diff --git a/api-editor/client/src/features/packageData/selectionView/DocumentationText.tsx b/api-editor/client/src/features/packageData/selectionView/DocumentationText.tsx index d1fc4ea75..de6409d25 100644 --- a/api-editor/client/src/features/packageData/selectionView/DocumentationText.tsx +++ b/api-editor/client/src/features/packageData/selectionView/DocumentationText.tsx @@ -43,8 +43,11 @@ const DocumentationText: React.FC = function ({ inputText = '', }) { const preprocessedText = inputText + // replace single new-lines by spaces .replaceAll(/(?=1.')\n elif 1 <= power < 2:\n self._lower_bound = DistributionBoundary(0, inclusive=True)\n elif power >= 2:\n self._lower_bound = DistributionBoundary(0, inclusive=False)\n else:\n raise ValueError\n self._power = power\n \n def unit_variance(self, y_pred):\n \"\"\"Compute the unit variance of a Tweedie distribution\n v(y_\textrm{pred})=y_\textrm{pred}**power.\n\n Parameters\n ----------\n y_pred : array of shape (n_samples,)\n Predicted mean.\n \"\"\"\n return np.power(y_pred, self.power)\n \n def unit_deviance(self, y, y_pred, check_input=False):\n \"\"\"Compute the unit deviance.\n\n The unit_deviance :math:`d(y,y_\\textrm{pred})` can be defined by the\n log-likelihood as\n :math:`d(y,y_\\textrm{pred}) = -2\\phi\\cdot\n \\left(loglike(y,y_\\textrm{pred},\\phi) - loglike(y,y,\\phi)\\right).`\n\n Parameters\n ----------\n y : array of shape (n_samples,)\n Target values.\n\n y_pred : array of shape (n_samples,)\n Predicted mean.\n\n check_input : bool, default=False\n If True raise an exception on invalid y or y_pred values, otherwise\n they will be propagated as NaN.\n Returns\n -------\n deviance: array of shape (n_samples,)\n Computed deviance\n \"\"\"\n p = self.power\n if check_input:\n message = 'Mean Tweedie deviance error with power={} can only be used on '.format(p)\n if p < 0:\n if (y_pred <= 0).any():\n raise ValueError(message + 'strictly positive y_pred.')\n elif p == 0:\n pass\n elif 0 < p < 1:\n raise ValueError('Tweedie deviance is only defined for power<=0 and power>=1.')\n elif 1 <= p < 2:\n if (y < 0).any() or (y_pred <= 0).any():\n raise ValueError(message + 'non-negative y and strictly positive y_pred.')\n elif p >= 2:\n if (y <= 0).any() or (y_pred <= 0).any():\n raise ValueError(message + 'strictly positive y and y_pred.')\n else:\n raise ValueError\n if p < 0:\n dev = 2 * (np.power(np.maximum(y, 0), 2 - p) / ((1 - p) * (2 - p)) - y * np.power(y_pred, 1 - p) / (1 - p) + np.power(y_pred, 2 - p) / (2 - p))\n elif p == 0:\n dev = (y - y_pred)**2\n elif p < 1:\n raise ValueError('Tweedie deviance is only defined for power<=0 and power>=1.')\n elif p == 1:\n dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)\n elif p == 2:\n dev = 2 * (np.log(y_pred / y) + y / y_pred - 1)\n else:\n dev = 2 * (np.power(y, 2 - p) / ((1 - p) * (2 - p)) - y * np.power(y_pred, 1 - p) / (1 - p) + np.power(y_pred, 2 - p) / (2 - p))\n return dev\n" }, @@ -19746,7 +19811,7 @@ "sklearn.base._OneToOneFeatureMixin.get_feature_names_out" ], "is_public": false, - "description": "Provides `get_feature_names_out` for simple transformers.\n\nAssumes there's a 1-to-1 correspondence between input features and output features.", + "description": "Provides `get_feature_names_out` for simple transformers.\n\nAssumes there's a 1-to-1 correspondence between input features\nand output features.", "docstring": "Provides `get_feature_names_out` for simple transformers.\n\n Assumes there's a 1-to-1 correspondence between input features\n and output features.\n ", "source_code": "\n\nclass _OneToOneFeatureMixin:\n \"\"\"Provides `get_feature_names_out` for simple transformers.\n\n Assumes there's a 1-to-1 correspondence between input features\n and output features.\n \"\"\"\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Same as input features.\n \"\"\"\n return _check_feature_names_in(self, input_features)\n" }, @@ -19778,7 +19843,7 @@ "sklearn.calibration.CalibratedClassifierCV._more_tags" ], "is_public": true, - "description": "Probability calibration with isotonic regression or logistic regression.\n\nThis class uses cross-validation to both estimate the parameters of a classifier and subsequently calibrate a classifier. With default `ensemble=True`, for each cv split it fits a copy of the base estimator to the training subset, and calibrates it using the testing subset. For prediction, predicted probabilities are averaged across these individual calibrated classifiers. When `ensemble=False`, cross-validation is used to obtain unbiased predictions, via :func:`~sklearn.model_selection.cross_val_predict`, which are then used for calibration. For prediction, the base estimator, trained using all the data, is used. This is the method implemented when `probabilities=True` for :mod:`sklearn.svm` estimators. Already fitted classifiers can be calibrated via the parameter `cv=\"prefit\"`. In this case, no cross-validation is used and all provided data is used for calibration. The user has to take care manually that data for model fitting and calibration are disjoint. The calibration is based on the :term:`decision_function` method of the `base_estimator` if it exists, else on :term:`predict_proba`. Read more in the :ref:`User Guide `.", + "description": "Probability calibration with isotonic regression or logistic regression.\n\nThis class uses cross-validation to both estimate the parameters of a\nclassifier and subsequently calibrate a classifier. With default\n`ensemble=True`, for each cv split it\nfits a copy of the base estimator to the training subset, and calibrates it\nusing the testing subset. For prediction, predicted probabilities are\naveraged across these individual calibrated classifiers. When\n`ensemble=False`, cross-validation is used to obtain unbiased predictions,\nvia :func:`~sklearn.model_selection.cross_val_predict`, which are then\nused for calibration. For prediction, the base estimator, trained using all\nthe data, is used. This is the method implemented when `probabilities=True`\nfor :mod:`sklearn.svm` estimators.\n\nAlready fitted classifiers can be calibrated via the parameter\n`cv=\"prefit\"`. In this case, no cross-validation is used and all provided\ndata is used for calibration. The user has to take care manually that data\nfor model fitting and calibration are disjoint.\n\nThe calibration is based on the :term:`decision_function` method of the\n`base_estimator` if it exists, else on :term:`predict_proba`.\n\nRead more in the :ref:`User Guide `.", "docstring": "Probability calibration with isotonic regression or logistic regression.\n\n This class uses cross-validation to both estimate the parameters of a\n classifier and subsequently calibrate a classifier. With default\n `ensemble=True`, for each cv split it\n fits a copy of the base estimator to the training subset, and calibrates it\n using the testing subset. For prediction, predicted probabilities are\n averaged across these individual calibrated classifiers. When\n `ensemble=False`, cross-validation is used to obtain unbiased predictions,\n via :func:`~sklearn.model_selection.cross_val_predict`, which are then\n used for calibration. For prediction, the base estimator, trained using all\n the data, is used. This is the method implemented when `probabilities=True`\n for :mod:`sklearn.svm` estimators.\n\n Already fitted classifiers can be calibrated via the parameter\n `cv=\"prefit\"`. In this case, no cross-validation is used and all provided\n data is used for calibration. The user has to take care manually that data\n for model fitting and calibration are disjoint.\n\n The calibration is based on the :term:`decision_function` method of the\n `base_estimator` if it exists, else on :term:`predict_proba`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n base_estimator : estimator instance, default=None\n The classifier whose output need to be calibrated to provide more\n accurate `predict_proba` outputs. The default classifier is\n a :class:`~sklearn.svm.LinearSVC`.\n\n method : {'sigmoid', 'isotonic'}, default='sigmoid'\n The method to use for calibration. Can be 'sigmoid' which\n corresponds to Platt's method (i.e. a logistic regression model) or\n 'isotonic' which is a non-parametric approach. It is not advised to\n use isotonic calibration with too few calibration samples\n ``(<<1000)`` since it tends to overfit.\n\n cv : int, cross-validation generator, iterable or \"prefit\", default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if ``y`` is binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used. If ``y`` is\n neither binary nor multiclass, :class:`~sklearn.model_selection.KFold`\n is used.\n\n Refer to the :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n If \"prefit\" is passed, it is assumed that `base_estimator` has been\n fitted already and all data is used for calibration.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors.\n\n Base estimator clones are fitted in parallel across cross-validation\n iterations. Therefore parallelism happens only when `cv != \"prefit\"`.\n\n See :term:`Glossary ` for more details.\n\n .. versionadded:: 0.24\n\n ensemble : bool, default=True\n Determines how the calibrator is fitted when `cv` is not `'prefit'`.\n Ignored if `cv='prefit'`.\n\n If `True`, the `base_estimator` is fitted using training data and\n calibrated using testing data, for each `cv` fold. The final estimator\n is an ensemble of `n_cv` fitted classifier and calibrator pairs, where\n `n_cv` is the number of cross-validation folds. The output is the\n average predicted probabilities of all pairs.\n\n If `False`, `cv` is used to compute unbiased predictions, via\n :func:`~sklearn.model_selection.cross_val_predict`, which are then\n used for calibration. At prediction time, the classifier used is the\n `base_estimator` trained on all the data.\n Note that this method is also internally implemented in\n :mod:`sklearn.svm` estimators with the `probabilities=True` parameter.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n The class labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying base_estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying base_estimator exposes such an attribute when fit.\n\n .. versionadded:: 1.0\n\n calibrated_classifiers_ : list (len() equal to cv or 1 if `cv=\"prefit\"` or `ensemble=False`)\n The list of classifier and calibrator pairs.\n\n - When `cv=\"prefit\"`, the fitted `base_estimator` and fitted\n calibrator.\n - When `cv` is not \"prefit\" and `ensemble=True`, `n_cv` fitted\n `base_estimator` and calibrator pairs. `n_cv` is the number of\n cross-validation folds.\n - When `cv` is not \"prefit\" and `ensemble=False`, the `base_estimator`,\n fitted on all the data, and fitted calibrator.\n\n .. versionchanged:: 0.24\n Single calibrated classifier case when `ensemble=False`.\n\n See Also\n --------\n calibration_curve : Compute true and predicted probabilities\n for a calibration curve.\n\n References\n ----------\n .. [1] Obtaining calibrated probability estimates from decision trees\n and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001\n\n .. [2] Transforming Classifier Scores into Accurate Multiclass\n Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002)\n\n .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to\n Regularized Likelihood Methods, J. Platt, (1999)\n\n .. [4] Predicting Good Probabilities with Supervised Learning,\n A. Niculescu-Mizil & R. Caruana, ICML 2005\n\n Examples\n --------\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.naive_bayes import GaussianNB\n >>> from sklearn.calibration import CalibratedClassifierCV\n >>> X, y = make_classification(n_samples=100, n_features=2,\n ... n_redundant=0, random_state=42)\n >>> base_clf = GaussianNB()\n >>> calibrated_clf = CalibratedClassifierCV(base_estimator=base_clf, cv=3)\n >>> calibrated_clf.fit(X, y)\n CalibratedClassifierCV(base_estimator=GaussianNB(), cv=3)\n >>> len(calibrated_clf.calibrated_classifiers_)\n 3\n >>> calibrated_clf.predict_proba(X)[:5, :]\n array([[0.110..., 0.889...],\n [0.072..., 0.927...],\n [0.928..., 0.071...],\n [0.928..., 0.071...],\n [0.071..., 0.928...]])\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = make_classification(n_samples=100, n_features=2,\n ... n_redundant=0, random_state=42)\n >>> X_train, X_calib, y_train, y_calib = train_test_split(\n ... X, y, random_state=42\n ... )\n >>> base_clf = GaussianNB()\n >>> base_clf.fit(X_train, y_train)\n GaussianNB()\n >>> calibrated_clf = CalibratedClassifierCV(\n ... base_estimator=base_clf,\n ... cv=\"prefit\"\n ... )\n >>> calibrated_clf.fit(X_calib, y_calib)\n CalibratedClassifierCV(base_estimator=GaussianNB(), cv='prefit')\n >>> len(calibrated_clf.calibrated_classifiers_)\n 1\n >>> calibrated_clf.predict_proba([[-0.5, 0.5]])\n array([[0.936..., 0.063...]])\n ", "source_code": "\n\nclass CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):\n \"\"\"Probability calibration with isotonic regression or logistic regression.\n\n This class uses cross-validation to both estimate the parameters of a\n classifier and subsequently calibrate a classifier. With default\n `ensemble=True`, for each cv split it\n fits a copy of the base estimator to the training subset, and calibrates it\n using the testing subset. For prediction, predicted probabilities are\n averaged across these individual calibrated classifiers. When\n `ensemble=False`, cross-validation is used to obtain unbiased predictions,\n via :func:`~sklearn.model_selection.cross_val_predict`, which are then\n used for calibration. For prediction, the base estimator, trained using all\n the data, is used. This is the method implemented when `probabilities=True`\n for :mod:`sklearn.svm` estimators.\n\n Already fitted classifiers can be calibrated via the parameter\n `cv=\"prefit\"`. In this case, no cross-validation is used and all provided\n data is used for calibration. The user has to take care manually that data\n for model fitting and calibration are disjoint.\n\n The calibration is based on the :term:`decision_function` method of the\n `base_estimator` if it exists, else on :term:`predict_proba`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n base_estimator : estimator instance, default=None\n The classifier whose output need to be calibrated to provide more\n accurate `predict_proba` outputs. The default classifier is\n a :class:`~sklearn.svm.LinearSVC`.\n\n method : {'sigmoid', 'isotonic'}, default='sigmoid'\n The method to use for calibration. Can be 'sigmoid' which\n corresponds to Platt's method (i.e. a logistic regression model) or\n 'isotonic' which is a non-parametric approach. It is not advised to\n use isotonic calibration with too few calibration samples\n ``(<<1000)`` since it tends to overfit.\n\n cv : int, cross-validation generator, iterable or \"prefit\", default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if ``y`` is binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used. If ``y`` is\n neither binary nor multiclass, :class:`~sklearn.model_selection.KFold`\n is used.\n\n Refer to the :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n If \"prefit\" is passed, it is assumed that `base_estimator` has been\n fitted already and all data is used for calibration.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors.\n\n Base estimator clones are fitted in parallel across cross-validation\n iterations. Therefore parallelism happens only when `cv != \"prefit\"`.\n\n See :term:`Glossary ` for more details.\n\n .. versionadded:: 0.24\n\n ensemble : bool, default=True\n Determines how the calibrator is fitted when `cv` is not `'prefit'`.\n Ignored if `cv='prefit'`.\n\n If `True`, the `base_estimator` is fitted using training data and\n calibrated using testing data, for each `cv` fold. The final estimator\n is an ensemble of `n_cv` fitted classifier and calibrator pairs, where\n `n_cv` is the number of cross-validation folds. The output is the\n average predicted probabilities of all pairs.\n\n If `False`, `cv` is used to compute unbiased predictions, via\n :func:`~sklearn.model_selection.cross_val_predict`, which are then\n used for calibration. At prediction time, the classifier used is the\n `base_estimator` trained on all the data.\n Note that this method is also internally implemented in\n :mod:`sklearn.svm` estimators with the `probabilities=True` parameter.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n The class labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying base_estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying base_estimator exposes such an attribute when fit.\n\n .. versionadded:: 1.0\n\n calibrated_classifiers_ : list (len() equal to cv or 1 if `cv=\"prefit\"` or `ensemble=False`)\n The list of classifier and calibrator pairs.\n\n - When `cv=\"prefit\"`, the fitted `base_estimator` and fitted\n calibrator.\n - When `cv` is not \"prefit\" and `ensemble=True`, `n_cv` fitted\n `base_estimator` and calibrator pairs. `n_cv` is the number of\n cross-validation folds.\n - When `cv` is not \"prefit\" and `ensemble=False`, the `base_estimator`,\n fitted on all the data, and fitted calibrator.\n\n .. versionchanged:: 0.24\n Single calibrated classifier case when `ensemble=False`.\n\n See Also\n --------\n calibration_curve : Compute true and predicted probabilities\n for a calibration curve.\n\n References\n ----------\n .. [1] Obtaining calibrated probability estimates from decision trees\n and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001\n\n .. [2] Transforming Classifier Scores into Accurate Multiclass\n Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002)\n\n .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to\n Regularized Likelihood Methods, J. Platt, (1999)\n\n .. [4] Predicting Good Probabilities with Supervised Learning,\n A. Niculescu-Mizil & R. Caruana, ICML 2005\n\n Examples\n --------\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.naive_bayes import GaussianNB\n >>> from sklearn.calibration import CalibratedClassifierCV\n >>> X, y = make_classification(n_samples=100, n_features=2,\n ... n_redundant=0, random_state=42)\n >>> base_clf = GaussianNB()\n >>> calibrated_clf = CalibratedClassifierCV(base_estimator=base_clf, cv=3)\n >>> calibrated_clf.fit(X, y)\n CalibratedClassifierCV(base_estimator=GaussianNB(), cv=3)\n >>> len(calibrated_clf.calibrated_classifiers_)\n 3\n >>> calibrated_clf.predict_proba(X)[:5, :]\n array([[0.110..., 0.889...],\n [0.072..., 0.927...],\n [0.928..., 0.071...],\n [0.928..., 0.071...],\n [0.071..., 0.928...]])\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = make_classification(n_samples=100, n_features=2,\n ... n_redundant=0, random_state=42)\n >>> X_train, X_calib, y_train, y_calib = train_test_split(\n ... X, y, random_state=42\n ... )\n >>> base_clf = GaussianNB()\n >>> base_clf.fit(X_train, y_train)\n GaussianNB()\n >>> calibrated_clf = CalibratedClassifierCV(\n ... base_estimator=base_clf,\n ... cv=\"prefit\"\n ... )\n >>> calibrated_clf.fit(X_calib, y_calib)\n CalibratedClassifierCV(base_estimator=GaussianNB(), cv='prefit')\n >>> len(calibrated_clf.calibrated_classifiers_)\n 1\n >>> calibrated_clf.predict_proba([[-0.5, 0.5]])\n array([[0.936..., 0.063...]])\n \"\"\"\n \n def __init__(self, base_estimator=None, *, method='sigmoid', cv=None, n_jobs=None, ensemble=True):\n self.base_estimator = base_estimator\n self.method = method\n self.cv = cv\n self.n_jobs = n_jobs\n self.ensemble = ensemble\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the calibrated model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n check_classification_targets(y)\n (X, y) = indexable(X, y)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if self.base_estimator is None:\n base_estimator = LinearSVC(random_state=0)\n else:\n base_estimator = self.base_estimator\n self.calibrated_classifiers_ = []\n if self.cv == 'prefit':\n check_is_fitted(self.base_estimator, attributes=['classes_'])\n self.classes_ = self.base_estimator.classes_\n (pred_method, method_name) = _get_prediction_method(base_estimator)\n n_classes = len(self.classes_)\n predictions = _compute_predictions(pred_method, method_name, X, n_classes)\n calibrated_classifier = _fit_calibrator(base_estimator, predictions, y, self.classes_, self.method, sample_weight)\n self.calibrated_classifiers_.append(calibrated_classifier)\n else:\n label_encoder_ = LabelEncoder().fit(y)\n self.classes_ = label_encoder_.classes_\n n_classes = len(self.classes_)\n fit_parameters = signature(base_estimator.fit).parameters\n supports_sw = 'sample_weight' in fit_parameters\n if sample_weight is not None and not supports_sw:\n estimator_name = type(base_estimator).__name__\n warnings.warn(f'Since {estimator_name} does not appear to accept sample_weight, sample weights will only be used for the calibration itself. This can be caused by a limitation of the current scikit-learn API. See the following issue for more details: https://github.com/scikit-learn/scikit-learn/issues/21134. Be warned that the result of the calibration is likely to be incorrect.')\n if isinstance(self.cv, int):\n n_folds = self.cv\n elif hasattr(self.cv, 'n_splits'):\n n_folds = self.cv.n_splits\n else:\n n_folds = None\n if n_folds and np.any([np.sum(y == class_) < n_folds for class_ in self.classes_]):\n raise ValueError(f'Requesting {n_folds}-fold cross-validation but provided less than {n_folds} examples for at least one class.')\n cv = check_cv(self.cv, y, classifier=True)\n if self.ensemble:\n parallel = Parallel(n_jobs=self.n_jobs)\n self.calibrated_classifiers_ = parallel((delayed(_fit_classifier_calibrator_pair)(clone(base_estimator), X, y, train=train, test=test, method=self.method, classes=self.classes_, supports_sw=supports_sw, sample_weight=sample_weight) for (train, test) in cv.split(X, y)))\n else:\n this_estimator = clone(base_estimator)\n (_, method_name) = _get_prediction_method(this_estimator)\n fit_params = {'sample_weight': sample_weight} if sample_weight is not None and supports_sw else None\n pred_method = partial(cross_val_predict, estimator=this_estimator, X=X, y=y, cv=cv, method=method_name, n_jobs=self.n_jobs, fit_params=fit_params)\n predictions = _compute_predictions(pred_method, method_name, X, n_classes)\n if sample_weight is not None and supports_sw:\n this_estimator.fit(X, y, sample_weight)\n else:\n this_estimator.fit(X, y)\n calibrated_classifier = _fit_calibrator(this_estimator, predictions, y, self.classes_, self.method, sample_weight)\n self.calibrated_classifiers_.append(calibrated_classifier)\n first_clf = self.calibrated_classifiers_[0].base_estimator\n if hasattr(first_clf, 'n_features_in_'):\n self.n_features_in_ = first_clf.n_features_in_\n if hasattr(first_clf, 'feature_names_in_'):\n self.feature_names_in_ = first_clf.feature_names_in_\n return self\n \n def predict_proba(self, X):\n \"\"\"Calibrated probabilities of classification.\n\n This function returns calibrated probabilities of classification\n according to each class on an array of test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The samples, as accepted by `base_estimator.predict_proba`.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n The predicted probas.\n \"\"\"\n check_is_fitted(self)\n mean_proba = np.zeros((_num_samples(X), len(self.classes_)))\n for calibrated_classifier in self.calibrated_classifiers_:\n proba = calibrated_classifier.predict_proba(X)\n mean_proba += proba\n mean_proba /= len(self.calibrated_classifiers_)\n return mean_proba\n \n def predict(self, X):\n \"\"\"Predict the target of new samples.\n\n The predicted class is the class that has the highest probability,\n and can thus be different from the prediction of the uncalibrated classifier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The samples, as accepted by `base_estimator.predict`.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n The predicted class.\n \"\"\"\n check_is_fitted(self)\n return self.classes_[np.argmax(self.predict_proba(X), axis=1)]\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'Due to the cross-validation and sample ordering, removing a sample is not strictly equal to putting is weight to zero. Specific unit tests are added for CalibratedClassifierCV specifically.'}}\n" }, @@ -19794,9 +19859,9 @@ "sklearn.calibration.CalibrationDisplay.from_predictions" ], "is_public": true, - "description": "Calibration curve (also known as reliability diagram) visualization.\n\nIt is recommended to use :func:`~sklearn.calibration.CalibrationDisplay.from_estimator` or :func:`~sklearn.calibration.CalibrationDisplay.from_predictions` to create a `CalibrationDisplay`. All parameters are stored as attributes. Read more about calibration in the :ref:`User Guide ` and more about the scikit-learn visualization API in :ref:`visualizations`. .. versionadded:: 1.0", + "description": "Calibration curve (also known as reliability diagram) visualization.\n\nIt is recommended to use\n:func:`~sklearn.calibration.CalibrationDisplay.from_estimator` or\n:func:`~sklearn.calibration.CalibrationDisplay.from_predictions`\nto create a `CalibrationDisplay`. All parameters are stored as attributes.\n\nRead more about calibration in the :ref:`User Guide ` and\nmore about the scikit-learn visualization API in :ref:`visualizations`.\n\n.. versionadded:: 1.0", "docstring": "Calibration curve (also known as reliability diagram) visualization.\n\n It is recommended to use\n :func:`~sklearn.calibration.CalibrationDisplay.from_estimator` or\n :func:`~sklearn.calibration.CalibrationDisplay.from_predictions`\n to create a `CalibrationDisplay`. All parameters are stored as attributes.\n\n Read more about calibration in the :ref:`User Guide ` and\n more about the scikit-learn visualization API in :ref:`visualizations`.\n\n .. versionadded:: 1.0\n\n Parameters\n -----------\n prob_true : ndarray of shape (n_bins,)\n The proportion of samples whose class is the positive class (fraction\n of positives), in each bin.\n\n prob_pred : ndarray of shape (n_bins,)\n The mean predicted probability in each bin.\n\n y_prob : ndarray of shape (n_samples,)\n Probability estimates for the positive class, for each sample.\n\n estimator_name : str, default=None\n Name of estimator. If None, the estimator name is not shown.\n\n Attributes\n ----------\n line_ : matplotlib Artist\n Calibration curve.\n\n ax_ : matplotlib Axes\n Axes with calibration curve.\n\n figure_ : matplotlib Figure\n Figure containing the curve.\n\n See Also\n --------\n calibration_curve : Compute true and predicted probabilities for a\n calibration curve.\n CalibrationDisplay.from_predictions : Plot calibration curve using true\n and predicted labels.\n CalibrationDisplay.from_estimator : Plot calibration curve using an\n estimator and data.\n\n Examples\n --------\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.calibration import calibration_curve, CalibrationDisplay\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression(random_state=0)\n >>> clf.fit(X_train, y_train)\n LogisticRegression(random_state=0)\n >>> y_prob = clf.predict_proba(X_test)[:, 1]\n >>> prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)\n >>> disp = CalibrationDisplay(prob_true, prob_pred, y_prob)\n >>> disp.plot()\n <...>\n ", - "source_code": "\n\nclass CalibrationDisplay:\n \"\"\"Calibration curve (also known as reliability diagram) visualization.\n\n It is recommended to use\n :func:`~sklearn.calibration.CalibrationDisplay.from_estimator` or\n :func:`~sklearn.calibration.CalibrationDisplay.from_predictions`\n to create a `CalibrationDisplay`. All parameters are stored as attributes.\n\n Read more about calibration in the :ref:`User Guide ` and\n more about the scikit-learn visualization API in :ref:`visualizations`.\n\n .. versionadded:: 1.0\n\n Parameters\n -----------\n prob_true : ndarray of shape (n_bins,)\n The proportion of samples whose class is the positive class (fraction\n of positives), in each bin.\n\n prob_pred : ndarray of shape (n_bins,)\n The mean predicted probability in each bin.\n\n y_prob : ndarray of shape (n_samples,)\n Probability estimates for the positive class, for each sample.\n\n estimator_name : str, default=None\n Name of estimator. If None, the estimator name is not shown.\n\n Attributes\n ----------\n line_ : matplotlib Artist\n Calibration curve.\n\n ax_ : matplotlib Axes\n Axes with calibration curve.\n\n figure_ : matplotlib Figure\n Figure containing the curve.\n\n See Also\n --------\n calibration_curve : Compute true and predicted probabilities for a\n calibration curve.\n CalibrationDisplay.from_predictions : Plot calibration curve using true\n and predicted labels.\n CalibrationDisplay.from_estimator : Plot calibration curve using an\n estimator and data.\n\n Examples\n --------\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.calibration import calibration_curve, CalibrationDisplay\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression(random_state=0)\n >>> clf.fit(X_train, y_train)\n LogisticRegression(random_state=0)\n >>> y_prob = clf.predict_proba(X_test)[:, 1]\n >>> prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)\n >>> disp = CalibrationDisplay(prob_true, prob_pred, y_prob)\n >>> disp.plot()\n <...>\n \"\"\"\n \n def __init__(self, prob_true, prob_pred, y_prob, *, estimator_name=None):\n self.prob_true = prob_true\n self.prob_pred = prob_pred\n self.y_prob = y_prob\n self.estimator_name = estimator_name\n \n def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):\n \"\"\"Plot visualization.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Parameters\n ----------\n ax : Matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name for labeling curve. If `None`, use `estimator_name` if\n not `None`, otherwise no labeling is shown.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`\n Object that stores computed values.\n \"\"\"\n check_matplotlib_support('CalibrationDisplay.plot')\n import matplotlib.pyplot as plt\n if ax is None:\n (fig, ax) = plt.subplots()\n name = self.estimator_name if name is None else name\n line_kwargs = {}\n if name is not None:\n line_kwargs['label'] = name\n line_kwargs.update(**kwargs)\n ref_line_label = 'Perfectly calibrated'\n existing_ref_line = ref_line_label in ax.get_legend_handles_labels()[1]\n if ref_line and not existing_ref_line:\n ax.plot([0, 1], [0, 1], 'k:', label=ref_line_label)\n self.line_ = ax.plot(self.prob_pred, self.prob_true, 's-', **line_kwargs)[0]\n if 'label' in line_kwargs:\n ax.legend(loc='lower right')\n ax.set(xlabel='Mean predicted probability', ylabel='Fraction of positives')\n self.ax_ = ax\n self.figure_ = ax.figure\n return self\n \n @classmethod\n def from_estimator(cls, estimator, X, y, *, n_bins=5, strategy='uniform', name=None, ref_line=True, ax=None, **kwargs):\n \"\"\"Plot calibration curve using a binary classifier and data.\n\n A calibration curve, also known as a reliability diagram, uses inputs\n from a binary classifier and plots the average predicted probability\n for each bin against the fraction of positive classes, on the\n y-axis.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Read more about calibration in the :ref:`User Guide ` and\n more about the scikit-learn visualization API in :ref:`visualizations`.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier. The classifier must\n have a :term:`predict_proba` method.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Binary target values.\n\n n_bins : int, default=5\n Number of bins to discretize the [0, 1] interval into when\n calculating the calibration curve. A bigger number requires more\n data.\n\n strategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n - `'uniform'`: The bins have identical widths.\n - `'quantile'`: The bins have the same number of samples and depend\n on predicted probabilities.\n\n name : str, default=None\n Name for labeling curve. If `None`, the name of the estimator is\n used.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`.\n Object that stores computed values.\n\n See Also\n --------\n CalibrationDisplay.from_predictions : Plot calibration curve using true\n and predicted labels.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.calibration import CalibrationDisplay\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression(random_state=0)\n >>> clf.fit(X_train, y_train)\n LogisticRegression(random_state=0)\n >>> disp = CalibrationDisplay.from_estimator(clf, X_test, y_test)\n >>> plt.show()\n \"\"\"\n method_name = f'{cls.__name__}.from_estimator'\n check_matplotlib_support(method_name)\n if not is_classifier(estimator):\n raise ValueError(\"'estimator' should be a fitted classifier.\")\n (y_prob, _) = _get_response(X, estimator, response_method='predict_proba', pos_label=None)\n name = name if name is not None else estimator.__class__.__name__\n return cls.from_predictions(y, y_prob, n_bins=n_bins, strategy=strategy, name=name, ref_line=ref_line, ax=ax, **kwargs)\n \n @classmethod\n def from_predictions(cls, y_true, y_prob, *, n_bins=5, strategy='uniform', name=None, ref_line=True, ax=None, **kwargs):\n \"\"\"Plot calibration curve using true labels and predicted probabilities.\n\n Calibration curve, also known as reliability diagram, uses inputs\n from a binary classifier and plots the average predicted probability\n for each bin against the fraction of positive classes, on the\n y-axis.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Read more about calibration in the :ref:`User Guide ` and\n more about the scikit-learn visualization API in :ref:`visualizations`.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_prob : array-like of shape (n_samples,)\n The predicted probabilities of the positive class.\n\n n_bins : int, default=5\n Number of bins to discretize the [0, 1] interval into when\n calculating the calibration curve. A bigger number requires more\n data.\n\n strategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n - `'uniform'`: The bins have identical widths.\n - `'quantile'`: The bins have the same number of samples and depend\n on predicted probabilities.\n\n name : str, default=None\n Name for labeling curve.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`.\n Object that stores computed values.\n\n See Also\n --------\n CalibrationDisplay.from_estimator : Plot calibration curve using an\n estimator and data.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.calibration import CalibrationDisplay\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression(random_state=0)\n >>> clf.fit(X_train, y_train)\n LogisticRegression(random_state=0)\n >>> y_prob = clf.predict_proba(X_test)[:, 1]\n >>> disp = CalibrationDisplay.from_predictions(y_test, y_prob)\n >>> plt.show()\n \"\"\"\n method_name = f'{cls.__name__}.from_estimator'\n check_matplotlib_support(method_name)\n (prob_true, prob_pred) = calibration_curve(y_true, y_prob, n_bins=n_bins, strategy=strategy)\n name = name if name is not None else 'Classifier'\n disp = cls(prob_true=prob_true, prob_pred=prob_pred, y_prob=y_prob, estimator_name=name)\n return disp.plot(ax=ax, ref_line=ref_line, **kwargs)\n" + "source_code": "\n\nclass CalibrationDisplay:\n \"\"\"Calibration curve (also known as reliability diagram) visualization.\n\n It is recommended to use\n :func:`~sklearn.calibration.CalibrationDisplay.from_estimator` or\n :func:`~sklearn.calibration.CalibrationDisplay.from_predictions`\n to create a `CalibrationDisplay`. All parameters are stored as attributes.\n\n Read more about calibration in the :ref:`User Guide ` and\n more about the scikit-learn visualization API in :ref:`visualizations`.\n\n .. versionadded:: 1.0\n\n Parameters\n -----------\n prob_true : ndarray of shape (n_bins,)\n The proportion of samples whose class is the positive class (fraction\n of positives), in each bin.\n\n prob_pred : ndarray of shape (n_bins,)\n The mean predicted probability in each bin.\n\n y_prob : ndarray of shape (n_samples,)\n Probability estimates for the positive class, for each sample.\n\n estimator_name : str, default=None\n Name of estimator. If None, the estimator name is not shown.\n\n Attributes\n ----------\n line_ : matplotlib Artist\n Calibration curve.\n\n ax_ : matplotlib Axes\n Axes with calibration curve.\n\n figure_ : matplotlib Figure\n Figure containing the curve.\n\n See Also\n --------\n calibration_curve : Compute true and predicted probabilities for a\n calibration curve.\n CalibrationDisplay.from_predictions : Plot calibration curve using true\n and predicted labels.\n CalibrationDisplay.from_estimator : Plot calibration curve using an\n estimator and data.\n\n Examples\n --------\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.calibration import calibration_curve, CalibrationDisplay\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression(random_state=0)\n >>> clf.fit(X_train, y_train)\n LogisticRegression(random_state=0)\n >>> y_prob = clf.predict_proba(X_test)[:, 1]\n >>> prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)\n >>> disp = CalibrationDisplay(prob_true, prob_pred, y_prob)\n >>> disp.plot()\n <...>\n \"\"\"\n \n def __init__(self, prob_true, prob_pred, y_prob, *, estimator_name=None):\n self.prob_true = prob_true\n self.prob_pred = prob_pred\n self.y_prob = y_prob\n self.estimator_name = estimator_name\n \n def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):\n \"\"\"Plot visualization.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Parameters\n ----------\n ax : Matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name for labeling curve. If `None`, use `estimator_name` if\n not `None`, otherwise no labeling is shown.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`\n Object that stores computed values.\n \"\"\"\n check_matplotlib_support('CalibrationDisplay.plot')\n import matplotlib.pyplot as plt\n if ax is None:\n (fig, ax) = plt.subplots()\n name = self.estimator_name if name is None else name\n line_kwargs = {}\n if name is not None:\n line_kwargs['label'] = name\n line_kwargs.update(**kwargs)\n ref_line_label = 'Perfectly calibrated'\n existing_ref_line = ref_line_label in ax.get_legend_handles_labels()[1]\n if ref_line and not existing_ref_line:\n ax.plot([0, 1], [0, 1], 'k:', label=ref_line_label)\n self.line_ = ax.plot(self.prob_pred, self.prob_true, 's-', **line_kwargs)[0]\n ax.legend(loc='lower right')\n ax.set(xlabel='Mean predicted probability', ylabel='Fraction of positives')\n self.ax_ = ax\n self.figure_ = ax.figure\n return self\n \n @classmethod\n def from_estimator(cls, estimator, X, y, *, n_bins=5, strategy='uniform', name=None, ref_line=True, ax=None, **kwargs):\n \"\"\"Plot calibration curve using a binary classifier and data.\n\n A calibration curve, also known as a reliability diagram, uses inputs\n from a binary classifier and plots the average predicted probability\n for each bin against the fraction of positive classes, on the\n y-axis.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Read more about calibration in the :ref:`User Guide ` and\n more about the scikit-learn visualization API in :ref:`visualizations`.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier. The classifier must\n have a :term:`predict_proba` method.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Binary target values.\n\n n_bins : int, default=5\n Number of bins to discretize the [0, 1] interval into when\n calculating the calibration curve. A bigger number requires more\n data.\n\n strategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n - `'uniform'`: The bins have identical widths.\n - `'quantile'`: The bins have the same number of samples and depend\n on predicted probabilities.\n\n name : str, default=None\n Name for labeling curve. If `None`, the name of the estimator is\n used.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`.\n Object that stores computed values.\n\n See Also\n --------\n CalibrationDisplay.from_predictions : Plot calibration curve using true\n and predicted labels.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.calibration import CalibrationDisplay\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression(random_state=0)\n >>> clf.fit(X_train, y_train)\n LogisticRegression(random_state=0)\n >>> disp = CalibrationDisplay.from_estimator(clf, X_test, y_test)\n >>> plt.show()\n \"\"\"\n method_name = f'{cls.__name__}.from_estimator'\n check_matplotlib_support(method_name)\n if not is_classifier(estimator):\n raise ValueError(\"'estimator' should be a fitted classifier.\")\n (y_prob, _) = _get_response(X, estimator, response_method='predict_proba', pos_label=None)\n name = name if name is not None else estimator.__class__.__name__\n return cls.from_predictions(y, y_prob, n_bins=n_bins, strategy=strategy, name=name, ref_line=ref_line, ax=ax, **kwargs)\n \n @classmethod\n def from_predictions(cls, y_true, y_prob, *, n_bins=5, strategy='uniform', name=None, ref_line=True, ax=None, **kwargs):\n \"\"\"Plot calibration curve using true labels and predicted probabilities.\n\n Calibration curve, also known as reliability diagram, uses inputs\n from a binary classifier and plots the average predicted probability\n for each bin against the fraction of positive classes, on the\n y-axis.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Read more about calibration in the :ref:`User Guide ` and\n more about the scikit-learn visualization API in :ref:`visualizations`.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_prob : array-like of shape (n_samples,)\n The predicted probabilities of the positive class.\n\n n_bins : int, default=5\n Number of bins to discretize the [0, 1] interval into when\n calculating the calibration curve. A bigger number requires more\n data.\n\n strategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n - `'uniform'`: The bins have identical widths.\n - `'quantile'`: The bins have the same number of samples and depend\n on predicted probabilities.\n\n name : str, default=None\n Name for labeling curve.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`.\n Object that stores computed values.\n\n See Also\n --------\n CalibrationDisplay.from_estimator : Plot calibration curve using an\n estimator and data.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.calibration import CalibrationDisplay\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression(random_state=0)\n >>> clf.fit(X_train, y_train)\n LogisticRegression(random_state=0)\n >>> y_prob = clf.predict_proba(X_test)[:, 1]\n >>> disp = CalibrationDisplay.from_predictions(y_test, y_prob)\n >>> plt.show()\n \"\"\"\n method_name = f'{cls.__name__}.from_estimator'\n check_matplotlib_support(method_name)\n (prob_true, prob_pred) = calibration_curve(y_true, y_prob, n_bins=n_bins, strategy=strategy)\n name = name if name is not None else 'Classifier'\n disp = cls(prob_true=prob_true, prob_pred=prob_pred, y_prob=y_prob, estimator_name=name)\n return disp.plot(ax=ax, ref_line=ref_line, **kwargs)\n" }, { "name": "_CalibratedClassifier", @@ -19857,7 +19922,7 @@ "sklearn.cluster._agglomerative.AgglomerativeClustering.fit_predict" ], "is_public": true, - "description": "Agglomerative Clustering.\n\nRecursively merges pair of clusters of sample data; uses linkage distance. Read more in the :ref:`User Guide `.", + "description": "Agglomerative Clustering.\n\nRecursively merges pair of clusters of sample data; uses linkage distance.\n\nRead more in the :ref:`User Guide `.", "docstring": "\n Agglomerative Clustering.\n\n Recursively merges pair of clusters of sample data; uses linkage distance.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_clusters : int or None, default=2\n The number of clusters to find. It must be ``None`` if\n ``distance_threshold`` is not ``None``.\n\n affinity : str or callable, default='euclidean'\n Metric used to compute the linkage. Can be \"euclidean\", \"l1\", \"l2\",\n \"manhattan\", \"cosine\", or \"precomputed\".\n If linkage is \"ward\", only \"euclidean\" is accepted.\n If \"precomputed\", a distance matrix (instead of a similarity matrix)\n is needed as input for the fit method.\n\n memory : str or object with the joblib.Memory interface, default=None\n Used to cache the output of the computation of the tree.\n By default, no caching is done. If a string is given, it is the\n path to the caching directory.\n\n connectivity : array-like or callable, default=None\n Connectivity matrix. Defines for each sample the neighboring\n samples following a given structure of the data.\n This can be a connectivity matrix itself or a callable that transforms\n the data into a connectivity matrix, such as derived from\n `kneighbors_graph`. Default is ``None``, i.e, the\n hierarchical clustering algorithm is unstructured.\n\n compute_full_tree : 'auto' or bool, default='auto'\n Stop early the construction of the tree at ``n_clusters``. This is\n useful to decrease computation time if the number of clusters is not\n small compared to the number of samples. This option is useful only\n when specifying a connectivity matrix. Note also that when varying the\n number of clusters and using caching, it may be advantageous to compute\n the full tree. It must be ``True`` if ``distance_threshold`` is not\n ``None``. By default `compute_full_tree` is \"auto\", which is equivalent\n to `True` when `distance_threshold` is not `None` or that `n_clusters`\n is inferior to the maximum between 100 or `0.02 * n_samples`.\n Otherwise, \"auto\" is equivalent to `False`.\n\n linkage : {'ward', 'complete', 'average', 'single'}, default='ward'\n Which linkage criterion to use. The linkage criterion determines which\n distance to use between sets of observation. The algorithm will merge\n the pairs of cluster that minimize this criterion.\n\n - 'ward' minimizes the variance of the clusters being merged.\n - 'average' uses the average of the distances of each observation of\n the two sets.\n - 'complete' or 'maximum' linkage uses the maximum distances between\n all observations of the two sets.\n - 'single' uses the minimum of the distances between all observations\n of the two sets.\n\n .. versionadded:: 0.20\n Added the 'single' option\n\n distance_threshold : float, default=None\n The linkage distance threshold above which, clusters will not be\n merged. If not ``None``, ``n_clusters`` must be ``None`` and\n ``compute_full_tree`` must be ``True``.\n\n .. versionadded:: 0.21\n\n compute_distances : bool, default=False\n Computes distances between clusters even if `distance_threshold` is not\n used. This can be used to make dendrogram visualization, but introduces\n a computational and memory overhead.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n n_clusters_ : int\n The number of clusters found by the algorithm. If\n ``distance_threshold=None``, it will be equal to the given\n ``n_clusters``.\n\n labels_ : ndarray of shape (n_samples)\n Cluster labels for each point.\n\n n_leaves_ : int\n Number of leaves in the hierarchical tree.\n\n n_connected_components_ : int\n The estimated number of connected components in the graph.\n\n .. versionadded:: 0.21\n ``n_connected_components_`` was added to replace ``n_components_``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n children_ : array-like of shape (n_samples-1, 2)\n The children of each non-leaf node. Values less than `n_samples`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_samples` is a non-leaf\n node and has children `children_[i - n_samples]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_samples + i`.\n\n distances_ : array-like of shape (n_nodes-1,)\n Distances between nodes in the corresponding place in `children_`.\n Only computed if `distance_threshold` is used or `compute_distances`\n is set to `True`.\n\n See Also\n --------\n FeatureAgglomeration : Agglomerative clustering but for features instead of\n samples.\n ward_tree : Hierarchical clustering with ward linkage.\n\n Examples\n --------\n >>> from sklearn.cluster import AgglomerativeClustering\n >>> import numpy as np\n >>> X = np.array([[1, 2], [1, 4], [1, 0],\n ... [4, 2], [4, 4], [4, 0]])\n >>> clustering = AgglomerativeClustering().fit(X)\n >>> clustering\n AgglomerativeClustering()\n >>> clustering.labels_\n array([1, 1, 1, 0, 0, 0])\n ", "source_code": "\n\nclass AgglomerativeClustering(ClusterMixin, BaseEstimator):\n \"\"\"\n Agglomerative Clustering.\n\n Recursively merges pair of clusters of sample data; uses linkage distance.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_clusters : int or None, default=2\n The number of clusters to find. It must be ``None`` if\n ``distance_threshold`` is not ``None``.\n\n affinity : str or callable, default='euclidean'\n Metric used to compute the linkage. Can be \"euclidean\", \"l1\", \"l2\",\n \"manhattan\", \"cosine\", or \"precomputed\".\n If linkage is \"ward\", only \"euclidean\" is accepted.\n If \"precomputed\", a distance matrix (instead of a similarity matrix)\n is needed as input for the fit method.\n\n memory : str or object with the joblib.Memory interface, default=None\n Used to cache the output of the computation of the tree.\n By default, no caching is done. If a string is given, it is the\n path to the caching directory.\n\n connectivity : array-like or callable, default=None\n Connectivity matrix. Defines for each sample the neighboring\n samples following a given structure of the data.\n This can be a connectivity matrix itself or a callable that transforms\n the data into a connectivity matrix, such as derived from\n `kneighbors_graph`. Default is ``None``, i.e, the\n hierarchical clustering algorithm is unstructured.\n\n compute_full_tree : 'auto' or bool, default='auto'\n Stop early the construction of the tree at ``n_clusters``. This is\n useful to decrease computation time if the number of clusters is not\n small compared to the number of samples. This option is useful only\n when specifying a connectivity matrix. Note also that when varying the\n number of clusters and using caching, it may be advantageous to compute\n the full tree. It must be ``True`` if ``distance_threshold`` is not\n ``None``. By default `compute_full_tree` is \"auto\", which is equivalent\n to `True` when `distance_threshold` is not `None` or that `n_clusters`\n is inferior to the maximum between 100 or `0.02 * n_samples`.\n Otherwise, \"auto\" is equivalent to `False`.\n\n linkage : {'ward', 'complete', 'average', 'single'}, default='ward'\n Which linkage criterion to use. The linkage criterion determines which\n distance to use between sets of observation. The algorithm will merge\n the pairs of cluster that minimize this criterion.\n\n - 'ward' minimizes the variance of the clusters being merged.\n - 'average' uses the average of the distances of each observation of\n the two sets.\n - 'complete' or 'maximum' linkage uses the maximum distances between\n all observations of the two sets.\n - 'single' uses the minimum of the distances between all observations\n of the two sets.\n\n .. versionadded:: 0.20\n Added the 'single' option\n\n distance_threshold : float, default=None\n The linkage distance threshold above which, clusters will not be\n merged. If not ``None``, ``n_clusters`` must be ``None`` and\n ``compute_full_tree`` must be ``True``.\n\n .. versionadded:: 0.21\n\n compute_distances : bool, default=False\n Computes distances between clusters even if `distance_threshold` is not\n used. This can be used to make dendrogram visualization, but introduces\n a computational and memory overhead.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n n_clusters_ : int\n The number of clusters found by the algorithm. If\n ``distance_threshold=None``, it will be equal to the given\n ``n_clusters``.\n\n labels_ : ndarray of shape (n_samples)\n Cluster labels for each point.\n\n n_leaves_ : int\n Number of leaves in the hierarchical tree.\n\n n_connected_components_ : int\n The estimated number of connected components in the graph.\n\n .. versionadded:: 0.21\n ``n_connected_components_`` was added to replace ``n_components_``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n children_ : array-like of shape (n_samples-1, 2)\n The children of each non-leaf node. Values less than `n_samples`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_samples` is a non-leaf\n node and has children `children_[i - n_samples]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_samples + i`.\n\n distances_ : array-like of shape (n_nodes-1,)\n Distances between nodes in the corresponding place in `children_`.\n Only computed if `distance_threshold` is used or `compute_distances`\n is set to `True`.\n\n See Also\n --------\n FeatureAgglomeration : Agglomerative clustering but for features instead of\n samples.\n ward_tree : Hierarchical clustering with ward linkage.\n\n Examples\n --------\n >>> from sklearn.cluster import AgglomerativeClustering\n >>> import numpy as np\n >>> X = np.array([[1, 2], [1, 4], [1, 0],\n ... [4, 2], [4, 4], [4, 0]])\n >>> clustering = AgglomerativeClustering().fit(X)\n >>> clustering\n AgglomerativeClustering()\n >>> clustering.labels_\n array([1, 1, 1, 0, 0, 0])\n \"\"\"\n \n def __init__(self, n_clusters=2, *, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', distance_threshold=None, compute_distances=False):\n self.n_clusters = n_clusters\n self.distance_threshold = distance_threshold\n self.memory = memory\n self.connectivity = connectivity\n self.compute_full_tree = compute_full_tree\n self.linkage = linkage\n self.affinity = affinity\n self.compute_distances = compute_distances\n \n def fit(self, X, y=None):\n \"\"\"Fit the hierarchical clustering from features, or distance matrix.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``affinity='precomputed'``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the fitted instance.\n \"\"\"\n X = self._validate_data(X, ensure_min_samples=2, estimator=self)\n return self._fit(X)\n \n def _fit(self, X):\n \"\"\"Fit without validation\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``affinity='precomputed'``.\n\n Returns\n -------\n self : object\n Returns the fitted instance.\n \"\"\"\n memory = check_memory(self.memory)\n if self.n_clusters is not None and self.n_clusters <= 0:\n raise ValueError('n_clusters should be an integer greater than 0. %s was provided.' % str(self.n_clusters))\n if not (self.n_clusters is None) ^ (self.distance_threshold is None):\n raise ValueError('Exactly one of n_clusters and distance_threshold has to be set, and the other needs to be None.')\n if self.distance_threshold is not None and not self.compute_full_tree:\n raise ValueError('compute_full_tree must be True if distance_threshold is set.')\n if self.linkage == 'ward' and self.affinity != 'euclidean':\n raise ValueError('%s was provided as affinity. Ward can only work with euclidean distances.' % (self.affinity, ))\n if self.linkage not in _TREE_BUILDERS:\n raise ValueError('Unknown linkage type %s. Valid options are %s' % (self.linkage, _TREE_BUILDERS.keys()))\n tree_builder = _TREE_BUILDERS[self.linkage]\n connectivity = self.connectivity\n if self.connectivity is not None:\n if callable(self.connectivity):\n connectivity = self.connectivity(X)\n connectivity = check_array(connectivity, accept_sparse=['csr', 'coo', 'lil'])\n n_samples = len(X)\n compute_full_tree = self.compute_full_tree\n if self.connectivity is None:\n compute_full_tree = True\n if compute_full_tree == 'auto':\n if self.distance_threshold is not None:\n compute_full_tree = True\n else:\n compute_full_tree = self.n_clusters < max(100, 0.02 * n_samples)\n n_clusters = self.n_clusters\n if compute_full_tree:\n n_clusters = None\n kwargs = {}\n if self.linkage != 'ward':\n kwargs['linkage'] = self.linkage\n kwargs['affinity'] = self.affinity\n distance_threshold = self.distance_threshold\n return_distance = distance_threshold is not None or self.compute_distances\n out = memory.cache(tree_builder)(X, connectivity=connectivity, n_clusters=n_clusters, return_distance=return_distance, **kwargs)\n (self.children_, self.n_connected_components_, self.n_leaves_, parents) = out[:4]\n if return_distance:\n self.distances_ = out[-1]\n if self.distance_threshold is not None:\n self.n_clusters_ = np.count_nonzero(self.distances_ >= distance_threshold) + 1\n else:\n self.n_clusters_ = self.n_clusters\n if compute_full_tree:\n self.labels_ = _hc_cut(self.n_clusters_, self.children_, self.n_leaves_)\n else:\n labels = _hierarchical.hc_get_heads(parents, copy=False)\n labels = np.copy(labels[:n_samples])\n self.labels_ = np.searchsorted(np.unique(labels), labels)\n return self\n \n def fit_predict(self, X, y=None):\n \"\"\"Fit and return the result of each sample's clustering assignment.\n\n In addition to fitting, this method also return the result of the\n clustering assignment for each sample in the training set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``affinity='precomputed'``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels.\n \"\"\"\n return super().fit_predict(X, y)\n" }, @@ -19875,7 +19940,7 @@ "sklearn.cluster._agglomerative.FeatureAgglomeration.fit_predict@getter" ], "is_public": true, - "description": "Agglomerate features.\n\nRecursively merges pair of clusters of features. Read more in the :ref:`User Guide `.", + "description": "Agglomerate features.\n\nRecursively merges pair of clusters of features.\n\nRead more in the :ref:`User Guide `.", "docstring": "Agglomerate features.\n\n Recursively merges pair of clusters of features.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_clusters : int, default=2\n The number of clusters to find. It must be ``None`` if\n ``distance_threshold`` is not ``None``.\n\n affinity : str or callable, default='euclidean'\n Metric used to compute the linkage. Can be \"euclidean\", \"l1\", \"l2\",\n \"manhattan\", \"cosine\", or 'precomputed'.\n If linkage is \"ward\", only \"euclidean\" is accepted.\n\n memory : str or object with the joblib.Memory interface, default=None\n Used to cache the output of the computation of the tree.\n By default, no caching is done. If a string is given, it is the\n path to the caching directory.\n\n connectivity : array-like or callable, default=None\n Connectivity matrix. Defines for each feature the neighboring\n features following a given structure of the data.\n This can be a connectivity matrix itself or a callable that transforms\n the data into a connectivity matrix, such as derived from\n `kneighbors_graph`. Default is `None`, i.e, the\n hierarchical clustering algorithm is unstructured.\n\n compute_full_tree : 'auto' or bool, default='auto'\n Stop early the construction of the tree at `n_clusters`. This is useful\n to decrease computation time if the number of clusters is not small\n compared to the number of features. This option is useful only when\n specifying a connectivity matrix. Note also that when varying the\n number of clusters and using caching, it may be advantageous to compute\n the full tree. It must be ``True`` if ``distance_threshold`` is not\n ``None``. By default `compute_full_tree` is \"auto\", which is equivalent\n to `True` when `distance_threshold` is not `None` or that `n_clusters`\n is inferior to the maximum between 100 or `0.02 * n_samples`.\n Otherwise, \"auto\" is equivalent to `False`.\n\n linkage : {\"ward\", \"complete\", \"average\", \"single\"}, default=\"ward\"\n Which linkage criterion to use. The linkage criterion determines which\n distance to use between sets of features. The algorithm will merge\n the pairs of cluster that minimize this criterion.\n\n - \"ward\" minimizes the variance of the clusters being merged.\n - \"complete\" or maximum linkage uses the maximum distances between\n all features of the two sets.\n - \"average\" uses the average of the distances of each feature of\n the two sets.\n - \"single\" uses the minimum of the distances between all features\n of the two sets.\n\n pooling_func : callable, default=np.mean\n This combines the values of agglomerated features into a single\n value, and should accept an array of shape [M, N] and the keyword\n argument `axis=1`, and reduce it to an array of size [M].\n\n distance_threshold : float, default=None\n The linkage distance threshold above which, clusters will not be\n merged. If not ``None``, ``n_clusters`` must be ``None`` and\n ``compute_full_tree`` must be ``True``.\n\n .. versionadded:: 0.21\n\n compute_distances : bool, default=False\n Computes distances between clusters even if `distance_threshold` is not\n used. This can be used to make dendrogram visualization, but introduces\n a computational and memory overhead.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n n_clusters_ : int\n The number of clusters found by the algorithm. If\n ``distance_threshold=None``, it will be equal to the given\n ``n_clusters``.\n\n labels_ : array-like of (n_features,)\n Cluster labels for each feature.\n\n n_leaves_ : int\n Number of leaves in the hierarchical tree.\n\n n_connected_components_ : int\n The estimated number of connected components in the graph.\n\n .. versionadded:: 0.21\n ``n_connected_components_`` was added to replace ``n_components_``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n children_ : array-like of shape (n_nodes-1, 2)\n The children of each non-leaf node. Values less than `n_features`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_features` is a non-leaf\n node and has children `children_[i - n_features]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_features + i`.\n\n distances_ : array-like of shape (n_nodes-1,)\n Distances between nodes in the corresponding place in `children_`.\n Only computed if `distance_threshold` is used or `compute_distances`\n is set to `True`.\n\n See Also\n --------\n AgglomerativeClustering : Agglomerative clustering samples instead of\n features.\n ward_tree : Hierarchical clustering with ward linkage.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import datasets, cluster\n >>> digits = datasets.load_digits()\n >>> images = digits.images\n >>> X = np.reshape(images, (len(images), -1))\n >>> agglo = cluster.FeatureAgglomeration(n_clusters=32)\n >>> agglo.fit(X)\n FeatureAgglomeration(n_clusters=32)\n >>> X_reduced = agglo.transform(X)\n >>> X_reduced.shape\n (1797, 32)\n ", "source_code": "\n\nclass FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):\n \"\"\"Agglomerate features.\n\n Recursively merges pair of clusters of features.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_clusters : int, default=2\n The number of clusters to find. It must be ``None`` if\n ``distance_threshold`` is not ``None``.\n\n affinity : str or callable, default='euclidean'\n Metric used to compute the linkage. Can be \"euclidean\", \"l1\", \"l2\",\n \"manhattan\", \"cosine\", or 'precomputed'.\n If linkage is \"ward\", only \"euclidean\" is accepted.\n\n memory : str or object with the joblib.Memory interface, default=None\n Used to cache the output of the computation of the tree.\n By default, no caching is done. If a string is given, it is the\n path to the caching directory.\n\n connectivity : array-like or callable, default=None\n Connectivity matrix. Defines for each feature the neighboring\n features following a given structure of the data.\n This can be a connectivity matrix itself or a callable that transforms\n the data into a connectivity matrix, such as derived from\n `kneighbors_graph`. Default is `None`, i.e, the\n hierarchical clustering algorithm is unstructured.\n\n compute_full_tree : 'auto' or bool, default='auto'\n Stop early the construction of the tree at `n_clusters`. This is useful\n to decrease computation time if the number of clusters is not small\n compared to the number of features. This option is useful only when\n specifying a connectivity matrix. Note also that when varying the\n number of clusters and using caching, it may be advantageous to compute\n the full tree. It must be ``True`` if ``distance_threshold`` is not\n ``None``. By default `compute_full_tree` is \"auto\", which is equivalent\n to `True` when `distance_threshold` is not `None` or that `n_clusters`\n is inferior to the maximum between 100 or `0.02 * n_samples`.\n Otherwise, \"auto\" is equivalent to `False`.\n\n linkage : {\"ward\", \"complete\", \"average\", \"single\"}, default=\"ward\"\n Which linkage criterion to use. The linkage criterion determines which\n distance to use between sets of features. The algorithm will merge\n the pairs of cluster that minimize this criterion.\n\n - \"ward\" minimizes the variance of the clusters being merged.\n - \"complete\" or maximum linkage uses the maximum distances between\n all features of the two sets.\n - \"average\" uses the average of the distances of each feature of\n the two sets.\n - \"single\" uses the minimum of the distances between all features\n of the two sets.\n\n pooling_func : callable, default=np.mean\n This combines the values of agglomerated features into a single\n value, and should accept an array of shape [M, N] and the keyword\n argument `axis=1`, and reduce it to an array of size [M].\n\n distance_threshold : float, default=None\n The linkage distance threshold above which, clusters will not be\n merged. If not ``None``, ``n_clusters`` must be ``None`` and\n ``compute_full_tree`` must be ``True``.\n\n .. versionadded:: 0.21\n\n compute_distances : bool, default=False\n Computes distances between clusters even if `distance_threshold` is not\n used. This can be used to make dendrogram visualization, but introduces\n a computational and memory overhead.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n n_clusters_ : int\n The number of clusters found by the algorithm. If\n ``distance_threshold=None``, it will be equal to the given\n ``n_clusters``.\n\n labels_ : array-like of (n_features,)\n Cluster labels for each feature.\n\n n_leaves_ : int\n Number of leaves in the hierarchical tree.\n\n n_connected_components_ : int\n The estimated number of connected components in the graph.\n\n .. versionadded:: 0.21\n ``n_connected_components_`` was added to replace ``n_components_``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n children_ : array-like of shape (n_nodes-1, 2)\n The children of each non-leaf node. Values less than `n_features`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_features` is a non-leaf\n node and has children `children_[i - n_features]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_features + i`.\n\n distances_ : array-like of shape (n_nodes-1,)\n Distances between nodes in the corresponding place in `children_`.\n Only computed if `distance_threshold` is used or `compute_distances`\n is set to `True`.\n\n See Also\n --------\n AgglomerativeClustering : Agglomerative clustering samples instead of\n features.\n ward_tree : Hierarchical clustering with ward linkage.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import datasets, cluster\n >>> digits = datasets.load_digits()\n >>> images = digits.images\n >>> X = np.reshape(images, (len(images), -1))\n >>> agglo = cluster.FeatureAgglomeration(n_clusters=32)\n >>> agglo.fit(X)\n FeatureAgglomeration(n_clusters=32)\n >>> X_reduced = agglo.transform(X)\n >>> X_reduced.shape\n (1797, 32)\n \"\"\"\n \n def __init__(self, n_clusters=2, *, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', pooling_func=np.mean, distance_threshold=None, compute_distances=False):\n super().__init__(n_clusters=n_clusters, memory=memory, connectivity=connectivity, compute_full_tree=compute_full_tree, linkage=linkage, affinity=affinity, distance_threshold=distance_threshold, compute_distances=compute_distances)\n self.pooling_func = pooling_func\n \n def fit(self, X, y=None):\n \"\"\"Fit the hierarchical clustering on the data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the transformer.\n \"\"\"\n X = self._validate_data(X, ensure_min_features=2, estimator=self)\n super()._fit(X.T)\n return self\n \n @property\n def fit_predict(self):\n \"\"\"Fit and return the result of each sample's clustering assignment.\"\"\"\n raise AttributeError\n" }, @@ -19895,7 +19960,7 @@ "is_public": false, "description": "Base class for spectral biclustering.", "docstring": "Base class for spectral biclustering.", - "source_code": "\n\nclass BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):\n \"\"\"Base class for spectral biclustering.\"\"\"\n \n @abstractmethod\n def __init__(self, n_clusters=3, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', n_init=10, random_state=None):\n self.n_clusters = n_clusters\n self.svd_method = svd_method\n self.n_svd_vecs = n_svd_vecs\n self.mini_batch = mini_batch\n self.init = init\n self.n_init = n_init\n self.random_state = random_state\n \n def _check_parameters(self):\n legal_svd_methods = ('randomized', 'arpack')\n if self.svd_method not in legal_svd_methods:\n raise ValueError(\"Unknown SVD method: '{0}'. svd_method must be one of {1}.\".format(self.svd_method, legal_svd_methods))\n \n def fit(self, X, y=None):\n \"\"\"Create a biclustering for X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n SpectralBiclustering instance.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', dtype=np.float64)\n self._check_parameters()\n self._fit(X)\n return self\n \n def _svd(self, array, n_components, n_discard):\n \"\"\"Returns first `n_components` left and right singular\n vectors u and v, discarding the first `n_discard`.\n\n \"\"\"\n if self.svd_method == 'randomized':\n kwargs = {}\n if self.n_svd_vecs is not None:\n kwargs['n_oversamples'] = self.n_svd_vecs\n (u, _, vt) = randomized_svd(array, n_components, random_state=self.random_state, **kwargs)\n elif self.svd_method == 'arpack':\n (u, _, vt) = svds(array, k=n_components, ncv=self.n_svd_vecs)\n if np.any(np.isnan(vt)):\n A = safe_sparse_dot(array.T, array)\n random_state = check_random_state(self.random_state)\n v0 = random_state.uniform(-1, 1, A.shape[0])\n (_, v) = eigsh(A, ncv=self.n_svd_vecs, v0=v0)\n vt = v.T\n if np.any(np.isnan(u)):\n A = safe_sparse_dot(array, array.T)\n random_state = check_random_state(self.random_state)\n v0 = random_state.uniform(-1, 1, A.shape[0])\n (_, u) = eigsh(A, ncv=self.n_svd_vecs, v0=v0)\n assert_all_finite(u)\n assert_all_finite(vt)\n u = u[:, n_discard:]\n vt = vt[n_discard:]\n return u, vt.T\n \n def _k_means(self, data, n_clusters):\n if self.mini_batch:\n model = MiniBatchKMeans(n_clusters, init=self.init, n_init=self.n_init, random_state=self.random_state)\n else:\n model = KMeans(n_clusters, init=self.init, n_init=self.n_init, random_state=self.random_state)\n model.fit(data)\n centroid = model.cluster_centers_\n labels = model.labels_\n return centroid, labels\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_estimators_dtypes': 'raises nan error', 'check_fit2d_1sample': '_scale_normalize fails', 'check_fit2d_1feature': 'raises apply_along_axis error', 'check_estimator_sparse_data': 'does not fail gracefully', 'check_methods_subset_invariance': 'empty array passed inside', 'check_dont_overwrite_parameters': 'empty array passed inside', 'check_fit2d_predict1d': 'empty array passed inside'}}\n" + "source_code": "\n\nclass BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):\n \"\"\"Base class for spectral biclustering.\"\"\"\n \n @abstractmethod\n def __init__(self, n_clusters=3, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', n_init=10, random_state=None):\n self.n_clusters = n_clusters\n self.svd_method = svd_method\n self.n_svd_vecs = n_svd_vecs\n self.mini_batch = mini_batch\n self.init = init\n self.n_init = n_init\n self.random_state = random_state\n \n def _check_parameters(self):\n legal_svd_methods = ('randomized', 'arpack')\n if self.svd_method not in legal_svd_methods:\n raise ValueError(\"Unknown SVD method: '{0}'. svd_method must be one of {1}.\".format(self.svd_method, legal_svd_methods))\n \n def fit(self, X, y=None):\n \"\"\"Create a biclustering for X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n SpectralBiclustering instance.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', dtype=np.float64)\n self._check_parameters()\n self._fit(X)\n return self\n \n def _svd(self, array, n_components, n_discard):\n \"\"\"Returns first `n_components` left and right singular\n vectors u and v, discarding the first `n_discard`.\n \"\"\"\n if self.svd_method == 'randomized':\n kwargs = {}\n if self.n_svd_vecs is not None:\n kwargs['n_oversamples'] = self.n_svd_vecs\n (u, _, vt) = randomized_svd(array, n_components, random_state=self.random_state, **kwargs)\n elif self.svd_method == 'arpack':\n (u, _, vt) = svds(array, k=n_components, ncv=self.n_svd_vecs)\n if np.any(np.isnan(vt)):\n A = safe_sparse_dot(array.T, array)\n random_state = check_random_state(self.random_state)\n v0 = random_state.uniform(-1, 1, A.shape[0])\n (_, v) = eigsh(A, ncv=self.n_svd_vecs, v0=v0)\n vt = v.T\n if np.any(np.isnan(u)):\n A = safe_sparse_dot(array, array.T)\n random_state = check_random_state(self.random_state)\n v0 = random_state.uniform(-1, 1, A.shape[0])\n (_, u) = eigsh(A, ncv=self.n_svd_vecs, v0=v0)\n assert_all_finite(u)\n assert_all_finite(vt)\n u = u[:, n_discard:]\n vt = vt[n_discard:]\n return u, vt.T\n \n def _k_means(self, data, n_clusters):\n if self.mini_batch:\n model = MiniBatchKMeans(n_clusters, init=self.init, n_init=self.n_init, random_state=self.random_state)\n else:\n model = KMeans(n_clusters, init=self.init, n_init=self.n_init, random_state=self.random_state)\n model.fit(data)\n centroid = model.cluster_centers_\n labels = model.labels_\n return centroid, labels\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_estimators_dtypes': 'raises nan error', 'check_fit2d_1sample': '_scale_normalize fails', 'check_fit2d_1feature': 'raises apply_along_axis error', 'check_estimator_sparse_data': 'does not fail gracefully', 'check_methods_subset_invariance': 'empty array passed inside', 'check_dont_overwrite_parameters': 'empty array passed inside', 'check_fit2d_predict1d': 'empty array passed inside'}}\n" }, { "name": "SpectralBiclustering", @@ -19910,7 +19975,7 @@ "sklearn.cluster._bicluster.SpectralBiclustering._project_and_cluster" ], "is_public": true, - "description": "Spectral biclustering (Kluger, 2003).\n\nPartitions rows and columns under the assumption that the data has an underlying checkerboard structure. For instance, if there are two row partitions and three column partitions, each row will belong to three biclusters, and each column will belong to two biclusters. The outer product of the corresponding row and column label vectors gives this checkerboard structure. Read more in the :ref:`User Guide `.", + "description": "Spectral biclustering (Kluger, 2003).\n\nPartitions rows and columns under the assumption that the data has\nan underlying checkerboard structure. For instance, if there are\ntwo row partitions and three column partitions, each row will\nbelong to three biclusters, and each column will belong to two\nbiclusters. The outer product of the corresponding row and column\nlabel vectors gives this checkerboard structure.\n\nRead more in the :ref:`User Guide `.", "docstring": "Spectral biclustering (Kluger, 2003).\n\n Partitions rows and columns under the assumption that the data has\n an underlying checkerboard structure. For instance, if there are\n two row partitions and three column partitions, each row will\n belong to three biclusters, and each column will belong to two\n biclusters. The outer product of the corresponding row and column\n label vectors gives this checkerboard structure.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3\n The number of row and column clusters in the checkerboard\n structure.\n\n method : {'bistochastic', 'scale', 'log'}, default='bistochastic'\n Method of normalizing and converting singular vectors into\n biclusters. May be one of 'scale', 'bistochastic', or 'log'.\n The authors recommend using 'log'. If the data is sparse,\n however, log normalization will not work, which is why the\n default is 'bistochastic'.\n\n .. warning::\n if `method='log'`, the data must be sparse.\n\n n_components : int, default=6\n Number of singular vectors to check.\n\n n_best : int, default=3\n Number of best singular vectors to which to project the data\n for clustering.\n\n svd_method : {'randomized', 'arpack'}, default='randomized'\n Selects the algorithm for finding singular vectors. May be\n 'randomized' or 'arpack'. If 'randomized', uses\n :func:`~sklearn.utils.extmath.randomized_svd`, which may be faster\n for large matrices. If 'arpack', uses\n `scipy.sparse.linalg.svds`, which is more accurate, but\n possibly slower in some cases.\n\n n_svd_vecs : int, default=None\n Number of vectors to use in calculating the SVD. Corresponds\n to `ncv` when `svd_method=arpack` and `n_oversamples` when\n `svd_method` is 'randomized`.\n\n mini_batch : bool, default=False\n Whether to use mini-batch k-means, which is faster but may get\n different results.\n\n init : {'k-means++', 'random'} or ndarray of (n_clusters, n_features), default='k-means++'\n Method for initialization of k-means algorithm; defaults to\n 'k-means++'.\n\n n_init : int, default=10\n Number of random initializations that are tried with the\n k-means algorithm.\n\n If mini-batch k-means is used, the best initialization is\n chosen and the algorithm runs once. Otherwise, the algorithm\n is run for each initialization and the best solution chosen.\n\n random_state : int, RandomState instance, default=None\n Used for randomizing the singular value decomposition and the k-means\n initialization. Use an int to make the randomness deterministic.\n See :term:`Glossary `.\n\n Attributes\n ----------\n rows_ : array-like of shape (n_row_clusters, n_rows)\n Results of the clustering. `rows[i, r]` is True if\n cluster `i` contains row `r`. Available only after calling ``fit``.\n\n columns_ : array-like of shape (n_column_clusters, n_columns)\n Results of the clustering, like `rows`.\n\n row_labels_ : array-like of shape (n_rows,)\n Row partition labels.\n\n column_labels_ : array-like of shape (n_cols,)\n Column partition labels.\n\n biclusters_ : tuple of two ndarrays\n The tuple contains the `rows_` and `columns_` arrays.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SpectralCoclustering : Spectral Co-Clustering algorithm (Dhillon, 2001).\n\n References\n ----------\n\n * Kluger, Yuval, et. al., 2003. `Spectral biclustering of microarray\n data: coclustering genes and conditions\n `__.\n\n Examples\n --------\n >>> from sklearn.cluster import SpectralBiclustering\n >>> import numpy as np\n >>> X = np.array([[1, 1], [2, 1], [1, 0],\n ... [4, 7], [3, 5], [3, 6]])\n >>> clustering = SpectralBiclustering(n_clusters=2, random_state=0).fit(X)\n >>> clustering.row_labels_\n array([1, 1, 1, 0, 0, 0], dtype=int32)\n >>> clustering.column_labels_\n array([0, 1], dtype=int32)\n >>> clustering\n SpectralBiclustering(n_clusters=2, random_state=0)\n ", "source_code": "\n\nclass SpectralBiclustering(BaseSpectral):\n \"\"\"Spectral biclustering (Kluger, 2003).\n\n Partitions rows and columns under the assumption that the data has\n an underlying checkerboard structure. For instance, if there are\n two row partitions and three column partitions, each row will\n belong to three biclusters, and each column will belong to two\n biclusters. The outer product of the corresponding row and column\n label vectors gives this checkerboard structure.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3\n The number of row and column clusters in the checkerboard\n structure.\n\n method : {'bistochastic', 'scale', 'log'}, default='bistochastic'\n Method of normalizing and converting singular vectors into\n biclusters. May be one of 'scale', 'bistochastic', or 'log'.\n The authors recommend using 'log'. If the data is sparse,\n however, log normalization will not work, which is why the\n default is 'bistochastic'.\n\n .. warning::\n if `method='log'`, the data must be sparse.\n\n n_components : int, default=6\n Number of singular vectors to check.\n\n n_best : int, default=3\n Number of best singular vectors to which to project the data\n for clustering.\n\n svd_method : {'randomized', 'arpack'}, default='randomized'\n Selects the algorithm for finding singular vectors. May be\n 'randomized' or 'arpack'. If 'randomized', uses\n :func:`~sklearn.utils.extmath.randomized_svd`, which may be faster\n for large matrices. If 'arpack', uses\n `scipy.sparse.linalg.svds`, which is more accurate, but\n possibly slower in some cases.\n\n n_svd_vecs : int, default=None\n Number of vectors to use in calculating the SVD. Corresponds\n to `ncv` when `svd_method=arpack` and `n_oversamples` when\n `svd_method` is 'randomized`.\n\n mini_batch : bool, default=False\n Whether to use mini-batch k-means, which is faster but may get\n different results.\n\n init : {'k-means++', 'random'} or ndarray of (n_clusters, n_features), default='k-means++'\n Method for initialization of k-means algorithm; defaults to\n 'k-means++'.\n\n n_init : int, default=10\n Number of random initializations that are tried with the\n k-means algorithm.\n\n If mini-batch k-means is used, the best initialization is\n chosen and the algorithm runs once. Otherwise, the algorithm\n is run for each initialization and the best solution chosen.\n\n random_state : int, RandomState instance, default=None\n Used for randomizing the singular value decomposition and the k-means\n initialization. Use an int to make the randomness deterministic.\n See :term:`Glossary `.\n\n Attributes\n ----------\n rows_ : array-like of shape (n_row_clusters, n_rows)\n Results of the clustering. `rows[i, r]` is True if\n cluster `i` contains row `r`. Available only after calling ``fit``.\n\n columns_ : array-like of shape (n_column_clusters, n_columns)\n Results of the clustering, like `rows`.\n\n row_labels_ : array-like of shape (n_rows,)\n Row partition labels.\n\n column_labels_ : array-like of shape (n_cols,)\n Column partition labels.\n\n biclusters_ : tuple of two ndarrays\n The tuple contains the `rows_` and `columns_` arrays.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SpectralCoclustering : Spectral Co-Clustering algorithm (Dhillon, 2001).\n\n References\n ----------\n\n * Kluger, Yuval, et. al., 2003. `Spectral biclustering of microarray\n data: coclustering genes and conditions\n `__.\n\n Examples\n --------\n >>> from sklearn.cluster import SpectralBiclustering\n >>> import numpy as np\n >>> X = np.array([[1, 1], [2, 1], [1, 0],\n ... [4, 7], [3, 5], [3, 6]])\n >>> clustering = SpectralBiclustering(n_clusters=2, random_state=0).fit(X)\n >>> clustering.row_labels_\n array([1, 1, 1, 0, 0, 0], dtype=int32)\n >>> clustering.column_labels_\n array([0, 1], dtype=int32)\n >>> clustering\n SpectralBiclustering(n_clusters=2, random_state=0)\n \"\"\"\n \n def __init__(self, n_clusters=3, *, method='bistochastic', n_components=6, n_best=3, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', n_init=10, random_state=None):\n super().__init__(n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state)\n self.method = method\n self.n_components = n_components\n self.n_best = n_best\n \n def _check_parameters(self):\n super()._check_parameters()\n legal_methods = ('bistochastic', 'scale', 'log')\n if self.method not in legal_methods:\n raise ValueError(\"Unknown method: '{0}'. method must be one of {1}.\".format(self.method, legal_methods))\n try:\n int(self.n_clusters)\n except TypeError:\n try:\n (r, c) = self.n_clusters\n int(r)\n int(c)\n except (ValueError, TypeError) as e:\n raise ValueError('Incorrect parameter n_clusters has value: {}. It should either be a single integer or an iterable with two integers: (n_row_clusters, n_column_clusters)') from e\n if self.n_components < 1:\n raise ValueError('Parameter n_components must be greater than 0, but its value is {}'.format(self.n_components))\n if self.n_best < 1:\n raise ValueError('Parameter n_best must be greater than 0, but its value is {}'.format(self.n_best))\n if self.n_best > self.n_components:\n raise ValueError('n_best cannot be larger than n_components, but {} > {}'.format(self.n_best, self.n_components))\n \n def _fit(self, X):\n n_sv = self.n_components\n if self.method == 'bistochastic':\n normalized_data = _bistochastic_normalize(X)\n n_sv += 1\n elif self.method == 'scale':\n (normalized_data, _, _) = _scale_normalize(X)\n n_sv += 1\n elif self.method == 'log':\n normalized_data = _log_normalize(X)\n n_discard = 0 if self.method == 'log' else 1\n (u, v) = self._svd(normalized_data, n_sv, n_discard)\n ut = u.T\n vt = v.T\n try:\n (n_row_clusters, n_col_clusters) = self.n_clusters\n except TypeError:\n n_row_clusters = n_col_clusters = self.n_clusters\n best_ut = self._fit_best_piecewise(ut, self.n_best, n_row_clusters)\n best_vt = self._fit_best_piecewise(vt, self.n_best, n_col_clusters)\n self.row_labels_ = self._project_and_cluster(X, best_vt.T, n_row_clusters)\n self.column_labels_ = self._project_and_cluster(X.T, best_ut.T, n_col_clusters)\n self.rows_ = np.vstack([self.row_labels_ == label for label in range(n_row_clusters) for _ in range(n_col_clusters)])\n self.columns_ = np.vstack([self.column_labels_ == label for _ in range(n_row_clusters) for label in range(n_col_clusters)])\n \n def _fit_best_piecewise(self, vectors, n_best, n_clusters):\n \"\"\"Find the ``n_best`` vectors that are best approximated by piecewise\n constant vectors.\n\n The piecewise vectors are found by k-means; the best is chosen\n according to Euclidean distance.\n\n \"\"\"\n \n def make_piecewise(v):\n (centroid, labels) = self._k_means(v.reshape(-1, 1), n_clusters)\n return centroid[labels].ravel()\n piecewise_vectors = np.apply_along_axis(make_piecewise, axis=1, arr=vectors)\n dists = np.apply_along_axis(norm, axis=1, arr=vectors - piecewise_vectors)\n result = vectors[np.argsort(dists)[:n_best]]\n return result\n \n def _project_and_cluster(self, data, vectors, n_clusters):\n \"\"\"Project ``data`` to ``vectors`` and cluster the result.\"\"\"\n projected = safe_sparse_dot(data, vectors)\n (_, labels) = self._k_means(projected, n_clusters)\n return labels\n" }, @@ -19924,9 +19989,9 @@ "sklearn.cluster._bicluster.SpectralCoclustering._fit" ], "is_public": true, - "description": "Spectral Co-Clustering algorithm (Dhillon, 2001).\n\nClusters rows and columns of an array `X` to solve the relaxed normalized cut of the bipartite graph created from `X` as follows: the edge between row vertex `i` and column vertex `j` has weight `X[i, j]`. The resulting bicluster structure is block-diagonal, since each row and each column belongs to exactly one bicluster. Supports sparse matrices, as long as they are nonnegative. Read more in the :ref:`User Guide `.", - "docstring": "Spectral Co-Clustering algorithm (Dhillon, 2001).\n\n Clusters rows and columns of an array `X` to solve the relaxed\n normalized cut of the bipartite graph created from `X` as follows:\n the edge between row vertex `i` and column vertex `j` has weight\n `X[i, j]`.\n\n The resulting bicluster structure is block-diagonal, since each\n row and each column belongs to exactly one bicluster.\n\n Supports sparse matrices, as long as they are nonnegative.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_clusters : int, default=3\n The number of biclusters to find.\n\n svd_method : {'randomized', 'arpack'}, default='randomized'\n Selects the algorithm for finding singular vectors. May be\n 'randomized' or 'arpack'. If 'randomized', use\n :func:`sklearn.utils.extmath.randomized_svd`, which may be faster\n for large matrices. If 'arpack', use\n :func:`scipy.sparse.linalg.svds`, which is more accurate, but\n possibly slower in some cases.\n\n n_svd_vecs : int, default=None\n Number of vectors to use in calculating the SVD. Corresponds\n to `ncv` when `svd_method=arpack` and `n_oversamples` when\n `svd_method` is 'randomized`.\n\n mini_batch : bool, default=False\n Whether to use mini-batch k-means, which is faster but may get\n different results.\n\n init : {'k-means++', 'random', or ndarray of shape (n_clusters, n_features), default='k-means++'\n Method for initialization of k-means algorithm; defaults to\n 'k-means++'.\n\n n_init : int, default=10\n Number of random initializations that are tried with the\n k-means algorithm.\n\n If mini-batch k-means is used, the best initialization is\n chosen and the algorithm runs once. Otherwise, the algorithm\n is run for each initialization and the best solution chosen.\n\n random_state : int, RandomState instance, default=None\n Used for randomizing the singular value decomposition and the k-means\n initialization. Use an int to make the randomness deterministic.\n See :term:`Glossary `.\n\n Attributes\n ----------\n rows_ : array-like of shape (n_row_clusters, n_rows)\n Results of the clustering. `rows[i, r]` is True if\n cluster `i` contains row `r`. Available only after calling ``fit``.\n\n columns_ : array-like of shape (n_column_clusters, n_columns)\n Results of the clustering, like `rows`.\n\n row_labels_ : array-like of shape (n_rows,)\n The bicluster label of each row.\n\n column_labels_ : array-like of shape (n_cols,)\n The bicluster label of each column.\n\n biclusters_ : tuple of two ndarrays\n The tuple contains the `rows_` and `columns_` arrays.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n Examples\n --------\n >>> from sklearn.cluster import SpectralCoclustering\n >>> import numpy as np\n >>> X = np.array([[1, 1], [2, 1], [1, 0],\n ... [4, 7], [3, 5], [3, 6]])\n >>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)\n >>> clustering.row_labels_ #doctest: +SKIP\n array([0, 1, 1, 0, 0, 0], dtype=int32)\n >>> clustering.column_labels_ #doctest: +SKIP\n array([0, 0], dtype=int32)\n >>> clustering\n SpectralCoclustering(n_clusters=2, random_state=0)\n\n References\n ----------\n\n * Dhillon, Inderjit S, 2001. `Co-clustering documents and words using\n bipartite spectral graph partitioning\n `__.\n\n ", - "source_code": "\n\nclass SpectralCoclustering(BaseSpectral):\n \"\"\"Spectral Co-Clustering algorithm (Dhillon, 2001).\n\n Clusters rows and columns of an array `X` to solve the relaxed\n normalized cut of the bipartite graph created from `X` as follows:\n the edge between row vertex `i` and column vertex `j` has weight\n `X[i, j]`.\n\n The resulting bicluster structure is block-diagonal, since each\n row and each column belongs to exactly one bicluster.\n\n Supports sparse matrices, as long as they are nonnegative.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_clusters : int, default=3\n The number of biclusters to find.\n\n svd_method : {'randomized', 'arpack'}, default='randomized'\n Selects the algorithm for finding singular vectors. May be\n 'randomized' or 'arpack'. If 'randomized', use\n :func:`sklearn.utils.extmath.randomized_svd`, which may be faster\n for large matrices. If 'arpack', use\n :func:`scipy.sparse.linalg.svds`, which is more accurate, but\n possibly slower in some cases.\n\n n_svd_vecs : int, default=None\n Number of vectors to use in calculating the SVD. Corresponds\n to `ncv` when `svd_method=arpack` and `n_oversamples` when\n `svd_method` is 'randomized`.\n\n mini_batch : bool, default=False\n Whether to use mini-batch k-means, which is faster but may get\n different results.\n\n init : {'k-means++', 'random', or ndarray of shape (n_clusters, n_features), default='k-means++'\n Method for initialization of k-means algorithm; defaults to\n 'k-means++'.\n\n n_init : int, default=10\n Number of random initializations that are tried with the\n k-means algorithm.\n\n If mini-batch k-means is used, the best initialization is\n chosen and the algorithm runs once. Otherwise, the algorithm\n is run for each initialization and the best solution chosen.\n\n random_state : int, RandomState instance, default=None\n Used for randomizing the singular value decomposition and the k-means\n initialization. Use an int to make the randomness deterministic.\n See :term:`Glossary `.\n\n Attributes\n ----------\n rows_ : array-like of shape (n_row_clusters, n_rows)\n Results of the clustering. `rows[i, r]` is True if\n cluster `i` contains row `r`. Available only after calling ``fit``.\n\n columns_ : array-like of shape (n_column_clusters, n_columns)\n Results of the clustering, like `rows`.\n\n row_labels_ : array-like of shape (n_rows,)\n The bicluster label of each row.\n\n column_labels_ : array-like of shape (n_cols,)\n The bicluster label of each column.\n\n biclusters_ : tuple of two ndarrays\n The tuple contains the `rows_` and `columns_` arrays.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n Examples\n --------\n >>> from sklearn.cluster import SpectralCoclustering\n >>> import numpy as np\n >>> X = np.array([[1, 1], [2, 1], [1, 0],\n ... [4, 7], [3, 5], [3, 6]])\n >>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)\n >>> clustering.row_labels_ #doctest: +SKIP\n array([0, 1, 1, 0, 0, 0], dtype=int32)\n >>> clustering.column_labels_ #doctest: +SKIP\n array([0, 0], dtype=int32)\n >>> clustering\n SpectralCoclustering(n_clusters=2, random_state=0)\n\n References\n ----------\n\n * Dhillon, Inderjit S, 2001. `Co-clustering documents and words using\n bipartite spectral graph partitioning\n `__.\n\n \"\"\"\n \n def __init__(self, n_clusters=3, *, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', n_init=10, random_state=None):\n super().__init__(n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state)\n \n def _fit(self, X):\n (normalized_data, row_diag, col_diag) = _scale_normalize(X)\n n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))\n (u, v) = self._svd(normalized_data, n_sv, n_discard=1)\n z = np.vstack((row_diag[:, np.newaxis] * u, col_diag[:, np.newaxis] * v))\n (_, labels) = self._k_means(z, self.n_clusters)\n n_rows = X.shape[0]\n self.row_labels_ = labels[:n_rows]\n self.column_labels_ = labels[n_rows:]\n self.rows_ = np.vstack([self.row_labels_ == c for c in range(self.n_clusters)])\n self.columns_ = np.vstack([self.column_labels_ == c for c in range(self.n_clusters)])\n" + "description": "Spectral Co-Clustering algorithm (Dhillon, 2001).\n\nClusters rows and columns of an array `X` to solve the relaxed\nnormalized cut of the bipartite graph created from `X` as follows:\nthe edge between row vertex `i` and column vertex `j` has weight\n`X[i, j]`.\n\nThe resulting bicluster structure is block-diagonal, since each\nrow and each column belongs to exactly one bicluster.\n\nSupports sparse matrices, as long as they are nonnegative.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Spectral Co-Clustering algorithm (Dhillon, 2001).\n\n Clusters rows and columns of an array `X` to solve the relaxed\n normalized cut of the bipartite graph created from `X` as follows:\n the edge between row vertex `i` and column vertex `j` has weight\n `X[i, j]`.\n\n The resulting bicluster structure is block-diagonal, since each\n row and each column belongs to exactly one bicluster.\n\n Supports sparse matrices, as long as they are nonnegative.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_clusters : int, default=3\n The number of biclusters to find.\n\n svd_method : {'randomized', 'arpack'}, default='randomized'\n Selects the algorithm for finding singular vectors. May be\n 'randomized' or 'arpack'. If 'randomized', use\n :func:`sklearn.utils.extmath.randomized_svd`, which may be faster\n for large matrices. If 'arpack', use\n :func:`scipy.sparse.linalg.svds`, which is more accurate, but\n possibly slower in some cases.\n\n n_svd_vecs : int, default=None\n Number of vectors to use in calculating the SVD. Corresponds\n to `ncv` when `svd_method=arpack` and `n_oversamples` when\n `svd_method` is 'randomized`.\n\n mini_batch : bool, default=False\n Whether to use mini-batch k-means, which is faster but may get\n different results.\n\n init : {'k-means++', 'random', or ndarray of shape (n_clusters, n_features), default='k-means++'\n Method for initialization of k-means algorithm; defaults to\n 'k-means++'.\n\n n_init : int, default=10\n Number of random initializations that are tried with the\n k-means algorithm.\n\n If mini-batch k-means is used, the best initialization is\n chosen and the algorithm runs once. Otherwise, the algorithm\n is run for each initialization and the best solution chosen.\n\n random_state : int, RandomState instance, default=None\n Used for randomizing the singular value decomposition and the k-means\n initialization. Use an int to make the randomness deterministic.\n See :term:`Glossary `.\n\n Attributes\n ----------\n rows_ : array-like of shape (n_row_clusters, n_rows)\n Results of the clustering. `rows[i, r]` is True if\n cluster `i` contains row `r`. Available only after calling ``fit``.\n\n columns_ : array-like of shape (n_column_clusters, n_columns)\n Results of the clustering, like `rows`.\n\n row_labels_ : array-like of shape (n_rows,)\n The bicluster label of each row.\n\n column_labels_ : array-like of shape (n_cols,)\n The bicluster label of each column.\n\n biclusters_ : tuple of two ndarrays\n The tuple contains the `rows_` and `columns_` arrays.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SpectralBiclustering : Partitions rows and columns under the assumption\n that the data has an underlying checkerboard structure.\n\n References\n ----------\n * Dhillon, Inderjit S, 2001. `Co-clustering documents and words using\n bipartite spectral graph partitioning\n `__.\n\n Examples\n --------\n >>> from sklearn.cluster import SpectralCoclustering\n >>> import numpy as np\n >>> X = np.array([[1, 1], [2, 1], [1, 0],\n ... [4, 7], [3, 5], [3, 6]])\n >>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)\n >>> clustering.row_labels_ #doctest: +SKIP\n array([0, 1, 1, 0, 0, 0], dtype=int32)\n >>> clustering.column_labels_ #doctest: +SKIP\n array([0, 0], dtype=int32)\n >>> clustering\n SpectralCoclustering(n_clusters=2, random_state=0)\n ", + "source_code": "\n\nclass SpectralCoclustering(BaseSpectral):\n \"\"\"Spectral Co-Clustering algorithm (Dhillon, 2001).\n\n Clusters rows and columns of an array `X` to solve the relaxed\n normalized cut of the bipartite graph created from `X` as follows:\n the edge between row vertex `i` and column vertex `j` has weight\n `X[i, j]`.\n\n The resulting bicluster structure is block-diagonal, since each\n row and each column belongs to exactly one bicluster.\n\n Supports sparse matrices, as long as they are nonnegative.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_clusters : int, default=3\n The number of biclusters to find.\n\n svd_method : {'randomized', 'arpack'}, default='randomized'\n Selects the algorithm for finding singular vectors. May be\n 'randomized' or 'arpack'. If 'randomized', use\n :func:`sklearn.utils.extmath.randomized_svd`, which may be faster\n for large matrices. If 'arpack', use\n :func:`scipy.sparse.linalg.svds`, which is more accurate, but\n possibly slower in some cases.\n\n n_svd_vecs : int, default=None\n Number of vectors to use in calculating the SVD. Corresponds\n to `ncv` when `svd_method=arpack` and `n_oversamples` when\n `svd_method` is 'randomized`.\n\n mini_batch : bool, default=False\n Whether to use mini-batch k-means, which is faster but may get\n different results.\n\n init : {'k-means++', 'random', or ndarray of shape (n_clusters, n_features), default='k-means++'\n Method for initialization of k-means algorithm; defaults to\n 'k-means++'.\n\n n_init : int, default=10\n Number of random initializations that are tried with the\n k-means algorithm.\n\n If mini-batch k-means is used, the best initialization is\n chosen and the algorithm runs once. Otherwise, the algorithm\n is run for each initialization and the best solution chosen.\n\n random_state : int, RandomState instance, default=None\n Used for randomizing the singular value decomposition and the k-means\n initialization. Use an int to make the randomness deterministic.\n See :term:`Glossary `.\n\n Attributes\n ----------\n rows_ : array-like of shape (n_row_clusters, n_rows)\n Results of the clustering. `rows[i, r]` is True if\n cluster `i` contains row `r`. Available only after calling ``fit``.\n\n columns_ : array-like of shape (n_column_clusters, n_columns)\n Results of the clustering, like `rows`.\n\n row_labels_ : array-like of shape (n_rows,)\n The bicluster label of each row.\n\n column_labels_ : array-like of shape (n_cols,)\n The bicluster label of each column.\n\n biclusters_ : tuple of two ndarrays\n The tuple contains the `rows_` and `columns_` arrays.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SpectralBiclustering : Partitions rows and columns under the assumption\n that the data has an underlying checkerboard structure.\n\n References\n ----------\n * Dhillon, Inderjit S, 2001. `Co-clustering documents and words using\n bipartite spectral graph partitioning\n `__.\n\n Examples\n --------\n >>> from sklearn.cluster import SpectralCoclustering\n >>> import numpy as np\n >>> X = np.array([[1, 1], [2, 1], [1, 0],\n ... [4, 7], [3, 5], [3, 6]])\n >>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)\n >>> clustering.row_labels_ #doctest: +SKIP\n array([0, 1, 1, 0, 0, 0], dtype=int32)\n >>> clustering.column_labels_ #doctest: +SKIP\n array([0, 0], dtype=int32)\n >>> clustering\n SpectralCoclustering(n_clusters=2, random_state=0)\n \"\"\"\n \n def __init__(self, n_clusters=3, *, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', n_init=10, random_state=None):\n super().__init__(n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state)\n \n def _fit(self, X):\n (normalized_data, row_diag, col_diag) = _scale_normalize(X)\n n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))\n (u, v) = self._svd(normalized_data, n_sv, n_discard=1)\n z = np.vstack((row_diag[:, np.newaxis] * u, col_diag[:, np.newaxis] * v))\n (_, labels) = self._k_means(z, self.n_clusters)\n n_rows = X.shape[0]\n self.row_labels_ = labels[:n_rows]\n self.column_labels_ = labels[n_rows:]\n self.rows_ = np.vstack([self.row_labels_ == c for c in range(self.n_clusters)])\n self.columns_ = np.vstack([self.column_labels_ == c for c in range(self.n_clusters)])\n" }, { "name": "Birch", @@ -19947,13 +20012,14 @@ "sklearn.cluster._birch.Birch.partial_fit", "sklearn.cluster._birch.Birch._check_fit", "sklearn.cluster._birch.Birch.predict", + "sklearn.cluster._birch.Birch._predict", "sklearn.cluster._birch.Birch.transform", "sklearn.cluster._birch.Birch._global_clustering" ], "is_public": true, - "description": "Implements the BIRCH clustering algorithm.\n\nIt is a memory-efficient, online-learning algorithm provided as an alternative to :class:`MiniBatchKMeans`. It constructs a tree data structure with the cluster centroids being read off the leaf. These can be either the final cluster centroids or can be provided as input to another clustering algorithm such as :class:`AgglomerativeClustering`. Read more in the :ref:`User Guide `. .. versionadded:: 0.16", + "description": "Implements the BIRCH clustering algorithm.\n\nIt is a memory-efficient, online-learning algorithm provided as an\nalternative to :class:`MiniBatchKMeans`. It constructs a tree\ndata structure with the cluster centroids being read off the leaf.\nThese can be either the final cluster centroids or can be provided as input\nto another clustering algorithm such as :class:`AgglomerativeClustering`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.16", "docstring": "Implements the BIRCH clustering algorithm.\n\n It is a memory-efficient, online-learning algorithm provided as an\n alternative to :class:`MiniBatchKMeans`. It constructs a tree\n data structure with the cluster centroids being read off the leaf.\n These can be either the final cluster centroids or can be provided as input\n to another clustering algorithm such as :class:`AgglomerativeClustering`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.16\n\n Parameters\n ----------\n threshold : float, default=0.5\n The radius of the subcluster obtained by merging a new sample and the\n closest subcluster should be lesser than the threshold. Otherwise a new\n subcluster is started. Setting this value to be very low promotes\n splitting and vice-versa.\n\n branching_factor : int, default=50\n Maximum number of CF subclusters in each node. If a new samples enters\n such that the number of subclusters exceed the branching_factor then\n that node is split into two nodes with the subclusters redistributed\n in each. The parent subcluster of that node is removed and two new\n subclusters are added as parents of the 2 split nodes.\n\n n_clusters : int, instance of sklearn.cluster model, default=3\n Number of clusters after the final clustering step, which treats the\n subclusters from the leaves as new samples.\n\n - `None` : the final clustering step is not performed and the\n subclusters are returned as they are.\n\n - :mod:`sklearn.cluster` Estimator : If a model is provided, the model\n is fit treating the subclusters as new samples and the initial data\n is mapped to the label of the closest subcluster.\n\n - `int` : the model fit is :class:`AgglomerativeClustering` with\n `n_clusters` set to be equal to the int.\n\n compute_labels : bool, default=True\n Whether or not to compute labels for each fit.\n\n copy : bool, default=True\n Whether or not to make a copy of the given data. If set to False,\n the initial data will be overwritten.\n\n Attributes\n ----------\n root_ : _CFNode\n Root of the CFTree.\n\n dummy_leaf_ : _CFNode\n Start pointer to all the leaves.\n\n subcluster_centers_ : ndarray\n Centroids of all subclusters read directly from the leaves.\n\n subcluster_labels_ : ndarray\n Labels assigned to the centroids of the subclusters after\n they are clustered globally.\n\n labels_ : ndarray of shape (n_samples,)\n Array of labels assigned to the input data.\n if partial_fit is used instead of fit, they are assigned to the\n last batch of data.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MiniBatchKMeans : Alternative implementation that does incremental updates\n of the centers' positions using mini-batches.\n\n Notes\n -----\n The tree data structure consists of nodes with each node consisting of\n a number of subclusters. The maximum number of subclusters in a node\n is determined by the branching factor. Each subcluster maintains a\n linear sum, squared sum and the number of samples in that subcluster.\n In addition, each subcluster can also have a node as its child, if the\n subcluster is not a member of a leaf node.\n\n For a new point entering the root, it is merged with the subcluster closest\n to it and the linear sum, squared sum and the number of samples of that\n subcluster are updated. This is done recursively till the properties of\n the leaf node are updated.\n\n References\n ----------\n * Tian Zhang, Raghu Ramakrishnan, Maron Livny\n BIRCH: An efficient data clustering method for large databases.\n https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf\n\n * Roberto Perdisci\n JBirch - Java implementation of BIRCH clustering algorithm\n https://code.google.com/archive/p/jbirch\n\n Examples\n --------\n >>> from sklearn.cluster import Birch\n >>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]\n >>> brc = Birch(n_clusters=None)\n >>> brc.fit(X)\n Birch(n_clusters=None)\n >>> brc.predict(X)\n array([0, 0, 0, 1, 1, 1])\n ", - "source_code": "\n\nclass Birch(ClusterMixin, TransformerMixin, BaseEstimator):\n \"\"\"Implements the BIRCH clustering algorithm.\n\n It is a memory-efficient, online-learning algorithm provided as an\n alternative to :class:`MiniBatchKMeans`. It constructs a tree\n data structure with the cluster centroids being read off the leaf.\n These can be either the final cluster centroids or can be provided as input\n to another clustering algorithm such as :class:`AgglomerativeClustering`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.16\n\n Parameters\n ----------\n threshold : float, default=0.5\n The radius of the subcluster obtained by merging a new sample and the\n closest subcluster should be lesser than the threshold. Otherwise a new\n subcluster is started. Setting this value to be very low promotes\n splitting and vice-versa.\n\n branching_factor : int, default=50\n Maximum number of CF subclusters in each node. If a new samples enters\n such that the number of subclusters exceed the branching_factor then\n that node is split into two nodes with the subclusters redistributed\n in each. The parent subcluster of that node is removed and two new\n subclusters are added as parents of the 2 split nodes.\n\n n_clusters : int, instance of sklearn.cluster model, default=3\n Number of clusters after the final clustering step, which treats the\n subclusters from the leaves as new samples.\n\n - `None` : the final clustering step is not performed and the\n subclusters are returned as they are.\n\n - :mod:`sklearn.cluster` Estimator : If a model is provided, the model\n is fit treating the subclusters as new samples and the initial data\n is mapped to the label of the closest subcluster.\n\n - `int` : the model fit is :class:`AgglomerativeClustering` with\n `n_clusters` set to be equal to the int.\n\n compute_labels : bool, default=True\n Whether or not to compute labels for each fit.\n\n copy : bool, default=True\n Whether or not to make a copy of the given data. If set to False,\n the initial data will be overwritten.\n\n Attributes\n ----------\n root_ : _CFNode\n Root of the CFTree.\n\n dummy_leaf_ : _CFNode\n Start pointer to all the leaves.\n\n subcluster_centers_ : ndarray\n Centroids of all subclusters read directly from the leaves.\n\n subcluster_labels_ : ndarray\n Labels assigned to the centroids of the subclusters after\n they are clustered globally.\n\n labels_ : ndarray of shape (n_samples,)\n Array of labels assigned to the input data.\n if partial_fit is used instead of fit, they are assigned to the\n last batch of data.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MiniBatchKMeans : Alternative implementation that does incremental updates\n of the centers' positions using mini-batches.\n\n Notes\n -----\n The tree data structure consists of nodes with each node consisting of\n a number of subclusters. The maximum number of subclusters in a node\n is determined by the branching factor. Each subcluster maintains a\n linear sum, squared sum and the number of samples in that subcluster.\n In addition, each subcluster can also have a node as its child, if the\n subcluster is not a member of a leaf node.\n\n For a new point entering the root, it is merged with the subcluster closest\n to it and the linear sum, squared sum and the number of samples of that\n subcluster are updated. This is done recursively till the properties of\n the leaf node are updated.\n\n References\n ----------\n * Tian Zhang, Raghu Ramakrishnan, Maron Livny\n BIRCH: An efficient data clustering method for large databases.\n https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf\n\n * Roberto Perdisci\n JBirch - Java implementation of BIRCH clustering algorithm\n https://code.google.com/archive/p/jbirch\n\n Examples\n --------\n >>> from sklearn.cluster import Birch\n >>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]\n >>> brc = Birch(n_clusters=None)\n >>> brc.fit(X)\n Birch(n_clusters=None)\n >>> brc.predict(X)\n array([0, 0, 0, 1, 1, 1])\n \"\"\"\n \n def __init__(self, *, threshold=0.5, branching_factor=50, n_clusters=3, compute_labels=True, copy=True):\n self.threshold = threshold\n self.branching_factor = branching_factor\n self.n_clusters = n_clusters\n self.compute_labels = compute_labels\n self.copy = copy\n \n @deprecated('`fit_` is deprecated in 1.0 and will be removed in 1.2.')\n @property\n def fit_(self):\n return self._deprecated_fit\n \n @deprecated('`partial_fit_` is deprecated in 1.0 and will be removed in 1.2.')\n @property\n def partial_fit_(self):\n return self._deprecated_partial_fit\n \n def fit(self, X, y=None):\n \"\"\"\n Build a CF Tree for the input data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Fitted estimator.\n \"\"\"\n (self._deprecated_fit, self._deprecated_partial_fit) = (True, False)\n return self._fit(X, partial=False)\n \n def _fit(self, X, partial):\n has_root = getattr(self, 'root_', None)\n first_call = not (partial and has_root)\n X = self._validate_data(X, accept_sparse='csr', copy=self.copy, reset=first_call)\n threshold = self.threshold\n branching_factor = self.branching_factor\n if branching_factor <= 1:\n raise ValueError('Branching_factor should be greater than one.')\n (n_samples, n_features) = X.shape\n if first_call:\n self.root_ = _CFNode(threshold=threshold, branching_factor=branching_factor, is_leaf=True, n_features=n_features)\n self.dummy_leaf_ = _CFNode(threshold=threshold, branching_factor=branching_factor, is_leaf=True, n_features=n_features)\n self.dummy_leaf_.next_leaf_ = self.root_\n self.root_.prev_leaf_ = self.dummy_leaf_\n if not sparse.issparse(X):\n iter_func = iter\n else:\n iter_func = _iterate_sparse_X\n for sample in iter_func(X):\n subcluster = _CFSubcluster(linear_sum=sample)\n split = self.root_.insert_cf_subcluster(subcluster)\n if split:\n (new_subcluster1, new_subcluster2) = _split_node(self.root_, threshold, branching_factor)\n del self.root_\n self.root_ = _CFNode(threshold=threshold, branching_factor=branching_factor, is_leaf=False, n_features=n_features)\n self.root_.append_subcluster(new_subcluster1)\n self.root_.append_subcluster(new_subcluster2)\n centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])\n self.subcluster_centers_ = centroids\n self._global_clustering(X)\n return self\n \n def _get_leaves(self):\n \"\"\"\n Retrieve the leaves of the CF Node.\n\n Returns\n -------\n leaves : list of shape (n_leaves,)\n List of the leaf nodes.\n \"\"\"\n leaf_ptr = self.dummy_leaf_.next_leaf_\n leaves = []\n while leaf_ptr is not None:\n leaves.append(leaf_ptr)\n leaf_ptr = leaf_ptr.next_leaf_\n return leaves\n \n def partial_fit(self, X=None, y=None):\n \"\"\"\n Online learning. Prevents rebuilding of CFTree from scratch.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None\n Input data. If X is not provided, only the global clustering\n step is done.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Fitted estimator.\n \"\"\"\n (self._deprecated_partial_fit, self._deprecated_fit) = (True, False)\n if X is None:\n self._global_clustering()\n return self\n else:\n return self._fit(X, partial=True)\n \n def _check_fit(self, X):\n check_is_fitted(self)\n if hasattr(self, 'subcluster_centers_') and X.shape[1] != self.subcluster_centers_.shape[1]:\n raise ValueError('Training data and predicted data do not have same number of features.')\n \n def predict(self, X):\n \"\"\"\n Predict data using the ``centroids_`` of subclusters.\n\n Avoid computation of the row norms of X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n labels : ndarray of shape(n_samples,)\n Labelled data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n kwargs = {'Y_norm_squared': self._subcluster_norms}\n with config_context(assume_finite=True):\n argmin = pairwise_distances_argmin(X, self.subcluster_centers_, metric_kwargs=kwargs)\n return self.subcluster_labels_[argmin]\n \n def transform(self, X):\n \"\"\"\n Transform X into subcluster centroids dimension.\n\n Each dimension represents the distance from the sample point to each\n cluster centroid.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n self._validate_data(X, accept_sparse='csr', reset=False)\n with config_context(assume_finite=True):\n return euclidean_distances(X, self.subcluster_centers_)\n \n def _global_clustering(self, X=None):\n \"\"\"\n Global clustering for the subclusters obtained after fitting\n \"\"\"\n clusterer = self.n_clusters\n centroids = self.subcluster_centers_\n compute_labels = X is not None and self.compute_labels\n not_enough_centroids = False\n if isinstance(clusterer, numbers.Integral):\n clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)\n if len(centroids) < self.n_clusters:\n not_enough_centroids = True\n elif clusterer is not None and not hasattr(clusterer, 'fit_predict'):\n raise ValueError('n_clusters should be an instance of ClusterMixin or an int')\n self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)\n if clusterer is None or not_enough_centroids:\n self.subcluster_labels_ = np.arange(len(centroids))\n if not_enough_centroids:\n warnings.warn('Number of subclusters found (%d) by BIRCH is less than (%d). Decrease the threshold.' % (len(centroids), self.n_clusters), ConvergenceWarning)\n else:\n self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)\n if compute_labels:\n self.labels_ = self.predict(X)\n" + "source_code": "\n\nclass Birch(ClusterMixin, TransformerMixin, BaseEstimator):\n \"\"\"Implements the BIRCH clustering algorithm.\n\n It is a memory-efficient, online-learning algorithm provided as an\n alternative to :class:`MiniBatchKMeans`. It constructs a tree\n data structure with the cluster centroids being read off the leaf.\n These can be either the final cluster centroids or can be provided as input\n to another clustering algorithm such as :class:`AgglomerativeClustering`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.16\n\n Parameters\n ----------\n threshold : float, default=0.5\n The radius of the subcluster obtained by merging a new sample and the\n closest subcluster should be lesser than the threshold. Otherwise a new\n subcluster is started. Setting this value to be very low promotes\n splitting and vice-versa.\n\n branching_factor : int, default=50\n Maximum number of CF subclusters in each node. If a new samples enters\n such that the number of subclusters exceed the branching_factor then\n that node is split into two nodes with the subclusters redistributed\n in each. The parent subcluster of that node is removed and two new\n subclusters are added as parents of the 2 split nodes.\n\n n_clusters : int, instance of sklearn.cluster model, default=3\n Number of clusters after the final clustering step, which treats the\n subclusters from the leaves as new samples.\n\n - `None` : the final clustering step is not performed and the\n subclusters are returned as they are.\n\n - :mod:`sklearn.cluster` Estimator : If a model is provided, the model\n is fit treating the subclusters as new samples and the initial data\n is mapped to the label of the closest subcluster.\n\n - `int` : the model fit is :class:`AgglomerativeClustering` with\n `n_clusters` set to be equal to the int.\n\n compute_labels : bool, default=True\n Whether or not to compute labels for each fit.\n\n copy : bool, default=True\n Whether or not to make a copy of the given data. If set to False,\n the initial data will be overwritten.\n\n Attributes\n ----------\n root_ : _CFNode\n Root of the CFTree.\n\n dummy_leaf_ : _CFNode\n Start pointer to all the leaves.\n\n subcluster_centers_ : ndarray\n Centroids of all subclusters read directly from the leaves.\n\n subcluster_labels_ : ndarray\n Labels assigned to the centroids of the subclusters after\n they are clustered globally.\n\n labels_ : ndarray of shape (n_samples,)\n Array of labels assigned to the input data.\n if partial_fit is used instead of fit, they are assigned to the\n last batch of data.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MiniBatchKMeans : Alternative implementation that does incremental updates\n of the centers' positions using mini-batches.\n\n Notes\n -----\n The tree data structure consists of nodes with each node consisting of\n a number of subclusters. The maximum number of subclusters in a node\n is determined by the branching factor. Each subcluster maintains a\n linear sum, squared sum and the number of samples in that subcluster.\n In addition, each subcluster can also have a node as its child, if the\n subcluster is not a member of a leaf node.\n\n For a new point entering the root, it is merged with the subcluster closest\n to it and the linear sum, squared sum and the number of samples of that\n subcluster are updated. This is done recursively till the properties of\n the leaf node are updated.\n\n References\n ----------\n * Tian Zhang, Raghu Ramakrishnan, Maron Livny\n BIRCH: An efficient data clustering method for large databases.\n https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf\n\n * Roberto Perdisci\n JBirch - Java implementation of BIRCH clustering algorithm\n https://code.google.com/archive/p/jbirch\n\n Examples\n --------\n >>> from sklearn.cluster import Birch\n >>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]\n >>> brc = Birch(n_clusters=None)\n >>> brc.fit(X)\n Birch(n_clusters=None)\n >>> brc.predict(X)\n array([0, 0, 0, 1, 1, 1])\n \"\"\"\n \n def __init__(self, *, threshold=0.5, branching_factor=50, n_clusters=3, compute_labels=True, copy=True):\n self.threshold = threshold\n self.branching_factor = branching_factor\n self.n_clusters = n_clusters\n self.compute_labels = compute_labels\n self.copy = copy\n \n @deprecated('`fit_` is deprecated in 1.0 and will be removed in 1.2.')\n @property\n def fit_(self):\n return self._deprecated_fit\n \n @deprecated('`partial_fit_` is deprecated in 1.0 and will be removed in 1.2.')\n @property\n def partial_fit_(self):\n return self._deprecated_partial_fit\n \n def fit(self, X, y=None):\n \"\"\"\n Build a CF Tree for the input data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Fitted estimator.\n \"\"\"\n (self._deprecated_fit, self._deprecated_partial_fit) = (True, False)\n return self._fit(X, partial=False)\n \n def _fit(self, X, partial):\n has_root = getattr(self, 'root_', None)\n first_call = not (partial and has_root)\n X = self._validate_data(X, accept_sparse='csr', copy=self.copy, reset=first_call)\n threshold = self.threshold\n branching_factor = self.branching_factor\n if branching_factor <= 1:\n raise ValueError('Branching_factor should be greater than one.')\n (n_samples, n_features) = X.shape\n if first_call:\n self.root_ = _CFNode(threshold=threshold, branching_factor=branching_factor, is_leaf=True, n_features=n_features)\n self.dummy_leaf_ = _CFNode(threshold=threshold, branching_factor=branching_factor, is_leaf=True, n_features=n_features)\n self.dummy_leaf_.next_leaf_ = self.root_\n self.root_.prev_leaf_ = self.dummy_leaf_\n if not sparse.issparse(X):\n iter_func = iter\n else:\n iter_func = _iterate_sparse_X\n for sample in iter_func(X):\n subcluster = _CFSubcluster(linear_sum=sample)\n split = self.root_.insert_cf_subcluster(subcluster)\n if split:\n (new_subcluster1, new_subcluster2) = _split_node(self.root_, threshold, branching_factor)\n del self.root_\n self.root_ = _CFNode(threshold=threshold, branching_factor=branching_factor, is_leaf=False, n_features=n_features)\n self.root_.append_subcluster(new_subcluster1)\n self.root_.append_subcluster(new_subcluster2)\n centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])\n self.subcluster_centers_ = centroids\n self._global_clustering(X)\n return self\n \n def _get_leaves(self):\n \"\"\"\n Retrieve the leaves of the CF Node.\n\n Returns\n -------\n leaves : list of shape (n_leaves,)\n List of the leaf nodes.\n \"\"\"\n leaf_ptr = self.dummy_leaf_.next_leaf_\n leaves = []\n while leaf_ptr is not None:\n leaves.append(leaf_ptr)\n leaf_ptr = leaf_ptr.next_leaf_\n return leaves\n \n def partial_fit(self, X=None, y=None):\n \"\"\"\n Online learning. Prevents rebuilding of CFTree from scratch.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None\n Input data. If X is not provided, only the global clustering\n step is done.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Fitted estimator.\n \"\"\"\n (self._deprecated_partial_fit, self._deprecated_fit) = (True, False)\n if X is None:\n self._global_clustering()\n return self\n else:\n return self._fit(X, partial=True)\n \n def _check_fit(self, X):\n check_is_fitted(self)\n if hasattr(self, 'subcluster_centers_') and X.shape[1] != self.subcluster_centers_.shape[1]:\n raise ValueError('Training data and predicted data do not have same number of features.')\n \n def predict(self, X):\n \"\"\"\n Predict data using the ``centroids_`` of subclusters.\n\n Avoid computation of the row norms of X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n labels : ndarray of shape(n_samples,)\n Labelled data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n return self._predict(X)\n \n def _predict(self, X):\n \"\"\"Predict data using the ``centroids_`` of subclusters.\"\"\"\n kwargs = {'Y_norm_squared': self._subcluster_norms}\n with config_context(assume_finite=True):\n argmin = pairwise_distances_argmin(X, self.subcluster_centers_, metric_kwargs=kwargs)\n return self.subcluster_labels_[argmin]\n \n def transform(self, X):\n \"\"\"\n Transform X into subcluster centroids dimension.\n\n Each dimension represents the distance from the sample point to each\n cluster centroid.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n self._validate_data(X, accept_sparse='csr', reset=False)\n with config_context(assume_finite=True):\n return euclidean_distances(X, self.subcluster_centers_)\n \n def _global_clustering(self, X=None):\n \"\"\"\n Global clustering for the subclusters obtained after fitting\n \"\"\"\n clusterer = self.n_clusters\n centroids = self.subcluster_centers_\n compute_labels = X is not None and self.compute_labels\n not_enough_centroids = False\n if isinstance(clusterer, numbers.Integral):\n clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)\n if len(centroids) < self.n_clusters:\n not_enough_centroids = True\n elif clusterer is not None and not hasattr(clusterer, 'fit_predict'):\n raise ValueError('n_clusters should be an instance of ClusterMixin or an int')\n self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)\n if clusterer is None or not_enough_centroids:\n self.subcluster_labels_ = np.arange(len(centroids))\n if not_enough_centroids:\n warnings.warn('Number of subclusters found (%d) by BIRCH is less than (%d). Decrease the threshold.' % (len(centroids), self.n_clusters), ConvergenceWarning)\n else:\n self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)\n if compute_labels:\n self.labels_ = self._predict(X)\n" }, { "name": "_CFNode", @@ -19967,7 +20033,7 @@ "sklearn.cluster._birch._CFNode.insert_cf_subcluster" ], "is_public": false, - "description": "Each node in a CFTree is called a CFNode.\n\nThe CFNode can have a maximum of branching_factor number of CFSubclusters.", + "description": "Each node in a CFTree is called a CFNode.\n\nThe CFNode can have a maximum of branching_factor\nnumber of CFSubclusters.", "docstring": "Each node in a CFTree is called a CFNode.\n\n The CFNode can have a maximum of branching_factor\n number of CFSubclusters.\n\n Parameters\n ----------\n threshold : float\n Threshold needed for a new subcluster to enter a CFSubcluster.\n\n branching_factor : int\n Maximum number of CF subclusters in each node.\n\n is_leaf : bool\n We need to know if the CFNode is a leaf or not, in order to\n retrieve the final subclusters.\n\n n_features : int\n The number of features.\n\n Attributes\n ----------\n subclusters_ : list\n List of subclusters for a particular CFNode.\n\n prev_leaf_ : _CFNode\n Useful only if is_leaf is True.\n\n next_leaf_ : _CFNode\n next_leaf. Useful only if is_leaf is True.\n the final subclusters.\n\n init_centroids_ : ndarray of shape (branching_factor + 1, n_features)\n Manipulate ``init_centroids_`` throughout rather than centroids_ since\n the centroids are just a view of the ``init_centroids_`` .\n\n init_sq_norm_ : ndarray of shape (branching_factor + 1,)\n manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.\n\n centroids_ : ndarray of shape (branching_factor + 1, n_features)\n View of ``init_centroids_``.\n\n squared_norm_ : ndarray of shape (branching_factor + 1,)\n View of ``init_sq_norm_``.\n\n ", "source_code": "\n\nclass _CFNode:\n \"\"\"Each node in a CFTree is called a CFNode.\n\n The CFNode can have a maximum of branching_factor\n number of CFSubclusters.\n\n Parameters\n ----------\n threshold : float\n Threshold needed for a new subcluster to enter a CFSubcluster.\n\n branching_factor : int\n Maximum number of CF subclusters in each node.\n\n is_leaf : bool\n We need to know if the CFNode is a leaf or not, in order to\n retrieve the final subclusters.\n\n n_features : int\n The number of features.\n\n Attributes\n ----------\n subclusters_ : list\n List of subclusters for a particular CFNode.\n\n prev_leaf_ : _CFNode\n Useful only if is_leaf is True.\n\n next_leaf_ : _CFNode\n next_leaf. Useful only if is_leaf is True.\n the final subclusters.\n\n init_centroids_ : ndarray of shape (branching_factor + 1, n_features)\n Manipulate ``init_centroids_`` throughout rather than centroids_ since\n the centroids are just a view of the ``init_centroids_`` .\n\n init_sq_norm_ : ndarray of shape (branching_factor + 1,)\n manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.\n\n centroids_ : ndarray of shape (branching_factor + 1, n_features)\n View of ``init_centroids_``.\n\n squared_norm_ : ndarray of shape (branching_factor + 1,)\n View of ``init_sq_norm_``.\n\n \"\"\"\n \n def __init__(self, *, threshold, branching_factor, is_leaf, n_features):\n self.threshold = threshold\n self.branching_factor = branching_factor\n self.is_leaf = is_leaf\n self.n_features = n_features\n self.subclusters_ = []\n self.init_centroids_ = np.zeros((branching_factor + 1, n_features))\n self.init_sq_norm_ = np.zeros(branching_factor + 1)\n self.squared_norm_ = []\n self.prev_leaf_ = None\n self.next_leaf_ = None\n \n def append_subcluster(self, subcluster):\n n_samples = len(self.subclusters_)\n self.subclusters_.append(subcluster)\n self.init_centroids_[n_samples] = subcluster.centroid_\n self.init_sq_norm_[n_samples] = subcluster.sq_norm_\n self.centroids_ = self.init_centroids_[:n_samples + 1, :]\n self.squared_norm_ = self.init_sq_norm_[:n_samples + 1]\n \n def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):\n \"\"\"Remove a subcluster from a node and update it with the\n split subclusters.\n \"\"\"\n ind = self.subclusters_.index(subcluster)\n self.subclusters_[ind] = new_subcluster1\n self.init_centroids_[ind] = new_subcluster1.centroid_\n self.init_sq_norm_[ind] = new_subcluster1.sq_norm_\n self.append_subcluster(new_subcluster2)\n \n def insert_cf_subcluster(self, subcluster):\n \"\"\"Insert a new subcluster into the node.\"\"\"\n if not self.subclusters_:\n self.append_subcluster(subcluster)\n return False\n threshold = self.threshold\n branching_factor = self.branching_factor\n dist_matrix = np.dot(self.centroids_, subcluster.centroid_)\n dist_matrix *= -2.0\n dist_matrix += self.squared_norm_\n closest_index = np.argmin(dist_matrix)\n closest_subcluster = self.subclusters_[closest_index]\n if closest_subcluster.child_ is not None:\n split_child = closest_subcluster.child_.insert_cf_subcluster(subcluster)\n if not split_child:\n closest_subcluster.update(subcluster)\n self.init_centroids_[closest_index] = self.subclusters_[closest_index].centroid_\n self.init_sq_norm_[closest_index] = self.subclusters_[closest_index].sq_norm_\n return False\n else:\n (new_subcluster1, new_subcluster2) = _split_node(closest_subcluster.child_, threshold, branching_factor)\n self.update_split_subclusters(closest_subcluster, new_subcluster1, new_subcluster2)\n if len(self.subclusters_) > self.branching_factor:\n return True\n return False\n else:\n merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)\n if merged:\n self.init_centroids_[closest_index] = closest_subcluster.centroid_\n self.init_sq_norm_[closest_index] = closest_subcluster.sq_norm_\n return False\n elif len(self.subclusters_) < self.branching_factor:\n self.append_subcluster(subcluster)\n return False\n else:\n self.append_subcluster(subcluster)\n return True\n" }, @@ -19998,7 +20064,7 @@ "sklearn.cluster._dbscan.DBSCAN.fit_predict" ], "is_public": true, - "description": "Perform DBSCAN clustering from vector array or distance matrix.\n\nDBSCAN - Density-Based Spatial Clustering of Applications with Noise. Finds core samples of high density and expands clusters from them. Good for data which contains clusters of similar density. Read more in the :ref:`User Guide `.", + "description": "Perform DBSCAN clustering from vector array or distance matrix.\n\nDBSCAN - Density-Based Spatial Clustering of Applications with Noise.\nFinds core samples of high density and expands clusters from them.\nGood for data which contains clusters of similar density.\n\nRead more in the :ref:`User Guide `.", "docstring": "Perform DBSCAN clustering from vector array or distance matrix.\n\n DBSCAN - Density-Based Spatial Clustering of Applications with Noise.\n Finds core samples of high density and expands clusters from them.\n Good for data which contains clusters of similar density.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n eps : float, default=0.5\n The maximum distance between two samples for one to be considered\n as in the neighborhood of the other. This is not a maximum bound\n on the distances of points within a cluster. This is the most\n important DBSCAN parameter to choose appropriately for your data set\n and distance function.\n\n min_samples : int, default=5\n The number of samples (or total weight) in a neighborhood for a point\n to be considered as a core point. This includes the point itself.\n\n metric : str, or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string or callable, it must be one of\n the options allowed by :func:`sklearn.metrics.pairwise_distances` for\n its metric parameter.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square. X may be a :term:`Glossary `, in which\n case only \"nonzero\" elements may be considered neighbors for DBSCAN.\n\n .. versionadded:: 0.17\n metric *precomputed* to accept precomputed sparse matrix.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n .. versionadded:: 0.19\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n The algorithm to be used by the NearestNeighbors module\n to compute pointwise distances and find nearest neighbors.\n See NearestNeighbors module documentation for details.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or cKDTree. This can affect the speed\n of the construction and query, as well as the memory required\n to store the tree. The optimal value depends\n on the nature of the problem.\n\n p : float, default=None\n The power of the Minkowski metric to be used to calculate distance\n between points. If None, then ``p=2`` (equivalent to the Euclidean\n distance).\n\n n_jobs : int, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n core_sample_indices_ : ndarray of shape (n_core_samples,)\n Indices of core samples.\n\n components_ : ndarray of shape (n_core_samples, n_features)\n Copy of each core sample found by training.\n\n labels_ : ndarray of shape (n_samples)\n Cluster labels for each point in the dataset given to fit().\n Noisy samples are given the label -1.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OPTICS : A similar clustering at multiple values of eps. Our implementation\n is optimized for memory usage.\n\n Notes\n -----\n For an example, see :ref:`examples/cluster/plot_dbscan.py\n `.\n\n This implementation bulk-computes all neighborhood queries, which increases\n the memory complexity to O(n.d) where d is the average number of neighbors,\n while original DBSCAN had memory complexity O(n). It may attract a higher\n memory complexity when querying these nearest neighborhoods, depending\n on the ``algorithm``.\n\n One way to avoid the query complexity is to pre-compute sparse\n neighborhoods in chunks using\n :func:`NearestNeighbors.radius_neighbors_graph\n ` with\n ``mode='distance'``, then using ``metric='precomputed'`` here.\n\n Another way to reduce memory and computation time is to remove\n (near-)duplicate points and use ``sample_weight`` instead.\n\n :class:`cluster.OPTICS` provides a similar clustering with lower memory\n usage.\n\n References\n ----------\n Ester, M., H. P. Kriegel, J. Sander, and X. Xu, \"A Density-Based\n Algorithm for Discovering Clusters in Large Spatial Databases with Noise\".\n In: Proceedings of the 2nd International Conference on Knowledge Discovery\n and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996\n\n Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).\n DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.\n ACM Transactions on Database Systems (TODS), 42(3), 19.\n\n Examples\n --------\n >>> from sklearn.cluster import DBSCAN\n >>> import numpy as np\n >>> X = np.array([[1, 2], [2, 2], [2, 3],\n ... [8, 7], [8, 8], [25, 80]])\n >>> clustering = DBSCAN(eps=3, min_samples=2).fit(X)\n >>> clustering.labels_\n array([ 0, 0, 0, 1, 1, -1])\n >>> clustering\n DBSCAN(eps=3, min_samples=2)\n ", "source_code": "\n\nclass DBSCAN(ClusterMixin, BaseEstimator):\n \"\"\"Perform DBSCAN clustering from vector array or distance matrix.\n\n DBSCAN - Density-Based Spatial Clustering of Applications with Noise.\n Finds core samples of high density and expands clusters from them.\n Good for data which contains clusters of similar density.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n eps : float, default=0.5\n The maximum distance between two samples for one to be considered\n as in the neighborhood of the other. This is not a maximum bound\n on the distances of points within a cluster. This is the most\n important DBSCAN parameter to choose appropriately for your data set\n and distance function.\n\n min_samples : int, default=5\n The number of samples (or total weight) in a neighborhood for a point\n to be considered as a core point. This includes the point itself.\n\n metric : str, or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string or callable, it must be one of\n the options allowed by :func:`sklearn.metrics.pairwise_distances` for\n its metric parameter.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square. X may be a :term:`Glossary `, in which\n case only \"nonzero\" elements may be considered neighbors for DBSCAN.\n\n .. versionadded:: 0.17\n metric *precomputed* to accept precomputed sparse matrix.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n .. versionadded:: 0.19\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n The algorithm to be used by the NearestNeighbors module\n to compute pointwise distances and find nearest neighbors.\n See NearestNeighbors module documentation for details.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or cKDTree. This can affect the speed\n of the construction and query, as well as the memory required\n to store the tree. The optimal value depends\n on the nature of the problem.\n\n p : float, default=None\n The power of the Minkowski metric to be used to calculate distance\n between points. If None, then ``p=2`` (equivalent to the Euclidean\n distance).\n\n n_jobs : int, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n core_sample_indices_ : ndarray of shape (n_core_samples,)\n Indices of core samples.\n\n components_ : ndarray of shape (n_core_samples, n_features)\n Copy of each core sample found by training.\n\n labels_ : ndarray of shape (n_samples)\n Cluster labels for each point in the dataset given to fit().\n Noisy samples are given the label -1.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OPTICS : A similar clustering at multiple values of eps. Our implementation\n is optimized for memory usage.\n\n Notes\n -----\n For an example, see :ref:`examples/cluster/plot_dbscan.py\n `.\n\n This implementation bulk-computes all neighborhood queries, which increases\n the memory complexity to O(n.d) where d is the average number of neighbors,\n while original DBSCAN had memory complexity O(n). It may attract a higher\n memory complexity when querying these nearest neighborhoods, depending\n on the ``algorithm``.\n\n One way to avoid the query complexity is to pre-compute sparse\n neighborhoods in chunks using\n :func:`NearestNeighbors.radius_neighbors_graph\n ` with\n ``mode='distance'``, then using ``metric='precomputed'`` here.\n\n Another way to reduce memory and computation time is to remove\n (near-)duplicate points and use ``sample_weight`` instead.\n\n :class:`cluster.OPTICS` provides a similar clustering with lower memory\n usage.\n\n References\n ----------\n Ester, M., H. P. Kriegel, J. Sander, and X. Xu, \"A Density-Based\n Algorithm for Discovering Clusters in Large Spatial Databases with Noise\".\n In: Proceedings of the 2nd International Conference on Knowledge Discovery\n and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996\n\n Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).\n DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.\n ACM Transactions on Database Systems (TODS), 42(3), 19.\n\n Examples\n --------\n >>> from sklearn.cluster import DBSCAN\n >>> import numpy as np\n >>> X = np.array([[1, 2], [2, 2], [2, 3],\n ... [8, 7], [8, 8], [25, 80]])\n >>> clustering = DBSCAN(eps=3, min_samples=2).fit(X)\n >>> clustering.labels_\n array([ 0, 0, 0, 1, 1, -1])\n >>> clustering\n DBSCAN(eps=3, min_samples=2)\n \"\"\"\n \n def __init__(self, eps=0.5, *, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None):\n self.eps = eps\n self.min_samples = min_samples\n self.metric = metric\n self.metric_params = metric_params\n self.algorithm = algorithm\n self.leaf_size = leaf_size\n self.p = p\n self.n_jobs = n_jobs\n \n def fit(self, X, y=None, sample_weight=None):\n \"\"\"Perform DBSCAN clustering from features, or distance matrix.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``metric='precomputed'``. If a sparse matrix is provided, it will\n be converted into a sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weight of each sample, such that a sample with a weight of at least\n ``min_samples`` is by itself a core sample; a sample with a\n negative weight may inhibit its eps-neighbor from being core.\n Note that weights are absolute, and default to 1.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr')\n if not self.eps > 0.0:\n raise ValueError('eps must be positive.')\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if self.metric == 'precomputed' and sparse.issparse(X):\n with warnings.catch_warnings():\n warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning)\n X.setdiag(X.diagonal())\n neighbors_model = NearestNeighbors(radius=self.eps, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs)\n neighbors_model.fit(X)\n neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False)\n if sample_weight is None:\n n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])\n else:\n n_neighbors = np.array([np.sum(sample_weight[neighbors]) for neighbors in neighborhoods])\n labels = np.full(X.shape[0], -1, dtype=np.intp)\n core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8)\n dbscan_inner(core_samples, neighborhoods, labels)\n self.core_sample_indices_ = np.where(core_samples)[0]\n self.labels_ = labels\n if len(self.core_sample_indices_):\n self.components_ = X[self.core_sample_indices_].copy()\n else:\n self.components_ = np.empty((0, X.shape[1]))\n return self\n \n def fit_predict(self, X, y=None, sample_weight=None):\n \"\"\"Compute clusters from a data or distance matrix and predict labels.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``metric='precomputed'``. If a sparse matrix is provided, it will\n be converted into a sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weight of each sample, such that a sample with a weight of at least\n ``min_samples`` is by itself a core sample; a sample with a\n negative weight may inhibit its eps-neighbor from being core.\n Note that weights are absolute, and default to 1.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels. Noisy samples are given the label -1.\n \"\"\"\n self.fit(X, sample_weight=sample_weight)\n return self.labels_\n" }, @@ -20080,7 +20146,7 @@ "sklearn.cluster._mean_shift.MeanShift.predict" ], "is_public": true, - "description": "Mean shift clustering using a flat kernel.\n\nMean shift clustering aims to discover \"blobs\" in a smooth density of samples. It is a centroid-based algorithm, which works by updating candidates for centroids to be the mean of the points within a given region. These candidates are then filtered in a post-processing stage to eliminate near-duplicates to form the final set of centroids. Seeding is performed using a binning technique for scalability. Read more in the :ref:`User Guide `.", + "description": "Mean shift clustering using a flat kernel.\n\nMean shift clustering aims to discover \"blobs\" in a smooth density of\nsamples. It is a centroid-based algorithm, which works by updating\ncandidates for centroids to be the mean of the points within a given\nregion. These candidates are then filtered in a post-processing stage to\neliminate near-duplicates to form the final set of centroids.\n\nSeeding is performed using a binning technique for scalability.\n\nRead more in the :ref:`User Guide `.", "docstring": "Mean shift clustering using a flat kernel.\n\n Mean shift clustering aims to discover \"blobs\" in a smooth density of\n samples. It is a centroid-based algorithm, which works by updating\n candidates for centroids to be the mean of the points within a given\n region. These candidates are then filtered in a post-processing stage to\n eliminate near-duplicates to form the final set of centroids.\n\n Seeding is performed using a binning technique for scalability.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n bandwidth : float, default=None\n Bandwidth used in the RBF kernel.\n\n If not given, the bandwidth is estimated using\n sklearn.cluster.estimate_bandwidth; see the documentation for that\n function for hints on scalability (see also the Notes, below).\n\n seeds : array-like of shape (n_samples, n_features), default=None\n Seeds used to initialize kernels. If not set,\n the seeds are calculated by clustering.get_bin_seeds\n with bandwidth as the grid size and default values for\n other parameters.\n\n bin_seeding : bool, default=False\n If true, initial kernel locations are not locations of all\n points, but rather the location of the discretized version of\n points, where points are binned onto a grid whose coarseness\n corresponds to the bandwidth. Setting this option to True will speed\n up the algorithm because fewer seeds will be initialized.\n The default value is False.\n Ignored if seeds argument is not None.\n\n min_bin_freq : int, default=1\n To speed up the algorithm, accept only those bins with at least\n min_bin_freq points as seeds.\n\n cluster_all : bool, default=True\n If true, then all points are clustered, even those orphans that are\n not within any kernel. Orphans are assigned to the nearest kernel.\n If false, then orphans are given cluster label -1.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by computing\n each of the n_init runs in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n max_iter : int, default=300\n Maximum number of iterations, per seed point before the clustering\n operation terminates (for that seed point), if has not converged yet.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n cluster_centers_ : ndarray of shape (n_clusters, n_features)\n Coordinates of cluster centers.\n\n labels_ : ndarray of shape (n_samples,)\n Labels of each point.\n\n n_iter_ : int\n Maximum number of iterations performed on each seed.\n\n .. versionadded:: 0.22\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n KMeans : K-Means clustering.\n\n Notes\n -----\n\n Scalability:\n\n Because this implementation uses a flat kernel and\n a Ball Tree to look up members of each kernel, the complexity will tend\n towards O(T*n*log(n)) in lower dimensions, with n the number of samples\n and T the number of points. In higher dimensions the complexity will\n tend towards O(T*n^2).\n\n Scalability can be boosted by using fewer seeds, for example by using\n a higher value of min_bin_freq in the get_bin_seeds function.\n\n Note that the estimate_bandwidth function is much less scalable than the\n mean shift algorithm and will be the bottleneck if it is used.\n\n References\n ----------\n\n Dorin Comaniciu and Peter Meer, \"Mean Shift: A robust approach toward\n feature space analysis\". IEEE Transactions on Pattern Analysis and\n Machine Intelligence. 2002. pp. 603-619.\n\n Examples\n --------\n >>> from sklearn.cluster import MeanShift\n >>> import numpy as np\n >>> X = np.array([[1, 1], [2, 1], [1, 0],\n ... [4, 7], [3, 5], [3, 6]])\n >>> clustering = MeanShift(bandwidth=2).fit(X)\n >>> clustering.labels_\n array([1, 1, 1, 0, 0, 0])\n >>> clustering.predict([[0, 0], [5, 5]])\n array([1, 0])\n >>> clustering\n MeanShift(bandwidth=2)\n ", "source_code": "\n\nclass MeanShift(ClusterMixin, BaseEstimator):\n \"\"\"Mean shift clustering using a flat kernel.\n\n Mean shift clustering aims to discover \"blobs\" in a smooth density of\n samples. It is a centroid-based algorithm, which works by updating\n candidates for centroids to be the mean of the points within a given\n region. These candidates are then filtered in a post-processing stage to\n eliminate near-duplicates to form the final set of centroids.\n\n Seeding is performed using a binning technique for scalability.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n bandwidth : float, default=None\n Bandwidth used in the RBF kernel.\n\n If not given, the bandwidth is estimated using\n sklearn.cluster.estimate_bandwidth; see the documentation for that\n function for hints on scalability (see also the Notes, below).\n\n seeds : array-like of shape (n_samples, n_features), default=None\n Seeds used to initialize kernels. If not set,\n the seeds are calculated by clustering.get_bin_seeds\n with bandwidth as the grid size and default values for\n other parameters.\n\n bin_seeding : bool, default=False\n If true, initial kernel locations are not locations of all\n points, but rather the location of the discretized version of\n points, where points are binned onto a grid whose coarseness\n corresponds to the bandwidth. Setting this option to True will speed\n up the algorithm because fewer seeds will be initialized.\n The default value is False.\n Ignored if seeds argument is not None.\n\n min_bin_freq : int, default=1\n To speed up the algorithm, accept only those bins with at least\n min_bin_freq points as seeds.\n\n cluster_all : bool, default=True\n If true, then all points are clustered, even those orphans that are\n not within any kernel. Orphans are assigned to the nearest kernel.\n If false, then orphans are given cluster label -1.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by computing\n each of the n_init runs in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n max_iter : int, default=300\n Maximum number of iterations, per seed point before the clustering\n operation terminates (for that seed point), if has not converged yet.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n cluster_centers_ : ndarray of shape (n_clusters, n_features)\n Coordinates of cluster centers.\n\n labels_ : ndarray of shape (n_samples,)\n Labels of each point.\n\n n_iter_ : int\n Maximum number of iterations performed on each seed.\n\n .. versionadded:: 0.22\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n KMeans : K-Means clustering.\n\n Notes\n -----\n\n Scalability:\n\n Because this implementation uses a flat kernel and\n a Ball Tree to look up members of each kernel, the complexity will tend\n towards O(T*n*log(n)) in lower dimensions, with n the number of samples\n and T the number of points. In higher dimensions the complexity will\n tend towards O(T*n^2).\n\n Scalability can be boosted by using fewer seeds, for example by using\n a higher value of min_bin_freq in the get_bin_seeds function.\n\n Note that the estimate_bandwidth function is much less scalable than the\n mean shift algorithm and will be the bottleneck if it is used.\n\n References\n ----------\n\n Dorin Comaniciu and Peter Meer, \"Mean Shift: A robust approach toward\n feature space analysis\". IEEE Transactions on Pattern Analysis and\n Machine Intelligence. 2002. pp. 603-619.\n\n Examples\n --------\n >>> from sklearn.cluster import MeanShift\n >>> import numpy as np\n >>> X = np.array([[1, 1], [2, 1], [1, 0],\n ... [4, 7], [3, 5], [3, 6]])\n >>> clustering = MeanShift(bandwidth=2).fit(X)\n >>> clustering.labels_\n array([1, 1, 1, 0, 0, 0])\n >>> clustering.predict([[0, 0], [5, 5]])\n array([1, 0])\n >>> clustering\n MeanShift(bandwidth=2)\n \"\"\"\n \n def __init__(self, *, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300):\n self.bandwidth = bandwidth\n self.seeds = seeds\n self.bin_seeding = bin_seeding\n self.cluster_all = cluster_all\n self.min_bin_freq = min_bin_freq\n self.n_jobs = n_jobs\n self.max_iter = max_iter\n \n def fit(self, X, y=None):\n \"\"\"Perform clustering.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Samples to cluster.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted instance.\n \"\"\"\n X = self._validate_data(X)\n bandwidth = self.bandwidth\n if bandwidth is None:\n bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)\n elif bandwidth <= 0:\n raise ValueError('bandwidth needs to be greater than zero or None, got %f' % bandwidth)\n seeds = self.seeds\n if seeds is None:\n if self.bin_seeding:\n seeds = get_bin_seeds(X, bandwidth, self.min_bin_freq)\n else:\n seeds = X\n (n_samples, n_features) = X.shape\n center_intensity_dict = {}\n nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)\n all_res = Parallel(n_jobs=self.n_jobs)((delayed(_mean_shift_single_seed)(seed, X, nbrs, self.max_iter) for seed in seeds))\n for i in range(len(seeds)):\n if all_res[i][1]:\n center_intensity_dict[all_res[i][0]] = all_res[i][1]\n self.n_iter_ = max([x[2] for x in all_res])\n if not center_intensity_dict:\n raise ValueError('No point was within bandwidth=%f of any seed. Try a different seeding strategy or increase the bandwidth.' % bandwidth)\n sorted_by_intensity = sorted(center_intensity_dict.items(), key=lambda tup: (tup[1], tup[0]), reverse=True)\n sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])\n unique = np.ones(len(sorted_centers), dtype=bool)\n nbrs = NearestNeighbors(radius=bandwidth, n_jobs=self.n_jobs).fit(sorted_centers)\n for (i, center) in enumerate(sorted_centers):\n if unique[i]:\n neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[0]\n unique[neighbor_idxs] = 0\n unique[i] = 1\n cluster_centers = sorted_centers[unique]\n nbrs = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs).fit(cluster_centers)\n labels = np.zeros(n_samples, dtype=int)\n (distances, idxs) = nbrs.kneighbors(X)\n if self.cluster_all:\n labels = idxs.flatten()\n else:\n labels.fill(-1)\n bool_selector = distances.flatten() <= bandwidth\n labels[bool_selector] = idxs.flatten()[bool_selector]\n (self.cluster_centers_, self.labels_) = (cluster_centers, labels)\n return self\n \n def predict(self, X):\n \"\"\"Predict the closest cluster each sample in X belongs to.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n New data to predict.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n with config_context(assume_finite=True):\n return pairwise_distances_argmin(X, self.cluster_centers_)\n" }, @@ -20094,7 +20160,7 @@ "sklearn.cluster._optics.OPTICS.fit" ], "is_public": true, - "description": "Estimate clustering structure from vector array.\n\nOPTICS (Ordering Points To Identify the Clustering Structure), closely related to DBSCAN, finds core sample of high density and expands clusters from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable neighborhood radius. Better suited for usage on large datasets than the current sklearn implementation of DBSCAN. Clusters are then extracted using a DBSCAN-like method (cluster_method = 'dbscan') or an automatic technique proposed in [1]_ (cluster_method = 'xi'). This implementation deviates from the original OPTICS by first performing k-nearest-neighborhood searches on all points to identify core sizes, then computing only the distances to unprocessed points when constructing the cluster order. Note that we do not employ a heap to manage the expansion candidates, so the time complexity will be O(n^2). Read more in the :ref:`User Guide `.", + "description": "Estimate clustering structure from vector array.\n\nOPTICS (Ordering Points To Identify the Clustering Structure), closely\nrelated to DBSCAN, finds core sample of high density and expands clusters\nfrom them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable\nneighborhood radius. Better suited for usage on large datasets than the\ncurrent sklearn implementation of DBSCAN.\n\nClusters are then extracted using a DBSCAN-like method\n(cluster_method = 'dbscan') or an automatic\ntechnique proposed in [1]_ (cluster_method = 'xi').\n\nThis implementation deviates from the original OPTICS by first performing\nk-nearest-neighborhood searches on all points to identify core sizes, then\ncomputing only the distances to unprocessed points when constructing the\ncluster order. Note that we do not employ a heap to manage the expansion\ncandidates, so the time complexity will be O(n^2).\n\nRead more in the :ref:`User Guide `.", "docstring": "Estimate clustering structure from vector array.\n\n OPTICS (Ordering Points To Identify the Clustering Structure), closely\n related to DBSCAN, finds core sample of high density and expands clusters\n from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable\n neighborhood radius. Better suited for usage on large datasets than the\n current sklearn implementation of DBSCAN.\n\n Clusters are then extracted using a DBSCAN-like method\n (cluster_method = 'dbscan') or an automatic\n technique proposed in [1]_ (cluster_method = 'xi').\n\n This implementation deviates from the original OPTICS by first performing\n k-nearest-neighborhood searches on all points to identify core sizes, then\n computing only the distances to unprocessed points when constructing the\n cluster order. Note that we do not employ a heap to manage the expansion\n candidates, so the time complexity will be O(n^2).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n min_samples : int > 1 or float between 0 and 1, default=5\n The number of samples in a neighborhood for a point to be considered as\n a core point. Also, up and down steep regions can't have more than\n ``min_samples`` consecutive non-steep points. Expressed as an absolute\n number or a fraction of the number of samples (rounded to be at least\n 2).\n\n max_eps : float, default=np.inf\n The maximum distance between two samples for one to be considered as\n in the neighborhood of the other. Default value of ``np.inf`` will\n identify clusters across all scales; reducing ``max_eps`` will result\n in shorter run times.\n\n metric : str or callable, default='minkowski'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string. If metric is\n \"precomputed\", X is assumed to be a distance matrix and must be square.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n cluster_method : str, default='xi'\n The extraction method used to extract clusters using the calculated\n reachability and ordering. Possible values are \"xi\" and \"dbscan\".\n\n eps : float, default=None\n The maximum distance between two samples for one to be considered as\n in the neighborhood of the other. By default it assumes the same value\n as ``max_eps``.\n Used only when ``cluster_method='dbscan'``.\n\n xi : float between 0 and 1, default=0.05\n Determines the minimum steepness on the reachability plot that\n constitutes a cluster boundary. For example, an upwards point in the\n reachability plot is defined by the ratio from one point to its\n successor being at most 1-xi.\n Used only when ``cluster_method='xi'``.\n\n predecessor_correction : bool, default=True\n Correct clusters according to the predecessors calculated by OPTICS\n [2]_. This parameter has minimal effect on most datasets.\n Used only when ``cluster_method='xi'``.\n\n min_cluster_size : int > 1 or float between 0 and 1, default=None\n Minimum number of samples in an OPTICS cluster, expressed as an\n absolute number or a fraction of the number of samples (rounded to be\n at least 2). If ``None``, the value of ``min_samples`` is used instead.\n Used only when ``cluster_method='xi'``.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method. (default)\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can\n affect the speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n memory : str or object with the joblib.Memory interface, default=None\n Used to cache the output of the computation of the tree.\n By default, no caching is done. If a string is given, it is the\n path to the caching directory.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n labels_ : ndarray of shape (n_samples,)\n Cluster labels for each point in the dataset given to fit().\n Noisy samples and points which are not included in a leaf cluster\n of ``cluster_hierarchy_`` are labeled as -1.\n\n reachability_ : ndarray of shape (n_samples,)\n Reachability distances per sample, indexed by object order. Use\n ``clust.reachability_[clust.ordering_]`` to access in cluster order.\n\n ordering_ : ndarray of shape (n_samples,)\n The cluster ordered list of sample indices.\n\n core_distances_ : ndarray of shape (n_samples,)\n Distance at which each sample becomes a core point, indexed by object\n order. Points which will never be core have a distance of inf. Use\n ``clust.core_distances_[clust.ordering_]`` to access in cluster order.\n\n predecessor_ : ndarray of shape (n_samples,)\n Point that a sample was reached from, indexed by object order.\n Seed points have a predecessor of -1.\n\n cluster_hierarchy_ : ndarray of shape (n_clusters, 2)\n The list of clusters in the form of ``[start, end]`` in each row, with\n all indices inclusive. The clusters are ordered according to\n ``(end, -start)`` (ascending) so that larger clusters encompassing\n smaller clusters come after those smaller ones. Since ``labels_`` does\n not reflect the hierarchy, usually\n ``len(cluster_hierarchy_) > np.unique(optics.labels_)``. Please also\n note that these indices are of the ``ordering_``, i.e.\n ``X[ordering_][start:end + 1]`` form a cluster.\n Only available when ``cluster_method='xi'``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n DBSCAN : A similar clustering for a specified neighborhood radius (eps).\n Our implementation is optimized for runtime.\n\n References\n ----------\n .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,\n and J\u00f6rg Sander. \"OPTICS: ordering points to identify the clustering\n structure.\" ACM SIGMOD Record 28, no. 2 (1999): 49-60.\n\n .. [2] Schubert, Erich, Michael Gertz.\n \"Improving the Cluster Structure Extracted from OPTICS Plots.\" Proc. of\n the Conference \"Lernen, Wissen, Daten, Analysen\" (LWDA) (2018): 318-329.\n\n Examples\n --------\n >>> from sklearn.cluster import OPTICS\n >>> import numpy as np\n >>> X = np.array([[1, 2], [2, 5], [3, 6],\n ... [8, 7], [8, 8], [7, 3]])\n >>> clustering = OPTICS(min_samples=2).fit(X)\n >>> clustering.labels_\n array([0, 0, 0, 1, 1, 1])\n ", "source_code": "\n\nclass OPTICS(ClusterMixin, BaseEstimator):\n \"\"\"Estimate clustering structure from vector array.\n\n OPTICS (Ordering Points To Identify the Clustering Structure), closely\n related to DBSCAN, finds core sample of high density and expands clusters\n from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable\n neighborhood radius. Better suited for usage on large datasets than the\n current sklearn implementation of DBSCAN.\n\n Clusters are then extracted using a DBSCAN-like method\n (cluster_method = 'dbscan') or an automatic\n technique proposed in [1]_ (cluster_method = 'xi').\n\n This implementation deviates from the original OPTICS by first performing\n k-nearest-neighborhood searches on all points to identify core sizes, then\n computing only the distances to unprocessed points when constructing the\n cluster order. Note that we do not employ a heap to manage the expansion\n candidates, so the time complexity will be O(n^2).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n min_samples : int > 1 or float between 0 and 1, default=5\n The number of samples in a neighborhood for a point to be considered as\n a core point. Also, up and down steep regions can't have more than\n ``min_samples`` consecutive non-steep points. Expressed as an absolute\n number or a fraction of the number of samples (rounded to be at least\n 2).\n\n max_eps : float, default=np.inf\n The maximum distance between two samples for one to be considered as\n in the neighborhood of the other. Default value of ``np.inf`` will\n identify clusters across all scales; reducing ``max_eps`` will result\n in shorter run times.\n\n metric : str or callable, default='minkowski'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string. If metric is\n \"precomputed\", X is assumed to be a distance matrix and must be square.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n cluster_method : str, default='xi'\n The extraction method used to extract clusters using the calculated\n reachability and ordering. Possible values are \"xi\" and \"dbscan\".\n\n eps : float, default=None\n The maximum distance between two samples for one to be considered as\n in the neighborhood of the other. By default it assumes the same value\n as ``max_eps``.\n Used only when ``cluster_method='dbscan'``.\n\n xi : float between 0 and 1, default=0.05\n Determines the minimum steepness on the reachability plot that\n constitutes a cluster boundary. For example, an upwards point in the\n reachability plot is defined by the ratio from one point to its\n successor being at most 1-xi.\n Used only when ``cluster_method='xi'``.\n\n predecessor_correction : bool, default=True\n Correct clusters according to the predecessors calculated by OPTICS\n [2]_. This parameter has minimal effect on most datasets.\n Used only when ``cluster_method='xi'``.\n\n min_cluster_size : int > 1 or float between 0 and 1, default=None\n Minimum number of samples in an OPTICS cluster, expressed as an\n absolute number or a fraction of the number of samples (rounded to be\n at least 2). If ``None``, the value of ``min_samples`` is used instead.\n Used only when ``cluster_method='xi'``.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method. (default)\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can\n affect the speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n memory : str or object with the joblib.Memory interface, default=None\n Used to cache the output of the computation of the tree.\n By default, no caching is done. If a string is given, it is the\n path to the caching directory.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n labels_ : ndarray of shape (n_samples,)\n Cluster labels for each point in the dataset given to fit().\n Noisy samples and points which are not included in a leaf cluster\n of ``cluster_hierarchy_`` are labeled as -1.\n\n reachability_ : ndarray of shape (n_samples,)\n Reachability distances per sample, indexed by object order. Use\n ``clust.reachability_[clust.ordering_]`` to access in cluster order.\n\n ordering_ : ndarray of shape (n_samples,)\n The cluster ordered list of sample indices.\n\n core_distances_ : ndarray of shape (n_samples,)\n Distance at which each sample becomes a core point, indexed by object\n order. Points which will never be core have a distance of inf. Use\n ``clust.core_distances_[clust.ordering_]`` to access in cluster order.\n\n predecessor_ : ndarray of shape (n_samples,)\n Point that a sample was reached from, indexed by object order.\n Seed points have a predecessor of -1.\n\n cluster_hierarchy_ : ndarray of shape (n_clusters, 2)\n The list of clusters in the form of ``[start, end]`` in each row, with\n all indices inclusive. The clusters are ordered according to\n ``(end, -start)`` (ascending) so that larger clusters encompassing\n smaller clusters come after those smaller ones. Since ``labels_`` does\n not reflect the hierarchy, usually\n ``len(cluster_hierarchy_) > np.unique(optics.labels_)``. Please also\n note that these indices are of the ``ordering_``, i.e.\n ``X[ordering_][start:end + 1]`` form a cluster.\n Only available when ``cluster_method='xi'``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n DBSCAN : A similar clustering for a specified neighborhood radius (eps).\n Our implementation is optimized for runtime.\n\n References\n ----------\n .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,\n and J\u00f6rg Sander. \"OPTICS: ordering points to identify the clustering\n structure.\" ACM SIGMOD Record 28, no. 2 (1999): 49-60.\n\n .. [2] Schubert, Erich, Michael Gertz.\n \"Improving the Cluster Structure Extracted from OPTICS Plots.\" Proc. of\n the Conference \"Lernen, Wissen, Daten, Analysen\" (LWDA) (2018): 318-329.\n\n Examples\n --------\n >>> from sklearn.cluster import OPTICS\n >>> import numpy as np\n >>> X = np.array([[1, 2], [2, 5], [3, 6],\n ... [8, 7], [8, 8], [7, 3]])\n >>> clustering = OPTICS(min_samples=2).fit(X)\n >>> clustering.labels_\n array([0, 0, 0, 1, 1, 1])\n \"\"\"\n \n def __init__(self, *, min_samples=5, max_eps=np.inf, metric='minkowski', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, memory=None, n_jobs=None):\n self.max_eps = max_eps\n self.min_samples = min_samples\n self.min_cluster_size = min_cluster_size\n self.algorithm = algorithm\n self.metric = metric\n self.metric_params = metric_params\n self.p = p\n self.leaf_size = leaf_size\n self.cluster_method = cluster_method\n self.eps = eps\n self.xi = xi\n self.predecessor_correction = predecessor_correction\n self.memory = memory\n self.n_jobs = n_jobs\n \n def fit(self, X, y=None):\n \"\"\"Perform OPTICS clustering.\n\n Extracts an ordered list of points and reachability distances, and\n performs initial clustering using ``max_eps`` distance specified at\n OPTICS object instantiation.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features), or (n_samples, n_samples) if metric=\u2019precomputed\u2019\n A feature array, or array of distances between samples if\n metric='precomputed'.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float\n if dtype == bool and X.dtype != bool:\n msg = f'Data will be converted to boolean for metric {self.metric}, to avoid this warning, you may convert the data prior to calling fit.'\n warnings.warn(msg, DataConversionWarning)\n X = self._validate_data(X, dtype=dtype)\n memory = check_memory(self.memory)\n if self.cluster_method not in ['dbscan', 'xi']:\n raise ValueError(\"cluster_method should be one of 'dbscan' or 'xi' but is %s\" % self.cluster_method)\n (self.ordering_, self.core_distances_, self.reachability_, self.predecessor_) = memory.cache(compute_optics_graph)(X=X, min_samples=self.min_samples, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs, max_eps=self.max_eps)\n if self.cluster_method == 'xi':\n (labels_, clusters_) = cluster_optics_xi(reachability=self.reachability_, predecessor=self.predecessor_, ordering=self.ordering_, min_samples=self.min_samples, min_cluster_size=self.min_cluster_size, xi=self.xi, predecessor_correction=self.predecessor_correction)\n self.cluster_hierarchy_ = clusters_\n elif self.cluster_method == 'dbscan':\n if self.eps is None:\n eps = self.max_eps\n else:\n eps = self.eps\n if eps > self.max_eps:\n raise ValueError('Specify an epsilon smaller than %s. Got %s.' % (self.max_eps, eps))\n labels_ = cluster_optics_dbscan(reachability=self.reachability_, core_distances=self.core_distances_, ordering=self.ordering_, eps=eps)\n self.labels_ = labels_\n return self\n" }, @@ -20111,7 +20177,7 @@ "sklearn.cluster._spectral.SpectralClustering._pairwise@getter" ], "is_public": true, - "description": "Apply clustering to a projection of the normalized Laplacian.\n\nIn practice Spectral Clustering is very useful when the structure of the individual clusters is highly non-convex, or more generally when a measure of the center and spread of the cluster is not a suitable description of the complete cluster, such as when clusters are nested circles on the 2D plane. If the affinity matrix is the adjacency matrix of a graph, this method can be used to find normalized graph cuts [1]_, [2]_. When calling ``fit``, an affinity matrix is constructed using either a kernel function such the Gaussian (aka RBF) kernel with Euclidean distance ``d(X, X)``:: np.exp(-gamma * d(X,X) ** 2) or a k-nearest neighbors connectivity matrix. Alternatively, a user-provided affinity matrix can be specified by setting ``affinity='precomputed'``. Read more in the :ref:`User Guide `.", + "description": "Apply clustering to a projection of the normalized Laplacian.\n\nIn practice Spectral Clustering is very useful when the structure of\nthe individual clusters is highly non-convex, or more generally when\na measure of the center and spread of the cluster is not a suitable\ndescription of the complete cluster, such as when clusters are\nnested circles on the 2D plane.\n\nIf the affinity matrix is the adjacency matrix of a graph, this method\ncan be used to find normalized graph cuts [1]_, [2]_.\n\nWhen calling ``fit``, an affinity matrix is constructed using either\na kernel function such the Gaussian (aka RBF) kernel with Euclidean\ndistance ``d(X, X)``::\n\n np.exp(-gamma * d(X,X) ** 2)\n\nor a k-nearest neighbors connectivity matrix.\n\nAlternatively, a user-provided affinity matrix can be specified by\nsetting ``affinity='precomputed'``.\n\nRead more in the :ref:`User Guide `.", "docstring": "Apply clustering to a projection of the normalized Laplacian.\n\n In practice Spectral Clustering is very useful when the structure of\n the individual clusters is highly non-convex, or more generally when\n a measure of the center and spread of the cluster is not a suitable\n description of the complete cluster, such as when clusters are\n nested circles on the 2D plane.\n\n If the affinity matrix is the adjacency matrix of a graph, this method\n can be used to find normalized graph cuts [1]_, [2]_.\n\n When calling ``fit``, an affinity matrix is constructed using either\n a kernel function such the Gaussian (aka RBF) kernel with Euclidean\n distance ``d(X, X)``::\n\n np.exp(-gamma * d(X,X) ** 2)\n\n or a k-nearest neighbors connectivity matrix.\n\n Alternatively, a user-provided affinity matrix can be specified by\n setting ``affinity='precomputed'``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_clusters : int, default=8\n The dimension of the projection subspace.\n\n eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None\n The eigenvalue decomposition strategy to use. AMG requires pyamg\n to be installed. It can be faster on very large, sparse problems,\n but may also lead to instabilities. If None, then ``'arpack'`` is\n used. See [4]_ for more details regarding `'lobpcg'`.\n\n n_components : int, default=n_clusters\n Number of eigenvectors to use for the spectral embedding.\n\n random_state : int, RandomState instance, default=None\n A pseudo random number generator used for the initialization\n of the lobpcg eigenvectors decomposition when `eigen_solver ==\n 'amg'`, and for the K-Means initialization. Use an int to make\n the results deterministic across calls (See\n :term:`Glossary `).\n\n .. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information.\n\n n_init : int, default=10\n Number of time the k-means algorithm will be run with different\n centroid seeds. The final results will be the best output of n_init\n consecutive runs in terms of inertia. Only used if\n ``assign_labels='kmeans'``.\n\n gamma : float, default=1.0\n Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.\n Ignored for ``affinity='nearest_neighbors'``.\n\n affinity : str or callable, default='rbf'\n How to construct the affinity matrix.\n - 'nearest_neighbors': construct the affinity matrix by computing a\n graph of nearest neighbors.\n - 'rbf': construct the affinity matrix using a radial basis function\n (RBF) kernel.\n - 'precomputed': interpret ``X`` as a precomputed affinity matrix,\n where larger values indicate greater similarity between instances.\n - 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph\n of precomputed distances, and construct a binary affinity matrix\n from the ``n_neighbors`` nearest neighbors of each instance.\n - one of the kernels supported by\n :func:`~sklearn.metrics.pairwise_kernels`.\n\n Only kernels that produce similarity scores (non-negative values that\n increase with similarity) should be used. This property is not checked\n by the clustering algorithm.\n\n n_neighbors : int, default=10\n Number of neighbors to use when constructing the affinity matrix using\n the nearest neighbors method. Ignored for ``affinity='rbf'``.\n\n eigen_tol : float, default=0.0\n Stopping criterion for eigendecomposition of the Laplacian matrix\n when ``eigen_solver='arpack'``.\n\n assign_labels : {'kmeans', 'discretize'}, default='kmeans'\n The strategy for assigning labels in the embedding space. There are two\n ways to assign labels after the Laplacian embedding. k-means is a\n popular choice, but it can be sensitive to initialization.\n Discretization is another approach which is less sensitive to random\n initialization [3]_.\n\n degree : float, default=3\n Degree of the polynomial kernel. Ignored by other kernels.\n\n coef0 : float, default=1\n Zero coefficient for polynomial and sigmoid kernels.\n Ignored by other kernels.\n\n kernel_params : dict of str to any, default=None\n Parameters (keyword arguments) and values for kernel passed as\n callable object. Ignored by other kernels.\n\n n_jobs : int, default=None\n The number of parallel jobs to run when `affinity='nearest_neighbors'`\n or `affinity='precomputed_nearest_neighbors'`. The neighbors search\n will be done in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : bool, default=False\n Verbosity mode.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n affinity_matrix_ : array-like of shape (n_samples, n_samples)\n Affinity matrix used for clustering. Available only after calling\n ``fit``.\n\n labels_ : ndarray of shape (n_samples,)\n Labels of each point\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.cluster.KMeans : K-Means clustering.\n sklearn.cluster.DBSCAN : Density-Based Spatial Clustering of\n Applications with Noise.\n\n Notes\n -----\n A distance matrix for which 0 indicates identical elements and high values\n indicate very dissimilar elements can be transformed into an affinity /\n similarity matrix that is well-suited for the algorithm by\n applying the Gaussian (aka RBF, heat) kernel::\n\n np.exp(- dist_matrix ** 2 / (2. * delta ** 2))\n\n where ``delta`` is a free parameter representing the width of the Gaussian\n kernel.\n\n An alternative is to take a symmetric version of the k-nearest neighbors\n connectivity matrix of the points.\n\n If the pyamg package is installed, it is used: this greatly\n speeds up computation.\n\n References\n ----------\n .. [1] `Normalized cuts and image segmentation, 2000\n Jianbo Shi, Jitendra Malik\n `_\n\n .. [2] `A Tutorial on Spectral Clustering, 2007\n Ulrike von Luxburg\n `_\n\n .. [3] `Multiclass spectral clustering, 2003\n Stella X. Yu, Jianbo Shi\n `_\n\n .. [4] `Toward the Optimal Preconditioned Eigensolver:\n Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001.\n A. V. Knyazev\n SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.\n `_\n\n Examples\n --------\n >>> from sklearn.cluster import SpectralClustering\n >>> import numpy as np\n >>> X = np.array([[1, 1], [2, 1], [1, 0],\n ... [4, 7], [3, 5], [3, 6]])\n >>> clustering = SpectralClustering(n_clusters=2,\n ... assign_labels='discretize',\n ... random_state=0).fit(X)\n >>> clustering.labels_\n array([1, 1, 1, 0, 0, 0])\n >>> clustering\n SpectralClustering(assign_labels='discretize', n_clusters=2,\n random_state=0)\n ", "source_code": "\n\nclass SpectralClustering(ClusterMixin, BaseEstimator):\n \"\"\"Apply clustering to a projection of the normalized Laplacian.\n\n In practice Spectral Clustering is very useful when the structure of\n the individual clusters is highly non-convex, or more generally when\n a measure of the center and spread of the cluster is not a suitable\n description of the complete cluster, such as when clusters are\n nested circles on the 2D plane.\n\n If the affinity matrix is the adjacency matrix of a graph, this method\n can be used to find normalized graph cuts [1]_, [2]_.\n\n When calling ``fit``, an affinity matrix is constructed using either\n a kernel function such the Gaussian (aka RBF) kernel with Euclidean\n distance ``d(X, X)``::\n\n np.exp(-gamma * d(X,X) ** 2)\n\n or a k-nearest neighbors connectivity matrix.\n\n Alternatively, a user-provided affinity matrix can be specified by\n setting ``affinity='precomputed'``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_clusters : int, default=8\n The dimension of the projection subspace.\n\n eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None\n The eigenvalue decomposition strategy to use. AMG requires pyamg\n to be installed. It can be faster on very large, sparse problems,\n but may also lead to instabilities. If None, then ``'arpack'`` is\n used. See [4]_ for more details regarding `'lobpcg'`.\n\n n_components : int, default=n_clusters\n Number of eigenvectors to use for the spectral embedding.\n\n random_state : int, RandomState instance, default=None\n A pseudo random number generator used for the initialization\n of the lobpcg eigenvectors decomposition when `eigen_solver ==\n 'amg'`, and for the K-Means initialization. Use an int to make\n the results deterministic across calls (See\n :term:`Glossary `).\n\n .. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information.\n\n n_init : int, default=10\n Number of time the k-means algorithm will be run with different\n centroid seeds. The final results will be the best output of n_init\n consecutive runs in terms of inertia. Only used if\n ``assign_labels='kmeans'``.\n\n gamma : float, default=1.0\n Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.\n Ignored for ``affinity='nearest_neighbors'``.\n\n affinity : str or callable, default='rbf'\n How to construct the affinity matrix.\n - 'nearest_neighbors': construct the affinity matrix by computing a\n graph of nearest neighbors.\n - 'rbf': construct the affinity matrix using a radial basis function\n (RBF) kernel.\n - 'precomputed': interpret ``X`` as a precomputed affinity matrix,\n where larger values indicate greater similarity between instances.\n - 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph\n of precomputed distances, and construct a binary affinity matrix\n from the ``n_neighbors`` nearest neighbors of each instance.\n - one of the kernels supported by\n :func:`~sklearn.metrics.pairwise_kernels`.\n\n Only kernels that produce similarity scores (non-negative values that\n increase with similarity) should be used. This property is not checked\n by the clustering algorithm.\n\n n_neighbors : int, default=10\n Number of neighbors to use when constructing the affinity matrix using\n the nearest neighbors method. Ignored for ``affinity='rbf'``.\n\n eigen_tol : float, default=0.0\n Stopping criterion for eigendecomposition of the Laplacian matrix\n when ``eigen_solver='arpack'``.\n\n assign_labels : {'kmeans', 'discretize'}, default='kmeans'\n The strategy for assigning labels in the embedding space. There are two\n ways to assign labels after the Laplacian embedding. k-means is a\n popular choice, but it can be sensitive to initialization.\n Discretization is another approach which is less sensitive to random\n initialization [3]_.\n\n degree : float, default=3\n Degree of the polynomial kernel. Ignored by other kernels.\n\n coef0 : float, default=1\n Zero coefficient for polynomial and sigmoid kernels.\n Ignored by other kernels.\n\n kernel_params : dict of str to any, default=None\n Parameters (keyword arguments) and values for kernel passed as\n callable object. Ignored by other kernels.\n\n n_jobs : int, default=None\n The number of parallel jobs to run when `affinity='nearest_neighbors'`\n or `affinity='precomputed_nearest_neighbors'`. The neighbors search\n will be done in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : bool, default=False\n Verbosity mode.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n affinity_matrix_ : array-like of shape (n_samples, n_samples)\n Affinity matrix used for clustering. Available only after calling\n ``fit``.\n\n labels_ : ndarray of shape (n_samples,)\n Labels of each point\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.cluster.KMeans : K-Means clustering.\n sklearn.cluster.DBSCAN : Density-Based Spatial Clustering of\n Applications with Noise.\n\n Notes\n -----\n A distance matrix for which 0 indicates identical elements and high values\n indicate very dissimilar elements can be transformed into an affinity /\n similarity matrix that is well-suited for the algorithm by\n applying the Gaussian (aka RBF, heat) kernel::\n\n np.exp(- dist_matrix ** 2 / (2. * delta ** 2))\n\n where ``delta`` is a free parameter representing the width of the Gaussian\n kernel.\n\n An alternative is to take a symmetric version of the k-nearest neighbors\n connectivity matrix of the points.\n\n If the pyamg package is installed, it is used: this greatly\n speeds up computation.\n\n References\n ----------\n .. [1] `Normalized cuts and image segmentation, 2000\n Jianbo Shi, Jitendra Malik\n `_\n\n .. [2] `A Tutorial on Spectral Clustering, 2007\n Ulrike von Luxburg\n `_\n\n .. [3] `Multiclass spectral clustering, 2003\n Stella X. Yu, Jianbo Shi\n `_\n\n .. [4] `Toward the Optimal Preconditioned Eigensolver:\n Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001.\n A. V. Knyazev\n SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.\n `_\n\n Examples\n --------\n >>> from sklearn.cluster import SpectralClustering\n >>> import numpy as np\n >>> X = np.array([[1, 1], [2, 1], [1, 0],\n ... [4, 7], [3, 5], [3, 6]])\n >>> clustering = SpectralClustering(n_clusters=2,\n ... assign_labels='discretize',\n ... random_state=0).fit(X)\n >>> clustering.labels_\n array([1, 1, 1, 0, 0, 0])\n >>> clustering\n SpectralClustering(assign_labels='discretize', n_clusters=2,\n random_state=0)\n \"\"\"\n \n def __init__(self, n_clusters=8, *, eigen_solver=None, n_components=None, random_state=None, n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, n_jobs=None, verbose=False):\n self.n_clusters = n_clusters\n self.eigen_solver = eigen_solver\n self.n_components = n_components\n self.random_state = random_state\n self.n_init = n_init\n self.gamma = gamma\n self.affinity = affinity\n self.n_neighbors = n_neighbors\n self.eigen_tol = eigen_tol\n self.assign_labels = assign_labels\n self.degree = degree\n self.coef0 = coef0\n self.kernel_params = kernel_params\n self.n_jobs = n_jobs\n self.verbose = verbose\n \n def fit(self, X, y=None):\n \"\"\"Perform spectral clustering from features, or affinity matrix.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, similarities / affinities between\n instances if ``affinity='precomputed'``, or distances between\n instances if ``affinity='precomputed_nearest_neighbors``. If a\n sparse matrix is provided in a format other than ``csr_matrix``,\n ``csc_matrix``, or ``coo_matrix``, it will be converted into a\n sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n A fitted instance of the estimator.\n \"\"\"\n X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64, ensure_min_samples=2)\n allow_squared = self.affinity in ['precomputed', 'precomputed_nearest_neighbors']\n if X.shape[0] == X.shape[1] and not allow_squared:\n warnings.warn('The spectral clustering API has changed. ``fit``now constructs an affinity matrix from data. To use a custom affinity matrix, set ``affinity=precomputed``.')\n if self.affinity == 'nearest_neighbors':\n connectivity = kneighbors_graph(X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs)\n self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)\n elif self.affinity == 'precomputed_nearest_neighbors':\n estimator = NearestNeighbors(n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric='precomputed').fit(X)\n connectivity = estimator.kneighbors_graph(X=X, mode='connectivity')\n self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)\n elif self.affinity == 'precomputed':\n self.affinity_matrix_ = X\n else:\n params = self.kernel_params\n if params is None:\n params = {}\n if not callable(self.affinity):\n params['gamma'] = self.gamma\n params['degree'] = self.degree\n params['coef0'] = self.coef0\n self.affinity_matrix_ = pairwise_kernels(X, metric=self.affinity, filter_params=True, **params)\n random_state = check_random_state(self.random_state)\n self.labels_ = spectral_clustering(self.affinity_matrix_, n_clusters=self.n_clusters, n_components=self.n_components, eigen_solver=self.eigen_solver, random_state=random_state, n_init=self.n_init, eigen_tol=self.eigen_tol, assign_labels=self.assign_labels, verbose=self.verbose)\n return self\n \n def fit_predict(self, X, y=None):\n \"\"\"Perform spectral clustering on `X` and return cluster labels.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, similarities / affinities between\n instances if ``affinity='precomputed'``, or distances between\n instances if ``affinity='precomputed_nearest_neighbors``. If a\n sparse matrix is provided in a format other than ``csr_matrix``,\n ``csc_matrix``, or ``coo_matrix``, it will be converted into a\n sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels.\n \"\"\"\n return super().fit_predict(X, y)\n \n def _more_tags(self):\n return {'pairwise': self.affinity in ['precomputed', 'precomputed_nearest_neighbors']}\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return self.affinity in ['precomputed', 'precomputed_nearest_neighbors']\n" }, @@ -20146,7 +20212,7 @@ "sklearn.compose._column_transformer.ColumnTransformer._sk_visual_block_" ], "is_public": true, - "description": "Applies transformers to columns of an array or pandas DataFrame.\n\nThis estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space. This is useful for heterogeneous or columnar data, to combine several feature extraction mechanisms or transformations into a single transformer. Read more in the :ref:`User Guide `. .. versionadded:: 0.20", + "description": "Applies transformers to columns of an array or pandas DataFrame.\n\nThis estimator allows different columns or column subsets of the input\nto be transformed separately and the features generated by each transformer\nwill be concatenated to form a single feature space.\nThis is useful for heterogeneous or columnar data, to combine several\nfeature extraction mechanisms or transformations into a single transformer.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20", "docstring": "Applies transformers to columns of an array or pandas DataFrame.\n\n This estimator allows different columns or column subsets of the input\n to be transformed separately and the features generated by each transformer\n will be concatenated to form a single feature space.\n This is useful for heterogeneous or columnar data, to combine several\n feature extraction mechanisms or transformations into a single transformer.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n transformers : list of tuples\n List of (name, transformer, columns) tuples specifying the\n transformer objects to be applied to subsets of the data.\n\n name : str\n Like in Pipeline and FeatureUnion, this allows the transformer and\n its parameters to be set using ``set_params`` and searched in grid\n search.\n transformer : {'drop', 'passthrough'} or estimator\n Estimator must support :term:`fit` and :term:`transform`.\n Special-cased strings 'drop' and 'passthrough' are accepted as\n well, to indicate to drop the columns or to pass them through\n untransformed, respectively.\n columns : str, array-like of str, int, array-like of int, array-like of bool, slice or callable\n Indexes the data on its second axis. Integers are interpreted as\n positional columns, while strings can reference DataFrame columns\n by name. A scalar string or int should be used where\n ``transformer`` expects X to be a 1d array-like (vector),\n otherwise a 2d array will be passed to the transformer.\n A callable is passed the input data `X` and can return any of the\n above. To select multiple columns by name or dtype, you can use\n :obj:`make_column_selector`.\n\n remainder : {'drop', 'passthrough'} or estimator, default='drop'\n By default, only the specified columns in `transformers` are\n transformed and combined in the output, and the non-specified\n columns are dropped. (default of ``'drop'``).\n By specifying ``remainder='passthrough'``, all remaining columns that\n were not specified in `transformers` will be automatically passed\n through. This subset of columns is concatenated with the output of\n the transformers.\n By setting ``remainder`` to be an estimator, the remaining\n non-specified columns will use the ``remainder`` estimator. The\n estimator must support :term:`fit` and :term:`transform`.\n Note that using this feature requires that the DataFrame columns\n input at :term:`fit` and :term:`transform` have identical order.\n\n sparse_threshold : float, default=0.3\n If the output of the different transformers contains sparse matrices,\n these will be stacked as a sparse matrix if the overall density is\n lower than this value. Use ``sparse_threshold=0`` to always return\n dense. When the transformed output consists of all dense data, the\n stacked result will be dense, and this keyword will be ignored.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n transformer_weights : dict, default=None\n Multiplicative weights for features per transformer. The output of the\n transformer is multiplied by these weights. Keys are transformer names,\n values the weights.\n\n verbose : bool, default=False\n If True, the time elapsed while fitting each transformer will be\n printed as it is completed.\n\n verbose_feature_names_out : bool, default=True\n If True, :meth:`get_feature_names_out` will prefix all feature names\n with the name of the transformer that generated that feature.\n If False, :meth:`get_feature_names_out` will not prefix any feature\n names and will error if feature names are not unique.\n\n .. versionadded:: 1.0\n\n Attributes\n ----------\n transformers_ : list\n The collection of fitted transformers as tuples of\n (name, fitted_transformer, column). `fitted_transformer` can be an\n estimator, 'drop', or 'passthrough'. In case there were no columns\n selected, this will be the unfitted transformer.\n If there are remaining columns, the final element is a tuple of the\n form:\n ('remainder', transformer, remaining_columns) corresponding to the\n ``remainder`` parameter. If there are remaining columns, then\n ``len(transformers_)==len(transformers)+1``, otherwise\n ``len(transformers_)==len(transformers)``.\n\n named_transformers_ : :class:`~sklearn.utils.Bunch`\n Read-only attribute to access any transformer by given name.\n Keys are transformer names and values are the fitted transformer\n objects.\n\n sparse_output_ : bool\n Boolean flag indicating whether the output of ``transform`` is a\n sparse matrix or a dense numpy array, which depends on the output\n of the individual transformers and the `sparse_threshold` keyword.\n\n output_indices_ : dict\n A dictionary from each transformer name to a slice, where the slice\n corresponds to indices in the transformed output. This is useful to\n inspect which transformer is responsible for which transformed\n feature(s).\n\n .. versionadded:: 1.0\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying transformers expose such an attribute when fit.\n\n .. versionadded:: 0.24\n\n See Also\n --------\n make_column_transformer : Convenience function for\n combining the outputs of multiple transformer objects applied to\n column subsets of the original feature space.\n make_column_selector : Convenience function for selecting\n columns based on datatype or the columns name with a regex pattern.\n\n Notes\n -----\n The order of the columns in the transformed feature matrix follows the\n order of how the columns are specified in the `transformers` list.\n Columns of the original feature matrix that are not specified are\n dropped from the resulting transformed feature matrix, unless specified\n in the `passthrough` keyword. Those columns specified with `passthrough`\n are added at the right to the output of the transformers.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.compose import ColumnTransformer\n >>> from sklearn.preprocessing import Normalizer\n >>> ct = ColumnTransformer(\n ... [(\"norm1\", Normalizer(norm='l1'), [0, 1]),\n ... (\"norm2\", Normalizer(norm='l1'), slice(2, 4))])\n >>> X = np.array([[0., 1., 2., 2.],\n ... [1., 1., 0., 1.]])\n >>> # Normalizer scales each row of X to unit norm. A separate scaling\n >>> # is applied for the two first and two last elements of each\n >>> # row independently.\n >>> ct.fit_transform(X)\n array([[0. , 1. , 0.5, 0.5],\n [0.5, 0.5, 0. , 1. ]])\n ", "source_code": "\n\nclass ColumnTransformer(TransformerMixin, _BaseComposition):\n \"\"\"Applies transformers to columns of an array or pandas DataFrame.\n\n This estimator allows different columns or column subsets of the input\n to be transformed separately and the features generated by each transformer\n will be concatenated to form a single feature space.\n This is useful for heterogeneous or columnar data, to combine several\n feature extraction mechanisms or transformations into a single transformer.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n transformers : list of tuples\n List of (name, transformer, columns) tuples specifying the\n transformer objects to be applied to subsets of the data.\n\n name : str\n Like in Pipeline and FeatureUnion, this allows the transformer and\n its parameters to be set using ``set_params`` and searched in grid\n search.\n transformer : {'drop', 'passthrough'} or estimator\n Estimator must support :term:`fit` and :term:`transform`.\n Special-cased strings 'drop' and 'passthrough' are accepted as\n well, to indicate to drop the columns or to pass them through\n untransformed, respectively.\n columns : str, array-like of str, int, array-like of int, array-like of bool, slice or callable\n Indexes the data on its second axis. Integers are interpreted as\n positional columns, while strings can reference DataFrame columns\n by name. A scalar string or int should be used where\n ``transformer`` expects X to be a 1d array-like (vector),\n otherwise a 2d array will be passed to the transformer.\n A callable is passed the input data `X` and can return any of the\n above. To select multiple columns by name or dtype, you can use\n :obj:`make_column_selector`.\n\n remainder : {'drop', 'passthrough'} or estimator, default='drop'\n By default, only the specified columns in `transformers` are\n transformed and combined in the output, and the non-specified\n columns are dropped. (default of ``'drop'``).\n By specifying ``remainder='passthrough'``, all remaining columns that\n were not specified in `transformers` will be automatically passed\n through. This subset of columns is concatenated with the output of\n the transformers.\n By setting ``remainder`` to be an estimator, the remaining\n non-specified columns will use the ``remainder`` estimator. The\n estimator must support :term:`fit` and :term:`transform`.\n Note that using this feature requires that the DataFrame columns\n input at :term:`fit` and :term:`transform` have identical order.\n\n sparse_threshold : float, default=0.3\n If the output of the different transformers contains sparse matrices,\n these will be stacked as a sparse matrix if the overall density is\n lower than this value. Use ``sparse_threshold=0`` to always return\n dense. When the transformed output consists of all dense data, the\n stacked result will be dense, and this keyword will be ignored.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n transformer_weights : dict, default=None\n Multiplicative weights for features per transformer. The output of the\n transformer is multiplied by these weights. Keys are transformer names,\n values the weights.\n\n verbose : bool, default=False\n If True, the time elapsed while fitting each transformer will be\n printed as it is completed.\n\n verbose_feature_names_out : bool, default=True\n If True, :meth:`get_feature_names_out` will prefix all feature names\n with the name of the transformer that generated that feature.\n If False, :meth:`get_feature_names_out` will not prefix any feature\n names and will error if feature names are not unique.\n\n .. versionadded:: 1.0\n\n Attributes\n ----------\n transformers_ : list\n The collection of fitted transformers as tuples of\n (name, fitted_transformer, column). `fitted_transformer` can be an\n estimator, 'drop', or 'passthrough'. In case there were no columns\n selected, this will be the unfitted transformer.\n If there are remaining columns, the final element is a tuple of the\n form:\n ('remainder', transformer, remaining_columns) corresponding to the\n ``remainder`` parameter. If there are remaining columns, then\n ``len(transformers_)==len(transformers)+1``, otherwise\n ``len(transformers_)==len(transformers)``.\n\n named_transformers_ : :class:`~sklearn.utils.Bunch`\n Read-only attribute to access any transformer by given name.\n Keys are transformer names and values are the fitted transformer\n objects.\n\n sparse_output_ : bool\n Boolean flag indicating whether the output of ``transform`` is a\n sparse matrix or a dense numpy array, which depends on the output\n of the individual transformers and the `sparse_threshold` keyword.\n\n output_indices_ : dict\n A dictionary from each transformer name to a slice, where the slice\n corresponds to indices in the transformed output. This is useful to\n inspect which transformer is responsible for which transformed\n feature(s).\n\n .. versionadded:: 1.0\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying transformers expose such an attribute when fit.\n\n .. versionadded:: 0.24\n\n See Also\n --------\n make_column_transformer : Convenience function for\n combining the outputs of multiple transformer objects applied to\n column subsets of the original feature space.\n make_column_selector : Convenience function for selecting\n columns based on datatype or the columns name with a regex pattern.\n\n Notes\n -----\n The order of the columns in the transformed feature matrix follows the\n order of how the columns are specified in the `transformers` list.\n Columns of the original feature matrix that are not specified are\n dropped from the resulting transformed feature matrix, unless specified\n in the `passthrough` keyword. Those columns specified with `passthrough`\n are added at the right to the output of the transformers.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.compose import ColumnTransformer\n >>> from sklearn.preprocessing import Normalizer\n >>> ct = ColumnTransformer(\n ... [(\"norm1\", Normalizer(norm='l1'), [0, 1]),\n ... (\"norm2\", Normalizer(norm='l1'), slice(2, 4))])\n >>> X = np.array([[0., 1., 2., 2.],\n ... [1., 1., 0., 1.]])\n >>> # Normalizer scales each row of X to unit norm. A separate scaling\n >>> # is applied for the two first and two last elements of each\n >>> # row independently.\n >>> ct.fit_transform(X)\n array([[0. , 1. , 0.5, 0.5],\n [0.5, 0.5, 0. , 1. ]])\n \"\"\"\n _required_parameters = ['transformers']\n \n def __init__(self, transformers, *, remainder='drop', sparse_threshold=0.3, n_jobs=None, transformer_weights=None, verbose=False, verbose_feature_names_out=True):\n self.transformers = transformers\n self.remainder = remainder\n self.sparse_threshold = sparse_threshold\n self.n_jobs = n_jobs\n self.transformer_weights = transformer_weights\n self.verbose = verbose\n self.verbose_feature_names_out = verbose_feature_names_out\n \n @property\n def _transformers(self):\n \"\"\"\n Internal list of transformer only containing the name and\n transformers, dropping the columns. This is for the implementation\n of get_params via BaseComposition._get_params which expects lists\n of tuples of len 2.\n \"\"\"\n return [(name, trans) for (name, trans, _) in self.transformers]\n \n @_transformers.setter\n def _transformers(self, value):\n self.transformers = [(name, trans, col) for ((name, trans), (_, _, col)) in zip(value, self.transformers)]\n \n def get_params(self, deep=True):\n \"\"\"Get parameters for this estimator.\n\n Returns the parameters given in the constructor as well as the\n estimators contained within the `transformers` of the\n `ColumnTransformer`.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n \"\"\"\n return self._get_params('_transformers', deep=deep)\n \n def set_params(self, **kwargs):\n \"\"\"Set the parameters of this estimator.\n\n Valid parameter keys can be listed with ``get_params()``. Note that you\n can directly set the parameters of the estimators contained in\n `transformers` of `ColumnTransformer`.\n\n Parameters\n ----------\n **kwargs : dict\n Estimator parameters.\n\n Returns\n -------\n self : ColumnTransformer\n This estimator.\n \"\"\"\n self._set_params('_transformers', **kwargs)\n return self\n \n def _iter(self, fitted=False, replace_strings=False, column_as_strings=False):\n \"\"\"\n Generate (name, trans, column, weight) tuples.\n\n If fitted=True, use the fitted transformers, else use the\n user specified transformers updated with converted column names\n and potentially appended with transformer for remainder.\n\n \"\"\"\n if fitted:\n transformers = self.transformers_\n else:\n transformers = [(name, trans, column) for ((name, trans, _), column) in zip(self.transformers, self._columns)]\n if self._remainder[2]:\n transformers = chain(transformers, [self._remainder])\n get_weight = (self.transformer_weights or {}).get\n for (name, trans, columns) in transformers:\n if replace_strings:\n if trans == 'passthrough':\n trans = FunctionTransformer(accept_sparse=True, check_inverse=False)\n elif trans == 'drop':\n continue\n elif _is_empty_column_selection(columns):\n continue\n if column_as_strings:\n columns_is_scalar = np.isscalar(columns)\n indices = self._transformer_to_input_indices[name]\n columns = self.feature_names_in_[indices]\n if columns_is_scalar:\n columns = columns[0]\n yield (name, trans, columns, get_weight(name))\n \n def _validate_transformers(self):\n if not self.transformers:\n return\n (names, transformers, _) = zip(*self.transformers)\n self._validate_names(names)\n for t in transformers:\n if t in ('drop', 'passthrough'):\n continue\n if not (hasattr(t, 'fit') or hasattr(t, 'fit_transform')) or not hasattr(t, 'transform'):\n raise TypeError(\"All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. '%s' (type %s) doesn't.\" % (t, type(t)))\n \n def _validate_column_callables(self, X):\n \"\"\"\n Converts callable column specifications.\n \"\"\"\n all_columns = []\n transformer_to_input_indices = {}\n for (name, _, columns) in self.transformers:\n if callable(columns):\n columns = columns(X)\n all_columns.append(columns)\n transformer_to_input_indices[name] = _get_column_indices(X, columns)\n self._columns = all_columns\n self._transformer_to_input_indices = transformer_to_input_indices\n \n def _validate_remainder(self, X):\n \"\"\"\n Validates ``remainder`` and defines ``_remainder`` targeting\n the remaining columns.\n \"\"\"\n is_transformer = (hasattr(self.remainder, 'fit') or hasattr(self.remainder, 'fit_transform')) and hasattr(self.remainder, 'transform')\n if self.remainder not in ('drop', 'passthrough') and not is_transformer:\n raise ValueError(\"The remainder keyword needs to be one of 'drop', 'passthrough', or estimator. '%s' was passed instead\" % self.remainder)\n self._n_features = X.shape[1]\n cols = set(chain(*self._transformer_to_input_indices.values()))\n remaining = sorted(set(range(self._n_features)) - cols)\n self._remainder = ('remainder', self.remainder, remaining)\n self._transformer_to_input_indices['remainder'] = remaining\n \n @property\n def named_transformers_(self):\n \"\"\"Access the fitted transformer by name.\n\n Read-only attribute to access any transformer by given name.\n Keys are transformer names and values are the fitted transformer\n objects.\n \"\"\"\n return Bunch(**{name: trans for (name, trans, _) in self.transformers_})\n \n @deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\n def get_feature_names(self):\n \"\"\"Get feature names from all transformers.\n\n Returns\n -------\n feature_names : list of strings\n Names of the features produced by transform.\n \"\"\"\n check_is_fitted(self)\n feature_names = []\n for (name, trans, column, _) in self._iter(fitted=True):\n if trans == 'drop' or _is_empty_column_selection(column):\n continue\n if trans == 'passthrough':\n if hasattr(self, 'feature_names_in_'):\n if not isinstance(column, slice) and all((isinstance(col, str) for col in column)):\n feature_names.extend(column)\n else:\n feature_names.extend(self.feature_names_in_[column])\n else:\n indices = np.arange(self._n_features)\n feature_names.extend(['x%d' % i for i in indices[column]])\n continue\n if not hasattr(trans, 'get_feature_names'):\n raise AttributeError('Transformer %s (type %s) does not provide get_feature_names.' % (str(name), type(trans).__name__))\n feature_names.extend([f'{name}__{f}' for f in trans.get_feature_names()])\n return feature_names\n \n def _get_feature_name_out_for_transformer(self, name, trans, column, feature_names_in):\n \"\"\"Gets feature names of transformer.\n\n Used in conjunction with self._iter(fitted=True) in get_feature_names_out.\n \"\"\"\n if trans == 'drop' or _is_empty_column_selection(column):\n return\n elif trans == 'passthrough':\n if not isinstance(column, slice) and all((isinstance(col, str) for col in column)):\n return column\n else:\n return feature_names_in[column]\n if not hasattr(trans, 'get_feature_names_out'):\n raise AttributeError(f'Transformer {name} (type {type(trans).__name__}) does not provide get_feature_names_out.')\n if isinstance(column, Iterable) and not all((isinstance(col, str) for col in column)):\n column = _safe_indexing(feature_names_in, column)\n return trans.get_feature_names_out(column)\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n check_is_fitted(self)\n input_features = _check_feature_names_in(self, input_features)\n transformer_with_feature_names_out = []\n for (name, trans, column, _) in self._iter(fitted=True):\n feature_names_out = self._get_feature_name_out_for_transformer(name, trans, column, input_features)\n if feature_names_out is None:\n continue\n transformer_with_feature_names_out.append((name, feature_names_out))\n if not transformer_with_feature_names_out:\n return np.array([], dtype=object)\n if self.verbose_feature_names_out:\n names = list(chain.from_iterable(((f'{name}__{i}' for i in feature_names_out) for (name, feature_names_out) in transformer_with_feature_names_out)))\n return np.asarray(names, dtype=object)\n feature_names_count = Counter(chain.from_iterable((s for (_, s) in transformer_with_feature_names_out)))\n top_6_overlap = [name for (name, count) in feature_names_count.most_common(6) if count > 1]\n top_6_overlap.sort()\n if top_6_overlap:\n if len(top_6_overlap) == 6:\n names_repr = str(top_6_overlap[:5])[:-1] + ', ...]'\n else:\n names_repr = str(top_6_overlap)\n raise ValueError(f'Output feature names: {names_repr} are not unique. Please set verbose_feature_names_out=True to add prefixes to feature names')\n return np.concatenate([name for (_, name) in transformer_with_feature_names_out])\n \n def _update_fitted_transformers(self, transformers):\n fitted_transformers = iter(transformers)\n transformers_ = []\n for (name, old, column, _) in self._iter():\n if old == 'drop':\n trans = 'drop'\n elif old == 'passthrough':\n next(fitted_transformers)\n trans = 'passthrough'\n elif _is_empty_column_selection(column):\n trans = old\n else:\n trans = next(fitted_transformers)\n transformers_.append((name, trans, column))\n assert not list(fitted_transformers)\n self.transformers_ = transformers_\n \n def _validate_output(self, result):\n \"\"\"\n Ensure that the output of each transformer is 2D. Otherwise\n hstack can raise an error or produce incorrect results.\n \"\"\"\n names = [name for (name, _, _, _) in self._iter(fitted=True, replace_strings=True)]\n for (Xs, name) in zip(result, names):\n if not getattr(Xs, 'ndim', 0) == 2:\n raise ValueError(\"The output of the '{0}' transformer should be 2D (scipy matrix, array, or pandas DataFrame).\".format(name))\n \n def _record_output_indices(self, Xs):\n \"\"\"\n Record which transformer produced which column.\n \"\"\"\n idx = 0\n self.output_indices_ = {}\n for (transformer_idx, (name, _, _, _)) in enumerate(self._iter(fitted=True, replace_strings=True)):\n n_columns = Xs[transformer_idx].shape[1]\n self.output_indices_[name] = slice(idx, idx + n_columns)\n idx += n_columns\n all_names = [t[0] for t in self.transformers] + ['remainder']\n for name in all_names:\n if name not in self.output_indices_:\n self.output_indices_[name] = slice(0, 0)\n \n def _log_message(self, name, idx, total):\n if not self.verbose:\n return None\n return '(%d of %d) Processing %s' % (idx, total, name)\n \n def _fit_transform(self, X, y, func, fitted=False, column_as_strings=False):\n \"\"\"\n Private function to fit and/or transform on demand.\n\n Return value (transformers and/or transformed X data) depends\n on the passed function.\n ``fitted=True`` ensures the fitted transformers are used.\n \"\"\"\n transformers = list(self._iter(fitted=fitted, replace_strings=True, column_as_strings=column_as_strings))\n try:\n return Parallel(n_jobs=self.n_jobs)((delayed(func)(transformer=clone(trans) if not fitted else trans, X=_safe_indexing(X, column, axis=1), y=y, weight=weight, message_clsname='ColumnTransformer', message=self._log_message(name, idx, len(transformers))) for (idx, (name, trans, column, weight)) in enumerate(transformers, 1)))\n except ValueError as e:\n if 'Expected 2D array, got 1D array instead' in str(e):\n raise ValueError(_ERR_MSG_1DCOLUMN) from e\n else:\n raise\n \n def fit(self, X, y=None):\n \"\"\"Fit all transformers using X.\n\n Parameters\n ----------\n X : {array-like, dataframe} of shape (n_samples, n_features)\n Input data, of which specified subsets are used to fit the\n transformers.\n\n y : array-like of shape (n_samples,...), default=None\n Targets for supervised learning.\n\n Returns\n -------\n self : ColumnTransformer\n This estimator.\n \"\"\"\n self.fit_transform(X, y=y)\n return self\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit all transformers, transform the data and concatenate results.\n\n Parameters\n ----------\n X : {array-like, dataframe} of shape (n_samples, n_features)\n Input data, of which specified subsets are used to fit the\n transformers.\n\n y : array-like of shape (n_samples,), default=None\n Targets for supervised learning.\n\n Returns\n -------\n X_t : {array-like, sparse matrix} of shape (n_samples, sum_n_components)\n Horizontally stacked results of transformers. sum_n_components is the\n sum of n_components (output dimension) over transformers. If\n any result is a sparse matrix, everything will be converted to\n sparse matrices.\n \"\"\"\n self._check_feature_names(X, reset=True)\n X = _check_X(X)\n self._check_n_features(X, reset=True)\n self._validate_transformers()\n self._validate_column_callables(X)\n self._validate_remainder(X)\n result = self._fit_transform(X, y, _fit_transform_one)\n if not result:\n self._update_fitted_transformers([])\n return np.zeros((X.shape[0], 0))\n (Xs, transformers) = zip(*result)\n if any((sparse.issparse(X) for X in Xs)):\n nnz = sum((X.nnz if sparse.issparse(X) else X.size for X in Xs))\n total = sum((X.shape[0] * X.shape[1] if sparse.issparse(X) else X.size for X in Xs))\n density = nnz / total\n self.sparse_output_ = density < self.sparse_threshold\n else:\n self.sparse_output_ = False\n self._update_fitted_transformers(transformers)\n self._validate_output(Xs)\n self._record_output_indices(Xs)\n return self._hstack(list(Xs))\n \n def transform(self, X):\n \"\"\"Transform X separately by each transformer, concatenate results.\n\n Parameters\n ----------\n X : {array-like, dataframe} of shape (n_samples, n_features)\n The data to be transformed by subset.\n\n Returns\n -------\n X_t : {array-like, sparse matrix} of shape (n_samples, sum_n_components)\n Horizontally stacked results of transformers. sum_n_components is the\n sum of n_components (output dimension) over transformers. If\n any result is a sparse matrix, everything will be converted to\n sparse matrices.\n \"\"\"\n check_is_fitted(self)\n X = _check_X(X)\n fit_dataframe_and_transform_dataframe = hasattr(self, 'feature_names_in_') and hasattr(X, 'columns')\n if fit_dataframe_and_transform_dataframe:\n named_transformers = self.named_transformers_\n non_dropped_indices = [ind for (name, ind) in self._transformer_to_input_indices.items() if name in named_transformers and isinstance(named_transformers[name], str) and named_transformers[name] != 'drop']\n all_indices = set(chain(*non_dropped_indices))\n all_names = set((self.feature_names_in_[ind] for ind in all_indices))\n diff = all_names - set(X.columns)\n if diff:\n raise ValueError(f'columns are missing: {diff}')\n else:\n self._check_n_features(X, reset=False)\n Xs = self._fit_transform(X, None, _transform_one, fitted=True, column_as_strings=fit_dataframe_and_transform_dataframe)\n self._validate_output(Xs)\n if not Xs:\n return np.zeros((X.shape[0], 0))\n return self._hstack(list(Xs))\n \n def _hstack(self, Xs):\n \"\"\"Stacks Xs horizontally.\n\n This allows subclasses to control the stacking behavior, while reusing\n everything else from ColumnTransformer.\n\n Parameters\n ----------\n Xs : list of {array-like, sparse matrix, dataframe}\n \"\"\"\n if self.sparse_output_:\n try:\n converted_Xs = [check_array(X, accept_sparse=True, force_all_finite=False) for X in Xs]\n except ValueError as e:\n raise ValueError('For a sparse output, all columns should be a numeric or convertible to a numeric.') from e\n return sparse.hstack(converted_Xs).tocsr()\n else:\n Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]\n return np.hstack(Xs)\n \n def _sk_visual_block_(self):\n if isinstance(self.remainder, str) and self.remainder == 'drop':\n transformers = self.transformers\n elif hasattr(self, '_remainder'):\n remainder_columns = self._remainder[2]\n if hasattr(self, 'feature_names_in_') and remainder_columns and not all((isinstance(col, str) for col in remainder_columns)):\n remainder_columns = self.feature_names_in_[remainder_columns].tolist()\n transformers = chain(self.transformers, [('remainder', self.remainder, remainder_columns)])\n else:\n transformers = chain(self.transformers, [('remainder', self.remainder, '')])\n (names, transformers, name_details) = zip(*transformers)\n return _VisualBlock('parallel', transformers, names=names, name_details=name_details)\n" }, @@ -20160,7 +20226,7 @@ "sklearn.compose._column_transformer.make_column_selector.__call__" ], "is_public": true, - "description": "Create a callable to select columns to be used with :class:`ColumnTransformer`.\n\n:func:`make_column_selector` can select columns based on datatype or the columns name with a regex. When using multiple selection criteria, **all** criteria must match for a column to be selected.", + "description": "Create a callable to select columns to be used with\n:class:`ColumnTransformer`.\n\n:func:`make_column_selector` can select columns based on datatype or the\ncolumns name with a regex. When using multiple selection criteria, **all**\ncriteria must match for a column to be selected.", "docstring": "Create a callable to select columns to be used with\n :class:`ColumnTransformer`.\n\n :func:`make_column_selector` can select columns based on datatype or the\n columns name with a regex. When using multiple selection criteria, **all**\n criteria must match for a column to be selected.\n\n Parameters\n ----------\n pattern : str, default=None\n Name of columns containing this regex pattern will be included. If\n None, column selection will not be selected based on pattern.\n\n dtype_include : column dtype or list of column dtypes, default=None\n A selection of dtypes to include. For more details, see\n :meth:`pandas.DataFrame.select_dtypes`.\n\n dtype_exclude : column dtype or list of column dtypes, default=None\n A selection of dtypes to exclude. For more details, see\n :meth:`pandas.DataFrame.select_dtypes`.\n\n Returns\n -------\n selector : callable\n Callable for column selection to be used by a\n :class:`ColumnTransformer`.\n\n See Also\n --------\n ColumnTransformer : Class that allows combining the\n outputs of multiple transformer objects used on column subsets\n of the data into a single feature space.\n\n Examples\n --------\n >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder\n >>> from sklearn.compose import make_column_transformer\n >>> from sklearn.compose import make_column_selector\n >>> import numpy as np\n >>> import pandas as pd # doctest: +SKIP\n >>> X = pd.DataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'],\n ... 'rating': [5, 3, 4, 5]}) # doctest: +SKIP\n >>> ct = make_column_transformer(\n ... (StandardScaler(),\n ... make_column_selector(dtype_include=np.number)), # rating\n ... (OneHotEncoder(),\n ... make_column_selector(dtype_include=object))) # city\n >>> ct.fit_transform(X) # doctest: +SKIP\n array([[ 0.90453403, 1. , 0. , 0. ],\n [-1.50755672, 1. , 0. , 0. ],\n [-0.30151134, 0. , 1. , 0. ],\n [ 0.90453403, 0. , 0. , 1. ]])\n ", "source_code": "\n\nclass make_column_selector:\n \"\"\"Create a callable to select columns to be used with\n :class:`ColumnTransformer`.\n\n :func:`make_column_selector` can select columns based on datatype or the\n columns name with a regex. When using multiple selection criteria, **all**\n criteria must match for a column to be selected.\n\n Parameters\n ----------\n pattern : str, default=None\n Name of columns containing this regex pattern will be included. If\n None, column selection will not be selected based on pattern.\n\n dtype_include : column dtype or list of column dtypes, default=None\n A selection of dtypes to include. For more details, see\n :meth:`pandas.DataFrame.select_dtypes`.\n\n dtype_exclude : column dtype or list of column dtypes, default=None\n A selection of dtypes to exclude. For more details, see\n :meth:`pandas.DataFrame.select_dtypes`.\n\n Returns\n -------\n selector : callable\n Callable for column selection to be used by a\n :class:`ColumnTransformer`.\n\n See Also\n --------\n ColumnTransformer : Class that allows combining the\n outputs of multiple transformer objects used on column subsets\n of the data into a single feature space.\n\n Examples\n --------\n >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder\n >>> from sklearn.compose import make_column_transformer\n >>> from sklearn.compose import make_column_selector\n >>> import numpy as np\n >>> import pandas as pd # doctest: +SKIP\n >>> X = pd.DataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'],\n ... 'rating': [5, 3, 4, 5]}) # doctest: +SKIP\n >>> ct = make_column_transformer(\n ... (StandardScaler(),\n ... make_column_selector(dtype_include=np.number)), # rating\n ... (OneHotEncoder(),\n ... make_column_selector(dtype_include=object))) # city\n >>> ct.fit_transform(X) # doctest: +SKIP\n array([[ 0.90453403, 1. , 0. , 0. ],\n [-1.50755672, 1. , 0. , 0. ],\n [-0.30151134, 0. , 1. , 0. ],\n [ 0.90453403, 0. , 0. , 1. ]])\n \"\"\"\n \n def __init__(self, pattern=None, *, dtype_include=None, dtype_exclude=None):\n self.pattern = pattern\n self.dtype_include = dtype_include\n self.dtype_exclude = dtype_exclude\n \n def __call__(self, df):\n \"\"\"Callable for column selection to be used by a\n :class:`ColumnTransformer`.\n\n Parameters\n ----------\n df : dataframe of shape (n_features, n_samples)\n DataFrame to select columns from.\n \"\"\"\n if not hasattr(df, 'iloc'):\n raise ValueError('make_column_selector can only be applied to pandas dataframes')\n df_row = df.iloc[:1]\n if self.dtype_include is not None or self.dtype_exclude is not None:\n df_row = df_row.select_dtypes(include=self.dtype_include, exclude=self.dtype_exclude)\n cols = df_row.columns\n if self.pattern is not None:\n cols = cols[cols.str.contains(self.pattern, regex=True)]\n return cols.tolist()\n" }, @@ -20178,7 +20244,7 @@ "sklearn.compose._target.TransformedTargetRegressor.n_features_in_@getter" ], "is_public": true, - "description": "Meta-estimator to regress on a transformed target.\n\nUseful for applying a non-linear transformation to the target `y` in regression problems. This transformation can be given as a Transformer such as the :class:`~sklearn.preprocessing.QuantileTransformer` or as a function and its inverse such as `np.log` and `np.exp`. The computation during :meth:`fit` is:: regressor.fit(X, func(y)) or:: regressor.fit(X, transformer.transform(y)) The computation during :meth:`predict` is:: inverse_func(regressor.predict(X)) or:: transformer.inverse_transform(regressor.predict(X)) Read more in the :ref:`User Guide `. .. versionadded:: 0.20", + "description": "Meta-estimator to regress on a transformed target.\n\nUseful for applying a non-linear transformation to the target `y` in\nregression problems. This transformation can be given as a Transformer\nsuch as the :class:`~sklearn.preprocessing.QuantileTransformer` or as a\nfunction and its inverse such as `np.log` and `np.exp`.\n\nThe computation during :meth:`fit` is::\n\n regressor.fit(X, func(y))\n\nor::\n\n regressor.fit(X, transformer.transform(y))\n\nThe computation during :meth:`predict` is::\n\n inverse_func(regressor.predict(X))\n\nor::\n\n transformer.inverse_transform(regressor.predict(X))\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20", "docstring": "Meta-estimator to regress on a transformed target.\n\n Useful for applying a non-linear transformation to the target `y` in\n regression problems. This transformation can be given as a Transformer\n such as the :class:`~sklearn.preprocessing.QuantileTransformer` or as a\n function and its inverse such as `np.log` and `np.exp`.\n\n The computation during :meth:`fit` is::\n\n regressor.fit(X, func(y))\n\n or::\n\n regressor.fit(X, transformer.transform(y))\n\n The computation during :meth:`predict` is::\n\n inverse_func(regressor.predict(X))\n\n or::\n\n transformer.inverse_transform(regressor.predict(X))\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n regressor : object, default=None\n Regressor object such as derived from\n :class:`~sklearn.base.RegressorMixin`. This regressor will\n automatically be cloned each time prior to fitting. If `regressor is\n None`, :class:`~sklearn.linear_model.LinearRegression` is created and used.\n\n transformer : object, default=None\n Estimator object such as derived from\n :class:`~sklearn.base.TransformerMixin`. Cannot be set at the same time\n as `func` and `inverse_func`. If `transformer is None` as well as\n `func` and `inverse_func`, the transformer will be an identity\n transformer. Note that the transformer will be cloned during fitting.\n Also, the transformer is restricting `y` to be a numpy array.\n\n func : function, default=None\n Function to apply to `y` before passing to :meth:`fit`. Cannot be set\n at the same time as `transformer`. The function needs to return a\n 2-dimensional array. If `func is None`, the function used will be the\n identity function.\n\n inverse_func : function, default=None\n Function to apply to the prediction of the regressor. Cannot be set at\n the same time as `transformer`. The function needs to return a\n 2-dimensional array. The inverse function is used to return\n predictions to the same space of the original training labels.\n\n check_inverse : bool, default=True\n Whether to check that `transform` followed by `inverse_transform`\n or `func` followed by `inverse_func` leads to the original targets.\n\n Attributes\n ----------\n regressor_ : object\n Fitted regressor.\n\n transformer_ : object\n Transformer used in :meth:`fit` and :meth:`predict`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying regressor exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.preprocessing.FunctionTransformer : Construct a transformer from an\n arbitrary callable.\n\n Notes\n -----\n Internally, the target `y` is always converted into a 2-dimensional array\n to be used by scikit-learn transformers. At the time of prediction, the\n output will be reshaped to a have the same number of dimensions as `y`.\n\n See :ref:`examples/compose/plot_transformed_target.py\n `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.compose import TransformedTargetRegressor\n >>> tt = TransformedTargetRegressor(regressor=LinearRegression(),\n ... func=np.log, inverse_func=np.exp)\n >>> X = np.arange(4).reshape(-1, 1)\n >>> y = np.exp(2 * X).ravel()\n >>> tt.fit(X, y)\n TransformedTargetRegressor(...)\n >>> tt.score(X, y)\n 1.0\n >>> tt.regressor_.coef_\n array([2.])\n ", "source_code": "\n\nclass TransformedTargetRegressor(RegressorMixin, BaseEstimator):\n \"\"\"Meta-estimator to regress on a transformed target.\n\n Useful for applying a non-linear transformation to the target `y` in\n regression problems. This transformation can be given as a Transformer\n such as the :class:`~sklearn.preprocessing.QuantileTransformer` or as a\n function and its inverse such as `np.log` and `np.exp`.\n\n The computation during :meth:`fit` is::\n\n regressor.fit(X, func(y))\n\n or::\n\n regressor.fit(X, transformer.transform(y))\n\n The computation during :meth:`predict` is::\n\n inverse_func(regressor.predict(X))\n\n or::\n\n transformer.inverse_transform(regressor.predict(X))\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n regressor : object, default=None\n Regressor object such as derived from\n :class:`~sklearn.base.RegressorMixin`. This regressor will\n automatically be cloned each time prior to fitting. If `regressor is\n None`, :class:`~sklearn.linear_model.LinearRegression` is created and used.\n\n transformer : object, default=None\n Estimator object such as derived from\n :class:`~sklearn.base.TransformerMixin`. Cannot be set at the same time\n as `func` and `inverse_func`. If `transformer is None` as well as\n `func` and `inverse_func`, the transformer will be an identity\n transformer. Note that the transformer will be cloned during fitting.\n Also, the transformer is restricting `y` to be a numpy array.\n\n func : function, default=None\n Function to apply to `y` before passing to :meth:`fit`. Cannot be set\n at the same time as `transformer`. The function needs to return a\n 2-dimensional array. If `func is None`, the function used will be the\n identity function.\n\n inverse_func : function, default=None\n Function to apply to the prediction of the regressor. Cannot be set at\n the same time as `transformer`. The function needs to return a\n 2-dimensional array. The inverse function is used to return\n predictions to the same space of the original training labels.\n\n check_inverse : bool, default=True\n Whether to check that `transform` followed by `inverse_transform`\n or `func` followed by `inverse_func` leads to the original targets.\n\n Attributes\n ----------\n regressor_ : object\n Fitted regressor.\n\n transformer_ : object\n Transformer used in :meth:`fit` and :meth:`predict`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying regressor exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.preprocessing.FunctionTransformer : Construct a transformer from an\n arbitrary callable.\n\n Notes\n -----\n Internally, the target `y` is always converted into a 2-dimensional array\n to be used by scikit-learn transformers. At the time of prediction, the\n output will be reshaped to a have the same number of dimensions as `y`.\n\n See :ref:`examples/compose/plot_transformed_target.py\n `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.compose import TransformedTargetRegressor\n >>> tt = TransformedTargetRegressor(regressor=LinearRegression(),\n ... func=np.log, inverse_func=np.exp)\n >>> X = np.arange(4).reshape(-1, 1)\n >>> y = np.exp(2 * X).ravel()\n >>> tt.fit(X, y)\n TransformedTargetRegressor(...)\n >>> tt.score(X, y)\n 1.0\n >>> tt.regressor_.coef_\n array([2.])\n \"\"\"\n \n def __init__(self, regressor=None, *, transformer=None, func=None, inverse_func=None, check_inverse=True):\n self.regressor = regressor\n self.transformer = transformer\n self.func = func\n self.inverse_func = inverse_func\n self.check_inverse = check_inverse\n \n def _fit_transformer(self, y):\n \"\"\"Check transformer and fit transformer.\n\n Create the default transformer, fit it and make additional inverse\n check on a subset (optional).\n\n \"\"\"\n if self.transformer is not None and (self.func is not None or self.inverse_func is not None):\n raise ValueError(\"'transformer' and functions 'func'/'inverse_func' cannot both be set.\")\n elif self.transformer is not None:\n self.transformer_ = clone(self.transformer)\n else:\n if self.func is not None and self.inverse_func is None:\n raise ValueError(\"When 'func' is provided, 'inverse_func' must also be provided\")\n self.transformer_ = FunctionTransformer(func=self.func, inverse_func=self.inverse_func, validate=True, check_inverse=self.check_inverse)\n self.transformer_.fit(y)\n if self.check_inverse:\n idx_selected = slice(None, None, max(1, y.shape[0] // 10))\n y_sel = _safe_indexing(y, idx_selected)\n y_sel_t = self.transformer_.transform(y_sel)\n if not np.allclose(y_sel, self.transformer_.inverse_transform(y_sel_t)):\n warnings.warn(\"The provided functions or transformer are not strictly inverse of each other. If you are sure you want to proceed regardless, set 'check_inverse=False'\", UserWarning)\n \n def fit(self, X, y, **fit_params):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n **fit_params : dict\n Parameters passed to the `fit` method of the underlying\n regressor.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n y = check_array(y, accept_sparse=False, force_all_finite=True, ensure_2d=False, dtype='numeric', allow_nd=True)\n self._training_dim = y.ndim\n if y.ndim == 1:\n y_2d = y.reshape(-1, 1)\n else:\n y_2d = y\n self._fit_transformer(y_2d)\n y_trans = self.transformer_.transform(y_2d)\n if y_trans.ndim == 2 and y_trans.shape[1] == 1:\n y_trans = y_trans.squeeze(axis=1)\n if self.regressor is None:\n from ..linear_model import LinearRegression\n self.regressor_ = LinearRegression()\n else:\n self.regressor_ = clone(self.regressor)\n self.regressor_.fit(X, y_trans, **fit_params)\n if hasattr(self.regressor_, 'feature_names_in_'):\n self.feature_names_in_ = self.regressor_.feature_names_in_\n return self\n \n def predict(self, X, **predict_params):\n \"\"\"Predict using the base regressor, applying inverse.\n\n The regressor is used to predict and the `inverse_func` or\n `inverse_transform` is applied before returning the prediction.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n **predict_params : dict of str -> object\n Parameters passed to the `predict` method of the underlying\n regressor.\n\n Returns\n -------\n y_hat : ndarray of shape (n_samples,)\n Predicted values.\n \"\"\"\n check_is_fitted(self)\n pred = self.regressor_.predict(X, **predict_params)\n if pred.ndim == 1:\n pred_trans = self.transformer_.inverse_transform(pred.reshape(-1, 1))\n else:\n pred_trans = self.transformer_.inverse_transform(pred)\n if self._training_dim == 1 and pred_trans.ndim == 2 and pred_trans.shape[1] == 1:\n pred_trans = pred_trans.squeeze(axis=1)\n return pred_trans\n \n def _more_tags(self):\n regressor = self.regressor\n if regressor is None:\n from ..linear_model import LinearRegression\n regressor = LinearRegression()\n return {'poor_score': True, 'multioutput': _safe_tags(regressor, key='multioutput')}\n \n @property\n def n_features_in_(self):\n \"\"\"Number of features seen during :term:`fit`.\"\"\"\n try:\n check_is_fitted(self)\n except NotFittedError as nfe:\n raise AttributeError('{} object has no n_features_in_ attribute.'.format(self.__class__.__name__)) from nfe\n return self.regressor_.n_features_in_\n" }, @@ -20229,7 +20295,7 @@ "sklearn.covariance._graph_lasso.GraphicalLasso.fit" ], "is_public": true, - "description": "Sparse inverse covariance estimation with an l1-penalized estimator.\n\nRead more in the :ref:`User Guide `. .. versionchanged:: v0.20 GraphLasso has been renamed to GraphicalLasso", + "description": "Sparse inverse covariance estimation with an l1-penalized estimator.\n\nRead more in the :ref:`User Guide `.\n\n.. versionchanged:: v0.20\n GraphLasso has been renamed to GraphicalLasso", "docstring": "Sparse inverse covariance estimation with an l1-penalized estimator.\n\n Read more in the :ref:`User Guide `.\n\n .. versionchanged:: v0.20\n GraphLasso has been renamed to GraphicalLasso\n\n Parameters\n ----------\n alpha : float, default=0.01\n The regularization parameter: the higher alpha, the more\n regularization, the sparser the inverse covariance.\n Range is (0, inf].\n\n mode : {'cd', 'lars'}, default='cd'\n The Lasso solver to use: coordinate descent or LARS. Use LARS for\n very sparse underlying graphs, where p > n. Elsewhere prefer cd\n which is more numerically stable.\n\n tol : float, default=1e-4\n The tolerance to declare convergence: if the dual gap goes below\n this value, iterations are stopped. Range is (0, inf].\n\n enet_tol : float, default=1e-4\n The tolerance for the elastic net solver used to calculate the descent\n direction. This parameter controls the accuracy of the search direction\n for a given column update, not of the overall parameter estimate. Only\n used for mode='cd'. Range is (0, inf].\n\n max_iter : int, default=100\n The maximum number of iterations.\n\n verbose : bool, default=False\n If verbose is True, the objective function and dual gap are\n plotted at each iteration.\n\n assume_centered : bool, default=False\n If True, data are not centered before computation.\n Useful when working with data whose mean is almost, but not exactly\n zero.\n If False, data are centered before computation.\n\n Attributes\n ----------\n location_ : ndarray of shape (n_features,)\n Estimated location, i.e. the estimated mean.\n\n covariance_ : ndarray of shape (n_features, n_features)\n Estimated covariance matrix\n\n precision_ : ndarray of shape (n_features, n_features)\n Estimated pseudo inverse matrix.\n\n n_iter_ : int\n Number of iterations run.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n graphical_lasso : L1-penalized covariance estimator.\n GraphicalLassoCV : Sparse inverse covariance with\n cross-validated choice of the l1 penalty.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.covariance import GraphicalLasso\n >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],\n ... [0.0, 0.4, 0.0, 0.0],\n ... [0.2, 0.0, 0.3, 0.1],\n ... [0.0, 0.0, 0.1, 0.7]])\n >>> np.random.seed(0)\n >>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],\n ... cov=true_cov,\n ... size=200)\n >>> cov = GraphicalLasso().fit(X)\n >>> np.around(cov.covariance_, decimals=3)\n array([[0.816, 0.049, 0.218, 0.019],\n [0.049, 0.364, 0.017, 0.034],\n [0.218, 0.017, 0.322, 0.093],\n [0.019, 0.034, 0.093, 0.69 ]])\n >>> np.around(cov.location_, decimals=3)\n array([0.073, 0.04 , 0.038, 0.143])\n ", "source_code": "\n\nclass GraphicalLasso(EmpiricalCovariance):\n \"\"\"Sparse inverse covariance estimation with an l1-penalized estimator.\n\n Read more in the :ref:`User Guide `.\n\n .. versionchanged:: v0.20\n GraphLasso has been renamed to GraphicalLasso\n\n Parameters\n ----------\n alpha : float, default=0.01\n The regularization parameter: the higher alpha, the more\n regularization, the sparser the inverse covariance.\n Range is (0, inf].\n\n mode : {'cd', 'lars'}, default='cd'\n The Lasso solver to use: coordinate descent or LARS. Use LARS for\n very sparse underlying graphs, where p > n. Elsewhere prefer cd\n which is more numerically stable.\n\n tol : float, default=1e-4\n The tolerance to declare convergence: if the dual gap goes below\n this value, iterations are stopped. Range is (0, inf].\n\n enet_tol : float, default=1e-4\n The tolerance for the elastic net solver used to calculate the descent\n direction. This parameter controls the accuracy of the search direction\n for a given column update, not of the overall parameter estimate. Only\n used for mode='cd'. Range is (0, inf].\n\n max_iter : int, default=100\n The maximum number of iterations.\n\n verbose : bool, default=False\n If verbose is True, the objective function and dual gap are\n plotted at each iteration.\n\n assume_centered : bool, default=False\n If True, data are not centered before computation.\n Useful when working with data whose mean is almost, but not exactly\n zero.\n If False, data are centered before computation.\n\n Attributes\n ----------\n location_ : ndarray of shape (n_features,)\n Estimated location, i.e. the estimated mean.\n\n covariance_ : ndarray of shape (n_features, n_features)\n Estimated covariance matrix\n\n precision_ : ndarray of shape (n_features, n_features)\n Estimated pseudo inverse matrix.\n\n n_iter_ : int\n Number of iterations run.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n graphical_lasso : L1-penalized covariance estimator.\n GraphicalLassoCV : Sparse inverse covariance with\n cross-validated choice of the l1 penalty.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.covariance import GraphicalLasso\n >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],\n ... [0.0, 0.4, 0.0, 0.0],\n ... [0.2, 0.0, 0.3, 0.1],\n ... [0.0, 0.0, 0.1, 0.7]])\n >>> np.random.seed(0)\n >>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],\n ... cov=true_cov,\n ... size=200)\n >>> cov = GraphicalLasso().fit(X)\n >>> np.around(cov.covariance_, decimals=3)\n array([[0.816, 0.049, 0.218, 0.019],\n [0.049, 0.364, 0.017, 0.034],\n [0.218, 0.017, 0.322, 0.093],\n [0.019, 0.034, 0.093, 0.69 ]])\n >>> np.around(cov.location_, decimals=3)\n array([0.073, 0.04 , 0.038, 0.143])\n \"\"\"\n \n def __init__(self, alpha=0.01, *, mode='cd', tol=0.0001, enet_tol=0.0001, max_iter=100, verbose=False, assume_centered=False):\n super().__init__(assume_centered=assume_centered)\n self.alpha = alpha\n self.mode = mode\n self.tol = tol\n self.enet_tol = enet_tol\n self.max_iter = max_iter\n self.verbose = verbose\n \n def fit(self, X, y=None):\n \"\"\"Fit the GraphicalLasso model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2, estimator=self)\n if self.assume_centered:\n self.location_ = np.zeros(X.shape[1])\n else:\n self.location_ = X.mean(0)\n emp_cov = empirical_covariance(X, assume_centered=self.assume_centered)\n (self.covariance_, self.precision_, self.n_iter_) = graphical_lasso(emp_cov, alpha=self.alpha, mode=self.mode, tol=self.tol, enet_tol=self.enet_tol, max_iter=self.max_iter, verbose=self.verbose, return_n_iter=True)\n return self\n" }, @@ -20245,7 +20311,7 @@ "sklearn.covariance._graph_lasso.GraphicalLassoCV.cv_alphas_@getter" ], "is_public": true, - "description": "Sparse inverse covariance w/ cross-validated choice of the l1 penalty.\n\nSee glossary entry for :term:`cross-validation estimator`. Read more in the :ref:`User Guide `. .. versionchanged:: v0.20 GraphLassoCV has been renamed to GraphicalLassoCV", + "description": "Sparse inverse covariance w/ cross-validated choice of the l1 penalty.\n\nSee glossary entry for :term:`cross-validation estimator`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionchanged:: v0.20\n GraphLassoCV has been renamed to GraphicalLassoCV", "docstring": "Sparse inverse covariance w/ cross-validated choice of the l1 penalty.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionchanged:: v0.20\n GraphLassoCV has been renamed to GraphicalLassoCV\n\n Parameters\n ----------\n alphas : int or array-like of shape (n_alphas,), dtype=float, default=4\n If an integer is given, it fixes the number of points on the\n grids of alpha to be used. If a list is given, it gives the\n grid to be used. See the notes in the class docstring for\n more details. Range is (0, inf] when floats given.\n\n n_refinements : int, default=4\n The number of times the grid is refined. Not used if explicit\n values of alphas are passed. Range is [1, inf).\n\n cv : int, cross-validation generator or iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.20\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n tol : float, default=1e-4\n The tolerance to declare convergence: if the dual gap goes below\n this value, iterations are stopped. Range is (0, inf].\n\n enet_tol : float, default=1e-4\n The tolerance for the elastic net solver used to calculate the descent\n direction. This parameter controls the accuracy of the search direction\n for a given column update, not of the overall parameter estimate. Only\n used for mode='cd'. Range is (0, inf].\n\n max_iter : int, default=100\n Maximum number of iterations.\n\n mode : {'cd', 'lars'}, default='cd'\n The Lasso solver to use: coordinate descent or LARS. Use LARS for\n very sparse underlying graphs, where number of features is greater\n than number of samples. Elsewhere prefer cd which is more numerically\n stable.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\n verbose : bool, default=False\n If verbose is True, the objective function and duality gap are\n printed at each iteration.\n\n assume_centered : bool, default=False\n If True, data are not centered before computation.\n Useful when working with data whose mean is almost, but not exactly\n zero.\n If False, data are centered before computation.\n\n Attributes\n ----------\n location_ : ndarray of shape (n_features,)\n Estimated location, i.e. the estimated mean.\n\n covariance_ : ndarray of shape (n_features, n_features)\n Estimated covariance matrix.\n\n precision_ : ndarray of shape (n_features, n_features)\n Estimated precision matrix (inverse covariance).\n\n alpha_ : float\n Penalization parameter selected.\n\n cv_alphas_ : list of shape (n_alphas,), dtype=float\n All penalization parameters explored.\n\n .. deprecated:: 0.24\n The `cv_alphas_` attribute is deprecated in version 0.24 in favor\n of `cv_results_['alphas']` and will be removed in version\n 1.1 (renaming of 0.26).\n\n grid_scores_ : ndarray of shape (n_alphas, n_folds)\n Log-likelihood score on left-out data across folds.\n\n .. deprecated:: 0.24\n The `grid_scores_` attribute is deprecated in version 0.24 in favor\n of `cv_results_` and will be removed in version\n 1.1 (renaming of 0.26).\n\n cv_results_ : dict of ndarrays\n A dict with keys:\n\n alphas : ndarray of shape (n_alphas,)\n All penalization parameters explored.\n\n split(k)_test_score : ndarray of shape (n_alphas,)\n Log-likelihood score on left-out data across (k)th fold.\n\n .. versionadded:: 1.0\n\n mean_test_score : ndarray of shape (n_alphas,)\n Mean of scores over the folds.\n\n .. versionadded:: 1.0\n\n std_test_score : ndarray of shape (n_alphas,)\n Standard deviation of scores over the folds.\n\n .. versionadded:: 1.0\n\n split(k)_score : ndarray of shape (n_alphas,)\n Log-likelihood score on left-out data across (k)th fold.\n\n .. deprecated:: 1.0\n `split(k)_score` is deprecated in 1.0 and will be removed in 1.2.\n Use `split(k)_test_score` instead.\n\n mean_score : ndarray of shape (n_alphas,)\n Mean of scores over the folds.\n\n .. deprecated:: 1.0\n `mean_score` is deprecated in 1.0 and will be removed in 1.2.\n Use `mean_test_score` instead.\n\n std_score : ndarray of shape (n_alphas,)\n Standard deviation of scores over the folds.\n\n .. deprecated:: 1.0\n `std_score` is deprecated in 1.0 and will be removed in 1.2.\n Use `std_test_score` instead.\n\n .. versionadded:: 0.24\n\n n_iter_ : int\n Number of iterations run for the optimal alpha.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n graphical_lasso : L1-penalized covariance estimator.\n GraphicalLasso : Sparse inverse covariance with\n cross-validated choice of the l1 penalty.\n\n Notes\n -----\n The search for the optimal penalization parameter (alpha) is done on an\n iteratively refined grid: first the cross-validated scores on a grid are\n computed, then a new refined grid is centered around the maximum, and so\n on.\n\n One of the challenges which is faced here is that the solvers can\n fail to converge to a well-conditioned estimate. The corresponding\n values of alpha then come out as missing values, but the optimum may\n be close to these missing values.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.covariance import GraphicalLassoCV\n >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],\n ... [0.0, 0.4, 0.0, 0.0],\n ... [0.2, 0.0, 0.3, 0.1],\n ... [0.0, 0.0, 0.1, 0.7]])\n >>> np.random.seed(0)\n >>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],\n ... cov=true_cov,\n ... size=200)\n >>> cov = GraphicalLassoCV().fit(X)\n >>> np.around(cov.covariance_, decimals=3)\n array([[0.816, 0.051, 0.22 , 0.017],\n [0.051, 0.364, 0.018, 0.036],\n [0.22 , 0.018, 0.322, 0.094],\n [0.017, 0.036, 0.094, 0.69 ]])\n >>> np.around(cov.location_, decimals=3)\n array([0.073, 0.04 , 0.038, 0.143])\n ", "source_code": "\n\nclass GraphicalLassoCV(GraphicalLasso):\n \"\"\"Sparse inverse covariance w/ cross-validated choice of the l1 penalty.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionchanged:: v0.20\n GraphLassoCV has been renamed to GraphicalLassoCV\n\n Parameters\n ----------\n alphas : int or array-like of shape (n_alphas,), dtype=float, default=4\n If an integer is given, it fixes the number of points on the\n grids of alpha to be used. If a list is given, it gives the\n grid to be used. See the notes in the class docstring for\n more details. Range is (0, inf] when floats given.\n\n n_refinements : int, default=4\n The number of times the grid is refined. Not used if explicit\n values of alphas are passed. Range is [1, inf).\n\n cv : int, cross-validation generator or iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.20\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n tol : float, default=1e-4\n The tolerance to declare convergence: if the dual gap goes below\n this value, iterations are stopped. Range is (0, inf].\n\n enet_tol : float, default=1e-4\n The tolerance for the elastic net solver used to calculate the descent\n direction. This parameter controls the accuracy of the search direction\n for a given column update, not of the overall parameter estimate. Only\n used for mode='cd'. Range is (0, inf].\n\n max_iter : int, default=100\n Maximum number of iterations.\n\n mode : {'cd', 'lars'}, default='cd'\n The Lasso solver to use: coordinate descent or LARS. Use LARS for\n very sparse underlying graphs, where number of features is greater\n than number of samples. Elsewhere prefer cd which is more numerically\n stable.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\n verbose : bool, default=False\n If verbose is True, the objective function and duality gap are\n printed at each iteration.\n\n assume_centered : bool, default=False\n If True, data are not centered before computation.\n Useful when working with data whose mean is almost, but not exactly\n zero.\n If False, data are centered before computation.\n\n Attributes\n ----------\n location_ : ndarray of shape (n_features,)\n Estimated location, i.e. the estimated mean.\n\n covariance_ : ndarray of shape (n_features, n_features)\n Estimated covariance matrix.\n\n precision_ : ndarray of shape (n_features, n_features)\n Estimated precision matrix (inverse covariance).\n\n alpha_ : float\n Penalization parameter selected.\n\n cv_alphas_ : list of shape (n_alphas,), dtype=float\n All penalization parameters explored.\n\n .. deprecated:: 0.24\n The `cv_alphas_` attribute is deprecated in version 0.24 in favor\n of `cv_results_['alphas']` and will be removed in version\n 1.1 (renaming of 0.26).\n\n grid_scores_ : ndarray of shape (n_alphas, n_folds)\n Log-likelihood score on left-out data across folds.\n\n .. deprecated:: 0.24\n The `grid_scores_` attribute is deprecated in version 0.24 in favor\n of `cv_results_` and will be removed in version\n 1.1 (renaming of 0.26).\n\n cv_results_ : dict of ndarrays\n A dict with keys:\n\n alphas : ndarray of shape (n_alphas,)\n All penalization parameters explored.\n\n split(k)_test_score : ndarray of shape (n_alphas,)\n Log-likelihood score on left-out data across (k)th fold.\n\n .. versionadded:: 1.0\n\n mean_test_score : ndarray of shape (n_alphas,)\n Mean of scores over the folds.\n\n .. versionadded:: 1.0\n\n std_test_score : ndarray of shape (n_alphas,)\n Standard deviation of scores over the folds.\n\n .. versionadded:: 1.0\n\n split(k)_score : ndarray of shape (n_alphas,)\n Log-likelihood score on left-out data across (k)th fold.\n\n .. deprecated:: 1.0\n `split(k)_score` is deprecated in 1.0 and will be removed in 1.2.\n Use `split(k)_test_score` instead.\n\n mean_score : ndarray of shape (n_alphas,)\n Mean of scores over the folds.\n\n .. deprecated:: 1.0\n `mean_score` is deprecated in 1.0 and will be removed in 1.2.\n Use `mean_test_score` instead.\n\n std_score : ndarray of shape (n_alphas,)\n Standard deviation of scores over the folds.\n\n .. deprecated:: 1.0\n `std_score` is deprecated in 1.0 and will be removed in 1.2.\n Use `std_test_score` instead.\n\n .. versionadded:: 0.24\n\n n_iter_ : int\n Number of iterations run for the optimal alpha.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n graphical_lasso : L1-penalized covariance estimator.\n GraphicalLasso : Sparse inverse covariance with\n cross-validated choice of the l1 penalty.\n\n Notes\n -----\n The search for the optimal penalization parameter (alpha) is done on an\n iteratively refined grid: first the cross-validated scores on a grid are\n computed, then a new refined grid is centered around the maximum, and so\n on.\n\n One of the challenges which is faced here is that the solvers can\n fail to converge to a well-conditioned estimate. The corresponding\n values of alpha then come out as missing values, but the optimum may\n be close to these missing values.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.covariance import GraphicalLassoCV\n >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],\n ... [0.0, 0.4, 0.0, 0.0],\n ... [0.2, 0.0, 0.3, 0.1],\n ... [0.0, 0.0, 0.1, 0.7]])\n >>> np.random.seed(0)\n >>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],\n ... cov=true_cov,\n ... size=200)\n >>> cov = GraphicalLassoCV().fit(X)\n >>> np.around(cov.covariance_, decimals=3)\n array([[0.816, 0.051, 0.22 , 0.017],\n [0.051, 0.364, 0.018, 0.036],\n [0.22 , 0.018, 0.322, 0.094],\n [0.017, 0.036, 0.094, 0.69 ]])\n >>> np.around(cov.location_, decimals=3)\n array([0.073, 0.04 , 0.038, 0.143])\n \"\"\"\n \n def __init__(self, *, alphas=4, n_refinements=4, cv=None, tol=0.0001, enet_tol=0.0001, max_iter=100, mode='cd', n_jobs=None, verbose=False, assume_centered=False):\n super().__init__(mode=mode, tol=tol, verbose=verbose, enet_tol=enet_tol, max_iter=max_iter, assume_centered=assume_centered)\n self.alphas = alphas\n self.n_refinements = n_refinements\n self.cv = cv\n self.n_jobs = n_jobs\n \n def fit(self, X, y=None):\n \"\"\"Fit the GraphicalLasso covariance model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, ensure_min_features=2, estimator=self)\n if self.assume_centered:\n self.location_ = np.zeros(X.shape[1])\n else:\n self.location_ = X.mean(0)\n emp_cov = empirical_covariance(X, assume_centered=self.assume_centered)\n cv = check_cv(self.cv, y, classifier=False)\n path = list()\n n_alphas = self.alphas\n inner_verbose = max(0, self.verbose - 1)\n if isinstance(n_alphas, Sequence):\n alphas = self.alphas\n n_refinements = 1\n else:\n n_refinements = self.n_refinements\n alpha_1 = alpha_max(emp_cov)\n alpha_0 = 0.01 * alpha_1\n alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1), n_alphas)[::-1]\n t0 = time.time()\n for i in range(n_refinements):\n with warnings.catch_warnings():\n warnings.simplefilter('ignore', ConvergenceWarning)\n this_path = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)((delayed(graphical_lasso_path)(X[train], alphas=alphas, X_test=X[test], mode=self.mode, tol=self.tol, enet_tol=self.enet_tol, max_iter=int(0.1 * self.max_iter), verbose=inner_verbose) for (train, test) in cv.split(X, y)))\n (covs, _, scores) = zip(*this_path)\n covs = zip(*covs)\n scores = zip(*scores)\n path.extend(zip(alphas, scores, covs))\n path = sorted(path, key=operator.itemgetter(0), reverse=True)\n best_score = -np.inf\n last_finite_idx = 0\n for (index, (alpha, scores, _)) in enumerate(path):\n this_score = np.mean(scores)\n if this_score >= 0.1 / np.finfo(np.float64).eps:\n this_score = np.nan\n if np.isfinite(this_score):\n last_finite_idx = index\n if this_score >= best_score:\n best_score = this_score\n best_index = index\n if best_index == 0:\n alpha_1 = path[0][0]\n alpha_0 = path[1][0]\n elif best_index == last_finite_idx and not best_index == len(path) - 1:\n alpha_1 = path[best_index][0]\n alpha_0 = path[best_index + 1][0]\n elif best_index == len(path) - 1:\n alpha_1 = path[best_index][0]\n alpha_0 = 0.01 * path[best_index][0]\n else:\n alpha_1 = path[best_index - 1][0]\n alpha_0 = path[best_index + 1][0]\n if not isinstance(n_alphas, Sequence):\n alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0), n_alphas + 2)\n alphas = alphas[1:-1]\n if self.verbose and n_refinements > 1:\n print('[GraphicalLassoCV] Done refinement % 2i out of %i: % 3is' % (i + 1, n_refinements, time.time() - t0))\n path = list(zip(*path))\n grid_scores = list(path[1])\n alphas = list(path[0])\n alphas.append(0)\n grid_scores.append(cross_val_score(EmpiricalCovariance(), X, cv=cv, n_jobs=self.n_jobs, verbose=inner_verbose))\n grid_scores = np.array(grid_scores)\n self.cv_results_ = _DictWithDeprecatedKeys(alphas=np.array(alphas))\n for i in range(grid_scores.shape[1]):\n self.cv_results_._set_deprecated(grid_scores[:, i], new_key=f'split{i}_test_score', deprecated_key=f'split{i}_score')\n self.cv_results_._set_deprecated(np.mean(grid_scores, axis=1), new_key='mean_test_score', deprecated_key='mean_score')\n self.cv_results_._set_deprecated(np.std(grid_scores, axis=1), new_key='std_test_score', deprecated_key='std_score')\n best_alpha = alphas[best_index]\n self.alpha_ = best_alpha\n (self.covariance_, self.precision_, self.n_iter_) = graphical_lasso(emp_cov, alpha=best_alpha, mode=self.mode, tol=self.tol, enet_tol=self.enet_tol, max_iter=self.max_iter, verbose=inner_verbose, return_n_iter=True)\n return self\n \n @deprecated('The `grid_scores_` attribute is deprecated in version 0.24 in favor of `cv_results_` and will be removed in version 1.1 (renaming of 0.26).')\n @property\n def grid_scores_(self):\n n_splits = len([key for key in self.cv_results_ if key.startswith('split') and key.endswith('_test_score')])\n return np.asarray([self.cv_results_['split{}_test_score'.format(i)] for i in range(n_splits)]).T\n \n @deprecated(\"The `cv_alphas_` attribute is deprecated in version 0.24 in favor of `cv_results_['alpha']` and will be removed in version 1.1 (renaming of 0.26).\")\n @property\n def cv_alphas_(self):\n return self.cv_results_['alphas'].tolist()\n" }, @@ -20276,7 +20342,7 @@ "sklearn.covariance._robust_covariance.MinCovDet.reweight_covariance" ], "is_public": true, - "description": "Minimum Covariance Determinant (MCD): robust estimator of covariance.\n\nThe Minimum Covariance Determinant covariance estimator is to be applied on Gaussian-distributed data, but could still be relevant on data drawn from a unimodal, symmetric distribution. It is not meant to be used with multi-modal data (the algorithm used to fit a MinCovDet object is likely to fail in such a case). One should consider projection pursuit methods to deal with multi-modal datasets. Read more in the :ref:`User Guide `.", + "description": "Minimum Covariance Determinant (MCD): robust estimator of covariance.\n\nThe Minimum Covariance Determinant covariance estimator is to be applied\non Gaussian-distributed data, but could still be relevant on data\ndrawn from a unimodal, symmetric distribution. It is not meant to be used\nwith multi-modal data (the algorithm used to fit a MinCovDet object is\nlikely to fail in such a case).\nOne should consider projection pursuit methods to deal with multi-modal\ndatasets.\n\nRead more in the :ref:`User Guide `.", "docstring": "Minimum Covariance Determinant (MCD): robust estimator of covariance.\n\n The Minimum Covariance Determinant covariance estimator is to be applied\n on Gaussian-distributed data, but could still be relevant on data\n drawn from a unimodal, symmetric distribution. It is not meant to be used\n with multi-modal data (the algorithm used to fit a MinCovDet object is\n likely to fail in such a case).\n One should consider projection pursuit methods to deal with multi-modal\n datasets.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n store_precision : bool, default=True\n Specify if the estimated precision is stored.\n\n assume_centered : bool, default=False\n If True, the support of the robust location and the covariance\n estimates is computed, and a covariance estimate is recomputed from\n it, without centering the data.\n Useful to work with data whose mean is significantly equal to\n zero but is not exactly zero.\n If False, the robust location and covariance are directly computed\n with the FastMCD algorithm without additional treatment.\n\n support_fraction : float, default=None\n The proportion of points to be included in the support of the raw\n MCD estimate. Default is None, which implies that the minimum\n value of support_fraction will be used within the algorithm:\n `(n_sample + n_features + 1) / 2`. The parameter must be in the range\n (0, 1).\n\n random_state : int, RandomState instance or None, default=None\n Determines the pseudo random number generator for shuffling the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n raw_location_ : ndarray of shape (n_features,)\n The raw robust estimated location before correction and re-weighting.\n\n raw_covariance_ : ndarray of shape (n_features, n_features)\n The raw robust estimated covariance before correction and re-weighting.\n\n raw_support_ : ndarray of shape (n_samples,)\n A mask of the observations that have been used to compute\n the raw robust estimates of location and shape, before correction\n and re-weighting.\n\n location_ : ndarray of shape (n_features,)\n Estimated robust location.\n\n covariance_ : ndarray of shape (n_features, n_features)\n Estimated robust covariance matrix.\n\n precision_ : ndarray of shape (n_features, n_features)\n Estimated pseudo inverse matrix.\n (stored only if store_precision is True)\n\n support_ : ndarray of shape (n_samples,)\n A mask of the observations that have been used to compute\n the robust estimates of location and shape.\n\n dist_ : ndarray of shape (n_samples,)\n Mahalanobis distances of the training set (on which :meth:`fit` is\n called) observations.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n EllipticEnvelope : An object for detecting outliers in\n a Gaussian distributed dataset.\n EmpiricalCovariance : Maximum likelihood covariance estimator.\n GraphicalLasso : Sparse inverse covariance estimation\n with an l1-penalized estimator.\n GraphicalLassoCV : Sparse inverse covariance with cross-validated\n choice of the l1 penalty.\n LedoitWolf : LedoitWolf Estimator.\n OAS : Oracle Approximating Shrinkage Estimator.\n ShrunkCovariance : Covariance estimator with shrinkage.\n\n References\n ----------\n\n .. [Rouseeuw1984] P. J. Rousseeuw. Least median of squares regression.\n J. Am Stat Ass, 79:871, 1984.\n .. [Rousseeuw] A Fast Algorithm for the Minimum Covariance Determinant\n Estimator, 1999, American Statistical Association and the American\n Society for Quality, TECHNOMETRICS\n .. [ButlerDavies] R. W. Butler, P. L. Davies and M. Jhun,\n Asymptotics For The Minimum Covariance Determinant Estimator,\n The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.covariance import MinCovDet\n >>> from sklearn.datasets import make_gaussian_quantiles\n >>> real_cov = np.array([[.8, .3],\n ... [.3, .4]])\n >>> rng = np.random.RandomState(0)\n >>> X = rng.multivariate_normal(mean=[0, 0],\n ... cov=real_cov,\n ... size=500)\n >>> cov = MinCovDet(random_state=0).fit(X)\n >>> cov.covariance_\n array([[0.7411..., 0.2535...],\n [0.2535..., 0.3053...]])\n >>> cov.location_\n array([0.0813... , 0.0427...])\n ", "source_code": "\n\nclass MinCovDet(EmpiricalCovariance):\n \"\"\"Minimum Covariance Determinant (MCD): robust estimator of covariance.\n\n The Minimum Covariance Determinant covariance estimator is to be applied\n on Gaussian-distributed data, but could still be relevant on data\n drawn from a unimodal, symmetric distribution. It is not meant to be used\n with multi-modal data (the algorithm used to fit a MinCovDet object is\n likely to fail in such a case).\n One should consider projection pursuit methods to deal with multi-modal\n datasets.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n store_precision : bool, default=True\n Specify if the estimated precision is stored.\n\n assume_centered : bool, default=False\n If True, the support of the robust location and the covariance\n estimates is computed, and a covariance estimate is recomputed from\n it, without centering the data.\n Useful to work with data whose mean is significantly equal to\n zero but is not exactly zero.\n If False, the robust location and covariance are directly computed\n with the FastMCD algorithm without additional treatment.\n\n support_fraction : float, default=None\n The proportion of points to be included in the support of the raw\n MCD estimate. Default is None, which implies that the minimum\n value of support_fraction will be used within the algorithm:\n `(n_sample + n_features + 1) / 2`. The parameter must be in the range\n (0, 1).\n\n random_state : int, RandomState instance or None, default=None\n Determines the pseudo random number generator for shuffling the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n raw_location_ : ndarray of shape (n_features,)\n The raw robust estimated location before correction and re-weighting.\n\n raw_covariance_ : ndarray of shape (n_features, n_features)\n The raw robust estimated covariance before correction and re-weighting.\n\n raw_support_ : ndarray of shape (n_samples,)\n A mask of the observations that have been used to compute\n the raw robust estimates of location and shape, before correction\n and re-weighting.\n\n location_ : ndarray of shape (n_features,)\n Estimated robust location.\n\n covariance_ : ndarray of shape (n_features, n_features)\n Estimated robust covariance matrix.\n\n precision_ : ndarray of shape (n_features, n_features)\n Estimated pseudo inverse matrix.\n (stored only if store_precision is True)\n\n support_ : ndarray of shape (n_samples,)\n A mask of the observations that have been used to compute\n the robust estimates of location and shape.\n\n dist_ : ndarray of shape (n_samples,)\n Mahalanobis distances of the training set (on which :meth:`fit` is\n called) observations.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n EllipticEnvelope : An object for detecting outliers in\n a Gaussian distributed dataset.\n EmpiricalCovariance : Maximum likelihood covariance estimator.\n GraphicalLasso : Sparse inverse covariance estimation\n with an l1-penalized estimator.\n GraphicalLassoCV : Sparse inverse covariance with cross-validated\n choice of the l1 penalty.\n LedoitWolf : LedoitWolf Estimator.\n OAS : Oracle Approximating Shrinkage Estimator.\n ShrunkCovariance : Covariance estimator with shrinkage.\n\n References\n ----------\n\n .. [Rouseeuw1984] P. J. Rousseeuw. Least median of squares regression.\n J. Am Stat Ass, 79:871, 1984.\n .. [Rousseeuw] A Fast Algorithm for the Minimum Covariance Determinant\n Estimator, 1999, American Statistical Association and the American\n Society for Quality, TECHNOMETRICS\n .. [ButlerDavies] R. W. Butler, P. L. Davies and M. Jhun,\n Asymptotics For The Minimum Covariance Determinant Estimator,\n The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.covariance import MinCovDet\n >>> from sklearn.datasets import make_gaussian_quantiles\n >>> real_cov = np.array([[.8, .3],\n ... [.3, .4]])\n >>> rng = np.random.RandomState(0)\n >>> X = rng.multivariate_normal(mean=[0, 0],\n ... cov=real_cov,\n ... size=500)\n >>> cov = MinCovDet(random_state=0).fit(X)\n >>> cov.covariance_\n array([[0.7411..., 0.2535...],\n [0.2535..., 0.3053...]])\n >>> cov.location_\n array([0.0813... , 0.0427...])\n \"\"\"\n _nonrobust_covariance = staticmethod(empirical_covariance)\n \n def __init__(self, *, store_precision=True, assume_centered=False, support_fraction=None, random_state=None):\n self.store_precision = store_precision\n self.assume_centered = assume_centered\n self.support_fraction = support_fraction\n self.random_state = random_state\n \n def fit(self, X, y=None):\n \"\"\"Fit a Minimum Covariance Determinant with the FastMCD algorithm.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, ensure_min_samples=2, estimator='MinCovDet')\n random_state = check_random_state(self.random_state)\n (n_samples, n_features) = X.shape\n if (linalg.svdvals(np.dot(X.T, X)) > 1e-08).sum() != n_features:\n warnings.warn('The covariance matrix associated to your dataset is not full rank')\n (raw_location, raw_covariance, raw_support, raw_dist) = fast_mcd(X, support_fraction=self.support_fraction, cov_computation_method=self._nonrobust_covariance, random_state=random_state)\n if self.assume_centered:\n raw_location = np.zeros(n_features)\n raw_covariance = self._nonrobust_covariance(X[raw_support], assume_centered=True)\n precision = linalg.pinvh(raw_covariance)\n raw_dist = np.sum(np.dot(X, precision) * X, 1)\n self.raw_location_ = raw_location\n self.raw_covariance_ = raw_covariance\n self.raw_support_ = raw_support\n self.location_ = raw_location\n self.support_ = raw_support\n self.dist_ = raw_dist\n self.correct_covariance(X)\n self.reweight_covariance(X)\n return self\n \n def correct_covariance(self, data):\n \"\"\"Apply a correction to raw Minimum Covariance Determinant estimates.\n\n Correction using the empirical correction factor suggested\n by Rousseeuw and Van Driessen in [RVD]_.\n\n Parameters\n ----------\n data : array-like of shape (n_samples, n_features)\n The data matrix, with p features and n samples.\n The data set must be the one which was used to compute\n the raw estimates.\n\n Returns\n -------\n covariance_corrected : ndarray of shape (n_features, n_features)\n Corrected robust covariance estimate.\n\n References\n ----------\n\n .. [RVD] A Fast Algorithm for the Minimum Covariance\n Determinant Estimator, 1999, American Statistical Association\n and the American Society for Quality, TECHNOMETRICS\n \"\"\"\n n_samples = len(self.dist_)\n n_support = np.sum(self.support_)\n if n_support < n_samples and np.allclose(self.raw_covariance_, 0):\n raise ValueError('The covariance matrix of the support data is equal to 0, try to increase support_fraction')\n correction = np.median(self.dist_) / chi2(data.shape[1]).isf(0.5)\n covariance_corrected = self.raw_covariance_ * correction\n self.dist_ /= correction\n return covariance_corrected\n \n def reweight_covariance(self, data):\n \"\"\"Re-weight raw Minimum Covariance Determinant estimates.\n\n Re-weight observations using Rousseeuw's method (equivalent to\n deleting outlying observations from the data set before\n computing location and covariance estimates) described\n in [RVDriessen]_.\n\n Parameters\n ----------\n data : array-like of shape (n_samples, n_features)\n The data matrix, with p features and n samples.\n The data set must be the one which was used to compute\n the raw estimates.\n\n Returns\n -------\n location_reweighted : ndarray of shape (n_features,)\n Re-weighted robust location estimate.\n\n covariance_reweighted : ndarray of shape (n_features, n_features)\n Re-weighted robust covariance estimate.\n\n support_reweighted : ndarray of shape (n_samples,), dtype=bool\n A mask of the observations that have been used to compute\n the re-weighted robust location and covariance estimates.\n\n References\n ----------\n\n .. [RVDriessen] A Fast Algorithm for the Minimum Covariance\n Determinant Estimator, 1999, American Statistical Association\n and the American Society for Quality, TECHNOMETRICS\n \"\"\"\n (n_samples, n_features) = data.shape\n mask = self.dist_ < chi2(n_features).isf(0.025)\n if self.assume_centered:\n location_reweighted = np.zeros(n_features)\n else:\n location_reweighted = data[mask].mean(0)\n covariance_reweighted = self._nonrobust_covariance(data[mask], assume_centered=self.assume_centered)\n support_reweighted = np.zeros(n_samples, dtype=bool)\n support_reweighted[mask] = True\n self._set_covariance(covariance_reweighted)\n self.location_ = location_reweighted\n self.support_ = support_reweighted\n X_centered = data - self.location_\n self.dist_ = np.sum(np.dot(X_centered, self.get_precision()) * X_centered, 1)\n return location_reweighted, covariance_reweighted, support_reweighted\n" }, @@ -20290,7 +20356,7 @@ "sklearn.covariance._shrunk_covariance.LedoitWolf.fit" ], "is_public": true, - "description": "LedoitWolf Estimator.\n\nLedoit-Wolf is a particular form of shrinkage, where the shrinkage coefficient is computed using O. Ledoit and M. Wolf's formula as described in \"A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices\", Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2, February 2004, pages 365-411. Read more in the :ref:`User Guide `.", + "description": "LedoitWolf Estimator.\n\nLedoit-Wolf is a particular form of shrinkage, where the shrinkage\ncoefficient is computed using O. Ledoit and M. Wolf's formula as\ndescribed in \"A Well-Conditioned Estimator for Large-Dimensional\nCovariance Matrices\", Ledoit and Wolf, Journal of Multivariate\nAnalysis, Volume 88, Issue 2, February 2004, pages 365-411.\n\nRead more in the :ref:`User Guide `.", "docstring": "LedoitWolf Estimator.\n\n Ledoit-Wolf is a particular form of shrinkage, where the shrinkage\n coefficient is computed using O. Ledoit and M. Wolf's formula as\n described in \"A Well-Conditioned Estimator for Large-Dimensional\n Covariance Matrices\", Ledoit and Wolf, Journal of Multivariate\n Analysis, Volume 88, Issue 2, February 2004, pages 365-411.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n store_precision : bool, default=True\n Specify if the estimated precision is stored.\n\n assume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful when working with data whose mean is almost, but not exactly\n zero.\n If False (default), data will be centered before computation.\n\n block_size : int, default=1000\n Size of blocks into which the covariance matrix will be split\n during its Ledoit-Wolf estimation. This is purely a memory\n optimization and does not affect results.\n\n Attributes\n ----------\n covariance_ : ndarray of shape (n_features, n_features)\n Estimated covariance matrix.\n\n location_ : ndarray of shape (n_features,)\n Estimated location, i.e. the estimated mean.\n\n precision_ : ndarray of shape (n_features, n_features)\n Estimated pseudo inverse matrix.\n (stored only if store_precision is True)\n\n shrinkage_ : float\n Coefficient in the convex combination used for the computation\n of the shrunk estimate. Range is [0, 1].\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n EllipticEnvelope : An object for detecting outliers in\n a Gaussian distributed dataset.\n EmpiricalCovariance : Maximum likelihood covariance estimator.\n GraphicalLasso : Sparse inverse covariance estimation\n with an l1-penalized estimator.\n GraphicalLassoCV : Sparse inverse covariance with cross-validated\n choice of the l1 penalty.\n MinCovDet : Minimum Covariance Determinant\n (robust estimator of covariance).\n OAS : Oracle Approximating Shrinkage Estimator.\n ShrunkCovariance : Covariance estimator with shrinkage.\n\n Notes\n -----\n The regularised covariance is:\n\n (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\n where mu = trace(cov) / n_features\n and shrinkage is given by the Ledoit and Wolf formula (see References)\n\n References\n ----------\n \"A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices\",\n Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2,\n February 2004, pages 365-411.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.covariance import LedoitWolf\n >>> real_cov = np.array([[.4, .2],\n ... [.2, .8]])\n >>> np.random.seed(0)\n >>> X = np.random.multivariate_normal(mean=[0, 0],\n ... cov=real_cov,\n ... size=50)\n >>> cov = LedoitWolf().fit(X)\n >>> cov.covariance_\n array([[0.4406..., 0.1616...],\n [0.1616..., 0.8022...]])\n >>> cov.location_\n array([ 0.0595... , -0.0075...])\n ", "source_code": "\n\nclass LedoitWolf(EmpiricalCovariance):\n \"\"\"LedoitWolf Estimator.\n\n Ledoit-Wolf is a particular form of shrinkage, where the shrinkage\n coefficient is computed using O. Ledoit and M. Wolf's formula as\n described in \"A Well-Conditioned Estimator for Large-Dimensional\n Covariance Matrices\", Ledoit and Wolf, Journal of Multivariate\n Analysis, Volume 88, Issue 2, February 2004, pages 365-411.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n store_precision : bool, default=True\n Specify if the estimated precision is stored.\n\n assume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful when working with data whose mean is almost, but not exactly\n zero.\n If False (default), data will be centered before computation.\n\n block_size : int, default=1000\n Size of blocks into which the covariance matrix will be split\n during its Ledoit-Wolf estimation. This is purely a memory\n optimization and does not affect results.\n\n Attributes\n ----------\n covariance_ : ndarray of shape (n_features, n_features)\n Estimated covariance matrix.\n\n location_ : ndarray of shape (n_features,)\n Estimated location, i.e. the estimated mean.\n\n precision_ : ndarray of shape (n_features, n_features)\n Estimated pseudo inverse matrix.\n (stored only if store_precision is True)\n\n shrinkage_ : float\n Coefficient in the convex combination used for the computation\n of the shrunk estimate. Range is [0, 1].\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n EllipticEnvelope : An object for detecting outliers in\n a Gaussian distributed dataset.\n EmpiricalCovariance : Maximum likelihood covariance estimator.\n GraphicalLasso : Sparse inverse covariance estimation\n with an l1-penalized estimator.\n GraphicalLassoCV : Sparse inverse covariance with cross-validated\n choice of the l1 penalty.\n MinCovDet : Minimum Covariance Determinant\n (robust estimator of covariance).\n OAS : Oracle Approximating Shrinkage Estimator.\n ShrunkCovariance : Covariance estimator with shrinkage.\n\n Notes\n -----\n The regularised covariance is:\n\n (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\n where mu = trace(cov) / n_features\n and shrinkage is given by the Ledoit and Wolf formula (see References)\n\n References\n ----------\n \"A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices\",\n Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2,\n February 2004, pages 365-411.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.covariance import LedoitWolf\n >>> real_cov = np.array([[.4, .2],\n ... [.2, .8]])\n >>> np.random.seed(0)\n >>> X = np.random.multivariate_normal(mean=[0, 0],\n ... cov=real_cov,\n ... size=50)\n >>> cov = LedoitWolf().fit(X)\n >>> cov.covariance_\n array([[0.4406..., 0.1616...],\n [0.1616..., 0.8022...]])\n >>> cov.location_\n array([ 0.0595... , -0.0075...])\n \"\"\"\n \n def __init__(self, *, store_precision=True, assume_centered=False, block_size=1000):\n super().__init__(store_precision=store_precision, assume_centered=assume_centered)\n self.block_size = block_size\n \n def fit(self, X, y=None):\n \"\"\"Fit the Ledoit-Wolf shrunk covariance model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X)\n if self.assume_centered:\n self.location_ = np.zeros(X.shape[1])\n else:\n self.location_ = X.mean(0)\n with config_context(assume_finite=True):\n (covariance, shrinkage) = ledoit_wolf(X - self.location_, assume_centered=True, block_size=self.block_size)\n self.shrinkage_ = shrinkage\n self._set_covariance(covariance)\n return self\n" }, @@ -20301,7 +20367,7 @@ "superclasses": ["EmpiricalCovariance"], "methods": ["sklearn.covariance._shrunk_covariance.OAS.fit"], "is_public": true, - "description": "Oracle Approximating Shrinkage Estimator.\n\nRead more in the :ref:`User Guide `. OAS is a particular form of shrinkage described in \"Shrinkage Algorithms for MMSE Covariance Estimation\" Chen et al., IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010. The formula used here does not correspond to the one given in the article. In the original article, formula (23) states that 2/p is multiplied by Trace(cov*cov) in both the numerator and denominator, but this operation is omitted because for a large p, the value of 2/p is so small that it doesn't affect the value of the estimator.", + "description": "Oracle Approximating Shrinkage Estimator.\n\nRead more in the :ref:`User Guide `.\n\nOAS is a particular form of shrinkage described in\n\"Shrinkage Algorithms for MMSE Covariance Estimation\"\nChen et al., IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010.\n\nThe formula used here does not correspond to the one given in the\narticle. In the original article, formula (23) states that 2/p is\nmultiplied by Trace(cov*cov) in both the numerator and denominator, but\nthis operation is omitted because for a large p, the value of 2/p is\nso small that it doesn't affect the value of the estimator.", "docstring": "Oracle Approximating Shrinkage Estimator.\n\n Read more in the :ref:`User Guide `.\n\n OAS is a particular form of shrinkage described in\n \"Shrinkage Algorithms for MMSE Covariance Estimation\"\n Chen et al., IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010.\n\n The formula used here does not correspond to the one given in the\n article. In the original article, formula (23) states that 2/p is\n multiplied by Trace(cov*cov) in both the numerator and denominator, but\n this operation is omitted because for a large p, the value of 2/p is\n so small that it doesn't affect the value of the estimator.\n\n Parameters\n ----------\n store_precision : bool, default=True\n Specify if the estimated precision is stored.\n\n assume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful when working with data whose mean is almost, but not exactly\n zero.\n If False (default), data will be centered before computation.\n\n Attributes\n ----------\n covariance_ : ndarray of shape (n_features, n_features)\n Estimated covariance matrix.\n\n location_ : ndarray of shape (n_features,)\n Estimated location, i.e. the estimated mean.\n\n precision_ : ndarray of shape (n_features, n_features)\n Estimated pseudo inverse matrix.\n (stored only if store_precision is True)\n\n shrinkage_ : float\n coefficient in the convex combination used for the computation\n of the shrunk estimate. Range is [0, 1].\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n EllipticEnvelope : An object for detecting outliers in\n a Gaussian distributed dataset.\n EmpiricalCovariance : Maximum likelihood covariance estimator.\n GraphicalLasso : Sparse inverse covariance estimation\n with an l1-penalized estimator.\n GraphicalLassoCV : Sparse inverse covariance with cross-validated\n choice of the l1 penalty.\n LedoitWolf : LedoitWolf Estimator.\n MinCovDet : Minimum Covariance Determinant\n (robust estimator of covariance).\n ShrunkCovariance : Covariance estimator with shrinkage.\n\n Notes\n -----\n The regularised covariance is:\n\n (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\n where mu = trace(cov) / n_features\n and shrinkage is given by the OAS formula (see References)\n\n References\n ----------\n \"Shrinkage Algorithms for MMSE Covariance Estimation\"\n Chen et al., IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.covariance import OAS\n >>> from sklearn.datasets import make_gaussian_quantiles\n >>> real_cov = np.array([[.8, .3],\n ... [.3, .4]])\n >>> rng = np.random.RandomState(0)\n >>> X = rng.multivariate_normal(mean=[0, 0],\n ... cov=real_cov,\n ... size=500)\n >>> oas = OAS().fit(X)\n >>> oas.covariance_\n array([[0.7533..., 0.2763...],\n [0.2763..., 0.3964...]])\n >>> oas.precision_\n array([[ 1.7833..., -1.2431... ],\n [-1.2431..., 3.3889...]])\n >>> oas.shrinkage_\n 0.0195...\n ", "source_code": "\n\nclass OAS(EmpiricalCovariance):\n \"\"\"Oracle Approximating Shrinkage Estimator.\n\n Read more in the :ref:`User Guide `.\n\n OAS is a particular form of shrinkage described in\n \"Shrinkage Algorithms for MMSE Covariance Estimation\"\n Chen et al., IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010.\n\n The formula used here does not correspond to the one given in the\n article. In the original article, formula (23) states that 2/p is\n multiplied by Trace(cov*cov) in both the numerator and denominator, but\n this operation is omitted because for a large p, the value of 2/p is\n so small that it doesn't affect the value of the estimator.\n\n Parameters\n ----------\n store_precision : bool, default=True\n Specify if the estimated precision is stored.\n\n assume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful when working with data whose mean is almost, but not exactly\n zero.\n If False (default), data will be centered before computation.\n\n Attributes\n ----------\n covariance_ : ndarray of shape (n_features, n_features)\n Estimated covariance matrix.\n\n location_ : ndarray of shape (n_features,)\n Estimated location, i.e. the estimated mean.\n\n precision_ : ndarray of shape (n_features, n_features)\n Estimated pseudo inverse matrix.\n (stored only if store_precision is True)\n\n shrinkage_ : float\n coefficient in the convex combination used for the computation\n of the shrunk estimate. Range is [0, 1].\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n EllipticEnvelope : An object for detecting outliers in\n a Gaussian distributed dataset.\n EmpiricalCovariance : Maximum likelihood covariance estimator.\n GraphicalLasso : Sparse inverse covariance estimation\n with an l1-penalized estimator.\n GraphicalLassoCV : Sparse inverse covariance with cross-validated\n choice of the l1 penalty.\n LedoitWolf : LedoitWolf Estimator.\n MinCovDet : Minimum Covariance Determinant\n (robust estimator of covariance).\n ShrunkCovariance : Covariance estimator with shrinkage.\n\n Notes\n -----\n The regularised covariance is:\n\n (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\n where mu = trace(cov) / n_features\n and shrinkage is given by the OAS formula (see References)\n\n References\n ----------\n \"Shrinkage Algorithms for MMSE Covariance Estimation\"\n Chen et al., IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.covariance import OAS\n >>> from sklearn.datasets import make_gaussian_quantiles\n >>> real_cov = np.array([[.8, .3],\n ... [.3, .4]])\n >>> rng = np.random.RandomState(0)\n >>> X = rng.multivariate_normal(mean=[0, 0],\n ... cov=real_cov,\n ... size=500)\n >>> oas = OAS().fit(X)\n >>> oas.covariance_\n array([[0.7533..., 0.2763...],\n [0.2763..., 0.3964...]])\n >>> oas.precision_\n array([[ 1.7833..., -1.2431... ],\n [-1.2431..., 3.3889...]])\n >>> oas.shrinkage_\n 0.0195...\n \"\"\"\n \n def fit(self, X, y=None):\n \"\"\"Fit the Oracle Approximating Shrinkage covariance model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X)\n if self.assume_centered:\n self.location_ = np.zeros(X.shape[1])\n else:\n self.location_ = X.mean(0)\n (covariance, shrinkage) = oas(X - self.location_, assume_centered=True)\n self.shrinkage_ = shrinkage\n self._set_covariance(covariance)\n return self\n" }, @@ -20339,7 +20405,7 @@ "sklearn.cross_decomposition._pls.PLSCanonical.__init__" ], "is_public": true, - "description": "Partial Least Squares transformer and regressor.\n\nRead more in the :ref:`User Guide `. .. versionadded:: 0.8", + "description": "Partial Least Squares transformer and regressor.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.8", "docstring": "Partial Least Squares transformer and regressor.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.8\n\n Parameters\n ----------\n n_components : int, default=2\n Number of components to keep. Should be in `[1, min(n_samples,\n n_features, n_targets)]`.\n\n scale : bool, default=True\n Whether to scale `X` and `Y`.\n\n algorithm : {'nipals', 'svd'}, default='nipals'\n The algorithm used to estimate the first singular vectors of the\n cross-covariance matrix. 'nipals' uses the power method while 'svd'\n will compute the whole SVD.\n\n max_iter : int, default=500\n The maximum number of iterations of the power method when\n `algorithm='nipals'`. Ignored otherwise.\n\n tol : float, default=1e-06\n The tolerance used as convergence criteria in the power method: the\n algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less\n than `tol`, where `u` corresponds to the left singular vector.\n\n copy : bool, default=True\n Whether to copy `X` and `Y` in fit before applying centering, and\n potentially scaling. If False, these operations will be done inplace,\n modifying both arrays.\n\n Attributes\n ----------\n x_weights_ : ndarray of shape (n_features, n_components)\n The left singular vectors of the cross-covariance matrices of each\n iteration.\n\n y_weights_ : ndarray of shape (n_targets, n_components)\n The right singular vectors of the cross-covariance matrices of each\n iteration.\n\n x_loadings_ : ndarray of shape (n_features, n_components)\n The loadings of `X`.\n\n y_loadings_ : ndarray of shape (n_targets, n_components)\n The loadings of `Y`.\n\n x_scores_ : ndarray of shape (n_samples, n_components)\n The transformed training samples.\n\n .. deprecated:: 0.24\n `x_scores_` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26). You can just call `transform` on the training\n data instead.\n\n y_scores_ : ndarray of shape (n_samples, n_components)\n The transformed training targets.\n\n .. deprecated:: 0.24\n `y_scores_` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26). You can just call `transform` on the training\n data instead.\n\n x_rotations_ : ndarray of shape (n_features, n_components)\n The projection matrix used to transform `X`.\n\n y_rotations_ : ndarray of shape (n_features, n_components)\n The projection matrix used to transform `Y`.\n\n coef_ : ndarray of shape (n_features, n_targets)\n The coefficients of the linear model such that `Y` is approximated as\n `Y = X @ coef_`.\n\n n_iter_ : list of shape (n_components,)\n Number of iterations of the power method, for each\n component. Empty if `algorithm='svd'`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n CCA : Canonical Correlation Analysis.\n PLSSVD : Partial Least Square SVD.\n\n Examples\n --------\n >>> from sklearn.cross_decomposition import PLSCanonical\n >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]]\n >>> Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]\n >>> plsca = PLSCanonical(n_components=2)\n >>> plsca.fit(X, Y)\n PLSCanonical()\n >>> X_c, Y_c = plsca.transform(X, Y)\n ", "source_code": "\n\nclass PLSCanonical(_PLS):\n \"\"\"Partial Least Squares transformer and regressor.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.8\n\n Parameters\n ----------\n n_components : int, default=2\n Number of components to keep. Should be in `[1, min(n_samples,\n n_features, n_targets)]`.\n\n scale : bool, default=True\n Whether to scale `X` and `Y`.\n\n algorithm : {'nipals', 'svd'}, default='nipals'\n The algorithm used to estimate the first singular vectors of the\n cross-covariance matrix. 'nipals' uses the power method while 'svd'\n will compute the whole SVD.\n\n max_iter : int, default=500\n The maximum number of iterations of the power method when\n `algorithm='nipals'`. Ignored otherwise.\n\n tol : float, default=1e-06\n The tolerance used as convergence criteria in the power method: the\n algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less\n than `tol`, where `u` corresponds to the left singular vector.\n\n copy : bool, default=True\n Whether to copy `X` and `Y` in fit before applying centering, and\n potentially scaling. If False, these operations will be done inplace,\n modifying both arrays.\n\n Attributes\n ----------\n x_weights_ : ndarray of shape (n_features, n_components)\n The left singular vectors of the cross-covariance matrices of each\n iteration.\n\n y_weights_ : ndarray of shape (n_targets, n_components)\n The right singular vectors of the cross-covariance matrices of each\n iteration.\n\n x_loadings_ : ndarray of shape (n_features, n_components)\n The loadings of `X`.\n\n y_loadings_ : ndarray of shape (n_targets, n_components)\n The loadings of `Y`.\n\n x_scores_ : ndarray of shape (n_samples, n_components)\n The transformed training samples.\n\n .. deprecated:: 0.24\n `x_scores_` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26). You can just call `transform` on the training\n data instead.\n\n y_scores_ : ndarray of shape (n_samples, n_components)\n The transformed training targets.\n\n .. deprecated:: 0.24\n `y_scores_` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26). You can just call `transform` on the training\n data instead.\n\n x_rotations_ : ndarray of shape (n_features, n_components)\n The projection matrix used to transform `X`.\n\n y_rotations_ : ndarray of shape (n_features, n_components)\n The projection matrix used to transform `Y`.\n\n coef_ : ndarray of shape (n_features, n_targets)\n The coefficients of the linear model such that `Y` is approximated as\n `Y = X @ coef_`.\n\n n_iter_ : list of shape (n_components,)\n Number of iterations of the power method, for each\n component. Empty if `algorithm='svd'`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n CCA : Canonical Correlation Analysis.\n PLSSVD : Partial Least Square SVD.\n\n Examples\n --------\n >>> from sklearn.cross_decomposition import PLSCanonical\n >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]]\n >>> Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]\n >>> plsca = PLSCanonical(n_components=2)\n >>> plsca.fit(X, Y)\n PLSCanonical()\n >>> X_c, Y_c = plsca.transform(X, Y)\n \"\"\"\n \n def __init__(self, n_components=2, *, scale=True, algorithm='nipals', max_iter=500, tol=1e-06, copy=True):\n super().__init__(n_components=n_components, scale=scale, deflation_mode='canonical', mode='A', algorithm=algorithm, max_iter=max_iter, tol=tol, copy=copy)\n" }, @@ -20352,7 +20418,7 @@ "sklearn.cross_decomposition._pls.PLSRegression.__init__" ], "is_public": true, - "description": "PLS regression.\n\nPLSRegression is also known as PLS2 or PLS1, depending on the number of targets. Read more in the :ref:`User Guide `. .. versionadded:: 0.8", + "description": "PLS regression.\n\nPLSRegression is also known as PLS2 or PLS1, depending on the number of\ntargets.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.8", "docstring": "PLS regression.\n\n PLSRegression is also known as PLS2 or PLS1, depending on the number of\n targets.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.8\n\n Parameters\n ----------\n n_components : int, default=2\n Number of components to keep. Should be in `[1, min(n_samples,\n n_features, n_targets)]`.\n\n scale : bool, default=True\n Whether to scale `X` and `Y`.\n\n max_iter : int, default=500\n The maximum number of iterations of the power method when\n `algorithm='nipals'`. Ignored otherwise.\n\n tol : float, default=1e-06\n The tolerance used as convergence criteria in the power method: the\n algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less\n than `tol`, where `u` corresponds to the left singular vector.\n\n copy : bool, default=True\n Whether to copy `X` and `Y` in :term:`fit` before applying centering,\n and potentially scaling. If `False`, these operations will be done\n inplace, modifying both arrays.\n\n Attributes\n ----------\n x_weights_ : ndarray of shape (n_features, n_components)\n The left singular vectors of the cross-covariance matrices of each\n iteration.\n\n y_weights_ : ndarray of shape (n_targets, n_components)\n The right singular vectors of the cross-covariance matrices of each\n iteration.\n\n x_loadings_ : ndarray of shape (n_features, n_components)\n The loadings of `X`.\n\n y_loadings_ : ndarray of shape (n_targets, n_components)\n The loadings of `Y`.\n\n x_scores_ : ndarray of shape (n_samples, n_components)\n The transformed training samples.\n\n y_scores_ : ndarray of shape (n_samples, n_components)\n The transformed training targets.\n\n x_rotations_ : ndarray of shape (n_features, n_components)\n The projection matrix used to transform `X`.\n\n y_rotations_ : ndarray of shape (n_features, n_components)\n The projection matrix used to transform `Y`.\n\n coef_ : ndarray of shape (n_features, n_targets)\n The coefficients of the linear model such that `Y` is approximated as\n `Y = X @ coef_`.\n\n n_iter_ : list of shape (n_components,)\n Number of iterations of the power method, for each\n component.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PLSCanonical : Partial Least Squares transformer and regressor.\n\n Examples\n --------\n >>> from sklearn.cross_decomposition import PLSRegression\n >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]]\n >>> Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]\n >>> pls2 = PLSRegression(n_components=2)\n >>> pls2.fit(X, Y)\n PLSRegression()\n >>> Y_pred = pls2.predict(X)\n ", "source_code": "\n\nclass PLSRegression(_PLS):\n \"\"\"PLS regression.\n\n PLSRegression is also known as PLS2 or PLS1, depending on the number of\n targets.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.8\n\n Parameters\n ----------\n n_components : int, default=2\n Number of components to keep. Should be in `[1, min(n_samples,\n n_features, n_targets)]`.\n\n scale : bool, default=True\n Whether to scale `X` and `Y`.\n\n max_iter : int, default=500\n The maximum number of iterations of the power method when\n `algorithm='nipals'`. Ignored otherwise.\n\n tol : float, default=1e-06\n The tolerance used as convergence criteria in the power method: the\n algorithm stops whenever the squared norm of `u_i - u_{i-1}` is less\n than `tol`, where `u` corresponds to the left singular vector.\n\n copy : bool, default=True\n Whether to copy `X` and `Y` in :term:`fit` before applying centering,\n and potentially scaling. If `False`, these operations will be done\n inplace, modifying both arrays.\n\n Attributes\n ----------\n x_weights_ : ndarray of shape (n_features, n_components)\n The left singular vectors of the cross-covariance matrices of each\n iteration.\n\n y_weights_ : ndarray of shape (n_targets, n_components)\n The right singular vectors of the cross-covariance matrices of each\n iteration.\n\n x_loadings_ : ndarray of shape (n_features, n_components)\n The loadings of `X`.\n\n y_loadings_ : ndarray of shape (n_targets, n_components)\n The loadings of `Y`.\n\n x_scores_ : ndarray of shape (n_samples, n_components)\n The transformed training samples.\n\n y_scores_ : ndarray of shape (n_samples, n_components)\n The transformed training targets.\n\n x_rotations_ : ndarray of shape (n_features, n_components)\n The projection matrix used to transform `X`.\n\n y_rotations_ : ndarray of shape (n_features, n_components)\n The projection matrix used to transform `Y`.\n\n coef_ : ndarray of shape (n_features, n_targets)\n The coefficients of the linear model such that `Y` is approximated as\n `Y = X @ coef_`.\n\n n_iter_ : list of shape (n_components,)\n Number of iterations of the power method, for each\n component.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PLSCanonical : Partial Least Squares transformer and regressor.\n\n Examples\n --------\n >>> from sklearn.cross_decomposition import PLSRegression\n >>> X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [2.,5.,4.]]\n >>> Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]\n >>> pls2 = PLSRegression(n_components=2)\n >>> pls2.fit(X, Y)\n PLSRegression()\n >>> Y_pred = pls2.predict(X)\n \"\"\"\n \n def __init__(self, n_components=2, *, scale=True, max_iter=500, tol=1e-06, copy=True):\n super().__init__(n_components=n_components, scale=scale, deflation_mode='regression', mode='A', algorithm='nipals', max_iter=max_iter, tol=tol, copy=copy)\n" }, @@ -20374,7 +20440,7 @@ "sklearn.cross_decomposition._pls.PLSSVD.fit_transform" ], "is_public": true, - "description": "Partial Least Square SVD.\n\nThis transformer simply performs a SVD on the cross-covariance matrix `X'Y`. It is able to project both the training data `X` and the targets `Y`. The training data `X` is projected on the left singular vectors, while the targets are projected on the right singular vectors. Read more in the :ref:`User Guide `. .. versionadded:: 0.8", + "description": "Partial Least Square SVD.\n\nThis transformer simply performs a SVD on the cross-covariance matrix\n`X'Y`. It is able to project both the training data `X` and the targets\n`Y`. The training data `X` is projected on the left singular vectors, while\nthe targets are projected on the right singular vectors.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.8", "docstring": "Partial Least Square SVD.\n\n This transformer simply performs a SVD on the cross-covariance matrix\n `X'Y`. It is able to project both the training data `X` and the targets\n `Y`. The training data `X` is projected on the left singular vectors, while\n the targets are projected on the right singular vectors.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.8\n\n Parameters\n ----------\n n_components : int, default=2\n The number of components to keep. Should be in `[1,\n min(n_samples, n_features, n_targets)]`.\n\n scale : bool, default=True\n Whether to scale `X` and `Y`.\n\n copy : bool, default=True\n Whether to copy `X` and `Y` in fit before applying centering, and\n potentially scaling. If `False`, these operations will be done inplace,\n modifying both arrays.\n\n Attributes\n ----------\n x_weights_ : ndarray of shape (n_features, n_components)\n The left singular vectors of the SVD of the cross-covariance matrix.\n Used to project `X` in :meth:`transform`.\n\n y_weights_ : ndarray of (n_targets, n_components)\n The right singular vectors of the SVD of the cross-covariance matrix.\n Used to project `X` in :meth:`transform`.\n\n x_scores_ : ndarray of shape (n_samples, n_components)\n The transformed training samples.\n\n .. deprecated:: 0.24\n `x_scores_` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26). You can just call `transform` on the training\n data instead.\n\n y_scores_ : ndarray of shape (n_samples, n_components)\n The transformed training targets.\n\n .. deprecated:: 0.24\n `y_scores_` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26). You can just call `transform` on the training\n data instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PLSCanonical : Partial Least Squares transformer and regressor.\n CCA : Canonical Correlation Analysis.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.cross_decomposition import PLSSVD\n >>> X = np.array([[0., 0., 1.],\n ... [1., 0., 0.],\n ... [2., 2., 2.],\n ... [2., 5., 4.]])\n >>> Y = np.array([[0.1, -0.2],\n ... [0.9, 1.1],\n ... [6.2, 5.9],\n ... [11.9, 12.3]])\n >>> pls = PLSSVD(n_components=2).fit(X, Y)\n >>> X_c, Y_c = pls.transform(X, Y)\n >>> X_c.shape, Y_c.shape\n ((4, 2), (4, 2))\n ", "source_code": "\n\nclass PLSSVD(TransformerMixin, BaseEstimator):\n \"\"\"Partial Least Square SVD.\n\n This transformer simply performs a SVD on the cross-covariance matrix\n `X'Y`. It is able to project both the training data `X` and the targets\n `Y`. The training data `X` is projected on the left singular vectors, while\n the targets are projected on the right singular vectors.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.8\n\n Parameters\n ----------\n n_components : int, default=2\n The number of components to keep. Should be in `[1,\n min(n_samples, n_features, n_targets)]`.\n\n scale : bool, default=True\n Whether to scale `X` and `Y`.\n\n copy : bool, default=True\n Whether to copy `X` and `Y` in fit before applying centering, and\n potentially scaling. If `False`, these operations will be done inplace,\n modifying both arrays.\n\n Attributes\n ----------\n x_weights_ : ndarray of shape (n_features, n_components)\n The left singular vectors of the SVD of the cross-covariance matrix.\n Used to project `X` in :meth:`transform`.\n\n y_weights_ : ndarray of (n_targets, n_components)\n The right singular vectors of the SVD of the cross-covariance matrix.\n Used to project `X` in :meth:`transform`.\n\n x_scores_ : ndarray of shape (n_samples, n_components)\n The transformed training samples.\n\n .. deprecated:: 0.24\n `x_scores_` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26). You can just call `transform` on the training\n data instead.\n\n y_scores_ : ndarray of shape (n_samples, n_components)\n The transformed training targets.\n\n .. deprecated:: 0.24\n `y_scores_` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26). You can just call `transform` on the training\n data instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PLSCanonical : Partial Least Squares transformer and regressor.\n CCA : Canonical Correlation Analysis.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.cross_decomposition import PLSSVD\n >>> X = np.array([[0., 0., 1.],\n ... [1., 0., 0.],\n ... [2., 2., 2.],\n ... [2., 5., 4.]])\n >>> Y = np.array([[0.1, -0.2],\n ... [0.9, 1.1],\n ... [6.2, 5.9],\n ... [11.9, 12.3]])\n >>> pls = PLSSVD(n_components=2).fit(X, Y)\n >>> X_c, Y_c = pls.transform(X, Y)\n >>> X_c.shape, Y_c.shape\n ((4, 2), (4, 2))\n \"\"\"\n \n def __init__(self, n_components=2, *, scale=True, copy=True):\n self.n_components = n_components\n self.scale = scale\n self.copy = copy\n \n def fit(self, X, Y):\n \"\"\"Fit model to data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training samples.\n\n Y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Targets.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n check_consistent_length(X, Y)\n X = self._validate_data(X, dtype=np.float64, copy=self.copy, ensure_min_samples=2)\n Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)\n if Y.ndim == 1:\n Y = Y.reshape(-1, 1)\n n_components = self.n_components\n rank_upper_bound = min(X.shape[0], X.shape[1], Y.shape[1])\n if not 1 <= n_components <= rank_upper_bound:\n warnings.warn(f'As of version 0.24, n_components({n_components}) should be in [1, min(n_features, n_samples, n_targets)] = [1, {rank_upper_bound}]. n_components={rank_upper_bound} will be used instead. In version 1.1 (renaming of 0.26), an error will be raised.', FutureWarning)\n n_components = rank_upper_bound\n (X, Y, self._x_mean, self._y_mean, self._x_std, self._y_std) = _center_scale_xy(X, Y, self.scale)\n C = np.dot(X.T, Y)\n (U, s, Vt) = svd(C, full_matrices=False)\n U = U[:, :n_components]\n Vt = Vt[:n_components]\n (U, Vt) = svd_flip(U, Vt)\n V = Vt.T\n self._x_scores = np.dot(X, U)\n self._y_scores = np.dot(Y, V)\n self.x_weights_ = U\n self.y_weights_ = V\n return self\n \n @deprecated('Attribute `x_scores_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26). Use est.transform(X) on the training data instead.')\n @property\n def x_scores_(self):\n return self._x_scores\n \n @deprecated('Attribute `y_scores_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26). Use est.transform(X, Y) on the training data instead.')\n @property\n def y_scores_(self):\n return self._y_scores\n \n @deprecated('Attribute `x_mean_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def x_mean_(self):\n return self._x_mean\n \n @deprecated('Attribute `y_mean_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def y_mean_(self):\n return self._y_mean\n \n @deprecated('Attribute `x_std_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def x_std_(self):\n return self._x_std\n \n @deprecated('Attribute `y_std_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def y_std_(self):\n return self._y_std\n \n def transform(self, X, Y=None):\n \"\"\"\n Apply the dimensionality reduction.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Samples to be transformed.\n\n Y : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Targets.\n\n Returns\n -------\n x_scores : array-like or tuple of array-like\n The transformed data `X_tranformed` if `Y is not None`,\n `(X_transformed, Y_transformed)` otherwise.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, dtype=np.float64, reset=False)\n Xr = (X - self._x_mean) / self._x_std\n x_scores = np.dot(Xr, self.x_weights_)\n if Y is not None:\n Y = check_array(Y, ensure_2d=False, dtype=np.float64)\n if Y.ndim == 1:\n Y = Y.reshape(-1, 1)\n Yr = (Y - self._y_mean) / self._y_std\n y_scores = np.dot(Yr, self.y_weights_)\n return x_scores, y_scores\n return x_scores\n \n def fit_transform(self, X, y=None):\n \"\"\"Learn and apply the dimensionality reduction.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training samples.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Targets.\n\n Returns\n -------\n out : array-like or tuple of array-like\n The transformed data `X_tranformed` if `Y is not None`,\n `(X_transformed, Y_transformed)` otherwise.\n \"\"\"\n return self.fit(X, y).transform(X, y)\n" }, @@ -20405,7 +20471,7 @@ "sklearn.cross_decomposition._pls._PLS._more_tags" ], "is_public": false, - "description": "Partial Least Squares (PLS)\n\nThis class implements the generic PLS algorithm. Main ref: Wegelin, a survey of Partial Least Squares (PLS) methods, with emphasis on the two-block case https://www.stat.washington.edu/research/reports/2000/tr371.pdf", + "description": "Partial Least Squares (PLS)\n\nThis class implements the generic PLS algorithm.\n\nMain ref: Wegelin, a survey of Partial Least Squares (PLS) methods,\nwith emphasis on the two-block case\nhttps://www.stat.washington.edu/research/reports/2000/tr371.pdf", "docstring": "Partial Least Squares (PLS)\n\n This class implements the generic PLS algorithm.\n\n Main ref: Wegelin, a survey of Partial Least Squares (PLS) methods,\n with emphasis on the two-block case\n https://www.stat.washington.edu/research/reports/2000/tr371.pdf\n ", "source_code": "\n\nclass _PLS(TransformerMixin, RegressorMixin, MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):\n \"\"\"Partial Least Squares (PLS)\n\n This class implements the generic PLS algorithm.\n\n Main ref: Wegelin, a survey of Partial Least Squares (PLS) methods,\n with emphasis on the two-block case\n https://www.stat.washington.edu/research/reports/2000/tr371.pdf\n \"\"\"\n \n @abstractmethod\n def __init__(self, n_components=2, *, scale=True, deflation_mode='regression', mode='A', algorithm='nipals', max_iter=500, tol=1e-06, copy=True):\n self.n_components = n_components\n self.deflation_mode = deflation_mode\n self.mode = mode\n self.scale = scale\n self.algorithm = algorithm\n self.max_iter = max_iter\n self.tol = tol\n self.copy = copy\n \n def fit(self, X, Y):\n \"\"\"Fit model to data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of predictors.\n\n Y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target vectors, where `n_samples` is the number of samples and\n `n_targets` is the number of response variables.\n\n Returns\n -------\n self : object\n Fitted model.\n \"\"\"\n check_consistent_length(X, Y)\n X = self._validate_data(X, dtype=np.float64, copy=self.copy, ensure_min_samples=2)\n Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)\n if Y.ndim == 1:\n Y = Y.reshape(-1, 1)\n n = X.shape[0]\n p = X.shape[1]\n q = Y.shape[1]\n n_components = self.n_components\n if self.deflation_mode == 'regression':\n rank_upper_bound = p\n if not 1 <= n_components <= rank_upper_bound:\n warnings.warn(f'As of version 0.24, n_components({n_components}) should be in [1, n_features].n_components={rank_upper_bound} will be used instead. In version 1.1 (renaming of 0.26), an error will be raised.', FutureWarning)\n n_components = rank_upper_bound\n else:\n rank_upper_bound = min(n, p, q)\n if not 1 <= self.n_components <= rank_upper_bound:\n warnings.warn(f'As of version 0.24, n_components({n_components}) should be in [1, min(n_features, n_samples, n_targets)] = [1, {rank_upper_bound}]. n_components={rank_upper_bound} will be used instead. In version 1.1 (renaming of 0.26), an error will be raised.', FutureWarning)\n n_components = rank_upper_bound\n if self.algorithm not in ('svd', 'nipals'):\n raise ValueError(f\"algorithm should be 'svd' or 'nipals', got {self.algorithm}.\")\n self._norm_y_weights = self.deflation_mode == 'canonical'\n norm_y_weights = self._norm_y_weights\n (Xk, Yk, self._x_mean, self._y_mean, self._x_std, self._y_std) = _center_scale_xy(X, Y, self.scale)\n self.x_weights_ = np.zeros((p, n_components))\n self.y_weights_ = np.zeros((q, n_components))\n self._x_scores = np.zeros((n, n_components))\n self._y_scores = np.zeros((n, n_components))\n self.x_loadings_ = np.zeros((p, n_components))\n self.y_loadings_ = np.zeros((q, n_components))\n self.n_iter_ = []\n Y_eps = np.finfo(Yk.dtype).eps\n for k in range(n_components):\n if self.algorithm == 'nipals':\n Yk_mask = np.all(np.abs(Yk) < 10 * Y_eps, axis=0)\n Yk[:, Yk_mask] = 0.0\n try:\n (x_weights, y_weights, n_iter_) = _get_first_singular_vectors_power_method(Xk, Yk, mode=self.mode, max_iter=self.max_iter, tol=self.tol, norm_y_weights=norm_y_weights)\n except StopIteration as e:\n if str(e) != 'Y residual is constant':\n raise\n warnings.warn(f'Y residual is constant at iteration {k}')\n break\n self.n_iter_.append(n_iter_)\n elif self.algorithm == 'svd':\n (x_weights, y_weights) = _get_first_singular_vectors_svd(Xk, Yk)\n _svd_flip_1d(x_weights, y_weights)\n x_scores = np.dot(Xk, x_weights)\n if norm_y_weights:\n y_ss = 1\n else:\n y_ss = np.dot(y_weights, y_weights)\n y_scores = np.dot(Yk, y_weights) / y_ss\n x_loadings = np.dot(x_scores, Xk) / np.dot(x_scores, x_scores)\n Xk -= np.outer(x_scores, x_loadings)\n if self.deflation_mode == 'canonical':\n y_loadings = np.dot(y_scores, Yk) / np.dot(y_scores, y_scores)\n Yk -= np.outer(y_scores, y_loadings)\n if self.deflation_mode == 'regression':\n y_loadings = np.dot(x_scores, Yk) / np.dot(x_scores, x_scores)\n Yk -= np.outer(x_scores, y_loadings)\n self.x_weights_[:, k] = x_weights\n self.y_weights_[:, k] = y_weights\n self._x_scores[:, k] = x_scores\n self._y_scores[:, k] = y_scores\n self.x_loadings_[:, k] = x_loadings\n self.y_loadings_[:, k] = y_loadings\n self.x_rotations_ = np.dot(self.x_weights_, pinv2(np.dot(self.x_loadings_.T, self.x_weights_), check_finite=False))\n self.y_rotations_ = np.dot(self.y_weights_, pinv2(np.dot(self.y_loadings_.T, self.y_weights_), check_finite=False))\n self.coef_ = np.dot(self.x_rotations_, self.y_loadings_.T)\n self.coef_ = self.coef_ * self._y_std\n return self\n \n def transform(self, X, Y=None, copy=True):\n \"\"\"Apply the dimension reduction.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Samples to transform.\n\n Y : array-like of shape (n_samples, n_targets), default=None\n Target vectors.\n\n copy : bool, default=True\n Whether to copy `X` and `Y`, or perform in-place normalization.\n\n Returns\n -------\n x_scores, y_scores : array-like or tuple of array-like\n Return `x_scores` if `Y` is not given, `(x_scores, y_scores)` otherwise.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, copy=copy, dtype=FLOAT_DTYPES, reset=False)\n X -= self._x_mean\n X /= self._x_std\n x_scores = np.dot(X, self.x_rotations_)\n if Y is not None:\n Y = check_array(Y, ensure_2d=False, copy=copy, dtype=FLOAT_DTYPES)\n if Y.ndim == 1:\n Y = Y.reshape(-1, 1)\n Y -= self._y_mean\n Y /= self._y_std\n y_scores = np.dot(Y, self.y_rotations_)\n return x_scores, y_scores\n return x_scores\n \n def inverse_transform(self, X):\n \"\"\"Transform data back to its original space.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_components)\n New data, where `n_samples` is the number of samples\n and `n_components` is the number of pls components.\n\n Returns\n -------\n self : ndarray of shape (n_samples, n_features)\n Return the reconstructed array.\n\n Notes\n -----\n This transformation will only be exact if `n_components=n_features`.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, dtype=FLOAT_DTYPES)\n X_reconstructed = np.matmul(X, self.x_loadings_.T)\n X_reconstructed *= self._x_std\n X_reconstructed += self._x_mean\n return X_reconstructed\n \n def predict(self, X, copy=True):\n \"\"\"Predict targets of given samples.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Samples.\n\n copy : bool, default=True\n Whether to copy `X` and `Y`, or perform in-place normalization.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Returns predicted values.\n\n Notes\n -----\n This call requires the estimation of a matrix of shape\n `(n_features, n_targets)`, which may be an issue in high dimensional\n space.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, copy=copy, dtype=FLOAT_DTYPES, reset=False)\n X -= self._x_mean\n X /= self._x_std\n Ypred = np.dot(X, self.coef_)\n return Ypred + self._y_mean\n \n def fit_transform(self, X, y=None):\n \"\"\"Learn and apply the dimension reduction on the train data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of predictors.\n\n y : array-like of shape (n_samples, n_targets), default=None\n Target vectors, where `n_samples` is the number of samples and\n `n_targets` is the number of response variables.\n\n Returns\n -------\n self : ndarray of shape (n_samples, n_components)\n Return `x_scores` if `Y` is not given, `(x_scores, y_scores)` otherwise.\n \"\"\"\n return self.fit(X, y).transform(X, y)\n \n @deprecated('Attribute `norm_y_weights` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def norm_y_weights(self):\n return self._norm_y_weights\n \n @deprecated('Attribute `x_mean_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def x_mean_(self):\n return self._x_mean\n \n @deprecated('Attribute `y_mean_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def y_mean_(self):\n return self._y_mean\n \n @deprecated('Attribute `x_std_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def x_std_(self):\n return self._x_std\n \n @deprecated('Attribute `y_std_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def y_std_(self):\n return self._y_std\n \n @property\n def x_scores_(self):\n \"\"\"Attribute `x_scores_` was deprecated in version 0.24.\"\"\"\n if not isinstance(self, PLSRegression):\n pass\n warnings.warn('Attribute `x_scores_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26). Use est.transform(X) on the training data instead.', FutureWarning)\n return self._x_scores\n \n @property\n def y_scores_(self):\n \"\"\"Attribute `y_scores_` was deprecated in version 0.24.\"\"\"\n if not isinstance(self, PLSRegression):\n warnings.warn('Attribute `y_scores_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26). Use est.transform(X) on the training data instead.', FutureWarning)\n return self._y_scores\n \n def _more_tags(self):\n return {'poor_score': True, 'requires_y': False}\n" }, @@ -20433,7 +20499,7 @@ "sklearn.decomposition._base._BasePCA.inverse_transform" ], "is_public": false, - "description": "Base class for PCA methods.\n\nWarning: This class should not be used directly. Use derived classes instead.", + "description": "Base class for PCA methods.\n\nWarning: This class should not be used directly.\nUse derived classes instead.", "docstring": "Base class for PCA methods.\n\n Warning: This class should not be used directly.\n Use derived classes instead.\n ", "source_code": "\n\nclass _BasePCA(TransformerMixin, BaseEstimator, metaclass=ABCMeta):\n \"\"\"Base class for PCA methods.\n\n Warning: This class should not be used directly.\n Use derived classes instead.\n \"\"\"\n \n def get_covariance(self):\n \"\"\"Compute data covariance with the generative model.\n\n ``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)``\n where S**2 contains the explained variances, and sigma2 contains the\n noise variances.\n\n Returns\n -------\n cov : array of shape=(n_features, n_features)\n Estimated covariance of data.\n \"\"\"\n components_ = self.components_\n exp_var = self.explained_variance_\n if self.whiten:\n components_ = components_ * np.sqrt(exp_var[:, np.newaxis])\n exp_var_diff = np.maximum(exp_var - self.noise_variance_, 0.0)\n cov = np.dot(components_.T * exp_var_diff, components_)\n cov.flat[::len(cov) + 1] += self.noise_variance_\n return cov\n \n def get_precision(self):\n \"\"\"Compute data precision matrix with the generative model.\n\n Equals the inverse of the covariance but computed with\n the matrix inversion lemma for efficiency.\n\n Returns\n -------\n precision : array, shape=(n_features, n_features)\n Estimated precision of data.\n \"\"\"\n n_features = self.components_.shape[1]\n if self.n_components_ == 0:\n return np.eye(n_features) / self.noise_variance_\n if self.n_components_ == n_features:\n return linalg.inv(self.get_covariance())\n components_ = self.components_\n exp_var = self.explained_variance_\n if self.whiten:\n components_ = components_ * np.sqrt(exp_var[:, np.newaxis])\n exp_var_diff = np.maximum(exp_var - self.noise_variance_, 0.0)\n precision = np.dot(components_, components_.T) / self.noise_variance_\n precision.flat[::len(precision) + 1] += 1.0 / exp_var_diff\n precision = np.dot(components_.T, np.dot(linalg.inv(precision), components_))\n precision /= -self.noise_variance_**2\n precision.flat[::len(precision) + 1] += 1.0 / self.noise_variance_\n return precision\n \n @abstractmethod\n def fit(self, X, y=None):\n \"\"\"Placeholder for fit. Subclasses should implement this method!\n\n Fit the model with X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n \n \n def transform(self, X):\n \"\"\"Apply dimensionality reduction to X.\n\n X is projected on the first principal components previously extracted\n from a training set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : array-like of shape (n_samples, n_components)\n Projection of X in the first principal components, where `n_samples`\n is the number of samples and `n_components` is the number of the components.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, dtype=[np.float64, np.float32], reset=False)\n if self.mean_ is not None:\n X = X - self.mean_\n X_transformed = np.dot(X, self.components_.T)\n if self.whiten:\n X_transformed /= np.sqrt(self.explained_variance_)\n return X_transformed\n \n def inverse_transform(self, X):\n \"\"\"Transform data back to its original space.\n\n In other words, return an input `X_original` whose transform would be X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_components)\n New data, where `n_samples` is the number of samples\n and `n_components` is the number of components.\n\n Returns\n -------\n X_original array-like of shape (n_samples, n_features)\n Original data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Notes\n -----\n If whitening is enabled, inverse_transform will compute the\n exact inverse operation, which includes reversing whitening.\n \"\"\"\n if self.whiten:\n return np.dot(X, np.sqrt(self.explained_variance_[:, np.newaxis]) * self.components_) + self.mean_\n else:\n return np.dot(X, self.components_) + self.mean_\n" }, @@ -20447,9 +20513,9 @@ "sklearn.decomposition._dict_learning.DictionaryLearning.fit" ], "is_public": true, - "description": "Dictionary learning.\n\nFinds a dictionary (a set of atoms) that performs well at sparsely encoding the fitted data. Solves the optimization problem:: (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1 (U,V) with || V_k ||_2 = 1 for all 0 <= k < n_components ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm which is the sum of the absolute values of all the entries in the matrix. Read more in the :ref:`User Guide `.", - "docstring": "Dictionary learning.\n\n Finds a dictionary (a set of atoms) that performs well at sparsely\n encoding the fitted data.\n\n Solves the optimization problem::\n\n (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\n ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for\n the entry-wise matrix norm which is the sum of the absolute values\n of all the entries in the matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=n_features\n Number of dictionary elements to extract.\n\n alpha : float, default=1.0\n Sparsity controlling parameter.\n\n max_iter : int, default=1000\n Maximum number of iterations to perform.\n\n tol : float, default=1e-8\n Tolerance for numerical error.\n\n fit_algorithm : {'lars', 'cd'}, default='lars'\n * `'lars'`: uses the least angle regression method to solve the lasso\n problem (:func:`~sklearn.linear_model.lars_path`);\n * `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (:class:`~sklearn.linear_model.Lasso`). Lars will be\n faster if the estimated components are sparse.\n\n .. versionadded:: 0.17\n *cd* coordinate descent method to improve speed.\n\n transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'\n Algorithm used to transform the data:\n\n - `'lars'`: uses the least angle regression method\n (:func:`~sklearn.linear_model.lars_path`);\n - `'lasso_lars'`: uses Lars to compute the Lasso solution.\n - `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (:class:`~sklearn.linear_model.Lasso`). `'lasso_lars'`\n will be faster if the estimated components are sparse.\n - `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution.\n - `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``.\n\n .. versionadded:: 0.17\n *lasso_cd* coordinate descent method to improve speed.\n\n transform_n_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and\n `algorithm='omp'`. If `None`, then\n `transform_n_nonzero_coefs=int(n_features / 10)`.\n\n transform_alpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `None`, defaults to `alpha`.\n\n n_jobs : int or None, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n code_init : ndarray of shape (n_samples, n_components), default=None\n Initial value for the code, for warm restart. Only used if `code_init`\n and `dict_init` are not None.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial values for the dictionary, for warm restart. Only used if\n `code_init` and `dict_init` are not None.\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n split_sign : bool, default=False\n Whether to split the sparse feature vector into the concatenation of\n its negative part and its positive part. This can improve the\n performance of downstream classifiers.\n\n random_state : int, RandomState instance or None, default=None\n Used for initializing the dictionary when ``dict_init`` is not\n specified, randomly shuffling the data when ``shuffle`` is set to\n ``True``, and updating the dictionary. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n transform_max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n dictionary atoms extracted from the data\n\n error_ : array\n vector of errors at each iteration\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations run.\n\n See Also\n --------\n MiniBatchDictionaryLearning: A faster, less accurate, version of the\n dictionary learning algorithm.\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n SparseCoder : Find a sparse representation of data from a fixed,\n precomputed dictionary.\n SparsePCA : Sparse Principal Components Analysis.\n\n References\n ----------\n\n J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009: Online dictionary learning\n for sparse coding (https://www.di.ens.fr/sierra/pdfs/icml09.pdf)\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_sparse_coded_signal\n >>> from sklearn.decomposition import DictionaryLearning\n >>> X, dictionary, code = make_sparse_coded_signal(\n ... n_samples=100, n_components=15, n_features=20, n_nonzero_coefs=10,\n ... random_state=42,\n ... )\n >>> dict_learner = DictionaryLearning(\n ... n_components=15, transform_algorithm='lasso_lars', random_state=42,\n ... )\n >>> X_transformed = dict_learner.fit_transform(X)\n\n We can check the level of sparsity of `X_transformed`:\n\n >>> np.mean(X_transformed == 0)\n 0.87...\n\n We can compare the average squared euclidean norm of the reconstruction\n error of the sparse coded signal relative to the squared euclidean norm of\n the original signal:\n\n >>> X_hat = X_transformed @ dict_learner.components_\n >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))\n 0.08...\n ", - "source_code": "\n\nclass DictionaryLearning(_BaseSparseCoding, BaseEstimator):\n \"\"\"Dictionary learning.\n\n Finds a dictionary (a set of atoms) that performs well at sparsely\n encoding the fitted data.\n\n Solves the optimization problem::\n\n (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\n ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for\n the entry-wise matrix norm which is the sum of the absolute values\n of all the entries in the matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=n_features\n Number of dictionary elements to extract.\n\n alpha : float, default=1.0\n Sparsity controlling parameter.\n\n max_iter : int, default=1000\n Maximum number of iterations to perform.\n\n tol : float, default=1e-8\n Tolerance for numerical error.\n\n fit_algorithm : {'lars', 'cd'}, default='lars'\n * `'lars'`: uses the least angle regression method to solve the lasso\n problem (:func:`~sklearn.linear_model.lars_path`);\n * `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (:class:`~sklearn.linear_model.Lasso`). Lars will be\n faster if the estimated components are sparse.\n\n .. versionadded:: 0.17\n *cd* coordinate descent method to improve speed.\n\n transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'\n Algorithm used to transform the data:\n\n - `'lars'`: uses the least angle regression method\n (:func:`~sklearn.linear_model.lars_path`);\n - `'lasso_lars'`: uses Lars to compute the Lasso solution.\n - `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (:class:`~sklearn.linear_model.Lasso`). `'lasso_lars'`\n will be faster if the estimated components are sparse.\n - `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution.\n - `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``.\n\n .. versionadded:: 0.17\n *lasso_cd* coordinate descent method to improve speed.\n\n transform_n_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and\n `algorithm='omp'`. If `None`, then\n `transform_n_nonzero_coefs=int(n_features / 10)`.\n\n transform_alpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `None`, defaults to `alpha`.\n\n n_jobs : int or None, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n code_init : ndarray of shape (n_samples, n_components), default=None\n Initial value for the code, for warm restart. Only used if `code_init`\n and `dict_init` are not None.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial values for the dictionary, for warm restart. Only used if\n `code_init` and `dict_init` are not None.\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n split_sign : bool, default=False\n Whether to split the sparse feature vector into the concatenation of\n its negative part and its positive part. This can improve the\n performance of downstream classifiers.\n\n random_state : int, RandomState instance or None, default=None\n Used for initializing the dictionary when ``dict_init`` is not\n specified, randomly shuffling the data when ``shuffle`` is set to\n ``True``, and updating the dictionary. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n transform_max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n dictionary atoms extracted from the data\n\n error_ : array\n vector of errors at each iteration\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations run.\n\n See Also\n --------\n MiniBatchDictionaryLearning: A faster, less accurate, version of the\n dictionary learning algorithm.\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n SparseCoder : Find a sparse representation of data from a fixed,\n precomputed dictionary.\n SparsePCA : Sparse Principal Components Analysis.\n\n References\n ----------\n\n J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009: Online dictionary learning\n for sparse coding (https://www.di.ens.fr/sierra/pdfs/icml09.pdf)\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_sparse_coded_signal\n >>> from sklearn.decomposition import DictionaryLearning\n >>> X, dictionary, code = make_sparse_coded_signal(\n ... n_samples=100, n_components=15, n_features=20, n_nonzero_coefs=10,\n ... random_state=42,\n ... )\n >>> dict_learner = DictionaryLearning(\n ... n_components=15, transform_algorithm='lasso_lars', random_state=42,\n ... )\n >>> X_transformed = dict_learner.fit_transform(X)\n\n We can check the level of sparsity of `X_transformed`:\n\n >>> np.mean(X_transformed == 0)\n 0.87...\n\n We can compare the average squared euclidean norm of the reconstruction\n error of the sparse coded signal relative to the squared euclidean norm of\n the original signal:\n\n >>> X_hat = X_transformed @ dict_learner.components_\n >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))\n 0.08...\n \"\"\"\n \n def __init__(self, n_components=None, *, alpha=1, max_iter=1000, tol=1e-08, fit_algorithm='lars', transform_algorithm='omp', transform_n_nonzero_coefs=None, transform_alpha=None, n_jobs=None, code_init=None, dict_init=None, verbose=False, split_sign=False, random_state=None, positive_code=False, positive_dict=False, transform_max_iter=1000):\n super().__init__(transform_algorithm, transform_n_nonzero_coefs, transform_alpha, split_sign, n_jobs, positive_code, transform_max_iter)\n self.n_components = n_components\n self.alpha = alpha\n self.max_iter = max_iter\n self.tol = tol\n self.fit_algorithm = fit_algorithm\n self.code_init = code_init\n self.dict_init = dict_init\n self.verbose = verbose\n self.random_state = random_state\n self.positive_dict = positive_dict\n \n def fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X)\n if self.n_components is None:\n n_components = X.shape[1]\n else:\n n_components = self.n_components\n (V, U, E, self.n_iter_) = dict_learning(X, n_components, alpha=self.alpha, tol=self.tol, max_iter=self.max_iter, method=self.fit_algorithm, method_max_iter=self.transform_max_iter, n_jobs=self.n_jobs, code_init=self.code_init, dict_init=self.dict_init, verbose=self.verbose, random_state=random_state, return_n_iter=True, positive_dict=self.positive_dict, positive_code=self.positive_code)\n self.components_ = U\n self.error_ = E\n return self\n" + "description": "Dictionary learning.\n\nFinds a dictionary (a set of atoms) that performs well at sparsely\nencoding the fitted data.\n\nSolves the optimization problem::\n\n (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 <= 1 for all 0 <= k < n_components\n\n||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for\nthe entry-wise matrix norm which is the sum of the absolute values\nof all the entries in the matrix.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Dictionary learning.\n\n Finds a dictionary (a set of atoms) that performs well at sparsely\n encoding the fitted data.\n\n Solves the optimization problem::\n\n (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 <= 1 for all 0 <= k < n_components\n\n ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for\n the entry-wise matrix norm which is the sum of the absolute values\n of all the entries in the matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of dictionary elements to extract. If None, then ``n_components``\n is set to ``n_features``.\n\n alpha : float, default=1.0\n Sparsity controlling parameter.\n\n max_iter : int, default=1000\n Maximum number of iterations to perform.\n\n tol : float, default=1e-8\n Tolerance for numerical error.\n\n fit_algorithm : {'lars', 'cd'}, default='lars'\n * `'lars'`: uses the least angle regression method to solve the lasso\n problem (:func:`~sklearn.linear_model.lars_path`);\n * `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (:class:`~sklearn.linear_model.Lasso`). Lars will be\n faster if the estimated components are sparse.\n\n .. versionadded:: 0.17\n *cd* coordinate descent method to improve speed.\n\n transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'\n Algorithm used to transform the data:\n\n - `'lars'`: uses the least angle regression method\n (:func:`~sklearn.linear_model.lars_path`);\n - `'lasso_lars'`: uses Lars to compute the Lasso solution.\n - `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (:class:`~sklearn.linear_model.Lasso`). `'lasso_lars'`\n will be faster if the estimated components are sparse.\n - `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution.\n - `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``.\n\n .. versionadded:: 0.17\n *lasso_cd* coordinate descent method to improve speed.\n\n transform_n_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and\n `algorithm='omp'`. If `None`, then\n `transform_n_nonzero_coefs=int(n_features / 10)`.\n\n transform_alpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `None`, defaults to `alpha`.\n\n n_jobs : int or None, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n code_init : ndarray of shape (n_samples, n_components), default=None\n Initial value for the code, for warm restart. Only used if `code_init`\n and `dict_init` are not None.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial values for the dictionary, for warm restart. Only used if\n `code_init` and `dict_init` are not None.\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n split_sign : bool, default=False\n Whether to split the sparse feature vector into the concatenation of\n its negative part and its positive part. This can improve the\n performance of downstream classifiers.\n\n random_state : int, RandomState instance or None, default=None\n Used for initializing the dictionary when ``dict_init`` is not\n specified, randomly shuffling the data when ``shuffle`` is set to\n ``True``, and updating the dictionary. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n transform_max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n dictionary atoms extracted from the data\n\n error_ : array\n vector of errors at each iteration\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations run.\n\n See Also\n --------\n MiniBatchDictionaryLearning: A faster, less accurate, version of the\n dictionary learning algorithm.\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n SparseCoder : Find a sparse representation of data from a fixed,\n precomputed dictionary.\n SparsePCA : Sparse Principal Components Analysis.\n\n References\n ----------\n\n J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009: Online dictionary learning\n for sparse coding (https://www.di.ens.fr/sierra/pdfs/icml09.pdf)\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_sparse_coded_signal\n >>> from sklearn.decomposition import DictionaryLearning\n >>> X, dictionary, code = make_sparse_coded_signal(\n ... n_samples=100, n_components=15, n_features=20, n_nonzero_coefs=10,\n ... random_state=42,\n ... )\n >>> dict_learner = DictionaryLearning(\n ... n_components=15, transform_algorithm='lasso_lars', random_state=42,\n ... )\n >>> X_transformed = dict_learner.fit_transform(X)\n\n We can check the level of sparsity of `X_transformed`:\n\n >>> np.mean(X_transformed == 0)\n 0.87...\n\n We can compare the average squared euclidean norm of the reconstruction\n error of the sparse coded signal relative to the squared euclidean norm of\n the original signal:\n\n >>> X_hat = X_transformed @ dict_learner.components_\n >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))\n 0.08...\n ", + "source_code": "\n\nclass DictionaryLearning(_BaseSparseCoding, BaseEstimator):\n \"\"\"Dictionary learning.\n\n Finds a dictionary (a set of atoms) that performs well at sparsely\n encoding the fitted data.\n\n Solves the optimization problem::\n\n (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 <= 1 for all 0 <= k < n_components\n\n ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for\n the entry-wise matrix norm which is the sum of the absolute values\n of all the entries in the matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of dictionary elements to extract. If None, then ``n_components``\n is set to ``n_features``.\n\n alpha : float, default=1.0\n Sparsity controlling parameter.\n\n max_iter : int, default=1000\n Maximum number of iterations to perform.\n\n tol : float, default=1e-8\n Tolerance for numerical error.\n\n fit_algorithm : {'lars', 'cd'}, default='lars'\n * `'lars'`: uses the least angle regression method to solve the lasso\n problem (:func:`~sklearn.linear_model.lars_path`);\n * `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (:class:`~sklearn.linear_model.Lasso`). Lars will be\n faster if the estimated components are sparse.\n\n .. versionadded:: 0.17\n *cd* coordinate descent method to improve speed.\n\n transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'\n Algorithm used to transform the data:\n\n - `'lars'`: uses the least angle regression method\n (:func:`~sklearn.linear_model.lars_path`);\n - `'lasso_lars'`: uses Lars to compute the Lasso solution.\n - `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (:class:`~sklearn.linear_model.Lasso`). `'lasso_lars'`\n will be faster if the estimated components are sparse.\n - `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution.\n - `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``.\n\n .. versionadded:: 0.17\n *lasso_cd* coordinate descent method to improve speed.\n\n transform_n_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and\n `algorithm='omp'`. If `None`, then\n `transform_n_nonzero_coefs=int(n_features / 10)`.\n\n transform_alpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `None`, defaults to `alpha`.\n\n n_jobs : int or None, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n code_init : ndarray of shape (n_samples, n_components), default=None\n Initial value for the code, for warm restart. Only used if `code_init`\n and `dict_init` are not None.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial values for the dictionary, for warm restart. Only used if\n `code_init` and `dict_init` are not None.\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n split_sign : bool, default=False\n Whether to split the sparse feature vector into the concatenation of\n its negative part and its positive part. This can improve the\n performance of downstream classifiers.\n\n random_state : int, RandomState instance or None, default=None\n Used for initializing the dictionary when ``dict_init`` is not\n specified, randomly shuffling the data when ``shuffle`` is set to\n ``True``, and updating the dictionary. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n transform_max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n dictionary atoms extracted from the data\n\n error_ : array\n vector of errors at each iteration\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations run.\n\n See Also\n --------\n MiniBatchDictionaryLearning: A faster, less accurate, version of the\n dictionary learning algorithm.\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n SparseCoder : Find a sparse representation of data from a fixed,\n precomputed dictionary.\n SparsePCA : Sparse Principal Components Analysis.\n\n References\n ----------\n\n J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009: Online dictionary learning\n for sparse coding (https://www.di.ens.fr/sierra/pdfs/icml09.pdf)\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_sparse_coded_signal\n >>> from sklearn.decomposition import DictionaryLearning\n >>> X, dictionary, code = make_sparse_coded_signal(\n ... n_samples=100, n_components=15, n_features=20, n_nonzero_coefs=10,\n ... random_state=42,\n ... )\n >>> dict_learner = DictionaryLearning(\n ... n_components=15, transform_algorithm='lasso_lars', random_state=42,\n ... )\n >>> X_transformed = dict_learner.fit_transform(X)\n\n We can check the level of sparsity of `X_transformed`:\n\n >>> np.mean(X_transformed == 0)\n 0.87...\n\n We can compare the average squared euclidean norm of the reconstruction\n error of the sparse coded signal relative to the squared euclidean norm of\n the original signal:\n\n >>> X_hat = X_transformed @ dict_learner.components_\n >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))\n 0.08...\n \"\"\"\n \n def __init__(self, n_components=None, *, alpha=1, max_iter=1000, tol=1e-08, fit_algorithm='lars', transform_algorithm='omp', transform_n_nonzero_coefs=None, transform_alpha=None, n_jobs=None, code_init=None, dict_init=None, verbose=False, split_sign=False, random_state=None, positive_code=False, positive_dict=False, transform_max_iter=1000):\n super().__init__(transform_algorithm, transform_n_nonzero_coefs, transform_alpha, split_sign, n_jobs, positive_code, transform_max_iter)\n self.n_components = n_components\n self.alpha = alpha\n self.max_iter = max_iter\n self.tol = tol\n self.fit_algorithm = fit_algorithm\n self.code_init = code_init\n self.dict_init = dict_init\n self.verbose = verbose\n self.random_state = random_state\n self.positive_dict = positive_dict\n \n def fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X)\n if self.n_components is None:\n n_components = X.shape[1]\n else:\n n_components = self.n_components\n (V, U, E, self.n_iter_) = dict_learning(X, n_components, alpha=self.alpha, tol=self.tol, max_iter=self.max_iter, method=self.fit_algorithm, method_max_iter=self.transform_max_iter, n_jobs=self.n_jobs, code_init=self.code_init, dict_init=self.dict_init, verbose=self.verbose, random_state=random_state, return_n_iter=True, positive_dict=self.positive_dict, positive_code=self.positive_code)\n self.components_ = U\n self.error_ = E\n return self\n" }, { "name": "MiniBatchDictionaryLearning", @@ -20462,9 +20528,9 @@ "sklearn.decomposition._dict_learning.MiniBatchDictionaryLearning.partial_fit" ], "is_public": true, - "description": "Mini-batch dictionary learning.\n\nFinds a dictionary (a set of atoms) that performs well at sparsely encoding the fitted data. Solves the optimization problem:: (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1 (U,V) with || V_k ||_2 = 1 for all 0 <= k < n_components ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm which is the sum of the absolute values of all the entries in the matrix. Read more in the :ref:`User Guide `.", - "docstring": "Mini-batch dictionary learning.\n\n Finds a dictionary (a set of atoms) that performs well at sparsely\n encoding the fitted data.\n\n Solves the optimization problem::\n\n (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\n ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for\n the entry-wise matrix norm which is the sum of the absolute values\n of all the entries in the matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of dictionary elements to extract.\n\n alpha : float, default=1\n Sparsity controlling parameter.\n\n n_iter : int, default=1000\n Total number of iterations to perform.\n\n fit_algorithm : {'lars', 'cd'}, default='lars'\n The algorithm used:\n\n - `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`)\n - `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n batch_size : int, default=3\n Number of samples in each mini-batch.\n\n shuffle : bool, default=True\n Whether to shuffle the samples before forming batches.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial value of the dictionary for warm restart scenarios.\n\n transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'\n Algorithm used to transform the data:\n\n - `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n - `'lasso_lars'`: uses Lars to compute the Lasso solution.\n - `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). `'lasso_lars'` will be faster\n if the estimated components are sparse.\n - `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution.\n - `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``.\n\n transform_n_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and\n `algorithm='omp'`. If `None`, then\n `transform_n_nonzero_coefs=int(n_features / 10)`.\n\n transform_alpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `None`, defaults to `alpha`.\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n split_sign : bool, default=False\n Whether to split the sparse feature vector into the concatenation of\n its negative part and its positive part. This can improve the\n performance of downstream classifiers.\n\n random_state : int, RandomState instance or None, default=None\n Used for initializing the dictionary when ``dict_init`` is not\n specified, randomly shuffling the data when ``shuffle`` is set to\n ``True``, and updating the dictionary. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n transform_max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Components extracted from the data.\n\n inner_stats_ : tuple of (A, B) ndarrays\n Internal sufficient statistics that are kept by the algorithm.\n Keeping them is useful in online settings, to avoid losing the\n history of the evolution, but they shouldn't have any use for the\n end user.\n `A` `(n_components, n_components)` is the dictionary covariance matrix.\n `B` `(n_features, n_components)` is the data approximation matrix.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations run.\n\n iter_offset_ : int\n The number of iteration on data batches that has been\n performed before.\n\n random_state_ : RandomState instance\n RandomState instance that is generated either from a seed, the random\n number generattor or by `np.random`.\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n SparseCoder : Find a sparse representation of data from a fixed,\n precomputed dictionary.\n SparsePCA : Sparse Principal Components Analysis.\n\n References\n ----------\n\n J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009: Online dictionary learning\n for sparse coding (https://www.di.ens.fr/sierra/pdfs/icml09.pdf)\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_sparse_coded_signal\n >>> from sklearn.decomposition import MiniBatchDictionaryLearning\n >>> X, dictionary, code = make_sparse_coded_signal(\n ... n_samples=100, n_components=15, n_features=20, n_nonzero_coefs=10,\n ... random_state=42)\n >>> dict_learner = MiniBatchDictionaryLearning(\n ... n_components=15, transform_algorithm='lasso_lars', random_state=42,\n ... )\n >>> X_transformed = dict_learner.fit_transform(X)\n\n We can check the level of sparsity of `X_transformed`:\n\n >>> np.mean(X_transformed == 0)\n 0.86...\n\n We can compare the average squared euclidean norm of the reconstruction\n error of the sparse coded signal relative to the squared euclidean norm of\n the original signal:\n\n >>> X_hat = X_transformed @ dict_learner.components_\n >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))\n 0.07...\n ", - "source_code": "\n\nclass MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):\n \"\"\"Mini-batch dictionary learning.\n\n Finds a dictionary (a set of atoms) that performs well at sparsely\n encoding the fitted data.\n\n Solves the optimization problem::\n\n (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\n ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for\n the entry-wise matrix norm which is the sum of the absolute values\n of all the entries in the matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of dictionary elements to extract.\n\n alpha : float, default=1\n Sparsity controlling parameter.\n\n n_iter : int, default=1000\n Total number of iterations to perform.\n\n fit_algorithm : {'lars', 'cd'}, default='lars'\n The algorithm used:\n\n - `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`)\n - `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n batch_size : int, default=3\n Number of samples in each mini-batch.\n\n shuffle : bool, default=True\n Whether to shuffle the samples before forming batches.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial value of the dictionary for warm restart scenarios.\n\n transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'\n Algorithm used to transform the data:\n\n - `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n - `'lasso_lars'`: uses Lars to compute the Lasso solution.\n - `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). `'lasso_lars'` will be faster\n if the estimated components are sparse.\n - `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution.\n - `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``.\n\n transform_n_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and\n `algorithm='omp'`. If `None`, then\n `transform_n_nonzero_coefs=int(n_features / 10)`.\n\n transform_alpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `None`, defaults to `alpha`.\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n split_sign : bool, default=False\n Whether to split the sparse feature vector into the concatenation of\n its negative part and its positive part. This can improve the\n performance of downstream classifiers.\n\n random_state : int, RandomState instance or None, default=None\n Used for initializing the dictionary when ``dict_init`` is not\n specified, randomly shuffling the data when ``shuffle`` is set to\n ``True``, and updating the dictionary. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n transform_max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Components extracted from the data.\n\n inner_stats_ : tuple of (A, B) ndarrays\n Internal sufficient statistics that are kept by the algorithm.\n Keeping them is useful in online settings, to avoid losing the\n history of the evolution, but they shouldn't have any use for the\n end user.\n `A` `(n_components, n_components)` is the dictionary covariance matrix.\n `B` `(n_features, n_components)` is the data approximation matrix.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations run.\n\n iter_offset_ : int\n The number of iteration on data batches that has been\n performed before.\n\n random_state_ : RandomState instance\n RandomState instance that is generated either from a seed, the random\n number generattor or by `np.random`.\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n SparseCoder : Find a sparse representation of data from a fixed,\n precomputed dictionary.\n SparsePCA : Sparse Principal Components Analysis.\n\n References\n ----------\n\n J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009: Online dictionary learning\n for sparse coding (https://www.di.ens.fr/sierra/pdfs/icml09.pdf)\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_sparse_coded_signal\n >>> from sklearn.decomposition import MiniBatchDictionaryLearning\n >>> X, dictionary, code = make_sparse_coded_signal(\n ... n_samples=100, n_components=15, n_features=20, n_nonzero_coefs=10,\n ... random_state=42)\n >>> dict_learner = MiniBatchDictionaryLearning(\n ... n_components=15, transform_algorithm='lasso_lars', random_state=42,\n ... )\n >>> X_transformed = dict_learner.fit_transform(X)\n\n We can check the level of sparsity of `X_transformed`:\n\n >>> np.mean(X_transformed == 0)\n 0.86...\n\n We can compare the average squared euclidean norm of the reconstruction\n error of the sparse coded signal relative to the squared euclidean norm of\n the original signal:\n\n >>> X_hat = X_transformed @ dict_learner.components_\n >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))\n 0.07...\n \"\"\"\n \n def __init__(self, n_components=None, *, alpha=1, n_iter=1000, fit_algorithm='lars', n_jobs=None, batch_size=3, shuffle=True, dict_init=None, transform_algorithm='omp', transform_n_nonzero_coefs=None, transform_alpha=None, verbose=False, split_sign=False, random_state=None, positive_code=False, positive_dict=False, transform_max_iter=1000):\n super().__init__(transform_algorithm, transform_n_nonzero_coefs, transform_alpha, split_sign, n_jobs, positive_code, transform_max_iter)\n self.n_components = n_components\n self.alpha = alpha\n self.n_iter = n_iter\n self.fit_algorithm = fit_algorithm\n self.dict_init = dict_init\n self.verbose = verbose\n self.shuffle = shuffle\n self.batch_size = batch_size\n self.split_sign = split_sign\n self.random_state = random_state\n self.positive_dict = positive_dict\n \n def fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X)\n (U, (A, B), self.n_iter_) = dict_learning_online(X, self.n_components, alpha=self.alpha, n_iter=self.n_iter, return_code=False, method=self.fit_algorithm, method_max_iter=self.transform_max_iter, n_jobs=self.n_jobs, dict_init=self.dict_init, batch_size=self.batch_size, shuffle=self.shuffle, verbose=self.verbose, random_state=random_state, return_inner_stats=True, return_n_iter=True, positive_dict=self.positive_dict, positive_code=self.positive_code)\n self.components_ = U\n self.inner_stats_ = (A, B)\n self.iter_offset_ = self.n_iter\n self.random_state_ = random_state\n return self\n \n def partial_fit(self, X, y=None, iter_offset=None):\n \"\"\"Update the model using the data in X as a mini-batch.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n iter_offset : int, default=None\n The number of iteration on data batches that has been\n performed before this call to `partial_fit`. This is optional:\n if no number is passed, the memory of the object is\n used.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n if not hasattr(self, 'random_state_'):\n self.random_state_ = check_random_state(self.random_state)\n if hasattr(self, 'components_'):\n dict_init = self.components_\n else:\n dict_init = self.dict_init\n inner_stats = getattr(self, 'inner_stats_', None)\n if iter_offset is None:\n iter_offset = getattr(self, 'iter_offset_', 0)\n X = self._validate_data(X, reset=iter_offset == 0)\n (U, (A, B)) = dict_learning_online(X, self.n_components, alpha=self.alpha, n_iter=1, method=self.fit_algorithm, method_max_iter=self.transform_max_iter, n_jobs=self.n_jobs, dict_init=dict_init, batch_size=len(X), shuffle=False, verbose=self.verbose, return_code=False, iter_offset=iter_offset, random_state=self.random_state_, return_inner_stats=True, inner_stats=inner_stats, positive_dict=self.positive_dict, positive_code=self.positive_code)\n self.components_ = U\n self.inner_stats_ = (A, B)\n self.iter_offset_ = iter_offset + 1\n return self\n" + "description": "Mini-batch dictionary learning.\n\nFinds a dictionary (a set of atoms) that performs well at sparsely\nencoding the fitted data.\n\nSolves the optimization problem::\n\n (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 <= 1 for all 0 <= k < n_components\n\n||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for\nthe entry-wise matrix norm which is the sum of the absolute values\nof all the entries in the matrix.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Mini-batch dictionary learning.\n\n Finds a dictionary (a set of atoms) that performs well at sparsely\n encoding the fitted data.\n\n Solves the optimization problem::\n\n (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 <= 1 for all 0 <= k < n_components\n\n ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for\n the entry-wise matrix norm which is the sum of the absolute values\n of all the entries in the matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of dictionary elements to extract.\n\n alpha : float, default=1\n Sparsity controlling parameter.\n\n n_iter : int, default=1000\n Total number of iterations to perform.\n\n fit_algorithm : {'lars', 'cd'}, default='lars'\n The algorithm used:\n\n - `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`)\n - `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n batch_size : int, default=3\n Number of samples in each mini-batch.\n\n shuffle : bool, default=True\n Whether to shuffle the samples before forming batches.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial value of the dictionary for warm restart scenarios.\n\n transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'\n Algorithm used to transform the data:\n\n - `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n - `'lasso_lars'`: uses Lars to compute the Lasso solution.\n - `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). `'lasso_lars'` will be faster\n if the estimated components are sparse.\n - `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution.\n - `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``.\n\n transform_n_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and\n `algorithm='omp'`. If `None`, then\n `transform_n_nonzero_coefs=int(n_features / 10)`.\n\n transform_alpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `None`, defaults to `alpha`.\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n split_sign : bool, default=False\n Whether to split the sparse feature vector into the concatenation of\n its negative part and its positive part. This can improve the\n performance of downstream classifiers.\n\n random_state : int, RandomState instance or None, default=None\n Used for initializing the dictionary when ``dict_init`` is not\n specified, randomly shuffling the data when ``shuffle`` is set to\n ``True``, and updating the dictionary. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n transform_max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Components extracted from the data.\n\n inner_stats_ : tuple of (A, B) ndarrays\n Internal sufficient statistics that are kept by the algorithm.\n Keeping them is useful in online settings, to avoid losing the\n history of the evolution, but they shouldn't have any use for the\n end user.\n `A` `(n_components, n_components)` is the dictionary covariance matrix.\n `B` `(n_features, n_components)` is the data approximation matrix.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations run.\n\n iter_offset_ : int\n The number of iteration on data batches that has been\n performed before.\n\n random_state_ : RandomState instance\n RandomState instance that is generated either from a seed, the random\n number generattor or by `np.random`.\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n SparseCoder : Find a sparse representation of data from a fixed,\n precomputed dictionary.\n SparsePCA : Sparse Principal Components Analysis.\n\n References\n ----------\n\n J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009: Online dictionary learning\n for sparse coding (https://www.di.ens.fr/sierra/pdfs/icml09.pdf)\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_sparse_coded_signal\n >>> from sklearn.decomposition import MiniBatchDictionaryLearning\n >>> X, dictionary, code = make_sparse_coded_signal(\n ... n_samples=100, n_components=15, n_features=20, n_nonzero_coefs=10,\n ... random_state=42)\n >>> dict_learner = MiniBatchDictionaryLearning(\n ... n_components=15, transform_algorithm='lasso_lars', random_state=42,\n ... )\n >>> X_transformed = dict_learner.fit_transform(X)\n\n We can check the level of sparsity of `X_transformed`:\n\n >>> np.mean(X_transformed == 0)\n 0.86...\n\n We can compare the average squared euclidean norm of the reconstruction\n error of the sparse coded signal relative to the squared euclidean norm of\n the original signal:\n\n >>> X_hat = X_transformed @ dict_learner.components_\n >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))\n 0.07...\n ", + "source_code": "\n\nclass MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):\n \"\"\"Mini-batch dictionary learning.\n\n Finds a dictionary (a set of atoms) that performs well at sparsely\n encoding the fitted data.\n\n Solves the optimization problem::\n\n (U^*,V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 <= 1 for all 0 <= k < n_components\n\n ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for\n the entry-wise matrix norm which is the sum of the absolute values\n of all the entries in the matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of dictionary elements to extract.\n\n alpha : float, default=1\n Sparsity controlling parameter.\n\n n_iter : int, default=1000\n Total number of iterations to perform.\n\n fit_algorithm : {'lars', 'cd'}, default='lars'\n The algorithm used:\n\n - `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`)\n - `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n batch_size : int, default=3\n Number of samples in each mini-batch.\n\n shuffle : bool, default=True\n Whether to shuffle the samples before forming batches.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial value of the dictionary for warm restart scenarios.\n\n transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'\n Algorithm used to transform the data:\n\n - `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n - `'lasso_lars'`: uses Lars to compute the Lasso solution.\n - `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). `'lasso_lars'` will be faster\n if the estimated components are sparse.\n - `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution.\n - `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``.\n\n transform_n_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and\n `algorithm='omp'`. If `None`, then\n `transform_n_nonzero_coefs=int(n_features / 10)`.\n\n transform_alpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `None`, defaults to `alpha`.\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n split_sign : bool, default=False\n Whether to split the sparse feature vector into the concatenation of\n its negative part and its positive part. This can improve the\n performance of downstream classifiers.\n\n random_state : int, RandomState instance or None, default=None\n Used for initializing the dictionary when ``dict_init`` is not\n specified, randomly shuffling the data when ``shuffle`` is set to\n ``True``, and updating the dictionary. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n transform_max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Components extracted from the data.\n\n inner_stats_ : tuple of (A, B) ndarrays\n Internal sufficient statistics that are kept by the algorithm.\n Keeping them is useful in online settings, to avoid losing the\n history of the evolution, but they shouldn't have any use for the\n end user.\n `A` `(n_components, n_components)` is the dictionary covariance matrix.\n `B` `(n_features, n_components)` is the data approximation matrix.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations run.\n\n iter_offset_ : int\n The number of iteration on data batches that has been\n performed before.\n\n random_state_ : RandomState instance\n RandomState instance that is generated either from a seed, the random\n number generattor or by `np.random`.\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n SparseCoder : Find a sparse representation of data from a fixed,\n precomputed dictionary.\n SparsePCA : Sparse Principal Components Analysis.\n\n References\n ----------\n\n J. Mairal, F. Bach, J. Ponce, G. Sapiro, 2009: Online dictionary learning\n for sparse coding (https://www.di.ens.fr/sierra/pdfs/icml09.pdf)\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_sparse_coded_signal\n >>> from sklearn.decomposition import MiniBatchDictionaryLearning\n >>> X, dictionary, code = make_sparse_coded_signal(\n ... n_samples=100, n_components=15, n_features=20, n_nonzero_coefs=10,\n ... random_state=42)\n >>> dict_learner = MiniBatchDictionaryLearning(\n ... n_components=15, transform_algorithm='lasso_lars', random_state=42,\n ... )\n >>> X_transformed = dict_learner.fit_transform(X)\n\n We can check the level of sparsity of `X_transformed`:\n\n >>> np.mean(X_transformed == 0)\n 0.86...\n\n We can compare the average squared euclidean norm of the reconstruction\n error of the sparse coded signal relative to the squared euclidean norm of\n the original signal:\n\n >>> X_hat = X_transformed @ dict_learner.components_\n >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))\n 0.07...\n \"\"\"\n \n def __init__(self, n_components=None, *, alpha=1, n_iter=1000, fit_algorithm='lars', n_jobs=None, batch_size=3, shuffle=True, dict_init=None, transform_algorithm='omp', transform_n_nonzero_coefs=None, transform_alpha=None, verbose=False, split_sign=False, random_state=None, positive_code=False, positive_dict=False, transform_max_iter=1000):\n super().__init__(transform_algorithm, transform_n_nonzero_coefs, transform_alpha, split_sign, n_jobs, positive_code, transform_max_iter)\n self.n_components = n_components\n self.alpha = alpha\n self.n_iter = n_iter\n self.fit_algorithm = fit_algorithm\n self.dict_init = dict_init\n self.verbose = verbose\n self.shuffle = shuffle\n self.batch_size = batch_size\n self.split_sign = split_sign\n self.random_state = random_state\n self.positive_dict = positive_dict\n \n def fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X)\n (U, (A, B), self.n_iter_) = dict_learning_online(X, self.n_components, alpha=self.alpha, n_iter=self.n_iter, return_code=False, method=self.fit_algorithm, method_max_iter=self.transform_max_iter, n_jobs=self.n_jobs, dict_init=self.dict_init, batch_size=self.batch_size, shuffle=self.shuffle, verbose=self.verbose, random_state=random_state, return_inner_stats=True, return_n_iter=True, positive_dict=self.positive_dict, positive_code=self.positive_code)\n self.components_ = U\n self.inner_stats_ = (A, B)\n self.iter_offset_ = self.n_iter\n self.random_state_ = random_state\n return self\n \n def partial_fit(self, X, y=None, iter_offset=None):\n \"\"\"Update the model using the data in X as a mini-batch.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n iter_offset : int, default=None\n The number of iteration on data batches that has been\n performed before this call to `partial_fit`. This is optional:\n if no number is passed, the memory of the object is\n used.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n if not hasattr(self, 'random_state_'):\n self.random_state_ = check_random_state(self.random_state)\n if hasattr(self, 'components_'):\n dict_init = self.components_\n else:\n dict_init = self.dict_init\n inner_stats = getattr(self, 'inner_stats_', None)\n if iter_offset is None:\n iter_offset = getattr(self, 'iter_offset_', 0)\n X = self._validate_data(X, reset=iter_offset == 0)\n (U, (A, B)) = dict_learning_online(X, self.n_components, alpha=self.alpha, n_iter=1, method=self.fit_algorithm, method_max_iter=self.transform_max_iter, n_jobs=self.n_jobs, dict_init=dict_init, batch_size=len(X), shuffle=False, verbose=self.verbose, return_code=False, iter_offset=iter_offset, random_state=self.random_state_, return_inner_stats=True, inner_stats=inner_stats, positive_dict=self.positive_dict, positive_code=self.positive_code)\n self.components_ = U\n self.inner_stats_ = (A, B)\n self.iter_offset_ = iter_offset + 1\n return self\n" }, { "name": "SparseCoder", @@ -20481,7 +20547,7 @@ "sklearn.decomposition._dict_learning.SparseCoder.n_features_in_@getter" ], "is_public": true, - "description": "Sparse coding.\n\nFinds a sparse representation of data against a fixed, precomputed dictionary. Each row of the result is the solution to a sparse coding problem. The goal is to find a sparse array `code` such that:: X ~= code * dictionary Read more in the :ref:`User Guide `.", + "description": "Sparse coding.\n\nFinds a sparse representation of data against a fixed, precomputed\ndictionary.\n\nEach row of the result is the solution to a sparse coding problem.\nThe goal is to find a sparse array `code` such that::\n\n X ~= code * dictionary\n\nRead more in the :ref:`User Guide `.", "docstring": "Sparse coding.\n\n Finds a sparse representation of data against a fixed, precomputed\n dictionary.\n\n Each row of the result is the solution to a sparse coding problem.\n The goal is to find a sparse array `code` such that::\n\n X ~= code * dictionary\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n dictionary : ndarray of shape (n_components, n_features)\n The dictionary atoms used for sparse coding. Lines are assumed to be\n normalized to unit norm.\n\n transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'\n Algorithm used to transform the data:\n\n - `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n - `'lasso_lars'`: uses Lars to compute the Lasso solution;\n - `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (linear_model.Lasso). `'lasso_lars'` will be faster if\n the estimated components are sparse;\n - `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution;\n - `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``.\n\n transform_n_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and `algorithm='omp'`\n and is overridden by `alpha` in the `omp` case. If `None`, then\n `transform_n_nonzero_coefs=int(n_features / 10)`.\n\n transform_alpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `algorithm='omp'`, `alpha` is the tolerance parameter: the value of\n the reconstruction error targeted. In this case, it overrides\n `n_nonzero_coefs`.\n If `None`, default to 1.\n\n split_sign : bool, default=False\n Whether to split the sparse feature vector into the concatenation of\n its negative part and its positive part. This can improve the\n performance of downstream classifiers.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n transform_max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `lasso_lars`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n The unchanged dictionary atoms.\n\n .. deprecated:: 0.24\n This attribute is deprecated in 0.24 and will be removed in\n 1.1 (renaming of 0.26). Use `dictionary` instead.\n\n n_components_ : int\n Number of atoms.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n MiniBatchDictionaryLearning : A faster, less accurate, version of the\n dictionary learning algorithm.\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n SparsePCA : Mini-batch Sparse Principal Components Analysis.\n sparse_encode : Sparse coding where each row of the result is the solution\n to a sparse coding problem.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.decomposition import SparseCoder\n >>> X = np.array([[-1, -1, -1], [0, 0, 3]])\n >>> dictionary = np.array(\n ... [[0, 1, 0],\n ... [-1, -1, 2],\n ... [1, 1, 1],\n ... [0, 1, 1],\n ... [0, 2, 1]],\n ... dtype=np.float64\n ... )\n >>> coder = SparseCoder(\n ... dictionary=dictionary, transform_algorithm='lasso_lars',\n ... transform_alpha=1e-10,\n ... )\n >>> coder.transform(X)\n array([[ 0., 0., -1., 0., 0.],\n [ 0., 1., 1., 0., 0.]])\n ", "source_code": "\n\nclass SparseCoder(_BaseSparseCoding, BaseEstimator):\n \"\"\"Sparse coding.\n\n Finds a sparse representation of data against a fixed, precomputed\n dictionary.\n\n Each row of the result is the solution to a sparse coding problem.\n The goal is to find a sparse array `code` such that::\n\n X ~= code * dictionary\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n dictionary : ndarray of shape (n_components, n_features)\n The dictionary atoms used for sparse coding. Lines are assumed to be\n normalized to unit norm.\n\n transform_algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'\n Algorithm used to transform the data:\n\n - `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n - `'lasso_lars'`: uses Lars to compute the Lasso solution;\n - `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (linear_model.Lasso). `'lasso_lars'` will be faster if\n the estimated components are sparse;\n - `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution;\n - `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``.\n\n transform_n_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and `algorithm='omp'`\n and is overridden by `alpha` in the `omp` case. If `None`, then\n `transform_n_nonzero_coefs=int(n_features / 10)`.\n\n transform_alpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `algorithm='omp'`, `alpha` is the tolerance parameter: the value of\n the reconstruction error targeted. In this case, it overrides\n `n_nonzero_coefs`.\n If `None`, default to 1.\n\n split_sign : bool, default=False\n Whether to split the sparse feature vector into the concatenation of\n its negative part and its positive part. This can improve the\n performance of downstream classifiers.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n transform_max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `lasso_lars`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n The unchanged dictionary atoms.\n\n .. deprecated:: 0.24\n This attribute is deprecated in 0.24 and will be removed in\n 1.1 (renaming of 0.26). Use `dictionary` instead.\n\n n_components_ : int\n Number of atoms.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n MiniBatchDictionaryLearning : A faster, less accurate, version of the\n dictionary learning algorithm.\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n SparsePCA : Mini-batch Sparse Principal Components Analysis.\n sparse_encode : Sparse coding where each row of the result is the solution\n to a sparse coding problem.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.decomposition import SparseCoder\n >>> X = np.array([[-1, -1, -1], [0, 0, 3]])\n >>> dictionary = np.array(\n ... [[0, 1, 0],\n ... [-1, -1, 2],\n ... [1, 1, 1],\n ... [0, 1, 1],\n ... [0, 2, 1]],\n ... dtype=np.float64\n ... )\n >>> coder = SparseCoder(\n ... dictionary=dictionary, transform_algorithm='lasso_lars',\n ... transform_alpha=1e-10,\n ... )\n >>> coder.transform(X)\n array([[ 0., 0., -1., 0., 0.],\n [ 0., 1., 1., 0., 0.]])\n \"\"\"\n _required_parameters = ['dictionary']\n \n def __init__(self, dictionary, *, transform_algorithm='omp', transform_n_nonzero_coefs=None, transform_alpha=None, split_sign=False, n_jobs=None, positive_code=False, transform_max_iter=1000):\n super().__init__(transform_algorithm, transform_n_nonzero_coefs, transform_alpha, split_sign, n_jobs, positive_code, transform_max_iter)\n self.dictionary = dictionary\n \n def fit(self, X, y=None):\n \"\"\"Do nothing and return the estimator unchanged.\n\n This method is just there to implement the usual API and hence\n work in pipelines.\n\n Parameters\n ----------\n X : Ignored\n Not used, present for API consistency by convention.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n return self\n \n @deprecated('The attribute `components_` is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Use the `dictionary` instead.')\n @property\n def components_(self):\n return self.dictionary\n \n def transform(self, X, y=None):\n \"\"\"Encode the data as a sparse combination of the dictionary atoms.\n\n Coding method is determined by the object parameter\n `transform_algorithm`.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n return super()._transform(X, self.dictionary)\n \n def _more_tags(self):\n return {'requires_fit': False}\n \n @property\n def n_components_(self):\n \"\"\"Number of atoms.\"\"\"\n return self.dictionary.shape[0]\n \n @property\n def n_features_in_(self):\n \"\"\"Number of features seen during `fit`.\"\"\"\n return self.dictionary.shape[1]\n" }, @@ -20516,7 +20582,7 @@ "sklearn.decomposition._factor_analysis.FactorAnalysis._rotate" ], "is_public": true, - "description": "Factor Analysis (FA).\n\nA simple linear generative model with Gaussian latent variables. The observations are assumed to be caused by a linear transformation of lower dimensional latent factors and added Gaussian noise. Without loss of generality the factors are distributed according to a Gaussian with zero mean and unit covariance. The noise is also zero mean and has an arbitrary diagonal covariance matrix. If we would restrict the model further, by assuming that the Gaussian noise is even isotropic (all diagonal entries are the same) we would obtain :class:`PPCA`. FactorAnalysis performs a maximum likelihood estimate of the so-called `loading` matrix, the transformation of the latent variables to the observed ones, using SVD based approach. Read more in the :ref:`User Guide `. .. versionadded:: 0.13", + "description": "Factor Analysis (FA).\n\nA simple linear generative model with Gaussian latent variables.\n\nThe observations are assumed to be caused by a linear transformation of\nlower dimensional latent factors and added Gaussian noise.\nWithout loss of generality the factors are distributed according to a\nGaussian with zero mean and unit covariance. The noise is also zero mean\nand has an arbitrary diagonal covariance matrix.\n\nIf we would restrict the model further, by assuming that the Gaussian\nnoise is even isotropic (all diagonal entries are the same) we would obtain\n:class:`PPCA`.\n\nFactorAnalysis performs a maximum likelihood estimate of the so-called\n`loading` matrix, the transformation of the latent variables to the\nobserved ones, using SVD based approach.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.13", "docstring": "Factor Analysis (FA).\n\n A simple linear generative model with Gaussian latent variables.\n\n The observations are assumed to be caused by a linear transformation of\n lower dimensional latent factors and added Gaussian noise.\n Without loss of generality the factors are distributed according to a\n Gaussian with zero mean and unit covariance. The noise is also zero mean\n and has an arbitrary diagonal covariance matrix.\n\n If we would restrict the model further, by assuming that the Gaussian\n noise is even isotropic (all diagonal entries are the same) we would obtain\n :class:`PPCA`.\n\n FactorAnalysis performs a maximum likelihood estimate of the so-called\n `loading` matrix, the transformation of the latent variables to the\n observed ones, using SVD based approach.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n n_components : int, default=None\n Dimensionality of latent space, the number of components\n of ``X`` that are obtained after ``transform``.\n If None, n_components is set to the number of features.\n\n tol : float, default=1e-2\n Stopping tolerance for log-likelihood increase.\n\n copy : bool, default=True\n Whether to make a copy of X. If ``False``, the input X gets overwritten\n during fitting.\n\n max_iter : int, default=1000\n Maximum number of iterations.\n\n noise_variance_init : ndarray of shape (n_features,), default=None\n The initial guess of the noise variance for each feature.\n If None, it defaults to np.ones(n_features).\n\n svd_method : {'lapack', 'randomized'}, default='randomized'\n Which SVD method to use. If 'lapack' use standard SVD from\n scipy.linalg, if 'randomized' use fast ``randomized_svd`` function.\n Defaults to 'randomized'. For most applications 'randomized' will\n be sufficiently precise while providing significant speed gains.\n Accuracy can also be improved by setting higher values for\n `iterated_power`. If this is not sufficient, for maximum precision\n you should choose 'lapack'.\n\n iterated_power : int, default=3\n Number of iterations for the power method. 3 by default. Only used\n if ``svd_method`` equals 'randomized'.\n\n rotation : {'varimax', 'quartimax'}, default=None\n If not None, apply the indicated rotation. Currently, varimax and\n quartimax are implemented. See\n `\"The varimax criterion for analytic rotation in factor analysis\"\n `_\n H. F. Kaiser, 1958.\n\n .. versionadded:: 0.24\n\n random_state : int or RandomState instance, default=0\n Only used when ``svd_method`` equals 'randomized'. Pass an int for\n reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Components with maximum variance.\n\n loglike_ : list of shape (n_iterations,)\n The log likelihood at each iteration.\n\n noise_variance_ : ndarray of shape (n_features,)\n The estimated noise variance for each feature.\n\n n_iter_ : int\n Number of iterations run.\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, estimated from the training set.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PCA: Principal component analysis is also a latent linear variable model\n which however assumes equal noise variance for each feature.\n This extra assumption makes probabilistic PCA faster as it can be\n computed in closed form.\n FastICA: Independent component analysis, a latent variable model with\n non-Gaussian latent variables.\n\n References\n ----------\n - David Barber, Bayesian Reasoning and Machine Learning,\n Algorithm 21.1.\n\n - Christopher M. Bishop: Pattern Recognition and Machine Learning,\n Chapter 12.2.4.\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.decomposition import FactorAnalysis\n >>> X, _ = load_digits(return_X_y=True)\n >>> transformer = FactorAnalysis(n_components=7, random_state=0)\n >>> X_transformed = transformer.fit_transform(X)\n >>> X_transformed.shape\n (1797, 7)\n ", "source_code": "\n\nclass FactorAnalysis(TransformerMixin, BaseEstimator):\n \"\"\"Factor Analysis (FA).\n\n A simple linear generative model with Gaussian latent variables.\n\n The observations are assumed to be caused by a linear transformation of\n lower dimensional latent factors and added Gaussian noise.\n Without loss of generality the factors are distributed according to a\n Gaussian with zero mean and unit covariance. The noise is also zero mean\n and has an arbitrary diagonal covariance matrix.\n\n If we would restrict the model further, by assuming that the Gaussian\n noise is even isotropic (all diagonal entries are the same) we would obtain\n :class:`PPCA`.\n\n FactorAnalysis performs a maximum likelihood estimate of the so-called\n `loading` matrix, the transformation of the latent variables to the\n observed ones, using SVD based approach.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n n_components : int, default=None\n Dimensionality of latent space, the number of components\n of ``X`` that are obtained after ``transform``.\n If None, n_components is set to the number of features.\n\n tol : float, default=1e-2\n Stopping tolerance for log-likelihood increase.\n\n copy : bool, default=True\n Whether to make a copy of X. If ``False``, the input X gets overwritten\n during fitting.\n\n max_iter : int, default=1000\n Maximum number of iterations.\n\n noise_variance_init : ndarray of shape (n_features,), default=None\n The initial guess of the noise variance for each feature.\n If None, it defaults to np.ones(n_features).\n\n svd_method : {'lapack', 'randomized'}, default='randomized'\n Which SVD method to use. If 'lapack' use standard SVD from\n scipy.linalg, if 'randomized' use fast ``randomized_svd`` function.\n Defaults to 'randomized'. For most applications 'randomized' will\n be sufficiently precise while providing significant speed gains.\n Accuracy can also be improved by setting higher values for\n `iterated_power`. If this is not sufficient, for maximum precision\n you should choose 'lapack'.\n\n iterated_power : int, default=3\n Number of iterations for the power method. 3 by default. Only used\n if ``svd_method`` equals 'randomized'.\n\n rotation : {'varimax', 'quartimax'}, default=None\n If not None, apply the indicated rotation. Currently, varimax and\n quartimax are implemented. See\n `\"The varimax criterion for analytic rotation in factor analysis\"\n `_\n H. F. Kaiser, 1958.\n\n .. versionadded:: 0.24\n\n random_state : int or RandomState instance, default=0\n Only used when ``svd_method`` equals 'randomized'. Pass an int for\n reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Components with maximum variance.\n\n loglike_ : list of shape (n_iterations,)\n The log likelihood at each iteration.\n\n noise_variance_ : ndarray of shape (n_features,)\n The estimated noise variance for each feature.\n\n n_iter_ : int\n Number of iterations run.\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, estimated from the training set.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PCA: Principal component analysis is also a latent linear variable model\n which however assumes equal noise variance for each feature.\n This extra assumption makes probabilistic PCA faster as it can be\n computed in closed form.\n FastICA: Independent component analysis, a latent variable model with\n non-Gaussian latent variables.\n\n References\n ----------\n - David Barber, Bayesian Reasoning and Machine Learning,\n Algorithm 21.1.\n\n - Christopher M. Bishop: Pattern Recognition and Machine Learning,\n Chapter 12.2.4.\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.decomposition import FactorAnalysis\n >>> X, _ = load_digits(return_X_y=True)\n >>> transformer = FactorAnalysis(n_components=7, random_state=0)\n >>> X_transformed = transformer.fit_transform(X)\n >>> X_transformed.shape\n (1797, 7)\n \"\"\"\n \n def __init__(self, n_components=None, *, tol=0.01, copy=True, max_iter=1000, noise_variance_init=None, svd_method='randomized', iterated_power=3, rotation=None, random_state=0):\n self.n_components = n_components\n self.copy = copy\n self.tol = tol\n self.max_iter = max_iter\n if svd_method not in ['lapack', 'randomized']:\n raise ValueError('SVD method %s is not supported. Please consider the documentation' % svd_method)\n self.svd_method = svd_method\n self.noise_variance_init = noise_variance_init\n self.iterated_power = iterated_power\n self.random_state = random_state\n self.rotation = rotation\n \n def fit(self, X, y=None):\n \"\"\"Fit the FactorAnalysis model to X using SVD based approach.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Ignored parameter.\n\n Returns\n -------\n self : object\n FactorAnalysis class instance.\n \"\"\"\n X = self._validate_data(X, copy=self.copy, dtype=np.float64)\n (n_samples, n_features) = X.shape\n n_components = self.n_components\n if n_components is None:\n n_components = n_features\n self.mean_ = np.mean(X, axis=0)\n X -= self.mean_\n nsqrt = sqrt(n_samples)\n llconst = n_features * log(2.0 * np.pi) + n_components\n var = np.var(X, axis=0)\n if self.noise_variance_init is None:\n psi = np.ones(n_features, dtype=X.dtype)\n else:\n if len(self.noise_variance_init) != n_features:\n raise ValueError('noise_variance_init dimension does not with number of features : %d != %d' % (len(self.noise_variance_init), n_features))\n psi = np.array(self.noise_variance_init)\n loglike = []\n old_ll = -np.inf\n SMALL = 1e-12\n if self.svd_method == 'lapack':\n \n def my_svd(X):\n (_, s, Vt) = linalg.svd(X, full_matrices=False, check_finite=False)\n return s[:n_components], Vt[:n_components], squared_norm(s[n_components:])\n elif self.svd_method == 'randomized':\n random_state = check_random_state(self.random_state)\n \n def my_svd(X):\n (_, s, Vt) = randomized_svd(X, n_components, random_state=random_state, n_iter=self.iterated_power)\n return s, Vt, squared_norm(X) - squared_norm(s)\n else:\n raise ValueError('SVD method %s is not supported. Please consider the documentation' % self.svd_method)\n for i in range(self.max_iter):\n sqrt_psi = np.sqrt(psi) + SMALL\n (s, Vt, unexp_var) = my_svd(X / (sqrt_psi * nsqrt))\n s **= 2\n W = np.sqrt(np.maximum(s - 1.0, 0.0))[:, np.newaxis] * Vt\n del Vt\n W *= sqrt_psi\n ll = llconst + np.sum(np.log(s))\n ll += unexp_var + np.sum(np.log(psi))\n ll *= -n_samples / 2.0\n loglike.append(ll)\n if ll - old_ll < self.tol:\n break\n old_ll = ll\n psi = np.maximum(var - np.sum(W**2, axis=0), SMALL)\n else:\n warnings.warn('FactorAnalysis did not converge.' + ' You might want' + ' to increase the number of iterations.', ConvergenceWarning)\n self.components_ = W\n if self.rotation is not None:\n self.components_ = self._rotate(W)\n self.noise_variance_ = psi\n self.loglike_ = loglike\n self.n_iter_ = i + 1\n return self\n \n def transform(self, X):\n \"\"\"Apply dimensionality reduction to X using the model.\n\n Compute the expected mean of the latent variables.\n See Barber, 21.2.33 (or Bishop, 12.66).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n The latent variables of X.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n Ih = np.eye(len(self.components_))\n X_transformed = X - self.mean_\n Wpsi = self.components_ / self.noise_variance_\n cov_z = linalg.inv(Ih + np.dot(Wpsi, self.components_.T))\n tmp = np.dot(X_transformed, Wpsi.T)\n X_transformed = np.dot(tmp, cov_z)\n return X_transformed\n \n def get_covariance(self):\n \"\"\"Compute data covariance with the FactorAnalysis model.\n\n ``cov = components_.T * components_ + diag(noise_variance)``\n\n Returns\n -------\n cov : ndarray of shape (n_features, n_features)\n Estimated covariance of data.\n \"\"\"\n check_is_fitted(self)\n cov = np.dot(self.components_.T, self.components_)\n cov.flat[::len(cov) + 1] += self.noise_variance_\n return cov\n \n def get_precision(self):\n \"\"\"Compute data precision matrix with the FactorAnalysis model.\n\n Returns\n -------\n precision : ndarray of shape (n_features, n_features)\n Estimated precision of data.\n \"\"\"\n check_is_fitted(self)\n n_features = self.components_.shape[1]\n if self.n_components == 0:\n return np.diag(1.0 / self.noise_variance_)\n if self.n_components == n_features:\n return linalg.inv(self.get_covariance())\n components_ = self.components_\n precision = np.dot(components_ / self.noise_variance_, components_.T)\n precision.flat[::len(precision) + 1] += 1.0\n precision = np.dot(components_.T, np.dot(linalg.inv(precision), components_))\n precision /= self.noise_variance_[:, np.newaxis]\n precision /= -self.noise_variance_[np.newaxis, :]\n precision.flat[::len(precision) + 1] += 1.0 / self.noise_variance_\n return precision\n \n def score_samples(self, X):\n \"\"\"Compute the log-likelihood of each sample.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data.\n\n Returns\n -------\n ll : ndarray of shape (n_samples,)\n Log-likelihood of each sample under the current model.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n Xr = X - self.mean_\n precision = self.get_precision()\n n_features = X.shape[1]\n log_like = -0.5 * (Xr * np.dot(Xr, precision)).sum(axis=1)\n log_like -= 0.5 * (n_features * log(2.0 * np.pi) - fast_logdet(precision))\n return log_like\n \n def score(self, X, y=None):\n \"\"\"Compute the average log-likelihood of the samples.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data.\n\n y : Ignored\n Ignored parameter.\n\n Returns\n -------\n ll : float\n Average log-likelihood of the samples under the current model.\n \"\"\"\n return np.mean(self.score_samples(X))\n \n def _rotate(self, components, n_components=None, tol=1e-06):\n \"\"\"Rotate the factor analysis solution.\"\"\"\n implemented = ('varimax', 'quartimax')\n method = self.rotation\n if method in implemented:\n return _ortho_rotation(components.T, method=method, tol=tol)[:self.n_components]\n else:\n raise ValueError(\"'method' must be in %s, not %s\" % (implemented, method))\n" }, @@ -20534,7 +20600,7 @@ "sklearn.decomposition._fastica.FastICA.inverse_transform" ], "is_public": true, - "description": "FastICA: a fast algorithm for Independent Component Analysis.\n\nThe implementation is based on [1]_. Read more in the :ref:`User Guide `.", + "description": "FastICA: a fast algorithm for Independent Component Analysis.\n\nThe implementation is based on [1]_.\n\nRead more in the :ref:`User Guide `.", "docstring": "FastICA: a fast algorithm for Independent Component Analysis.\n\n The implementation is based on [1]_.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of components to use. If None is passed, all are used.\n\n algorithm : {'parallel', 'deflation'}, default='parallel'\n Apply parallel or deflational algorithm for FastICA.\n\n whiten : bool, default=True\n If whiten is false, the data is already considered to be\n whitened, and no whitening is performed.\n\n fun : {'logcosh', 'exp', 'cube'} or callable, default='logcosh'\n The functional form of the G function used in the\n approximation to neg-entropy. Could be either 'logcosh', 'exp',\n or 'cube'.\n You can also provide your own function. It should return a tuple\n containing the value of the function, and of its derivative, in the\n point. Example::\n\n def my_g(x):\n return x ** 3, (3 * x ** 2).mean(axis=-1)\n\n fun_args : dict, default=None\n Arguments to send to the functional form.\n If empty and if fun='logcosh', fun_args will take value\n {'alpha' : 1.0}.\n\n max_iter : int, default=200\n Maximum number of iterations during fit.\n\n tol : float, default=1e-4\n Tolerance on update at each iteration.\n\n w_init : ndarray of shape (n_components, n_components), default=None\n The mixing matrix to be used to initialize the algorithm.\n\n random_state : int, RandomState instance or None, default=None\n Used to initialize ``w_init`` when not specified, with a\n normal distribution. Pass an int, for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n The linear operator to apply to the data to get the independent\n sources. This is equal to the unmixing matrix when ``whiten`` is\n False, and equal to ``np.dot(unmixing_matrix, self.whitening_)`` when\n ``whiten`` is True.\n\n mixing_ : ndarray of shape (n_features, n_components)\n The pseudo-inverse of ``components_``. It is the linear operator\n that maps independent sources to the data.\n\n mean_ : ndarray of shape(n_features,)\n The mean over features. Only set if `self.whiten` is True.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n If the algorithm is \"deflation\", n_iter is the\n maximum number of iterations run across all components. Else\n they are just the number of iterations taken to converge.\n\n whitening_ : ndarray of shape (n_components, n_features)\n Only set if whiten is 'True'. This is the pre-whitening matrix\n that projects data onto the first `n_components` principal components.\n\n See Also\n --------\n PCA : Principal component analysis (PCA).\n IncrementalPCA : Incremental principal components analysis (IPCA).\n KernelPCA : Kernel Principal component analysis (KPCA).\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n SparsePCA : Sparse Principal Components Analysis (SparsePCA).\n\n References\n ----------\n .. [1] A. Hyvarinen and E. Oja, Independent Component Analysis:\n Algorithms and Applications, Neural Networks, 13(4-5), 2000,\n pp. 411-430.\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.decomposition import FastICA\n >>> X, _ = load_digits(return_X_y=True)\n >>> transformer = FastICA(n_components=7,\n ... random_state=0)\n >>> X_transformed = transformer.fit_transform(X)\n >>> X_transformed.shape\n (1797, 7)\n ", "source_code": "\n\nclass FastICA(TransformerMixin, BaseEstimator):\n \"\"\"FastICA: a fast algorithm for Independent Component Analysis.\n\n The implementation is based on [1]_.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of components to use. If None is passed, all are used.\n\n algorithm : {'parallel', 'deflation'}, default='parallel'\n Apply parallel or deflational algorithm for FastICA.\n\n whiten : bool, default=True\n If whiten is false, the data is already considered to be\n whitened, and no whitening is performed.\n\n fun : {'logcosh', 'exp', 'cube'} or callable, default='logcosh'\n The functional form of the G function used in the\n approximation to neg-entropy. Could be either 'logcosh', 'exp',\n or 'cube'.\n You can also provide your own function. It should return a tuple\n containing the value of the function, and of its derivative, in the\n point. Example::\n\n def my_g(x):\n return x ** 3, (3 * x ** 2).mean(axis=-1)\n\n fun_args : dict, default=None\n Arguments to send to the functional form.\n If empty and if fun='logcosh', fun_args will take value\n {'alpha' : 1.0}.\n\n max_iter : int, default=200\n Maximum number of iterations during fit.\n\n tol : float, default=1e-4\n Tolerance on update at each iteration.\n\n w_init : ndarray of shape (n_components, n_components), default=None\n The mixing matrix to be used to initialize the algorithm.\n\n random_state : int, RandomState instance or None, default=None\n Used to initialize ``w_init`` when not specified, with a\n normal distribution. Pass an int, for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n The linear operator to apply to the data to get the independent\n sources. This is equal to the unmixing matrix when ``whiten`` is\n False, and equal to ``np.dot(unmixing_matrix, self.whitening_)`` when\n ``whiten`` is True.\n\n mixing_ : ndarray of shape (n_features, n_components)\n The pseudo-inverse of ``components_``. It is the linear operator\n that maps independent sources to the data.\n\n mean_ : ndarray of shape(n_features,)\n The mean over features. Only set if `self.whiten` is True.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n If the algorithm is \"deflation\", n_iter is the\n maximum number of iterations run across all components. Else\n they are just the number of iterations taken to converge.\n\n whitening_ : ndarray of shape (n_components, n_features)\n Only set if whiten is 'True'. This is the pre-whitening matrix\n that projects data onto the first `n_components` principal components.\n\n See Also\n --------\n PCA : Principal component analysis (PCA).\n IncrementalPCA : Incremental principal components analysis (IPCA).\n KernelPCA : Kernel Principal component analysis (KPCA).\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n SparsePCA : Sparse Principal Components Analysis (SparsePCA).\n\n References\n ----------\n .. [1] A. Hyvarinen and E. Oja, Independent Component Analysis:\n Algorithms and Applications, Neural Networks, 13(4-5), 2000,\n pp. 411-430.\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.decomposition import FastICA\n >>> X, _ = load_digits(return_X_y=True)\n >>> transformer = FastICA(n_components=7,\n ... random_state=0)\n >>> X_transformed = transformer.fit_transform(X)\n >>> X_transformed.shape\n (1797, 7)\n \"\"\"\n \n def __init__(self, n_components=None, *, algorithm='parallel', whiten=True, fun='logcosh', fun_args=None, max_iter=200, tol=0.0001, w_init=None, random_state=None):\n super().__init__()\n if max_iter < 1:\n raise ValueError('max_iter should be greater than 1, got (max_iter={})'.format(max_iter))\n self.n_components = n_components\n self.algorithm = algorithm\n self.whiten = whiten\n self.fun = fun\n self.fun_args = fun_args\n self.max_iter = max_iter\n self.tol = tol\n self.w_init = w_init\n self.random_state = random_state\n \n def _fit(self, X, compute_sources=False):\n \"\"\"Fit the model\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n compute_sources : bool, default=False\n If False, sources are not computes but only the rotation matrix.\n This can save memory when working with big data. Defaults to False.\n\n Returns\n -------\n S : ndarray of shape (n_samples, n_components) or None\n Sources matrix. `None` if `compute_sources` is `False`.\n \"\"\"\n XT = self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES, ensure_min_samples=2).T\n fun_args = {} if self.fun_args is None else self.fun_args\n random_state = check_random_state(self.random_state)\n alpha = fun_args.get('alpha', 1.0)\n if not 1 <= alpha <= 2:\n raise ValueError('alpha must be in [1,2]')\n if self.fun == 'logcosh':\n g = _logcosh\n elif self.fun == 'exp':\n g = _exp\n elif self.fun == 'cube':\n g = _cube\n elif callable(self.fun):\n \n def g(x, fun_args):\n return self.fun(x, **fun_args)\n else:\n exc = ValueError if isinstance(self.fun, str) else TypeError\n raise exc(\"Unknown function %r; should be one of 'logcosh', 'exp', 'cube' or callable\" % self.fun)\n (n_features, n_samples) = XT.shape\n n_components = self.n_components\n if not self.whiten and n_components is not None:\n n_components = None\n warnings.warn('Ignoring n_components with whiten=False.')\n if n_components is None:\n n_components = min(n_samples, n_features)\n if n_components > min(n_samples, n_features):\n n_components = min(n_samples, n_features)\n warnings.warn('n_components is too large: it will be set to %s' % n_components)\n if self.whiten:\n X_mean = XT.mean(axis=-1)\n XT -= X_mean[:, np.newaxis]\n (u, d, _) = linalg.svd(XT, full_matrices=False, check_finite=False)\n del _\n K = (u / d).T[:n_components]\n del u, d\n X1 = np.dot(K, XT)\n X1 *= np.sqrt(n_samples)\n else:\n X1 = as_float_array(XT, copy=False)\n w_init = self.w_init\n if w_init is None:\n w_init = np.asarray(random_state.normal(size=(n_components, n_components)), dtype=X1.dtype)\n else:\n w_init = np.asarray(w_init)\n if w_init.shape != (n_components, n_components):\n raise ValueError('w_init has invalid shape -- should be %(shape)s' % {'shape': (n_components, n_components)})\n kwargs = {'tol': self.tol, 'g': g, 'fun_args': fun_args, 'max_iter': self.max_iter, 'w_init': w_init}\n if self.algorithm == 'parallel':\n (W, n_iter) = _ica_par(X1, **kwargs)\n elif self.algorithm == 'deflation':\n (W, n_iter) = _ica_def(X1, **kwargs)\n else:\n raise ValueError('Invalid algorithm: must be either `parallel` or `deflation`.')\n del X1\n if compute_sources:\n if self.whiten:\n S = np.linalg.multi_dot([W, K, XT]).T\n else:\n S = np.dot(W, XT).T\n else:\n S = None\n self.n_iter_ = n_iter\n if self.whiten:\n self.components_ = np.dot(W, K)\n self.mean_ = X_mean\n self.whitening_ = K\n else:\n self.components_ = W\n self.mixing_ = linalg.pinv(self.components_, check_finite=False)\n self._unmixing = W\n return S\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit the model and recover the sources from X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Estimated sources obtained by transforming the data with the\n estimated unmixing matrix.\n \"\"\"\n return self._fit(X, compute_sources=True)\n \n def fit(self, X, y=None):\n \"\"\"Fit the model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self._fit(X, compute_sources=False)\n return self\n \n def transform(self, X, copy=True):\n \"\"\"Recover the sources from X (apply the unmixing matrix).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to transform, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n copy : bool, default=True\n If False, data passed to fit can be overwritten. Defaults to True.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Estimated sources obtained by transforming the data with the\n estimated unmixing matrix.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, copy=copy and self.whiten, dtype=FLOAT_DTYPES, reset=False)\n if self.whiten:\n X -= self.mean_\n return np.dot(X, self.components_.T)\n \n def inverse_transform(self, X, copy=True):\n \"\"\"Transform the sources back to the mixed data (apply mixing matrix).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_components)\n Sources, where `n_samples` is the number of samples\n and `n_components` is the number of components.\n copy : bool, default=True\n If False, data passed to fit are overwritten. Defaults to True.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_features)\n Reconstructed data obtained with the mixing matrix.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, copy=copy and self.whiten, dtype=FLOAT_DTYPES)\n X = np.dot(X, self.mixing_.T)\n if self.whiten:\n X += self.mean_\n return X\n" }, @@ -20550,7 +20616,7 @@ "sklearn.decomposition._incremental_pca.IncrementalPCA.transform" ], "is_public": true, - "description": "Incremental principal components analysis (IPCA).\n\nLinear dimensionality reduction using Singular Value Decomposition of the data, keeping only the most significant singular vectors to project the data to a lower dimensional space. The input data is centered but not scaled for each feature before applying the SVD. Depending on the size of the input data, this algorithm can be much more memory efficient than a PCA, and allows sparse input. This algorithm has constant memory complexity, on the order of ``batch_size * n_features``, enabling use of np.memmap files without loading the entire file into memory. For sparse matrices, the input is converted to dense in batches (in order to be able to subtract the mean) which avoids storing the entire dense matrix at any one time. The computational overhead of each SVD is ``O(batch_size * n_features ** 2)``, but only 2 * batch_size samples remain in memory at a time. There will be ``n_samples / batch_size`` SVD computations to get the principal components, versus 1 large SVD of complexity ``O(n_samples * n_features ** 2)`` for PCA. Read more in the :ref:`User Guide `. .. versionadded:: 0.16", + "description": "Incremental principal components analysis (IPCA).\n\nLinear dimensionality reduction using Singular Value Decomposition of\nthe data, keeping only the most significant singular vectors to\nproject the data to a lower dimensional space. The input data is centered\nbut not scaled for each feature before applying the SVD.\n\nDepending on the size of the input data, this algorithm can be much more\nmemory efficient than a PCA, and allows sparse input.\n\nThis algorithm has constant memory complexity, on the order\nof ``batch_size * n_features``, enabling use of np.memmap files without\nloading the entire file into memory. For sparse matrices, the input\nis converted to dense in batches (in order to be able to subtract the\nmean) which avoids storing the entire dense matrix at any one time.\n\nThe computational overhead of each SVD is\n``O(batch_size * n_features ** 2)``, but only 2 * batch_size samples\nremain in memory at a time. There will be ``n_samples / batch_size`` SVD\ncomputations to get the principal components, versus 1 large SVD of\ncomplexity ``O(n_samples * n_features ** 2)`` for PCA.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.16", "docstring": "Incremental principal components analysis (IPCA).\n\n Linear dimensionality reduction using Singular Value Decomposition of\n the data, keeping only the most significant singular vectors to\n project the data to a lower dimensional space. The input data is centered\n but not scaled for each feature before applying the SVD.\n\n Depending on the size of the input data, this algorithm can be much more\n memory efficient than a PCA, and allows sparse input.\n\n This algorithm has constant memory complexity, on the order\n of ``batch_size * n_features``, enabling use of np.memmap files without\n loading the entire file into memory. For sparse matrices, the input\n is converted to dense in batches (in order to be able to subtract the\n mean) which avoids storing the entire dense matrix at any one time.\n\n The computational overhead of each SVD is\n ``O(batch_size * n_features ** 2)``, but only 2 * batch_size samples\n remain in memory at a time. There will be ``n_samples / batch_size`` SVD\n computations to get the principal components, versus 1 large SVD of\n complexity ``O(n_samples * n_features ** 2)`` for PCA.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.16\n\n Parameters\n ----------\n n_components : int, default=None\n Number of components to keep. If ``n_components`` is ``None``,\n then ``n_components`` is set to ``min(n_samples, n_features)``.\n\n whiten : bool, default=False\n When True (False by default) the ``components_`` vectors are divided\n by ``n_samples`` times ``components_`` to ensure uncorrelated outputs\n with unit component-wise variances.\n\n Whitening will remove some information from the transformed signal\n (the relative variance scales of the components) but can sometimes\n improve the predictive accuracy of the downstream estimators by\n making data respect some hard-wired assumptions.\n\n copy : bool, default=True\n If False, X will be overwritten. ``copy=False`` can be used to\n save memory but is unsafe for general use.\n\n batch_size : int, default=None\n The number of samples to use for each batch. Only used when calling\n ``fit``. If ``batch_size`` is ``None``, then ``batch_size``\n is inferred from the data and set to ``5 * n_features``, to provide a\n balance between approximation accuracy and memory consumption.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Principal axes in feature space, representing the directions of\n maximum variance in the data. Equivalently, the right singular\n vectors of the centered input data, parallel to its eigenvectors.\n The components are sorted by ``explained_variance_``.\n\n explained_variance_ : ndarray of shape (n_components,)\n Variance explained by each of the selected components.\n\n explained_variance_ratio_ : ndarray of shape (n_components,)\n Percentage of variance explained by each of the selected components.\n If all components are stored, the sum of explained variances is equal\n to 1.0.\n\n singular_values_ : ndarray of shape (n_components,)\n The singular values corresponding to each of the selected components.\n The singular values are equal to the 2-norms of the ``n_components``\n variables in the lower-dimensional space.\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, aggregate over calls to ``partial_fit``.\n\n var_ : ndarray of shape (n_features,)\n Per-feature empirical variance, aggregate over calls to\n ``partial_fit``.\n\n noise_variance_ : float\n The estimated noise covariance following the Probabilistic PCA model\n from Tipping and Bishop 1999. See \"Pattern Recognition and\n Machine Learning\" by C. Bishop, 12.2.1 p. 574 or\n http://www.miketipping.com/papers/met-mppca.pdf.\n\n n_components_ : int\n The estimated number of components. Relevant when\n ``n_components=None``.\n\n n_samples_seen_ : int\n The number of samples processed by the estimator. Will be reset on\n new calls to fit, but increments across ``partial_fit`` calls.\n\n batch_size_ : int\n Inferred batch size from ``batch_size``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PCA : Principal component analysis (PCA).\n KernelPCA : Kernel Principal component analysis (KPCA).\n SparsePCA : Sparse Principal Components Analysis (SparsePCA).\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n\n Notes\n -----\n Implements the incremental PCA model from:\n *D. Ross, J. Lim, R. Lin, M. Yang, Incremental Learning for Robust Visual\n Tracking, International Journal of Computer Vision, Volume 77, Issue 1-3,\n pp. 125-141, May 2008.*\n See https://www.cs.toronto.edu/~dross/ivt/RossLimLinYang_ijcv.pdf\n\n This model is an extension of the Sequential Karhunen-Loeve Transform from:\n *A. Levy and M. Lindenbaum, Sequential Karhunen-Loeve Basis Extraction and\n its Application to Images, IEEE Transactions on Image Processing, Volume 9,\n Number 8, pp. 1371-1374, August 2000.*\n See https://www.cs.technion.ac.il/~mic/doc/skl-ip.pdf\n\n We have specifically abstained from an optimization used by authors of both\n papers, a QR decomposition used in specific situations to reduce the\n algorithmic complexity of the SVD. The source for this technique is\n *Matrix Computations, Third Edition, G. Holub and C. Van Loan, Chapter 5,\n section 5.4.4, pp 252-253.*. This technique has been omitted because it is\n advantageous only when decomposing a matrix with ``n_samples`` (rows)\n >= 5/3 * ``n_features`` (columns), and hurts the readability of the\n implemented algorithm. This would be a good opportunity for future\n optimization, if it is deemed necessary.\n\n References\n ----------\n D. Ross, J. Lim, R. Lin, M. Yang. Incremental Learning for Robust Visual\n Tracking, International Journal of Computer Vision, Volume 77,\n Issue 1-3, pp. 125-141, May 2008.\n\n G. Golub and C. Van Loan. Matrix Computations, Third Edition, Chapter 5,\n Section 5.4.4, pp. 252-253.\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.decomposition import IncrementalPCA\n >>> from scipy import sparse\n >>> X, _ = load_digits(return_X_y=True)\n >>> transformer = IncrementalPCA(n_components=7, batch_size=200)\n >>> # either partially fit on smaller batches of data\n >>> transformer.partial_fit(X[:100, :])\n IncrementalPCA(batch_size=200, n_components=7)\n >>> # or let the fit function itself divide the data into batches\n >>> X_sparse = sparse.csr_matrix(X)\n >>> X_transformed = transformer.fit_transform(X_sparse)\n >>> X_transformed.shape\n (1797, 7)\n ", "source_code": "\n\nclass IncrementalPCA(_BasePCA):\n \"\"\"Incremental principal components analysis (IPCA).\n\n Linear dimensionality reduction using Singular Value Decomposition of\n the data, keeping only the most significant singular vectors to\n project the data to a lower dimensional space. The input data is centered\n but not scaled for each feature before applying the SVD.\n\n Depending on the size of the input data, this algorithm can be much more\n memory efficient than a PCA, and allows sparse input.\n\n This algorithm has constant memory complexity, on the order\n of ``batch_size * n_features``, enabling use of np.memmap files without\n loading the entire file into memory. For sparse matrices, the input\n is converted to dense in batches (in order to be able to subtract the\n mean) which avoids storing the entire dense matrix at any one time.\n\n The computational overhead of each SVD is\n ``O(batch_size * n_features ** 2)``, but only 2 * batch_size samples\n remain in memory at a time. There will be ``n_samples / batch_size`` SVD\n computations to get the principal components, versus 1 large SVD of\n complexity ``O(n_samples * n_features ** 2)`` for PCA.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.16\n\n Parameters\n ----------\n n_components : int, default=None\n Number of components to keep. If ``n_components`` is ``None``,\n then ``n_components`` is set to ``min(n_samples, n_features)``.\n\n whiten : bool, default=False\n When True (False by default) the ``components_`` vectors are divided\n by ``n_samples`` times ``components_`` to ensure uncorrelated outputs\n with unit component-wise variances.\n\n Whitening will remove some information from the transformed signal\n (the relative variance scales of the components) but can sometimes\n improve the predictive accuracy of the downstream estimators by\n making data respect some hard-wired assumptions.\n\n copy : bool, default=True\n If False, X will be overwritten. ``copy=False`` can be used to\n save memory but is unsafe for general use.\n\n batch_size : int, default=None\n The number of samples to use for each batch. Only used when calling\n ``fit``. If ``batch_size`` is ``None``, then ``batch_size``\n is inferred from the data and set to ``5 * n_features``, to provide a\n balance between approximation accuracy and memory consumption.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Principal axes in feature space, representing the directions of\n maximum variance in the data. Equivalently, the right singular\n vectors of the centered input data, parallel to its eigenvectors.\n The components are sorted by ``explained_variance_``.\n\n explained_variance_ : ndarray of shape (n_components,)\n Variance explained by each of the selected components.\n\n explained_variance_ratio_ : ndarray of shape (n_components,)\n Percentage of variance explained by each of the selected components.\n If all components are stored, the sum of explained variances is equal\n to 1.0.\n\n singular_values_ : ndarray of shape (n_components,)\n The singular values corresponding to each of the selected components.\n The singular values are equal to the 2-norms of the ``n_components``\n variables in the lower-dimensional space.\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, aggregate over calls to ``partial_fit``.\n\n var_ : ndarray of shape (n_features,)\n Per-feature empirical variance, aggregate over calls to\n ``partial_fit``.\n\n noise_variance_ : float\n The estimated noise covariance following the Probabilistic PCA model\n from Tipping and Bishop 1999. See \"Pattern Recognition and\n Machine Learning\" by C. Bishop, 12.2.1 p. 574 or\n http://www.miketipping.com/papers/met-mppca.pdf.\n\n n_components_ : int\n The estimated number of components. Relevant when\n ``n_components=None``.\n\n n_samples_seen_ : int\n The number of samples processed by the estimator. Will be reset on\n new calls to fit, but increments across ``partial_fit`` calls.\n\n batch_size_ : int\n Inferred batch size from ``batch_size``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PCA : Principal component analysis (PCA).\n KernelPCA : Kernel Principal component analysis (KPCA).\n SparsePCA : Sparse Principal Components Analysis (SparsePCA).\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n\n Notes\n -----\n Implements the incremental PCA model from:\n *D. Ross, J. Lim, R. Lin, M. Yang, Incremental Learning for Robust Visual\n Tracking, International Journal of Computer Vision, Volume 77, Issue 1-3,\n pp. 125-141, May 2008.*\n See https://www.cs.toronto.edu/~dross/ivt/RossLimLinYang_ijcv.pdf\n\n This model is an extension of the Sequential Karhunen-Loeve Transform from:\n *A. Levy and M. Lindenbaum, Sequential Karhunen-Loeve Basis Extraction and\n its Application to Images, IEEE Transactions on Image Processing, Volume 9,\n Number 8, pp. 1371-1374, August 2000.*\n See https://www.cs.technion.ac.il/~mic/doc/skl-ip.pdf\n\n We have specifically abstained from an optimization used by authors of both\n papers, a QR decomposition used in specific situations to reduce the\n algorithmic complexity of the SVD. The source for this technique is\n *Matrix Computations, Third Edition, G. Holub and C. Van Loan, Chapter 5,\n section 5.4.4, pp 252-253.*. This technique has been omitted because it is\n advantageous only when decomposing a matrix with ``n_samples`` (rows)\n >= 5/3 * ``n_features`` (columns), and hurts the readability of the\n implemented algorithm. This would be a good opportunity for future\n optimization, if it is deemed necessary.\n\n References\n ----------\n D. Ross, J. Lim, R. Lin, M. Yang. Incremental Learning for Robust Visual\n Tracking, International Journal of Computer Vision, Volume 77,\n Issue 1-3, pp. 125-141, May 2008.\n\n G. Golub and C. Van Loan. Matrix Computations, Third Edition, Chapter 5,\n Section 5.4.4, pp. 252-253.\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.decomposition import IncrementalPCA\n >>> from scipy import sparse\n >>> X, _ = load_digits(return_X_y=True)\n >>> transformer = IncrementalPCA(n_components=7, batch_size=200)\n >>> # either partially fit on smaller batches of data\n >>> transformer.partial_fit(X[:100, :])\n IncrementalPCA(batch_size=200, n_components=7)\n >>> # or let the fit function itself divide the data into batches\n >>> X_sparse = sparse.csr_matrix(X)\n >>> X_transformed = transformer.fit_transform(X_sparse)\n >>> X_transformed.shape\n (1797, 7)\n \"\"\"\n \n def __init__(self, n_components=None, *, whiten=False, copy=True, batch_size=None):\n self.n_components = n_components\n self.whiten = whiten\n self.copy = copy\n self.batch_size = batch_size\n \n def fit(self, X, y=None):\n \"\"\"Fit the model with X, using minibatches of size batch_size.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self.components_ = None\n self.n_samples_seen_ = 0\n self.mean_ = 0.0\n self.var_ = 0.0\n self.singular_values_ = None\n self.explained_variance_ = None\n self.explained_variance_ratio_ = None\n self.noise_variance_ = None\n X = self._validate_data(X, accept_sparse=['csr', 'csc', 'lil'], copy=self.copy, dtype=[np.float64, np.float32])\n (n_samples, n_features) = X.shape\n if self.batch_size is None:\n self.batch_size_ = 5 * n_features\n else:\n self.batch_size_ = self.batch_size\n for batch in gen_batches(n_samples, self.batch_size_, min_batch_size=self.n_components or 0):\n X_batch = X[batch]\n if sparse.issparse(X_batch):\n X_batch = X_batch.toarray()\n self.partial_fit(X_batch, check_input=False)\n return self\n \n def partial_fit(self, X, y=None, check_input=True):\n \"\"\"Incremental fit with X. All of X is processed as a single batch.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n check_input : bool, default=True\n Run check_array on X.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n first_pass = not hasattr(self, 'components_')\n if check_input:\n if sparse.issparse(X):\n raise TypeError('IncrementalPCA.partial_fit does not support sparse input. Either convert data to dense or use IncrementalPCA.fit to do so in batches.')\n X = self._validate_data(X, copy=self.copy, dtype=[np.float64, np.float32], reset=first_pass)\n (n_samples, n_features) = X.shape\n if first_pass:\n self.components_ = None\n if self.n_components is None:\n if self.components_ is None:\n self.n_components_ = min(n_samples, n_features)\n else:\n self.n_components_ = self.components_.shape[0]\n elif not 1 <= self.n_components <= n_features:\n raise ValueError('n_components=%r invalid for n_features=%d, need more rows than columns for IncrementalPCA processing' % (self.n_components, n_features))\n elif not self.n_components <= n_samples:\n raise ValueError('n_components=%r must be less or equal to the batch number of samples %d.' % (self.n_components, n_samples))\n else:\n self.n_components_ = self.n_components\n if self.components_ is not None and self.components_.shape[0] != self.n_components_:\n raise ValueError('Number of input features has changed from %i to %i between calls to partial_fit! Try setting n_components to a fixed value.' % (self.components_.shape[0], self.n_components_))\n if not hasattr(self, 'n_samples_seen_'):\n self.n_samples_seen_ = 0\n self.mean_ = 0.0\n self.var_ = 0.0\n (col_mean, col_var, n_total_samples) = _incremental_mean_and_var(X, last_mean=self.mean_, last_variance=self.var_, last_sample_count=np.repeat(self.n_samples_seen_, X.shape[1]))\n n_total_samples = n_total_samples[0]\n if self.n_samples_seen_ == 0:\n X -= col_mean\n else:\n col_batch_mean = np.mean(X, axis=0)\n X -= col_batch_mean\n mean_correction = np.sqrt(self.n_samples_seen_ / n_total_samples * n_samples) * (self.mean_ - col_batch_mean)\n X = np.vstack((self.singular_values_.reshape((-1, 1)) * self.components_, X, mean_correction))\n (U, S, Vt) = linalg.svd(X, full_matrices=False, check_finite=False)\n (U, Vt) = svd_flip(U, Vt, u_based_decision=False)\n explained_variance = S**2 / (n_total_samples - 1)\n explained_variance_ratio = S**2 / np.sum(col_var * n_total_samples)\n self.n_samples_seen_ = n_total_samples\n self.components_ = Vt[:self.n_components_]\n self.singular_values_ = S[:self.n_components_]\n self.mean_ = col_mean\n self.var_ = col_var\n self.explained_variance_ = explained_variance[:self.n_components_]\n self.explained_variance_ratio_ = explained_variance_ratio[:self.n_components_]\n if self.n_components_ < n_features:\n self.noise_variance_ = explained_variance[self.n_components_:].mean()\n else:\n self.noise_variance_ = 0.0\n return self\n \n def transform(self, X):\n \"\"\"Apply dimensionality reduction to X.\n\n X is projected on the first principal components previously extracted\n from a training set, using minibatches of size batch_size if X is\n sparse.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Projection of X in the first principal components.\n\n Examples\n --------\n\n >>> import numpy as np\n >>> from sklearn.decomposition import IncrementalPCA\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2],\n ... [1, 1], [2, 1], [3, 2]])\n >>> ipca = IncrementalPCA(n_components=2, batch_size=3)\n >>> ipca.fit(X)\n IncrementalPCA(batch_size=3, n_components=2)\n >>> ipca.transform(X) # doctest: +SKIP\n \"\"\"\n if sparse.issparse(X):\n n_samples = X.shape[0]\n output = []\n for batch in gen_batches(n_samples, self.batch_size_, min_batch_size=self.n_components or 0):\n output.append(super().transform(X[batch].toarray()))\n return np.vstack(output)\n else:\n return super().transform(X)\n" }, @@ -20574,9 +20640,9 @@ "sklearn.decomposition._kernel_pca.KernelPCA._more_tags" ], "is_public": true, - "description": "Kernel Principal component analysis (KPCA).\n\nNon-linear dimensionality reduction through the use of kernels (see :ref:`metrics`). It uses the `scipy.linalg.eigh` LAPACK implementation of the full SVD or the `scipy.sparse.linalg.eigsh` ARPACK implementation of the truncated SVD, depending on the shape of the input data and the number of components to extract. It can also use a randomized truncated SVD by the method of Halko et al. 2009, see `eigen_solver`. Read more in the :ref:`User Guide `.", - "docstring": "Kernel Principal component analysis (KPCA).\n\n Non-linear dimensionality reduction through the use of kernels (see\n :ref:`metrics`).\n\n It uses the `scipy.linalg.eigh` LAPACK implementation of the full SVD or\n the `scipy.sparse.linalg.eigsh` ARPACK implementation of the truncated SVD,\n depending on the shape of the input data and the number of components to\n extract. It can also use a randomized truncated SVD by the method of\n Halko et al. 2009, see `eigen_solver`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of components. If None, all non-zero components are kept.\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'}, default='linear'\n Kernel used for PCA.\n\n gamma : float, default=None\n Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other\n kernels. If ``gamma`` is ``None``, then it is set to ``1/n_features``.\n\n degree : int, default=3\n Degree for poly kernels. Ignored by other kernels.\n\n coef0 : float, default=1\n Independent term in poly and sigmoid kernels.\n Ignored by other kernels.\n\n kernel_params : dict, default=None\n Parameters (keyword arguments) and\n values for kernel passed as callable object.\n Ignored by other kernels.\n\n alpha : float, default=1.0\n Hyperparameter of the ridge regression that learns the\n inverse transform (when fit_inverse_transform=True).\n\n fit_inverse_transform : bool, default=False\n Learn the inverse transform for non-precomputed kernels\n (i.e. learn to find the pre-image of a point).\n\n eigen_solver : {'auto', 'dense', 'arpack', 'randomized'}, default='auto'\n Select eigensolver to use. If `n_components` is much\n less than the number of training samples, randomized (or arpack to a\n smaller extend) may be more efficient than the dense eigensolver.\n Randomized SVD is performed according to the method of Halko et al.\n\n auto :\n the solver is selected by a default policy based on n_samples\n (the number of training samples) and `n_components`:\n if the number of components to extract is less than 10 (strict) and\n the number of samples is more than 200 (strict), the 'arpack'\n method is enabled. Otherwise the exact full eigenvalue\n decomposition is computed and optionally truncated afterwards\n ('dense' method).\n dense :\n run exact full eigenvalue decomposition calling the standard\n LAPACK solver via `scipy.linalg.eigh`, and select the components\n by postprocessing\n arpack :\n run SVD truncated to n_components calling ARPACK solver using\n `scipy.sparse.linalg.eigsh`. It requires strictly\n 0 < n_components < n_samples\n randomized :\n run randomized SVD by the method of Halko et al. The current\n implementation selects eigenvalues based on their module; therefore\n using this method can lead to unexpected results if the kernel is\n not positive semi-definite.\n\n .. versionchanged:: 1.0\n `'randomized'` was added.\n\n tol : float, default=0\n Convergence tolerance for arpack.\n If 0, optimal value will be chosen by arpack.\n\n max_iter : int, default=None\n Maximum number of iterations for arpack.\n If None, optimal value will be chosen by arpack.\n\n iterated_power : int >= 0, or 'auto', default='auto'\n Number of iterations for the power method computed by\n svd_solver == 'randomized'. When 'auto', it is set to 7 when\n `n_components < 0.1 * min(X.shape)`, other it is set to 4.\n\n .. versionadded:: 1.0\n\n remove_zero_eig : bool, default=False\n If True, then all components with zero eigenvalues are removed, so\n that the number of components in the output may be < n_components\n (and sometimes even zero due to numerical instability).\n When n_components is None, this parameter is ignored and components\n with zero eigenvalues are removed regardless.\n\n random_state : int, RandomState instance or None, default=None\n Used when ``eigen_solver`` == 'arpack' or 'randomized'. Pass an int\n for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n .. versionadded:: 0.18\n\n copy_X : bool, default=True\n If True, input X is copied and stored by the model in the `X_fit_`\n attribute. If no further changes will be done to X, setting\n `copy_X=False` saves memory by storing a reference.\n\n .. versionadded:: 0.18\n\n n_jobs : int, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.18\n\n Attributes\n ----------\n eigenvalues_ : ndarray of shape (n_components,)\n Eigenvalues of the centered kernel matrix in decreasing order.\n If `n_components` and `remove_zero_eig` are not set,\n then all values are stored.\n\n lambdas_ : ndarray of shape (n_components,)\n Same as `eigenvalues_` but this attribute is deprecated.\n\n .. deprecated:: 1.0\n `lambdas_` was renamed to `eigenvalues_` in version 1.0 and will be\n removed in 1.2.\n\n eigenvectors_ : ndarray of shape (n_samples, n_components)\n Eigenvectors of the centered kernel matrix. If `n_components` and\n `remove_zero_eig` are not set, then all components are stored.\n\n alphas_ : ndarray of shape (n_samples, n_components)\n Same as `eigenvectors_` but this attribute is deprecated.\n\n .. deprecated:: 1.0\n `alphas_` was renamed to `eigenvectors_` in version 1.0 and will be\n removed in 1.2.\n\n dual_coef_ : ndarray of shape (n_samples, n_features)\n Inverse transform matrix. Only available when\n ``fit_inverse_transform`` is True.\n\n X_transformed_fit_ : ndarray of shape (n_samples, n_components)\n Projection of the fitted data on the kernel principal components.\n Only available when ``fit_inverse_transform`` is True.\n\n X_fit_ : ndarray of shape (n_samples, n_features)\n The data used to fit the model. If `copy_X=False`, then `X_fit_` is\n a reference. This attribute is used for the calls to transform.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n FastICA : A fast algorithm for Independent Component Analysis.\n IncrementalPCA : Incremental Principal Component Analysis.\n NMF : Non-Negative Matrix Factorization.\n PCA : Principal Component Analysis.\n SparsePCA : Sparse Principal Component Analysis.\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n\n References\n ----------\n Kernel PCA was introduced in:\n Bernhard Schoelkopf, Alexander J. Smola,\n and Klaus-Robert Mueller. 1999. Kernel principal\n component analysis. In Advances in kernel methods,\n MIT Press, Cambridge, MA, USA 327-352.\n\n For eigen_solver == 'arpack', refer to `scipy.sparse.linalg.eigsh`.\n\n For eigen_solver == 'randomized', see:\n Finding structure with randomness: Stochastic algorithms\n for constructing approximate matrix decompositions Halko, et al., 2009\n (arXiv:909)\n A randomized algorithm for the decomposition of matrices\n Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.decomposition import KernelPCA\n >>> X, _ = load_digits(return_X_y=True)\n >>> transformer = KernelPCA(n_components=7, kernel='linear')\n >>> X_transformed = transformer.fit_transform(X)\n >>> X_transformed.shape\n (1797, 7)\n ", - "source_code": "\n\nclass KernelPCA(TransformerMixin, BaseEstimator):\n \"\"\"Kernel Principal component analysis (KPCA).\n\n Non-linear dimensionality reduction through the use of kernels (see\n :ref:`metrics`).\n\n It uses the `scipy.linalg.eigh` LAPACK implementation of the full SVD or\n the `scipy.sparse.linalg.eigsh` ARPACK implementation of the truncated SVD,\n depending on the shape of the input data and the number of components to\n extract. It can also use a randomized truncated SVD by the method of\n Halko et al. 2009, see `eigen_solver`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of components. If None, all non-zero components are kept.\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'}, default='linear'\n Kernel used for PCA.\n\n gamma : float, default=None\n Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other\n kernels. If ``gamma`` is ``None``, then it is set to ``1/n_features``.\n\n degree : int, default=3\n Degree for poly kernels. Ignored by other kernels.\n\n coef0 : float, default=1\n Independent term in poly and sigmoid kernels.\n Ignored by other kernels.\n\n kernel_params : dict, default=None\n Parameters (keyword arguments) and\n values for kernel passed as callable object.\n Ignored by other kernels.\n\n alpha : float, default=1.0\n Hyperparameter of the ridge regression that learns the\n inverse transform (when fit_inverse_transform=True).\n\n fit_inverse_transform : bool, default=False\n Learn the inverse transform for non-precomputed kernels\n (i.e. learn to find the pre-image of a point).\n\n eigen_solver : {'auto', 'dense', 'arpack', 'randomized'}, default='auto'\n Select eigensolver to use. If `n_components` is much\n less than the number of training samples, randomized (or arpack to a\n smaller extend) may be more efficient than the dense eigensolver.\n Randomized SVD is performed according to the method of Halko et al.\n\n auto :\n the solver is selected by a default policy based on n_samples\n (the number of training samples) and `n_components`:\n if the number of components to extract is less than 10 (strict) and\n the number of samples is more than 200 (strict), the 'arpack'\n method is enabled. Otherwise the exact full eigenvalue\n decomposition is computed and optionally truncated afterwards\n ('dense' method).\n dense :\n run exact full eigenvalue decomposition calling the standard\n LAPACK solver via `scipy.linalg.eigh`, and select the components\n by postprocessing\n arpack :\n run SVD truncated to n_components calling ARPACK solver using\n `scipy.sparse.linalg.eigsh`. It requires strictly\n 0 < n_components < n_samples\n randomized :\n run randomized SVD by the method of Halko et al. The current\n implementation selects eigenvalues based on their module; therefore\n using this method can lead to unexpected results if the kernel is\n not positive semi-definite.\n\n .. versionchanged:: 1.0\n `'randomized'` was added.\n\n tol : float, default=0\n Convergence tolerance for arpack.\n If 0, optimal value will be chosen by arpack.\n\n max_iter : int, default=None\n Maximum number of iterations for arpack.\n If None, optimal value will be chosen by arpack.\n\n iterated_power : int >= 0, or 'auto', default='auto'\n Number of iterations for the power method computed by\n svd_solver == 'randomized'. When 'auto', it is set to 7 when\n `n_components < 0.1 * min(X.shape)`, other it is set to 4.\n\n .. versionadded:: 1.0\n\n remove_zero_eig : bool, default=False\n If True, then all components with zero eigenvalues are removed, so\n that the number of components in the output may be < n_components\n (and sometimes even zero due to numerical instability).\n When n_components is None, this parameter is ignored and components\n with zero eigenvalues are removed regardless.\n\n random_state : int, RandomState instance or None, default=None\n Used when ``eigen_solver`` == 'arpack' or 'randomized'. Pass an int\n for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n .. versionadded:: 0.18\n\n copy_X : bool, default=True\n If True, input X is copied and stored by the model in the `X_fit_`\n attribute. If no further changes will be done to X, setting\n `copy_X=False` saves memory by storing a reference.\n\n .. versionadded:: 0.18\n\n n_jobs : int, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.18\n\n Attributes\n ----------\n eigenvalues_ : ndarray of shape (n_components,)\n Eigenvalues of the centered kernel matrix in decreasing order.\n If `n_components` and `remove_zero_eig` are not set,\n then all values are stored.\n\n lambdas_ : ndarray of shape (n_components,)\n Same as `eigenvalues_` but this attribute is deprecated.\n\n .. deprecated:: 1.0\n `lambdas_` was renamed to `eigenvalues_` in version 1.0 and will be\n removed in 1.2.\n\n eigenvectors_ : ndarray of shape (n_samples, n_components)\n Eigenvectors of the centered kernel matrix. If `n_components` and\n `remove_zero_eig` are not set, then all components are stored.\n\n alphas_ : ndarray of shape (n_samples, n_components)\n Same as `eigenvectors_` but this attribute is deprecated.\n\n .. deprecated:: 1.0\n `alphas_` was renamed to `eigenvectors_` in version 1.0 and will be\n removed in 1.2.\n\n dual_coef_ : ndarray of shape (n_samples, n_features)\n Inverse transform matrix. Only available when\n ``fit_inverse_transform`` is True.\n\n X_transformed_fit_ : ndarray of shape (n_samples, n_components)\n Projection of the fitted data on the kernel principal components.\n Only available when ``fit_inverse_transform`` is True.\n\n X_fit_ : ndarray of shape (n_samples, n_features)\n The data used to fit the model. If `copy_X=False`, then `X_fit_` is\n a reference. This attribute is used for the calls to transform.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n FastICA : A fast algorithm for Independent Component Analysis.\n IncrementalPCA : Incremental Principal Component Analysis.\n NMF : Non-Negative Matrix Factorization.\n PCA : Principal Component Analysis.\n SparsePCA : Sparse Principal Component Analysis.\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n\n References\n ----------\n Kernel PCA was introduced in:\n Bernhard Schoelkopf, Alexander J. Smola,\n and Klaus-Robert Mueller. 1999. Kernel principal\n component analysis. In Advances in kernel methods,\n MIT Press, Cambridge, MA, USA 327-352.\n\n For eigen_solver == 'arpack', refer to `scipy.sparse.linalg.eigsh`.\n\n For eigen_solver == 'randomized', see:\n Finding structure with randomness: Stochastic algorithms\n for constructing approximate matrix decompositions Halko, et al., 2009\n (arXiv:909)\n A randomized algorithm for the decomposition of matrices\n Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.decomposition import KernelPCA\n >>> X, _ = load_digits(return_X_y=True)\n >>> transformer = KernelPCA(n_components=7, kernel='linear')\n >>> X_transformed = transformer.fit_transform(X)\n >>> X_transformed.shape\n (1797, 7)\n \"\"\"\n \n def __init__(self, n_components=None, *, kernel='linear', gamma=None, degree=3, coef0=1, kernel_params=None, alpha=1.0, fit_inverse_transform=False, eigen_solver='auto', tol=0, max_iter=None, iterated_power='auto', remove_zero_eig=False, random_state=None, copy_X=True, n_jobs=None):\n if fit_inverse_transform and kernel == 'precomputed':\n raise ValueError('Cannot fit_inverse_transform with a precomputed kernel.')\n self.n_components = n_components\n self.kernel = kernel\n self.kernel_params = kernel_params\n self.gamma = gamma\n self.degree = degree\n self.coef0 = coef0\n self.alpha = alpha\n self.fit_inverse_transform = fit_inverse_transform\n self.eigen_solver = eigen_solver\n self.tol = tol\n self.max_iter = max_iter\n self.iterated_power = iterated_power\n self.remove_zero_eig = remove_zero_eig\n self.random_state = random_state\n self.n_jobs = n_jobs\n self.copy_X = copy_X\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return self.kernel == 'precomputed'\n \n @deprecated('Attribute `lambdas_` was deprecated in version 1.0 and will be removed in 1.2. Use `eigenvalues_` instead.')\n @property\n def lambdas_(self):\n return self.eigenvalues_\n \n @deprecated('Attribute `alphas_` was deprecated in version 1.0 and will be removed in 1.2. Use `eigenvectors_` instead.')\n @property\n def alphas_(self):\n return self.eigenvectors_\n \n def _get_kernel(self, X, Y=None):\n if callable(self.kernel):\n params = self.kernel_params or {}\n else:\n params = {'gamma': self.gamma, 'degree': self.degree, 'coef0': self.coef0}\n return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, n_jobs=self.n_jobs, **params)\n \n def _fit_transform(self, K):\n \"\"\"Fit's using kernel K\"\"\"\n K = self._centerer.fit_transform(K)\n if self.n_components is None:\n n_components = K.shape[0]\n else:\n if self.n_components < 1:\n raise ValueError(f'`n_components` should be >= 1, got: {self.n_component}')\n n_components = min(K.shape[0], self.n_components)\n if self.eigen_solver == 'auto':\n if K.shape[0] > 200 and n_components < 10:\n eigen_solver = 'arpack'\n else:\n eigen_solver = 'dense'\n else:\n eigen_solver = self.eigen_solver\n if eigen_solver == 'dense':\n (self.eigenvalues_, self.eigenvectors_) = linalg.eigh(K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1))\n elif eigen_solver == 'arpack':\n v0 = _init_arpack_v0(K.shape[0], self.random_state)\n (self.eigenvalues_, self.eigenvectors_) = eigsh(K, n_components, which='LA', tol=self.tol, maxiter=self.max_iter, v0=v0)\n elif eigen_solver == 'randomized':\n (self.eigenvalues_, self.eigenvectors_) = _randomized_eigsh(K, n_components=n_components, n_iter=self.iterated_power, random_state=self.random_state, selection='module')\n else:\n raise ValueError('Unsupported value for `eigen_solver`: %r' % eigen_solver)\n self.eigenvalues_ = _check_psd_eigenvalues(self.eigenvalues_, enable_warnings=False)\n (self.eigenvectors_, _) = svd_flip(self.eigenvectors_, np.zeros_like(self.eigenvectors_).T)\n indices = self.eigenvalues_.argsort()[::-1]\n self.eigenvalues_ = self.eigenvalues_[indices]\n self.eigenvectors_ = self.eigenvectors_[:, indices]\n if self.remove_zero_eig or self.n_components is None:\n self.eigenvectors_ = self.eigenvectors_[:, self.eigenvalues_ > 0]\n self.eigenvalues_ = self.eigenvalues_[self.eigenvalues_ > 0]\n return K\n \n def _fit_inverse_transform(self, X_transformed, X):\n if hasattr(X, 'tocsr'):\n raise NotImplementedError('Inverse transform not implemented for sparse matrices!')\n n_samples = X_transformed.shape[0]\n K = self._get_kernel(X_transformed)\n K.flat[::n_samples + 1] += self.alpha\n self.dual_coef_ = linalg.solve(K, X, sym_pos=True, overwrite_a=True)\n self.X_transformed_fit_ = X_transformed\n \n def fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', copy=self.copy_X)\n self._centerer = KernelCenterer()\n K = self._get_kernel(X)\n self._fit_transform(K)\n if self.fit_inverse_transform:\n X_transformed = self.eigenvectors_ * np.sqrt(self.eigenvalues_)\n self._fit_inverse_transform(X_transformed, X)\n self.X_fit_ = X\n return self\n \n def fit_transform(self, X, y=None, **params):\n \"\"\"Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n **params : kwargs\n Parameters (keyword arguments) and values passed to\n the fit_transform instance.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.\n \"\"\"\n self.fit(X, **params)\n X_transformed = self.eigenvectors_ * np.sqrt(self.eigenvalues_)\n if self.fit_inverse_transform:\n self._fit_inverse_transform(X_transformed, X)\n return X_transformed\n \n def transform(self, X):\n \"\"\"Transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n K = self._centerer.transform(self._get_kernel(X, self.X_fit_))\n non_zeros = np.flatnonzero(self.eigenvalues_)\n scaled_alphas = np.zeros_like(self.eigenvectors_)\n scaled_alphas[:, non_zeros] = self.eigenvectors_[:, non_zeros] / np.sqrt(self.eigenvalues_[non_zeros])\n return np.dot(K, scaled_alphas)\n \n def inverse_transform(self, X):\n \"\"\"Transform X back to original space.\n\n ``inverse_transform`` approximates the inverse transformation using\n a learned pre-image. The pre-image is learned by kernel ridge\n regression of the original data on their low-dimensional representation\n vectors.\n\n .. note:\n :meth:`~sklearn.decomposition.fit` internally uses a centered\n kernel. As the centered kernel no longer contains the information\n of the mean of kernel features, such information is not taken into\n account in reconstruction.\n\n .. note::\n When users want to compute inverse transformation for 'linear'\n kernel, it is recommended that they use\n :class:`~sklearn.decomposition.PCA` instead. Unlike\n :class:`~sklearn.decomposition.PCA`,\n :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform``\n does not reconstruct the mean of data when 'linear' kernel is used\n due to the use of centered kernel.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_components)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_features)\n Returns the instance itself.\n\n References\n ----------\n \"Learning to Find Pre-Images\", G BakIr et al, 2004.\n \"\"\"\n if not self.fit_inverse_transform:\n raise NotFittedError('The fit_inverse_transform parameter was not set to True when instantiating and hence the inverse transform is not available.')\n K = self._get_kernel(X, self.X_transformed_fit_)\n return np.dot(K, self.dual_coef_)\n \n def _more_tags(self):\n return {'preserves_dtype': [np.float64, np.float32], 'pairwise': self.kernel == 'precomputed'}\n" + "description": "Kernel Principal component analysis (KPCA).\n\nNon-linear dimensionality reduction through the use of kernels (see\n:ref:`metrics`).\n\nIt uses the :func:`scipy.linalg.eigh` LAPACK implementation of the full SVD\nor the :func:`scipy.sparse.linalg.eigsh` ARPACK implementation of the\ntruncated SVD, depending on the shape of the input data and the number of\ncomponents to extract. It can also use a randomized truncated SVD by the\nmethod proposed in [3]_, see `eigen_solver`.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Kernel Principal component analysis (KPCA).\n\n Non-linear dimensionality reduction through the use of kernels (see\n :ref:`metrics`).\n\n It uses the :func:`scipy.linalg.eigh` LAPACK implementation of the full SVD\n or the :func:`scipy.sparse.linalg.eigsh` ARPACK implementation of the\n truncated SVD, depending on the shape of the input data and the number of\n components to extract. It can also use a randomized truncated SVD by the\n method proposed in [3]_, see `eigen_solver`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of components. If None, all non-zero components are kept.\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'}, default='linear'\n Kernel used for PCA.\n\n gamma : float, default=None\n Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other\n kernels. If ``gamma`` is ``None``, then it is set to ``1/n_features``.\n\n degree : int, default=3\n Degree for poly kernels. Ignored by other kernels.\n\n coef0 : float, default=1\n Independent term in poly and sigmoid kernels.\n Ignored by other kernels.\n\n kernel_params : dict, default=None\n Parameters (keyword arguments) and\n values for kernel passed as callable object.\n Ignored by other kernels.\n\n alpha : float, default=1.0\n Hyperparameter of the ridge regression that learns the\n inverse transform (when fit_inverse_transform=True).\n\n fit_inverse_transform : bool, default=False\n Learn the inverse transform for non-precomputed kernels\n (i.e. learn to find the pre-image of a point). This method is based\n on [2]_.\n\n eigen_solver : {'auto', 'dense', 'arpack', 'randomized'}, default='auto'\n Select eigensolver to use. If `n_components` is much\n less than the number of training samples, randomized (or arpack to a\n smaller extend) may be more efficient than the dense eigensolver.\n Randomized SVD is performed according to the method of Halko et al\n [3]_.\n\n auto :\n the solver is selected by a default policy based on n_samples\n (the number of training samples) and `n_components`:\n if the number of components to extract is less than 10 (strict) and\n the number of samples is more than 200 (strict), the 'arpack'\n method is enabled. Otherwise the exact full eigenvalue\n decomposition is computed and optionally truncated afterwards\n ('dense' method).\n dense :\n run exact full eigenvalue decomposition calling the standard\n LAPACK solver via `scipy.linalg.eigh`, and select the components\n by postprocessing\n arpack :\n run SVD truncated to n_components calling ARPACK solver using\n `scipy.sparse.linalg.eigsh`. It requires strictly\n 0 < n_components < n_samples\n randomized :\n run randomized SVD by the method of Halko et al. [3]_. The current\n implementation selects eigenvalues based on their module; therefore\n using this method can lead to unexpected results if the kernel is\n not positive semi-definite. See also [4]_.\n\n .. versionchanged:: 1.0\n `'randomized'` was added.\n\n tol : float, default=0\n Convergence tolerance for arpack.\n If 0, optimal value will be chosen by arpack.\n\n max_iter : int, default=None\n Maximum number of iterations for arpack.\n If None, optimal value will be chosen by arpack.\n\n iterated_power : int >= 0, or 'auto', default='auto'\n Number of iterations for the power method computed by\n svd_solver == 'randomized'. When 'auto', it is set to 7 when\n `n_components < 0.1 * min(X.shape)`, other it is set to 4.\n\n .. versionadded:: 1.0\n\n remove_zero_eig : bool, default=False\n If True, then all components with zero eigenvalues are removed, so\n that the number of components in the output may be < n_components\n (and sometimes even zero due to numerical instability).\n When n_components is None, this parameter is ignored and components\n with zero eigenvalues are removed regardless.\n\n random_state : int, RandomState instance or None, default=None\n Used when ``eigen_solver`` == 'arpack' or 'randomized'. Pass an int\n for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n .. versionadded:: 0.18\n\n copy_X : bool, default=True\n If True, input X is copied and stored by the model in the `X_fit_`\n attribute. If no further changes will be done to X, setting\n `copy_X=False` saves memory by storing a reference.\n\n .. versionadded:: 0.18\n\n n_jobs : int, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.18\n\n Attributes\n ----------\n eigenvalues_ : ndarray of shape (n_components,)\n Eigenvalues of the centered kernel matrix in decreasing order.\n If `n_components` and `remove_zero_eig` are not set,\n then all values are stored.\n\n lambdas_ : ndarray of shape (n_components,)\n Same as `eigenvalues_` but this attribute is deprecated.\n\n .. deprecated:: 1.0\n `lambdas_` was renamed to `eigenvalues_` in version 1.0 and will be\n removed in 1.2.\n\n eigenvectors_ : ndarray of shape (n_samples, n_components)\n Eigenvectors of the centered kernel matrix. If `n_components` and\n `remove_zero_eig` are not set, then all components are stored.\n\n alphas_ : ndarray of shape (n_samples, n_components)\n Same as `eigenvectors_` but this attribute is deprecated.\n\n .. deprecated:: 1.0\n `alphas_` was renamed to `eigenvectors_` in version 1.0 and will be\n removed in 1.2.\n\n dual_coef_ : ndarray of shape (n_samples, n_features)\n Inverse transform matrix. Only available when\n ``fit_inverse_transform`` is True.\n\n X_transformed_fit_ : ndarray of shape (n_samples, n_components)\n Projection of the fitted data on the kernel principal components.\n Only available when ``fit_inverse_transform`` is True.\n\n X_fit_ : ndarray of shape (n_samples, n_features)\n The data used to fit the model. If `copy_X=False`, then `X_fit_` is\n a reference. This attribute is used for the calls to transform.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n FastICA : A fast algorithm for Independent Component Analysis.\n IncrementalPCA : Incremental Principal Component Analysis.\n NMF : Non-Negative Matrix Factorization.\n PCA : Principal Component Analysis.\n SparsePCA : Sparse Principal Component Analysis.\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n\n References\n ----------\n .. [1] `Sch\u00f6lkopf, Bernhard, Alexander Smola, and Klaus-Robert M\u00fcller.\n \"Kernel principal component analysis.\"\n International conference on artificial neural networks.\n Springer, Berlin, Heidelberg, 1997.\n `_\n\n .. [2] `Bak\u0131r, G\u00f6khan H., Jason Weston, and Bernhard Sch\u00f6lkopf.\n \"Learning to find pre-images.\"\n Advances in neural information processing systems 16 (2004): 449-456.\n `_\n\n .. [3] :arxiv:`Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp.\n \"Finding structure with randomness: Probabilistic algorithms for\n constructing approximate matrix decompositions.\"\n SIAM review 53.2 (2011): 217-288. <0909.4061>`\n\n .. [4] `Martinsson, Per-Gunnar, Vladimir Rokhlin, and Mark Tygert.\n \"A randomized algorithm for the decomposition of matrices.\"\n Applied and Computational Harmonic Analysis 30.1 (2011): 47-68.\n `_\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.decomposition import KernelPCA\n >>> X, _ = load_digits(return_X_y=True)\n >>> transformer = KernelPCA(n_components=7, kernel='linear')\n >>> X_transformed = transformer.fit_transform(X)\n >>> X_transformed.shape\n (1797, 7)\n ", + "source_code": "\n\nclass KernelPCA(TransformerMixin, BaseEstimator):\n \"\"\"Kernel Principal component analysis (KPCA).\n\n Non-linear dimensionality reduction through the use of kernels (see\n :ref:`metrics`).\n\n It uses the :func:`scipy.linalg.eigh` LAPACK implementation of the full SVD\n or the :func:`scipy.sparse.linalg.eigsh` ARPACK implementation of the\n truncated SVD, depending on the shape of the input data and the number of\n components to extract. It can also use a randomized truncated SVD by the\n method proposed in [3]_, see `eigen_solver`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of components. If None, all non-zero components are kept.\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'}, default='linear'\n Kernel used for PCA.\n\n gamma : float, default=None\n Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other\n kernels. If ``gamma`` is ``None``, then it is set to ``1/n_features``.\n\n degree : int, default=3\n Degree for poly kernels. Ignored by other kernels.\n\n coef0 : float, default=1\n Independent term in poly and sigmoid kernels.\n Ignored by other kernels.\n\n kernel_params : dict, default=None\n Parameters (keyword arguments) and\n values for kernel passed as callable object.\n Ignored by other kernels.\n\n alpha : float, default=1.0\n Hyperparameter of the ridge regression that learns the\n inverse transform (when fit_inverse_transform=True).\n\n fit_inverse_transform : bool, default=False\n Learn the inverse transform for non-precomputed kernels\n (i.e. learn to find the pre-image of a point). This method is based\n on [2]_.\n\n eigen_solver : {'auto', 'dense', 'arpack', 'randomized'}, default='auto'\n Select eigensolver to use. If `n_components` is much\n less than the number of training samples, randomized (or arpack to a\n smaller extend) may be more efficient than the dense eigensolver.\n Randomized SVD is performed according to the method of Halko et al\n [3]_.\n\n auto :\n the solver is selected by a default policy based on n_samples\n (the number of training samples) and `n_components`:\n if the number of components to extract is less than 10 (strict) and\n the number of samples is more than 200 (strict), the 'arpack'\n method is enabled. Otherwise the exact full eigenvalue\n decomposition is computed and optionally truncated afterwards\n ('dense' method).\n dense :\n run exact full eigenvalue decomposition calling the standard\n LAPACK solver via `scipy.linalg.eigh`, and select the components\n by postprocessing\n arpack :\n run SVD truncated to n_components calling ARPACK solver using\n `scipy.sparse.linalg.eigsh`. It requires strictly\n 0 < n_components < n_samples\n randomized :\n run randomized SVD by the method of Halko et al. [3]_. The current\n implementation selects eigenvalues based on their module; therefore\n using this method can lead to unexpected results if the kernel is\n not positive semi-definite. See also [4]_.\n\n .. versionchanged:: 1.0\n `'randomized'` was added.\n\n tol : float, default=0\n Convergence tolerance for arpack.\n If 0, optimal value will be chosen by arpack.\n\n max_iter : int, default=None\n Maximum number of iterations for arpack.\n If None, optimal value will be chosen by arpack.\n\n iterated_power : int >= 0, or 'auto', default='auto'\n Number of iterations for the power method computed by\n svd_solver == 'randomized'. When 'auto', it is set to 7 when\n `n_components < 0.1 * min(X.shape)`, other it is set to 4.\n\n .. versionadded:: 1.0\n\n remove_zero_eig : bool, default=False\n If True, then all components with zero eigenvalues are removed, so\n that the number of components in the output may be < n_components\n (and sometimes even zero due to numerical instability).\n When n_components is None, this parameter is ignored and components\n with zero eigenvalues are removed regardless.\n\n random_state : int, RandomState instance or None, default=None\n Used when ``eigen_solver`` == 'arpack' or 'randomized'. Pass an int\n for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n .. versionadded:: 0.18\n\n copy_X : bool, default=True\n If True, input X is copied and stored by the model in the `X_fit_`\n attribute. If no further changes will be done to X, setting\n `copy_X=False` saves memory by storing a reference.\n\n .. versionadded:: 0.18\n\n n_jobs : int, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.18\n\n Attributes\n ----------\n eigenvalues_ : ndarray of shape (n_components,)\n Eigenvalues of the centered kernel matrix in decreasing order.\n If `n_components` and `remove_zero_eig` are not set,\n then all values are stored.\n\n lambdas_ : ndarray of shape (n_components,)\n Same as `eigenvalues_` but this attribute is deprecated.\n\n .. deprecated:: 1.0\n `lambdas_` was renamed to `eigenvalues_` in version 1.0 and will be\n removed in 1.2.\n\n eigenvectors_ : ndarray of shape (n_samples, n_components)\n Eigenvectors of the centered kernel matrix. If `n_components` and\n `remove_zero_eig` are not set, then all components are stored.\n\n alphas_ : ndarray of shape (n_samples, n_components)\n Same as `eigenvectors_` but this attribute is deprecated.\n\n .. deprecated:: 1.0\n `alphas_` was renamed to `eigenvectors_` in version 1.0 and will be\n removed in 1.2.\n\n dual_coef_ : ndarray of shape (n_samples, n_features)\n Inverse transform matrix. Only available when\n ``fit_inverse_transform`` is True.\n\n X_transformed_fit_ : ndarray of shape (n_samples, n_components)\n Projection of the fitted data on the kernel principal components.\n Only available when ``fit_inverse_transform`` is True.\n\n X_fit_ : ndarray of shape (n_samples, n_features)\n The data used to fit the model. If `copy_X=False`, then `X_fit_` is\n a reference. This attribute is used for the calls to transform.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n FastICA : A fast algorithm for Independent Component Analysis.\n IncrementalPCA : Incremental Principal Component Analysis.\n NMF : Non-Negative Matrix Factorization.\n PCA : Principal Component Analysis.\n SparsePCA : Sparse Principal Component Analysis.\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n\n References\n ----------\n .. [1] `Sch\u00f6lkopf, Bernhard, Alexander Smola, and Klaus-Robert M\u00fcller.\n \"Kernel principal component analysis.\"\n International conference on artificial neural networks.\n Springer, Berlin, Heidelberg, 1997.\n `_\n\n .. [2] `Bak\u0131r, G\u00f6khan H., Jason Weston, and Bernhard Sch\u00f6lkopf.\n \"Learning to find pre-images.\"\n Advances in neural information processing systems 16 (2004): 449-456.\n `_\n\n .. [3] :arxiv:`Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp.\n \"Finding structure with randomness: Probabilistic algorithms for\n constructing approximate matrix decompositions.\"\n SIAM review 53.2 (2011): 217-288. <0909.4061>`\n\n .. [4] `Martinsson, Per-Gunnar, Vladimir Rokhlin, and Mark Tygert.\n \"A randomized algorithm for the decomposition of matrices.\"\n Applied and Computational Harmonic Analysis 30.1 (2011): 47-68.\n `_\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.decomposition import KernelPCA\n >>> X, _ = load_digits(return_X_y=True)\n >>> transformer = KernelPCA(n_components=7, kernel='linear')\n >>> X_transformed = transformer.fit_transform(X)\n >>> X_transformed.shape\n (1797, 7)\n \"\"\"\n \n def __init__(self, n_components=None, *, kernel='linear', gamma=None, degree=3, coef0=1, kernel_params=None, alpha=1.0, fit_inverse_transform=False, eigen_solver='auto', tol=0, max_iter=None, iterated_power='auto', remove_zero_eig=False, random_state=None, copy_X=True, n_jobs=None):\n if fit_inverse_transform and kernel == 'precomputed':\n raise ValueError('Cannot fit_inverse_transform with a precomputed kernel.')\n self.n_components = n_components\n self.kernel = kernel\n self.kernel_params = kernel_params\n self.gamma = gamma\n self.degree = degree\n self.coef0 = coef0\n self.alpha = alpha\n self.fit_inverse_transform = fit_inverse_transform\n self.eigen_solver = eigen_solver\n self.tol = tol\n self.max_iter = max_iter\n self.iterated_power = iterated_power\n self.remove_zero_eig = remove_zero_eig\n self.random_state = random_state\n self.n_jobs = n_jobs\n self.copy_X = copy_X\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return self.kernel == 'precomputed'\n \n @deprecated('Attribute `lambdas_` was deprecated in version 1.0 and will be removed in 1.2. Use `eigenvalues_` instead.')\n @property\n def lambdas_(self):\n return self.eigenvalues_\n \n @deprecated('Attribute `alphas_` was deprecated in version 1.0 and will be removed in 1.2. Use `eigenvectors_` instead.')\n @property\n def alphas_(self):\n return self.eigenvectors_\n \n def _get_kernel(self, X, Y=None):\n if callable(self.kernel):\n params = self.kernel_params or {}\n else:\n params = {'gamma': self.gamma, 'degree': self.degree, 'coef0': self.coef0}\n return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, n_jobs=self.n_jobs, **params)\n \n def _fit_transform(self, K):\n \"\"\"Fit's using kernel K\"\"\"\n K = self._centerer.fit_transform(K)\n if self.n_components is None:\n n_components = K.shape[0]\n else:\n if self.n_components < 1:\n raise ValueError(f'`n_components` should be >= 1, got: {self.n_component}')\n n_components = min(K.shape[0], self.n_components)\n if self.eigen_solver == 'auto':\n if K.shape[0] > 200 and n_components < 10:\n eigen_solver = 'arpack'\n else:\n eigen_solver = 'dense'\n else:\n eigen_solver = self.eigen_solver\n if eigen_solver == 'dense':\n (self.eigenvalues_, self.eigenvectors_) = linalg.eigh(K, eigvals=(K.shape[0] - n_components, K.shape[0] - 1))\n elif eigen_solver == 'arpack':\n v0 = _init_arpack_v0(K.shape[0], self.random_state)\n (self.eigenvalues_, self.eigenvectors_) = eigsh(K, n_components, which='LA', tol=self.tol, maxiter=self.max_iter, v0=v0)\n elif eigen_solver == 'randomized':\n (self.eigenvalues_, self.eigenvectors_) = _randomized_eigsh(K, n_components=n_components, n_iter=self.iterated_power, random_state=self.random_state, selection='module')\n else:\n raise ValueError('Unsupported value for `eigen_solver`: %r' % eigen_solver)\n self.eigenvalues_ = _check_psd_eigenvalues(self.eigenvalues_, enable_warnings=False)\n (self.eigenvectors_, _) = svd_flip(self.eigenvectors_, np.zeros_like(self.eigenvectors_).T)\n indices = self.eigenvalues_.argsort()[::-1]\n self.eigenvalues_ = self.eigenvalues_[indices]\n self.eigenvectors_ = self.eigenvectors_[:, indices]\n if self.remove_zero_eig or self.n_components is None:\n self.eigenvectors_ = self.eigenvectors_[:, self.eigenvalues_ > 0]\n self.eigenvalues_ = self.eigenvalues_[self.eigenvalues_ > 0]\n return K\n \n def _fit_inverse_transform(self, X_transformed, X):\n if hasattr(X, 'tocsr'):\n raise NotImplementedError('Inverse transform not implemented for sparse matrices!')\n n_samples = X_transformed.shape[0]\n K = self._get_kernel(X_transformed)\n K.flat[::n_samples + 1] += self.alpha\n self.dual_coef_ = linalg.solve(K, X, sym_pos=True, overwrite_a=True)\n self.X_transformed_fit_ = X_transformed\n \n def fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', copy=self.copy_X)\n self._centerer = KernelCenterer()\n K = self._get_kernel(X)\n self._fit_transform(K)\n if self.fit_inverse_transform:\n X_transformed = self.eigenvectors_ * np.sqrt(self.eigenvalues_)\n self._fit_inverse_transform(X_transformed, X)\n self.X_fit_ = X\n return self\n \n def fit_transform(self, X, y=None, **params):\n \"\"\"Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n **params : kwargs\n Parameters (keyword arguments) and values passed to\n the fit_transform instance.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.\n \"\"\"\n self.fit(X, **params)\n X_transformed = self.eigenvectors_ * np.sqrt(self.eigenvalues_)\n if self.fit_inverse_transform:\n self._fit_inverse_transform(X_transformed, X)\n return X_transformed\n \n def transform(self, X):\n \"\"\"Transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n K = self._centerer.transform(self._get_kernel(X, self.X_fit_))\n non_zeros = np.flatnonzero(self.eigenvalues_)\n scaled_alphas = np.zeros_like(self.eigenvectors_)\n scaled_alphas[:, non_zeros] = self.eigenvectors_[:, non_zeros] / np.sqrt(self.eigenvalues_[non_zeros])\n return np.dot(K, scaled_alphas)\n \n def inverse_transform(self, X):\n \"\"\"Transform X back to original space.\n\n ``inverse_transform`` approximates the inverse transformation using\n a learned pre-image. The pre-image is learned by kernel ridge\n regression of the original data on their low-dimensional representation\n vectors.\n\n .. note:\n :meth:`~sklearn.decomposition.fit` internally uses a centered\n kernel. As the centered kernel no longer contains the information\n of the mean of kernel features, such information is not taken into\n account in reconstruction.\n\n .. note::\n When users want to compute inverse transformation for 'linear'\n kernel, it is recommended that they use\n :class:`~sklearn.decomposition.PCA` instead. Unlike\n :class:`~sklearn.decomposition.PCA`,\n :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform``\n does not reconstruct the mean of data when 'linear' kernel is used\n due to the use of centered kernel.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_components)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_features)\n Returns the instance itself.\n\n References\n ----------\n `Bak\u0131r, G\u00f6khan H., Jason Weston, and Bernhard Sch\u00f6lkopf.\n \"Learning to find pre-images.\"\n Advances in neural information processing systems 16 (2004): 449-456.\n `_\n \"\"\"\n if not self.fit_inverse_transform:\n raise NotFittedError('The fit_inverse_transform parameter was not set to True when instantiating and hence the inverse transform is not available.')\n K = self._get_kernel(X, self.X_transformed_fit_)\n return np.dot(K, self.dual_coef_)\n \n def _more_tags(self):\n return {'preserves_dtype': [np.float64, np.float32], 'pairwise': self.kernel == 'precomputed'}\n" }, { "name": "LatentDirichletAllocation", @@ -20601,7 +20667,7 @@ "sklearn.decomposition._lda.LatentDirichletAllocation.perplexity" ], "is_public": true, - "description": "Latent Dirichlet Allocation with online variational Bayes algorithm.\n\nThe implementation is based on [1]_ and [2]_. .. versionadded:: 0.17 Read more in the :ref:`User Guide `.", + "description": "Latent Dirichlet Allocation with online variational Bayes algorithm.\n\nThe implementation is based on [1]_ and [2]_.\n\n.. versionadded:: 0.17\n\nRead more in the :ref:`User Guide `.", "docstring": "Latent Dirichlet Allocation with online variational Bayes algorithm.\n\n The implementation is based on [1]_ and [2]_.\n\n .. versionadded:: 0.17\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=10\n Number of topics.\n\n .. versionchanged:: 0.19\n ``n_topics`` was renamed to ``n_components``\n\n doc_topic_prior : float, default=None\n Prior of document topic distribution `theta`. If the value is None,\n defaults to `1 / n_components`.\n In [1]_, this is called `alpha`.\n\n topic_word_prior : float, default=None\n Prior of topic word distribution `beta`. If the value is None, defaults\n to `1 / n_components`.\n In [1]_, this is called `eta`.\n\n learning_method : {'batch', 'online'}, default='batch'\n Method used to update `_component`. Only used in :meth:`fit` method.\n In general, if the data size is large, the online update will be much\n faster than the batch update.\n\n Valid options::\n\n 'batch': Batch variational Bayes method. Use all training data in\n each EM update.\n Old `components_` will be overwritten in each iteration.\n 'online': Online variational Bayes method. In each EM update, use\n mini-batch of training data to update the ``components_``\n variable incrementally. The learning rate is controlled by the\n ``learning_decay`` and the ``learning_offset`` parameters.\n\n .. versionchanged:: 0.20\n The default learning method is now ``\"batch\"``.\n\n learning_decay : float, default=0.7\n It is a parameter that control learning rate in the online learning\n method. The value should be set between (0.5, 1.0] to guarantee\n asymptotic convergence. When the value is 0.0 and batch_size is\n ``n_samples``, the update method is same as batch learning. In the\n literature, this is called kappa.\n\n learning_offset : float, default=10.0\n A (positive) parameter that downweights early iterations in online\n learning. It should be greater than 1.0. In the literature, this is\n called tau_0.\n\n max_iter : int, default=10\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the :meth:`fit` method, and not the\n :meth:`partial_fit` method.\n\n batch_size : int, default=128\n Number of documents to use in each EM iteration. Only used in online\n learning.\n\n evaluate_every : int, default=-1\n How often to evaluate perplexity. Only used in `fit` method.\n set it to 0 or negative number to not evaluate perplexity in\n training at all. Evaluating perplexity can help you check convergence\n in training process, but it will also increase total training time.\n Evaluating perplexity in every iteration might increase training time\n up to two-fold.\n\n total_samples : int, default=1e6\n Total number of documents. Only used in the :meth:`partial_fit` method.\n\n perp_tol : float, default=1e-1\n Perplexity tolerance in batch learning. Only used when\n ``evaluate_every`` is greater than 0.\n\n mean_change_tol : float, default=1e-3\n Stopping tolerance for updating document topic distribution in E-step.\n\n max_doc_update_iter : int, default=100\n Max number of iterations for updating document topic distribution in\n the E-step.\n\n n_jobs : int, default=None\n The number of jobs to use in the E-step.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n Verbosity level.\n\n random_state : int, RandomState instance or None, default=None\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Variational parameters for topic word distribution. Since the complete\n conditional for topic word distribution is a Dirichlet,\n ``components_[i, j]`` can be viewed as pseudocount that represents the\n number of times word `j` was assigned to topic `i`.\n It can also be viewed as distribution over the words for each topic\n after normalization:\n ``model.components_ / model.components_.sum(axis=1)[:, np.newaxis]``.\n\n exp_dirichlet_component_ : ndarray of shape (n_components, n_features)\n Exponential value of expectation of log topic word distribution.\n In the literature, this is `exp(E[log(beta)])`.\n\n n_batch_iter_ : int\n Number of iterations of the EM step.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of passes over the dataset.\n\n bound_ : float\n Final perplexity score on training set.\n\n doc_topic_prior_ : float\n Prior of document topic distribution `theta`. If the value is None,\n it is `1 / n_components`.\n\n random_state_ : RandomState instance\n RandomState instance that is generated either from a seed, the random\n number generator or by `np.random`.\n\n topic_word_prior_ : float\n Prior of topic word distribution `beta`. If the value is None, it is\n `1 / n_components`.\n\n See Also\n --------\n sklearn.discriminant_analysis.LinearDiscriminantAnalysis:\n A classifier with a linear decision boundary, generated by fitting\n class conditional densities to the data and using Bayes\u2019 rule.\n\n References\n ----------\n .. [1] \"Online Learning for Latent Dirichlet Allocation\", Matthew D.\n Hoffman, David M. Blei, Francis Bach, 2010\n https://github.com/blei-lab/onlineldavb\n\n .. [2] \"Stochastic Variational Inference\", Matthew D. Hoffman,\n David M. Blei, Chong Wang, John Paisley, 2013\n\n Examples\n --------\n >>> from sklearn.decomposition import LatentDirichletAllocation\n >>> from sklearn.datasets import make_multilabel_classification\n >>> # This produces a feature matrix of token counts, similar to what\n >>> # CountVectorizer would produce on text.\n >>> X, _ = make_multilabel_classification(random_state=0)\n >>> lda = LatentDirichletAllocation(n_components=5,\n ... random_state=0)\n >>> lda.fit(X)\n LatentDirichletAllocation(...)\n >>> # get topics for some given samples:\n >>> lda.transform(X[-2:])\n array([[0.00360392, 0.25499205, 0.0036211 , 0.64236448, 0.09541846],\n [0.15297572, 0.00362644, 0.44412786, 0.39568399, 0.003586 ]])\n ", "source_code": "\n\nclass LatentDirichletAllocation(TransformerMixin, BaseEstimator):\n \"\"\"Latent Dirichlet Allocation with online variational Bayes algorithm.\n\n The implementation is based on [1]_ and [2]_.\n\n .. versionadded:: 0.17\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=10\n Number of topics.\n\n .. versionchanged:: 0.19\n ``n_topics`` was renamed to ``n_components``\n\n doc_topic_prior : float, default=None\n Prior of document topic distribution `theta`. If the value is None,\n defaults to `1 / n_components`.\n In [1]_, this is called `alpha`.\n\n topic_word_prior : float, default=None\n Prior of topic word distribution `beta`. If the value is None, defaults\n to `1 / n_components`.\n In [1]_, this is called `eta`.\n\n learning_method : {'batch', 'online'}, default='batch'\n Method used to update `_component`. Only used in :meth:`fit` method.\n In general, if the data size is large, the online update will be much\n faster than the batch update.\n\n Valid options::\n\n 'batch': Batch variational Bayes method. Use all training data in\n each EM update.\n Old `components_` will be overwritten in each iteration.\n 'online': Online variational Bayes method. In each EM update, use\n mini-batch of training data to update the ``components_``\n variable incrementally. The learning rate is controlled by the\n ``learning_decay`` and the ``learning_offset`` parameters.\n\n .. versionchanged:: 0.20\n The default learning method is now ``\"batch\"``.\n\n learning_decay : float, default=0.7\n It is a parameter that control learning rate in the online learning\n method. The value should be set between (0.5, 1.0] to guarantee\n asymptotic convergence. When the value is 0.0 and batch_size is\n ``n_samples``, the update method is same as batch learning. In the\n literature, this is called kappa.\n\n learning_offset : float, default=10.0\n A (positive) parameter that downweights early iterations in online\n learning. It should be greater than 1.0. In the literature, this is\n called tau_0.\n\n max_iter : int, default=10\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the :meth:`fit` method, and not the\n :meth:`partial_fit` method.\n\n batch_size : int, default=128\n Number of documents to use in each EM iteration. Only used in online\n learning.\n\n evaluate_every : int, default=-1\n How often to evaluate perplexity. Only used in `fit` method.\n set it to 0 or negative number to not evaluate perplexity in\n training at all. Evaluating perplexity can help you check convergence\n in training process, but it will also increase total training time.\n Evaluating perplexity in every iteration might increase training time\n up to two-fold.\n\n total_samples : int, default=1e6\n Total number of documents. Only used in the :meth:`partial_fit` method.\n\n perp_tol : float, default=1e-1\n Perplexity tolerance in batch learning. Only used when\n ``evaluate_every`` is greater than 0.\n\n mean_change_tol : float, default=1e-3\n Stopping tolerance for updating document topic distribution in E-step.\n\n max_doc_update_iter : int, default=100\n Max number of iterations for updating document topic distribution in\n the E-step.\n\n n_jobs : int, default=None\n The number of jobs to use in the E-step.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n Verbosity level.\n\n random_state : int, RandomState instance or None, default=None\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Variational parameters for topic word distribution. Since the complete\n conditional for topic word distribution is a Dirichlet,\n ``components_[i, j]`` can be viewed as pseudocount that represents the\n number of times word `j` was assigned to topic `i`.\n It can also be viewed as distribution over the words for each topic\n after normalization:\n ``model.components_ / model.components_.sum(axis=1)[:, np.newaxis]``.\n\n exp_dirichlet_component_ : ndarray of shape (n_components, n_features)\n Exponential value of expectation of log topic word distribution.\n In the literature, this is `exp(E[log(beta)])`.\n\n n_batch_iter_ : int\n Number of iterations of the EM step.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of passes over the dataset.\n\n bound_ : float\n Final perplexity score on training set.\n\n doc_topic_prior_ : float\n Prior of document topic distribution `theta`. If the value is None,\n it is `1 / n_components`.\n\n random_state_ : RandomState instance\n RandomState instance that is generated either from a seed, the random\n number generator or by `np.random`.\n\n topic_word_prior_ : float\n Prior of topic word distribution `beta`. If the value is None, it is\n `1 / n_components`.\n\n See Also\n --------\n sklearn.discriminant_analysis.LinearDiscriminantAnalysis:\n A classifier with a linear decision boundary, generated by fitting\n class conditional densities to the data and using Bayes\u2019 rule.\n\n References\n ----------\n .. [1] \"Online Learning for Latent Dirichlet Allocation\", Matthew D.\n Hoffman, David M. Blei, Francis Bach, 2010\n https://github.com/blei-lab/onlineldavb\n\n .. [2] \"Stochastic Variational Inference\", Matthew D. Hoffman,\n David M. Blei, Chong Wang, John Paisley, 2013\n\n Examples\n --------\n >>> from sklearn.decomposition import LatentDirichletAllocation\n >>> from sklearn.datasets import make_multilabel_classification\n >>> # This produces a feature matrix of token counts, similar to what\n >>> # CountVectorizer would produce on text.\n >>> X, _ = make_multilabel_classification(random_state=0)\n >>> lda = LatentDirichletAllocation(n_components=5,\n ... random_state=0)\n >>> lda.fit(X)\n LatentDirichletAllocation(...)\n >>> # get topics for some given samples:\n >>> lda.transform(X[-2:])\n array([[0.00360392, 0.25499205, 0.0036211 , 0.64236448, 0.09541846],\n [0.15297572, 0.00362644, 0.44412786, 0.39568399, 0.003586 ]])\n \"\"\"\n \n def __init__(self, n_components=10, *, doc_topic_prior=None, topic_word_prior=None, learning_method='batch', learning_decay=0.7, learning_offset=10.0, max_iter=10, batch_size=128, evaluate_every=-1, total_samples=1000000.0, perp_tol=0.1, mean_change_tol=0.001, max_doc_update_iter=100, n_jobs=None, verbose=0, random_state=None):\n self.n_components = n_components\n self.doc_topic_prior = doc_topic_prior\n self.topic_word_prior = topic_word_prior\n self.learning_method = learning_method\n self.learning_decay = learning_decay\n self.learning_offset = learning_offset\n self.max_iter = max_iter\n self.batch_size = batch_size\n self.evaluate_every = evaluate_every\n self.total_samples = total_samples\n self.perp_tol = perp_tol\n self.mean_change_tol = mean_change_tol\n self.max_doc_update_iter = max_doc_update_iter\n self.n_jobs = n_jobs\n self.verbose = verbose\n self.random_state = random_state\n \n def _check_params(self):\n \"\"\"Check model parameters.\"\"\"\n if self.n_components <= 0:\n raise ValueError(\"Invalid 'n_components' parameter: %r\" % self.n_components)\n if self.total_samples <= 0:\n raise ValueError(\"Invalid 'total_samples' parameter: %r\" % self.total_samples)\n if self.learning_offset < 0:\n raise ValueError(\"Invalid 'learning_offset' parameter: %r\" % self.learning_offset)\n if self.learning_method not in ('batch', 'online'):\n raise ValueError(\"Invalid 'learning_method' parameter: %r\" % self.learning_method)\n \n def _init_latent_vars(self, n_features):\n \"\"\"Initialize latent variables.\"\"\"\n self.random_state_ = check_random_state(self.random_state)\n self.n_batch_iter_ = 1\n self.n_iter_ = 0\n if self.doc_topic_prior is None:\n self.doc_topic_prior_ = 1.0 / self.n_components\n else:\n self.doc_topic_prior_ = self.doc_topic_prior\n if self.topic_word_prior is None:\n self.topic_word_prior_ = 1.0 / self.n_components\n else:\n self.topic_word_prior_ = self.topic_word_prior\n init_gamma = 100.0\n init_var = 1.0 / init_gamma\n self.components_ = self.random_state_.gamma(init_gamma, init_var, (self.n_components, n_features))\n self.exp_dirichlet_component_ = np.exp(_dirichlet_expectation_2d(self.components_))\n \n def _e_step(self, X, cal_sstats, random_init, parallel=None):\n \"\"\"E-step in EM update.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n cal_sstats : bool\n Parameter that indicate whether to calculate sufficient statistics\n or not. Set ``cal_sstats`` to True when we need to run M-step.\n\n random_init : bool\n Parameter that indicate whether to initialize document topic\n distribution randomly in the E-step. Set it to True in training\n steps.\n\n parallel : joblib.Parallel, default=None\n Pre-initialized instance of joblib.Parallel.\n\n Returns\n -------\n (doc_topic_distr, suff_stats) :\n `doc_topic_distr` is unnormalized topic distribution for each\n document. In the literature, this is called `gamma`.\n `suff_stats` is expected sufficient statistics for the M-step.\n When `cal_sstats == False`, it will be None.\n\n \"\"\"\n random_state = self.random_state_ if random_init else None\n n_jobs = effective_n_jobs(self.n_jobs)\n if parallel is None:\n parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1))\n results = parallel((delayed(_update_doc_distribution)(X[idx_slice, :], self.exp_dirichlet_component_, self.doc_topic_prior_, self.max_doc_update_iter, self.mean_change_tol, cal_sstats, random_state) for idx_slice in gen_even_slices(X.shape[0], n_jobs)))\n (doc_topics, sstats_list) = zip(*results)\n doc_topic_distr = np.vstack(doc_topics)\n if cal_sstats:\n suff_stats = np.zeros(self.components_.shape)\n for sstats in sstats_list:\n suff_stats += sstats\n suff_stats *= self.exp_dirichlet_component_\n else:\n suff_stats = None\n return doc_topic_distr, suff_stats\n \n def _em_step(self, X, total_samples, batch_update, parallel=None):\n \"\"\"EM update for 1 iteration.\n\n update `_component` by batch VB or online VB.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n total_samples : int\n Total number of documents. It is only used when\n batch_update is `False`.\n\n batch_update : bool\n Parameter that controls updating method.\n `True` for batch learning, `False` for online learning.\n\n parallel : joblib.Parallel, default=None\n Pre-initialized instance of joblib.Parallel\n\n Returns\n -------\n doc_topic_distr : ndarray of shape (n_samples, n_components)\n Unnormalized document topic distribution.\n \"\"\"\n (_, suff_stats) = self._e_step(X, cal_sstats=True, random_init=True, parallel=parallel)\n if batch_update:\n self.components_ = self.topic_word_prior_ + suff_stats\n else:\n weight = np.power(self.learning_offset + self.n_batch_iter_, -self.learning_decay)\n doc_ratio = float(total_samples) / X.shape[0]\n self.components_ *= 1 - weight\n self.components_ += weight * (self.topic_word_prior_ + doc_ratio * suff_stats)\n self.exp_dirichlet_component_ = np.exp(_dirichlet_expectation_2d(self.components_))\n self.n_batch_iter_ += 1\n return\n \n def _more_tags(self):\n return {'requires_positive_X': True}\n \n def _check_non_neg_array(self, X, reset_n_features, whom):\n \"\"\"check X format\n\n check X format and make sure no negative value in X.\n\n Parameters\n ----------\n X : array-like or sparse matrix\n\n \"\"\"\n X = self._validate_data(X, reset=reset_n_features, accept_sparse='csr')\n check_non_negative(X, whom)\n return X\n \n def partial_fit(self, X, y=None):\n \"\"\"Online VB with Mini-Batch update.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Partially fitted estimator.\n \"\"\"\n self._check_params()\n first_time = not hasattr(self, 'components_')\n X = self._check_non_neg_array(X, reset_n_features=first_time, whom='LatentDirichletAllocation.partial_fit')\n (n_samples, n_features) = X.shape\n batch_size = self.batch_size\n if first_time:\n self._init_latent_vars(n_features)\n if n_features != self.components_.shape[1]:\n raise ValueError('The provided data has %d dimensions while the model was trained with feature size %d.' % (n_features, self.components_.shape[1]))\n n_jobs = effective_n_jobs(self.n_jobs)\n with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel:\n for idx_slice in gen_batches(n_samples, batch_size):\n self._em_step(X[idx_slice, :], total_samples=self.total_samples, batch_update=False, parallel=parallel)\n return self\n \n def fit(self, X, y=None):\n \"\"\"Learn model for the data X with variational Bayes method.\n\n When `learning_method` is 'online', use mini-batch update.\n Otherwise, use batch update.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Fitted estimator.\n \"\"\"\n self._check_params()\n X = self._check_non_neg_array(X, reset_n_features=True, whom='LatentDirichletAllocation.fit')\n (n_samples, n_features) = X.shape\n max_iter = self.max_iter\n evaluate_every = self.evaluate_every\n learning_method = self.learning_method\n batch_size = self.batch_size\n self._init_latent_vars(n_features)\n last_bound = None\n n_jobs = effective_n_jobs(self.n_jobs)\n with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel:\n for i in range(max_iter):\n if learning_method == 'online':\n for idx_slice in gen_batches(n_samples, batch_size):\n self._em_step(X[idx_slice, :], total_samples=n_samples, batch_update=False, parallel=parallel)\n else:\n self._em_step(X, total_samples=n_samples, batch_update=True, parallel=parallel)\n if evaluate_every > 0 and (i + 1) % evaluate_every == 0:\n (doc_topics_distr, _) = self._e_step(X, cal_sstats=False, random_init=False, parallel=parallel)\n bound = self._perplexity_precomp_distr(X, doc_topics_distr, sub_sampling=False)\n if self.verbose:\n print('iteration: %d of max_iter: %d, perplexity: %.4f' % (i + 1, max_iter, bound))\n if last_bound and abs(last_bound - bound) < self.perp_tol:\n break\n last_bound = bound\n elif self.verbose:\n print('iteration: %d of max_iter: %d' % (i + 1, max_iter))\n self.n_iter_ += 1\n (doc_topics_distr, _) = self._e_step(X, cal_sstats=False, random_init=False, parallel=parallel)\n self.bound_ = self._perplexity_precomp_distr(X, doc_topics_distr, sub_sampling=False)\n return self\n \n def _unnormalized_transform(self, X):\n \"\"\"Transform data X according to fitted model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n Returns\n -------\n doc_topic_distr : ndarray of shape (n_samples, n_components)\n Document topic distribution for X.\n \"\"\"\n (doc_topic_distr, _) = self._e_step(X, cal_sstats=False, random_init=False)\n return doc_topic_distr\n \n def transform(self, X):\n \"\"\"Transform data X according to the fitted model.\n\n .. versionchanged:: 0.18\n *doc_topic_distr* is now normalized\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n Returns\n -------\n doc_topic_distr : ndarray of shape (n_samples, n_components)\n Document topic distribution for X.\n \"\"\"\n check_is_fitted(self)\n X = self._check_non_neg_array(X, reset_n_features=False, whom='LatentDirichletAllocation.transform')\n doc_topic_distr = self._unnormalized_transform(X)\n doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis]\n return doc_topic_distr\n \n def _approx_bound(self, X, doc_topic_distr, sub_sampling):\n \"\"\"Estimate the variational bound.\n\n Estimate the variational bound over \"all documents\" using only the\n documents passed in as X. Since log-likelihood of each word cannot\n be computed directly, we use this bound to estimate it.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n doc_topic_distr : ndarray of shape (n_samples, n_components)\n Document topic distribution. In the literature, this is called\n gamma.\n\n sub_sampling : bool, default=False\n Compensate for subsampling of documents.\n It is used in calculate bound in online learning.\n\n Returns\n -------\n score : float\n\n \"\"\"\n \n def _loglikelihood(prior, distr, dirichlet_distr, size):\n score = np.sum((prior - distr) * dirichlet_distr)\n score += np.sum(gammaln(distr) - gammaln(prior))\n score += np.sum(gammaln(prior * size) - gammaln(np.sum(distr, 1)))\n return score\n is_sparse_x = sp.issparse(X)\n (n_samples, n_components) = doc_topic_distr.shape\n n_features = self.components_.shape[1]\n score = 0\n dirichlet_doc_topic = _dirichlet_expectation_2d(doc_topic_distr)\n dirichlet_component_ = _dirichlet_expectation_2d(self.components_)\n doc_topic_prior = self.doc_topic_prior_\n topic_word_prior = self.topic_word_prior_\n if is_sparse_x:\n X_data = X.data\n X_indices = X.indices\n X_indptr = X.indptr\n for idx_d in range(0, n_samples):\n if is_sparse_x:\n ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]]\n cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]]\n else:\n ids = np.nonzero(X[idx_d, :])[0]\n cnts = X[idx_d, ids]\n temp = dirichlet_doc_topic[idx_d, :, np.newaxis] + dirichlet_component_[:, ids]\n norm_phi = logsumexp(temp, axis=0)\n score += np.dot(cnts, norm_phi)\n score += _loglikelihood(doc_topic_prior, doc_topic_distr, dirichlet_doc_topic, self.n_components)\n if sub_sampling:\n doc_ratio = float(self.total_samples) / n_samples\n score *= doc_ratio\n score += _loglikelihood(topic_word_prior, self.components_, dirichlet_component_, n_features)\n return score\n \n def score(self, X, y=None):\n \"\"\"Calculate approximate log-likelihood as score.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n score : float\n Use approximate bound as score.\n \"\"\"\n check_is_fitted(self)\n X = self._check_non_neg_array(X, reset_n_features=False, whom='LatentDirichletAllocation.score')\n doc_topic_distr = self._unnormalized_transform(X)\n score = self._approx_bound(X, doc_topic_distr, sub_sampling=False)\n return score\n \n def _perplexity_precomp_distr(self, X, doc_topic_distr=None, sub_sampling=False):\n \"\"\"Calculate approximate perplexity for data X with ability to accept\n precomputed doc_topic_distr\n\n Perplexity is defined as exp(-1. * log-likelihood per word)\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n doc_topic_distr : ndarray of shape (n_samples, n_components), default=None\n Document topic distribution.\n If it is None, it will be generated by applying transform on X.\n\n Returns\n -------\n score : float\n Perplexity score.\n \"\"\"\n if doc_topic_distr is None:\n doc_topic_distr = self._unnormalized_transform(X)\n else:\n (n_samples, n_components) = doc_topic_distr.shape\n if n_samples != X.shape[0]:\n raise ValueError('Number of samples in X and doc_topic_distr do not match.')\n if n_components != self.n_components:\n raise ValueError('Number of topics does not match.')\n current_samples = X.shape[0]\n bound = self._approx_bound(X, doc_topic_distr, sub_sampling)\n if sub_sampling:\n word_cnt = X.sum() * (float(self.total_samples) / current_samples)\n else:\n word_cnt = X.sum()\n perword_bound = bound / word_cnt\n return np.exp(-1.0 * perword_bound)\n \n def perplexity(self, X, sub_sampling=False):\n \"\"\"Calculate approximate perplexity for data X.\n\n Perplexity is defined as exp(-1. * log-likelihood per word)\n\n .. versionchanged:: 0.19\n *doc_topic_distr* argument has been deprecated and is ignored\n because user no longer has access to unnormalized distribution\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n sub_sampling : bool\n Do sub-sampling or not.\n\n Returns\n -------\n score : float\n Perplexity score.\n \"\"\"\n check_is_fitted(self)\n X = self._check_non_neg_array(X, reset_n_features=True, whom='LatentDirichletAllocation.perplexity')\n return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling)\n" }, @@ -20623,7 +20689,7 @@ "sklearn.decomposition._nmf.NMF.inverse_transform" ], "is_public": true, - "description": "Non-Negative Matrix Factorization (NMF).\n\nFind two non-negative matrices (W, H) whose product approximates the non- negative matrix X. This factorization can be used for example for dimensionality reduction, source separation or topic extraction. The objective function is: .. math:: 0.5 * ||X - WH||_{loss}^2 + alpha\\_W * l1_{ratio} * n\\_features * ||vec(W)||_1 + alpha\\_H * l1_{ratio} * n\\_samples * ||vec(H)||_1 + 0.5 * alpha\\_W * (1 - l1_{ratio}) * n\\_features * ||W||_{Fro}^2 + 0.5 * alpha\\_H * (1 - l1_{ratio}) * n\\_samples * ||H||_{Fro}^2 Where: :math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm) :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm) The generic norm :math:`||X - WH||_{loss}` may represent the Frobenius norm or another supported beta-divergence loss. The choice between options is controlled by the `beta_loss` parameter. The regularization terms are scaled by `n_features` for `W` and by `n_samples` for `H` to keep their impact balanced with respect to one another and to the data fit term as independent as possible of the size `n_samples` of the training set. The objective function is minimized with an alternating minimization of W and H. Read more in the :ref:`User Guide `.", + "description": "Non-Negative Matrix Factorization (NMF).\n\nFind two non-negative matrices (W, H) whose product approximates the non-\nnegative matrix X. This factorization can be used for example for\ndimensionality reduction, source separation or topic extraction.\n\nThe objective function is:\n\n .. math::\n\n 0.5 * ||X - WH||_{loss}^2\n\n + alpha\\_W * l1_{ratio} * n\\_features * ||vec(W)||_1\n\n + alpha\\_H * l1_{ratio} * n\\_samples * ||vec(H)||_1\n\n + 0.5 * alpha\\_W * (1 - l1_{ratio}) * n\\_features * ||W||_{Fro}^2\n\n + 0.5 * alpha\\_H * (1 - l1_{ratio}) * n\\_samples * ||H||_{Fro}^2\n\nWhere:\n\n:math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm)\n\n:math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm)\n\nThe generic norm :math:`||X - WH||_{loss}` may represent\nthe Frobenius norm or another supported beta-divergence loss.\nThe choice between options is controlled by the `beta_loss` parameter.\n\nThe regularization terms are scaled by `n_features` for `W` and by `n_samples` for\n`H` to keep their impact balanced with respect to one another and to the data fit\nterm as independent as possible of the size `n_samples` of the training set.\n\nThe objective function is minimized with an alternating minimization of W\nand H.\n\nRead more in the :ref:`User Guide `.", "docstring": "Non-Negative Matrix Factorization (NMF).\n\n Find two non-negative matrices (W, H) whose product approximates the non-\n negative matrix X. This factorization can be used for example for\n dimensionality reduction, source separation or topic extraction.\n\n The objective function is:\n\n .. math::\n\n 0.5 * ||X - WH||_{loss}^2\n\n + alpha\\_W * l1_{ratio} * n\\_features * ||vec(W)||_1\n\n + alpha\\_H * l1_{ratio} * n\\_samples * ||vec(H)||_1\n\n + 0.5 * alpha\\_W * (1 - l1_{ratio}) * n\\_features * ||W||_{Fro}^2\n\n + 0.5 * alpha\\_H * (1 - l1_{ratio}) * n\\_samples * ||H||_{Fro}^2\n\n Where:\n\n :math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm)\n\n :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm)\n\n The generic norm :math:`||X - WH||_{loss}` may represent\n the Frobenius norm or another supported beta-divergence loss.\n The choice between options is controlled by the `beta_loss` parameter.\n\n The regularization terms are scaled by `n_features` for `W` and by `n_samples` for\n `H` to keep their impact balanced with respect to one another and to the data fit\n term as independent as possible of the size `n_samples` of the training set.\n\n The objective function is minimized with an alternating minimization of W\n and H.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of components, if n_components is not set all features\n are kept.\n\n init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None\n Method used to initialize the procedure.\n Default: None.\n Valid options:\n\n - `None`: 'nndsvd' if n_components <= min(n_samples, n_features),\n otherwise random.\n\n - `'random'`: non-negative random matrices, scaled with:\n sqrt(X.mean() / n_components)\n\n - `'nndsvd'`: Nonnegative Double Singular Value Decomposition (NNDSVD)\n initialization (better for sparseness)\n\n - `'nndsvda'`: NNDSVD with zeros filled with the average of X\n (better when sparsity is not desired)\n\n - `'nndsvdar'` NNDSVD with zeros filled with small random values\n (generally faster, less accurate alternative to NNDSVDa\n for when sparsity is not desired)\n\n - `'custom'`: use custom matrices W and H\n\n solver : {'cd', 'mu'}, default='cd'\n Numerical solver to use:\n 'cd' is a Coordinate Descent solver.\n 'mu' is a Multiplicative Update solver.\n\n .. versionadded:: 0.17\n Coordinate Descent solver.\n\n .. versionadded:: 0.19\n Multiplicative Update solver.\n\n beta_loss : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius'\n Beta divergence to be minimized, measuring the distance between X\n and the dot product WH. Note that values different from 'frobenius'\n (or 2) and 'kullback-leibler' (or 1) lead to significantly slower\n fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input\n matrix X cannot contain zeros. Used only in 'mu' solver.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-4\n Tolerance of the stopping condition.\n\n max_iter : int, default=200\n Maximum number of iterations before timing out.\n\n random_state : int, RandomState instance or None, default=None\n Used for initialisation (when ``init`` == 'nndsvdar' or\n 'random'), and in Coordinate Descent. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n alpha : float, default=0.0\n Constant that multiplies the regularization terms. Set it to zero to\n have no regularization. When using `alpha` instead of `alpha_W` and `alpha_H`,\n the regularization terms are not scaled by the `n_features` (resp. `n_samples`)\n factors for `W` (resp. `H`).\n\n .. versionadded:: 0.17\n *alpha* used in the Coordinate Descent solver.\n\n .. deprecated:: 1.0\n The `alpha` parameter is deprecated in 1.0 and will be removed in 1.2.\n Use `alpha_W` and `alpha_H` instead.\n\n alpha_W : float, default=0.0\n Constant that multiplies the regularization terms of `W`. Set it to zero\n (default) to have no regularization on `W`.\n\n .. versionadded:: 1.0\n\n alpha_H : float or \"same\", default=\"same\"\n Constant that multiplies the regularization terms of `H`. Set it to zero to\n have no regularization on `H`. If \"same\" (default), it takes the same value as\n `alpha_W`.\n\n .. versionadded:: 1.0\n\n l1_ratio : float, default=0.0\n The regularization mixing parameter, with 0 <= l1_ratio <= 1.\n For l1_ratio = 0 the penalty is an elementwise L2 penalty\n (aka Frobenius Norm).\n For l1_ratio = 1 it is an elementwise L1 penalty.\n For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.\n\n .. versionadded:: 0.17\n Regularization parameter *l1_ratio* used in the Coordinate Descent\n solver.\n\n verbose : int, default=0\n Whether to be verbose.\n\n shuffle : bool, default=False\n If true, randomize the order of coordinates in the CD solver.\n\n .. versionadded:: 0.17\n *shuffle* parameter used in the Coordinate Descent solver.\n\n regularization : {'both', 'components', 'transformation', None}, default='both'\n Select whether the regularization affects the components (H), the\n transformation (W), both or none of them.\n\n .. versionadded:: 0.24\n\n .. deprecated:: 1.0\n The `regularization` parameter is deprecated in 1.0 and will be removed in\n 1.2. Use `alpha_W` and `alpha_H` instead.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Factorization matrix, sometimes called 'dictionary'.\n\n n_components_ : int\n The number of components. It is same as the `n_components` parameter\n if it was given. Otherwise, it will be same as the number of\n features.\n\n reconstruction_err_ : float\n Frobenius norm of the matrix difference, or beta-divergence, between\n the training data ``X`` and the reconstructed data ``WH`` from\n the fitted model.\n\n n_iter_ : int\n Actual number of iterations.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n PCA : Principal component analysis.\n SparseCoder : Find a sparse representation of data from a fixed,\n precomputed dictionary.\n SparsePCA : Sparse Principal Components Analysis.\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n\n References\n ----------\n Cichocki, Andrzej, and P. H. A. N. Anh-Huy. \"Fast local algorithms for\n large scale nonnegative matrix and tensor factorizations.\"\n IEICE transactions on fundamentals of electronics, communications and\n computer sciences 92.3: 708-721, 2009.\n\n Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix\n factorization with the beta-divergence. Neural Computation, 23(9).\n\n Examples\n --------\n >>> import numpy as np\n >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])\n >>> from sklearn.decomposition import NMF\n >>> model = NMF(n_components=2, init='random', random_state=0)\n >>> W = model.fit_transform(X)\n >>> H = model.components_\n ", "source_code": "\n\nclass NMF(TransformerMixin, BaseEstimator):\n \"\"\"Non-Negative Matrix Factorization (NMF).\n\n Find two non-negative matrices (W, H) whose product approximates the non-\n negative matrix X. This factorization can be used for example for\n dimensionality reduction, source separation or topic extraction.\n\n The objective function is:\n\n .. math::\n\n 0.5 * ||X - WH||_{loss}^2\n\n + alpha\\_W * l1_{ratio} * n\\_features * ||vec(W)||_1\n\n + alpha\\_H * l1_{ratio} * n\\_samples * ||vec(H)||_1\n\n + 0.5 * alpha\\_W * (1 - l1_{ratio}) * n\\_features * ||W||_{Fro}^2\n\n + 0.5 * alpha\\_H * (1 - l1_{ratio}) * n\\_samples * ||H||_{Fro}^2\n\n Where:\n\n :math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm)\n\n :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm)\n\n The generic norm :math:`||X - WH||_{loss}` may represent\n the Frobenius norm or another supported beta-divergence loss.\n The choice between options is controlled by the `beta_loss` parameter.\n\n The regularization terms are scaled by `n_features` for `W` and by `n_samples` for\n `H` to keep their impact balanced with respect to one another and to the data fit\n term as independent as possible of the size `n_samples` of the training set.\n\n The objective function is minimized with an alternating minimization of W\n and H.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of components, if n_components is not set all features\n are kept.\n\n init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None\n Method used to initialize the procedure.\n Default: None.\n Valid options:\n\n - `None`: 'nndsvd' if n_components <= min(n_samples, n_features),\n otherwise random.\n\n - `'random'`: non-negative random matrices, scaled with:\n sqrt(X.mean() / n_components)\n\n - `'nndsvd'`: Nonnegative Double Singular Value Decomposition (NNDSVD)\n initialization (better for sparseness)\n\n - `'nndsvda'`: NNDSVD with zeros filled with the average of X\n (better when sparsity is not desired)\n\n - `'nndsvdar'` NNDSVD with zeros filled with small random values\n (generally faster, less accurate alternative to NNDSVDa\n for when sparsity is not desired)\n\n - `'custom'`: use custom matrices W and H\n\n solver : {'cd', 'mu'}, default='cd'\n Numerical solver to use:\n 'cd' is a Coordinate Descent solver.\n 'mu' is a Multiplicative Update solver.\n\n .. versionadded:: 0.17\n Coordinate Descent solver.\n\n .. versionadded:: 0.19\n Multiplicative Update solver.\n\n beta_loss : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius'\n Beta divergence to be minimized, measuring the distance between X\n and the dot product WH. Note that values different from 'frobenius'\n (or 2) and 'kullback-leibler' (or 1) lead to significantly slower\n fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input\n matrix X cannot contain zeros. Used only in 'mu' solver.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-4\n Tolerance of the stopping condition.\n\n max_iter : int, default=200\n Maximum number of iterations before timing out.\n\n random_state : int, RandomState instance or None, default=None\n Used for initialisation (when ``init`` == 'nndsvdar' or\n 'random'), and in Coordinate Descent. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n alpha : float, default=0.0\n Constant that multiplies the regularization terms. Set it to zero to\n have no regularization. When using `alpha` instead of `alpha_W` and `alpha_H`,\n the regularization terms are not scaled by the `n_features` (resp. `n_samples`)\n factors for `W` (resp. `H`).\n\n .. versionadded:: 0.17\n *alpha* used in the Coordinate Descent solver.\n\n .. deprecated:: 1.0\n The `alpha` parameter is deprecated in 1.0 and will be removed in 1.2.\n Use `alpha_W` and `alpha_H` instead.\n\n alpha_W : float, default=0.0\n Constant that multiplies the regularization terms of `W`. Set it to zero\n (default) to have no regularization on `W`.\n\n .. versionadded:: 1.0\n\n alpha_H : float or \"same\", default=\"same\"\n Constant that multiplies the regularization terms of `H`. Set it to zero to\n have no regularization on `H`. If \"same\" (default), it takes the same value as\n `alpha_W`.\n\n .. versionadded:: 1.0\n\n l1_ratio : float, default=0.0\n The regularization mixing parameter, with 0 <= l1_ratio <= 1.\n For l1_ratio = 0 the penalty is an elementwise L2 penalty\n (aka Frobenius Norm).\n For l1_ratio = 1 it is an elementwise L1 penalty.\n For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.\n\n .. versionadded:: 0.17\n Regularization parameter *l1_ratio* used in the Coordinate Descent\n solver.\n\n verbose : int, default=0\n Whether to be verbose.\n\n shuffle : bool, default=False\n If true, randomize the order of coordinates in the CD solver.\n\n .. versionadded:: 0.17\n *shuffle* parameter used in the Coordinate Descent solver.\n\n regularization : {'both', 'components', 'transformation', None}, default='both'\n Select whether the regularization affects the components (H), the\n transformation (W), both or none of them.\n\n .. versionadded:: 0.24\n\n .. deprecated:: 1.0\n The `regularization` parameter is deprecated in 1.0 and will be removed in\n 1.2. Use `alpha_W` and `alpha_H` instead.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Factorization matrix, sometimes called 'dictionary'.\n\n n_components_ : int\n The number of components. It is same as the `n_components` parameter\n if it was given. Otherwise, it will be same as the number of\n features.\n\n reconstruction_err_ : float\n Frobenius norm of the matrix difference, or beta-divergence, between\n the training data ``X`` and the reconstructed data ``WH`` from\n the fitted model.\n\n n_iter_ : int\n Actual number of iterations.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n MiniBatchSparsePCA : Mini-batch Sparse Principal Components Analysis.\n PCA : Principal component analysis.\n SparseCoder : Find a sparse representation of data from a fixed,\n precomputed dictionary.\n SparsePCA : Sparse Principal Components Analysis.\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n\n References\n ----------\n Cichocki, Andrzej, and P. H. A. N. Anh-Huy. \"Fast local algorithms for\n large scale nonnegative matrix and tensor factorizations.\"\n IEICE transactions on fundamentals of electronics, communications and\n computer sciences 92.3: 708-721, 2009.\n\n Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix\n factorization with the beta-divergence. Neural Computation, 23(9).\n\n Examples\n --------\n >>> import numpy as np\n >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])\n >>> from sklearn.decomposition import NMF\n >>> model = NMF(n_components=2, init='random', random_state=0)\n >>> W = model.fit_transform(X)\n >>> H = model.components_\n \"\"\"\n \n def __init__(self, n_components=None, *, init='warn', solver='cd', beta_loss='frobenius', tol=0.0001, max_iter=200, random_state=None, alpha='deprecated', alpha_W=0.0, alpha_H='same', l1_ratio=0.0, verbose=0, shuffle=False, regularization='deprecated'):\n self.n_components = n_components\n self.init = init\n self.solver = solver\n self.beta_loss = beta_loss\n self.tol = tol\n self.max_iter = max_iter\n self.random_state = random_state\n self.alpha = alpha\n self.alpha_W = alpha_W\n self.alpha_H = alpha_H\n self.l1_ratio = l1_ratio\n self.verbose = verbose\n self.shuffle = shuffle\n self.regularization = regularization\n \n def _more_tags(self):\n return {'requires_positive_X': True}\n \n def _check_params(self, X):\n self._n_components = self.n_components\n if self._n_components is None:\n self._n_components = X.shape[1]\n if not isinstance(self._n_components, numbers.Integral) or self._n_components <= 0:\n raise ValueError(f'Number of components must be a positive integer; got (n_components={self._n_components!r})')\n if not isinstance(self.max_iter, numbers.Integral) or self.max_iter < 0:\n raise ValueError(f'Maximum number of iterations must be a positive integer; got (max_iter={self.max_iter!r})')\n if not isinstance(self.tol, numbers.Number) or self.tol < 0:\n raise ValueError(f'Tolerance for stopping criteria must be positive; got (tol={self.tol!r})')\n self._beta_loss = _beta_loss_to_float(self.beta_loss)\n allowed_solver = ('cd', 'mu')\n if self.solver not in allowed_solver:\n raise ValueError(f'Invalid solver parameter: got {self.solver!r} instead of one of {allowed_solver}')\n if self.solver != 'mu' and self.beta_loss not in (2, 'frobenius'):\n raise ValueError(f'Invalid beta_loss parameter: solver {self.solver!r} does not handle beta_loss = {self.beta_loss!r}')\n if self.solver == 'mu' and self.init == 'nndsvd':\n warnings.warn(\"The multiplicative update ('mu') solver cannot update zeros present in the initialization, and so leads to poorer results when used jointly with init='nndsvd'. You may try init='nndsvda' or init='nndsvdar' instead.\", UserWarning)\n if self.alpha != 'deprecated':\n warnings.warn('`alpha` was deprecated in version 1.0 and will be removed in 1.2. Use `alpha_W` and `alpha_H` instead', FutureWarning)\n alpha = self.alpha\n else:\n alpha = 0.0\n if self.regularization != 'deprecated':\n warnings.warn('`regularization` was deprecated in version 1.0 and will be removed in 1.2. Use `alpha_W` and `alpha_H` instead', FutureWarning)\n allowed_regularization = ('both', 'components', 'transformation', None)\n if self.regularization not in allowed_regularization:\n raise ValueError(f'Invalid regularization parameter: got {self.regularization!r} instead of one of {allowed_regularization}')\n regularization = self.regularization\n else:\n regularization = 'both'\n (self._l1_reg_W, self._l1_reg_H, self._l2_reg_W, self._l2_reg_H) = _compute_regularization(alpha, self.alpha_W, self.alpha_H, self.l1_ratio, regularization)\n return self\n \n def _check_w_h(self, X, W, H, update_H):\n (n_samples, n_features) = X.shape\n if self.init == 'custom' and update_H:\n _check_init(H, (self._n_components, n_features), 'NMF (input H)')\n _check_init(W, (n_samples, self._n_components), 'NMF (input W)')\n if H.dtype != X.dtype or W.dtype != X.dtype:\n raise TypeError('H and W should have the same dtype as X. Got H.dtype = {} and W.dtype = {}.'.format(H.dtype, W.dtype))\n elif not update_H:\n _check_init(H, (self._n_components, n_features), 'NMF (input H)')\n if H.dtype != X.dtype:\n raise TypeError('H should have the same dtype as X. Got H.dtype = {}.'.format(H.dtype))\n if self.solver == 'mu':\n avg = np.sqrt(X.mean() / self._n_components)\n W = np.full((n_samples, self._n_components), avg, dtype=X.dtype)\n else:\n W = np.zeros((n_samples, self._n_components), dtype=X.dtype)\n else:\n (W, H) = _initialize_nmf(X, self._n_components, init=self.init, random_state=self.random_state)\n return W, H\n \n def _scale_regularization(self, X):\n (n_samples, n_features) = X.shape\n if self.alpha_W != 0 or self.alpha_H != 'same':\n l1_reg_W = n_features * self._l1_reg_W\n l1_reg_H = n_samples * self._l1_reg_H\n l2_reg_W = n_features * self._l2_reg_W\n l2_reg_H = n_samples * self._l2_reg_H\n else:\n l1_reg_W = self._l1_reg_W\n l1_reg_H = self._l1_reg_H\n l2_reg_W = self._l2_reg_W\n l2_reg_H = self._l2_reg_H\n return l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H\n \n def fit_transform(self, X, y=None, W=None, H=None):\n \"\"\"Learn a NMF model for the data X and returns the transformed data.\n\n This is more efficient than calling fit followed by transform.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n W : array-like of shape (n_samples, n_components)\n If init='custom', it is used as initial guess for the solution.\n\n H : array-like of shape (n_components, n_features)\n If init='custom', it is used as initial guess for the solution.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), dtype=[np.float64, np.float32])\n with config_context(assume_finite=True):\n (W, H, n_iter) = self._fit_transform(X, W=W, H=H)\n self.reconstruction_err_ = _beta_divergence(X, W, H, self._beta_loss, square_root=True)\n self.n_components_ = H.shape[0]\n self.components_ = H\n self.n_iter_ = n_iter\n return W\n \n def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):\n \"\"\"Learn a NMF model for the data X and returns the transformed data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Data matrix to be decomposed\n\n y : Ignored\n\n W : array-like of shape (n_samples, n_components)\n If init='custom', it is used as initial guess for the solution.\n\n H : array-like of shape (n_components, n_features)\n If init='custom', it is used as initial guess for the solution.\n If update_H=False, it is used as a constant, to solve for W only.\n\n update_H : bool, default=True\n If True, both W and H will be estimated from initial guesses,\n this corresponds to a call to the 'fit_transform' method.\n If False, only W will be estimated, this corresponds to a call\n to the 'transform' method.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Transformed data.\n\n H : ndarray of shape (n_components, n_features)\n Factorization matrix, sometimes called 'dictionary'.\n\n n_iter_ : int\n Actual number of iterations.\n \"\"\"\n check_non_negative(X, 'NMF (input X)')\n self._check_params(X)\n if X.min() == 0 and self._beta_loss <= 0:\n raise ValueError('When beta_loss <= 0 and X contains zeros, the solver may diverge. Please add small values to X, or use a positive beta_loss.')\n (W, H) = self._check_w_h(X, W, H, update_H)\n (l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H) = self._scale_regularization(X)\n if self.solver == 'cd':\n (W, H, n_iter) = _fit_coordinate_descent(X, W, H, self.tol, self.max_iter, l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H, update_H=update_H, verbose=self.verbose, shuffle=self.shuffle, random_state=self.random_state)\n elif self.solver == 'mu':\n (W, H, n_iter) = _fit_multiplicative_update(X, W, H, self._beta_loss, self.max_iter, self.tol, l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H, update_H=update_H, verbose=self.verbose)\n else:\n raise ValueError(\"Invalid solver parameter '%s'.\" % self.solver)\n if n_iter == self.max_iter and self.tol > 0:\n warnings.warn('Maximum number of iterations %d reached. Increase it to improve convergence.' % self.max_iter, ConvergenceWarning)\n return W, H, n_iter\n \n def fit(self, X, y=None, **params):\n \"\"\"Learn a NMF model for the data X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n **params : kwargs\n Parameters (keyword arguments) and values passed to\n the fit_transform instance.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self.fit_transform(X, **params)\n return self\n \n def transform(self, X):\n \"\"\"Transform the data X according to the fitted NMF model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), dtype=[np.float64, np.float32], reset=False)\n with config_context(assume_finite=True):\n (W, *_) = self._fit_transform(X, H=self.components_, update_H=False)\n return W\n \n def inverse_transform(self, W):\n \"\"\"Transform data back to its original space.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n W : {ndarray, sparse matrix} of shape (n_samples, n_components)\n Transformed data matrix.\n\n Returns\n -------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Returns a data matrix of the original shape.\n \"\"\"\n check_is_fitted(self)\n return np.dot(W, self.components_)\n" }, @@ -20644,7 +20710,7 @@ "sklearn.decomposition._pca.PCA._more_tags" ], "is_public": true, - "description": "Principal component analysis (PCA).\n\nLinear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. The input data is centered but not scaled for each feature before applying the SVD. It uses the LAPACK implementation of the full SVD or a randomized truncated SVD by the method of Halko et al. 2009, depending on the shape of the input data and the number of components to extract. It can also use the scipy.sparse.linalg ARPACK implementation of the truncated SVD. Notice that this class does not support sparse input. See :class:`TruncatedSVD` for an alternative with sparse data. Read more in the :ref:`User Guide `.", + "description": "Principal component analysis (PCA).\n\nLinear dimensionality reduction using Singular Value Decomposition of the\ndata to project it to a lower dimensional space. The input data is centered\nbut not scaled for each feature before applying the SVD.\n\nIt uses the LAPACK implementation of the full SVD or a randomized truncated\nSVD by the method of Halko et al. 2009, depending on the shape of the input\ndata and the number of components to extract.\n\nIt can also use the scipy.sparse.linalg ARPACK implementation of the\ntruncated SVD.\n\nNotice that this class does not support sparse input. See\n:class:`TruncatedSVD` for an alternative with sparse data.\n\nRead more in the :ref:`User Guide `.", "docstring": "Principal component analysis (PCA).\n\n Linear dimensionality reduction using Singular Value Decomposition of the\n data to project it to a lower dimensional space. The input data is centered\n but not scaled for each feature before applying the SVD.\n\n It uses the LAPACK implementation of the full SVD or a randomized truncated\n SVD by the method of Halko et al. 2009, depending on the shape of the input\n data and the number of components to extract.\n\n It can also use the scipy.sparse.linalg ARPACK implementation of the\n truncated SVD.\n\n Notice that this class does not support sparse input. See\n :class:`TruncatedSVD` for an alternative with sparse data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, float or 'mle', default=None\n Number of components to keep.\n if n_components is not set all components are kept::\n\n n_components == min(n_samples, n_features)\n\n If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's\n MLE is used to guess the dimension. Use of ``n_components == 'mle'``\n will interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``.\n\n If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the\n number of components such that the amount of variance that needs to be\n explained is greater than the percentage specified by n_components.\n\n If ``svd_solver == 'arpack'``, the number of components must be\n strictly less than the minimum of n_features and n_samples.\n\n Hence, the None case results in::\n\n n_components == min(n_samples, n_features) - 1\n\n copy : bool, default=True\n If False, data passed to fit are overwritten and running\n fit(X).transform(X) will not yield the expected results,\n use fit_transform(X) instead.\n\n whiten : bool, default=False\n When True (False by default) the `components_` vectors are multiplied\n by the square root of n_samples and then divided by the singular values\n to ensure uncorrelated outputs with unit component-wise variances.\n\n Whitening will remove some information from the transformed signal\n (the relative variance scales of the components) but can sometime\n improve the predictive accuracy of the downstream estimators by\n making their data respect some hard-wired assumptions.\n\n svd_solver : {'auto', 'full', 'arpack', 'randomized'}, default='auto'\n If auto :\n The solver is selected by a default policy based on `X.shape` and\n `n_components`: if the input data is larger than 500x500 and the\n number of components to extract is lower than 80% of the smallest\n dimension of the data, then the more efficient 'randomized'\n method is enabled. Otherwise the exact full SVD is computed and\n optionally truncated afterwards.\n If full :\n run exact full SVD calling the standard LAPACK solver via\n `scipy.linalg.svd` and select the components by postprocessing\n If arpack :\n run SVD truncated to n_components calling ARPACK solver via\n `scipy.sparse.linalg.svds`. It requires strictly\n 0 < n_components < min(X.shape)\n If randomized :\n run randomized SVD by the method of Halko et al.\n\n .. versionadded:: 0.18.0\n\n tol : float, default=0.0\n Tolerance for singular values computed by svd_solver == 'arpack'.\n Must be of range [0.0, infinity).\n\n .. versionadded:: 0.18.0\n\n iterated_power : int or 'auto', default='auto'\n Number of iterations for the power method computed by\n svd_solver == 'randomized'.\n Must be of range [0, infinity).\n\n .. versionadded:: 0.18.0\n\n random_state : int, RandomState instance or None, default=None\n Used when the 'arpack' or 'randomized' solvers are used. Pass an int\n for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n .. versionadded:: 0.18.0\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Principal axes in feature space, representing the directions of\n maximum variance in the data. Equivalently, the right singular\n vectors of the centered input data, parallel to its eigenvectors.\n The components are sorted by ``explained_variance_``.\n\n explained_variance_ : ndarray of shape (n_components,)\n The amount of variance explained by each of the selected components.\n The variance estimation uses `n_samples - 1` degrees of freedom.\n\n Equal to n_components largest eigenvalues\n of the covariance matrix of X.\n\n .. versionadded:: 0.18\n\n explained_variance_ratio_ : ndarray of shape (n_components,)\n Percentage of variance explained by each of the selected components.\n\n If ``n_components`` is not set then all components are stored and the\n sum of the ratios is equal to 1.0.\n\n singular_values_ : ndarray of shape (n_components,)\n The singular values corresponding to each of the selected components.\n The singular values are equal to the 2-norms of the ``n_components``\n variables in the lower-dimensional space.\n\n .. versionadded:: 0.19\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, estimated from the training set.\n\n Equal to `X.mean(axis=0)`.\n\n n_components_ : int\n The estimated number of components. When n_components is set\n to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this\n number is estimated from input data. Otherwise it equals the parameter\n n_components, or the lesser value of n_features and n_samples\n if n_components is None.\n\n n_features_ : int\n Number of features in the training data.\n\n n_samples_ : int\n Number of samples in the training data.\n\n noise_variance_ : float\n The estimated noise covariance following the Probabilistic PCA model\n from Tipping and Bishop 1999. See \"Pattern Recognition and\n Machine Learning\" by C. Bishop, 12.2.1 p. 574 or\n http://www.miketipping.com/papers/met-mppca.pdf. It is required to\n compute the estimated data covariance and score samples.\n\n Equal to the average of (min(n_features, n_samples) - n_components)\n smallest eigenvalues of the covariance matrix of X.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n KernelPCA : Kernel Principal Component Analysis.\n SparsePCA : Sparse Principal Component Analysis.\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n IncrementalPCA : Incremental Principal Component Analysis.\n\n References\n ----------\n For n_components == 'mle', this class uses the method from:\n `Minka, T. P.. \"Automatic choice of dimensionality for PCA\".\n In NIPS, pp. 598-604 `_\n\n Implements the probabilistic PCA model from:\n `Tipping, M. E., and Bishop, C. M. (1999). \"Probabilistic principal\n component analysis\". Journal of the Royal Statistical Society:\n Series B (Statistical Methodology), 61(3), 611-622.\n `_\n via the score and score_samples methods.\n\n For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`.\n\n For svd_solver == 'randomized', see:\n `Halko, N., Martinsson, P. G., and Tropp, J. A. (2011).\n \"Finding structure with randomness: Probabilistic algorithms for\n constructing approximate matrix decompositions\".\n SIAM review, 53(2), 217-288.\n `_\n and also\n `Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011).\n \"A randomized algorithm for the decomposition of matrices\".\n Applied and Computational Harmonic Analysis, 30(1), 47-68\n `_.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.decomposition import PCA\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> pca = PCA(n_components=2)\n >>> pca.fit(X)\n PCA(n_components=2)\n >>> print(pca.explained_variance_ratio_)\n [0.9924... 0.0075...]\n >>> print(pca.singular_values_)\n [6.30061... 0.54980...]\n\n >>> pca = PCA(n_components=2, svd_solver='full')\n >>> pca.fit(X)\n PCA(n_components=2, svd_solver='full')\n >>> print(pca.explained_variance_ratio_)\n [0.9924... 0.00755...]\n >>> print(pca.singular_values_)\n [6.30061... 0.54980...]\n\n >>> pca = PCA(n_components=1, svd_solver='arpack')\n >>> pca.fit(X)\n PCA(n_components=1, svd_solver='arpack')\n >>> print(pca.explained_variance_ratio_)\n [0.99244...]\n >>> print(pca.singular_values_)\n [6.30061...]\n ", "source_code": "\n\nclass PCA(_BasePCA):\n \"\"\"Principal component analysis (PCA).\n\n Linear dimensionality reduction using Singular Value Decomposition of the\n data to project it to a lower dimensional space. The input data is centered\n but not scaled for each feature before applying the SVD.\n\n It uses the LAPACK implementation of the full SVD or a randomized truncated\n SVD by the method of Halko et al. 2009, depending on the shape of the input\n data and the number of components to extract.\n\n It can also use the scipy.sparse.linalg ARPACK implementation of the\n truncated SVD.\n\n Notice that this class does not support sparse input. See\n :class:`TruncatedSVD` for an alternative with sparse data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, float or 'mle', default=None\n Number of components to keep.\n if n_components is not set all components are kept::\n\n n_components == min(n_samples, n_features)\n\n If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's\n MLE is used to guess the dimension. Use of ``n_components == 'mle'``\n will interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``.\n\n If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the\n number of components such that the amount of variance that needs to be\n explained is greater than the percentage specified by n_components.\n\n If ``svd_solver == 'arpack'``, the number of components must be\n strictly less than the minimum of n_features and n_samples.\n\n Hence, the None case results in::\n\n n_components == min(n_samples, n_features) - 1\n\n copy : bool, default=True\n If False, data passed to fit are overwritten and running\n fit(X).transform(X) will not yield the expected results,\n use fit_transform(X) instead.\n\n whiten : bool, default=False\n When True (False by default) the `components_` vectors are multiplied\n by the square root of n_samples and then divided by the singular values\n to ensure uncorrelated outputs with unit component-wise variances.\n\n Whitening will remove some information from the transformed signal\n (the relative variance scales of the components) but can sometime\n improve the predictive accuracy of the downstream estimators by\n making their data respect some hard-wired assumptions.\n\n svd_solver : {'auto', 'full', 'arpack', 'randomized'}, default='auto'\n If auto :\n The solver is selected by a default policy based on `X.shape` and\n `n_components`: if the input data is larger than 500x500 and the\n number of components to extract is lower than 80% of the smallest\n dimension of the data, then the more efficient 'randomized'\n method is enabled. Otherwise the exact full SVD is computed and\n optionally truncated afterwards.\n If full :\n run exact full SVD calling the standard LAPACK solver via\n `scipy.linalg.svd` and select the components by postprocessing\n If arpack :\n run SVD truncated to n_components calling ARPACK solver via\n `scipy.sparse.linalg.svds`. It requires strictly\n 0 < n_components < min(X.shape)\n If randomized :\n run randomized SVD by the method of Halko et al.\n\n .. versionadded:: 0.18.0\n\n tol : float, default=0.0\n Tolerance for singular values computed by svd_solver == 'arpack'.\n Must be of range [0.0, infinity).\n\n .. versionadded:: 0.18.0\n\n iterated_power : int or 'auto', default='auto'\n Number of iterations for the power method computed by\n svd_solver == 'randomized'.\n Must be of range [0, infinity).\n\n .. versionadded:: 0.18.0\n\n random_state : int, RandomState instance or None, default=None\n Used when the 'arpack' or 'randomized' solvers are used. Pass an int\n for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n .. versionadded:: 0.18.0\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Principal axes in feature space, representing the directions of\n maximum variance in the data. Equivalently, the right singular\n vectors of the centered input data, parallel to its eigenvectors.\n The components are sorted by ``explained_variance_``.\n\n explained_variance_ : ndarray of shape (n_components,)\n The amount of variance explained by each of the selected components.\n The variance estimation uses `n_samples - 1` degrees of freedom.\n\n Equal to n_components largest eigenvalues\n of the covariance matrix of X.\n\n .. versionadded:: 0.18\n\n explained_variance_ratio_ : ndarray of shape (n_components,)\n Percentage of variance explained by each of the selected components.\n\n If ``n_components`` is not set then all components are stored and the\n sum of the ratios is equal to 1.0.\n\n singular_values_ : ndarray of shape (n_components,)\n The singular values corresponding to each of the selected components.\n The singular values are equal to the 2-norms of the ``n_components``\n variables in the lower-dimensional space.\n\n .. versionadded:: 0.19\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, estimated from the training set.\n\n Equal to `X.mean(axis=0)`.\n\n n_components_ : int\n The estimated number of components. When n_components is set\n to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this\n number is estimated from input data. Otherwise it equals the parameter\n n_components, or the lesser value of n_features and n_samples\n if n_components is None.\n\n n_features_ : int\n Number of features in the training data.\n\n n_samples_ : int\n Number of samples in the training data.\n\n noise_variance_ : float\n The estimated noise covariance following the Probabilistic PCA model\n from Tipping and Bishop 1999. See \"Pattern Recognition and\n Machine Learning\" by C. Bishop, 12.2.1 p. 574 or\n http://www.miketipping.com/papers/met-mppca.pdf. It is required to\n compute the estimated data covariance and score samples.\n\n Equal to the average of (min(n_features, n_samples) - n_components)\n smallest eigenvalues of the covariance matrix of X.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n KernelPCA : Kernel Principal Component Analysis.\n SparsePCA : Sparse Principal Component Analysis.\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n IncrementalPCA : Incremental Principal Component Analysis.\n\n References\n ----------\n For n_components == 'mle', this class uses the method from:\n `Minka, T. P.. \"Automatic choice of dimensionality for PCA\".\n In NIPS, pp. 598-604 `_\n\n Implements the probabilistic PCA model from:\n `Tipping, M. E., and Bishop, C. M. (1999). \"Probabilistic principal\n component analysis\". Journal of the Royal Statistical Society:\n Series B (Statistical Methodology), 61(3), 611-622.\n `_\n via the score and score_samples methods.\n\n For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`.\n\n For svd_solver == 'randomized', see:\n `Halko, N., Martinsson, P. G., and Tropp, J. A. (2011).\n \"Finding structure with randomness: Probabilistic algorithms for\n constructing approximate matrix decompositions\".\n SIAM review, 53(2), 217-288.\n `_\n and also\n `Martinsson, P. G., Rokhlin, V., and Tygert, M. (2011).\n \"A randomized algorithm for the decomposition of matrices\".\n Applied and Computational Harmonic Analysis, 30(1), 47-68\n `_.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.decomposition import PCA\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> pca = PCA(n_components=2)\n >>> pca.fit(X)\n PCA(n_components=2)\n >>> print(pca.explained_variance_ratio_)\n [0.9924... 0.0075...]\n >>> print(pca.singular_values_)\n [6.30061... 0.54980...]\n\n >>> pca = PCA(n_components=2, svd_solver='full')\n >>> pca.fit(X)\n PCA(n_components=2, svd_solver='full')\n >>> print(pca.explained_variance_ratio_)\n [0.9924... 0.00755...]\n >>> print(pca.singular_values_)\n [6.30061... 0.54980...]\n\n >>> pca = PCA(n_components=1, svd_solver='arpack')\n >>> pca.fit(X)\n PCA(n_components=1, svd_solver='arpack')\n >>> print(pca.explained_variance_ratio_)\n [0.99244...]\n >>> print(pca.singular_values_)\n [6.30061...]\n \"\"\"\n \n def __init__(self, n_components=None, *, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None):\n self.n_components = n_components\n self.copy = copy\n self.whiten = whiten\n self.svd_solver = svd_solver\n self.tol = tol\n self.iterated_power = iterated_power\n self.random_state = random_state\n \n def fit(self, X, y=None):\n \"\"\"Fit the model with X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Ignored.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self._fit(X)\n return self\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit the model with X and apply the dimensionality reduction on X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Ignored.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed values.\n\n Notes\n -----\n This method returns a Fortran-ordered array. To convert it to a\n C-ordered array, use 'np.ascontiguousarray'.\n \"\"\"\n (U, S, Vt) = self._fit(X)\n U = U[:, :self.n_components_]\n if self.whiten:\n U *= sqrt(X.shape[0] - 1)\n else:\n U *= S[:self.n_components_]\n return U\n \n def _fit(self, X):\n \"\"\"Dispatch to the right submethod depending on the chosen solver.\"\"\"\n if issparse(X):\n raise TypeError('PCA does not support sparse input. See TruncatedSVD for a possible alternative.')\n X = self._validate_data(X, dtype=[np.float64, np.float32], ensure_2d=True, copy=self.copy)\n if self.n_components is None:\n if self.svd_solver != 'arpack':\n n_components = min(X.shape)\n else:\n n_components = min(X.shape) - 1\n else:\n n_components = self.n_components\n self._fit_svd_solver = self.svd_solver\n if self._fit_svd_solver == 'auto':\n if max(X.shape) <= 500 or n_components == 'mle':\n self._fit_svd_solver = 'full'\n elif n_components >= 1 and n_components < 0.8 * min(X.shape):\n self._fit_svd_solver = 'randomized'\n else:\n self._fit_svd_solver = 'full'\n if self._fit_svd_solver == 'full':\n return self._fit_full(X, n_components)\n elif self._fit_svd_solver in ['arpack', 'randomized']:\n return self._fit_truncated(X, n_components, self._fit_svd_solver)\n else:\n raise ValueError(\"Unrecognized svd_solver='{0}'\".format(self._fit_svd_solver))\n \n def _fit_full(self, X, n_components):\n \"\"\"Fit the model by computing full SVD on X.\"\"\"\n (n_samples, n_features) = X.shape\n if n_components == 'mle':\n if n_samples < n_features:\n raise ValueError(\"n_components='mle' is only supported if n_samples >= n_features\")\n elif not 0 <= n_components <= min(n_samples, n_features):\n raise ValueError(\"n_components=%r must be between 0 and min(n_samples, n_features)=%r with svd_solver='full'\" % (n_components, min(n_samples, n_features)))\n elif n_components >= 1:\n if not isinstance(n_components, numbers.Integral):\n raise ValueError('n_components=%r must be of type int when greater than or equal to 1, was of type=%r' % (n_components, type(n_components)))\n self.mean_ = np.mean(X, axis=0)\n X -= self.mean_\n (U, S, Vt) = linalg.svd(X, full_matrices=False)\n (U, Vt) = svd_flip(U, Vt)\n components_ = Vt\n explained_variance_ = S**2 / (n_samples - 1)\n total_var = explained_variance_.sum()\n explained_variance_ratio_ = explained_variance_ / total_var\n singular_values_ = S.copy()\n if n_components == 'mle':\n n_components = _infer_dimension(explained_variance_, n_samples)\n elif 0 < n_components < 1.0:\n ratio_cumsum = stable_cumsum(explained_variance_ratio_)\n n_components = np.searchsorted(ratio_cumsum, n_components, side='right') + 1\n if n_components < min(n_features, n_samples):\n self.noise_variance_ = explained_variance_[n_components:].mean()\n else:\n self.noise_variance_ = 0.0\n (self.n_samples_, self.n_features_) = (n_samples, n_features)\n self.components_ = components_[:n_components]\n self.n_components_ = n_components\n self.explained_variance_ = explained_variance_[:n_components]\n self.explained_variance_ratio_ = explained_variance_ratio_[:n_components]\n self.singular_values_ = singular_values_[:n_components]\n return U, S, Vt\n \n def _fit_truncated(self, X, n_components, svd_solver):\n \"\"\"Fit the model by computing truncated SVD (by ARPACK or randomized)\n on X.\n \"\"\"\n (n_samples, n_features) = X.shape\n if isinstance(n_components, str):\n raise ValueError(\"n_components=%r cannot be a string with svd_solver='%s'\" % (n_components, svd_solver))\n elif not 1 <= n_components <= min(n_samples, n_features):\n raise ValueError(\"n_components=%r must be between 1 and min(n_samples, n_features)=%r with svd_solver='%s'\" % (n_components, min(n_samples, n_features), svd_solver))\n elif not isinstance(n_components, numbers.Integral):\n raise ValueError('n_components=%r must be of type int when greater than or equal to 1, was of type=%r' % (n_components, type(n_components)))\n elif svd_solver == 'arpack' and n_components == min(n_samples, n_features):\n raise ValueError(\"n_components=%r must be strictly less than min(n_samples, n_features)=%r with svd_solver='%s'\" % (n_components, min(n_samples, n_features), svd_solver))\n random_state = check_random_state(self.random_state)\n self.mean_ = np.mean(X, axis=0)\n X -= self.mean_\n if svd_solver == 'arpack':\n v0 = _init_arpack_v0(min(X.shape), random_state)\n (U, S, Vt) = svds(X, k=n_components, tol=self.tol, v0=v0)\n S = S[::-1]\n (U, Vt) = svd_flip(U[:, ::-1], Vt[::-1])\n elif svd_solver == 'randomized':\n (U, S, Vt) = randomized_svd(X, n_components=n_components, n_iter=self.iterated_power, flip_sign=True, random_state=random_state)\n (self.n_samples_, self.n_features_) = (n_samples, n_features)\n self.components_ = Vt\n self.n_components_ = n_components\n self.explained_variance_ = S**2 / (n_samples - 1)\n total_var = np.var(X, ddof=1, axis=0)\n self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum()\n self.singular_values_ = S.copy()\n if self.n_components_ < min(n_features, n_samples):\n self.noise_variance_ = total_var.sum() - self.explained_variance_.sum()\n self.noise_variance_ /= min(n_features, n_samples) - n_components\n else:\n self.noise_variance_ = 0.0\n return U, S, Vt\n \n def score_samples(self, X):\n \"\"\"Return the log-likelihood of each sample.\n\n See. \"Pattern Recognition and Machine Learning\"\n by C. Bishop, 12.2.1 p. 574\n or http://www.miketipping.com/papers/met-mppca.pdf\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n Returns\n -------\n ll : ndarray of shape (n_samples,)\n Log-likelihood of each sample under the current model.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, dtype=[np.float64, np.float32], reset=False)\n Xr = X - self.mean_\n n_features = X.shape[1]\n precision = self.get_precision()\n log_like = -0.5 * (Xr * np.dot(Xr, precision)).sum(axis=1)\n log_like -= 0.5 * (n_features * log(2.0 * np.pi) - fast_logdet(precision))\n return log_like\n \n def score(self, X, y=None):\n \"\"\"Return the average log-likelihood of all samples.\n\n See. \"Pattern Recognition and Machine Learning\"\n by C. Bishop, 12.2.1 p. 574\n or http://www.miketipping.com/papers/met-mppca.pdf\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n y : Ignored\n Ignored.\n\n Returns\n -------\n ll : float\n Average log-likelihood of the samples under the current model.\n \"\"\"\n return np.mean(self.score_samples(X))\n \n def _more_tags(self):\n return {'preserves_dtype': [np.float64, np.float32]}\n" }, @@ -20658,9 +20724,9 @@ "sklearn.decomposition._sparse_pca.MiniBatchSparsePCA.fit" ], "is_public": true, - "description": "Mini-batch Sparse Principal Components Analysis.\n\nFinds the set of sparse components that can optimally reconstruct the data. The amount of sparseness is controllable by the coefficient of the L1 penalty, given by the parameter alpha. Read more in the :ref:`User Guide `.", - "docstring": "Mini-batch Sparse Principal Components Analysis.\n\n Finds the set of sparse components that can optimally reconstruct\n the data. The amount of sparseness is controllable by the coefficient\n of the L1 penalty, given by the parameter alpha.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of sparse atoms to extract.\n\n alpha : int, default=1\n Sparsity controlling parameter. Higher values lead to sparser\n components.\n\n ridge_alpha : float, default=0.01\n Amount of ridge shrinkage to apply in order to improve\n conditioning when calling the transform method.\n\n n_iter : int, default=100\n Number of iterations to perform for each mini batch.\n\n callback : callable, default=None\n Callable that gets invoked every five iterations.\n\n batch_size : int, default=3\n The number of features to take in each mini batch.\n\n verbose : int or bool, default=False\n Controls the verbosity; the higher, the more messages. Defaults to 0.\n\n shuffle : bool, default=True\n Whether to shuffle the data before splitting it in batches.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n method : {'lars', 'cd'}, default='lars'\n Method to be used for optimization.\n lars: uses the least angle regression method to solve the lasso problem\n (linear_model.lars_path)\n cd: uses the coordinate descent method to compute the\n Lasso solution (linear_model.Lasso). Lars will be faster if\n the estimated components are sparse.\n\n random_state : int, RandomState instance or None, default=None\n Used for random shuffling when ``shuffle`` is set to ``True``,\n during online dictionary learning. Pass an int for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Sparse components extracted from the data.\n\n n_components_ : int\n Estimated number of components.\n\n .. versionadded:: 0.23\n\n n_iter_ : int\n Number of iterations run.\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, estimated from the training set.\n Equal to ``X.mean(axis=0)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n IncrementalPCA : Incremental principal components analysis.\n PCA : Principal component analysis.\n SparsePCA : Sparse Principal Components Analysis.\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.decomposition import MiniBatchSparsePCA\n >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)\n >>> transformer = MiniBatchSparsePCA(n_components=5, batch_size=50,\n ... random_state=0)\n >>> transformer.fit(X)\n MiniBatchSparsePCA(...)\n >>> X_transformed = transformer.transform(X)\n >>> X_transformed.shape\n (200, 5)\n >>> # most values in the components_ are zero (sparsity)\n >>> np.mean(transformer.components_ == 0)\n 0.94\n ", - "source_code": "\n\nclass MiniBatchSparsePCA(SparsePCA):\n \"\"\"Mini-batch Sparse Principal Components Analysis.\n\n Finds the set of sparse components that can optimally reconstruct\n the data. The amount of sparseness is controllable by the coefficient\n of the L1 penalty, given by the parameter alpha.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of sparse atoms to extract.\n\n alpha : int, default=1\n Sparsity controlling parameter. Higher values lead to sparser\n components.\n\n ridge_alpha : float, default=0.01\n Amount of ridge shrinkage to apply in order to improve\n conditioning when calling the transform method.\n\n n_iter : int, default=100\n Number of iterations to perform for each mini batch.\n\n callback : callable, default=None\n Callable that gets invoked every five iterations.\n\n batch_size : int, default=3\n The number of features to take in each mini batch.\n\n verbose : int or bool, default=False\n Controls the verbosity; the higher, the more messages. Defaults to 0.\n\n shuffle : bool, default=True\n Whether to shuffle the data before splitting it in batches.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n method : {'lars', 'cd'}, default='lars'\n Method to be used for optimization.\n lars: uses the least angle regression method to solve the lasso problem\n (linear_model.lars_path)\n cd: uses the coordinate descent method to compute the\n Lasso solution (linear_model.Lasso). Lars will be faster if\n the estimated components are sparse.\n\n random_state : int, RandomState instance or None, default=None\n Used for random shuffling when ``shuffle`` is set to ``True``,\n during online dictionary learning. Pass an int for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Sparse components extracted from the data.\n\n n_components_ : int\n Estimated number of components.\n\n .. versionadded:: 0.23\n\n n_iter_ : int\n Number of iterations run.\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, estimated from the training set.\n Equal to ``X.mean(axis=0)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n IncrementalPCA : Incremental principal components analysis.\n PCA : Principal component analysis.\n SparsePCA : Sparse Principal Components Analysis.\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.decomposition import MiniBatchSparsePCA\n >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)\n >>> transformer = MiniBatchSparsePCA(n_components=5, batch_size=50,\n ... random_state=0)\n >>> transformer.fit(X)\n MiniBatchSparsePCA(...)\n >>> X_transformed = transformer.transform(X)\n >>> X_transformed.shape\n (200, 5)\n >>> # most values in the components_ are zero (sparsity)\n >>> np.mean(transformer.components_ == 0)\n 0.94\n \"\"\"\n \n def __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01, n_iter=100, callback=None, batch_size=3, verbose=False, shuffle=True, n_jobs=None, method='lars', random_state=None):\n super().__init__(n_components=n_components, alpha=alpha, verbose=verbose, ridge_alpha=ridge_alpha, n_jobs=n_jobs, method=method, random_state=random_state)\n self.n_iter = n_iter\n self.callback = callback\n self.batch_size = batch_size\n self.shuffle = shuffle\n \n def fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X)\n self.mean_ = X.mean(axis=0)\n X = X - self.mean_\n if self.n_components is None:\n n_components = X.shape[1]\n else:\n n_components = self.n_components\n (Vt, _, self.n_iter_) = dict_learning_online(X.T, n_components, alpha=self.alpha, n_iter=self.n_iter, return_code=True, dict_init=None, verbose=self.verbose, callback=self.callback, batch_size=self.batch_size, shuffle=self.shuffle, n_jobs=self.n_jobs, method=self.method, random_state=random_state, return_n_iter=True)\n self.components_ = Vt.T\n components_norm = np.linalg.norm(self.components_, axis=1)[:, np.newaxis]\n components_norm[components_norm == 0] = 1\n self.components_ /= components_norm\n self.n_components_ = len(self.components_)\n return self\n" + "description": "Mini-batch Sparse Principal Components Analysis.\n\nFinds the set of sparse components that can optimally reconstruct\nthe data. The amount of sparseness is controllable by the coefficient\nof the L1 penalty, given by the parameter alpha.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Mini-batch Sparse Principal Components Analysis.\n\n Finds the set of sparse components that can optimally reconstruct\n the data. The amount of sparseness is controllable by the coefficient\n of the L1 penalty, given by the parameter alpha.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of sparse atoms to extract. If None, then ``n_components``\n is set to ``n_features``.\n\n alpha : int, default=1\n Sparsity controlling parameter. Higher values lead to sparser\n components.\n\n ridge_alpha : float, default=0.01\n Amount of ridge shrinkage to apply in order to improve\n conditioning when calling the transform method.\n\n n_iter : int, default=100\n Number of iterations to perform for each mini batch.\n\n callback : callable, default=None\n Callable that gets invoked every five iterations.\n\n batch_size : int, default=3\n The number of features to take in each mini batch.\n\n verbose : int or bool, default=False\n Controls the verbosity; the higher, the more messages. Defaults to 0.\n\n shuffle : bool, default=True\n Whether to shuffle the data before splitting it in batches.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n method : {'lars', 'cd'}, default='lars'\n Method to be used for optimization.\n lars: uses the least angle regression method to solve the lasso problem\n (linear_model.lars_path)\n cd: uses the coordinate descent method to compute the\n Lasso solution (linear_model.Lasso). Lars will be faster if\n the estimated components are sparse.\n\n random_state : int, RandomState instance or None, default=None\n Used for random shuffling when ``shuffle`` is set to ``True``,\n during online dictionary learning. Pass an int for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Sparse components extracted from the data.\n\n n_components_ : int\n Estimated number of components.\n\n .. versionadded:: 0.23\n\n n_iter_ : int\n Number of iterations run.\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, estimated from the training set.\n Equal to ``X.mean(axis=0)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n IncrementalPCA : Incremental principal components analysis.\n PCA : Principal component analysis.\n SparsePCA : Sparse Principal Components Analysis.\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.decomposition import MiniBatchSparsePCA\n >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)\n >>> transformer = MiniBatchSparsePCA(n_components=5, batch_size=50,\n ... random_state=0)\n >>> transformer.fit(X)\n MiniBatchSparsePCA(...)\n >>> X_transformed = transformer.transform(X)\n >>> X_transformed.shape\n (200, 5)\n >>> # most values in the components_ are zero (sparsity)\n >>> np.mean(transformer.components_ == 0)\n 0.94\n ", + "source_code": "\n\nclass MiniBatchSparsePCA(SparsePCA):\n \"\"\"Mini-batch Sparse Principal Components Analysis.\n\n Finds the set of sparse components that can optimally reconstruct\n the data. The amount of sparseness is controllable by the coefficient\n of the L1 penalty, given by the parameter alpha.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of sparse atoms to extract. If None, then ``n_components``\n is set to ``n_features``.\n\n alpha : int, default=1\n Sparsity controlling parameter. Higher values lead to sparser\n components.\n\n ridge_alpha : float, default=0.01\n Amount of ridge shrinkage to apply in order to improve\n conditioning when calling the transform method.\n\n n_iter : int, default=100\n Number of iterations to perform for each mini batch.\n\n callback : callable, default=None\n Callable that gets invoked every five iterations.\n\n batch_size : int, default=3\n The number of features to take in each mini batch.\n\n verbose : int or bool, default=False\n Controls the verbosity; the higher, the more messages. Defaults to 0.\n\n shuffle : bool, default=True\n Whether to shuffle the data before splitting it in batches.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n method : {'lars', 'cd'}, default='lars'\n Method to be used for optimization.\n lars: uses the least angle regression method to solve the lasso problem\n (linear_model.lars_path)\n cd: uses the coordinate descent method to compute the\n Lasso solution (linear_model.Lasso). Lars will be faster if\n the estimated components are sparse.\n\n random_state : int, RandomState instance or None, default=None\n Used for random shuffling when ``shuffle`` is set to ``True``,\n during online dictionary learning. Pass an int for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Sparse components extracted from the data.\n\n n_components_ : int\n Estimated number of components.\n\n .. versionadded:: 0.23\n\n n_iter_ : int\n Number of iterations run.\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, estimated from the training set.\n Equal to ``X.mean(axis=0)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n IncrementalPCA : Incremental principal components analysis.\n PCA : Principal component analysis.\n SparsePCA : Sparse Principal Components Analysis.\n TruncatedSVD : Dimensionality reduction using truncated SVD.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.decomposition import MiniBatchSparsePCA\n >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)\n >>> transformer = MiniBatchSparsePCA(n_components=5, batch_size=50,\n ... random_state=0)\n >>> transformer.fit(X)\n MiniBatchSparsePCA(...)\n >>> X_transformed = transformer.transform(X)\n >>> X_transformed.shape\n (200, 5)\n >>> # most values in the components_ are zero (sparsity)\n >>> np.mean(transformer.components_ == 0)\n 0.94\n \"\"\"\n \n def __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01, n_iter=100, callback=None, batch_size=3, verbose=False, shuffle=True, n_jobs=None, method='lars', random_state=None):\n super().__init__(n_components=n_components, alpha=alpha, verbose=verbose, ridge_alpha=ridge_alpha, n_jobs=n_jobs, method=method, random_state=random_state)\n self.n_iter = n_iter\n self.callback = callback\n self.batch_size = batch_size\n self.shuffle = shuffle\n \n def fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X)\n self.mean_ = X.mean(axis=0)\n X = X - self.mean_\n if self.n_components is None:\n n_components = X.shape[1]\n else:\n n_components = self.n_components\n (Vt, _, self.n_iter_) = dict_learning_online(X.T, n_components, alpha=self.alpha, n_iter=self.n_iter, return_code=True, dict_init=None, verbose=self.verbose, callback=self.callback, batch_size=self.batch_size, shuffle=self.shuffle, n_jobs=self.n_jobs, method=self.method, random_state=random_state, return_n_iter=True)\n self.components_ = Vt.T\n components_norm = np.linalg.norm(self.components_, axis=1)[:, np.newaxis]\n components_norm[components_norm == 0] = 1\n self.components_ /= components_norm\n self.n_components_ = len(self.components_)\n return self\n" }, { "name": "SparsePCA", @@ -20673,9 +20739,9 @@ "sklearn.decomposition._sparse_pca.SparsePCA.transform" ], "is_public": true, - "description": "Sparse Principal Components Analysis (SparsePCA).\n\nFinds the set of sparse components that can optimally reconstruct the data. The amount of sparseness is controllable by the coefficient of the L1 penalty, given by the parameter alpha. Read more in the :ref:`User Guide `.", - "docstring": "Sparse Principal Components Analysis (SparsePCA).\n\n Finds the set of sparse components that can optimally reconstruct\n the data. The amount of sparseness is controllable by the coefficient\n of the L1 penalty, given by the parameter alpha.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of sparse atoms to extract.\n\n alpha : float, default=1\n Sparsity controlling parameter. Higher values lead to sparser\n components.\n\n ridge_alpha : float, default=0.01\n Amount of ridge shrinkage to apply in order to improve\n conditioning when calling the transform method.\n\n max_iter : int, default=1000\n Maximum number of iterations to perform.\n\n tol : float, default=1e-8\n Tolerance for the stopping condition.\n\n method : {'lars', 'cd'}, default='lars'\n Method to be used for optimization.\n lars: uses the least angle regression method to solve the lasso problem\n (linear_model.lars_path)\n cd: uses the coordinate descent method to compute the\n Lasso solution (linear_model.Lasso). Lars will be faster if\n the estimated components are sparse.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n U_init : ndarray of shape (n_samples, n_components), default=None\n Initial values for the loadings for warm restart scenarios. Only used\n if `U_init` and `V_init` are not None.\n\n V_init : ndarray of shape (n_components, n_features), default=None\n Initial values for the components for warm restart scenarios. Only used\n if `U_init` and `V_init` are not None.\n\n verbose : int or bool, default=False\n Controls the verbosity; the higher, the more messages. Defaults to 0.\n\n random_state : int, RandomState instance or None, default=None\n Used during dictionary learning. Pass an int for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Sparse components extracted from the data.\n\n error_ : ndarray\n Vector of errors at each iteration.\n\n n_components_ : int\n Estimated number of components.\n\n .. versionadded:: 0.23\n\n n_iter_ : int\n Number of iterations run.\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, estimated from the training set.\n Equal to ``X.mean(axis=0)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PCA : Principal Component Analysis implementation.\n MiniBatchSparsePCA : Mini batch variant of `SparsePCA` that is faster but less\n accurate.\n DictionaryLearning : Generic dictionary learning problem using a sparse code.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.decomposition import SparsePCA\n >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)\n >>> transformer = SparsePCA(n_components=5, random_state=0)\n >>> transformer.fit(X)\n SparsePCA(...)\n >>> X_transformed = transformer.transform(X)\n >>> X_transformed.shape\n (200, 5)\n >>> # most values in the components_ are zero (sparsity)\n >>> np.mean(transformer.components_ == 0)\n 0.9666...\n ", - "source_code": "\n\nclass SparsePCA(TransformerMixin, BaseEstimator):\n \"\"\"Sparse Principal Components Analysis (SparsePCA).\n\n Finds the set of sparse components that can optimally reconstruct\n the data. The amount of sparseness is controllable by the coefficient\n of the L1 penalty, given by the parameter alpha.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of sparse atoms to extract.\n\n alpha : float, default=1\n Sparsity controlling parameter. Higher values lead to sparser\n components.\n\n ridge_alpha : float, default=0.01\n Amount of ridge shrinkage to apply in order to improve\n conditioning when calling the transform method.\n\n max_iter : int, default=1000\n Maximum number of iterations to perform.\n\n tol : float, default=1e-8\n Tolerance for the stopping condition.\n\n method : {'lars', 'cd'}, default='lars'\n Method to be used for optimization.\n lars: uses the least angle regression method to solve the lasso problem\n (linear_model.lars_path)\n cd: uses the coordinate descent method to compute the\n Lasso solution (linear_model.Lasso). Lars will be faster if\n the estimated components are sparse.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n U_init : ndarray of shape (n_samples, n_components), default=None\n Initial values for the loadings for warm restart scenarios. Only used\n if `U_init` and `V_init` are not None.\n\n V_init : ndarray of shape (n_components, n_features), default=None\n Initial values for the components for warm restart scenarios. Only used\n if `U_init` and `V_init` are not None.\n\n verbose : int or bool, default=False\n Controls the verbosity; the higher, the more messages. Defaults to 0.\n\n random_state : int, RandomState instance or None, default=None\n Used during dictionary learning. Pass an int for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Sparse components extracted from the data.\n\n error_ : ndarray\n Vector of errors at each iteration.\n\n n_components_ : int\n Estimated number of components.\n\n .. versionadded:: 0.23\n\n n_iter_ : int\n Number of iterations run.\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, estimated from the training set.\n Equal to ``X.mean(axis=0)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PCA : Principal Component Analysis implementation.\n MiniBatchSparsePCA : Mini batch variant of `SparsePCA` that is faster but less\n accurate.\n DictionaryLearning : Generic dictionary learning problem using a sparse code.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.decomposition import SparsePCA\n >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)\n >>> transformer = SparsePCA(n_components=5, random_state=0)\n >>> transformer.fit(X)\n SparsePCA(...)\n >>> X_transformed = transformer.transform(X)\n >>> X_transformed.shape\n (200, 5)\n >>> # most values in the components_ are zero (sparsity)\n >>> np.mean(transformer.components_ == 0)\n 0.9666...\n \"\"\"\n \n def __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01, max_iter=1000, tol=1e-08, method='lars', n_jobs=None, U_init=None, V_init=None, verbose=False, random_state=None):\n self.n_components = n_components\n self.alpha = alpha\n self.ridge_alpha = ridge_alpha\n self.max_iter = max_iter\n self.tol = tol\n self.method = method\n self.n_jobs = n_jobs\n self.U_init = U_init\n self.V_init = V_init\n self.verbose = verbose\n self.random_state = random_state\n \n def fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X)\n self.mean_ = X.mean(axis=0)\n X = X - self.mean_\n if self.n_components is None:\n n_components = X.shape[1]\n else:\n n_components = self.n_components\n code_init = self.V_init.T if self.V_init is not None else None\n dict_init = self.U_init.T if self.U_init is not None else None\n (Vt, _, E, self.n_iter_) = dict_learning(X.T, n_components, alpha=self.alpha, tol=self.tol, max_iter=self.max_iter, method=self.method, n_jobs=self.n_jobs, verbose=self.verbose, random_state=random_state, code_init=code_init, dict_init=dict_init, return_n_iter=True)\n self.components_ = Vt.T\n components_norm = np.linalg.norm(self.components_, axis=1)[:, np.newaxis]\n components_norm[components_norm == 0] = 1\n self.components_ /= components_norm\n self.n_components_ = len(self.components_)\n self.error_ = E\n return self\n \n def transform(self, X):\n \"\"\"Least Squares projection of the data onto the sparse components.\n\n To avoid instability issues in case the system is under-determined,\n regularization can be applied (Ridge regression) via the\n `ridge_alpha` parameter.\n\n Note that Sparse PCA components orthogonality is not enforced as in PCA\n hence one cannot use a simple linear projection.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Test data to be transformed, must have the same number of\n features as the data used to train the model.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n X = X - self.mean_\n U = ridge_regression(self.components_.T, X.T, self.ridge_alpha, solver='cholesky')\n return U\n" + "description": "Sparse Principal Components Analysis (SparsePCA).\n\nFinds the set of sparse components that can optimally reconstruct\nthe data. The amount of sparseness is controllable by the coefficient\nof the L1 penalty, given by the parameter alpha.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Sparse Principal Components Analysis (SparsePCA).\n\n Finds the set of sparse components that can optimally reconstruct\n the data. The amount of sparseness is controllable by the coefficient\n of the L1 penalty, given by the parameter alpha.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of sparse atoms to extract. If None, then ``n_components``\n is set to ``n_features``.\n\n alpha : float, default=1\n Sparsity controlling parameter. Higher values lead to sparser\n components.\n\n ridge_alpha : float, default=0.01\n Amount of ridge shrinkage to apply in order to improve\n conditioning when calling the transform method.\n\n max_iter : int, default=1000\n Maximum number of iterations to perform.\n\n tol : float, default=1e-8\n Tolerance for the stopping condition.\n\n method : {'lars', 'cd'}, default='lars'\n Method to be used for optimization.\n lars: uses the least angle regression method to solve the lasso problem\n (linear_model.lars_path)\n cd: uses the coordinate descent method to compute the\n Lasso solution (linear_model.Lasso). Lars will be faster if\n the estimated components are sparse.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n U_init : ndarray of shape (n_samples, n_components), default=None\n Initial values for the loadings for warm restart scenarios. Only used\n if `U_init` and `V_init` are not None.\n\n V_init : ndarray of shape (n_components, n_features), default=None\n Initial values for the components for warm restart scenarios. Only used\n if `U_init` and `V_init` are not None.\n\n verbose : int or bool, default=False\n Controls the verbosity; the higher, the more messages. Defaults to 0.\n\n random_state : int, RandomState instance or None, default=None\n Used during dictionary learning. Pass an int for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Sparse components extracted from the data.\n\n error_ : ndarray\n Vector of errors at each iteration.\n\n n_components_ : int\n Estimated number of components.\n\n .. versionadded:: 0.23\n\n n_iter_ : int\n Number of iterations run.\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, estimated from the training set.\n Equal to ``X.mean(axis=0)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PCA : Principal Component Analysis implementation.\n MiniBatchSparsePCA : Mini batch variant of `SparsePCA` that is faster but less\n accurate.\n DictionaryLearning : Generic dictionary learning problem using a sparse code.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.decomposition import SparsePCA\n >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)\n >>> transformer = SparsePCA(n_components=5, random_state=0)\n >>> transformer.fit(X)\n SparsePCA(...)\n >>> X_transformed = transformer.transform(X)\n >>> X_transformed.shape\n (200, 5)\n >>> # most values in the components_ are zero (sparsity)\n >>> np.mean(transformer.components_ == 0)\n 0.9666...\n ", + "source_code": "\n\nclass SparsePCA(TransformerMixin, BaseEstimator):\n \"\"\"Sparse Principal Components Analysis (SparsePCA).\n\n Finds the set of sparse components that can optimally reconstruct\n the data. The amount of sparseness is controllable by the coefficient\n of the L1 penalty, given by the parameter alpha.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Number of sparse atoms to extract. If None, then ``n_components``\n is set to ``n_features``.\n\n alpha : float, default=1\n Sparsity controlling parameter. Higher values lead to sparser\n components.\n\n ridge_alpha : float, default=0.01\n Amount of ridge shrinkage to apply in order to improve\n conditioning when calling the transform method.\n\n max_iter : int, default=1000\n Maximum number of iterations to perform.\n\n tol : float, default=1e-8\n Tolerance for the stopping condition.\n\n method : {'lars', 'cd'}, default='lars'\n Method to be used for optimization.\n lars: uses the least angle regression method to solve the lasso problem\n (linear_model.lars_path)\n cd: uses the coordinate descent method to compute the\n Lasso solution (linear_model.Lasso). Lars will be faster if\n the estimated components are sparse.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n U_init : ndarray of shape (n_samples, n_components), default=None\n Initial values for the loadings for warm restart scenarios. Only used\n if `U_init` and `V_init` are not None.\n\n V_init : ndarray of shape (n_components, n_features), default=None\n Initial values for the components for warm restart scenarios. Only used\n if `U_init` and `V_init` are not None.\n\n verbose : int or bool, default=False\n Controls the verbosity; the higher, the more messages. Defaults to 0.\n\n random_state : int, RandomState instance or None, default=None\n Used during dictionary learning. Pass an int for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Sparse components extracted from the data.\n\n error_ : ndarray\n Vector of errors at each iteration.\n\n n_components_ : int\n Estimated number of components.\n\n .. versionadded:: 0.23\n\n n_iter_ : int\n Number of iterations run.\n\n mean_ : ndarray of shape (n_features,)\n Per-feature empirical mean, estimated from the training set.\n Equal to ``X.mean(axis=0)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PCA : Principal Component Analysis implementation.\n MiniBatchSparsePCA : Mini batch variant of `SparsePCA` that is faster but less\n accurate.\n DictionaryLearning : Generic dictionary learning problem using a sparse code.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.decomposition import SparsePCA\n >>> X, _ = make_friedman1(n_samples=200, n_features=30, random_state=0)\n >>> transformer = SparsePCA(n_components=5, random_state=0)\n >>> transformer.fit(X)\n SparsePCA(...)\n >>> X_transformed = transformer.transform(X)\n >>> X_transformed.shape\n (200, 5)\n >>> # most values in the components_ are zero (sparsity)\n >>> np.mean(transformer.components_ == 0)\n 0.9666...\n \"\"\"\n \n def __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01, max_iter=1000, tol=1e-08, method='lars', n_jobs=None, U_init=None, V_init=None, verbose=False, random_state=None):\n self.n_components = n_components\n self.alpha = alpha\n self.ridge_alpha = ridge_alpha\n self.max_iter = max_iter\n self.tol = tol\n self.method = method\n self.n_jobs = n_jobs\n self.U_init = U_init\n self.V_init = V_init\n self.verbose = verbose\n self.random_state = random_state\n \n def fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X)\n self.mean_ = X.mean(axis=0)\n X = X - self.mean_\n if self.n_components is None:\n n_components = X.shape[1]\n else:\n n_components = self.n_components\n code_init = self.V_init.T if self.V_init is not None else None\n dict_init = self.U_init.T if self.U_init is not None else None\n (Vt, _, E, self.n_iter_) = dict_learning(X.T, n_components, alpha=self.alpha, tol=self.tol, max_iter=self.max_iter, method=self.method, n_jobs=self.n_jobs, verbose=self.verbose, random_state=random_state, code_init=code_init, dict_init=dict_init, return_n_iter=True)\n self.components_ = Vt.T\n components_norm = np.linalg.norm(self.components_, axis=1)[:, np.newaxis]\n components_norm[components_norm == 0] = 1\n self.components_ /= components_norm\n self.n_components_ = len(self.components_)\n self.error_ = E\n return self\n \n def transform(self, X):\n \"\"\"Least Squares projection of the data onto the sparse components.\n\n To avoid instability issues in case the system is under-determined,\n regularization can be applied (Ridge regression) via the\n `ridge_alpha` parameter.\n\n Note that Sparse PCA components orthogonality is not enforced as in PCA\n hence one cannot use a simple linear projection.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Test data to be transformed, must have the same number of\n features as the data used to train the model.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n X = X - self.mean_\n U = ridge_regression(self.components_.T, X.T, self.ridge_alpha, solver='cholesky')\n return U\n" }, { "name": "TruncatedSVD", @@ -20691,7 +20757,7 @@ "sklearn.decomposition._truncated_svd.TruncatedSVD._more_tags" ], "is_public": true, - "description": "Dimensionality reduction using truncated SVD (aka LSA).\n\nThis transformer performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). Contrary to PCA, this estimator does not center the data before computing the singular value decomposition. This means it can work with sparse matrices efficiently. In particular, truncated SVD works on term count/tf-idf matrices as returned by the vectorizers in :mod:`sklearn.feature_extraction.text`. In that context, it is known as latent semantic analysis (LSA). This estimator supports two algorithms: a fast randomized SVD solver, and a \"naive\" algorithm that uses ARPACK as an eigensolver on `X * X.T` or `X.T * X`, whichever is more efficient. Read more in the :ref:`User Guide `.", + "description": "Dimensionality reduction using truncated SVD (aka LSA).\n\nThis transformer performs linear dimensionality reduction by means of\ntruncated singular value decomposition (SVD). Contrary to PCA, this\nestimator does not center the data before computing the singular value\ndecomposition. This means it can work with sparse matrices\nefficiently.\n\nIn particular, truncated SVD works on term count/tf-idf matrices as\nreturned by the vectorizers in :mod:`sklearn.feature_extraction.text`. In\nthat context, it is known as latent semantic analysis (LSA).\n\nThis estimator supports two algorithms: a fast randomized SVD solver, and\na \"naive\" algorithm that uses ARPACK as an eigensolver on `X * X.T` or\n`X.T * X`, whichever is more efficient.\n\nRead more in the :ref:`User Guide `.", "docstring": "Dimensionality reduction using truncated SVD (aka LSA).\n\n This transformer performs linear dimensionality reduction by means of\n truncated singular value decomposition (SVD). Contrary to PCA, this\n estimator does not center the data before computing the singular value\n decomposition. This means it can work with sparse matrices\n efficiently.\n\n In particular, truncated SVD works on term count/tf-idf matrices as\n returned by the vectorizers in :mod:`sklearn.feature_extraction.text`. In\n that context, it is known as latent semantic analysis (LSA).\n\n This estimator supports two algorithms: a fast randomized SVD solver, and\n a \"naive\" algorithm that uses ARPACK as an eigensolver on `X * X.T` or\n `X.T * X`, whichever is more efficient.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=2\n Desired dimensionality of output data.\n Must be strictly less than the number of features.\n The default value is useful for visualisation. For LSA, a value of\n 100 is recommended.\n\n algorithm : {'arpack', 'randomized'}, default='randomized'\n SVD solver to use. Either \"arpack\" for the ARPACK wrapper in SciPy\n (scipy.sparse.linalg.svds), or \"randomized\" for the randomized\n algorithm due to Halko (2009).\n\n n_iter : int, default=5\n Number of iterations for randomized SVD solver. Not used by ARPACK. The\n default is larger than the default in\n :func:`~sklearn.utils.extmath.randomized_svd` to handle sparse\n matrices that may have large slowly decaying spectrum.\n\n random_state : int, RandomState instance or None, default=None\n Used during randomized svd. Pass an int for reproducible results across\n multiple function calls.\n See :term:`Glossary `.\n\n tol : float, default=0.0\n Tolerance for ARPACK. 0 means machine precision. Ignored by randomized\n SVD solver.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n The right singular vectors of the input data.\n\n explained_variance_ : ndarray of shape (n_components,)\n The variance of the training samples transformed by a projection to\n each component.\n\n explained_variance_ratio_ : ndarray of shape (n_components,)\n Percentage of variance explained by each of the selected components.\n\n singular_values_ : ndarray od shape (n_components,)\n The singular values corresponding to each of the selected components.\n The singular values are equal to the 2-norms of the ``n_components``\n variables in the lower-dimensional space.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n FactorAnalysis : A simple linear generative model with\n Gaussian latent variables.\n IncrementalPCA : Incremental principal components analysis.\n KernelPCA : Kernel Principal component analysis.\n NMF : Non-Negative Matrix Factorization.\n PCA : Principal component analysis.\n\n Notes\n -----\n SVD suffers from a problem called \"sign indeterminacy\", which means the\n sign of the ``components_`` and the output from transform depend on the\n algorithm and random state. To work around this, fit instances of this\n class to data once, then keep the instance around to do transformations.\n\n References\n ----------\n Finding structure with randomness: Stochastic algorithms for constructing\n approximate matrix decompositions\n Halko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf\n\n Examples\n --------\n >>> from sklearn.decomposition import TruncatedSVD\n >>> from scipy.sparse import csr_matrix\n >>> import numpy as np\n >>> np.random.seed(0)\n >>> X_dense = np.random.rand(100, 100)\n >>> X_dense[:, 2 * np.arange(50)] = 0\n >>> X = csr_matrix(X_dense)\n >>> svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)\n >>> svd.fit(X)\n TruncatedSVD(n_components=5, n_iter=7, random_state=42)\n >>> print(svd.explained_variance_ratio_)\n [0.0157... 0.0512... 0.0499... 0.0479... 0.0453...]\n >>> print(svd.explained_variance_ratio_.sum())\n 0.2102...\n >>> print(svd.singular_values_)\n [35.2410... 4.5981... 4.5420... 4.4486... 4.3288...]\n ", "source_code": "\n\nclass TruncatedSVD(TransformerMixin, BaseEstimator):\n \"\"\"Dimensionality reduction using truncated SVD (aka LSA).\n\n This transformer performs linear dimensionality reduction by means of\n truncated singular value decomposition (SVD). Contrary to PCA, this\n estimator does not center the data before computing the singular value\n decomposition. This means it can work with sparse matrices\n efficiently.\n\n In particular, truncated SVD works on term count/tf-idf matrices as\n returned by the vectorizers in :mod:`sklearn.feature_extraction.text`. In\n that context, it is known as latent semantic analysis (LSA).\n\n This estimator supports two algorithms: a fast randomized SVD solver, and\n a \"naive\" algorithm that uses ARPACK as an eigensolver on `X * X.T` or\n `X.T * X`, whichever is more efficient.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=2\n Desired dimensionality of output data.\n Must be strictly less than the number of features.\n The default value is useful for visualisation. For LSA, a value of\n 100 is recommended.\n\n algorithm : {'arpack', 'randomized'}, default='randomized'\n SVD solver to use. Either \"arpack\" for the ARPACK wrapper in SciPy\n (scipy.sparse.linalg.svds), or \"randomized\" for the randomized\n algorithm due to Halko (2009).\n\n n_iter : int, default=5\n Number of iterations for randomized SVD solver. Not used by ARPACK. The\n default is larger than the default in\n :func:`~sklearn.utils.extmath.randomized_svd` to handle sparse\n matrices that may have large slowly decaying spectrum.\n\n random_state : int, RandomState instance or None, default=None\n Used during randomized svd. Pass an int for reproducible results across\n multiple function calls.\n See :term:`Glossary `.\n\n tol : float, default=0.0\n Tolerance for ARPACK. 0 means machine precision. Ignored by randomized\n SVD solver.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n The right singular vectors of the input data.\n\n explained_variance_ : ndarray of shape (n_components,)\n The variance of the training samples transformed by a projection to\n each component.\n\n explained_variance_ratio_ : ndarray of shape (n_components,)\n Percentage of variance explained by each of the selected components.\n\n singular_values_ : ndarray od shape (n_components,)\n The singular values corresponding to each of the selected components.\n The singular values are equal to the 2-norms of the ``n_components``\n variables in the lower-dimensional space.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n DictionaryLearning : Find a dictionary that sparsely encodes data.\n FactorAnalysis : A simple linear generative model with\n Gaussian latent variables.\n IncrementalPCA : Incremental principal components analysis.\n KernelPCA : Kernel Principal component analysis.\n NMF : Non-Negative Matrix Factorization.\n PCA : Principal component analysis.\n\n Notes\n -----\n SVD suffers from a problem called \"sign indeterminacy\", which means the\n sign of the ``components_`` and the output from transform depend on the\n algorithm and random state. To work around this, fit instances of this\n class to data once, then keep the instance around to do transformations.\n\n References\n ----------\n Finding structure with randomness: Stochastic algorithms for constructing\n approximate matrix decompositions\n Halko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf\n\n Examples\n --------\n >>> from sklearn.decomposition import TruncatedSVD\n >>> from scipy.sparse import csr_matrix\n >>> import numpy as np\n >>> np.random.seed(0)\n >>> X_dense = np.random.rand(100, 100)\n >>> X_dense[:, 2 * np.arange(50)] = 0\n >>> X = csr_matrix(X_dense)\n >>> svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)\n >>> svd.fit(X)\n TruncatedSVD(n_components=5, n_iter=7, random_state=42)\n >>> print(svd.explained_variance_ratio_)\n [0.0157... 0.0512... 0.0499... 0.0479... 0.0453...]\n >>> print(svd.explained_variance_ratio_.sum())\n 0.2102...\n >>> print(svd.singular_values_)\n [35.2410... 4.5981... 4.5420... 4.4486... 4.3288...]\n \"\"\"\n \n def __init__(self, n_components=2, *, algorithm='randomized', n_iter=5, random_state=None, tol=0.0):\n self.algorithm = algorithm\n self.n_components = n_components\n self.n_iter = n_iter\n self.random_state = random_state\n self.tol = tol\n \n def fit(self, X, y=None):\n \"\"\"Fit model on training data X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the transformer object.\n \"\"\"\n self.fit_transform(X)\n return self\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit model to X and perform dimensionality reduction on X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Reduced version of X. This will always be a dense array.\n \"\"\"\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], ensure_min_features=2)\n random_state = check_random_state(self.random_state)\n if self.algorithm == 'arpack':\n v0 = _init_arpack_v0(min(X.shape), random_state)\n (U, Sigma, VT) = svds(X, k=self.n_components, tol=self.tol, v0=v0)\n Sigma = Sigma[::-1]\n (U, VT) = svd_flip(U[:, ::-1], VT[::-1])\n elif self.algorithm == 'randomized':\n k = self.n_components\n n_features = X.shape[1]\n if k >= n_features:\n raise ValueError('n_components must be < n_features; got %d >= %d' % (k, n_features))\n (U, Sigma, VT) = randomized_svd(X, self.n_components, n_iter=self.n_iter, random_state=random_state)\n else:\n raise ValueError('unknown algorithm %r' % self.algorithm)\n self.components_ = VT\n if self.algorithm == 'randomized' or self.algorithm == 'arpack' and self.tol > 0:\n X_transformed = safe_sparse_dot(X, self.components_.T)\n else:\n X_transformed = U * Sigma\n self.explained_variance_ = exp_var = np.var(X_transformed, axis=0)\n if sp.issparse(X):\n (_, full_var) = mean_variance_axis(X, axis=0)\n full_var = full_var.sum()\n else:\n full_var = np.var(X, axis=0).sum()\n self.explained_variance_ratio_ = exp_var / full_var\n self.singular_values_ = Sigma\n return X_transformed\n \n def transform(self, X):\n \"\"\"Perform dimensionality reduction on X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Reduced version of X. This will always be a dense array.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], reset=False)\n return safe_sparse_dot(X, self.components_.T)\n \n def inverse_transform(self, X):\n \"\"\"Transform X back to its original space.\n\n Returns an array X_original whose transform would be X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_components)\n New data.\n\n Returns\n -------\n X_original : ndarray of shape (n_samples, n_features)\n Note that this is always a dense array.\n \"\"\"\n X = check_array(X)\n return np.dot(X, self.components_)\n \n def _more_tags(self):\n return {'preserves_dtype': [np.float64, np.float32]}\n" }, @@ -20716,7 +20782,7 @@ "sklearn.discriminant_analysis.LinearDiscriminantAnalysis.decision_function" ], "is_public": true, - "description": "Linear Discriminant Analysis.\n\nA classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes' rule. The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix. The fitted model can also be used to reduce the dimensionality of the input by projecting it to the most discriminative directions, using the `transform` method. .. versionadded:: 0.17 *LinearDiscriminantAnalysis*. Read more in the :ref:`User Guide `.", + "description": "Linear Discriminant Analysis.\n\nA classifier with a linear decision boundary, generated by fitting class\nconditional densities to the data and using Bayes' rule.\n\nThe model fits a Gaussian density to each class, assuming that all classes\nshare the same covariance matrix.\n\nThe fitted model can also be used to reduce the dimensionality of the input\nby projecting it to the most discriminative directions, using the\n`transform` method.\n\n.. versionadded:: 0.17\n *LinearDiscriminantAnalysis*.\n\nRead more in the :ref:`User Guide `.", "docstring": "Linear Discriminant Analysis.\n\n A classifier with a linear decision boundary, generated by fitting class\n conditional densities to the data and using Bayes' rule.\n\n The model fits a Gaussian density to each class, assuming that all classes\n share the same covariance matrix.\n\n The fitted model can also be used to reduce the dimensionality of the input\n by projecting it to the most discriminative directions, using the\n `transform` method.\n\n .. versionadded:: 0.17\n *LinearDiscriminantAnalysis*.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n solver : {'svd', 'lsqr', 'eigen'}, default='svd'\n Solver to use, possible values:\n - 'svd': Singular value decomposition (default).\n Does not compute the covariance matrix, therefore this solver is\n recommended for data with a large number of features.\n - 'lsqr': Least squares solution.\n Can be combined with shrinkage or custom covariance estimator.\n - 'eigen': Eigenvalue decomposition.\n Can be combined with shrinkage or custom covariance estimator.\n\n shrinkage : 'auto' or float, default=None\n Shrinkage parameter, possible values:\n - None: no shrinkage (default).\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n This should be left to None if `covariance_estimator` is used.\n Note that shrinkage works only with 'lsqr' and 'eigen' solvers.\n\n priors : array-like of shape (n_classes,), default=None\n The class prior probabilities. By default, the class proportions are\n inferred from the training data.\n\n n_components : int, default=None\n Number of components (<= min(n_classes - 1, n_features)) for\n dimensionality reduction. If None, will be set to\n min(n_classes - 1, n_features). This parameter only affects the\n `transform` method.\n\n store_covariance : bool, default=False\n If True, explicitly compute the weighted within-class covariance\n matrix when solver is 'svd'. The matrix is always computed\n and stored for the other solvers.\n\n .. versionadded:: 0.17\n\n tol : float, default=1.0e-4\n Absolute threshold for a singular value of X to be considered\n significant, used to estimate the rank of X. Dimensions whose\n singular values are non-significant are discarded. Only used if\n solver is 'svd'.\n\n .. versionadded:: 0.17\n\n covariance_estimator : covariance estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying on the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in :mod:`sklearn.covariance`.\n if None the shrinkage parameter drives the estimate.\n\n This should be left to None if `shrinkage` is used.\n Note that `covariance_estimator` works only with 'lsqr' and 'eigen'\n solvers.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,) or (n_classes, n_features)\n Weight vector(s).\n\n intercept_ : ndarray of shape (n_classes,)\n Intercept term.\n\n covariance_ : array-like of shape (n_features, n_features)\n Weighted within-class covariance matrix. It corresponds to\n `sum_k prior_k * C_k` where `C_k` is the covariance matrix of the\n samples in class `k`. The `C_k` are estimated using the (potentially\n shrunk) biased estimator of covariance. If solver is 'svd', only\n exists when `store_covariance` is True.\n\n explained_variance_ratio_ : ndarray of shape (n_components,)\n Percentage of variance explained by each of the selected components.\n If ``n_components`` is not set then all components are stored and the\n sum of explained variances is equal to 1.0. Only available when eigen\n or svd solver is used.\n\n means_ : array-like of shape (n_classes, n_features)\n Class-wise means.\n\n priors_ : array-like of shape (n_classes,)\n Class priors (sum to 1).\n\n scalings_ : array-like of shape (rank, n_classes - 1)\n Scaling of the features in the space spanned by the class centroids.\n Only available for 'svd' and 'eigen' solvers.\n\n xbar_ : array-like of shape (n_features,)\n Overall mean. Only present if solver is 'svd'.\n\n classes_ : array-like of shape (n_classes,)\n Unique class labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n QuadraticDiscriminantAnalysis : Quadratic Discriminant Analysis.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> y = np.array([1, 1, 1, 2, 2, 2])\n >>> clf = LinearDiscriminantAnalysis()\n >>> clf.fit(X, y)\n LinearDiscriminantAnalysis()\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n ", "source_code": "\n\nclass LinearDiscriminantAnalysis(LinearClassifierMixin, TransformerMixin, BaseEstimator):\n \"\"\"Linear Discriminant Analysis.\n\n A classifier with a linear decision boundary, generated by fitting class\n conditional densities to the data and using Bayes' rule.\n\n The model fits a Gaussian density to each class, assuming that all classes\n share the same covariance matrix.\n\n The fitted model can also be used to reduce the dimensionality of the input\n by projecting it to the most discriminative directions, using the\n `transform` method.\n\n .. versionadded:: 0.17\n *LinearDiscriminantAnalysis*.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n solver : {'svd', 'lsqr', 'eigen'}, default='svd'\n Solver to use, possible values:\n - 'svd': Singular value decomposition (default).\n Does not compute the covariance matrix, therefore this solver is\n recommended for data with a large number of features.\n - 'lsqr': Least squares solution.\n Can be combined with shrinkage or custom covariance estimator.\n - 'eigen': Eigenvalue decomposition.\n Can be combined with shrinkage or custom covariance estimator.\n\n shrinkage : 'auto' or float, default=None\n Shrinkage parameter, possible values:\n - None: no shrinkage (default).\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n This should be left to None if `covariance_estimator` is used.\n Note that shrinkage works only with 'lsqr' and 'eigen' solvers.\n\n priors : array-like of shape (n_classes,), default=None\n The class prior probabilities. By default, the class proportions are\n inferred from the training data.\n\n n_components : int, default=None\n Number of components (<= min(n_classes - 1, n_features)) for\n dimensionality reduction. If None, will be set to\n min(n_classes - 1, n_features). This parameter only affects the\n `transform` method.\n\n store_covariance : bool, default=False\n If True, explicitly compute the weighted within-class covariance\n matrix when solver is 'svd'. The matrix is always computed\n and stored for the other solvers.\n\n .. versionadded:: 0.17\n\n tol : float, default=1.0e-4\n Absolute threshold for a singular value of X to be considered\n significant, used to estimate the rank of X. Dimensions whose\n singular values are non-significant are discarded. Only used if\n solver is 'svd'.\n\n .. versionadded:: 0.17\n\n covariance_estimator : covariance estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying on the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in :mod:`sklearn.covariance`.\n if None the shrinkage parameter drives the estimate.\n\n This should be left to None if `shrinkage` is used.\n Note that `covariance_estimator` works only with 'lsqr' and 'eigen'\n solvers.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,) or (n_classes, n_features)\n Weight vector(s).\n\n intercept_ : ndarray of shape (n_classes,)\n Intercept term.\n\n covariance_ : array-like of shape (n_features, n_features)\n Weighted within-class covariance matrix. It corresponds to\n `sum_k prior_k * C_k` where `C_k` is the covariance matrix of the\n samples in class `k`. The `C_k` are estimated using the (potentially\n shrunk) biased estimator of covariance. If solver is 'svd', only\n exists when `store_covariance` is True.\n\n explained_variance_ratio_ : ndarray of shape (n_components,)\n Percentage of variance explained by each of the selected components.\n If ``n_components`` is not set then all components are stored and the\n sum of explained variances is equal to 1.0. Only available when eigen\n or svd solver is used.\n\n means_ : array-like of shape (n_classes, n_features)\n Class-wise means.\n\n priors_ : array-like of shape (n_classes,)\n Class priors (sum to 1).\n\n scalings_ : array-like of shape (rank, n_classes - 1)\n Scaling of the features in the space spanned by the class centroids.\n Only available for 'svd' and 'eigen' solvers.\n\n xbar_ : array-like of shape (n_features,)\n Overall mean. Only present if solver is 'svd'.\n\n classes_ : array-like of shape (n_classes,)\n Unique class labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n QuadraticDiscriminantAnalysis : Quadratic Discriminant Analysis.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> y = np.array([1, 1, 1, 2, 2, 2])\n >>> clf = LinearDiscriminantAnalysis()\n >>> clf.fit(X, y)\n LinearDiscriminantAnalysis()\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n \"\"\"\n \n def __init__(self, solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001, covariance_estimator=None):\n self.solver = solver\n self.shrinkage = shrinkage\n self.priors = priors\n self.n_components = n_components\n self.store_covariance = store_covariance\n self.tol = tol\n self.covariance_estimator = covariance_estimator\n \n def _solve_lsqr(self, X, y, shrinkage, covariance_estimator):\n \"\"\"Least squares solver.\n\n The least squares solver computes a straightforward solution of the\n optimal decision rule based directly on the discriminant functions. It\n can only be used for classification (with any covariance estimator),\n because\n estimation of eigenvectors is not performed. Therefore, dimensionality\n reduction with the transform is not supported.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_classes)\n Target values.\n\n shrinkage : 'auto', float or None\n Shrinkage parameter, possible values:\n - None: no shrinkage.\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n Shrinkage parameter is ignored if `covariance_estimator` i\n not None\n\n covariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in sklearn.covariance.\n if None the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\n Notes\n -----\n This solver is based on [1]_, section 2.6.2, pp. 39-41.\n\n References\n ----------\n .. [1] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification\n (Second Edition). John Wiley & Sons, Inc., New York, 2001. ISBN\n 0-471-05669-3.\n \"\"\"\n self.means_ = _class_means(X, y)\n self.covariance_ = _class_cov(X, y, self.priors_, shrinkage, covariance_estimator)\n self.coef_ = linalg.lstsq(self.covariance_, self.means_.T)[0].T\n self.intercept_ = -0.5 * np.diag(np.dot(self.means_, self.coef_.T)) + np.log(self.priors_)\n \n def _solve_eigen(self, X, y, shrinkage, covariance_estimator):\n \"\"\"Eigenvalue solver.\n\n The eigenvalue solver computes the optimal solution of the Rayleigh\n coefficient (basically the ratio of between class scatter to within\n class scatter). This solver supports both classification and\n dimensionality reduction (with any covariance estimator).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n shrinkage : 'auto', float or None\n Shrinkage parameter, possible values:\n - None: no shrinkage.\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage constant.\n\n Shrinkage parameter is ignored if `covariance_estimator` i\n not None\n\n covariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in sklearn.covariance.\n if None the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\n Notes\n -----\n This solver is based on [1]_, section 3.8.3, pp. 121-124.\n\n References\n ----------\n .. [1] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification\n (Second Edition). John Wiley & Sons, Inc., New York, 2001. ISBN\n 0-471-05669-3.\n \"\"\"\n self.means_ = _class_means(X, y)\n self.covariance_ = _class_cov(X, y, self.priors_, shrinkage, covariance_estimator)\n Sw = self.covariance_\n St = _cov(X, shrinkage, covariance_estimator)\n Sb = St - Sw\n (evals, evecs) = linalg.eigh(Sb, Sw)\n self.explained_variance_ratio_ = np.sort(evals / np.sum(evals))[::-1][:self._max_components]\n evecs = evecs[:, np.argsort(evals)[::-1]]\n self.scalings_ = evecs\n self.coef_ = np.dot(self.means_, evecs).dot(evecs.T)\n self.intercept_ = -0.5 * np.diag(np.dot(self.means_, self.coef_.T)) + np.log(self.priors_)\n \n def _solve_svd(self, X, y):\n \"\"\"SVD solver.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n \"\"\"\n (n_samples, n_features) = X.shape\n n_classes = len(self.classes_)\n self.means_ = _class_means(X, y)\n if self.store_covariance:\n self.covariance_ = _class_cov(X, y, self.priors_)\n Xc = []\n for (idx, group) in enumerate(self.classes_):\n Xg = X[y == group, :]\n Xc.append(Xg - self.means_[idx])\n self.xbar_ = np.dot(self.priors_, self.means_)\n Xc = np.concatenate(Xc, axis=0)\n std = Xc.std(axis=0)\n std[std == 0] = 1.0\n fac = 1.0 / (n_samples - n_classes)\n X = np.sqrt(fac) * (Xc / std)\n (U, S, Vt) = linalg.svd(X, full_matrices=False)\n rank = np.sum(S > self.tol)\n scalings = (Vt[:rank] / std).T / S[:rank]\n X = np.dot((np.sqrt(n_samples * self.priors_ * fac) * (self.means_ - self.xbar_).T).T, scalings)\n (_, S, Vt) = linalg.svd(X, full_matrices=0)\n if self._max_components == 0:\n self.explained_variance_ratio_ = np.empty((0, ), dtype=S.dtype)\n else:\n self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[:self._max_components]\n rank = np.sum(S > self.tol * S[0])\n self.scalings_ = np.dot(scalings, Vt.T[:, :rank])\n coef = np.dot(self.means_ - self.xbar_, self.scalings_)\n self.intercept_ = -0.5 * np.sum(coef**2, axis=1) + np.log(self.priors_)\n self.coef_ = np.dot(coef, self.scalings_.T)\n self.intercept_ -= np.dot(self.xbar_, self.coef_.T)\n \n def fit(self, X, y):\n \"\"\"Fit the Linear Discriminant Analysis model.\n\n .. versionchanged:: 0.19\n *store_covariance* has been moved to main constructor.\n\n .. versionchanged:: 0.19\n *tol* has been moved to main constructor.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y, ensure_min_samples=2, estimator=self, dtype=[np.float64, np.float32])\n self.classes_ = unique_labels(y)\n (n_samples, _) = X.shape\n n_classes = len(self.classes_)\n if n_samples == n_classes:\n raise ValueError('The number of samples must be more than the number of classes.')\n if self.priors is None:\n (_, y_t) = np.unique(y, return_inverse=True)\n self.priors_ = np.bincount(y_t) / float(len(y))\n else:\n self.priors_ = np.asarray(self.priors)\n if (self.priors_ < 0).any():\n raise ValueError('priors must be non-negative')\n if not np.isclose(self.priors_.sum(), 1.0):\n warnings.warn('The priors do not sum to 1. Renormalizing', UserWarning)\n self.priors_ = self.priors_ / self.priors_.sum()\n max_components = min(len(self.classes_) - 1, X.shape[1])\n if self.n_components is None:\n self._max_components = max_components\n else:\n if self.n_components > max_components:\n raise ValueError('n_components cannot be larger than min(n_features, n_classes - 1).')\n self._max_components = self.n_components\n if self.solver == 'svd':\n if self.shrinkage is not None:\n raise NotImplementedError('shrinkage not supported')\n if self.covariance_estimator is not None:\n raise ValueError('covariance estimator is not supported with svd solver. Try another solver')\n self._solve_svd(X, y)\n elif self.solver == 'lsqr':\n self._solve_lsqr(X, y, shrinkage=self.shrinkage, covariance_estimator=self.covariance_estimator)\n elif self.solver == 'eigen':\n self._solve_eigen(X, y, shrinkage=self.shrinkage, covariance_estimator=self.covariance_estimator)\n else:\n raise ValueError(\"unknown solver {} (valid solvers are 'svd', 'lsqr', and 'eigen').\".format(self.solver))\n if self.classes_.size == 2:\n self.coef_ = np.array(self.coef_[1, :] - self.coef_[0, :], ndmin=2, dtype=X.dtype)\n self.intercept_ = np.array(self.intercept_[1] - self.intercept_[0], ndmin=1, dtype=X.dtype)\n return self\n \n def transform(self, X):\n \"\"\"Project data to maximize class separation.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n if self.solver == 'lsqr':\n raise NotImplementedError(\"transform not implemented for 'lsqr' solver (use 'svd' or 'eigen').\")\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n if self.solver == 'svd':\n X_new = np.dot(X - self.xbar_, self.scalings_)\n elif self.solver == 'eigen':\n X_new = np.dot(X, self.scalings_)\n return X_new[:, :self._max_components]\n \n def predict_proba(self, X):\n \"\"\"Estimate probability.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n Estimated probabilities.\n \"\"\"\n check_is_fitted(self)\n decision = self.decision_function(X)\n if self.classes_.size == 2:\n proba = expit(decision)\n return np.vstack([1 - proba, proba]).T\n else:\n return softmax(decision)\n \n def predict_log_proba(self, X):\n \"\"\"Estimate log probability.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n Estimated log probabilities.\n \"\"\"\n prediction = self.predict_proba(X)\n prediction[prediction == 0.0] += np.finfo(prediction.dtype).tiny\n return np.log(prediction)\n \n def decision_function(self, X):\n \"\"\"Apply decision function to an array of samples.\n\n The decision function is equal (up to a constant factor) to the\n log-posterior of the model, i.e. `log p(y = k | x)`. In a binary\n classification setting this instead corresponds to the difference\n `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Array of samples (test vectors).\n\n Returns\n -------\n C : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Decision function values related to each class, per sample.\n In the two-class case, the shape is (n_samples,), giving the\n log likelihood ratio of the positive class.\n \"\"\"\n return super().decision_function(X)\n" }, @@ -20735,7 +20801,7 @@ "sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.predict_log_proba" ], "is_public": true, - "description": "Quadratic Discriminant Analysis.\n\nA classifier with a quadratic decision boundary, generated by fitting class conditional densities to the data and using Bayes' rule. The model fits a Gaussian density to each class. .. versionadded:: 0.17 *QuadraticDiscriminantAnalysis* Read more in the :ref:`User Guide `.", + "description": "Quadratic Discriminant Analysis.\n\nA classifier with a quadratic decision boundary, generated\nby fitting class conditional densities to the data\nand using Bayes' rule.\n\nThe model fits a Gaussian density to each class.\n\n.. versionadded:: 0.17\n *QuadraticDiscriminantAnalysis*\n\nRead more in the :ref:`User Guide `.", "docstring": "Quadratic Discriminant Analysis.\n\n A classifier with a quadratic decision boundary, generated\n by fitting class conditional densities to the data\n and using Bayes' rule.\n\n The model fits a Gaussian density to each class.\n\n .. versionadded:: 0.17\n *QuadraticDiscriminantAnalysis*\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n priors : ndarray of shape (n_classes,), default=None\n Class priors. By default, the class proportions are inferred from the\n training data.\n\n reg_param : float, default=0.0\n Regularizes the per-class covariance estimates by transforming S2 as\n ``S2 = (1 - reg_param) * S2 + reg_param * np.eye(n_features)``,\n where S2 corresponds to the `scaling_` attribute of a given class.\n\n store_covariance : bool, default=False\n If True, the class covariance matrices are explicitly computed and\n stored in the `self.covariance_` attribute.\n\n .. versionadded:: 0.17\n\n tol : float, default=1.0e-4\n Absolute threshold for a singular value to be considered significant,\n used to estimate the rank of `Xk` where `Xk` is the centered matrix\n of samples in class k. This parameter does not affect the\n predictions. It only controls a warning that is raised when features\n are considered to be colinear.\n\n .. versionadded:: 0.17\n\n Attributes\n ----------\n covariance_ : list of len n_classes of ndarray of shape (n_features, n_features)\n For each class, gives the covariance matrix estimated using the\n samples of that class. The estimations are unbiased. Only present if\n `store_covariance` is True.\n\n means_ : array-like of shape (n_classes, n_features)\n Class-wise means.\n\n priors_ : array-like of shape (n_classes,)\n Class priors (sum to 1).\n\n rotations_ : list of len n_classes of ndarray of shape (n_features, n_k)\n For each class k an array of shape (n_features, n_k), where\n ``n_k = min(n_features, number of elements in class k)``\n It is the rotation of the Gaussian distribution, i.e. its\n principal axis. It corresponds to `V`, the matrix of eigenvectors\n coming from the SVD of `Xk = U S Vt` where `Xk` is the centered\n matrix of samples from class k.\n\n scalings_ : list of len n_classes of ndarray of shape (n_k,)\n For each class, contains the scaling of\n the Gaussian distributions along its principal axes, i.e. the\n variance in the rotated coordinate system. It corresponds to `S^2 /\n (n_samples - 1)`, where `S` is the diagonal matrix of singular values\n from the SVD of `Xk`, where `Xk` is the centered matrix of samples\n from class k.\n\n classes_ : ndarray of shape (n_classes,)\n Unique class labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n LinearDiscriminantAnalysis : Linear Discriminant Analysis.\n\n Examples\n --------\n >>> from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\n >>> import numpy as np\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> y = np.array([1, 1, 1, 2, 2, 2])\n >>> clf = QuadraticDiscriminantAnalysis()\n >>> clf.fit(X, y)\n QuadraticDiscriminantAnalysis()\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n ", "source_code": "\n\nclass QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator):\n \"\"\"Quadratic Discriminant Analysis.\n\n A classifier with a quadratic decision boundary, generated\n by fitting class conditional densities to the data\n and using Bayes' rule.\n\n The model fits a Gaussian density to each class.\n\n .. versionadded:: 0.17\n *QuadraticDiscriminantAnalysis*\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n priors : ndarray of shape (n_classes,), default=None\n Class priors. By default, the class proportions are inferred from the\n training data.\n\n reg_param : float, default=0.0\n Regularizes the per-class covariance estimates by transforming S2 as\n ``S2 = (1 - reg_param) * S2 + reg_param * np.eye(n_features)``,\n where S2 corresponds to the `scaling_` attribute of a given class.\n\n store_covariance : bool, default=False\n If True, the class covariance matrices are explicitly computed and\n stored in the `self.covariance_` attribute.\n\n .. versionadded:: 0.17\n\n tol : float, default=1.0e-4\n Absolute threshold for a singular value to be considered significant,\n used to estimate the rank of `Xk` where `Xk` is the centered matrix\n of samples in class k. This parameter does not affect the\n predictions. It only controls a warning that is raised when features\n are considered to be colinear.\n\n .. versionadded:: 0.17\n\n Attributes\n ----------\n covariance_ : list of len n_classes of ndarray of shape (n_features, n_features)\n For each class, gives the covariance matrix estimated using the\n samples of that class. The estimations are unbiased. Only present if\n `store_covariance` is True.\n\n means_ : array-like of shape (n_classes, n_features)\n Class-wise means.\n\n priors_ : array-like of shape (n_classes,)\n Class priors (sum to 1).\n\n rotations_ : list of len n_classes of ndarray of shape (n_features, n_k)\n For each class k an array of shape (n_features, n_k), where\n ``n_k = min(n_features, number of elements in class k)``\n It is the rotation of the Gaussian distribution, i.e. its\n principal axis. It corresponds to `V`, the matrix of eigenvectors\n coming from the SVD of `Xk = U S Vt` where `Xk` is the centered\n matrix of samples from class k.\n\n scalings_ : list of len n_classes of ndarray of shape (n_k,)\n For each class, contains the scaling of\n the Gaussian distributions along its principal axes, i.e. the\n variance in the rotated coordinate system. It corresponds to `S^2 /\n (n_samples - 1)`, where `S` is the diagonal matrix of singular values\n from the SVD of `Xk`, where `Xk` is the centered matrix of samples\n from class k.\n\n classes_ : ndarray of shape (n_classes,)\n Unique class labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n LinearDiscriminantAnalysis : Linear Discriminant Analysis.\n\n Examples\n --------\n >>> from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\n >>> import numpy as np\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> y = np.array([1, 1, 1, 2, 2, 2])\n >>> clf = QuadraticDiscriminantAnalysis()\n >>> clf.fit(X, y)\n QuadraticDiscriminantAnalysis()\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n \"\"\"\n \n def __init__(self, *, priors=None, reg_param=0.0, store_covariance=False, tol=0.0001):\n self.priors = np.asarray(priors) if priors is not None else None\n self.reg_param = reg_param\n self.store_covariance = store_covariance\n self.tol = tol\n \n def fit(self, X, y):\n \"\"\"Fit the model according to the given training data and parameters.\n\n .. versionchanged:: 0.19\n ``store_covariances`` has been moved to main constructor as\n ``store_covariance``\n\n .. versionchanged:: 0.19\n ``tol`` has been moved to main constructor.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values (integers).\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y)\n check_classification_targets(y)\n (self.classes_, y) = np.unique(y, return_inverse=True)\n (n_samples, n_features) = X.shape\n n_classes = len(self.classes_)\n if n_classes < 2:\n raise ValueError('The number of classes has to be greater than one; got %d class' % n_classes)\n if self.priors is None:\n self.priors_ = np.bincount(y) / float(n_samples)\n else:\n self.priors_ = self.priors\n cov = None\n store_covariance = self.store_covariance\n if store_covariance:\n cov = []\n means = []\n scalings = []\n rotations = []\n for ind in range(n_classes):\n Xg = X[y == ind, :]\n meang = Xg.mean(0)\n means.append(meang)\n if len(Xg) == 1:\n raise ValueError('y has only 1 sample in class %s, covariance is ill defined.' % str(self.classes_[ind]))\n Xgc = Xg - meang\n (_, S, Vt) = np.linalg.svd(Xgc, full_matrices=False)\n rank = np.sum(S > self.tol)\n if rank < n_features:\n warnings.warn('Variables are collinear')\n S2 = S**2 / (len(Xg) - 1)\n S2 = (1 - self.reg_param) * S2 + self.reg_param\n if self.store_covariance or store_covariance:\n cov.append(np.dot(S2 * Vt.T, Vt))\n scalings.append(S2)\n rotations.append(Vt.T)\n if self.store_covariance or store_covariance:\n self.covariance_ = cov\n self.means_ = np.asarray(means)\n self.scalings_ = scalings\n self.rotations_ = rotations\n return self\n \n def _decision_function(self, X):\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n norm2 = []\n for i in range(len(self.classes_)):\n R = self.rotations_[i]\n S = self.scalings_[i]\n Xm = X - self.means_[i]\n X2 = np.dot(Xm, R * S**(-0.5))\n norm2.append(np.sum(X2**2, axis=1))\n norm2 = np.array(norm2).T\n u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])\n return -0.5 * (norm2 + u) + np.log(self.priors_)\n \n def decision_function(self, X):\n \"\"\"Apply decision function to an array of samples.\n\n The decision function is equal (up to a constant factor) to the\n log-posterior of the model, i.e. `log p(y = k | x)`. In a binary\n classification setting this instead corresponds to the difference\n `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Array of samples (test vectors).\n\n Returns\n -------\n C : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Decision function values related to each class, per sample.\n In the two-class case, the shape is (n_samples,), giving the\n log likelihood ratio of the positive class.\n \"\"\"\n dec_func = self._decision_function(X)\n if len(self.classes_) == 2:\n return dec_func[:, 1] - dec_func[:, 0]\n return dec_func\n \n def predict(self, X):\n \"\"\"Perform classification on an array of test vectors X.\n\n The predicted class C for each sample in X is returned.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Vector to be scored, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n Estimated probabilities.\n \"\"\"\n d = self._decision_function(X)\n y_pred = self.classes_.take(d.argmax(1))\n return y_pred\n \n def predict_proba(self, X):\n \"\"\"Return posterior probabilities of classification.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Array of samples/test vectors.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n Posterior probabilities of classification per class.\n \"\"\"\n values = self._decision_function(X)\n likelihood = np.exp(values - values.max(axis=1)[:, np.newaxis])\n return likelihood / likelihood.sum(axis=1)[:, np.newaxis]\n \n def predict_log_proba(self, X):\n \"\"\"Return log of posterior probabilities of classification.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Array of samples/test vectors.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n Posterior log-probabilities of classification per class.\n \"\"\"\n probas_ = self.predict_proba(X)\n return np.log(probas_)\n" }, @@ -20759,9 +20825,9 @@ "sklearn.dummy.DummyClassifier.n_features_in_@getter" ], "is_public": true, - "description": "DummyClassifier is a classifier that makes predictions using simple rules.\n\nThis classifier is useful as a simple baseline to compare with other (real) classifiers. Do not use it for real problems. Read more in the :ref:`User Guide `. .. versionadded:: 0.13", - "docstring": "\n DummyClassifier is a classifier that makes predictions using simple rules.\n\n This classifier is useful as a simple baseline to compare with other\n (real) classifiers. Do not use it for real problems.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n strategy : {\"stratified\", \"most_frequent\", \"prior\", \"uniform\", \"constant\"}, default=\"prior\"\n Strategy to use to generate predictions.\n\n * \"stratified\": generates predictions by respecting the training\n set's class distribution.\n * \"most_frequent\": always predicts the most frequent label in the\n training set.\n * \"prior\": always predicts the class that maximizes the class prior\n (like \"most_frequent\") and ``predict_proba`` returns the class prior.\n * \"uniform\": generates predictions uniformly at random.\n * \"constant\": always predicts a constant label that is provided by\n the user. This is useful for metrics that evaluate a non-majority\n class\n\n .. versionchanged:: 0.24\n The default value of `strategy` has changed to \"prior\" in version\n 0.24.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness to generate the predictions when\n ``strategy='stratified'`` or ``strategy='uniform'``.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n constant : int or str or array-like of shape (n_outputs,)\n The explicit constant as predicted by the \"constant\" strategy. This\n parameter is useful only for the \"constant\" strategy.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,) or list of such arrays\n Class labels for each output.\n\n n_classes_ : int or list of int\n Number of label for each output.\n\n class_prior_ : ndarray of shape (n_classes,) or list of such arrays\n Probability of each class for each output.\n\n n_outputs_ : int\n Number of outputs.\n\n n_features_in_ : `None`\n Always set to `None`.\n\n .. versionadded:: 0.24\n .. deprecated:: 1.0\n Will be removed in 1.0\n\n sparse_output_ : bool\n True if the array returned from predict is to be in sparse CSC format.\n Is automatically set to True if the input y is passed in sparse format.\n\n See Also\n --------\n DummyRegressor : Regressor that makes predictions using simple rules.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.dummy import DummyClassifier\n >>> X = np.array([-1, 1, 1, 1])\n >>> y = np.array([0, 1, 1, 1])\n >>> dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n >>> dummy_clf.fit(X, y)\n DummyClassifier(strategy='most_frequent')\n >>> dummy_clf.predict(X)\n array([1, 1, 1, 1])\n >>> dummy_clf.score(X, y)\n 0.75\n ", - "source_code": "\n\nclass DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):\n \"\"\"\n DummyClassifier is a classifier that makes predictions using simple rules.\n\n This classifier is useful as a simple baseline to compare with other\n (real) classifiers. Do not use it for real problems.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n strategy : {\"stratified\", \"most_frequent\", \"prior\", \"uniform\", \"constant\"}, default=\"prior\"\n Strategy to use to generate predictions.\n\n * \"stratified\": generates predictions by respecting the training\n set's class distribution.\n * \"most_frequent\": always predicts the most frequent label in the\n training set.\n * \"prior\": always predicts the class that maximizes the class prior\n (like \"most_frequent\") and ``predict_proba`` returns the class prior.\n * \"uniform\": generates predictions uniformly at random.\n * \"constant\": always predicts a constant label that is provided by\n the user. This is useful for metrics that evaluate a non-majority\n class\n\n .. versionchanged:: 0.24\n The default value of `strategy` has changed to \"prior\" in version\n 0.24.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness to generate the predictions when\n ``strategy='stratified'`` or ``strategy='uniform'``.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n constant : int or str or array-like of shape (n_outputs,)\n The explicit constant as predicted by the \"constant\" strategy. This\n parameter is useful only for the \"constant\" strategy.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,) or list of such arrays\n Class labels for each output.\n\n n_classes_ : int or list of int\n Number of label for each output.\n\n class_prior_ : ndarray of shape (n_classes,) or list of such arrays\n Probability of each class for each output.\n\n n_outputs_ : int\n Number of outputs.\n\n n_features_in_ : `None`\n Always set to `None`.\n\n .. versionadded:: 0.24\n .. deprecated:: 1.0\n Will be removed in 1.0\n\n sparse_output_ : bool\n True if the array returned from predict is to be in sparse CSC format.\n Is automatically set to True if the input y is passed in sparse format.\n\n See Also\n --------\n DummyRegressor : Regressor that makes predictions using simple rules.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.dummy import DummyClassifier\n >>> X = np.array([-1, 1, 1, 1])\n >>> y = np.array([0, 1, 1, 1])\n >>> dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n >>> dummy_clf.fit(X, y)\n DummyClassifier(strategy='most_frequent')\n >>> dummy_clf.predict(X)\n array([1, 1, 1, 1])\n >>> dummy_clf.score(X, y)\n 0.75\n \"\"\"\n \n def __init__(self, *, strategy='prior', random_state=None, constant=None):\n self.strategy = strategy\n self.random_state = random_state\n self.constant = constant\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the random classifier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n allowed_strategies = ('most_frequent', 'stratified', 'uniform', 'constant', 'prior')\n if self.strategy not in allowed_strategies:\n raise ValueError('Unknown strategy type: %s, expected one of %s.' % (self.strategy, allowed_strategies))\n self._strategy = self.strategy\n if self._strategy == 'uniform' and sp.issparse(y):\n y = y.toarray()\n warnings.warn('A local copy of the target data has been converted to a numpy array. Predicting on sparse target data with the uniform strategy would not save memory and would be slower.', UserWarning)\n self.sparse_output_ = sp.issparse(y)\n if not self.sparse_output_:\n y = np.asarray(y)\n y = np.atleast_1d(y)\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n self.n_outputs_ = y.shape[1]\n check_consistent_length(X, y)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if self._strategy == 'constant':\n if self.constant is None:\n raise ValueError('Constant target value has to be specified when the constant strategy is used.')\n else:\n constant = np.reshape(np.atleast_1d(self.constant), (-1, 1))\n if constant.shape[0] != self.n_outputs_:\n raise ValueError('Constant target value should have shape (%d, 1).' % self.n_outputs_)\n (self.classes_, self.n_classes_, self.class_prior_) = class_distribution(y, sample_weight)\n if self._strategy == 'constant':\n for k in range(self.n_outputs_):\n if not any((constant[k][0] == c for c in self.classes_[k])):\n err_msg = 'The constant target value must be present in the training data. You provided constant={}. Possible values are: {}.'.format(self.constant, list(self.classes_[k]))\n raise ValueError(err_msg)\n if self.n_outputs_ == 1:\n self.n_classes_ = self.n_classes_[0]\n self.classes_ = self.classes_[0]\n self.class_prior_ = self.class_prior_[0]\n return self\n \n def predict(self, X):\n \"\"\"Perform classification on test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test data.\n\n Returns\n -------\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Predicted target values for X.\n \"\"\"\n check_is_fitted(self)\n n_samples = _num_samples(X)\n rs = check_random_state(self.random_state)\n n_classes_ = self.n_classes_\n classes_ = self.classes_\n class_prior_ = self.class_prior_\n constant = self.constant\n if self.n_outputs_ == 1:\n n_classes_ = [n_classes_]\n classes_ = [classes_]\n class_prior_ = [class_prior_]\n constant = [constant]\n if self._strategy == 'stratified':\n proba = self.predict_proba(X)\n if self.n_outputs_ == 1:\n proba = [proba]\n if self.sparse_output_:\n class_prob = None\n if self._strategy in ('most_frequent', 'prior'):\n classes_ = [np.array([cp.argmax()]) for cp in class_prior_]\n elif self._strategy == 'stratified':\n class_prob = class_prior_\n elif self._strategy == 'uniform':\n raise ValueError('Sparse target prediction is not supported with the uniform strategy')\n elif self._strategy == 'constant':\n classes_ = [np.array([c]) for c in constant]\n y = _random_choice_csc(n_samples, classes_, class_prob, self.random_state)\n else:\n if self._strategy in ('most_frequent', 'prior'):\n y = np.tile([classes_[k][class_prior_[k].argmax()] for k in range(self.n_outputs_)], [n_samples, 1])\n elif self._strategy == 'stratified':\n y = np.vstack([classes_[k][proba[k].argmax(axis=1)] for k in range(self.n_outputs_)]).T\n elif self._strategy == 'uniform':\n ret = [classes_[k][rs.randint(n_classes_[k], size=n_samples)] for k in range(self.n_outputs_)]\n y = np.vstack(ret).T\n elif self._strategy == 'constant':\n y = np.tile(self.constant, (n_samples, 1))\n if self.n_outputs_ == 1:\n y = np.ravel(y)\n return y\n \n def predict_proba(self, X):\n \"\"\"\n Return probability estimates for the test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test data.\n\n Returns\n -------\n P : ndarray of shape (n_samples, n_classes) or list of such arrays\n Returns the probability of the sample for each class in\n the model, where classes are ordered arithmetically, for each\n output.\n \"\"\"\n check_is_fitted(self)\n n_samples = _num_samples(X)\n rs = check_random_state(self.random_state)\n n_classes_ = self.n_classes_\n classes_ = self.classes_\n class_prior_ = self.class_prior_\n constant = self.constant\n if self.n_outputs_ == 1:\n n_classes_ = [n_classes_]\n classes_ = [classes_]\n class_prior_ = [class_prior_]\n constant = [constant]\n P = []\n for k in range(self.n_outputs_):\n if self._strategy == 'most_frequent':\n ind = class_prior_[k].argmax()\n out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)\n out[:, ind] = 1.0\n elif self._strategy == 'prior':\n out = np.ones((n_samples, 1)) * class_prior_[k]\n elif self._strategy == 'stratified':\n out = rs.multinomial(1, class_prior_[k], size=n_samples)\n out = out.astype(np.float64)\n elif self._strategy == 'uniform':\n out = np.ones((n_samples, n_classes_[k]), dtype=np.float64)\n out /= n_classes_[k]\n elif self._strategy == 'constant':\n ind = np.where(classes_[k] == constant[k])\n out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)\n out[:, ind] = 1.0\n P.append(out)\n if self.n_outputs_ == 1:\n P = P[0]\n return P\n \n def predict_log_proba(self, X):\n \"\"\"\n Return log probability estimates for the test vectors X.\n\n Parameters\n ----------\n X : {array-like, object with finite length or shape}\n Training data.\n\n Returns\n -------\n P : ndarray of shape (n_samples, n_classes) or list of such arrays\n Returns the log probability of the sample for each class in\n the model, where classes are ordered arithmetically for each\n output.\n \"\"\"\n proba = self.predict_proba(X)\n if self.n_outputs_ == 1:\n return np.log(proba)\n else:\n return [np.log(p) for p in proba]\n \n def _more_tags(self):\n return {'poor_score': True, 'no_validation': True, '_xfail_checks': {'check_methods_subset_invariance': 'fails for the predict method', 'check_methods_sample_order_invariance': 'fails for the predict method'}}\n \n def score(self, X, y, sample_weight=None):\n \"\"\"Return the mean accuracy on the given test data and labels.\n\n In multi-label classification, this is the subset accuracy\n which is a harsh metric since you require for each sample that\n each label set be correctly predicted.\n\n Parameters\n ----------\n X : None or array-like of shape (n_samples, n_features)\n Test samples. Passing None as test samples gives the same result\n as passing real test samples, since DummyClassifier\n operates independently of the sampled observations.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True labels for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Mean accuracy of self.predict(X) wrt. y.\n \"\"\"\n if X is None:\n X = np.zeros(shape=(len(y), 1))\n return super().score(X, y, sample_weight)\n \n @deprecated('`n_features_in_` is deprecated in 1.0 and will be removed in 1.2.')\n @property\n def n_features_in_(self):\n check_is_fitted(self)\n return None\n" + "description": "DummyClassifier makes predictions that ignore the input features.\n\nThis classifier serves as a simple baseline to compare against other more\ncomplex classifiers.\n\nThe specific behavior of the baseline is selected with the `strategy`\nparameter.\n\nAll strategies make predictions that ignore the input feature values passed\nas the `X` argument to `fit` and `predict`. The predictions, however,\ntypically depend on values observed in the `y` parameter passed to `fit`.\n\nNote that the \"stratified\" and \"uniform\" strategies lead to\nnon-deterministic predictions that can be rendered deterministic by setting\nthe `random_state` parameter if needed. The other strategies are naturally\ndeterministic and, once fit, always return a the same constant prediction\nfor any value of `X`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.13", + "docstring": "DummyClassifier makes predictions that ignore the input features.\n\n This classifier serves as a simple baseline to compare against other more\n complex classifiers.\n\n The specific behavior of the baseline is selected with the `strategy`\n parameter.\n\n All strategies make predictions that ignore the input feature values passed\n as the `X` argument to `fit` and `predict`. The predictions, however,\n typically depend on values observed in the `y` parameter passed to `fit`.\n\n Note that the \"stratified\" and \"uniform\" strategies lead to\n non-deterministic predictions that can be rendered deterministic by setting\n the `random_state` parameter if needed. The other strategies are naturally\n deterministic and, once fit, always return a the same constant prediction\n for any value of `X`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n strategy : {\"most_frequent\", \"prior\", \"stratified\", \"uniform\", \"constant\"}, default=\"prior\"\n Strategy to use to generate predictions.\n\n * \"most_frequent\": the `predict` method always returns the most\n frequent class label in the observed `y` argument passed to `fit`.\n The `predict_proba` method returns the matching one-hot encoded\n vector.\n * \"prior\": the `predict` method always returns the most frequent\n class label in the observed `y` argument passed to `fit` (like\n \"most_frequent\"). ``predict_proba`` always returns the empirical\n class distribution of `y` also known as the empirical class prior\n distribution.\n * \"stratified\": the `predict_proba` method randomly samples one-hot\n vectors from a multinomial distribution parametrized by the empirical\n class prior probabilities.\n The `predict` method returns the class label which got probability\n one in the one-hot vector of `predict_proba`.\n Each sampled row of both methods is therefore independent and\n identically distributed.\n * \"uniform\": generates predictions uniformly at random from the list\n of unique classes observed in `y`, i.e. each class has equal\n probability.\n * \"constant\": always predicts a constant label that is provided by\n the user. This is useful for metrics that evaluate a non-majority\n class.\n\n .. versionchanged:: 0.24\n The default value of `strategy` has changed to \"prior\" in version\n 0.24.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness to generate the predictions when\n ``strategy='stratified'`` or ``strategy='uniform'``.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n constant : int or str or array-like of shape (n_outputs,), default=None\n The explicit constant as predicted by the \"constant\" strategy. This\n parameter is useful only for the \"constant\" strategy.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,) or list of such arrays\n Unique class labels observed in `y`. For multi-output classification\n problems, this attribute is a list of arrays as each output has an\n independent set of possible classes.\n\n n_classes_ : int or list of int\n Number of label for each output.\n\n class_prior_ : ndarray of shape (n_classes,) or list of such arrays\n Frequency of each class observed in `y`. For multioutput classification\n problems, this is computed independently for each output.\n\n n_outputs_ : int\n Number of outputs.\n\n n_features_in_ : `None`\n Always set to `None`.\n\n .. versionadded:: 0.24\n .. deprecated:: 1.0\n Will be removed in 1.0\n\n sparse_output_ : bool\n True if the array returned from predict is to be in sparse CSC format.\n Is automatically set to True if the input `y` is passed in sparse\n format.\n\n See Also\n --------\n DummyRegressor : Regressor that makes predictions using simple rules.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.dummy import DummyClassifier\n >>> X = np.array([-1, 1, 1, 1])\n >>> y = np.array([0, 1, 1, 1])\n >>> dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n >>> dummy_clf.fit(X, y)\n DummyClassifier(strategy='most_frequent')\n >>> dummy_clf.predict(X)\n array([1, 1, 1, 1])\n >>> dummy_clf.score(X, y)\n 0.75\n ", + "source_code": "\n\nclass DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):\n \"\"\"DummyClassifier makes predictions that ignore the input features.\n\n This classifier serves as a simple baseline to compare against other more\n complex classifiers.\n\n The specific behavior of the baseline is selected with the `strategy`\n parameter.\n\n All strategies make predictions that ignore the input feature values passed\n as the `X` argument to `fit` and `predict`. The predictions, however,\n typically depend on values observed in the `y` parameter passed to `fit`.\n\n Note that the \"stratified\" and \"uniform\" strategies lead to\n non-deterministic predictions that can be rendered deterministic by setting\n the `random_state` parameter if needed. The other strategies are naturally\n deterministic and, once fit, always return a the same constant prediction\n for any value of `X`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n strategy : {\"most_frequent\", \"prior\", \"stratified\", \"uniform\", \"constant\"}, default=\"prior\"\n Strategy to use to generate predictions.\n\n * \"most_frequent\": the `predict` method always returns the most\n frequent class label in the observed `y` argument passed to `fit`.\n The `predict_proba` method returns the matching one-hot encoded\n vector.\n * \"prior\": the `predict` method always returns the most frequent\n class label in the observed `y` argument passed to `fit` (like\n \"most_frequent\"). ``predict_proba`` always returns the empirical\n class distribution of `y` also known as the empirical class prior\n distribution.\n * \"stratified\": the `predict_proba` method randomly samples one-hot\n vectors from a multinomial distribution parametrized by the empirical\n class prior probabilities.\n The `predict` method returns the class label which got probability\n one in the one-hot vector of `predict_proba`.\n Each sampled row of both methods is therefore independent and\n identically distributed.\n * \"uniform\": generates predictions uniformly at random from the list\n of unique classes observed in `y`, i.e. each class has equal\n probability.\n * \"constant\": always predicts a constant label that is provided by\n the user. This is useful for metrics that evaluate a non-majority\n class.\n\n .. versionchanged:: 0.24\n The default value of `strategy` has changed to \"prior\" in version\n 0.24.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness to generate the predictions when\n ``strategy='stratified'`` or ``strategy='uniform'``.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n constant : int or str or array-like of shape (n_outputs,), default=None\n The explicit constant as predicted by the \"constant\" strategy. This\n parameter is useful only for the \"constant\" strategy.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,) or list of such arrays\n Unique class labels observed in `y`. For multi-output classification\n problems, this attribute is a list of arrays as each output has an\n independent set of possible classes.\n\n n_classes_ : int or list of int\n Number of label for each output.\n\n class_prior_ : ndarray of shape (n_classes,) or list of such arrays\n Frequency of each class observed in `y`. For multioutput classification\n problems, this is computed independently for each output.\n\n n_outputs_ : int\n Number of outputs.\n\n n_features_in_ : `None`\n Always set to `None`.\n\n .. versionadded:: 0.24\n .. deprecated:: 1.0\n Will be removed in 1.0\n\n sparse_output_ : bool\n True if the array returned from predict is to be in sparse CSC format.\n Is automatically set to True if the input `y` is passed in sparse\n format.\n\n See Also\n --------\n DummyRegressor : Regressor that makes predictions using simple rules.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.dummy import DummyClassifier\n >>> X = np.array([-1, 1, 1, 1])\n >>> y = np.array([0, 1, 1, 1])\n >>> dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n >>> dummy_clf.fit(X, y)\n DummyClassifier(strategy='most_frequent')\n >>> dummy_clf.predict(X)\n array([1, 1, 1, 1])\n >>> dummy_clf.score(X, y)\n 0.75\n \"\"\"\n \n def __init__(self, *, strategy='prior', random_state=None, constant=None):\n self.strategy = strategy\n self.random_state = random_state\n self.constant = constant\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the baseline classifier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n allowed_strategies = ('most_frequent', 'stratified', 'uniform', 'constant', 'prior')\n if self.strategy not in allowed_strategies:\n raise ValueError('Unknown strategy type: %s, expected one of %s.' % (self.strategy, allowed_strategies))\n self._strategy = self.strategy\n if self._strategy == 'uniform' and sp.issparse(y):\n y = y.toarray()\n warnings.warn('A local copy of the target data has been converted to a numpy array. Predicting on sparse target data with the uniform strategy would not save memory and would be slower.', UserWarning)\n self.sparse_output_ = sp.issparse(y)\n if not self.sparse_output_:\n y = np.asarray(y)\n y = np.atleast_1d(y)\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n self.n_outputs_ = y.shape[1]\n check_consistent_length(X, y)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if self._strategy == 'constant':\n if self.constant is None:\n raise ValueError('Constant target value has to be specified when the constant strategy is used.')\n else:\n constant = np.reshape(np.atleast_1d(self.constant), (-1, 1))\n if constant.shape[0] != self.n_outputs_:\n raise ValueError('Constant target value should have shape (%d, 1).' % self.n_outputs_)\n (self.classes_, self.n_classes_, self.class_prior_) = class_distribution(y, sample_weight)\n if self._strategy == 'constant':\n for k in range(self.n_outputs_):\n if not any((constant[k][0] == c for c in self.classes_[k])):\n err_msg = 'The constant target value must be present in the training data. You provided constant={}. Possible values are: {}.'.format(self.constant, list(self.classes_[k]))\n raise ValueError(err_msg)\n if self.n_outputs_ == 1:\n self.n_classes_ = self.n_classes_[0]\n self.classes_ = self.classes_[0]\n self.class_prior_ = self.class_prior_[0]\n return self\n \n def predict(self, X):\n \"\"\"Perform classification on test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test data.\n\n Returns\n -------\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Predicted target values for X.\n \"\"\"\n check_is_fitted(self)\n n_samples = _num_samples(X)\n rs = check_random_state(self.random_state)\n n_classes_ = self.n_classes_\n classes_ = self.classes_\n class_prior_ = self.class_prior_\n constant = self.constant\n if self.n_outputs_ == 1:\n n_classes_ = [n_classes_]\n classes_ = [classes_]\n class_prior_ = [class_prior_]\n constant = [constant]\n if self._strategy == 'stratified':\n proba = self.predict_proba(X)\n if self.n_outputs_ == 1:\n proba = [proba]\n if self.sparse_output_:\n class_prob = None\n if self._strategy in ('most_frequent', 'prior'):\n classes_ = [np.array([cp.argmax()]) for cp in class_prior_]\n elif self._strategy == 'stratified':\n class_prob = class_prior_\n elif self._strategy == 'uniform':\n raise ValueError('Sparse target prediction is not supported with the uniform strategy')\n elif self._strategy == 'constant':\n classes_ = [np.array([c]) for c in constant]\n y = _random_choice_csc(n_samples, classes_, class_prob, self.random_state)\n else:\n if self._strategy in ('most_frequent', 'prior'):\n y = np.tile([classes_[k][class_prior_[k].argmax()] for k in range(self.n_outputs_)], [n_samples, 1])\n elif self._strategy == 'stratified':\n y = np.vstack([classes_[k][proba[k].argmax(axis=1)] for k in range(self.n_outputs_)]).T\n elif self._strategy == 'uniform':\n ret = [classes_[k][rs.randint(n_classes_[k], size=n_samples)] for k in range(self.n_outputs_)]\n y = np.vstack(ret).T\n elif self._strategy == 'constant':\n y = np.tile(self.constant, (n_samples, 1))\n if self.n_outputs_ == 1:\n y = np.ravel(y)\n return y\n \n def predict_proba(self, X):\n \"\"\"\n Return probability estimates for the test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test data.\n\n Returns\n -------\n P : ndarray of shape (n_samples, n_classes) or list of such arrays\n Returns the probability of the sample for each class in\n the model, where classes are ordered arithmetically, for each\n output.\n \"\"\"\n check_is_fitted(self)\n n_samples = _num_samples(X)\n rs = check_random_state(self.random_state)\n n_classes_ = self.n_classes_\n classes_ = self.classes_\n class_prior_ = self.class_prior_\n constant = self.constant\n if self.n_outputs_ == 1:\n n_classes_ = [n_classes_]\n classes_ = [classes_]\n class_prior_ = [class_prior_]\n constant = [constant]\n P = []\n for k in range(self.n_outputs_):\n if self._strategy == 'most_frequent':\n ind = class_prior_[k].argmax()\n out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)\n out[:, ind] = 1.0\n elif self._strategy == 'prior':\n out = np.ones((n_samples, 1)) * class_prior_[k]\n elif self._strategy == 'stratified':\n out = rs.multinomial(1, class_prior_[k], size=n_samples)\n out = out.astype(np.float64)\n elif self._strategy == 'uniform':\n out = np.ones((n_samples, n_classes_[k]), dtype=np.float64)\n out /= n_classes_[k]\n elif self._strategy == 'constant':\n ind = np.where(classes_[k] == constant[k])\n out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)\n out[:, ind] = 1.0\n P.append(out)\n if self.n_outputs_ == 1:\n P = P[0]\n return P\n \n def predict_log_proba(self, X):\n \"\"\"\n Return log probability estimates for the test vectors X.\n\n Parameters\n ----------\n X : {array-like, object with finite length or shape}\n Training data.\n\n Returns\n -------\n P : ndarray of shape (n_samples, n_classes) or list of such arrays\n Returns the log probability of the sample for each class in\n the model, where classes are ordered arithmetically for each\n output.\n \"\"\"\n proba = self.predict_proba(X)\n if self.n_outputs_ == 1:\n return np.log(proba)\n else:\n return [np.log(p) for p in proba]\n \n def _more_tags(self):\n return {'poor_score': True, 'no_validation': True, '_xfail_checks': {'check_methods_subset_invariance': 'fails for the predict method', 'check_methods_sample_order_invariance': 'fails for the predict method'}}\n \n def score(self, X, y, sample_weight=None):\n \"\"\"Return the mean accuracy on the given test data and labels.\n\n In multi-label classification, this is the subset accuracy\n which is a harsh metric since you require for each sample that\n each label set be correctly predicted.\n\n Parameters\n ----------\n X : None or array-like of shape (n_samples, n_features)\n Test samples. Passing None as test samples gives the same result\n as passing real test samples, since DummyClassifier\n operates independently of the sampled observations.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True labels for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Mean accuracy of self.predict(X) wrt. y.\n \"\"\"\n if X is None:\n X = np.zeros(shape=(len(y), 1))\n return super().score(X, y, sample_weight)\n \n @deprecated('`n_features_in_` is deprecated in 1.0 and will be removed in 1.2.')\n @property\n def n_features_in_(self):\n check_is_fitted(self)\n return None\n" }, { "name": "DummyRegressor", @@ -20781,7 +20847,7 @@ "sklearn.dummy.DummyRegressor.n_features_in_@getter" ], "is_public": true, - "description": "Regressor that makes predictions using simple rules.\n\nThis regressor is useful as a simple baseline to compare with other (real) regressors. Do not use it for real problems. Read more in the :ref:`User Guide `. .. versionadded:: 0.13", + "description": "Regressor that makes predictions using simple rules.\n\nThis regressor is useful as a simple baseline to compare with other\n(real) regressors. Do not use it for real problems.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.13", "docstring": "Regressor that makes predictions using simple rules.\n\n This regressor is useful as a simple baseline to compare with other\n (real) regressors. Do not use it for real problems.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n strategy : {\"mean\", \"median\", \"quantile\", \"constant\"}, default=\"mean\"\n Strategy to use to generate predictions.\n\n * \"mean\": always predicts the mean of the training set\n * \"median\": always predicts the median of the training set\n * \"quantile\": always predicts a specified quantile of the training set,\n provided with the quantile parameter.\n * \"constant\": always predicts a constant value that is provided by\n the user.\n\n constant : int or float or array-like of shape (n_outputs,), default=None\n The explicit constant as predicted by the \"constant\" strategy. This\n parameter is useful only for the \"constant\" strategy.\n\n quantile : float in [0.0, 1.0], default=None\n The quantile to predict using the \"quantile\" strategy. A quantile of\n 0.5 corresponds to the median, while 0.0 to the minimum and 1.0 to the\n maximum.\n\n Attributes\n ----------\n constant_ : ndarray of shape (1, n_outputs)\n Mean or median or quantile of the training targets or constant value\n given by the user.\n\n n_features_in_ : `None`\n Always set to `None`.\n\n .. versionadded:: 0.24\n .. deprecated:: 1.0\n Will be removed in 1.0\n\n n_outputs_ : int\n Number of outputs.\n\n See Also\n --------\n DummyClassifier: Classifier that makes predictions using simple rules.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.dummy import DummyRegressor\n >>> X = np.array([1.0, 2.0, 3.0, 4.0])\n >>> y = np.array([2.0, 3.0, 5.0, 10.0])\n >>> dummy_regr = DummyRegressor(strategy=\"mean\")\n >>> dummy_regr.fit(X, y)\n DummyRegressor()\n >>> dummy_regr.predict(X)\n array([5., 5., 5., 5.])\n >>> dummy_regr.score(X, y)\n 0.0\n ", "source_code": "\n\nclass DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):\n \"\"\"Regressor that makes predictions using simple rules.\n\n This regressor is useful as a simple baseline to compare with other\n (real) regressors. Do not use it for real problems.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n strategy : {\"mean\", \"median\", \"quantile\", \"constant\"}, default=\"mean\"\n Strategy to use to generate predictions.\n\n * \"mean\": always predicts the mean of the training set\n * \"median\": always predicts the median of the training set\n * \"quantile\": always predicts a specified quantile of the training set,\n provided with the quantile parameter.\n * \"constant\": always predicts a constant value that is provided by\n the user.\n\n constant : int or float or array-like of shape (n_outputs,), default=None\n The explicit constant as predicted by the \"constant\" strategy. This\n parameter is useful only for the \"constant\" strategy.\n\n quantile : float in [0.0, 1.0], default=None\n The quantile to predict using the \"quantile\" strategy. A quantile of\n 0.5 corresponds to the median, while 0.0 to the minimum and 1.0 to the\n maximum.\n\n Attributes\n ----------\n constant_ : ndarray of shape (1, n_outputs)\n Mean or median or quantile of the training targets or constant value\n given by the user.\n\n n_features_in_ : `None`\n Always set to `None`.\n\n .. versionadded:: 0.24\n .. deprecated:: 1.0\n Will be removed in 1.0\n\n n_outputs_ : int\n Number of outputs.\n\n See Also\n --------\n DummyClassifier: Classifier that makes predictions using simple rules.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.dummy import DummyRegressor\n >>> X = np.array([1.0, 2.0, 3.0, 4.0])\n >>> y = np.array([2.0, 3.0, 5.0, 10.0])\n >>> dummy_regr = DummyRegressor(strategy=\"mean\")\n >>> dummy_regr.fit(X, y)\n DummyRegressor()\n >>> dummy_regr.predict(X)\n array([5., 5., 5., 5.])\n >>> dummy_regr.score(X, y)\n 0.0\n \"\"\"\n \n def __init__(self, *, strategy='mean', constant=None, quantile=None):\n self.strategy = strategy\n self.constant = constant\n self.quantile = quantile\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the random regressor.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n allowed_strategies = ('mean', 'median', 'quantile', 'constant')\n if self.strategy not in allowed_strategies:\n raise ValueError('Unknown strategy type: %s, expected one of %s.' % (self.strategy, allowed_strategies))\n y = check_array(y, ensure_2d=False)\n if len(y) == 0:\n raise ValueError('y must not be empty.')\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n self.n_outputs_ = y.shape[1]\n check_consistent_length(X, y, sample_weight)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if self.strategy == 'mean':\n self.constant_ = np.average(y, axis=0, weights=sample_weight)\n elif self.strategy == 'median':\n if sample_weight is None:\n self.constant_ = np.median(y, axis=0)\n else:\n self.constant_ = [_weighted_percentile(y[:, k], sample_weight, percentile=50.0) for k in range(self.n_outputs_)]\n elif self.strategy == 'quantile':\n if self.quantile is None or not np.isscalar(self.quantile):\n raise ValueError('Quantile must be a scalar in the range [0.0, 1.0], but got %s.' % self.quantile)\n percentile = self.quantile * 100.0\n if sample_weight is None:\n self.constant_ = np.percentile(y, axis=0, q=percentile)\n else:\n self.constant_ = [_weighted_percentile(y[:, k], sample_weight, percentile=percentile) for k in range(self.n_outputs_)]\n elif self.strategy == 'constant':\n if self.constant is None:\n raise TypeError('Constant target value has to be specified when the constant strategy is used.')\n self.constant = check_array(self.constant, accept_sparse=['csr', 'csc', 'coo'], ensure_2d=False, ensure_min_samples=0)\n if self.n_outputs_ != 1 and self.constant.shape[0] != y.shape[1]:\n raise ValueError('Constant target value should have shape (%d, 1).' % y.shape[1])\n self.constant_ = self.constant\n self.constant_ = np.reshape(self.constant_, (1, -1))\n return self\n \n def predict(self, X, return_std=False):\n \"\"\"Perform classification on test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test data.\n\n return_std : bool, default=False\n Whether to return the standard deviation of posterior prediction.\n All zeros in this case.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Predicted target values for X.\n\n y_std : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Standard deviation of predictive distribution of query points.\n \"\"\"\n check_is_fitted(self)\n n_samples = _num_samples(X)\n y = np.full((n_samples, self.n_outputs_), self.constant_, dtype=np.array(self.constant_).dtype)\n y_std = np.zeros((n_samples, self.n_outputs_))\n if self.n_outputs_ == 1:\n y = np.ravel(y)\n y_std = np.ravel(y_std)\n return (y, y_std) if return_std else y\n \n def _more_tags(self):\n return {'poor_score': True, 'no_validation': True}\n \n def score(self, X, y, sample_weight=None):\n \"\"\"Return the coefficient of determination R^2 of the prediction.\n\n The coefficient R^2 is defined as `(1 - u/v)`, where `u` is the\n residual sum of squares `((y_true - y_pred) ** 2).sum()` and `v` is the\n total sum of squares `((y_true - y_true.mean()) ** 2).sum()`. The best\n possible score is 1.0 and it can be negative (because the model can be\n arbitrarily worse). A constant model that always predicts the expected\n value of y, disregarding the input features, would get a R^2 score of\n 0.0.\n\n Parameters\n ----------\n X : None or array-like of shape (n_samples, n_features)\n Test samples. Passing None as test samples gives the same result\n as passing real test samples, since `DummyRegressor`\n operates independently of the sampled observations.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True values for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n R^2 of `self.predict(X)` wrt. y.\n \"\"\"\n if X is None:\n X = np.zeros(shape=(len(y), 1))\n return super().score(X, y, sample_weight)\n \n @deprecated('`n_features_in_` is deprecated in 1.0 and will be removed in 1.2.')\n @property\n def n_features_in_(self):\n check_is_fitted(self)\n return None\n" }, @@ -20801,7 +20867,7 @@ "sklearn.ensemble._bagging.BaggingClassifier.decision_function" ], "is_public": true, - "description": "A Bagging classifier.\n\nA Bagging classifier is an ensemble meta-estimator that fits base classifiers each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it. This algorithm encompasses several works from the literature. When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known as Pasting [1]_. If samples are drawn with replacement, then the method is known as Bagging [2]_. When random subsets of the dataset are drawn as random subsets of the features, then the method is known as Random Subspaces [3]_. Finally, when base estimators are built on subsets of both samples and features, then the method is known as Random Patches [4]_. Read more in the :ref:`User Guide `. .. versionadded:: 0.15", + "description": "A Bagging classifier.\n\nA Bagging classifier is an ensemble meta-estimator that fits base\nclassifiers each on random subsets of the original dataset and then\naggregate their individual predictions (either by voting or by averaging)\nto form a final prediction. Such a meta-estimator can typically be used as\na way to reduce the variance of a black-box estimator (e.g., a decision\ntree), by introducing randomization into its construction procedure and\nthen making an ensemble out of it.\n\nThis algorithm encompasses several works from the literature. When random\nsubsets of the dataset are drawn as random subsets of the samples, then\nthis algorithm is known as Pasting [1]_. If samples are drawn with\nreplacement, then the method is known as Bagging [2]_. When random subsets\nof the dataset are drawn as random subsets of the features, then the method\nis known as Random Subspaces [3]_. Finally, when base estimators are built\non subsets of both samples and features, then the method is known as\nRandom Patches [4]_.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.15", "docstring": "A Bagging classifier.\n\n A Bagging classifier is an ensemble meta-estimator that fits base\n classifiers each on random subsets of the original dataset and then\n aggregate their individual predictions (either by voting or by averaging)\n to form a final prediction. Such a meta-estimator can typically be used as\n a way to reduce the variance of a black-box estimator (e.g., a decision\n tree), by introducing randomization into its construction procedure and\n then making an ensemble out of it.\n\n This algorithm encompasses several works from the literature. When random\n subsets of the dataset are drawn as random subsets of the samples, then\n this algorithm is known as Pasting [1]_. If samples are drawn with\n replacement, then the method is known as Bagging [2]_. When random subsets\n of the dataset are drawn as random subsets of the features, then the method\n is known as Random Subspaces [3]_. Finally, when base estimators are built\n on subsets of both samples and features, then the method is known as\n Random Patches [4]_.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.15\n\n Parameters\n ----------\n base_estimator : object, default=None\n The base estimator to fit on random subsets of the dataset.\n If None, then the base estimator is a\n :class:`~sklearn.tree.DecisionTreeClassifier`.\n\n n_estimators : int, default=10\n The number of base estimators in the ensemble.\n\n max_samples : int or float, default=1.0\n The number of samples to draw from X to train each base estimator (with\n replacement by default, see `bootstrap` for more details).\n\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples.\n\n max_features : int or float, default=1.0\n The number of features to draw from X to train each base estimator (\n without replacement by default, see `bootstrap_features` for more\n details).\n\n - If int, then draw `max_features` features.\n - If float, then draw `max_features * X.shape[1]` features.\n\n bootstrap : bool, default=True\n Whether samples are drawn with replacement. If False, sampling\n without replacement is performed.\n\n bootstrap_features : bool, default=False\n Whether features are drawn with replacement.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate\n the generalization error. Only available if bootstrap=True.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit\n a whole new ensemble. See :term:`the Glossary `.\n\n .. versionadded:: 0.17\n *warm_start* constructor parameter.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for both :meth:`fit` and\n :meth:`predict`. ``None`` means 1 unless in a\n :obj:`joblib.parallel_backend` context. ``-1`` means using all\n processors. See :term:`Glossary ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls the random resampling of the original dataset\n (sample wise and feature wise).\n If the base estimator accepts a `random_state` attribute, a different\n seed is generated for each instance in the ensemble.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n Attributes\n ----------\n base_estimator_ : estimator\n The base estimator from which the ensemble is grown.\n\n n_features_ : int\n The number of features when :meth:`fit` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n estimators_ : list of estimators\n The collection of fitted base estimators.\n\n estimators_samples_ : list of arrays\n The subset of drawn samples (i.e., the in-bag samples) for each base\n estimator. Each subset is defined by an array of the indices selected.\n\n estimators_features_ : list of arrays\n The subset of drawn features for each base estimator.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_classes_ : int or list\n The number of classes.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_decision_function_ : ndarray of shape (n_samples, n_classes)\n Decision function computed with out-of-bag estimate on the training\n set. If n_estimators is small it might be possible that a data point\n was never left out during the bootstrap. In this case,\n `oob_decision_function_` might contain NaN. This attribute exists\n only when ``oob_score`` is True.\n\n See Also\n --------\n BaggingRegressor : A Bagging regressor.\n\n References\n ----------\n\n .. [1] L. Breiman, \"Pasting small votes for classification in large\n databases and on-line\", Machine Learning, 36(1), 85-103, 1999.\n\n .. [2] L. Breiman, \"Bagging predictors\", Machine Learning, 24(2), 123-140,\n 1996.\n\n .. [3] T. Ho, \"The random subspace method for constructing decision\n forests\", Pattern Analysis and Machine Intelligence, 20(8), 832-844,\n 1998.\n\n .. [4] G. Louppe and P. Geurts, \"Ensembles on Random Patches\", Machine\n Learning and Knowledge Discovery in Databases, 346-361, 2012.\n\n Examples\n --------\n >>> from sklearn.svm import SVC\n >>> from sklearn.ensemble import BaggingClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_samples=100, n_features=4,\n ... n_informative=2, n_redundant=0,\n ... random_state=0, shuffle=False)\n >>> clf = BaggingClassifier(base_estimator=SVC(),\n ... n_estimators=10, random_state=0).fit(X, y)\n >>> clf.predict([[0, 0, 0, 0]])\n array([1])\n ", "source_code": "\n\nclass BaggingClassifier(ClassifierMixin, BaseBagging):\n \"\"\"A Bagging classifier.\n\n A Bagging classifier is an ensemble meta-estimator that fits base\n classifiers each on random subsets of the original dataset and then\n aggregate their individual predictions (either by voting or by averaging)\n to form a final prediction. Such a meta-estimator can typically be used as\n a way to reduce the variance of a black-box estimator (e.g., a decision\n tree), by introducing randomization into its construction procedure and\n then making an ensemble out of it.\n\n This algorithm encompasses several works from the literature. When random\n subsets of the dataset are drawn as random subsets of the samples, then\n this algorithm is known as Pasting [1]_. If samples are drawn with\n replacement, then the method is known as Bagging [2]_. When random subsets\n of the dataset are drawn as random subsets of the features, then the method\n is known as Random Subspaces [3]_. Finally, when base estimators are built\n on subsets of both samples and features, then the method is known as\n Random Patches [4]_.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.15\n\n Parameters\n ----------\n base_estimator : object, default=None\n The base estimator to fit on random subsets of the dataset.\n If None, then the base estimator is a\n :class:`~sklearn.tree.DecisionTreeClassifier`.\n\n n_estimators : int, default=10\n The number of base estimators in the ensemble.\n\n max_samples : int or float, default=1.0\n The number of samples to draw from X to train each base estimator (with\n replacement by default, see `bootstrap` for more details).\n\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples.\n\n max_features : int or float, default=1.0\n The number of features to draw from X to train each base estimator (\n without replacement by default, see `bootstrap_features` for more\n details).\n\n - If int, then draw `max_features` features.\n - If float, then draw `max_features * X.shape[1]` features.\n\n bootstrap : bool, default=True\n Whether samples are drawn with replacement. If False, sampling\n without replacement is performed.\n\n bootstrap_features : bool, default=False\n Whether features are drawn with replacement.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate\n the generalization error. Only available if bootstrap=True.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit\n a whole new ensemble. See :term:`the Glossary `.\n\n .. versionadded:: 0.17\n *warm_start* constructor parameter.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for both :meth:`fit` and\n :meth:`predict`. ``None`` means 1 unless in a\n :obj:`joblib.parallel_backend` context. ``-1`` means using all\n processors. See :term:`Glossary ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls the random resampling of the original dataset\n (sample wise and feature wise).\n If the base estimator accepts a `random_state` attribute, a different\n seed is generated for each instance in the ensemble.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n Attributes\n ----------\n base_estimator_ : estimator\n The base estimator from which the ensemble is grown.\n\n n_features_ : int\n The number of features when :meth:`fit` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n estimators_ : list of estimators\n The collection of fitted base estimators.\n\n estimators_samples_ : list of arrays\n The subset of drawn samples (i.e., the in-bag samples) for each base\n estimator. Each subset is defined by an array of the indices selected.\n\n estimators_features_ : list of arrays\n The subset of drawn features for each base estimator.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_classes_ : int or list\n The number of classes.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_decision_function_ : ndarray of shape (n_samples, n_classes)\n Decision function computed with out-of-bag estimate on the training\n set. If n_estimators is small it might be possible that a data point\n was never left out during the bootstrap. In this case,\n `oob_decision_function_` might contain NaN. This attribute exists\n only when ``oob_score`` is True.\n\n See Also\n --------\n BaggingRegressor : A Bagging regressor.\n\n References\n ----------\n\n .. [1] L. Breiman, \"Pasting small votes for classification in large\n databases and on-line\", Machine Learning, 36(1), 85-103, 1999.\n\n .. [2] L. Breiman, \"Bagging predictors\", Machine Learning, 24(2), 123-140,\n 1996.\n\n .. [3] T. Ho, \"The random subspace method for constructing decision\n forests\", Pattern Analysis and Machine Intelligence, 20(8), 832-844,\n 1998.\n\n .. [4] G. Louppe and P. Geurts, \"Ensembles on Random Patches\", Machine\n Learning and Knowledge Discovery in Databases, 346-361, 2012.\n\n Examples\n --------\n >>> from sklearn.svm import SVC\n >>> from sklearn.ensemble import BaggingClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_samples=100, n_features=4,\n ... n_informative=2, n_redundant=0,\n ... random_state=0, shuffle=False)\n >>> clf = BaggingClassifier(base_estimator=SVC(),\n ... n_estimators=10, random_state=0).fit(X, y)\n >>> clf.predict([[0, 0, 0, 0]])\n array([1])\n \"\"\"\n \n def __init__(self, base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0):\n super().__init__(base_estimator, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, bootstrap=bootstrap, bootstrap_features=bootstrap_features, oob_score=oob_score, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose)\n \n def _validate_estimator(self):\n \"\"\"Check the estimator and set the base_estimator_ attribute.\"\"\"\n super()._validate_estimator(default=DecisionTreeClassifier())\n \n def _set_oob_score(self, X, y):\n n_samples = y.shape[0]\n n_classes_ = self.n_classes_\n predictions = np.zeros((n_samples, n_classes_))\n for (estimator, samples, features) in zip(self.estimators_, self.estimators_samples_, self.estimators_features_):\n mask = ~indices_to_mask(samples, n_samples)\n if hasattr(estimator, 'predict_proba'):\n predictions[mask, :] += estimator.predict_proba(X[mask, :][:, features])\n else:\n p = estimator.predict(X[mask, :][:, features])\n j = 0\n for i in range(n_samples):\n if mask[i]:\n predictions[i, p[j]] += 1\n j += 1\n if (predictions.sum(axis=1) == 0).any():\n warn('Some inputs do not have OOB scores. This probably means too few estimators were used to compute any reliable oob estimates.')\n oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]\n oob_score = accuracy_score(y, np.argmax(predictions, axis=1))\n self.oob_decision_function_ = oob_decision_function\n self.oob_score_ = oob_score\n \n def _validate_y(self, y):\n y = column_or_1d(y, warn=True)\n check_classification_targets(y)\n (self.classes_, y) = np.unique(y, return_inverse=True)\n self.n_classes_ = len(self.classes_)\n return y\n \n def predict(self, X):\n \"\"\"Predict class for X.\n\n The predicted class of an input sample is computed as the class with\n the highest mean predicted probability. If base estimators do not\n implement a ``predict_proba`` method, then it resorts to voting.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted classes.\n \"\"\"\n predicted_probabilitiy = self.predict_proba(X)\n return self.classes_.take(np.argmax(predicted_probabilitiy, axis=1), axis=0)\n \n def predict_proba(self, X):\n \"\"\"Predict class probabilities for X.\n\n The predicted class probabilities of an input sample is computed as\n the mean predicted class probabilities of the base estimators in the\n ensemble. If base estimators do not implement a ``predict_proba``\n method, then it resorts to voting and the predicted class probabilities\n of an input sample represents the proportion of estimators predicting\n each class.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False, reset=False)\n (n_jobs, n_estimators, starts) = _partition_estimators(self.n_estimators, self.n_jobs)\n all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args())((delayed(_parallel_predict_proba)(self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_) for i in range(n_jobs)))\n proba = sum(all_proba) / self.n_estimators\n return proba\n \n def predict_log_proba(self, X):\n \"\"\"Predict class log-probabilities for X.\n\n The predicted class log-probabilities of an input sample is computed as\n the log of the mean predicted class probabilities of the base\n estimators in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n if hasattr(self.base_estimator_, 'predict_log_proba'):\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False, reset=False)\n (n_jobs, n_estimators, starts) = _partition_estimators(self.n_estimators, self.n_jobs)\n all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)((delayed(_parallel_predict_log_proba)(self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_) for i in range(n_jobs)))\n log_proba = all_log_proba[0]\n for j in range(1, len(all_log_proba)):\n log_proba = np.logaddexp(log_proba, all_log_proba[j])\n log_proba -= np.log(self.n_estimators)\n return log_proba\n else:\n return np.log(self.predict_proba(X))\n \n @if_delegate_has_method(delegate='base_estimator')\n def decision_function(self, X):\n \"\"\"Average of the decision functions of the base classifiers.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n score : ndarray of shape (n_samples, k)\n The decision function of the input samples. The columns correspond\n to the classes in sorted order, as they appear in the attribute\n ``classes_``. Regression and binary classification are special\n cases with ``k == 1``, otherwise ``k==n_classes``.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False, reset=False)\n (n_jobs, n_estimators, starts) = _partition_estimators(self.n_estimators, self.n_jobs)\n all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)((delayed(_parallel_decision_function)(self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X) for i in range(n_jobs)))\n decisions = sum(all_decisions) / self.n_estimators\n return decisions\n" }, @@ -20817,7 +20883,7 @@ "sklearn.ensemble._bagging.BaggingRegressor._set_oob_score" ], "is_public": true, - "description": "A Bagging regressor.\n\nA Bagging regressor is an ensemble meta-estimator that fits base regressors each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it. This algorithm encompasses several works from the literature. When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known as Pasting [1]_. If samples are drawn with replacement, then the method is known as Bagging [2]_. When random subsets of the dataset are drawn as random subsets of the features, then the method is known as Random Subspaces [3]_. Finally, when base estimators are built on subsets of both samples and features, then the method is known as Random Patches [4]_. Read more in the :ref:`User Guide `. .. versionadded:: 0.15", + "description": "A Bagging regressor.\n\nA Bagging regressor is an ensemble meta-estimator that fits base\nregressors each on random subsets of the original dataset and then\naggregate their individual predictions (either by voting or by averaging)\nto form a final prediction. Such a meta-estimator can typically be used as\na way to reduce the variance of a black-box estimator (e.g., a decision\ntree), by introducing randomization into its construction procedure and\nthen making an ensemble out of it.\n\nThis algorithm encompasses several works from the literature. When random\nsubsets of the dataset are drawn as random subsets of the samples, then\nthis algorithm is known as Pasting [1]_. If samples are drawn with\nreplacement, then the method is known as Bagging [2]_. When random subsets\nof the dataset are drawn as random subsets of the features, then the method\nis known as Random Subspaces [3]_. Finally, when base estimators are built\non subsets of both samples and features, then the method is known as\nRandom Patches [4]_.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.15", "docstring": "A Bagging regressor.\n\n A Bagging regressor is an ensemble meta-estimator that fits base\n regressors each on random subsets of the original dataset and then\n aggregate their individual predictions (either by voting or by averaging)\n to form a final prediction. Such a meta-estimator can typically be used as\n a way to reduce the variance of a black-box estimator (e.g., a decision\n tree), by introducing randomization into its construction procedure and\n then making an ensemble out of it.\n\n This algorithm encompasses several works from the literature. When random\n subsets of the dataset are drawn as random subsets of the samples, then\n this algorithm is known as Pasting [1]_. If samples are drawn with\n replacement, then the method is known as Bagging [2]_. When random subsets\n of the dataset are drawn as random subsets of the features, then the method\n is known as Random Subspaces [3]_. Finally, when base estimators are built\n on subsets of both samples and features, then the method is known as\n Random Patches [4]_.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.15\n\n Parameters\n ----------\n base_estimator : object, default=None\n The base estimator to fit on random subsets of the dataset.\n If None, then the base estimator is a\n :class:`~sklearn.tree.DecisionTreeRegressor`.\n\n n_estimators : int, default=10\n The number of base estimators in the ensemble.\n\n max_samples : int or float, default=1.0\n The number of samples to draw from X to train each base estimator (with\n replacement by default, see `bootstrap` for more details).\n\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples.\n\n max_features : int or float, default=1.0\n The number of features to draw from X to train each base estimator (\n without replacement by default, see `bootstrap_features` for more\n details).\n\n - If int, then draw `max_features` features.\n - If float, then draw `max_features * X.shape[1]` features.\n\n bootstrap : bool, default=True\n Whether samples are drawn with replacement. If False, sampling\n without replacement is performed.\n\n bootstrap_features : bool, default=False\n Whether features are drawn with replacement.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate\n the generalization error. Only available if bootstrap=True.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit\n a whole new ensemble. See :term:`the Glossary `.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for both :meth:`fit` and\n :meth:`predict`. ``None`` means 1 unless in a\n :obj:`joblib.parallel_backend` context. ``-1`` means using all\n processors. See :term:`Glossary ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls the random resampling of the original dataset\n (sample wise and feature wise).\n If the base estimator accepts a `random_state` attribute, a different\n seed is generated for each instance in the ensemble.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n Attributes\n ----------\n base_estimator_ : estimator\n The base estimator from which the ensemble is grown.\n\n n_features_ : int\n The number of features when :meth:`fit` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n estimators_ : list of estimators\n The collection of fitted sub-estimators.\n\n estimators_samples_ : list of arrays\n The subset of drawn samples (i.e., the in-bag samples) for each base\n estimator. Each subset is defined by an array of the indices selected.\n\n estimators_features_ : list of arrays\n The subset of drawn features for each base estimator.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_prediction_ : ndarray of shape (n_samples,)\n Prediction computed with out-of-bag estimate on the training\n set. If n_estimators is small it might be possible that a data point\n was never left out during the bootstrap. In this case,\n `oob_prediction_` might contain NaN. This attribute exists only\n when ``oob_score`` is True.\n\n See Also\n --------\n BaggingClassifier : A Bagging classifier.\n\n References\n ----------\n\n .. [1] L. Breiman, \"Pasting small votes for classification in large\n databases and on-line\", Machine Learning, 36(1), 85-103, 1999.\n\n .. [2] L. Breiman, \"Bagging predictors\", Machine Learning, 24(2), 123-140,\n 1996.\n\n .. [3] T. Ho, \"The random subspace method for constructing decision\n forests\", Pattern Analysis and Machine Intelligence, 20(8), 832-844,\n 1998.\n\n .. [4] G. Louppe and P. Geurts, \"Ensembles on Random Patches\", Machine\n Learning and Knowledge Discovery in Databases, 346-361, 2012.\n\n Examples\n --------\n >>> from sklearn.svm import SVR\n >>> from sklearn.ensemble import BaggingRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_samples=100, n_features=4,\n ... n_informative=2, n_targets=1,\n ... random_state=0, shuffle=False)\n >>> regr = BaggingRegressor(base_estimator=SVR(),\n ... n_estimators=10, random_state=0).fit(X, y)\n >>> regr.predict([[0, 0, 0, 0]])\n array([-2.8720...])\n ", "source_code": "\n\nclass BaggingRegressor(RegressorMixin, BaseBagging):\n \"\"\"A Bagging regressor.\n\n A Bagging regressor is an ensemble meta-estimator that fits base\n regressors each on random subsets of the original dataset and then\n aggregate their individual predictions (either by voting or by averaging)\n to form a final prediction. Such a meta-estimator can typically be used as\n a way to reduce the variance of a black-box estimator (e.g., a decision\n tree), by introducing randomization into its construction procedure and\n then making an ensemble out of it.\n\n This algorithm encompasses several works from the literature. When random\n subsets of the dataset are drawn as random subsets of the samples, then\n this algorithm is known as Pasting [1]_. If samples are drawn with\n replacement, then the method is known as Bagging [2]_. When random subsets\n of the dataset are drawn as random subsets of the features, then the method\n is known as Random Subspaces [3]_. Finally, when base estimators are built\n on subsets of both samples and features, then the method is known as\n Random Patches [4]_.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.15\n\n Parameters\n ----------\n base_estimator : object, default=None\n The base estimator to fit on random subsets of the dataset.\n If None, then the base estimator is a\n :class:`~sklearn.tree.DecisionTreeRegressor`.\n\n n_estimators : int, default=10\n The number of base estimators in the ensemble.\n\n max_samples : int or float, default=1.0\n The number of samples to draw from X to train each base estimator (with\n replacement by default, see `bootstrap` for more details).\n\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples.\n\n max_features : int or float, default=1.0\n The number of features to draw from X to train each base estimator (\n without replacement by default, see `bootstrap_features` for more\n details).\n\n - If int, then draw `max_features` features.\n - If float, then draw `max_features * X.shape[1]` features.\n\n bootstrap : bool, default=True\n Whether samples are drawn with replacement. If False, sampling\n without replacement is performed.\n\n bootstrap_features : bool, default=False\n Whether features are drawn with replacement.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate\n the generalization error. Only available if bootstrap=True.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit\n a whole new ensemble. See :term:`the Glossary `.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for both :meth:`fit` and\n :meth:`predict`. ``None`` means 1 unless in a\n :obj:`joblib.parallel_backend` context. ``-1`` means using all\n processors. See :term:`Glossary ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls the random resampling of the original dataset\n (sample wise and feature wise).\n If the base estimator accepts a `random_state` attribute, a different\n seed is generated for each instance in the ensemble.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n Attributes\n ----------\n base_estimator_ : estimator\n The base estimator from which the ensemble is grown.\n\n n_features_ : int\n The number of features when :meth:`fit` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n estimators_ : list of estimators\n The collection of fitted sub-estimators.\n\n estimators_samples_ : list of arrays\n The subset of drawn samples (i.e., the in-bag samples) for each base\n estimator. Each subset is defined by an array of the indices selected.\n\n estimators_features_ : list of arrays\n The subset of drawn features for each base estimator.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_prediction_ : ndarray of shape (n_samples,)\n Prediction computed with out-of-bag estimate on the training\n set. If n_estimators is small it might be possible that a data point\n was never left out during the bootstrap. In this case,\n `oob_prediction_` might contain NaN. This attribute exists only\n when ``oob_score`` is True.\n\n See Also\n --------\n BaggingClassifier : A Bagging classifier.\n\n References\n ----------\n\n .. [1] L. Breiman, \"Pasting small votes for classification in large\n databases and on-line\", Machine Learning, 36(1), 85-103, 1999.\n\n .. [2] L. Breiman, \"Bagging predictors\", Machine Learning, 24(2), 123-140,\n 1996.\n\n .. [3] T. Ho, \"The random subspace method for constructing decision\n forests\", Pattern Analysis and Machine Intelligence, 20(8), 832-844,\n 1998.\n\n .. [4] G. Louppe and P. Geurts, \"Ensembles on Random Patches\", Machine\n Learning and Knowledge Discovery in Databases, 346-361, 2012.\n\n Examples\n --------\n >>> from sklearn.svm import SVR\n >>> from sklearn.ensemble import BaggingRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_samples=100, n_features=4,\n ... n_informative=2, n_targets=1,\n ... random_state=0, shuffle=False)\n >>> regr = BaggingRegressor(base_estimator=SVR(),\n ... n_estimators=10, random_state=0).fit(X, y)\n >>> regr.predict([[0, 0, 0, 0]])\n array([-2.8720...])\n \"\"\"\n \n def __init__(self, base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0):\n super().__init__(base_estimator, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, bootstrap=bootstrap, bootstrap_features=bootstrap_features, oob_score=oob_score, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose)\n \n def predict(self, X):\n \"\"\"Predict regression target for X.\n\n The predicted regression target of an input sample is computed as the\n mean predicted regression targets of the estimators in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False, reset=False)\n (n_jobs, n_estimators, starts) = _partition_estimators(self.n_estimators, self.n_jobs)\n all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)((delayed(_parallel_predict_regression)(self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X) for i in range(n_jobs)))\n y_hat = sum(all_y_hat) / self.n_estimators\n return y_hat\n \n def _validate_estimator(self):\n \"\"\"Check the estimator and set the base_estimator_ attribute.\"\"\"\n super()._validate_estimator(default=DecisionTreeRegressor())\n \n def _set_oob_score(self, X, y):\n n_samples = y.shape[0]\n predictions = np.zeros((n_samples, ))\n n_predictions = np.zeros((n_samples, ))\n for (estimator, samples, features) in zip(self.estimators_, self.estimators_samples_, self.estimators_features_):\n mask = ~indices_to_mask(samples, n_samples)\n predictions[mask] += estimator.predict(X[mask, :][:, features])\n n_predictions[mask] += 1\n if (n_predictions == 0).any():\n warn('Some inputs do not have OOB scores. This probably means too few estimators were used to compute any reliable oob estimates.')\n n_predictions[n_predictions == 0] = 1\n predictions /= n_predictions\n self.oob_prediction_ = predictions\n self.oob_score_ = r2_score(y, predictions)\n" }, @@ -20838,7 +20904,7 @@ "sklearn.ensemble._bagging.BaseBagging.n_features_@getter" ], "is_public": false, - "description": "Base class for Bagging meta-estimator.\n\nWarning: This class should not be used directly. Use derived classes instead.", + "description": "Base class for Bagging meta-estimator.\n\nWarning: This class should not be used directly. Use derived classes\ninstead.", "docstring": "Base class for Bagging meta-estimator.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n ", "source_code": "\n\nclass BaseBagging(BaseEnsemble, metaclass=ABCMeta):\n \"\"\"Base class for Bagging meta-estimator.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n \"\"\"\n \n @abstractmethod\n def __init__(self, base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0):\n super().__init__(base_estimator=base_estimator, n_estimators=n_estimators)\n self.max_samples = max_samples\n self.max_features = max_features\n self.bootstrap = bootstrap\n self.bootstrap_features = bootstrap_features\n self.oob_score = oob_score\n self.warm_start = warm_start\n self.n_jobs = n_jobs\n self.random_state = random_state\n self.verbose = verbose\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Build a Bagging ensemble of estimators from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if the base estimator supports\n sample weighting.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False, multi_output=True)\n return self._fit(X, y, self.max_samples, sample_weight=sample_weight)\n \n def _parallel_args(self):\n return {}\n \n def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):\n \"\"\"Build a Bagging ensemble of estimators from the training\n set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n max_samples : int or float, default=None\n Argument to use instead of self.max_samples.\n\n max_depth : int, default=None\n Override value used when constructing base estimator. Only\n supported if the base estimator has a max_depth parameter.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if the base estimator supports\n sample weighting.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n random_state = check_random_state(self.random_state)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=None)\n n_samples = X.shape[0]\n self._n_samples = n_samples\n y = self._validate_y(y)\n self._validate_estimator()\n if max_depth is not None:\n self.base_estimator_.max_depth = max_depth\n if max_samples is None:\n max_samples = self.max_samples\n elif not isinstance(max_samples, numbers.Integral):\n max_samples = int(max_samples * X.shape[0])\n if not 0 < max_samples <= X.shape[0]:\n raise ValueError('max_samples must be in (0, n_samples]')\n self._max_samples = max_samples\n if isinstance(self.max_features, numbers.Integral):\n max_features = self.max_features\n elif isinstance(self.max_features, float):\n max_features = self.max_features * self.n_features_in_\n else:\n raise ValueError('max_features must be int or float')\n if not 0 < max_features <= self.n_features_in_:\n raise ValueError('max_features must be in (0, n_features]')\n max_features = max(1, int(max_features))\n self._max_features = max_features\n if not self.bootstrap and self.oob_score:\n raise ValueError('Out of bag estimation only available if bootstrap=True')\n if self.warm_start and self.oob_score:\n raise ValueError('Out of bag estimate only available if warm_start=False')\n if hasattr(self, 'oob_score_') and self.warm_start:\n del self.oob_score_\n if not self.warm_start or not hasattr(self, 'estimators_'):\n self.estimators_ = []\n self.estimators_features_ = []\n n_more_estimators = self.n_estimators - len(self.estimators_)\n if n_more_estimators < 0:\n raise ValueError('n_estimators=%d must be larger or equal to len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_)))\n elif n_more_estimators == 0:\n warn('Warm-start fitting without increasing n_estimators does not fit new trees.')\n return self\n (n_jobs, n_estimators, starts) = _partition_estimators(n_more_estimators, self.n_jobs)\n total_n_estimators = sum(n_estimators)\n if self.warm_start and len(self.estimators_) > 0:\n random_state.randint(MAX_INT, size=len(self.estimators_))\n seeds = random_state.randint(MAX_INT, size=n_more_estimators)\n self._seeds = seeds\n all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args())((delayed(_parallel_build_estimators)(n_estimators[i], self, X, y, sample_weight, seeds[starts[i]:starts[i + 1]], total_n_estimators, verbose=self.verbose) for i in range(n_jobs)))\n self.estimators_ += list(itertools.chain.from_iterable((t[0] for t in all_results)))\n self.estimators_features_ += list(itertools.chain.from_iterable((t[1] for t in all_results)))\n if self.oob_score:\n self._set_oob_score(X, y)\n return self\n \n @abstractmethod\n def _set_oob_score(self, X, y):\n \"\"\"Calculate out of bag predictions and score.\"\"\"\n \n \n def _validate_y(self, y):\n if len(y.shape) == 1 or y.shape[1] == 1:\n return column_or_1d(y, warn=True)\n else:\n return y\n \n def _get_estimators_indices(self):\n for seed in self._seeds:\n (feature_indices, sample_indices) = _generate_bagging_indices(seed, self.bootstrap_features, self.bootstrap, self.n_features_in_, self._n_samples, self._max_features, self._max_samples)\n yield (feature_indices, sample_indices)\n \n @property\n def estimators_samples_(self):\n \"\"\"\n The subset of drawn samples for each base estimator.\n\n Returns a dynamically generated list of indices identifying\n the samples used for fitting each member of the ensemble, i.e.,\n the in-bag samples.\n\n Note: the list is re-created at each call to the property in order\n to reduce the object memory footprint by not storing the sampling\n data. Thus fetching the property may be slower than expected.\n \"\"\"\n return [sample_indices for (_, sample_indices) in self._get_estimators_indices()]\n \n @deprecated('Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead.')\n @property\n def n_features_(self):\n return self.n_features_in_\n" }, @@ -20856,7 +20922,7 @@ "sklearn.ensemble._base.BaseEnsemble.__iter__" ], "is_public": true, - "description": "Base class for all ensemble classes.\n\nWarning: This class should not be used directly. Use derived classes instead.", + "description": "Base class for all ensemble classes.\n\nWarning: This class should not be used directly. Use derived classes\ninstead.", "docstring": "Base class for all ensemble classes.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n\n Parameters\n ----------\n base_estimator : object\n The base estimator from which the ensemble is built.\n\n n_estimators : int, default=10\n The number of estimators in the ensemble.\n\n estimator_params : list of str, default=tuple()\n The list of attributes to use as parameters when instantiating a\n new base estimator. If none are given, default parameters are used.\n\n Attributes\n ----------\n base_estimator_ : estimator\n The base estimator from which the ensemble is grown.\n\n estimators_ : list of estimators\n The collection of fitted base estimators.\n ", "source_code": "\n\nclass BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):\n \"\"\"Base class for all ensemble classes.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n\n Parameters\n ----------\n base_estimator : object\n The base estimator from which the ensemble is built.\n\n n_estimators : int, default=10\n The number of estimators in the ensemble.\n\n estimator_params : list of str, default=tuple()\n The list of attributes to use as parameters when instantiating a\n new base estimator. If none are given, default parameters are used.\n\n Attributes\n ----------\n base_estimator_ : estimator\n The base estimator from which the ensemble is grown.\n\n estimators_ : list of estimators\n The collection of fitted base estimators.\n \"\"\"\n _required_parameters: List[str] = []\n \n @abstractmethod\n def __init__(self, base_estimator, *, n_estimators=10, estimator_params=tuple()):\n self.base_estimator = base_estimator\n self.n_estimators = n_estimators\n self.estimator_params = estimator_params\n \n def _validate_estimator(self, default=None):\n \"\"\"Check the estimator and the n_estimator attribute.\n\n Sets the base_estimator_` attributes.\n \"\"\"\n if not isinstance(self.n_estimators, numbers.Integral):\n raise ValueError('n_estimators must be an integer, got {0}.'.format(type(self.n_estimators)))\n if self.n_estimators <= 0:\n raise ValueError('n_estimators must be greater than zero, got {0}.'.format(self.n_estimators))\n if self.base_estimator is not None:\n self.base_estimator_ = self.base_estimator\n else:\n self.base_estimator_ = default\n if self.base_estimator_ is None:\n raise ValueError('base_estimator cannot be None')\n \n def _make_estimator(self, append=True, random_state=None):\n \"\"\"Make and configure a copy of the `base_estimator_` attribute.\n\n Warning: This method should be used to properly instantiate new\n sub-estimators.\n \"\"\"\n estimator = clone(self.base_estimator_)\n estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params})\n if isinstance(estimator, (DecisionTreeRegressor, ExtraTreeRegressor)):\n if getattr(estimator, 'criterion', None) == 'mse':\n estimator.set_params(criterion='squared_error')\n elif getattr(estimator, 'criterion', None) == 'mae':\n estimator.set_params(criterion='absolute_error')\n if random_state is not None:\n _set_random_states(estimator, random_state)\n if append:\n self.estimators_.append(estimator)\n return estimator\n \n def __len__(self):\n \"\"\"Return the number of estimators in the ensemble.\"\"\"\n return len(self.estimators_)\n \n def __getitem__(self, index):\n \"\"\"Return the index'th estimator in the ensemble.\"\"\"\n return self.estimators_[index]\n \n def __iter__(self):\n \"\"\"Return iterator over estimators in the ensemble.\"\"\"\n return iter(self.estimators_)\n" }, @@ -20895,9 +20961,9 @@ "sklearn.ensemble._forest.BaseForest.n_features_@getter" ], "is_public": false, - "description": "Base class for forests of trees.\n\nWarning: This class should not be used directly. Use derived classes instead.", + "description": "Base class for forests of trees.\n\nWarning: This class should not be used directly. Use derived classes\ninstead.", "docstring": "\n Base class for forests of trees.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n ", - "source_code": "\n\nclass BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):\n \"\"\"\n Base class for forests of trees.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n \"\"\"\n \n @abstractmethod\n def __init__(self, base_estimator, n_estimators=100, *, estimator_params=tuple(), bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, max_samples=None):\n super().__init__(base_estimator=base_estimator, n_estimators=n_estimators, estimator_params=estimator_params)\n self.bootstrap = bootstrap\n self.oob_score = oob_score\n self.n_jobs = n_jobs\n self.random_state = random_state\n self.verbose = verbose\n self.warm_start = warm_start\n self.class_weight = class_weight\n self.max_samples = max_samples\n \n def apply(self, X):\n \"\"\"\n Apply trees in the forest to X, return leaf indices.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n X_leaves : ndarray of shape (n_samples, n_estimators)\n For each datapoint x in X and for each tree in the forest,\n return the index of the leaf x ends up in.\n \"\"\"\n X = self._validate_X_predict(X)\n results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))((delayed(tree.apply)(X, check_input=False) for tree in self.estimators_))\n return np.array(results).T\n \n def decision_path(self, X):\n \"\"\"\n Return the decision path in the forest.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n indicator : sparse matrix of shape (n_samples, n_nodes)\n Return a node indicator matrix where non zero elements indicates\n that the samples goes through the nodes. The matrix is of CSR\n format.\n\n n_nodes_ptr : ndarray of shape (n_estimators + 1,)\n The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]\n gives the indicator value for the i-th estimator.\n \"\"\"\n X = self._validate_X_predict(X)\n indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))((delayed(tree.decision_path)(X, check_input=False) for tree in self.estimators_))\n n_nodes = [0]\n n_nodes.extend([i.shape[1] for i in indicators])\n n_nodes_ptr = np.array(n_nodes).cumsum()\n return sparse_hstack(indicators).tocsr(), n_nodes_ptr\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"\n Build a forest of trees from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, its dtype will be converted\n to ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csc_matrix``.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if issparse(y):\n raise ValueError('sparse multilabel-indicator for y is not supported.')\n (X, y) = self._validate_data(X, y, multi_output=True, accept_sparse='csc', dtype=DTYPE)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if issparse(X):\n X.sort_indices()\n y = np.atleast_1d(y)\n if y.ndim == 2 and y.shape[1] == 1:\n warn('A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().', DataConversionWarning, stacklevel=2)\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n if self.criterion == 'poisson':\n if np.any(y < 0):\n raise ValueError('Some value(s) of y are negative which is not allowed for Poisson regression.')\n if np.sum(y) <= 0:\n raise ValueError('Sum of y is not strictly positive which is necessary for Poisson regression.')\n self.n_outputs_ = y.shape[1]\n (y, expanded_class_weight) = self._validate_y_class_weight(y)\n if getattr(y, 'dtype', None) != DOUBLE or not y.flags.contiguous:\n y = np.ascontiguousarray(y, dtype=DOUBLE)\n if expanded_class_weight is not None:\n if sample_weight is not None:\n sample_weight = sample_weight * expanded_class_weight\n else:\n sample_weight = expanded_class_weight\n n_samples_bootstrap = _get_n_samples_bootstrap(n_samples=X.shape[0], max_samples=self.max_samples)\n self._validate_estimator()\n if isinstance(self, (RandomForestRegressor, ExtraTreesRegressor)):\n if self.criterion == 'mse':\n warn(\"Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.\", FutureWarning)\n elif self.criterion == 'mae':\n warn(\"Criterion 'mae' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='absolute_error'` which is equivalent.\", FutureWarning)\n if not self.bootstrap and self.oob_score:\n raise ValueError('Out of bag estimation only available if bootstrap=True')\n random_state = check_random_state(self.random_state)\n if not self.warm_start or not hasattr(self, 'estimators_'):\n self.estimators_ = []\n n_more_estimators = self.n_estimators - len(self.estimators_)\n if n_more_estimators < 0:\n raise ValueError('n_estimators=%d must be larger or equal to len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_)))\n elif n_more_estimators == 0:\n warn('Warm-start fitting without increasing n_estimators does not fit new trees.')\n else:\n if self.warm_start and len(self.estimators_) > 0:\n random_state.randint(MAX_INT, size=len(self.estimators_))\n trees = [self._make_estimator(append=False, random_state=random_state) for i in range(n_more_estimators)]\n trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))((delayed(_parallel_build_trees)(t, self, X, y, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap) for (i, t) in enumerate(trees)))\n self.estimators_.extend(trees)\n if self.oob_score:\n y_type = type_of_target(y)\n if y_type in ('multiclass-multioutput', 'unknown'):\n raise ValueError(f'The type of target cannot be used to compute OOB estimates. Got {y_type} while only the following are supported: continuous, continuous-multioutput, binary, multiclass, multilabel-indicator.')\n self._set_oob_score_and_attributes(X, y)\n if hasattr(self, 'classes_') and self.n_outputs_ == 1:\n self.n_classes_ = self.n_classes_[0]\n self.classes_ = self.classes_[0]\n return self\n \n @abstractmethod\n def _set_oob_score_and_attributes(self, X, y):\n \"\"\"Compute and set the OOB score and attributes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n \"\"\"\n \n \n def _compute_oob_predictions(self, X, y):\n \"\"\"Compute and set the OOB score.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n\n Returns\n -------\n oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or (n_samples, 1, n_outputs)\n The OOB predictions.\n \"\"\"\n X = self._validate_data(X, dtype=DTYPE, accept_sparse='csr', reset=False)\n n_samples = y.shape[0]\n n_outputs = self.n_outputs_\n if is_classifier(self) and hasattr(self, 'n_classes_'):\n oob_pred_shape = (n_samples, self.n_classes_[0], n_outputs)\n else:\n oob_pred_shape = (n_samples, 1, n_outputs)\n oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64)\n n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64)\n n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples)\n for estimator in self.estimators_:\n unsampled_indices = _generate_unsampled_indices(estimator.random_state, n_samples, n_samples_bootstrap)\n y_pred = self._get_oob_predictions(estimator, X[unsampled_indices, :])\n oob_pred[unsampled_indices, ...] += y_pred\n n_oob_pred[unsampled_indices, :] += 1\n for k in range(n_outputs):\n if (n_oob_pred == 0).any():\n warn('Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.', UserWarning)\n n_oob_pred[n_oob_pred == 0] = 1\n oob_pred[..., k] /= n_oob_pred[..., [k]]\n return oob_pred\n \n def _validate_y_class_weight(self, y):\n return y, None\n \n def _validate_X_predict(self, X):\n \"\"\"\n Validate X whenever one tries to predict, apply, predict_proba.\"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, dtype=DTYPE, accept_sparse='csr', reset=False)\n if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):\n raise ValueError('No support for np.int64 index based sparse matrices')\n return X\n \n @property\n def feature_importances_(self):\n \"\"\"\n The impurity-based feature importances.\n\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n The values of this array sum to 1, unless all trees are single node\n trees consisting of only the root node, in which case it will be an\n array of zeros.\n \"\"\"\n check_is_fitted(self)\n all_importances = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))((delayed(getattr)(tree, 'feature_importances_') for tree in self.estimators_ if tree.tree_.node_count > 1))\n if not all_importances:\n return np.zeros(self.n_features_in_, dtype=np.float64)\n all_importances = np.mean(all_importances, axis=0, dtype=np.float64)\n return all_importances / np.sum(all_importances)\n \n @deprecated('Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead.')\n @property\n def n_features_(self):\n \"\"\"Number of features when fitting the estimator.\"\"\"\n return self.n_features_in_\n" + "source_code": "\n\nclass BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):\n \"\"\"\n Base class for forests of trees.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n \"\"\"\n \n @abstractmethod\n def __init__(self, base_estimator, n_estimators=100, *, estimator_params=tuple(), bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, max_samples=None):\n super().__init__(base_estimator=base_estimator, n_estimators=n_estimators, estimator_params=estimator_params)\n self.bootstrap = bootstrap\n self.oob_score = oob_score\n self.n_jobs = n_jobs\n self.random_state = random_state\n self.verbose = verbose\n self.warm_start = warm_start\n self.class_weight = class_weight\n self.max_samples = max_samples\n \n def apply(self, X):\n \"\"\"\n Apply trees in the forest to X, return leaf indices.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n X_leaves : ndarray of shape (n_samples, n_estimators)\n For each datapoint x in X and for each tree in the forest,\n return the index of the leaf x ends up in.\n \"\"\"\n X = self._validate_X_predict(X)\n results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))((delayed(tree.apply)(X, check_input=False) for tree in self.estimators_))\n return np.array(results).T\n \n def decision_path(self, X):\n \"\"\"\n Return the decision path in the forest.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n indicator : sparse matrix of shape (n_samples, n_nodes)\n Return a node indicator matrix where non zero elements indicates\n that the samples goes through the nodes. The matrix is of CSR\n format.\n\n n_nodes_ptr : ndarray of shape (n_estimators + 1,)\n The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]\n gives the indicator value for the i-th estimator.\n \"\"\"\n X = self._validate_X_predict(X)\n indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))((delayed(tree.decision_path)(X, check_input=False) for tree in self.estimators_))\n n_nodes = [0]\n n_nodes.extend([i.shape[1] for i in indicators])\n n_nodes_ptr = np.array(n_nodes).cumsum()\n return sparse_hstack(indicators).tocsr(), n_nodes_ptr\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"\n Build a forest of trees from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, its dtype will be converted\n to ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csc_matrix``.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if issparse(y):\n raise ValueError('sparse multilabel-indicator for y is not supported.')\n (X, y) = self._validate_data(X, y, multi_output=True, accept_sparse='csc', dtype=DTYPE)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if issparse(X):\n X.sort_indices()\n y = np.atleast_1d(y)\n if y.ndim == 2 and y.shape[1] == 1:\n warn('A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().', DataConversionWarning, stacklevel=2)\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n if self.criterion == 'poisson':\n if np.any(y < 0):\n raise ValueError('Some value(s) of y are negative which is not allowed for Poisson regression.')\n if np.sum(y) <= 0:\n raise ValueError('Sum of y is not strictly positive which is necessary for Poisson regression.')\n self.n_outputs_ = y.shape[1]\n (y, expanded_class_weight) = self._validate_y_class_weight(y)\n if getattr(y, 'dtype', None) != DOUBLE or not y.flags.contiguous:\n y = np.ascontiguousarray(y, dtype=DOUBLE)\n if expanded_class_weight is not None:\n if sample_weight is not None:\n sample_weight = sample_weight * expanded_class_weight\n else:\n sample_weight = expanded_class_weight\n if not self.bootstrap and self.max_samples is not None:\n raise ValueError('`max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.')\n elif self.bootstrap:\n n_samples_bootstrap = _get_n_samples_bootstrap(n_samples=X.shape[0], max_samples=self.max_samples)\n else:\n n_samples_bootstrap = None\n self._validate_estimator()\n if isinstance(self, (RandomForestRegressor, ExtraTreesRegressor)):\n if self.criterion == 'mse':\n warn(\"Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.\", FutureWarning)\n elif self.criterion == 'mae':\n warn(\"Criterion 'mae' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='absolute_error'` which is equivalent.\", FutureWarning)\n if not self.bootstrap and self.oob_score:\n raise ValueError('Out of bag estimation only available if bootstrap=True')\n random_state = check_random_state(self.random_state)\n if not self.warm_start or not hasattr(self, 'estimators_'):\n self.estimators_ = []\n n_more_estimators = self.n_estimators - len(self.estimators_)\n if n_more_estimators < 0:\n raise ValueError('n_estimators=%d must be larger or equal to len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_)))\n elif n_more_estimators == 0:\n warn('Warm-start fitting without increasing n_estimators does not fit new trees.')\n else:\n if self.warm_start and len(self.estimators_) > 0:\n random_state.randint(MAX_INT, size=len(self.estimators_))\n trees = [self._make_estimator(append=False, random_state=random_state) for i in range(n_more_estimators)]\n trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))((delayed(_parallel_build_trees)(t, self, X, y, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap) for (i, t) in enumerate(trees)))\n self.estimators_.extend(trees)\n if self.oob_score:\n y_type = type_of_target(y)\n if y_type in ('multiclass-multioutput', 'unknown'):\n raise ValueError(f'The type of target cannot be used to compute OOB estimates. Got {y_type} while only the following are supported: continuous, continuous-multioutput, binary, multiclass, multilabel-indicator.')\n self._set_oob_score_and_attributes(X, y)\n if hasattr(self, 'classes_') and self.n_outputs_ == 1:\n self.n_classes_ = self.n_classes_[0]\n self.classes_ = self.classes_[0]\n return self\n \n @abstractmethod\n def _set_oob_score_and_attributes(self, X, y):\n \"\"\"Compute and set the OOB score and attributes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n \"\"\"\n \n \n def _compute_oob_predictions(self, X, y):\n \"\"\"Compute and set the OOB score.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n\n Returns\n -------\n oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or (n_samples, 1, n_outputs)\n The OOB predictions.\n \"\"\"\n if issparse(X):\n X = X.tocsr()\n n_samples = y.shape[0]\n n_outputs = self.n_outputs_\n if is_classifier(self) and hasattr(self, 'n_classes_'):\n oob_pred_shape = (n_samples, self.n_classes_[0], n_outputs)\n else:\n oob_pred_shape = (n_samples, 1, n_outputs)\n oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64)\n n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64)\n n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples)\n for estimator in self.estimators_:\n unsampled_indices = _generate_unsampled_indices(estimator.random_state, n_samples, n_samples_bootstrap)\n y_pred = self._get_oob_predictions(estimator, X[unsampled_indices, :])\n oob_pred[unsampled_indices, ...] += y_pred\n n_oob_pred[unsampled_indices, :] += 1\n for k in range(n_outputs):\n if (n_oob_pred == 0).any():\n warn('Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.', UserWarning)\n n_oob_pred[n_oob_pred == 0] = 1\n oob_pred[..., k] /= n_oob_pred[..., [k]]\n return oob_pred\n \n def _validate_y_class_weight(self, y):\n return y, None\n \n def _validate_X_predict(self, X):\n \"\"\"\n Validate X whenever one tries to predict, apply, predict_proba.\"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, dtype=DTYPE, accept_sparse='csr', reset=False)\n if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):\n raise ValueError('No support for np.int64 index based sparse matrices')\n return X\n \n @property\n def feature_importances_(self):\n \"\"\"\n The impurity-based feature importances.\n\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n The values of this array sum to 1, unless all trees are single node\n trees consisting of only the root node, in which case it will be an\n array of zeros.\n \"\"\"\n check_is_fitted(self)\n all_importances = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))((delayed(getattr)(tree, 'feature_importances_') for tree in self.estimators_ if tree.tree_.node_count > 1))\n if not all_importances:\n return np.zeros(self.n_features_in_, dtype=np.float64)\n all_importances = np.mean(all_importances, axis=0, dtype=np.float64)\n return all_importances / np.sum(all_importances)\n \n @deprecated('Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead.')\n @property\n def n_features_(self):\n \"\"\"Number of features when fitting the estimator.\"\"\"\n return self.n_features_in_\n" }, { "name": "ExtraTreesClassifier", @@ -20908,9 +20974,9 @@ "sklearn.ensemble._forest.ExtraTreesClassifier.__init__" ], "is_public": true, - "description": "An extra-trees classifier.\n\nThis class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. Read more in the :ref:`User Guide `.", - "docstring": "\n An extra-trees classifier.\n\n This class implements a meta estimator that fits a number of\n randomized decision trees (a.k.a. extra-trees) on various sub-samples\n of the dataset and uses averaging to improve the predictive accuracy\n and control over-fitting.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"gini\", \"entropy\"}, default=\"gini\"\n The function to measure the quality of a split. Supported criteria are\n \"gini\" for the Gini impurity and \"entropy\" for the information gain.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=sqrt(n_features)`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=False\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls 3 sources of randomness:\n\n - the bootstrapping of the samples used when building trees\n (if ``bootstrap=True``)\n - the sampling of the features to consider when looking for the best\n split at each node (if ``max_features < n_features``)\n - the draw of the splits for each of the `max_features`\n\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n class_weight : {\"balanced\", \"balanced_subsample\"}, dict or list of dicts, default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n The \"balanced_subsample\" mode is the same as \"balanced\" except that\n weights are computed based on the bootstrap sample for every tree\n grown.\n\n For multi-output, the weights of each column of y will be multiplied.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : ExtraTreesClassifier\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeClassifier\n The collection of fitted sub-estimators.\n\n classes_ : ndarray of shape (n_classes,) or a list of such arrays\n The classes labels (single output problem), or a list of arrays of\n class labels (multi-output problem).\n\n n_classes_ : int or list\n The number of classes (single output problem), or a list containing the\n number of classes for each output (multi-output problem).\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_decision_function_ : ndarray of shape (n_samples, n_classes) or (n_samples, n_classes, n_outputs)\n Decision function computed with out-of-bag estimate on the training\n set. If n_estimators is small it might be possible that a data point\n was never left out during the bootstrap. In this case,\n `oob_decision_function_` might contain NaN. This attribute exists\n only when ``oob_score`` is True.\n\n See Also\n --------\n ExtraTreesRegressor : An extra-trees regressor with random splits.\n RandomForestClassifier : A random forest classifier with optimal splits.\n RandomForestRegressor : Ensemble regressor using trees with optimal splits.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n References\n ----------\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized\n trees\", Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.ensemble import ExtraTreesClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_features=4, random_state=0)\n >>> clf = ExtraTreesClassifier(n_estimators=100, random_state=0)\n >>> clf.fit(X, y)\n ExtraTreesClassifier(random_state=0)\n >>> clf.predict([[0, 0, 0, 0]])\n array([1])\n ", - "source_code": "\n\nclass ExtraTreesClassifier(ForestClassifier):\n \"\"\"\n An extra-trees classifier.\n\n This class implements a meta estimator that fits a number of\n randomized decision trees (a.k.a. extra-trees) on various sub-samples\n of the dataset and uses averaging to improve the predictive accuracy\n and control over-fitting.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"gini\", \"entropy\"}, default=\"gini\"\n The function to measure the quality of a split. Supported criteria are\n \"gini\" for the Gini impurity and \"entropy\" for the information gain.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=sqrt(n_features)`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=False\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls 3 sources of randomness:\n\n - the bootstrapping of the samples used when building trees\n (if ``bootstrap=True``)\n - the sampling of the features to consider when looking for the best\n split at each node (if ``max_features < n_features``)\n - the draw of the splits for each of the `max_features`\n\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n class_weight : {\"balanced\", \"balanced_subsample\"}, dict or list of dicts, default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n The \"balanced_subsample\" mode is the same as \"balanced\" except that\n weights are computed based on the bootstrap sample for every tree\n grown.\n\n For multi-output, the weights of each column of y will be multiplied.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : ExtraTreesClassifier\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeClassifier\n The collection of fitted sub-estimators.\n\n classes_ : ndarray of shape (n_classes,) or a list of such arrays\n The classes labels (single output problem), or a list of arrays of\n class labels (multi-output problem).\n\n n_classes_ : int or list\n The number of classes (single output problem), or a list containing the\n number of classes for each output (multi-output problem).\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_decision_function_ : ndarray of shape (n_samples, n_classes) or (n_samples, n_classes, n_outputs)\n Decision function computed with out-of-bag estimate on the training\n set. If n_estimators is small it might be possible that a data point\n was never left out during the bootstrap. In this case,\n `oob_decision_function_` might contain NaN. This attribute exists\n only when ``oob_score`` is True.\n\n See Also\n --------\n ExtraTreesRegressor : An extra-trees regressor with random splits.\n RandomForestClassifier : A random forest classifier with optimal splits.\n RandomForestRegressor : Ensemble regressor using trees with optimal splits.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n References\n ----------\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized\n trees\", Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.ensemble import ExtraTreesClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_features=4, random_state=0)\n >>> clf = ExtraTreesClassifier(n_estimators=100, random_state=0)\n >>> clf.fit(X, y)\n ExtraTreesClassifier(random_state=0)\n >>> clf.predict([[0, 0, 0, 0]])\n array([1])\n \"\"\"\n \n def __init__(self, n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None):\n super().__init__(base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples)\n self.criterion = criterion\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha\n" + "description": "An extra-trees classifier.\n\nThis class implements a meta estimator that fits a number of\nrandomized decision trees (a.k.a. extra-trees) on various sub-samples\nof the dataset and uses averaging to improve the predictive accuracy\nand control over-fitting.\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n An extra-trees classifier.\n\n This class implements a meta estimator that fits a number of\n randomized decision trees (a.k.a. extra-trees) on various sub-samples\n of the dataset and uses averaging to improve the predictive accuracy\n and control over-fitting.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"gini\", \"entropy\"}, default=\"gini\"\n The function to measure the quality of a split. Supported criteria are\n \"gini\" for the Gini impurity and \"entropy\" for the information gain.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=sqrt(n_features)`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=False\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls 3 sources of randomness:\n\n - the bootstrapping of the samples used when building trees\n (if ``bootstrap=True``)\n - the sampling of the features to consider when looking for the best\n split at each node (if ``max_features < n_features``)\n - the draw of the splits for each of the `max_features`\n\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n class_weight : {\"balanced\", \"balanced_subsample\"}, dict or list of dicts, default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n The \"balanced_subsample\" mode is the same as \"balanced\" except that\n weights are computed based on the bootstrap sample for every tree\n grown.\n\n For multi-output, the weights of each column of y will be multiplied.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : ExtraTreesClassifier\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeClassifier\n The collection of fitted sub-estimators.\n\n classes_ : ndarray of shape (n_classes,) or a list of such arrays\n The classes labels (single output problem), or a list of arrays of\n class labels (multi-output problem).\n\n n_classes_ : int or list\n The number of classes (single output problem), or a list containing the\n number of classes for each output (multi-output problem).\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_decision_function_ : ndarray of shape (n_samples, n_classes) or (n_samples, n_classes, n_outputs)\n Decision function computed with out-of-bag estimate on the training\n set. If n_estimators is small it might be possible that a data point\n was never left out during the bootstrap. In this case,\n `oob_decision_function_` might contain NaN. This attribute exists\n only when ``oob_score`` is True.\n\n See Also\n --------\n ExtraTreesRegressor : An extra-trees regressor with random splits.\n RandomForestClassifier : A random forest classifier with optimal splits.\n RandomForestRegressor : Ensemble regressor using trees with optimal splits.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n References\n ----------\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized\n trees\", Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.ensemble import ExtraTreesClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_features=4, random_state=0)\n >>> clf = ExtraTreesClassifier(n_estimators=100, random_state=0)\n >>> clf.fit(X, y)\n ExtraTreesClassifier(random_state=0)\n >>> clf.predict([[0, 0, 0, 0]])\n array([1])\n ", + "source_code": "\n\nclass ExtraTreesClassifier(ForestClassifier):\n \"\"\"\n An extra-trees classifier.\n\n This class implements a meta estimator that fits a number of\n randomized decision trees (a.k.a. extra-trees) on various sub-samples\n of the dataset and uses averaging to improve the predictive accuracy\n and control over-fitting.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"gini\", \"entropy\"}, default=\"gini\"\n The function to measure the quality of a split. Supported criteria are\n \"gini\" for the Gini impurity and \"entropy\" for the information gain.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=sqrt(n_features)`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=False\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls 3 sources of randomness:\n\n - the bootstrapping of the samples used when building trees\n (if ``bootstrap=True``)\n - the sampling of the features to consider when looking for the best\n split at each node (if ``max_features < n_features``)\n - the draw of the splits for each of the `max_features`\n\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n class_weight : {\"balanced\", \"balanced_subsample\"}, dict or list of dicts, default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n The \"balanced_subsample\" mode is the same as \"balanced\" except that\n weights are computed based on the bootstrap sample for every tree\n grown.\n\n For multi-output, the weights of each column of y will be multiplied.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : ExtraTreesClassifier\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeClassifier\n The collection of fitted sub-estimators.\n\n classes_ : ndarray of shape (n_classes,) or a list of such arrays\n The classes labels (single output problem), or a list of arrays of\n class labels (multi-output problem).\n\n n_classes_ : int or list\n The number of classes (single output problem), or a list containing the\n number of classes for each output (multi-output problem).\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_decision_function_ : ndarray of shape (n_samples, n_classes) or (n_samples, n_classes, n_outputs)\n Decision function computed with out-of-bag estimate on the training\n set. If n_estimators is small it might be possible that a data point\n was never left out during the bootstrap. In this case,\n `oob_decision_function_` might contain NaN. This attribute exists\n only when ``oob_score`` is True.\n\n See Also\n --------\n ExtraTreesRegressor : An extra-trees regressor with random splits.\n RandomForestClassifier : A random forest classifier with optimal splits.\n RandomForestRegressor : Ensemble regressor using trees with optimal splits.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n References\n ----------\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized\n trees\", Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.ensemble import ExtraTreesClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_features=4, random_state=0)\n >>> clf = ExtraTreesClassifier(n_estimators=100, random_state=0)\n >>> clf.fit(X, y)\n ExtraTreesClassifier(random_state=0)\n >>> clf.predict([[0, 0, 0, 0]])\n array([1])\n \"\"\"\n \n def __init__(self, n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None):\n super().__init__(base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples)\n self.criterion = criterion\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha\n" }, { "name": "ExtraTreesRegressor", @@ -20921,9 +20987,9 @@ "sklearn.ensemble._forest.ExtraTreesRegressor.__init__" ], "is_public": true, - "description": "An extra-trees regressor.\n\nThis class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. Read more in the :ref:`User Guide `.", - "docstring": "\n An extra-trees regressor.\n\n This class implements a meta estimator that fits a number of\n randomized decision trees (a.k.a. extra-trees) on various sub-samples\n of the dataset and uses averaging to improve the predictive accuracy\n and control over-fitting.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"squared_error\", \"absolute_error\"}, default=\"squared_error\"\n The function to measure the quality of a split. Supported criteria\n are \"squared_error\" for the mean squared error, which is equal to\n variance reduction as feature selection criterion, and \"absolute_error\"\n for the mean absolute error.\n\n .. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n .. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n .. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=n_features`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=False\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls 3 sources of randomness:\n\n - the bootstrapping of the samples used when building trees\n (if ``bootstrap=True``)\n - the sampling of the features to consider when looking for the best\n split at each node (if ``max_features < n_features``)\n - the draw of the splits for each of the `max_features`\n\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : ExtraTreeRegressor\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeRegressor\n The collection of fitted sub-estimators.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Prediction computed with out-of-bag estimate on the training set.\n This attribute exists only when ``oob_score`` is True.\n\n See Also\n --------\n ExtraTreesClassifier : An extra-trees classifier with random splits.\n RandomForestClassifier : A random forest classifier with optimal splits.\n RandomForestRegressor : Ensemble regressor using trees with optimal splits.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n References\n ----------\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\",\n Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.ensemble import ExtraTreesRegressor\n >>> X, y = load_diabetes(return_X_y=True)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(\n ... X_train, y_train)\n >>> reg.score(X_test, y_test)\n 0.2708...\n ", - "source_code": "\n\nclass ExtraTreesRegressor(ForestRegressor):\n \"\"\"\n An extra-trees regressor.\n\n This class implements a meta estimator that fits a number of\n randomized decision trees (a.k.a. extra-trees) on various sub-samples\n of the dataset and uses averaging to improve the predictive accuracy\n and control over-fitting.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"squared_error\", \"absolute_error\"}, default=\"squared_error\"\n The function to measure the quality of a split. Supported criteria\n are \"squared_error\" for the mean squared error, which is equal to\n variance reduction as feature selection criterion, and \"absolute_error\"\n for the mean absolute error.\n\n .. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n .. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n .. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=n_features`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=False\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls 3 sources of randomness:\n\n - the bootstrapping of the samples used when building trees\n (if ``bootstrap=True``)\n - the sampling of the features to consider when looking for the best\n split at each node (if ``max_features < n_features``)\n - the draw of the splits for each of the `max_features`\n\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : ExtraTreeRegressor\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeRegressor\n The collection of fitted sub-estimators.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Prediction computed with out-of-bag estimate on the training set.\n This attribute exists only when ``oob_score`` is True.\n\n See Also\n --------\n ExtraTreesClassifier : An extra-trees classifier with random splits.\n RandomForestClassifier : A random forest classifier with optimal splits.\n RandomForestRegressor : Ensemble regressor using trees with optimal splits.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n References\n ----------\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\",\n Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.ensemble import ExtraTreesRegressor\n >>> X, y = load_diabetes(return_X_y=True)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(\n ... X_train, y_train)\n >>> reg.score(X_test, y_test)\n 0.2708...\n \"\"\"\n \n def __init__(self, n_estimators=100, *, criterion='squared_error', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None):\n super().__init__(base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, max_samples=max_samples)\n self.criterion = criterion\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha\n" + "description": "An extra-trees regressor.\n\nThis class implements a meta estimator that fits a number of\nrandomized decision trees (a.k.a. extra-trees) on various sub-samples\nof the dataset and uses averaging to improve the predictive accuracy\nand control over-fitting.\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n An extra-trees regressor.\n\n This class implements a meta estimator that fits a number of\n randomized decision trees (a.k.a. extra-trees) on various sub-samples\n of the dataset and uses averaging to improve the predictive accuracy\n and control over-fitting.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"squared_error\", \"absolute_error\"}, default=\"squared_error\"\n The function to measure the quality of a split. Supported criteria\n are \"squared_error\" for the mean squared error, which is equal to\n variance reduction as feature selection criterion, and \"absolute_error\"\n for the mean absolute error.\n\n .. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n .. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n .. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=n_features`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=False\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls 3 sources of randomness:\n\n - the bootstrapping of the samples used when building trees\n (if ``bootstrap=True``)\n - the sampling of the features to consider when looking for the best\n split at each node (if ``max_features < n_features``)\n - the draw of the splits for each of the `max_features`\n\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : ExtraTreeRegressor\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeRegressor\n The collection of fitted sub-estimators.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Prediction computed with out-of-bag estimate on the training set.\n This attribute exists only when ``oob_score`` is True.\n\n See Also\n --------\n ExtraTreesClassifier : An extra-trees classifier with random splits.\n RandomForestClassifier : A random forest classifier with optimal splits.\n RandomForestRegressor : Ensemble regressor using trees with optimal splits.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n References\n ----------\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\",\n Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.ensemble import ExtraTreesRegressor\n >>> X, y = load_diabetes(return_X_y=True)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(\n ... X_train, y_train)\n >>> reg.score(X_test, y_test)\n 0.2708...\n ", + "source_code": "\n\nclass ExtraTreesRegressor(ForestRegressor):\n \"\"\"\n An extra-trees regressor.\n\n This class implements a meta estimator that fits a number of\n randomized decision trees (a.k.a. extra-trees) on various sub-samples\n of the dataset and uses averaging to improve the predictive accuracy\n and control over-fitting.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"squared_error\", \"absolute_error\"}, default=\"squared_error\"\n The function to measure the quality of a split. Supported criteria\n are \"squared_error\" for the mean squared error, which is equal to\n variance reduction as feature selection criterion, and \"absolute_error\"\n for the mean absolute error.\n\n .. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n .. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n .. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=n_features`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=False\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls 3 sources of randomness:\n\n - the bootstrapping of the samples used when building trees\n (if ``bootstrap=True``)\n - the sampling of the features to consider when looking for the best\n split at each node (if ``max_features < n_features``)\n - the draw of the splits for each of the `max_features`\n\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : ExtraTreeRegressor\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeRegressor\n The collection of fitted sub-estimators.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Prediction computed with out-of-bag estimate on the training set.\n This attribute exists only when ``oob_score`` is True.\n\n See Also\n --------\n ExtraTreesClassifier : An extra-trees classifier with random splits.\n RandomForestClassifier : A random forest classifier with optimal splits.\n RandomForestRegressor : Ensemble regressor using trees with optimal splits.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n References\n ----------\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\",\n Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.ensemble import ExtraTreesRegressor\n >>> X, y = load_diabetes(return_X_y=True)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(\n ... X_train, y_train)\n >>> reg.score(X_test, y_test)\n 0.2708...\n \"\"\"\n \n def __init__(self, n_estimators=100, *, criterion='squared_error', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None):\n super().__init__(base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, max_samples=max_samples)\n self.criterion = criterion\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha\n" }, { "name": "ForestClassifier", @@ -20941,7 +21007,7 @@ "sklearn.ensemble._forest.ForestClassifier._more_tags" ], "is_public": false, - "description": "Base class for forest of trees-based classifiers.\n\nWarning: This class should not be used directly. Use derived classes instead.", + "description": "Base class for forest of trees-based classifiers.\n\nWarning: This class should not be used directly. Use derived classes\ninstead.", "docstring": "\n Base class for forest of trees-based classifiers.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n ", "source_code": "\n\nclass ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):\n \"\"\"\n Base class for forest of trees-based classifiers.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n \"\"\"\n \n @abstractmethod\n def __init__(self, base_estimator, n_estimators=100, *, estimator_params=tuple(), bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, max_samples=None):\n super().__init__(base_estimator, n_estimators=n_estimators, estimator_params=estimator_params, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples)\n \n @staticmethod\n def _get_oob_predictions(tree, X):\n \"\"\"Compute the OOB predictions for an individual tree.\n\n Parameters\n ----------\n tree : DecisionTreeClassifier object\n A single decision tree classifier.\n X : ndarray of shape (n_samples, n_features)\n The OOB samples.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples, n_classes, n_outputs)\n The OOB associated predictions.\n \"\"\"\n y_pred = tree.predict_proba(X, check_input=False)\n y_pred = np.array(y_pred, copy=False)\n if y_pred.ndim == 2:\n y_pred = y_pred[..., np.newaxis]\n else:\n y_pred = np.rollaxis(y_pred, axis=0, start=3)\n return y_pred\n \n def _set_oob_score_and_attributes(self, X, y):\n \"\"\"Compute and set the OOB score and attributes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n \"\"\"\n self.oob_decision_function_ = super()._compute_oob_predictions(X, y)\n if self.oob_decision_function_.shape[-1] == 1:\n self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1)\n self.oob_score_ = accuracy_score(y, np.argmax(self.oob_decision_function_, axis=1))\n \n def _validate_y_class_weight(self, y):\n check_classification_targets(y)\n y = np.copy(y)\n expanded_class_weight = None\n if self.class_weight is not None:\n y_original = np.copy(y)\n self.classes_ = []\n self.n_classes_ = []\n y_store_unique_indices = np.zeros(y.shape, dtype=int)\n for k in range(self.n_outputs_):\n (classes_k, y_store_unique_indices[:, k]) = np.unique(y[:, k], return_inverse=True)\n self.classes_.append(classes_k)\n self.n_classes_.append(classes_k.shape[0])\n y = y_store_unique_indices\n if self.class_weight is not None:\n valid_presets = ('balanced', 'balanced_subsample')\n if isinstance(self.class_weight, str):\n if self.class_weight not in valid_presets:\n raise ValueError('Valid presets for class_weight include \"balanced\" and \"balanced_subsample\".Given \"%s\".' % self.class_weight)\n if self.warm_start:\n warn('class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.')\n if self.class_weight != 'balanced_subsample' or not self.bootstrap:\n if self.class_weight == 'balanced_subsample':\n class_weight = 'balanced'\n else:\n class_weight = self.class_weight\n expanded_class_weight = compute_sample_weight(class_weight, y_original)\n return y, expanded_class_weight\n \n def predict(self, X):\n \"\"\"\n Predict class for X.\n\n The predicted class of an input sample is a vote by the trees in\n the forest, weighted by their probability estimates. That is,\n the predicted class is the one with highest mean probability\n estimate across the trees.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The predicted classes.\n \"\"\"\n proba = self.predict_proba(X)\n if self.n_outputs_ == 1:\n return self.classes_.take(np.argmax(proba, axis=1), axis=0)\n else:\n n_samples = proba[0].shape[0]\n class_type = self.classes_[0].dtype\n predictions = np.empty((n_samples, self.n_outputs_), dtype=class_type)\n for k in range(self.n_outputs_):\n predictions[:, k] = self.classes_[k].take(np.argmax(proba[k], axis=1), axis=0)\n return predictions\n \n def predict_proba(self, X):\n \"\"\"\n Predict class probabilities for X.\n\n The predicted class probabilities of an input sample are computed as\n the mean predicted class probabilities of the trees in the forest.\n The class probability of a single tree is the fraction of samples of\n the same class in a leaf.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes), or a list of such arrays\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_X_predict(X)\n (n_jobs, _, _) = _partition_estimators(self.n_estimators, self.n_jobs)\n all_proba = [np.zeros((X.shape[0], j), dtype=np.float64) for j in np.atleast_1d(self.n_classes_)]\n lock = threading.Lock()\n Parallel(n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require='sharedmem'))((delayed(_accumulate_prediction)(e.predict_proba, X, all_proba, lock) for e in self.estimators_))\n for proba in all_proba:\n proba /= len(self.estimators_)\n if len(all_proba) == 1:\n return all_proba[0]\n else:\n return all_proba\n \n def predict_log_proba(self, X):\n \"\"\"\n Predict class log-probabilities for X.\n\n The predicted class log-probabilities of an input sample is computed as\n the log of the mean predicted class probabilities of the trees in the\n forest.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes), or a list of such arrays\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n proba = self.predict_proba(X)\n if self.n_outputs_ == 1:\n return np.log(proba)\n else:\n for k in range(self.n_outputs_):\n proba[k] = np.log(proba[k])\n return proba\n \n def _more_tags(self):\n return {'multilabel': True}\n" }, @@ -20959,7 +21025,7 @@ "sklearn.ensemble._forest.ForestRegressor._more_tags" ], "is_public": false, - "description": "Base class for forest of trees-based regressors.\n\nWarning: This class should not be used directly. Use derived classes instead.", + "description": "Base class for forest of trees-based regressors.\n\nWarning: This class should not be used directly. Use derived classes\ninstead.", "docstring": "\n Base class for forest of trees-based regressors.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n ", "source_code": "\n\nclass ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta):\n \"\"\"\n Base class for forest of trees-based regressors.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n \"\"\"\n \n @abstractmethod\n def __init__(self, base_estimator, n_estimators=100, *, estimator_params=tuple(), bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, max_samples=None):\n super().__init__(base_estimator, n_estimators=n_estimators, estimator_params=estimator_params, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, max_samples=max_samples)\n \n def predict(self, X):\n \"\"\"\n Predict regression target for X.\n\n The predicted regression target of an input sample is computed as the\n mean predicted regression targets of the trees in the forest.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The predicted values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_X_predict(X)\n (n_jobs, _, _) = _partition_estimators(self.n_estimators, self.n_jobs)\n if self.n_outputs_ > 1:\n y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64)\n else:\n y_hat = np.zeros(X.shape[0], dtype=np.float64)\n lock = threading.Lock()\n Parallel(n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require='sharedmem'))((delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) for e in self.estimators_))\n y_hat /= len(self.estimators_)\n return y_hat\n \n @staticmethod\n def _get_oob_predictions(tree, X):\n \"\"\"Compute the OOB predictions for an individual tree.\n\n Parameters\n ----------\n tree : DecisionTreeRegressor object\n A single decision tree regressor.\n X : ndarray of shape (n_samples, n_features)\n The OOB samples.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples, 1, n_outputs)\n The OOB associated predictions.\n \"\"\"\n y_pred = tree.predict(X, check_input=False)\n if y_pred.ndim == 1:\n y_pred = y_pred[:, np.newaxis, np.newaxis]\n else:\n y_pred = y_pred[:, np.newaxis, :]\n return y_pred\n \n def _set_oob_score_and_attributes(self, X, y):\n \"\"\"Compute and set the OOB score and attributes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n \"\"\"\n self.oob_prediction_ = super()._compute_oob_predictions(X, y).squeeze(axis=1)\n if self.oob_prediction_.shape[-1] == 1:\n self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1)\n self.oob_score_ = r2_score(y, self.oob_prediction_)\n \n def _compute_partial_dependence_recursion(self, grid, target_features):\n \"\"\"Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray of shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray of shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\n\n Returns\n -------\n averaged_predictions : ndarray of shape (n_samples,)\n The value of the partial dependence function on each grid point.\n \"\"\"\n grid = np.asarray(grid, dtype=DTYPE, order='C')\n averaged_predictions = np.zeros(shape=grid.shape[0], dtype=np.float64, order='C')\n for tree in self.estimators_:\n tree.tree_.compute_partial_dependence(grid, target_features, averaged_predictions)\n averaged_predictions /= len(self.estimators_)\n return averaged_predictions\n \n def _more_tags(self):\n return {'multilabel': True}\n" }, @@ -20972,9 +21038,9 @@ "sklearn.ensemble._forest.RandomForestClassifier.__init__" ], "is_public": true, - "description": "A random forest classifier.\n\nA random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the `max_samples` parameter if `bootstrap=True` (default), otherwise the whole dataset is used to build each tree. Read more in the :ref:`User Guide `.", - "docstring": "\n A random forest classifier.\n\n A random forest is a meta estimator that fits a number of decision tree\n classifiers on various sub-samples of the dataset and uses averaging to\n improve the predictive accuracy and control over-fitting.\n The sub-sample size is controlled with the `max_samples` parameter if\n `bootstrap=True` (default), otherwise the whole dataset is used to build\n each tree.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"gini\", \"entropy\"}, default=\"gini\"\n The function to measure the quality of a split. Supported criteria are\n \"gini\" for the Gini impurity and \"entropy\" for the information gain.\n Note: this parameter is tree-specific.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=sqrt(n_features)`.\n - If \"sqrt\", then `max_features=sqrt(n_features)` (same as \"auto\").\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=True\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls both the randomness of the bootstrapping of the samples used\n when building trees (if ``bootstrap=True``) and the sampling of the\n features to consider when looking for the best split at each node\n (if ``max_features < n_features``).\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n class_weight : {\"balanced\", \"balanced_subsample\"}, dict or list of dicts, default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n The \"balanced_subsample\" mode is the same as \"balanced\" except that\n weights are computed based on the bootstrap sample for every tree\n grown.\n\n For multi-output, the weights of each column of y will be multiplied.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : DecisionTreeClassifier\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeClassifier\n The collection of fitted sub-estimators.\n\n classes_ : ndarray of shape (n_classes,) or a list of such arrays\n The classes labels (single output problem), or a list of arrays of\n class labels (multi-output problem).\n\n n_classes_ : int or list\n The number of classes (single output problem), or a list containing the\n number of classes for each output (multi-output problem).\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_decision_function_ : ndarray of shape (n_samples, n_classes) or (n_samples, n_classes, n_outputs)\n Decision function computed with out-of-bag estimate on the training\n set. If n_estimators is small it might be possible that a data point\n was never left out during the bootstrap. In this case,\n `oob_decision_function_` might contain NaN. This attribute exists\n only when ``oob_score`` is True.\n\n See Also\n --------\n sklearn.tree.DecisionTreeClassifier : A decision tree classifier.\n sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized\n tree classifiers.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n The features are always randomly permuted at each split. Therefore,\n the best found split may vary, even with the same training data,\n ``max_features=n_features`` and ``bootstrap=False``, if the improvement\n of the criterion is identical for several splits enumerated during the\n search of the best split. To obtain a deterministic behaviour during\n fitting, ``random_state`` has to be fixed.\n\n References\n ----------\n .. [1] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32, 2001.\n\n Examples\n --------\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_samples=1000, n_features=4,\n ... n_informative=2, n_redundant=0,\n ... random_state=0, shuffle=False)\n >>> clf = RandomForestClassifier(max_depth=2, random_state=0)\n >>> clf.fit(X, y)\n RandomForestClassifier(...)\n >>> print(clf.predict([[0, 0, 0, 0]]))\n [1]\n ", - "source_code": "\n\nclass RandomForestClassifier(ForestClassifier):\n \"\"\"\n A random forest classifier.\n\n A random forest is a meta estimator that fits a number of decision tree\n classifiers on various sub-samples of the dataset and uses averaging to\n improve the predictive accuracy and control over-fitting.\n The sub-sample size is controlled with the `max_samples` parameter if\n `bootstrap=True` (default), otherwise the whole dataset is used to build\n each tree.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"gini\", \"entropy\"}, default=\"gini\"\n The function to measure the quality of a split. Supported criteria are\n \"gini\" for the Gini impurity and \"entropy\" for the information gain.\n Note: this parameter is tree-specific.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=sqrt(n_features)`.\n - If \"sqrt\", then `max_features=sqrt(n_features)` (same as \"auto\").\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=True\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls both the randomness of the bootstrapping of the samples used\n when building trees (if ``bootstrap=True``) and the sampling of the\n features to consider when looking for the best split at each node\n (if ``max_features < n_features``).\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n class_weight : {\"balanced\", \"balanced_subsample\"}, dict or list of dicts, default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n The \"balanced_subsample\" mode is the same as \"balanced\" except that\n weights are computed based on the bootstrap sample for every tree\n grown.\n\n For multi-output, the weights of each column of y will be multiplied.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : DecisionTreeClassifier\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeClassifier\n The collection of fitted sub-estimators.\n\n classes_ : ndarray of shape (n_classes,) or a list of such arrays\n The classes labels (single output problem), or a list of arrays of\n class labels (multi-output problem).\n\n n_classes_ : int or list\n The number of classes (single output problem), or a list containing the\n number of classes for each output (multi-output problem).\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_decision_function_ : ndarray of shape (n_samples, n_classes) or (n_samples, n_classes, n_outputs)\n Decision function computed with out-of-bag estimate on the training\n set. If n_estimators is small it might be possible that a data point\n was never left out during the bootstrap. In this case,\n `oob_decision_function_` might contain NaN. This attribute exists\n only when ``oob_score`` is True.\n\n See Also\n --------\n sklearn.tree.DecisionTreeClassifier : A decision tree classifier.\n sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized\n tree classifiers.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n The features are always randomly permuted at each split. Therefore,\n the best found split may vary, even with the same training data,\n ``max_features=n_features`` and ``bootstrap=False``, if the improvement\n of the criterion is identical for several splits enumerated during the\n search of the best split. To obtain a deterministic behaviour during\n fitting, ``random_state`` has to be fixed.\n\n References\n ----------\n .. [1] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32, 2001.\n\n Examples\n --------\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_samples=1000, n_features=4,\n ... n_informative=2, n_redundant=0,\n ... random_state=0, shuffle=False)\n >>> clf = RandomForestClassifier(max_depth=2, random_state=0)\n >>> clf.fit(X, y)\n RandomForestClassifier(...)\n >>> print(clf.predict([[0, 0, 0, 0]]))\n [1]\n \"\"\"\n \n def __init__(self, n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None):\n super().__init__(base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples)\n self.criterion = criterion\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha\n" + "description": "A random forest classifier.\n\nA random forest is a meta estimator that fits a number of decision tree\nclassifiers on various sub-samples of the dataset and uses averaging to\nimprove the predictive accuracy and control over-fitting.\nThe sub-sample size is controlled with the `max_samples` parameter if\n`bootstrap=True` (default), otherwise the whole dataset is used to build\neach tree.\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n A random forest classifier.\n\n A random forest is a meta estimator that fits a number of decision tree\n classifiers on various sub-samples of the dataset and uses averaging to\n improve the predictive accuracy and control over-fitting.\n The sub-sample size is controlled with the `max_samples` parameter if\n `bootstrap=True` (default), otherwise the whole dataset is used to build\n each tree.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"gini\", \"entropy\"}, default=\"gini\"\n The function to measure the quality of a split. Supported criteria are\n \"gini\" for the Gini impurity and \"entropy\" for the information gain.\n Note: this parameter is tree-specific.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=sqrt(n_features)`.\n - If \"sqrt\", then `max_features=sqrt(n_features)` (same as \"auto\").\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=True\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls both the randomness of the bootstrapping of the samples used\n when building trees (if ``bootstrap=True``) and the sampling of the\n features to consider when looking for the best split at each node\n (if ``max_features < n_features``).\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n class_weight : {\"balanced\", \"balanced_subsample\"}, dict or list of dicts, default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n The \"balanced_subsample\" mode is the same as \"balanced\" except that\n weights are computed based on the bootstrap sample for every tree\n grown.\n\n For multi-output, the weights of each column of y will be multiplied.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : DecisionTreeClassifier\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeClassifier\n The collection of fitted sub-estimators.\n\n classes_ : ndarray of shape (n_classes,) or a list of such arrays\n The classes labels (single output problem), or a list of arrays of\n class labels (multi-output problem).\n\n n_classes_ : int or list\n The number of classes (single output problem), or a list containing the\n number of classes for each output (multi-output problem).\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_decision_function_ : ndarray of shape (n_samples, n_classes) or (n_samples, n_classes, n_outputs)\n Decision function computed with out-of-bag estimate on the training\n set. If n_estimators is small it might be possible that a data point\n was never left out during the bootstrap. In this case,\n `oob_decision_function_` might contain NaN. This attribute exists\n only when ``oob_score`` is True.\n\n See Also\n --------\n sklearn.tree.DecisionTreeClassifier : A decision tree classifier.\n sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized\n tree classifiers.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n The features are always randomly permuted at each split. Therefore,\n the best found split may vary, even with the same training data,\n ``max_features=n_features`` and ``bootstrap=False``, if the improvement\n of the criterion is identical for several splits enumerated during the\n search of the best split. To obtain a deterministic behaviour during\n fitting, ``random_state`` has to be fixed.\n\n References\n ----------\n .. [1] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32, 2001.\n\n Examples\n --------\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_samples=1000, n_features=4,\n ... n_informative=2, n_redundant=0,\n ... random_state=0, shuffle=False)\n >>> clf = RandomForestClassifier(max_depth=2, random_state=0)\n >>> clf.fit(X, y)\n RandomForestClassifier(...)\n >>> print(clf.predict([[0, 0, 0, 0]]))\n [1]\n ", + "source_code": "\n\nclass RandomForestClassifier(ForestClassifier):\n \"\"\"\n A random forest classifier.\n\n A random forest is a meta estimator that fits a number of decision tree\n classifiers on various sub-samples of the dataset and uses averaging to\n improve the predictive accuracy and control over-fitting.\n The sub-sample size is controlled with the `max_samples` parameter if\n `bootstrap=True` (default), otherwise the whole dataset is used to build\n each tree.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"gini\", \"entropy\"}, default=\"gini\"\n The function to measure the quality of a split. Supported criteria are\n \"gini\" for the Gini impurity and \"entropy\" for the information gain.\n Note: this parameter is tree-specific.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=sqrt(n_features)`.\n - If \"sqrt\", then `max_features=sqrt(n_features)` (same as \"auto\").\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=True\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls both the randomness of the bootstrapping of the samples used\n when building trees (if ``bootstrap=True``) and the sampling of the\n features to consider when looking for the best split at each node\n (if ``max_features < n_features``).\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n class_weight : {\"balanced\", \"balanced_subsample\"}, dict or list of dicts, default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n The \"balanced_subsample\" mode is the same as \"balanced\" except that\n weights are computed based on the bootstrap sample for every tree\n grown.\n\n For multi-output, the weights of each column of y will be multiplied.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : DecisionTreeClassifier\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeClassifier\n The collection of fitted sub-estimators.\n\n classes_ : ndarray of shape (n_classes,) or a list of such arrays\n The classes labels (single output problem), or a list of arrays of\n class labels (multi-output problem).\n\n n_classes_ : int or list\n The number of classes (single output problem), or a list containing the\n number of classes for each output (multi-output problem).\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_decision_function_ : ndarray of shape (n_samples, n_classes) or (n_samples, n_classes, n_outputs)\n Decision function computed with out-of-bag estimate on the training\n set. If n_estimators is small it might be possible that a data point\n was never left out during the bootstrap. In this case,\n `oob_decision_function_` might contain NaN. This attribute exists\n only when ``oob_score`` is True.\n\n See Also\n --------\n sklearn.tree.DecisionTreeClassifier : A decision tree classifier.\n sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized\n tree classifiers.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n The features are always randomly permuted at each split. Therefore,\n the best found split may vary, even with the same training data,\n ``max_features=n_features`` and ``bootstrap=False``, if the improvement\n of the criterion is identical for several splits enumerated during the\n search of the best split. To obtain a deterministic behaviour during\n fitting, ``random_state`` has to be fixed.\n\n References\n ----------\n .. [1] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32, 2001.\n\n Examples\n --------\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_samples=1000, n_features=4,\n ... n_informative=2, n_redundant=0,\n ... random_state=0, shuffle=False)\n >>> clf = RandomForestClassifier(max_depth=2, random_state=0)\n >>> clf.fit(X, y)\n RandomForestClassifier(...)\n >>> print(clf.predict([[0, 0, 0, 0]]))\n [1]\n \"\"\"\n \n def __init__(self, n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None):\n super().__init__(base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples)\n self.criterion = criterion\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha\n" }, { "name": "RandomForestRegressor", @@ -20985,9 +21051,9 @@ "sklearn.ensemble._forest.RandomForestRegressor.__init__" ], "is_public": true, - "description": "A random forest regressor.\n\nA random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the `max_samples` parameter if `bootstrap=True` (default), otherwise the whole dataset is used to build each tree. Read more in the :ref:`User Guide `.", - "docstring": "\n A random forest regressor.\n\n A random forest is a meta estimator that fits a number of classifying\n decision trees on various sub-samples of the dataset and uses averaging\n to improve the predictive accuracy and control over-fitting.\n The sub-sample size is controlled with the `max_samples` parameter if\n `bootstrap=True` (default), otherwise the whole dataset is used to build\n each tree.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"squared_error\", \"absolute_error\", \"poisson\"}, default=\"squared_error\"\n The function to measure the quality of a split. Supported criteria\n are \"squared_error\" for the mean squared error, which is equal to\n variance reduction as feature selection criterion, \"absolute_error\"\n for the mean absolute error, and \"poisson\" which uses reduction in\n Poisson deviance to find splits.\n Training using \"absolute_error\" is significantly slower\n than when using \"squared_error\".\n\n .. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n .. versionadded:: 1.0\n Poisson criterion.\n\n .. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n .. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=n_features`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=True\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls both the randomness of the bootstrapping of the samples used\n when building trees (if ``bootstrap=True``) and the sampling of the\n features to consider when looking for the best split at each node\n (if ``max_features < n_features``).\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : DecisionTreeRegressor\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeRegressor\n The collection of fitted sub-estimators.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Prediction computed with out-of-bag estimate on the training set.\n This attribute exists only when ``oob_score`` is True.\n\n See Also\n --------\n sklearn.tree.DecisionTreeRegressor : A decision tree regressor.\n sklearn.ensemble.ExtraTreesRegressor : Ensemble of extremely randomized\n tree regressors.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n The features are always randomly permuted at each split. Therefore,\n the best found split may vary, even with the same training data,\n ``max_features=n_features`` and ``bootstrap=False``, if the improvement\n of the criterion is identical for several splits enumerated during the\n search of the best split. To obtain a deterministic behaviour during\n fitting, ``random_state`` has to be fixed.\n\n The default value ``max_features=\"auto\"`` uses ``n_features``\n rather than ``n_features / 3``. The latter was originally suggested in\n [1], whereas the former was more recently justified empirically in [2].\n\n References\n ----------\n .. [1] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32, 2001.\n\n .. [2] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized\n trees\", Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_features=4, n_informative=2,\n ... random_state=0, shuffle=False)\n >>> regr = RandomForestRegressor(max_depth=2, random_state=0)\n >>> regr.fit(X, y)\n RandomForestRegressor(...)\n >>> print(regr.predict([[0, 0, 0, 0]]))\n [-8.32987858]\n ", - "source_code": "\n\nclass RandomForestRegressor(ForestRegressor):\n \"\"\"\n A random forest regressor.\n\n A random forest is a meta estimator that fits a number of classifying\n decision trees on various sub-samples of the dataset and uses averaging\n to improve the predictive accuracy and control over-fitting.\n The sub-sample size is controlled with the `max_samples` parameter if\n `bootstrap=True` (default), otherwise the whole dataset is used to build\n each tree.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"squared_error\", \"absolute_error\", \"poisson\"}, default=\"squared_error\"\n The function to measure the quality of a split. Supported criteria\n are \"squared_error\" for the mean squared error, which is equal to\n variance reduction as feature selection criterion, \"absolute_error\"\n for the mean absolute error, and \"poisson\" which uses reduction in\n Poisson deviance to find splits.\n Training using \"absolute_error\" is significantly slower\n than when using \"squared_error\".\n\n .. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n .. versionadded:: 1.0\n Poisson criterion.\n\n .. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n .. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=n_features`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=True\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls both the randomness of the bootstrapping of the samples used\n when building trees (if ``bootstrap=True``) and the sampling of the\n features to consider when looking for the best split at each node\n (if ``max_features < n_features``).\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : DecisionTreeRegressor\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeRegressor\n The collection of fitted sub-estimators.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Prediction computed with out-of-bag estimate on the training set.\n This attribute exists only when ``oob_score`` is True.\n\n See Also\n --------\n sklearn.tree.DecisionTreeRegressor : A decision tree regressor.\n sklearn.ensemble.ExtraTreesRegressor : Ensemble of extremely randomized\n tree regressors.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n The features are always randomly permuted at each split. Therefore,\n the best found split may vary, even with the same training data,\n ``max_features=n_features`` and ``bootstrap=False``, if the improvement\n of the criterion is identical for several splits enumerated during the\n search of the best split. To obtain a deterministic behaviour during\n fitting, ``random_state`` has to be fixed.\n\n The default value ``max_features=\"auto\"`` uses ``n_features``\n rather than ``n_features / 3``. The latter was originally suggested in\n [1], whereas the former was more recently justified empirically in [2].\n\n References\n ----------\n .. [1] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32, 2001.\n\n .. [2] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized\n trees\", Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_features=4, n_informative=2,\n ... random_state=0, shuffle=False)\n >>> regr = RandomForestRegressor(max_depth=2, random_state=0)\n >>> regr.fit(X, y)\n RandomForestRegressor(...)\n >>> print(regr.predict([[0, 0, 0, 0]]))\n [-8.32987858]\n \"\"\"\n \n def __init__(self, n_estimators=100, *, criterion='squared_error', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None):\n super().__init__(base_estimator=DecisionTreeRegressor(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, max_samples=max_samples)\n self.criterion = criterion\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha\n" + "description": "A random forest regressor.\n\nA random forest is a meta estimator that fits a number of classifying\ndecision trees on various sub-samples of the dataset and uses averaging\nto improve the predictive accuracy and control over-fitting.\nThe sub-sample size is controlled with the `max_samples` parameter if\n`bootstrap=True` (default), otherwise the whole dataset is used to build\neach tree.\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n A random forest regressor.\n\n A random forest is a meta estimator that fits a number of classifying\n decision trees on various sub-samples of the dataset and uses averaging\n to improve the predictive accuracy and control over-fitting.\n The sub-sample size is controlled with the `max_samples` parameter if\n `bootstrap=True` (default), otherwise the whole dataset is used to build\n each tree.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"squared_error\", \"absolute_error\", \"poisson\"}, default=\"squared_error\"\n The function to measure the quality of a split. Supported criteria\n are \"squared_error\" for the mean squared error, which is equal to\n variance reduction as feature selection criterion, \"absolute_error\"\n for the mean absolute error, and \"poisson\" which uses reduction in\n Poisson deviance to find splits.\n Training using \"absolute_error\" is significantly slower\n than when using \"squared_error\".\n\n .. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n .. versionadded:: 1.0\n Poisson criterion.\n\n .. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n .. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=n_features`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=True\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls both the randomness of the bootstrapping of the samples used\n when building trees (if ``bootstrap=True``) and the sampling of the\n features to consider when looking for the best split at each node\n (if ``max_features < n_features``).\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : DecisionTreeRegressor\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeRegressor\n The collection of fitted sub-estimators.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Prediction computed with out-of-bag estimate on the training set.\n This attribute exists only when ``oob_score`` is True.\n\n See Also\n --------\n sklearn.tree.DecisionTreeRegressor : A decision tree regressor.\n sklearn.ensemble.ExtraTreesRegressor : Ensemble of extremely randomized\n tree regressors.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n The features are always randomly permuted at each split. Therefore,\n the best found split may vary, even with the same training data,\n ``max_features=n_features`` and ``bootstrap=False``, if the improvement\n of the criterion is identical for several splits enumerated during the\n search of the best split. To obtain a deterministic behaviour during\n fitting, ``random_state`` has to be fixed.\n\n The default value ``max_features=\"auto\"`` uses ``n_features``\n rather than ``n_features / 3``. The latter was originally suggested in\n [1], whereas the former was more recently justified empirically in [2].\n\n References\n ----------\n .. [1] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32, 2001.\n\n .. [2] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized\n trees\", Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_features=4, n_informative=2,\n ... random_state=0, shuffle=False)\n >>> regr = RandomForestRegressor(max_depth=2, random_state=0)\n >>> regr.fit(X, y)\n RandomForestRegressor(...)\n >>> print(regr.predict([[0, 0, 0, 0]]))\n [-8.32987858]\n ", + "source_code": "\n\nclass RandomForestRegressor(ForestRegressor):\n \"\"\"\n A random forest regressor.\n\n A random forest is a meta estimator that fits a number of classifying\n decision trees on various sub-samples of the dataset and uses averaging\n to improve the predictive accuracy and control over-fitting.\n The sub-sample size is controlled with the `max_samples` parameter if\n `bootstrap=True` (default), otherwise the whole dataset is used to build\n each tree.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n criterion : {\"squared_error\", \"absolute_error\", \"poisson\"}, default=\"squared_error\"\n The function to measure the quality of a split. Supported criteria\n are \"squared_error\" for the mean squared error, which is equal to\n variance reduction as feature selection criterion, \"absolute_error\"\n for the mean absolute error, and \"poisson\" which uses reduction in\n Poisson deviance to find splits.\n Training using \"absolute_error\" is significantly slower\n than when using \"squared_error\".\n\n .. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n .. versionadded:: 1.0\n Poisson criterion.\n\n .. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n .. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : {\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=n_features`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n bootstrap : bool, default=True\n Whether bootstrap samples are used when building trees. If False, the\n whole dataset is used to build each tree.\n\n oob_score : bool, default=False\n Whether to use out-of-bag samples to estimate the generalization score.\n Only available if bootstrap=True.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls both the randomness of the bootstrapping of the samples used\n when building trees (if ``bootstrap=True``) and the sampling of the\n features to consider when looking for the best split at each node\n (if ``max_features < n_features``).\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n max_samples : int or float, default=None\n If bootstrap is True, the number of samples to draw from X\n to train each base estimator.\n\n - If None (default), then draw `X.shape[0]` samples.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n base_estimator_ : DecisionTreeRegressor\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of DecisionTreeRegressor\n The collection of fitted sub-estimators.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n oob_score_ : float\n Score of the training dataset obtained using an out-of-bag estimate.\n This attribute exists only when ``oob_score`` is True.\n\n oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Prediction computed with out-of-bag estimate on the training set.\n This attribute exists only when ``oob_score`` is True.\n\n See Also\n --------\n sklearn.tree.DecisionTreeRegressor : A decision tree regressor.\n sklearn.ensemble.ExtraTreesRegressor : Ensemble of extremely randomized\n tree regressors.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n The features are always randomly permuted at each split. Therefore,\n the best found split may vary, even with the same training data,\n ``max_features=n_features`` and ``bootstrap=False``, if the improvement\n of the criterion is identical for several splits enumerated during the\n search of the best split. To obtain a deterministic behaviour during\n fitting, ``random_state`` has to be fixed.\n\n The default value ``max_features=\"auto\"`` uses ``n_features``\n rather than ``n_features / 3``. The latter was originally suggested in\n [1], whereas the former was more recently justified empirically in [2].\n\n References\n ----------\n .. [1] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32, 2001.\n\n .. [2] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized\n trees\", Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_features=4, n_informative=2,\n ... random_state=0, shuffle=False)\n >>> regr = RandomForestRegressor(max_depth=2, random_state=0)\n >>> regr.fit(X, y)\n RandomForestRegressor(...)\n >>> print(regr.predict([[0, 0, 0, 0]]))\n [-8.32987858]\n \"\"\"\n \n def __init__(self, n_estimators=100, *, criterion='squared_error', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None):\n super().__init__(base_estimator=DecisionTreeRegressor(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, max_samples=max_samples)\n self.criterion = criterion\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha\n" }, { "name": "RandomTreesEmbedding", @@ -21002,9 +21068,9 @@ "sklearn.ensemble._forest.RandomTreesEmbedding.transform" ], "is_public": true, - "description": "An ensemble of totally random trees.\n\nAn unsupervised transformation of a dataset to a high-dimensional sparse representation. A datapoint is coded according to which leaf of each tree it is sorted into. Using a one-hot encoding of the leaves, this leads to a binary coding with as many ones as there are trees in the forest. The dimensionality of the resulting representation is ``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``, the number of leaf nodes is at most ``n_estimators * 2 ** max_depth``. Read more in the :ref:`User Guide `.", - "docstring": "\n An ensemble of totally random trees.\n\n An unsupervised transformation of a dataset to a high-dimensional\n sparse representation. A datapoint is coded according to which leaf of\n each tree it is sorted into. Using a one-hot encoding of the leaves,\n this leads to a binary coding with as many ones as there are trees in\n the forest.\n\n The dimensionality of the resulting representation is\n ``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``,\n the number of leaf nodes is at most ``n_estimators * 2 ** max_depth``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n Number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n max_depth : int, default=5\n The maximum depth of each tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` is the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` is the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n sparse_output : bool, default=True\n Whether or not to return a sparse CSR matrix, as default behavior,\n or to return a dense array compatible with dense pipeline operators.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`transform`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls the generation of the random `y` used to fit the trees\n and the draw of the splits for each feature at the trees' nodes.\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n Attributes\n ----------\n base_estimator_ : :class:`~sklearn.tree.ExtraTreeClassifier` instance\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of :class:`~sklearn.tree.ExtraTreeClassifier` instances\n The collection of fitted sub-estimators.\n\n feature_importances_ : ndarray of shape (n_features,)\n The feature importances (the higher, the more important the feature).\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n one_hot_encoder_ : OneHotEncoder instance\n One-hot encoder used to create the sparse embedding.\n\n See Also\n --------\n ExtraTreesClassifier : An extra-trees classifier.\n ExtraTreesRegressor : An extra-trees regressor.\n RandomForestClassifier : A random forest classifier.\n RandomForestRegressor : A random forest regressor.\n sklearn.tree.ExtraTreeClassifier: An extremely randomized\n tree classifier.\n sklearn.tree.ExtraTreeRegressor : An extremely randomized\n tree regressor.\n\n References\n ----------\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\",\n Machine Learning, 63(1), 3-42, 2006.\n .. [2] Moosmann, F. and Triggs, B. and Jurie, F. \"Fast discriminative\n visual codebooks using randomized clustering forests\"\n NIPS 2007\n\n Examples\n --------\n >>> from sklearn.ensemble import RandomTreesEmbedding\n >>> X = [[0,0], [1,0], [0,1], [-1,0], [0,-1]]\n >>> random_trees = RandomTreesEmbedding(\n ... n_estimators=5, random_state=0, max_depth=1).fit(X)\n >>> X_sparse_embedding = random_trees.transform(X)\n >>> X_sparse_embedding.toarray()\n array([[0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],\n [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],\n [0., 1., 0., 1., 0., 1., 0., 1., 0., 1.],\n [1., 0., 1., 0., 1., 0., 1., 0., 1., 0.],\n [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]])\n ", - "source_code": "\n\nclass RandomTreesEmbedding(BaseForest):\n \"\"\"\n An ensemble of totally random trees.\n\n An unsupervised transformation of a dataset to a high-dimensional\n sparse representation. A datapoint is coded according to which leaf of\n each tree it is sorted into. Using a one-hot encoding of the leaves,\n this leads to a binary coding with as many ones as there are trees in\n the forest.\n\n The dimensionality of the resulting representation is\n ``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``,\n the number of leaf nodes is at most ``n_estimators * 2 ** max_depth``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n Number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n max_depth : int, default=5\n The maximum depth of each tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` is the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` is the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n sparse_output : bool, default=True\n Whether or not to return a sparse CSR matrix, as default behavior,\n or to return a dense array compatible with dense pipeline operators.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`transform`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls the generation of the random `y` used to fit the trees\n and the draw of the splits for each feature at the trees' nodes.\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n Attributes\n ----------\n base_estimator_ : :class:`~sklearn.tree.ExtraTreeClassifier` instance\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of :class:`~sklearn.tree.ExtraTreeClassifier` instances\n The collection of fitted sub-estimators.\n\n feature_importances_ : ndarray of shape (n_features,)\n The feature importances (the higher, the more important the feature).\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n one_hot_encoder_ : OneHotEncoder instance\n One-hot encoder used to create the sparse embedding.\n\n See Also\n --------\n ExtraTreesClassifier : An extra-trees classifier.\n ExtraTreesRegressor : An extra-trees regressor.\n RandomForestClassifier : A random forest classifier.\n RandomForestRegressor : A random forest regressor.\n sklearn.tree.ExtraTreeClassifier: An extremely randomized\n tree classifier.\n sklearn.tree.ExtraTreeRegressor : An extremely randomized\n tree regressor.\n\n References\n ----------\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\",\n Machine Learning, 63(1), 3-42, 2006.\n .. [2] Moosmann, F. and Triggs, B. and Jurie, F. \"Fast discriminative\n visual codebooks using randomized clustering forests\"\n NIPS 2007\n\n Examples\n --------\n >>> from sklearn.ensemble import RandomTreesEmbedding\n >>> X = [[0,0], [1,0], [0,1], [-1,0], [0,-1]]\n >>> random_trees = RandomTreesEmbedding(\n ... n_estimators=5, random_state=0, max_depth=1).fit(X)\n >>> X_sparse_embedding = random_trees.transform(X)\n >>> X_sparse_embedding.toarray()\n array([[0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],\n [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],\n [0., 1., 0., 1., 0., 1., 0., 1., 0., 1.],\n [1., 0., 1., 0., 1., 0., 1., 0., 1., 0.],\n [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]])\n \"\"\"\n criterion = 'squared_error'\n max_features = 1\n \n def __init__(self, n_estimators=100, *, max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0, sparse_output=True, n_jobs=None, random_state=None, verbose=0, warm_start=False):\n super().__init__(base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state'), bootstrap=False, oob_score=False, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, max_samples=None)\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.sparse_output = sparse_output\n \n def _set_oob_score_and_attributes(self, X, y):\n raise NotImplementedError('OOB score not supported by tree embedding')\n \n def fit(self, X, y=None, sample_weight=None):\n \"\"\"\n Fit estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csc_matrix`` for maximum efficiency.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self.fit_transform(X, y, sample_weight=sample_weight)\n return self\n \n def fit_transform(self, X, y=None, sample_weight=None):\n \"\"\"\n Fit estimator and transform dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data used to build forests. Use ``dtype=np.float32`` for\n maximum efficiency.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n X_transformed : sparse matrix of shape (n_samples, n_out)\n Transformed dataset.\n \"\"\"\n rnd = check_random_state(self.random_state)\n y = rnd.uniform(size=_num_samples(X))\n super().fit(X, y, sample_weight=sample_weight)\n self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output)\n return self.one_hot_encoder_.fit_transform(self.apply(X))\n \n def transform(self, X):\n \"\"\"\n Transform dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data to be transformed. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csr_matrix`` for maximum efficiency.\n\n Returns\n -------\n X_transformed : sparse matrix of shape (n_samples, n_out)\n Transformed dataset.\n \"\"\"\n check_is_fitted(self)\n return self.one_hot_encoder_.transform(self.apply(X))\n" + "description": "An ensemble of totally random trees.\n\nAn unsupervised transformation of a dataset to a high-dimensional\nsparse representation. A datapoint is coded according to which leaf of\neach tree it is sorted into. Using a one-hot encoding of the leaves,\nthis leads to a binary coding with as many ones as there are trees in\nthe forest.\n\nThe dimensionality of the resulting representation is\n``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``,\nthe number of leaf nodes is at most ``n_estimators * 2 ** max_depth``.\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n An ensemble of totally random trees.\n\n An unsupervised transformation of a dataset to a high-dimensional\n sparse representation. A datapoint is coded according to which leaf of\n each tree it is sorted into. Using a one-hot encoding of the leaves,\n this leads to a binary coding with as many ones as there are trees in\n the forest.\n\n The dimensionality of the resulting representation is\n ``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``,\n the number of leaf nodes is at most ``n_estimators * 2 ** max_depth``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n Number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n max_depth : int, default=5\n The maximum depth of each tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` is the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` is the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n sparse_output : bool, default=True\n Whether or not to return a sparse CSR matrix, as default behavior,\n or to return a dense array compatible with dense pipeline operators.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`transform`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls the generation of the random `y` used to fit the trees\n and the draw of the splits for each feature at the trees' nodes.\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n Attributes\n ----------\n base_estimator_ : :class:`~sklearn.tree.ExtraTreeClassifier` instance\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of :class:`~sklearn.tree.ExtraTreeClassifier` instances\n The collection of fitted sub-estimators.\n\n feature_importances_ : ndarray of shape (n_features,)\n The feature importances (the higher, the more important the feature).\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n one_hot_encoder_ : OneHotEncoder instance\n One-hot encoder used to create the sparse embedding.\n\n See Also\n --------\n ExtraTreesClassifier : An extra-trees classifier.\n ExtraTreesRegressor : An extra-trees regressor.\n RandomForestClassifier : A random forest classifier.\n RandomForestRegressor : A random forest regressor.\n sklearn.tree.ExtraTreeClassifier: An extremely randomized\n tree classifier.\n sklearn.tree.ExtraTreeRegressor : An extremely randomized\n tree regressor.\n\n References\n ----------\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\",\n Machine Learning, 63(1), 3-42, 2006.\n .. [2] Moosmann, F. and Triggs, B. and Jurie, F. \"Fast discriminative\n visual codebooks using randomized clustering forests\"\n NIPS 2007\n\n Examples\n --------\n >>> from sklearn.ensemble import RandomTreesEmbedding\n >>> X = [[0,0], [1,0], [0,1], [-1,0], [0,-1]]\n >>> random_trees = RandomTreesEmbedding(\n ... n_estimators=5, random_state=0, max_depth=1).fit(X)\n >>> X_sparse_embedding = random_trees.transform(X)\n >>> X_sparse_embedding.toarray()\n array([[0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],\n [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],\n [0., 1., 0., 1., 0., 1., 0., 1., 0., 1.],\n [1., 0., 1., 0., 1., 0., 1., 0., 1., 0.],\n [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]])\n ", + "source_code": "\n\nclass RandomTreesEmbedding(BaseForest):\n \"\"\"\n An ensemble of totally random trees.\n\n An unsupervised transformation of a dataset to a high-dimensional\n sparse representation. A datapoint is coded according to which leaf of\n each tree it is sorted into. Using a one-hot encoding of the leaves,\n this leads to a binary coding with as many ones as there are trees in\n the forest.\n\n The dimensionality of the resulting representation is\n ``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``,\n the number of leaf nodes is at most ``n_estimators * 2 ** max_depth``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_estimators : int, default=100\n Number of trees in the forest.\n\n .. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22.\n\n max_depth : int, default=5\n The maximum depth of each tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` is the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` is the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n sparse_output : bool, default=True\n Whether or not to return a sparse CSR matrix, as default behavior,\n or to return a dense array compatible with dense pipeline operators.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel. :meth:`fit`, :meth:`transform`,\n :meth:`decision_path` and :meth:`apply` are all parallelized over the\n trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors. See :term:`Glossary\n ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls the generation of the random `y` used to fit the trees\n and the draw of the splits for each feature at the trees' nodes.\n See :term:`Glossary ` for details.\n\n verbose : int, default=0\n Controls the verbosity when fitting and predicting.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n Attributes\n ----------\n base_estimator_ : :class:`~sklearn.tree.ExtraTreeClassifier` instance\n The child estimator template used to create the collection of fitted\n sub-estimators.\n\n estimators_ : list of :class:`~sklearn.tree.ExtraTreeClassifier` instances\n The collection of fitted sub-estimators.\n\n feature_importances_ : ndarray of shape (n_features,)\n The feature importances (the higher, the more important the feature).\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n one_hot_encoder_ : OneHotEncoder instance\n One-hot encoder used to create the sparse embedding.\n\n See Also\n --------\n ExtraTreesClassifier : An extra-trees classifier.\n ExtraTreesRegressor : An extra-trees regressor.\n RandomForestClassifier : A random forest classifier.\n RandomForestRegressor : A random forest regressor.\n sklearn.tree.ExtraTreeClassifier: An extremely randomized\n tree classifier.\n sklearn.tree.ExtraTreeRegressor : An extremely randomized\n tree regressor.\n\n References\n ----------\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\",\n Machine Learning, 63(1), 3-42, 2006.\n .. [2] Moosmann, F. and Triggs, B. and Jurie, F. \"Fast discriminative\n visual codebooks using randomized clustering forests\"\n NIPS 2007\n\n Examples\n --------\n >>> from sklearn.ensemble import RandomTreesEmbedding\n >>> X = [[0,0], [1,0], [0,1], [-1,0], [0,-1]]\n >>> random_trees = RandomTreesEmbedding(\n ... n_estimators=5, random_state=0, max_depth=1).fit(X)\n >>> X_sparse_embedding = random_trees.transform(X)\n >>> X_sparse_embedding.toarray()\n array([[0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],\n [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],\n [0., 1., 0., 1., 0., 1., 0., 1., 0., 1.],\n [1., 0., 1., 0., 1., 0., 1., 0., 1., 0.],\n [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]])\n \"\"\"\n criterion = 'squared_error'\n max_features = 1\n \n def __init__(self, n_estimators=100, *, max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0, sparse_output=True, n_jobs=None, random_state=None, verbose=0, warm_start=False):\n super().__init__(base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state'), bootstrap=False, oob_score=False, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, max_samples=None)\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.sparse_output = sparse_output\n \n def _set_oob_score_and_attributes(self, X, y):\n raise NotImplementedError('OOB score not supported by tree embedding')\n \n def fit(self, X, y=None, sample_weight=None):\n \"\"\"\n Fit estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csc_matrix`` for maximum efficiency.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self.fit_transform(X, y, sample_weight=sample_weight)\n return self\n \n def fit_transform(self, X, y=None, sample_weight=None):\n \"\"\"\n Fit estimator and transform dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data used to build forests. Use ``dtype=np.float32`` for\n maximum efficiency.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n X_transformed : sparse matrix of shape (n_samples, n_out)\n Transformed dataset.\n \"\"\"\n rnd = check_random_state(self.random_state)\n y = rnd.uniform(size=_num_samples(X))\n super().fit(X, y, sample_weight=sample_weight)\n self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output)\n return self.one_hot_encoder_.fit_transform(self.apply(X))\n \n def transform(self, X):\n \"\"\"\n Transform dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data to be transformed. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csr_matrix`` for maximum efficiency.\n\n Returns\n -------\n X_transformed : sparse matrix of shape (n_samples, n_out)\n Transformed dataset.\n \"\"\"\n check_is_fitted(self)\n return self.one_hot_encoder_.transform(self.apply(X))\n" }, { "name": "BaseGradientBoosting", @@ -21036,7 +21102,7 @@ "is_public": false, "description": "Abstract base class for Gradient Boosting.", "docstring": "Abstract base class for Gradient Boosting.", - "source_code": "\n\nclass BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta):\n \"\"\"Abstract base class for Gradient Boosting.\"\"\"\n \n @abstractmethod\n def __init__(self, *, loss, learning_rate, n_estimators, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, min_impurity_decrease, init, subsample, max_features, ccp_alpha, random_state, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001):\n self.n_estimators = n_estimators\n self.learning_rate = learning_rate\n self.loss = loss\n self.criterion = criterion\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.subsample = subsample\n self.max_features = max_features\n self.max_depth = max_depth\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha\n self.init = init\n self.random_state = random_state\n self.alpha = alpha\n self.verbose = verbose\n self.max_leaf_nodes = max_leaf_nodes\n self.warm_start = warm_start\n self.validation_fraction = validation_fraction\n self.n_iter_no_change = n_iter_no_change\n self.tol = tol\n \n @abstractmethod\n def _validate_y(self, y, sample_weight=None):\n \"\"\"Called by fit to validate y.\"\"\"\n \n \n def _fit_stage(self, i, X, y, raw_predictions, sample_weight, sample_mask, random_state, X_csc=None, X_csr=None):\n \"\"\"Fit another stage of ``_n_classes`` trees to the boosting model.\"\"\"\n assert sample_mask.dtype == bool\n loss = self.loss_\n original_y = y\n raw_predictions_copy = raw_predictions.copy()\n for k in range(loss.K):\n if loss.is_multi_class:\n y = np.array(original_y == k, dtype=np.float64)\n residual = loss.negative_gradient(y, raw_predictions_copy, k=k, sample_weight=sample_weight)\n tree = DecisionTreeRegressor(criterion=self.criterion, splitter='best', max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=random_state, ccp_alpha=self.ccp_alpha)\n if self.subsample < 1.0:\n sample_weight = sample_weight * sample_mask.astype(np.float64)\n X = X_csr if X_csr is not None else X\n tree.fit(X, residual, sample_weight=sample_weight, check_input=False)\n loss.update_terminal_regions(tree.tree_, X, y, residual, raw_predictions, sample_weight, sample_mask, learning_rate=self.learning_rate, k=k)\n self.estimators_[i, k] = tree\n return raw_predictions\n \n def _check_params(self):\n \"\"\"Check validity of parameters and raise ValueError if not valid.\"\"\"\n if self.n_estimators <= 0:\n raise ValueError('n_estimators must be greater than 0 but was %r' % self.n_estimators)\n if self.learning_rate <= 0.0:\n raise ValueError('learning_rate must be greater than 0 but was %r' % self.learning_rate)\n if self.loss not in self._SUPPORTED_LOSS or self.loss not in _gb_losses.LOSS_FUNCTIONS:\n raise ValueError(\"Loss '{0:s}' not supported. \".format(self.loss))\n if self.loss == 'ls':\n warnings.warn(\"The loss 'ls' was deprecated in v1.0 and will be removed in version 1.2. Use 'squared_error' which is equivalent.\", FutureWarning)\n elif self.loss == 'lad':\n warnings.warn(\"The loss 'lad' was deprecated in v1.0 and will be removed in version 1.2. Use 'absolute_error' which is equivalent.\", FutureWarning)\n if self.loss == 'deviance':\n loss_class = _gb_losses.MultinomialDeviance if len(self.classes_) > 2 else _gb_losses.BinomialDeviance\n else:\n loss_class = _gb_losses.LOSS_FUNCTIONS[self.loss]\n if is_classifier(self):\n self.loss_ = loss_class(self.n_classes_)\n elif self.loss in ('huber', 'quantile'):\n self.loss_ = loss_class(self.alpha)\n else:\n self.loss_ = loss_class()\n if not 0.0 < self.subsample <= 1.0:\n raise ValueError('subsample must be in (0,1] but was %r' % self.subsample)\n if self.init is not None:\n if isinstance(self.init, BaseEstimator):\n self.loss_.check_init_estimator(self.init)\n elif not (isinstance(self.init, str) and self.init == 'zero'):\n raise ValueError(\"The init parameter must be an estimator or 'zero'. Got init={}\".format(self.init))\n if not 0.0 < self.alpha < 1.0:\n raise ValueError('alpha must be in (0.0, 1.0) but was %r' % self.alpha)\n if isinstance(self.max_features, str):\n if self.max_features == 'auto':\n if is_classifier(self):\n max_features = max(1, int(np.sqrt(self.n_features_in_)))\n else:\n max_features = self.n_features_in_\n elif self.max_features == 'sqrt':\n max_features = max(1, int(np.sqrt(self.n_features_in_)))\n elif self.max_features == 'log2':\n max_features = max(1, int(np.log2(self.n_features_in_)))\n else:\n raise ValueError(\"Invalid value for max_features: %r. Allowed string values are 'auto', 'sqrt' or 'log2'.\" % self.max_features)\n elif self.max_features is None:\n max_features = self.n_features_in_\n elif isinstance(self.max_features, numbers.Integral):\n max_features = self.max_features\n elif 0.0 < self.max_features <= 1.0:\n max_features = max(int(self.max_features * self.n_features_in_), 1)\n else:\n raise ValueError('max_features must be in (0, n_features]')\n self.max_features_ = max_features\n if not isinstance(self.n_iter_no_change, (numbers.Integral, type(None))):\n raise ValueError('n_iter_no_change should either be None or an integer. %r was passed' % self.n_iter_no_change)\n \n def _init_state(self):\n \"\"\"Initialize model state and allocate model state data structures.\"\"\"\n self.init_ = self.init\n if self.init_ is None:\n self.init_ = self.loss_.init_estimator()\n self.estimators_ = np.empty((self.n_estimators, self.loss_.K), dtype=object)\n self.train_score_ = np.zeros((self.n_estimators, ), dtype=np.float64)\n if self.subsample < 1.0:\n self.oob_improvement_ = np.zeros(self.n_estimators, dtype=np.float64)\n \n def _clear_state(self):\n \"\"\"Clear the state of the gradient boosting model.\"\"\"\n if hasattr(self, 'estimators_'):\n self.estimators_ = np.empty((0, 0), dtype=object)\n if hasattr(self, 'train_score_'):\n del self.train_score_\n if hasattr(self, 'oob_improvement_'):\n del self.oob_improvement_\n if hasattr(self, 'init_'):\n del self.init_\n if hasattr(self, '_rng'):\n del self._rng\n \n def _resize_state(self):\n \"\"\"Add additional ``n_estimators`` entries to all attributes.\"\"\"\n total_n_estimators = self.n_estimators\n if total_n_estimators < self.estimators_.shape[0]:\n raise ValueError('resize with smaller n_estimators %d < %d' % (total_n_estimators, self.estimators_[0]))\n self.estimators_ = np.resize(self.estimators_, (total_n_estimators, self.loss_.K))\n self.train_score_ = np.resize(self.train_score_, total_n_estimators)\n if self.subsample < 1 or hasattr(self, 'oob_improvement_'):\n if hasattr(self, 'oob_improvement_'):\n self.oob_improvement_ = np.resize(self.oob_improvement_, total_n_estimators)\n else:\n self.oob_improvement_ = np.zeros((total_n_estimators, ), dtype=np.float64)\n \n def _is_initialized(self):\n return len(getattr(self, 'estimators_', [])) > 0\n \n def _check_initialized(self):\n \"\"\"Check that the estimator is initialized, raising an error if not.\"\"\"\n check_is_fitted(self)\n \n @abstractmethod\n def _warn_mae_for_criterion(self):\n pass\n \n def fit(self, X, y, sample_weight=None, monitor=None):\n \"\"\"Fit the gradient boosting model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n y : array-like of shape (n_samples,)\n Target values (strings or integers in classification, real numbers\n in regression)\n For classification, labels must correspond to classes.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n monitor : callable, default=None\n The monitor is called after each iteration with the current\n iteration, a reference to the estimator and the local variables of\n ``_fit_stages`` as keyword arguments ``callable(i, self,\n locals())``. If the callable returns ``True`` the fitting procedure\n is stopped. The monitor can be used for various things such as\n computing held-out estimates, early stopping, model introspect, and\n snapshoting.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.criterion in ('absolute_error', 'mae'):\n self._warn_mae_for_criterion()\n if self.criterion == 'mse':\n warnings.warn(\"Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.\", FutureWarning)\n if not self.warm_start:\n self._clear_state()\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE, multi_output=True)\n sample_weight_is_none = sample_weight is None\n sample_weight = _check_sample_weight(sample_weight, X)\n y = column_or_1d(y, warn=True)\n if is_classifier(self):\n y = self._validate_y(y, sample_weight)\n else:\n y = self._validate_y(y)\n if self.n_iter_no_change is not None:\n stratify = y if is_classifier(self) else None\n (X, X_val, y, y_val, sample_weight, sample_weight_val) = train_test_split(X, y, sample_weight, random_state=self.random_state, test_size=self.validation_fraction, stratify=stratify)\n if is_classifier(self):\n if self._n_classes != np.unique(y).shape[0]:\n raise ValueError('The training data after the early stopping split is missing some classes. Try using another random seed.')\n else:\n X_val = y_val = sample_weight_val = None\n self._check_params()\n if not self._is_initialized():\n self._init_state()\n if self.init_ == 'zero':\n raw_predictions = np.zeros(shape=(X.shape[0], self.loss_.K), dtype=np.float64)\n else:\n if sample_weight_is_none:\n self.init_.fit(X, y)\n else:\n msg = 'The initial estimator {} does not support sample weights.'.format(self.init_.__class__.__name__)\n try:\n self.init_.fit(X, y, sample_weight=sample_weight)\n except TypeError as e:\n raise ValueError(msg) from e\n except ValueError as e:\n if 'pass parameters to specific steps of your pipeline using the stepname__parameter' in str(e):\n raise ValueError(msg) from e\n else:\n raise\n raw_predictions = self.loss_.get_init_raw_predictions(X, self.init_)\n begin_at_stage = 0\n self._rng = check_random_state(self.random_state)\n else:\n if self.n_estimators < self.estimators_.shape[0]:\n raise ValueError('n_estimators=%d must be larger or equal to estimators_.shape[0]=%d when warm_start==True' % (self.n_estimators, self.estimators_.shape[0]))\n begin_at_stage = self.estimators_.shape[0]\n X = check_array(X, dtype=DTYPE, order='C', accept_sparse='csr')\n raw_predictions = self._raw_predict(X)\n self._resize_state()\n n_stages = self._fit_stages(X, y, raw_predictions, sample_weight, self._rng, X_val, y_val, sample_weight_val, begin_at_stage, monitor)\n if n_stages != self.estimators_.shape[0]:\n self.estimators_ = self.estimators_[:n_stages]\n self.train_score_ = self.train_score_[:n_stages]\n if hasattr(self, 'oob_improvement_'):\n self.oob_improvement_ = self.oob_improvement_[:n_stages]\n self.n_estimators_ = n_stages\n return self\n \n def _fit_stages(self, X, y, raw_predictions, sample_weight, random_state, X_val, y_val, sample_weight_val, begin_at_stage=0, monitor=None):\n \"\"\"Iteratively fits the stages.\n\n For each stage it computes the progress (OOB, train score)\n and delegates to ``_fit_stage``.\n Returns the number of stages fit; might differ from ``n_estimators``\n due to early stopping.\n \"\"\"\n n_samples = X.shape[0]\n do_oob = self.subsample < 1.0\n sample_mask = np.ones((n_samples, ), dtype=bool)\n n_inbag = max(1, int(self.subsample * n_samples))\n loss_ = self.loss_\n if self.verbose:\n verbose_reporter = VerboseReporter(verbose=self.verbose)\n verbose_reporter.init(self, begin_at_stage)\n X_csc = csc_matrix(X) if issparse(X) else None\n X_csr = csr_matrix(X) if issparse(X) else None\n if self.n_iter_no_change is not None:\n loss_history = np.full(self.n_iter_no_change, np.inf)\n y_val_pred_iter = self._staged_raw_predict(X_val)\n i = begin_at_stage\n for i in range(begin_at_stage, self.n_estimators):\n if do_oob:\n sample_mask = _random_sample_mask(n_samples, n_inbag, random_state)\n old_oob_score = loss_(y[~sample_mask], raw_predictions[~sample_mask], sample_weight[~sample_mask])\n raw_predictions = self._fit_stage(i, X, y, raw_predictions, sample_weight, sample_mask, random_state, X_csc, X_csr)\n if do_oob:\n self.train_score_[i] = loss_(y[sample_mask], raw_predictions[sample_mask], sample_weight[sample_mask])\n self.oob_improvement_[i] = old_oob_score - loss_(y[~sample_mask], raw_predictions[~sample_mask], sample_weight[~sample_mask])\n else:\n self.train_score_[i] = loss_(y, raw_predictions, sample_weight)\n if self.verbose > 0:\n verbose_reporter.update(i, self)\n if monitor is not None:\n early_stopping = monitor(i, self, locals())\n if early_stopping:\n break\n if self.n_iter_no_change is not None:\n validation_loss = loss_(y_val, next(y_val_pred_iter), sample_weight_val)\n if np.any(validation_loss + self.tol < loss_history):\n loss_history[i % len(loss_history)] = validation_loss\n else:\n break\n return i + 1\n \n def _make_estimator(self, append=True):\n raise NotImplementedError()\n \n def _raw_predict_init(self, X):\n \"\"\"Check input and compute raw predictions of the init estimator.\"\"\"\n self._check_initialized()\n X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)\n if self.init_ == 'zero':\n raw_predictions = np.zeros(shape=(X.shape[0], self.loss_.K), dtype=np.float64)\n else:\n raw_predictions = self.loss_.get_init_raw_predictions(X, self.init_).astype(np.float64)\n return raw_predictions\n \n def _raw_predict(self, X):\n \"\"\"Return the sum of the trees raw predictions (+ init estimator).\"\"\"\n raw_predictions = self._raw_predict_init(X)\n predict_stages(self.estimators_, X, self.learning_rate, raw_predictions)\n return raw_predictions\n \n def _staged_raw_predict(self, X):\n \"\"\"Compute raw predictions of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n raw_predictions : generator of ndarray of shape (n_samples, k)\n The raw predictions of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification are special cases with\n ``k == 1``, otherwise ``k==n_classes``.\n \"\"\"\n X = self._validate_data(X, dtype=DTYPE, order='C', accept_sparse='csr', reset=False)\n raw_predictions = self._raw_predict_init(X)\n for i in range(self.estimators_.shape[0]):\n predict_stage(self.estimators_, i, X, self.learning_rate, raw_predictions)\n yield raw_predictions.copy()\n \n @property\n def feature_importances_(self):\n \"\"\"The impurity-based feature importances.\n\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n The values of this array sum to 1, unless all trees are single node\n trees consisting of only the root node, in which case it will be an\n array of zeros.\n \"\"\"\n self._check_initialized()\n relevant_trees = [tree for stage in self.estimators_ for tree in stage if tree.tree_.node_count > 1]\n if not relevant_trees:\n return np.zeros(shape=self.n_features_in_, dtype=np.float64)\n relevant_feature_importances = [tree.tree_.compute_feature_importances(normalize=False) for tree in relevant_trees]\n avg_feature_importances = np.mean(relevant_feature_importances, axis=0, dtype=np.float64)\n return avg_feature_importances / np.sum(avg_feature_importances)\n \n def _compute_partial_dependence_recursion(self, grid, target_features):\n \"\"\"Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray of shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray of shape (n_target_features,)\n The set of target features for which the partial dependence\n should be evaluated.\n\n Returns\n -------\n averaged_predictions : ndarray of shape (n_trees_per_iteration, n_samples)\n The value of the partial dependence function on each grid point.\n \"\"\"\n if self.init is not None:\n warnings.warn('Using recursion method with a non-constant init predictor will lead to incorrect partial dependence values. Got init=%s.' % self.init, UserWarning)\n grid = np.asarray(grid, dtype=DTYPE, order='C')\n (n_estimators, n_trees_per_stage) = self.estimators_.shape\n averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]), dtype=np.float64, order='C')\n for stage in range(n_estimators):\n for k in range(n_trees_per_stage):\n tree = self.estimators_[stage, k].tree_\n tree.compute_partial_dependence(grid, target_features, averaged_predictions[k])\n averaged_predictions *= self.learning_rate\n return averaged_predictions\n \n def apply(self, X):\n \"\"\"Apply trees in the ensemble to X, return leaf indices.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will\n be converted to a sparse ``csr_matrix``.\n\n Returns\n -------\n X_leaves : array-like of shape (n_samples, n_estimators, n_classes)\n For each datapoint x in X and for each tree in the ensemble,\n return the index of the leaf x ends up in each estimator.\n In the case of binary classification n_classes is 1.\n \"\"\"\n self._check_initialized()\n X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)\n (n_estimators, n_classes) = self.estimators_.shape\n leaves = np.zeros((X.shape[0], n_estimators, n_classes))\n for i in range(n_estimators):\n for j in range(n_classes):\n estimator = self.estimators_[i, j]\n leaves[:, i, j] = estimator.apply(X, check_input=False)\n return leaves\n \n @deprecated('Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead.')\n @property\n def n_features_(self):\n return self.n_features_in_\n" + "source_code": "\n\nclass BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta):\n \"\"\"Abstract base class for Gradient Boosting.\"\"\"\n \n @abstractmethod\n def __init__(self, *, loss, learning_rate, n_estimators, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, min_impurity_decrease, init, subsample, max_features, ccp_alpha, random_state, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001):\n self.n_estimators = n_estimators\n self.learning_rate = learning_rate\n self.loss = loss\n self.criterion = criterion\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.subsample = subsample\n self.max_features = max_features\n self.max_depth = max_depth\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha\n self.init = init\n self.random_state = random_state\n self.alpha = alpha\n self.verbose = verbose\n self.max_leaf_nodes = max_leaf_nodes\n self.warm_start = warm_start\n self.validation_fraction = validation_fraction\n self.n_iter_no_change = n_iter_no_change\n self.tol = tol\n \n @abstractmethod\n def _validate_y(self, y, sample_weight=None):\n \"\"\"Called by fit to validate y.\"\"\"\n \n \n def _fit_stage(self, i, X, y, raw_predictions, sample_weight, sample_mask, random_state, X_csc=None, X_csr=None):\n \"\"\"Fit another stage of ``_n_classes`` trees to the boosting model.\"\"\"\n assert sample_mask.dtype == bool\n loss = self.loss_\n original_y = y\n raw_predictions_copy = raw_predictions.copy()\n for k in range(loss.K):\n if loss.is_multi_class:\n y = np.array(original_y == k, dtype=np.float64)\n residual = loss.negative_gradient(y, raw_predictions_copy, k=k, sample_weight=sample_weight)\n tree = DecisionTreeRegressor(criterion=self.criterion, splitter='best', max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=random_state, ccp_alpha=self.ccp_alpha)\n if self.subsample < 1.0:\n sample_weight = sample_weight * sample_mask.astype(np.float64)\n X = X_csr if X_csr is not None else X\n tree.fit(X, residual, sample_weight=sample_weight, check_input=False)\n loss.update_terminal_regions(tree.tree_, X, y, residual, raw_predictions, sample_weight, sample_mask, learning_rate=self.learning_rate, k=k)\n self.estimators_[i, k] = tree\n return raw_predictions\n \n def _check_params(self):\n \"\"\"Check validity of parameters and raise ValueError if not valid.\"\"\"\n if self.n_estimators <= 0:\n raise ValueError('n_estimators must be greater than 0 but was %r' % self.n_estimators)\n if self.learning_rate <= 0.0:\n raise ValueError('learning_rate must be greater than 0 but was %r' % self.learning_rate)\n if self.loss not in self._SUPPORTED_LOSS or self.loss not in _gb_losses.LOSS_FUNCTIONS:\n raise ValueError(\"Loss '{0:s}' not supported. \".format(self.loss))\n if self.loss == 'ls':\n warnings.warn(\"The loss 'ls' was deprecated in v1.0 and will be removed in version 1.2. Use 'squared_error' which is equivalent.\", FutureWarning)\n elif self.loss == 'lad':\n warnings.warn(\"The loss 'lad' was deprecated in v1.0 and will be removed in version 1.2. Use 'absolute_error' which is equivalent.\", FutureWarning)\n if self.loss == 'deviance':\n loss_class = _gb_losses.MultinomialDeviance if len(self.classes_) > 2 else _gb_losses.BinomialDeviance\n else:\n loss_class = _gb_losses.LOSS_FUNCTIONS[self.loss]\n if is_classifier(self):\n self.loss_ = loss_class(self.n_classes_)\n elif self.loss in ('huber', 'quantile'):\n self.loss_ = loss_class(self.alpha)\n else:\n self.loss_ = loss_class()\n if not 0.0 < self.subsample <= 1.0:\n raise ValueError('subsample must be in (0,1] but was %r' % self.subsample)\n if self.init is not None:\n if isinstance(self.init, BaseEstimator):\n self.loss_.check_init_estimator(self.init)\n elif not (isinstance(self.init, str) and self.init == 'zero'):\n raise ValueError(\"The init parameter must be an estimator or 'zero'. Got init={}\".format(self.init))\n if not 0.0 < self.alpha < 1.0:\n raise ValueError('alpha must be in (0.0, 1.0) but was %r' % self.alpha)\n if isinstance(self.max_features, str):\n if self.max_features == 'auto':\n if is_classifier(self):\n max_features = max(1, int(np.sqrt(self.n_features_in_)))\n else:\n max_features = self.n_features_in_\n elif self.max_features == 'sqrt':\n max_features = max(1, int(np.sqrt(self.n_features_in_)))\n elif self.max_features == 'log2':\n max_features = max(1, int(np.log2(self.n_features_in_)))\n else:\n raise ValueError(\"Invalid value for max_features: %r. Allowed string values are 'auto', 'sqrt' or 'log2'.\" % self.max_features)\n elif self.max_features is None:\n max_features = self.n_features_in_\n elif isinstance(self.max_features, numbers.Integral):\n max_features = self.max_features\n elif 0.0 < self.max_features <= 1.0:\n max_features = max(int(self.max_features * self.n_features_in_), 1)\n else:\n raise ValueError('max_features must be in (0, n_features]')\n self.max_features_ = max_features\n if not isinstance(self.n_iter_no_change, (numbers.Integral, type(None))):\n raise ValueError('n_iter_no_change should either be None or an integer. %r was passed' % self.n_iter_no_change)\n \n def _init_state(self):\n \"\"\"Initialize model state and allocate model state data structures.\"\"\"\n self.init_ = self.init\n if self.init_ is None:\n self.init_ = self.loss_.init_estimator()\n self.estimators_ = np.empty((self.n_estimators, self.loss_.K), dtype=object)\n self.train_score_ = np.zeros((self.n_estimators, ), dtype=np.float64)\n if self.subsample < 1.0:\n self.oob_improvement_ = np.zeros(self.n_estimators, dtype=np.float64)\n \n def _clear_state(self):\n \"\"\"Clear the state of the gradient boosting model.\"\"\"\n if hasattr(self, 'estimators_'):\n self.estimators_ = np.empty((0, 0), dtype=object)\n if hasattr(self, 'train_score_'):\n del self.train_score_\n if hasattr(self, 'oob_improvement_'):\n del self.oob_improvement_\n if hasattr(self, 'init_'):\n del self.init_\n if hasattr(self, '_rng'):\n del self._rng\n \n def _resize_state(self):\n \"\"\"Add additional ``n_estimators`` entries to all attributes.\"\"\"\n total_n_estimators = self.n_estimators\n if total_n_estimators < self.estimators_.shape[0]:\n raise ValueError('resize with smaller n_estimators %d < %d' % (total_n_estimators, self.estimators_[0]))\n self.estimators_ = np.resize(self.estimators_, (total_n_estimators, self.loss_.K))\n self.train_score_ = np.resize(self.train_score_, total_n_estimators)\n if self.subsample < 1 or hasattr(self, 'oob_improvement_'):\n if hasattr(self, 'oob_improvement_'):\n self.oob_improvement_ = np.resize(self.oob_improvement_, total_n_estimators)\n else:\n self.oob_improvement_ = np.zeros((total_n_estimators, ), dtype=np.float64)\n \n def _is_initialized(self):\n return len(getattr(self, 'estimators_', [])) > 0\n \n def _check_initialized(self):\n \"\"\"Check that the estimator is initialized, raising an error if not.\"\"\"\n check_is_fitted(self)\n \n @abstractmethod\n def _warn_mae_for_criterion(self):\n pass\n \n def fit(self, X, y, sample_weight=None, monitor=None):\n \"\"\"Fit the gradient boosting model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n y : array-like of shape (n_samples,)\n Target values (strings or integers in classification, real numbers\n in regression)\n For classification, labels must correspond to classes.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n monitor : callable, default=None\n The monitor is called after each iteration with the current\n iteration, a reference to the estimator and the local variables of\n ``_fit_stages`` as keyword arguments ``callable(i, self,\n locals())``. If the callable returns ``True`` the fitting procedure\n is stopped. The monitor can be used for various things such as\n computing held-out estimates, early stopping, model introspect, and\n snapshoting.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.criterion in ('absolute_error', 'mae'):\n self._warn_mae_for_criterion()\n if self.criterion == 'mse':\n warnings.warn(\"Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.\", FutureWarning)\n if not self.warm_start:\n self._clear_state()\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE, multi_output=True)\n sample_weight_is_none = sample_weight is None\n sample_weight = _check_sample_weight(sample_weight, X)\n y = column_or_1d(y, warn=True)\n if is_classifier(self):\n y = self._validate_y(y, sample_weight)\n else:\n y = self._validate_y(y)\n if self.n_iter_no_change is not None:\n stratify = y if is_classifier(self) else None\n (X, X_val, y, y_val, sample_weight, sample_weight_val) = train_test_split(X, y, sample_weight, random_state=self.random_state, test_size=self.validation_fraction, stratify=stratify)\n if is_classifier(self):\n if self._n_classes != np.unique(y).shape[0]:\n raise ValueError('The training data after the early stopping split is missing some classes. Try using another random seed.')\n else:\n X_val = y_val = sample_weight_val = None\n self._check_params()\n if not self._is_initialized():\n self._init_state()\n if self.init_ == 'zero':\n raw_predictions = np.zeros(shape=(X.shape[0], self.loss_.K), dtype=np.float64)\n else:\n if sample_weight_is_none:\n self.init_.fit(X, y)\n else:\n msg = 'The initial estimator {} does not support sample weights.'.format(self.init_.__class__.__name__)\n try:\n self.init_.fit(X, y, sample_weight=sample_weight)\n except TypeError as e:\n raise ValueError(msg) from e\n except ValueError as e:\n if 'pass parameters to specific steps of your pipeline using the stepname__parameter' in str(e):\n raise ValueError(msg) from e\n else:\n raise\n raw_predictions = self.loss_.get_init_raw_predictions(X, self.init_)\n begin_at_stage = 0\n self._rng = check_random_state(self.random_state)\n else:\n if self.n_estimators < self.estimators_.shape[0]:\n raise ValueError('n_estimators=%d must be larger or equal to estimators_.shape[0]=%d when warm_start==True' % (self.n_estimators, self.estimators_.shape[0]))\n begin_at_stage = self.estimators_.shape[0]\n X = check_array(X, dtype=DTYPE, order='C', accept_sparse='csr')\n raw_predictions = self._raw_predict(X)\n self._resize_state()\n n_stages = self._fit_stages(X, y, raw_predictions, sample_weight, self._rng, X_val, y_val, sample_weight_val, begin_at_stage, monitor)\n if n_stages != self.estimators_.shape[0]:\n self.estimators_ = self.estimators_[:n_stages]\n self.train_score_ = self.train_score_[:n_stages]\n if hasattr(self, 'oob_improvement_'):\n self.oob_improvement_ = self.oob_improvement_[:n_stages]\n self.n_estimators_ = n_stages\n return self\n \n def _fit_stages(self, X, y, raw_predictions, sample_weight, random_state, X_val, y_val, sample_weight_val, begin_at_stage=0, monitor=None):\n \"\"\"Iteratively fits the stages.\n\n For each stage it computes the progress (OOB, train score)\n and delegates to ``_fit_stage``.\n Returns the number of stages fit; might differ from ``n_estimators``\n due to early stopping.\n \"\"\"\n n_samples = X.shape[0]\n do_oob = self.subsample < 1.0\n sample_mask = np.ones((n_samples, ), dtype=bool)\n n_inbag = max(1, int(self.subsample * n_samples))\n loss_ = self.loss_\n if self.verbose:\n verbose_reporter = VerboseReporter(verbose=self.verbose)\n verbose_reporter.init(self, begin_at_stage)\n X_csc = csc_matrix(X) if issparse(X) else None\n X_csr = csr_matrix(X) if issparse(X) else None\n if self.n_iter_no_change is not None:\n loss_history = np.full(self.n_iter_no_change, np.inf)\n y_val_pred_iter = self._staged_raw_predict(X_val, check_input=False)\n i = begin_at_stage\n for i in range(begin_at_stage, self.n_estimators):\n if do_oob:\n sample_mask = _random_sample_mask(n_samples, n_inbag, random_state)\n old_oob_score = loss_(y[~sample_mask], raw_predictions[~sample_mask], sample_weight[~sample_mask])\n raw_predictions = self._fit_stage(i, X, y, raw_predictions, sample_weight, sample_mask, random_state, X_csc, X_csr)\n if do_oob:\n self.train_score_[i] = loss_(y[sample_mask], raw_predictions[sample_mask], sample_weight[sample_mask])\n self.oob_improvement_[i] = old_oob_score - loss_(y[~sample_mask], raw_predictions[~sample_mask], sample_weight[~sample_mask])\n else:\n self.train_score_[i] = loss_(y, raw_predictions, sample_weight)\n if self.verbose > 0:\n verbose_reporter.update(i, self)\n if monitor is not None:\n early_stopping = monitor(i, self, locals())\n if early_stopping:\n break\n if self.n_iter_no_change is not None:\n validation_loss = loss_(y_val, next(y_val_pred_iter), sample_weight_val)\n if np.any(validation_loss + self.tol < loss_history):\n loss_history[i % len(loss_history)] = validation_loss\n else:\n break\n return i + 1\n \n def _make_estimator(self, append=True):\n raise NotImplementedError()\n \n def _raw_predict_init(self, X):\n \"\"\"Check input and compute raw predictions of the init estimator.\"\"\"\n self._check_initialized()\n X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)\n if self.init_ == 'zero':\n raw_predictions = np.zeros(shape=(X.shape[0], self.loss_.K), dtype=np.float64)\n else:\n raw_predictions = self.loss_.get_init_raw_predictions(X, self.init_).astype(np.float64)\n return raw_predictions\n \n def _raw_predict(self, X):\n \"\"\"Return the sum of the trees raw predictions (+ init estimator).\"\"\"\n raw_predictions = self._raw_predict_init(X)\n predict_stages(self.estimators_, X, self.learning_rate, raw_predictions)\n return raw_predictions\n \n def _staged_raw_predict(self, X, check_input=True):\n \"\"\"Compute raw predictions of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n If False, the input arrays X will not be checked.\n\n Returns\n -------\n raw_predictions : generator of ndarray of shape (n_samples, k)\n The raw predictions of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification are special cases with\n ``k == 1``, otherwise ``k==n_classes``.\n \"\"\"\n if check_input:\n X = self._validate_data(X, dtype=DTYPE, order='C', accept_sparse='csr', reset=False)\n raw_predictions = self._raw_predict_init(X)\n for i in range(self.estimators_.shape[0]):\n predict_stage(self.estimators_, i, X, self.learning_rate, raw_predictions)\n yield raw_predictions.copy()\n \n @property\n def feature_importances_(self):\n \"\"\"The impurity-based feature importances.\n\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n The values of this array sum to 1, unless all trees are single node\n trees consisting of only the root node, in which case it will be an\n array of zeros.\n \"\"\"\n self._check_initialized()\n relevant_trees = [tree for stage in self.estimators_ for tree in stage if tree.tree_.node_count > 1]\n if not relevant_trees:\n return np.zeros(shape=self.n_features_in_, dtype=np.float64)\n relevant_feature_importances = [tree.tree_.compute_feature_importances(normalize=False) for tree in relevant_trees]\n avg_feature_importances = np.mean(relevant_feature_importances, axis=0, dtype=np.float64)\n return avg_feature_importances / np.sum(avg_feature_importances)\n \n def _compute_partial_dependence_recursion(self, grid, target_features):\n \"\"\"Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray of shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray of shape (n_target_features,)\n The set of target features for which the partial dependence\n should be evaluated.\n\n Returns\n -------\n averaged_predictions : ndarray of shape (n_trees_per_iteration, n_samples)\n The value of the partial dependence function on each grid point.\n \"\"\"\n if self.init is not None:\n warnings.warn('Using recursion method with a non-constant init predictor will lead to incorrect partial dependence values. Got init=%s.' % self.init, UserWarning)\n grid = np.asarray(grid, dtype=DTYPE, order='C')\n (n_estimators, n_trees_per_stage) = self.estimators_.shape\n averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]), dtype=np.float64, order='C')\n for stage in range(n_estimators):\n for k in range(n_trees_per_stage):\n tree = self.estimators_[stage, k].tree_\n tree.compute_partial_dependence(grid, target_features, averaged_predictions[k])\n averaged_predictions *= self.learning_rate\n return averaged_predictions\n \n def apply(self, X):\n \"\"\"Apply trees in the ensemble to X, return leaf indices.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will\n be converted to a sparse ``csr_matrix``.\n\n Returns\n -------\n X_leaves : array-like of shape (n_samples, n_estimators, n_classes)\n For each datapoint x in X and for each tree in the ensemble,\n return the index of the leaf x ends up in each estimator.\n In the case of binary classification n_classes is 1.\n \"\"\"\n self._check_initialized()\n X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)\n (n_estimators, n_classes) = self.estimators_.shape\n leaves = np.zeros((X.shape[0], n_estimators, n_classes))\n for i in range(n_estimators):\n for j in range(n_classes):\n estimator = self.estimators_[i, j]\n leaves[:, i, j] = estimator.apply(X, check_input=False)\n return leaves\n \n @deprecated('Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead.')\n @property\n def n_features_(self):\n return self.n_features_in_\n" }, { "name": "GradientBoostingClassifier", @@ -21056,7 +21122,7 @@ "sklearn.ensemble._gb.GradientBoostingClassifier.staged_predict_proba" ], "is_public": true, - "description": "Gradient Boosting for classification.\n\nGB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage ``n_classes_`` regression trees are fit on the negative gradient of the binomial or multinomial deviance loss function. Binary classification is a special case where only a single regression tree is induced. Read more in the :ref:`User Guide `.", + "description": "Gradient Boosting for classification.\n\nGB builds an additive model in a\nforward stage-wise fashion; it allows for the optimization of\narbitrary differentiable loss functions. In each stage ``n_classes_``\nregression trees are fit on the negative gradient of the\nbinomial or multinomial deviance loss function. Binary classification\nis a special case where only a single regression tree is induced.\n\nRead more in the :ref:`User Guide `.", "docstring": "Gradient Boosting for classification.\n\n GB builds an additive model in a\n forward stage-wise fashion; it allows for the optimization of\n arbitrary differentiable loss functions. In each stage ``n_classes_``\n regression trees are fit on the negative gradient of the\n binomial or multinomial deviance loss function. Binary classification\n is a special case where only a single regression tree is induced.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n loss : {'deviance', 'exponential'}, default='deviance'\n The loss function to be optimized. 'deviance' refers to\n deviance (= logistic regression) for classification\n with probabilistic outputs. For loss 'exponential' gradient\n boosting recovers the AdaBoost algorithm.\n\n learning_rate : float, default=0.1\n Learning rate shrinks the contribution of each tree by `learning_rate`.\n There is a trade-off between learning_rate and n_estimators.\n\n n_estimators : int, default=100\n The number of boosting stages to perform. Gradient boosting\n is fairly robust to over-fitting so a large number usually\n results in better performance.\n\n subsample : float, default=1.0\n The fraction of samples to be used for fitting the individual base\n learners. If smaller than 1.0 this results in Stochastic Gradient\n Boosting. `subsample` interacts with the parameter `n_estimators`.\n Choosing `subsample < 1.0` leads to a reduction of variance\n and an increase in bias.\n\n criterion : {'friedman_mse', 'squared_error', 'mse', 'mae'}, default='friedman_mse'\n The function to measure the quality of a split. Supported criteria\n are 'friedman_mse' for the mean squared error with improvement\n score by Friedman, 'squared_error' for mean squared error, and 'mae'\n for the mean absolute error. The default value of 'friedman_mse' is\n generally the best as it can provide a better approximation in some\n cases.\n\n .. versionadded:: 0.18\n\n .. deprecated:: 0.24\n `criterion='mae'` is deprecated and will be removed in version\n 1.1 (renaming of 0.26). Use `criterion='friedman_mse'` or\n `'squared_error'` instead, as trees should use a squared error\n criterion in Gradient Boosting.\n\n .. deprecated:: 1.0\n Criterion 'mse' was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion='squared_error'` which is equivalent.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_depth : int, default=3\n The maximum depth of the individual regression estimators. The maximum\n depth limits the number of nodes in the tree. Tune this parameter\n for best performance; the best value depends on the interaction\n of the input variables.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n init : estimator or 'zero', default=None\n An estimator object that is used to compute the initial predictions.\n ``init`` has to provide :meth:`fit` and :meth:`predict_proba`. If\n 'zero', the initial raw predictions are set to zero. By default, a\n ``DummyEstimator`` predicting the classes priors is used.\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given to each Tree estimator at each\n boosting iteration.\n In addition, it controls the random permutation of the features at\n each split (see Notes for more details).\n It also controls the random splitting of the training data to obtain a\n validation set if `n_iter_no_change` is not None.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n max_features : {'auto', 'sqrt', 'log2'}, int or float, default=None\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n - If 'auto', then `max_features=sqrt(n_features)`.\n - If 'sqrt', then `max_features=sqrt(n_features)`.\n - If 'log2', then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Choosing `max_features < n_features` leads to a reduction of variance\n and an increase in bias.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n verbose : int, default=0\n Enable verbose output. If 1 then it prints progress and performance\n once in a while (the more trees the lower the frequency). If greater\n than 1 then it prints progress and performance for every tree.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just erase the\n previous solution. See :term:`the Glossary `.\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if ``n_iter_no_change`` is set to an integer.\n\n .. versionadded:: 0.20\n\n n_iter_no_change : int, default=None\n ``n_iter_no_change`` is used to decide if early stopping will be used\n to terminate training when validation score is not improving. By\n default it is set to None to disable early stopping. If set to a\n number, it will set aside ``validation_fraction`` size of the training\n data as validation and terminate training when validation score is not\n improving in all of the previous ``n_iter_no_change`` numbers of\n iterations. The split is stratified.\n\n .. versionadded:: 0.20\n\n tol : float, default=1e-4\n Tolerance for the early stopping. When the loss is not improving\n by at least tol for ``n_iter_no_change`` iterations (if set to a\n number), the training stops.\n\n .. versionadded:: 0.20\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n n_estimators_ : int\n The number of estimators as selected by early stopping (if\n ``n_iter_no_change`` is specified). Otherwise it is set to\n ``n_estimators``.\n\n .. versionadded:: 0.20\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n oob_improvement_ : ndarray of shape (n_estimators,)\n The improvement in loss (= deviance) on the out-of-bag samples\n relative to the previous iteration.\n ``oob_improvement_[0]`` is the improvement in\n loss of the first stage over the ``init`` estimator.\n Only available if ``subsample < 1.0``\n\n train_score_ : ndarray of shape (n_estimators,)\n The i-th score ``train_score_[i]`` is the deviance (= loss) of the\n model at iteration ``i`` on the in-bag sample.\n If ``subsample == 1`` this is the deviance on the training data.\n\n loss_ : LossFunction\n The concrete ``LossFunction`` object.\n\n init_ : estimator\n The estimator that provides the initial predictions.\n Set via the ``init`` argument or ``loss.init_estimator``.\n\n estimators_ : ndarray of DecisionTreeRegressor of shape (n_estimators, ``loss_.K``)\n The collection of fitted sub-estimators. ``loss_.K`` is 1 for binary\n classification, otherwise n_classes.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_features_ : int\n The number of data features.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_classes_ : int\n The number of classes.\n\n max_features_ : int\n The inferred value of max_features.\n\n See Also\n --------\n HistGradientBoostingClassifier : Histogram-based Gradient Boosting\n Classification Tree.\n sklearn.tree.DecisionTreeClassifier : A decision tree classifier.\n RandomForestClassifier : A meta-estimator that fits a number of decision\n tree classifiers on various sub-samples of the dataset and uses\n averaging to improve the predictive accuracy and control over-fitting.\n AdaBoostClassifier : A meta-estimator that begins by fitting a classifier\n on the original dataset and then fits additional copies of the\n classifier on the same dataset where the weights of incorrectly\n classified instances are adjusted such that subsequent classifiers\n focus more on difficult cases.\n\n Notes\n -----\n The features are always randomly permuted at each split. Therefore,\n the best found split may vary, even with the same training data and\n ``max_features=n_features``, if the improvement of the criterion is\n identical for several splits enumerated during the search of the best\n split. To obtain a deterministic behaviour during fitting,\n ``random_state`` has to be fixed.\n\n References\n ----------\n J. Friedman, Greedy Function Approximation: A Gradient Boosting\n Machine, The Annals of Statistics, Vol. 29, No. 5, 2001.\n\n J. Friedman, Stochastic Gradient Boosting, 1999\n\n T. Hastie, R. Tibshirani and J. Friedman.\n Elements of Statistical Learning Ed. 2, Springer, 2009.\n\n Examples\n --------\n The following example shows how to fit a gradient boosting classifier with\n 100 decision stumps as weak learners.\n\n >>> from sklearn.datasets import make_hastie_10_2\n >>> from sklearn.ensemble import GradientBoostingClassifier\n\n >>> X, y = make_hastie_10_2(random_state=0)\n >>> X_train, X_test = X[:2000], X[2000:]\n >>> y_train, y_test = y[:2000], y[2000:]\n\n >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,\n ... max_depth=1, random_state=0).fit(X_train, y_train)\n >>> clf.score(X_test, y_test)\n 0.913...\n ", "source_code": "\n\nclass GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):\n \"\"\"Gradient Boosting for classification.\n\n GB builds an additive model in a\n forward stage-wise fashion; it allows for the optimization of\n arbitrary differentiable loss functions. In each stage ``n_classes_``\n regression trees are fit on the negative gradient of the\n binomial or multinomial deviance loss function. Binary classification\n is a special case where only a single regression tree is induced.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n loss : {'deviance', 'exponential'}, default='deviance'\n The loss function to be optimized. 'deviance' refers to\n deviance (= logistic regression) for classification\n with probabilistic outputs. For loss 'exponential' gradient\n boosting recovers the AdaBoost algorithm.\n\n learning_rate : float, default=0.1\n Learning rate shrinks the contribution of each tree by `learning_rate`.\n There is a trade-off between learning_rate and n_estimators.\n\n n_estimators : int, default=100\n The number of boosting stages to perform. Gradient boosting\n is fairly robust to over-fitting so a large number usually\n results in better performance.\n\n subsample : float, default=1.0\n The fraction of samples to be used for fitting the individual base\n learners. If smaller than 1.0 this results in Stochastic Gradient\n Boosting. `subsample` interacts with the parameter `n_estimators`.\n Choosing `subsample < 1.0` leads to a reduction of variance\n and an increase in bias.\n\n criterion : {'friedman_mse', 'squared_error', 'mse', 'mae'}, default='friedman_mse'\n The function to measure the quality of a split. Supported criteria\n are 'friedman_mse' for the mean squared error with improvement\n score by Friedman, 'squared_error' for mean squared error, and 'mae'\n for the mean absolute error. The default value of 'friedman_mse' is\n generally the best as it can provide a better approximation in some\n cases.\n\n .. versionadded:: 0.18\n\n .. deprecated:: 0.24\n `criterion='mae'` is deprecated and will be removed in version\n 1.1 (renaming of 0.26). Use `criterion='friedman_mse'` or\n `'squared_error'` instead, as trees should use a squared error\n criterion in Gradient Boosting.\n\n .. deprecated:: 1.0\n Criterion 'mse' was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion='squared_error'` which is equivalent.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_depth : int, default=3\n The maximum depth of the individual regression estimators. The maximum\n depth limits the number of nodes in the tree. Tune this parameter\n for best performance; the best value depends on the interaction\n of the input variables.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n init : estimator or 'zero', default=None\n An estimator object that is used to compute the initial predictions.\n ``init`` has to provide :meth:`fit` and :meth:`predict_proba`. If\n 'zero', the initial raw predictions are set to zero. By default, a\n ``DummyEstimator`` predicting the classes priors is used.\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given to each Tree estimator at each\n boosting iteration.\n In addition, it controls the random permutation of the features at\n each split (see Notes for more details).\n It also controls the random splitting of the training data to obtain a\n validation set if `n_iter_no_change` is not None.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n max_features : {'auto', 'sqrt', 'log2'}, int or float, default=None\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n - If 'auto', then `max_features=sqrt(n_features)`.\n - If 'sqrt', then `max_features=sqrt(n_features)`.\n - If 'log2', then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Choosing `max_features < n_features` leads to a reduction of variance\n and an increase in bias.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n verbose : int, default=0\n Enable verbose output. If 1 then it prints progress and performance\n once in a while (the more trees the lower the frequency). If greater\n than 1 then it prints progress and performance for every tree.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just erase the\n previous solution. See :term:`the Glossary `.\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if ``n_iter_no_change`` is set to an integer.\n\n .. versionadded:: 0.20\n\n n_iter_no_change : int, default=None\n ``n_iter_no_change`` is used to decide if early stopping will be used\n to terminate training when validation score is not improving. By\n default it is set to None to disable early stopping. If set to a\n number, it will set aside ``validation_fraction`` size of the training\n data as validation and terminate training when validation score is not\n improving in all of the previous ``n_iter_no_change`` numbers of\n iterations. The split is stratified.\n\n .. versionadded:: 0.20\n\n tol : float, default=1e-4\n Tolerance for the early stopping. When the loss is not improving\n by at least tol for ``n_iter_no_change`` iterations (if set to a\n number), the training stops.\n\n .. versionadded:: 0.20\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n n_estimators_ : int\n The number of estimators as selected by early stopping (if\n ``n_iter_no_change`` is specified). Otherwise it is set to\n ``n_estimators``.\n\n .. versionadded:: 0.20\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n oob_improvement_ : ndarray of shape (n_estimators,)\n The improvement in loss (= deviance) on the out-of-bag samples\n relative to the previous iteration.\n ``oob_improvement_[0]`` is the improvement in\n loss of the first stage over the ``init`` estimator.\n Only available if ``subsample < 1.0``\n\n train_score_ : ndarray of shape (n_estimators,)\n The i-th score ``train_score_[i]`` is the deviance (= loss) of the\n model at iteration ``i`` on the in-bag sample.\n If ``subsample == 1`` this is the deviance on the training data.\n\n loss_ : LossFunction\n The concrete ``LossFunction`` object.\n\n init_ : estimator\n The estimator that provides the initial predictions.\n Set via the ``init`` argument or ``loss.init_estimator``.\n\n estimators_ : ndarray of DecisionTreeRegressor of shape (n_estimators, ``loss_.K``)\n The collection of fitted sub-estimators. ``loss_.K`` is 1 for binary\n classification, otherwise n_classes.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_features_ : int\n The number of data features.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_classes_ : int\n The number of classes.\n\n max_features_ : int\n The inferred value of max_features.\n\n See Also\n --------\n HistGradientBoostingClassifier : Histogram-based Gradient Boosting\n Classification Tree.\n sklearn.tree.DecisionTreeClassifier : A decision tree classifier.\n RandomForestClassifier : A meta-estimator that fits a number of decision\n tree classifiers on various sub-samples of the dataset and uses\n averaging to improve the predictive accuracy and control over-fitting.\n AdaBoostClassifier : A meta-estimator that begins by fitting a classifier\n on the original dataset and then fits additional copies of the\n classifier on the same dataset where the weights of incorrectly\n classified instances are adjusted such that subsequent classifiers\n focus more on difficult cases.\n\n Notes\n -----\n The features are always randomly permuted at each split. Therefore,\n the best found split may vary, even with the same training data and\n ``max_features=n_features``, if the improvement of the criterion is\n identical for several splits enumerated during the search of the best\n split. To obtain a deterministic behaviour during fitting,\n ``random_state`` has to be fixed.\n\n References\n ----------\n J. Friedman, Greedy Function Approximation: A Gradient Boosting\n Machine, The Annals of Statistics, Vol. 29, No. 5, 2001.\n\n J. Friedman, Stochastic Gradient Boosting, 1999\n\n T. Hastie, R. Tibshirani and J. Friedman.\n Elements of Statistical Learning Ed. 2, Springer, 2009.\n\n Examples\n --------\n The following example shows how to fit a gradient boosting classifier with\n 100 decision stumps as weak learners.\n\n >>> from sklearn.datasets import make_hastie_10_2\n >>> from sklearn.ensemble import GradientBoostingClassifier\n\n >>> X, y = make_hastie_10_2(random_state=0)\n >>> X_train, X_test = X[:2000], X[2000:]\n >>> y_train, y_test = y[:2000], y[2000:]\n\n >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,\n ... max_depth=1, random_state=0).fit(X_train, y_train)\n >>> clf.score(X_test, y_test)\n 0.913...\n \"\"\"\n _SUPPORTED_LOSS = ('deviance', 'exponential')\n \n def __init__(self, *, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0):\n super().__init__(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, init=init, subsample=subsample, max_features=max_features, random_state=random_state, verbose=verbose, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, warm_start=warm_start, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, ccp_alpha=ccp_alpha)\n \n def _validate_y(self, y, sample_weight):\n check_classification_targets(y)\n (self.classes_, y) = np.unique(y, return_inverse=True)\n n_trim_classes = np.count_nonzero(np.bincount(y, sample_weight))\n if n_trim_classes < 2:\n raise ValueError('y contains %d class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.' % n_trim_classes)\n self._n_classes = len(self.classes_)\n self.n_classes_ = self._n_classes\n return y\n \n def _warn_mae_for_criterion(self):\n warnings.warn(\"criterion='mae' was deprecated in version 0.24 and will be removed in version 1.1 (renaming of 0.26). Use criterion='friedman_mse' or 'squared_error' instead, as trees should use a squared error criterion in Gradient Boosting.\", FutureWarning)\n \n def decision_function(self, X):\n \"\"\"Compute the decision function of ``X``.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n score : ndarray of shape (n_samples, n_classes) or (n_samples,)\n The decision function of the input samples, which corresponds to\n the raw values predicted from the trees of the ensemble . The\n order of the classes corresponds to that in the attribute\n :term:`classes_`. Regression and binary classification produce an\n array of shape (n_samples,).\n \"\"\"\n X = self._validate_data(X, dtype=DTYPE, order='C', accept_sparse='csr', reset=False)\n raw_predictions = self._raw_predict(X)\n if raw_predictions.shape[1] == 1:\n return raw_predictions.ravel()\n return raw_predictions\n \n def staged_decision_function(self, X):\n \"\"\"Compute decision function of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Yields\n ------\n score : generator of ndarray of shape (n_samples, k)\n The decision function of the input samples, which corresponds to\n the raw values predicted from the trees of the ensemble . The\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification are special cases with\n ``k == 1``, otherwise ``k==n_classes``.\n \"\"\"\n yield from self._staged_raw_predict(X)\n \n def predict(self, X):\n \"\"\"Predict class for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted values.\n \"\"\"\n raw_predictions = self.decision_function(X)\n encoded_labels = self.loss_._raw_prediction_to_decision(raw_predictions)\n return self.classes_.take(encoded_labels, axis=0)\n \n def staged_predict(self, X):\n \"\"\"Predict class at each stage for X.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted value of the input samples.\n \"\"\"\n for raw_predictions in self._staged_raw_predict(X):\n encoded_labels = self.loss_._raw_prediction_to_decision(raw_predictions)\n yield self.classes_.take(encoded_labels, axis=0)\n \n def predict_proba(self, X):\n \"\"\"Predict class probabilities for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n\n Raises\n ------\n AttributeError\n If the ``loss`` does not support probabilities.\n \"\"\"\n raw_predictions = self.decision_function(X)\n try:\n return self.loss_._raw_prediction_to_proba(raw_predictions)\n except NotFittedError:\n raise\n except AttributeError as e:\n raise AttributeError('loss=%r does not support predict_proba' % self.loss) from e\n \n def predict_log_proba(self, X):\n \"\"\"Predict class log-probabilities for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n\n Raises\n ------\n AttributeError\n If the ``loss`` does not support probabilities.\n \"\"\"\n proba = self.predict_proba(X)\n return np.log(proba)\n \n def staged_predict_proba(self, X):\n \"\"\"Predict class probabilities at each stage for X.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Yields\n ------\n y : generator of ndarray of shape (n_samples,)\n The predicted value of the input samples.\n \"\"\"\n try:\n for raw_predictions in self._staged_raw_predict(X):\n yield self.loss_._raw_prediction_to_proba(raw_predictions)\n except NotFittedError:\n raise\n except AttributeError as e:\n raise AttributeError('loss=%r does not support predict_proba' % self.loss) from e\n" }, @@ -21075,7 +21141,7 @@ "sklearn.ensemble._gb.GradientBoostingRegressor.n_classes_@getter" ], "is_public": true, - "description": "Gradient Boosting for regression.\n\nGB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage a regression tree is fit on the negative gradient of the given loss function. Read more in the :ref:`User Guide `.", + "description": "Gradient Boosting for regression.\n\nGB builds an additive model in a forward stage-wise fashion;\nit allows for the optimization of arbitrary differentiable loss functions.\nIn each stage a regression tree is fit on the negative gradient of the\ngiven loss function.\n\nRead more in the :ref:`User Guide `.", "docstring": "Gradient Boosting for regression.\n\n GB builds an additive model in a forward stage-wise fashion;\n it allows for the optimization of arbitrary differentiable loss functions.\n In each stage a regression tree is fit on the negative gradient of the\n given loss function.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n loss : {'squared_error', 'absolute_error', 'huber', 'quantile'}, default='squared_error'\n Loss function to be optimized. 'squared_error' refers to the squared\n error for regression. 'absolute_error' refers to the absolute error of\n regression and is a robust loss function. 'huber' is a\n combination of the two. 'quantile' allows quantile regression (use\n `alpha` to specify the quantile).\n\n .. deprecated:: 1.0\n The loss 'ls' was deprecated in v1.0 and will be removed in\n version 1.2. Use `loss='squared_error'` which is equivalent.\n\n .. deprecated:: 1.0\n The loss 'lad' was deprecated in v1.0 and will be removed in\n version 1.2. Use `loss='absolute_error'` which is equivalent.\n\n learning_rate : float, default=0.1\n Learning rate shrinks the contribution of each tree by `learning_rate`.\n There is a trade-off between learning_rate and n_estimators.\n\n n_estimators : int, default=100\n The number of boosting stages to perform. Gradient boosting\n is fairly robust to over-fitting so a large number usually\n results in better performance.\n\n subsample : float, default=1.0\n The fraction of samples to be used for fitting the individual base\n learners. If smaller than 1.0 this results in Stochastic Gradient\n Boosting. `subsample` interacts with the parameter `n_estimators`.\n Choosing `subsample < 1.0` leads to a reduction of variance\n and an increase in bias.\n\n criterion : {'friedman_mse', 'squared_error', 'mse', 'mae'}, default='friedman_mse'\n The function to measure the quality of a split. Supported criteria\n are \"friedman_mse\" for the mean squared error with improvement\n score by Friedman, \"squared_error\" for mean squared error, and \"mae\"\n for the mean absolute error. The default value of \"friedman_mse\" is\n generally the best as it can provide a better approximation in some\n cases.\n\n .. versionadded:: 0.18\n\n .. deprecated:: 0.24\n `criterion='mae'` is deprecated and will be removed in version\n 1.1 (renaming of 0.26). The correct way of minimizing the absolute\n error is to use `loss='absolute_error'` instead.\n\n .. deprecated:: 1.0\n Criterion 'mse' was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion='squared_error'` which is equivalent.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_depth : int, default=3\n Maximum depth of the individual regression estimators. The maximum\n depth limits the number of nodes in the tree. Tune this parameter\n for best performance; the best value depends on the interaction\n of the input variables.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n init : estimator or 'zero', default=None\n An estimator object that is used to compute the initial predictions.\n ``init`` has to provide :term:`fit` and :term:`predict`. If 'zero', the\n initial raw predictions are set to zero. By default a\n ``DummyEstimator`` is used, predicting either the average target value\n (for loss='squared_error'), or a quantile for the other losses.\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given to each Tree estimator at each\n boosting iteration.\n In addition, it controls the random permutation of the features at\n each split (see Notes for more details).\n It also controls the random splitting of the training data to obtain a\n validation set if `n_iter_no_change` is not None.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n max_features : {'auto', 'sqrt', 'log2'}, int or float, default=None\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=n_features`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Choosing `max_features < n_features` leads to a reduction of variance\n and an increase in bias.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n alpha : float, default=0.9\n The alpha-quantile of the huber loss function and the quantile\n loss function. Only if ``loss='huber'`` or ``loss='quantile'``.\n\n verbose : int, default=0\n Enable verbose output. If 1 then it prints progress and performance\n once in a while (the more trees the lower the frequency). If greater\n than 1 then it prints progress and performance for every tree.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just erase the\n previous solution. See :term:`the Glossary `.\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if ``n_iter_no_change`` is set to an integer.\n\n .. versionadded:: 0.20\n\n n_iter_no_change : int, default=None\n ``n_iter_no_change`` is used to decide if early stopping will be used\n to terminate training when validation score is not improving. By\n default it is set to None to disable early stopping. If set to a\n number, it will set aside ``validation_fraction`` size of the training\n data as validation and terminate training when validation score is not\n improving in all of the previous ``n_iter_no_change`` numbers of\n iterations.\n\n .. versionadded:: 0.20\n\n tol : float, default=1e-4\n Tolerance for the early stopping. When the loss is not improving\n by at least tol for ``n_iter_no_change`` iterations (if set to a\n number), the training stops.\n\n .. versionadded:: 0.20\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n oob_improvement_ : ndarray of shape (n_estimators,)\n The improvement in loss (= deviance) on the out-of-bag samples\n relative to the previous iteration.\n ``oob_improvement_[0]`` is the improvement in\n loss of the first stage over the ``init`` estimator.\n Only available if ``subsample < 1.0``\n\n train_score_ : ndarray of shape (n_estimators,)\n The i-th score ``train_score_[i]`` is the deviance (= loss) of the\n model at iteration ``i`` on the in-bag sample.\n If ``subsample == 1`` this is the deviance on the training data.\n\n loss_ : LossFunction\n The concrete ``LossFunction`` object.\n\n init_ : estimator\n The estimator that provides the initial predictions.\n Set via the ``init`` argument or ``loss.init_estimator``.\n\n estimators_ : ndarray of DecisionTreeRegressor of shape (n_estimators, 1)\n The collection of fitted sub-estimators.\n\n n_classes_ : int\n The number of classes, set to 1 for regressors.\n\n .. deprecated:: 0.24\n Attribute ``n_classes_`` was deprecated in version 0.24 and\n will be removed in 1.1 (renaming of 0.26).\n\n n_estimators_ : int\n The number of estimators as selected by early stopping (if\n ``n_iter_no_change`` is specified). Otherwise it is set to\n ``n_estimators``.\n\n n_features_ : int\n The number of data features.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n max_features_ : int\n The inferred value of max_features.\n\n See Also\n --------\n HistGradientBoostingRegressor : Histogram-based Gradient Boosting\n Classification Tree.\n sklearn.tree.DecisionTreeRegressor : A decision tree regressor.\n sklearn.ensemble.RandomForestRegressor : A random forest regressor.\n\n Notes\n -----\n The features are always randomly permuted at each split. Therefore,\n the best found split may vary, even with the same training data and\n ``max_features=n_features``, if the improvement of the criterion is\n identical for several splits enumerated during the search of the best\n split. To obtain a deterministic behaviour during fitting,\n ``random_state`` has to be fixed.\n\n References\n ----------\n J. Friedman, Greedy Function Approximation: A Gradient Boosting\n Machine, The Annals of Statistics, Vol. 29, No. 5, 2001.\n\n J. Friedman, Stochastic Gradient Boosting, 1999\n\n T. Hastie, R. Tibshirani and J. Friedman.\n Elements of Statistical Learning Ed. 2, Springer, 2009.\n\n Examples\n --------\n >>> from sklearn.datasets import make_regression\n >>> from sklearn.ensemble import GradientBoostingRegressor\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = make_regression(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> reg = GradientBoostingRegressor(random_state=0)\n >>> reg.fit(X_train, y_train)\n GradientBoostingRegressor(random_state=0)\n >>> reg.predict(X_test[1:2])\n array([-61...])\n >>> reg.score(X_test, y_test)\n 0.4...\n ", "source_code": "\n\nclass GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):\n \"\"\"Gradient Boosting for regression.\n\n GB builds an additive model in a forward stage-wise fashion;\n it allows for the optimization of arbitrary differentiable loss functions.\n In each stage a regression tree is fit on the negative gradient of the\n given loss function.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n loss : {'squared_error', 'absolute_error', 'huber', 'quantile'}, default='squared_error'\n Loss function to be optimized. 'squared_error' refers to the squared\n error for regression. 'absolute_error' refers to the absolute error of\n regression and is a robust loss function. 'huber' is a\n combination of the two. 'quantile' allows quantile regression (use\n `alpha` to specify the quantile).\n\n .. deprecated:: 1.0\n The loss 'ls' was deprecated in v1.0 and will be removed in\n version 1.2. Use `loss='squared_error'` which is equivalent.\n\n .. deprecated:: 1.0\n The loss 'lad' was deprecated in v1.0 and will be removed in\n version 1.2. Use `loss='absolute_error'` which is equivalent.\n\n learning_rate : float, default=0.1\n Learning rate shrinks the contribution of each tree by `learning_rate`.\n There is a trade-off between learning_rate and n_estimators.\n\n n_estimators : int, default=100\n The number of boosting stages to perform. Gradient boosting\n is fairly robust to over-fitting so a large number usually\n results in better performance.\n\n subsample : float, default=1.0\n The fraction of samples to be used for fitting the individual base\n learners. If smaller than 1.0 this results in Stochastic Gradient\n Boosting. `subsample` interacts with the parameter `n_estimators`.\n Choosing `subsample < 1.0` leads to a reduction of variance\n and an increase in bias.\n\n criterion : {'friedman_mse', 'squared_error', 'mse', 'mae'}, default='friedman_mse'\n The function to measure the quality of a split. Supported criteria\n are \"friedman_mse\" for the mean squared error with improvement\n score by Friedman, \"squared_error\" for mean squared error, and \"mae\"\n for the mean absolute error. The default value of \"friedman_mse\" is\n generally the best as it can provide a better approximation in some\n cases.\n\n .. versionadded:: 0.18\n\n .. deprecated:: 0.24\n `criterion='mae'` is deprecated and will be removed in version\n 1.1 (renaming of 0.26). The correct way of minimizing the absolute\n error is to use `loss='absolute_error'` instead.\n\n .. deprecated:: 1.0\n Criterion 'mse' was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion='squared_error'` which is equivalent.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_depth : int, default=3\n Maximum depth of the individual regression estimators. The maximum\n depth limits the number of nodes in the tree. Tune this parameter\n for best performance; the best value depends on the interaction\n of the input variables.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n init : estimator or 'zero', default=None\n An estimator object that is used to compute the initial predictions.\n ``init`` has to provide :term:`fit` and :term:`predict`. If 'zero', the\n initial raw predictions are set to zero. By default a\n ``DummyEstimator`` is used, predicting either the average target value\n (for loss='squared_error'), or a quantile for the other losses.\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given to each Tree estimator at each\n boosting iteration.\n In addition, it controls the random permutation of the features at\n each split (see Notes for more details).\n It also controls the random splitting of the training data to obtain a\n validation set if `n_iter_no_change` is not None.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n max_features : {'auto', 'sqrt', 'log2'}, int or float, default=None\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=n_features`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Choosing `max_features < n_features` leads to a reduction of variance\n and an increase in bias.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n alpha : float, default=0.9\n The alpha-quantile of the huber loss function and the quantile\n loss function. Only if ``loss='huber'`` or ``loss='quantile'``.\n\n verbose : int, default=0\n Enable verbose output. If 1 then it prints progress and performance\n once in a while (the more trees the lower the frequency). If greater\n than 1 then it prints progress and performance for every tree.\n\n max_leaf_nodes : int, default=None\n Grow trees with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just erase the\n previous solution. See :term:`the Glossary `.\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if ``n_iter_no_change`` is set to an integer.\n\n .. versionadded:: 0.20\n\n n_iter_no_change : int, default=None\n ``n_iter_no_change`` is used to decide if early stopping will be used\n to terminate training when validation score is not improving. By\n default it is set to None to disable early stopping. If set to a\n number, it will set aside ``validation_fraction`` size of the training\n data as validation and terminate training when validation score is not\n improving in all of the previous ``n_iter_no_change`` numbers of\n iterations.\n\n .. versionadded:: 0.20\n\n tol : float, default=1e-4\n Tolerance for the early stopping. When the loss is not improving\n by at least tol for ``n_iter_no_change`` iterations (if set to a\n number), the training stops.\n\n .. versionadded:: 0.20\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n oob_improvement_ : ndarray of shape (n_estimators,)\n The improvement in loss (= deviance) on the out-of-bag samples\n relative to the previous iteration.\n ``oob_improvement_[0]`` is the improvement in\n loss of the first stage over the ``init`` estimator.\n Only available if ``subsample < 1.0``\n\n train_score_ : ndarray of shape (n_estimators,)\n The i-th score ``train_score_[i]`` is the deviance (= loss) of the\n model at iteration ``i`` on the in-bag sample.\n If ``subsample == 1`` this is the deviance on the training data.\n\n loss_ : LossFunction\n The concrete ``LossFunction`` object.\n\n init_ : estimator\n The estimator that provides the initial predictions.\n Set via the ``init`` argument or ``loss.init_estimator``.\n\n estimators_ : ndarray of DecisionTreeRegressor of shape (n_estimators, 1)\n The collection of fitted sub-estimators.\n\n n_classes_ : int\n The number of classes, set to 1 for regressors.\n\n .. deprecated:: 0.24\n Attribute ``n_classes_`` was deprecated in version 0.24 and\n will be removed in 1.1 (renaming of 0.26).\n\n n_estimators_ : int\n The number of estimators as selected by early stopping (if\n ``n_iter_no_change`` is specified). Otherwise it is set to\n ``n_estimators``.\n\n n_features_ : int\n The number of data features.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n max_features_ : int\n The inferred value of max_features.\n\n See Also\n --------\n HistGradientBoostingRegressor : Histogram-based Gradient Boosting\n Classification Tree.\n sklearn.tree.DecisionTreeRegressor : A decision tree regressor.\n sklearn.ensemble.RandomForestRegressor : A random forest regressor.\n\n Notes\n -----\n The features are always randomly permuted at each split. Therefore,\n the best found split may vary, even with the same training data and\n ``max_features=n_features``, if the improvement of the criterion is\n identical for several splits enumerated during the search of the best\n split. To obtain a deterministic behaviour during fitting,\n ``random_state`` has to be fixed.\n\n References\n ----------\n J. Friedman, Greedy Function Approximation: A Gradient Boosting\n Machine, The Annals of Statistics, Vol. 29, No. 5, 2001.\n\n J. Friedman, Stochastic Gradient Boosting, 1999\n\n T. Hastie, R. Tibshirani and J. Friedman.\n Elements of Statistical Learning Ed. 2, Springer, 2009.\n\n Examples\n --------\n >>> from sklearn.datasets import make_regression\n >>> from sklearn.ensemble import GradientBoostingRegressor\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = make_regression(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> reg = GradientBoostingRegressor(random_state=0)\n >>> reg.fit(X_train, y_train)\n GradientBoostingRegressor(random_state=0)\n >>> reg.predict(X_test[1:2])\n array([-61...])\n >>> reg.score(X_test, y_test)\n 0.4...\n \"\"\"\n _SUPPORTED_LOSS = ('squared_error', 'ls', 'absolute_error', 'lad', 'huber', 'quantile')\n \n def __init__(self, *, loss='squared_error', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0):\n super().__init__(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, init=init, subsample=subsample, max_features=max_features, min_impurity_decrease=min_impurity_decrease, random_state=random_state, alpha=alpha, verbose=verbose, max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, ccp_alpha=ccp_alpha)\n \n def _validate_y(self, y, sample_weight=None):\n if y.dtype.kind == 'O':\n y = y.astype(DOUBLE)\n return y\n \n def _warn_mae_for_criterion(self):\n warnings.warn(\"criterion='mae' was deprecated in version 0.24 and will be removed in version 1.1 (renaming of 0.26). The correct way of minimizing the absolute error is to use loss='absolute_error' instead.\", FutureWarning)\n \n def predict(self, X):\n \"\"\"Predict regression target for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted values.\n \"\"\"\n X = self._validate_data(X, dtype=DTYPE, order='C', accept_sparse='csr', reset=False)\n return self._raw_predict(X).ravel()\n \n def staged_predict(self, X):\n \"\"\"Predict regression target at each stage for X.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Yields\n ------\n y : generator of ndarray of shape (n_samples,)\n The predicted value of the input samples.\n \"\"\"\n for raw_predictions in self._staged_raw_predict(X):\n yield raw_predictions.ravel()\n \n def apply(self, X):\n \"\"\"Apply trees in the ensemble to X, return leaf indices.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will\n be converted to a sparse ``csr_matrix``.\n\n Returns\n -------\n X_leaves : array-like of shape (n_samples, n_estimators)\n For each datapoint x in X and for each tree in the ensemble,\n return the index of the leaf x ends up in each estimator.\n \"\"\"\n leaves = super().apply(X)\n leaves = leaves.reshape(X.shape[0], self.estimators_.shape[0])\n return leaves\n \n @deprecated('Attribute `n_classes_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def n_classes_(self):\n try:\n check_is_fitted(self)\n except NotFittedError as nfe:\n raise AttributeError('{} object has no n_classes_ attribute.'.format(self.__class__.__name__)) from nfe\n return 1\n" }, @@ -21110,7 +21176,7 @@ "sklearn.ensemble._gb_losses.BinomialDeviance.get_init_raw_predictions" ], "is_public": false, - "description": "Binomial deviance loss function for binary classification.\n\nBinary classification is a special case; here, we only need to fit one tree instead of ``n_classes`` trees.", + "description": "Binomial deviance loss function for binary classification.\n\nBinary classification is a special case; here, we only need to\nfit one tree instead of ``n_classes`` trees.", "docstring": "Binomial deviance loss function for binary classification.\n\n Binary classification is a special case; here, we only need to\n fit one tree instead of ``n_classes`` trees.\n\n Parameters\n ----------\n n_classes : int\n Number of classes.\n ", "source_code": "\n\nclass BinomialDeviance(ClassificationLossFunction):\n \"\"\"Binomial deviance loss function for binary classification.\n\n Binary classification is a special case; here, we only need to\n fit one tree instead of ``n_classes`` trees.\n\n Parameters\n ----------\n n_classes : int\n Number of classes.\n \"\"\"\n \n def __init__(self, n_classes):\n if n_classes != 2:\n raise ValueError('{0:s} requires 2 classes; got {1:d} class(es)'.format(self.__class__.__name__, n_classes))\n super().__init__(n_classes=1)\n \n def init_estimator(self):\n return DummyClassifier(strategy='prior')\n \n def __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the deviance (= 2 * negative log-likelihood).\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n raw_predictions = raw_predictions.ravel()\n if sample_weight is None:\n return -2 * np.mean(y * raw_predictions - np.logaddexp(0, raw_predictions))\n else:\n return -2 / sample_weight.sum() * np.sum(sample_weight * (y * raw_predictions - np.logaddexp(0, raw_predictions)))\n \n def negative_gradient(self, y, raw_predictions, **kargs):\n \"\"\"Compute half of the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n \"\"\"\n return y - expit(raw_predictions.ravel())\n \n def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, raw_predictions, sample_weight):\n \"\"\"Make a single Newton-Raphson step.\n\n our node estimate is given by:\n\n sum(w * (y - prob)) / sum(w * prob * (1 - prob))\n\n we take advantage that: y - prob = residual\n \"\"\"\n terminal_region = np.where(terminal_regions == leaf)[0]\n residual = residual.take(terminal_region, axis=0)\n y = y.take(terminal_region, axis=0)\n sample_weight = sample_weight.take(terminal_region, axis=0)\n numerator = np.sum(sample_weight * residual)\n denominator = np.sum(sample_weight * (y - residual) * (1 - y + residual))\n if abs(denominator) < 1e-150:\n tree.value[leaf, 0, 0] = 0.0\n else:\n tree.value[leaf, 0, 0] = numerator / denominator\n \n def _raw_prediction_to_proba(self, raw_predictions):\n proba = np.ones((raw_predictions.shape[0], 2), dtype=np.float64)\n proba[:, 1] = expit(raw_predictions.ravel())\n proba[:, 0] -= proba[:, 1]\n return proba\n \n def _raw_prediction_to_decision(self, raw_predictions):\n proba = self._raw_prediction_to_proba(raw_predictions)\n return np.argmax(proba, axis=1)\n \n def get_init_raw_predictions(self, X, estimator):\n probas = estimator.predict_proba(X)\n proba_pos_class = probas[:, 1]\n eps = np.finfo(np.float32).eps\n proba_pos_class = np.clip(proba_pos_class, eps, 1 - eps)\n raw_predictions = np.log(proba_pos_class / (1 - proba_pos_class))\n return raw_predictions.reshape(-1, 1).astype(np.float64)\n" }, @@ -21147,7 +21213,7 @@ "is_public": false, "description": "Exponential loss function for binary classification.\n\nSame loss as AdaBoost.", "docstring": "Exponential loss function for binary classification.\n\n Same loss as AdaBoost.\n\n Parameters\n ----------\n n_classes : int\n Number of classes.\n\n References\n ----------\n Greg Ridgeway, Generalized Boosted Models: A guide to the gbm package, 2007\n ", - "source_code": "\n\nclass ExponentialLoss(ClassificationLossFunction):\n \"\"\"Exponential loss function for binary classification.\n\n Same loss as AdaBoost.\n\n Parameters\n ----------\n n_classes : int\n Number of classes.\n\n References\n ----------\n Greg Ridgeway, Generalized Boosted Models: A guide to the gbm package, 2007\n \"\"\"\n \n def __init__(self, n_classes):\n if n_classes != 2:\n raise ValueError('{0:s} requires 2 classes; got {1:d} class(es)'.format(self.__class__.__name__, n_classes))\n super().__init__(n_classes=1)\n \n def init_estimator(self):\n return DummyClassifier(strategy='prior')\n \n def __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the exponential loss\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n raw_predictions = raw_predictions.ravel()\n if sample_weight is None:\n return np.mean(np.exp(-(2.0 * y - 1.0) * raw_predictions))\n else:\n return 1.0 / sample_weight.sum() * np.sum(sample_weight * np.exp(-(2 * y - 1) * raw_predictions))\n \n def negative_gradient(self, y, raw_predictions, **kargs):\n \"\"\"Compute the residual (= negative gradient).\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n \"\"\"\n y_ = -(2.0 * y - 1.0)\n return y_ * np.exp(y_ * raw_predictions.ravel())\n \n def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, raw_predictions, sample_weight):\n terminal_region = np.where(terminal_regions == leaf)[0]\n raw_predictions = raw_predictions.take(terminal_region, axis=0)\n y = y.take(terminal_region, axis=0)\n sample_weight = sample_weight.take(terminal_region, axis=0)\n y_ = 2.0 * y - 1.0\n numerator = np.sum(y_ * sample_weight * np.exp(-y_ * raw_predictions))\n denominator = np.sum(sample_weight * np.exp(-y_ * raw_predictions))\n if abs(denominator) < 1e-150:\n tree.value[leaf, 0, 0] = 0.0\n else:\n tree.value[leaf, 0, 0] = numerator / denominator\n \n def _raw_prediction_to_proba(self, raw_predictions):\n proba = np.ones((raw_predictions.shape[0], 2), dtype=np.float64)\n proba[:, 1] = expit(2.0 * raw_predictions.ravel())\n proba[:, 0] -= proba[:, 1]\n return proba\n \n def _raw_prediction_to_decision(self, raw_predictions):\n return (raw_predictions.ravel() >= 0).astype(int)\n \n def get_init_raw_predictions(self, X, estimator):\n probas = estimator.predict_proba(X)\n proba_pos_class = probas[:, 1]\n eps = np.finfo(np.float32).eps\n proba_pos_class = np.clip(proba_pos_class, eps, 1 - eps)\n raw_predictions = 0.5 * np.log(proba_pos_class / (1 - proba_pos_class))\n return raw_predictions.reshape(-1, 1).astype(np.float64)\n" + "source_code": "\n\nclass ExponentialLoss(ClassificationLossFunction):\n \"\"\"Exponential loss function for binary classification.\n\n Same loss as AdaBoost.\n\n Parameters\n ----------\n n_classes : int\n Number of classes.\n\n References\n ----------\n Greg Ridgeway, Generalized Boosted Models: A guide to the gbm package, 2007\n \"\"\"\n \n def __init__(self, n_classes):\n if n_classes != 2:\n raise ValueError('{0:s} requires 2 classes; got {1:d} class(es)'.format(self.__class__.__name__, n_classes))\n super().__init__(n_classes=1)\n \n def init_estimator(self):\n return DummyClassifier(strategy='prior')\n \n def __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the exponential loss\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n raw_predictions = raw_predictions.ravel()\n if sample_weight is None:\n return np.mean(np.exp(-(2.0 * y - 1.0) * raw_predictions))\n else:\n return 1.0 / sample_weight.sum() * np.sum(sample_weight * np.exp(-(2 * y - 1) * raw_predictions))\n \n def negative_gradient(self, y, raw_predictions, **kargs):\n \"\"\"Compute the residual (= negative gradient).\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n \"\"\"\n y_ = 2.0 * y - 1.0\n return y_ * np.exp(-y_ * raw_predictions.ravel())\n \n def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, raw_predictions, sample_weight):\n terminal_region = np.where(terminal_regions == leaf)[0]\n raw_predictions = raw_predictions.take(terminal_region, axis=0)\n y = y.take(terminal_region, axis=0)\n sample_weight = sample_weight.take(terminal_region, axis=0)\n y_ = 2.0 * y - 1.0\n numerator = np.sum(y_ * sample_weight * np.exp(-y_ * raw_predictions))\n denominator = np.sum(sample_weight * np.exp(-y_ * raw_predictions))\n if abs(denominator) < 1e-150:\n tree.value[leaf, 0, 0] = 0.0\n else:\n tree.value[leaf, 0, 0] = numerator / denominator\n \n def _raw_prediction_to_proba(self, raw_predictions):\n proba = np.ones((raw_predictions.shape[0], 2), dtype=np.float64)\n proba[:, 1] = expit(2.0 * raw_predictions.ravel())\n proba[:, 0] -= proba[:, 1]\n return proba\n \n def _raw_prediction_to_decision(self, raw_predictions):\n return (raw_predictions.ravel() >= 0).astype(int)\n \n def get_init_raw_predictions(self, X, estimator):\n probas = estimator.predict_proba(X)\n proba_pos_class = probas[:, 1]\n eps = np.finfo(np.float32).eps\n proba_pos_class = np.clip(proba_pos_class, eps, 1 - eps)\n raw_predictions = 0.5 * np.log(proba_pos_class / (1 - proba_pos_class))\n return raw_predictions.reshape(-1, 1).astype(np.float64)\n" }, { "name": "HuberLossFunction", @@ -21195,7 +21261,7 @@ "sklearn.ensemble._gb_losses.LeastSquaresError._update_terminal_region" ], "is_public": false, - "description": "Loss function for least squares (LS) estimation. Terminal regions do not need to be updated for least squares.", + "description": "Loss function for least squares (LS) estimation.\nTerminal regions do not need to be updated for least squares.", "docstring": "Loss function for least squares (LS) estimation.\n Terminal regions do not need to be updated for least squares.\n\n Parameters\n ----------\n n_classes : int\n Number of classes.\n ", "source_code": "\n\nclass LeastSquaresError(RegressionLossFunction):\n \"\"\"Loss function for least squares (LS) estimation.\n Terminal regions do not need to be updated for least squares.\n\n Parameters\n ----------\n n_classes : int\n Number of classes.\n \"\"\"\n \n def init_estimator(self):\n return DummyRegressor(strategy='mean')\n \n def __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the least squares loss.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves).\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n if sample_weight is None:\n return np.mean((y - raw_predictions.ravel())**2)\n else:\n return 1 / sample_weight.sum() * np.sum(sample_weight * (y - raw_predictions.ravel())**2)\n \n def negative_gradient(self, y, raw_predictions, **kargs):\n \"\"\"Compute half of the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples,)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n \"\"\"\n return y - raw_predictions.ravel()\n \n def update_terminal_regions(self, tree, X, y, residual, raw_predictions, sample_weight, sample_mask, learning_rate=0.1, k=0):\n \"\"\"Least squares does not need to update terminal regions.\n\n But it has to update the predictions.\n\n Parameters\n ----------\n tree : tree.Tree\n The tree object.\n X : ndarray of shape (n_samples, n_features)\n The data array.\n y : ndarray of shape (n_samples,)\n The target labels.\n residual : ndarray of shape (n_samples,)\n The residuals (usually the negative gradient).\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n sample_weight : ndarray of shape (n,)\n The weight of each sample.\n sample_mask : ndarray of shape (n,)\n The sample mask to be used.\n learning_rate : float, default=0.1\n Learning rate shrinks the contribution of each tree by\n ``learning_rate``.\n k : int, default=0\n The index of the estimator being updated.\n \"\"\"\n raw_predictions[:, k] += learning_rate * tree.predict(X).ravel()\n \n def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, raw_predictions, sample_weight):\n pass\n" }, @@ -21234,7 +21300,7 @@ "sklearn.ensemble._gb_losses.MultinomialDeviance.get_init_raw_predictions" ], "is_public": false, - "description": "Multinomial deviance loss function for multi-class classification.\n\nFor multi-class classification we need to fit ``n_classes`` trees at each stage.", + "description": "Multinomial deviance loss function for multi-class classification.\n\nFor multi-class classification we need to fit ``n_classes`` trees at\neach stage.", "docstring": "Multinomial deviance loss function for multi-class classification.\n\n For multi-class classification we need to fit ``n_classes`` trees at\n each stage.\n\n Parameters\n ----------\n n_classes : int\n Number of classes.\n ", "source_code": "\n\nclass MultinomialDeviance(ClassificationLossFunction):\n \"\"\"Multinomial deviance loss function for multi-class classification.\n\n For multi-class classification we need to fit ``n_classes`` trees at\n each stage.\n\n Parameters\n ----------\n n_classes : int\n Number of classes.\n \"\"\"\n is_multi_class = True\n \n def __init__(self, n_classes):\n if n_classes < 3:\n raise ValueError('{0:s} requires more than 2 classes.'.format(self.__class__.__name__))\n super().__init__(n_classes)\n \n def init_estimator(self):\n return DummyClassifier(strategy='prior')\n \n def __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the Multinomial deviance.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n Y = np.zeros((y.shape[0], self.K), dtype=np.float64)\n for k in range(self.K):\n Y[:, k] = y == k\n return np.average(-1 * (Y * raw_predictions).sum(axis=1) + logsumexp(raw_predictions, axis=1), weights=sample_weight)\n \n def negative_gradient(self, y, raw_predictions, k=0, **kwargs):\n \"\"\"Compute negative gradient for the ``k``-th class.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n\n k : int, default=0\n The index of the class.\n \"\"\"\n return y - np.nan_to_num(np.exp(raw_predictions[:, k] - logsumexp(raw_predictions, axis=1)))\n \n def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, raw_predictions, sample_weight):\n \"\"\"Make a single Newton-Raphson step.\"\"\"\n terminal_region = np.where(terminal_regions == leaf)[0]\n residual = residual.take(terminal_region, axis=0)\n y = y.take(terminal_region, axis=0)\n sample_weight = sample_weight.take(terminal_region, axis=0)\n numerator = np.sum(sample_weight * residual)\n numerator *= (self.K - 1) / self.K\n denominator = np.sum(sample_weight * (y - residual) * (1 - y + residual))\n if abs(denominator) < 1e-150:\n tree.value[leaf, 0, 0] = 0.0\n else:\n tree.value[leaf, 0, 0] = numerator / denominator\n \n def _raw_prediction_to_proba(self, raw_predictions):\n return np.nan_to_num(np.exp(raw_predictions - logsumexp(raw_predictions, axis=1)[:, np.newaxis]))\n \n def _raw_prediction_to_decision(self, raw_predictions):\n proba = self._raw_prediction_to_proba(raw_predictions)\n return np.argmax(proba, axis=1)\n \n def get_init_raw_predictions(self, X, estimator):\n probas = estimator.predict_proba(X)\n eps = np.finfo(np.float32).eps\n probas = np.clip(probas, eps, 1 - eps)\n raw_predictions = np.log(probas).astype(np.float64)\n return raw_predictions\n" }, @@ -21251,7 +21317,7 @@ "sklearn.ensemble._gb_losses.QuantileLossFunction._update_terminal_region" ], "is_public": false, - "description": "Loss function for quantile regression.\n\nQuantile regression allows to estimate the percentiles of the conditional distribution of the target.", + "description": "Loss function for quantile regression.\n\nQuantile regression allows to estimate the percentiles\nof the conditional distribution of the target.", "docstring": "Loss function for quantile regression.\n\n Quantile regression allows to estimate the percentiles\n of the conditional distribution of the target.\n\n Parameters\n ----------\n alpha : float, default=0.9\n The percentile.\n ", "source_code": "\n\nclass QuantileLossFunction(RegressionLossFunction):\n \"\"\"Loss function for quantile regression.\n\n Quantile regression allows to estimate the percentiles\n of the conditional distribution of the target.\n\n Parameters\n ----------\n alpha : float, default=0.9\n The percentile.\n \"\"\"\n \n def __init__(self, alpha=0.9):\n super().__init__()\n self.alpha = alpha\n self.percentile = alpha * 100\n \n def init_estimator(self):\n return DummyRegressor(strategy='quantile', quantile=self.alpha)\n \n def __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the Quantile loss.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n raw_predictions = raw_predictions.ravel()\n diff = y - raw_predictions\n alpha = self.alpha\n mask = y > raw_predictions\n if sample_weight is None:\n loss = (alpha * diff[mask].sum() - (1 - alpha) * diff[~mask].sum()) / y.shape[0]\n else:\n loss = (alpha * np.sum(sample_weight[mask] * diff[mask]) - (1 - alpha) * np.sum(sample_weight[~mask] * diff[~mask])) / sample_weight.sum()\n return loss\n \n def negative_gradient(self, y, raw_predictions, **kargs):\n \"\"\"Compute the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n \"\"\"\n alpha = self.alpha\n raw_predictions = raw_predictions.ravel()\n mask = y > raw_predictions\n return alpha * mask - (1 - alpha) * ~mask\n \n def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, raw_predictions, sample_weight):\n terminal_region = np.where(terminal_regions == leaf)[0]\n diff = y.take(terminal_region, axis=0) - raw_predictions.take(terminal_region, axis=0)\n sample_weight = sample_weight.take(terminal_region, axis=0)\n val = _weighted_percentile(diff, sample_weight, self.percentile)\n tree.value[leaf, 0] = val\n" }, @@ -21282,7 +21348,7 @@ "sklearn.ensemble._hist_gradient_boosting.binning._BinMapper.make_known_categories_bitsets" ], "is_public": false, - "description": "Transformer that maps a dataset into integer-valued bins.\n\nFor continuous features, the bins are created in a feature-wise fashion, using quantiles so that each bins contains approximately the same number of samples. For large datasets, quantiles are computed on a subset of the data to speed-up the binning, but the quantiles should remain stable. For categorical features, the raw categorical values are expected to be in [0, 254] (this is not validated here though) and each category corresponds to a bin. All categorical values must be known at initialization: transform() doesn't know how to bin unknown categorical values. Note that transform() is only used on non-training data in the case of early stopping. Features with a small number of values may be binned into less than ``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved for missing values.", + "description": "Transformer that maps a dataset into integer-valued bins.\n\nFor continuous features, the bins are created in a feature-wise fashion,\nusing quantiles so that each bins contains approximately the same number\nof samples. For large datasets, quantiles are computed on a subset of the\ndata to speed-up the binning, but the quantiles should remain stable.\n\nFor categorical features, the raw categorical values are expected to be\nin [0, 254] (this is not validated here though) and each category\ncorresponds to a bin. All categorical values must be known at\ninitialization: transform() doesn't know how to bin unknown categorical\nvalues. Note that transform() is only used on non-training data in the\ncase of early stopping.\n\nFeatures with a small number of values may be binned into less than\n``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved\nfor missing values.", "docstring": "Transformer that maps a dataset into integer-valued bins.\n\n For continuous features, the bins are created in a feature-wise fashion,\n using quantiles so that each bins contains approximately the same number\n of samples. For large datasets, quantiles are computed on a subset of the\n data to speed-up the binning, but the quantiles should remain stable.\n\n For categorical features, the raw categorical values are expected to be\n in [0, 254] (this is not validated here though) and each category\n corresponds to a bin. All categorical values must be known at\n initialization: transform() doesn't know how to bin unknown categorical\n values. Note that transform() is only used on non-training data in the\n case of early stopping.\n\n Features with a small number of values may be binned into less than\n ``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved\n for missing values.\n\n Parameters\n ----------\n n_bins : int, default=256\n The maximum number of bins to use (including the bin for missing\n values). Should be in [3, 256]. Non-missing values are binned on\n ``max_bins = n_bins - 1`` bins. The last bin is always reserved for\n missing values. If for a given feature the number of unique values is\n less than ``max_bins``, then those unique values will be used to\n compute the bin thresholds, instead of the quantiles. For categorical\n features indicated by ``is_categorical``, the docstring for\n ``is_categorical`` details on this procedure.\n subsample : int or None, default=2e5\n If ``n_samples > subsample``, then ``sub_samples`` samples will be\n randomly chosen to compute the quantiles. If ``None``, the whole data\n is used.\n is_categorical : ndarray of bool of shape (n_features,), default=None\n Indicates categorical features. By default, all features are\n considered continuous.\n known_categories : list of {ndarray, None} of shape (n_features,), default=none\n For each categorical feature, the array indicates the set of unique\n categorical values. These should be the possible values over all the\n data, not just the training data. For continuous features, the\n corresponding entry should be None.\n random_state: int, RandomState instance or None, default=None\n Pseudo-random number generator to control the random sub-sampling.\n Pass an int for reproducible output across multiple\n function calls.\n See :term:`Glossary `.\n n_threads : int, default=None\n Number of OpenMP threads to use. `_openmp_effective_n_threads` is called\n to determine the effective number of threads use, which takes cgroups CPU\n quotes into account. See the docstring of `_openmp_effective_n_threads`\n for details.\n\n Attributes\n ----------\n bin_thresholds_ : list of ndarray\n For each feature, each array indicates how to map a feature into a\n binned feature. The semantic and size depends on the nature of the\n feature:\n - for real-valued features, the array corresponds to the real-valued\n bin thresholds (the upper bound of each bin). There are ``max_bins\n - 1`` thresholds, where ``max_bins = n_bins - 1`` is the number of\n bins used for non-missing values.\n - for categorical features, the array is a map from a binned category\n value to the raw category value. The size of the array is equal to\n ``min(max_bins, category_cardinality)`` where we ignore missing\n values in the cardinality.\n n_bins_non_missing_ : ndarray, dtype=np.uint32\n For each feature, gives the number of bins actually used for\n non-missing values. For features with a lot of unique values, this is\n equal to ``n_bins - 1``.\n is_categorical_ : ndarray of shape (n_features,), dtype=np.uint8\n Indicator for categorical features.\n missing_values_bin_idx_ : np.uint8\n The index of the bin where missing values are mapped. This is a\n constant across all features. This corresponds to the last bin, and\n it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_``\n is less than ``n_bins - 1`` for a given feature, then there are\n empty (and unused) bins.\n ", "source_code": "\n\nclass _BinMapper(TransformerMixin, BaseEstimator):\n \"\"\"Transformer that maps a dataset into integer-valued bins.\n\n For continuous features, the bins are created in a feature-wise fashion,\n using quantiles so that each bins contains approximately the same number\n of samples. For large datasets, quantiles are computed on a subset of the\n data to speed-up the binning, but the quantiles should remain stable.\n\n For categorical features, the raw categorical values are expected to be\n in [0, 254] (this is not validated here though) and each category\n corresponds to a bin. All categorical values must be known at\n initialization: transform() doesn't know how to bin unknown categorical\n values. Note that transform() is only used on non-training data in the\n case of early stopping.\n\n Features with a small number of values may be binned into less than\n ``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved\n for missing values.\n\n Parameters\n ----------\n n_bins : int, default=256\n The maximum number of bins to use (including the bin for missing\n values). Should be in [3, 256]. Non-missing values are binned on\n ``max_bins = n_bins - 1`` bins. The last bin is always reserved for\n missing values. If for a given feature the number of unique values is\n less than ``max_bins``, then those unique values will be used to\n compute the bin thresholds, instead of the quantiles. For categorical\n features indicated by ``is_categorical``, the docstring for\n ``is_categorical`` details on this procedure.\n subsample : int or None, default=2e5\n If ``n_samples > subsample``, then ``sub_samples`` samples will be\n randomly chosen to compute the quantiles. If ``None``, the whole data\n is used.\n is_categorical : ndarray of bool of shape (n_features,), default=None\n Indicates categorical features. By default, all features are\n considered continuous.\n known_categories : list of {ndarray, None} of shape (n_features,), default=none\n For each categorical feature, the array indicates the set of unique\n categorical values. These should be the possible values over all the\n data, not just the training data. For continuous features, the\n corresponding entry should be None.\n random_state: int, RandomState instance or None, default=None\n Pseudo-random number generator to control the random sub-sampling.\n Pass an int for reproducible output across multiple\n function calls.\n See :term:`Glossary `.\n n_threads : int, default=None\n Number of OpenMP threads to use. `_openmp_effective_n_threads` is called\n to determine the effective number of threads use, which takes cgroups CPU\n quotes into account. See the docstring of `_openmp_effective_n_threads`\n for details.\n\n Attributes\n ----------\n bin_thresholds_ : list of ndarray\n For each feature, each array indicates how to map a feature into a\n binned feature. The semantic and size depends on the nature of the\n feature:\n - for real-valued features, the array corresponds to the real-valued\n bin thresholds (the upper bound of each bin). There are ``max_bins\n - 1`` thresholds, where ``max_bins = n_bins - 1`` is the number of\n bins used for non-missing values.\n - for categorical features, the array is a map from a binned category\n value to the raw category value. The size of the array is equal to\n ``min(max_bins, category_cardinality)`` where we ignore missing\n values in the cardinality.\n n_bins_non_missing_ : ndarray, dtype=np.uint32\n For each feature, gives the number of bins actually used for\n non-missing values. For features with a lot of unique values, this is\n equal to ``n_bins - 1``.\n is_categorical_ : ndarray of shape (n_features,), dtype=np.uint8\n Indicator for categorical features.\n missing_values_bin_idx_ : np.uint8\n The index of the bin where missing values are mapped. This is a\n constant across all features. This corresponds to the last bin, and\n it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_``\n is less than ``n_bins - 1`` for a given feature, then there are\n empty (and unused) bins.\n \"\"\"\n \n def __init__(self, n_bins=256, subsample=int(200000.0), is_categorical=None, known_categories=None, random_state=None, n_threads=None):\n self.n_bins = n_bins\n self.subsample = subsample\n self.is_categorical = is_categorical\n self.known_categories = known_categories\n self.random_state = random_state\n self.n_threads = n_threads\n \n def fit(self, X, y=None):\n \"\"\"Fit data X by computing the binning thresholds.\n\n The last bin is reserved for missing values, whether missing values\n are present in the data or not.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to bin.\n y: None\n Ignored.\n\n Returns\n -------\n self : object\n \"\"\"\n if not 3 <= self.n_bins <= 256:\n raise ValueError('n_bins={} should be no smaller than 3 and no larger than 256.'.format(self.n_bins))\n X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)\n max_bins = self.n_bins - 1\n rng = check_random_state(self.random_state)\n if self.subsample is not None and X.shape[0] > self.subsample:\n subset = rng.choice(X.shape[0], self.subsample, replace=False)\n X = X.take(subset, axis=0)\n if self.is_categorical is None:\n self.is_categorical_ = np.zeros(X.shape[1], dtype=np.uint8)\n else:\n self.is_categorical_ = np.asarray(self.is_categorical, dtype=np.uint8)\n n_features = X.shape[1]\n known_categories = self.known_categories\n if known_categories is None:\n known_categories = [None] * n_features\n for f_idx in range(n_features):\n is_categorical = self.is_categorical_[f_idx]\n known_cats = known_categories[f_idx]\n if is_categorical and known_cats is None:\n raise ValueError(f'Known categories for feature {f_idx} must be provided.')\n if not is_categorical and known_cats is not None:\n raise ValueError(f\"Feature {f_idx} isn't marked as a categorical feature, but categories were passed.\")\n self.missing_values_bin_idx_ = self.n_bins - 1\n self.bin_thresholds_ = []\n n_bins_non_missing = []\n for f_idx in range(n_features):\n if not self.is_categorical_[f_idx]:\n thresholds = _find_binning_thresholds(X[:, f_idx], max_bins)\n n_bins_non_missing.append(thresholds.shape[0] + 1)\n else:\n thresholds = known_categories[f_idx]\n n_bins_non_missing.append(thresholds.shape[0])\n self.bin_thresholds_.append(thresholds)\n self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint32)\n return self\n \n def transform(self, X):\n \"\"\"Bin data X.\n\n Missing values will be mapped to the last bin.\n\n For categorical features, the mapping will be incorrect for unknown\n categories. Since the BinMapper is given known_categories of the\n entire training data (i.e. before the call to train_test_split() in\n case of early-stopping), this never happens.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to bin.\n\n Returns\n -------\n X_binned : array-like of shape (n_samples, n_features)\n The binned data (fortran-aligned).\n \"\"\"\n X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)\n check_is_fitted(self)\n if X.shape[1] != self.n_bins_non_missing_.shape[0]:\n raise ValueError('This estimator was fitted with {} features but {} got passed to transform()'.format(self.n_bins_non_missing_.shape[0], X.shape[1]))\n n_threads = _openmp_effective_n_threads(self.n_threads)\n binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F')\n _map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_, n_threads, binned)\n return binned\n \n def make_known_categories_bitsets(self):\n \"\"\"Create bitsets of known categories.\n\n Returns\n -------\n - known_cat_bitsets : ndarray of shape (n_categorical_features, 8)\n Array of bitsets of known categories, for each categorical feature.\n - f_idx_map : ndarray of shape (n_features,)\n Map from original feature index to the corresponding index in the\n known_cat_bitsets array.\n \"\"\"\n categorical_features_indices = np.flatnonzero(self.is_categorical_)\n n_features = self.is_categorical_.size\n n_categorical_features = categorical_features_indices.size\n f_idx_map = np.zeros(n_features, dtype=np.uint32)\n f_idx_map[categorical_features_indices] = np.arange(n_categorical_features, dtype=np.uint32)\n known_categories = self.bin_thresholds_\n known_cat_bitsets = np.zeros((n_categorical_features, 8), dtype=X_BITSET_INNER_DTYPE)\n for (mapped_f_idx, f_idx) in enumerate(categorical_features_indices):\n for raw_cat_val in known_categories[f_idx]:\n set_bitset_memoryview(known_cat_bitsets[mapped_f_idx], raw_cat_val)\n return known_cat_bitsets, f_idx_map\n" }, @@ -21335,7 +21401,7 @@ "sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier._get_loss" ], "is_public": true, - "description": "Histogram-based Gradient Boosting Classification Tree.\n\nThis estimator is much faster than :class:`GradientBoostingClassifier` for big datasets (n_samples >= 10 000). This estimator has native support for missing values (NaNs). During training, the tree grower learns at each split point whether samples with missing values should go to the left or right child, based on the potential gain. When predicting, samples with missing values are assigned to the left or right child consequently. If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most samples. This implementation is inspired by `LightGBM `_. Read more in the :ref:`User Guide `. .. versionadded:: 0.21", + "description": "Histogram-based Gradient Boosting Classification Tree.\n\nThis estimator is much faster than\n:class:`GradientBoostingClassifier`\nfor big datasets (n_samples >= 10 000).\n\nThis estimator has native support for missing values (NaNs). During\ntraining, the tree grower learns at each split point whether samples\nwith missing values should go to the left or right child, based on the\npotential gain. When predicting, samples with missing values are\nassigned to the left or right child consequently. If no missing values\nwere encountered for a given feature during training, then samples with\nmissing values are mapped to whichever child has the most samples.\n\nThis implementation is inspired by\n`LightGBM `_.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.21", "docstring": "Histogram-based Gradient Boosting Classification Tree.\n\n This estimator is much faster than\n :class:`GradientBoostingClassifier`\n for big datasets (n_samples >= 10 000).\n\n This estimator has native support for missing values (NaNs). During\n training, the tree grower learns at each split point whether samples\n with missing values should go to the left or right child, based on the\n potential gain. When predicting, samples with missing values are\n assigned to the left or right child consequently. If no missing values\n were encountered for a given feature during training, then samples with\n missing values are mapped to whichever child has the most samples.\n\n This implementation is inspired by\n `LightGBM `_.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.21\n\n Parameters\n ----------\n loss : {'auto', 'binary_crossentropy', 'categorical_crossentropy'}, default='auto'\n The loss function to use in the boosting process. 'binary_crossentropy'\n (also known as logistic loss) is used for binary classification and\n generalizes to 'categorical_crossentropy' for multiclass\n classification. 'auto' will automatically choose either loss depending\n on the nature of the problem.\n learning_rate : float, default=0.1\n The learning rate, also known as *shrinkage*. This is used as a\n multiplicative factor for the leaves values. Use ``1`` for no\n shrinkage.\n max_iter : int, default=100\n The maximum number of iterations of the boosting process, i.e. the\n maximum number of trees for binary classification. For multiclass\n classification, `n_classes` trees per iteration are built.\n max_leaf_nodes : int or None, default=31\n The maximum number of leaves for each tree. Must be strictly greater\n than 1. If None, there is no maximum limit.\n max_depth : int or None, default=None\n The maximum depth of each tree. The depth of a tree is the number of\n edges to go from the root to the deepest leaf.\n Depth isn't constrained by default.\n min_samples_leaf : int, default=20\n The minimum number of samples per leaf. For small datasets with less\n than a few hundred samples, it is recommended to lower this value\n since only very shallow trees would be built.\n l2_regularization : float, default=0\n The L2 regularization parameter. Use 0 for no regularization.\n max_bins : int, default=255\n The maximum number of bins to use for non-missing values. Before\n training, each feature of the input array `X` is binned into\n integer-valued bins, which allows for a much faster training stage.\n Features with a small number of unique values may use less than\n ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin\n is always reserved for missing values. Must be no larger than 255.\n categorical_features : array-like of {bool, int} of shape (n_features) or shape (n_categorical_features,), default=None\n Indicates the categorical features.\n\n - None : no feature will be considered categorical.\n - boolean array-like : boolean mask indicating categorical features.\n - integer array-like : integer indices indicating categorical\n features.\n\n For each categorical feature, there must be at most `max_bins` unique\n categories, and each categorical value must be in [0, max_bins -1].\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n monotonic_cst : array-like of int of shape (n_features), default=None\n Indicates the monotonic constraint to enforce on each feature. -1, 1\n and 0 respectively correspond to a negative constraint, positive\n constraint and no constraint. Read more in the :ref:`User Guide\n `.\n\n .. versionadded:: 0.23\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble. For results to be valid, the\n estimator should be re-trained on the same data only.\n See :term:`the Glossary `.\n early_stopping : 'auto' or bool, default='auto'\n If 'auto', early stopping is enabled if the sample size is larger than\n 10000. If True, early stopping is enabled, otherwise early stopping is\n disabled.\n\n .. versionadded:: 0.23\n\n scoring : str or callable or None, default='loss'\n Scoring parameter to use for early stopping. It can be a single\n string (see :ref:`scoring_parameter`) or a callable (see\n :ref:`scoring`). If None, the estimator's default scorer\n is used. If ``scoring='loss'``, early stopping is checked\n w.r.t the loss value. Only used if early stopping is performed.\n validation_fraction : int or float or None, default=0.1\n Proportion (or absolute size) of training data to set aside as\n validation data for early stopping. If None, early stopping is done on\n the training data. Only used if early stopping is performed.\n n_iter_no_change : int, default=10\n Used to determine when to \"early stop\". The fitting process is\n stopped when none of the last ``n_iter_no_change`` scores are better\n than the ``n_iter_no_change - 1`` -th-to-last one, up to some\n tolerance. Only used if early stopping is performed.\n tol : float, default=1e-7\n The absolute tolerance to use when comparing scores. The higher the\n tolerance, the more likely we are to early stop: higher tolerance\n means that it will be harder for subsequent iterations to be\n considered an improvement upon the reference score.\n verbose : int, default=0\n The verbosity level. If not zero, print some information about the\n fitting process.\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the subsampling in the\n binning process, and the train/validation data split if early stopping\n is enabled.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n classes_ : array, shape = (n_classes,)\n Class labels.\n do_early_stopping_ : bool\n Indicates whether early stopping is used during training.\n n_iter_ : int\n The number of iterations as selected by early stopping, depending on\n the `early_stopping` parameter. Otherwise it corresponds to max_iter.\n n_trees_per_iteration_ : int\n The number of tree that are built at each iteration. This is equal to 1\n for binary classification, and to ``n_classes`` for multiclass\n classification.\n train_score_ : ndarray, shape (n_iter_+1,)\n The scores at each iteration on the training data. The first entry\n is the score of the ensemble before the first iteration. Scores are\n computed according to the ``scoring`` parameter. If ``scoring`` is\n not 'loss', scores are computed on a subset of at most 10 000\n samples. Empty if no early stopping.\n validation_score_ : ndarray, shape (n_iter_+1,)\n The scores at each iteration on the held-out validation data. The\n first entry is the score of the ensemble before the first iteration.\n Scores are computed according to the ``scoring`` parameter. Empty if\n no early stopping or if ``validation_fraction`` is None.\n is_categorical_ : ndarray, shape (n_features, ) or None\n Boolean mask for the categorical features. ``None`` if there are no\n categorical features.\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GradientBoostingClassifier : Exact gradient boosting method that does not\n scale as good on datasets with a large number of samples.\n sklearn.tree.DecisionTreeClassifier : A decision tree classifier.\n RandomForestClassifier : A meta-estimator that fits a number of decision\n tree classifiers on various sub-samples of the dataset and uses\n averaging to improve the predictive accuracy and control over-fitting.\n AdaBoostClassifier : A meta-estimator that begins by fitting a classifier\n on the original dataset and then fits additional copies of the\n classifier on the same dataset where the weights of incorrectly\n classified instances are adjusted such that subsequent classifiers\n focus more on difficult cases.\n\n Examples\n --------\n >>> from sklearn.ensemble import HistGradientBoostingClassifier\n >>> from sklearn.datasets import load_iris\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = HistGradientBoostingClassifier().fit(X, y)\n >>> clf.score(X, y)\n 1.0\n ", "source_code": "\n\nclass HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):\n \"\"\"Histogram-based Gradient Boosting Classification Tree.\n\n This estimator is much faster than\n :class:`GradientBoostingClassifier`\n for big datasets (n_samples >= 10 000).\n\n This estimator has native support for missing values (NaNs). During\n training, the tree grower learns at each split point whether samples\n with missing values should go to the left or right child, based on the\n potential gain. When predicting, samples with missing values are\n assigned to the left or right child consequently. If no missing values\n were encountered for a given feature during training, then samples with\n missing values are mapped to whichever child has the most samples.\n\n This implementation is inspired by\n `LightGBM `_.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.21\n\n Parameters\n ----------\n loss : {'auto', 'binary_crossentropy', 'categorical_crossentropy'}, default='auto'\n The loss function to use in the boosting process. 'binary_crossentropy'\n (also known as logistic loss) is used for binary classification and\n generalizes to 'categorical_crossentropy' for multiclass\n classification. 'auto' will automatically choose either loss depending\n on the nature of the problem.\n learning_rate : float, default=0.1\n The learning rate, also known as *shrinkage*. This is used as a\n multiplicative factor for the leaves values. Use ``1`` for no\n shrinkage.\n max_iter : int, default=100\n The maximum number of iterations of the boosting process, i.e. the\n maximum number of trees for binary classification. For multiclass\n classification, `n_classes` trees per iteration are built.\n max_leaf_nodes : int or None, default=31\n The maximum number of leaves for each tree. Must be strictly greater\n than 1. If None, there is no maximum limit.\n max_depth : int or None, default=None\n The maximum depth of each tree. The depth of a tree is the number of\n edges to go from the root to the deepest leaf.\n Depth isn't constrained by default.\n min_samples_leaf : int, default=20\n The minimum number of samples per leaf. For small datasets with less\n than a few hundred samples, it is recommended to lower this value\n since only very shallow trees would be built.\n l2_regularization : float, default=0\n The L2 regularization parameter. Use 0 for no regularization.\n max_bins : int, default=255\n The maximum number of bins to use for non-missing values. Before\n training, each feature of the input array `X` is binned into\n integer-valued bins, which allows for a much faster training stage.\n Features with a small number of unique values may use less than\n ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin\n is always reserved for missing values. Must be no larger than 255.\n categorical_features : array-like of {bool, int} of shape (n_features) or shape (n_categorical_features,), default=None\n Indicates the categorical features.\n\n - None : no feature will be considered categorical.\n - boolean array-like : boolean mask indicating categorical features.\n - integer array-like : integer indices indicating categorical\n features.\n\n For each categorical feature, there must be at most `max_bins` unique\n categories, and each categorical value must be in [0, max_bins -1].\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n monotonic_cst : array-like of int of shape (n_features), default=None\n Indicates the monotonic constraint to enforce on each feature. -1, 1\n and 0 respectively correspond to a negative constraint, positive\n constraint and no constraint. Read more in the :ref:`User Guide\n `.\n\n .. versionadded:: 0.23\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble. For results to be valid, the\n estimator should be re-trained on the same data only.\n See :term:`the Glossary `.\n early_stopping : 'auto' or bool, default='auto'\n If 'auto', early stopping is enabled if the sample size is larger than\n 10000. If True, early stopping is enabled, otherwise early stopping is\n disabled.\n\n .. versionadded:: 0.23\n\n scoring : str or callable or None, default='loss'\n Scoring parameter to use for early stopping. It can be a single\n string (see :ref:`scoring_parameter`) or a callable (see\n :ref:`scoring`). If None, the estimator's default scorer\n is used. If ``scoring='loss'``, early stopping is checked\n w.r.t the loss value. Only used if early stopping is performed.\n validation_fraction : int or float or None, default=0.1\n Proportion (or absolute size) of training data to set aside as\n validation data for early stopping. If None, early stopping is done on\n the training data. Only used if early stopping is performed.\n n_iter_no_change : int, default=10\n Used to determine when to \"early stop\". The fitting process is\n stopped when none of the last ``n_iter_no_change`` scores are better\n than the ``n_iter_no_change - 1`` -th-to-last one, up to some\n tolerance. Only used if early stopping is performed.\n tol : float, default=1e-7\n The absolute tolerance to use when comparing scores. The higher the\n tolerance, the more likely we are to early stop: higher tolerance\n means that it will be harder for subsequent iterations to be\n considered an improvement upon the reference score.\n verbose : int, default=0\n The verbosity level. If not zero, print some information about the\n fitting process.\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the subsampling in the\n binning process, and the train/validation data split if early stopping\n is enabled.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n classes_ : array, shape = (n_classes,)\n Class labels.\n do_early_stopping_ : bool\n Indicates whether early stopping is used during training.\n n_iter_ : int\n The number of iterations as selected by early stopping, depending on\n the `early_stopping` parameter. Otherwise it corresponds to max_iter.\n n_trees_per_iteration_ : int\n The number of tree that are built at each iteration. This is equal to 1\n for binary classification, and to ``n_classes`` for multiclass\n classification.\n train_score_ : ndarray, shape (n_iter_+1,)\n The scores at each iteration on the training data. The first entry\n is the score of the ensemble before the first iteration. Scores are\n computed according to the ``scoring`` parameter. If ``scoring`` is\n not 'loss', scores are computed on a subset of at most 10 000\n samples. Empty if no early stopping.\n validation_score_ : ndarray, shape (n_iter_+1,)\n The scores at each iteration on the held-out validation data. The\n first entry is the score of the ensemble before the first iteration.\n Scores are computed according to the ``scoring`` parameter. Empty if\n no early stopping or if ``validation_fraction`` is None.\n is_categorical_ : ndarray, shape (n_features, ) or None\n Boolean mask for the categorical features. ``None`` if there are no\n categorical features.\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GradientBoostingClassifier : Exact gradient boosting method that does not\n scale as good on datasets with a large number of samples.\n sklearn.tree.DecisionTreeClassifier : A decision tree classifier.\n RandomForestClassifier : A meta-estimator that fits a number of decision\n tree classifiers on various sub-samples of the dataset and uses\n averaging to improve the predictive accuracy and control over-fitting.\n AdaBoostClassifier : A meta-estimator that begins by fitting a classifier\n on the original dataset and then fits additional copies of the\n classifier on the same dataset where the weights of incorrectly\n classified instances are adjusted such that subsequent classifiers\n focus more on difficult cases.\n\n Examples\n --------\n >>> from sklearn.ensemble import HistGradientBoostingClassifier\n >>> from sklearn.datasets import load_iris\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = HistGradientBoostingClassifier().fit(X, y)\n >>> clf.score(X, y)\n 1.0\n \"\"\"\n _VALID_LOSSES = ('binary_crossentropy', 'categorical_crossentropy', 'auto')\n \n def __init__(self, loss='auto', *, learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0.0, max_bins=255, categorical_features=None, monotonic_cst=None, warm_start=False, early_stopping='auto', scoring='loss', validation_fraction=0.1, n_iter_no_change=10, tol=1e-07, verbose=0, random_state=None):\n super(HistGradientBoostingClassifier, self).__init__(loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, categorical_features=categorical_features, monotonic_cst=monotonic_cst, warm_start=warm_start, early_stopping=early_stopping, scoring=scoring, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, random_state=random_state)\n \n def predict(self, X):\n \"\"\"Predict classes for X.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n y : ndarray, shape (n_samples,)\n The predicted classes.\n \"\"\"\n encoded_classes = np.argmax(self.predict_proba(X), axis=1)\n return self.classes_[encoded_classes]\n \n def staged_predict(self, X):\n \"\"\"Predict classes at each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted classes of the input samples, for each iteration.\n \"\"\"\n for proba in self.staged_predict_proba(X):\n encoded_classes = np.argmax(proba, axis=1)\n yield self.classes_.take(encoded_classes, axis=0)\n \n def predict_proba(self, X):\n \"\"\"Predict class probabilities for X.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n p : ndarray, shape (n_samples, n_classes)\n The class probabilities of the input samples.\n \"\"\"\n raw_predictions = self._raw_predict(X)\n return self._loss.predict_proba(raw_predictions)\n \n def staged_predict_proba(self, X):\n \"\"\"Predict class probabilities at each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted class probabilities of the input samples,\n for each iteration.\n \"\"\"\n for raw_predictions in self._staged_raw_predict(X):\n yield self._loss.predict_proba(raw_predictions)\n \n def decision_function(self, X):\n \"\"\"Compute the decision function of ``X``.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n decision : ndarray, shape (n_samples,) or (n_samples, n_trees_per_iteration)\n The raw predicted values (i.e. the sum of the trees leaves) for\n each sample. n_trees_per_iteration is equal to the number of\n classes in multiclass classification.\n \"\"\"\n decision = self._raw_predict(X)\n if decision.shape[0] == 1:\n decision = decision.ravel()\n return decision.T\n \n def staged_decision_function(self, X):\n \"\"\"Compute decision function of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n decision : generator of ndarray of shape (n_samples,) or (n_samples, n_trees_per_iteration)\n The decision function of the input samples, which corresponds to\n the raw values predicted from the trees of the ensemble . The\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n for staged_decision in self._staged_raw_predict(X):\n if staged_decision.shape[0] == 1:\n staged_decision = staged_decision.ravel()\n yield staged_decision.T\n \n def _encode_y(self, y):\n check_classification_targets(y)\n label_encoder = LabelEncoder()\n encoded_y = label_encoder.fit_transform(y)\n self.classes_ = label_encoder.classes_\n n_classes = self.classes_.shape[0]\n self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes\n encoded_y = encoded_y.astype(Y_DTYPE, copy=False)\n return encoded_y\n \n def _get_loss(self, sample_weight, n_threads):\n if self.loss == 'categorical_crossentropy' and self.n_trees_per_iteration_ == 1:\n raise ValueError(\"'categorical_crossentropy' is not suitable for a binary classification problem. Please use 'auto' or 'binary_crossentropy' instead.\")\n if self.loss == 'auto':\n if self.n_trees_per_iteration_ == 1:\n return _LOSSES['binary_crossentropy'](sample_weight=sample_weight, n_threads=n_threads)\n else:\n return _LOSSES['categorical_crossentropy'](sample_weight=sample_weight, n_threads=n_threads)\n return _LOSSES[self.loss](sample_weight=sample_weight, n_threads=n_threads)\n" }, @@ -21352,7 +21418,7 @@ "sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor._get_loss" ], "is_public": true, - "description": "Histogram-based Gradient Boosting Regression Tree.\n\nThis estimator is much faster than :class:`GradientBoostingRegressor` for big datasets (n_samples >= 10 000). This estimator has native support for missing values (NaNs). During training, the tree grower learns at each split point whether samples with missing values should go to the left or right child, based on the potential gain. When predicting, samples with missing values are assigned to the left or right child consequently. If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most samples. This implementation is inspired by `LightGBM `_. Read more in the :ref:`User Guide `. .. versionadded:: 0.21", + "description": "Histogram-based Gradient Boosting Regression Tree.\n\nThis estimator is much faster than\n:class:`GradientBoostingRegressor`\nfor big datasets (n_samples >= 10 000).\n\nThis estimator has native support for missing values (NaNs). During\ntraining, the tree grower learns at each split point whether samples\nwith missing values should go to the left or right child, based on the\npotential gain. When predicting, samples with missing values are\nassigned to the left or right child consequently. If no missing values\nwere encountered for a given feature during training, then samples with\nmissing values are mapped to whichever child has the most samples.\n\nThis implementation is inspired by\n`LightGBM `_.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.21", "docstring": "Histogram-based Gradient Boosting Regression Tree.\n\n This estimator is much faster than\n :class:`GradientBoostingRegressor`\n for big datasets (n_samples >= 10 000).\n\n This estimator has native support for missing values (NaNs). During\n training, the tree grower learns at each split point whether samples\n with missing values should go to the left or right child, based on the\n potential gain. When predicting, samples with missing values are\n assigned to the left or right child consequently. If no missing values\n were encountered for a given feature during training, then samples with\n missing values are mapped to whichever child has the most samples.\n\n This implementation is inspired by\n `LightGBM `_.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.21\n\n Parameters\n ----------\n loss : {'squared_error', 'absolute_error', 'poisson'}, default='squared_error'\n The loss function to use in the boosting process. Note that the\n \"squared error\" and \"poisson\" losses actually implement\n \"half least squares loss\" and \"half poisson deviance\" to simplify the\n computation of the gradient. Furthermore, \"poisson\" loss internally\n uses a log-link and requires ``y >= 0``.\n\n .. versionchanged:: 0.23\n Added option 'poisson'.\n\n .. deprecated:: 1.0\n The loss 'least_squares' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n .. deprecated:: 1.0\n The loss 'least_absolute_deviation' was deprecated in v1.0 and will\n be removed in version 1.2. Use `loss='absolute_error'` which is\n equivalent.\n\n learning_rate : float, default=0.1\n The learning rate, also known as *shrinkage*. This is used as a\n multiplicative factor for the leaves values. Use ``1`` for no\n shrinkage.\n max_iter : int, default=100\n The maximum number of iterations of the boosting process, i.e. the\n maximum number of trees.\n max_leaf_nodes : int or None, default=31\n The maximum number of leaves for each tree. Must be strictly greater\n than 1. If None, there is no maximum limit.\n max_depth : int or None, default=None\n The maximum depth of each tree. The depth of a tree is the number of\n edges to go from the root to the deepest leaf.\n Depth isn't constrained by default.\n min_samples_leaf : int, default=20\n The minimum number of samples per leaf. For small datasets with less\n than a few hundred samples, it is recommended to lower this value\n since only very shallow trees would be built.\n l2_regularization : float, default=0\n The L2 regularization parameter. Use ``0`` for no regularization\n (default).\n max_bins : int, default=255\n The maximum number of bins to use for non-missing values. Before\n training, each feature of the input array `X` is binned into\n integer-valued bins, which allows for a much faster training stage.\n Features with a small number of unique values may use less than\n ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin\n is always reserved for missing values. Must be no larger than 255.\n categorical_features : array-like of {bool, int} of shape (n_features) or shape (n_categorical_features,), default=None\n Indicates the categorical features.\n\n - None : no feature will be considered categorical.\n - boolean array-like : boolean mask indicating categorical features.\n - integer array-like : integer indices indicating categorical\n features.\n\n For each categorical feature, there must be at most `max_bins` unique\n categories, and each categorical value must be in [0, max_bins -1].\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n monotonic_cst : array-like of int of shape (n_features), default=None\n Indicates the monotonic constraint to enforce on each feature. -1, 1\n and 0 respectively correspond to a negative constraint, positive\n constraint and no constraint. Read more in the :ref:`User Guide\n `.\n\n .. versionadded:: 0.23\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble. For results to be valid, the\n estimator should be re-trained on the same data only.\n See :term:`the Glossary `.\n early_stopping : 'auto' or bool, default='auto'\n If 'auto', early stopping is enabled if the sample size is larger than\n 10000. If True, early stopping is enabled, otherwise early stopping is\n disabled.\n\n .. versionadded:: 0.23\n\n scoring : str or callable or None, default='loss'\n Scoring parameter to use for early stopping. It can be a single\n string (see :ref:`scoring_parameter`) or a callable (see\n :ref:`scoring`). If None, the estimator's default scorer is used. If\n ``scoring='loss'``, early stopping is checked w.r.t the loss value.\n Only used if early stopping is performed.\n validation_fraction : int or float or None, default=0.1\n Proportion (or absolute size) of training data to set aside as\n validation data for early stopping. If None, early stopping is done on\n the training data. Only used if early stopping is performed.\n n_iter_no_change : int, default=10\n Used to determine when to \"early stop\". The fitting process is\n stopped when none of the last ``n_iter_no_change`` scores are better\n than the ``n_iter_no_change - 1`` -th-to-last one, up to some\n tolerance. Only used if early stopping is performed.\n tol : float, default=1e-7\n The absolute tolerance to use when comparing scores during early\n stopping. The higher the tolerance, the more likely we are to early\n stop: higher tolerance means that it will be harder for subsequent\n iterations to be considered an improvement upon the reference score.\n verbose : int, default=0\n The verbosity level. If not zero, print some information about the\n fitting process.\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the subsampling in the\n binning process, and the train/validation data split if early stopping\n is enabled.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n do_early_stopping_ : bool\n Indicates whether early stopping is used during training.\n n_iter_ : int\n The number of iterations as selected by early stopping, depending on\n the `early_stopping` parameter. Otherwise it corresponds to max_iter.\n n_trees_per_iteration_ : int\n The number of tree that are built at each iteration. For regressors,\n this is always 1.\n train_score_ : ndarray, shape (n_iter_+1,)\n The scores at each iteration on the training data. The first entry\n is the score of the ensemble before the first iteration. Scores are\n computed according to the ``scoring`` parameter. If ``scoring`` is\n not 'loss', scores are computed on a subset of at most 10 000\n samples. Empty if no early stopping.\n validation_score_ : ndarray, shape (n_iter_+1,)\n The scores at each iteration on the held-out validation data. The\n first entry is the score of the ensemble before the first iteration.\n Scores are computed according to the ``scoring`` parameter. Empty if\n no early stopping or if ``validation_fraction`` is None.\n is_categorical_ : ndarray, shape (n_features, ) or None\n Boolean mask for the categorical features. ``None`` if there are no\n categorical features.\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GradientBoostingRegressor : Exact gradient boosting method that does not\n scale as good on datasets with a large number of samples.\n sklearn.tree.DecisionTreeRegressor : A decision tree regressor.\n RandomForestRegressor : A meta-estimator that fits a number of decision\n tree regressors on various sub-samples of the dataset and uses\n averaging to improve the statistical performance and control\n over-fitting.\n AdaBoostRegressor : A meta-estimator that begins by fitting a regressor\n on the original dataset and then fits additional copies of the\n regressor on the same dataset but where the weights of instances are\n adjusted according to the error of the current prediction. As such,\n subsequent regressors focus more on difficult cases.\n\n Examples\n --------\n >>> from sklearn.ensemble import HistGradientBoostingRegressor\n >>> from sklearn.datasets import load_diabetes\n >>> X, y = load_diabetes(return_X_y=True)\n >>> est = HistGradientBoostingRegressor().fit(X, y)\n >>> est.score(X, y)\n 0.92...\n ", "source_code": "\n\nclass HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):\n \"\"\"Histogram-based Gradient Boosting Regression Tree.\n\n This estimator is much faster than\n :class:`GradientBoostingRegressor`\n for big datasets (n_samples >= 10 000).\n\n This estimator has native support for missing values (NaNs). During\n training, the tree grower learns at each split point whether samples\n with missing values should go to the left or right child, based on the\n potential gain. When predicting, samples with missing values are\n assigned to the left or right child consequently. If no missing values\n were encountered for a given feature during training, then samples with\n missing values are mapped to whichever child has the most samples.\n\n This implementation is inspired by\n `LightGBM `_.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.21\n\n Parameters\n ----------\n loss : {'squared_error', 'absolute_error', 'poisson'}, default='squared_error'\n The loss function to use in the boosting process. Note that the\n \"squared error\" and \"poisson\" losses actually implement\n \"half least squares loss\" and \"half poisson deviance\" to simplify the\n computation of the gradient. Furthermore, \"poisson\" loss internally\n uses a log-link and requires ``y >= 0``.\n\n .. versionchanged:: 0.23\n Added option 'poisson'.\n\n .. deprecated:: 1.0\n The loss 'least_squares' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n .. deprecated:: 1.0\n The loss 'least_absolute_deviation' was deprecated in v1.0 and will\n be removed in version 1.2. Use `loss='absolute_error'` which is\n equivalent.\n\n learning_rate : float, default=0.1\n The learning rate, also known as *shrinkage*. This is used as a\n multiplicative factor for the leaves values. Use ``1`` for no\n shrinkage.\n max_iter : int, default=100\n The maximum number of iterations of the boosting process, i.e. the\n maximum number of trees.\n max_leaf_nodes : int or None, default=31\n The maximum number of leaves for each tree. Must be strictly greater\n than 1. If None, there is no maximum limit.\n max_depth : int or None, default=None\n The maximum depth of each tree. The depth of a tree is the number of\n edges to go from the root to the deepest leaf.\n Depth isn't constrained by default.\n min_samples_leaf : int, default=20\n The minimum number of samples per leaf. For small datasets with less\n than a few hundred samples, it is recommended to lower this value\n since only very shallow trees would be built.\n l2_regularization : float, default=0\n The L2 regularization parameter. Use ``0`` for no regularization\n (default).\n max_bins : int, default=255\n The maximum number of bins to use for non-missing values. Before\n training, each feature of the input array `X` is binned into\n integer-valued bins, which allows for a much faster training stage.\n Features with a small number of unique values may use less than\n ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin\n is always reserved for missing values. Must be no larger than 255.\n categorical_features : array-like of {bool, int} of shape (n_features) or shape (n_categorical_features,), default=None\n Indicates the categorical features.\n\n - None : no feature will be considered categorical.\n - boolean array-like : boolean mask indicating categorical features.\n - integer array-like : integer indices indicating categorical\n features.\n\n For each categorical feature, there must be at most `max_bins` unique\n categories, and each categorical value must be in [0, max_bins -1].\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n monotonic_cst : array-like of int of shape (n_features), default=None\n Indicates the monotonic constraint to enforce on each feature. -1, 1\n and 0 respectively correspond to a negative constraint, positive\n constraint and no constraint. Read more in the :ref:`User Guide\n `.\n\n .. versionadded:: 0.23\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble. For results to be valid, the\n estimator should be re-trained on the same data only.\n See :term:`the Glossary `.\n early_stopping : 'auto' or bool, default='auto'\n If 'auto', early stopping is enabled if the sample size is larger than\n 10000. If True, early stopping is enabled, otherwise early stopping is\n disabled.\n\n .. versionadded:: 0.23\n\n scoring : str or callable or None, default='loss'\n Scoring parameter to use for early stopping. It can be a single\n string (see :ref:`scoring_parameter`) or a callable (see\n :ref:`scoring`). If None, the estimator's default scorer is used. If\n ``scoring='loss'``, early stopping is checked w.r.t the loss value.\n Only used if early stopping is performed.\n validation_fraction : int or float or None, default=0.1\n Proportion (or absolute size) of training data to set aside as\n validation data for early stopping. If None, early stopping is done on\n the training data. Only used if early stopping is performed.\n n_iter_no_change : int, default=10\n Used to determine when to \"early stop\". The fitting process is\n stopped when none of the last ``n_iter_no_change`` scores are better\n than the ``n_iter_no_change - 1`` -th-to-last one, up to some\n tolerance. Only used if early stopping is performed.\n tol : float, default=1e-7\n The absolute tolerance to use when comparing scores during early\n stopping. The higher the tolerance, the more likely we are to early\n stop: higher tolerance means that it will be harder for subsequent\n iterations to be considered an improvement upon the reference score.\n verbose : int, default=0\n The verbosity level. If not zero, print some information about the\n fitting process.\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the subsampling in the\n binning process, and the train/validation data split if early stopping\n is enabled.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n do_early_stopping_ : bool\n Indicates whether early stopping is used during training.\n n_iter_ : int\n The number of iterations as selected by early stopping, depending on\n the `early_stopping` parameter. Otherwise it corresponds to max_iter.\n n_trees_per_iteration_ : int\n The number of tree that are built at each iteration. For regressors,\n this is always 1.\n train_score_ : ndarray, shape (n_iter_+1,)\n The scores at each iteration on the training data. The first entry\n is the score of the ensemble before the first iteration. Scores are\n computed according to the ``scoring`` parameter. If ``scoring`` is\n not 'loss', scores are computed on a subset of at most 10 000\n samples. Empty if no early stopping.\n validation_score_ : ndarray, shape (n_iter_+1,)\n The scores at each iteration on the held-out validation data. The\n first entry is the score of the ensemble before the first iteration.\n Scores are computed according to the ``scoring`` parameter. Empty if\n no early stopping or if ``validation_fraction`` is None.\n is_categorical_ : ndarray, shape (n_features, ) or None\n Boolean mask for the categorical features. ``None`` if there are no\n categorical features.\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GradientBoostingRegressor : Exact gradient boosting method that does not\n scale as good on datasets with a large number of samples.\n sklearn.tree.DecisionTreeRegressor : A decision tree regressor.\n RandomForestRegressor : A meta-estimator that fits a number of decision\n tree regressors on various sub-samples of the dataset and uses\n averaging to improve the statistical performance and control\n over-fitting.\n AdaBoostRegressor : A meta-estimator that begins by fitting a regressor\n on the original dataset and then fits additional copies of the\n regressor on the same dataset but where the weights of instances are\n adjusted according to the error of the current prediction. As such,\n subsequent regressors focus more on difficult cases.\n\n Examples\n --------\n >>> from sklearn.ensemble import HistGradientBoostingRegressor\n >>> from sklearn.datasets import load_diabetes\n >>> X, y = load_diabetes(return_X_y=True)\n >>> est = HistGradientBoostingRegressor().fit(X, y)\n >>> est.score(X, y)\n 0.92...\n \"\"\"\n _VALID_LOSSES = ('squared_error', 'least_squares', 'absolute_error', 'least_absolute_deviation', 'poisson')\n \n def __init__(self, loss='squared_error', *, learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0.0, max_bins=255, categorical_features=None, monotonic_cst=None, warm_start=False, early_stopping='auto', scoring='loss', validation_fraction=0.1, n_iter_no_change=10, tol=1e-07, verbose=0, random_state=None):\n super(HistGradientBoostingRegressor, self).__init__(loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, monotonic_cst=monotonic_cst, categorical_features=categorical_features, early_stopping=early_stopping, warm_start=warm_start, scoring=scoring, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, random_state=random_state)\n \n def predict(self, X):\n \"\"\"Predict values for X.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n y : ndarray, shape (n_samples,)\n The predicted values.\n \"\"\"\n check_is_fitted(self)\n return self._loss.inverse_link_function(self._raw_predict(X).ravel())\n \n def staged_predict(self, X):\n \"\"\"Predict regression target for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted values of the input samples, for each iteration.\n \"\"\"\n for raw_predictions in self._staged_raw_predict(X):\n yield self._loss.inverse_link_function(raw_predictions.ravel())\n \n def _encode_y(self, y):\n self.n_trees_per_iteration_ = 1\n y = y.astype(Y_DTYPE, copy=False)\n if self.loss == 'poisson':\n if not (np.all(y >= 0) and np.sum(y) > 0):\n raise ValueError(\"loss='poisson' requires non-negative y and sum(y) > 0.\")\n return y\n \n def _get_loss(self, sample_weight, n_threads):\n if self.loss == 'least_squares':\n warnings.warn(\"The loss 'least_squares' was deprecated in v1.0 and will be removed in version 1.2. Use 'squared_error' which is equivalent.\", FutureWarning)\n return _LOSSES['squared_error'](sample_weight=sample_weight, n_threads=n_threads)\n elif self.loss == 'least_absolute_deviation':\n warnings.warn(\"The loss 'least_absolute_deviation' was deprecated in v1.0 and will be removed in version 1.2. Use 'absolute_error' which is equivalent.\", FutureWarning)\n return _LOSSES['absolute_error'](sample_weight=sample_weight, n_threads=n_threads)\n return _LOSSES[self.loss](sample_weight=sample_weight, n_threads=n_threads)\n" }, @@ -21374,7 +21440,7 @@ "sklearn.ensemble._hist_gradient_boosting.grower.TreeGrower.make_predictor" ], "is_public": false, - "description": "Tree grower class used to build a tree.\n\nThe tree is fitted to predict the values of a Newton-Raphson step. The splits are considered in a best-first fashion, and the quality of a split is defined in splitting._split_gain.", + "description": "Tree grower class used to build a tree.\n\nThe tree is fitted to predict the values of a Newton-Raphson step. The\nsplits are considered in a best-first fashion, and the quality of a\nsplit is defined in splitting._split_gain.", "docstring": "Tree grower class used to build a tree.\n\n The tree is fitted to predict the values of a Newton-Raphson step. The\n splits are considered in a best-first fashion, and the quality of a\n split is defined in splitting._split_gain.\n\n Parameters\n ----------\n X_binned : ndarray of shape (n_samples, n_features), dtype=np.uint8\n The binned input samples. Must be Fortran-aligned.\n gradients : ndarray of shape (n_samples,)\n The gradients of each training sample. Those are the gradients of the\n loss w.r.t the predictions, evaluated at iteration ``i - 1``.\n hessians : ndarray of shape (n_samples,)\n The hessians of each training sample. Those are the hessians of the\n loss w.r.t the predictions, evaluated at iteration ``i - 1``.\n max_leaf_nodes : int, default=None\n The maximum number of leaves for each tree. If None, there is no\n maximum limit.\n max_depth : int, default=None\n The maximum depth of each tree. The depth of a tree is the number of\n edges to go from the root to the deepest leaf.\n Depth isn't constrained by default.\n min_samples_leaf : int, default=20\n The minimum number of samples per leaf.\n min_gain_to_split : float, default=0.\n The minimum gain needed to split a node. Splits with lower gain will\n be ignored.\n n_bins : int, default=256\n The total number of bins, including the bin for missing values. Used\n to define the shape of the histograms.\n n_bins_non_missing : ndarray, dtype=np.uint32, default=None\n For each feature, gives the number of bins actually used for\n non-missing values. For features with a lot of unique values, this\n is equal to ``n_bins - 1``. If it's an int, all features are\n considered to have the same number of bins. If None, all features\n are considered to have ``n_bins - 1`` bins.\n has_missing_values : bool or ndarray, dtype=bool, default=False\n Whether each feature contains missing values (in the training data).\n If it's a bool, the same value is used for all features.\n is_categorical : ndarray of bool of shape (n_features,), default=None\n Indicates categorical features.\n monotonic_cst : array-like of shape (n_features,), dtype=int, default=None\n Indicates the monotonic constraint to enforce on each feature. -1, 1\n and 0 respectively correspond to a positive constraint, negative\n constraint and no constraint. Read more in the :ref:`User Guide\n `.\n l2_regularization : float, default=0.\n The L2 regularization parameter.\n min_hessian_to_split : float, default=1e-3\n The minimum sum of hessians needed in each node. Splits that result in\n at least one child having a sum of hessians less than\n ``min_hessian_to_split`` are discarded.\n shrinkage : float, default=1.\n The shrinkage parameter to apply to the leaves values, also known as\n learning rate.\n n_threads : int, default=None\n Number of OpenMP threads to use. `_openmp_effective_n_threads` is called\n to determine the effective number of threads use, which takes cgroups CPU\n quotes into account. See the docstring of `_openmp_effective_n_threads`\n for details.\n ", "source_code": "\n\nclass TreeGrower:\n \"\"\"Tree grower class used to build a tree.\n\n The tree is fitted to predict the values of a Newton-Raphson step. The\n splits are considered in a best-first fashion, and the quality of a\n split is defined in splitting._split_gain.\n\n Parameters\n ----------\n X_binned : ndarray of shape (n_samples, n_features), dtype=np.uint8\n The binned input samples. Must be Fortran-aligned.\n gradients : ndarray of shape (n_samples,)\n The gradients of each training sample. Those are the gradients of the\n loss w.r.t the predictions, evaluated at iteration ``i - 1``.\n hessians : ndarray of shape (n_samples,)\n The hessians of each training sample. Those are the hessians of the\n loss w.r.t the predictions, evaluated at iteration ``i - 1``.\n max_leaf_nodes : int, default=None\n The maximum number of leaves for each tree. If None, there is no\n maximum limit.\n max_depth : int, default=None\n The maximum depth of each tree. The depth of a tree is the number of\n edges to go from the root to the deepest leaf.\n Depth isn't constrained by default.\n min_samples_leaf : int, default=20\n The minimum number of samples per leaf.\n min_gain_to_split : float, default=0.\n The minimum gain needed to split a node. Splits with lower gain will\n be ignored.\n n_bins : int, default=256\n The total number of bins, including the bin for missing values. Used\n to define the shape of the histograms.\n n_bins_non_missing : ndarray, dtype=np.uint32, default=None\n For each feature, gives the number of bins actually used for\n non-missing values. For features with a lot of unique values, this\n is equal to ``n_bins - 1``. If it's an int, all features are\n considered to have the same number of bins. If None, all features\n are considered to have ``n_bins - 1`` bins.\n has_missing_values : bool or ndarray, dtype=bool, default=False\n Whether each feature contains missing values (in the training data).\n If it's a bool, the same value is used for all features.\n is_categorical : ndarray of bool of shape (n_features,), default=None\n Indicates categorical features.\n monotonic_cst : array-like of shape (n_features,), dtype=int, default=None\n Indicates the monotonic constraint to enforce on each feature. -1, 1\n and 0 respectively correspond to a positive constraint, negative\n constraint and no constraint. Read more in the :ref:`User Guide\n `.\n l2_regularization : float, default=0.\n The L2 regularization parameter.\n min_hessian_to_split : float, default=1e-3\n The minimum sum of hessians needed in each node. Splits that result in\n at least one child having a sum of hessians less than\n ``min_hessian_to_split`` are discarded.\n shrinkage : float, default=1.\n The shrinkage parameter to apply to the leaves values, also known as\n learning rate.\n n_threads : int, default=None\n Number of OpenMP threads to use. `_openmp_effective_n_threads` is called\n to determine the effective number of threads use, which takes cgroups CPU\n quotes into account. See the docstring of `_openmp_effective_n_threads`\n for details.\n \"\"\"\n \n def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, max_depth=None, min_samples_leaf=20, min_gain_to_split=0.0, n_bins=256, n_bins_non_missing=None, has_missing_values=False, is_categorical=None, monotonic_cst=None, l2_regularization=0.0, min_hessian_to_split=0.001, shrinkage=1.0, n_threads=None):\n self._validate_parameters(X_binned, max_leaf_nodes, max_depth, min_samples_leaf, min_gain_to_split, l2_regularization, min_hessian_to_split)\n n_threads = _openmp_effective_n_threads(n_threads)\n if n_bins_non_missing is None:\n n_bins_non_missing = n_bins - 1\n if isinstance(n_bins_non_missing, numbers.Integral):\n n_bins_non_missing = np.array([n_bins_non_missing] * X_binned.shape[1], dtype=np.uint32)\n else:\n n_bins_non_missing = np.asarray(n_bins_non_missing, dtype=np.uint32)\n if isinstance(has_missing_values, bool):\n has_missing_values = [has_missing_values] * X_binned.shape[1]\n has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)\n if monotonic_cst is None:\n self.with_monotonic_cst = False\n monotonic_cst = np.full(shape=X_binned.shape[1], fill_value=MonotonicConstraint.NO_CST, dtype=np.int8)\n else:\n self.with_monotonic_cst = True\n monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)\n if monotonic_cst.shape[0] != X_binned.shape[1]:\n raise ValueError('monotonic_cst has shape {} but the input data X has {} features.'.format(monotonic_cst.shape[0], X_binned.shape[1]))\n if np.any(monotonic_cst < -1) or np.any(monotonic_cst > 1):\n raise ValueError('monotonic_cst must be None or an array-like of -1, 0 or 1.')\n if is_categorical is None:\n is_categorical = np.zeros(shape=X_binned.shape[1], dtype=np.uint8)\n else:\n is_categorical = np.asarray(is_categorical, dtype=np.uint8)\n if np.any(np.logical_and(is_categorical == 1, monotonic_cst != MonotonicConstraint.NO_CST)):\n raise ValueError('Categorical features cannot have monotonic constraints.')\n hessians_are_constant = hessians.shape[0] == 1\n self.histogram_builder = HistogramBuilder(X_binned, n_bins, gradients, hessians, hessians_are_constant, n_threads)\n missing_values_bin_idx = n_bins - 1\n self.splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, has_missing_values, is_categorical, monotonic_cst, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, hessians_are_constant, n_threads)\n self.n_bins_non_missing = n_bins_non_missing\n self.missing_values_bin_idx = missing_values_bin_idx\n self.max_leaf_nodes = max_leaf_nodes\n self.has_missing_values = has_missing_values\n self.monotonic_cst = monotonic_cst\n self.is_categorical = is_categorical\n self.l2_regularization = l2_regularization\n self.n_features = X_binned.shape[1]\n self.max_depth = max_depth\n self.min_samples_leaf = min_samples_leaf\n self.X_binned = X_binned\n self.min_gain_to_split = min_gain_to_split\n self.shrinkage = shrinkage\n self.n_threads = n_threads\n self.splittable_nodes = []\n self.finalized_leaves = []\n self.total_find_split_time = 0.0\n self.total_compute_hist_time = 0.0\n self.total_apply_split_time = 0.0\n self.n_categorical_splits = 0\n self._intilialize_root(gradients, hessians, hessians_are_constant)\n self.n_nodes = 1\n \n def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, min_samples_leaf, min_gain_to_split, l2_regularization, min_hessian_to_split):\n \"\"\"Validate parameters passed to __init__.\n\n Also validate parameters passed to splitter.\n \"\"\"\n if X_binned.dtype != np.uint8:\n raise NotImplementedError('X_binned must be of type uint8.')\n if not X_binned.flags.f_contiguous:\n raise ValueError('X_binned should be passed as Fortran contiguous array for maximum efficiency.')\n if max_leaf_nodes is not None and max_leaf_nodes <= 1:\n raise ValueError('max_leaf_nodes={} should not be smaller than 2'.format(max_leaf_nodes))\n if max_depth is not None and max_depth < 1:\n raise ValueError('max_depth={} should not be smaller than 1'.format(max_depth))\n if min_samples_leaf < 1:\n raise ValueError('min_samples_leaf={} should not be smaller than 1'.format(min_samples_leaf))\n if min_gain_to_split < 0:\n raise ValueError('min_gain_to_split={} must be positive.'.format(min_gain_to_split))\n if l2_regularization < 0:\n raise ValueError('l2_regularization={} must be positive.'.format(l2_regularization))\n if min_hessian_to_split < 0:\n raise ValueError('min_hessian_to_split={} must be positive.'.format(min_hessian_to_split))\n \n def grow(self):\n \"\"\"Grow the tree, from root to leaves.\"\"\"\n while self.splittable_nodes:\n self.split_next()\n self._apply_shrinkage()\n \n def _apply_shrinkage(self):\n \"\"\"Multiply leaves values by shrinkage parameter.\n\n This must be done at the very end of the growing process. If this were\n done during the growing process e.g. in finalize_leaf(), then a leaf\n would be shrunk but its sibling would potentially not be (if it's a\n non-leaf), which would lead to a wrong computation of the 'middle'\n value needed to enforce the monotonic constraints.\n \"\"\"\n for leaf in self.finalized_leaves:\n leaf.value *= self.shrinkage\n \n def _intilialize_root(self, gradients, hessians, hessians_are_constant):\n \"\"\"Initialize root node and finalize it if needed.\"\"\"\n n_samples = self.X_binned.shape[0]\n depth = 0\n sum_gradients = sum_parallel(gradients, self.n_threads)\n if self.histogram_builder.hessians_are_constant:\n sum_hessians = hessians[0] * n_samples\n else:\n sum_hessians = sum_parallel(hessians, self.n_threads)\n self.root = TreeNode(depth=depth, sample_indices=self.splitter.partition, sum_gradients=sum_gradients, sum_hessians=sum_hessians, value=0)\n self.root.partition_start = 0\n self.root.partition_stop = n_samples\n if self.root.n_samples < 2 * self.min_samples_leaf:\n self._finalize_leaf(self.root)\n return\n if sum_hessians < self.splitter.min_hessian_to_split:\n self._finalize_leaf(self.root)\n return\n self.root.histograms = self.histogram_builder.compute_histograms_brute(self.root.sample_indices)\n self._compute_best_split_and_push(self.root)\n \n def _compute_best_split_and_push(self, node):\n \"\"\"Compute the best possible split (SplitInfo) of a given node.\n\n Also push it in the heap of splittable nodes if gain isn't zero.\n The gain of a node is 0 if either all the leaves are pure\n (best gain = 0), or if no split would satisfy the constraints,\n (min_hessians_to_split, min_gain_to_split, min_samples_leaf)\n \"\"\"\n node.split_info = self.splitter.find_node_split(node.n_samples, node.histograms, node.sum_gradients, node.sum_hessians, node.value, node.children_lower_bound, node.children_upper_bound)\n if node.split_info.gain <= 0:\n self._finalize_leaf(node)\n else:\n heappush(self.splittable_nodes, node)\n \n def split_next(self):\n \"\"\"Split the node with highest potential gain.\n\n Returns\n -------\n left : TreeNode\n The resulting left child.\n right : TreeNode\n The resulting right child.\n \"\"\"\n node = heappop(self.splittable_nodes)\n tic = time()\n (sample_indices_left, sample_indices_right, right_child_pos) = self.splitter.split_indices(node.split_info, node.sample_indices)\n self.total_apply_split_time += time() - tic\n depth = node.depth + 1\n n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes)\n n_leaf_nodes += 2\n left_child_node = TreeNode(depth, sample_indices_left, node.split_info.sum_gradient_left, node.split_info.sum_hessian_left, value=node.split_info.value_left)\n right_child_node = TreeNode(depth, sample_indices_right, node.split_info.sum_gradient_right, node.split_info.sum_hessian_right, value=node.split_info.value_right)\n node.right_child = right_child_node\n node.left_child = left_child_node\n left_child_node.partition_start = node.partition_start\n left_child_node.partition_stop = node.partition_start + right_child_pos\n right_child_node.partition_start = left_child_node.partition_stop\n right_child_node.partition_stop = node.partition_stop\n if not self.has_missing_values[node.split_info.feature_idx]:\n node.split_info.missing_go_to_left = left_child_node.n_samples > right_child_node.n_samples\n self.n_nodes += 2\n self.n_categorical_splits += node.split_info.is_categorical\n if self.max_leaf_nodes is not None and n_leaf_nodes == self.max_leaf_nodes:\n self._finalize_leaf(left_child_node)\n self._finalize_leaf(right_child_node)\n self._finalize_splittable_nodes()\n return left_child_node, right_child_node\n if self.max_depth is not None and depth == self.max_depth:\n self._finalize_leaf(left_child_node)\n self._finalize_leaf(right_child_node)\n return left_child_node, right_child_node\n if left_child_node.n_samples < self.min_samples_leaf * 2:\n self._finalize_leaf(left_child_node)\n if right_child_node.n_samples < self.min_samples_leaf * 2:\n self._finalize_leaf(right_child_node)\n if self.with_monotonic_cst:\n if self.monotonic_cst[node.split_info.feature_idx] == MonotonicConstraint.NO_CST:\n lower_left = lower_right = node.children_lower_bound\n upper_left = upper_right = node.children_upper_bound\n else:\n mid = (left_child_node.value + right_child_node.value) / 2\n if self.monotonic_cst[node.split_info.feature_idx] == MonotonicConstraint.POS:\n (lower_left, upper_left) = (node.children_lower_bound, mid)\n (lower_right, upper_right) = (mid, node.children_upper_bound)\n else:\n (lower_left, upper_left) = (mid, node.children_upper_bound)\n (lower_right, upper_right) = (node.children_lower_bound, mid)\n left_child_node.set_children_bounds(lower_left, upper_left)\n right_child_node.set_children_bounds(lower_right, upper_right)\n should_split_left = not left_child_node.is_leaf\n should_split_right = not right_child_node.is_leaf\n if should_split_left or should_split_right:\n n_samples_left = left_child_node.sample_indices.shape[0]\n n_samples_right = right_child_node.sample_indices.shape[0]\n if n_samples_left < n_samples_right:\n smallest_child = left_child_node\n largest_child = right_child_node\n else:\n smallest_child = right_child_node\n largest_child = left_child_node\n tic = time()\n smallest_child.histograms = self.histogram_builder.compute_histograms_brute(smallest_child.sample_indices)\n largest_child.histograms = self.histogram_builder.compute_histograms_subtraction(node.histograms, smallest_child.histograms)\n self.total_compute_hist_time += time() - tic\n tic = time()\n if should_split_left:\n self._compute_best_split_and_push(left_child_node)\n if should_split_right:\n self._compute_best_split_and_push(right_child_node)\n self.total_find_split_time += time() - tic\n for child in (left_child_node, right_child_node):\n if child.is_leaf:\n del child.histograms\n del node.histograms\n return left_child_node, right_child_node\n \n def _finalize_leaf(self, node):\n \"\"\"Make node a leaf of the tree being grown.\"\"\"\n node.is_leaf = True\n self.finalized_leaves.append(node)\n \n def _finalize_splittable_nodes(self):\n \"\"\"Transform all splittable nodes into leaves.\n\n Used when some constraint is met e.g. maximum number of leaves or\n maximum depth.\"\"\"\n while len(self.splittable_nodes) > 0:\n node = self.splittable_nodes.pop()\n self._finalize_leaf(node)\n \n def make_predictor(self, binning_thresholds):\n \"\"\"Make a TreePredictor object out of the current tree.\n\n Parameters\n ----------\n binning_thresholds : array-like of floats\n Corresponds to the bin_thresholds_ attribute of the BinMapper.\n For each feature, this stores:\n\n - the bin frontiers for continuous features\n - the unique raw category values for categorical features\n\n Returns\n -------\n A TreePredictor object.\n \"\"\"\n predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)\n binned_left_cat_bitsets = np.zeros((self.n_categorical_splits, 8), dtype=X_BITSET_INNER_DTYPE)\n raw_left_cat_bitsets = np.zeros((self.n_categorical_splits, 8), dtype=X_BITSET_INNER_DTYPE)\n _fill_predictor_arrays(predictor_nodes, binned_left_cat_bitsets, raw_left_cat_bitsets, self.root, binning_thresholds, self.n_bins_non_missing)\n return TreePredictor(predictor_nodes, binned_left_cat_bitsets, raw_left_cat_bitsets)\n" }, @@ -21389,7 +21455,7 @@ "sklearn.ensemble._hist_gradient_boosting.grower.TreeNode.__lt__" ], "is_public": false, - "description": "Tree Node class used in TreeGrower.\n\nThis isn't used for prediction purposes, only for training (see TreePredictor).", + "description": "Tree Node class used in TreeGrower.\n\nThis isn't used for prediction purposes, only for training (see\nTreePredictor).", "docstring": "Tree Node class used in TreeGrower.\n\n This isn't used for prediction purposes, only for training (see\n TreePredictor).\n\n Parameters\n ----------\n depth : int\n The depth of the node, i.e. its distance from the root.\n sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint\n The indices of the samples at the node.\n sum_gradients : float\n The sum of the gradients of the samples at the node.\n sum_hessians : float\n The sum of the hessians of the samples at the node.\n\n Attributes\n ----------\n depth : int\n The depth of the node, i.e. its distance from the root.\n sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint\n The indices of the samples at the node.\n sum_gradients : float\n The sum of the gradients of the samples at the node.\n sum_hessians : float\n The sum of the hessians of the samples at the node.\n split_info : SplitInfo or None\n The result of the split evaluation.\n left_child : TreeNode or None\n The left child of the node. None for leaves.\n right_child : TreeNode or None\n The right child of the node. None for leaves.\n value : float or None\n The value of the leaf, as computed in finalize_leaf(). None for\n non-leaf nodes.\n partition_start : int\n start position of the node's sample_indices in splitter.partition.\n partition_stop : int\n stop position of the node's sample_indices in splitter.partition.\n ", "source_code": "\n\nclass TreeNode:\n \"\"\"Tree Node class used in TreeGrower.\n\n This isn't used for prediction purposes, only for training (see\n TreePredictor).\n\n Parameters\n ----------\n depth : int\n The depth of the node, i.e. its distance from the root.\n sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint\n The indices of the samples at the node.\n sum_gradients : float\n The sum of the gradients of the samples at the node.\n sum_hessians : float\n The sum of the hessians of the samples at the node.\n\n Attributes\n ----------\n depth : int\n The depth of the node, i.e. its distance from the root.\n sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint\n The indices of the samples at the node.\n sum_gradients : float\n The sum of the gradients of the samples at the node.\n sum_hessians : float\n The sum of the hessians of the samples at the node.\n split_info : SplitInfo or None\n The result of the split evaluation.\n left_child : TreeNode or None\n The left child of the node. None for leaves.\n right_child : TreeNode or None\n The right child of the node. None for leaves.\n value : float or None\n The value of the leaf, as computed in finalize_leaf(). None for\n non-leaf nodes.\n partition_start : int\n start position of the node's sample_indices in splitter.partition.\n partition_stop : int\n stop position of the node's sample_indices in splitter.partition.\n \"\"\"\n split_info = None\n left_child = None\n right_child = None\n histograms = None\n partition_start = 0\n partition_stop = 0\n \n def __init__(self, depth, sample_indices, sum_gradients, sum_hessians, value=None):\n self.depth = depth\n self.sample_indices = sample_indices\n self.n_samples = sample_indices.shape[0]\n self.sum_gradients = sum_gradients\n self.sum_hessians = sum_hessians\n self.value = value\n self.is_leaf = False\n self.set_children_bounds(float('-inf'), float('+inf'))\n \n def set_children_bounds(self, lower, upper):\n \"\"\"Set children values bounds to respect monotonic constraints.\"\"\"\n self.children_lower_bound = lower\n self.children_upper_bound = upper\n \n def __lt__(self, other_node):\n \"\"\"Comparison for priority queue.\n\n Nodes with high gain are higher priority than nodes with low gain.\n\n heapq.heappush only need the '<' operator.\n heapq.heappop take the smallest item first (smaller is higher\n priority).\n\n Parameters\n ----------\n other_node : TreeNode\n The node to compare with.\n \"\"\"\n return self.split_info.gain > other_node.split_info.gain\n" }, @@ -21424,7 +21490,7 @@ "sklearn.ensemble._hist_gradient_boosting.loss.BinaryCrossEntropy.predict_proba" ], "is_public": false, - "description": "Binary cross-entropy loss, for binary classification.\n\nFor a given sample x_i, the binary cross-entropy loss is defined as the negative log-likelihood of the model which can be expressed as:: loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman, section 4.4.1 (about logistic regression).", + "description": "Binary cross-entropy loss, for binary classification.\n\nFor a given sample x_i, the binary cross-entropy loss is defined as the\nnegative log-likelihood of the model which can be expressed as::\n\n loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i\n\nSee The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,\nsection 4.4.1 (about logistic regression).", "docstring": "Binary cross-entropy loss, for binary classification.\n\n For a given sample x_i, the binary cross-entropy loss is defined as the\n negative log-likelihood of the model which can be expressed as::\n\n loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i\n\n See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,\n section 4.4.1 (about logistic regression).\n ", "source_code": "\n\nclass BinaryCrossEntropy(BaseLoss):\n \"\"\"Binary cross-entropy loss, for binary classification.\n\n For a given sample x_i, the binary cross-entropy loss is defined as the\n negative log-likelihood of the model which can be expressed as::\n\n loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i\n\n See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,\n section 4.4.1 (about logistic regression).\n \"\"\"\n \n def __init__(self, sample_weight, n_threads=None):\n super().__init__(hessians_are_constant=False, n_threads=n_threads)\n inverse_link_function = staticmethod(expit)\n \n def pointwise_loss(self, y_true, raw_predictions):\n raw_predictions = raw_predictions.reshape(-1)\n loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions\n return loss\n \n def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):\n if prediction_dim > 2:\n raise ValueError(\"loss='binary_crossentropy' is not defined for multiclass classification with n_classes=%d, use loss='categorical_crossentropy' instead\" % prediction_dim)\n proba_positive_class = np.average(y_train, weights=sample_weight)\n eps = np.finfo(y_train.dtype).eps\n proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps)\n return np.log(proba_positive_class / (1 - proba_positive_class))\n \n def update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions, sample_weight):\n raw_predictions = raw_predictions.reshape(-1)\n gradients = gradients.reshape(-1)\n hessians = hessians.reshape(-1)\n _update_gradients_hessians_binary_crossentropy(gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads)\n \n def predict_proba(self, raw_predictions):\n raw_predictions = raw_predictions.reshape(-1)\n proba = np.empty((raw_predictions.shape[0], 2), dtype=Y_DTYPE)\n proba[:, 1] = expit(raw_predictions)\n proba[:, 0] = 1 - proba[:, 1]\n return proba\n" }, @@ -21441,7 +21507,7 @@ "sklearn.ensemble._hist_gradient_boosting.loss.CategoricalCrossEntropy.predict_proba" ], "is_public": false, - "description": "Categorical cross-entropy loss, for multiclass classification.\n\nFor a given sample x_i, the categorical cross-entropy loss is defined as the negative log-likelihood of the model and generalizes the binary cross-entropy to more than 2 classes.", + "description": "Categorical cross-entropy loss, for multiclass classification.\n\nFor a given sample x_i, the categorical cross-entropy loss is defined as\nthe negative log-likelihood of the model and generalizes the binary\ncross-entropy to more than 2 classes.", "docstring": "Categorical cross-entropy loss, for multiclass classification.\n\n For a given sample x_i, the categorical cross-entropy loss is defined as\n the negative log-likelihood of the model and generalizes the binary\n cross-entropy to more than 2 classes.\n ", "source_code": "\n\nclass CategoricalCrossEntropy(BaseLoss):\n \"\"\"Categorical cross-entropy loss, for multiclass classification.\n\n For a given sample x_i, the categorical cross-entropy loss is defined as\n the negative log-likelihood of the model and generalizes the binary\n cross-entropy to more than 2 classes.\n \"\"\"\n \n def __init__(self, sample_weight, n_threads=None):\n super().__init__(hessians_are_constant=False, n_threads=n_threads)\n \n def pointwise_loss(self, y_true, raw_predictions):\n one_hot_true = np.zeros_like(raw_predictions)\n prediction_dim = raw_predictions.shape[0]\n for k in range(prediction_dim):\n one_hot_true[k, :] = y_true == k\n loss = logsumexp(raw_predictions, axis=0) - (one_hot_true * raw_predictions).sum(axis=0)\n return loss\n \n def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):\n init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE)\n eps = np.finfo(y_train.dtype).eps\n for k in range(prediction_dim):\n proba_kth_class = np.average(y_train == k, weights=sample_weight)\n proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps)\n init_value[k, :] += np.log(proba_kth_class)\n return init_value\n \n def update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions, sample_weight):\n _update_gradients_hessians_categorical_crossentropy(gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads)\n \n def predict_proba(self, raw_predictions):\n proba = np.exp(raw_predictions - logsumexp(raw_predictions, axis=0)[np.newaxis, :])\n return proba.T\n" }, @@ -21459,7 +21525,7 @@ "sklearn.ensemble._hist_gradient_boosting.loss.LeastAbsoluteDeviation.update_leaves_values" ], "is_public": false, - "description": "Least absolute deviation, for regression.\n\nFor a given sample x_i, the loss is defined as:: loss(x_i) = |y_true_i - raw_pred_i|", + "description": "Least absolute deviation, for regression.\n\nFor a given sample x_i, the loss is defined as::\n\n loss(x_i) = |y_true_i - raw_pred_i|", "docstring": "Least absolute deviation, for regression.\n\n For a given sample x_i, the loss is defined as::\n\n loss(x_i) = |y_true_i - raw_pred_i|\n ", "source_code": "\n\nclass LeastAbsoluteDeviation(BaseLoss):\n \"\"\"Least absolute deviation, for regression.\n\n For a given sample x_i, the loss is defined as::\n\n loss(x_i) = |y_true_i - raw_pred_i|\n \"\"\"\n \n def __init__(self, sample_weight, n_threads=None):\n super().__init__(hessians_are_constant=sample_weight is None, n_threads=n_threads)\n need_update_leaves_values = True\n \n def pointwise_loss(self, y_true, raw_predictions):\n raw_predictions = raw_predictions.reshape(-1)\n loss = np.abs(y_true - raw_predictions)\n return loss\n \n def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):\n if sample_weight is None:\n return np.median(y_train)\n else:\n return _weighted_percentile(y_train, sample_weight, 50)\n \n @staticmethod\n def inverse_link_function(raw_predictions):\n return raw_predictions\n \n def update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions, sample_weight):\n raw_predictions = raw_predictions.reshape(-1)\n gradients = gradients.reshape(-1)\n if sample_weight is None:\n _update_gradients_least_absolute_deviation(gradients, y_true, raw_predictions, self.n_threads)\n else:\n hessians = hessians.reshape(-1)\n _update_gradients_hessians_least_absolute_deviation(gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads)\n \n def update_leaves_values(self, grower, y_true, raw_predictions, sample_weight):\n for leaf in grower.finalized_leaves:\n indices = leaf.sample_indices\n if sample_weight is None:\n median_res = np.median(y_true[indices] - raw_predictions[indices])\n else:\n median_res = _weighted_percentile(y_true[indices] - raw_predictions[indices], sample_weight=sample_weight[indices], percentile=50)\n leaf.value = grower.shrinkage * median_res\n" }, @@ -21476,7 +21542,7 @@ "sklearn.ensemble._hist_gradient_boosting.loss.LeastSquares.update_gradients_and_hessians" ], "is_public": false, - "description": "Least squares loss, for regression.\n\nFor a given sample x_i, least squares loss is defined as:: loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2 This actually computes the half least squares loss to simplify the computation of the gradients and get a unit hessian (and be consistent with what is done in LightGBM).", + "description": "Least squares loss, for regression.\n\nFor a given sample x_i, least squares loss is defined as::\n\n loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2\n\nThis actually computes the half least squares loss to simplify\nthe computation of the gradients and get a unit hessian (and be consistent\nwith what is done in LightGBM).", "docstring": "Least squares loss, for regression.\n\n For a given sample x_i, least squares loss is defined as::\n\n loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2\n\n This actually computes the half least squares loss to simplify\n the computation of the gradients and get a unit hessian (and be consistent\n with what is done in LightGBM).\n ", "source_code": "\n\nclass LeastSquares(BaseLoss):\n \"\"\"Least squares loss, for regression.\n\n For a given sample x_i, least squares loss is defined as::\n\n loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2\n\n This actually computes the half least squares loss to simplify\n the computation of the gradients and get a unit hessian (and be consistent\n with what is done in LightGBM).\n \"\"\"\n \n def __init__(self, sample_weight, n_threads=None):\n super().__init__(hessians_are_constant=sample_weight is None, n_threads=n_threads)\n \n def pointwise_loss(self, y_true, raw_predictions):\n raw_predictions = raw_predictions.reshape(-1)\n loss = 0.5 * np.power(y_true - raw_predictions, 2)\n return loss\n \n def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):\n return np.average(y_train, weights=sample_weight)\n \n @staticmethod\n def inverse_link_function(raw_predictions):\n return raw_predictions\n \n def update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions, sample_weight):\n raw_predictions = raw_predictions.reshape(-1)\n gradients = gradients.reshape(-1)\n if sample_weight is None:\n _update_gradients_least_squares(gradients, y_true, raw_predictions, self.n_threads)\n else:\n hessians = hessians.reshape(-1)\n _update_gradients_hessians_least_squares(gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads)\n" }, @@ -21492,7 +21558,7 @@ "sklearn.ensemble._hist_gradient_boosting.loss.Poisson.update_gradients_and_hessians" ], "is_public": false, - "description": "Poisson deviance loss with log-link, for regression.\n\nFor a given sample x_i, Poisson deviance loss is defined as:: loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i)) - y_true_i + exp(raw_pred_i)) This actually computes half the Poisson deviance to simplify the computation of the gradients.", + "description": "Poisson deviance loss with log-link, for regression.\n\nFor a given sample x_i, Poisson deviance loss is defined as::\n\n loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i))\n - y_true_i + exp(raw_pred_i))\n\nThis actually computes half the Poisson deviance to simplify\nthe computation of the gradients.", "docstring": "Poisson deviance loss with log-link, for regression.\n\n For a given sample x_i, Poisson deviance loss is defined as::\n\n loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i))\n - y_true_i + exp(raw_pred_i))\n\n This actually computes half the Poisson deviance to simplify\n the computation of the gradients.\n ", "source_code": "\n\nclass Poisson(BaseLoss):\n \"\"\"Poisson deviance loss with log-link, for regression.\n\n For a given sample x_i, Poisson deviance loss is defined as::\n\n loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i))\n - y_true_i + exp(raw_pred_i))\n\n This actually computes half the Poisson deviance to simplify\n the computation of the gradients.\n \"\"\"\n \n def __init__(self, sample_weight, n_threads=None):\n super().__init__(hessians_are_constant=False, n_threads=n_threads)\n inverse_link_function = staticmethod(np.exp)\n \n def pointwise_loss(self, y_true, raw_predictions):\n raw_predictions = raw_predictions.reshape(-1)\n loss = xlogy(y_true, y_true) - y_true * (raw_predictions + 1) + np.exp(raw_predictions)\n return loss\n \n def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):\n y_pred = np.average(y_train, weights=sample_weight)\n eps = np.finfo(y_train.dtype).eps\n y_pred = np.clip(y_pred, eps, None)\n return np.log(y_pred)\n \n def update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions, sample_weight):\n raw_predictions = raw_predictions.reshape(-1)\n gradients = gradients.reshape(-1)\n hessians = hessians.reshape(-1)\n _update_gradients_hessians_poisson(gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads)\n" }, @@ -21532,7 +21598,7 @@ "sklearn.ensemble._iforest.IsolationForest._more_tags" ], "is_public": true, - "description": "Isolation Forest Algorithm.\n\nReturn the anomaly score of each sample using the IsolationForest algorithm The IsolationForest 'isolates' observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node. This path length, averaged over a forest of such random trees, is a measure of normality and our decision function. Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "Isolation Forest Algorithm.\n\nReturn the anomaly score of each sample using the IsolationForest algorithm\n\nThe IsolationForest 'isolates' observations by randomly selecting a feature\nand then randomly selecting a split value between the maximum and minimum\nvalues of the selected feature.\n\nSince recursive partitioning can be represented by a tree structure, the\nnumber of splittings required to isolate a sample is equivalent to the path\nlength from the root node to the terminating node.\n\nThis path length, averaged over a forest of such random trees, is a\nmeasure of normality and our decision function.\n\nRandom partitioning produces noticeably shorter paths for anomalies.\nHence, when a forest of random trees collectively produce shorter path\nlengths for particular samples, they are highly likely to be anomalies.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "\n Isolation Forest Algorithm.\n\n Return the anomaly score of each sample using the IsolationForest algorithm\n\n The IsolationForest 'isolates' observations by randomly selecting a feature\n and then randomly selecting a split value between the maximum and minimum\n values of the selected feature.\n\n Since recursive partitioning can be represented by a tree structure, the\n number of splittings required to isolate a sample is equivalent to the path\n length from the root node to the terminating node.\n\n This path length, averaged over a forest of such random trees, is a\n measure of normality and our decision function.\n\n Random partitioning produces noticeably shorter paths for anomalies.\n Hence, when a forest of random trees collectively produce shorter path\n lengths for particular samples, they are highly likely to be anomalies.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of base estimators in the ensemble.\n\n max_samples : \"auto\", int or float, default=\"auto\"\n The number of samples to draw from X to train each base estimator.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples.\n - If \"auto\", then `max_samples=min(256, n_samples)`.\n\n If max_samples is larger than the number of samples provided,\n all samples will be used for all trees (no sampling).\n\n contamination : 'auto' or float, default='auto'\n The amount of contamination of the data set, i.e. the proportion\n of outliers in the data set. Used when fitting to define the threshold\n on the scores of the samples.\n\n - If 'auto', the threshold is determined as in the\n original paper.\n - If float, the contamination should be in the range (0, 0.5].\n\n .. versionchanged:: 0.22\n The default value of ``contamination`` changed from 0.1\n to ``'auto'``.\n\n max_features : int or float, default=1.0\n The number of features to draw from X to train each base estimator.\n\n - If int, then draw `max_features` features.\n - If float, then draw `max_features * X.shape[1]` features.\n\n bootstrap : bool, default=False\n If True, individual trees are fit on random subsets of the training\n data sampled with replacement. If False, sampling without replacement\n is performed.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for both :meth:`fit` and\n :meth:`predict`. ``None`` means 1 unless in a\n :obj:`joblib.parallel_backend` context. ``-1`` means using all\n processors. See :term:`Glossary ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo-randomness of the selection of the feature\n and split values for each branching step and each tree in the forest.\n\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n verbose : int, default=0\n Controls the verbosity of the tree building process.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n .. versionadded:: 0.21\n\n Attributes\n ----------\n base_estimator_ : ExtraTreeRegressor instance\n The child estimator template used to create the collection of\n fitted sub-estimators.\n\n estimators_ : list of ExtraTreeRegressor instances\n The collection of fitted sub-estimators.\n\n estimators_features_ : list of ndarray\n The subset of drawn features for each base estimator.\n\n estimators_samples_ : list of ndarray\n The subset of drawn samples (i.e., the in-bag samples) for each base\n estimator.\n\n max_samples_ : int\n The actual number of samples.\n\n offset_ : float\n Offset used to define the decision function from the raw scores. We\n have the relation: ``decision_function = score_samples - offset_``.\n ``offset_`` is defined as follows. When the contamination parameter is\n set to \"auto\", the offset is equal to -0.5 as the scores of inliers are\n close to 0 and the scores of outliers are close to -1. When a\n contamination parameter different than \"auto\" is provided, the offset\n is defined in such a way we obtain the expected number of outliers\n (samples with decision function < 0) in training.\n\n .. versionadded:: 0.20\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n Notes\n -----\n The implementation is based on an ensemble of ExtraTreeRegressor. The\n maximum depth of each tree is set to ``ceil(log_2(n))`` where\n :math:`n` is the number of samples used to build the tree\n (see (Liu et al., 2008) for more details).\n\n References\n ----------\n .. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. \"Isolation forest.\"\n Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.\n .. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. \"Isolation-based\n anomaly detection.\" ACM Transactions on Knowledge Discovery from\n Data (TKDD) 6.1 (2012): 3.\n\n See Also\n ----------\n sklearn.covariance.EllipticEnvelope : An object for detecting outliers in a\n Gaussian distributed dataset.\n sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.\n Estimate the support of a high-dimensional distribution.\n The implementation is based on libsvm.\n sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection\n using Local Outlier Factor (LOF).\n\n Examples\n --------\n >>> from sklearn.ensemble import IsolationForest\n >>> X = [[-1.1], [0.3], [0.5], [100]]\n >>> clf = IsolationForest(random_state=0).fit(X)\n >>> clf.predict([[0.1], [0], [90]])\n array([ 1, 1, -1])\n ", "source_code": "\n\nclass IsolationForest(OutlierMixin, BaseBagging):\n \"\"\"\n Isolation Forest Algorithm.\n\n Return the anomaly score of each sample using the IsolationForest algorithm\n\n The IsolationForest 'isolates' observations by randomly selecting a feature\n and then randomly selecting a split value between the maximum and minimum\n values of the selected feature.\n\n Since recursive partitioning can be represented by a tree structure, the\n number of splittings required to isolate a sample is equivalent to the path\n length from the root node to the terminating node.\n\n This path length, averaged over a forest of such random trees, is a\n measure of normality and our decision function.\n\n Random partitioning produces noticeably shorter paths for anomalies.\n Hence, when a forest of random trees collectively produce shorter path\n lengths for particular samples, they are highly likely to be anomalies.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n n_estimators : int, default=100\n The number of base estimators in the ensemble.\n\n max_samples : \"auto\", int or float, default=\"auto\"\n The number of samples to draw from X to train each base estimator.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples.\n - If \"auto\", then `max_samples=min(256, n_samples)`.\n\n If max_samples is larger than the number of samples provided,\n all samples will be used for all trees (no sampling).\n\n contamination : 'auto' or float, default='auto'\n The amount of contamination of the data set, i.e. the proportion\n of outliers in the data set. Used when fitting to define the threshold\n on the scores of the samples.\n\n - If 'auto', the threshold is determined as in the\n original paper.\n - If float, the contamination should be in the range (0, 0.5].\n\n .. versionchanged:: 0.22\n The default value of ``contamination`` changed from 0.1\n to ``'auto'``.\n\n max_features : int or float, default=1.0\n The number of features to draw from X to train each base estimator.\n\n - If int, then draw `max_features` features.\n - If float, then draw `max_features * X.shape[1]` features.\n\n bootstrap : bool, default=False\n If True, individual trees are fit on random subsets of the training\n data sampled with replacement. If False, sampling without replacement\n is performed.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for both :meth:`fit` and\n :meth:`predict`. ``None`` means 1 unless in a\n :obj:`joblib.parallel_backend` context. ``-1`` means using all\n processors. See :term:`Glossary ` for more details.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo-randomness of the selection of the feature\n and split values for each branching step and each tree in the forest.\n\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n verbose : int, default=0\n Controls the verbosity of the tree building process.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit\n and add more estimators to the ensemble, otherwise, just fit a whole\n new forest. See :term:`the Glossary `.\n\n .. versionadded:: 0.21\n\n Attributes\n ----------\n base_estimator_ : ExtraTreeRegressor instance\n The child estimator template used to create the collection of\n fitted sub-estimators.\n\n estimators_ : list of ExtraTreeRegressor instances\n The collection of fitted sub-estimators.\n\n estimators_features_ : list of ndarray\n The subset of drawn features for each base estimator.\n\n estimators_samples_ : list of ndarray\n The subset of drawn samples (i.e., the in-bag samples) for each base\n estimator.\n\n max_samples_ : int\n The actual number of samples.\n\n offset_ : float\n Offset used to define the decision function from the raw scores. We\n have the relation: ``decision_function = score_samples - offset_``.\n ``offset_`` is defined as follows. When the contamination parameter is\n set to \"auto\", the offset is equal to -0.5 as the scores of inliers are\n close to 0 and the scores of outliers are close to -1. When a\n contamination parameter different than \"auto\" is provided, the offset\n is defined in such a way we obtain the expected number of outliers\n (samples with decision function < 0) in training.\n\n .. versionadded:: 0.20\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n Notes\n -----\n The implementation is based on an ensemble of ExtraTreeRegressor. The\n maximum depth of each tree is set to ``ceil(log_2(n))`` where\n :math:`n` is the number of samples used to build the tree\n (see (Liu et al., 2008) for more details).\n\n References\n ----------\n .. [1] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. \"Isolation forest.\"\n Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.\n .. [2] Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. \"Isolation-based\n anomaly detection.\" ACM Transactions on Knowledge Discovery from\n Data (TKDD) 6.1 (2012): 3.\n\n See Also\n ----------\n sklearn.covariance.EllipticEnvelope : An object for detecting outliers in a\n Gaussian distributed dataset.\n sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.\n Estimate the support of a high-dimensional distribution.\n The implementation is based on libsvm.\n sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection\n using Local Outlier Factor (LOF).\n\n Examples\n --------\n >>> from sklearn.ensemble import IsolationForest\n >>> X = [[-1.1], [0.3], [0.5], [100]]\n >>> clf = IsolationForest(random_state=0).fit(X)\n >>> clf.predict([[0.1], [0], [90]])\n array([ 1, 1, -1])\n \"\"\"\n \n def __init__(self, *, n_estimators=100, max_samples='auto', contamination='auto', max_features=1.0, bootstrap=False, n_jobs=None, random_state=None, verbose=0, warm_start=False):\n super().__init__(base_estimator=ExtraTreeRegressor(max_features=1, splitter='random', random_state=random_state), bootstrap=bootstrap, bootstrap_features=False, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose)\n self.contamination = contamination\n \n def _set_oob_score(self, X, y):\n raise NotImplementedError('OOB score not supported by iforest')\n \n def _parallel_args(self):\n return _joblib_parallel_args(prefer='threads')\n \n def fit(self, X, y=None, sample_weight=None):\n \"\"\"\n Fit estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csc_matrix`` for maximum efficiency.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n X = self._validate_data(X, accept_sparse=['csc'])\n if issparse(X):\n X.sort_indices()\n rnd = check_random_state(self.random_state)\n y = rnd.uniform(size=X.shape[0])\n n_samples = X.shape[0]\n if self.contamination != 'auto':\n if not 0.0 < self.contamination <= 0.5:\n raise ValueError('contamination must be in (0, 0.5], got: %f' % self.contamination)\n if isinstance(self.max_samples, str):\n if self.max_samples == 'auto':\n max_samples = min(256, n_samples)\n else:\n raise ValueError('max_samples (%s) is not supported.Valid choices are: \"auto\", int orfloat' % self.max_samples)\n elif isinstance(self.max_samples, numbers.Integral):\n if self.max_samples > n_samples:\n warn('max_samples (%s) is greater than the total number of samples (%s). max_samples will be set to n_samples for estimation.' % (self.max_samples, n_samples))\n max_samples = n_samples\n else:\n max_samples = self.max_samples\n else:\n if not 0.0 < self.max_samples <= 1.0:\n raise ValueError('max_samples must be in (0, 1], got %r' % self.max_samples)\n max_samples = int(self.max_samples * X.shape[0])\n self.max_samples_ = max_samples\n max_depth = int(np.ceil(np.log2(max(max_samples, 2))))\n super()._fit(X, y, max_samples, max_depth=max_depth, sample_weight=sample_weight)\n if self.contamination == 'auto':\n self.offset_ = -0.5\n return self\n self.offset_ = np.percentile(self.score_samples(X), 100.0 * self.contamination)\n return self\n \n def predict(self, X):\n \"\"\"\n Predict if a particular sample is an outlier or not.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n For each observation, tells whether or not (+1 or -1) it should\n be considered as an inlier according to the fitted model.\n \"\"\"\n check_is_fitted(self)\n decision_func = self.decision_function(X)\n is_inlier = np.ones_like(decision_func, dtype=int)\n is_inlier[decision_func < 0] = -1\n return is_inlier\n \n def decision_function(self, X):\n \"\"\"\n Average anomaly score of X of the base classifiers.\n\n The anomaly score of an input sample is computed as\n the mean anomaly score of the trees in the forest.\n\n The measure of normality of an observation given a tree is the depth\n of the leaf containing this observation, which is equivalent to\n the number of splittings required to isolate this point. In case of\n several observations n_left in the leaf, the average path length of\n a n_left samples isolation tree is added.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n scores : ndarray of shape (n_samples,)\n The anomaly score of the input samples.\n The lower, the more abnormal. Negative scores represent outliers,\n positive scores represent inliers.\n \"\"\"\n return self.score_samples(X) - self.offset_\n \n def score_samples(self, X):\n \"\"\"\n Opposite of the anomaly score defined in the original paper.\n\n The anomaly score of an input sample is computed as\n the mean anomaly score of the trees in the forest.\n\n The measure of normality of an observation given a tree is the depth\n of the leaf containing this observation, which is equivalent to\n the number of splittings required to isolate this point. In case of\n several observations n_left in the leaf, the average path length of\n a n_left samples isolation tree is added.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n scores : ndarray of shape (n_samples,)\n The anomaly score of the input samples.\n The lower, the more abnormal.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n return -self._compute_chunked_score_samples(X)\n \n def _compute_chunked_score_samples(self, X):\n n_samples = _num_samples(X)\n if self._max_features == X.shape[1]:\n subsample_features = False\n else:\n subsample_features = True\n chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self._max_features, max_n_rows=n_samples)\n slices = gen_batches(n_samples, chunk_n_rows)\n scores = np.zeros(n_samples, order='f')\n for sl in slices:\n scores[sl] = self._compute_score_samples(X[sl], subsample_features)\n return scores\n \n def _compute_score_samples(self, X, subsample_features):\n \"\"\"\n Compute the score of each samples in X going through the extra trees.\n\n Parameters\n ----------\n X : array-like or sparse matrix\n Data matrix.\n\n subsample_features : bool\n Whether features should be subsampled.\n \"\"\"\n n_samples = X.shape[0]\n depths = np.zeros(n_samples, order='f')\n for (tree, features) in zip(self.estimators_, self.estimators_features_):\n X_subset = X[:, features] if subsample_features else X\n leaves_index = tree.apply(X_subset)\n node_indicator = tree.decision_path(X_subset)\n n_samples_leaf = tree.tree_.n_node_samples[leaves_index]\n depths += np.ravel(node_indicator.sum(axis=1)) + _average_path_length(n_samples_leaf) - 1.0\n denominator = len(self.estimators_) * _average_path_length([self.max_samples_])\n scores = 2**(-np.divide(depths, denominator, out=np.ones_like(depths), where=denominator != 0))\n return scores\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, @@ -21552,7 +21618,7 @@ "sklearn.ensemble._stacking.StackingClassifier._sk_visual_block_" ], "is_public": true, - "description": "Stack of estimators with a final classifier.\n\nStacked generalization consists in stacking the output of individual estimator and use a classifier to compute the final prediction. Stacking allows to use the strength of each individual estimator by using their output as input of a final estimator. Note that `estimators_` are fitted on the full `X` while `final_estimator_` is trained using cross-validated predictions of the base estimators using `cross_val_predict`. Read more in the :ref:`User Guide `. .. versionadded:: 0.22", + "description": "Stack of estimators with a final classifier.\n\nStacked generalization consists in stacking the output of individual\nestimator and use a classifier to compute the final prediction. Stacking\nallows to use the strength of each individual estimator by using their\noutput as input of a final estimator.\n\nNote that `estimators_` are fitted on the full `X` while `final_estimator_`\nis trained using cross-validated predictions of the base estimators using\n`cross_val_predict`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.22", "docstring": "Stack of estimators with a final classifier.\n\n Stacked generalization consists in stacking the output of individual\n estimator and use a classifier to compute the final prediction. Stacking\n allows to use the strength of each individual estimator by using their\n output as input of a final estimator.\n\n Note that `estimators_` are fitted on the full `X` while `final_estimator_`\n is trained using cross-validated predictions of the base estimators using\n `cross_val_predict`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n estimators : list of (str, estimator)\n Base estimators which will be stacked together. Each element of the\n list is defined as a tuple of string (i.e. name) and an estimator\n instance. An estimator can be set to 'drop' using `set_params`.\n\n final_estimator : estimator, default=None\n A classifier which will be used to combine the base estimators.\n The default classifier is a\n :class:`~sklearn.linear_model.LogisticRegression`.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy used in\n `cross_val_predict` to train `final_estimator`. Possible inputs for\n cv are:\n\n * None, to use the default 5-fold cross validation,\n * integer, to specify the number of folds in a (Stratified) KFold,\n * An object to be used as a cross-validation generator,\n * An iterable yielding train, test splits.\n\n For integer/None inputs, if the estimator is a classifier and y is\n either binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used.\n In all other cases, :class:`~sklearn.model_selection.KFold` is used.\n These splitters are instantiated with `shuffle=False` so the splits\n will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. note::\n A larger number of split will provide no benefits if the number\n of training samples is large enough. Indeed, the training time\n will increase. ``cv`` is not used for model evaluation but for\n prediction.\n\n stack_method : {'auto', 'predict_proba', 'decision_function', 'predict'}, default='auto'\n Methods called for each base estimator. It can be:\n\n * if 'auto', it will try to invoke, for each estimator,\n `'predict_proba'`, `'decision_function'` or `'predict'` in that\n order.\n * otherwise, one of `'predict_proba'`, `'decision_function'` or\n `'predict'`. If the method is not implemented by the estimator, it\n will raise an error.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel all `estimators` `fit`.\n `None` means 1 unless in a `joblib.parallel_backend` context. -1 means\n using all processors. See Glossary for more details.\n\n passthrough : bool, default=False\n When False, only the predictions of estimators will be used as\n training data for `final_estimator`. When True, the\n `final_estimator` is trained on the predictions as well as the\n original training data.\n\n verbose : int, default=0\n Verbosity level.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n Class labels.\n\n estimators_ : list of estimators\n The elements of the estimators parameter, having been fitted on the\n training data. If an estimator has been set to `'drop'`, it\n will not appear in `estimators_`.\n\n named_estimators_ : :class:`~sklearn.utils.Bunch`\n Attribute to access any fitted sub-estimators by name.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying classifier exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n .. versionadded:: 1.0\n\n final_estimator_ : estimator\n The classifier which predicts given the output of `estimators_`.\n\n stack_method_ : list of str\n The method used by each base estimator.\n\n See Also\n --------\n StackingRegressor : Stack of estimators with a final regressor.\n\n Notes\n -----\n When `predict_proba` is used by each estimator (i.e. most of the time for\n `stack_method='auto'` or specifically for `stack_method='predict_proba'`),\n The first column predicted by each estimator will be dropped in the case\n of a binary classification problem. Indeed, both feature will be perfectly\n collinear.\n\n References\n ----------\n .. [1] Wolpert, David H. \"Stacked generalization.\" Neural networks 5.2\n (1992): 241-259.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.svm import LinearSVC\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.ensemble import StackingClassifier\n >>> X, y = load_iris(return_X_y=True)\n >>> estimators = [\n ... ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),\n ... ('svr', make_pipeline(StandardScaler(),\n ... LinearSVC(random_state=42)))\n ... ]\n >>> clf = StackingClassifier(\n ... estimators=estimators, final_estimator=LogisticRegression()\n ... )\n >>> from sklearn.model_selection import train_test_split\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, stratify=y, random_state=42\n ... )\n >>> clf.fit(X_train, y_train).score(X_test, y_test)\n 0.9...\n ", "source_code": "\n\nclass StackingClassifier(ClassifierMixin, _BaseStacking):\n \"\"\"Stack of estimators with a final classifier.\n\n Stacked generalization consists in stacking the output of individual\n estimator and use a classifier to compute the final prediction. Stacking\n allows to use the strength of each individual estimator by using their\n output as input of a final estimator.\n\n Note that `estimators_` are fitted on the full `X` while `final_estimator_`\n is trained using cross-validated predictions of the base estimators using\n `cross_val_predict`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n estimators : list of (str, estimator)\n Base estimators which will be stacked together. Each element of the\n list is defined as a tuple of string (i.e. name) and an estimator\n instance. An estimator can be set to 'drop' using `set_params`.\n\n final_estimator : estimator, default=None\n A classifier which will be used to combine the base estimators.\n The default classifier is a\n :class:`~sklearn.linear_model.LogisticRegression`.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy used in\n `cross_val_predict` to train `final_estimator`. Possible inputs for\n cv are:\n\n * None, to use the default 5-fold cross validation,\n * integer, to specify the number of folds in a (Stratified) KFold,\n * An object to be used as a cross-validation generator,\n * An iterable yielding train, test splits.\n\n For integer/None inputs, if the estimator is a classifier and y is\n either binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used.\n In all other cases, :class:`~sklearn.model_selection.KFold` is used.\n These splitters are instantiated with `shuffle=False` so the splits\n will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. note::\n A larger number of split will provide no benefits if the number\n of training samples is large enough. Indeed, the training time\n will increase. ``cv`` is not used for model evaluation but for\n prediction.\n\n stack_method : {'auto', 'predict_proba', 'decision_function', 'predict'}, default='auto'\n Methods called for each base estimator. It can be:\n\n * if 'auto', it will try to invoke, for each estimator,\n `'predict_proba'`, `'decision_function'` or `'predict'` in that\n order.\n * otherwise, one of `'predict_proba'`, `'decision_function'` or\n `'predict'`. If the method is not implemented by the estimator, it\n will raise an error.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel all `estimators` `fit`.\n `None` means 1 unless in a `joblib.parallel_backend` context. -1 means\n using all processors. See Glossary for more details.\n\n passthrough : bool, default=False\n When False, only the predictions of estimators will be used as\n training data for `final_estimator`. When True, the\n `final_estimator` is trained on the predictions as well as the\n original training data.\n\n verbose : int, default=0\n Verbosity level.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n Class labels.\n\n estimators_ : list of estimators\n The elements of the estimators parameter, having been fitted on the\n training data. If an estimator has been set to `'drop'`, it\n will not appear in `estimators_`.\n\n named_estimators_ : :class:`~sklearn.utils.Bunch`\n Attribute to access any fitted sub-estimators by name.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying classifier exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n .. versionadded:: 1.0\n\n final_estimator_ : estimator\n The classifier which predicts given the output of `estimators_`.\n\n stack_method_ : list of str\n The method used by each base estimator.\n\n See Also\n --------\n StackingRegressor : Stack of estimators with a final regressor.\n\n Notes\n -----\n When `predict_proba` is used by each estimator (i.e. most of the time for\n `stack_method='auto'` or specifically for `stack_method='predict_proba'`),\n The first column predicted by each estimator will be dropped in the case\n of a binary classification problem. Indeed, both feature will be perfectly\n collinear.\n\n References\n ----------\n .. [1] Wolpert, David H. \"Stacked generalization.\" Neural networks 5.2\n (1992): 241-259.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.svm import LinearSVC\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.ensemble import StackingClassifier\n >>> X, y = load_iris(return_X_y=True)\n >>> estimators = [\n ... ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),\n ... ('svr', make_pipeline(StandardScaler(),\n ... LinearSVC(random_state=42)))\n ... ]\n >>> clf = StackingClassifier(\n ... estimators=estimators, final_estimator=LogisticRegression()\n ... )\n >>> from sklearn.model_selection import train_test_split\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, stratify=y, random_state=42\n ... )\n >>> clf.fit(X_train, y_train).score(X_test, y_test)\n 0.9...\n \"\"\"\n \n def __init__(self, estimators, final_estimator=None, *, cv=None, stack_method='auto', n_jobs=None, passthrough=False, verbose=0):\n super().__init__(estimators=estimators, final_estimator=final_estimator, cv=cv, stack_method=stack_method, n_jobs=n_jobs, passthrough=passthrough, verbose=verbose)\n \n def _validate_final_estimator(self):\n self._clone_final_estimator(default=LogisticRegression())\n if not is_classifier(self.final_estimator_):\n raise ValueError(\"'final_estimator' parameter should be a classifier. Got {}\".format(self.final_estimator_))\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n Returns\n -------\n self : object\n Returns a fitted instance of estimator.\n \"\"\"\n check_classification_targets(y)\n self._le = LabelEncoder().fit(y)\n self.classes_ = self._le.classes_\n return super().fit(X, self._le.transform(y), sample_weight)\n \n @if_delegate_has_method(delegate='final_estimator_')\n def predict(self, X, **predict_params):\n \"\"\"Predict target for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n **predict_params : dict of str -> obj\n Parameters to the `predict` called by the `final_estimator`. Note\n that this may be used to return uncertainties from some estimators\n with `return_std` or `return_cov`. Be aware that it will only\n accounts for uncertainty in the final estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)\n Predicted targets.\n \"\"\"\n y_pred = super().predict(X, **predict_params)\n return self._le.inverse_transform(y_pred)\n \n @if_delegate_has_method(delegate='final_estimator_')\n def predict_proba(self, X):\n \"\"\"Predict class probabilities for `X` using the final estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n probabilities : ndarray of shape (n_samples, n_classes) or list of ndarray of shape (n_output,)\n The class probabilities of the input samples.\n \"\"\"\n check_is_fitted(self)\n return self.final_estimator_.predict_proba(self.transform(X))\n \n @if_delegate_has_method(delegate='final_estimator_')\n def decision_function(self, X):\n \"\"\"Decision function for samples in `X` using the final estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n decisions : ndarray of shape (n_samples,), (n_samples, n_classes), or (n_samples, n_classes * (n_classes-1) / 2)\n The decision function computed the final estimator.\n \"\"\"\n check_is_fitted(self)\n return self.final_estimator_.decision_function(self.transform(X))\n \n def transform(self, X):\n \"\"\"Return class labels or probabilities for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n y_preds : ndarray of shape (n_samples, n_estimators) or (n_samples, n_classes * n_estimators)\n Prediction outputs for each estimator.\n \"\"\"\n return self._transform(X)\n \n def _sk_visual_block_(self):\n if self.final_estimator is None:\n final_estimator = LogisticRegression()\n else:\n final_estimator = self.final_estimator\n return super()._sk_visual_block_(final_estimator)\n" }, @@ -21569,9 +21635,9 @@ "sklearn.ensemble._stacking.StackingRegressor._sk_visual_block_" ], "is_public": true, - "description": "Stack of estimators with a final regressor.\n\nStacked generalization consists in stacking the output of individual estimator and use a regressor to compute the final prediction. Stacking allows to use the strength of each individual estimator by using their output as input of a final estimator. Note that `estimators_` are fitted on the full `X` while `final_estimator_` is trained using cross-validated predictions of the base estimators using `cross_val_predict`. Read more in the :ref:`User Guide `. .. versionadded:: 0.22", - "docstring": "Stack of estimators with a final regressor.\n\n Stacked generalization consists in stacking the output of individual\n estimator and use a regressor to compute the final prediction. Stacking\n allows to use the strength of each individual estimator by using their\n output as input of a final estimator.\n\n Note that `estimators_` are fitted on the full `X` while `final_estimator_`\n is trained using cross-validated predictions of the base estimators using\n `cross_val_predict`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n estimators : list of (str, estimator)\n Base estimators which will be stacked together. Each element of the\n list is defined as a tuple of string (i.e. name) and an estimator\n instance. An estimator can be set to 'drop' using `set_params`.\n\n final_estimator : estimator, default=None\n A regressor which will be used to combine the base estimators.\n The default regressor is a :class:`~sklearn.linear_model.RidgeCV`.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy used in\n `cross_val_predict` to train `final_estimator`. Possible inputs for\n cv are:\n\n * None, to use the default 5-fold cross validation,\n * integer, to specify the number of folds in a (Stratified) KFold,\n * An object to be used as a cross-validation generator,\n * An iterable yielding train, test splits.\n\n For integer/None inputs, if the estimator is a classifier and y is\n either binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used.\n In all other cases, :class:`~sklearn.model_selection.KFold` is used.\n These splitters are instantiated with `shuffle=False` so the splits\n will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. note::\n A larger number of split will provide no benefits if the number\n of training samples is large enough. Indeed, the training time\n will increase. ``cv`` is not used for model evaluation but for\n prediction.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for `fit` of all `estimators`.\n `None` means 1 unless in a `joblib.parallel_backend` context. -1 means\n using all processors. See Glossary for more details.\n\n passthrough : bool, default=False\n When False, only the predictions of estimators will be used as\n training data for `final_estimator`. When True, the\n `final_estimator` is trained on the predictions as well as the\n original training data.\n\n verbose : int, default=0\n Verbosity level.\n\n Attributes\n ----------\n estimators_ : list of estimator\n The elements of the estimators parameter, having been fitted on the\n training data. If an estimator has been set to `'drop'`, it\n will not appear in `estimators_`.\n\n named_estimators_ : :class:`~sklearn.utils.Bunch`\n Attribute to access any fitted sub-estimators by name.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying regressor exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n .. versionadded:: 1.0\n\n final_estimator_ : estimator\n The regressor to stacked the base estimators fitted.\n\n stack_method_ : list of str\n The method used by each base estimator.\n\n References\n ----------\n .. [1] Wolpert, David H. \"Stacked generalization.\" Neural networks 5.2\n (1992): 241-259.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.linear_model import RidgeCV\n >>> from sklearn.svm import LinearSVR\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> from sklearn.ensemble import StackingRegressor\n >>> X, y = load_diabetes(return_X_y=True)\n >>> estimators = [\n ... ('lr', RidgeCV()),\n ... ('svr', LinearSVR(random_state=42))\n ... ]\n >>> reg = StackingRegressor(\n ... estimators=estimators,\n ... final_estimator=RandomForestRegressor(n_estimators=10,\n ... random_state=42)\n ... )\n >>> from sklearn.model_selection import train_test_split\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=42\n ... )\n >>> reg.fit(X_train, y_train).score(X_test, y_test)\n 0.3...\n\n ", - "source_code": "\n\nclass StackingRegressor(RegressorMixin, _BaseStacking):\n \"\"\"Stack of estimators with a final regressor.\n\n Stacked generalization consists in stacking the output of individual\n estimator and use a regressor to compute the final prediction. Stacking\n allows to use the strength of each individual estimator by using their\n output as input of a final estimator.\n\n Note that `estimators_` are fitted on the full `X` while `final_estimator_`\n is trained using cross-validated predictions of the base estimators using\n `cross_val_predict`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n estimators : list of (str, estimator)\n Base estimators which will be stacked together. Each element of the\n list is defined as a tuple of string (i.e. name) and an estimator\n instance. An estimator can be set to 'drop' using `set_params`.\n\n final_estimator : estimator, default=None\n A regressor which will be used to combine the base estimators.\n The default regressor is a :class:`~sklearn.linear_model.RidgeCV`.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy used in\n `cross_val_predict` to train `final_estimator`. Possible inputs for\n cv are:\n\n * None, to use the default 5-fold cross validation,\n * integer, to specify the number of folds in a (Stratified) KFold,\n * An object to be used as a cross-validation generator,\n * An iterable yielding train, test splits.\n\n For integer/None inputs, if the estimator is a classifier and y is\n either binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used.\n In all other cases, :class:`~sklearn.model_selection.KFold` is used.\n These splitters are instantiated with `shuffle=False` so the splits\n will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. note::\n A larger number of split will provide no benefits if the number\n of training samples is large enough. Indeed, the training time\n will increase. ``cv`` is not used for model evaluation but for\n prediction.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for `fit` of all `estimators`.\n `None` means 1 unless in a `joblib.parallel_backend` context. -1 means\n using all processors. See Glossary for more details.\n\n passthrough : bool, default=False\n When False, only the predictions of estimators will be used as\n training data for `final_estimator`. When True, the\n `final_estimator` is trained on the predictions as well as the\n original training data.\n\n verbose : int, default=0\n Verbosity level.\n\n Attributes\n ----------\n estimators_ : list of estimator\n The elements of the estimators parameter, having been fitted on the\n training data. If an estimator has been set to `'drop'`, it\n will not appear in `estimators_`.\n\n named_estimators_ : :class:`~sklearn.utils.Bunch`\n Attribute to access any fitted sub-estimators by name.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying regressor exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n .. versionadded:: 1.0\n\n final_estimator_ : estimator\n The regressor to stacked the base estimators fitted.\n\n stack_method_ : list of str\n The method used by each base estimator.\n\n References\n ----------\n .. [1] Wolpert, David H. \"Stacked generalization.\" Neural networks 5.2\n (1992): 241-259.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.linear_model import RidgeCV\n >>> from sklearn.svm import LinearSVR\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> from sklearn.ensemble import StackingRegressor\n >>> X, y = load_diabetes(return_X_y=True)\n >>> estimators = [\n ... ('lr', RidgeCV()),\n ... ('svr', LinearSVR(random_state=42))\n ... ]\n >>> reg = StackingRegressor(\n ... estimators=estimators,\n ... final_estimator=RandomForestRegressor(n_estimators=10,\n ... random_state=42)\n ... )\n >>> from sklearn.model_selection import train_test_split\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=42\n ... )\n >>> reg.fit(X_train, y_train).score(X_test, y_test)\n 0.3...\n\n \"\"\"\n \n def __init__(self, estimators, final_estimator=None, *, cv=None, n_jobs=None, passthrough=False, verbose=0):\n super().__init__(estimators=estimators, final_estimator=final_estimator, cv=cv, stack_method='predict', n_jobs=n_jobs, passthrough=passthrough, verbose=verbose)\n \n def _validate_final_estimator(self):\n self._clone_final_estimator(default=RidgeCV())\n if not is_regressor(self.final_estimator_):\n raise ValueError(\"'final_estimator' parameter should be a regressor. Got {}\".format(self.final_estimator_))\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n Returns\n -------\n self : object\n \"\"\"\n y = column_or_1d(y, warn=True)\n return super().fit(X, y, sample_weight)\n \n def transform(self, X):\n \"\"\"Return the predictions for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n y_preds : ndarray of shape (n_samples, n_estimators)\n Prediction outputs for each estimator.\n \"\"\"\n return self._transform(X)\n \n def _sk_visual_block_(self):\n if self.final_estimator is None:\n final_estimator = RidgeCV()\n else:\n final_estimator = self.final_estimator\n return super()._sk_visual_block_(final_estimator)\n" + "description": "Stack of estimators with a final regressor.\n\nStacked generalization consists in stacking the output of individual\nestimator and use a regressor to compute the final prediction. Stacking\nallows to use the strength of each individual estimator by using their\noutput as input of a final estimator.\n\nNote that `estimators_` are fitted on the full `X` while `final_estimator_`\nis trained using cross-validated predictions of the base estimators using\n`cross_val_predict`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.22", + "docstring": "Stack of estimators with a final regressor.\n\n Stacked generalization consists in stacking the output of individual\n estimator and use a regressor to compute the final prediction. Stacking\n allows to use the strength of each individual estimator by using their\n output as input of a final estimator.\n\n Note that `estimators_` are fitted on the full `X` while `final_estimator_`\n is trained using cross-validated predictions of the base estimators using\n `cross_val_predict`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n estimators : list of (str, estimator)\n Base estimators which will be stacked together. Each element of the\n list is defined as a tuple of string (i.e. name) and an estimator\n instance. An estimator can be set to 'drop' using `set_params`.\n\n final_estimator : estimator, default=None\n A regressor which will be used to combine the base estimators.\n The default regressor is a :class:`~sklearn.linear_model.RidgeCV`.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy used in\n `cross_val_predict` to train `final_estimator`. Possible inputs for\n cv are:\n\n * None, to use the default 5-fold cross validation,\n * integer, to specify the number of folds in a (Stratified) KFold,\n * An object to be used as a cross-validation generator,\n * An iterable yielding train, test splits.\n\n For integer/None inputs, if the estimator is a classifier and y is\n either binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used.\n In all other cases, :class:`~sklearn.model_selection.KFold` is used.\n These splitters are instantiated with `shuffle=False` so the splits\n will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. note::\n A larger number of split will provide no benefits if the number\n of training samples is large enough. Indeed, the training time\n will increase. ``cv`` is not used for model evaluation but for\n prediction.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for `fit` of all `estimators`.\n `None` means 1 unless in a `joblib.parallel_backend` context. -1 means\n using all processors. See Glossary for more details.\n\n passthrough : bool, default=False\n When False, only the predictions of estimators will be used as\n training data for `final_estimator`. When True, the\n `final_estimator` is trained on the predictions as well as the\n original training data.\n\n verbose : int, default=0\n Verbosity level.\n\n Attributes\n ----------\n estimators_ : list of estimator\n The elements of the estimators parameter, having been fitted on the\n training data. If an estimator has been set to `'drop'`, it\n will not appear in `estimators_`.\n\n named_estimators_ : :class:`~sklearn.utils.Bunch`\n Attribute to access any fitted sub-estimators by name.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying regressor exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n .. versionadded:: 1.0\n\n final_estimator_ : estimator\n The regressor to stacked the base estimators fitted.\n\n stack_method_ : list of str\n The method used by each base estimator.\n\n See Also\n --------\n StackingClassifier : Stack of estimators with a final classifier.\n\n References\n ----------\n .. [1] Wolpert, David H. \"Stacked generalization.\" Neural networks 5.2\n (1992): 241-259.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.linear_model import RidgeCV\n >>> from sklearn.svm import LinearSVR\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> from sklearn.ensemble import StackingRegressor\n >>> X, y = load_diabetes(return_X_y=True)\n >>> estimators = [\n ... ('lr', RidgeCV()),\n ... ('svr', LinearSVR(random_state=42))\n ... ]\n >>> reg = StackingRegressor(\n ... estimators=estimators,\n ... final_estimator=RandomForestRegressor(n_estimators=10,\n ... random_state=42)\n ... )\n >>> from sklearn.model_selection import train_test_split\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=42\n ... )\n >>> reg.fit(X_train, y_train).score(X_test, y_test)\n 0.3...\n ", + "source_code": "\n\nclass StackingRegressor(RegressorMixin, _BaseStacking):\n \"\"\"Stack of estimators with a final regressor.\n\n Stacked generalization consists in stacking the output of individual\n estimator and use a regressor to compute the final prediction. Stacking\n allows to use the strength of each individual estimator by using their\n output as input of a final estimator.\n\n Note that `estimators_` are fitted on the full `X` while `final_estimator_`\n is trained using cross-validated predictions of the base estimators using\n `cross_val_predict`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n estimators : list of (str, estimator)\n Base estimators which will be stacked together. Each element of the\n list is defined as a tuple of string (i.e. name) and an estimator\n instance. An estimator can be set to 'drop' using `set_params`.\n\n final_estimator : estimator, default=None\n A regressor which will be used to combine the base estimators.\n The default regressor is a :class:`~sklearn.linear_model.RidgeCV`.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy used in\n `cross_val_predict` to train `final_estimator`. Possible inputs for\n cv are:\n\n * None, to use the default 5-fold cross validation,\n * integer, to specify the number of folds in a (Stratified) KFold,\n * An object to be used as a cross-validation generator,\n * An iterable yielding train, test splits.\n\n For integer/None inputs, if the estimator is a classifier and y is\n either binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used.\n In all other cases, :class:`~sklearn.model_selection.KFold` is used.\n These splitters are instantiated with `shuffle=False` so the splits\n will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. note::\n A larger number of split will provide no benefits if the number\n of training samples is large enough. Indeed, the training time\n will increase. ``cv`` is not used for model evaluation but for\n prediction.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for `fit` of all `estimators`.\n `None` means 1 unless in a `joblib.parallel_backend` context. -1 means\n using all processors. See Glossary for more details.\n\n passthrough : bool, default=False\n When False, only the predictions of estimators will be used as\n training data for `final_estimator`. When True, the\n `final_estimator` is trained on the predictions as well as the\n original training data.\n\n verbose : int, default=0\n Verbosity level.\n\n Attributes\n ----------\n estimators_ : list of estimator\n The elements of the estimators parameter, having been fitted on the\n training data. If an estimator has been set to `'drop'`, it\n will not appear in `estimators_`.\n\n named_estimators_ : :class:`~sklearn.utils.Bunch`\n Attribute to access any fitted sub-estimators by name.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying regressor exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n .. versionadded:: 1.0\n\n final_estimator_ : estimator\n The regressor to stacked the base estimators fitted.\n\n stack_method_ : list of str\n The method used by each base estimator.\n\n See Also\n --------\n StackingClassifier : Stack of estimators with a final classifier.\n\n References\n ----------\n .. [1] Wolpert, David H. \"Stacked generalization.\" Neural networks 5.2\n (1992): 241-259.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.linear_model import RidgeCV\n >>> from sklearn.svm import LinearSVR\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> from sklearn.ensemble import StackingRegressor\n >>> X, y = load_diabetes(return_X_y=True)\n >>> estimators = [\n ... ('lr', RidgeCV()),\n ... ('svr', LinearSVR(random_state=42))\n ... ]\n >>> reg = StackingRegressor(\n ... estimators=estimators,\n ... final_estimator=RandomForestRegressor(n_estimators=10,\n ... random_state=42)\n ... )\n >>> from sklearn.model_selection import train_test_split\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=42\n ... )\n >>> reg.fit(X_train, y_train).score(X_test, y_test)\n 0.3...\n \"\"\"\n \n def __init__(self, estimators, final_estimator=None, *, cv=None, n_jobs=None, passthrough=False, verbose=0):\n super().__init__(estimators=estimators, final_estimator=final_estimator, cv=cv, stack_method='predict', n_jobs=n_jobs, passthrough=passthrough, verbose=verbose)\n \n def _validate_final_estimator(self):\n self._clone_final_estimator(default=RidgeCV())\n if not is_regressor(self.final_estimator_):\n raise ValueError(\"'final_estimator' parameter should be a regressor. Got {}\".format(self.final_estimator_))\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n \"\"\"\n y = column_or_1d(y, warn=True)\n return super().fit(X, y, sample_weight)\n \n def transform(self, X):\n \"\"\"Return the predictions for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n y_preds : ndarray of shape (n_samples, n_estimators)\n Prediction outputs for each estimator.\n \"\"\"\n return self._transform(X)\n \n def _sk_visual_block_(self):\n if self.final_estimator is None:\n final_estimator = RidgeCV()\n else:\n final_estimator = self.final_estimator\n return super()._sk_visual_block_(final_estimator)\n" }, { "name": "_BaseStacking", @@ -21609,7 +21675,7 @@ "sklearn.ensemble._voting.VotingClassifier.transform" ], "is_public": true, - "description": "Soft Voting/Majority Rule classifier for unfitted estimators.\n\nRead more in the :ref:`User Guide `. .. versionadded:: 0.17", + "description": "Soft Voting/Majority Rule classifier for unfitted estimators.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.17", "docstring": "Soft Voting/Majority Rule classifier for unfitted estimators.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n estimators : list of (str, estimator) tuples\n Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones\n of those original estimators that will be stored in the class attribute\n ``self.estimators_``. An estimator can be set to ``'drop'``\n using ``set_params``.\n\n .. versionchanged:: 0.21\n ``'drop'`` is accepted. Using None was deprecated in 0.22 and\n support was removed in 0.24.\n\n voting : {'hard', 'soft'}, default='hard'\n If 'hard', uses predicted class labels for majority rule voting.\n Else if 'soft', predicts the class label based on the argmax of\n the sums of the predicted probabilities, which is recommended for\n an ensemble of well-calibrated classifiers.\n\n weights : array-like of shape (n_classifiers,), default=None\n Sequence of weights (`float` or `int`) to weight the occurrences of\n predicted class labels (`hard` voting) or class probabilities\n before averaging (`soft` voting). Uses uniform weights if `None`.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for ``fit``.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.18\n\n flatten_transform : bool, default=True\n Affects shape of transform output only when voting='soft'\n If voting='soft' and flatten_transform=True, transform method returns\n matrix with shape (n_samples, n_classifiers * n_classes). If\n flatten_transform=False, it returns\n (n_classifiers, n_samples, n_classes).\n\n verbose : bool, default=False\n If True, the time elapsed while fitting will be printed as it\n is completed.\n\n .. versionadded:: 0.23\n\n Attributes\n ----------\n estimators_ : list of classifiers\n The collection of fitted sub-estimators as defined in ``estimators``\n that are not 'drop'.\n\n named_estimators_ : :class:`~sklearn.utils.Bunch`\n Attribute to access any fitted sub-estimators by name.\n\n .. versionadded:: 0.20\n\n le_ : :class:`~sklearn.preprocessing.LabelEncoder`\n Transformer used to encode the labels during fit and decode during\n prediction.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying classifier exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n .. versionadded:: 1.0\n\n See Also\n --------\n VotingRegressor : Prediction voting regressor.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.naive_bayes import GaussianNB\n >>> from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n >>> clf1 = LogisticRegression(multi_class='multinomial', random_state=1)\n >>> clf2 = RandomForestClassifier(n_estimators=50, random_state=1)\n >>> clf3 = GaussianNB()\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> y = np.array([1, 1, 1, 2, 2, 2])\n >>> eclf1 = VotingClassifier(estimators=[\n ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')\n >>> eclf1 = eclf1.fit(X, y)\n >>> print(eclf1.predict(X))\n [1 1 1 2 2 2]\n >>> np.array_equal(eclf1.named_estimators_.lr.predict(X),\n ... eclf1.named_estimators_['lr'].predict(X))\n True\n >>> eclf2 = VotingClassifier(estimators=[\n ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],\n ... voting='soft')\n >>> eclf2 = eclf2.fit(X, y)\n >>> print(eclf2.predict(X))\n [1 1 1 2 2 2]\n >>> eclf3 = VotingClassifier(estimators=[\n ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],\n ... voting='soft', weights=[2,1,1],\n ... flatten_transform=True)\n >>> eclf3 = eclf3.fit(X, y)\n >>> print(eclf3.predict(X))\n [1 1 1 2 2 2]\n >>> print(eclf3.transform(X).shape)\n (6, 6)\n ", "source_code": "\n\nclass VotingClassifier(ClassifierMixin, _BaseVoting):\n \"\"\"Soft Voting/Majority Rule classifier for unfitted estimators.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n estimators : list of (str, estimator) tuples\n Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones\n of those original estimators that will be stored in the class attribute\n ``self.estimators_``. An estimator can be set to ``'drop'``\n using ``set_params``.\n\n .. versionchanged:: 0.21\n ``'drop'`` is accepted. Using None was deprecated in 0.22 and\n support was removed in 0.24.\n\n voting : {'hard', 'soft'}, default='hard'\n If 'hard', uses predicted class labels for majority rule voting.\n Else if 'soft', predicts the class label based on the argmax of\n the sums of the predicted probabilities, which is recommended for\n an ensemble of well-calibrated classifiers.\n\n weights : array-like of shape (n_classifiers,), default=None\n Sequence of weights (`float` or `int`) to weight the occurrences of\n predicted class labels (`hard` voting) or class probabilities\n before averaging (`soft` voting). Uses uniform weights if `None`.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for ``fit``.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.18\n\n flatten_transform : bool, default=True\n Affects shape of transform output only when voting='soft'\n If voting='soft' and flatten_transform=True, transform method returns\n matrix with shape (n_samples, n_classifiers * n_classes). If\n flatten_transform=False, it returns\n (n_classifiers, n_samples, n_classes).\n\n verbose : bool, default=False\n If True, the time elapsed while fitting will be printed as it\n is completed.\n\n .. versionadded:: 0.23\n\n Attributes\n ----------\n estimators_ : list of classifiers\n The collection of fitted sub-estimators as defined in ``estimators``\n that are not 'drop'.\n\n named_estimators_ : :class:`~sklearn.utils.Bunch`\n Attribute to access any fitted sub-estimators by name.\n\n .. versionadded:: 0.20\n\n le_ : :class:`~sklearn.preprocessing.LabelEncoder`\n Transformer used to encode the labels during fit and decode during\n prediction.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying classifier exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n .. versionadded:: 1.0\n\n See Also\n --------\n VotingRegressor : Prediction voting regressor.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.naive_bayes import GaussianNB\n >>> from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n >>> clf1 = LogisticRegression(multi_class='multinomial', random_state=1)\n >>> clf2 = RandomForestClassifier(n_estimators=50, random_state=1)\n >>> clf3 = GaussianNB()\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> y = np.array([1, 1, 1, 2, 2, 2])\n >>> eclf1 = VotingClassifier(estimators=[\n ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')\n >>> eclf1 = eclf1.fit(X, y)\n >>> print(eclf1.predict(X))\n [1 1 1 2 2 2]\n >>> np.array_equal(eclf1.named_estimators_.lr.predict(X),\n ... eclf1.named_estimators_['lr'].predict(X))\n True\n >>> eclf2 = VotingClassifier(estimators=[\n ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],\n ... voting='soft')\n >>> eclf2 = eclf2.fit(X, y)\n >>> print(eclf2.predict(X))\n [1 1 1 2 2 2]\n >>> eclf3 = VotingClassifier(estimators=[\n ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],\n ... voting='soft', weights=[2,1,1],\n ... flatten_transform=True)\n >>> eclf3 = eclf3.fit(X, y)\n >>> print(eclf3.predict(X))\n [1 1 1 2 2 2]\n >>> print(eclf3.transform(X).shape)\n (6, 6)\n \"\"\"\n \n def __init__(self, estimators, *, voting='hard', weights=None, n_jobs=None, flatten_transform=True, verbose=False):\n super().__init__(estimators=estimators)\n self.voting = voting\n self.weights = weights\n self.n_jobs = n_jobs\n self.flatten_transform = flatten_transform\n self.verbose = verbose\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n check_classification_targets(y)\n if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:\n raise NotImplementedError('Multilabel and multi-output classification is not supported.')\n if self.voting not in ('soft', 'hard'):\n raise ValueError(\"Voting must be 'soft' or 'hard'; got (voting=%r)\" % self.voting)\n self.le_ = LabelEncoder().fit(y)\n self.classes_ = self.le_.classes_\n transformed_y = self.le_.transform(y)\n return super().fit(X, transformed_y, sample_weight)\n \n def predict(self, X):\n \"\"\"Predict class labels for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n maj : array-like of shape (n_samples,)\n Predicted class labels.\n \"\"\"\n check_is_fitted(self)\n if self.voting == 'soft':\n maj = np.argmax(self.predict_proba(X), axis=1)\n else:\n predictions = self._predict(X)\n maj = np.apply_along_axis(lambda x: np.argmax(np.bincount(x, weights=self._weights_not_none)), axis=1, arr=predictions)\n maj = self.le_.inverse_transform(maj)\n return maj\n \n def _collect_probas(self, X):\n \"\"\"Collect results from clf.predict calls.\"\"\"\n return np.asarray([clf.predict_proba(X) for clf in self.estimators_])\n \n def _check_voting(self):\n if self.voting == 'hard':\n raise AttributeError(f'predict_proba is not available when voting={repr(self.voting)}')\n return True\n \n @available_if(_check_voting)\n def predict_proba(self, X):\n \"\"\"Compute probabilities of possible outcomes for samples in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n avg : array-like of shape (n_samples, n_classes)\n Weighted average probability for each class per sample.\n \"\"\"\n check_is_fitted(self)\n avg = np.average(self._collect_probas(X), axis=0, weights=self._weights_not_none)\n return avg\n \n def transform(self, X):\n \"\"\"Return class labels or probabilities for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n probabilities_or_labels\n If `voting='soft'` and `flatten_transform=True`:\n returns ndarray of shape (n_classifiers, n_samples *\n n_classes), being class probabilities calculated by each\n classifier.\n If `voting='soft' and `flatten_transform=False`:\n ndarray of shape (n_classifiers, n_samples, n_classes)\n If `voting='hard'`:\n ndarray of shape (n_samples, n_classifiers), being\n class labels predicted by each classifier.\n \"\"\"\n check_is_fitted(self)\n if self.voting == 'soft':\n probas = self._collect_probas(X)\n if not self.flatten_transform:\n return probas\n return np.hstack(probas)\n else:\n return self._predict(X)\n" }, @@ -21625,7 +21691,7 @@ "sklearn.ensemble._voting.VotingRegressor.transform" ], "is_public": true, - "description": "Prediction voting regressor for unfitted estimators.\n\nA voting regressor is an ensemble meta-estimator that fits several base regressors, each on the whole dataset. Then it averages the individual predictions to form a final prediction. Read more in the :ref:`User Guide `. .. versionadded:: 0.21", + "description": "Prediction voting regressor for unfitted estimators.\n\nA voting regressor is an ensemble meta-estimator that fits several base\nregressors, each on the whole dataset. Then it averages the individual\npredictions to form a final prediction.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.21", "docstring": "Prediction voting regressor for unfitted estimators.\n\n A voting regressor is an ensemble meta-estimator that fits several base\n regressors, each on the whole dataset. Then it averages the individual\n predictions to form a final prediction.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.21\n\n Parameters\n ----------\n estimators : list of (str, estimator) tuples\n Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones\n of those original estimators that will be stored in the class attribute\n ``self.estimators_``. An estimator can be set to ``'drop'`` using\n ``set_params``.\n\n .. versionchanged:: 0.21\n ``'drop'`` is accepted. Using None was deprecated in 0.22 and\n support was removed in 0.24.\n\n weights : array-like of shape (n_regressors,), default=None\n Sequence of weights (`float` or `int`) to weight the occurrences of\n predicted values before averaging. Uses uniform weights if `None`.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for ``fit``.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : bool, default=False\n If True, the time elapsed while fitting will be printed as it\n is completed.\n\n .. versionadded:: 0.23\n\n Attributes\n ----------\n estimators_ : list of regressors\n The collection of fitted sub-estimators as defined in ``estimators``\n that are not 'drop'.\n\n named_estimators_ : :class:`~sklearn.utils.Bunch`\n Attribute to access any fitted sub-estimators by name.\n\n .. versionadded:: 0.20\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying regressor exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n .. versionadded:: 1.0\n\n See Also\n --------\n VotingClassifier : Soft Voting/Majority Rule classifier.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> from sklearn.ensemble import VotingRegressor\n >>> r1 = LinearRegression()\n >>> r2 = RandomForestRegressor(n_estimators=10, random_state=1)\n >>> X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])\n >>> y = np.array([2, 6, 12, 20, 30, 42])\n >>> er = VotingRegressor([('lr', r1), ('rf', r2)])\n >>> print(er.fit(X, y).predict(X))\n [ 3.3 5.7 11.8 19.7 28. 40.3]\n ", "source_code": "\n\nclass VotingRegressor(RegressorMixin, _BaseVoting):\n \"\"\"Prediction voting regressor for unfitted estimators.\n\n A voting regressor is an ensemble meta-estimator that fits several base\n regressors, each on the whole dataset. Then it averages the individual\n predictions to form a final prediction.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.21\n\n Parameters\n ----------\n estimators : list of (str, estimator) tuples\n Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones\n of those original estimators that will be stored in the class attribute\n ``self.estimators_``. An estimator can be set to ``'drop'`` using\n ``set_params``.\n\n .. versionchanged:: 0.21\n ``'drop'`` is accepted. Using None was deprecated in 0.22 and\n support was removed in 0.24.\n\n weights : array-like of shape (n_regressors,), default=None\n Sequence of weights (`float` or `int`) to weight the occurrences of\n predicted values before averaging. Uses uniform weights if `None`.\n\n n_jobs : int, default=None\n The number of jobs to run in parallel for ``fit``.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : bool, default=False\n If True, the time elapsed while fitting will be printed as it\n is completed.\n\n .. versionadded:: 0.23\n\n Attributes\n ----------\n estimators_ : list of regressors\n The collection of fitted sub-estimators as defined in ``estimators``\n that are not 'drop'.\n\n named_estimators_ : :class:`~sklearn.utils.Bunch`\n Attribute to access any fitted sub-estimators by name.\n\n .. versionadded:: 0.20\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying regressor exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n .. versionadded:: 1.0\n\n See Also\n --------\n VotingClassifier : Soft Voting/Majority Rule classifier.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> from sklearn.ensemble import VotingRegressor\n >>> r1 = LinearRegression()\n >>> r2 = RandomForestRegressor(n_estimators=10, random_state=1)\n >>> X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])\n >>> y = np.array([2, 6, 12, 20, 30, 42])\n >>> er = VotingRegressor([('lr', r1), ('rf', r2)])\n >>> print(er.fit(X, y).predict(X))\n [ 3.3 5.7 11.8 19.7 28. 40.3]\n \"\"\"\n \n def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False):\n super().__init__(estimators=estimators)\n self.weights = weights\n self.n_jobs = n_jobs\n self.verbose = verbose\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n y = column_or_1d(y, warn=True)\n return super().fit(X, y, sample_weight)\n \n def predict(self, X):\n \"\"\"Predict regression target for X.\n\n The predicted regression target of an input sample is computed as the\n mean predicted regression targets of the estimators in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted values.\n \"\"\"\n check_is_fitted(self)\n return np.average(self._predict(X), axis=1, weights=self._weights_not_none)\n \n def transform(self, X):\n \"\"\"Return predictions for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n predictions : ndarray of shape (n_samples, n_classifiers)\n Values predicted by each regressor.\n \"\"\"\n check_is_fitted(self)\n return self._predict(X)\n" }, @@ -21645,7 +21711,7 @@ "sklearn.ensemble._voting._BaseVoting._more_tags" ], "is_public": false, - "description": "Base class for voting.\n\nWarning: This class should not be used directly. Use derived classes instead.", + "description": "Base class for voting.\n\nWarning: This class should not be used directly. Use derived classes\ninstead.", "docstring": "Base class for voting.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n ", "source_code": "\n\nclass _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble):\n \"\"\"Base class for voting.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n \"\"\"\n \n def _log_message(self, name, idx, total):\n if not self.verbose:\n return None\n return '(%d of %d) Processing %s' % (idx, total, name)\n \n @property\n def _weights_not_none(self):\n \"\"\"Get the weights of not `None` estimators.\"\"\"\n if self.weights is None:\n return None\n return [w for (est, w) in zip(self.estimators, self.weights) if est[1] != 'drop']\n \n def _predict(self, X):\n \"\"\"Collect results from clf.predict calls.\"\"\"\n return np.asarray([est.predict(X) for est in self.estimators_]).T\n \n @abstractmethod\n def fit(self, X, y, sample_weight=None):\n \"\"\"Get common fit operations.\"\"\"\n (names, clfs) = self._validate_estimators()\n if self.weights is not None and len(self.weights) != len(self.estimators):\n raise ValueError('Number of `estimators` and weights must be equal; got %d weights, %d estimators' % (len(self.weights), len(self.estimators)))\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_fit_single_estimator)(clone(clf), X, y, sample_weight=sample_weight, message_clsname='Voting', message=self._log_message(names[idx], idx + 1, len(clfs))) for (idx, clf) in enumerate(clfs) if clf != 'drop'))\n self.named_estimators_ = Bunch()\n est_iter = iter(self.estimators_)\n for (name, est) in self.estimators:\n current_est = est if est == 'drop' else next(est_iter)\n self.named_estimators_[name] = current_est\n if hasattr(current_est, 'feature_names_in_'):\n self.feature_names_in_ = current_est.feature_names_in_\n return self\n \n def fit_transform(self, X, y=None, **fit_params):\n \"\"\"Return class labels or probabilities for each estimator.\n\n Return predictions for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix, dataframe} of shape (n_samples, n_features)\n Input samples.\n\n y : ndarray of shape (n_samples,), default=None\n Target values (None for unsupervised transformations).\n\n **fit_params : dict\n Additional fit parameters.\n\n Returns\n -------\n X_new : ndarray array of shape (n_samples, n_features_new)\n Transformed array.\n \"\"\"\n return super().fit_transform(X, y, **fit_params)\n \n @property\n def n_features_in_(self):\n \"\"\"Number of features seen during :term:`fit`.\"\"\"\n try:\n check_is_fitted(self)\n except NotFittedError as nfe:\n raise AttributeError('{} object has no n_features_in_ attribute.'.format(self.__class__.__name__)) from nfe\n return self.estimators_[0].n_features_in_\n \n def _sk_visual_block_(self):\n (names, estimators) = zip(*self.estimators)\n return _VisualBlock('parallel', estimators, names=names)\n \n def _more_tags(self):\n return {'preserves_dtype': []}\n" }, @@ -21671,7 +21737,7 @@ "sklearn.ensemble._weight_boosting.AdaBoostClassifier.predict_log_proba" ], "is_public": true, - "description": "An AdaBoost classifier.\n\nAn AdaBoost [1] classifier is a meta-estimator that begins by fitting a classifier on the original dataset and then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases. This class implements the algorithm known as AdaBoost-SAMME [2]. Read more in the :ref:`User Guide `. .. versionadded:: 0.14", + "description": "An AdaBoost classifier.\n\nAn AdaBoost [1] classifier is a meta-estimator that begins by fitting a\nclassifier on the original dataset and then fits additional copies of the\nclassifier on the same dataset but where the weights of incorrectly\nclassified instances are adjusted such that subsequent classifiers focus\nmore on difficult cases.\n\nThis class implements the algorithm known as AdaBoost-SAMME [2].\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.14", "docstring": "An AdaBoost classifier.\n\n An AdaBoost [1] classifier is a meta-estimator that begins by fitting a\n classifier on the original dataset and then fits additional copies of the\n classifier on the same dataset but where the weights of incorrectly\n classified instances are adjusted such that subsequent classifiers focus\n more on difficult cases.\n\n This class implements the algorithm known as AdaBoost-SAMME [2].\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.14\n\n Parameters\n ----------\n base_estimator : object, default=None\n The base estimator from which the boosted ensemble is built.\n Support for sample weighting is required, as well as proper\n ``classes_`` and ``n_classes_`` attributes. If ``None``, then\n the base estimator is :class:`~sklearn.tree.DecisionTreeClassifier`\n initialized with `max_depth=1`.\n\n n_estimators : int, default=50\n The maximum number of estimators at which boosting is terminated.\n In case of perfect fit, the learning procedure is stopped early.\n\n learning_rate : float, default=1.0\n Weight applied to each classifier at each boosting iteration. A higher\n learning rate increases the contribution of each classifier. There is\n a trade-off between the `learning_rate` and `n_estimators` parameters.\n\n algorithm : {'SAMME', 'SAMME.R'}, default='SAMME.R'\n If 'SAMME.R' then use the SAMME.R real boosting algorithm.\n ``base_estimator`` must support calculation of class probabilities.\n If 'SAMME' then use the SAMME discrete boosting algorithm.\n The SAMME.R algorithm typically converges faster than SAMME,\n achieving a lower test error with fewer boosting iterations.\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given at each `base_estimator` at each\n boosting iteration.\n Thus, it is only used when `base_estimator` exposes a `random_state`.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n base_estimator_ : estimator\n The base estimator from which the ensemble is grown.\n\n estimators_ : list of classifiers\n The collection of fitted sub-estimators.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_classes_ : int\n The number of classes.\n\n estimator_weights_ : ndarray of floats\n Weights for each estimator in the boosted ensemble.\n\n estimator_errors_ : ndarray of floats\n Classification error for each estimator in the boosted\n ensemble.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances if supported by the\n ``base_estimator`` (when based on decision trees).\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n AdaBoostRegressor : An AdaBoost regressor that begins by fitting a\n regressor on the original dataset and then fits additional copies of\n the regressor on the same dataset but where the weights of instances\n are adjusted according to the error of the current prediction.\n\n GradientBoostingClassifier : GB builds an additive model in a forward\n stage-wise fashion. Regression trees are fit on the negative gradient\n of the binomial or multinomial deviance loss function. Binary\n classification is a special case where only a single regression tree is\n induced.\n\n sklearn.tree.DecisionTreeClassifier : A non-parametric supervised learning\n method used for classification.\n Creates a model that predicts the value of a target variable by\n learning simple decision rules inferred from the data features.\n\n References\n ----------\n .. [1] Y. Freund, R. Schapire, \"A Decision-Theoretic Generalization of\n on-Line Learning and an Application to Boosting\", 1995.\n\n .. [2] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\", 2009.\n\n Examples\n --------\n >>> from sklearn.ensemble import AdaBoostClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_samples=1000, n_features=4,\n ... n_informative=2, n_redundant=0,\n ... random_state=0, shuffle=False)\n >>> clf = AdaBoostClassifier(n_estimators=100, random_state=0)\n >>> clf.fit(X, y)\n AdaBoostClassifier(n_estimators=100, random_state=0)\n >>> clf.predict([[0, 0, 0, 0]])\n array([1])\n >>> clf.score(X, y)\n 0.983...\n ", "source_code": "\n\nclass AdaBoostClassifier(ClassifierMixin, BaseWeightBoosting):\n \"\"\"An AdaBoost classifier.\n\n An AdaBoost [1] classifier is a meta-estimator that begins by fitting a\n classifier on the original dataset and then fits additional copies of the\n classifier on the same dataset but where the weights of incorrectly\n classified instances are adjusted such that subsequent classifiers focus\n more on difficult cases.\n\n This class implements the algorithm known as AdaBoost-SAMME [2].\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.14\n\n Parameters\n ----------\n base_estimator : object, default=None\n The base estimator from which the boosted ensemble is built.\n Support for sample weighting is required, as well as proper\n ``classes_`` and ``n_classes_`` attributes. If ``None``, then\n the base estimator is :class:`~sklearn.tree.DecisionTreeClassifier`\n initialized with `max_depth=1`.\n\n n_estimators : int, default=50\n The maximum number of estimators at which boosting is terminated.\n In case of perfect fit, the learning procedure is stopped early.\n\n learning_rate : float, default=1.0\n Weight applied to each classifier at each boosting iteration. A higher\n learning rate increases the contribution of each classifier. There is\n a trade-off between the `learning_rate` and `n_estimators` parameters.\n\n algorithm : {'SAMME', 'SAMME.R'}, default='SAMME.R'\n If 'SAMME.R' then use the SAMME.R real boosting algorithm.\n ``base_estimator`` must support calculation of class probabilities.\n If 'SAMME' then use the SAMME discrete boosting algorithm.\n The SAMME.R algorithm typically converges faster than SAMME,\n achieving a lower test error with fewer boosting iterations.\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given at each `base_estimator` at each\n boosting iteration.\n Thus, it is only used when `base_estimator` exposes a `random_state`.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n base_estimator_ : estimator\n The base estimator from which the ensemble is grown.\n\n estimators_ : list of classifiers\n The collection of fitted sub-estimators.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_classes_ : int\n The number of classes.\n\n estimator_weights_ : ndarray of floats\n Weights for each estimator in the boosted ensemble.\n\n estimator_errors_ : ndarray of floats\n Classification error for each estimator in the boosted\n ensemble.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances if supported by the\n ``base_estimator`` (when based on decision trees).\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n AdaBoostRegressor : An AdaBoost regressor that begins by fitting a\n regressor on the original dataset and then fits additional copies of\n the regressor on the same dataset but where the weights of instances\n are adjusted according to the error of the current prediction.\n\n GradientBoostingClassifier : GB builds an additive model in a forward\n stage-wise fashion. Regression trees are fit on the negative gradient\n of the binomial or multinomial deviance loss function. Binary\n classification is a special case where only a single regression tree is\n induced.\n\n sklearn.tree.DecisionTreeClassifier : A non-parametric supervised learning\n method used for classification.\n Creates a model that predicts the value of a target variable by\n learning simple decision rules inferred from the data features.\n\n References\n ----------\n .. [1] Y. Freund, R. Schapire, \"A Decision-Theoretic Generalization of\n on-Line Learning and an Application to Boosting\", 1995.\n\n .. [2] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\", 2009.\n\n Examples\n --------\n >>> from sklearn.ensemble import AdaBoostClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_samples=1000, n_features=4,\n ... n_informative=2, n_redundant=0,\n ... random_state=0, shuffle=False)\n >>> clf = AdaBoostClassifier(n_estimators=100, random_state=0)\n >>> clf.fit(X, y)\n AdaBoostClassifier(n_estimators=100, random_state=0)\n >>> clf.predict([[0, 0, 0, 0]])\n array([1])\n >>> clf.score(X, y)\n 0.983...\n \"\"\"\n \n def __init__(self, base_estimator=None, *, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None):\n super().__init__(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state)\n self.algorithm = algorithm\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Build a boosted classifier from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n The target values (class labels).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, the sample weights are initialized to\n ``1 / n_samples``.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.algorithm not in ('SAMME', 'SAMME.R'):\n raise ValueError('algorithm %s is not supported' % self.algorithm)\n return super().fit(X, y, sample_weight)\n \n def _validate_estimator(self):\n \"\"\"Check the estimator and set the base_estimator_ attribute.\"\"\"\n super()._validate_estimator(default=DecisionTreeClassifier(max_depth=1))\n if self.algorithm == 'SAMME.R':\n if not hasattr(self.base_estimator_, 'predict_proba'):\n raise TypeError(\"AdaBoostClassifier with algorithm='SAMME.R' requires that the weak learner supports the calculation of class probabilities with a predict_proba method.\\nPlease change the base estimator or set algorithm='SAMME' instead.\")\n if not has_fit_parameter(self.base_estimator_, 'sample_weight'):\n raise ValueError(\"%s doesn't support sample_weight.\" % self.base_estimator_.__class__.__name__)\n \n def _boost(self, iboost, X, y, sample_weight, random_state):\n \"\"\"Implement a single boost.\n\n Perform a single boost according to the real multi-class SAMME.R\n algorithm or to the discrete SAMME algorithm and return the updated\n sample weights.\n\n Parameters\n ----------\n iboost : int\n The index of the current boost iteration.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,)\n The target values (class labels).\n\n sample_weight : array-like of shape (n_samples,)\n The current sample weights.\n\n random_state : RandomState instance\n The RandomState instance used if the base estimator accepts a\n `random_state` attribute.\n\n Returns\n -------\n sample_weight : array-like of shape (n_samples,) or None\n The reweighted sample weights.\n If None then boosting has terminated early.\n\n estimator_weight : float\n The weight for the current boost.\n If None then boosting has terminated early.\n\n estimator_error : float\n The classification error for the current boost.\n If None then boosting has terminated early.\n \"\"\"\n if self.algorithm == 'SAMME.R':\n return self._boost_real(iboost, X, y, sample_weight, random_state)\n else:\n return self._boost_discrete(iboost, X, y, sample_weight, random_state)\n \n def _boost_real(self, iboost, X, y, sample_weight, random_state):\n \"\"\"Implement a single boost using the SAMME.R real algorithm.\"\"\"\n estimator = self._make_estimator(random_state=random_state)\n estimator.fit(X, y, sample_weight=sample_weight)\n y_predict_proba = estimator.predict_proba(X)\n if iboost == 0:\n self.classes_ = getattr(estimator, 'classes_', None)\n self.n_classes_ = len(self.classes_)\n y_predict = self.classes_.take(np.argmax(y_predict_proba, axis=1), axis=0)\n incorrect = y_predict != y\n estimator_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0))\n if estimator_error <= 0:\n return sample_weight, 1.0, 0.0\n n_classes = self.n_classes_\n classes = self.classes_\n y_codes = np.array([-1.0 / (n_classes - 1), 1.0])\n y_coding = y_codes.take(classes == y[:, np.newaxis])\n proba = y_predict_proba\n np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba)\n estimator_weight = -1.0 * self.learning_rate * ((n_classes - 1.0) / n_classes) * xlogy(y_coding, y_predict_proba).sum(axis=1)\n if not iboost == self.n_estimators - 1:\n sample_weight *= np.exp(estimator_weight * ((sample_weight > 0) | (estimator_weight < 0)))\n return sample_weight, 1.0, estimator_error\n \n def _boost_discrete(self, iboost, X, y, sample_weight, random_state):\n \"\"\"Implement a single boost using the SAMME discrete algorithm.\"\"\"\n estimator = self._make_estimator(random_state=random_state)\n estimator.fit(X, y, sample_weight=sample_weight)\n y_predict = estimator.predict(X)\n if iboost == 0:\n self.classes_ = getattr(estimator, 'classes_', None)\n self.n_classes_ = len(self.classes_)\n incorrect = y_predict != y\n estimator_error = np.mean(np.average(incorrect, weights=sample_weight, axis=0))\n if estimator_error <= 0:\n return sample_weight, 1.0, 0.0\n n_classes = self.n_classes_\n if estimator_error >= 1.0 - 1.0 / n_classes:\n self.estimators_.pop(-1)\n if len(self.estimators_) == 0:\n raise ValueError('BaseClassifier in AdaBoostClassifier ensemble is worse than random, ensemble can not be fit.')\n return None, None, None\n estimator_weight = self.learning_rate * (np.log((1.0 - estimator_error) / estimator_error) + np.log(n_classes - 1.0))\n if not iboost == self.n_estimators - 1:\n sample_weight = np.exp(np.log(sample_weight) + estimator_weight * incorrect * (sample_weight > 0))\n return sample_weight, estimator_weight, estimator_error\n \n def predict(self, X):\n \"\"\"Predict classes for X.\n\n The predicted class of an input sample is computed as the weighted mean\n prediction of the classifiers in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted classes.\n \"\"\"\n pred = self.decision_function(X)\n if self.n_classes_ == 2:\n return self.classes_.take(pred > 0, axis=0)\n return self.classes_.take(np.argmax(pred, axis=1), axis=0)\n \n def staged_predict(self, X):\n \"\"\"Return staged predictions for X.\n\n The predicted class of an input sample is computed as the weighted mean\n prediction of the classifiers in the ensemble.\n\n This generator method yields the ensemble prediction after each\n iteration of boosting and therefore allows monitoring, such as to\n determine the prediction on a test set after each boost.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Yields\n ------\n y : generator of ndarray of shape (n_samples,)\n The predicted classes.\n \"\"\"\n X = self._check_X(X)\n n_classes = self.n_classes_\n classes = self.classes_\n if n_classes == 2:\n for pred in self.staged_decision_function(X):\n yield np.array(classes.take(pred > 0, axis=0))\n else:\n for pred in self.staged_decision_function(X):\n yield np.array(classes.take(np.argmax(pred, axis=1), axis=0))\n \n def decision_function(self, X):\n \"\"\"Compute the decision function of ``X``.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n score : ndarray of shape of (n_samples, k)\n The decision function of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n Binary classification is a special cases with ``k == 1``,\n otherwise ``k==n_classes``. For binary classification,\n values closer to -1 or 1 mean more like the first or second\n class in ``classes_``, respectively.\n \"\"\"\n check_is_fitted(self)\n X = self._check_X(X)\n n_classes = self.n_classes_\n classes = self.classes_[:, np.newaxis]\n if self.algorithm == 'SAMME.R':\n pred = sum((_samme_proba(estimator, n_classes, X) for estimator in self.estimators_))\n else:\n pred = sum(((estimator.predict(X) == classes).T * w for (estimator, w) in zip(self.estimators_, self.estimator_weights_)))\n pred /= self.estimator_weights_.sum()\n if n_classes == 2:\n pred[:, 0] *= -1\n return pred.sum(axis=1)\n return pred\n \n def staged_decision_function(self, X):\n \"\"\"Compute decision function of ``X`` for each boosting iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each boosting iteration.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Yields\n ------\n score : generator of ndarray of shape (n_samples, k)\n The decision function of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n Binary classification is a special cases with ``k == 1``,\n otherwise ``k==n_classes``. For binary classification,\n values closer to -1 or 1 mean more like the first or second\n class in ``classes_``, respectively.\n \"\"\"\n check_is_fitted(self)\n X = self._check_X(X)\n n_classes = self.n_classes_\n classes = self.classes_[:, np.newaxis]\n pred = None\n norm = 0.0\n for (weight, estimator) in zip(self.estimator_weights_, self.estimators_):\n norm += weight\n if self.algorithm == 'SAMME.R':\n current_pred = _samme_proba(estimator, n_classes, X)\n else:\n current_pred = estimator.predict(X)\n current_pred = (current_pred == classes).T * weight\n if pred is None:\n pred = current_pred\n else:\n pred += current_pred\n if n_classes == 2:\n tmp_pred = np.copy(pred)\n tmp_pred[:, 0] *= -1\n yield (tmp_pred / norm).sum(axis=1)\n else:\n yield pred / norm\n \n @staticmethod\n def _compute_proba_from_decision(decision, n_classes):\n \"\"\"Compute probabilities from the decision function.\n\n This is based eq. (4) of [1] where:\n p(y=c|X) = exp((1 / K-1) f_c(X)) / sum_k(exp((1 / K-1) f_k(X)))\n = softmax((1 / K-1) * f(X))\n\n References\n ----------\n .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\",\n 2009.\n \"\"\"\n if n_classes == 2:\n decision = np.vstack([-decision, decision]).T / 2\n else:\n decision /= n_classes - 1\n return softmax(decision, copy=False)\n \n def predict_proba(self, X):\n \"\"\"Predict class probabilities for X.\n\n The predicted class probabilities of an input sample is computed as\n the weighted mean predicted class probabilities of the classifiers\n in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n \"\"\"\n check_is_fitted(self)\n n_classes = self.n_classes_\n if n_classes == 1:\n return np.ones((_num_samples(X), 1))\n decision = self.decision_function(X)\n return self._compute_proba_from_decision(decision, n_classes)\n \n def staged_predict_proba(self, X):\n \"\"\"Predict class probabilities for X.\n\n The predicted class probabilities of an input sample is computed as\n the weighted mean predicted class probabilities of the classifiers\n in the ensemble.\n\n This generator method yields the ensemble predicted class probabilities\n after each iteration of boosting and therefore allows monitoring, such\n as to determine the predicted class probabilities on a test set after\n each boost.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Yields\n ------\n p : generator of ndarray of shape (n_samples,)\n The class probabilities of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n \"\"\"\n n_classes = self.n_classes_\n for decision in self.staged_decision_function(X):\n yield self._compute_proba_from_decision(decision, n_classes)\n \n def predict_log_proba(self, X):\n \"\"\"Predict class log-probabilities for X.\n\n The predicted class log-probabilities of an input sample is computed as\n the weighted mean predicted class log-probabilities of the classifiers\n in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n \"\"\"\n return np.log(self.predict_proba(X))\n" }, @@ -21690,7 +21756,7 @@ "sklearn.ensemble._weight_boosting.AdaBoostRegressor.staged_predict" ], "is_public": true, - "description": "An AdaBoost regressor.\n\nAn AdaBoost [1] regressor is a meta-estimator that begins by fitting a regressor on the original dataset and then fits additional copies of the regressor on the same dataset but where the weights of instances are adjusted according to the error of the current prediction. As such, subsequent regressors focus more on difficult cases. This class implements the algorithm known as AdaBoost.R2 [2]. Read more in the :ref:`User Guide `. .. versionadded:: 0.14", + "description": "An AdaBoost regressor.\n\nAn AdaBoost [1] regressor is a meta-estimator that begins by fitting a\nregressor on the original dataset and then fits additional copies of the\nregressor on the same dataset but where the weights of instances are\nadjusted according to the error of the current prediction. As such,\nsubsequent regressors focus more on difficult cases.\n\nThis class implements the algorithm known as AdaBoost.R2 [2].\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.14", "docstring": "An AdaBoost regressor.\n\n An AdaBoost [1] regressor is a meta-estimator that begins by fitting a\n regressor on the original dataset and then fits additional copies of the\n regressor on the same dataset but where the weights of instances are\n adjusted according to the error of the current prediction. As such,\n subsequent regressors focus more on difficult cases.\n\n This class implements the algorithm known as AdaBoost.R2 [2].\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.14\n\n Parameters\n ----------\n base_estimator : object, default=None\n The base estimator from which the boosted ensemble is built.\n If ``None``, then the base estimator is\n :class:`~sklearn.tree.DecisionTreeRegressor` initialized with\n `max_depth=3`.\n\n n_estimators : int, default=50\n The maximum number of estimators at which boosting is terminated.\n In case of perfect fit, the learning procedure is stopped early.\n\n learning_rate : float, default=1.0\n Weight applied to each regressor at each boosting iteration. A higher\n learning rate increases the contribution of each regressor. There is\n a trade-off between the `learning_rate` and `n_estimators` parameters.\n\n loss : {'linear', 'square', 'exponential'}, default='linear'\n The loss function to use when updating the weights after each\n boosting iteration.\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given at each `base_estimator` at each\n boosting iteration.\n Thus, it is only used when `base_estimator` exposes a `random_state`.\n In addition, it controls the bootstrap of the weights used to train the\n `base_estimator` at each boosting iteration.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n base_estimator_ : estimator\n The base estimator from which the ensemble is grown.\n\n estimators_ : list of regressors\n The collection of fitted sub-estimators.\n\n estimator_weights_ : ndarray of floats\n Weights for each estimator in the boosted ensemble.\n\n estimator_errors_ : ndarray of floats\n Regression error for each estimator in the boosted ensemble.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances if supported by the\n ``base_estimator`` (when based on decision trees).\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n AdaBoostClassifier : An AdaBoost classifier.\n GradientBoostingRegressor : Gradient Boosting Classification Tree.\n sklearn.tree.DecisionTreeRegressor : A decision tree regressor.\n\n References\n ----------\n .. [1] Y. Freund, R. Schapire, \"A Decision-Theoretic Generalization of\n on-Line Learning and an Application to Boosting\", 1995.\n\n .. [2] H. Drucker, \"Improving Regressors using Boosting Techniques\", 1997.\n\n Examples\n --------\n >>> from sklearn.ensemble import AdaBoostRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_features=4, n_informative=2,\n ... random_state=0, shuffle=False)\n >>> regr = AdaBoostRegressor(random_state=0, n_estimators=100)\n >>> regr.fit(X, y)\n AdaBoostRegressor(n_estimators=100, random_state=0)\n >>> regr.predict([[0, 0, 0, 0]])\n array([4.7972...])\n >>> regr.score(X, y)\n 0.9771...\n ", "source_code": "\n\nclass AdaBoostRegressor(RegressorMixin, BaseWeightBoosting):\n \"\"\"An AdaBoost regressor.\n\n An AdaBoost [1] regressor is a meta-estimator that begins by fitting a\n regressor on the original dataset and then fits additional copies of the\n regressor on the same dataset but where the weights of instances are\n adjusted according to the error of the current prediction. As such,\n subsequent regressors focus more on difficult cases.\n\n This class implements the algorithm known as AdaBoost.R2 [2].\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.14\n\n Parameters\n ----------\n base_estimator : object, default=None\n The base estimator from which the boosted ensemble is built.\n If ``None``, then the base estimator is\n :class:`~sklearn.tree.DecisionTreeRegressor` initialized with\n `max_depth=3`.\n\n n_estimators : int, default=50\n The maximum number of estimators at which boosting is terminated.\n In case of perfect fit, the learning procedure is stopped early.\n\n learning_rate : float, default=1.0\n Weight applied to each regressor at each boosting iteration. A higher\n learning rate increases the contribution of each regressor. There is\n a trade-off between the `learning_rate` and `n_estimators` parameters.\n\n loss : {'linear', 'square', 'exponential'}, default='linear'\n The loss function to use when updating the weights after each\n boosting iteration.\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given at each `base_estimator` at each\n boosting iteration.\n Thus, it is only used when `base_estimator` exposes a `random_state`.\n In addition, it controls the bootstrap of the weights used to train the\n `base_estimator` at each boosting iteration.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n base_estimator_ : estimator\n The base estimator from which the ensemble is grown.\n\n estimators_ : list of regressors\n The collection of fitted sub-estimators.\n\n estimator_weights_ : ndarray of floats\n Weights for each estimator in the boosted ensemble.\n\n estimator_errors_ : ndarray of floats\n Regression error for each estimator in the boosted ensemble.\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances if supported by the\n ``base_estimator`` (when based on decision trees).\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n AdaBoostClassifier : An AdaBoost classifier.\n GradientBoostingRegressor : Gradient Boosting Classification Tree.\n sklearn.tree.DecisionTreeRegressor : A decision tree regressor.\n\n References\n ----------\n .. [1] Y. Freund, R. Schapire, \"A Decision-Theoretic Generalization of\n on-Line Learning and an Application to Boosting\", 1995.\n\n .. [2] H. Drucker, \"Improving Regressors using Boosting Techniques\", 1997.\n\n Examples\n --------\n >>> from sklearn.ensemble import AdaBoostRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_features=4, n_informative=2,\n ... random_state=0, shuffle=False)\n >>> regr = AdaBoostRegressor(random_state=0, n_estimators=100)\n >>> regr.fit(X, y)\n AdaBoostRegressor(n_estimators=100, random_state=0)\n >>> regr.predict([[0, 0, 0, 0]])\n array([4.7972...])\n >>> regr.score(X, y)\n 0.9771...\n \"\"\"\n \n def __init__(self, base_estimator=None, *, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None):\n super().__init__(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state)\n self.loss = loss\n self.random_state = random_state\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Build a boosted regressor from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n The target values (real numbers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, the sample weights are initialized to\n 1 / n_samples.\n\n Returns\n -------\n self : object\n Fitted AdaBoostRegressor estimator.\n \"\"\"\n if self.loss not in ('linear', 'square', 'exponential'):\n raise ValueError(\"loss must be 'linear', 'square', or 'exponential'\")\n return super().fit(X, y, sample_weight)\n \n def _validate_estimator(self):\n \"\"\"Check the estimator and set the base_estimator_ attribute.\"\"\"\n super()._validate_estimator(default=DecisionTreeRegressor(max_depth=3))\n \n def _boost(self, iboost, X, y, sample_weight, random_state):\n \"\"\"Implement a single boost for regression\n\n Perform a single boost according to the AdaBoost.R2 algorithm and\n return the updated sample weights.\n\n Parameters\n ----------\n iboost : int\n The index of the current boost iteration.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,)\n The current sample weights.\n\n random_state : RandomState\n The RandomState instance used if the base estimator accepts a\n `random_state` attribute.\n Controls also the bootstrap of the weights used to train the weak\n learner.\n replacement.\n\n Returns\n -------\n sample_weight : array-like of shape (n_samples,) or None\n The reweighted sample weights.\n If None then boosting has terminated early.\n\n estimator_weight : float\n The weight for the current boost.\n If None then boosting has terminated early.\n\n estimator_error : float\n The regression error for the current boost.\n If None then boosting has terminated early.\n \"\"\"\n estimator = self._make_estimator(random_state=random_state)\n bootstrap_idx = random_state.choice(np.arange(_num_samples(X)), size=_num_samples(X), replace=True, p=sample_weight)\n X_ = _safe_indexing(X, bootstrap_idx)\n y_ = _safe_indexing(y, bootstrap_idx)\n estimator.fit(X_, y_)\n y_predict = estimator.predict(X)\n error_vect = np.abs(y_predict - y)\n sample_mask = sample_weight > 0\n masked_sample_weight = sample_weight[sample_mask]\n masked_error_vector = error_vect[sample_mask]\n error_max = masked_error_vector.max()\n if error_max != 0:\n masked_error_vector /= error_max\n if self.loss == 'square':\n masked_error_vector **= 2\n elif self.loss == 'exponential':\n masked_error_vector = 1.0 - np.exp(-masked_error_vector)\n estimator_error = (masked_sample_weight * masked_error_vector).sum()\n if estimator_error <= 0:\n return sample_weight, 1.0, 0.0\n elif estimator_error >= 0.5:\n if len(self.estimators_) > 1:\n self.estimators_.pop(-1)\n return None, None, None\n beta = estimator_error / (1.0 - estimator_error)\n estimator_weight = self.learning_rate * np.log(1.0 / beta)\n if not iboost == self.n_estimators - 1:\n sample_weight[sample_mask] *= np.power(beta, (1.0 - masked_error_vector) * self.learning_rate)\n return sample_weight, estimator_weight, estimator_error\n \n def _get_median_predict(self, X, limit):\n predictions = np.array([est.predict(X) for est in self.estimators_[:limit]]).T\n sorted_idx = np.argsort(predictions, axis=1)\n weight_cdf = stable_cumsum(self.estimator_weights_[sorted_idx], axis=1)\n median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis]\n median_idx = median_or_above.argmax(axis=1)\n median_estimators = sorted_idx[np.arange(_num_samples(X)), median_idx]\n return predictions[np.arange(_num_samples(X)), median_estimators]\n \n def predict(self, X):\n \"\"\"Predict regression value for X.\n\n The predicted regression value of an input sample is computed\n as the weighted median prediction of the regressors in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted regression values.\n \"\"\"\n check_is_fitted(self)\n X = self._check_X(X)\n return self._get_median_predict(X, len(self.estimators_))\n \n def staged_predict(self, X):\n \"\"\"Return staged predictions for X.\n\n The predicted regression value of an input sample is computed\n as the weighted median prediction of the regressors in the ensemble.\n\n This generator method yields the ensemble prediction after each\n iteration of boosting and therefore allows monitoring, such as to\n determine the prediction on a test set after each boost.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted regression values.\n \"\"\"\n check_is_fitted(self)\n X = self._check_X(X)\n for (i, _) in enumerate(self.estimators_, 1):\n yield self._get_median_predict(X, limit=i)\n" }, @@ -21708,7 +21774,7 @@ "sklearn.ensemble._weight_boosting.BaseWeightBoosting.feature_importances_@getter" ], "is_public": false, - "description": "Base class for AdaBoost estimators.\n\nWarning: This class should not be used directly. Use derived classes instead.", + "description": "Base class for AdaBoost estimators.\n\nWarning: This class should not be used directly. Use derived classes\ninstead.", "docstring": "Base class for AdaBoost estimators.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n ", "source_code": "\n\nclass BaseWeightBoosting(BaseEnsemble, metaclass=ABCMeta):\n \"\"\"Base class for AdaBoost estimators.\n\n Warning: This class should not be used directly. Use derived classes\n instead.\n \"\"\"\n \n @abstractmethod\n def __init__(self, base_estimator=None, *, n_estimators=50, estimator_params=tuple(), learning_rate=1.0, random_state=None):\n super().__init__(base_estimator=base_estimator, n_estimators=n_estimators, estimator_params=estimator_params)\n self.learning_rate = learning_rate\n self.random_state = random_state\n \n def _check_X(self, X):\n return self._validate_data(X, accept_sparse=['csr', 'csc'], ensure_2d=True, allow_nd=True, dtype=None, reset=False)\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Build a boosted classifier/regressor from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, the sample weights are initialized to\n 1 / n_samples.\n\n Returns\n -------\n self : object\n \"\"\"\n if self.learning_rate <= 0:\n raise ValueError('learning_rate must be greater than zero')\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], ensure_2d=True, allow_nd=True, dtype=None, y_numeric=is_regressor(self))\n sample_weight = _check_sample_weight(sample_weight, X, np.float64, copy=True)\n sample_weight /= sample_weight.sum()\n if np.any(sample_weight < 0):\n raise ValueError('sample_weight cannot contain negative weights')\n self._validate_estimator()\n self.estimators_ = []\n self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)\n self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)\n random_state = check_random_state(self.random_state)\n for iboost in range(self.n_estimators):\n (sample_weight, estimator_weight, estimator_error) = self._boost(iboost, X, y, sample_weight, random_state)\n if sample_weight is None:\n break\n self.estimator_weights_[iboost] = estimator_weight\n self.estimator_errors_[iboost] = estimator_error\n if estimator_error == 0:\n break\n sample_weight_sum = np.sum(sample_weight)\n if not np.isfinite(sample_weight_sum):\n warnings.warn(f'Sample weights have reached infinite values, at iteration {iboost}, causing overflow. Iterations stopped. Try lowering the learning rate.', stacklevel=2)\n break\n if sample_weight_sum <= 0:\n break\n if iboost < self.n_estimators - 1:\n sample_weight /= sample_weight_sum\n return self\n \n @abstractmethod\n def _boost(self, iboost, X, y, sample_weight, random_state):\n \"\"\"Implement a single boost.\n\n Warning: This method needs to be overridden by subclasses.\n\n Parameters\n ----------\n iboost : int\n The index of the current boost iteration.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n The target values (class labels).\n\n sample_weight : array-like of shape (n_samples,)\n The current sample weights.\n\n random_state : RandomState\n The current random number generator\n\n Returns\n -------\n sample_weight : array-like of shape (n_samples,) or None\n The reweighted sample weights.\n If None then boosting has terminated early.\n\n estimator_weight : float\n The weight for the current boost.\n If None then boosting has terminated early.\n\n error : float\n The classification error for the current boost.\n If None then boosting has terminated early.\n \"\"\"\n pass\n \n def staged_score(self, X, y, sample_weight=None):\n \"\"\"Return staged scores for X, y.\n\n This generator method yields the ensemble score after each iteration of\n boosting and therefore allows monitoring, such as to determine the\n score on a test set after each boost.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n Labels for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Yields\n ------\n z : float\n \"\"\"\n X = self._check_X(X)\n for y_pred in self.staged_predict(X):\n if is_classifier(self):\n yield accuracy_score(y, y_pred, sample_weight=sample_weight)\n else:\n yield r2_score(y, y_pred, sample_weight=sample_weight)\n \n @property\n def feature_importances_(self):\n \"\"\"The impurity-based feature importances.\n\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n The feature importances.\n \"\"\"\n if self.estimators_ is None or len(self.estimators_) == 0:\n raise ValueError('Estimator not fitted, call `fit` before `feature_importances_`.')\n try:\n norm = self.estimator_weights_.sum()\n return sum((weight * clf.feature_importances_ for (weight, clf) in zip(self.estimator_weights_, self.estimators_))) / norm\n except AttributeError as e:\n raise AttributeError('Unable to compute feature importances since base_estimator does not have a feature_importances_ attribute') from e\n" }, @@ -21721,7 +21787,7 @@ "superclasses": ["UserWarning"], "methods": [], "is_public": true, - "description": "Warning class used to notify the user of any change in the behavior.\n\n.. versionchanged:: 0.18 Moved from sklearn.base.", + "description": "Warning class used to notify the user of any change in the behavior.\n\n.. versionchanged:: 0.18\n Moved from sklearn.base.", "docstring": "Warning class used to notify the user of any change in the behavior.\n\n .. versionchanged:: 0.18\n Moved from sklearn.base.\n ", "source_code": "\n\n@deprecated('ChangedBehaviorWarning is deprecated in 0.24 and will be removed in 1.1')\nclass ChangedBehaviorWarning(UserWarning):\n \"\"\"Warning class used to notify the user of any change in the behavior.\n\n .. versionchanged:: 0.18\n Moved from sklearn.base.\n \"\"\"\n \n" }, @@ -21732,7 +21798,7 @@ "superclasses": ["UserWarning"], "methods": [], "is_public": true, - "description": "Custom warning to capture convergence problems\n\n.. versionchanged:: 0.18 Moved from sklearn.utils.", + "description": "Custom warning to capture convergence problems\n\n.. versionchanged:: 0.18\n Moved from sklearn.utils.", "docstring": "Custom warning to capture convergence problems\n\n .. versionchanged:: 0.18\n Moved from sklearn.utils.\n ", "source_code": "\n\nclass ConvergenceWarning(UserWarning):\n \"\"\"Custom warning to capture convergence problems\n\n .. versionchanged:: 0.18\n Moved from sklearn.utils.\n \"\"\"\n \n" }, @@ -21743,7 +21809,7 @@ "superclasses": ["UserWarning"], "methods": [], "is_public": true, - "description": "Warning used to notify implicit data conversions happening in the code.\n\nThis warning occurs when some input data needs to be converted or interpreted in a way that may not match the user's expectations. For example, this warning may occur when the user - passes an integer array to a function which expects float input and will convert the input - requests a non-copying operation, but a copy is required to meet the implementation's data-type expectations; - passes an input whose shape can be interpreted ambiguously. .. versionchanged:: 0.18 Moved from sklearn.utils.validation.", + "description": "Warning used to notify implicit data conversions happening in the code.\n\nThis warning occurs when some input data needs to be converted or\ninterpreted in a way that may not match the user's expectations.\n\nFor example, this warning may occur when the user\n - passes an integer array to a function which expects float input and\n will convert the input\n - requests a non-copying operation, but a copy is required to meet the\n implementation's data-type expectations;\n - passes an input whose shape can be interpreted ambiguously.\n\n.. versionchanged:: 0.18\n Moved from sklearn.utils.validation.", "docstring": "Warning used to notify implicit data conversions happening in the code.\n\n This warning occurs when some input data needs to be converted or\n interpreted in a way that may not match the user's expectations.\n\n For example, this warning may occur when the user\n - passes an integer array to a function which expects float input and\n will convert the input\n - requests a non-copying operation, but a copy is required to meet the\n implementation's data-type expectations;\n - passes an input whose shape can be interpreted ambiguously.\n\n .. versionchanged:: 0.18\n Moved from sklearn.utils.validation.\n ", "source_code": "\n\nclass DataConversionWarning(UserWarning):\n \"\"\"Warning used to notify implicit data conversions happening in the code.\n\n This warning occurs when some input data needs to be converted or\n interpreted in a way that may not match the user's expectations.\n\n For example, this warning may occur when the user\n - passes an integer array to a function which expects float input and\n will convert the input\n - requests a non-copying operation, but a copy is required to meet the\n implementation's data-type expectations;\n - passes an input whose shape can be interpreted ambiguously.\n\n .. versionchanged:: 0.18\n Moved from sklearn.utils.validation.\n \"\"\"\n \n" }, @@ -21754,7 +21820,7 @@ "superclasses": ["UserWarning"], "methods": [], "is_public": true, - "description": "Custom warning to notify potential issues with data dimensionality.\n\nFor example, in random projection, this warning is raised when the number of components, which quantifies the dimensionality of the target projection space, is higher than the number of features, which quantifies the dimensionality of the original source space, to imply that the dimensionality of the problem will not be reduced. .. versionchanged:: 0.18 Moved from sklearn.utils.", + "description": "Custom warning to notify potential issues with data dimensionality.\n\nFor example, in random projection, this warning is raised when the\nnumber of components, which quantifies the dimensionality of the target\nprojection space, is higher than the number of features, which quantifies\nthe dimensionality of the original source space, to imply that the\ndimensionality of the problem will not be reduced.\n\n.. versionchanged:: 0.18\n Moved from sklearn.utils.", "docstring": "Custom warning to notify potential issues with data dimensionality.\n\n For example, in random projection, this warning is raised when the\n number of components, which quantifies the dimensionality of the target\n projection space, is higher than the number of features, which quantifies\n the dimensionality of the original source space, to imply that the\n dimensionality of the problem will not be reduced.\n\n .. versionchanged:: 0.18\n Moved from sklearn.utils.\n ", "source_code": "\n\nclass DataDimensionalityWarning(UserWarning):\n \"\"\"Custom warning to notify potential issues with data dimensionality.\n\n For example, in random projection, this warning is raised when the\n number of components, which quantifies the dimensionality of the target\n projection space, is higher than the number of features, which quantifies\n the dimensionality of the original source space, to imply that the\n dimensionality of the problem will not be reduced.\n\n .. versionchanged:: 0.18\n Moved from sklearn.utils.\n \"\"\"\n \n" }, @@ -21765,7 +21831,7 @@ "superclasses": ["UserWarning"], "methods": [], "is_public": true, - "description": "Warning used to notify the user of inefficient computation.\n\nThis warning notifies the user that the efficiency may not be optimal due to some reason which may be included as a part of the warning message. This may be subclassed into a more specific Warning class. .. versionadded:: 0.18", + "description": "Warning used to notify the user of inefficient computation.\n\nThis warning notifies the user that the efficiency may not be optimal due\nto some reason which may be included as a part of the warning message.\nThis may be subclassed into a more specific Warning class.\n\n.. versionadded:: 0.18", "docstring": "Warning used to notify the user of inefficient computation.\n\n This warning notifies the user that the efficiency may not be optimal due\n to some reason which may be included as a part of the warning message.\n This may be subclassed into a more specific Warning class.\n\n .. versionadded:: 0.18\n ", "source_code": "\n\nclass EfficiencyWarning(UserWarning):\n \"\"\"Warning used to notify the user of inefficient computation.\n\n This warning notifies the user that the efficiency may not be optimal due\n to some reason which may be included as a part of the warning message.\n This may be subclassed into a more specific Warning class.\n\n .. versionadded:: 0.18\n \"\"\"\n \n" }, @@ -21776,7 +21842,7 @@ "superclasses": ["RuntimeWarning"], "methods": [], "is_public": true, - "description": "Warning class used if there is an error while fitting the estimator.\n\nThis Warning is used in meta estimators GridSearchCV and RandomizedSearchCV and the cross-validation helper function cross_val_score to warn when there is an error while fitting the estimator. .. versionchanged:: 0.18 Moved from sklearn.cross_validation.", + "description": "Warning class used if there is an error while fitting the estimator.\n\nThis Warning is used in meta estimators GridSearchCV and RandomizedSearchCV\nand the cross-validation helper function cross_val_score to warn when there\nis an error while fitting the estimator.\n\n.. versionchanged:: 0.18\n Moved from sklearn.cross_validation.", "docstring": "Warning class used if there is an error while fitting the estimator.\n\n This Warning is used in meta estimators GridSearchCV and RandomizedSearchCV\n and the cross-validation helper function cross_val_score to warn when there\n is an error while fitting the estimator.\n\n .. versionchanged:: 0.18\n Moved from sklearn.cross_validation.\n ", "source_code": "\n\nclass FitFailedWarning(RuntimeWarning):\n \"\"\"Warning class used if there is an error while fitting the estimator.\n\n This Warning is used in meta estimators GridSearchCV and RandomizedSearchCV\n and the cross-validation helper function cross_val_score to warn when there\n is an error while fitting the estimator.\n\n .. versionchanged:: 0.18\n Moved from sklearn.cross_validation.\n \"\"\"\n \n" }, @@ -21789,7 +21855,7 @@ "superclasses": ["EfficiencyWarning"], "methods": [], "is_public": true, - "description": "Warning used when the dot operation does not use BLAS.\n\nThis warning is used to notify the user that BLAS was not used for dot operation and hence the efficiency may be affected. .. versionchanged:: 0.18 Moved from sklearn.utils.validation, extends EfficiencyWarning.", + "description": "Warning used when the dot operation does not use BLAS.\n\nThis warning is used to notify the user that BLAS was not used for dot\noperation and hence the efficiency may be affected.\n\n.. versionchanged:: 0.18\n Moved from sklearn.utils.validation, extends EfficiencyWarning.", "docstring": "Warning used when the dot operation does not use BLAS.\n\n This warning is used to notify the user that BLAS was not used for dot\n operation and hence the efficiency may be affected.\n\n .. versionchanged:: 0.18\n Moved from sklearn.utils.validation, extends EfficiencyWarning.\n ", "source_code": "\n\n@deprecated('NonBLASDotWarning is deprecated in 0.24 and will be removed in 1.1')\nclass NonBLASDotWarning(EfficiencyWarning):\n \"\"\"Warning used when the dot operation does not use BLAS.\n\n This warning is used to notify the user that BLAS was not used for dot\n operation and hence the efficiency may be affected.\n\n .. versionchanged:: 0.18\n Moved from sklearn.utils.validation, extends EfficiencyWarning.\n \"\"\"\n \n" }, @@ -21800,7 +21866,7 @@ "superclasses": ["ValueError", "AttributeError"], "methods": [], "is_public": true, - "description": "Exception class to raise if estimator is used before fitting.\n\nThis class inherits from both ValueError and AttributeError to help with exception handling and backward compatibility.", + "description": "Exception class to raise if estimator is used before fitting.\n\nThis class inherits from both ValueError and AttributeError to help with\nexception handling and backward compatibility.", "docstring": "Exception class to raise if estimator is used before fitting.\n\n This class inherits from both ValueError and AttributeError to help with\n exception handling and backward compatibility.\n\n Examples\n --------\n >>> from sklearn.svm import LinearSVC\n >>> from sklearn.exceptions import NotFittedError\n >>> try:\n ... LinearSVC().predict([[1, 2], [2, 3], [3, 4]])\n ... except NotFittedError as e:\n ... print(repr(e))\n NotFittedError(\"This LinearSVC instance is not fitted yet. Call 'fit' with\n appropriate arguments before using this estimator.\"...)\n\n .. versionchanged:: 0.18\n Moved from sklearn.utils.validation.\n ", "source_code": "\n\nclass NotFittedError(ValueError, AttributeError):\n \"\"\"Exception class to raise if estimator is used before fitting.\n\n This class inherits from both ValueError and AttributeError to help with\n exception handling and backward compatibility.\n\n Examples\n --------\n >>> from sklearn.svm import LinearSVC\n >>> from sklearn.exceptions import NotFittedError\n >>> try:\n ... LinearSVC().predict([[1, 2], [2, 3], [3, 4]])\n ... except NotFittedError as e:\n ... print(repr(e))\n NotFittedError(\"This LinearSVC instance is not fitted yet. Call 'fit' with\n appropriate arguments before using this estimator.\"...)\n\n .. versionchanged:: 0.18\n Moved from sklearn.utils.validation.\n \"\"\"\n \n" }, @@ -21811,7 +21877,7 @@ "superclasses": ["UserWarning"], "methods": [], "is_public": true, - "description": "Warning raised when the eigenvalues of a PSD matrix have issues\n\nThis warning is typically raised by ``_check_psd_eigenvalues`` when the eigenvalues of a positive semidefinite (PSD) matrix such as a gram matrix (kernel) present significant negative eigenvalues, or bad conditioning i.e. very small non-zero eigenvalues compared to the largest eigenvalue. .. versionadded:: 0.22", + "description": "Warning raised when the eigenvalues of a PSD matrix have issues\n\nThis warning is typically raised by ``_check_psd_eigenvalues`` when the\neigenvalues of a positive semidefinite (PSD) matrix such as a gram matrix\n(kernel) present significant negative eigenvalues, or bad conditioning i.e.\nvery small non-zero eigenvalues compared to the largest eigenvalue.\n\n.. versionadded:: 0.22", "docstring": "Warning raised when the eigenvalues of a PSD matrix have issues\n\n This warning is typically raised by ``_check_psd_eigenvalues`` when the\n eigenvalues of a positive semidefinite (PSD) matrix such as a gram matrix\n (kernel) present significant negative eigenvalues, or bad conditioning i.e.\n very small non-zero eigenvalues compared to the largest eigenvalue.\n\n .. versionadded:: 0.22\n ", "source_code": "\n\nclass PositiveSpectrumWarning(UserWarning):\n \"\"\"Warning raised when the eigenvalues of a PSD matrix have issues\n\n This warning is typically raised by ``_check_psd_eigenvalues`` when the\n eigenvalues of a positive semidefinite (PSD) matrix such as a gram matrix\n (kernel) present significant negative eigenvalues, or bad conditioning i.e.\n very small non-zero eigenvalues compared to the largest eigenvalue.\n\n .. versionadded:: 0.22\n \"\"\"\n \n" }, @@ -21822,7 +21888,7 @@ "superclasses": ["UserWarning"], "methods": [], "is_public": true, - "description": "Warning class used to notify the user of a test that was skipped.\n\nFor example, one of the estimator checks requires a pandas import. If the pandas package cannot be imported, the test will be skipped rather than register as a failure.", + "description": "Warning class used to notify the user of a test that was skipped.\n\nFor example, one of the estimator checks requires a pandas import.\nIf the pandas package cannot be imported, the test will be skipped rather\nthan register as a failure.", "docstring": "Warning class used to notify the user of a test that was skipped.\n\n For example, one of the estimator checks requires a pandas import.\n If the pandas package cannot be imported, the test will be skipped rather\n than register as a failure.\n ", "source_code": "\n\nclass SkipTestWarning(UserWarning):\n \"\"\"Warning class used to notify the user of a test that was skipped.\n\n For example, one of the estimator checks requires a pandas import.\n If the pandas package cannot be imported, the test will be skipped rather\n than register as a failure.\n \"\"\"\n \n" }, @@ -21833,7 +21899,7 @@ "superclasses": ["UserWarning"], "methods": [], "is_public": true, - "description": "Warning used when the metric is invalid\n\n.. versionchanged:: 0.18 Moved from sklearn.base.", + "description": "Warning used when the metric is invalid\n\n.. versionchanged:: 0.18\n Moved from sklearn.base.", "docstring": "Warning used when the metric is invalid\n\n .. versionchanged:: 0.18\n Moved from sklearn.base.\n ", "source_code": "\n\nclass UndefinedMetricWarning(UserWarning):\n \"\"\"Warning used when the metric is invalid\n\n .. versionchanged:: 0.18\n Moved from sklearn.base.\n \"\"\"\n \n" }, @@ -21915,7 +21981,7 @@ "superclasses": ["ArffException"], "methods": ["sklearn.externals._arff.BadAttributeName.__init__"], "is_public": false, - "description": "Error raised when an attribute name is provided twice the attribute declaration.", + "description": "Error raised when an attribute name is provided twice the attribute\ndeclaration.", "docstring": "Error raised when an attribute name is provided twice the attribute\n declaration.", "source_code": "\n\nclass BadAttributeName(ArffException):\n \"\"\"Error raised when an attribute name is provided twice the attribute\n declaration.\"\"\"\n \n def __init__(self, value, value2):\n super().__init__()\n self.message = 'Bad @ATTRIBUTE name %s at line' % value + ' %d, this name is already in use in line' + ' %d.' % value2\n" }, @@ -21926,7 +21992,7 @@ "superclasses": ["ArffException"], "methods": [], "is_public": false, - "description": "Error raised when some invalid type is provided into the attribute declaration.", + "description": "Error raised when some invalid type is provided into the attribute\ndeclaration.", "docstring": "Error raised when some invalid type is provided into the attribute\n declaration.", "source_code": "\n\nclass BadAttributeType(ArffException):\n \"\"\"Error raised when some invalid type is provided into the attribute\n declaration.\"\"\"\n message = 'Bad @ATTRIBUTE type, at line %d.'\n" }, @@ -21972,7 +22038,7 @@ "superclasses": ["ArffException"], "methods": ["sklearn.externals._arff.BadNominalValue.__init__"], "is_public": false, - "description": "Error raised when a value in used in some data instance but is not declared into it respective attribute declaration.", + "description": "Error raised when a value in used in some data instance but is not\ndeclared into it respective attribute declaration.", "docstring": "Error raised when a value in used in some data instance but is not\n declared into it respective attribute declaration.", "source_code": "\n\nclass BadNominalValue(ArffException):\n \"\"\"Error raised when a value in used in some data instance but is not\n declared into it respective attribute declaration.\"\"\"\n \n def __init__(self, value):\n super().__init__()\n self.message = 'Data value %s not found in nominal declaration, ' % value + 'at line %d.'\n" }, @@ -21983,7 +22049,7 @@ "superclasses": ["ArffException"], "methods": [], "is_public": false, - "description": "Error raised when and invalid numerical value is used in some data instance.", + "description": "Error raised when and invalid numerical value is used in some data\ninstance.", "docstring": "Error raised when and invalid numerical value is used in some data\n instance.", "source_code": "\n\nclass BadNumericalValue(ArffException):\n \"\"\"Error raised when and invalid numerical value is used in some data\n instance.\"\"\"\n message = 'Invalid numerical value, at line %d.'\n" }, @@ -21997,7 +22063,7 @@ "sklearn.externals._arff.BadObject.__str__" ], "is_public": false, - "description": "Error raised when the object representing the ARFF file has something wrong.", + "description": "Error raised when the object representing the ARFF file has something\nwrong.", "docstring": "Error raised when the object representing the ARFF file has something\n wrong.", "source_code": "\n\nclass BadObject(ArffException):\n \"\"\"Error raised when the object representing the ARFF file has something\n wrong.\"\"\"\n \n def __init__(self, msg='Invalid object.'):\n self.msg = msg\n \n def __str__(self):\n return '%s' % self.msg\n" }, @@ -22059,7 +22125,7 @@ "sklearn.externals._arff.DenseGeneratorData.encode_data" ], "is_public": false, - "description": "Internal helper class to allow for different matrix types without making the code a huge collection of if statements.", + "description": "Internal helper class to allow for different matrix types without\nmaking the code a huge collection of if statements.", "docstring": "Internal helper class to allow for different matrix types without\n making the code a huge collection of if statements.", "source_code": "\n\nclass DenseGeneratorData:\n \"\"\"Internal helper class to allow for different matrix types without\n making the code a huge collection of if statements.\"\"\"\n \n def decode_rows(self, stream, conversors):\n for row in stream:\n values = _parse_values(row)\n if isinstance(values, dict):\n if values and max(values) >= len(conversors):\n raise BadDataFormat(row)\n values = [values[i] if i in values else 0 for i in range(len(conversors))]\n elif len(values) != len(conversors):\n raise BadDataFormat(row)\n yield self._decode_values(values, conversors)\n \n @staticmethod\n def _decode_values(values, conversors):\n try:\n values = [None if value is None else conversor(value) for (conversor, value) in zip(conversors, values)]\n except ValueError as exc:\n if 'float: ' in str(exc):\n raise BadNumericalValue()\n return values\n \n def encode_data(self, data, attributes):\n \"\"\"(INTERNAL) Encodes a line of data.\n\n Data instances follow the csv format, i.e, attribute values are\n delimited by commas. After converted from csv.\n\n :param data: a list of values.\n :param attributes: a list of attributes. Used to check if data is valid.\n :return: a string with the encoded data line.\n \"\"\"\n current_row = 0\n for inst in data:\n if len(inst) != len(attributes):\n raise BadObject('Instance %d has %d attributes, expected %d' % (current_row, len(inst), len(attributes)))\n new_data = []\n for value in inst:\n if value is None or value == '' or value != value:\n s = '?'\n else:\n s = encode_string(str(value))\n new_data.append(s)\n current_row += 1\n yield ','.join(new_data)\n" }, @@ -22273,7 +22339,7 @@ "sklearn.feature_extraction._dict_vectorizer.DictVectorizer._more_tags" ], "is_public": true, - "description": "Transforms lists of feature-value mappings to vectors.\n\nThis transformer turns lists of mappings (dict-like objects) of feature names to feature values into Numpy arrays or scipy.sparse matrices for use with scikit-learn estimators. When feature values are strings, this transformer will do a binary one-hot (aka one-of-K) coding: one boolean-valued feature is constructed for each of the possible string values that the feature can take on. For instance, a feature \"f\" that can take on the values \"ham\" and \"spam\" will become two features in the output, one signifying \"f=ham\", the other \"f=spam\". If a feature value is a sequence or set of strings, this transformer will iterate over the values and will count the occurrences of each string value. However, note that this transformer will only do a binary one-hot encoding when feature values are of type string. If categorical features are represented as numeric values such as int or iterables of strings, the DictVectorizer can be followed by :class:`~sklearn.preprocessing.OneHotEncoder` to complete binary one-hot encoding. Features that do not occur in a sample (mapping) will have a zero value in the resulting array/matrix. Read more in the :ref:`User Guide `.", + "description": "Transforms lists of feature-value mappings to vectors.\n\nThis transformer turns lists of mappings (dict-like objects) of feature\nnames to feature values into Numpy arrays or scipy.sparse matrices for use\nwith scikit-learn estimators.\n\nWhen feature values are strings, this transformer will do a binary one-hot\n(aka one-of-K) coding: one boolean-valued feature is constructed for each\nof the possible string values that the feature can take on. For instance,\na feature \"f\" that can take on the values \"ham\" and \"spam\" will become two\nfeatures in the output, one signifying \"f=ham\", the other \"f=spam\".\n\nIf a feature value is a sequence or set of strings, this transformer\nwill iterate over the values and will count the occurrences of each string\nvalue.\n\nHowever, note that this transformer will only do a binary one-hot encoding\nwhen feature values are of type string. If categorical features are\nrepresented as numeric values such as int or iterables of strings, the\nDictVectorizer can be followed by\n:class:`~sklearn.preprocessing.OneHotEncoder` to complete\nbinary one-hot encoding.\n\nFeatures that do not occur in a sample (mapping) will have a zero value\nin the resulting array/matrix.\n\nRead more in the :ref:`User Guide `.", "docstring": "Transforms lists of feature-value mappings to vectors.\n\n This transformer turns lists of mappings (dict-like objects) of feature\n names to feature values into Numpy arrays or scipy.sparse matrices for use\n with scikit-learn estimators.\n\n When feature values are strings, this transformer will do a binary one-hot\n (aka one-of-K) coding: one boolean-valued feature is constructed for each\n of the possible string values that the feature can take on. For instance,\n a feature \"f\" that can take on the values \"ham\" and \"spam\" will become two\n features in the output, one signifying \"f=ham\", the other \"f=spam\".\n\n If a feature value is a sequence or set of strings, this transformer\n will iterate over the values and will count the occurrences of each string\n value.\n\n However, note that this transformer will only do a binary one-hot encoding\n when feature values are of type string. If categorical features are\n represented as numeric values such as int or iterables of strings, the\n DictVectorizer can be followed by\n :class:`~sklearn.preprocessing.OneHotEncoder` to complete\n binary one-hot encoding.\n\n Features that do not occur in a sample (mapping) will have a zero value\n in the resulting array/matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n dtype : dtype, default=np.float64\n The type of feature values. Passed to Numpy array/scipy.sparse matrix\n constructors as the dtype argument.\n separator : str, default=\"=\"\n Separator string used when constructing new features for one-hot\n coding.\n sparse : bool, default=True\n Whether transform should produce scipy.sparse matrices.\n sort : bool, default=True\n Whether ``feature_names_`` and ``vocabulary_`` should be\n sorted when fitting.\n\n Attributes\n ----------\n vocabulary_ : dict\n A dictionary mapping feature names to feature indices.\n\n feature_names_ : list\n A list of length n_features containing the feature names (e.g., \"f=ham\"\n and \"f=spam\").\n\n See Also\n --------\n FeatureHasher : Performs vectorization using only a hash function.\n sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical\n features encoded as columns of arbitrary data types.\n\n Examples\n --------\n >>> from sklearn.feature_extraction import DictVectorizer\n >>> v = DictVectorizer(sparse=False)\n >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]\n >>> X = v.fit_transform(D)\n >>> X\n array([[2., 0., 1.],\n [0., 1., 3.]])\n >>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},\n ... {'baz': 1.0, 'foo': 3.0}]\n True\n >>> v.transform({'foo': 4, 'unseen_feature': 3})\n array([[0., 0., 4.]])\n ", "source_code": "\n\nclass DictVectorizer(TransformerMixin, BaseEstimator):\n \"\"\"Transforms lists of feature-value mappings to vectors.\n\n This transformer turns lists of mappings (dict-like objects) of feature\n names to feature values into Numpy arrays or scipy.sparse matrices for use\n with scikit-learn estimators.\n\n When feature values are strings, this transformer will do a binary one-hot\n (aka one-of-K) coding: one boolean-valued feature is constructed for each\n of the possible string values that the feature can take on. For instance,\n a feature \"f\" that can take on the values \"ham\" and \"spam\" will become two\n features in the output, one signifying \"f=ham\", the other \"f=spam\".\n\n If a feature value is a sequence or set of strings, this transformer\n will iterate over the values and will count the occurrences of each string\n value.\n\n However, note that this transformer will only do a binary one-hot encoding\n when feature values are of type string. If categorical features are\n represented as numeric values such as int or iterables of strings, the\n DictVectorizer can be followed by\n :class:`~sklearn.preprocessing.OneHotEncoder` to complete\n binary one-hot encoding.\n\n Features that do not occur in a sample (mapping) will have a zero value\n in the resulting array/matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n dtype : dtype, default=np.float64\n The type of feature values. Passed to Numpy array/scipy.sparse matrix\n constructors as the dtype argument.\n separator : str, default=\"=\"\n Separator string used when constructing new features for one-hot\n coding.\n sparse : bool, default=True\n Whether transform should produce scipy.sparse matrices.\n sort : bool, default=True\n Whether ``feature_names_`` and ``vocabulary_`` should be\n sorted when fitting.\n\n Attributes\n ----------\n vocabulary_ : dict\n A dictionary mapping feature names to feature indices.\n\n feature_names_ : list\n A list of length n_features containing the feature names (e.g., \"f=ham\"\n and \"f=spam\").\n\n See Also\n --------\n FeatureHasher : Performs vectorization using only a hash function.\n sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical\n features encoded as columns of arbitrary data types.\n\n Examples\n --------\n >>> from sklearn.feature_extraction import DictVectorizer\n >>> v = DictVectorizer(sparse=False)\n >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]\n >>> X = v.fit_transform(D)\n >>> X\n array([[2., 0., 1.],\n [0., 1., 3.]])\n >>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},\n ... {'baz': 1.0, 'foo': 3.0}]\n True\n >>> v.transform({'foo': 4, 'unseen_feature': 3})\n array([[0., 0., 4.]])\n \"\"\"\n \n def __init__(self, *, dtype=np.float64, separator='=', sparse=True, sort=True):\n self.dtype = dtype\n self.separator = separator\n self.sparse = sparse\n self.sort = sort\n \n def _add_iterable_element(self, f, v, feature_names, vocab, *, fitting=True, transforming=False, indices=None, values=None):\n \"\"\"Add feature names for iterable of strings\"\"\"\n for vv in v:\n if isinstance(vv, str):\n feature_name = '%s%s%s' % (f, self.separator, vv)\n vv = 1\n else:\n raise TypeError(f'Unsupported type {type(vv)} in iterable value. Only iterables of string are supported.')\n if fitting and feature_name not in vocab:\n vocab[feature_name] = len(feature_names)\n feature_names.append(feature_name)\n if transforming and feature_name in vocab:\n indices.append(vocab[feature_name])\n values.append(self.dtype(vv))\n \n def fit(self, X, y=None):\n \"\"\"Learn a list of feature name -> indices mappings.\n\n Parameters\n ----------\n X : Mapping or iterable over Mappings\n Dict(s) or Mapping(s) from feature names (arbitrary Python\n objects) to feature values (strings or convertible to dtype).\n\n .. versionchanged:: 0.24\n Accepts multiple string values for one categorical feature.\n\n y : (ignored)\n Ignored parameter.\n\n Returns\n -------\n self : object\n DictVectorizer class instance.\n \"\"\"\n feature_names = []\n vocab = {}\n for x in X:\n for (f, v) in x.items():\n if isinstance(v, str):\n feature_name = '%s%s%s' % (f, self.separator, v)\n v = 1\n elif isinstance(v, Number) or v is None:\n feature_name = f\n elif isinstance(v, Mapping):\n raise TypeError(f'Unsupported value type {type(v)} for {f}: {v}.\\nMapping objects are not supported.')\n elif isinstance(v, Iterable):\n feature_name = None\n self._add_iterable_element(f, v, feature_names, vocab)\n if feature_name is not None:\n if feature_name not in vocab:\n vocab[feature_name] = len(feature_names)\n feature_names.append(feature_name)\n if self.sort:\n feature_names.sort()\n vocab = {f: i for (i, f) in enumerate(feature_names)}\n self.feature_names_ = feature_names\n self.vocabulary_ = vocab\n return self\n \n def _transform(self, X, fitting):\n assert array('i').itemsize == 4, 'sizeof(int) != 4 on your platform; please report this at https://github.com/scikit-learn/scikit-learn/issues and include the output from platform.platform() in your bug report'\n dtype = self.dtype\n if fitting:\n feature_names = []\n vocab = {}\n else:\n feature_names = self.feature_names_\n vocab = self.vocabulary_\n transforming = True\n X = [X] if isinstance(X, Mapping) else X\n indices = array('i')\n indptr = [0]\n values = []\n for x in X:\n for (f, v) in x.items():\n if isinstance(v, str):\n feature_name = '%s%s%s' % (f, self.separator, v)\n v = 1\n elif isinstance(v, Number) or v is None:\n feature_name = f\n elif not isinstance(v, Mapping) and isinstance(v, Iterable):\n feature_name = None\n self._add_iterable_element(f, v, feature_names, vocab, fitting=fitting, transforming=transforming, indices=indices, values=values)\n else:\n raise TypeError(f'Unsupported value Type {type(v)} for {f}: {v}.\\n{type(v)} objects are not supported.')\n if feature_name is not None:\n if fitting and feature_name not in vocab:\n vocab[feature_name] = len(feature_names)\n feature_names.append(feature_name)\n if feature_name in vocab:\n indices.append(vocab[feature_name])\n values.append(self.dtype(v))\n indptr.append(len(indices))\n if len(indptr) == 1:\n raise ValueError('Sample sequence X is empty.')\n indices = np.frombuffer(indices, dtype=np.intc)\n shape = (len(indptr) - 1, len(vocab))\n result_matrix = sp.csr_matrix((values, indices, indptr), shape=shape, dtype=dtype)\n if fitting and self.sort:\n feature_names.sort()\n map_index = np.empty(len(feature_names), dtype=np.int32)\n for (new_val, f) in enumerate(feature_names):\n map_index[new_val] = vocab[f]\n vocab[f] = new_val\n result_matrix = result_matrix[:, map_index]\n if self.sparse:\n result_matrix.sort_indices()\n else:\n result_matrix = result_matrix.toarray()\n if fitting:\n self.feature_names_ = feature_names\n self.vocabulary_ = vocab\n return result_matrix\n \n def fit_transform(self, X, y=None):\n \"\"\"Learn a list of feature name -> indices mappings and transform X.\n\n Like fit(X) followed by transform(X), but does not require\n materializing X in memory.\n\n Parameters\n ----------\n X : Mapping or iterable over Mappings\n Dict(s) or Mapping(s) from feature names (arbitrary Python\n objects) to feature values (strings or convertible to dtype).\n\n .. versionchanged:: 0.24\n Accepts multiple string values for one categorical feature.\n\n y : (ignored)\n Ignored parameter.\n\n Returns\n -------\n Xa : {array, sparse matrix}\n Feature vectors; always 2-d.\n \"\"\"\n return self._transform(X, fitting=True)\n \n def inverse_transform(self, X, dict_type=dict):\n \"\"\"Transform array or sparse matrix X back to feature mappings.\n\n X must have been produced by this DictVectorizer's transform or\n fit_transform method; it may only have passed through transformers\n that preserve the number of features and their order.\n\n In the case of one-hot/one-of-K coding, the constructed feature\n names and values are returned rather than the original ones.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Sample matrix.\n dict_type : type, default=dict\n Constructor for feature mappings. Must conform to the\n collections.Mapping API.\n\n Returns\n -------\n D : list of dict_type objects of shape (n_samples,)\n Feature mappings for the samples in X.\n \"\"\"\n X = check_array(X, accept_sparse=['csr', 'csc'])\n n_samples = X.shape[0]\n names = self.feature_names_\n dicts = [dict_type() for _ in range(n_samples)]\n if sp.issparse(X):\n for (i, j) in zip(*X.nonzero()):\n dicts[i][names[j]] = X[i, j]\n else:\n for (i, d) in enumerate(dicts):\n for (j, v) in enumerate(X[i, :]):\n if v != 0:\n d[names[j]] = X[i, j]\n return dicts\n \n def transform(self, X):\n \"\"\"Transform feature->value dicts to array or sparse matrix.\n\n Named features not encountered during fit or fit_transform will be\n silently ignored.\n\n Parameters\n ----------\n X : Mapping or iterable over Mappings of shape (n_samples,)\n Dict(s) or Mapping(s) from feature names (arbitrary Python\n objects) to feature values (strings or convertible to dtype).\n\n Returns\n -------\n Xa : {array, sparse matrix}\n Feature vectors; always 2-d.\n \"\"\"\n return self._transform(X, fitting=False)\n \n @deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\n def get_feature_names(self):\n \"\"\"Return a list of feature names, ordered by their indices.\n\n If one-of-K coding is applied to categorical features, this will\n include the constructed feature names but not the original ones.\n\n Returns\n -------\n feature_names_ : list of length (n_features,)\n List containing the feature names (e.g., \"f=ham\" and \"f=spam\").\n \"\"\"\n return self.feature_names_\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n if any((not isinstance(name, str) for name in self.feature_names_)):\n feature_names = [str(name) for name in self.feature_names_]\n else:\n feature_names = self.feature_names_\n return np.asarray(feature_names, dtype=object)\n \n def restrict(self, support, indices=False):\n \"\"\"Restrict the features to those in support using feature selection.\n\n This function modifies the estimator in-place.\n\n Parameters\n ----------\n support : array-like\n Boolean mask or list of indices (as returned by the get_support\n member of feature selectors).\n indices : bool, default=False\n Whether support is a list of indices.\n\n Returns\n -------\n self : object\n DictVectorizer class instance.\n\n Examples\n --------\n >>> from sklearn.feature_extraction import DictVectorizer\n >>> from sklearn.feature_selection import SelectKBest, chi2\n >>> v = DictVectorizer()\n >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]\n >>> X = v.fit_transform(D)\n >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])\n >>> v.get_feature_names_out()\n array(['bar', 'baz', 'foo'], ...)\n >>> v.restrict(support.get_support())\n DictVectorizer()\n >>> v.get_feature_names_out()\n array(['bar', 'foo'], ...)\n \"\"\"\n if not indices:\n support = np.where(support)[0]\n names = self.feature_names_\n new_vocab = {}\n for i in support:\n new_vocab[names[i]] = len(new_vocab)\n self.vocabulary_ = new_vocab\n self.feature_names_ = [f for (f, i) in sorted(new_vocab.items(), key=itemgetter(1))]\n return self\n \n def _more_tags(self):\n return {'X_types': ['dict']}\n" }, @@ -22290,7 +22356,7 @@ "sklearn.feature_extraction._hash.FeatureHasher._more_tags" ], "is_public": true, - "description": "Implements feature hashing, aka the hashing trick.\n\nThis class turns sequences of symbolic feature names (strings) into scipy.sparse matrices, using a hash function to compute the matrix column corresponding to a name. The hash function employed is the signed 32-bit version of Murmurhash3. Feature names of type byte string are used as-is. Unicode strings are converted to UTF-8 first, but no Unicode normalization is done. Feature values must be (finite) numbers. This class is a low-memory alternative to DictVectorizer and CountVectorizer, intended for large-scale (online) learning and situations where memory is tight, e.g. when running prediction code on embedded devices. Read more in the :ref:`User Guide `. .. versionadded:: 0.13", + "description": "Implements feature hashing, aka the hashing trick.\n\nThis class turns sequences of symbolic feature names (strings) into\nscipy.sparse matrices, using a hash function to compute the matrix column\ncorresponding to a name. The hash function employed is the signed 32-bit\nversion of Murmurhash3.\n\nFeature names of type byte string are used as-is. Unicode strings are\nconverted to UTF-8 first, but no Unicode normalization is done.\nFeature values must be (finite) numbers.\n\nThis class is a low-memory alternative to DictVectorizer and\nCountVectorizer, intended for large-scale (online) learning and situations\nwhere memory is tight, e.g. when running prediction code on embedded\ndevices.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.13", "docstring": "Implements feature hashing, aka the hashing trick.\n\n This class turns sequences of symbolic feature names (strings) into\n scipy.sparse matrices, using a hash function to compute the matrix column\n corresponding to a name. The hash function employed is the signed 32-bit\n version of Murmurhash3.\n\n Feature names of type byte string are used as-is. Unicode strings are\n converted to UTF-8 first, but no Unicode normalization is done.\n Feature values must be (finite) numbers.\n\n This class is a low-memory alternative to DictVectorizer and\n CountVectorizer, intended for large-scale (online) learning and situations\n where memory is tight, e.g. when running prediction code on embedded\n devices.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n n_features : int, default=2**20\n The number of features (columns) in the output matrices. Small numbers\n of features are likely to cause hash collisions, but large numbers\n will cause larger coefficient dimensions in linear learners.\n input_type : str, default='dict'\n Choose a string from {'dict', 'pair', 'string'}.\n Either \"dict\" (the default) to accept dictionaries over\n (feature_name, value); \"pair\" to accept pairs of (feature_name, value);\n or \"string\" to accept single strings.\n feature_name should be a string, while value should be a number.\n In the case of \"string\", a value of 1 is implied.\n The feature_name is hashed to find the appropriate column for the\n feature. The value's sign might be flipped in the output (but see\n non_negative, below).\n dtype : numpy dtype, default=np.float64\n The type of feature values. Passed to scipy.sparse matrix constructors\n as the dtype argument. Do not set this to bool, np.boolean or any\n unsigned integer type.\n alternate_sign : bool, default=True\n When True, an alternating sign is added to the features as to\n approximately conserve the inner product in the hashed space even for\n small n_features. This approach is similar to sparse random projection.\n\n .. versionchanged:: 0.19\n ``alternate_sign`` replaces the now deprecated ``non_negative``\n parameter.\n\n See Also\n --------\n DictVectorizer : Vectorizes string-valued features using a hash table.\n sklearn.preprocessing.OneHotEncoder : Handles nominal/categorical features.\n\n Examples\n --------\n >>> from sklearn.feature_extraction import FeatureHasher\n >>> h = FeatureHasher(n_features=10)\n >>> D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]\n >>> f = h.transform(D)\n >>> f.toarray()\n array([[ 0., 0., -4., -1., 0., 0., 0., 0., 0., 2.],\n [ 0., 0., 0., -2., -5., 0., 0., 0., 0., 0.]])\n ", "source_code": "\n\nclass FeatureHasher(TransformerMixin, BaseEstimator):\n \"\"\"Implements feature hashing, aka the hashing trick.\n\n This class turns sequences of symbolic feature names (strings) into\n scipy.sparse matrices, using a hash function to compute the matrix column\n corresponding to a name. The hash function employed is the signed 32-bit\n version of Murmurhash3.\n\n Feature names of type byte string are used as-is. Unicode strings are\n converted to UTF-8 first, but no Unicode normalization is done.\n Feature values must be (finite) numbers.\n\n This class is a low-memory alternative to DictVectorizer and\n CountVectorizer, intended for large-scale (online) learning and situations\n where memory is tight, e.g. when running prediction code on embedded\n devices.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n n_features : int, default=2**20\n The number of features (columns) in the output matrices. Small numbers\n of features are likely to cause hash collisions, but large numbers\n will cause larger coefficient dimensions in linear learners.\n input_type : str, default='dict'\n Choose a string from {'dict', 'pair', 'string'}.\n Either \"dict\" (the default) to accept dictionaries over\n (feature_name, value); \"pair\" to accept pairs of (feature_name, value);\n or \"string\" to accept single strings.\n feature_name should be a string, while value should be a number.\n In the case of \"string\", a value of 1 is implied.\n The feature_name is hashed to find the appropriate column for the\n feature. The value's sign might be flipped in the output (but see\n non_negative, below).\n dtype : numpy dtype, default=np.float64\n The type of feature values. Passed to scipy.sparse matrix constructors\n as the dtype argument. Do not set this to bool, np.boolean or any\n unsigned integer type.\n alternate_sign : bool, default=True\n When True, an alternating sign is added to the features as to\n approximately conserve the inner product in the hashed space even for\n small n_features. This approach is similar to sparse random projection.\n\n .. versionchanged:: 0.19\n ``alternate_sign`` replaces the now deprecated ``non_negative``\n parameter.\n\n See Also\n --------\n DictVectorizer : Vectorizes string-valued features using a hash table.\n sklearn.preprocessing.OneHotEncoder : Handles nominal/categorical features.\n\n Examples\n --------\n >>> from sklearn.feature_extraction import FeatureHasher\n >>> h = FeatureHasher(n_features=10)\n >>> D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]\n >>> f = h.transform(D)\n >>> f.toarray()\n array([[ 0., 0., -4., -1., 0., 0., 0., 0., 0., 2.],\n [ 0., 0., 0., -2., -5., 0., 0., 0., 0., 0.]])\n \"\"\"\n \n def __init__(self, n_features=2**20, *, input_type='dict', dtype=np.float64, alternate_sign=True):\n self._validate_params(n_features, input_type)\n self.dtype = dtype\n self.input_type = input_type\n self.n_features = n_features\n self.alternate_sign = alternate_sign\n \n @staticmethod\n def _validate_params(n_features, input_type):\n if not isinstance(n_features, numbers.Integral):\n raise TypeError('n_features must be integral, got %r (%s).' % (n_features, type(n_features)))\n elif n_features < 1 or n_features >= np.iinfo(np.int32).max + 1:\n raise ValueError('Invalid number of features (%d).' % n_features)\n if input_type not in ('dict', 'pair', 'string'):\n raise ValueError(\"input_type must be 'dict', 'pair' or 'string', got %r.\" % input_type)\n \n def fit(self, X=None, y=None):\n \"\"\"No-op.\n\n This method doesn't do anything. It exists purely for compatibility\n with the scikit-learn transformer API.\n\n Parameters\n ----------\n X : Ignored\n Not used, present here for API consistency by convention.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n FeatureHasher class instance.\n \"\"\"\n self._validate_params(self.n_features, self.input_type)\n return self\n \n def transform(self, raw_X):\n \"\"\"Transform a sequence of instances to a scipy.sparse matrix.\n\n Parameters\n ----------\n raw_X : iterable over iterable over raw features, length = n_samples\n Samples. Each sample must be iterable an (e.g., a list or tuple)\n containing/generating feature names (and optionally values, see\n the input_type constructor argument) which will be hashed.\n raw_X need not support the len function, so it can be the result\n of a generator; n_samples is determined on the fly.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Feature matrix, for use with estimators or further transformers.\n \"\"\"\n raw_X = iter(raw_X)\n if self.input_type == 'dict':\n raw_X = (_iteritems(d) for d in raw_X)\n elif self.input_type == 'string':\n raw_X = (((f, 1) for f in x) for x in raw_X)\n (indices, indptr, values) = _hashing_transform(raw_X, self.n_features, self.dtype, self.alternate_sign, seed=0)\n n_samples = indptr.shape[0] - 1\n if n_samples == 0:\n raise ValueError('Cannot vectorize empty sequence.')\n X = sp.csr_matrix((values, indices, indptr), dtype=self.dtype, shape=(n_samples, self.n_features))\n X.sum_duplicates()\n return X\n \n def _more_tags(self):\n return {'X_types': [self.input_type]}\n" }, @@ -22306,7 +22372,7 @@ "sklearn.feature_extraction.image.PatchExtractor._more_tags" ], "is_public": true, - "description": "Extracts patches from a collection of images.\n\nRead more in the :ref:`User Guide `. .. versionadded:: 0.9", + "description": "Extracts patches from a collection of images.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.9", "docstring": "Extracts patches from a collection of images.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.9\n\n Parameters\n ----------\n patch_size : tuple of int (patch_height, patch_width), default=None\n The dimensions of one patch.\n\n max_patches : int or float, default=None\n The maximum number of patches per image to extract. If `max_patches` is\n a float in (0, 1), it is taken to mean a proportion of the total number\n of patches.\n\n random_state : int, RandomState instance, default=None\n Determines the random number generator used for random sampling when\n `max_patches is not None`. Use an int to make the randomness\n deterministic.\n See :term:`Glossary `.\n\n See Also\n --------\n reconstruct_from_patches_2d : Reconstruct image from all of its patches.\n\n Examples\n --------\n >>> from sklearn.datasets import load_sample_images\n >>> from sklearn.feature_extraction import image\n >>> # Use the array data from the second image in this dataset:\n >>> X = load_sample_images().images[1]\n >>> print('Image shape: {}'.format(X.shape))\n Image shape: (427, 640, 3)\n >>> pe = image.PatchExtractor(patch_size=(2, 2))\n >>> pe_fit = pe.fit(X)\n >>> pe_trans = pe.transform(X)\n >>> print('Patches shape: {}'.format(pe_trans.shape))\n Patches shape: (545706, 2, 2)\n ", "source_code": "\n\nclass PatchExtractor(BaseEstimator):\n \"\"\"Extracts patches from a collection of images.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.9\n\n Parameters\n ----------\n patch_size : tuple of int (patch_height, patch_width), default=None\n The dimensions of one patch.\n\n max_patches : int or float, default=None\n The maximum number of patches per image to extract. If `max_patches` is\n a float in (0, 1), it is taken to mean a proportion of the total number\n of patches.\n\n random_state : int, RandomState instance, default=None\n Determines the random number generator used for random sampling when\n `max_patches is not None`. Use an int to make the randomness\n deterministic.\n See :term:`Glossary `.\n\n See Also\n --------\n reconstruct_from_patches_2d : Reconstruct image from all of its patches.\n\n Examples\n --------\n >>> from sklearn.datasets import load_sample_images\n >>> from sklearn.feature_extraction import image\n >>> # Use the array data from the second image in this dataset:\n >>> X = load_sample_images().images[1]\n >>> print('Image shape: {}'.format(X.shape))\n Image shape: (427, 640, 3)\n >>> pe = image.PatchExtractor(patch_size=(2, 2))\n >>> pe_fit = pe.fit(X)\n >>> pe_trans = pe.transform(X)\n >>> print('Patches shape: {}'.format(pe_trans.shape))\n Patches shape: (545706, 2, 2)\n \"\"\"\n \n def __init__(self, *, patch_size=None, max_patches=None, random_state=None):\n self.patch_size = patch_size\n self.max_patches = max_patches\n self.random_state = random_state\n \n def fit(self, X, y=None):\n \"\"\"Do nothing and return the estimator unchanged.\n\n This method is just there to implement the usual API and hence\n work in pipelines.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n return self\n \n def transform(self, X):\n \"\"\"Transform the image samples in `X` into a matrix of patch data.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, image_height, image_width) or (n_samples, image_height, image_width, n_channels)\n Array of images from which to extract patches. For color images,\n the last dimension specifies the channel: a RGB image would have\n `n_channels=3`.\n\n Returns\n -------\n patches : array of shape (n_patches, patch_height, patch_width) or (n_patches, patch_height, patch_width, n_channels)\n The collection of patches extracted from the images, where\n `n_patches` is either `n_samples * max_patches` or the total\n number of patches that can be extracted.\n \"\"\"\n self.random_state = check_random_state(self.random_state)\n (n_images, i_h, i_w) = X.shape[:3]\n X = np.reshape(X, (n_images, i_h, i_w, -1))\n n_channels = X.shape[-1]\n if self.patch_size is None:\n patch_size = (i_h // 10, i_w // 10)\n else:\n patch_size = self.patch_size\n (p_h, p_w) = patch_size\n n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, self.max_patches)\n patches_shape = (n_images * n_patches, ) + patch_size\n if n_channels > 1:\n patches_shape += (n_channels, )\n patches = np.empty(patches_shape)\n for (ii, image) in enumerate(X):\n patches[ii * n_patches:(ii + 1) * n_patches] = extract_patches_2d(image, patch_size, max_patches=self.max_patches, random_state=self.random_state)\n return patches\n \n def _more_tags(self):\n return {'X_types': ['3darray']}\n" }, @@ -22330,9 +22396,9 @@ "sklearn.feature_extraction.text.CountVectorizer._more_tags" ], "is_public": true, - "description": "Convert a collection of text documents to a matrix of token counts.\n\nThis implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix. If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection then the number of features will be equal to the vocabulary size found by analyzing the data. Read more in the :ref:`User Guide `.", - "docstring": "Convert a collection of text documents to a matrix of token counts.\n\n This implementation produces a sparse representation of the counts using\n scipy.sparse.csr_matrix.\n\n If you do not provide an a-priori dictionary and you do not use an analyzer\n that does some kind of feature selection then the number of features will\n be equal to the vocabulary size found by analyzing the data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n input : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\n encoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\n strip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n an direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\n lowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\n preprocessor : callable, default=None\n Override the preprocessing (strip_accents and lowercase) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer is not callable``.\n\n tokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\n stop_words : {'english'}, list, default=None\n If 'english', a built-in stop word list for English is used.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n If None, no stop words will be used. max_df can be set to a value\n in the range [0.7, 1.0) to automatically detect and filter stop\n words based on intra corpus document frequency of terms.\n\n token_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp select tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\n ngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n word n-grams or char n-grams to be extracted. All values of n such\n such that min_n <= n <= max_n will be used. For example an\n ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means\n unigrams and bigrams, and ``(2, 2)`` means only bigrams.\n Only applies if ``analyzer is not callable``.\n\n analyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word n-gram or character\n n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n\n Since v0.21, if ``input`` is ``filename`` or ``file``, the data is\n first read from the file and then passed to the given callable\n analyzer.\n\n max_df : float in range [0.0, 1.0] or int, default=1.0\n When building the vocabulary ignore terms that have a document\n frequency strictly higher than the given threshold (corpus-specific\n stop words).\n If float, the parameter represents a proportion of documents, integer\n absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n min_df : float in range [0.0, 1.0] or int, default=1\n When building the vocabulary ignore terms that have a document\n frequency strictly lower than the given threshold. This value is also\n called cut-off in the literature.\n If float, the parameter represents a proportion of documents, integer\n absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n max_features : int, default=None\n If not None, build a vocabulary that only consider the top\n max_features ordered by term frequency across the corpus.\n\n This parameter is ignored if vocabulary is not None.\n\n vocabulary : Mapping or iterable, default=None\n Either a Mapping (e.g., a dict) where keys are terms and values are\n indices in the feature matrix, or an iterable over terms. If not\n given, a vocabulary is determined from the input documents. Indices\n in the mapping should not be repeated and should not have any gap\n between 0 and the largest index.\n\n binary : bool, default=False\n If True, all non zero counts are set to 1. This is useful for discrete\n probabilistic models that model binary events rather than integer\n counts.\n\n dtype : type, default=np.int64\n Type of the matrix returned by fit_transform() or transform().\n\n Attributes\n ----------\n vocabulary_ : dict\n A mapping of terms to feature indices.\n\n fixed_vocabulary_ : bool\n True if a fixed vocabulary of term to indices mapping\n is provided by the user.\n\n stop_words_ : set\n Terms that were ignored because they either:\n\n - occurred in too many documents (`max_df`)\n - occurred in too few documents (`min_df`)\n - were cut off by feature selection (`max_features`).\n\n This is only available if no vocabulary was given.\n\n See Also\n --------\n HashingVectorizer : Convert a collection of text documents to a\n matrix of token counts.\n\n TfidfVectorizer : Convert a collection of raw documents to a matrix\n of TF-IDF features.\n\n Notes\n -----\n The ``stop_words_`` attribute can get large and increase the model size\n when pickling. This attribute is provided only for introspection and can\n be safely removed using delattr or set to None before pickling.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import CountVectorizer\n >>> corpus = [\n ... 'This is the first document.',\n ... 'This document is the second document.',\n ... 'And this is the third one.',\n ... 'Is this the first document?',\n ... ]\n >>> vectorizer = CountVectorizer()\n >>> X = vectorizer.fit_transform(corpus)\n >>> vectorizer.get_feature_names_out()\n array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',\n 'this'], ...)\n >>> print(X.toarray())\n [[0 1 1 1 0 0 1 0 1]\n [0 2 0 1 0 1 1 0 1]\n [1 0 0 1 1 0 1 1 1]\n [0 1 1 1 0 0 1 0 1]]\n >>> vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))\n >>> X2 = vectorizer2.fit_transform(corpus)\n >>> vectorizer2.get_feature_names_out()\n array(['and this', 'document is', 'first document', 'is the', 'is this',\n 'second document', 'the first', 'the second', 'the third', 'third one',\n 'this document', 'this is', 'this the'], ...)\n >>> print(X2.toarray())\n [[0 0 1 1 0 0 1 0 0 0 0 1 0]\n [0 1 0 1 0 1 0 1 0 0 1 0 0]\n [1 0 0 1 0 0 0 0 1 1 0 1 0]\n [0 0 1 0 1 0 1 0 0 0 0 0 1]]\n ", - "source_code": "\n\nclass CountVectorizer(_VectorizerMixin, BaseEstimator):\n \"\"\"Convert a collection of text documents to a matrix of token counts.\n\n This implementation produces a sparse representation of the counts using\n scipy.sparse.csr_matrix.\n\n If you do not provide an a-priori dictionary and you do not use an analyzer\n that does some kind of feature selection then the number of features will\n be equal to the vocabulary size found by analyzing the data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n input : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\n encoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\n strip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n an direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\n lowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\n preprocessor : callable, default=None\n Override the preprocessing (strip_accents and lowercase) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer is not callable``.\n\n tokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\n stop_words : {'english'}, list, default=None\n If 'english', a built-in stop word list for English is used.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n If None, no stop words will be used. max_df can be set to a value\n in the range [0.7, 1.0) to automatically detect and filter stop\n words based on intra corpus document frequency of terms.\n\n token_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp select tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\n ngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n word n-grams or char n-grams to be extracted. All values of n such\n such that min_n <= n <= max_n will be used. For example an\n ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means\n unigrams and bigrams, and ``(2, 2)`` means only bigrams.\n Only applies if ``analyzer is not callable``.\n\n analyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word n-gram or character\n n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n\n Since v0.21, if ``input`` is ``filename`` or ``file``, the data is\n first read from the file and then passed to the given callable\n analyzer.\n\n max_df : float in range [0.0, 1.0] or int, default=1.0\n When building the vocabulary ignore terms that have a document\n frequency strictly higher than the given threshold (corpus-specific\n stop words).\n If float, the parameter represents a proportion of documents, integer\n absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n min_df : float in range [0.0, 1.0] or int, default=1\n When building the vocabulary ignore terms that have a document\n frequency strictly lower than the given threshold. This value is also\n called cut-off in the literature.\n If float, the parameter represents a proportion of documents, integer\n absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n max_features : int, default=None\n If not None, build a vocabulary that only consider the top\n max_features ordered by term frequency across the corpus.\n\n This parameter is ignored if vocabulary is not None.\n\n vocabulary : Mapping or iterable, default=None\n Either a Mapping (e.g., a dict) where keys are terms and values are\n indices in the feature matrix, or an iterable over terms. If not\n given, a vocabulary is determined from the input documents. Indices\n in the mapping should not be repeated and should not have any gap\n between 0 and the largest index.\n\n binary : bool, default=False\n If True, all non zero counts are set to 1. This is useful for discrete\n probabilistic models that model binary events rather than integer\n counts.\n\n dtype : type, default=np.int64\n Type of the matrix returned by fit_transform() or transform().\n\n Attributes\n ----------\n vocabulary_ : dict\n A mapping of terms to feature indices.\n\n fixed_vocabulary_ : bool\n True if a fixed vocabulary of term to indices mapping\n is provided by the user.\n\n stop_words_ : set\n Terms that were ignored because they either:\n\n - occurred in too many documents (`max_df`)\n - occurred in too few documents (`min_df`)\n - were cut off by feature selection (`max_features`).\n\n This is only available if no vocabulary was given.\n\n See Also\n --------\n HashingVectorizer : Convert a collection of text documents to a\n matrix of token counts.\n\n TfidfVectorizer : Convert a collection of raw documents to a matrix\n of TF-IDF features.\n\n Notes\n -----\n The ``stop_words_`` attribute can get large and increase the model size\n when pickling. This attribute is provided only for introspection and can\n be safely removed using delattr or set to None before pickling.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import CountVectorizer\n >>> corpus = [\n ... 'This is the first document.',\n ... 'This document is the second document.',\n ... 'And this is the third one.',\n ... 'Is this the first document?',\n ... ]\n >>> vectorizer = CountVectorizer()\n >>> X = vectorizer.fit_transform(corpus)\n >>> vectorizer.get_feature_names_out()\n array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',\n 'this'], ...)\n >>> print(X.toarray())\n [[0 1 1 1 0 0 1 0 1]\n [0 2 0 1 0 1 1 0 1]\n [1 0 0 1 1 0 1 1 1]\n [0 1 1 1 0 0 1 0 1]]\n >>> vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))\n >>> X2 = vectorizer2.fit_transform(corpus)\n >>> vectorizer2.get_feature_names_out()\n array(['and this', 'document is', 'first document', 'is the', 'is this',\n 'second document', 'the first', 'the second', 'the third', 'third one',\n 'this document', 'this is', 'this the'], ...)\n >>> print(X2.toarray())\n [[0 0 1 1 0 0 1 0 0 0 0 1 0]\n [0 1 0 1 0 1 0 1 0 0 1 0 0]\n [1 0 0 1 0 0 0 0 1 1 0 1 0]\n [0 0 1 0 1 0 1 0 0 0 0 0 1]]\n \"\"\"\n \n def __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64):\n self.input = input\n self.encoding = encoding\n self.decode_error = decode_error\n self.strip_accents = strip_accents\n self.preprocessor = preprocessor\n self.tokenizer = tokenizer\n self.analyzer = analyzer\n self.lowercase = lowercase\n self.token_pattern = token_pattern\n self.stop_words = stop_words\n self.max_df = max_df\n self.min_df = min_df\n self.max_features = max_features\n self.ngram_range = ngram_range\n self.vocabulary = vocabulary\n self.binary = binary\n self.dtype = dtype\n \n def _sort_features(self, X, vocabulary):\n \"\"\"Sort features by name\n\n Returns a reordered matrix and modifies the vocabulary in place\n \"\"\"\n sorted_features = sorted(vocabulary.items())\n map_index = np.empty(len(sorted_features), dtype=X.indices.dtype)\n for (new_val, (term, old_val)) in enumerate(sorted_features):\n vocabulary[term] = new_val\n map_index[old_val] = new_val\n X.indices = map_index.take(X.indices, mode='clip')\n return X\n \n def _limit_features(self, X, vocabulary, high=None, low=None, limit=None):\n \"\"\"Remove too rare or too common features.\n\n Prune features that are non zero in more samples than high or less\n documents than low, modifying the vocabulary, and restricting it to\n at most the limit most frequent.\n\n This does not prune samples with zero features.\n \"\"\"\n if high is None and low is None and limit is None:\n return X, set()\n dfs = _document_frequency(X)\n mask = np.ones(len(dfs), dtype=bool)\n if high is not None:\n mask &= dfs <= high\n if low is not None:\n mask &= dfs >= low\n if limit is not None and mask.sum() > limit:\n tfs = np.asarray(X.sum(axis=0)).ravel()\n mask_inds = (-tfs[mask]).argsort()[:limit]\n new_mask = np.zeros(len(dfs), dtype=bool)\n new_mask[np.where(mask)[0][mask_inds]] = True\n mask = new_mask\n new_indices = np.cumsum(mask) - 1\n removed_terms = set()\n for (term, old_index) in list(vocabulary.items()):\n if mask[old_index]:\n vocabulary[term] = new_indices[old_index]\n else:\n del vocabulary[term]\n removed_terms.add(term)\n kept_indices = np.where(mask)[0]\n if len(kept_indices) == 0:\n raise ValueError('After pruning, no terms remain. Try a lower min_df or a higher max_df.')\n return X[:, kept_indices], removed_terms\n \n def _count_vocab(self, raw_documents, fixed_vocab):\n \"\"\"Create sparse feature matrix, and vocabulary where fixed_vocab=False\"\"\"\n if fixed_vocab:\n vocabulary = self.vocabulary_\n else:\n vocabulary = defaultdict()\n vocabulary.default_factory = vocabulary.__len__\n analyze = self.build_analyzer()\n j_indices = []\n indptr = []\n values = _make_int_array()\n indptr.append(0)\n for doc in raw_documents:\n feature_counter = {}\n for feature in analyze(doc):\n try:\n feature_idx = vocabulary[feature]\n if feature_idx not in feature_counter:\n feature_counter[feature_idx] = 1\n else:\n feature_counter[feature_idx] += 1\n except KeyError:\n continue\n j_indices.extend(feature_counter.keys())\n values.extend(feature_counter.values())\n indptr.append(len(j_indices))\n if not fixed_vocab:\n vocabulary = dict(vocabulary)\n if not vocabulary:\n raise ValueError('empty vocabulary; perhaps the documents only contain stop words')\n if indptr[-1] > np.iinfo(np.int32).max:\n if _IS_32BIT:\n raise ValueError('sparse CSR array has {} non-zero elements and requires 64 bit indexing, which is unsupported with 32 bit Python.'.format(indptr[-1]))\n indices_dtype = np.int64\n else:\n indices_dtype = np.int32\n j_indices = np.asarray(j_indices, dtype=indices_dtype)\n indptr = np.asarray(indptr, dtype=indices_dtype)\n values = np.frombuffer(values, dtype=np.intc)\n X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype)\n X.sort_indices()\n return vocabulary, X\n \n def _validate_params(self):\n \"\"\"Validation of min_df, max_df and max_features\"\"\"\n super()._validate_params()\n if self.max_features is not None:\n check_scalar(self.max_features, 'max_features', numbers.Integral, min_val=0)\n if isinstance(self.min_df, numbers.Integral):\n check_scalar(self.min_df, 'min_df', numbers.Integral, min_val=0)\n else:\n check_scalar(self.min_df, 'min_df', numbers.Real, min_val=0.0, max_val=1.0)\n if isinstance(self.max_df, numbers.Integral):\n check_scalar(self.max_df, 'max_df', numbers.Integral, min_val=0)\n else:\n check_scalar(self.max_df, 'max_df', numbers.Real, min_val=0.0, max_val=1.0)\n \n def fit(self, raw_documents, y=None):\n \"\"\"Learn a vocabulary dictionary of all tokens in the raw documents.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is ignored.\n\n Returns\n -------\n self : object\n Fitted vectorizer.\n \"\"\"\n self._warn_for_unused_params()\n self.fit_transform(raw_documents)\n return self\n \n def fit_transform(self, raw_documents, y=None):\n \"\"\"Learn the vocabulary dictionary and return document-term matrix.\n\n This is equivalent to fit followed by transform, but more efficiently\n implemented.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is ignored.\n\n Returns\n -------\n X : array of shape (n_samples, n_features)\n Document-term matrix.\n \"\"\"\n if isinstance(raw_documents, str):\n raise ValueError('Iterable over raw text documents expected, string object received.')\n self._validate_params()\n self._validate_vocabulary()\n max_df = self.max_df\n min_df = self.min_df\n max_features = self.max_features\n if self.fixed_vocabulary_ and self.lowercase:\n for term in self.vocabulary:\n if any(map(str.isupper, term)):\n warnings.warn(\"Upper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documents\")\n break\n (vocabulary, X) = self._count_vocab(raw_documents, self.fixed_vocabulary_)\n if self.binary:\n X.data.fill(1)\n if not self.fixed_vocabulary_:\n n_doc = X.shape[0]\n max_doc_count = max_df if isinstance(max_df, numbers.Integral) else max_df * n_doc\n min_doc_count = min_df if isinstance(min_df, numbers.Integral) else min_df * n_doc\n if max_doc_count < min_doc_count:\n raise ValueError('max_df corresponds to < documents than min_df')\n if max_features is not None:\n X = self._sort_features(X, vocabulary)\n (X, self.stop_words_) = self._limit_features(X, vocabulary, max_doc_count, min_doc_count, max_features)\n if max_features is None:\n X = self._sort_features(X, vocabulary)\n self.vocabulary_ = vocabulary\n return X\n \n def transform(self, raw_documents):\n \"\"\"Transform documents to document-term matrix.\n\n Extract token counts out of raw text documents using the vocabulary\n fitted with fit or the one provided to the constructor.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.\n \"\"\"\n if isinstance(raw_documents, str):\n raise ValueError('Iterable over raw text documents expected, string object received.')\n self._check_vocabulary()\n (_, X) = self._count_vocab(raw_documents, fixed_vocab=True)\n if self.binary:\n X.data.fill(1)\n return X\n \n def inverse_transform(self, X):\n \"\"\"Return terms per document with nonzero entries in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document-term matrix.\n\n Returns\n -------\n X_inv : list of arrays of shape (n_samples,)\n List of arrays of terms.\n \"\"\"\n self._check_vocabulary()\n X = check_array(X, accept_sparse='csr')\n n_samples = X.shape[0]\n terms = np.array(list(self.vocabulary_.keys()))\n indices = np.array(list(self.vocabulary_.values()))\n inverse_vocabulary = terms[np.argsort(indices)]\n if sp.issparse(X):\n return [inverse_vocabulary[X[i, :].nonzero()[1]].ravel() for i in range(n_samples)]\n else:\n return [inverse_vocabulary[np.flatnonzero(X[i, :])].ravel() for i in range(n_samples)]\n \n @deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\n def get_feature_names(self):\n \"\"\"Array mapping from feature integer indices to feature name.\n\n Returns\n -------\n feature_names : list\n A list of feature names.\n \"\"\"\n self._check_vocabulary()\n return [t for (t, i) in sorted(self.vocabulary_.items(), key=itemgetter(1))]\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n self._check_vocabulary()\n return np.asarray([t for (t, i) in sorted(self.vocabulary_.items(), key=itemgetter(1))], dtype=object)\n \n def _more_tags(self):\n return {'X_types': ['string']}\n" + "description": "Convert a collection of text documents to a matrix of token counts.\n\nThis implementation produces a sparse representation of the counts using\nscipy.sparse.csr_matrix.\n\nIf you do not provide an a-priori dictionary and you do not use an analyzer\nthat does some kind of feature selection then the number of features will\nbe equal to the vocabulary size found by analyzing the data.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Convert a collection of text documents to a matrix of token counts.\n\n This implementation produces a sparse representation of the counts using\n scipy.sparse.csr_matrix.\n\n If you do not provide an a-priori dictionary and you do not use an analyzer\n that does some kind of feature selection then the number of features will\n be equal to the vocabulary size found by analyzing the data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n input : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\n encoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\n strip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n an direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\n lowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\n preprocessor : callable, default=None\n Override the preprocessing (strip_accents and lowercase) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer`` is not callable.\n\n tokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\n stop_words : {'english'}, list, default=None\n If 'english', a built-in stop word list for English is used.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n If None, no stop words will be used. max_df can be set to a value\n in the range [0.7, 1.0) to automatically detect and filter stop\n words based on intra corpus document frequency of terms.\n\n token_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp select tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\n ngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n word n-grams or char n-grams to be extracted. All values of n such\n such that min_n <= n <= max_n will be used. For example an\n ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means\n unigrams and bigrams, and ``(2, 2)`` means only bigrams.\n Only applies if ``analyzer`` is not callable.\n\n analyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word n-gram or character\n n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n\n Since v0.21, if ``input`` is ``filename`` or ``file``, the data is\n first read from the file and then passed to the given callable\n analyzer.\n\n max_df : float in range [0.0, 1.0] or int, default=1.0\n When building the vocabulary ignore terms that have a document\n frequency strictly higher than the given threshold (corpus-specific\n stop words).\n If float, the parameter represents a proportion of documents, integer\n absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n min_df : float in range [0.0, 1.0] or int, default=1\n When building the vocabulary ignore terms that have a document\n frequency strictly lower than the given threshold. This value is also\n called cut-off in the literature.\n If float, the parameter represents a proportion of documents, integer\n absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n max_features : int, default=None\n If not None, build a vocabulary that only consider the top\n max_features ordered by term frequency across the corpus.\n\n This parameter is ignored if vocabulary is not None.\n\n vocabulary : Mapping or iterable, default=None\n Either a Mapping (e.g., a dict) where keys are terms and values are\n indices in the feature matrix, or an iterable over terms. If not\n given, a vocabulary is determined from the input documents. Indices\n in the mapping should not be repeated and should not have any gap\n between 0 and the largest index.\n\n binary : bool, default=False\n If True, all non zero counts are set to 1. This is useful for discrete\n probabilistic models that model binary events rather than integer\n counts.\n\n dtype : type, default=np.int64\n Type of the matrix returned by fit_transform() or transform().\n\n Attributes\n ----------\n vocabulary_ : dict\n A mapping of terms to feature indices.\n\n fixed_vocabulary_ : bool\n True if a fixed vocabulary of term to indices mapping\n is provided by the user.\n\n stop_words_ : set\n Terms that were ignored because they either:\n\n - occurred in too many documents (`max_df`)\n - occurred in too few documents (`min_df`)\n - were cut off by feature selection (`max_features`).\n\n This is only available if no vocabulary was given.\n\n See Also\n --------\n HashingVectorizer : Convert a collection of text documents to a\n matrix of token counts.\n\n TfidfVectorizer : Convert a collection of raw documents to a matrix\n of TF-IDF features.\n\n Notes\n -----\n The ``stop_words_`` attribute can get large and increase the model size\n when pickling. This attribute is provided only for introspection and can\n be safely removed using delattr or set to None before pickling.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import CountVectorizer\n >>> corpus = [\n ... 'This is the first document.',\n ... 'This document is the second document.',\n ... 'And this is the third one.',\n ... 'Is this the first document?',\n ... ]\n >>> vectorizer = CountVectorizer()\n >>> X = vectorizer.fit_transform(corpus)\n >>> vectorizer.get_feature_names_out()\n array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',\n 'this'], ...)\n >>> print(X.toarray())\n [[0 1 1 1 0 0 1 0 1]\n [0 2 0 1 0 1 1 0 1]\n [1 0 0 1 1 0 1 1 1]\n [0 1 1 1 0 0 1 0 1]]\n >>> vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))\n >>> X2 = vectorizer2.fit_transform(corpus)\n >>> vectorizer2.get_feature_names_out()\n array(['and this', 'document is', 'first document', 'is the', 'is this',\n 'second document', 'the first', 'the second', 'the third', 'third one',\n 'this document', 'this is', 'this the'], ...)\n >>> print(X2.toarray())\n [[0 0 1 1 0 0 1 0 0 0 0 1 0]\n [0 1 0 1 0 1 0 1 0 0 1 0 0]\n [1 0 0 1 0 0 0 0 1 1 0 1 0]\n [0 0 1 0 1 0 1 0 0 0 0 0 1]]\n ", + "source_code": "\n\nclass CountVectorizer(_VectorizerMixin, BaseEstimator):\n \"\"\"Convert a collection of text documents to a matrix of token counts.\n\n This implementation produces a sparse representation of the counts using\n scipy.sparse.csr_matrix.\n\n If you do not provide an a-priori dictionary and you do not use an analyzer\n that does some kind of feature selection then the number of features will\n be equal to the vocabulary size found by analyzing the data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n input : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\n encoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\n strip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n an direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\n lowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\n preprocessor : callable, default=None\n Override the preprocessing (strip_accents and lowercase) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer`` is not callable.\n\n tokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\n stop_words : {'english'}, list, default=None\n If 'english', a built-in stop word list for English is used.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n If None, no stop words will be used. max_df can be set to a value\n in the range [0.7, 1.0) to automatically detect and filter stop\n words based on intra corpus document frequency of terms.\n\n token_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp select tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\n ngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n word n-grams or char n-grams to be extracted. All values of n such\n such that min_n <= n <= max_n will be used. For example an\n ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means\n unigrams and bigrams, and ``(2, 2)`` means only bigrams.\n Only applies if ``analyzer`` is not callable.\n\n analyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word n-gram or character\n n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n\n Since v0.21, if ``input`` is ``filename`` or ``file``, the data is\n first read from the file and then passed to the given callable\n analyzer.\n\n max_df : float in range [0.0, 1.0] or int, default=1.0\n When building the vocabulary ignore terms that have a document\n frequency strictly higher than the given threshold (corpus-specific\n stop words).\n If float, the parameter represents a proportion of documents, integer\n absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n min_df : float in range [0.0, 1.0] or int, default=1\n When building the vocabulary ignore terms that have a document\n frequency strictly lower than the given threshold. This value is also\n called cut-off in the literature.\n If float, the parameter represents a proportion of documents, integer\n absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n max_features : int, default=None\n If not None, build a vocabulary that only consider the top\n max_features ordered by term frequency across the corpus.\n\n This parameter is ignored if vocabulary is not None.\n\n vocabulary : Mapping or iterable, default=None\n Either a Mapping (e.g., a dict) where keys are terms and values are\n indices in the feature matrix, or an iterable over terms. If not\n given, a vocabulary is determined from the input documents. Indices\n in the mapping should not be repeated and should not have any gap\n between 0 and the largest index.\n\n binary : bool, default=False\n If True, all non zero counts are set to 1. This is useful for discrete\n probabilistic models that model binary events rather than integer\n counts.\n\n dtype : type, default=np.int64\n Type of the matrix returned by fit_transform() or transform().\n\n Attributes\n ----------\n vocabulary_ : dict\n A mapping of terms to feature indices.\n\n fixed_vocabulary_ : bool\n True if a fixed vocabulary of term to indices mapping\n is provided by the user.\n\n stop_words_ : set\n Terms that were ignored because they either:\n\n - occurred in too many documents (`max_df`)\n - occurred in too few documents (`min_df`)\n - were cut off by feature selection (`max_features`).\n\n This is only available if no vocabulary was given.\n\n See Also\n --------\n HashingVectorizer : Convert a collection of text documents to a\n matrix of token counts.\n\n TfidfVectorizer : Convert a collection of raw documents to a matrix\n of TF-IDF features.\n\n Notes\n -----\n The ``stop_words_`` attribute can get large and increase the model size\n when pickling. This attribute is provided only for introspection and can\n be safely removed using delattr or set to None before pickling.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import CountVectorizer\n >>> corpus = [\n ... 'This is the first document.',\n ... 'This document is the second document.',\n ... 'And this is the third one.',\n ... 'Is this the first document?',\n ... ]\n >>> vectorizer = CountVectorizer()\n >>> X = vectorizer.fit_transform(corpus)\n >>> vectorizer.get_feature_names_out()\n array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',\n 'this'], ...)\n >>> print(X.toarray())\n [[0 1 1 1 0 0 1 0 1]\n [0 2 0 1 0 1 1 0 1]\n [1 0 0 1 1 0 1 1 1]\n [0 1 1 1 0 0 1 0 1]]\n >>> vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))\n >>> X2 = vectorizer2.fit_transform(corpus)\n >>> vectorizer2.get_feature_names_out()\n array(['and this', 'document is', 'first document', 'is the', 'is this',\n 'second document', 'the first', 'the second', 'the third', 'third one',\n 'this document', 'this is', 'this the'], ...)\n >>> print(X2.toarray())\n [[0 0 1 1 0 0 1 0 0 0 0 1 0]\n [0 1 0 1 0 1 0 1 0 0 1 0 0]\n [1 0 0 1 0 0 0 0 1 1 0 1 0]\n [0 0 1 0 1 0 1 0 0 0 0 0 1]]\n \"\"\"\n \n def __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64):\n self.input = input\n self.encoding = encoding\n self.decode_error = decode_error\n self.strip_accents = strip_accents\n self.preprocessor = preprocessor\n self.tokenizer = tokenizer\n self.analyzer = analyzer\n self.lowercase = lowercase\n self.token_pattern = token_pattern\n self.stop_words = stop_words\n self.max_df = max_df\n self.min_df = min_df\n self.max_features = max_features\n self.ngram_range = ngram_range\n self.vocabulary = vocabulary\n self.binary = binary\n self.dtype = dtype\n \n def _sort_features(self, X, vocabulary):\n \"\"\"Sort features by name\n\n Returns a reordered matrix and modifies the vocabulary in place\n \"\"\"\n sorted_features = sorted(vocabulary.items())\n map_index = np.empty(len(sorted_features), dtype=X.indices.dtype)\n for (new_val, (term, old_val)) in enumerate(sorted_features):\n vocabulary[term] = new_val\n map_index[old_val] = new_val\n X.indices = map_index.take(X.indices, mode='clip')\n return X\n \n def _limit_features(self, X, vocabulary, high=None, low=None, limit=None):\n \"\"\"Remove too rare or too common features.\n\n Prune features that are non zero in more samples than high or less\n documents than low, modifying the vocabulary, and restricting it to\n at most the limit most frequent.\n\n This does not prune samples with zero features.\n \"\"\"\n if high is None and low is None and limit is None:\n return X, set()\n dfs = _document_frequency(X)\n mask = np.ones(len(dfs), dtype=bool)\n if high is not None:\n mask &= dfs <= high\n if low is not None:\n mask &= dfs >= low\n if limit is not None and mask.sum() > limit:\n tfs = np.asarray(X.sum(axis=0)).ravel()\n mask_inds = (-tfs[mask]).argsort()[:limit]\n new_mask = np.zeros(len(dfs), dtype=bool)\n new_mask[np.where(mask)[0][mask_inds]] = True\n mask = new_mask\n new_indices = np.cumsum(mask) - 1\n removed_terms = set()\n for (term, old_index) in list(vocabulary.items()):\n if mask[old_index]:\n vocabulary[term] = new_indices[old_index]\n else:\n del vocabulary[term]\n removed_terms.add(term)\n kept_indices = np.where(mask)[0]\n if len(kept_indices) == 0:\n raise ValueError('After pruning, no terms remain. Try a lower min_df or a higher max_df.')\n return X[:, kept_indices], removed_terms\n \n def _count_vocab(self, raw_documents, fixed_vocab):\n \"\"\"Create sparse feature matrix, and vocabulary where fixed_vocab=False\"\"\"\n if fixed_vocab:\n vocabulary = self.vocabulary_\n else:\n vocabulary = defaultdict()\n vocabulary.default_factory = vocabulary.__len__\n analyze = self.build_analyzer()\n j_indices = []\n indptr = []\n values = _make_int_array()\n indptr.append(0)\n for doc in raw_documents:\n feature_counter = {}\n for feature in analyze(doc):\n try:\n feature_idx = vocabulary[feature]\n if feature_idx not in feature_counter:\n feature_counter[feature_idx] = 1\n else:\n feature_counter[feature_idx] += 1\n except KeyError:\n continue\n j_indices.extend(feature_counter.keys())\n values.extend(feature_counter.values())\n indptr.append(len(j_indices))\n if not fixed_vocab:\n vocabulary = dict(vocabulary)\n if not vocabulary:\n raise ValueError('empty vocabulary; perhaps the documents only contain stop words')\n if indptr[-1] > np.iinfo(np.int32).max:\n if _IS_32BIT:\n raise ValueError('sparse CSR array has {} non-zero elements and requires 64 bit indexing, which is unsupported with 32 bit Python.'.format(indptr[-1]))\n indices_dtype = np.int64\n else:\n indices_dtype = np.int32\n j_indices = np.asarray(j_indices, dtype=indices_dtype)\n indptr = np.asarray(indptr, dtype=indices_dtype)\n values = np.frombuffer(values, dtype=np.intc)\n X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype)\n X.sort_indices()\n return vocabulary, X\n \n def _validate_params(self):\n \"\"\"Validation of min_df, max_df and max_features\"\"\"\n super()._validate_params()\n if self.max_features is not None:\n check_scalar(self.max_features, 'max_features', numbers.Integral, min_val=0)\n if isinstance(self.min_df, numbers.Integral):\n check_scalar(self.min_df, 'min_df', numbers.Integral, min_val=0)\n else:\n check_scalar(self.min_df, 'min_df', numbers.Real, min_val=0.0, max_val=1.0)\n if isinstance(self.max_df, numbers.Integral):\n check_scalar(self.max_df, 'max_df', numbers.Integral, min_val=0)\n else:\n check_scalar(self.max_df, 'max_df', numbers.Real, min_val=0.0, max_val=1.0)\n \n def fit(self, raw_documents, y=None):\n \"\"\"Learn a vocabulary dictionary of all tokens in the raw documents.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is ignored.\n\n Returns\n -------\n self : object\n Fitted vectorizer.\n \"\"\"\n self._warn_for_unused_params()\n self.fit_transform(raw_documents)\n return self\n \n def fit_transform(self, raw_documents, y=None):\n \"\"\"Learn the vocabulary dictionary and return document-term matrix.\n\n This is equivalent to fit followed by transform, but more efficiently\n implemented.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is ignored.\n\n Returns\n -------\n X : array of shape (n_samples, n_features)\n Document-term matrix.\n \"\"\"\n if isinstance(raw_documents, str):\n raise ValueError('Iterable over raw text documents expected, string object received.')\n self._validate_params()\n self._validate_vocabulary()\n max_df = self.max_df\n min_df = self.min_df\n max_features = self.max_features\n if self.fixed_vocabulary_ and self.lowercase:\n for term in self.vocabulary:\n if any(map(str.isupper, term)):\n warnings.warn(\"Upper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documents\")\n break\n (vocabulary, X) = self._count_vocab(raw_documents, self.fixed_vocabulary_)\n if self.binary:\n X.data.fill(1)\n if not self.fixed_vocabulary_:\n n_doc = X.shape[0]\n max_doc_count = max_df if isinstance(max_df, numbers.Integral) else max_df * n_doc\n min_doc_count = min_df if isinstance(min_df, numbers.Integral) else min_df * n_doc\n if max_doc_count < min_doc_count:\n raise ValueError('max_df corresponds to < documents than min_df')\n if max_features is not None:\n X = self._sort_features(X, vocabulary)\n (X, self.stop_words_) = self._limit_features(X, vocabulary, max_doc_count, min_doc_count, max_features)\n if max_features is None:\n X = self._sort_features(X, vocabulary)\n self.vocabulary_ = vocabulary\n return X\n \n def transform(self, raw_documents):\n \"\"\"Transform documents to document-term matrix.\n\n Extract token counts out of raw text documents using the vocabulary\n fitted with fit or the one provided to the constructor.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.\n \"\"\"\n if isinstance(raw_documents, str):\n raise ValueError('Iterable over raw text documents expected, string object received.')\n self._check_vocabulary()\n (_, X) = self._count_vocab(raw_documents, fixed_vocab=True)\n if self.binary:\n X.data.fill(1)\n return X\n \n def inverse_transform(self, X):\n \"\"\"Return terms per document with nonzero entries in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document-term matrix.\n\n Returns\n -------\n X_inv : list of arrays of shape (n_samples,)\n List of arrays of terms.\n \"\"\"\n self._check_vocabulary()\n X = check_array(X, accept_sparse='csr')\n n_samples = X.shape[0]\n terms = np.array(list(self.vocabulary_.keys()))\n indices = np.array(list(self.vocabulary_.values()))\n inverse_vocabulary = terms[np.argsort(indices)]\n if sp.issparse(X):\n return [inverse_vocabulary[X[i, :].nonzero()[1]].ravel() for i in range(n_samples)]\n else:\n return [inverse_vocabulary[np.flatnonzero(X[i, :])].ravel() for i in range(n_samples)]\n \n @deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\n def get_feature_names(self):\n \"\"\"Array mapping from feature integer indices to feature name.\n\n Returns\n -------\n feature_names : list\n A list of feature names.\n \"\"\"\n self._check_vocabulary()\n return [t for (t, i) in sorted(self.vocabulary_.items(), key=itemgetter(1))]\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n self._check_vocabulary()\n return np.asarray([t for (t, i) in sorted(self.vocabulary_.items(), key=itemgetter(1))], dtype=object)\n \n def _more_tags(self):\n return {'X_types': ['string']}\n" }, { "name": "HashingVectorizer", @@ -22353,9 +22419,9 @@ "sklearn.feature_extraction.text.HashingVectorizer._more_tags" ], "is_public": true, - "description": "Convert a collection of text documents to a matrix of token occurrences.\n\nIt turns a collection of text documents into a scipy.sparse matrix holding token occurrence counts (or binary occurrence information), possibly normalized as token frequencies if norm='l1' or projected on the euclidean unit sphere if norm='l2'. This text vectorizer implementation uses the hashing trick to find the token string name to feature integer index mapping. This strategy has several advantages: - it is very low memory scalable to large datasets as there is no need to store a vocabulary dictionary in memory. - it is fast to pickle and un-pickle as it holds no state besides the constructor parameters. - it can be used in a streaming (partial fit) or parallel pipeline as there is no state computed during fit. There are also a couple of cons (vs using a CountVectorizer with an in-memory vocabulary): - there is no way to compute the inverse transform (from feature indices to string feature names) which can be a problem when trying to introspect which features are most important to a model. - there can be collisions: distinct tokens can be mapped to the same feature index. However in practice this is rarely an issue if n_features is large enough (e.g. 2 ** 18 for text classification problems). - no IDF weighting as this would render the transformer stateful. The hash function employed is the signed 32-bit version of Murmurhash3. Read more in the :ref:`User Guide `.", - "docstring": "Convert a collection of text documents to a matrix of token occurrences.\n\n It turns a collection of text documents into a scipy.sparse matrix holding\n token occurrence counts (or binary occurrence information), possibly\n normalized as token frequencies if norm='l1' or projected on the euclidean\n unit sphere if norm='l2'.\n\n This text vectorizer implementation uses the hashing trick to find the\n token string name to feature integer index mapping.\n\n This strategy has several advantages:\n\n - it is very low memory scalable to large datasets as there is no need to\n store a vocabulary dictionary in memory.\n\n - it is fast to pickle and un-pickle as it holds no state besides the\n constructor parameters.\n\n - it can be used in a streaming (partial fit) or parallel pipeline as there\n is no state computed during fit.\n\n There are also a couple of cons (vs using a CountVectorizer with an\n in-memory vocabulary):\n\n - there is no way to compute the inverse transform (from feature indices to\n string feature names) which can be a problem when trying to introspect\n which features are most important to a model.\n\n - there can be collisions: distinct tokens can be mapped to the same\n feature index. However in practice this is rarely an issue if n_features\n is large enough (e.g. 2 ** 18 for text classification problems).\n\n - no IDF weighting as this would render the transformer stateful.\n\n The hash function employed is the signed 32-bit version of Murmurhash3.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n input : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\n encoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\n strip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n a direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\n lowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\n preprocessor : callable, default=None\n Override the preprocessing (string transformation) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer is not callable``.\n\n tokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\n stop_words : {'english'}, list, default=None\n If 'english', a built-in stop word list for English is used.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n token_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp selects tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\n ngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n n-grams to be extracted. All values of n such that min_n <= n <= max_n\n will be used. For example an ``ngram_range`` of ``(1, 1)`` means only\n unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\n only bigrams.\n Only applies if ``analyzer is not callable``.\n\n analyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word or character n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data\n is first read from the file and then passed to the given callable\n analyzer.\n\n n_features : int, default=(2 ** 20)\n The number of features (columns) in the output matrices. Small numbers\n of features are likely to cause hash collisions, but large numbers\n will cause larger coefficient dimensions in linear learners.\n\n binary : bool, default=False\n If True, all non zero counts are set to 1. This is useful for discrete\n probabilistic models that model binary events rather than integer\n counts.\n\n norm : {'l1', 'l2'}, default='l2'\n Norm used to normalize term vectors. None for no normalization.\n\n alternate_sign : bool, default=True\n When True, an alternating sign is added to the features as to\n approximately conserve the inner product in the hashed space even for\n small n_features. This approach is similar to sparse random projection.\n\n .. versionadded:: 0.19\n\n dtype : type, default=np.float64\n Type of the matrix returned by fit_transform() or transform().\n\n See Also\n --------\n CountVectorizer : Convert a collection of text documents to a matrix of\n token counts.\n TfidfVectorizer : Convert a collection of raw documents to a matrix of\n TF-IDF features.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import HashingVectorizer\n >>> corpus = [\n ... 'This is the first document.',\n ... 'This document is the second document.',\n ... 'And this is the third one.',\n ... 'Is this the first document?',\n ... ]\n >>> vectorizer = HashingVectorizer(n_features=2**4)\n >>> X = vectorizer.fit_transform(corpus)\n >>> print(X.shape)\n (4, 16)\n ", - "source_code": "\n\nclass HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):\n \"\"\"Convert a collection of text documents to a matrix of token occurrences.\n\n It turns a collection of text documents into a scipy.sparse matrix holding\n token occurrence counts (or binary occurrence information), possibly\n normalized as token frequencies if norm='l1' or projected on the euclidean\n unit sphere if norm='l2'.\n\n This text vectorizer implementation uses the hashing trick to find the\n token string name to feature integer index mapping.\n\n This strategy has several advantages:\n\n - it is very low memory scalable to large datasets as there is no need to\n store a vocabulary dictionary in memory.\n\n - it is fast to pickle and un-pickle as it holds no state besides the\n constructor parameters.\n\n - it can be used in a streaming (partial fit) or parallel pipeline as there\n is no state computed during fit.\n\n There are also a couple of cons (vs using a CountVectorizer with an\n in-memory vocabulary):\n\n - there is no way to compute the inverse transform (from feature indices to\n string feature names) which can be a problem when trying to introspect\n which features are most important to a model.\n\n - there can be collisions: distinct tokens can be mapped to the same\n feature index. However in practice this is rarely an issue if n_features\n is large enough (e.g. 2 ** 18 for text classification problems).\n\n - no IDF weighting as this would render the transformer stateful.\n\n The hash function employed is the signed 32-bit version of Murmurhash3.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n input : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\n encoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\n strip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n a direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\n lowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\n preprocessor : callable, default=None\n Override the preprocessing (string transformation) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer is not callable``.\n\n tokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\n stop_words : {'english'}, list, default=None\n If 'english', a built-in stop word list for English is used.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n token_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp selects tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\n ngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n n-grams to be extracted. All values of n such that min_n <= n <= max_n\n will be used. For example an ``ngram_range`` of ``(1, 1)`` means only\n unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\n only bigrams.\n Only applies if ``analyzer is not callable``.\n\n analyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word or character n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data\n is first read from the file and then passed to the given callable\n analyzer.\n\n n_features : int, default=(2 ** 20)\n The number of features (columns) in the output matrices. Small numbers\n of features are likely to cause hash collisions, but large numbers\n will cause larger coefficient dimensions in linear learners.\n\n binary : bool, default=False\n If True, all non zero counts are set to 1. This is useful for discrete\n probabilistic models that model binary events rather than integer\n counts.\n\n norm : {'l1', 'l2'}, default='l2'\n Norm used to normalize term vectors. None for no normalization.\n\n alternate_sign : bool, default=True\n When True, an alternating sign is added to the features as to\n approximately conserve the inner product in the hashed space even for\n small n_features. This approach is similar to sparse random projection.\n\n .. versionadded:: 0.19\n\n dtype : type, default=np.float64\n Type of the matrix returned by fit_transform() or transform().\n\n See Also\n --------\n CountVectorizer : Convert a collection of text documents to a matrix of\n token counts.\n TfidfVectorizer : Convert a collection of raw documents to a matrix of\n TF-IDF features.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import HashingVectorizer\n >>> corpus = [\n ... 'This is the first document.',\n ... 'This document is the second document.',\n ... 'And this is the third one.',\n ... 'Is this the first document?',\n ... ]\n >>> vectorizer = HashingVectorizer(n_features=2**4)\n >>> X = vectorizer.fit_transform(corpus)\n >>> print(X.shape)\n (4, 16)\n \"\"\"\n \n def __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', ngram_range=(1, 1), analyzer='word', n_features=2**20, binary=False, norm='l2', alternate_sign=True, dtype=np.float64):\n self.input = input\n self.encoding = encoding\n self.decode_error = decode_error\n self.strip_accents = strip_accents\n self.preprocessor = preprocessor\n self.tokenizer = tokenizer\n self.analyzer = analyzer\n self.lowercase = lowercase\n self.token_pattern = token_pattern\n self.stop_words = stop_words\n self.n_features = n_features\n self.ngram_range = ngram_range\n self.binary = binary\n self.norm = norm\n self.alternate_sign = alternate_sign\n self.dtype = dtype\n \n def partial_fit(self, X, y=None):\n \"\"\"No-op: this transformer is stateless.\n\n This method is just there to mark the fact that this transformer\n can work in a streaming setup.\n\n Parameters\n ----------\n X : ndarray of shape [n_samples, n_features]\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n HashingVectorizer instance.\n \"\"\"\n return self\n \n def fit(self, X, y=None):\n \"\"\"No-op: this transformer is stateless.\n\n Parameters\n ----------\n X : ndarray of shape [n_samples, n_features]\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n HashingVectorizer instance.\n \"\"\"\n if isinstance(X, str):\n raise ValueError('Iterable over raw text documents expected, string object received.')\n self._warn_for_unused_params()\n self._validate_params()\n self._get_hasher().fit(X, y=y)\n return self\n \n def transform(self, X):\n \"\"\"Transform a sequence of documents to a document-term matrix.\n\n Parameters\n ----------\n X : iterable over raw text documents, length = n_samples\n Samples. Each sample must be a text document (either bytes or\n unicode strings, file name or file object depending on the\n constructor argument) which will be tokenized and hashed.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.\n \"\"\"\n if isinstance(X, str):\n raise ValueError('Iterable over raw text documents expected, string object received.')\n self._validate_params()\n analyzer = self.build_analyzer()\n X = self._get_hasher().transform((analyzer(doc) for doc in X))\n if self.binary:\n X.data.fill(1)\n if self.norm is not None:\n X = normalize(X, norm=self.norm, copy=False)\n return X\n \n def fit_transform(self, X, y=None):\n \"\"\"Transform a sequence of documents to a document-term matrix.\n\n Parameters\n ----------\n X : iterable over raw text documents, length = n_samples\n Samples. Each sample must be a text document (either bytes or\n unicode strings, file name or file object depending on the\n constructor argument) which will be tokenized and hashed.\n y : any\n Ignored. This parameter exists only for compatibility with\n sklearn.pipeline.Pipeline.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.\n \"\"\"\n return self.fit(X, y).transform(X)\n \n def _get_hasher(self):\n return FeatureHasher(n_features=self.n_features, input_type='string', dtype=self.dtype, alternate_sign=self.alternate_sign)\n \n def _more_tags(self):\n return {'X_types': ['string']}\n" + "description": "Convert a collection of text documents to a matrix of token occurrences.\n\nIt turns a collection of text documents into a scipy.sparse matrix holding\ntoken occurrence counts (or binary occurrence information), possibly\nnormalized as token frequencies if norm='l1' or projected on the euclidean\nunit sphere if norm='l2'.\n\nThis text vectorizer implementation uses the hashing trick to find the\ntoken string name to feature integer index mapping.\n\nThis strategy has several advantages:\n\n- it is very low memory scalable to large datasets as there is no need to\n store a vocabulary dictionary in memory.\n\n- it is fast to pickle and un-pickle as it holds no state besides the\n constructor parameters.\n\n- it can be used in a streaming (partial fit) or parallel pipeline as there\n is no state computed during fit.\n\nThere are also a couple of cons (vs using a CountVectorizer with an\nin-memory vocabulary):\n\n- there is no way to compute the inverse transform (from feature indices to\n string feature names) which can be a problem when trying to introspect\n which features are most important to a model.\n\n- there can be collisions: distinct tokens can be mapped to the same\n feature index. However in practice this is rarely an issue if n_features\n is large enough (e.g. 2 ** 18 for text classification problems).\n\n- no IDF weighting as this would render the transformer stateful.\n\nThe hash function employed is the signed 32-bit version of Murmurhash3.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Convert a collection of text documents to a matrix of token occurrences.\n\n It turns a collection of text documents into a scipy.sparse matrix holding\n token occurrence counts (or binary occurrence information), possibly\n normalized as token frequencies if norm='l1' or projected on the euclidean\n unit sphere if norm='l2'.\n\n This text vectorizer implementation uses the hashing trick to find the\n token string name to feature integer index mapping.\n\n This strategy has several advantages:\n\n - it is very low memory scalable to large datasets as there is no need to\n store a vocabulary dictionary in memory.\n\n - it is fast to pickle and un-pickle as it holds no state besides the\n constructor parameters.\n\n - it can be used in a streaming (partial fit) or parallel pipeline as there\n is no state computed during fit.\n\n There are also a couple of cons (vs using a CountVectorizer with an\n in-memory vocabulary):\n\n - there is no way to compute the inverse transform (from feature indices to\n string feature names) which can be a problem when trying to introspect\n which features are most important to a model.\n\n - there can be collisions: distinct tokens can be mapped to the same\n feature index. However in practice this is rarely an issue if n_features\n is large enough (e.g. 2 ** 18 for text classification problems).\n\n - no IDF weighting as this would render the transformer stateful.\n\n The hash function employed is the signed 32-bit version of Murmurhash3.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n input : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\n encoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\n strip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n a direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\n lowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\n preprocessor : callable, default=None\n Override the preprocessing (string transformation) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer`` is not callable.\n\n tokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\n stop_words : {'english'}, list, default=None\n If 'english', a built-in stop word list for English is used.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n token_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp selects tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\n ngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n n-grams to be extracted. All values of n such that min_n <= n <= max_n\n will be used. For example an ``ngram_range`` of ``(1, 1)`` means only\n unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\n only bigrams.\n Only applies if ``analyzer`` is not callable.\n\n analyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word or character n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data\n is first read from the file and then passed to the given callable\n analyzer.\n\n n_features : int, default=(2 ** 20)\n The number of features (columns) in the output matrices. Small numbers\n of features are likely to cause hash collisions, but large numbers\n will cause larger coefficient dimensions in linear learners.\n\n binary : bool, default=False\n If True, all non zero counts are set to 1. This is useful for discrete\n probabilistic models that model binary events rather than integer\n counts.\n\n norm : {'l1', 'l2'}, default='l2'\n Norm used to normalize term vectors. None for no normalization.\n\n alternate_sign : bool, default=True\n When True, an alternating sign is added to the features as to\n approximately conserve the inner product in the hashed space even for\n small n_features. This approach is similar to sparse random projection.\n\n .. versionadded:: 0.19\n\n dtype : type, default=np.float64\n Type of the matrix returned by fit_transform() or transform().\n\n See Also\n --------\n CountVectorizer : Convert a collection of text documents to a matrix of\n token counts.\n TfidfVectorizer : Convert a collection of raw documents to a matrix of\n TF-IDF features.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import HashingVectorizer\n >>> corpus = [\n ... 'This is the first document.',\n ... 'This document is the second document.',\n ... 'And this is the third one.',\n ... 'Is this the first document?',\n ... ]\n >>> vectorizer = HashingVectorizer(n_features=2**4)\n >>> X = vectorizer.fit_transform(corpus)\n >>> print(X.shape)\n (4, 16)\n ", + "source_code": "\n\nclass HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):\n \"\"\"Convert a collection of text documents to a matrix of token occurrences.\n\n It turns a collection of text documents into a scipy.sparse matrix holding\n token occurrence counts (or binary occurrence information), possibly\n normalized as token frequencies if norm='l1' or projected on the euclidean\n unit sphere if norm='l2'.\n\n This text vectorizer implementation uses the hashing trick to find the\n token string name to feature integer index mapping.\n\n This strategy has several advantages:\n\n - it is very low memory scalable to large datasets as there is no need to\n store a vocabulary dictionary in memory.\n\n - it is fast to pickle and un-pickle as it holds no state besides the\n constructor parameters.\n\n - it can be used in a streaming (partial fit) or parallel pipeline as there\n is no state computed during fit.\n\n There are also a couple of cons (vs using a CountVectorizer with an\n in-memory vocabulary):\n\n - there is no way to compute the inverse transform (from feature indices to\n string feature names) which can be a problem when trying to introspect\n which features are most important to a model.\n\n - there can be collisions: distinct tokens can be mapped to the same\n feature index. However in practice this is rarely an issue if n_features\n is large enough (e.g. 2 ** 18 for text classification problems).\n\n - no IDF weighting as this would render the transformer stateful.\n\n The hash function employed is the signed 32-bit version of Murmurhash3.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n input : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\n encoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\n strip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n a direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\n lowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\n preprocessor : callable, default=None\n Override the preprocessing (string transformation) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer`` is not callable.\n\n tokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\n stop_words : {'english'}, list, default=None\n If 'english', a built-in stop word list for English is used.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n token_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp selects tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\n ngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n n-grams to be extracted. All values of n such that min_n <= n <= max_n\n will be used. For example an ``ngram_range`` of ``(1, 1)`` means only\n unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\n only bigrams.\n Only applies if ``analyzer`` is not callable.\n\n analyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word or character n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data\n is first read from the file and then passed to the given callable\n analyzer.\n\n n_features : int, default=(2 ** 20)\n The number of features (columns) in the output matrices. Small numbers\n of features are likely to cause hash collisions, but large numbers\n will cause larger coefficient dimensions in linear learners.\n\n binary : bool, default=False\n If True, all non zero counts are set to 1. This is useful for discrete\n probabilistic models that model binary events rather than integer\n counts.\n\n norm : {'l1', 'l2'}, default='l2'\n Norm used to normalize term vectors. None for no normalization.\n\n alternate_sign : bool, default=True\n When True, an alternating sign is added to the features as to\n approximately conserve the inner product in the hashed space even for\n small n_features. This approach is similar to sparse random projection.\n\n .. versionadded:: 0.19\n\n dtype : type, default=np.float64\n Type of the matrix returned by fit_transform() or transform().\n\n See Also\n --------\n CountVectorizer : Convert a collection of text documents to a matrix of\n token counts.\n TfidfVectorizer : Convert a collection of raw documents to a matrix of\n TF-IDF features.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import HashingVectorizer\n >>> corpus = [\n ... 'This is the first document.',\n ... 'This document is the second document.',\n ... 'And this is the third one.',\n ... 'Is this the first document?',\n ... ]\n >>> vectorizer = HashingVectorizer(n_features=2**4)\n >>> X = vectorizer.fit_transform(corpus)\n >>> print(X.shape)\n (4, 16)\n \"\"\"\n \n def __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', ngram_range=(1, 1), analyzer='word', n_features=2**20, binary=False, norm='l2', alternate_sign=True, dtype=np.float64):\n self.input = input\n self.encoding = encoding\n self.decode_error = decode_error\n self.strip_accents = strip_accents\n self.preprocessor = preprocessor\n self.tokenizer = tokenizer\n self.analyzer = analyzer\n self.lowercase = lowercase\n self.token_pattern = token_pattern\n self.stop_words = stop_words\n self.n_features = n_features\n self.ngram_range = ngram_range\n self.binary = binary\n self.norm = norm\n self.alternate_sign = alternate_sign\n self.dtype = dtype\n \n def partial_fit(self, X, y=None):\n \"\"\"No-op: this transformer is stateless.\n\n This method is just there to mark the fact that this transformer\n can work in a streaming setup.\n\n Parameters\n ----------\n X : ndarray of shape [n_samples, n_features]\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n HashingVectorizer instance.\n \"\"\"\n return self\n \n def fit(self, X, y=None):\n \"\"\"No-op: this transformer is stateless.\n\n Parameters\n ----------\n X : ndarray of shape [n_samples, n_features]\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n HashingVectorizer instance.\n \"\"\"\n if isinstance(X, str):\n raise ValueError('Iterable over raw text documents expected, string object received.')\n self._warn_for_unused_params()\n self._validate_params()\n self._get_hasher().fit(X, y=y)\n return self\n \n def transform(self, X):\n \"\"\"Transform a sequence of documents to a document-term matrix.\n\n Parameters\n ----------\n X : iterable over raw text documents, length = n_samples\n Samples. Each sample must be a text document (either bytes or\n unicode strings, file name or file object depending on the\n constructor argument) which will be tokenized and hashed.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.\n \"\"\"\n if isinstance(X, str):\n raise ValueError('Iterable over raw text documents expected, string object received.')\n self._validate_params()\n analyzer = self.build_analyzer()\n X = self._get_hasher().transform((analyzer(doc) for doc in X))\n if self.binary:\n X.data.fill(1)\n if self.norm is not None:\n X = normalize(X, norm=self.norm, copy=False)\n return X\n \n def fit_transform(self, X, y=None):\n \"\"\"Transform a sequence of documents to a document-term matrix.\n\n Parameters\n ----------\n X : iterable over raw text documents, length = n_samples\n Samples. Each sample must be a text document (either bytes or\n unicode strings, file name or file object depending on the\n constructor argument) which will be tokenized and hashed.\n y : any\n Ignored. This parameter exists only for compatibility with\n sklearn.pipeline.Pipeline.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.\n \"\"\"\n return self.fit(X, y).transform(X)\n \n def _get_hasher(self):\n return FeatureHasher(n_features=self.n_features, input_type='string', dtype=self.dtype, alternate_sign=self.alternate_sign)\n \n def _more_tags(self):\n return {'X_types': ['string']}\n" }, { "name": "TfidfTransformer", @@ -22375,7 +22441,7 @@ "sklearn.feature_extraction.text.TfidfTransformer._more_tags" ], "is_public": true, - "description": "Transform a count matrix to a normalized tf or tf-idf representation.\n\nTf means term-frequency while tf-idf means term-frequency times inverse document-frequency. This is a common term weighting scheme in information retrieval, that has also found good use in document classification. The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given document is to scale down the impact of tokens that occur very frequently in a given corpus and that are hence empirically less informative than features that occur in a small fraction of the training corpus. The formula that is used to compute the tf-idf for a term t of a document d in a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), where n is the total number of documents in the document set and df(t) is the document frequency of t; the document frequency is the number of documents in the document set that contain the term t. The effect of adding \"1\" to the idf in the equation above is that terms with zero idf, i.e., terms that occur in all documents in a training set, will not be entirely ignored. (Note that the idf formula above differs from the standard textbook notation that defines the idf as idf(t) = log [ n / (df(t) + 1) ]). If ``smooth_idf=True`` (the default), the constant \"1\" is added to the numerator and denominator of the idf as if an extra document was seen containing every term in the collection exactly once, which prevents zero divisions: idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1. Furthermore, the formulas used to compute tf and idf depend on parameter settings that correspond to the SMART notation used in IR as follows: Tf is \"n\" (natural) by default, \"l\" (logarithmic) when ``sublinear_tf=True``. Idf is \"t\" when use_idf is given, \"n\" (none) otherwise. Normalization is \"c\" (cosine) when ``norm='l2'``, \"n\" (none) when ``norm=None``. Read more in the :ref:`User Guide `.", + "description": "Transform a count matrix to a normalized tf or tf-idf representation.\n\nTf means term-frequency while tf-idf means term-frequency times inverse\ndocument-frequency. This is a common term weighting scheme in information\nretrieval, that has also found good use in document classification.\n\nThe goal of using tf-idf instead of the raw frequencies of occurrence of a\ntoken in a given document is to scale down the impact of tokens that occur\nvery frequently in a given corpus and that are hence empirically less\ninformative than features that occur in a small fraction of the training\ncorpus.\n\nThe formula that is used to compute the tf-idf for a term t of a document d\nin a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is\ncomputed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), where\nn is the total number of documents in the document set and df(t) is the\ndocument frequency of t; the document frequency is the number of documents\nin the document set that contain the term t. The effect of adding \"1\" to\nthe idf in the equation above is that terms with zero idf, i.e., terms\nthat occur in all documents in a training set, will not be entirely\nignored.\n(Note that the idf formula above differs from the standard textbook\nnotation that defines the idf as\nidf(t) = log [ n / (df(t) + 1) ]).\n\nIf ``smooth_idf=True`` (the default), the constant \"1\" is added to the\nnumerator and denominator of the idf as if an extra document was seen\ncontaining every term in the collection exactly once, which prevents\nzero divisions: idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1.\n\nFurthermore, the formulas used to compute tf and idf depend\non parameter settings that correspond to the SMART notation used in IR\nas follows:\n\nTf is \"n\" (natural) by default, \"l\" (logarithmic) when\n``sublinear_tf=True``.\nIdf is \"t\" when use_idf is given, \"n\" (none) otherwise.\nNormalization is \"c\" (cosine) when ``norm='l2'``, \"n\" (none)\nwhen ``norm=None``.\n\nRead more in the :ref:`User Guide `.", "docstring": "Transform a count matrix to a normalized tf or tf-idf representation.\n\n Tf means term-frequency while tf-idf means term-frequency times inverse\n document-frequency. This is a common term weighting scheme in information\n retrieval, that has also found good use in document classification.\n\n The goal of using tf-idf instead of the raw frequencies of occurrence of a\n token in a given document is to scale down the impact of tokens that occur\n very frequently in a given corpus and that are hence empirically less\n informative than features that occur in a small fraction of the training\n corpus.\n\n The formula that is used to compute the tf-idf for a term t of a document d\n in a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is\n computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), where\n n is the total number of documents in the document set and df(t) is the\n document frequency of t; the document frequency is the number of documents\n in the document set that contain the term t. The effect of adding \"1\" to\n the idf in the equation above is that terms with zero idf, i.e., terms\n that occur in all documents in a training set, will not be entirely\n ignored.\n (Note that the idf formula above differs from the standard textbook\n notation that defines the idf as\n idf(t) = log [ n / (df(t) + 1) ]).\n\n If ``smooth_idf=True`` (the default), the constant \"1\" is added to the\n numerator and denominator of the idf as if an extra document was seen\n containing every term in the collection exactly once, which prevents\n zero divisions: idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1.\n\n Furthermore, the formulas used to compute tf and idf depend\n on parameter settings that correspond to the SMART notation used in IR\n as follows:\n\n Tf is \"n\" (natural) by default, \"l\" (logarithmic) when\n ``sublinear_tf=True``.\n Idf is \"t\" when use_idf is given, \"n\" (none) otherwise.\n Normalization is \"c\" (cosine) when ``norm='l2'``, \"n\" (none)\n when ``norm=None``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n norm : {'l1', 'l2'}, default='l2'\n Each output row will have unit norm, either:\n\n - 'l2': Sum of squares of vector elements is 1. The cosine\n similarity between two vectors is their dot product when l2 norm has\n been applied.\n - 'l1': Sum of absolute values of vector elements is 1.\n See :func:`preprocessing.normalize`.\n\n use_idf : bool, default=True\n Enable inverse-document-frequency reweighting. If False, idf(t) = 1.\n\n smooth_idf : bool, default=True\n Smooth idf weights by adding one to document frequencies, as if an\n extra document was seen containing every term in the collection\n exactly once. Prevents zero divisions.\n\n sublinear_tf : bool, default=False\n Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).\n\n Attributes\n ----------\n idf_ : array of shape (n_features)\n The inverse document frequency (IDF) vector; only defined\n if ``use_idf`` is True.\n\n .. versionadded:: 0.20\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 1.0\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n CountVectorizer : Transforms text into a sparse matrix of n-gram counts.\n\n TfidfVectorizer : Convert a collection of raw documents to a matrix of\n TF-IDF features.\n\n HashingVectorizer : Convert a collection of text documents to a matrix\n of token occurrences.\n\n References\n ----------\n .. [Yates2011] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern\n Information Retrieval. Addison Wesley, pp. 68-74.\n\n .. [MRS2008] C.D. Manning, P. Raghavan and H. Sch\u00fctze (2008).\n Introduction to Information Retrieval. Cambridge University\n Press, pp. 118-120.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import TfidfTransformer\n >>> from sklearn.feature_extraction.text import CountVectorizer\n >>> from sklearn.pipeline import Pipeline\n >>> corpus = ['this is the first document',\n ... 'this document is the second document',\n ... 'and this is the third one',\n ... 'is this the first document']\n >>> vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',\n ... 'and', 'one']\n >>> pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),\n ... ('tfid', TfidfTransformer())]).fit(corpus)\n >>> pipe['count'].transform(corpus).toarray()\n array([[1, 1, 1, 1, 0, 1, 0, 0],\n [1, 2, 0, 1, 1, 1, 0, 0],\n [1, 0, 0, 1, 0, 1, 1, 1],\n [1, 1, 1, 1, 0, 1, 0, 0]])\n >>> pipe['tfid'].idf_\n array([1. , 1.22314355, 1.51082562, 1. , 1.91629073,\n 1. , 1.91629073, 1.91629073])\n >>> pipe.transform(corpus).shape\n (4, 8)\n ", "source_code": "\n\nclass TfidfTransformer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):\n \"\"\"Transform a count matrix to a normalized tf or tf-idf representation.\n\n Tf means term-frequency while tf-idf means term-frequency times inverse\n document-frequency. This is a common term weighting scheme in information\n retrieval, that has also found good use in document classification.\n\n The goal of using tf-idf instead of the raw frequencies of occurrence of a\n token in a given document is to scale down the impact of tokens that occur\n very frequently in a given corpus and that are hence empirically less\n informative than features that occur in a small fraction of the training\n corpus.\n\n The formula that is used to compute the tf-idf for a term t of a document d\n in a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is\n computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), where\n n is the total number of documents in the document set and df(t) is the\n document frequency of t; the document frequency is the number of documents\n in the document set that contain the term t. The effect of adding \"1\" to\n the idf in the equation above is that terms with zero idf, i.e., terms\n that occur in all documents in a training set, will not be entirely\n ignored.\n (Note that the idf formula above differs from the standard textbook\n notation that defines the idf as\n idf(t) = log [ n / (df(t) + 1) ]).\n\n If ``smooth_idf=True`` (the default), the constant \"1\" is added to the\n numerator and denominator of the idf as if an extra document was seen\n containing every term in the collection exactly once, which prevents\n zero divisions: idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1.\n\n Furthermore, the formulas used to compute tf and idf depend\n on parameter settings that correspond to the SMART notation used in IR\n as follows:\n\n Tf is \"n\" (natural) by default, \"l\" (logarithmic) when\n ``sublinear_tf=True``.\n Idf is \"t\" when use_idf is given, \"n\" (none) otherwise.\n Normalization is \"c\" (cosine) when ``norm='l2'``, \"n\" (none)\n when ``norm=None``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n norm : {'l1', 'l2'}, default='l2'\n Each output row will have unit norm, either:\n\n - 'l2': Sum of squares of vector elements is 1. The cosine\n similarity between two vectors is their dot product when l2 norm has\n been applied.\n - 'l1': Sum of absolute values of vector elements is 1.\n See :func:`preprocessing.normalize`.\n\n use_idf : bool, default=True\n Enable inverse-document-frequency reweighting. If False, idf(t) = 1.\n\n smooth_idf : bool, default=True\n Smooth idf weights by adding one to document frequencies, as if an\n extra document was seen containing every term in the collection\n exactly once. Prevents zero divisions.\n\n sublinear_tf : bool, default=False\n Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).\n\n Attributes\n ----------\n idf_ : array of shape (n_features)\n The inverse document frequency (IDF) vector; only defined\n if ``use_idf`` is True.\n\n .. versionadded:: 0.20\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 1.0\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n CountVectorizer : Transforms text into a sparse matrix of n-gram counts.\n\n TfidfVectorizer : Convert a collection of raw documents to a matrix of\n TF-IDF features.\n\n HashingVectorizer : Convert a collection of text documents to a matrix\n of token occurrences.\n\n References\n ----------\n .. [Yates2011] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern\n Information Retrieval. Addison Wesley, pp. 68-74.\n\n .. [MRS2008] C.D. Manning, P. Raghavan and H. Sch\u00fctze (2008).\n Introduction to Information Retrieval. Cambridge University\n Press, pp. 118-120.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import TfidfTransformer\n >>> from sklearn.feature_extraction.text import CountVectorizer\n >>> from sklearn.pipeline import Pipeline\n >>> corpus = ['this is the first document',\n ... 'this document is the second document',\n ... 'and this is the third one',\n ... 'is this the first document']\n >>> vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',\n ... 'and', 'one']\n >>> pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),\n ... ('tfid', TfidfTransformer())]).fit(corpus)\n >>> pipe['count'].transform(corpus).toarray()\n array([[1, 1, 1, 1, 0, 1, 0, 0],\n [1, 2, 0, 1, 1, 1, 0, 0],\n [1, 0, 0, 1, 0, 1, 1, 1],\n [1, 1, 1, 1, 0, 1, 0, 0]])\n >>> pipe['tfid'].idf_\n array([1. , 1.22314355, 1.51082562, 1. , 1.91629073,\n 1. , 1.91629073, 1.91629073])\n >>> pipe.transform(corpus).shape\n (4, 8)\n \"\"\"\n \n def __init__(self, *, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False):\n self.norm = norm\n self.use_idf = use_idf\n self.smooth_idf = smooth_idf\n self.sublinear_tf = sublinear_tf\n \n def fit(self, X, y=None):\n \"\"\"Learn the idf vector (global term weights).\n\n Parameters\n ----------\n X : sparse matrix of shape n_samples, n_features)\n A matrix of term/token counts.\n\n y : None\n This parameter is not needed to compute tf-idf.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), accept_large_sparse=not _IS_32BIT)\n if not sp.issparse(X):\n X = sp.csr_matrix(X)\n dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64\n if self.use_idf:\n (n_samples, n_features) = X.shape\n df = _document_frequency(X)\n df = df.astype(dtype, **_astype_copy_false(df))\n df += int(self.smooth_idf)\n n_samples += int(self.smooth_idf)\n idf = np.log(n_samples / df) + 1\n self._idf_diag = sp.diags(idf, offsets=0, shape=(n_features, n_features), format='csr', dtype=dtype)\n return self\n \n def transform(self, X, copy=True):\n \"\"\"Transform a count matrix to a tf or tf-idf representation.\n\n Parameters\n ----------\n X : sparse matrix of (n_samples, n_features)\n A matrix of term/token counts.\n\n copy : bool, default=True\n Whether to copy X and operate on the copy or perform in-place\n operations.\n\n Returns\n -------\n vectors : sparse matrix of shape (n_samples, n_features)\n Tf-idf-weighted document-term matrix.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy, reset=False)\n if not sp.issparse(X):\n X = sp.csr_matrix(X, dtype=np.float64)\n if self.sublinear_tf:\n np.log(X.data, X.data)\n X.data += 1\n if self.use_idf:\n check_is_fitted(self, attributes=['idf_'], msg='idf vector is not fitted')\n X = X * self._idf_diag\n if self.norm:\n X = normalize(X, norm=self.norm, copy=False)\n return X\n \n @property\n def idf_(self):\n \"\"\"Inverse document frequency vector, only defined if `use_idf=True`.\n\n Returns\n -------\n ndarray of shape (n_features,)\n \"\"\"\n return np.ravel(self._idf_diag.sum(axis=0))\n \n @idf_.setter\n def idf_(self, value):\n value = np.asarray(value, dtype=np.float64)\n n_features = value.shape[0]\n self._idf_diag = sp.spdiags(value, diags=0, m=n_features, n=n_features, format='csr')\n \n def _more_tags(self):\n return {'X_types': ['2darray', 'sparse']}\n" }, @@ -22403,9 +22469,9 @@ "sklearn.feature_extraction.text.TfidfVectorizer._more_tags" ], "is_public": true, - "description": "Convert a collection of raw documents to a matrix of TF-IDF features.\n\nEquivalent to :class:`CountVectorizer` followed by :class:`TfidfTransformer`. Read more in the :ref:`User Guide `.", - "docstring": "Convert a collection of raw documents to a matrix of TF-IDF features.\n\n Equivalent to :class:`CountVectorizer` followed by\n :class:`TfidfTransformer`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n input : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\n encoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\n strip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n an direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\n lowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\n preprocessor : callable, default=None\n Override the preprocessing (string transformation) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer is not callable``.\n\n tokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\n analyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word or character n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data\n is first read from the file and then passed to the given callable\n analyzer.\n\n stop_words : {'english'}, list, default=None\n If a string, it is passed to _check_stop_list and the appropriate stop\n list is returned. 'english' is currently the only supported string\n value.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n If None, no stop words will be used. max_df can be set to a value\n in the range [0.7, 1.0) to automatically detect and filter stop\n words based on intra corpus document frequency of terms.\n\n token_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp selects tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\n ngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n n-grams to be extracted. All values of n such that min_n <= n <= max_n\n will be used. For example an ``ngram_range`` of ``(1, 1)`` means only\n unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\n only bigrams.\n Only applies if ``analyzer is not callable``.\n\n max_df : float or int, default=1.0\n When building the vocabulary ignore terms that have a document\n frequency strictly higher than the given threshold (corpus-specific\n stop words).\n If float in range [0.0, 1.0], the parameter represents a proportion of\n documents, integer absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n min_df : float or int, default=1\n When building the vocabulary ignore terms that have a document\n frequency strictly lower than the given threshold. This value is also\n called cut-off in the literature.\n If float in range of [0.0, 1.0], the parameter represents a proportion\n of documents, integer absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n max_features : int, default=None\n If not None, build a vocabulary that only consider the top\n max_features ordered by term frequency across the corpus.\n\n This parameter is ignored if vocabulary is not None.\n\n vocabulary : Mapping or iterable, default=None\n Either a Mapping (e.g., a dict) where keys are terms and values are\n indices in the feature matrix, or an iterable over terms. If not\n given, a vocabulary is determined from the input documents.\n\n binary : bool, default=False\n If True, all non-zero term counts are set to 1. This does not mean\n outputs will have only 0/1 values, only that the tf term in tf-idf\n is binary. (Set idf and normalization to False to get 0/1 outputs).\n\n dtype : dtype, default=float64\n Type of the matrix returned by fit_transform() or transform().\n\n norm : {'l1', 'l2'}, default='l2'\n Each output row will have unit norm, either:\n\n - 'l2': Sum of squares of vector elements is 1. The cosine\n similarity between two vectors is their dot product when l2 norm has\n been applied.\n - 'l1': Sum of absolute values of vector elements is 1.\n See :func:`preprocessing.normalize`.\n\n use_idf : bool, default=True\n Enable inverse-document-frequency reweighting. If False, idf(t) = 1.\n\n smooth_idf : bool, default=True\n Smooth idf weights by adding one to document frequencies, as if an\n extra document was seen containing every term in the collection\n exactly once. Prevents zero divisions.\n\n sublinear_tf : bool, default=False\n Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).\n\n Attributes\n ----------\n vocabulary_ : dict\n A mapping of terms to feature indices.\n\n fixed_vocabulary_ : bool\n True if a fixed vocabulary of term to indices mapping\n is provided by the user.\n\n idf_ : array of shape (n_features,)\n The inverse document frequency (IDF) vector; only defined\n if ``use_idf`` is True.\n\n stop_words_ : set\n Terms that were ignored because they either:\n\n - occurred in too many documents (`max_df`)\n - occurred in too few documents (`min_df`)\n - were cut off by feature selection (`max_features`).\n\n This is only available if no vocabulary was given.\n\n See Also\n --------\n CountVectorizer : Transforms text into a sparse matrix of n-gram counts.\n\n TfidfTransformer : Performs the TF-IDF transformation from a provided\n matrix of counts.\n\n Notes\n -----\n The ``stop_words_`` attribute can get large and increase the model size\n when pickling. This attribute is provided only for introspection and can\n be safely removed using delattr or set to None before pickling.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import TfidfVectorizer\n >>> corpus = [\n ... 'This is the first document.',\n ... 'This document is the second document.',\n ... 'And this is the third one.',\n ... 'Is this the first document?',\n ... ]\n >>> vectorizer = TfidfVectorizer()\n >>> X = vectorizer.fit_transform(corpus)\n >>> vectorizer.get_feature_names_out()\n array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',\n 'this'], ...)\n >>> print(X.shape)\n (4, 9)\n ", - "source_code": "\n\nclass TfidfVectorizer(CountVectorizer):\n \"\"\"Convert a collection of raw documents to a matrix of TF-IDF features.\n\n Equivalent to :class:`CountVectorizer` followed by\n :class:`TfidfTransformer`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n input : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\n encoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\n strip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n an direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\n lowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\n preprocessor : callable, default=None\n Override the preprocessing (string transformation) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer is not callable``.\n\n tokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\n analyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word or character n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data\n is first read from the file and then passed to the given callable\n analyzer.\n\n stop_words : {'english'}, list, default=None\n If a string, it is passed to _check_stop_list and the appropriate stop\n list is returned. 'english' is currently the only supported string\n value.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n If None, no stop words will be used. max_df can be set to a value\n in the range [0.7, 1.0) to automatically detect and filter stop\n words based on intra corpus document frequency of terms.\n\n token_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp selects tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\n ngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n n-grams to be extracted. All values of n such that min_n <= n <= max_n\n will be used. For example an ``ngram_range`` of ``(1, 1)`` means only\n unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\n only bigrams.\n Only applies if ``analyzer is not callable``.\n\n max_df : float or int, default=1.0\n When building the vocabulary ignore terms that have a document\n frequency strictly higher than the given threshold (corpus-specific\n stop words).\n If float in range [0.0, 1.0], the parameter represents a proportion of\n documents, integer absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n min_df : float or int, default=1\n When building the vocabulary ignore terms that have a document\n frequency strictly lower than the given threshold. This value is also\n called cut-off in the literature.\n If float in range of [0.0, 1.0], the parameter represents a proportion\n of documents, integer absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n max_features : int, default=None\n If not None, build a vocabulary that only consider the top\n max_features ordered by term frequency across the corpus.\n\n This parameter is ignored if vocabulary is not None.\n\n vocabulary : Mapping or iterable, default=None\n Either a Mapping (e.g., a dict) where keys are terms and values are\n indices in the feature matrix, or an iterable over terms. If not\n given, a vocabulary is determined from the input documents.\n\n binary : bool, default=False\n If True, all non-zero term counts are set to 1. This does not mean\n outputs will have only 0/1 values, only that the tf term in tf-idf\n is binary. (Set idf and normalization to False to get 0/1 outputs).\n\n dtype : dtype, default=float64\n Type of the matrix returned by fit_transform() or transform().\n\n norm : {'l1', 'l2'}, default='l2'\n Each output row will have unit norm, either:\n\n - 'l2': Sum of squares of vector elements is 1. The cosine\n similarity between two vectors is their dot product when l2 norm has\n been applied.\n - 'l1': Sum of absolute values of vector elements is 1.\n See :func:`preprocessing.normalize`.\n\n use_idf : bool, default=True\n Enable inverse-document-frequency reweighting. If False, idf(t) = 1.\n\n smooth_idf : bool, default=True\n Smooth idf weights by adding one to document frequencies, as if an\n extra document was seen containing every term in the collection\n exactly once. Prevents zero divisions.\n\n sublinear_tf : bool, default=False\n Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).\n\n Attributes\n ----------\n vocabulary_ : dict\n A mapping of terms to feature indices.\n\n fixed_vocabulary_ : bool\n True if a fixed vocabulary of term to indices mapping\n is provided by the user.\n\n idf_ : array of shape (n_features,)\n The inverse document frequency (IDF) vector; only defined\n if ``use_idf`` is True.\n\n stop_words_ : set\n Terms that were ignored because they either:\n\n - occurred in too many documents (`max_df`)\n - occurred in too few documents (`min_df`)\n - were cut off by feature selection (`max_features`).\n\n This is only available if no vocabulary was given.\n\n See Also\n --------\n CountVectorizer : Transforms text into a sparse matrix of n-gram counts.\n\n TfidfTransformer : Performs the TF-IDF transformation from a provided\n matrix of counts.\n\n Notes\n -----\n The ``stop_words_`` attribute can get large and increase the model size\n when pickling. This attribute is provided only for introspection and can\n be safely removed using delattr or set to None before pickling.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import TfidfVectorizer\n >>> corpus = [\n ... 'This is the first document.',\n ... 'This document is the second document.',\n ... 'And this is the third one.',\n ... 'Is this the first document?',\n ... ]\n >>> vectorizer = TfidfVectorizer()\n >>> X = vectorizer.fit_transform(corpus)\n >>> vectorizer.get_feature_names_out()\n array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',\n 'this'], ...)\n >>> print(X.shape)\n (4, 9)\n \"\"\"\n \n def __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.float64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False):\n super().__init__(input=input, encoding=encoding, decode_error=decode_error, strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer, stop_words=stop_words, token_pattern=token_pattern, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, binary=binary, dtype=dtype)\n self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)\n \n @property\n def norm(self):\n \"\"\"Norm of each row output, can be either \"l1\" or \"l2\".\"\"\"\n return self._tfidf.norm\n \n @norm.setter\n def norm(self, value):\n self._tfidf.norm = value\n \n @property\n def use_idf(self):\n \"\"\"Whether or not IDF re-weighting is used.\"\"\"\n return self._tfidf.use_idf\n \n @use_idf.setter\n def use_idf(self, value):\n self._tfidf.use_idf = value\n \n @property\n def smooth_idf(self):\n \"\"\"Whether or not IDF weights are smoothed.\"\"\"\n return self._tfidf.smooth_idf\n \n @smooth_idf.setter\n def smooth_idf(self, value):\n self._tfidf.smooth_idf = value\n \n @property\n def sublinear_tf(self):\n \"\"\"Whether or not sublinear TF scaling is applied.\"\"\"\n return self._tfidf.sublinear_tf\n \n @sublinear_tf.setter\n def sublinear_tf(self, value):\n self._tfidf.sublinear_tf = value\n \n @property\n def idf_(self):\n \"\"\"Inverse document frequency vector, only defined if `use_idf=True`.\n\n Returns\n -------\n ndarray of shape (n_features,)\n \"\"\"\n return self._tfidf.idf_\n \n @idf_.setter\n def idf_(self, value):\n self._validate_vocabulary()\n if hasattr(self, 'vocabulary_'):\n if len(self.vocabulary_) != len(value):\n raise ValueError('idf length = %d must be equal to vocabulary size = %d' % (len(value), len(self.vocabulary)))\n self._tfidf.idf_ = value\n \n def _check_params(self):\n if self.dtype not in FLOAT_DTYPES:\n warnings.warn(\"Only {} 'dtype' should be used. {} 'dtype' will be converted to np.float64.\".format(FLOAT_DTYPES, self.dtype), UserWarning)\n \n def fit(self, raw_documents, y=None):\n \"\"\"Learn vocabulary and idf from training set.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is not needed to compute tfidf.\n\n Returns\n -------\n self : object\n Fitted vectorizer.\n \"\"\"\n self._check_params()\n self._warn_for_unused_params()\n X = super().fit_transform(raw_documents)\n self._tfidf.fit(X)\n return self\n \n def fit_transform(self, raw_documents, y=None):\n \"\"\"Learn vocabulary and idf, return document-term matrix.\n\n This is equivalent to fit followed by transform, but more efficiently\n implemented.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is ignored.\n\n Returns\n -------\n X : sparse matrix of (n_samples, n_features)\n Tf-idf-weighted document-term matrix.\n \"\"\"\n self._check_params()\n X = super().fit_transform(raw_documents)\n self._tfidf.fit(X)\n return self._tfidf.transform(X, copy=False)\n \n def transform(self, raw_documents):\n \"\"\"Transform documents to document-term matrix.\n\n Uses the vocabulary and document frequencies (df) learned by fit (or\n fit_transform).\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n Returns\n -------\n X : sparse matrix of (n_samples, n_features)\n Tf-idf-weighted document-term matrix.\n \"\"\"\n check_is_fitted(self, msg='The TF-IDF vectorizer is not fitted')\n X = super().transform(raw_documents)\n return self._tfidf.transform(X, copy=False)\n \n def _more_tags(self):\n return {'X_types': ['string'], '_skip_test': True}\n" + "description": "Convert a collection of raw documents to a matrix of TF-IDF features.\n\nEquivalent to :class:`CountVectorizer` followed by\n:class:`TfidfTransformer`.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Convert a collection of raw documents to a matrix of TF-IDF features.\n\n Equivalent to :class:`CountVectorizer` followed by\n :class:`TfidfTransformer`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n input : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\n encoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\n strip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n an direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\n lowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\n preprocessor : callable, default=None\n Override the preprocessing (string transformation) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer`` is not callable.\n\n tokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\n analyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word or character n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data\n is first read from the file and then passed to the given callable\n analyzer.\n\n stop_words : {'english'}, list, default=None\n If a string, it is passed to _check_stop_list and the appropriate stop\n list is returned. 'english' is currently the only supported string\n value.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n If None, no stop words will be used. max_df can be set to a value\n in the range [0.7, 1.0) to automatically detect and filter stop\n words based on intra corpus document frequency of terms.\n\n token_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp selects tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\n ngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n n-grams to be extracted. All values of n such that min_n <= n <= max_n\n will be used. For example an ``ngram_range`` of ``(1, 1)`` means only\n unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\n only bigrams.\n Only applies if ``analyzer`` is not callable.\n\n max_df : float or int, default=1.0\n When building the vocabulary ignore terms that have a document\n frequency strictly higher than the given threshold (corpus-specific\n stop words).\n If float in range [0.0, 1.0], the parameter represents a proportion of\n documents, integer absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n min_df : float or int, default=1\n When building the vocabulary ignore terms that have a document\n frequency strictly lower than the given threshold. This value is also\n called cut-off in the literature.\n If float in range of [0.0, 1.0], the parameter represents a proportion\n of documents, integer absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n max_features : int, default=None\n If not None, build a vocabulary that only consider the top\n max_features ordered by term frequency across the corpus.\n\n This parameter is ignored if vocabulary is not None.\n\n vocabulary : Mapping or iterable, default=None\n Either a Mapping (e.g., a dict) where keys are terms and values are\n indices in the feature matrix, or an iterable over terms. If not\n given, a vocabulary is determined from the input documents.\n\n binary : bool, default=False\n If True, all non-zero term counts are set to 1. This does not mean\n outputs will have only 0/1 values, only that the tf term in tf-idf\n is binary. (Set idf and normalization to False to get 0/1 outputs).\n\n dtype : dtype, default=float64\n Type of the matrix returned by fit_transform() or transform().\n\n norm : {'l1', 'l2'}, default='l2'\n Each output row will have unit norm, either:\n\n - 'l2': Sum of squares of vector elements is 1. The cosine\n similarity between two vectors is their dot product when l2 norm has\n been applied.\n - 'l1': Sum of absolute values of vector elements is 1.\n See :func:`preprocessing.normalize`.\n\n use_idf : bool, default=True\n Enable inverse-document-frequency reweighting. If False, idf(t) = 1.\n\n smooth_idf : bool, default=True\n Smooth idf weights by adding one to document frequencies, as if an\n extra document was seen containing every term in the collection\n exactly once. Prevents zero divisions.\n\n sublinear_tf : bool, default=False\n Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).\n\n Attributes\n ----------\n vocabulary_ : dict\n A mapping of terms to feature indices.\n\n fixed_vocabulary_ : bool\n True if a fixed vocabulary of term to indices mapping\n is provided by the user.\n\n idf_ : array of shape (n_features,)\n The inverse document frequency (IDF) vector; only defined\n if ``use_idf`` is True.\n\n stop_words_ : set\n Terms that were ignored because they either:\n\n - occurred in too many documents (`max_df`)\n - occurred in too few documents (`min_df`)\n - were cut off by feature selection (`max_features`).\n\n This is only available if no vocabulary was given.\n\n See Also\n --------\n CountVectorizer : Transforms text into a sparse matrix of n-gram counts.\n\n TfidfTransformer : Performs the TF-IDF transformation from a provided\n matrix of counts.\n\n Notes\n -----\n The ``stop_words_`` attribute can get large and increase the model size\n when pickling. This attribute is provided only for introspection and can\n be safely removed using delattr or set to None before pickling.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import TfidfVectorizer\n >>> corpus = [\n ... 'This is the first document.',\n ... 'This document is the second document.',\n ... 'And this is the third one.',\n ... 'Is this the first document?',\n ... ]\n >>> vectorizer = TfidfVectorizer()\n >>> X = vectorizer.fit_transform(corpus)\n >>> vectorizer.get_feature_names_out()\n array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',\n 'this'], ...)\n >>> print(X.shape)\n (4, 9)\n ", + "source_code": "\n\nclass TfidfVectorizer(CountVectorizer):\n \"\"\"Convert a collection of raw documents to a matrix of TF-IDF features.\n\n Equivalent to :class:`CountVectorizer` followed by\n :class:`TfidfTransformer`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n input : {'filename', 'file', 'content'}, default='content'\n - If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n - If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n - If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte.\n\n encoding : str, default='utf-8'\n If bytes or files are given to analyze, this encoding is used to\n decode.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. By default, it is\n 'strict', meaning that a UnicodeDecodeError will be raised. Other\n values are 'ignore' and 'replace'.\n\n strip_accents : {'ascii', 'unicode'}, default=None\n Remove accents and perform other character normalization\n during the preprocessing step.\n 'ascii' is a fast method that only works on characters that have\n an direct ASCII mapping.\n 'unicode' is a slightly slower method that works on any characters.\n None (default) does nothing.\n\n Both 'ascii' and 'unicode' use NFKD normalization from\n :func:`unicodedata.normalize`.\n\n lowercase : bool, default=True\n Convert all characters to lowercase before tokenizing.\n\n preprocessor : callable, default=None\n Override the preprocessing (string transformation) stage while\n preserving the tokenizing and n-grams generation steps.\n Only applies if ``analyzer`` is not callable.\n\n tokenizer : callable, default=None\n Override the string tokenization step while preserving the\n preprocessing and n-grams generation steps.\n Only applies if ``analyzer == 'word'``.\n\n analyzer : {'word', 'char', 'char_wb'} or callable, default='word'\n Whether the feature should be made of word or character n-grams.\n Option 'char_wb' creates character n-grams only from text inside\n word boundaries; n-grams at the edges of words are padded with space.\n\n If a callable is passed it is used to extract the sequence of features\n out of the raw, unprocessed input.\n\n .. versionchanged:: 0.21\n Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data\n is first read from the file and then passed to the given callable\n analyzer.\n\n stop_words : {'english'}, list, default=None\n If a string, it is passed to _check_stop_list and the appropriate stop\n list is returned. 'english' is currently the only supported string\n value.\n There are several known issues with 'english' and you should\n consider an alternative (see :ref:`stop_words`).\n\n If a list, that list is assumed to contain stop words, all of which\n will be removed from the resulting tokens.\n Only applies if ``analyzer == 'word'``.\n\n If None, no stop words will be used. max_df can be set to a value\n in the range [0.7, 1.0) to automatically detect and filter stop\n words based on intra corpus document frequency of terms.\n\n token_pattern : str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"\n Regular expression denoting what constitutes a \"token\", only used\n if ``analyzer == 'word'``. The default regexp selects tokens of 2\n or more alphanumeric characters (punctuation is completely ignored\n and always treated as a token separator).\n\n If there is a capturing group in token_pattern then the\n captured group content, not the entire match, becomes the token.\n At most one capturing group is permitted.\n\n ngram_range : tuple (min_n, max_n), default=(1, 1)\n The lower and upper boundary of the range of n-values for different\n n-grams to be extracted. All values of n such that min_n <= n <= max_n\n will be used. For example an ``ngram_range`` of ``(1, 1)`` means only\n unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\n only bigrams.\n Only applies if ``analyzer`` is not callable.\n\n max_df : float or int, default=1.0\n When building the vocabulary ignore terms that have a document\n frequency strictly higher than the given threshold (corpus-specific\n stop words).\n If float in range [0.0, 1.0], the parameter represents a proportion of\n documents, integer absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n min_df : float or int, default=1\n When building the vocabulary ignore terms that have a document\n frequency strictly lower than the given threshold. This value is also\n called cut-off in the literature.\n If float in range of [0.0, 1.0], the parameter represents a proportion\n of documents, integer absolute counts.\n This parameter is ignored if vocabulary is not None.\n\n max_features : int, default=None\n If not None, build a vocabulary that only consider the top\n max_features ordered by term frequency across the corpus.\n\n This parameter is ignored if vocabulary is not None.\n\n vocabulary : Mapping or iterable, default=None\n Either a Mapping (e.g., a dict) where keys are terms and values are\n indices in the feature matrix, or an iterable over terms. If not\n given, a vocabulary is determined from the input documents.\n\n binary : bool, default=False\n If True, all non-zero term counts are set to 1. This does not mean\n outputs will have only 0/1 values, only that the tf term in tf-idf\n is binary. (Set idf and normalization to False to get 0/1 outputs).\n\n dtype : dtype, default=float64\n Type of the matrix returned by fit_transform() or transform().\n\n norm : {'l1', 'l2'}, default='l2'\n Each output row will have unit norm, either:\n\n - 'l2': Sum of squares of vector elements is 1. The cosine\n similarity between two vectors is their dot product when l2 norm has\n been applied.\n - 'l1': Sum of absolute values of vector elements is 1.\n See :func:`preprocessing.normalize`.\n\n use_idf : bool, default=True\n Enable inverse-document-frequency reweighting. If False, idf(t) = 1.\n\n smooth_idf : bool, default=True\n Smooth idf weights by adding one to document frequencies, as if an\n extra document was seen containing every term in the collection\n exactly once. Prevents zero divisions.\n\n sublinear_tf : bool, default=False\n Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).\n\n Attributes\n ----------\n vocabulary_ : dict\n A mapping of terms to feature indices.\n\n fixed_vocabulary_ : bool\n True if a fixed vocabulary of term to indices mapping\n is provided by the user.\n\n idf_ : array of shape (n_features,)\n The inverse document frequency (IDF) vector; only defined\n if ``use_idf`` is True.\n\n stop_words_ : set\n Terms that were ignored because they either:\n\n - occurred in too many documents (`max_df`)\n - occurred in too few documents (`min_df`)\n - were cut off by feature selection (`max_features`).\n\n This is only available if no vocabulary was given.\n\n See Also\n --------\n CountVectorizer : Transforms text into a sparse matrix of n-gram counts.\n\n TfidfTransformer : Performs the TF-IDF transformation from a provided\n matrix of counts.\n\n Notes\n -----\n The ``stop_words_`` attribute can get large and increase the model size\n when pickling. This attribute is provided only for introspection and can\n be safely removed using delattr or set to None before pickling.\n\n Examples\n --------\n >>> from sklearn.feature_extraction.text import TfidfVectorizer\n >>> corpus = [\n ... 'This is the first document.',\n ... 'This document is the second document.',\n ... 'And this is the third one.',\n ... 'Is this the first document?',\n ... ]\n >>> vectorizer = TfidfVectorizer()\n >>> X = vectorizer.fit_transform(corpus)\n >>> vectorizer.get_feature_names_out()\n array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',\n 'this'], ...)\n >>> print(X.shape)\n (4, 9)\n \"\"\"\n \n def __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.float64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False):\n super().__init__(input=input, encoding=encoding, decode_error=decode_error, strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer, stop_words=stop_words, token_pattern=token_pattern, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, binary=binary, dtype=dtype)\n self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)\n \n @property\n def norm(self):\n \"\"\"Norm of each row output, can be either \"l1\" or \"l2\".\"\"\"\n return self._tfidf.norm\n \n @norm.setter\n def norm(self, value):\n self._tfidf.norm = value\n \n @property\n def use_idf(self):\n \"\"\"Whether or not IDF re-weighting is used.\"\"\"\n return self._tfidf.use_idf\n \n @use_idf.setter\n def use_idf(self, value):\n self._tfidf.use_idf = value\n \n @property\n def smooth_idf(self):\n \"\"\"Whether or not IDF weights are smoothed.\"\"\"\n return self._tfidf.smooth_idf\n \n @smooth_idf.setter\n def smooth_idf(self, value):\n self._tfidf.smooth_idf = value\n \n @property\n def sublinear_tf(self):\n \"\"\"Whether or not sublinear TF scaling is applied.\"\"\"\n return self._tfidf.sublinear_tf\n \n @sublinear_tf.setter\n def sublinear_tf(self, value):\n self._tfidf.sublinear_tf = value\n \n @property\n def idf_(self):\n \"\"\"Inverse document frequency vector, only defined if `use_idf=True`.\n\n Returns\n -------\n ndarray of shape (n_features,)\n \"\"\"\n return self._tfidf.idf_\n \n @idf_.setter\n def idf_(self, value):\n self._validate_vocabulary()\n if hasattr(self, 'vocabulary_'):\n if len(self.vocabulary_) != len(value):\n raise ValueError('idf length = %d must be equal to vocabulary size = %d' % (len(value), len(self.vocabulary)))\n self._tfidf.idf_ = value\n \n def _check_params(self):\n if self.dtype not in FLOAT_DTYPES:\n warnings.warn(\"Only {} 'dtype' should be used. {} 'dtype' will be converted to np.float64.\".format(FLOAT_DTYPES, self.dtype), UserWarning)\n \n def fit(self, raw_documents, y=None):\n \"\"\"Learn vocabulary and idf from training set.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is not needed to compute tfidf.\n\n Returns\n -------\n self : object\n Fitted vectorizer.\n \"\"\"\n self._check_params()\n self._warn_for_unused_params()\n X = super().fit_transform(raw_documents)\n self._tfidf.fit(X)\n return self\n \n def fit_transform(self, raw_documents, y=None):\n \"\"\"Learn vocabulary and idf, return document-term matrix.\n\n This is equivalent to fit followed by transform, but more efficiently\n implemented.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is ignored.\n\n Returns\n -------\n X : sparse matrix of (n_samples, n_features)\n Tf-idf-weighted document-term matrix.\n \"\"\"\n self._check_params()\n X = super().fit_transform(raw_documents)\n self._tfidf.fit(X)\n return self._tfidf.transform(X, copy=False)\n \n def transform(self, raw_documents):\n \"\"\"Transform documents to document-term matrix.\n\n Uses the vocabulary and document frequencies (df) learned by fit (or\n fit_transform).\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n Returns\n -------\n X : sparse matrix of (n_samples, n_features)\n Tf-idf-weighted document-term matrix.\n \"\"\"\n check_is_fitted(self, msg='The TF-IDF vectorizer is not fitted')\n X = super().transform(raw_documents)\n return self._tfidf.transform(X, copy=False)\n \n def _more_tags(self):\n return {'X_types': ['string'], '_skip_test': True}\n" }, { "name": "_VectorizerMixin", @@ -22441,13 +22507,14 @@ "sklearn.feature_selection._base.SelectorMixin.get_support", "sklearn.feature_selection._base.SelectorMixin._get_support_mask", "sklearn.feature_selection._base.SelectorMixin.transform", + "sklearn.feature_selection._base.SelectorMixin._transform", "sklearn.feature_selection._base.SelectorMixin.inverse_transform", "sklearn.feature_selection._base.SelectorMixin.get_feature_names_out" ], "is_public": true, - "description": "Transformer mixin that performs feature selection given a support mask\n\nThis mixin provides a feature selector implementation with `transform` and `inverse_transform` functionality given an implementation of `_get_support_mask`.", + "description": "Transformer mixin that performs feature selection given a support mask\n\nThis mixin provides a feature selector implementation with `transform` and\n`inverse_transform` functionality given an implementation of\n`_get_support_mask`.", "docstring": "\n Transformer mixin that performs feature selection given a support mask\n\n This mixin provides a feature selector implementation with `transform` and\n `inverse_transform` functionality given an implementation of\n `_get_support_mask`.\n ", - "source_code": "\n\nclass SelectorMixin(TransformerMixin, metaclass=ABCMeta):\n \"\"\"\n Transformer mixin that performs feature selection given a support mask\n\n This mixin provides a feature selector implementation with `transform` and\n `inverse_transform` functionality given an implementation of\n `_get_support_mask`.\n \"\"\"\n \n def get_support(self, indices=False):\n \"\"\"\n Get a mask, or integer index, of the features selected.\n\n Parameters\n ----------\n indices : bool, default=False\n If True, the return value will be an array of integers, rather\n than a boolean mask.\n\n Returns\n -------\n support : array\n An index that selects the retained features from a feature vector.\n If `indices` is False, this is a boolean array of shape\n [# input features], in which an element is True iff its\n corresponding feature is selected for retention. If `indices` is\n True, this is an integer array of shape [# output features] whose\n values are indices into the input feature vector.\n \"\"\"\n mask = self._get_support_mask()\n return mask if not indices else np.where(mask)[0]\n \n @abstractmethod\n def _get_support_mask(self):\n \"\"\"\n Get the boolean mask indicating which features are selected\n\n Returns\n -------\n support : boolean array of shape [# input features]\n An element is True iff its corresponding feature is selected for\n retention.\n \"\"\"\n \n \n def transform(self, X):\n \"\"\"Reduce X to the selected features.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n Returns\n -------\n X_r : array of shape [n_samples, n_selected_features]\n The input samples with only the selected features.\n \"\"\"\n X = self._validate_data(X, dtype=None, accept_sparse='csr', force_all_finite=not _safe_tags(self, key='allow_nan'), reset=False)\n mask = self.get_support()\n if not mask.any():\n warn('No features were selected: either the data is too noisy or the selection test too strict.', UserWarning)\n return np.empty(0).reshape((X.shape[0], 0))\n if len(mask) != X.shape[1]:\n raise ValueError('X has a different shape than during fitting.')\n return X[:, safe_mask(X, mask)]\n \n def inverse_transform(self, X):\n \"\"\"Reverse the transformation operation.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_selected_features]\n The input samples.\n\n Returns\n -------\n X_r : array of shape [n_samples, n_original_features]\n `X` with columns of zeros inserted where features would have\n been removed by :meth:`transform`.\n \"\"\"\n if issparse(X):\n X = X.tocsc()\n it = self.inverse_transform(np.diff(X.indptr).reshape(1, -1))\n col_nonzeros = it.ravel()\n indptr = np.concatenate([[0], np.cumsum(col_nonzeros)])\n Xt = csc_matrix((X.data, X.indices, indptr), shape=(X.shape[0], len(indptr) - 1), dtype=X.dtype)\n return Xt\n support = self.get_support()\n X = check_array(X, dtype=None)\n if support.sum() != X.shape[1]:\n raise ValueError('X has a different shape than during fitting.')\n if X.ndim == 1:\n X = X[None, :]\n Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype)\n Xt[:, support] = X\n return Xt\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Mask feature names according to selected features.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n input_features = _check_feature_names_in(self, input_features)\n return input_features[self.get_support()]\n" + "source_code": "\n\nclass SelectorMixin(TransformerMixin, metaclass=ABCMeta):\n \"\"\"\n Transformer mixin that performs feature selection given a support mask\n\n This mixin provides a feature selector implementation with `transform` and\n `inverse_transform` functionality given an implementation of\n `_get_support_mask`.\n \"\"\"\n \n def get_support(self, indices=False):\n \"\"\"\n Get a mask, or integer index, of the features selected.\n\n Parameters\n ----------\n indices : bool, default=False\n If True, the return value will be an array of integers, rather\n than a boolean mask.\n\n Returns\n -------\n support : array\n An index that selects the retained features from a feature vector.\n If `indices` is False, this is a boolean array of shape\n [# input features], in which an element is True iff its\n corresponding feature is selected for retention. If `indices` is\n True, this is an integer array of shape [# output features] whose\n values are indices into the input feature vector.\n \"\"\"\n mask = self._get_support_mask()\n return mask if not indices else np.where(mask)[0]\n \n @abstractmethod\n def _get_support_mask(self):\n \"\"\"\n Get the boolean mask indicating which features are selected\n\n Returns\n -------\n support : boolean array of shape [# input features]\n An element is True iff its corresponding feature is selected for\n retention.\n \"\"\"\n \n \n def transform(self, X):\n \"\"\"Reduce X to the selected features.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n Returns\n -------\n X_r : array of shape [n_samples, n_selected_features]\n The input samples with only the selected features.\n \"\"\"\n X = self._validate_data(X, dtype=None, accept_sparse='csr', force_all_finite=not _safe_tags(self, key='allow_nan'), reset=False)\n return self._transform(X)\n \n def _transform(self, X):\n \"\"\"Reduce X to the selected features.\"\"\"\n mask = self.get_support()\n if not mask.any():\n warn('No features were selected: either the data is too noisy or the selection test too strict.', UserWarning)\n return np.empty(0).reshape((X.shape[0], 0))\n if len(mask) != X.shape[1]:\n raise ValueError('X has a different shape than during fitting.')\n return X[:, safe_mask(X, mask)]\n \n def inverse_transform(self, X):\n \"\"\"Reverse the transformation operation.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_selected_features]\n The input samples.\n\n Returns\n -------\n X_r : array of shape [n_samples, n_original_features]\n `X` with columns of zeros inserted where features would have\n been removed by :meth:`transform`.\n \"\"\"\n if issparse(X):\n X = X.tocsc()\n it = self.inverse_transform(np.diff(X.indptr).reshape(1, -1))\n col_nonzeros = it.ravel()\n indptr = np.concatenate([[0], np.cumsum(col_nonzeros)])\n Xt = csc_matrix((X.data, X.indices, indptr), shape=(X.shape[0], len(indptr) - 1), dtype=X.dtype)\n return Xt\n support = self.get_support()\n X = check_array(X, dtype=None)\n if support.sum() != X.shape[1]:\n raise ValueError('X has a different shape than during fitting.')\n if X.ndim == 1:\n X = X[None, :]\n Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype)\n Xt[:, support] = X\n return Xt\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Mask feature names according to selected features.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n input_features = _check_feature_names_in(self, input_features)\n return input_features[self.get_support()]\n" }, { "name": "SelectFromModel", @@ -22468,9 +22535,9 @@ "sklearn.feature_selection._from_model.SelectFromModel._more_tags" ], "is_public": true, - "description": "Meta-transformer for selecting features based on importance weights.\n\n.. versionadded:: 0.17 Read more in the :ref:`User Guide `.", + "description": "Meta-transformer for selecting features based on importance weights.\n\n.. versionadded:: 0.17\n\nRead more in the :ref:`User Guide `.", "docstring": "Meta-transformer for selecting features based on importance weights.\n\n .. versionadded:: 0.17\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : object\n The base estimator from which the transformer is built.\n This can be both a fitted (if ``prefit`` is set to True)\n or a non-fitted estimator. The estimator should have a\n ``feature_importances_`` or ``coef_`` attribute after fitting.\n Otherwise, the ``importance_getter`` parameter should be used.\n\n threshold : str or float, default=None\n The threshold value to use for feature selection. Features whose\n importance is greater or equal are kept while the others are\n discarded. If \"median\" (resp. \"mean\"), then the ``threshold`` value is\n the median (resp. the mean) of the feature importances. A scaling\n factor (e.g., \"1.25*mean\") may also be used. If None and if the\n estimator has a parameter penalty set to l1, either explicitly\n or implicitly (e.g, Lasso), the threshold used is 1e-5.\n Otherwise, \"mean\" is used by default.\n\n prefit : bool, default=False\n Whether a prefit model is expected to be passed into the constructor\n directly or not. If True, ``transform`` must be called directly\n and SelectFromModel cannot be used with ``cross_val_score``,\n ``GridSearchCV`` and similar utilities that clone the estimator.\n Otherwise train the model using ``fit`` and then ``transform`` to do\n feature selection.\n\n norm_order : non-zero int, inf, -inf, default=1\n Order of the norm used to filter the vectors of coefficients below\n ``threshold`` in the case where the ``coef_`` attribute of the\n estimator is of dimension 2.\n\n max_features : int, default=None\n The maximum number of features to select.\n To only select based on ``max_features``, set ``threshold=-np.inf``.\n\n .. versionadded:: 0.20\n\n importance_getter : str or callable, default='auto'\n If 'auto', uses the feature importance either through a ``coef_``\n attribute or ``feature_importances_`` attribute of estimator.\n\n Also accepts a string that specifies an attribute name/path\n for extracting feature importance (implemented with `attrgetter`).\n For example, give `regressor_.coef_` in case of\n :class:`~sklearn.compose.TransformedTargetRegressor` or\n `named_steps.clf.feature_importances_` in case of\n :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.\n\n If `callable`, overrides the default feature importance getter.\n The callable is passed with the fitted estimator and it should\n return importance for each feature.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n estimator_ : an estimator\n The base estimator from which the transformer is built.\n This is stored only when a non-fitted estimator is passed to the\n ``SelectFromModel``, i.e when prefit is False.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n threshold_ : float\n The threshold value used for feature selection.\n\n See Also\n --------\n RFE : Recursive feature elimination based on importance weights.\n RFECV : Recursive feature elimination with built-in cross-validated\n selection of the best number of features.\n SequentialFeatureSelector : Sequential cross-validation based feature\n selection. Does not rely on importance weights.\n\n Notes\n -----\n Allows NaN/Inf in the input if the underlying estimator does as well.\n\n Examples\n --------\n >>> from sklearn.feature_selection import SelectFromModel\n >>> from sklearn.linear_model import LogisticRegression\n >>> X = [[ 0.87, -1.34, 0.31 ],\n ... [-2.79, -0.02, -0.85 ],\n ... [-1.34, -0.48, -2.55 ],\n ... [ 1.92, 1.48, 0.65 ]]\n >>> y = [0, 1, 0, 1]\n >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)\n >>> selector.estimator_.coef_\n array([[-0.3252302 , 0.83462377, 0.49750423]])\n >>> selector.threshold_\n 0.55245...\n >>> selector.get_support()\n array([False, True, False])\n >>> selector.transform(X)\n array([[-1.34],\n [-0.02],\n [-0.48],\n [ 1.48]])\n ", - "source_code": "\n\nclass SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):\n \"\"\"Meta-transformer for selecting features based on importance weights.\n\n .. versionadded:: 0.17\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : object\n The base estimator from which the transformer is built.\n This can be both a fitted (if ``prefit`` is set to True)\n or a non-fitted estimator. The estimator should have a\n ``feature_importances_`` or ``coef_`` attribute after fitting.\n Otherwise, the ``importance_getter`` parameter should be used.\n\n threshold : str or float, default=None\n The threshold value to use for feature selection. Features whose\n importance is greater or equal are kept while the others are\n discarded. If \"median\" (resp. \"mean\"), then the ``threshold`` value is\n the median (resp. the mean) of the feature importances. A scaling\n factor (e.g., \"1.25*mean\") may also be used. If None and if the\n estimator has a parameter penalty set to l1, either explicitly\n or implicitly (e.g, Lasso), the threshold used is 1e-5.\n Otherwise, \"mean\" is used by default.\n\n prefit : bool, default=False\n Whether a prefit model is expected to be passed into the constructor\n directly or not. If True, ``transform`` must be called directly\n and SelectFromModel cannot be used with ``cross_val_score``,\n ``GridSearchCV`` and similar utilities that clone the estimator.\n Otherwise train the model using ``fit`` and then ``transform`` to do\n feature selection.\n\n norm_order : non-zero int, inf, -inf, default=1\n Order of the norm used to filter the vectors of coefficients below\n ``threshold`` in the case where the ``coef_`` attribute of the\n estimator is of dimension 2.\n\n max_features : int, default=None\n The maximum number of features to select.\n To only select based on ``max_features``, set ``threshold=-np.inf``.\n\n .. versionadded:: 0.20\n\n importance_getter : str or callable, default='auto'\n If 'auto', uses the feature importance either through a ``coef_``\n attribute or ``feature_importances_`` attribute of estimator.\n\n Also accepts a string that specifies an attribute name/path\n for extracting feature importance (implemented with `attrgetter`).\n For example, give `regressor_.coef_` in case of\n :class:`~sklearn.compose.TransformedTargetRegressor` or\n `named_steps.clf.feature_importances_` in case of\n :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.\n\n If `callable`, overrides the default feature importance getter.\n The callable is passed with the fitted estimator and it should\n return importance for each feature.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n estimator_ : an estimator\n The base estimator from which the transformer is built.\n This is stored only when a non-fitted estimator is passed to the\n ``SelectFromModel``, i.e when prefit is False.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n threshold_ : float\n The threshold value used for feature selection.\n\n See Also\n --------\n RFE : Recursive feature elimination based on importance weights.\n RFECV : Recursive feature elimination with built-in cross-validated\n selection of the best number of features.\n SequentialFeatureSelector : Sequential cross-validation based feature\n selection. Does not rely on importance weights.\n\n Notes\n -----\n Allows NaN/Inf in the input if the underlying estimator does as well.\n\n Examples\n --------\n >>> from sklearn.feature_selection import SelectFromModel\n >>> from sklearn.linear_model import LogisticRegression\n >>> X = [[ 0.87, -1.34, 0.31 ],\n ... [-2.79, -0.02, -0.85 ],\n ... [-1.34, -0.48, -2.55 ],\n ... [ 1.92, 1.48, 0.65 ]]\n >>> y = [0, 1, 0, 1]\n >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)\n >>> selector.estimator_.coef_\n array([[-0.3252302 , 0.83462377, 0.49750423]])\n >>> selector.threshold_\n 0.55245...\n >>> selector.get_support()\n array([False, True, False])\n >>> selector.transform(X)\n array([[-1.34],\n [-0.02],\n [-0.48],\n [ 1.48]])\n \"\"\"\n \n def __init__(self, estimator, *, threshold=None, prefit=False, norm_order=1, max_features=None, importance_getter='auto'):\n self.estimator = estimator\n self.threshold = threshold\n self.prefit = prefit\n self.importance_getter = importance_getter\n self.norm_order = norm_order\n self.max_features = max_features\n \n def _get_support_mask(self):\n if self.prefit:\n estimator = self.estimator\n elif hasattr(self, 'estimator_'):\n estimator = self.estimator_\n else:\n raise ValueError('Either fit the model before transform or set \"prefit=True\" while passing the fitted estimator to the constructor.')\n scores = _get_feature_importances(estimator=estimator, getter=self.importance_getter, transform_func='norm', norm_order=self.norm_order)\n threshold = _calculate_threshold(estimator, scores, self.threshold)\n if self.max_features is not None:\n mask = np.zeros_like(scores, dtype=bool)\n candidate_indices = np.argsort(-scores, kind='mergesort')[:self.max_features]\n mask[candidate_indices] = True\n else:\n mask = np.ones_like(scores, dtype=bool)\n mask[scores < threshold] = False\n return mask\n \n def fit(self, X, y=None, **fit_params):\n \"\"\"Fit the SelectFromModel meta-transformer.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,), default=None\n The target values (integers that correspond to classes in\n classification, real numbers in regression).\n\n **fit_params : dict\n Other estimator specific parameters.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.max_features is not None:\n if not isinstance(self.max_features, numbers.Integral):\n raise TypeError(\"'max_features' should be an integer between 0 and {} features. Got {!r} instead.\".format(X.shape[1], self.max_features))\n elif self.max_features < 0 or self.max_features > X.shape[1]:\n raise ValueError(\"'max_features' should be 0 and {} features.Got {} instead.\".format(X.shape[1], self.max_features))\n if self.prefit:\n raise NotFittedError(\"Since 'prefit=True', call transform directly\")\n self.estimator_ = clone(self.estimator)\n self.estimator_.fit(X, y, **fit_params)\n if hasattr(self.estimator_, 'feature_names_in_'):\n self.feature_names_in_ = self.estimator_.feature_names_in_\n return self\n \n @property\n def threshold_(self):\n \"\"\"Threshold value used for feature selection.\"\"\"\n scores = _get_feature_importances(estimator=self.estimator_, getter=self.importance_getter, transform_func='norm', norm_order=self.norm_order)\n return _calculate_threshold(self.estimator, scores, self.threshold)\n \n @if_delegate_has_method('estimator')\n def partial_fit(self, X, y=None, **fit_params):\n \"\"\"Fit the SelectFromModel meta-transformer only once.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,), default=None\n The target values (integers that correspond to classes in\n classification, real numbers in regression).\n\n **fit_params : dict\n Other estimator specific parameters.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.prefit:\n raise NotFittedError(\"Since 'prefit=True', call transform directly\")\n if not hasattr(self, 'estimator_'):\n self.estimator_ = clone(self.estimator)\n self.estimator_.partial_fit(X, y, **fit_params)\n return self\n \n @property\n def n_features_in_(self):\n \"\"\"Number of features seen during `fit`.\"\"\"\n try:\n check_is_fitted(self)\n except NotFittedError as nfe:\n raise AttributeError('{} object has no n_features_in_ attribute.'.format(self.__class__.__name__)) from nfe\n return self.estimator_.n_features_in_\n \n def _more_tags(self):\n return {'allow_nan': _safe_tags(self.estimator, key='allow_nan')}\n" + "source_code": "\n\nclass SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):\n \"\"\"Meta-transformer for selecting features based on importance weights.\n\n .. versionadded:: 0.17\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : object\n The base estimator from which the transformer is built.\n This can be both a fitted (if ``prefit`` is set to True)\n or a non-fitted estimator. The estimator should have a\n ``feature_importances_`` or ``coef_`` attribute after fitting.\n Otherwise, the ``importance_getter`` parameter should be used.\n\n threshold : str or float, default=None\n The threshold value to use for feature selection. Features whose\n importance is greater or equal are kept while the others are\n discarded. If \"median\" (resp. \"mean\"), then the ``threshold`` value is\n the median (resp. the mean) of the feature importances. A scaling\n factor (e.g., \"1.25*mean\") may also be used. If None and if the\n estimator has a parameter penalty set to l1, either explicitly\n or implicitly (e.g, Lasso), the threshold used is 1e-5.\n Otherwise, \"mean\" is used by default.\n\n prefit : bool, default=False\n Whether a prefit model is expected to be passed into the constructor\n directly or not. If True, ``transform`` must be called directly\n and SelectFromModel cannot be used with ``cross_val_score``,\n ``GridSearchCV`` and similar utilities that clone the estimator.\n Otherwise train the model using ``fit`` and then ``transform`` to do\n feature selection.\n\n norm_order : non-zero int, inf, -inf, default=1\n Order of the norm used to filter the vectors of coefficients below\n ``threshold`` in the case where the ``coef_`` attribute of the\n estimator is of dimension 2.\n\n max_features : int, default=None\n The maximum number of features to select.\n To only select based on ``max_features``, set ``threshold=-np.inf``.\n\n .. versionadded:: 0.20\n\n importance_getter : str or callable, default='auto'\n If 'auto', uses the feature importance either through a ``coef_``\n attribute or ``feature_importances_`` attribute of estimator.\n\n Also accepts a string that specifies an attribute name/path\n for extracting feature importance (implemented with `attrgetter`).\n For example, give `regressor_.coef_` in case of\n :class:`~sklearn.compose.TransformedTargetRegressor` or\n `named_steps.clf.feature_importances_` in case of\n :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.\n\n If `callable`, overrides the default feature importance getter.\n The callable is passed with the fitted estimator and it should\n return importance for each feature.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n estimator_ : an estimator\n The base estimator from which the transformer is built.\n This is stored only when a non-fitted estimator is passed to the\n ``SelectFromModel``, i.e when prefit is False.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n threshold_ : float\n The threshold value used for feature selection.\n\n See Also\n --------\n RFE : Recursive feature elimination based on importance weights.\n RFECV : Recursive feature elimination with built-in cross-validated\n selection of the best number of features.\n SequentialFeatureSelector : Sequential cross-validation based feature\n selection. Does not rely on importance weights.\n\n Notes\n -----\n Allows NaN/Inf in the input if the underlying estimator does as well.\n\n Examples\n --------\n >>> from sklearn.feature_selection import SelectFromModel\n >>> from sklearn.linear_model import LogisticRegression\n >>> X = [[ 0.87, -1.34, 0.31 ],\n ... [-2.79, -0.02, -0.85 ],\n ... [-1.34, -0.48, -2.55 ],\n ... [ 1.92, 1.48, 0.65 ]]\n >>> y = [0, 1, 0, 1]\n >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)\n >>> selector.estimator_.coef_\n array([[-0.3252302 , 0.83462377, 0.49750423]])\n >>> selector.threshold_\n 0.55245...\n >>> selector.get_support()\n array([False, True, False])\n >>> selector.transform(X)\n array([[-1.34],\n [-0.02],\n [-0.48],\n [ 1.48]])\n \"\"\"\n \n def __init__(self, estimator, *, threshold=None, prefit=False, norm_order=1, max_features=None, importance_getter='auto'):\n self.estimator = estimator\n self.threshold = threshold\n self.prefit = prefit\n self.importance_getter = importance_getter\n self.norm_order = norm_order\n self.max_features = max_features\n \n def _get_support_mask(self):\n if self.prefit:\n estimator = self.estimator\n elif hasattr(self, 'estimator_'):\n estimator = self.estimator_\n else:\n raise ValueError('Either fit the model before transform or set \"prefit=True\" while passing the fitted estimator to the constructor.')\n scores = _get_feature_importances(estimator=estimator, getter=self.importance_getter, transform_func='norm', norm_order=self.norm_order)\n threshold = _calculate_threshold(estimator, scores, self.threshold)\n if self.max_features is not None:\n mask = np.zeros_like(scores, dtype=bool)\n candidate_indices = np.argsort(-scores, kind='mergesort')[:self.max_features]\n mask[candidate_indices] = True\n else:\n mask = np.ones_like(scores, dtype=bool)\n mask[scores < threshold] = False\n return mask\n \n def fit(self, X, y=None, **fit_params):\n \"\"\"Fit the SelectFromModel meta-transformer.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,), default=None\n The target values (integers that correspond to classes in\n classification, real numbers in regression).\n\n **fit_params : dict\n Other estimator specific parameters.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.max_features is not None:\n if not isinstance(self.max_features, numbers.Integral):\n raise TypeError(\"'max_features' should be an integer between 0 and {} features. Got {!r} instead.\".format(X.shape[1], self.max_features))\n elif self.max_features < 0 or self.max_features > X.shape[1]:\n raise ValueError(\"'max_features' should be 0 and {} features.Got {} instead.\".format(X.shape[1], self.max_features))\n if self.prefit:\n raise NotFittedError(\"Since 'prefit=True', call transform directly\")\n self.estimator_ = clone(self.estimator)\n self.estimator_.fit(X, y, **fit_params)\n if hasattr(self.estimator_, 'feature_names_in_'):\n self.feature_names_in_ = self.estimator_.feature_names_in_\n else:\n self._check_feature_names(X, reset=True)\n return self\n \n @property\n def threshold_(self):\n \"\"\"Threshold value used for feature selection.\"\"\"\n scores = _get_feature_importances(estimator=self.estimator_, getter=self.importance_getter, transform_func='norm', norm_order=self.norm_order)\n return _calculate_threshold(self.estimator, scores, self.threshold)\n \n @if_delegate_has_method('estimator')\n def partial_fit(self, X, y=None, **fit_params):\n \"\"\"Fit the SelectFromModel meta-transformer only once.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,), default=None\n The target values (integers that correspond to classes in\n classification, real numbers in regression).\n\n **fit_params : dict\n Other estimator specific parameters.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.prefit:\n raise NotFittedError(\"Since 'prefit=True', call transform directly\")\n if not hasattr(self, 'estimator_'):\n self.estimator_ = clone(self.estimator)\n self.estimator_.partial_fit(X, y, **fit_params)\n return self\n \n @property\n def n_features_in_(self):\n \"\"\"Number of features seen during `fit`.\"\"\"\n try:\n check_is_fitted(self)\n except NotFittedError as nfe:\n raise AttributeError('{} object has no n_features_in_ attribute.'.format(self.__class__.__name__)) from nfe\n return self.estimator_.n_features_in_\n \n def _more_tags(self):\n return {'allow_nan': _safe_tags(self.estimator, key='allow_nan')}\n" }, { "name": "RFE", @@ -22496,7 +22563,7 @@ "sklearn.feature_selection._rfe.RFE._more_tags" ], "is_public": true, - "description": "Feature ranking with recursive feature elimination.\n\nGiven an external estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through any specific attribute or callable. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. Read more in the :ref:`User Guide `.", + "description": "Feature ranking with recursive feature elimination.\n\nGiven an external estimator that assigns weights to features (e.g., the\ncoefficients of a linear model), the goal of recursive feature elimination\n(RFE) is to select features by recursively considering smaller and smaller\nsets of features. First, the estimator is trained on the initial set of\nfeatures and the importance of each feature is obtained either through\nany specific attribute or callable.\nThen, the least important features are pruned from current set of features.\nThat procedure is recursively repeated on the pruned set until the desired\nnumber of features to select is eventually reached.\n\nRead more in the :ref:`User Guide `.", "docstring": "Feature ranking with recursive feature elimination.\n\n Given an external estimator that assigns weights to features (e.g., the\n coefficients of a linear model), the goal of recursive feature elimination\n (RFE) is to select features by recursively considering smaller and smaller\n sets of features. First, the estimator is trained on the initial set of\n features and the importance of each feature is obtained either through\n any specific attribute or callable.\n Then, the least important features are pruned from current set of features.\n That procedure is recursively repeated on the pruned set until the desired\n number of features to select is eventually reached.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : ``Estimator`` instance\n A supervised learning estimator with a ``fit`` method that provides\n information about feature importance\n (e.g. `coef_`, `feature_importances_`).\n\n n_features_to_select : int or float, default=None\n The number of features to select. If `None`, half of the features are\n selected. If integer, the parameter is the absolute number of features\n to select. If float between 0 and 1, it is the fraction of features to\n select.\n\n .. versionchanged:: 0.24\n Added float values for fractions.\n\n step : int or float, default=1\n If greater than or equal to 1, then ``step`` corresponds to the\n (integer) number of features to remove at each iteration.\n If within (0.0, 1.0), then ``step`` corresponds to the percentage\n (rounded down) of features to remove at each iteration.\n\n verbose : int, default=0\n Controls verbosity of output.\n\n importance_getter : str or callable, default='auto'\n If 'auto', uses the feature importance either through a `coef_`\n or `feature_importances_` attributes of estimator.\n\n Also accepts a string that specifies an attribute name/path\n for extracting feature importance (implemented with `attrgetter`).\n For example, give `regressor_.coef_` in case of\n :class:`~sklearn.compose.TransformedTargetRegressor` or\n `named_steps.clf.feature_importances_` in case of\n class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.\n\n If `callable`, overrides the default feature importance getter.\n The callable is passed with the fitted estimator and it should\n return importance for each feature.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n The classes labels. Only available when `estimator` is a classifier.\n\n estimator_ : ``Estimator`` instance\n The fitted estimator used to select features.\n\n n_features_ : int\n The number of selected features.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n ranking_ : ndarray of shape (n_features,)\n The feature ranking, such that ``ranking_[i]`` corresponds to the\n ranking position of the i-th feature. Selected (i.e., estimated\n best) features are assigned rank 1.\n\n support_ : ndarray of shape (n_features,)\n The mask of selected features.\n\n See Also\n --------\n RFECV : Recursive feature elimination with built-in cross-validated\n selection of the best number of features.\n SelectFromModel : Feature selection based on thresholds of importance\n weights.\n SequentialFeatureSelector : Sequential cross-validation based feature\n selection. Does not rely on importance weights.\n\n Notes\n -----\n Allows NaN/Inf in the input if the underlying estimator does as well.\n\n References\n ----------\n\n .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., \"Gene selection\n for cancer classification using support vector machines\",\n Mach. Learn., 46(1-3), 389--422, 2002.\n\n Examples\n --------\n The following example shows how to retrieve the 5 most informative\n features in the Friedman #1 dataset.\n\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.feature_selection import RFE\n >>> from sklearn.svm import SVR\n >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n >>> estimator = SVR(kernel=\"linear\")\n >>> selector = RFE(estimator, n_features_to_select=5, step=1)\n >>> selector = selector.fit(X, y)\n >>> selector.support_\n array([ True, True, True, True, True, False, False, False, False,\n False])\n >>> selector.ranking_\n array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])\n ", "source_code": "\n\nclass RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):\n \"\"\"Feature ranking with recursive feature elimination.\n\n Given an external estimator that assigns weights to features (e.g., the\n coefficients of a linear model), the goal of recursive feature elimination\n (RFE) is to select features by recursively considering smaller and smaller\n sets of features. First, the estimator is trained on the initial set of\n features and the importance of each feature is obtained either through\n any specific attribute or callable.\n Then, the least important features are pruned from current set of features.\n That procedure is recursively repeated on the pruned set until the desired\n number of features to select is eventually reached.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : ``Estimator`` instance\n A supervised learning estimator with a ``fit`` method that provides\n information about feature importance\n (e.g. `coef_`, `feature_importances_`).\n\n n_features_to_select : int or float, default=None\n The number of features to select. If `None`, half of the features are\n selected. If integer, the parameter is the absolute number of features\n to select. If float between 0 and 1, it is the fraction of features to\n select.\n\n .. versionchanged:: 0.24\n Added float values for fractions.\n\n step : int or float, default=1\n If greater than or equal to 1, then ``step`` corresponds to the\n (integer) number of features to remove at each iteration.\n If within (0.0, 1.0), then ``step`` corresponds to the percentage\n (rounded down) of features to remove at each iteration.\n\n verbose : int, default=0\n Controls verbosity of output.\n\n importance_getter : str or callable, default='auto'\n If 'auto', uses the feature importance either through a `coef_`\n or `feature_importances_` attributes of estimator.\n\n Also accepts a string that specifies an attribute name/path\n for extracting feature importance (implemented with `attrgetter`).\n For example, give `regressor_.coef_` in case of\n :class:`~sklearn.compose.TransformedTargetRegressor` or\n `named_steps.clf.feature_importances_` in case of\n class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.\n\n If `callable`, overrides the default feature importance getter.\n The callable is passed with the fitted estimator and it should\n return importance for each feature.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n The classes labels. Only available when `estimator` is a classifier.\n\n estimator_ : ``Estimator`` instance\n The fitted estimator used to select features.\n\n n_features_ : int\n The number of selected features.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n ranking_ : ndarray of shape (n_features,)\n The feature ranking, such that ``ranking_[i]`` corresponds to the\n ranking position of the i-th feature. Selected (i.e., estimated\n best) features are assigned rank 1.\n\n support_ : ndarray of shape (n_features,)\n The mask of selected features.\n\n See Also\n --------\n RFECV : Recursive feature elimination with built-in cross-validated\n selection of the best number of features.\n SelectFromModel : Feature selection based on thresholds of importance\n weights.\n SequentialFeatureSelector : Sequential cross-validation based feature\n selection. Does not rely on importance weights.\n\n Notes\n -----\n Allows NaN/Inf in the input if the underlying estimator does as well.\n\n References\n ----------\n\n .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., \"Gene selection\n for cancer classification using support vector machines\",\n Mach. Learn., 46(1-3), 389--422, 2002.\n\n Examples\n --------\n The following example shows how to retrieve the 5 most informative\n features in the Friedman #1 dataset.\n\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.feature_selection import RFE\n >>> from sklearn.svm import SVR\n >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n >>> estimator = SVR(kernel=\"linear\")\n >>> selector = RFE(estimator, n_features_to_select=5, step=1)\n >>> selector = selector.fit(X, y)\n >>> selector.support_\n array([ True, True, True, True, True, False, False, False, False,\n False])\n >>> selector.ranking_\n array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])\n \"\"\"\n \n def __init__(self, estimator, *, n_features_to_select=None, step=1, verbose=0, importance_getter='auto'):\n self.estimator = estimator\n self.n_features_to_select = n_features_to_select\n self.step = step\n self.importance_getter = importance_getter\n self.verbose = verbose\n \n @property\n def _estimator_type(self):\n return self.estimator._estimator_type\n \n @property\n def classes_(self):\n \"\"\"Classes labels available when `estimator` is a classifier.\n\n Returns\n -------\n ndarray of shape (n_classes,)\n \"\"\"\n return self.estimator_.classes_\n \n def fit(self, X, y, **fit_params):\n \"\"\"Fit the RFE model and then the underlying estimator on the selected features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,)\n The target values.\n\n **fit_params : dict\n Additional parameters passed to the `fit` method of the underlying\n estimator.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n return self._fit(X, y, **fit_params)\n \n def _fit(self, X, y, step_score=None, **fit_params):\n tags = self._get_tags()\n (X, y) = self._validate_data(X, y, accept_sparse='csc', ensure_min_features=2, force_all_finite=not tags.get('allow_nan', True), multi_output=True)\n error_msg = f'n_features_to_select must be either None, a positive integer representing the absolute number of features or a float in (0.0, 1.0] representing a percentage of features to select. Got {self.n_features_to_select}'\n n_features = X.shape[1]\n if self.n_features_to_select is None:\n n_features_to_select = n_features // 2\n elif self.n_features_to_select < 0:\n raise ValueError(error_msg)\n elif isinstance(self.n_features_to_select, numbers.Integral):\n n_features_to_select = self.n_features_to_select\n elif self.n_features_to_select > 1.0:\n raise ValueError(error_msg)\n else:\n n_features_to_select = int(n_features * self.n_features_to_select)\n if 0.0 < self.step < 1.0:\n step = int(max(1, self.step * n_features))\n else:\n step = int(self.step)\n if step <= 0:\n raise ValueError('Step must be >0')\n support_ = np.ones(n_features, dtype=bool)\n ranking_ = np.ones(n_features, dtype=int)\n if step_score:\n self.scores_ = []\n while np.sum(support_) > n_features_to_select:\n features = np.arange(n_features)[support_]\n estimator = clone(self.estimator)\n if self.verbose > 0:\n print('Fitting estimator with %d features.' % np.sum(support_))\n estimator.fit(X[:, features], y, **fit_params)\n importances = _get_feature_importances(estimator, self.importance_getter, transform_func='square')\n ranks = np.argsort(importances)\n ranks = np.ravel(ranks)\n threshold = min(step, np.sum(support_) - n_features_to_select)\n if step_score:\n self.scores_.append(step_score(estimator, features))\n support_[features[ranks][:threshold]] = False\n ranking_[np.logical_not(support_)] += 1\n features = np.arange(n_features)[support_]\n self.estimator_ = clone(self.estimator)\n self.estimator_.fit(X[:, features], y, **fit_params)\n if step_score:\n self.scores_.append(step_score(self.estimator_, features))\n self.n_features_ = support_.sum()\n self.support_ = support_\n self.ranking_ = ranking_\n return self\n \n @if_delegate_has_method(delegate='estimator')\n def predict(self, X):\n \"\"\"Reduce X to the selected features and then predict using the underlying estimator.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n Returns\n -------\n y : array of shape [n_samples]\n The predicted target values.\n \"\"\"\n check_is_fitted(self)\n return self.estimator_.predict(self.transform(X))\n \n @if_delegate_has_method(delegate='estimator')\n def score(self, X, y, **fit_params):\n \"\"\"Reduce X to the selected features and return the score of the underlying estimator.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n y : array of shape [n_samples]\n The target values.\n\n **fit_params : dict\n Parameters to pass to the `score` method of the underlying\n estimator.\n\n .. versionadded:: 1.0\n\n Returns\n -------\n score : float\n Score of the underlying base estimator computed with the selected\n features returned by `rfe.transform(X)` and `y`.\n \"\"\"\n check_is_fitted(self)\n return self.estimator_.score(self.transform(X), y, **fit_params)\n \n def _get_support_mask(self):\n check_is_fitted(self)\n return self.support_\n \n @if_delegate_has_method(delegate='estimator')\n def decision_function(self, X):\n \"\"\"Compute the decision function of ``X``.\n\n Parameters\n ----------\n X : {array-like or sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n score : array, shape = [n_samples, n_classes] or [n_samples]\n The decision function of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification produce an array of shape\n [n_samples].\n \"\"\"\n check_is_fitted(self)\n return self.estimator_.decision_function(self.transform(X))\n \n @if_delegate_has_method(delegate='estimator')\n def predict_proba(self, X):\n \"\"\"Predict class probabilities for X.\n\n Parameters\n ----------\n X : {array-like or sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n p : array of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n return self.estimator_.predict_proba(self.transform(X))\n \n @if_delegate_has_method(delegate='estimator')\n def predict_log_proba(self, X):\n \"\"\"Predict class log-probabilities for X.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n Returns\n -------\n p : array of shape (n_samples, n_classes)\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n return self.estimator_.predict_log_proba(self.transform(X))\n \n def _more_tags(self):\n return {'poor_score': True, 'allow_nan': _safe_tags(self.estimator, key='allow_nan'), 'requires_y': True}\n" }, @@ -22511,9 +22578,9 @@ "sklearn.feature_selection._rfe.RFECV.grid_scores_@getter" ], "is_public": true, - "description": "Recursive feature elimination with cross-validation to select the number of features.\n\nSee glossary entry for :term:`cross-validation estimator`. Read more in the :ref:`User Guide `.", + "description": "Recursive feature elimination with cross-validation to select the number of features.\n\nSee glossary entry for :term:`cross-validation estimator`.\n\nRead more in the :ref:`User Guide `.", "docstring": "Recursive feature elimination with cross-validation to select the number of features.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : ``Estimator`` instance\n A supervised learning estimator with a ``fit`` method that provides\n information about feature importance either through a ``coef_``\n attribute or through a ``feature_importances_`` attribute.\n\n step : int or float, default=1\n If greater than or equal to 1, then ``step`` corresponds to the\n (integer) number of features to remove at each iteration.\n If within (0.0, 1.0), then ``step`` corresponds to the percentage\n (rounded down) of features to remove at each iteration.\n Note that the last iteration may remove fewer than ``step`` features in\n order to reach ``min_features_to_select``.\n\n min_features_to_select : int, default=1\n The minimum number of features to be selected. This number of features\n will always be scored, even if the difference between the original\n feature count and ``min_features_to_select`` isn't divisible by\n ``step``.\n\n .. versionadded:: 0.20\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if ``y`` is binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used. If the\n estimator is a classifier or if ``y`` is neither binary nor multiclass,\n :class:`~sklearn.model_selection.KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value of None changed from 3-fold to 5-fold.\n\n scoring : str, callable or None, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n\n verbose : int, default=0\n Controls verbosity of output.\n\n n_jobs : int or None, default=None\n Number of cores to run in parallel while fitting across folds.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.18\n\n importance_getter : str or callable, default='auto'\n If 'auto', uses the feature importance either through a `coef_`\n or `feature_importances_` attributes of estimator.\n\n Also accepts a string that specifies an attribute name/path\n for extracting feature importance.\n For example, give `regressor_.coef_` in case of\n :class:`~sklearn.compose.TransformedTargetRegressor` or\n `named_steps.clf.feature_importances_` in case of\n :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.\n\n If `callable`, overrides the default feature importance getter.\n The callable is passed with the fitted estimator and it should\n return importance for each feature.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n The classes labels. Only available when `estimator` is a classifier.\n\n estimator_ : ``Estimator`` instance\n The fitted estimator used to select features.\n\n grid_scores_ : ndarray of shape (n_subsets_of_features,)\n The cross-validation scores such that\n ``grid_scores_[i]`` corresponds to\n the CV score of the i-th subset of features.\n\n .. deprecated:: 1.0\n The `grid_scores_` attribute is deprecated in version 1.0 in favor\n of `cv_results_` and will be removed in version 1.2.\n\n cv_results_ : dict of ndarrays\n A dict with keys:\n\n split(k)_test_score : ndarray of shape (n_features,)\n The cross-validation scores across (k)th fold.\n\n mean_test_score : ndarray of shape (n_features,)\n Mean of scores over the folds.\n\n std_test_score : ndarray of shape (n_features,)\n Standard deviation of scores over the folds.\n\n .. versionadded:: 1.0\n\n n_features_ : int\n The number of selected features with cross-validation.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n ranking_ : narray of shape (n_features,)\n The feature ranking, such that `ranking_[i]`\n corresponds to the ranking\n position of the i-th feature.\n Selected (i.e., estimated best)\n features are assigned rank 1.\n\n support_ : ndarray of shape (n_features,)\n The mask of selected features.\n\n See Also\n --------\n RFE : Recursive feature elimination.\n\n Notes\n -----\n The size of ``grid_scores_`` is equal to\n ``ceil((n_features - min_features_to_select) / step) + 1``,\n where step is the number of features removed at each iteration.\n\n Allows NaN/Inf in the input if the underlying estimator does as well.\n\n References\n ----------\n\n .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., \"Gene selection\n for cancer classification using support vector machines\",\n Mach. Learn., 46(1-3), 389--422, 2002.\n\n Examples\n --------\n The following example shows how to retrieve the a-priori not known 5\n informative features in the Friedman #1 dataset.\n\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.feature_selection import RFECV\n >>> from sklearn.svm import SVR\n >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n >>> estimator = SVR(kernel=\"linear\")\n >>> selector = RFECV(estimator, step=1, cv=5)\n >>> selector = selector.fit(X, y)\n >>> selector.support_\n array([ True, True, True, True, True, False, False, False, False,\n False])\n >>> selector.ranking_\n array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])\n ", - "source_code": "\n\nclass RFECV(RFE):\n \"\"\"Recursive feature elimination with cross-validation to select the number of features.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : ``Estimator`` instance\n A supervised learning estimator with a ``fit`` method that provides\n information about feature importance either through a ``coef_``\n attribute or through a ``feature_importances_`` attribute.\n\n step : int or float, default=1\n If greater than or equal to 1, then ``step`` corresponds to the\n (integer) number of features to remove at each iteration.\n If within (0.0, 1.0), then ``step`` corresponds to the percentage\n (rounded down) of features to remove at each iteration.\n Note that the last iteration may remove fewer than ``step`` features in\n order to reach ``min_features_to_select``.\n\n min_features_to_select : int, default=1\n The minimum number of features to be selected. This number of features\n will always be scored, even if the difference between the original\n feature count and ``min_features_to_select`` isn't divisible by\n ``step``.\n\n .. versionadded:: 0.20\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if ``y`` is binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used. If the\n estimator is a classifier or if ``y`` is neither binary nor multiclass,\n :class:`~sklearn.model_selection.KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value of None changed from 3-fold to 5-fold.\n\n scoring : str, callable or None, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n\n verbose : int, default=0\n Controls verbosity of output.\n\n n_jobs : int or None, default=None\n Number of cores to run in parallel while fitting across folds.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.18\n\n importance_getter : str or callable, default='auto'\n If 'auto', uses the feature importance either through a `coef_`\n or `feature_importances_` attributes of estimator.\n\n Also accepts a string that specifies an attribute name/path\n for extracting feature importance.\n For example, give `regressor_.coef_` in case of\n :class:`~sklearn.compose.TransformedTargetRegressor` or\n `named_steps.clf.feature_importances_` in case of\n :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.\n\n If `callable`, overrides the default feature importance getter.\n The callable is passed with the fitted estimator and it should\n return importance for each feature.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n The classes labels. Only available when `estimator` is a classifier.\n\n estimator_ : ``Estimator`` instance\n The fitted estimator used to select features.\n\n grid_scores_ : ndarray of shape (n_subsets_of_features,)\n The cross-validation scores such that\n ``grid_scores_[i]`` corresponds to\n the CV score of the i-th subset of features.\n\n .. deprecated:: 1.0\n The `grid_scores_` attribute is deprecated in version 1.0 in favor\n of `cv_results_` and will be removed in version 1.2.\n\n cv_results_ : dict of ndarrays\n A dict with keys:\n\n split(k)_test_score : ndarray of shape (n_features,)\n The cross-validation scores across (k)th fold.\n\n mean_test_score : ndarray of shape (n_features,)\n Mean of scores over the folds.\n\n std_test_score : ndarray of shape (n_features,)\n Standard deviation of scores over the folds.\n\n .. versionadded:: 1.0\n\n n_features_ : int\n The number of selected features with cross-validation.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n ranking_ : narray of shape (n_features,)\n The feature ranking, such that `ranking_[i]`\n corresponds to the ranking\n position of the i-th feature.\n Selected (i.e., estimated best)\n features are assigned rank 1.\n\n support_ : ndarray of shape (n_features,)\n The mask of selected features.\n\n See Also\n --------\n RFE : Recursive feature elimination.\n\n Notes\n -----\n The size of ``grid_scores_`` is equal to\n ``ceil((n_features - min_features_to_select) / step) + 1``,\n where step is the number of features removed at each iteration.\n\n Allows NaN/Inf in the input if the underlying estimator does as well.\n\n References\n ----------\n\n .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., \"Gene selection\n for cancer classification using support vector machines\",\n Mach. Learn., 46(1-3), 389--422, 2002.\n\n Examples\n --------\n The following example shows how to retrieve the a-priori not known 5\n informative features in the Friedman #1 dataset.\n\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.feature_selection import RFECV\n >>> from sklearn.svm import SVR\n >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n >>> estimator = SVR(kernel=\"linear\")\n >>> selector = RFECV(estimator, step=1, cv=5)\n >>> selector = selector.fit(X, y)\n >>> selector.support_\n array([ True, True, True, True, True, False, False, False, False,\n False])\n >>> selector.ranking_\n array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])\n \"\"\"\n \n def __init__(self, estimator, *, step=1, min_features_to_select=1, cv=None, scoring=None, verbose=0, n_jobs=None, importance_getter='auto'):\n self.estimator = estimator\n self.step = step\n self.importance_getter = importance_getter\n self.cv = cv\n self.scoring = scoring\n self.verbose = verbose\n self.n_jobs = n_jobs\n self.min_features_to_select = min_features_to_select\n \n def fit(self, X, y, groups=None):\n \"\"\"Fit the RFE model and automatically tune the number of selected features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the total number of features.\n\n y : array-like of shape (n_samples,)\n Target values (integers for classification, real numbers for\n regression).\n\n groups : array-like of shape (n_samples,) or None, default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n .. versionadded:: 0.20\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n tags = self._get_tags()\n (X, y) = self._validate_data(X, y, accept_sparse='csr', ensure_min_features=2, force_all_finite=not tags.get('allow_nan', True), multi_output=True)\n cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n scorer = check_scoring(self.estimator, scoring=self.scoring)\n n_features = X.shape[1]\n if 0.0 < self.step < 1.0:\n step = int(max(1, self.step * n_features))\n else:\n step = int(self.step)\n if step <= 0:\n raise ValueError('Step must be >0')\n rfe = RFE(estimator=self.estimator, n_features_to_select=self.min_features_to_select, importance_getter=self.importance_getter, step=self.step, verbose=self.verbose)\n if effective_n_jobs(self.n_jobs) == 1:\n (parallel, func) = (list, _rfe_single_fit)\n else:\n parallel = Parallel(n_jobs=self.n_jobs)\n func = delayed(_rfe_single_fit)\n scores = parallel((func(rfe, self.estimator, X, y, train, test, scorer) for (train, test) in cv.split(X, y, groups)))\n scores = np.array(scores)\n scores_sum = np.sum(scores, axis=0)\n scores_sum_rev = scores_sum[::-1]\n argmax_idx = len(scores_sum) - np.argmax(scores_sum_rev) - 1\n n_features_to_select = max(n_features - argmax_idx * step, self.min_features_to_select)\n rfe = RFE(estimator=self.estimator, n_features_to_select=n_features_to_select, step=self.step, importance_getter=self.importance_getter, verbose=self.verbose)\n rfe.fit(X, y)\n self.support_ = rfe.support_\n self.n_features_ = rfe.n_features_\n self.ranking_ = rfe.ranking_\n self.estimator_ = clone(self.estimator)\n self.estimator_.fit(self.transform(X), y)\n scores_rev = scores[:, ::-1]\n self.cv_results_ = {}\n self.cv_results_['mean_test_score'] = np.mean(scores_rev, axis=0)\n self.cv_results_['std_test_score'] = np.std(scores_rev, axis=0)\n for i in range(scores.shape[0]):\n self.cv_results_[f'split{i}_test_score'] = scores_rev[i]\n return self\n \n @deprecated('The `grid_scores_` attribute is deprecated in version 1.0 in favor of `cv_results_` and will be removed in version 1.2.')\n @property\n def grid_scores_(self):\n grid_size = len(self.cv_results_) - 2\n return np.asarray([self.cv_results_[f'split{i}_test_score'] for i in range(grid_size)]).T\n" + "source_code": "\n\nclass RFECV(RFE):\n \"\"\"Recursive feature elimination with cross-validation to select the number of features.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : ``Estimator`` instance\n A supervised learning estimator with a ``fit`` method that provides\n information about feature importance either through a ``coef_``\n attribute or through a ``feature_importances_`` attribute.\n\n step : int or float, default=1\n If greater than or equal to 1, then ``step`` corresponds to the\n (integer) number of features to remove at each iteration.\n If within (0.0, 1.0), then ``step`` corresponds to the percentage\n (rounded down) of features to remove at each iteration.\n Note that the last iteration may remove fewer than ``step`` features in\n order to reach ``min_features_to_select``.\n\n min_features_to_select : int, default=1\n The minimum number of features to be selected. This number of features\n will always be scored, even if the difference between the original\n feature count and ``min_features_to_select`` isn't divisible by\n ``step``.\n\n .. versionadded:: 0.20\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if ``y`` is binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used. If the\n estimator is a classifier or if ``y`` is neither binary nor multiclass,\n :class:`~sklearn.model_selection.KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value of None changed from 3-fold to 5-fold.\n\n scoring : str, callable or None, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n\n verbose : int, default=0\n Controls verbosity of output.\n\n n_jobs : int or None, default=None\n Number of cores to run in parallel while fitting across folds.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.18\n\n importance_getter : str or callable, default='auto'\n If 'auto', uses the feature importance either through a `coef_`\n or `feature_importances_` attributes of estimator.\n\n Also accepts a string that specifies an attribute name/path\n for extracting feature importance.\n For example, give `regressor_.coef_` in case of\n :class:`~sklearn.compose.TransformedTargetRegressor` or\n `named_steps.clf.feature_importances_` in case of\n :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.\n\n If `callable`, overrides the default feature importance getter.\n The callable is passed with the fitted estimator and it should\n return importance for each feature.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n The classes labels. Only available when `estimator` is a classifier.\n\n estimator_ : ``Estimator`` instance\n The fitted estimator used to select features.\n\n grid_scores_ : ndarray of shape (n_subsets_of_features,)\n The cross-validation scores such that\n ``grid_scores_[i]`` corresponds to\n the CV score of the i-th subset of features.\n\n .. deprecated:: 1.0\n The `grid_scores_` attribute is deprecated in version 1.0 in favor\n of `cv_results_` and will be removed in version 1.2.\n\n cv_results_ : dict of ndarrays\n A dict with keys:\n\n split(k)_test_score : ndarray of shape (n_features,)\n The cross-validation scores across (k)th fold.\n\n mean_test_score : ndarray of shape (n_features,)\n Mean of scores over the folds.\n\n std_test_score : ndarray of shape (n_features,)\n Standard deviation of scores over the folds.\n\n .. versionadded:: 1.0\n\n n_features_ : int\n The number of selected features with cross-validation.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n ranking_ : narray of shape (n_features,)\n The feature ranking, such that `ranking_[i]`\n corresponds to the ranking\n position of the i-th feature.\n Selected (i.e., estimated best)\n features are assigned rank 1.\n\n support_ : ndarray of shape (n_features,)\n The mask of selected features.\n\n See Also\n --------\n RFE : Recursive feature elimination.\n\n Notes\n -----\n The size of ``grid_scores_`` is equal to\n ``ceil((n_features - min_features_to_select) / step) + 1``,\n where step is the number of features removed at each iteration.\n\n Allows NaN/Inf in the input if the underlying estimator does as well.\n\n References\n ----------\n\n .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., \"Gene selection\n for cancer classification using support vector machines\",\n Mach. Learn., 46(1-3), 389--422, 2002.\n\n Examples\n --------\n The following example shows how to retrieve the a-priori not known 5\n informative features in the Friedman #1 dataset.\n\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.feature_selection import RFECV\n >>> from sklearn.svm import SVR\n >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)\n >>> estimator = SVR(kernel=\"linear\")\n >>> selector = RFECV(estimator, step=1, cv=5)\n >>> selector = selector.fit(X, y)\n >>> selector.support_\n array([ True, True, True, True, True, False, False, False, False,\n False])\n >>> selector.ranking_\n array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])\n \"\"\"\n \n def __init__(self, estimator, *, step=1, min_features_to_select=1, cv=None, scoring=None, verbose=0, n_jobs=None, importance_getter='auto'):\n self.estimator = estimator\n self.step = step\n self.importance_getter = importance_getter\n self.cv = cv\n self.scoring = scoring\n self.verbose = verbose\n self.n_jobs = n_jobs\n self.min_features_to_select = min_features_to_select\n \n def fit(self, X, y, groups=None):\n \"\"\"Fit the RFE model and automatically tune the number of selected features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the total number of features.\n\n y : array-like of shape (n_samples,)\n Target values (integers for classification, real numbers for\n regression).\n\n groups : array-like of shape (n_samples,) or None, default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n .. versionadded:: 0.20\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n tags = self._get_tags()\n (X, y) = self._validate_data(X, y, accept_sparse='csr', ensure_min_features=2, force_all_finite=not tags.get('allow_nan', True), multi_output=True)\n cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n scorer = check_scoring(self.estimator, scoring=self.scoring)\n n_features = X.shape[1]\n if 0.0 < self.step < 1.0:\n step = int(max(1, self.step * n_features))\n else:\n step = int(self.step)\n if step <= 0:\n raise ValueError('Step must be >0')\n rfe = RFE(estimator=self.estimator, n_features_to_select=self.min_features_to_select, importance_getter=self.importance_getter, step=self.step, verbose=self.verbose)\n if effective_n_jobs(self.n_jobs) == 1:\n (parallel, func) = (list, _rfe_single_fit)\n else:\n parallel = Parallel(n_jobs=self.n_jobs)\n func = delayed(_rfe_single_fit)\n scores = parallel((func(rfe, self.estimator, X, y, train, test, scorer) for (train, test) in cv.split(X, y, groups)))\n scores = np.array(scores)\n scores_sum = np.sum(scores, axis=0)\n scores_sum_rev = scores_sum[::-1]\n argmax_idx = len(scores_sum) - np.argmax(scores_sum_rev) - 1\n n_features_to_select = max(n_features - argmax_idx * step, self.min_features_to_select)\n rfe = RFE(estimator=self.estimator, n_features_to_select=n_features_to_select, step=self.step, importance_getter=self.importance_getter, verbose=self.verbose)\n rfe.fit(X, y)\n self.support_ = rfe.support_\n self.n_features_ = rfe.n_features_\n self.ranking_ = rfe.ranking_\n self.estimator_ = clone(self.estimator)\n self.estimator_.fit(self._transform(X), y)\n scores_rev = scores[:, ::-1]\n self.cv_results_ = {}\n self.cv_results_['mean_test_score'] = np.mean(scores_rev, axis=0)\n self.cv_results_['std_test_score'] = np.std(scores_rev, axis=0)\n for i in range(scores.shape[0]):\n self.cv_results_[f'split{i}_test_score'] = scores_rev[i]\n return self\n \n @deprecated('The `grid_scores_` attribute is deprecated in version 1.0 in favor of `cv_results_` and will be removed in version 1.2.')\n @property\n def grid_scores_(self):\n grid_size = len(self.cv_results_) - 2\n return np.asarray([self.cv_results_[f'split{i}_test_score'] for i in range(grid_size)]).T\n" }, { "name": "SequentialFeatureSelector", @@ -22532,7 +22599,7 @@ "sklearn.feature_selection._sequential.SequentialFeatureSelector._more_tags" ], "is_public": true, - "description": "Transformer that performs Sequential Feature Selection.\n\nThis Sequential Feature Selector adds (forward selection) or removes (backward selection) features to form a feature subset in a greedy fashion. At each stage, this estimator chooses the best feature to add or remove based on the cross-validation score of an estimator. In the case of unsupervised learning, this Sequential Feature Selector looks only at the features (X), not the desired outputs (y). Read more in the :ref:`User Guide `. .. versionadded:: 0.24", + "description": "Transformer that performs Sequential Feature Selection.\n\nThis Sequential Feature Selector adds (forward selection) or\nremoves (backward selection) features to form a feature subset in a\ngreedy fashion. At each stage, this estimator chooses the best feature to\nadd or remove based on the cross-validation score of an estimator. In\nthe case of unsupervised learning, this Sequential Feature Selector\nlooks only at the features (X), not the desired outputs (y).\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.24", "docstring": "Transformer that performs Sequential Feature Selection.\n\n This Sequential Feature Selector adds (forward selection) or\n removes (backward selection) features to form a feature subset in a\n greedy fashion. At each stage, this estimator chooses the best feature to\n add or remove based on the cross-validation score of an estimator. In\n the case of unsupervised learning, this Sequential Feature Selector\n looks only at the features (X), not the desired outputs (y).\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n estimator : estimator instance\n An unfitted estimator.\n\n n_features_to_select : int or float, default=None\n The number of features to select. If `None`, half of the features are\n selected. If integer, the parameter is the absolute number of features\n to select. If float between 0 and 1, it is the fraction of features to\n select.\n\n direction : {'forward', 'backward'}, default='forward'\n Whether to perform forward selection or backward selection.\n\n scoring : str, callable, list/tuple or dict, default=None\n A single str (see :ref:`scoring_parameter`) or a callable\n (see :ref:`scoring`) to evaluate the predictions on the test set.\n\n NOTE that when using custom scorers, each scorer should return a single\n value. Metric functions returning a list/array of values can be wrapped\n into multiple scorers that return one value each.\n\n If None, the estimator's score method is used.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - integer, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. When evaluating a new feature to\n add or remove, the cross-validation procedure is parallel over the\n folds.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_features_to_select_ : int\n The number of features that were selected.\n\n support_ : ndarray of shape (n_features,), dtype=bool\n The mask of selected features.\n\n See Also\n --------\n GenericUnivariateSelect : Univariate feature selector with configurable\n strategy.\n RFE : Recursive feature elimination based on importance weights.\n RFECV : Recursive feature elimination based on importance weights, with\n automatic selection of the number of features.\n SelectFromModel : Feature selection based on thresholds of importance\n weights.\n\n Examples\n --------\n >>> from sklearn.feature_selection import SequentialFeatureSelector\n >>> from sklearn.neighbors import KNeighborsClassifier\n >>> from sklearn.datasets import load_iris\n >>> X, y = load_iris(return_X_y=True)\n >>> knn = KNeighborsClassifier(n_neighbors=3)\n >>> sfs = SequentialFeatureSelector(knn, n_features_to_select=3)\n >>> sfs.fit(X, y)\n SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),\n n_features_to_select=3)\n >>> sfs.get_support()\n array([ True, False, True, True])\n >>> sfs.transform(X).shape\n (150, 3)\n ", "source_code": "\n\nclass SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator):\n \"\"\"Transformer that performs Sequential Feature Selection.\n\n This Sequential Feature Selector adds (forward selection) or\n removes (backward selection) features to form a feature subset in a\n greedy fashion. At each stage, this estimator chooses the best feature to\n add or remove based on the cross-validation score of an estimator. In\n the case of unsupervised learning, this Sequential Feature Selector\n looks only at the features (X), not the desired outputs (y).\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n estimator : estimator instance\n An unfitted estimator.\n\n n_features_to_select : int or float, default=None\n The number of features to select. If `None`, half of the features are\n selected. If integer, the parameter is the absolute number of features\n to select. If float between 0 and 1, it is the fraction of features to\n select.\n\n direction : {'forward', 'backward'}, default='forward'\n Whether to perform forward selection or backward selection.\n\n scoring : str, callable, list/tuple or dict, default=None\n A single str (see :ref:`scoring_parameter`) or a callable\n (see :ref:`scoring`) to evaluate the predictions on the test set.\n\n NOTE that when using custom scorers, each scorer should return a single\n value. Metric functions returning a list/array of values can be wrapped\n into multiple scorers that return one value each.\n\n If None, the estimator's score method is used.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - integer, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. When evaluating a new feature to\n add or remove, the cross-validation procedure is parallel over the\n folds.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_features_to_select_ : int\n The number of features that were selected.\n\n support_ : ndarray of shape (n_features,), dtype=bool\n The mask of selected features.\n\n See Also\n --------\n GenericUnivariateSelect : Univariate feature selector with configurable\n strategy.\n RFE : Recursive feature elimination based on importance weights.\n RFECV : Recursive feature elimination based on importance weights, with\n automatic selection of the number of features.\n SelectFromModel : Feature selection based on thresholds of importance\n weights.\n\n Examples\n --------\n >>> from sklearn.feature_selection import SequentialFeatureSelector\n >>> from sklearn.neighbors import KNeighborsClassifier\n >>> from sklearn.datasets import load_iris\n >>> X, y = load_iris(return_X_y=True)\n >>> knn = KNeighborsClassifier(n_neighbors=3)\n >>> sfs = SequentialFeatureSelector(knn, n_features_to_select=3)\n >>> sfs.fit(X, y)\n SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),\n n_features_to_select=3)\n >>> sfs.get_support()\n array([ True, False, True, True])\n >>> sfs.transform(X).shape\n (150, 3)\n \"\"\"\n \n def __init__(self, estimator, *, n_features_to_select=None, direction='forward', scoring=None, cv=5, n_jobs=None):\n self.estimator = estimator\n self.n_features_to_select = n_features_to_select\n self.direction = direction\n self.scoring = scoring\n self.cv = cv\n self.n_jobs = n_jobs\n \n def fit(self, X, y=None):\n \"\"\"Learn the features to select from X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of predictors.\n\n y : array-like of shape (n_samples,), default=None\n Target values. This parameter may be ignored for\n unsupervised learning.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n tags = self._get_tags()\n X = self._validate_data(X, accept_sparse='csc', ensure_min_features=2, force_all_finite=not tags.get('allow_nan', True))\n n_features = X.shape[1]\n error_msg = f'n_features_to_select must be either None, an integer in [1, n_features - 1] representing the absolute number of features, or a float in (0, 1] representing a percentage of features to select. Got {self.n_features_to_select}'\n if self.n_features_to_select is None:\n self.n_features_to_select_ = n_features // 2\n elif isinstance(self.n_features_to_select, numbers.Integral):\n if not 0 < self.n_features_to_select < n_features:\n raise ValueError(error_msg)\n self.n_features_to_select_ = self.n_features_to_select\n elif isinstance(self.n_features_to_select, numbers.Real):\n if not 0 < self.n_features_to_select <= 1:\n raise ValueError(error_msg)\n self.n_features_to_select_ = int(n_features * self.n_features_to_select)\n else:\n raise ValueError(error_msg)\n if self.direction not in ('forward', 'backward'):\n raise ValueError(f\"direction must be either 'forward' or 'backward'. Got {self.direction}.\")\n cloned_estimator = clone(self.estimator)\n current_mask = np.zeros(shape=n_features, dtype=bool)\n n_iterations = self.n_features_to_select_ if self.direction == 'forward' else n_features - self.n_features_to_select_\n for _ in range(n_iterations):\n new_feature_idx = self._get_best_new_feature(cloned_estimator, X, y, current_mask)\n current_mask[new_feature_idx] = True\n if self.direction == 'backward':\n current_mask = ~current_mask\n self.support_ = current_mask\n return self\n \n def _get_best_new_feature(self, estimator, X, y, current_mask):\n candidate_feature_indices = np.flatnonzero(~current_mask)\n scores = {}\n for feature_idx in candidate_feature_indices:\n candidate_mask = current_mask.copy()\n candidate_mask[feature_idx] = True\n if self.direction == 'backward':\n candidate_mask = ~candidate_mask\n X_new = X[:, candidate_mask]\n scores[feature_idx] = cross_val_score(estimator, X_new, y, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs).mean()\n return max(scores, key=lambda feature_idx: scores[feature_idx])\n \n def _get_support_mask(self):\n check_is_fitted(self)\n return self.support_\n \n def _more_tags(self):\n return {'allow_nan': _safe_tags(self.estimator, key='allow_nan'), 'requires_y': True}\n" }, @@ -22562,7 +22629,7 @@ "sklearn.feature_selection._univariate_selection.SelectFdr._get_support_mask" ], "is_public": true, - "description": "Filter: Select the p-values for an estimated false discovery rate.\n\nThis uses the Benjamini-Hochberg procedure. ``alpha`` is an upper bound on the expected false discovery rate. Read more in the :ref:`User Guide `.", + "description": "Filter: Select the p-values for an estimated false discovery rate.\n\nThis uses the Benjamini-Hochberg procedure. ``alpha`` is an upper bound\non the expected false discovery rate.\n\nRead more in the :ref:`User Guide `.", "docstring": "Filter: Select the p-values for an estimated false discovery rate.\n\n This uses the Benjamini-Hochberg procedure. ``alpha`` is an upper bound\n on the expected false discovery rate.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n score_func : callable, default=f_classif\n Function taking two arrays X and y, and returning a pair of arrays\n (scores, pvalues).\n Default is f_classif (see below \"See Also\"). The default function only\n works with classification tasks.\n\n alpha : float, default=5e-2\n The highest uncorrected p-value for features to keep.\n\n Attributes\n ----------\n scores_ : array-like of shape (n_features,)\n Scores of features.\n\n pvalues_ : array-like of shape (n_features,)\n p-values of feature scores.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n f_classif : ANOVA F-value between label/feature for classification tasks.\n mutual_info_classif : Mutual information for a discrete target.\n chi2 : Chi-squared stats of non-negative features for classification tasks.\n f_regression : F-value between label/feature for regression tasks.\n mutual_info_regression : Mutual information for a contnuous target.\n SelectPercentile : Select features based on percentile of the highest\n scores.\n SelectKBest : Select features based on the k highest scores.\n SelectFpr : Select features based on a false positive rate test.\n SelectFwe : Select features based on family-wise error rate.\n GenericUnivariateSelect : Univariate feature selector with configurable\n mode.\n\n References\n ----------\n https://en.wikipedia.org/wiki/False_discovery_rate\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.feature_selection import SelectFdr, chi2\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> X.shape\n (569, 30)\n >>> X_new = SelectFdr(chi2, alpha=0.01).fit_transform(X, y)\n >>> X_new.shape\n (569, 16)\n ", "source_code": "\n\nclass SelectFdr(_BaseFilter):\n \"\"\"Filter: Select the p-values for an estimated false discovery rate.\n\n This uses the Benjamini-Hochberg procedure. ``alpha`` is an upper bound\n on the expected false discovery rate.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n score_func : callable, default=f_classif\n Function taking two arrays X and y, and returning a pair of arrays\n (scores, pvalues).\n Default is f_classif (see below \"See Also\"). The default function only\n works with classification tasks.\n\n alpha : float, default=5e-2\n The highest uncorrected p-value for features to keep.\n\n Attributes\n ----------\n scores_ : array-like of shape (n_features,)\n Scores of features.\n\n pvalues_ : array-like of shape (n_features,)\n p-values of feature scores.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n f_classif : ANOVA F-value between label/feature for classification tasks.\n mutual_info_classif : Mutual information for a discrete target.\n chi2 : Chi-squared stats of non-negative features for classification tasks.\n f_regression : F-value between label/feature for regression tasks.\n mutual_info_regression : Mutual information for a contnuous target.\n SelectPercentile : Select features based on percentile of the highest\n scores.\n SelectKBest : Select features based on the k highest scores.\n SelectFpr : Select features based on a false positive rate test.\n SelectFwe : Select features based on family-wise error rate.\n GenericUnivariateSelect : Univariate feature selector with configurable\n mode.\n\n References\n ----------\n https://en.wikipedia.org/wiki/False_discovery_rate\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.feature_selection import SelectFdr, chi2\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> X.shape\n (569, 30)\n >>> X_new = SelectFdr(chi2, alpha=0.01).fit_transform(X, y)\n >>> X_new.shape\n (569, 16)\n \"\"\"\n \n def __init__(self, score_func=f_classif, *, alpha=0.05):\n super().__init__(score_func=score_func)\n self.alpha = alpha\n \n def _get_support_mask(self):\n check_is_fitted(self)\n n_features = len(self.pvalues_)\n sv = np.sort(self.pvalues_)\n selected = sv[sv <= float(self.alpha) / n_features * np.arange(1, n_features + 1)]\n if selected.size == 0:\n return np.zeros_like(self.pvalues_, dtype=bool)\n return self.pvalues_ <= selected.max()\n" }, @@ -22576,9 +22643,9 @@ "sklearn.feature_selection._univariate_selection.SelectFpr._get_support_mask" ], "is_public": true, - "description": "Filter: Select the pvalues below alpha based on a FPR test.\n\nFPR test stands for False Positive Rate test. It controls the total amount of false detections. Read more in the :ref:`User Guide `.", - "docstring": "Filter: Select the pvalues below alpha based on a FPR test.\n\n FPR test stands for False Positive Rate test. It controls the total\n amount of false detections.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n score_func : callable, default=f_classif\n Function taking two arrays X and y, and returning a pair of arrays\n (scores, pvalues).\n Default is f_classif (see below \"See Also\"). The default function only\n works with classification tasks.\n\n alpha : float, default=5e-2\n The highest p-value for features to be kept.\n\n Attributes\n ----------\n scores_ : array-like of shape (n_features,)\n Scores of features.\n\n pvalues_ : array-like of shape (n_features,)\n p-values of feature scores.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n f_classif : ANOVA F-value between label/feature for classification tasks.\n chi2 : Chi-squared stats of non-negative features for classification tasks.\n mutual_info_classif: Mutual information for a discrete target.\n f_regression : F-value between label/feature for regression tasks.\n mutual_info_regression : Mutual information for a continuous target.\n SelectPercentile : Select features based on percentile of the highest\n scores.\n SelectKBest : Select features based on the k highest scores.\n SelectFdr : Select features based on an estimated false discovery rate.\n SelectFwe : Select features based on family-wise error rate.\n GenericUnivariateSelect : Univariate feature selector with configurable\n mode.\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.feature_selection import SelectFpr, chi2\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> X.shape\n (569, 30)\n >>> X_new = SelectFpr(chi2, alpha=0.01).fit_transform(X, y)\n >>> X_new.shape\n (569, 16)\n ", - "source_code": "\n\nclass SelectFpr(_BaseFilter):\n \"\"\"Filter: Select the pvalues below alpha based on a FPR test.\n\n FPR test stands for False Positive Rate test. It controls the total\n amount of false detections.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n score_func : callable, default=f_classif\n Function taking two arrays X and y, and returning a pair of arrays\n (scores, pvalues).\n Default is f_classif (see below \"See Also\"). The default function only\n works with classification tasks.\n\n alpha : float, default=5e-2\n The highest p-value for features to be kept.\n\n Attributes\n ----------\n scores_ : array-like of shape (n_features,)\n Scores of features.\n\n pvalues_ : array-like of shape (n_features,)\n p-values of feature scores.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n f_classif : ANOVA F-value between label/feature for classification tasks.\n chi2 : Chi-squared stats of non-negative features for classification tasks.\n mutual_info_classif: Mutual information for a discrete target.\n f_regression : F-value between label/feature for regression tasks.\n mutual_info_regression : Mutual information for a continuous target.\n SelectPercentile : Select features based on percentile of the highest\n scores.\n SelectKBest : Select features based on the k highest scores.\n SelectFdr : Select features based on an estimated false discovery rate.\n SelectFwe : Select features based on family-wise error rate.\n GenericUnivariateSelect : Univariate feature selector with configurable\n mode.\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.feature_selection import SelectFpr, chi2\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> X.shape\n (569, 30)\n >>> X_new = SelectFpr(chi2, alpha=0.01).fit_transform(X, y)\n >>> X_new.shape\n (569, 16)\n \"\"\"\n \n def __init__(self, score_func=f_classif, *, alpha=0.05):\n super().__init__(score_func=score_func)\n self.alpha = alpha\n \n def _get_support_mask(self):\n check_is_fitted(self)\n return self.pvalues_ < self.alpha\n" + "description": "Filter: Select the pvalues below alpha based on a FPR test.\n\nFPR test stands for False Positive Rate test. It controls the total\namount of false detections.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Filter: Select the pvalues below alpha based on a FPR test.\n\n FPR test stands for False Positive Rate test. It controls the total\n amount of false detections.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n score_func : callable, default=f_classif\n Function taking two arrays X and y, and returning a pair of arrays\n (scores, pvalues).\n Default is f_classif (see below \"See Also\"). The default function only\n works with classification tasks.\n\n alpha : float, default=5e-2\n Features with p-values less than `alpha` are selected.\n\n Attributes\n ----------\n scores_ : array-like of shape (n_features,)\n Scores of features.\n\n pvalues_ : array-like of shape (n_features,)\n p-values of feature scores.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n f_classif : ANOVA F-value between label/feature for classification tasks.\n chi2 : Chi-squared stats of non-negative features for classification tasks.\n mutual_info_classif: Mutual information for a discrete target.\n f_regression : F-value between label/feature for regression tasks.\n mutual_info_regression : Mutual information for a continuous target.\n SelectPercentile : Select features based on percentile of the highest\n scores.\n SelectKBest : Select features based on the k highest scores.\n SelectFdr : Select features based on an estimated false discovery rate.\n SelectFwe : Select features based on family-wise error rate.\n GenericUnivariateSelect : Univariate feature selector with configurable\n mode.\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.feature_selection import SelectFpr, chi2\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> X.shape\n (569, 30)\n >>> X_new = SelectFpr(chi2, alpha=0.01).fit_transform(X, y)\n >>> X_new.shape\n (569, 16)\n ", + "source_code": "\n\nclass SelectFpr(_BaseFilter):\n \"\"\"Filter: Select the pvalues below alpha based on a FPR test.\n\n FPR test stands for False Positive Rate test. It controls the total\n amount of false detections.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n score_func : callable, default=f_classif\n Function taking two arrays X and y, and returning a pair of arrays\n (scores, pvalues).\n Default is f_classif (see below \"See Also\"). The default function only\n works with classification tasks.\n\n alpha : float, default=5e-2\n Features with p-values less than `alpha` are selected.\n\n Attributes\n ----------\n scores_ : array-like of shape (n_features,)\n Scores of features.\n\n pvalues_ : array-like of shape (n_features,)\n p-values of feature scores.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n f_classif : ANOVA F-value between label/feature for classification tasks.\n chi2 : Chi-squared stats of non-negative features for classification tasks.\n mutual_info_classif: Mutual information for a discrete target.\n f_regression : F-value between label/feature for regression tasks.\n mutual_info_regression : Mutual information for a continuous target.\n SelectPercentile : Select features based on percentile of the highest\n scores.\n SelectKBest : Select features based on the k highest scores.\n SelectFdr : Select features based on an estimated false discovery rate.\n SelectFwe : Select features based on family-wise error rate.\n GenericUnivariateSelect : Univariate feature selector with configurable\n mode.\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.feature_selection import SelectFpr, chi2\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> X.shape\n (569, 30)\n >>> X_new = SelectFpr(chi2, alpha=0.01).fit_transform(X, y)\n >>> X_new.shape\n (569, 16)\n \"\"\"\n \n def __init__(self, score_func=f_classif, *, alpha=0.05):\n super().__init__(score_func=score_func)\n self.alpha = alpha\n \n def _get_support_mask(self):\n check_is_fitted(self)\n return self.pvalues_ < self.alpha\n" }, { "name": "SelectFwe", @@ -22652,7 +22719,7 @@ "sklearn.feature_selection._variance_threshold.VarianceThreshold._more_tags" ], "is_public": true, - "description": "Feature selector that removes all low-variance features.\n\nThis feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning. Read more in the :ref:`User Guide `.", + "description": "Feature selector that removes all low-variance features.\n\nThis feature selection algorithm looks only at the features (X), not the\ndesired outputs (y), and can thus be used for unsupervised learning.\n\nRead more in the :ref:`User Guide `.", "docstring": "Feature selector that removes all low-variance features.\n\n This feature selection algorithm looks only at the features (X), not the\n desired outputs (y), and can thus be used for unsupervised learning.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n threshold : float, default=0\n Features with a training-set variance lower than this threshold will\n be removed. The default is to keep all features with non-zero variance,\n i.e. remove the features that have the same value in all samples.\n\n Attributes\n ----------\n variances_ : array, shape (n_features,)\n Variances of individual features.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SelectFromModel: Meta-transformer for selecting features based on\n importance weights.\n SelectPercentile : Select features according to a percentile of the highest\n scores.\n SequentialFeatureSelector : Transformer that performs Sequential Feature\n Selection.\n\n Notes\n -----\n Allows NaN in the input.\n Raises ValueError if no feature in X meets the variance threshold.\n\n Examples\n --------\n The following dataset has integer features, two of which are the same\n in every sample. These are removed with the default setting for threshold::\n\n >>> from sklearn.feature_selection import VarianceThreshold\n >>> X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]\n >>> selector = VarianceThreshold()\n >>> selector.fit_transform(X)\n array([[2, 0],\n [1, 4],\n [1, 1]])\n ", "source_code": "\n\nclass VarianceThreshold(SelectorMixin, BaseEstimator):\n \"\"\"Feature selector that removes all low-variance features.\n\n This feature selection algorithm looks only at the features (X), not the\n desired outputs (y), and can thus be used for unsupervised learning.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n threshold : float, default=0\n Features with a training-set variance lower than this threshold will\n be removed. The default is to keep all features with non-zero variance,\n i.e. remove the features that have the same value in all samples.\n\n Attributes\n ----------\n variances_ : array, shape (n_features,)\n Variances of individual features.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SelectFromModel: Meta-transformer for selecting features based on\n importance weights.\n SelectPercentile : Select features according to a percentile of the highest\n scores.\n SequentialFeatureSelector : Transformer that performs Sequential Feature\n Selection.\n\n Notes\n -----\n Allows NaN in the input.\n Raises ValueError if no feature in X meets the variance threshold.\n\n Examples\n --------\n The following dataset has integer features, two of which are the same\n in every sample. These are removed with the default setting for threshold::\n\n >>> from sklearn.feature_selection import VarianceThreshold\n >>> X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]\n >>> selector = VarianceThreshold()\n >>> selector.fit_transform(X)\n array([[2, 0],\n [1, 4],\n [1, 1]])\n \"\"\"\n \n def __init__(self, threshold=0.0):\n self.threshold = threshold\n \n def fit(self, X, y=None):\n \"\"\"Learn empirical variances from X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Data from which to compute variances, where `n_samples` is\n the number of samples and `n_features` is the number of features.\n\n y : any, default=None\n Ignored. This parameter exists only for compatibility with\n sklearn.pipeline.Pipeline.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), dtype=np.float64, force_all_finite='allow-nan')\n if hasattr(X, 'toarray'):\n (_, self.variances_) = mean_variance_axis(X, axis=0)\n if self.threshold == 0:\n (mins, maxes) = min_max_axis(X, axis=0)\n peak_to_peaks = maxes - mins\n else:\n self.variances_ = np.nanvar(X, axis=0)\n if self.threshold == 0:\n peak_to_peaks = np.ptp(X, axis=0)\n if self.threshold == 0:\n compare_arr = np.array([self.variances_, peak_to_peaks])\n self.variances_ = np.nanmin(compare_arr, axis=0)\n elif self.threshold < 0.0:\n raise ValueError(f'Threshold must be non-negative. Got: {self.threshold}')\n if np.all(~np.isfinite(self.variances_) | (self.variances_ <= self.threshold)):\n msg = 'No feature in X meets the variance threshold {0:.5f}'\n if X.shape[0] == 1:\n msg += ' (X contains only one sample)'\n raise ValueError(msg.format(self.threshold))\n return self\n \n def _get_support_mask(self):\n check_is_fitted(self)\n return self.variances_ > self.threshold\n \n def _more_tags(self):\n return {'allow_nan': True}\n" }, @@ -22670,7 +22737,7 @@ "sklearn.gaussian_process._gpc.GaussianProcessClassifier.log_marginal_likelihood" ], "is_public": true, - "description": "Gaussian process classification (GPC) based on Laplace approximation.\n\nThe implementation is based on Algorithm 3.1, 3.2, and 5.1 of Gaussian Processes for Machine Learning (GPML) by Rasmussen and Williams. Internally, the Laplace approximation is used for approximating the non-Gaussian posterior by a Gaussian. Currently, the implementation is restricted to using the logistic link function. For multi-class classification, several binary one-versus rest classifiers are fitted. Note that this class thus does not implement a true multi-class Laplace approximation. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "Gaussian process classification (GPC) based on Laplace approximation.\n\nThe implementation is based on Algorithm 3.1, 3.2, and 5.1 of\nGaussian Processes for Machine Learning (GPML) by Rasmussen and\nWilliams.\n\nInternally, the Laplace approximation is used for approximating the\nnon-Gaussian posterior by a Gaussian.\n\nCurrently, the implementation is restricted to using the logistic link\nfunction. For multi-class classification, several binary one-versus rest\nclassifiers are fitted. Note that this class thus does not implement\na true multi-class Laplace approximation.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "Gaussian process classification (GPC) based on Laplace approximation.\n\n The implementation is based on Algorithm 3.1, 3.2, and 5.1 of\n Gaussian Processes for Machine Learning (GPML) by Rasmussen and\n Williams.\n\n Internally, the Laplace approximation is used for approximating the\n non-Gaussian posterior by a Gaussian.\n\n Currently, the implementation is restricted to using the logistic link\n function. For multi-class classification, several binary one-versus rest\n classifiers are fitted. Note that this class thus does not implement\n a true multi-class Laplace approximation.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n kernel : kernel instance, default=None\n The kernel specifying the covariance function of the GP. If None is\n passed, the kernel \"1.0 * RBF(1.0)\" is used as default. Note that\n the kernel's hyperparameters are optimized during fitting.\n\n optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'\n Can either be one of the internally supported optimizers for optimizing\n the kernel's parameters, specified by a string, or an externally\n defined optimizer passed as a callable. If a callable is passed, it\n must have the signature::\n\n def optimizer(obj_func, initial_theta, bounds):\n # * 'obj_func' is the objective function to be maximized, which\n # takes the hyperparameters theta as parameter and an\n # optional flag eval_gradient, which determines if the\n # gradient is returned additionally to the function value\n # * 'initial_theta': the initial value for theta, which can be\n # used by local optimizers\n # * 'bounds': the bounds on the values of theta\n ....\n # Returned are the best found hyperparameters theta and\n # the corresponding value of the target function.\n return theta_opt, func_min\n\n Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize\n is used. If None is passed, the kernel's parameters are kept fixed.\n Available internal optimizers are::\n\n 'fmin_l_bfgs_b'\n\n n_restarts_optimizer : int, default=0\n The number of restarts of the optimizer for finding the kernel's\n parameters which maximize the log-marginal likelihood. The first run\n of the optimizer is performed from the kernel's initial parameters,\n the remaining ones (if any) from thetas sampled log-uniform randomly\n from the space of allowed theta-values. If greater than 0, all bounds\n must be finite. Note that n_restarts_optimizer=0 implies that one\n run is performed.\n\n max_iter_predict : int, default=100\n The maximum number of iterations in Newton's method for approximating\n the posterior during predict. Smaller values will reduce computation\n time at the cost of worse results.\n\n warm_start : bool, default=False\n If warm-starts are enabled, the solution of the last Newton iteration\n on the Laplace approximation of the posterior mode is used as\n initialization for the next call of _posterior_mode(). This can speed\n up convergence when _posterior_mode is called several times on similar\n problems as in hyperparameter optimization. See :term:`the Glossary\n `.\n\n copy_X_train : bool, default=True\n If True, a persistent copy of the training data is stored in the\n object. Otherwise, just a reference to the training data is stored,\n which might cause predictions to change if the data is modified\n externally.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n multi_class : {'one_vs_rest', 'one_vs_one'}, default='one_vs_rest'\n Specifies how multi-class classification problems are handled.\n Supported are 'one_vs_rest' and 'one_vs_one'. In 'one_vs_rest',\n one binary Gaussian process classifier is fitted for each class, which\n is trained to separate this class from the rest. In 'one_vs_one', one\n binary Gaussian process classifier is fitted for each pair of classes,\n which is trained to separate these two classes. The predictions of\n these binary predictors are combined into multi-class predictions.\n Note that 'one_vs_one' does not support predicting probability\n estimates.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation: the specified\n multiclass problems are computed in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n base_estimator_ : ``Estimator`` instance\n The estimator instance that defines the likelihood function\n using the observed data.\n\n kernel_ : kernel instance\n The kernel used for prediction. In case of binary classification,\n the structure of the kernel is the same as the one passed as parameter\n but with optimized hyperparameters. In case of multi-class\n classification, a CompoundKernel is returned which consists of the\n different kernels used in the one-versus-rest classifiers.\n\n log_marginal_likelihood_value_ : float\n The log-marginal-likelihood of ``self.kernel_.theta``\n\n classes_ : array-like of shape (n_classes,)\n Unique class labels.\n\n n_classes_ : int\n The number of classes in the training data\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GaussianProcessRegressor : Gaussian process regression (GPR).\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.gaussian_process import GaussianProcessClassifier\n >>> from sklearn.gaussian_process.kernels import RBF\n >>> X, y = load_iris(return_X_y=True)\n >>> kernel = 1.0 * RBF(1.0)\n >>> gpc = GaussianProcessClassifier(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpc.score(X, y)\n 0.9866...\n >>> gpc.predict_proba(X[:2,:])\n array([[0.83548752, 0.03228706, 0.13222543],\n [0.79064206, 0.06525643, 0.14410151]])\n ", "source_code": "\n\nclass GaussianProcessClassifier(ClassifierMixin, BaseEstimator):\n \"\"\"Gaussian process classification (GPC) based on Laplace approximation.\n\n The implementation is based on Algorithm 3.1, 3.2, and 5.1 of\n Gaussian Processes for Machine Learning (GPML) by Rasmussen and\n Williams.\n\n Internally, the Laplace approximation is used for approximating the\n non-Gaussian posterior by a Gaussian.\n\n Currently, the implementation is restricted to using the logistic link\n function. For multi-class classification, several binary one-versus rest\n classifiers are fitted. Note that this class thus does not implement\n a true multi-class Laplace approximation.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n kernel : kernel instance, default=None\n The kernel specifying the covariance function of the GP. If None is\n passed, the kernel \"1.0 * RBF(1.0)\" is used as default. Note that\n the kernel's hyperparameters are optimized during fitting.\n\n optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'\n Can either be one of the internally supported optimizers for optimizing\n the kernel's parameters, specified by a string, or an externally\n defined optimizer passed as a callable. If a callable is passed, it\n must have the signature::\n\n def optimizer(obj_func, initial_theta, bounds):\n # * 'obj_func' is the objective function to be maximized, which\n # takes the hyperparameters theta as parameter and an\n # optional flag eval_gradient, which determines if the\n # gradient is returned additionally to the function value\n # * 'initial_theta': the initial value for theta, which can be\n # used by local optimizers\n # * 'bounds': the bounds on the values of theta\n ....\n # Returned are the best found hyperparameters theta and\n # the corresponding value of the target function.\n return theta_opt, func_min\n\n Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize\n is used. If None is passed, the kernel's parameters are kept fixed.\n Available internal optimizers are::\n\n 'fmin_l_bfgs_b'\n\n n_restarts_optimizer : int, default=0\n The number of restarts of the optimizer for finding the kernel's\n parameters which maximize the log-marginal likelihood. The first run\n of the optimizer is performed from the kernel's initial parameters,\n the remaining ones (if any) from thetas sampled log-uniform randomly\n from the space of allowed theta-values. If greater than 0, all bounds\n must be finite. Note that n_restarts_optimizer=0 implies that one\n run is performed.\n\n max_iter_predict : int, default=100\n The maximum number of iterations in Newton's method for approximating\n the posterior during predict. Smaller values will reduce computation\n time at the cost of worse results.\n\n warm_start : bool, default=False\n If warm-starts are enabled, the solution of the last Newton iteration\n on the Laplace approximation of the posterior mode is used as\n initialization for the next call of _posterior_mode(). This can speed\n up convergence when _posterior_mode is called several times on similar\n problems as in hyperparameter optimization. See :term:`the Glossary\n `.\n\n copy_X_train : bool, default=True\n If True, a persistent copy of the training data is stored in the\n object. Otherwise, just a reference to the training data is stored,\n which might cause predictions to change if the data is modified\n externally.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n multi_class : {'one_vs_rest', 'one_vs_one'}, default='one_vs_rest'\n Specifies how multi-class classification problems are handled.\n Supported are 'one_vs_rest' and 'one_vs_one'. In 'one_vs_rest',\n one binary Gaussian process classifier is fitted for each class, which\n is trained to separate this class from the rest. In 'one_vs_one', one\n binary Gaussian process classifier is fitted for each pair of classes,\n which is trained to separate these two classes. The predictions of\n these binary predictors are combined into multi-class predictions.\n Note that 'one_vs_one' does not support predicting probability\n estimates.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation: the specified\n multiclass problems are computed in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n base_estimator_ : ``Estimator`` instance\n The estimator instance that defines the likelihood function\n using the observed data.\n\n kernel_ : kernel instance\n The kernel used for prediction. In case of binary classification,\n the structure of the kernel is the same as the one passed as parameter\n but with optimized hyperparameters. In case of multi-class\n classification, a CompoundKernel is returned which consists of the\n different kernels used in the one-versus-rest classifiers.\n\n log_marginal_likelihood_value_ : float\n The log-marginal-likelihood of ``self.kernel_.theta``\n\n classes_ : array-like of shape (n_classes,)\n Unique class labels.\n\n n_classes_ : int\n The number of classes in the training data\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GaussianProcessRegressor : Gaussian process regression (GPR).\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.gaussian_process import GaussianProcessClassifier\n >>> from sklearn.gaussian_process.kernels import RBF\n >>> X, y = load_iris(return_X_y=True)\n >>> kernel = 1.0 * RBF(1.0)\n >>> gpc = GaussianProcessClassifier(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpc.score(X, y)\n 0.9866...\n >>> gpc.predict_proba(X[:2,:])\n array([[0.83548752, 0.03228706, 0.13222543],\n [0.79064206, 0.06525643, 0.14410151]])\n \"\"\"\n \n def __init__(self, kernel=None, *, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None, multi_class='one_vs_rest', n_jobs=None):\n self.kernel = kernel\n self.optimizer = optimizer\n self.n_restarts_optimizer = n_restarts_optimizer\n self.max_iter_predict = max_iter_predict\n self.warm_start = warm_start\n self.copy_X_train = copy_X_train\n self.random_state = random_state\n self.multi_class = multi_class\n self.n_jobs = n_jobs\n \n def fit(self, X, y):\n \"\"\"Fit Gaussian process classification model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\n y : array-like of shape (n_samples,)\n Target values, must be binary.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n if self.kernel is None or self.kernel.requires_vector_input:\n (X, y) = self._validate_data(X, y, multi_output=False, ensure_2d=True, dtype='numeric')\n else:\n (X, y) = self._validate_data(X, y, multi_output=False, ensure_2d=False, dtype=None)\n self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(kernel=self.kernel, optimizer=self.optimizer, n_restarts_optimizer=self.n_restarts_optimizer, max_iter_predict=self.max_iter_predict, warm_start=self.warm_start, copy_X_train=self.copy_X_train, random_state=self.random_state)\n self.classes_ = np.unique(y)\n self.n_classes_ = self.classes_.size\n if self.n_classes_ == 1:\n raise ValueError('GaussianProcessClassifier requires 2 or more distinct classes; got %d class (only class %s is present)' % (self.n_classes_, self.classes_[0]))\n if self.n_classes_ > 2:\n if self.multi_class == 'one_vs_rest':\n self.base_estimator_ = OneVsRestClassifier(self.base_estimator_, n_jobs=self.n_jobs)\n elif self.multi_class == 'one_vs_one':\n self.base_estimator_ = OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs)\n else:\n raise ValueError('Unknown multi-class mode %s' % self.multi_class)\n self.base_estimator_.fit(X, y)\n if self.n_classes_ > 2:\n self.log_marginal_likelihood_value_ = np.mean([estimator.log_marginal_likelihood() for estimator in self.base_estimator_.estimators_])\n else:\n self.log_marginal_likelihood_value_ = self.base_estimator_.log_marginal_likelihood()\n return self\n \n def predict(self, X):\n \"\"\"Perform classification on an array of test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n Predicted target values for X, values are from ``classes_``.\n \"\"\"\n check_is_fitted(self)\n if self.kernel is None or self.kernel.requires_vector_input:\n X = self._validate_data(X, ensure_2d=True, dtype='numeric', reset=False)\n else:\n X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False)\n return self.base_estimator_.predict(X)\n \n def predict_proba(self, X):\n \"\"\"Return probability estimates for the test vector X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\n Returns\n -------\n C : array-like of shape (n_samples, n_classes)\n Returns the probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n if self.n_classes_ > 2 and self.multi_class == 'one_vs_one':\n raise ValueError('one_vs_one multi-class mode does not support predicting probability estimates. Use one_vs_rest mode instead.')\n if self.kernel is None or self.kernel.requires_vector_input:\n X = self._validate_data(X, ensure_2d=True, dtype='numeric', reset=False)\n else:\n X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False)\n return self.base_estimator_.predict_proba(X)\n \n @property\n def kernel_(self):\n \"\"\"Return the kernel of the base estimator.\"\"\"\n if self.n_classes_ == 2:\n return self.base_estimator_.kernel_\n else:\n return CompoundKernel([estimator.kernel_ for estimator in self.base_estimator_.estimators_])\n \n def log_marginal_likelihood(self, theta=None, eval_gradient=False, clone_kernel=True):\n \"\"\"Return log-marginal likelihood of theta for training data.\n\n In the case of multi-class classification, the mean log-marginal\n likelihood of the one-versus-rest classifiers are returned.\n\n Parameters\n ----------\n theta : array-like of shape (n_kernel_params,), default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. In the case of multi-class classification, theta may\n be the hyperparameters of the compound kernel or of an individual\n kernel. In the latter case, all individual kernel get assigned the\n same theta values. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\n eval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. Note that gradient computation is not supported\n for non-binary classification. If True, theta must not be None.\n\n clone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\n Returns\n -------\n log_likelihood : float\n Log-marginal likelihood of theta for training data.\n\n log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when `eval_gradient` is True.\n \"\"\"\n check_is_fitted(self)\n if theta is None:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated for theta!=None')\n return self.log_marginal_likelihood_value_\n theta = np.asarray(theta)\n if self.n_classes_ == 2:\n return self.base_estimator_.log_marginal_likelihood(theta, eval_gradient, clone_kernel=clone_kernel)\n else:\n if eval_gradient:\n raise NotImplementedError('Gradient of log-marginal-likelihood not implemented for multi-class GPC.')\n estimators = self.base_estimator_.estimators_\n n_dims = estimators[0].kernel_.n_dims\n if theta.shape[0] == n_dims:\n return np.mean([estimator.log_marginal_likelihood(theta, clone_kernel=clone_kernel) for (i, estimator) in enumerate(estimators)])\n elif theta.shape[0] == n_dims * self.classes_.shape[0]:\n return np.mean([estimator.log_marginal_likelihood(theta[n_dims * i:n_dims * (i + 1)], clone_kernel=clone_kernel) for (i, estimator) in enumerate(estimators)])\n else:\n raise ValueError('Shape of theta must be either %d or %d. Obtained theta with shape %d.' % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))\n" }, @@ -22689,7 +22756,7 @@ "sklearn.gaussian_process._gpc._BinaryGaussianProcessClassifierLaplace._constrained_optimization" ], "is_public": false, - "description": "Binary Gaussian process classification based on Laplace approximation.\n\nThe implementation is based on Algorithm 3.1, 3.2, and 5.1 of ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and Williams. Internally, the Laplace approximation is used for approximating the non-Gaussian posterior by a Gaussian. Currently, the implementation is restricted to using the logistic link function. .. versionadded:: 0.18", + "description": "Binary Gaussian process classification based on Laplace approximation.\n\nThe implementation is based on Algorithm 3.1, 3.2, and 5.1 of\n``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and\nWilliams.\n\nInternally, the Laplace approximation is used for approximating the\nnon-Gaussian posterior by a Gaussian.\n\nCurrently, the implementation is restricted to using the logistic link\nfunction.\n\n.. versionadded:: 0.18", "docstring": "Binary Gaussian process classification based on Laplace approximation.\n\n The implementation is based on Algorithm 3.1, 3.2, and 5.1 of\n ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and\n Williams.\n\n Internally, the Laplace approximation is used for approximating the\n non-Gaussian posterior by a Gaussian.\n\n Currently, the implementation is restricted to using the logistic link\n function.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n kernel : kernel instance, default=None\n The kernel specifying the covariance function of the GP. If None is\n passed, the kernel \"1.0 * RBF(1.0)\" is used as default. Note that\n the kernel's hyperparameters are optimized during fitting.\n\n optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'\n Can either be one of the internally supported optimizers for optimizing\n the kernel's parameters, specified by a string, or an externally\n defined optimizer passed as a callable. If a callable is passed, it\n must have the signature::\n\n def optimizer(obj_func, initial_theta, bounds):\n # * 'obj_func' is the objective function to be maximized, which\n # takes the hyperparameters theta as parameter and an\n # optional flag eval_gradient, which determines if the\n # gradient is returned additionally to the function value\n # * 'initial_theta': the initial value for theta, which can be\n # used by local optimizers\n # * 'bounds': the bounds on the values of theta\n ....\n # Returned are the best found hyperparameters theta and\n # the corresponding value of the target function.\n return theta_opt, func_min\n\n Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize\n is used. If None is passed, the kernel's parameters are kept fixed.\n Available internal optimizers are::\n\n 'fmin_l_bfgs_b'\n\n n_restarts_optimizer : int, default=0\n The number of restarts of the optimizer for finding the kernel's\n parameters which maximize the log-marginal likelihood. The first run\n of the optimizer is performed from the kernel's initial parameters,\n the remaining ones (if any) from thetas sampled log-uniform randomly\n from the space of allowed theta-values. If greater than 0, all bounds\n must be finite. Note that n_restarts_optimizer=0 implies that one\n run is performed.\n\n max_iter_predict : int, default=100\n The maximum number of iterations in Newton's method for approximating\n the posterior during predict. Smaller values will reduce computation\n time at the cost of worse results.\n\n warm_start : bool, default=False\n If warm-starts are enabled, the solution of the last Newton iteration\n on the Laplace approximation of the posterior mode is used as\n initialization for the next call of _posterior_mode(). This can speed\n up convergence when _posterior_mode is called several times on similar\n problems as in hyperparameter optimization. See :term:`the Glossary\n `.\n\n copy_X_train : bool, default=True\n If True, a persistent copy of the training data is stored in the\n object. Otherwise, just a reference to the training data is stored,\n which might cause predictions to change if the data is modified\n externally.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n X_train_ : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data (also\n required for prediction).\n\n y_train_ : array-like of shape (n_samples,)\n Target values in training data (also required for prediction)\n\n classes_ : array-like of shape (n_classes,)\n Unique class labels.\n\n kernel_ : kernl instance\n The kernel used for prediction. The structure of the kernel is the\n same as the one passed as parameter but with optimized hyperparameters\n\n L_ : array-like of shape (n_samples, n_samples)\n Lower-triangular Cholesky decomposition of the kernel in X_train_\n\n pi_ : array-like of shape (n_samples,)\n The probabilities of the positive class for the training points\n X_train_\n\n W_sr_ : array-like of shape (n_samples,)\n Square root of W, the Hessian of log-likelihood of the latent function\n values for the observed labels. Since W is diagonal, only the diagonal\n of sqrt(W) is stored.\n\n log_marginal_likelihood_value_ : float\n The log-marginal-likelihood of ``self.kernel_.theta``\n\n ", "source_code": "\n\nclass _BinaryGaussianProcessClassifierLaplace(BaseEstimator):\n \"\"\"Binary Gaussian process classification based on Laplace approximation.\n\n The implementation is based on Algorithm 3.1, 3.2, and 5.1 of\n ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and\n Williams.\n\n Internally, the Laplace approximation is used for approximating the\n non-Gaussian posterior by a Gaussian.\n\n Currently, the implementation is restricted to using the logistic link\n function.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n kernel : kernel instance, default=None\n The kernel specifying the covariance function of the GP. If None is\n passed, the kernel \"1.0 * RBF(1.0)\" is used as default. Note that\n the kernel's hyperparameters are optimized during fitting.\n\n optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'\n Can either be one of the internally supported optimizers for optimizing\n the kernel's parameters, specified by a string, or an externally\n defined optimizer passed as a callable. If a callable is passed, it\n must have the signature::\n\n def optimizer(obj_func, initial_theta, bounds):\n # * 'obj_func' is the objective function to be maximized, which\n # takes the hyperparameters theta as parameter and an\n # optional flag eval_gradient, which determines if the\n # gradient is returned additionally to the function value\n # * 'initial_theta': the initial value for theta, which can be\n # used by local optimizers\n # * 'bounds': the bounds on the values of theta\n ....\n # Returned are the best found hyperparameters theta and\n # the corresponding value of the target function.\n return theta_opt, func_min\n\n Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize\n is used. If None is passed, the kernel's parameters are kept fixed.\n Available internal optimizers are::\n\n 'fmin_l_bfgs_b'\n\n n_restarts_optimizer : int, default=0\n The number of restarts of the optimizer for finding the kernel's\n parameters which maximize the log-marginal likelihood. The first run\n of the optimizer is performed from the kernel's initial parameters,\n the remaining ones (if any) from thetas sampled log-uniform randomly\n from the space of allowed theta-values. If greater than 0, all bounds\n must be finite. Note that n_restarts_optimizer=0 implies that one\n run is performed.\n\n max_iter_predict : int, default=100\n The maximum number of iterations in Newton's method for approximating\n the posterior during predict. Smaller values will reduce computation\n time at the cost of worse results.\n\n warm_start : bool, default=False\n If warm-starts are enabled, the solution of the last Newton iteration\n on the Laplace approximation of the posterior mode is used as\n initialization for the next call of _posterior_mode(). This can speed\n up convergence when _posterior_mode is called several times on similar\n problems as in hyperparameter optimization. See :term:`the Glossary\n `.\n\n copy_X_train : bool, default=True\n If True, a persistent copy of the training data is stored in the\n object. Otherwise, just a reference to the training data is stored,\n which might cause predictions to change if the data is modified\n externally.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n X_train_ : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data (also\n required for prediction).\n\n y_train_ : array-like of shape (n_samples,)\n Target values in training data (also required for prediction)\n\n classes_ : array-like of shape (n_classes,)\n Unique class labels.\n\n kernel_ : kernl instance\n The kernel used for prediction. The structure of the kernel is the\n same as the one passed as parameter but with optimized hyperparameters\n\n L_ : array-like of shape (n_samples, n_samples)\n Lower-triangular Cholesky decomposition of the kernel in X_train_\n\n pi_ : array-like of shape (n_samples,)\n The probabilities of the positive class for the training points\n X_train_\n\n W_sr_ : array-like of shape (n_samples,)\n Square root of W, the Hessian of log-likelihood of the latent function\n values for the observed labels. Since W is diagonal, only the diagonal\n of sqrt(W) is stored.\n\n log_marginal_likelihood_value_ : float\n The log-marginal-likelihood of ``self.kernel_.theta``\n\n \"\"\"\n \n def __init__(self, kernel=None, *, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None):\n self.kernel = kernel\n self.optimizer = optimizer\n self.n_restarts_optimizer = n_restarts_optimizer\n self.max_iter_predict = max_iter_predict\n self.warm_start = warm_start\n self.copy_X_train = copy_X_train\n self.random_state = random_state\n \n def fit(self, X, y):\n \"\"\"Fit Gaussian process classification model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\n y : array-like of shape (n_samples,)\n Target values, must be binary.\n\n Returns\n -------\n self : returns an instance of self.\n \"\"\"\n if self.kernel is None:\n self.kernel_ = C(1.0, constant_value_bounds='fixed') * RBF(1.0, length_scale_bounds='fixed')\n else:\n self.kernel_ = clone(self.kernel)\n self.rng = check_random_state(self.random_state)\n self.X_train_ = np.copy(X) if self.copy_X_train else X\n label_encoder = LabelEncoder()\n self.y_train_ = label_encoder.fit_transform(y)\n self.classes_ = label_encoder.classes_\n if self.classes_.size > 2:\n raise ValueError('%s supports only binary classification. y contains classes %s' % (self.__class__.__name__, self.classes_))\n elif self.classes_.size == 1:\n raise ValueError('{0:s} requires 2 classes; got {1:d} class'.format(self.__class__.__name__, self.classes_.size))\n if self.optimizer is not None and self.kernel_.n_dims > 0:\n \n def obj_func(theta, eval_gradient=True):\n if eval_gradient:\n (lml, grad) = self.log_marginal_likelihood(theta, eval_gradient=True, clone_kernel=False)\n return -lml, -grad\n else:\n return -self.log_marginal_likelihood(theta, clone_kernel=False)\n optima = [self._constrained_optimization(obj_func, self.kernel_.theta, self.kernel_.bounds)]\n if self.n_restarts_optimizer > 0:\n if not np.isfinite(self.kernel_.bounds).all():\n raise ValueError('Multiple optimizer restarts (n_restarts_optimizer>0) requires that all bounds are finite.')\n bounds = self.kernel_.bounds\n for iteration in range(self.n_restarts_optimizer):\n theta_initial = np.exp(self.rng.uniform(bounds[:, 0], bounds[:, 1]))\n optima.append(self._constrained_optimization(obj_func, theta_initial, bounds))\n lml_values = list(map(itemgetter(1), optima))\n self.kernel_.theta = optima[np.argmin(lml_values)][0]\n self.kernel_._check_bounds_params()\n self.log_marginal_likelihood_value_ = -np.min(lml_values)\n else:\n self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(self.kernel_.theta)\n K = self.kernel_(self.X_train_)\n (_, (self.pi_, self.W_sr_, self.L_, _, _)) = self._posterior_mode(K, return_temporaries=True)\n return self\n \n def predict(self, X):\n \"\"\"Perform classification on an array of test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n Predicted target values for X, values are from ``classes_``\n \"\"\"\n check_is_fitted(self)\n K_star = self.kernel_(self.X_train_, X)\n f_star = K_star.T.dot(self.y_train_ - self.pi_)\n return np.where(f_star > 0, self.classes_[1], self.classes_[0])\n \n def predict_proba(self, X):\n \"\"\"Return probability estimates for the test vector X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\n Returns\n -------\n C : array-like of shape (n_samples, n_classes)\n Returns the probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute ``classes_``.\n \"\"\"\n check_is_fitted(self)\n K_star = self.kernel_(self.X_train_, X)\n f_star = K_star.T.dot(self.y_train_ - self.pi_)\n v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star)\n var_f_star = self.kernel_.diag(X) - np.einsum('ij,ij->j', v, v)\n alpha = 1 / (2 * var_f_star)\n gamma = LAMBDAS * f_star\n integrals = np.sqrt(np.pi / alpha) * erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2))) / (2 * np.sqrt(var_f_star * 2 * np.pi))\n pi_star = (COEFS * integrals).sum(axis=0) + 0.5 * COEFS.sum()\n return np.vstack((1 - pi_star, pi_star)).T\n \n def log_marginal_likelihood(self, theta=None, eval_gradient=False, clone_kernel=True):\n \"\"\"Returns log-marginal likelihood of theta for training data.\n\n Parameters\n ----------\n theta : array-like of shape (n_kernel_params,), default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\n eval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. If True, theta must not be None.\n\n clone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\n Returns\n -------\n log_likelihood : float\n Log-marginal likelihood of theta for training data.\n\n log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when `eval_gradient` is True.\n \"\"\"\n if theta is None:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated for theta!=None')\n return self.log_marginal_likelihood_value_\n if clone_kernel:\n kernel = self.kernel_.clone_with_theta(theta)\n else:\n kernel = self.kernel_\n kernel.theta = theta\n if eval_gradient:\n (K, K_gradient) = kernel(self.X_train_, eval_gradient=True)\n else:\n K = kernel(self.X_train_)\n (Z, (pi, W_sr, L, b, a)) = self._posterior_mode(K, return_temporaries=True)\n if not eval_gradient:\n return Z\n d_Z = np.empty(theta.shape[0])\n R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr))\n C = solve(L, W_sr[:, np.newaxis] * K)\n s_2 = -0.5 * (np.diag(K) - np.einsum('ij, ij -> j', C, C)) * (pi * (1 - pi) * (1 - 2 * pi))\n for j in range(d_Z.shape[0]):\n C = K_gradient[:, :, j]\n s_1 = 0.5 * a.T.dot(C).dot(a) - 0.5 * R.T.ravel().dot(C.ravel())\n b = C.dot(self.y_train_ - pi)\n s_3 = b - K.dot(R.dot(b))\n d_Z[j] = s_1 + s_2.T.dot(s_3)\n return Z, d_Z\n \n def _posterior_mode(self, K, return_temporaries=False):\n \"\"\"Mode-finding for binary Laplace GPC and fixed kernel.\n\n This approximates the posterior of the latent function values for given\n inputs and target observations with a Gaussian approximation and uses\n Newton's iteration to find the mode of this approximation.\n \"\"\"\n if self.warm_start and hasattr(self, 'f_cached') and self.f_cached.shape == self.y_train_.shape:\n f = self.f_cached\n else:\n f = np.zeros_like(self.y_train_, dtype=np.float64)\n log_marginal_likelihood = -np.inf\n for _ in range(self.max_iter_predict):\n pi = expit(f)\n W = pi * (1 - pi)\n W_sr = np.sqrt(W)\n W_sr_K = W_sr[:, np.newaxis] * K\n B = np.eye(W.shape[0]) + W_sr_K * W_sr\n L = cholesky(B, lower=True)\n b = W * f + (self.y_train_ - pi)\n a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b))\n f = K.dot(a)\n lml = -0.5 * a.T.dot(f) - np.log1p(np.exp(-(self.y_train_ * 2 - 1) * f)).sum() - np.log(np.diag(L)).sum()\n if lml - log_marginal_likelihood < 1e-10:\n break\n log_marginal_likelihood = lml\n self.f_cached = f\n if return_temporaries:\n return log_marginal_likelihood, (pi, W_sr, L, b, a)\n else:\n return log_marginal_likelihood\n \n def _constrained_optimization(self, obj_func, initial_theta, bounds):\n if self.optimizer == 'fmin_l_bfgs_b':\n opt_res = scipy.optimize.minimize(obj_func, initial_theta, method='L-BFGS-B', jac=True, bounds=bounds)\n _check_optimize_result('lbfgs', opt_res)\n (theta_opt, func_min) = (opt_res.x, opt_res.fun)\n elif callable(self.optimizer):\n (theta_opt, func_min) = self.optimizer(obj_func, initial_theta, bounds=bounds)\n else:\n raise ValueError('Unknown optimizer %s.' % self.optimizer)\n return theta_opt, func_min\n" }, @@ -22712,9 +22779,9 @@ "sklearn.gaussian_process._gpr.GaussianProcessRegressor._more_tags" ], "is_public": true, - "description": "Gaussian process regression (GPR).\n\nThe implementation is based on Algorithm 2.1 of [1]_. In addition to standard scikit-learn estimator API, :class:`GaussianProcessRegressor`: * allows prediction without prior fitting (based on the GP prior) * provides an additional method `sample_y(X)`, which evaluates samples drawn from the GPR (prior or posterior) at given inputs * exposes a method `log_marginal_likelihood(theta)`, which can be used externally for other ways of selecting hyperparameters, e.g., via Markov chain Monte Carlo. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", - "docstring": "Gaussian process regression (GPR).\n\n The implementation is based on Algorithm 2.1 of [1]_.\n\n In addition to standard scikit-learn estimator API,\n :class:`GaussianProcessRegressor`:\n\n * allows prediction without prior fitting (based on the GP prior)\n * provides an additional method `sample_y(X)`, which evaluates samples\n drawn from the GPR (prior or posterior) at given inputs\n * exposes a method `log_marginal_likelihood(theta)`, which can be used\n externally for other ways of selecting hyperparameters, e.g., via\n Markov chain Monte Carlo.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n kernel : kernel instance, default=None\n The kernel specifying the covariance function of the GP. If None is\n passed, the kernel ``ConstantKernel(1.0, constant_value_bounds=\"fixed\"\n * RBF(1.0, length_scale_bounds=\"fixed\")`` is used as default. Note that\n the kernel hyperparameters are optimized during fitting unless the\n bounds are marked as \"fixed\".\n\n alpha : float or ndarray of shape (n_samples,), default=1e-10\n Value added to the diagonal of the kernel matrix during fitting.\n This can prevent a potential numerical issue during fitting, by\n ensuring that the calculated values form a positive definite matrix.\n It can also be interpreted as the variance of additional Gaussian\n measurement noise on the training observations. Note that this is\n different from using a `WhiteKernel`. If an array is passed, it must\n have the same number of entries as the data used for fitting and is\n used as datapoint-dependent noise level. Allowing to specify the\n noise level directly as a parameter is mainly for convenience and\n for consistency with :class:`~sklearn.linear_model.Ridge`.\n\n optimizer : \"fmin_l_bfgs_b\" or callable, default=\"fmin_l_bfgs_b\"\n Can either be one of the internally supported optimizers for optimizing\n the kernel's parameters, specified by a string, or an externally\n defined optimizer passed as a callable. If a callable is passed, it\n must have the signature::\n\n def optimizer(obj_func, initial_theta, bounds):\n # * 'obj_func': the objective function to be minimized, which\n # takes the hyperparameters theta as a parameter and an\n # optional flag eval_gradient, which determines if the\n # gradient is returned additionally to the function value\n # * 'initial_theta': the initial value for theta, which can be\n # used by local optimizers\n # * 'bounds': the bounds on the values of theta\n ....\n # Returned are the best found hyperparameters theta and\n # the corresponding value of the target function.\n return theta_opt, func_min\n\n Per default, the L-BFGS-B algorithm from `scipy.optimize.minimize`\n is used. If None is passed, the kernel's parameters are kept fixed.\n Available internal optimizers are: `{'fmin_l_bfgs_b'}`.\n\n n_restarts_optimizer : int, default=0\n The number of restarts of the optimizer for finding the kernel's\n parameters which maximize the log-marginal likelihood. The first run\n of the optimizer is performed from the kernel's initial parameters,\n the remaining ones (if any) from thetas sampled log-uniform randomly\n from the space of allowed theta-values. If greater than 0, all bounds\n must be finite. Note that `n_restarts_optimizer == 0` implies that one\n run is performed.\n\n normalize_y : bool, default=False\n Whether or not to normalized the target values `y` by removing the mean\n and scaling to unit-variance. This is recommended for cases where\n zero-mean, unit-variance priors are used. Note that, in this\n implementation, the normalisation is reversed before the GP predictions\n are reported.\n\n .. versionchanged:: 0.23\n\n copy_X_train : bool, default=True\n If True, a persistent copy of the training data is stored in the\n object. Otherwise, just a reference to the training data is stored,\n which might cause predictions to change if the data is modified\n externally.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n X_train_ : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data (also\n required for prediction).\n\n y_train_ : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values in training data (also required for prediction).\n\n kernel_ : kernel instance\n The kernel used for prediction. The structure of the kernel is the\n same as the one passed as parameter but with optimized hyperparameters.\n\n L_ : array-like of shape (n_samples, n_samples)\n Lower-triangular Cholesky decomposition of the kernel in ``X_train_``.\n\n alpha_ : array-like of shape (n_samples,)\n Dual coefficients of training data points in kernel space.\n\n log_marginal_likelihood_value_ : float\n The log-marginal-likelihood of ``self.kernel_.theta``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GaussianProcessClassifier : Gaussian process classification (GPC)\n based on Laplace approximation.\n\n References\n ----------\n .. [1] `Rasmussen, Carl Edward.\n \"Gaussian processes in machine learning.\"\n Summer school on machine learning. Springer, Berlin, Heidelberg, 2003\n `_.\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = DotProduct() + WhiteKernel()\n >>> gpr = GaussianProcessRegressor(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.3680...\n >>> gpr.predict(X[:2,:], return_std=True)\n (array([653.0..., 592.1...]), array([316.6..., 316.6...]))\n ", - "source_code": "\n\nclass GaussianProcessRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):\n \"\"\"Gaussian process regression (GPR).\n\n The implementation is based on Algorithm 2.1 of [1]_.\n\n In addition to standard scikit-learn estimator API,\n :class:`GaussianProcessRegressor`:\n\n * allows prediction without prior fitting (based on the GP prior)\n * provides an additional method `sample_y(X)`, which evaluates samples\n drawn from the GPR (prior or posterior) at given inputs\n * exposes a method `log_marginal_likelihood(theta)`, which can be used\n externally for other ways of selecting hyperparameters, e.g., via\n Markov chain Monte Carlo.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n kernel : kernel instance, default=None\n The kernel specifying the covariance function of the GP. If None is\n passed, the kernel ``ConstantKernel(1.0, constant_value_bounds=\"fixed\"\n * RBF(1.0, length_scale_bounds=\"fixed\")`` is used as default. Note that\n the kernel hyperparameters are optimized during fitting unless the\n bounds are marked as \"fixed\".\n\n alpha : float or ndarray of shape (n_samples,), default=1e-10\n Value added to the diagonal of the kernel matrix during fitting.\n This can prevent a potential numerical issue during fitting, by\n ensuring that the calculated values form a positive definite matrix.\n It can also be interpreted as the variance of additional Gaussian\n measurement noise on the training observations. Note that this is\n different from using a `WhiteKernel`. If an array is passed, it must\n have the same number of entries as the data used for fitting and is\n used as datapoint-dependent noise level. Allowing to specify the\n noise level directly as a parameter is mainly for convenience and\n for consistency with :class:`~sklearn.linear_model.Ridge`.\n\n optimizer : \"fmin_l_bfgs_b\" or callable, default=\"fmin_l_bfgs_b\"\n Can either be one of the internally supported optimizers for optimizing\n the kernel's parameters, specified by a string, or an externally\n defined optimizer passed as a callable. If a callable is passed, it\n must have the signature::\n\n def optimizer(obj_func, initial_theta, bounds):\n # * 'obj_func': the objective function to be minimized, which\n # takes the hyperparameters theta as a parameter and an\n # optional flag eval_gradient, which determines if the\n # gradient is returned additionally to the function value\n # * 'initial_theta': the initial value for theta, which can be\n # used by local optimizers\n # * 'bounds': the bounds on the values of theta\n ....\n # Returned are the best found hyperparameters theta and\n # the corresponding value of the target function.\n return theta_opt, func_min\n\n Per default, the L-BFGS-B algorithm from `scipy.optimize.minimize`\n is used. If None is passed, the kernel's parameters are kept fixed.\n Available internal optimizers are: `{'fmin_l_bfgs_b'}`.\n\n n_restarts_optimizer : int, default=0\n The number of restarts of the optimizer for finding the kernel's\n parameters which maximize the log-marginal likelihood. The first run\n of the optimizer is performed from the kernel's initial parameters,\n the remaining ones (if any) from thetas sampled log-uniform randomly\n from the space of allowed theta-values. If greater than 0, all bounds\n must be finite. Note that `n_restarts_optimizer == 0` implies that one\n run is performed.\n\n normalize_y : bool, default=False\n Whether or not to normalized the target values `y` by removing the mean\n and scaling to unit-variance. This is recommended for cases where\n zero-mean, unit-variance priors are used. Note that, in this\n implementation, the normalisation is reversed before the GP predictions\n are reported.\n\n .. versionchanged:: 0.23\n\n copy_X_train : bool, default=True\n If True, a persistent copy of the training data is stored in the\n object. Otherwise, just a reference to the training data is stored,\n which might cause predictions to change if the data is modified\n externally.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n X_train_ : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data (also\n required for prediction).\n\n y_train_ : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values in training data (also required for prediction).\n\n kernel_ : kernel instance\n The kernel used for prediction. The structure of the kernel is the\n same as the one passed as parameter but with optimized hyperparameters.\n\n L_ : array-like of shape (n_samples, n_samples)\n Lower-triangular Cholesky decomposition of the kernel in ``X_train_``.\n\n alpha_ : array-like of shape (n_samples,)\n Dual coefficients of training data points in kernel space.\n\n log_marginal_likelihood_value_ : float\n The log-marginal-likelihood of ``self.kernel_.theta``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GaussianProcessClassifier : Gaussian process classification (GPC)\n based on Laplace approximation.\n\n References\n ----------\n .. [1] `Rasmussen, Carl Edward.\n \"Gaussian processes in machine learning.\"\n Summer school on machine learning. Springer, Berlin, Heidelberg, 2003\n `_.\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = DotProduct() + WhiteKernel()\n >>> gpr = GaussianProcessRegressor(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.3680...\n >>> gpr.predict(X[:2,:], return_std=True)\n (array([653.0..., 592.1...]), array([316.6..., 316.6...]))\n \"\"\"\n \n def __init__(self, kernel=None, *, alpha=1e-10, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, normalize_y=False, copy_X_train=True, random_state=None):\n self.kernel = kernel\n self.alpha = alpha\n self.optimizer = optimizer\n self.n_restarts_optimizer = n_restarts_optimizer\n self.normalize_y = normalize_y\n self.copy_X_train = copy_X_train\n self.random_state = random_state\n \n def fit(self, X, y):\n \"\"\"Fit Gaussian process regression model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n Returns\n -------\n self : object\n GaussianProcessRegressor class instance.\n \"\"\"\n if self.kernel is None:\n self.kernel_ = C(1.0, constant_value_bounds='fixed') * RBF(1.0, length_scale_bounds='fixed')\n else:\n self.kernel_ = clone(self.kernel)\n self._rng = check_random_state(self.random_state)\n if self.kernel_.requires_vector_input:\n (dtype, ensure_2d) = ('numeric', True)\n else:\n (dtype, ensure_2d) = (None, False)\n (X, y) = self._validate_data(X, y, multi_output=True, y_numeric=True, ensure_2d=ensure_2d, dtype=dtype)\n if self.normalize_y:\n self._y_train_mean = np.mean(y, axis=0)\n self._y_train_std = _handle_zeros_in_scale(np.std(y, axis=0), copy=False)\n y = (y - self._y_train_mean) / self._y_train_std\n else:\n self._y_train_mean = np.zeros(1)\n self._y_train_std = 1\n if np.iterable(self.alpha) and self.alpha.shape[0] != y.shape[0]:\n if self.alpha.shape[0] == 1:\n self.alpha = self.alpha[0]\n else:\n raise ValueError(f'alpha must be a scalar or an array with same number of entries as y. ({self.alpha.shape[0]} != {y.shape[0]})')\n self.X_train_ = np.copy(X) if self.copy_X_train else X\n self.y_train_ = np.copy(y) if self.copy_X_train else y\n if self.optimizer is not None and self.kernel_.n_dims > 0:\n \n def obj_func(theta, eval_gradient=True):\n if eval_gradient:\n (lml, grad) = self.log_marginal_likelihood(theta, eval_gradient=True, clone_kernel=False)\n return -lml, -grad\n else:\n return -self.log_marginal_likelihood(theta, clone_kernel=False)\n optima = [self._constrained_optimization(obj_func, self.kernel_.theta, self.kernel_.bounds)]\n if self.n_restarts_optimizer > 0:\n if not np.isfinite(self.kernel_.bounds).all():\n raise ValueError('Multiple optimizer restarts (n_restarts_optimizer>0) requires that all bounds are finite.')\n bounds = self.kernel_.bounds\n for iteration in range(self.n_restarts_optimizer):\n theta_initial = self._rng.uniform(bounds[:, 0], bounds[:, 1])\n optima.append(self._constrained_optimization(obj_func, theta_initial, bounds))\n lml_values = list(map(itemgetter(1), optima))\n self.kernel_.theta = optima[np.argmin(lml_values)][0]\n self.kernel_._check_bounds_params()\n self.log_marginal_likelihood_value_ = -np.min(lml_values)\n else:\n self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(self.kernel_.theta, clone_kernel=False)\n K = self.kernel_(self.X_train_)\n K[np.diag_indices_from(K)] += self.alpha\n try:\n self.L_ = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)\n except np.linalg.LinAlgError as exc:\n exc.args = (f\"The kernel, {self.kernel_}, is not returning a positive definite matrix. Try gradually increasing the 'alpha' parameter of your GaussianProcessRegressor estimator.\", ) + exc.args\n raise\n self.alpha_ = cho_solve((self.L_, GPR_CHOLESKY_LOWER), self.y_train_, check_finite=False)\n return self\n \n def predict(self, X, return_std=False, return_cov=False):\n \"\"\"Predict using the Gaussian process regression model.\n\n We can also predict based on an unfitted model by using the GP prior.\n In addition to the mean of the predictive distribution, optionally also\n returns its standard deviation (`return_std=True`) or covariance\n (`return_cov=True`). Note that at most one of the two can be requested.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated.\n\n return_std : bool, default=False\n If True, the standard-deviation of the predictive distribution at\n the query points is returned along with the mean.\n\n return_cov : bool, default=False\n If True, the covariance of the joint predictive distribution at\n the query points is returned along with the mean.\n\n Returns\n -------\n y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Mean of predictive distribution a query points.\n\n y_std : ndarray of shape (n_samples,) or (n_samples, n_targets), optional\n Standard deviation of predictive distribution at query points.\n Only returned when `return_std` is True.\n\n y_cov : ndarray of shape (n_samples, n_samples) or (n_samples, n_samples, n_targets), optional\n Covariance of joint predictive distribution a query points.\n Only returned when `return_cov` is True.\n \"\"\"\n if return_std and return_cov:\n raise RuntimeError('At most one of return_std or return_cov can be requested.')\n if self.kernel is None or self.kernel.requires_vector_input:\n (dtype, ensure_2d) = ('numeric', True)\n else:\n (dtype, ensure_2d) = (None, False)\n X = self._validate_data(X, ensure_2d=ensure_2d, dtype=dtype, reset=False)\n if not hasattr(self, 'X_train_'):\n if self.kernel is None:\n kernel = C(1.0, constant_value_bounds='fixed') * RBF(1.0, length_scale_bounds='fixed')\n else:\n kernel = self.kernel\n y_mean = np.zeros(X.shape[0])\n if return_cov:\n y_cov = kernel(X)\n return y_mean, y_cov\n elif return_std:\n y_var = kernel.diag(X)\n return y_mean, np.sqrt(y_var)\n else:\n return y_mean\n else:\n K_trans = self.kernel_(X, self.X_train_)\n y_mean = K_trans @ self.alpha_\n y_mean = self._y_train_std * y_mean + self._y_train_mean\n V = solve_triangular(self.L_, K_trans.T, lower=GPR_CHOLESKY_LOWER, check_finite=False)\n if return_cov:\n y_cov = self.kernel_(X) - V.T @ V\n y_cov = np.outer(y_cov, self._y_train_std**2).reshape(*y_cov.shape, -1)\n if y_cov.shape[2] == 1:\n y_cov = np.squeeze(y_cov, axis=2)\n return y_mean, y_cov\n elif return_std:\n y_var = self.kernel_.diag(X)\n y_var -= np.einsum('ij,ji->i', V.T, V)\n y_var_negative = y_var < 0\n if np.any(y_var_negative):\n warnings.warn('Predicted variances smaller than 0. Setting those variances to 0.')\n y_var[y_var_negative] = 0.0\n y_var = np.outer(y_var, self._y_train_std**2).reshape(*y_var.shape, -1)\n if y_var.shape[1] == 1:\n y_var = np.squeeze(y_var, axis=1)\n return y_mean, np.sqrt(y_var)\n else:\n return y_mean\n \n def sample_y(self, X, n_samples=1, random_state=0):\n \"\"\"Draw samples from Gaussian process and evaluate at X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Query points where the GP is evaluated.\n\n n_samples : int, default=1\n Number of samples drawn from the Gaussian process per query point.\n\n random_state : int, RandomState instance or None, default=0\n Determines random number generation to randomly draw samples.\n Pass an int for reproducible results across multiple function\n calls.\n See :term:`Glossary `.\n\n Returns\n -------\n y_samples : ndarray of shape (n_samples_X, n_samples), or (n_samples_X, n_targets, n_samples)\n Values of n_samples samples drawn from Gaussian process and\n evaluated at query points.\n \"\"\"\n rng = check_random_state(random_state)\n (y_mean, y_cov) = self.predict(X, return_cov=True)\n if y_mean.ndim == 1:\n y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T\n else:\n y_samples = [rng.multivariate_normal(y_mean[:, i], y_cov, n_samples).T[:, np.newaxis] for i in range(y_mean.shape[1])]\n y_samples = np.hstack(y_samples)\n return y_samples\n \n def log_marginal_likelihood(self, theta=None, eval_gradient=False, clone_kernel=True):\n \"\"\"Return log-marginal likelihood of theta for training data.\n\n Parameters\n ----------\n theta : array-like of shape (n_kernel_params,) default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\n eval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. If True, theta must not be None.\n\n clone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\n Returns\n -------\n log_likelihood : float\n Log-marginal likelihood of theta for training data.\n\n log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when eval_gradient is True.\n \"\"\"\n if theta is None:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated for theta!=None')\n return self.log_marginal_likelihood_value_\n if clone_kernel:\n kernel = self.kernel_.clone_with_theta(theta)\n else:\n kernel = self.kernel_\n kernel.theta = theta\n if eval_gradient:\n (K, K_gradient) = kernel(self.X_train_, eval_gradient=True)\n else:\n K = kernel(self.X_train_)\n K[np.diag_indices_from(K)] += self.alpha\n try:\n L = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)\n except np.linalg.LinAlgError:\n return (-np.inf, np.zeros_like(theta)) if eval_gradient else -np.inf\n y_train = self.y_train_\n if y_train.ndim == 1:\n y_train = y_train[:, np.newaxis]\n alpha = cho_solve((L, GPR_CHOLESKY_LOWER), y_train, check_finite=False)\n log_likelihood_dims = -0.5 * np.einsum('ik,ik->k', y_train, alpha)\n log_likelihood_dims -= np.log(np.diag(L)).sum()\n log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi)\n log_likelihood = log_likelihood_dims.sum(axis=-1)\n if eval_gradient:\n inner_term = np.einsum('ik,jk->ijk', alpha, alpha)\n K_inv = cho_solve((L, GPR_CHOLESKY_LOWER), np.eye(K.shape[0]), check_finite=False)\n inner_term -= K_inv[..., np.newaxis]\n log_likelihood_gradient_dims = 0.5 * np.einsum('ijl,jik->kl', inner_term, K_gradient)\n log_likelihood_gradient = log_likelihood_gradient_dims.sum(axis=-1)\n if eval_gradient:\n return log_likelihood, log_likelihood_gradient\n else:\n return log_likelihood\n \n def _constrained_optimization(self, obj_func, initial_theta, bounds):\n if self.optimizer == 'fmin_l_bfgs_b':\n opt_res = scipy.optimize.minimize(obj_func, initial_theta, method='L-BFGS-B', jac=True, bounds=bounds)\n _check_optimize_result('lbfgs', opt_res)\n (theta_opt, func_min) = (opt_res.x, opt_res.fun)\n elif callable(self.optimizer):\n (theta_opt, func_min) = self.optimizer(obj_func, initial_theta, bounds=bounds)\n else:\n raise ValueError(f'Unknown optimizer {self.optimizer}.')\n return theta_opt, func_min\n \n def _more_tags(self):\n return {'requires_fit': False}\n" + "description": "Gaussian process regression (GPR).\n\nThe implementation is based on Algorithm 2.1 of [1]_.\n\nIn addition to standard scikit-learn estimator API,\n:class:`GaussianProcessRegressor`:\n\n * allows prediction without prior fitting (based on the GP prior)\n * provides an additional method `sample_y(X)`, which evaluates samples\n drawn from the GPR (prior or posterior) at given inputs\n * exposes a method `log_marginal_likelihood(theta)`, which can be used\n externally for other ways of selecting hyperparameters, e.g., via\n Markov chain Monte Carlo.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", + "docstring": "Gaussian process regression (GPR).\n\n The implementation is based on Algorithm 2.1 of [1]_.\n\n In addition to standard scikit-learn estimator API,\n :class:`GaussianProcessRegressor`:\n\n * allows prediction without prior fitting (based on the GP prior)\n * provides an additional method `sample_y(X)`, which evaluates samples\n drawn from the GPR (prior or posterior) at given inputs\n * exposes a method `log_marginal_likelihood(theta)`, which can be used\n externally for other ways of selecting hyperparameters, e.g., via\n Markov chain Monte Carlo.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n kernel : kernel instance, default=None\n The kernel specifying the covariance function of the GP. If None is\n passed, the kernel ``ConstantKernel(1.0, constant_value_bounds=\"fixed\"\n * RBF(1.0, length_scale_bounds=\"fixed\")`` is used as default. Note that\n the kernel hyperparameters are optimized during fitting unless the\n bounds are marked as \"fixed\".\n\n alpha : float or ndarray of shape (n_samples,), default=1e-10\n Value added to the diagonal of the kernel matrix during fitting.\n This can prevent a potential numerical issue during fitting, by\n ensuring that the calculated values form a positive definite matrix.\n It can also be interpreted as the variance of additional Gaussian\n measurement noise on the training observations. Note that this is\n different from using a `WhiteKernel`. If an array is passed, it must\n have the same number of entries as the data used for fitting and is\n used as datapoint-dependent noise level. Allowing to specify the\n noise level directly as a parameter is mainly for convenience and\n for consistency with :class:`~sklearn.linear_model.Ridge`.\n\n optimizer : \"fmin_l_bfgs_b\" or callable, default=\"fmin_l_bfgs_b\"\n Can either be one of the internally supported optimizers for optimizing\n the kernel's parameters, specified by a string, or an externally\n defined optimizer passed as a callable. If a callable is passed, it\n must have the signature::\n\n def optimizer(obj_func, initial_theta, bounds):\n # * 'obj_func': the objective function to be minimized, which\n # takes the hyperparameters theta as a parameter and an\n # optional flag eval_gradient, which determines if the\n # gradient is returned additionally to the function value\n # * 'initial_theta': the initial value for theta, which can be\n # used by local optimizers\n # * 'bounds': the bounds on the values of theta\n ....\n # Returned are the best found hyperparameters theta and\n # the corresponding value of the target function.\n return theta_opt, func_min\n\n Per default, the L-BFGS-B algorithm from `scipy.optimize.minimize`\n is used. If None is passed, the kernel's parameters are kept fixed.\n Available internal optimizers are: `{'fmin_l_bfgs_b'}`.\n\n n_restarts_optimizer : int, default=0\n The number of restarts of the optimizer for finding the kernel's\n parameters which maximize the log-marginal likelihood. The first run\n of the optimizer is performed from the kernel's initial parameters,\n the remaining ones (if any) from thetas sampled log-uniform randomly\n from the space of allowed theta-values. If greater than 0, all bounds\n must be finite. Note that `n_restarts_optimizer == 0` implies that one\n run is performed.\n\n normalize_y : bool, default=False\n Whether or not to normalize the target values `y` by removing the mean\n and scaling to unit-variance. This is recommended for cases where\n zero-mean, unit-variance priors are used. Note that, in this\n implementation, the normalisation is reversed before the GP predictions\n are reported.\n\n .. versionchanged:: 0.23\n\n copy_X_train : bool, default=True\n If True, a persistent copy of the training data is stored in the\n object. Otherwise, just a reference to the training data is stored,\n which might cause predictions to change if the data is modified\n externally.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n X_train_ : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data (also\n required for prediction).\n\n y_train_ : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values in training data (also required for prediction).\n\n kernel_ : kernel instance\n The kernel used for prediction. The structure of the kernel is the\n same as the one passed as parameter but with optimized hyperparameters.\n\n L_ : array-like of shape (n_samples, n_samples)\n Lower-triangular Cholesky decomposition of the kernel in ``X_train_``.\n\n alpha_ : array-like of shape (n_samples,)\n Dual coefficients of training data points in kernel space.\n\n log_marginal_likelihood_value_ : float\n The log-marginal-likelihood of ``self.kernel_.theta``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GaussianProcessClassifier : Gaussian process classification (GPC)\n based on Laplace approximation.\n\n References\n ----------\n .. [1] `Rasmussen, Carl Edward.\n \"Gaussian processes in machine learning.\"\n Summer school on machine learning. Springer, Berlin, Heidelberg, 2003\n `_.\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = DotProduct() + WhiteKernel()\n >>> gpr = GaussianProcessRegressor(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.3680...\n >>> gpr.predict(X[:2,:], return_std=True)\n (array([653.0..., 592.1...]), array([316.6..., 316.6...]))\n ", + "source_code": "\n\nclass GaussianProcessRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):\n \"\"\"Gaussian process regression (GPR).\n\n The implementation is based on Algorithm 2.1 of [1]_.\n\n In addition to standard scikit-learn estimator API,\n :class:`GaussianProcessRegressor`:\n\n * allows prediction without prior fitting (based on the GP prior)\n * provides an additional method `sample_y(X)`, which evaluates samples\n drawn from the GPR (prior or posterior) at given inputs\n * exposes a method `log_marginal_likelihood(theta)`, which can be used\n externally for other ways of selecting hyperparameters, e.g., via\n Markov chain Monte Carlo.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n kernel : kernel instance, default=None\n The kernel specifying the covariance function of the GP. If None is\n passed, the kernel ``ConstantKernel(1.0, constant_value_bounds=\"fixed\"\n * RBF(1.0, length_scale_bounds=\"fixed\")`` is used as default. Note that\n the kernel hyperparameters are optimized during fitting unless the\n bounds are marked as \"fixed\".\n\n alpha : float or ndarray of shape (n_samples,), default=1e-10\n Value added to the diagonal of the kernel matrix during fitting.\n This can prevent a potential numerical issue during fitting, by\n ensuring that the calculated values form a positive definite matrix.\n It can also be interpreted as the variance of additional Gaussian\n measurement noise on the training observations. Note that this is\n different from using a `WhiteKernel`. If an array is passed, it must\n have the same number of entries as the data used for fitting and is\n used as datapoint-dependent noise level. Allowing to specify the\n noise level directly as a parameter is mainly for convenience and\n for consistency with :class:`~sklearn.linear_model.Ridge`.\n\n optimizer : \"fmin_l_bfgs_b\" or callable, default=\"fmin_l_bfgs_b\"\n Can either be one of the internally supported optimizers for optimizing\n the kernel's parameters, specified by a string, or an externally\n defined optimizer passed as a callable. If a callable is passed, it\n must have the signature::\n\n def optimizer(obj_func, initial_theta, bounds):\n # * 'obj_func': the objective function to be minimized, which\n # takes the hyperparameters theta as a parameter and an\n # optional flag eval_gradient, which determines if the\n # gradient is returned additionally to the function value\n # * 'initial_theta': the initial value for theta, which can be\n # used by local optimizers\n # * 'bounds': the bounds on the values of theta\n ....\n # Returned are the best found hyperparameters theta and\n # the corresponding value of the target function.\n return theta_opt, func_min\n\n Per default, the L-BFGS-B algorithm from `scipy.optimize.minimize`\n is used. If None is passed, the kernel's parameters are kept fixed.\n Available internal optimizers are: `{'fmin_l_bfgs_b'}`.\n\n n_restarts_optimizer : int, default=0\n The number of restarts of the optimizer for finding the kernel's\n parameters which maximize the log-marginal likelihood. The first run\n of the optimizer is performed from the kernel's initial parameters,\n the remaining ones (if any) from thetas sampled log-uniform randomly\n from the space of allowed theta-values. If greater than 0, all bounds\n must be finite. Note that `n_restarts_optimizer == 0` implies that one\n run is performed.\n\n normalize_y : bool, default=False\n Whether or not to normalize the target values `y` by removing the mean\n and scaling to unit-variance. This is recommended for cases where\n zero-mean, unit-variance priors are used. Note that, in this\n implementation, the normalisation is reversed before the GP predictions\n are reported.\n\n .. versionchanged:: 0.23\n\n copy_X_train : bool, default=True\n If True, a persistent copy of the training data is stored in the\n object. Otherwise, just a reference to the training data is stored,\n which might cause predictions to change if the data is modified\n externally.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n X_train_ : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data (also\n required for prediction).\n\n y_train_ : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values in training data (also required for prediction).\n\n kernel_ : kernel instance\n The kernel used for prediction. The structure of the kernel is the\n same as the one passed as parameter but with optimized hyperparameters.\n\n L_ : array-like of shape (n_samples, n_samples)\n Lower-triangular Cholesky decomposition of the kernel in ``X_train_``.\n\n alpha_ : array-like of shape (n_samples,)\n Dual coefficients of training data points in kernel space.\n\n log_marginal_likelihood_value_ : float\n The log-marginal-likelihood of ``self.kernel_.theta``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GaussianProcessClassifier : Gaussian process classification (GPC)\n based on Laplace approximation.\n\n References\n ----------\n .. [1] `Rasmussen, Carl Edward.\n \"Gaussian processes in machine learning.\"\n Summer school on machine learning. Springer, Berlin, Heidelberg, 2003\n `_.\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = DotProduct() + WhiteKernel()\n >>> gpr = GaussianProcessRegressor(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.3680...\n >>> gpr.predict(X[:2,:], return_std=True)\n (array([653.0..., 592.1...]), array([316.6..., 316.6...]))\n \"\"\"\n \n def __init__(self, kernel=None, *, alpha=1e-10, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, normalize_y=False, copy_X_train=True, random_state=None):\n self.kernel = kernel\n self.alpha = alpha\n self.optimizer = optimizer\n self.n_restarts_optimizer = n_restarts_optimizer\n self.normalize_y = normalize_y\n self.copy_X_train = copy_X_train\n self.random_state = random_state\n \n def fit(self, X, y):\n \"\"\"Fit Gaussian process regression model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n Returns\n -------\n self : object\n GaussianProcessRegressor class instance.\n \"\"\"\n if self.kernel is None:\n self.kernel_ = C(1.0, constant_value_bounds='fixed') * RBF(1.0, length_scale_bounds='fixed')\n else:\n self.kernel_ = clone(self.kernel)\n self._rng = check_random_state(self.random_state)\n if self.kernel_.requires_vector_input:\n (dtype, ensure_2d) = ('numeric', True)\n else:\n (dtype, ensure_2d) = (None, False)\n (X, y) = self._validate_data(X, y, multi_output=True, y_numeric=True, ensure_2d=ensure_2d, dtype=dtype)\n if self.normalize_y:\n self._y_train_mean = np.mean(y, axis=0)\n self._y_train_std = _handle_zeros_in_scale(np.std(y, axis=0), copy=False)\n y = (y - self._y_train_mean) / self._y_train_std\n else:\n self._y_train_mean = np.zeros(1)\n self._y_train_std = 1\n if np.iterable(self.alpha) and self.alpha.shape[0] != y.shape[0]:\n if self.alpha.shape[0] == 1:\n self.alpha = self.alpha[0]\n else:\n raise ValueError(f'alpha must be a scalar or an array with same number of entries as y. ({self.alpha.shape[0]} != {y.shape[0]})')\n self.X_train_ = np.copy(X) if self.copy_X_train else X\n self.y_train_ = np.copy(y) if self.copy_X_train else y\n if self.optimizer is not None and self.kernel_.n_dims > 0:\n \n def obj_func(theta, eval_gradient=True):\n if eval_gradient:\n (lml, grad) = self.log_marginal_likelihood(theta, eval_gradient=True, clone_kernel=False)\n return -lml, -grad\n else:\n return -self.log_marginal_likelihood(theta, clone_kernel=False)\n optima = [self._constrained_optimization(obj_func, self.kernel_.theta, self.kernel_.bounds)]\n if self.n_restarts_optimizer > 0:\n if not np.isfinite(self.kernel_.bounds).all():\n raise ValueError('Multiple optimizer restarts (n_restarts_optimizer>0) requires that all bounds are finite.')\n bounds = self.kernel_.bounds\n for iteration in range(self.n_restarts_optimizer):\n theta_initial = self._rng.uniform(bounds[:, 0], bounds[:, 1])\n optima.append(self._constrained_optimization(obj_func, theta_initial, bounds))\n lml_values = list(map(itemgetter(1), optima))\n self.kernel_.theta = optima[np.argmin(lml_values)][0]\n self.kernel_._check_bounds_params()\n self.log_marginal_likelihood_value_ = -np.min(lml_values)\n else:\n self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(self.kernel_.theta, clone_kernel=False)\n K = self.kernel_(self.X_train_)\n K[np.diag_indices_from(K)] += self.alpha\n try:\n self.L_ = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)\n except np.linalg.LinAlgError as exc:\n exc.args = (f\"The kernel, {self.kernel_}, is not returning a positive definite matrix. Try gradually increasing the 'alpha' parameter of your GaussianProcessRegressor estimator.\", ) + exc.args\n raise\n self.alpha_ = cho_solve((self.L_, GPR_CHOLESKY_LOWER), self.y_train_, check_finite=False)\n return self\n \n def predict(self, X, return_std=False, return_cov=False):\n \"\"\"Predict using the Gaussian process regression model.\n\n We can also predict based on an unfitted model by using the GP prior.\n In addition to the mean of the predictive distribution, optionally also\n returns its standard deviation (`return_std=True`) or covariance\n (`return_cov=True`). Note that at most one of the two can be requested.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated.\n\n return_std : bool, default=False\n If True, the standard-deviation of the predictive distribution at\n the query points is returned along with the mean.\n\n return_cov : bool, default=False\n If True, the covariance of the joint predictive distribution at\n the query points is returned along with the mean.\n\n Returns\n -------\n y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Mean of predictive distribution a query points.\n\n y_std : ndarray of shape (n_samples,) or (n_samples, n_targets), optional\n Standard deviation of predictive distribution at query points.\n Only returned when `return_std` is True.\n\n y_cov : ndarray of shape (n_samples, n_samples) or (n_samples, n_samples, n_targets), optional\n Covariance of joint predictive distribution a query points.\n Only returned when `return_cov` is True.\n \"\"\"\n if return_std and return_cov:\n raise RuntimeError('At most one of return_std or return_cov can be requested.')\n if self.kernel is None or self.kernel.requires_vector_input:\n (dtype, ensure_2d) = ('numeric', True)\n else:\n (dtype, ensure_2d) = (None, False)\n X = self._validate_data(X, ensure_2d=ensure_2d, dtype=dtype, reset=False)\n if not hasattr(self, 'X_train_'):\n if self.kernel is None:\n kernel = C(1.0, constant_value_bounds='fixed') * RBF(1.0, length_scale_bounds='fixed')\n else:\n kernel = self.kernel\n y_mean = np.zeros(X.shape[0])\n if return_cov:\n y_cov = kernel(X)\n return y_mean, y_cov\n elif return_std:\n y_var = kernel.diag(X)\n return y_mean, np.sqrt(y_var)\n else:\n return y_mean\n else:\n K_trans = self.kernel_(X, self.X_train_)\n y_mean = K_trans @ self.alpha_\n y_mean = self._y_train_std * y_mean + self._y_train_mean\n V = solve_triangular(self.L_, K_trans.T, lower=GPR_CHOLESKY_LOWER, check_finite=False)\n if return_cov:\n y_cov = self.kernel_(X) - V.T @ V\n y_cov = np.outer(y_cov, self._y_train_std**2).reshape(*y_cov.shape, -1)\n if y_cov.shape[2] == 1:\n y_cov = np.squeeze(y_cov, axis=2)\n return y_mean, y_cov\n elif return_std:\n y_var = self.kernel_.diag(X)\n y_var -= np.einsum('ij,ji->i', V.T, V)\n y_var_negative = y_var < 0\n if np.any(y_var_negative):\n warnings.warn('Predicted variances smaller than 0. Setting those variances to 0.')\n y_var[y_var_negative] = 0.0\n y_var = np.outer(y_var, self._y_train_std**2).reshape(*y_var.shape, -1)\n if y_var.shape[1] == 1:\n y_var = np.squeeze(y_var, axis=1)\n return y_mean, np.sqrt(y_var)\n else:\n return y_mean\n \n def sample_y(self, X, n_samples=1, random_state=0):\n \"\"\"Draw samples from Gaussian process and evaluate at X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Query points where the GP is evaluated.\n\n n_samples : int, default=1\n Number of samples drawn from the Gaussian process per query point.\n\n random_state : int, RandomState instance or None, default=0\n Determines random number generation to randomly draw samples.\n Pass an int for reproducible results across multiple function\n calls.\n See :term:`Glossary `.\n\n Returns\n -------\n y_samples : ndarray of shape (n_samples_X, n_samples), or (n_samples_X, n_targets, n_samples)\n Values of n_samples samples drawn from Gaussian process and\n evaluated at query points.\n \"\"\"\n rng = check_random_state(random_state)\n (y_mean, y_cov) = self.predict(X, return_cov=True)\n if y_mean.ndim == 1:\n y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T\n else:\n y_samples = [rng.multivariate_normal(y_mean[:, i], y_cov, n_samples).T[:, np.newaxis] for i in range(y_mean.shape[1])]\n y_samples = np.hstack(y_samples)\n return y_samples\n \n def log_marginal_likelihood(self, theta=None, eval_gradient=False, clone_kernel=True):\n \"\"\"Return log-marginal likelihood of theta for training data.\n\n Parameters\n ----------\n theta : array-like of shape (n_kernel_params,) default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\n eval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. If True, theta must not be None.\n\n clone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\n Returns\n -------\n log_likelihood : float\n Log-marginal likelihood of theta for training data.\n\n log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when eval_gradient is True.\n \"\"\"\n if theta is None:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated for theta!=None')\n return self.log_marginal_likelihood_value_\n if clone_kernel:\n kernel = self.kernel_.clone_with_theta(theta)\n else:\n kernel = self.kernel_\n kernel.theta = theta\n if eval_gradient:\n (K, K_gradient) = kernel(self.X_train_, eval_gradient=True)\n else:\n K = kernel(self.X_train_)\n K[np.diag_indices_from(K)] += self.alpha\n try:\n L = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)\n except np.linalg.LinAlgError:\n return (-np.inf, np.zeros_like(theta)) if eval_gradient else -np.inf\n y_train = self.y_train_\n if y_train.ndim == 1:\n y_train = y_train[:, np.newaxis]\n alpha = cho_solve((L, GPR_CHOLESKY_LOWER), y_train, check_finite=False)\n log_likelihood_dims = -0.5 * np.einsum('ik,ik->k', y_train, alpha)\n log_likelihood_dims -= np.log(np.diag(L)).sum()\n log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi)\n log_likelihood = log_likelihood_dims.sum(axis=-1)\n if eval_gradient:\n inner_term = np.einsum('ik,jk->ijk', alpha, alpha)\n K_inv = cho_solve((L, GPR_CHOLESKY_LOWER), np.eye(K.shape[0]), check_finite=False)\n inner_term -= K_inv[..., np.newaxis]\n log_likelihood_gradient_dims = 0.5 * np.einsum('ijl,jik->kl', inner_term, K_gradient)\n log_likelihood_gradient = log_likelihood_gradient_dims.sum(axis=-1)\n if eval_gradient:\n return log_likelihood, log_likelihood_gradient\n else:\n return log_likelihood\n \n def _constrained_optimization(self, obj_func, initial_theta, bounds):\n if self.optimizer == 'fmin_l_bfgs_b':\n opt_res = scipy.optimize.minimize(obj_func, initial_theta, method='L-BFGS-B', jac=True, bounds=bounds)\n _check_optimize_result('lbfgs', opt_res)\n (theta_opt, func_min) = (opt_res.x, opt_res.fun)\n elif callable(self.optimizer):\n (theta_opt, func_min) = self.optimizer(obj_func, initial_theta, bounds=bounds)\n else:\n raise ValueError(f'Unknown optimizer {self.optimizer}.')\n return theta_opt, func_min\n \n def _more_tags(self):\n return {'requires_fit': False}\n" }, { "name": "CompoundKernel", @@ -22755,7 +22822,7 @@ "sklearn.gaussian_process.kernels.ConstantKernel.__repr__" ], "is_public": true, - "description": "Constant kernel.\n\nCan be used as part of a product-kernel where it scales the magnitude of the other factor (kernel) or as part of a sum-kernel, where it modifies the mean of the Gaussian process. .. math:: k(x_1, x_2) = constant\\_value \\;\\forall\\; x_1, x_2 Adding a constant kernel is equivalent to adding a constant:: kernel = RBF() + ConstantKernel(constant_value=2) is the same as:: kernel = RBF() + 2 Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "Constant kernel.\n\nCan be used as part of a product-kernel where it scales the magnitude of\nthe other factor (kernel) or as part of a sum-kernel, where it modifies\nthe mean of the Gaussian process.\n\n.. math::\n k(x_1, x_2) = constant\\_value \\;\\forall\\; x_1, x_2\n\nAdding a constant kernel is equivalent to adding a constant::\n\n kernel = RBF() + ConstantKernel(constant_value=2)\n\nis the same as::\n\n kernel = RBF() + 2\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "Constant kernel.\n\n Can be used as part of a product-kernel where it scales the magnitude of\n the other factor (kernel) or as part of a sum-kernel, where it modifies\n the mean of the Gaussian process.\n\n .. math::\n k(x_1, x_2) = constant\\_value \\;\\forall\\; x_1, x_2\n\n Adding a constant kernel is equivalent to adding a constant::\n\n kernel = RBF() + ConstantKernel(constant_value=2)\n\n is the same as::\n\n kernel = RBF() + 2\n\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n constant_value : float, default=1.0\n The constant value which defines the covariance:\n k(x_1, x_2) = constant_value\n\n constant_value_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on `constant_value`.\n If set to \"fixed\", `constant_value` cannot be changed during\n hyperparameter tuning.\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import RBF, ConstantKernel\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = RBF() + ConstantKernel(constant_value=2)\n >>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.3696...\n >>> gpr.predict(X[:1,:], return_std=True)\n (array([606.1...]), array([0.24...]))\n ", "source_code": "\n\nclass ConstantKernel(StationaryKernelMixin, GenericKernelMixin, Kernel):\n \"\"\"Constant kernel.\n\n Can be used as part of a product-kernel where it scales the magnitude of\n the other factor (kernel) or as part of a sum-kernel, where it modifies\n the mean of the Gaussian process.\n\n .. math::\n k(x_1, x_2) = constant\\_value \\;\\forall\\; x_1, x_2\n\n Adding a constant kernel is equivalent to adding a constant::\n\n kernel = RBF() + ConstantKernel(constant_value=2)\n\n is the same as::\n\n kernel = RBF() + 2\n\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n constant_value : float, default=1.0\n The constant value which defines the covariance:\n k(x_1, x_2) = constant_value\n\n constant_value_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on `constant_value`.\n If set to \"fixed\", `constant_value` cannot be changed during\n hyperparameter tuning.\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import RBF, ConstantKernel\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = RBF() + ConstantKernel(constant_value=2)\n >>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.3696...\n >>> gpr.predict(X[:1,:], return_std=True)\n (array([606.1...]), array([0.24...]))\n \"\"\"\n \n def __init__(self, constant_value=1.0, constant_value_bounds=(1e-05, 100000.0)):\n self.constant_value = constant_value\n self.constant_value_bounds = constant_value_bounds\n \n @property\n def hyperparameter_constant_value(self):\n return Hyperparameter('constant_value', 'numeric', self.constant_value_bounds)\n \n def __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when eval_gradient\n is True.\n \"\"\"\n if Y is None:\n Y = X\n elif eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n K = np.full((_num_samples(X), _num_samples(Y)), self.constant_value, dtype=np.array(self.constant_value).dtype)\n if eval_gradient:\n if not self.hyperparameter_constant_value.fixed:\n return K, np.full((_num_samples(X), _num_samples(X), 1), self.constant_value, dtype=np.array(self.constant_value).dtype)\n else:\n return K, np.empty((_num_samples(X), _num_samples(X), 0))\n else:\n return K\n \n def diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return np.full(_num_samples(X), self.constant_value, dtype=np.array(self.constant_value).dtype)\n \n def __repr__(self):\n return '{0:.3g}**2'.format(np.sqrt(self.constant_value))\n" }, @@ -22773,7 +22840,7 @@ "sklearn.gaussian_process.kernels.DotProduct.__repr__" ], "is_public": true, - "description": "Dot-Product kernel.\n\nThe DotProduct kernel is non-stationary and can be obtained from linear regression by putting :math:`N(0, 1)` priors on the coefficients of :math:`x_d (d = 1, . . . , D)` and a prior of :math:`N(0, \\sigma_0^2)` on the bias. The DotProduct kernel is invariant to a rotation of the coordinates about the origin, but not translations. It is parameterized by a parameter sigma_0 :math:`\\sigma` which controls the inhomogenity of the kernel. For :math:`\\sigma_0^2 =0`, the kernel is called the homogeneous linear kernel, otherwise it is inhomogeneous. The kernel is given by .. math:: k(x_i, x_j) = \\sigma_0 ^ 2 + x_i \\cdot x_j The DotProduct kernel is commonly combined with exponentiation. See [1]_, Chapter 4, Section 4.2, for further details regarding the DotProduct kernel. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "Dot-Product kernel.\n\nThe DotProduct kernel is non-stationary and can be obtained from linear\nregression by putting :math:`N(0, 1)` priors on the coefficients\nof :math:`x_d (d = 1, . . . , D)` and a prior of :math:`N(0, \\sigma_0^2)`\non the bias. The DotProduct kernel is invariant to a rotation of\nthe coordinates about the origin, but not translations.\nIt is parameterized by a parameter sigma_0 :math:`\\sigma`\nwhich controls the inhomogenity of the kernel. For :math:`\\sigma_0^2 =0`,\nthe kernel is called the homogeneous linear kernel, otherwise\nit is inhomogeneous. The kernel is given by\n\n.. math::\n k(x_i, x_j) = \\sigma_0 ^ 2 + x_i \\cdot x_j\n\nThe DotProduct kernel is commonly combined with exponentiation.\n\nSee [1]_, Chapter 4, Section 4.2, for further details regarding the\nDotProduct kernel.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "Dot-Product kernel.\n\n The DotProduct kernel is non-stationary and can be obtained from linear\n regression by putting :math:`N(0, 1)` priors on the coefficients\n of :math:`x_d (d = 1, . . . , D)` and a prior of :math:`N(0, \\sigma_0^2)`\n on the bias. The DotProduct kernel is invariant to a rotation of\n the coordinates about the origin, but not translations.\n It is parameterized by a parameter sigma_0 :math:`\\sigma`\n which controls the inhomogenity of the kernel. For :math:`\\sigma_0^2 =0`,\n the kernel is called the homogeneous linear kernel, otherwise\n it is inhomogeneous. The kernel is given by\n\n .. math::\n k(x_i, x_j) = \\sigma_0 ^ 2 + x_i \\cdot x_j\n\n The DotProduct kernel is commonly combined with exponentiation.\n\n See [1]_, Chapter 4, Section 4.2, for further details regarding the\n DotProduct kernel.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n sigma_0 : float >= 0, default=1.0\n Parameter controlling the inhomogenity of the kernel. If sigma_0=0,\n the kernel is homogeneous.\n\n sigma_0_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'sigma_0'.\n If set to \"fixed\", 'sigma_0' cannot be changed during\n hyperparameter tuning.\n\n References\n ----------\n .. [1] `Carl Edward Rasmussen, Christopher K. I. Williams (2006).\n \"Gaussian Processes for Machine Learning\". The MIT Press.\n `_\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = DotProduct() + WhiteKernel()\n >>> gpr = GaussianProcessRegressor(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.3680...\n >>> gpr.predict(X[:2,:], return_std=True)\n (array([653.0..., 592.1...]), array([316.6..., 316.6...]))\n ", "source_code": "\n\nclass DotProduct(Kernel):\n \"\"\"Dot-Product kernel.\n\n The DotProduct kernel is non-stationary and can be obtained from linear\n regression by putting :math:`N(0, 1)` priors on the coefficients\n of :math:`x_d (d = 1, . . . , D)` and a prior of :math:`N(0, \\sigma_0^2)`\n on the bias. The DotProduct kernel is invariant to a rotation of\n the coordinates about the origin, but not translations.\n It is parameterized by a parameter sigma_0 :math:`\\sigma`\n which controls the inhomogenity of the kernel. For :math:`\\sigma_0^2 =0`,\n the kernel is called the homogeneous linear kernel, otherwise\n it is inhomogeneous. The kernel is given by\n\n .. math::\n k(x_i, x_j) = \\sigma_0 ^ 2 + x_i \\cdot x_j\n\n The DotProduct kernel is commonly combined with exponentiation.\n\n See [1]_, Chapter 4, Section 4.2, for further details regarding the\n DotProduct kernel.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n sigma_0 : float >= 0, default=1.0\n Parameter controlling the inhomogenity of the kernel. If sigma_0=0,\n the kernel is homogeneous.\n\n sigma_0_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'sigma_0'.\n If set to \"fixed\", 'sigma_0' cannot be changed during\n hyperparameter tuning.\n\n References\n ----------\n .. [1] `Carl Edward Rasmussen, Christopher K. I. Williams (2006).\n \"Gaussian Processes for Machine Learning\". The MIT Press.\n `_\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = DotProduct() + WhiteKernel()\n >>> gpr = GaussianProcessRegressor(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.3680...\n >>> gpr.predict(X[:2,:], return_std=True)\n (array([653.0..., 592.1...]), array([316.6..., 316.6...]))\n \"\"\"\n \n def __init__(self, sigma_0=1.0, sigma_0_bounds=(1e-05, 100000.0)):\n self.sigma_0 = sigma_0\n self.sigma_0_bounds = sigma_0_bounds\n \n @property\n def hyperparameter_sigma_0(self):\n return Hyperparameter('sigma_0', 'numeric', self.sigma_0_bounds)\n \n def __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n X = np.atleast_2d(X)\n if Y is None:\n K = np.inner(X, X) + self.sigma_0**2\n else:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n K = np.inner(X, Y) + self.sigma_0**2\n if eval_gradient:\n if not self.hyperparameter_sigma_0.fixed:\n K_gradient = np.empty((K.shape[0], K.shape[1], 1))\n K_gradient[..., 0] = 2 * self.sigma_0**2\n return K, K_gradient\n else:\n return K, np.empty((X.shape[0], X.shape[0], 0))\n else:\n return K\n \n def diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y).\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X).\n \"\"\"\n return np.einsum('ij,ij->i', X, X) + self.sigma_0**2\n \n def is_stationary(self):\n \"\"\"Returns whether the kernel is stationary.\"\"\"\n return False\n \n def __repr__(self):\n return '{0}(sigma_0={1:.3g})'.format(self.__class__.__name__, self.sigma_0)\n" }, @@ -22794,7 +22861,7 @@ "sklearn.gaussian_process.kernels.ExpSineSquared.__repr__" ], "is_public": true, - "description": "Exp-Sine-Squared kernel (aka periodic kernel).\n\nThe ExpSineSquared kernel allows one to model functions which repeat themselves exactly. It is parameterized by a length scale parameter :math:`l>0` and a periodicity parameter :math:`p>0`. Only the isotropic variant where :math:`l` is a scalar is supported at the moment. The kernel is given by: .. math:: k(x_i, x_j) = \\text{exp}\\left(- \\frac{ 2\\sin^2(\\pi d(x_i, x_j)/p) }{ l^ 2} \\right) where :math:`l` is the length scale of the kernel, :math:`p` the periodicity of the kernel and :math:`d(\\\\cdot,\\\\cdot)` is the Euclidean distance. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "Exp-Sine-Squared kernel (aka periodic kernel).\n\nThe ExpSineSquared kernel allows one to model functions which repeat\nthemselves exactly. It is parameterized by a length scale\nparameter :math:`l>0` and a periodicity parameter :math:`p>0`.\nOnly the isotropic variant where :math:`l` is a scalar is\nsupported at the moment. The kernel is given by:\n\n.. math::\n k(x_i, x_j) = \\text{exp}\\left(-\n \\frac{ 2\\sin^2(\\pi d(x_i, x_j)/p) }{ l^ 2} \\right)\n\nwhere :math:`l` is the length scale of the kernel, :math:`p` the\nperiodicity of the kernel and :math:`d(\\\\cdot,\\\\cdot)` is the\nEuclidean distance.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "Exp-Sine-Squared kernel (aka periodic kernel).\n\n The ExpSineSquared kernel allows one to model functions which repeat\n themselves exactly. It is parameterized by a length scale\n parameter :math:`l>0` and a periodicity parameter :math:`p>0`.\n Only the isotropic variant where :math:`l` is a scalar is\n supported at the moment. The kernel is given by:\n\n .. math::\n k(x_i, x_j) = \\text{exp}\\left(-\n \\frac{ 2\\sin^2(\\pi d(x_i, x_j)/p) }{ l^ 2} \\right)\n\n where :math:`l` is the length scale of the kernel, :math:`p` the\n periodicity of the kernel and :math:`d(\\\\cdot,\\\\cdot)` is the\n Euclidean distance.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n\n length_scale : float > 0, default=1.0\n The length scale of the kernel.\n\n periodicity : float > 0, default=1.0\n The periodicity of the kernel.\n\n length_scale_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'length_scale'.\n If set to \"fixed\", 'length_scale' cannot be changed during\n hyperparameter tuning.\n\n periodicity_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'periodicity'.\n If set to \"fixed\", 'periodicity' cannot be changed during\n hyperparameter tuning.\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import ExpSineSquared\n >>> X, y = make_friedman2(n_samples=50, noise=0, random_state=0)\n >>> kernel = ExpSineSquared(length_scale=1, periodicity=1)\n >>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.0144...\n >>> gpr.predict(X[:2,:], return_std=True)\n (array([425.6..., 457.5...]), array([0.3894..., 0.3467...]))\n ", "source_code": "\n\nclass ExpSineSquared(StationaryKernelMixin, NormalizedKernelMixin, Kernel):\n \"\"\"Exp-Sine-Squared kernel (aka periodic kernel).\n\n The ExpSineSquared kernel allows one to model functions which repeat\n themselves exactly. It is parameterized by a length scale\n parameter :math:`l>0` and a periodicity parameter :math:`p>0`.\n Only the isotropic variant where :math:`l` is a scalar is\n supported at the moment. The kernel is given by:\n\n .. math::\n k(x_i, x_j) = \\text{exp}\\left(-\n \\frac{ 2\\sin^2(\\pi d(x_i, x_j)/p) }{ l^ 2} \\right)\n\n where :math:`l` is the length scale of the kernel, :math:`p` the\n periodicity of the kernel and :math:`d(\\\\cdot,\\\\cdot)` is the\n Euclidean distance.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n\n length_scale : float > 0, default=1.0\n The length scale of the kernel.\n\n periodicity : float > 0, default=1.0\n The periodicity of the kernel.\n\n length_scale_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'length_scale'.\n If set to \"fixed\", 'length_scale' cannot be changed during\n hyperparameter tuning.\n\n periodicity_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'periodicity'.\n If set to \"fixed\", 'periodicity' cannot be changed during\n hyperparameter tuning.\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import ExpSineSquared\n >>> X, y = make_friedman2(n_samples=50, noise=0, random_state=0)\n >>> kernel = ExpSineSquared(length_scale=1, periodicity=1)\n >>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.0144...\n >>> gpr.predict(X[:2,:], return_std=True)\n (array([425.6..., 457.5...]), array([0.3894..., 0.3467...]))\n \"\"\"\n \n def __init__(self, length_scale=1.0, periodicity=1.0, length_scale_bounds=(1e-05, 100000.0), periodicity_bounds=(1e-05, 100000.0)):\n self.length_scale = length_scale\n self.periodicity = periodicity\n self.length_scale_bounds = length_scale_bounds\n self.periodicity_bounds = periodicity_bounds\n \n @property\n def hyperparameter_length_scale(self):\n \"\"\"Returns the length scale\"\"\"\n return Hyperparameter('length_scale', 'numeric', self.length_scale_bounds)\n \n @property\n def hyperparameter_periodicity(self):\n return Hyperparameter('periodicity', 'numeric', self.periodicity_bounds)\n \n def __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n X = np.atleast_2d(X)\n if Y is None:\n dists = squareform(pdist(X, metric='euclidean'))\n arg = np.pi * dists / self.periodicity\n sin_of_arg = np.sin(arg)\n K = np.exp(-2 * (sin_of_arg / self.length_scale)**2)\n else:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n dists = cdist(X, Y, metric='euclidean')\n K = np.exp(-2 * (np.sin(np.pi / self.periodicity * dists) / self.length_scale)**2)\n if eval_gradient:\n cos_of_arg = np.cos(arg)\n if not self.hyperparameter_length_scale.fixed:\n length_scale_gradient = 4 / self.length_scale**2 * sin_of_arg**2 * K\n length_scale_gradient = length_scale_gradient[:, :, np.newaxis]\n else:\n length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0))\n if not self.hyperparameter_periodicity.fixed:\n periodicity_gradient = 4 * arg / self.length_scale**2 * cos_of_arg * sin_of_arg * K\n periodicity_gradient = periodicity_gradient[:, :, np.newaxis]\n else:\n periodicity_gradient = np.empty((K.shape[0], K.shape[1], 0))\n return K, np.dstack((length_scale_gradient, periodicity_gradient))\n else:\n return K\n \n def __repr__(self):\n return '{0}(length_scale={1:.3g}, periodicity={2:.3g})'.format(self.__class__.__name__, self.length_scale, self.periodicity)\n" }, @@ -22818,7 +22885,7 @@ "sklearn.gaussian_process.kernels.Exponentiation.requires_vector_input@getter" ], "is_public": true, - "description": "The Exponentiation kernel takes one base kernel and a scalar parameter :math:`p` and combines them via\n\n.. math:: k_{exp}(X, Y) = k(X, Y) ^p Note that the `__pow__` magic method is overridden, so `Exponentiation(RBF(), 2)` is equivalent to using the ** operator with `RBF() ** 2`. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "The Exponentiation kernel takes one base kernel and a scalar parameter\n:math:`p` and combines them via\n\n.. math::\n k_{exp}(X, Y) = k(X, Y) ^p\n\nNote that the `__pow__` magic method is overridden, so\n`Exponentiation(RBF(), 2)` is equivalent to using the ** operator\nwith `RBF() ** 2`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "The Exponentiation kernel takes one base kernel and a scalar parameter\n :math:`p` and combines them via\n\n .. math::\n k_{exp}(X, Y) = k(X, Y) ^p\n\n Note that the `__pow__` magic method is overridden, so\n `Exponentiation(RBF(), 2)` is equivalent to using the ** operator\n with `RBF() ** 2`.\n\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n kernel : Kernel\n The base kernel\n\n exponent : float\n The exponent for the base kernel\n\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import (RationalQuadratic,\n ... Exponentiation)\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = Exponentiation(RationalQuadratic(), exponent=2)\n >>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.419...\n >>> gpr.predict(X[:1,:], return_std=True)\n (array([635.5...]), array([0.559...]))\n ", "source_code": "\n\nclass Exponentiation(Kernel):\n \"\"\"The Exponentiation kernel takes one base kernel and a scalar parameter\n :math:`p` and combines them via\n\n .. math::\n k_{exp}(X, Y) = k(X, Y) ^p\n\n Note that the `__pow__` magic method is overridden, so\n `Exponentiation(RBF(), 2)` is equivalent to using the ** operator\n with `RBF() ** 2`.\n\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n kernel : Kernel\n The base kernel\n\n exponent : float\n The exponent for the base kernel\n\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import (RationalQuadratic,\n ... Exponentiation)\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = Exponentiation(RationalQuadratic(), exponent=2)\n >>> gpr = GaussianProcessRegressor(kernel=kernel, alpha=5,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.419...\n >>> gpr.predict(X[:1,:], return_std=True)\n (array([635.5...]), array([0.559...]))\n \"\"\"\n \n def __init__(self, kernel, exponent):\n self.kernel = kernel\n self.exponent = exponent\n \n def get_params(self, deep=True):\n \"\"\"Get parameters of this kernel.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n \"\"\"\n params = dict(kernel=self.kernel, exponent=self.exponent)\n if deep:\n deep_items = self.kernel.get_params().items()\n params.update((('kernel__' + k, val) for (k, val) in deep_items))\n return params\n \n @property\n def hyperparameters(self):\n \"\"\"Returns a list of all hyperparameter.\"\"\"\n r = []\n for hyperparameter in self.kernel.hyperparameters:\n r.append(Hyperparameter('kernel__' + hyperparameter.name, hyperparameter.value_type, hyperparameter.bounds, hyperparameter.n_elements))\n return r\n \n @property\n def theta(self):\n \"\"\"Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\n Note that theta are typically the log-transformed values of the\n kernel's hyperparameters as this representation of the search space\n is more amenable for hyperparameter search, as hyperparameters like\n length-scales naturally live on a log-scale.\n\n Returns\n -------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n \"\"\"\n return self.kernel.theta\n \n @theta.setter\n def theta(self, theta):\n \"\"\"Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\n Parameters\n ----------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n \"\"\"\n self.kernel.theta = theta\n \n @property\n def bounds(self):\n \"\"\"Returns the log-transformed bounds on the theta.\n\n Returns\n -------\n bounds : ndarray of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta\n \"\"\"\n return self.kernel.bounds\n \n def __eq__(self, b):\n if type(self) != type(b):\n return False\n return self.kernel == b.kernel and self.exponent == b.exponent\n \n def __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_Y, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n if eval_gradient:\n (K, K_gradient) = self.kernel(X, Y, eval_gradient=True)\n K_gradient *= self.exponent * K[:, :, np.newaxis]**(self.exponent - 1)\n return K**self.exponent, K_gradient\n else:\n K = self.kernel(X, Y, eval_gradient=False)\n return K**self.exponent\n \n def diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return self.kernel.diag(X)**self.exponent\n \n def __repr__(self):\n return '{0} ** {1}'.format(self.kernel, self.exponent)\n \n def is_stationary(self):\n \"\"\"Returns whether the kernel is stationary.\"\"\"\n return self.kernel.is_stationary()\n \n @property\n def requires_vector_input(self):\n \"\"\"Returns whether the kernel is defined on discrete structures.\"\"\"\n return self.kernel.requires_vector_input\n" }, @@ -22831,7 +22898,7 @@ "sklearn.gaussian_process.kernels.GenericKernelMixin.requires_vector_input@getter" ], "is_public": true, - "description": "Mixin for kernels which operate on generic objects such as variable- length sequences, trees, and graphs.\n\n.. versionadded:: 0.22", + "description": "Mixin for kernels which operate on generic objects such as variable-\nlength sequences, trees, and graphs.\n\n.. versionadded:: 0.22", "docstring": "Mixin for kernels which operate on generic objects such as variable-\n length sequences, trees, and graphs.\n\n .. versionadded:: 0.22\n ", "source_code": "\n\nclass GenericKernelMixin:\n \"\"\"Mixin for kernels which operate on generic objects such as variable-\n length sequences, trees, and graphs.\n\n .. versionadded:: 0.22\n \"\"\"\n \n @property\n def requires_vector_input(self):\n \"\"\"Whether the kernel works only on fixed-length feature vectors.\"\"\"\n return False\n" }, @@ -22915,7 +22982,7 @@ "sklearn.gaussian_process.kernels.Matern.__repr__" ], "is_public": true, - "description": "Matern kernel.\n\nThe class of Matern kernels is a generalization of the :class:`RBF`. It has an additional parameter :math:`\\nu` which controls the smoothness of the resulting function. The smaller :math:`\\nu`, the less smooth the approximated function is. As :math:`\\nu\\rightarrow\\infty`, the kernel becomes equivalent to the :class:`RBF` kernel. When :math:`\\nu = 1/2`, the Mat\u00e9rn kernel becomes identical to the absolute exponential kernel. Important intermediate values are :math:`\\nu=1.5` (once differentiable functions) and :math:`\\nu=2.5` (twice differentiable functions). The kernel is given by: .. math:: k(x_i, x_j) = \\frac{1}{\\Gamma(\\nu)2^{\\nu-1}}\\Bigg( \\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j ) \\Bigg)^\\nu K_\\nu\\Bigg( \\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j )\\Bigg) where :math:`d(\\cdot,\\cdot)` is the Euclidean distance, :math:`K_{\\nu}(\\cdot)` is a modified Bessel function and :math:`\\Gamma(\\cdot)` is the gamma function. See [1]_, Chapter 4, Section 4.2, for details regarding the different variants of the Matern kernel. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "Matern kernel.\n\nThe class of Matern kernels is a generalization of the :class:`RBF`.\nIt has an additional parameter :math:`\\nu` which controls the\nsmoothness of the resulting function. The smaller :math:`\\nu`,\nthe less smooth the approximated function is.\nAs :math:`\\nu\\rightarrow\\infty`, the kernel becomes equivalent to\nthe :class:`RBF` kernel. When :math:`\\nu = 1/2`, the Mat\u00e9rn kernel\nbecomes identical to the absolute exponential kernel.\nImportant intermediate values are\n:math:`\\nu=1.5` (once differentiable functions)\nand :math:`\\nu=2.5` (twice differentiable functions).\n\nThe kernel is given by:\n\n.. math::\n k(x_i, x_j) = \\frac{1}{\\Gamma(\\nu)2^{\\nu-1}}\\Bigg(\n \\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j )\n \\Bigg)^\\nu K_\\nu\\Bigg(\n \\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j )\\Bigg)\n\nwhere :math:`d(\\cdot,\\cdot)` is the Euclidean distance,\n:math:`K_{\\nu}(\\cdot)` is a modified Bessel function and\n:math:`\\Gamma(\\cdot)` is the gamma function.\nSee [1]_, Chapter 4, Section 4.2, for details regarding the different\nvariants of the Matern kernel.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "Matern kernel.\n\n The class of Matern kernels is a generalization of the :class:`RBF`.\n It has an additional parameter :math:`\\nu` which controls the\n smoothness of the resulting function. The smaller :math:`\\nu`,\n the less smooth the approximated function is.\n As :math:`\\nu\\rightarrow\\infty`, the kernel becomes equivalent to\n the :class:`RBF` kernel. When :math:`\\nu = 1/2`, the Mat\u00e9rn kernel\n becomes identical to the absolute exponential kernel.\n Important intermediate values are\n :math:`\\nu=1.5` (once differentiable functions)\n and :math:`\\nu=2.5` (twice differentiable functions).\n\n The kernel is given by:\n\n .. math::\n k(x_i, x_j) = \\frac{1}{\\Gamma(\\nu)2^{\\nu-1}}\\Bigg(\n \\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j )\n \\Bigg)^\\nu K_\\nu\\Bigg(\n \\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j )\\Bigg)\n\n\n\n where :math:`d(\\cdot,\\cdot)` is the Euclidean distance,\n :math:`K_{\\nu}(\\cdot)` is a modified Bessel function and\n :math:`\\Gamma(\\cdot)` is the gamma function.\n See [1]_, Chapter 4, Section 4.2, for details regarding the different\n variants of the Matern kernel.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n length_scale : float or ndarray of shape (n_features,), default=1.0\n The length scale of the kernel. If a float, an isotropic kernel is\n used. If an array, an anisotropic kernel is used where each dimension\n of l defines the length-scale of the respective feature dimension.\n\n length_scale_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'length_scale'.\n If set to \"fixed\", 'length_scale' cannot be changed during\n hyperparameter tuning.\n\n nu : float, default=1.5\n The parameter nu controlling the smoothness of the learned function.\n The smaller nu, the less smooth the approximated function is.\n For nu=inf, the kernel becomes equivalent to the RBF kernel and for\n nu=0.5 to the absolute exponential kernel. Important intermediate\n values are nu=1.5 (once differentiable functions) and nu=2.5\n (twice differentiable functions). Note that values of nu not in\n [0.5, 1.5, 2.5, inf] incur a considerably higher computational cost\n (appr. 10 times higher) since they require to evaluate the modified\n Bessel function. Furthermore, in contrast to l, nu is kept fixed to\n its initial value and not optimized.\n\n References\n ----------\n .. [1] `Carl Edward Rasmussen, Christopher K. I. Williams (2006).\n \"Gaussian Processes for Machine Learning\". The MIT Press.\n `_\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.gaussian_process import GaussianProcessClassifier\n >>> from sklearn.gaussian_process.kernels import Matern\n >>> X, y = load_iris(return_X_y=True)\n >>> kernel = 1.0 * Matern(length_scale=1.0, nu=1.5)\n >>> gpc = GaussianProcessClassifier(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpc.score(X, y)\n 0.9866...\n >>> gpc.predict_proba(X[:2,:])\n array([[0.8513..., 0.0368..., 0.1117...],\n [0.8086..., 0.0693..., 0.1220...]])\n ", "source_code": "\n\nclass Matern(RBF):\n \"\"\"Matern kernel.\n\n The class of Matern kernels is a generalization of the :class:`RBF`.\n It has an additional parameter :math:`\\nu` which controls the\n smoothness of the resulting function. The smaller :math:`\\nu`,\n the less smooth the approximated function is.\n As :math:`\\nu\\rightarrow\\infty`, the kernel becomes equivalent to\n the :class:`RBF` kernel. When :math:`\\nu = 1/2`, the Mat\u00e9rn kernel\n becomes identical to the absolute exponential kernel.\n Important intermediate values are\n :math:`\\nu=1.5` (once differentiable functions)\n and :math:`\\nu=2.5` (twice differentiable functions).\n\n The kernel is given by:\n\n .. math::\n k(x_i, x_j) = \\frac{1}{\\Gamma(\\nu)2^{\\nu-1}}\\Bigg(\n \\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j )\n \\Bigg)^\\nu K_\\nu\\Bigg(\n \\frac{\\sqrt{2\\nu}}{l} d(x_i , x_j )\\Bigg)\n\n\n\n where :math:`d(\\cdot,\\cdot)` is the Euclidean distance,\n :math:`K_{\\nu}(\\cdot)` is a modified Bessel function and\n :math:`\\Gamma(\\cdot)` is the gamma function.\n See [1]_, Chapter 4, Section 4.2, for details regarding the different\n variants of the Matern kernel.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n length_scale : float or ndarray of shape (n_features,), default=1.0\n The length scale of the kernel. If a float, an isotropic kernel is\n used. If an array, an anisotropic kernel is used where each dimension\n of l defines the length-scale of the respective feature dimension.\n\n length_scale_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'length_scale'.\n If set to \"fixed\", 'length_scale' cannot be changed during\n hyperparameter tuning.\n\n nu : float, default=1.5\n The parameter nu controlling the smoothness of the learned function.\n The smaller nu, the less smooth the approximated function is.\n For nu=inf, the kernel becomes equivalent to the RBF kernel and for\n nu=0.5 to the absolute exponential kernel. Important intermediate\n values are nu=1.5 (once differentiable functions) and nu=2.5\n (twice differentiable functions). Note that values of nu not in\n [0.5, 1.5, 2.5, inf] incur a considerably higher computational cost\n (appr. 10 times higher) since they require to evaluate the modified\n Bessel function. Furthermore, in contrast to l, nu is kept fixed to\n its initial value and not optimized.\n\n References\n ----------\n .. [1] `Carl Edward Rasmussen, Christopher K. I. Williams (2006).\n \"Gaussian Processes for Machine Learning\". The MIT Press.\n `_\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.gaussian_process import GaussianProcessClassifier\n >>> from sklearn.gaussian_process.kernels import Matern\n >>> X, y = load_iris(return_X_y=True)\n >>> kernel = 1.0 * Matern(length_scale=1.0, nu=1.5)\n >>> gpc = GaussianProcessClassifier(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpc.score(X, y)\n 0.9866...\n >>> gpc.predict_proba(X[:2,:])\n array([[0.8513..., 0.0368..., 0.1117...],\n [0.8086..., 0.0693..., 0.1220...]])\n \"\"\"\n \n def __init__(self, length_scale=1.0, length_scale_bounds=(1e-05, 100000.0), nu=1.5):\n super().__init__(length_scale, length_scale_bounds)\n self.nu = nu\n \n def __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n X = np.atleast_2d(X)\n length_scale = _check_length_scale(X, self.length_scale)\n if Y is None:\n dists = pdist(X / length_scale, metric='euclidean')\n else:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n dists = cdist(X / length_scale, Y / length_scale, metric='euclidean')\n if self.nu == 0.5:\n K = np.exp(-dists)\n elif self.nu == 1.5:\n K = dists * math.sqrt(3)\n K = (1.0 + K) * np.exp(-K)\n elif self.nu == 2.5:\n K = dists * math.sqrt(5)\n K = (1.0 + K + K**2 / 3.0) * np.exp(-K)\n elif self.nu == np.inf:\n K = np.exp(-dists**2 / 2.0)\n else:\n K = dists\n K[K == 0.0] += np.finfo(float).eps\n tmp = math.sqrt(2 * self.nu) * K\n K.fill(2**(1.0 - self.nu) / gamma(self.nu))\n K *= tmp**self.nu\n K *= kv(self.nu, tmp)\n if Y is None:\n K = squareform(K)\n np.fill_diagonal(K, 1)\n if eval_gradient:\n if self.hyperparameter_length_scale.fixed:\n K_gradient = np.empty((X.shape[0], X.shape[0], 0))\n return K, K_gradient\n if self.anisotropic:\n D = (X[:, np.newaxis, :] - X[np.newaxis, :, :])**2 / length_scale**2\n else:\n D = squareform(dists**2)[:, :, np.newaxis]\n if self.nu == 0.5:\n denominator = np.sqrt(D.sum(axis=2))[:, :, np.newaxis]\n K_gradient = K[..., np.newaxis] * np.divide(D, denominator, where=denominator != 0)\n elif self.nu == 1.5:\n K_gradient = 3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis]\n elif self.nu == 2.5:\n tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis]\n K_gradient = 5.0 / 3.0 * D * (tmp + 1) * np.exp(-tmp)\n elif self.nu == np.inf:\n K_gradient = D * K[..., np.newaxis]\n else:\n \n def f(theta):\n return self.clone_with_theta(theta)(X, Y)\n return K, _approx_fprime(self.theta, f, 1e-10)\n if not self.anisotropic:\n return K, K_gradient[:, :].sum(-1)[:, :, np.newaxis]\n else:\n return K, K_gradient\n else:\n return K\n \n def __repr__(self):\n if self.anisotropic:\n return '{0}(length_scale=[{1}], nu={2:.3g})'.format(self.__class__.__name__, ', '.join(map('{0:.3g}'.format, self.length_scale)), self.nu)\n else:\n return '{0}(length_scale={1:.3g}, nu={2:.3g})'.format(self.__class__.__name__, np.ravel(self.length_scale)[0], self.nu)\n" }, @@ -22946,7 +23013,7 @@ "sklearn.gaussian_process.kernels.PairwiseKernel.__repr__" ], "is_public": true, - "description": "Wrapper for kernels in sklearn.metrics.pairwise.\n\nA thin wrapper around the functionality of the kernels in sklearn.metrics.pairwise. Note: Evaluation of eval_gradient is not analytic but numeric and all kernels support only isotropic distances. The parameter gamma is considered to be a hyperparameter and may be optimized. The other kernel parameters are set directly at initialization and are kept fixed. .. versionadded:: 0.18", + "description": "Wrapper for kernels in sklearn.metrics.pairwise.\n\nA thin wrapper around the functionality of the kernels in\nsklearn.metrics.pairwise.\n\nNote: Evaluation of eval_gradient is not analytic but numeric and all\n kernels support only isotropic distances. The parameter gamma is\n considered to be a hyperparameter and may be optimized. The other\n kernel parameters are set directly at initialization and are kept\n fixed.\n\n.. versionadded:: 0.18", "docstring": "Wrapper for kernels in sklearn.metrics.pairwise.\n\n A thin wrapper around the functionality of the kernels in\n sklearn.metrics.pairwise.\n\n Note: Evaluation of eval_gradient is not analytic but numeric and all\n kernels support only isotropic distances. The parameter gamma is\n considered to be a hyperparameter and may be optimized. The other\n kernel parameters are set directly at initialization and are kept\n fixed.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n gamma : float, default=1.0\n Parameter gamma of the pairwise kernel specified by metric. It should\n be positive.\n\n gamma_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'gamma'.\n If set to \"fixed\", 'gamma' cannot be changed during\n hyperparameter tuning.\n\n metric : {\"linear\", \"additive_chi2\", \"chi2\", \"poly\", \"polynomial\", \"rbf\", \"laplacian\", \"sigmoid\", \"cosine\"} or callable, default=\"linear\"\n The metric to use when calculating kernel between instances in a\n feature array. If metric is a string, it must be one of the metrics\n in pairwise.PAIRWISE_KERNEL_FUNCTIONS.\n If metric is \"precomputed\", X is assumed to be a kernel matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them.\n\n pairwise_kernels_kwargs : dict, default=None\n All entries of this dict (if any) are passed as keyword arguments to\n the pairwise kernel function.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.gaussian_process import GaussianProcessClassifier\n >>> from sklearn.gaussian_process.kernels import PairwiseKernel\n >>> X, y = load_iris(return_X_y=True)\n >>> kernel = PairwiseKernel(metric='rbf')\n >>> gpc = GaussianProcessClassifier(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpc.score(X, y)\n 0.9733...\n >>> gpc.predict_proba(X[:2,:])\n array([[0.8880..., 0.05663..., 0.05532...],\n [0.8676..., 0.07073..., 0.06165...]])\n ", "source_code": "\n\nclass PairwiseKernel(Kernel):\n \"\"\"Wrapper for kernels in sklearn.metrics.pairwise.\n\n A thin wrapper around the functionality of the kernels in\n sklearn.metrics.pairwise.\n\n Note: Evaluation of eval_gradient is not analytic but numeric and all\n kernels support only isotropic distances. The parameter gamma is\n considered to be a hyperparameter and may be optimized. The other\n kernel parameters are set directly at initialization and are kept\n fixed.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n gamma : float, default=1.0\n Parameter gamma of the pairwise kernel specified by metric. It should\n be positive.\n\n gamma_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'gamma'.\n If set to \"fixed\", 'gamma' cannot be changed during\n hyperparameter tuning.\n\n metric : {\"linear\", \"additive_chi2\", \"chi2\", \"poly\", \"polynomial\", \"rbf\", \"laplacian\", \"sigmoid\", \"cosine\"} or callable, default=\"linear\"\n The metric to use when calculating kernel between instances in a\n feature array. If metric is a string, it must be one of the metrics\n in pairwise.PAIRWISE_KERNEL_FUNCTIONS.\n If metric is \"precomputed\", X is assumed to be a kernel matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them.\n\n pairwise_kernels_kwargs : dict, default=None\n All entries of this dict (if any) are passed as keyword arguments to\n the pairwise kernel function.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.gaussian_process import GaussianProcessClassifier\n >>> from sklearn.gaussian_process.kernels import PairwiseKernel\n >>> X, y = load_iris(return_X_y=True)\n >>> kernel = PairwiseKernel(metric='rbf')\n >>> gpc = GaussianProcessClassifier(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpc.score(X, y)\n 0.9733...\n >>> gpc.predict_proba(X[:2,:])\n array([[0.8880..., 0.05663..., 0.05532...],\n [0.8676..., 0.07073..., 0.06165...]])\n \"\"\"\n \n def __init__(self, gamma=1.0, gamma_bounds=(1e-05, 100000.0), metric='linear', pairwise_kernels_kwargs=None):\n self.gamma = gamma\n self.gamma_bounds = gamma_bounds\n self.metric = metric\n self.pairwise_kernels_kwargs = pairwise_kernels_kwargs\n \n @property\n def hyperparameter_gamma(self):\n return Hyperparameter('gamma', 'numeric', self.gamma_bounds)\n \n def __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n pairwise_kernels_kwargs = self.pairwise_kernels_kwargs\n if self.pairwise_kernels_kwargs is None:\n pairwise_kernels_kwargs = {}\n X = np.atleast_2d(X)\n K = pairwise_kernels(X, Y, metric=self.metric, gamma=self.gamma, filter_params=True, **pairwise_kernels_kwargs)\n if eval_gradient:\n if self.hyperparameter_gamma.fixed:\n return K, np.empty((X.shape[0], X.shape[0], 0))\n else:\n \n def f(gamma):\n return pairwise_kernels(X, Y, metric=self.metric, gamma=np.exp(gamma), filter_params=True, **pairwise_kernels_kwargs)\n return K, _approx_fprime(self.theta, f, 1e-10)\n else:\n return K\n \n def diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return np.apply_along_axis(self, 1, X).ravel()\n \n def is_stationary(self):\n \"\"\"Returns whether the kernel is stationary.\"\"\"\n return self.metric in ['rbf']\n \n def __repr__(self):\n return '{0}(gamma={1}, metric={2})'.format(self.__class__.__name__, self.gamma, self.metric)\n" }, @@ -22961,7 +23028,7 @@ "sklearn.gaussian_process.kernels.Product.__repr__" ], "is_public": true, - "description": "The `Product` kernel takes two kernels :math:`k_1` and :math:`k_2` and combines them via\n\n.. math:: k_{prod}(X, Y) = k_1(X, Y) * k_2(X, Y) Note that the `__mul__` magic method is overridden, so `Product(RBF(), RBF())` is equivalent to using the * operator with `RBF() * RBF()`. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "The `Product` kernel takes two kernels :math:`k_1` and :math:`k_2`\nand combines them via\n\n.. math::\n k_{prod}(X, Y) = k_1(X, Y) * k_2(X, Y)\n\nNote that the `__mul__` magic method is overridden, so\n`Product(RBF(), RBF())` is equivalent to using the * operator\nwith `RBF() * RBF()`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "The `Product` kernel takes two kernels :math:`k_1` and :math:`k_2`\n and combines them via\n\n .. math::\n k_{prod}(X, Y) = k_1(X, Y) * k_2(X, Y)\n\n Note that the `__mul__` magic method is overridden, so\n `Product(RBF(), RBF())` is equivalent to using the * operator\n with `RBF() * RBF()`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n k1 : Kernel\n The first base-kernel of the product-kernel\n\n k2 : Kernel\n The second base-kernel of the product-kernel\n\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import (RBF, Product,\n ... ConstantKernel)\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = Product(ConstantKernel(2), RBF())\n >>> gpr = GaussianProcessRegressor(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 1.0\n >>> kernel\n 1.41**2 * RBF(length_scale=1)\n ", "source_code": "\n\nclass Product(KernelOperator):\n \"\"\"The `Product` kernel takes two kernels :math:`k_1` and :math:`k_2`\n and combines them via\n\n .. math::\n k_{prod}(X, Y) = k_1(X, Y) * k_2(X, Y)\n\n Note that the `__mul__` magic method is overridden, so\n `Product(RBF(), RBF())` is equivalent to using the * operator\n with `RBF() * RBF()`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n k1 : Kernel\n The first base-kernel of the product-kernel\n\n k2 : Kernel\n The second base-kernel of the product-kernel\n\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import (RBF, Product,\n ... ConstantKernel)\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = Product(ConstantKernel(2), RBF())\n >>> gpr = GaussianProcessRegressor(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 1.0\n >>> kernel\n 1.41**2 * RBF(length_scale=1)\n \"\"\"\n \n def __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_Y, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n if eval_gradient:\n (K1, K1_gradient) = self.k1(X, Y, eval_gradient=True)\n (K2, K2_gradient) = self.k2(X, Y, eval_gradient=True)\n return K1 * K2, np.dstack((K1_gradient * K2[:, :, np.newaxis], K2_gradient * K1[:, :, np.newaxis]))\n else:\n return self.k1(X, Y) * self.k2(X, Y)\n \n def diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return self.k1.diag(X) * self.k2.diag(X)\n \n def __repr__(self):\n return '{0} * {1}'.format(self.k1, self.k2)\n" }, @@ -22982,7 +23049,7 @@ "sklearn.gaussian_process.kernels.RBF.__repr__" ], "is_public": true, - "description": "Radial-basis function kernel (aka squared-exponential kernel).\n\nThe RBF kernel is a stationary kernel. It is also known as the \"squared exponential\" kernel. It is parameterized by a length scale parameter :math:`l>0`, which can either be a scalar (isotropic variant of the kernel) or a vector with the same number of dimensions as the inputs X (anisotropic variant of the kernel). The kernel is given by: .. math:: k(x_i, x_j) = \\exp\\left(- \\frac{d(x_i, x_j)^2}{2l^2} \\right) where :math:`l` is the length scale of the kernel and :math:`d(\\cdot,\\cdot)` is the Euclidean distance. For advice on how to set the length scale parameter, see e.g. [1]_. This kernel is infinitely differentiable, which implies that GPs with this kernel as covariance function have mean square derivatives of all orders, and are thus very smooth. See [2]_, Chapter 4, Section 4.2, for further details of the RBF kernel. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "Radial-basis function kernel (aka squared-exponential kernel).\n\nThe RBF kernel is a stationary kernel. It is also known as the\n\"squared exponential\" kernel. It is parameterized by a length scale\nparameter :math:`l>0`, which can either be a scalar (isotropic variant\nof the kernel) or a vector with the same number of dimensions as the inputs\nX (anisotropic variant of the kernel). The kernel is given by:\n\n.. math::\n k(x_i, x_j) = \\exp\\left(- \\frac{d(x_i, x_j)^2}{2l^2} \\right)\n\nwhere :math:`l` is the length scale of the kernel and\n:math:`d(\\cdot,\\cdot)` is the Euclidean distance.\nFor advice on how to set the length scale parameter, see e.g. [1]_.\n\nThis kernel is infinitely differentiable, which implies that GPs with this\nkernel as covariance function have mean square derivatives of all orders,\nand are thus very smooth.\nSee [2]_, Chapter 4, Section 4.2, for further details of the RBF kernel.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "Radial-basis function kernel (aka squared-exponential kernel).\n\n The RBF kernel is a stationary kernel. It is also known as the\n \"squared exponential\" kernel. It is parameterized by a length scale\n parameter :math:`l>0`, which can either be a scalar (isotropic variant\n of the kernel) or a vector with the same number of dimensions as the inputs\n X (anisotropic variant of the kernel). The kernel is given by:\n\n .. math::\n k(x_i, x_j) = \\exp\\left(- \\frac{d(x_i, x_j)^2}{2l^2} \\right)\n\n where :math:`l` is the length scale of the kernel and\n :math:`d(\\cdot,\\cdot)` is the Euclidean distance.\n For advice on how to set the length scale parameter, see e.g. [1]_.\n\n This kernel is infinitely differentiable, which implies that GPs with this\n kernel as covariance function have mean square derivatives of all orders,\n and are thus very smooth.\n See [2]_, Chapter 4, Section 4.2, for further details of the RBF kernel.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n length_scale : float or ndarray of shape (n_features,), default=1.0\n The length scale of the kernel. If a float, an isotropic kernel is\n used. If an array, an anisotropic kernel is used where each dimension\n of l defines the length-scale of the respective feature dimension.\n\n length_scale_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'length_scale'.\n If set to \"fixed\", 'length_scale' cannot be changed during\n hyperparameter tuning.\n\n References\n ----------\n .. [1] `David Duvenaud (2014). \"The Kernel Cookbook:\n Advice on Covariance functions\".\n `_\n\n .. [2] `Carl Edward Rasmussen, Christopher K. I. Williams (2006).\n \"Gaussian Processes for Machine Learning\". The MIT Press.\n `_\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.gaussian_process import GaussianProcessClassifier\n >>> from sklearn.gaussian_process.kernels import RBF\n >>> X, y = load_iris(return_X_y=True)\n >>> kernel = 1.0 * RBF(1.0)\n >>> gpc = GaussianProcessClassifier(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpc.score(X, y)\n 0.9866...\n >>> gpc.predict_proba(X[:2,:])\n array([[0.8354..., 0.03228..., 0.1322...],\n [0.7906..., 0.0652..., 0.1441...]])\n ", "source_code": "\n\nclass RBF(StationaryKernelMixin, NormalizedKernelMixin, Kernel):\n \"\"\"Radial-basis function kernel (aka squared-exponential kernel).\n\n The RBF kernel is a stationary kernel. It is also known as the\n \"squared exponential\" kernel. It is parameterized by a length scale\n parameter :math:`l>0`, which can either be a scalar (isotropic variant\n of the kernel) or a vector with the same number of dimensions as the inputs\n X (anisotropic variant of the kernel). The kernel is given by:\n\n .. math::\n k(x_i, x_j) = \\exp\\left(- \\frac{d(x_i, x_j)^2}{2l^2} \\right)\n\n where :math:`l` is the length scale of the kernel and\n :math:`d(\\cdot,\\cdot)` is the Euclidean distance.\n For advice on how to set the length scale parameter, see e.g. [1]_.\n\n This kernel is infinitely differentiable, which implies that GPs with this\n kernel as covariance function have mean square derivatives of all orders,\n and are thus very smooth.\n See [2]_, Chapter 4, Section 4.2, for further details of the RBF kernel.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n length_scale : float or ndarray of shape (n_features,), default=1.0\n The length scale of the kernel. If a float, an isotropic kernel is\n used. If an array, an anisotropic kernel is used where each dimension\n of l defines the length-scale of the respective feature dimension.\n\n length_scale_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'length_scale'.\n If set to \"fixed\", 'length_scale' cannot be changed during\n hyperparameter tuning.\n\n References\n ----------\n .. [1] `David Duvenaud (2014). \"The Kernel Cookbook:\n Advice on Covariance functions\".\n `_\n\n .. [2] `Carl Edward Rasmussen, Christopher K. I. Williams (2006).\n \"Gaussian Processes for Machine Learning\". The MIT Press.\n `_\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.gaussian_process import GaussianProcessClassifier\n >>> from sklearn.gaussian_process.kernels import RBF\n >>> X, y = load_iris(return_X_y=True)\n >>> kernel = 1.0 * RBF(1.0)\n >>> gpc = GaussianProcessClassifier(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpc.score(X, y)\n 0.9866...\n >>> gpc.predict_proba(X[:2,:])\n array([[0.8354..., 0.03228..., 0.1322...],\n [0.7906..., 0.0652..., 0.1441...]])\n \"\"\"\n \n def __init__(self, length_scale=1.0, length_scale_bounds=(1e-05, 100000.0)):\n self.length_scale = length_scale\n self.length_scale_bounds = length_scale_bounds\n \n @property\n def anisotropic(self):\n return np.iterable(self.length_scale) and len(self.length_scale) > 1\n \n @property\n def hyperparameter_length_scale(self):\n if self.anisotropic:\n return Hyperparameter('length_scale', 'numeric', self.length_scale_bounds, len(self.length_scale))\n return Hyperparameter('length_scale', 'numeric', self.length_scale_bounds)\n \n def __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n X = np.atleast_2d(X)\n length_scale = _check_length_scale(X, self.length_scale)\n if Y is None:\n dists = pdist(X / length_scale, metric='sqeuclidean')\n K = np.exp(-0.5 * dists)\n K = squareform(K)\n np.fill_diagonal(K, 1)\n else:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n dists = cdist(X / length_scale, Y / length_scale, metric='sqeuclidean')\n K = np.exp(-0.5 * dists)\n if eval_gradient:\n if self.hyperparameter_length_scale.fixed:\n return K, np.empty((X.shape[0], X.shape[0], 0))\n elif not self.anisotropic or length_scale.shape[0] == 1:\n K_gradient = (K * squareform(dists))[:, :, np.newaxis]\n return K, K_gradient\n elif self.anisotropic:\n K_gradient = (X[:, np.newaxis, :] - X[np.newaxis, :, :])**2 / length_scale**2\n K_gradient *= K[..., np.newaxis]\n return K, K_gradient\n else:\n return K\n \n def __repr__(self):\n if self.anisotropic:\n return '{0}(length_scale=[{1}])'.format(self.__class__.__name__, ', '.join(map('{0:.3g}'.format, self.length_scale)))\n else:\n return '{0}(length_scale={1:.3g})'.format(self.__class__.__name__, np.ravel(self.length_scale)[0])\n" }, @@ -23003,7 +23070,7 @@ "sklearn.gaussian_process.kernels.RationalQuadratic.__repr__" ], "is_public": true, - "description": "Rational Quadratic kernel.\n\nThe RationalQuadratic kernel can be seen as a scale mixture (an infinite sum) of RBF kernels with different characteristic length scales. It is parameterized by a length scale parameter :math:`l>0` and a scale mixture parameter :math:`\\alpha>0`. Only the isotropic variant where length_scale :math:`l` is a scalar is supported at the moment. The kernel is given by: .. math:: k(x_i, x_j) = \\left( 1 + \\frac{d(x_i, x_j)^2 }{ 2\\alpha l^2}\\right)^{-\\alpha} where :math:`\\alpha` is the scale mixture parameter, :math:`l` is the length scale of the kernel and :math:`d(\\cdot,\\cdot)` is the Euclidean distance. For advice on how to set the parameters, see e.g. [1]_. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "Rational Quadratic kernel.\n\nThe RationalQuadratic kernel can be seen as a scale mixture (an infinite\nsum) of RBF kernels with different characteristic length scales. It is\nparameterized by a length scale parameter :math:`l>0` and a scale\nmixture parameter :math:`\\alpha>0`. Only the isotropic variant\nwhere length_scale :math:`l` is a scalar is supported at the moment.\nThe kernel is given by:\n\n.. math::\n k(x_i, x_j) = \\left(\n 1 + \\frac{d(x_i, x_j)^2 }{ 2\\alpha l^2}\\right)^{-\\alpha}\n\nwhere :math:`\\alpha` is the scale mixture parameter, :math:`l` is\nthe length scale of the kernel and :math:`d(\\cdot,\\cdot)` is the\nEuclidean distance.\nFor advice on how to set the parameters, see e.g. [1]_.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "Rational Quadratic kernel.\n\n The RationalQuadratic kernel can be seen as a scale mixture (an infinite\n sum) of RBF kernels with different characteristic length scales. It is\n parameterized by a length scale parameter :math:`l>0` and a scale\n mixture parameter :math:`\\alpha>0`. Only the isotropic variant\n where length_scale :math:`l` is a scalar is supported at the moment.\n The kernel is given by:\n\n .. math::\n k(x_i, x_j) = \\left(\n 1 + \\frac{d(x_i, x_j)^2 }{ 2\\alpha l^2}\\right)^{-\\alpha}\n\n where :math:`\\alpha` is the scale mixture parameter, :math:`l` is\n the length scale of the kernel and :math:`d(\\cdot,\\cdot)` is the\n Euclidean distance.\n For advice on how to set the parameters, see e.g. [1]_.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n length_scale : float > 0, default=1.0\n The length scale of the kernel.\n\n alpha : float > 0, default=1.0\n Scale mixture parameter\n\n length_scale_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'length_scale'.\n If set to \"fixed\", 'length_scale' cannot be changed during\n hyperparameter tuning.\n\n alpha_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'alpha'.\n If set to \"fixed\", 'alpha' cannot be changed during\n hyperparameter tuning.\n\n References\n ----------\n .. [1] `David Duvenaud (2014). \"The Kernel Cookbook:\n Advice on Covariance functions\".\n `_\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.gaussian_process import GaussianProcessClassifier\n >>> from sklearn.gaussian_process.kernels import RationalQuadratic\n >>> X, y = load_iris(return_X_y=True)\n >>> kernel = RationalQuadratic(length_scale=1.0, alpha=1.5)\n >>> gpc = GaussianProcessClassifier(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpc.score(X, y)\n 0.9733...\n >>> gpc.predict_proba(X[:2,:])\n array([[0.8881..., 0.0566..., 0.05518...],\n [0.8678..., 0.0707... , 0.0614...]])\n ", "source_code": "\n\nclass RationalQuadratic(StationaryKernelMixin, NormalizedKernelMixin, Kernel):\n \"\"\"Rational Quadratic kernel.\n\n The RationalQuadratic kernel can be seen as a scale mixture (an infinite\n sum) of RBF kernels with different characteristic length scales. It is\n parameterized by a length scale parameter :math:`l>0` and a scale\n mixture parameter :math:`\\alpha>0`. Only the isotropic variant\n where length_scale :math:`l` is a scalar is supported at the moment.\n The kernel is given by:\n\n .. math::\n k(x_i, x_j) = \\left(\n 1 + \\frac{d(x_i, x_j)^2 }{ 2\\alpha l^2}\\right)^{-\\alpha}\n\n where :math:`\\alpha` is the scale mixture parameter, :math:`l` is\n the length scale of the kernel and :math:`d(\\cdot,\\cdot)` is the\n Euclidean distance.\n For advice on how to set the parameters, see e.g. [1]_.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n length_scale : float > 0, default=1.0\n The length scale of the kernel.\n\n alpha : float > 0, default=1.0\n Scale mixture parameter\n\n length_scale_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'length_scale'.\n If set to \"fixed\", 'length_scale' cannot be changed during\n hyperparameter tuning.\n\n alpha_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'alpha'.\n If set to \"fixed\", 'alpha' cannot be changed during\n hyperparameter tuning.\n\n References\n ----------\n .. [1] `David Duvenaud (2014). \"The Kernel Cookbook:\n Advice on Covariance functions\".\n `_\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.gaussian_process import GaussianProcessClassifier\n >>> from sklearn.gaussian_process.kernels import RationalQuadratic\n >>> X, y = load_iris(return_X_y=True)\n >>> kernel = RationalQuadratic(length_scale=1.0, alpha=1.5)\n >>> gpc = GaussianProcessClassifier(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpc.score(X, y)\n 0.9733...\n >>> gpc.predict_proba(X[:2,:])\n array([[0.8881..., 0.0566..., 0.05518...],\n [0.8678..., 0.0707... , 0.0614...]])\n \"\"\"\n \n def __init__(self, length_scale=1.0, alpha=1.0, length_scale_bounds=(1e-05, 100000.0), alpha_bounds=(1e-05, 100000.0)):\n self.length_scale = length_scale\n self.alpha = alpha\n self.length_scale_bounds = length_scale_bounds\n self.alpha_bounds = alpha_bounds\n \n @property\n def hyperparameter_length_scale(self):\n return Hyperparameter('length_scale', 'numeric', self.length_scale_bounds)\n \n @property\n def hyperparameter_alpha(self):\n return Hyperparameter('alpha', 'numeric', self.alpha_bounds)\n \n def __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims)\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when eval_gradient\n is True.\n \"\"\"\n if len(np.atleast_1d(self.length_scale)) > 1:\n raise AttributeError('RationalQuadratic kernel only supports isotropic version, please use a single scalar for length_scale')\n X = np.atleast_2d(X)\n if Y is None:\n dists = squareform(pdist(X, metric='sqeuclidean'))\n tmp = dists / (2 * self.alpha * self.length_scale**2)\n base = 1 + tmp\n K = base**(-self.alpha)\n np.fill_diagonal(K, 1)\n else:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n dists = cdist(X, Y, metric='sqeuclidean')\n K = (1 + dists / (2 * self.alpha * self.length_scale**2))**(-self.alpha)\n if eval_gradient:\n if not self.hyperparameter_length_scale.fixed:\n length_scale_gradient = dists * K / (self.length_scale**2 * base)\n length_scale_gradient = length_scale_gradient[:, :, np.newaxis]\n else:\n length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0))\n if not self.hyperparameter_alpha.fixed:\n alpha_gradient = K * (-self.alpha * np.log(base) + dists / (2 * self.length_scale**2 * base))\n alpha_gradient = alpha_gradient[:, :, np.newaxis]\n else:\n alpha_gradient = np.empty((K.shape[0], K.shape[1], 0))\n return K, np.dstack((alpha_gradient, length_scale_gradient))\n else:\n return K\n \n def __repr__(self):\n return '{0}(alpha={1:.3g}, length_scale={2:.3g})'.format(self.__class__.__name__, self.alpha, self.length_scale)\n" }, @@ -23031,7 +23098,7 @@ "sklearn.gaussian_process.kernels.Sum.__repr__" ], "is_public": true, - "description": "The `Sum` kernel takes two kernels :math:`k_1` and :math:`k_2` and combines them via\n\n.. math:: k_{sum}(X, Y) = k_1(X, Y) + k_2(X, Y) Note that the `__add__` magic method is overridden, so `Sum(RBF(), RBF())` is equivalent to using the + operator with `RBF() + RBF()`. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "The `Sum` kernel takes two kernels :math:`k_1` and :math:`k_2`\nand combines them via\n\n.. math::\n k_{sum}(X, Y) = k_1(X, Y) + k_2(X, Y)\n\nNote that the `__add__` magic method is overridden, so\n`Sum(RBF(), RBF())` is equivalent to using the + operator\nwith `RBF() + RBF()`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "The `Sum` kernel takes two kernels :math:`k_1` and :math:`k_2`\n and combines them via\n\n .. math::\n k_{sum}(X, Y) = k_1(X, Y) + k_2(X, Y)\n\n Note that the `__add__` magic method is overridden, so\n `Sum(RBF(), RBF())` is equivalent to using the + operator\n with `RBF() + RBF()`.\n\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n k1 : Kernel\n The first base-kernel of the sum-kernel\n\n k2 : Kernel\n The second base-kernel of the sum-kernel\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import RBF, Sum, ConstantKernel\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = Sum(ConstantKernel(2), RBF())\n >>> gpr = GaussianProcessRegressor(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 1.0\n >>> kernel\n 1.41**2 + RBF(length_scale=1)\n ", "source_code": "\n\nclass Sum(KernelOperator):\n \"\"\"The `Sum` kernel takes two kernels :math:`k_1` and :math:`k_2`\n and combines them via\n\n .. math::\n k_{sum}(X, Y) = k_1(X, Y) + k_2(X, Y)\n\n Note that the `__add__` magic method is overridden, so\n `Sum(RBF(), RBF())` is equivalent to using the + operator\n with `RBF() + RBF()`.\n\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n k1 : Kernel\n The first base-kernel of the sum-kernel\n\n k2 : Kernel\n The second base-kernel of the sum-kernel\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import RBF, Sum, ConstantKernel\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = Sum(ConstantKernel(2), RBF())\n >>> gpr = GaussianProcessRegressor(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 1.0\n >>> kernel\n 1.41**2 + RBF(length_scale=1)\n \"\"\"\n \n def __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n if eval_gradient:\n (K1, K1_gradient) = self.k1(X, Y, eval_gradient=True)\n (K2, K2_gradient) = self.k2(X, Y, eval_gradient=True)\n return K1 + K2, np.dstack((K1_gradient, K2_gradient))\n else:\n return self.k1(X, Y) + self.k2(X, Y)\n \n def diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to `np.diag(self(X))`; however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return self.k1.diag(X) + self.k2.diag(X)\n \n def __repr__(self):\n return '{0} + {1}'.format(self.k1, self.k2)\n" }, @@ -23052,7 +23119,7 @@ "sklearn.gaussian_process.kernels.WhiteKernel.__repr__" ], "is_public": true, - "description": "White kernel.\n\nThe main use-case of this kernel is as part of a sum-kernel where it explains the noise of the signal as independently and identically normally-distributed. The parameter noise_level equals the variance of this noise. .. math:: k(x_1, x_2) = noise\\_level \\text{ if } x_i == x_j \\text{ else } 0 Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "White kernel.\n\nThe main use-case of this kernel is as part of a sum-kernel where it\nexplains the noise of the signal as independently and identically\nnormally-distributed. The parameter noise_level equals the variance of this\nnoise.\n\n.. math::\n k(x_1, x_2) = noise\\_level \\text{ if } x_i == x_j \\text{ else } 0\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "White kernel.\n\n The main use-case of this kernel is as part of a sum-kernel where it\n explains the noise of the signal as independently and identically\n normally-distributed. The parameter noise_level equals the variance of this\n noise.\n\n .. math::\n k(x_1, x_2) = noise\\_level \\text{ if } x_i == x_j \\text{ else } 0\n\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n noise_level : float, default=1.0\n Parameter controlling the noise level (variance)\n\n noise_level_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'noise_level'.\n If set to \"fixed\", 'noise_level' cannot be changed during\n hyperparameter tuning.\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = DotProduct() + WhiteKernel(noise_level=0.5)\n >>> gpr = GaussianProcessRegressor(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.3680...\n >>> gpr.predict(X[:2,:], return_std=True)\n (array([653.0..., 592.1... ]), array([316.6..., 316.6...]))\n ", "source_code": "\n\nclass WhiteKernel(StationaryKernelMixin, GenericKernelMixin, Kernel):\n \"\"\"White kernel.\n\n The main use-case of this kernel is as part of a sum-kernel where it\n explains the noise of the signal as independently and identically\n normally-distributed. The parameter noise_level equals the variance of this\n noise.\n\n .. math::\n k(x_1, x_2) = noise\\_level \\text{ if } x_i == x_j \\text{ else } 0\n\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n noise_level : float, default=1.0\n Parameter controlling the noise level (variance)\n\n noise_level_bounds : pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)\n The lower and upper bound on 'noise_level'.\n If set to \"fixed\", 'noise_level' cannot be changed during\n hyperparameter tuning.\n\n Examples\n --------\n >>> from sklearn.datasets import make_friedman2\n >>> from sklearn.gaussian_process import GaussianProcessRegressor\n >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel\n >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)\n >>> kernel = DotProduct() + WhiteKernel(noise_level=0.5)\n >>> gpr = GaussianProcessRegressor(kernel=kernel,\n ... random_state=0).fit(X, y)\n >>> gpr.score(X, y)\n 0.3680...\n >>> gpr.predict(X[:2,:], return_std=True)\n (array([653.0..., 592.1... ]), array([316.6..., 316.6...]))\n \"\"\"\n \n def __init__(self, noise_level=1.0, noise_level_bounds=(1e-05, 100000.0)):\n self.noise_level = noise_level\n self.noise_level_bounds = noise_level_bounds\n \n @property\n def hyperparameter_noise_level(self):\n return Hyperparameter('noise_level', 'numeric', self.noise_level_bounds)\n \n def __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when eval_gradient\n is True.\n \"\"\"\n if Y is not None and eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n if Y is None:\n K = self.noise_level * np.eye(_num_samples(X))\n if eval_gradient:\n if not self.hyperparameter_noise_level.fixed:\n return K, self.noise_level * np.eye(_num_samples(X))[:, :, np.newaxis]\n else:\n return K, np.empty((_num_samples(X), _num_samples(X), 0))\n else:\n return K\n else:\n return np.zeros((_num_samples(X), _num_samples(Y)))\n \n def diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return np.full(_num_samples(X), self.noise_level, dtype=np.array(self.noise_level).dtype)\n \n def __repr__(self):\n return '{0}(noise_level={1:.3g})'.format(self.__class__.__name__, self.noise_level)\n" }, @@ -23072,7 +23139,7 @@ "sklearn.impute._base.MissingIndicator._more_tags" ], "is_public": true, - "description": "Binary indicators for missing values.\n\nNote that this component typically should not be used in a vanilla :class:`Pipeline` consisting of transformers and a classifier, but rather could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`. Read more in the :ref:`User Guide `. .. versionadded:: 0.20", + "description": "Binary indicators for missing values.\n\nNote that this component typically should not be used in a vanilla\n:class:`Pipeline` consisting of transformers and a classifier, but rather\ncould be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20", "docstring": "Binary indicators for missing values.\n\n Note that this component typically should not be used in a vanilla\n :class:`Pipeline` consisting of transformers and a classifier, but rather\n could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n missing_values : int, float, str, np.nan or None, default=np.nan\n The placeholder for the missing values. All occurrences of\n `missing_values` will be imputed. For pandas' dataframes with\n nullable integer dtypes with missing values, `missing_values`\n should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.\n\n features : {'missing-only', 'all'}, default='missing-only'\n Whether the imputer mask should represent all or a subset of\n features.\n\n - If `'missing-only'` (default), the imputer mask will only represent\n features containing missing values during fit time.\n - If `'all'`, the imputer mask will represent all features.\n\n sparse : bool or 'auto', default='auto'\n Whether the imputer mask format should be sparse or dense.\n\n - If `'auto'` (default), the imputer mask will be of same type as\n input.\n - If `True`, the imputer mask will be a sparse matrix.\n - If `False`, the imputer mask will be a numpy array.\n\n error_on_new : bool, default=True\n If `True`, :meth:`transform` will raise an error when there are\n features with missing values that have no missing values in\n :meth:`fit`. This is applicable only when `features='missing-only'`.\n\n Attributes\n ----------\n features_ : ndarray of shape (n_missing_features,) or (n_features,)\n The features indices which will be returned when calling\n :meth:`transform`. They are computed during :meth:`fit`. If\n `features='all'`, `features_` is equal to `range(n_features)`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SimpleImputer : Univariate imputation of missing values.\n IterativeImputer : Multivariate imputation of missing values.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.impute import MissingIndicator\n >>> X1 = np.array([[np.nan, 1, 3],\n ... [4, 0, np.nan],\n ... [8, 1, 0]])\n >>> X2 = np.array([[5, 1, np.nan],\n ... [np.nan, 2, 3],\n ... [2, 4, 0]])\n >>> indicator = MissingIndicator()\n >>> indicator.fit(X1)\n MissingIndicator()\n >>> X2_tr = indicator.transform(X2)\n >>> X2_tr\n array([[False, True],\n [ True, False],\n [False, False]])\n ", "source_code": "\n\nclass MissingIndicator(TransformerMixin, BaseEstimator):\n \"\"\"Binary indicators for missing values.\n\n Note that this component typically should not be used in a vanilla\n :class:`Pipeline` consisting of transformers and a classifier, but rather\n could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n missing_values : int, float, str, np.nan or None, default=np.nan\n The placeholder for the missing values. All occurrences of\n `missing_values` will be imputed. For pandas' dataframes with\n nullable integer dtypes with missing values, `missing_values`\n should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.\n\n features : {'missing-only', 'all'}, default='missing-only'\n Whether the imputer mask should represent all or a subset of\n features.\n\n - If `'missing-only'` (default), the imputer mask will only represent\n features containing missing values during fit time.\n - If `'all'`, the imputer mask will represent all features.\n\n sparse : bool or 'auto', default='auto'\n Whether the imputer mask format should be sparse or dense.\n\n - If `'auto'` (default), the imputer mask will be of same type as\n input.\n - If `True`, the imputer mask will be a sparse matrix.\n - If `False`, the imputer mask will be a numpy array.\n\n error_on_new : bool, default=True\n If `True`, :meth:`transform` will raise an error when there are\n features with missing values that have no missing values in\n :meth:`fit`. This is applicable only when `features='missing-only'`.\n\n Attributes\n ----------\n features_ : ndarray of shape (n_missing_features,) or (n_features,)\n The features indices which will be returned when calling\n :meth:`transform`. They are computed during :meth:`fit`. If\n `features='all'`, `features_` is equal to `range(n_features)`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SimpleImputer : Univariate imputation of missing values.\n IterativeImputer : Multivariate imputation of missing values.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.impute import MissingIndicator\n >>> X1 = np.array([[np.nan, 1, 3],\n ... [4, 0, np.nan],\n ... [8, 1, 0]])\n >>> X2 = np.array([[5, 1, np.nan],\n ... [np.nan, 2, 3],\n ... [2, 4, 0]])\n >>> indicator = MissingIndicator()\n >>> indicator.fit(X1)\n MissingIndicator()\n >>> X2_tr = indicator.transform(X2)\n >>> X2_tr\n array([[False, True],\n [ True, False],\n [False, False]])\n \"\"\"\n \n def __init__(self, *, missing_values=np.nan, features='missing-only', sparse='auto', error_on_new=True):\n self.missing_values = missing_values\n self.features = features\n self.sparse = sparse\n self.error_on_new = error_on_new\n \n def _get_missing_features_info(self, X):\n \"\"\"Compute the imputer mask and the indices of the features\n containing missing values.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input data with missing values. Note that `X` has been\n checked in :meth:`fit` and :meth:`transform` before to call this\n function.\n\n Returns\n -------\n imputer_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The imputer mask of the original data.\n\n features_with_missing : ndarray of shape (n_features_with_missing)\n The features containing missing values.\n \"\"\"\n if not self._precomputed:\n imputer_mask = _get_mask(X, self.missing_values)\n else:\n imputer_mask = X\n if sp.issparse(X):\n imputer_mask.eliminate_zeros()\n if self.features == 'missing-only':\n n_missing = imputer_mask.getnnz(axis=0)\n if self.sparse is False:\n imputer_mask = imputer_mask.toarray()\n elif imputer_mask.format == 'csr':\n imputer_mask = imputer_mask.tocsc()\n else:\n if not self._precomputed:\n imputer_mask = _get_mask(X, self.missing_values)\n else:\n imputer_mask = X\n if self.features == 'missing-only':\n n_missing = imputer_mask.sum(axis=0)\n if self.sparse is True:\n imputer_mask = sp.csc_matrix(imputer_mask)\n if self.features == 'all':\n features_indices = np.arange(X.shape[1])\n else:\n features_indices = np.flatnonzero(n_missing)\n return imputer_mask, features_indices\n \n def _validate_input(self, X, in_fit):\n if not is_scalar_nan(self.missing_values):\n force_all_finite = True\n else:\n force_all_finite = 'allow-nan'\n X = self._validate_data(X, reset=in_fit, accept_sparse=('csc', 'csr'), dtype=None, force_all_finite=force_all_finite)\n _check_inputs_dtype(X, self.missing_values)\n if X.dtype.kind not in ('i', 'u', 'f', 'O'):\n raise ValueError('MissingIndicator does not support data with dtype {0}. Please provide either a numeric array (with a floating point or integer dtype) or categorical data represented either as an array with integer dtype or an array of string values with an object dtype.'.format(X.dtype))\n if sp.issparse(X) and self.missing_values == 0:\n raise ValueError('Sparse input with missing_values=0 is not supported. Provide a dense array instead.')\n return X\n \n def _fit(self, X, y=None, precomputed=False):\n \"\"\"Fit the transformer on `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n If `precomputed=True`, then `X` is a mask of the input data.\n\n precomputed : bool\n Whether the input data is a mask.\n\n Returns\n -------\n imputer_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The imputer mask of the original data.\n \"\"\"\n if precomputed:\n if not (hasattr(X, 'dtype') and X.dtype.kind == 'b'):\n raise ValueError('precomputed is True but the input data is not a mask')\n self._precomputed = True\n else:\n self._precomputed = False\n if not self._precomputed:\n X = self._validate_input(X, in_fit=True)\n self._n_features = X.shape[1]\n if self.features not in ('missing-only', 'all'):\n raise ValueError(\"'features' has to be either 'missing-only' or 'all'. Got {} instead.\".format(self.features))\n if not (isinstance(self.sparse, str) and self.sparse == 'auto' or isinstance(self.sparse, bool)):\n raise ValueError(\"'sparse' has to be a boolean or 'auto'. Got {!r} instead.\".format(self.sparse))\n missing_features_info = self._get_missing_features_info(X)\n self.features_ = missing_features_info[1]\n return missing_features_info[0]\n \n def fit(self, X, y=None):\n \"\"\"Fit the transformer on `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self._fit(X, y)\n return self\n \n def transform(self, X):\n \"\"\"Generate missing values indicator for `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data to complete.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_features_with_missing)\n The missing indicator for input data. The data type of `Xt`\n will be boolean.\n \"\"\"\n check_is_fitted(self)\n if not self._precomputed:\n X = self._validate_input(X, in_fit=False)\n elif not (hasattr(X, 'dtype') and X.dtype.kind == 'b'):\n raise ValueError('precomputed is True but the input data is not a mask')\n (imputer_mask, features) = self._get_missing_features_info(X)\n if self.features == 'missing-only':\n features_diff_fit_trans = np.setdiff1d(features, self.features_)\n if self.error_on_new and features_diff_fit_trans.size > 0:\n raise ValueError('The features {} have missing values in transform but have no missing values in fit.'.format(features_diff_fit_trans))\n if self.features_.size < self._n_features:\n imputer_mask = imputer_mask[:, self.features_]\n return imputer_mask\n \n def fit_transform(self, X, y=None):\n \"\"\"Generate missing values indicator for `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data to complete.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_features_with_missing)\n The missing indicator for input data. The data type of `Xt`\n will be boolean.\n \"\"\"\n imputer_mask = self._fit(X, y)\n if self.features_.size < self._n_features:\n imputer_mask = imputer_mask[:, self.features_]\n return imputer_mask\n \n def _more_tags(self):\n return {'allow_nan': True, 'X_types': ['2darray', 'string'], 'preserves_dtype': []}\n" }, @@ -23091,7 +23158,7 @@ "sklearn.impute._base.SimpleImputer.inverse_transform" ], "is_public": true, - "description": "Imputation transformer for completing missing values.\n\nRead more in the :ref:`User Guide `. .. versionadded:: 0.20 `SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer` estimator which is now removed.", + "description": "Imputation transformer for completing missing values.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20\n `SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer`\n estimator which is now removed.", "docstring": "Imputation transformer for completing missing values.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n `SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer`\n estimator which is now removed.\n\n Parameters\n ----------\n missing_values : int, float, str, np.nan or None, default=np.nan\n The placeholder for the missing values. All occurrences of\n `missing_values` will be imputed. For pandas' dataframes with\n nullable integer dtypes with missing values, `missing_values`\n should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.\n\n strategy : str, default='mean'\n The imputation strategy.\n\n - If \"mean\", then replace missing values using the mean along\n each column. Can only be used with numeric data.\n - If \"median\", then replace missing values using the median along\n each column. Can only be used with numeric data.\n - If \"most_frequent\", then replace missing using the most frequent\n value along each column. Can be used with strings or numeric data.\n If there is more than one such value, only the smallest is returned.\n - If \"constant\", then replace missing values with fill_value. Can be\n used with strings or numeric data.\n\n .. versionadded:: 0.20\n strategy=\"constant\" for fixed value imputation.\n\n fill_value : str or numerical value, default=None\n When strategy == \"constant\", fill_value is used to replace all\n occurrences of missing_values.\n If left to the default, fill_value will be 0 when imputing numerical\n data and \"missing_value\" for strings or object data types.\n\n verbose : int, default=0\n Controls the verbosity of the imputer.\n\n copy : bool, default=True\n If True, a copy of `X` will be created. If False, imputation will\n be done in-place whenever possible. Note that, in the following cases,\n a new copy will always be made, even if `copy=False`:\n\n - If `X` is not an array of floating values;\n - If `X` is encoded as a CSR matrix;\n - If `add_indicator=True`.\n\n add_indicator : bool, default=False\n If True, a :class:`MissingIndicator` transform will stack onto output\n of the imputer's transform. This allows a predictive estimator\n to account for missingness despite imputation. If a feature has no\n missing values at fit/train time, the feature won't appear on\n the missing indicator even if there are missing values at\n transform/test time.\n\n Attributes\n ----------\n statistics_ : array of shape (n_features,)\n The imputation fill value for each feature.\n Computing statistics can result in `np.nan` values.\n During :meth:`transform`, features corresponding to `np.nan`\n statistics will be discarded.\n\n indicator_ : :class:`~sklearn.impute.MissingIndicator`\n Indicator used to add binary indicators for missing values.\n `None` if `add_indicator=False`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n IterativeImputer : Multivariate imputation of missing values.\n\n Notes\n -----\n Columns which only contained missing values at :meth:`fit` are discarded\n upon :meth:`transform` if strategy is not `\"constant\"`.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.impute import SimpleImputer\n >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')\n >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])\n SimpleImputer()\n >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]\n >>> print(imp_mean.transform(X))\n [[ 7. 2. 3. ]\n [ 4. 3.5 6. ]\n [10. 3.5 9. ]]\n ", "source_code": "\n\nclass SimpleImputer(_BaseImputer):\n \"\"\"Imputation transformer for completing missing values.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n `SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer`\n estimator which is now removed.\n\n Parameters\n ----------\n missing_values : int, float, str, np.nan or None, default=np.nan\n The placeholder for the missing values. All occurrences of\n `missing_values` will be imputed. For pandas' dataframes with\n nullable integer dtypes with missing values, `missing_values`\n should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.\n\n strategy : str, default='mean'\n The imputation strategy.\n\n - If \"mean\", then replace missing values using the mean along\n each column. Can only be used with numeric data.\n - If \"median\", then replace missing values using the median along\n each column. Can only be used with numeric data.\n - If \"most_frequent\", then replace missing using the most frequent\n value along each column. Can be used with strings or numeric data.\n If there is more than one such value, only the smallest is returned.\n - If \"constant\", then replace missing values with fill_value. Can be\n used with strings or numeric data.\n\n .. versionadded:: 0.20\n strategy=\"constant\" for fixed value imputation.\n\n fill_value : str or numerical value, default=None\n When strategy == \"constant\", fill_value is used to replace all\n occurrences of missing_values.\n If left to the default, fill_value will be 0 when imputing numerical\n data and \"missing_value\" for strings or object data types.\n\n verbose : int, default=0\n Controls the verbosity of the imputer.\n\n copy : bool, default=True\n If True, a copy of `X` will be created. If False, imputation will\n be done in-place whenever possible. Note that, in the following cases,\n a new copy will always be made, even if `copy=False`:\n\n - If `X` is not an array of floating values;\n - If `X` is encoded as a CSR matrix;\n - If `add_indicator=True`.\n\n add_indicator : bool, default=False\n If True, a :class:`MissingIndicator` transform will stack onto output\n of the imputer's transform. This allows a predictive estimator\n to account for missingness despite imputation. If a feature has no\n missing values at fit/train time, the feature won't appear on\n the missing indicator even if there are missing values at\n transform/test time.\n\n Attributes\n ----------\n statistics_ : array of shape (n_features,)\n The imputation fill value for each feature.\n Computing statistics can result in `np.nan` values.\n During :meth:`transform`, features corresponding to `np.nan`\n statistics will be discarded.\n\n indicator_ : :class:`~sklearn.impute.MissingIndicator`\n Indicator used to add binary indicators for missing values.\n `None` if `add_indicator=False`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n IterativeImputer : Multivariate imputation of missing values.\n\n Notes\n -----\n Columns which only contained missing values at :meth:`fit` are discarded\n upon :meth:`transform` if strategy is not `\"constant\"`.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.impute import SimpleImputer\n >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')\n >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])\n SimpleImputer()\n >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]\n >>> print(imp_mean.transform(X))\n [[ 7. 2. 3. ]\n [ 4. 3.5 6. ]\n [10. 3.5 9. ]]\n \"\"\"\n \n def __init__(self, *, missing_values=np.nan, strategy='mean', fill_value=None, verbose=0, copy=True, add_indicator=False):\n super().__init__(missing_values=missing_values, add_indicator=add_indicator)\n self.strategy = strategy\n self.fill_value = fill_value\n self.verbose = verbose\n self.copy = copy\n \n def _validate_input(self, X, in_fit):\n allowed_strategies = ['mean', 'median', 'most_frequent', 'constant']\n if self.strategy not in allowed_strategies:\n raise ValueError('Can only use these strategies: {0} got strategy={1}'.format(allowed_strategies, self.strategy))\n if self.strategy in ('most_frequent', 'constant'):\n if isinstance(X, list) and any((isinstance(elem, str) for row in X for elem in row)):\n dtype = object\n else:\n dtype = None\n else:\n dtype = FLOAT_DTYPES\n if not is_scalar_nan(self.missing_values):\n force_all_finite = True\n else:\n force_all_finite = 'allow-nan'\n try:\n X = self._validate_data(X, reset=in_fit, accept_sparse='csc', dtype=dtype, force_all_finite=force_all_finite, copy=self.copy)\n except ValueError as ve:\n if 'could not convert' in str(ve):\n new_ve = ValueError('Cannot use {} strategy with non-numeric data:\\n{}'.format(self.strategy, ve))\n raise new_ve from None\n else:\n raise ve\n _check_inputs_dtype(X, self.missing_values)\n if X.dtype.kind not in ('i', 'u', 'f', 'O'):\n raise ValueError('SimpleImputer does not support data with dtype {0}. Please provide either a numeric array (with a floating point or integer dtype) or categorical data represented either as an array with integer dtype or an array of string values with an object dtype.'.format(X.dtype))\n return X\n \n def fit(self, X, y=None):\n \"\"\"Fit the imputer on `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n X = self._validate_input(X, in_fit=True)\n if self.fill_value is None:\n if X.dtype.kind in ('i', 'u', 'f'):\n fill_value = 0\n else:\n fill_value = 'missing_value'\n else:\n fill_value = self.fill_value\n if self.strategy == 'constant' and X.dtype.kind in ('i', 'u', 'f') and not isinstance(fill_value, numbers.Real):\n raise ValueError(\"'fill_value'={0} is invalid. Expected a numerical value when imputing numerical data\".format(fill_value))\n if sp.issparse(X):\n if self.missing_values == 0:\n raise ValueError('Imputation not possible when missing_values == 0 and input is sparse. Provide a dense array instead.')\n else:\n self.statistics_ = self._sparse_fit(X, self.strategy, self.missing_values, fill_value)\n else:\n self.statistics_ = self._dense_fit(X, self.strategy, self.missing_values, fill_value)\n return self\n \n def _sparse_fit(self, X, strategy, missing_values, fill_value):\n \"\"\"Fit the transformer on sparse data.\"\"\"\n missing_mask = _get_mask(X, missing_values)\n mask_data = missing_mask.data\n n_implicit_zeros = X.shape[0] - np.diff(X.indptr)\n statistics = np.empty(X.shape[1])\n if strategy == 'constant':\n statistics.fill(fill_value)\n else:\n for i in range(X.shape[1]):\n column = X.data[X.indptr[i]:X.indptr[i + 1]]\n mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]\n column = column[~mask_column]\n mask_zeros = _get_mask(column, 0)\n column = column[~mask_zeros]\n n_explicit_zeros = mask_zeros.sum()\n n_zeros = n_implicit_zeros[i] + n_explicit_zeros\n if strategy == 'mean':\n s = column.size + n_zeros\n statistics[i] = np.nan if s == 0 else column.sum() / s\n elif strategy == 'median':\n statistics[i] = _get_median(column, n_zeros)\n elif strategy == 'most_frequent':\n statistics[i] = _most_frequent(column, 0, n_zeros)\n super()._fit_indicator(missing_mask)\n return statistics\n \n def _dense_fit(self, X, strategy, missing_values, fill_value):\n \"\"\"Fit the transformer on dense data.\"\"\"\n missing_mask = _get_mask(X, missing_values)\n masked_X = ma.masked_array(X, mask=missing_mask)\n super()._fit_indicator(missing_mask)\n if strategy == 'mean':\n mean_masked = np.ma.mean(masked_X, axis=0)\n mean = np.ma.getdata(mean_masked)\n mean[np.ma.getmask(mean_masked)] = np.nan\n return mean\n elif strategy == 'median':\n median_masked = np.ma.median(masked_X, axis=0)\n median = np.ma.getdata(median_masked)\n median[np.ma.getmaskarray(median_masked)] = np.nan\n return median\n elif strategy == 'most_frequent':\n X = X.transpose()\n mask = missing_mask.transpose()\n if X.dtype.kind == 'O':\n most_frequent = np.empty(X.shape[0], dtype=object)\n else:\n most_frequent = np.empty(X.shape[0])\n for (i, (row, row_mask)) in enumerate(zip(X[:], mask[:])):\n row_mask = np.logical_not(row_mask).astype(bool)\n row = row[row_mask]\n most_frequent[i] = _most_frequent(row, np.nan, 0)\n return most_frequent\n elif strategy == 'constant':\n return np.full(X.shape[1], fill_value, dtype=X.dtype)\n \n def transform(self, X):\n \"\"\"Impute all missing values in `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data to complete.\n\n Returns\n -------\n X_imputed : {ndarray, sparse matrix} of shape (n_samples, n_features_out)\n `X` with imputed values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_input(X, in_fit=False)\n statistics = self.statistics_\n if X.shape[1] != statistics.shape[0]:\n raise ValueError('X has %d features per sample, expected %d' % (X.shape[1], self.statistics_.shape[0]))\n missing_mask = _get_mask(X, self.missing_values)\n if self.strategy == 'constant':\n valid_statistics = statistics\n valid_statistics_indexes = None\n else:\n invalid_mask = _get_mask(statistics, np.nan)\n valid_mask = np.logical_not(invalid_mask)\n valid_statistics = statistics[valid_mask]\n valid_statistics_indexes = np.flatnonzero(valid_mask)\n if invalid_mask.any():\n missing = np.arange(X.shape[1])[invalid_mask]\n if self.verbose:\n warnings.warn('Deleting features without observed values: %s' % missing)\n X = X[:, valid_statistics_indexes]\n if sp.issparse(X):\n if self.missing_values == 0:\n raise ValueError('Imputation not possible when missing_values == 0 and input is sparse. Provide a dense array instead.')\n else:\n if valid_statistics_indexes is None:\n mask = missing_mask.data\n else:\n mask = _get_mask(X.data, self.missing_values)\n indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=int), np.diff(X.indptr))[mask]\n X.data[mask] = valid_statistics[indexes].astype(X.dtype, copy=False)\n else:\n if valid_statistics_indexes is None:\n mask_valid_features = missing_mask\n else:\n mask_valid_features = missing_mask[:, valid_statistics_indexes]\n n_missing = np.sum(mask_valid_features, axis=0)\n values = np.repeat(valid_statistics, n_missing)\n coordinates = np.where(mask_valid_features.transpose())[::-1]\n X[coordinates] = values\n X_indicator = super()._transform_indicator(missing_mask)\n return super()._concatenate_indicator(X, X_indicator)\n \n def inverse_transform(self, X):\n \"\"\"Convert the data back to the original representation.\n\n Inverts the `transform` operation performed on an array.\n This operation can only be performed after :class:`SimpleImputer` is\n instantiated with `add_indicator=True`.\n\n Note that `inverse_transform` can only invert the transform in\n features that have binary indicators for missing values. If a feature\n has no missing values at `fit` time, the feature won't have a binary\n indicator, and the imputation done at `transform` time won't be\n inverted.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features + n_features_missing_indicator)\n The imputed data to be reverted to original data. It has to be\n an augmented array of imputed data and the missing indicator mask.\n\n Returns\n -------\n X_original : ndarray of shape (n_samples, n_features)\n The original `X` with missing values as it was prior\n to imputation.\n \"\"\"\n check_is_fitted(self)\n if not self.add_indicator:\n raise ValueError(f\"'inverse_transform' works only when 'SimpleImputer' is instantiated with 'add_indicator=True'. Got 'add_indicator={self.add_indicator}' instead.\")\n n_features_missing = len(self.indicator_.features_)\n non_empty_feature_count = X.shape[1] - n_features_missing\n array_imputed = X[:, :non_empty_feature_count].copy()\n missing_mask = X[:, non_empty_feature_count:].astype(bool)\n n_features_original = len(self.statistics_)\n shape_original = (X.shape[0], n_features_original)\n X_original = np.zeros(shape_original)\n X_original[:, self.indicator_.features_] = missing_mask\n full_mask = X_original.astype(bool)\n (imputed_idx, original_idx) = (0, 0)\n while imputed_idx < len(array_imputed.T):\n if not np.all(X_original[:, original_idx]):\n X_original[:, original_idx] = array_imputed.T[imputed_idx]\n imputed_idx += 1\n original_idx += 1\n else:\n original_idx += 1\n X_original[full_mask] = self.missing_values\n return X_original\n" }, @@ -23130,7 +23197,7 @@ "sklearn.impute._iterative.IterativeImputer.fit" ], "is_public": true, - "description": "Multivariate imputer that estimates each feature from all the others.\n\nA strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion. Read more in the :ref:`User Guide `. .. versionadded:: 0.21 .. note:: This estimator is still **experimental** for now: the predictions and the API might change without any deprecation cycle. To use it, you need to explicitly import `enable_iterative_imputer`:: >>> # explicitly require this experimental feature >>> from sklearn.experimental import enable_iterative_imputer # noqa >>> # now you can import normally from sklearn.impute >>> from sklearn.impute import IterativeImputer", + "description": "Multivariate imputer that estimates each feature from all the others.\n\nA strategy for imputing missing values by modeling each feature with\nmissing values as a function of other features in a round-robin fashion.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.21\n\n.. note::\n\n This estimator is still **experimental** for now: the predictions\n and the API might change without any deprecation cycle. To use it,\n you need to explicitly import `enable_iterative_imputer`::\n\n >>> # explicitly require this experimental feature\n >>> from sklearn.experimental import enable_iterative_imputer # noqa\n >>> # now you can import normally from sklearn.impute\n >>> from sklearn.impute import IterativeImputer", "docstring": "Multivariate imputer that estimates each feature from all the others.\n\n A strategy for imputing missing values by modeling each feature with\n missing values as a function of other features in a round-robin fashion.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.21\n\n .. note::\n\n This estimator is still **experimental** for now: the predictions\n and the API might change without any deprecation cycle. To use it,\n you need to explicitly import `enable_iterative_imputer`::\n\n >>> # explicitly require this experimental feature\n >>> from sklearn.experimental import enable_iterative_imputer # noqa\n >>> # now you can import normally from sklearn.impute\n >>> from sklearn.impute import IterativeImputer\n\n Parameters\n ----------\n estimator : estimator object, default=BayesianRidge()\n The estimator to use at each step of the round-robin imputation.\n If `sample_posterior=True`, the estimator must support\n `return_std` in its `predict` method.\n\n missing_values : int or np.nan, default=np.nan\n The placeholder for the missing values. All occurrences of\n `missing_values` will be imputed. For pandas' dataframes with\n nullable integer dtypes with missing values, `missing_values`\n should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.\n\n sample_posterior : bool, default=False\n Whether to sample from the (Gaussian) predictive posterior of the\n fitted estimator for each imputation. Estimator must support\n `return_std` in its `predict` method if set to `True`. Set to\n `True` if using `IterativeImputer` for multiple imputations.\n\n max_iter : int, default=10\n Maximum number of imputation rounds to perform before returning the\n imputations computed during the final round. A round is a single\n imputation of each feature with missing values. The stopping criterion\n is met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol`,\n where `X_t` is `X` at iteration `t`. Note that early stopping is only\n applied if `sample_posterior=False`.\n\n tol : float, default=1e-3\n Tolerance of the stopping condition.\n\n n_nearest_features : int, default=None\n Number of other features to use to estimate the missing values of\n each feature column. Nearness between features is measured using\n the absolute correlation coefficient between each feature pair (after\n initial imputation). To ensure coverage of features throughout the\n imputation process, the neighbor features are not necessarily nearest,\n but are drawn with probability proportional to correlation for each\n imputed target feature. Can provide significant speed-up when the\n number of features is huge. If `None`, all features will be used.\n\n initial_strategy : {'mean', 'median', 'most_frequent', 'constant'}, default='mean'\n Which strategy to use to initialize the missing values. Same as the\n `strategy` parameter in :class:`~sklearn.impute.SimpleImputer`.\n\n imputation_order : {'ascending', 'descending', 'roman', 'arabic', 'random'}, default='ascending'\n The order in which the features will be imputed. Possible values:\n\n - `'ascending'`: From features with fewest missing values to most.\n - `'descending'`: From features with most missing values to fewest.\n - `'roman'`: Left to right.\n - `'arabic'`: Right to left.\n - `'random'`: A random order for each round.\n\n skip_complete : bool, default=False\n If `True` then features with missing values during :meth:`transform`\n which did not have any missing values during :meth:`fit` will be\n imputed with the initial imputation method only. Set to `True` if you\n have many features with no missing values at both :meth:`fit` and\n :meth:`transform` time to save compute.\n\n min_value : float or array-like of shape (n_features,), default=-np.inf\n Minimum possible imputed value. Broadcast to shape `(n_features,)` if\n scalar. If array-like, expects shape `(n_features,)`, one min value for\n each feature. The default is `-np.inf`.\n\n .. versionchanged:: 0.23\n Added support for array-like.\n\n max_value : float or array-like of shape (n_features,), default=np.inf\n Maximum possible imputed value. Broadcast to shape `(n_features,)` if\n scalar. If array-like, expects shape `(n_features,)`, one max value for\n each feature. The default is `np.inf`.\n\n .. versionchanged:: 0.23\n Added support for array-like.\n\n verbose : int, default=0\n Verbosity flag, controls the debug messages that are issued\n as functions are evaluated. The higher, the more verbose. Can be 0, 1,\n or 2.\n\n random_state : int, RandomState instance or None, default=None\n The seed of the pseudo random number generator to use. Randomizes\n selection of estimator features if `n_nearest_features` is not `None`,\n the `imputation_order` if `random`, and the sampling from posterior if\n `sample_posterior=True`. Use an integer for determinism.\n See :term:`the Glossary `.\n\n add_indicator : bool, default=False\n If `True`, a :class:`MissingIndicator` transform will stack onto output\n of the imputer's transform. This allows a predictive estimator\n to account for missingness despite imputation. If a feature has no\n missing values at fit/train time, the feature won't appear on\n the missing indicator even if there are missing values at\n transform/test time.\n\n Attributes\n ----------\n initial_imputer_ : object of type :class:`~sklearn.impute.SimpleImputer`\n Imputer used to initialize the missing values.\n\n imputation_sequence_ : list of tuples\n Each tuple has `(feat_idx, neighbor_feat_idx, estimator)`, where\n `feat_idx` is the current feature to be imputed,\n `neighbor_feat_idx` is the array of other features used to impute the\n current feature, and `estimator` is the trained estimator used for\n the imputation. Length is `self.n_features_with_missing_ *\n self.n_iter_`.\n\n n_iter_ : int\n Number of iteration rounds that occurred. Will be less than\n `self.max_iter` if early stopping criterion was reached.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_features_with_missing_ : int\n Number of features with missing values.\n\n indicator_ : :class:`~sklearn.impute.MissingIndicator`\n Indicator used to add binary indicators for missing values.\n `None` if `add_indicator=False`.\n\n random_state_ : RandomState instance\n RandomState instance that is generated either from a seed, the random\n number generator or by `np.random`.\n\n See Also\n --------\n SimpleImputer : Univariate imputation of missing values.\n\n Notes\n -----\n To support imputation in inductive mode we store each feature's estimator\n during the :meth:`fit` phase, and predict without refitting (in order)\n during the :meth:`transform` phase.\n\n Features which contain all missing values at :meth:`fit` are discarded upon\n :meth:`transform`.\n\n References\n ----------\n .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). \"mice:\n Multivariate Imputation by Chained Equations in R\". Journal of\n Statistical Software 45: 1-67.\n `_\n\n .. [2] `S. F. Buck, (1960). \"A Method of Estimation of Missing Values in\n Multivariate Data Suitable for use with an Electronic Computer\".\n Journal of the Royal Statistical Society 22(2): 302-306.\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.experimental import enable_iterative_imputer\n >>> from sklearn.impute import IterativeImputer\n >>> imp_mean = IterativeImputer(random_state=0)\n >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])\n IterativeImputer(random_state=0)\n >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]\n >>> imp_mean.transform(X)\n array([[ 6.9584..., 2. , 3. ],\n [ 4. , 2.6000..., 6. ],\n [10. , 4.9999..., 9. ]])\n ", "source_code": "\n\nclass IterativeImputer(_BaseImputer):\n \"\"\"Multivariate imputer that estimates each feature from all the others.\n\n A strategy for imputing missing values by modeling each feature with\n missing values as a function of other features in a round-robin fashion.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.21\n\n .. note::\n\n This estimator is still **experimental** for now: the predictions\n and the API might change without any deprecation cycle. To use it,\n you need to explicitly import `enable_iterative_imputer`::\n\n >>> # explicitly require this experimental feature\n >>> from sklearn.experimental import enable_iterative_imputer # noqa\n >>> # now you can import normally from sklearn.impute\n >>> from sklearn.impute import IterativeImputer\n\n Parameters\n ----------\n estimator : estimator object, default=BayesianRidge()\n The estimator to use at each step of the round-robin imputation.\n If `sample_posterior=True`, the estimator must support\n `return_std` in its `predict` method.\n\n missing_values : int or np.nan, default=np.nan\n The placeholder for the missing values. All occurrences of\n `missing_values` will be imputed. For pandas' dataframes with\n nullable integer dtypes with missing values, `missing_values`\n should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.\n\n sample_posterior : bool, default=False\n Whether to sample from the (Gaussian) predictive posterior of the\n fitted estimator for each imputation. Estimator must support\n `return_std` in its `predict` method if set to `True`. Set to\n `True` if using `IterativeImputer` for multiple imputations.\n\n max_iter : int, default=10\n Maximum number of imputation rounds to perform before returning the\n imputations computed during the final round. A round is a single\n imputation of each feature with missing values. The stopping criterion\n is met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol`,\n where `X_t` is `X` at iteration `t`. Note that early stopping is only\n applied if `sample_posterior=False`.\n\n tol : float, default=1e-3\n Tolerance of the stopping condition.\n\n n_nearest_features : int, default=None\n Number of other features to use to estimate the missing values of\n each feature column. Nearness between features is measured using\n the absolute correlation coefficient between each feature pair (after\n initial imputation). To ensure coverage of features throughout the\n imputation process, the neighbor features are not necessarily nearest,\n but are drawn with probability proportional to correlation for each\n imputed target feature. Can provide significant speed-up when the\n number of features is huge. If `None`, all features will be used.\n\n initial_strategy : {'mean', 'median', 'most_frequent', 'constant'}, default='mean'\n Which strategy to use to initialize the missing values. Same as the\n `strategy` parameter in :class:`~sklearn.impute.SimpleImputer`.\n\n imputation_order : {'ascending', 'descending', 'roman', 'arabic', 'random'}, default='ascending'\n The order in which the features will be imputed. Possible values:\n\n - `'ascending'`: From features with fewest missing values to most.\n - `'descending'`: From features with most missing values to fewest.\n - `'roman'`: Left to right.\n - `'arabic'`: Right to left.\n - `'random'`: A random order for each round.\n\n skip_complete : bool, default=False\n If `True` then features with missing values during :meth:`transform`\n which did not have any missing values during :meth:`fit` will be\n imputed with the initial imputation method only. Set to `True` if you\n have many features with no missing values at both :meth:`fit` and\n :meth:`transform` time to save compute.\n\n min_value : float or array-like of shape (n_features,), default=-np.inf\n Minimum possible imputed value. Broadcast to shape `(n_features,)` if\n scalar. If array-like, expects shape `(n_features,)`, one min value for\n each feature. The default is `-np.inf`.\n\n .. versionchanged:: 0.23\n Added support for array-like.\n\n max_value : float or array-like of shape (n_features,), default=np.inf\n Maximum possible imputed value. Broadcast to shape `(n_features,)` if\n scalar. If array-like, expects shape `(n_features,)`, one max value for\n each feature. The default is `np.inf`.\n\n .. versionchanged:: 0.23\n Added support for array-like.\n\n verbose : int, default=0\n Verbosity flag, controls the debug messages that are issued\n as functions are evaluated. The higher, the more verbose. Can be 0, 1,\n or 2.\n\n random_state : int, RandomState instance or None, default=None\n The seed of the pseudo random number generator to use. Randomizes\n selection of estimator features if `n_nearest_features` is not `None`,\n the `imputation_order` if `random`, and the sampling from posterior if\n `sample_posterior=True`. Use an integer for determinism.\n See :term:`the Glossary `.\n\n add_indicator : bool, default=False\n If `True`, a :class:`MissingIndicator` transform will stack onto output\n of the imputer's transform. This allows a predictive estimator\n to account for missingness despite imputation. If a feature has no\n missing values at fit/train time, the feature won't appear on\n the missing indicator even if there are missing values at\n transform/test time.\n\n Attributes\n ----------\n initial_imputer_ : object of type :class:`~sklearn.impute.SimpleImputer`\n Imputer used to initialize the missing values.\n\n imputation_sequence_ : list of tuples\n Each tuple has `(feat_idx, neighbor_feat_idx, estimator)`, where\n `feat_idx` is the current feature to be imputed,\n `neighbor_feat_idx` is the array of other features used to impute the\n current feature, and `estimator` is the trained estimator used for\n the imputation. Length is `self.n_features_with_missing_ *\n self.n_iter_`.\n\n n_iter_ : int\n Number of iteration rounds that occurred. Will be less than\n `self.max_iter` if early stopping criterion was reached.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_features_with_missing_ : int\n Number of features with missing values.\n\n indicator_ : :class:`~sklearn.impute.MissingIndicator`\n Indicator used to add binary indicators for missing values.\n `None` if `add_indicator=False`.\n\n random_state_ : RandomState instance\n RandomState instance that is generated either from a seed, the random\n number generator or by `np.random`.\n\n See Also\n --------\n SimpleImputer : Univariate imputation of missing values.\n\n Notes\n -----\n To support imputation in inductive mode we store each feature's estimator\n during the :meth:`fit` phase, and predict without refitting (in order)\n during the :meth:`transform` phase.\n\n Features which contain all missing values at :meth:`fit` are discarded upon\n :meth:`transform`.\n\n References\n ----------\n .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). \"mice:\n Multivariate Imputation by Chained Equations in R\". Journal of\n Statistical Software 45: 1-67.\n `_\n\n .. [2] `S. F. Buck, (1960). \"A Method of Estimation of Missing Values in\n Multivariate Data Suitable for use with an Electronic Computer\".\n Journal of the Royal Statistical Society 22(2): 302-306.\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.experimental import enable_iterative_imputer\n >>> from sklearn.impute import IterativeImputer\n >>> imp_mean = IterativeImputer(random_state=0)\n >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])\n IterativeImputer(random_state=0)\n >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]\n >>> imp_mean.transform(X)\n array([[ 6.9584..., 2. , 3. ],\n [ 4. , 2.6000..., 6. ],\n [10. , 4.9999..., 9. ]])\n \"\"\"\n \n def __init__(self, estimator=None, *, missing_values=np.nan, sample_posterior=False, max_iter=10, tol=0.001, n_nearest_features=None, initial_strategy='mean', imputation_order='ascending', skip_complete=False, min_value=-np.inf, max_value=np.inf, verbose=0, random_state=None, add_indicator=False):\n super().__init__(missing_values=missing_values, add_indicator=add_indicator)\n self.estimator = estimator\n self.sample_posterior = sample_posterior\n self.max_iter = max_iter\n self.tol = tol\n self.n_nearest_features = n_nearest_features\n self.initial_strategy = initial_strategy\n self.imputation_order = imputation_order\n self.skip_complete = skip_complete\n self.min_value = min_value\n self.max_value = max_value\n self.verbose = verbose\n self.random_state = random_state\n \n def _impute_one_feature(self, X_filled, mask_missing_values, feat_idx, neighbor_feat_idx, estimator=None, fit_mode=True):\n \"\"\"Impute a single feature from the others provided.\n\n This function predicts the missing values of one of the features using\n the current estimates of all the other features. The `estimator` must\n support `return_std=True` in its `predict` method for this function\n to work.\n\n Parameters\n ----------\n X_filled : ndarray\n Input data with the most recent imputations.\n\n mask_missing_values : ndarray\n Input data's missing indicator matrix.\n\n feat_idx : int\n Index of the feature currently being imputed.\n\n neighbor_feat_idx : ndarray\n Indices of the features to be used in imputing `feat_idx`.\n\n estimator : object\n The estimator to use at this step of the round-robin imputation.\n If `sample_posterior=True`, the estimator must support\n `return_std` in its `predict` method.\n If None, it will be cloned from self._estimator.\n\n fit_mode : boolean, default=True\n Whether to fit and predict with the estimator or just predict.\n\n Returns\n -------\n X_filled : ndarray\n Input data with `X_filled[missing_row_mask, feat_idx]` updated.\n\n estimator : estimator with sklearn API\n The fitted estimator used to impute\n `X_filled[missing_row_mask, feat_idx]`.\n \"\"\"\n if estimator is None and fit_mode is False:\n raise ValueError('If fit_mode is False, then an already-fitted estimator should be passed in.')\n if estimator is None:\n estimator = clone(self._estimator)\n missing_row_mask = mask_missing_values[:, feat_idx]\n if fit_mode:\n X_train = _safe_indexing(X_filled[:, neighbor_feat_idx], ~missing_row_mask)\n y_train = _safe_indexing(X_filled[:, feat_idx], ~missing_row_mask)\n estimator.fit(X_train, y_train)\n if np.sum(missing_row_mask) == 0:\n return X_filled, estimator\n X_test = _safe_indexing(X_filled[:, neighbor_feat_idx], missing_row_mask)\n if self.sample_posterior:\n (mus, sigmas) = estimator.predict(X_test, return_std=True)\n imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)\n positive_sigmas = sigmas > 0\n imputed_values[~positive_sigmas] = mus[~positive_sigmas]\n mus_too_low = mus < self._min_value[feat_idx]\n imputed_values[mus_too_low] = self._min_value[feat_idx]\n mus_too_high = mus > self._max_value[feat_idx]\n imputed_values[mus_too_high] = self._max_value[feat_idx]\n inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high\n mus = mus[inrange_mask]\n sigmas = sigmas[inrange_mask]\n a = (self._min_value[feat_idx] - mus) / sigmas\n b = (self._max_value[feat_idx] - mus) / sigmas\n truncated_normal = stats.truncnorm(a=a, b=b, loc=mus, scale=sigmas)\n imputed_values[inrange_mask] = truncated_normal.rvs(random_state=self.random_state_)\n else:\n imputed_values = estimator.predict(X_test)\n imputed_values = np.clip(imputed_values, self._min_value[feat_idx], self._max_value[feat_idx])\n X_filled[missing_row_mask, feat_idx] = imputed_values\n return X_filled, estimator\n \n def _get_neighbor_feat_idx(self, n_features, feat_idx, abs_corr_mat):\n \"\"\"Get a list of other features to predict `feat_idx`.\n\n If `self.n_nearest_features` is less than or equal to the total\n number of features, then use a probability proportional to the absolute\n correlation between `feat_idx` and each other feature to randomly\n choose a subsample of the other features (without replacement).\n\n Parameters\n ----------\n n_features : int\n Number of features in `X`.\n\n feat_idx : int\n Index of the feature currently being imputed.\n\n abs_corr_mat : ndarray, shape (n_features, n_features)\n Absolute correlation matrix of `X`. The diagonal has been zeroed\n out and each feature has been normalized to sum to 1. Can be None.\n\n Returns\n -------\n neighbor_feat_idx : array-like\n The features to use to impute `feat_idx`.\n \"\"\"\n if self.n_nearest_features is not None and self.n_nearest_features < n_features:\n p = abs_corr_mat[:, feat_idx]\n neighbor_feat_idx = self.random_state_.choice(np.arange(n_features), self.n_nearest_features, replace=False, p=p)\n else:\n inds_left = np.arange(feat_idx)\n inds_right = np.arange(feat_idx + 1, n_features)\n neighbor_feat_idx = np.concatenate((inds_left, inds_right))\n return neighbor_feat_idx\n \n def _get_ordered_idx(self, mask_missing_values):\n \"\"\"Decide in what order we will update the features.\n\n As a homage to the MICE R package, we will have 4 main options of\n how to order the updates, and use a random order if anything else\n is specified.\n\n Also, this function skips features which have no missing values.\n\n Parameters\n ----------\n mask_missing_values : array-like, shape (n_samples, n_features)\n Input data's missing indicator matrix, where `n_samples` is the\n number of samples and `n_features` is the number of features.\n\n Returns\n -------\n ordered_idx : ndarray, shape (n_features,)\n The order in which to impute the features.\n \"\"\"\n frac_of_missing_values = mask_missing_values.mean(axis=0)\n if self.skip_complete:\n missing_values_idx = np.flatnonzero(frac_of_missing_values)\n else:\n missing_values_idx = np.arange(np.shape(frac_of_missing_values)[0])\n if self.imputation_order == 'roman':\n ordered_idx = missing_values_idx\n elif self.imputation_order == 'arabic':\n ordered_idx = missing_values_idx[::-1]\n elif self.imputation_order == 'ascending':\n n = len(frac_of_missing_values) - len(missing_values_idx)\n ordered_idx = np.argsort(frac_of_missing_values, kind='mergesort')[n:]\n elif self.imputation_order == 'descending':\n n = len(frac_of_missing_values) - len(missing_values_idx)\n ordered_idx = np.argsort(frac_of_missing_values, kind='mergesort')[n:][::-1]\n elif self.imputation_order == 'random':\n ordered_idx = missing_values_idx\n self.random_state_.shuffle(ordered_idx)\n else:\n raise ValueError(\"Got an invalid imputation order: '{0}'. It must be one of the following: 'roman', 'arabic', 'ascending', 'descending', or 'random'.\".format(self.imputation_order))\n return ordered_idx\n \n def _get_abs_corr_mat(self, X_filled, tolerance=1e-06):\n \"\"\"Get absolute correlation matrix between features.\n\n Parameters\n ----------\n X_filled : ndarray, shape (n_samples, n_features)\n Input data with the most recent imputations.\n\n tolerance : float, default=1e-6\n `abs_corr_mat` can have nans, which will be replaced\n with `tolerance`.\n\n Returns\n -------\n abs_corr_mat : ndarray, shape (n_features, n_features)\n Absolute correlation matrix of `X` at the beginning of the\n current round. The diagonal has been zeroed out and each feature's\n absolute correlations with all others have been normalized to sum\n to 1.\n \"\"\"\n n_features = X_filled.shape[1]\n if self.n_nearest_features is None or self.n_nearest_features >= n_features:\n return None\n with np.errstate(invalid='ignore'):\n abs_corr_mat = np.abs(np.corrcoef(X_filled.T))\n abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance\n np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)\n np.fill_diagonal(abs_corr_mat, 0)\n abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)\n return abs_corr_mat\n \n def _initial_imputation(self, X, in_fit=False):\n \"\"\"Perform initial imputation for input `X`.\n\n Parameters\n ----------\n X : ndarray, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n in_fit : bool, default=False\n Whether function is called in :meth:`fit`.\n\n Returns\n -------\n Xt : ndarray, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n X_filled : ndarray, shape (n_samples, n_features)\n Input data with the most recent imputations.\n\n mask_missing_values : ndarray, shape (n_samples, n_features)\n Input data's missing indicator matrix, where `n_samples` is the\n number of samples and `n_features` is the number of features.\n\n X_missing_mask : ndarray, shape (n_samples, n_features)\n Input data's mask matrix indicating missing datapoints, where\n `n_samples` is the number of samples and `n_features` is the\n number of features.\n \"\"\"\n if is_scalar_nan(self.missing_values):\n force_all_finite = 'allow-nan'\n else:\n force_all_finite = True\n X = self._validate_data(X, dtype=FLOAT_DTYPES, order='F', reset=in_fit, force_all_finite=force_all_finite)\n _check_inputs_dtype(X, self.missing_values)\n X_missing_mask = _get_mask(X, self.missing_values)\n mask_missing_values = X_missing_mask.copy()\n if self.initial_imputer_ is None:\n self.initial_imputer_ = SimpleImputer(missing_values=self.missing_values, strategy=self.initial_strategy)\n X_filled = self.initial_imputer_.fit_transform(X)\n else:\n X_filled = self.initial_imputer_.transform(X)\n valid_mask = np.flatnonzero(np.logical_not(np.isnan(self.initial_imputer_.statistics_)))\n Xt = X[:, valid_mask]\n mask_missing_values = mask_missing_values[:, valid_mask]\n return Xt, X_filled, mask_missing_values, X_missing_mask\n \n @staticmethod\n def _validate_limit(limit, limit_type, n_features):\n \"\"\"Validate the limits (min/max) of the feature values.\n\n Converts scalar min/max limits to vectors of shape `(n_features,)`.\n\n Parameters\n ----------\n limit: scalar or array-like\n The user-specified limit (i.e, min_value or max_value).\n limit_type: {'max', 'min'}\n Type of limit to validate.\n n_features: int\n Number of features in the dataset.\n\n Returns\n -------\n limit: ndarray, shape(n_features,)\n Array of limits, one for each feature.\n \"\"\"\n limit_bound = np.inf if limit_type == 'max' else -np.inf\n limit = limit_bound if limit is None else limit\n if np.isscalar(limit):\n limit = np.full(n_features, limit)\n limit = check_array(limit, force_all_finite=False, copy=False, ensure_2d=False)\n if not limit.shape[0] == n_features:\n raise ValueError(f\"'{limit_type}_value' should be of shape ({n_features},) when an array-like is provided. Got {limit.shape}, instead.\")\n return limit\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit the imputer on `X` and return the transformed `X`.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n Xt : array-like, shape (n_samples, n_features)\n The imputed input data.\n \"\"\"\n self.random_state_ = getattr(self, 'random_state_', check_random_state(self.random_state))\n if self.max_iter < 0:\n raise ValueError(\"'max_iter' should be a positive integer. Got {} instead.\".format(self.max_iter))\n if self.tol < 0:\n raise ValueError(\"'tol' should be a non-negative float. Got {} instead.\".format(self.tol))\n if self.estimator is None:\n from ..linear_model import BayesianRidge\n self._estimator = BayesianRidge()\n else:\n self._estimator = clone(self.estimator)\n self.imputation_sequence_ = []\n self.initial_imputer_ = None\n (X, Xt, mask_missing_values, complete_mask) = self._initial_imputation(X, in_fit=True)\n super()._fit_indicator(complete_mask)\n X_indicator = super()._transform_indicator(complete_mask)\n if self.max_iter == 0 or np.all(mask_missing_values):\n self.n_iter_ = 0\n return super()._concatenate_indicator(Xt, X_indicator)\n if Xt.shape[1] == 1:\n self.n_iter_ = 0\n return super()._concatenate_indicator(Xt, X_indicator)\n self._min_value = self._validate_limit(self.min_value, 'min', X.shape[1])\n self._max_value = self._validate_limit(self.max_value, 'max', X.shape[1])\n if not np.all(np.greater(self._max_value, self._min_value)):\n raise ValueError('One (or more) features have min_value >= max_value.')\n ordered_idx = self._get_ordered_idx(mask_missing_values)\n self.n_features_with_missing_ = len(ordered_idx)\n abs_corr_mat = self._get_abs_corr_mat(Xt)\n (n_samples, n_features) = Xt.shape\n if self.verbose > 0:\n print('[IterativeImputer] Completing matrix with shape %s' % (X.shape, ))\n start_t = time()\n if not self.sample_posterior:\n Xt_previous = Xt.copy()\n normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))\n for self.n_iter_ in range(1, self.max_iter + 1):\n if self.imputation_order == 'random':\n ordered_idx = self._get_ordered_idx(mask_missing_values)\n for feat_idx in ordered_idx:\n neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, feat_idx, abs_corr_mat)\n (Xt, estimator) = self._impute_one_feature(Xt, mask_missing_values, feat_idx, neighbor_feat_idx, estimator=None, fit_mode=True)\n estimator_triplet = _ImputerTriplet(feat_idx, neighbor_feat_idx, estimator)\n self.imputation_sequence_.append(estimator_triplet)\n if self.verbose > 1:\n print('[IterativeImputer] Ending imputation round %d/%d, elapsed time %0.2f' % (self.n_iter_, self.max_iter, time() - start_t))\n if not self.sample_posterior:\n inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, axis=None)\n if self.verbose > 0:\n print('[IterativeImputer] Change: {}, scaled tolerance: {} '.format(inf_norm, normalized_tol))\n if inf_norm < normalized_tol:\n if self.verbose > 0:\n print('[IterativeImputer] Early stopping criterion reached.')\n break\n Xt_previous = Xt.copy()\n else:\n if not self.sample_posterior:\n warnings.warn('[IterativeImputer] Early stopping criterion not reached.', ConvergenceWarning)\n Xt[~mask_missing_values] = X[~mask_missing_values]\n return super()._concatenate_indicator(Xt, X_indicator)\n \n def transform(self, X):\n \"\"\"Impute all missing values in `X`.\n\n Note that this is stochastic, and that if `random_state` is not fixed,\n repeated calls, or permuted input, results will differ.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data to complete.\n\n Returns\n -------\n Xt : array-like, shape (n_samples, n_features)\n The imputed input data.\n \"\"\"\n check_is_fitted(self)\n (X, Xt, mask_missing_values, complete_mask) = self._initial_imputation(X)\n X_indicator = super()._transform_indicator(complete_mask)\n if self.n_iter_ == 0 or np.all(mask_missing_values):\n return super()._concatenate_indicator(Xt, X_indicator)\n imputations_per_round = len(self.imputation_sequence_) // self.n_iter_\n i_rnd = 0\n if self.verbose > 0:\n print('[IterativeImputer] Completing matrix with shape %s' % (X.shape, ))\n start_t = time()\n for (it, estimator_triplet) in enumerate(self.imputation_sequence_):\n (Xt, _) = self._impute_one_feature(Xt, mask_missing_values, estimator_triplet.feat_idx, estimator_triplet.neighbor_feat_idx, estimator=estimator_triplet.estimator, fit_mode=False)\n if not (it + 1) % imputations_per_round:\n if self.verbose > 1:\n print('[IterativeImputer] Ending imputation round %d/%d, elapsed time %0.2f' % (i_rnd + 1, self.n_iter_, time() - start_t))\n i_rnd += 1\n Xt[~mask_missing_values] = X[~mask_missing_values]\n return super()._concatenate_indicator(Xt, X_indicator)\n \n def fit(self, X, y=None):\n \"\"\"Fit the imputer on `X` and return self.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self.fit_transform(X)\n return self\n" }, @@ -23146,7 +23213,7 @@ "sklearn.impute._knn.KNNImputer.transform" ], "is_public": true, - "description": "Imputation for completing missing values using k-Nearest Neighbors.\n\nEach sample's missing values are imputed using the mean value from `n_neighbors` nearest neighbors found in the training set. Two samples are close if the features that neither is missing are close. Read more in the :ref:`User Guide `. .. versionadded:: 0.22", + "description": "Imputation for completing missing values using k-Nearest Neighbors.\n\nEach sample's missing values are imputed using the mean value from\n`n_neighbors` nearest neighbors found in the training set. Two samples are\nclose if the features that neither is missing are close.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.22", "docstring": "Imputation for completing missing values using k-Nearest Neighbors.\n\n Each sample's missing values are imputed using the mean value from\n `n_neighbors` nearest neighbors found in the training set. Two samples are\n close if the features that neither is missing are close.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n missing_values : int, float, str, np.nan or None, default=np.nan\n The placeholder for the missing values. All occurrences of\n `missing_values` will be imputed. For pandas' dataframes with\n nullable integer dtypes with missing values, `missing_values`\n should be set to np.nan, since `pd.NA` will be converted to np.nan.\n\n n_neighbors : int, default=5\n Number of neighboring samples to use for imputation.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood are\n weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - callable : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n metric : {'nan_euclidean'} or callable, default='nan_euclidean'\n Distance metric for searching neighbors. Possible values:\n\n - 'nan_euclidean'\n - callable : a user-defined function which conforms to the definition\n of ``_pairwise_callable(X, Y, metric, **kwds)``. The function\n accepts two arrays, X and Y, and a `missing_values` keyword in\n `kwds` and returns a scalar distance value.\n\n copy : bool, default=True\n If True, a copy of X will be created. If False, imputation will\n be done in-place whenever possible.\n\n add_indicator : bool, default=False\n If True, a :class:`MissingIndicator` transform will stack onto the\n output of the imputer's transform. This allows a predictive estimator\n to account for missingness despite imputation. If a feature has no\n missing values at fit/train time, the feature won't appear on the\n missing indicator even if there are missing values at transform/test\n time.\n\n Attributes\n ----------\n indicator_ : :class:`~sklearn.impute.MissingIndicator`\n Indicator used to add binary indicators for missing values.\n ``None`` if add_indicator is False.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SimpleImputer : Imputation transformer for completing missing values\n with simple strategies.\n IterativeImputer : Multivariate imputer that estimates each feature\n from all the others.\n\n References\n ----------\n * Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor\n Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing\n value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17\n no. 6, 2001 Pages 520-525.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.impute import KNNImputer\n >>> X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]\n >>> imputer = KNNImputer(n_neighbors=2)\n >>> imputer.fit_transform(X)\n array([[1. , 2. , 4. ],\n [3. , 4. , 3. ],\n [5.5, 6. , 5. ],\n [8. , 8. , 7. ]])\n ", "source_code": "\n\nclass KNNImputer(_BaseImputer):\n \"\"\"Imputation for completing missing values using k-Nearest Neighbors.\n\n Each sample's missing values are imputed using the mean value from\n `n_neighbors` nearest neighbors found in the training set. Two samples are\n close if the features that neither is missing are close.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n missing_values : int, float, str, np.nan or None, default=np.nan\n The placeholder for the missing values. All occurrences of\n `missing_values` will be imputed. For pandas' dataframes with\n nullable integer dtypes with missing values, `missing_values`\n should be set to np.nan, since `pd.NA` will be converted to np.nan.\n\n n_neighbors : int, default=5\n Number of neighboring samples to use for imputation.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood are\n weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - callable : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n metric : {'nan_euclidean'} or callable, default='nan_euclidean'\n Distance metric for searching neighbors. Possible values:\n\n - 'nan_euclidean'\n - callable : a user-defined function which conforms to the definition\n of ``_pairwise_callable(X, Y, metric, **kwds)``. The function\n accepts two arrays, X and Y, and a `missing_values` keyword in\n `kwds` and returns a scalar distance value.\n\n copy : bool, default=True\n If True, a copy of X will be created. If False, imputation will\n be done in-place whenever possible.\n\n add_indicator : bool, default=False\n If True, a :class:`MissingIndicator` transform will stack onto the\n output of the imputer's transform. This allows a predictive estimator\n to account for missingness despite imputation. If a feature has no\n missing values at fit/train time, the feature won't appear on the\n missing indicator even if there are missing values at transform/test\n time.\n\n Attributes\n ----------\n indicator_ : :class:`~sklearn.impute.MissingIndicator`\n Indicator used to add binary indicators for missing values.\n ``None`` if add_indicator is False.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SimpleImputer : Imputation transformer for completing missing values\n with simple strategies.\n IterativeImputer : Multivariate imputer that estimates each feature\n from all the others.\n\n References\n ----------\n * Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor\n Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing\n value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17\n no. 6, 2001 Pages 520-525.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.impute import KNNImputer\n >>> X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]\n >>> imputer = KNNImputer(n_neighbors=2)\n >>> imputer.fit_transform(X)\n array([[1. , 2. , 4. ],\n [3. , 4. , 3. ],\n [5.5, 6. , 5. ],\n [8. , 8. , 7. ]])\n \"\"\"\n \n def __init__(self, *, missing_values=np.nan, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False):\n super().__init__(missing_values=missing_values, add_indicator=add_indicator)\n self.n_neighbors = n_neighbors\n self.weights = weights\n self.metric = metric\n self.copy = copy\n \n def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):\n \"\"\"Helper function to impute a single column.\n\n Parameters\n ----------\n dist_pot_donors : ndarray of shape (n_receivers, n_potential_donors)\n Distance matrix between the receivers and potential donors from\n training set. There must be at least one non-nan distance between\n a receiver and a potential donor.\n\n n_neighbors : int\n Number of neighbors to consider.\n\n fit_X_col : ndarray of shape (n_potential_donors,)\n Column of potential donors from training set.\n\n mask_fit_X_col : ndarray of shape (n_potential_donors,)\n Missing mask for fit_X_col.\n\n Returns\n -------\n imputed_values: ndarray of shape (n_receivers,)\n Imputed values for receiver.\n \"\"\"\n donors_idx = np.argpartition(dist_pot_donors, n_neighbors - 1, axis=1)[:, :n_neighbors]\n donors_dist = dist_pot_donors[np.arange(donors_idx.shape[0])[:, None], donors_idx]\n weight_matrix = _get_weights(donors_dist, self.weights)\n if weight_matrix is not None:\n weight_matrix[np.isnan(weight_matrix)] = 0.0\n donors = fit_X_col.take(donors_idx)\n donors_mask = mask_fit_X_col.take(donors_idx)\n donors = np.ma.array(donors, mask=donors_mask)\n return np.ma.average(donors, axis=1, weights=weight_matrix).data\n \n def fit(self, X, y=None):\n \"\"\"Fit the imputer on X.\n\n Parameters\n ----------\n X : array-like shape of (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n The fitted `KNNImputer` class instance.\n \"\"\"\n if not is_scalar_nan(self.missing_values):\n force_all_finite = True\n else:\n force_all_finite = 'allow-nan'\n if self.metric not in _NAN_METRICS and not callable(self.metric):\n raise ValueError('The selected metric does not support NaN values')\n if self.n_neighbors <= 0:\n raise ValueError('Expected n_neighbors > 0. Got {}'.format(self.n_neighbors))\n X = self._validate_data(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy)\n _check_weights(self.weights)\n self._fit_X = X\n self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)\n super()._fit_indicator(self._mask_fit_X)\n return self\n \n def transform(self, X):\n \"\"\"Impute all missing values in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data to complete.\n\n Returns\n -------\n X : array-like of shape (n_samples, n_output_features)\n The imputed dataset. `n_output_features` is the number of features\n that is not always missing during `fit`.\n \"\"\"\n check_is_fitted(self)\n if not is_scalar_nan(self.missing_values):\n force_all_finite = True\n else:\n force_all_finite = 'allow-nan'\n X = self._validate_data(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy, reset=False)\n mask = _get_mask(X, self.missing_values)\n mask_fit_X = self._mask_fit_X\n valid_mask = ~np.all(mask_fit_X, axis=0)\n X_indicator = super()._transform_indicator(mask)\n if not np.any(mask):\n return X[:, valid_mask]\n row_missing_idx = np.flatnonzero(mask.any(axis=1))\n non_missing_fix_X = np.logical_not(mask_fit_X)\n dist_idx_map = np.zeros(X.shape[0], dtype=int)\n dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0])\n \n def process_chunk(dist_chunk, start):\n row_missing_chunk = row_missing_idx[start:start + len(dist_chunk)]\n for col in range(X.shape[1]):\n if not valid_mask[col]:\n continue\n col_mask = mask[row_missing_chunk, col]\n if not np.any(col_mask):\n continue\n (potential_donors_idx, ) = np.nonzero(non_missing_fix_X[:, col])\n receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]\n dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][:, potential_donors_idx]\n all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)\n all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]\n if all_nan_receivers_idx.size:\n col_mean = np.ma.array(self._fit_X[:, col], mask=mask_fit_X[:, col]).mean()\n X[all_nan_receivers_idx, col] = col_mean\n if len(all_nan_receivers_idx) == len(receivers_idx):\n continue\n receivers_idx = receivers_idx[~all_nan_dist_mask]\n dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][:, potential_donors_idx]\n n_neighbors = min(self.n_neighbors, len(potential_donors_idx))\n value = self._calc_impute(dist_subset, n_neighbors, self._fit_X[potential_donors_idx, col], mask_fit_X[potential_donors_idx, col])\n X[receivers_idx, col] = value\n gen = pairwise_distances_chunked(X[row_missing_idx, :], self._fit_X, metric=self.metric, missing_values=self.missing_values, force_all_finite=force_all_finite, reduce_func=process_chunk)\n for chunk in gen:\n pass\n return super()._concatenate_indicator(X[:, valid_mask], X_indicator)\n" }, @@ -23166,7 +23233,7 @@ "sklearn.inspection._plot.partial_dependence.PartialDependenceDisplay.plot" ], "is_public": true, - "description": "Partial Dependence Plot (PDP).\n\nThis can also display individual partial dependencies which are often referred to as: Individual Condition Expectation (ICE). It is recommended to use :func:`~sklearn.inspection.PartialDependenceDisplay.from_estimator` to create a :class:`~sklearn.inspection.PartialDependenceDisplay`. All parameters are stored as attributes. Read more in :ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py` and the :ref:`User Guide `. .. versionadded:: 0.22", + "description": "Partial Dependence Plot (PDP).\n\nThis can also display individual partial dependencies which are often\nreferred to as: Individual Condition Expectation (ICE).\n\nIt is recommended to use\n:func:`~sklearn.inspection.PartialDependenceDisplay.from_estimator` to create a\n:class:`~sklearn.inspection.PartialDependenceDisplay`. All parameters are\nstored as attributes.\n\nRead more in\n:ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py`\nand the :ref:`User Guide `.\n\n .. versionadded:: 0.22", "docstring": "Partial Dependence Plot (PDP).\n\n This can also display individual partial dependencies which are often\n referred to as: Individual Condition Expectation (ICE).\n\n It is recommended to use\n :func:`~sklearn.inspection.PartialDependenceDisplay.from_estimator` to create a\n :class:`~sklearn.inspection.PartialDependenceDisplay`. All parameters are\n stored as attributes.\n\n Read more in\n :ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py`\n and the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n pd_results : list of Bunch\n Results of :func:`~sklearn.inspection.partial_dependence` for\n ``features``.\n\n features : list of (int,) or list of (int, int)\n Indices of features for a given plot. A tuple of one integer will plot\n a partial dependence curve of one feature. A tuple of two integers will\n plot a two-way partial dependence curve as a contour plot.\n\n feature_names : list of str\n Feature names corresponding to the indices in ``features``.\n\n target_idx : int\n\n - In a multiclass setting, specifies the class for which the PDPs\n should be computed. Note that for binary classification, the\n positive class (index 1) is always used.\n - In a multioutput setting, specifies the task for which the PDPs\n should be computed.\n\n Ignored in binary classification or classical regression settings.\n\n pdp_lim : dict\n Global min and max average predictions, such that all plots will have\n the same scale and y limits. `pdp_lim[1]` is the global min and max for\n single partial dependence curves. `pdp_lim[2]` is the global min and\n max for two-way partial dependence curves.\n\n deciles : dict\n Deciles for feature indices in ``features``.\n\n kind : {'average', 'individual', 'both'}, default='average'\n Whether to plot the partial dependence averaged across all the samples\n in the dataset or one line per sample or both.\n\n - ``kind='average'`` results in the traditional PD plot;\n - ``kind='individual'`` results in the ICE plot.\n\n Note that the fast ``method='recursion'`` option is only available for\n ``kind='average'``. Plotting individual dependencies requires using the\n slower ``method='brute'`` option.\n\n .. versionadded:: 0.24\n\n subsample : float, int or None, default=1000\n Sampling for ICE curves when `kind` is 'individual' or 'both'.\n If float, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to be used to plot ICE curves. If int, represents the\n maximum absolute number of samples to use.\n\n Note that the full dataset is still used to calculate partial\n dependence when `kind='both'`.\n\n .. versionadded:: 0.24\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the selected samples when subsamples is not\n `None`. See :term:`Glossary ` for details.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n bounding_ax_ : matplotlib Axes or None\n If `ax` is an axes or None, the `bounding_ax_` is the axes where the\n grid of partial dependence plots are drawn. If `ax` is a list of axes\n or a numpy array of axes, `bounding_ax_` is None.\n\n axes_ : ndarray of matplotlib Axes\n If `ax` is an axes or None, `axes_[i, j]` is the axes on the i-th row\n and j-th column. If `ax` is a list of axes, `axes_[i]` is the i-th item\n in `ax`. Elements that are None correspond to a nonexisting axes in\n that position.\n\n lines_ : ndarray of matplotlib Artists\n If `ax` is an axes or None, `lines_[i, j]` is the partial dependence\n curve on the i-th row and j-th column. If `ax` is a list of axes,\n `lines_[i]` is the partial dependence curve corresponding to the i-th\n item in `ax`. Elements that are None correspond to a nonexisting axes\n or an axes that does not include a line plot.\n\n deciles_vlines_ : ndarray of matplotlib LineCollection\n If `ax` is an axes or None, `vlines_[i, j]` is the line collection\n representing the x axis deciles of the i-th row and j-th column. If\n `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in\n `ax`. Elements that are None correspond to a nonexisting axes or an\n axes that does not include a PDP plot.\n\n .. versionadded:: 0.23\n\n deciles_hlines_ : ndarray of matplotlib LineCollection\n If `ax` is an axes or None, `vlines_[i, j]` is the line collection\n representing the y axis deciles of the i-th row and j-th column. If\n `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in\n `ax`. Elements that are None correspond to a nonexisting axes or an\n axes that does not include a 2-way plot.\n\n .. versionadded:: 0.23\n\n contours_ : ndarray of matplotlib Artists\n If `ax` is an axes or None, `contours_[i, j]` is the partial dependence\n plot on the i-th row and j-th column. If `ax` is a list of axes,\n `contours_[i]` is the partial dependence plot corresponding to the i-th\n item in `ax`. Elements that are None correspond to a nonexisting axes\n or an axes that does not include a contour plot.\n\n figure_ : matplotlib Figure\n Figure containing partial dependence plots.\n\n See Also\n --------\n partial_dependence : Compute Partial Dependence values.\n PartialDependenceDisplay.from_estimator : Plot Partial Dependence.\n ", "source_code": "\n\nclass PartialDependenceDisplay:\n \"\"\"Partial Dependence Plot (PDP).\n\n This can also display individual partial dependencies which are often\n referred to as: Individual Condition Expectation (ICE).\n\n It is recommended to use\n :func:`~sklearn.inspection.PartialDependenceDisplay.from_estimator` to create a\n :class:`~sklearn.inspection.PartialDependenceDisplay`. All parameters are\n stored as attributes.\n\n Read more in\n :ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py`\n and the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n pd_results : list of Bunch\n Results of :func:`~sklearn.inspection.partial_dependence` for\n ``features``.\n\n features : list of (int,) or list of (int, int)\n Indices of features for a given plot. A tuple of one integer will plot\n a partial dependence curve of one feature. A tuple of two integers will\n plot a two-way partial dependence curve as a contour plot.\n\n feature_names : list of str\n Feature names corresponding to the indices in ``features``.\n\n target_idx : int\n\n - In a multiclass setting, specifies the class for which the PDPs\n should be computed. Note that for binary classification, the\n positive class (index 1) is always used.\n - In a multioutput setting, specifies the task for which the PDPs\n should be computed.\n\n Ignored in binary classification or classical regression settings.\n\n pdp_lim : dict\n Global min and max average predictions, such that all plots will have\n the same scale and y limits. `pdp_lim[1]` is the global min and max for\n single partial dependence curves. `pdp_lim[2]` is the global min and\n max for two-way partial dependence curves.\n\n deciles : dict\n Deciles for feature indices in ``features``.\n\n kind : {'average', 'individual', 'both'}, default='average'\n Whether to plot the partial dependence averaged across all the samples\n in the dataset or one line per sample or both.\n\n - ``kind='average'`` results in the traditional PD plot;\n - ``kind='individual'`` results in the ICE plot.\n\n Note that the fast ``method='recursion'`` option is only available for\n ``kind='average'``. Plotting individual dependencies requires using the\n slower ``method='brute'`` option.\n\n .. versionadded:: 0.24\n\n subsample : float, int or None, default=1000\n Sampling for ICE curves when `kind` is 'individual' or 'both'.\n If float, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to be used to plot ICE curves. If int, represents the\n maximum absolute number of samples to use.\n\n Note that the full dataset is still used to calculate partial\n dependence when `kind='both'`.\n\n .. versionadded:: 0.24\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the selected samples when subsamples is not\n `None`. See :term:`Glossary ` for details.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n bounding_ax_ : matplotlib Axes or None\n If `ax` is an axes or None, the `bounding_ax_` is the axes where the\n grid of partial dependence plots are drawn. If `ax` is a list of axes\n or a numpy array of axes, `bounding_ax_` is None.\n\n axes_ : ndarray of matplotlib Axes\n If `ax` is an axes or None, `axes_[i, j]` is the axes on the i-th row\n and j-th column. If `ax` is a list of axes, `axes_[i]` is the i-th item\n in `ax`. Elements that are None correspond to a nonexisting axes in\n that position.\n\n lines_ : ndarray of matplotlib Artists\n If `ax` is an axes or None, `lines_[i, j]` is the partial dependence\n curve on the i-th row and j-th column. If `ax` is a list of axes,\n `lines_[i]` is the partial dependence curve corresponding to the i-th\n item in `ax`. Elements that are None correspond to a nonexisting axes\n or an axes that does not include a line plot.\n\n deciles_vlines_ : ndarray of matplotlib LineCollection\n If `ax` is an axes or None, `vlines_[i, j]` is the line collection\n representing the x axis deciles of the i-th row and j-th column. If\n `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in\n `ax`. Elements that are None correspond to a nonexisting axes or an\n axes that does not include a PDP plot.\n\n .. versionadded:: 0.23\n\n deciles_hlines_ : ndarray of matplotlib LineCollection\n If `ax` is an axes or None, `vlines_[i, j]` is the line collection\n representing the y axis deciles of the i-th row and j-th column. If\n `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in\n `ax`. Elements that are None correspond to a nonexisting axes or an\n axes that does not include a 2-way plot.\n\n .. versionadded:: 0.23\n\n contours_ : ndarray of matplotlib Artists\n If `ax` is an axes or None, `contours_[i, j]` is the partial dependence\n plot on the i-th row and j-th column. If `ax` is a list of axes,\n `contours_[i]` is the partial dependence plot corresponding to the i-th\n item in `ax`. Elements that are None correspond to a nonexisting axes\n or an axes that does not include a contour plot.\n\n figure_ : matplotlib Figure\n Figure containing partial dependence plots.\n\n See Also\n --------\n partial_dependence : Compute Partial Dependence values.\n PartialDependenceDisplay.from_estimator : Plot Partial Dependence.\n \"\"\"\n \n def __init__(self, pd_results, *, features, feature_names, target_idx, pdp_lim, deciles, kind='average', subsample=1000, random_state=None):\n self.pd_results = pd_results\n self.features = features\n self.feature_names = feature_names\n self.target_idx = target_idx\n self.pdp_lim = pdp_lim\n self.deciles = deciles\n self.kind = kind\n self.subsample = subsample\n self.random_state = random_state\n \n @classmethod\n def from_estimator(cls, estimator, X, features, *, feature_names=None, target=None, response_method='auto', n_cols=3, grid_resolution=100, percentiles=(0.05, 0.95), method='auto', n_jobs=None, verbose=0, line_kw=None, ice_lines_kw=None, pd_line_kw=None, contour_kw=None, ax=None, kind='average', subsample=1000, random_state=None):\n \"\"\"Partial dependence (PD) and individual conditional expectation (ICE) plots.\n\n Partial dependence plots, individual conditional expectation plots or an\n overlay of both of them can be plotted by setting the ``kind``\n parameter. The ``len(features)`` plots are arranged in a grid with\n ``n_cols`` columns. Two-way partial dependence plots are plotted as\n contour plots. The deciles of the feature values will be shown with tick\n marks on the x-axes for one-way plots, and on both axes for two-way\n plots.\n\n Read more in the :ref:`User Guide `.\n\n .. note::\n\n :func:`PartialDependenceDisplay.from_estimator` does not support using the\n same axes with multiple calls. To plot the the partial dependence for\n multiple estimators, please pass the axes created by the first call to the\n second call::\n\n >>> from sklearn.inspection import PartialDependenceDisplay\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> X, y = make_friedman1()\n >>> est1 = LinearRegression().fit(X, y)\n >>> est2 = RandomForestRegressor().fit(X, y)\n >>> disp1 = PartialDependenceDisplay.from_estimator(est1, X,\n ... [1, 2])\n >>> disp2 = PartialDependenceDisplay.from_estimator(est2, X, [1, 2],\n ... ax=disp1.axes_)\n\n .. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : BaseEstimator\n A fitted estimator object implementing :term:`predict`,\n :term:`predict_proba`, or :term:`decision_function`.\n Multioutput-multiclass classifiers are not supported.\n\n X : {array-like, dataframe} of shape (n_samples, n_features)\n ``X`` is used to generate a grid of values for the target\n ``features`` (where the partial dependence will be evaluated), and\n also to generate values for the complement features when the\n `method` is `'brute'`.\n\n features : list of {int, str, pair of int, pair of str}\n The target features for which to create the PDPs.\n If `features[i]` is an integer or a string, a one-way PDP is created;\n if `features[i]` is a tuple, a two-way PDP is created (only supported\n with `kind='average'`). Each tuple must be of size 2.\n if any entry is a string, then it must be in ``feature_names``.\n\n feature_names : array-like of shape (n_features,), dtype=str, default=None\n Name of each feature; `feature_names[i]` holds the name of the feature\n with index `i`.\n By default, the name of the feature corresponds to their numerical\n index for NumPy array and their column name for pandas dataframe.\n\n target : int, default=None\n - In a multiclass setting, specifies the class for which the PDPs\n should be computed. Note that for binary classification, the\n positive class (index 1) is always used.\n - In a multioutput setting, specifies the task for which the PDPs\n should be computed.\n\n Ignored in binary classification or classical regression settings.\n\n response_method : {'auto', 'predict_proba', 'decision_function'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. For regressors\n this parameter is ignored and the response is always the output of\n :term:`predict`. By default, :term:`predict_proba` is tried first\n and we revert to :term:`decision_function` if it doesn't exist. If\n ``method`` is `'recursion'`, the response is always the output of\n :term:`decision_function`.\n\n n_cols : int, default=3\n The maximum number of columns in the grid plot. Only active when `ax`\n is a single axis or `None`.\n\n grid_resolution : int, default=100\n The number of equally spaced points on the axes of the plots, for each\n target feature.\n\n percentiles : tuple of float, default=(0.05, 0.95)\n The lower and upper percentile used to create the extreme values\n for the PDP axes. Must be in [0, 1].\n\n method : str, default='auto'\n The method used to calculate the averaged predictions:\n\n - `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`\n but is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the ICEs by design, it is not compatible with ICE and\n thus `kind` must be `'average'`.\n\n - `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n - `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\n Please see :ref:`this note ` for\n differences between the `'brute'` and `'recursion'` method.\n\n n_jobs : int, default=None\n The number of CPUs to use to compute the partial dependences.\n Computation is parallelized over features specified by the `features`\n parameter.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n Verbose output during PD computations.\n\n line_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.\n For one-way partial dependence plots. It can be used to define common\n properties for both `ice_lines_kw` and `pdp_line_kw`.\n\n ice_lines_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For ICE lines in the one-way partial dependence plots.\n The key value pairs defined in `ice_lines_kw` takes priority over\n `line_kw`.\n\n pd_line_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For partial dependence in one-way partial dependence plots.\n The key value pairs defined in `pd_line_kw` takes priority over\n `line_kw`.\n\n contour_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.contourf`` call.\n For two-way partial dependence plots.\n\n ax : Matplotlib axes or array-like of Matplotlib axes, default=None\n - If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n - If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n - If `None`, a figure and a bounding axes is created and treated\n as the single axes case.\n\n kind : {'average', 'individual', 'both'}, default='average'\n Whether to plot the partial dependence averaged across all the samples\n in the dataset or one line per sample or both.\n\n - ``kind='average'`` results in the traditional PD plot;\n - ``kind='individual'`` results in the ICE plot.\n\n Note that the fast ``method='recursion'`` option is only available for\n ``kind='average'``. Plotting individual dependencies requires using the\n slower ``method='brute'`` option.\n\n subsample : float, int or None, default=1000\n Sampling for ICE curves when `kind` is 'individual' or 'both'.\n If `float`, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to be used to plot ICE curves. If `int`, represents the\n absolute number samples to use.\n\n Note that the full dataset is still used to calculate averaged partial\n dependence when `kind='both'`.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the selected samples when subsamples is not\n `None` and `kind` is either `'both'` or `'individual'`.\n See :term:`Glossary ` for details.\n\n Returns\n -------\n display : :class:`~sklearn.inspection.PartialDependenceDisplay`\n\n See Also\n --------\n partial_dependence : Compute Partial Dependence values.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.ensemble import GradientBoostingRegressor\n >>> from sklearn.inspection import PartialDependenceDisplay\n >>> X, y = make_friedman1()\n >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)\n >>> PartialDependenceDisplay.from_estimator(clf, X, [0, (0, 1)])\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_estimator')\n return _plot_partial_dependence(estimator, X, features, feature_names=feature_names, target=target, response_method=response_method, n_cols=n_cols, grid_resolution=grid_resolution, percentiles=percentiles, method=method, n_jobs=n_jobs, verbose=verbose, line_kw=line_kw, ice_lines_kw=ice_lines_kw, pd_line_kw=pd_line_kw, contour_kw=contour_kw, ax=ax, kind=kind, subsample=subsample, random_state=random_state)\n \n def _get_sample_count(self, n_samples):\n \"\"\"Compute the number of samples as an integer.\"\"\"\n if isinstance(self.subsample, numbers.Integral):\n if self.subsample < n_samples:\n return self.subsample\n return n_samples\n elif isinstance(self.subsample, numbers.Real):\n return ceil(n_samples * self.subsample)\n return n_samples\n \n def _plot_ice_lines(self, preds, feature_values, n_ice_to_plot, ax, pd_plot_idx, n_total_lines_by_plot, individual_line_kw):\n \"\"\"Plot the ICE lines.\n\n Parameters\n ----------\n preds : ndarray of shape (n_instances, n_grid_points)\n The predictions computed for all points of `feature_values` for a\n given feature for all samples in `X`.\n feature_values : ndarray of shape (n_grid_points,)\n The feature values for which the predictions have been computed.\n n_ice_to_plot : int\n The number of ICE lines to plot.\n ax : Matplotlib axes\n The axis on which to plot the ICE lines.\n pd_plot_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\n n_total_lines_by_plot : int\n The total number of lines expected to be plot on the axis.\n individual_line_kw : dict\n Dict with keywords passed when plotting the ICE lines.\n \"\"\"\n rng = check_random_state(self.random_state)\n ice_lines_idx = rng.choice(preds.shape[0], n_ice_to_plot, replace=False)\n ice_lines_subsampled = preds[ice_lines_idx, :]\n for (ice_idx, ice) in enumerate(ice_lines_subsampled):\n line_idx = np.unravel_index(pd_plot_idx * n_total_lines_by_plot + ice_idx, self.lines_.shape)\n self.lines_[line_idx] = ax.plot(feature_values, ice.ravel(), **individual_line_kw)[0]\n \n def _plot_average_dependence(self, avg_preds, feature_values, ax, pd_line_idx, line_kw):\n \"\"\"Plot the average partial dependence.\n\n Parameters\n ----------\n avg_preds : ndarray of shape (n_grid_points,)\n The average predictions for all points of `feature_values` for a\n given feature for all samples in `X`.\n feature_values : ndarray of shape (n_grid_points,)\n The feature values for which the predictions have been computed.\n ax : Matplotlib axes\n The axis on which to plot the ICE lines.\n pd_line_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\n line_kw : dict\n Dict with keywords passed when plotting the PD plot.\n \"\"\"\n line_idx = np.unravel_index(pd_line_idx, self.lines_.shape)\n self.lines_[line_idx] = ax.plot(feature_values, avg_preds, **line_kw)[0]\n \n def _plot_one_way_partial_dependence(self, preds, avg_preds, feature_values, feature_idx, n_ice_lines, ax, n_cols, pd_plot_idx, n_lines, ice_lines_kw, pd_line_kw):\n \"\"\"Plot 1-way partial dependence: ICE and PDP.\n\n Parameters\n ----------\n preds : ndarray of shape (n_instances, n_grid_points) or None\n The predictions computed for all points of `feature_values` for a\n given feature for all samples in `X`.\n avg_preds : ndarray of shape (n_grid_points,)\n The average predictions for all points of `feature_values` for a\n given feature for all samples in `X`.\n feature_values : ndarray of shape (n_grid_points,)\n The feature values for which the predictions have been computed.\n feature_idx : int\n The index corresponding to the target feature.\n n_ice_lines : int\n The number of ICE lines to plot.\n ax : Matplotlib axes\n The axis on which to plot the ICE and PDP lines.\n n_cols : int or None\n The number of column in the axis.\n pd_plot_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\n n_lines : int\n The total number of lines expected to be plot on the axis.\n ice_lines_kw : dict\n Dict with keywords passed when plotting the ICE lines.\n pd_line_kw : dict\n Dict with keywords passed when plotting the PD plot.\n \"\"\"\n from matplotlib import transforms\n if self.kind in ('individual', 'both'):\n self._plot_ice_lines(preds[self.target_idx], feature_values, n_ice_lines, ax, pd_plot_idx, n_lines, ice_lines_kw)\n if self.kind in ('average', 'both'):\n if self.kind == 'average':\n pd_line_idx = pd_plot_idx\n else:\n pd_line_idx = pd_plot_idx * n_lines + n_ice_lines\n self._plot_average_dependence(avg_preds[self.target_idx].ravel(), feature_values, ax, pd_line_idx, pd_line_kw)\n trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)\n vlines_idx = np.unravel_index(pd_plot_idx, self.deciles_vlines_.shape)\n self.deciles_vlines_[vlines_idx] = ax.vlines(self.deciles[feature_idx[0]], 0, 0.05, transform=trans, color='k')\n ax.set_ylim(self.pdp_lim[1])\n if not ax.get_xlabel():\n ax.set_xlabel(self.feature_names[feature_idx[0]])\n if n_cols is None or pd_plot_idx % n_cols == 0:\n if not ax.get_ylabel():\n ax.set_ylabel('Partial dependence')\n else:\n ax.set_yticklabels([])\n if pd_line_kw.get('label', None) and self.kind != 'individual':\n ax.legend()\n \n def _plot_two_way_partial_dependence(self, avg_preds, feature_values, feature_idx, ax, pd_plot_idx, Z_level, contour_kw):\n \"\"\"Plot 2-way partial dependence.\n\n Parameters\n ----------\n avg_preds : ndarray of shape (n_instances, n_grid_points, n_grid_points)\n The average predictions for all points of `feature_values[0]` and\n `feature_values[1]` for some given features for all samples in `X`.\n feature_values : seq of 1d array\n A sequence of array of the feature values for which the predictions\n have been computed.\n feature_idx : tuple of int\n The indices of the target features\n ax : Matplotlib axes\n The axis on which to plot the ICE and PDP lines.\n pd_plot_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\n Z_level : ndarray of shape (8, 8)\n The Z-level used to encode the average predictions.\n contour_kw : dict\n Dict with keywords passed when plotting the contours.\n \"\"\"\n from matplotlib import transforms\n (XX, YY) = np.meshgrid(feature_values[0], feature_values[1])\n Z = avg_preds[self.target_idx].T\n CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, colors='k')\n contour_idx = np.unravel_index(pd_plot_idx, self.contours_.shape)\n self.contours_[contour_idx] = ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1], vmin=Z_level[0], **contour_kw)\n ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)\n trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)\n (xlim, ylim) = (ax.get_xlim(), ax.get_ylim())\n vlines_idx = np.unravel_index(pd_plot_idx, self.deciles_vlines_.shape)\n self.deciles_vlines_[vlines_idx] = ax.vlines(self.deciles[feature_idx[0]], 0, 0.05, transform=trans, color='k')\n hlines_idx = np.unravel_index(pd_plot_idx, self.deciles_hlines_.shape)\n self.deciles_hlines_[hlines_idx] = ax.hlines(self.deciles[feature_idx[1]], 0, 0.05, transform=trans, color='k')\n ax.set_xlim(xlim)\n ax.set_ylim(ylim)\n if not ax.get_xlabel():\n ax.set_xlabel(self.feature_names[feature_idx[0]])\n ax.set_ylabel(self.feature_names[feature_idx[1]])\n \n @_deprecate_positional_args(version='1.1')\n def plot(self, *, ax=None, n_cols=3, line_kw=None, ice_lines_kw=None, pd_line_kw=None, contour_kw=None):\n \"\"\"Plot partial dependence plots.\n\n Parameters\n ----------\n ax : Matplotlib axes or array-like of Matplotlib axes, default=None\n - If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n - If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n - If `None`, a figure and a bounding axes is created and treated\n as the single axes case.\n\n n_cols : int, default=3\n The maximum number of columns in the grid plot. Only active when\n `ax` is a single axes or `None`.\n\n line_kw : dict, default=None\n Dict with keywords passed to the `matplotlib.pyplot.plot` call.\n For one-way partial dependence plots.\n\n ice_lines_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For ICE lines in the one-way partial dependence plots.\n The key value pairs defined in `ice_lines_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\n pd_line_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For partial dependence in one-way partial dependence plots.\n The key value pairs defined in `pd_line_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\n contour_kw : dict, default=None\n Dict with keywords passed to the `matplotlib.pyplot.contourf`\n call for two-way partial dependence plots.\n\n Returns\n -------\n display : :class:`~sklearn.inspection.PartialDependenceDisplay`\n \"\"\"\n check_matplotlib_support('plot_partial_dependence')\n import matplotlib.pyplot as plt\n from matplotlib.gridspec import GridSpecFromSubplotSpec\n if line_kw is None:\n line_kw = {}\n if ice_lines_kw is None:\n ice_lines_kw = {}\n if pd_line_kw is None:\n pd_line_kw = {}\n if contour_kw is None:\n contour_kw = {}\n if ax is None:\n (_, ax) = plt.subplots()\n default_contour_kws = {'alpha': 0.75}\n contour_kw = {**default_contour_kws, **contour_kw}\n default_line_kws = {'color': 'C0', 'label': 'average' if self.kind == 'both' else None}\n if self.kind in ('individual', 'both'):\n default_ice_lines_kws = {'alpha': 0.3, 'linewidth': 0.5}\n else:\n default_ice_lines_kws = {}\n ice_lines_kw = {**default_line_kws, **line_kw, **default_ice_lines_kws, **ice_lines_kw}\n del ice_lines_kw['label']\n pd_line_kw = {**default_line_kws, **line_kw, **pd_line_kw}\n n_features = len(self.features)\n if self.kind in ('individual', 'both'):\n n_ice_lines = self._get_sample_count(len(self.pd_results[0].individual[0]))\n if self.kind == 'individual':\n n_lines = n_ice_lines\n else:\n n_lines = n_ice_lines + 1\n else:\n n_ice_lines = 0\n n_lines = 1\n if isinstance(ax, plt.Axes):\n if not ax.axison:\n raise ValueError('The ax was already used in another plot function, please set ax=display.axes_ instead')\n ax.set_axis_off()\n self.bounding_ax_ = ax\n self.figure_ = ax.figure\n n_cols = min(n_cols, n_features)\n n_rows = int(np.ceil(n_features / float(n_cols)))\n self.axes_ = np.empty((n_rows, n_cols), dtype=object)\n if self.kind == 'average':\n self.lines_ = np.empty((n_rows, n_cols), dtype=object)\n else:\n self.lines_ = np.empty((n_rows, n_cols, n_lines), dtype=object)\n self.contours_ = np.empty((n_rows, n_cols), dtype=object)\n axes_ravel = self.axes_.ravel()\n gs = GridSpecFromSubplotSpec(n_rows, n_cols, subplot_spec=ax.get_subplotspec())\n for (i, spec) in zip(range(n_features), gs):\n axes_ravel[i] = self.figure_.add_subplot(spec)\n else:\n ax = np.asarray(ax, dtype=object)\n if ax.size != n_features:\n raise ValueError('Expected ax to have {} axes, got {}'.format(n_features, ax.size))\n if ax.ndim == 2:\n n_cols = ax.shape[1]\n else:\n n_cols = None\n self.bounding_ax_ = None\n self.figure_ = ax.ravel()[0].figure\n self.axes_ = ax\n if self.kind == 'average':\n self.lines_ = np.empty_like(ax, dtype=object)\n else:\n self.lines_ = np.empty(ax.shape + (n_lines, ), dtype=object)\n self.contours_ = np.empty_like(ax, dtype=object)\n if 2 in self.pdp_lim:\n Z_level = np.linspace(*self.pdp_lim[2], num=8)\n self.deciles_vlines_ = np.empty_like(self.axes_, dtype=object)\n self.deciles_hlines_ = np.empty_like(self.axes_, dtype=object)\n for (pd_plot_idx, (axi, feature_idx, pd_result)) in enumerate(zip(self.axes_.ravel(), self.features, self.pd_results)):\n avg_preds = None\n preds = None\n feature_values = pd_result['values']\n if self.kind == 'individual':\n preds = pd_result.individual\n elif self.kind == 'average':\n avg_preds = pd_result.average\n else:\n avg_preds = pd_result.average\n preds = pd_result.individual\n if len(feature_values) == 1:\n self._plot_one_way_partial_dependence(preds, avg_preds, feature_values[0], feature_idx, n_ice_lines, axi, n_cols, pd_plot_idx, n_lines, ice_lines_kw, pd_line_kw)\n else:\n self._plot_two_way_partial_dependence(avg_preds, feature_values, feature_idx, axi, pd_plot_idx, Z_level, contour_kw)\n return self\n" }, @@ -23192,7 +23259,7 @@ "sklearn.isotonic.IsotonicRegression._more_tags" ], "is_public": true, - "description": "Isotonic regression model.\n\nRead more in the :ref:`User Guide `. .. versionadded:: 0.13", + "description": "Isotonic regression model.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.13", "docstring": "Isotonic regression model.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n y_min : float, default=None\n Lower bound on the lowest predicted value (the minimum value may\n still be higher). If not set, defaults to -inf.\n\n y_max : float, default=None\n Upper bound on the highest predicted value (the maximum may still be\n lower). If not set, defaults to +inf.\n\n increasing : bool or 'auto', default=True\n Determines whether the predictions should be constrained to increase\n or decrease with `X`. 'auto' will decide based on the Spearman\n correlation estimate's sign.\n\n out_of_bounds : {'nan', 'clip', 'raise'}, default='nan'\n Handles how `X` values outside of the training domain are handled\n during prediction.\n\n - 'nan', predictions will be NaN.\n - 'clip', predictions will be set to the value corresponding to\n the nearest train interval endpoint.\n - 'raise', a `ValueError` is raised.\n\n Attributes\n ----------\n X_min_ : float\n Minimum value of input array `X_` for left bound.\n\n X_max_ : float\n Maximum value of input array `X_` for right bound.\n\n X_thresholds_ : ndarray of shape (n_thresholds,)\n Unique ascending `X` values used to interpolate\n the y = f(X) monotonic function.\n\n .. versionadded:: 0.24\n\n y_thresholds_ : ndarray of shape (n_thresholds,)\n De-duplicated `y` values suitable to interpolate the y = f(X)\n monotonic function.\n\n .. versionadded:: 0.24\n\n f_ : function\n The stepwise interpolating function that covers the input domain ``X``.\n\n increasing_ : bool\n Inferred value for ``increasing``.\n\n See Also\n --------\n sklearn.linear_model.LinearRegression : Ordinary least squares Linear\n Regression.\n sklearn.ensemble.HistGradientBoostingRegressor : Gradient boosting that\n is a non-parametric model accepting monotonicity constraints.\n isotonic_regression : Function to solve the isotonic regression model.\n\n Notes\n -----\n Ties are broken using the secondary method from de Leeuw, 1977.\n\n References\n ----------\n Isotonic Median Regression: A Linear Programming Approach\n Nilotpal Chakravarti\n Mathematics of Operations Research\n Vol. 14, No. 2 (May, 1989), pp. 303-308\n\n Isotone Optimization in R : Pool-Adjacent-Violators\n Algorithm (PAVA) and Active Set Methods\n de Leeuw, Hornik, Mair\n Journal of Statistical Software 2009\n\n Correctness of Kruskal's algorithms for monotone regression with ties\n de Leeuw, Psychometrica, 1977\n\n Examples\n --------\n >>> from sklearn.datasets import make_regression\n >>> from sklearn.isotonic import IsotonicRegression\n >>> X, y = make_regression(n_samples=10, n_features=1, random_state=41)\n >>> iso_reg = IsotonicRegression().fit(X, y)\n >>> iso_reg.predict([.1, .2])\n array([1.8628..., 3.7256...])\n ", "source_code": "\n\nclass IsotonicRegression(RegressorMixin, TransformerMixin, BaseEstimator):\n \"\"\"Isotonic regression model.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n y_min : float, default=None\n Lower bound on the lowest predicted value (the minimum value may\n still be higher). If not set, defaults to -inf.\n\n y_max : float, default=None\n Upper bound on the highest predicted value (the maximum may still be\n lower). If not set, defaults to +inf.\n\n increasing : bool or 'auto', default=True\n Determines whether the predictions should be constrained to increase\n or decrease with `X`. 'auto' will decide based on the Spearman\n correlation estimate's sign.\n\n out_of_bounds : {'nan', 'clip', 'raise'}, default='nan'\n Handles how `X` values outside of the training domain are handled\n during prediction.\n\n - 'nan', predictions will be NaN.\n - 'clip', predictions will be set to the value corresponding to\n the nearest train interval endpoint.\n - 'raise', a `ValueError` is raised.\n\n Attributes\n ----------\n X_min_ : float\n Minimum value of input array `X_` for left bound.\n\n X_max_ : float\n Maximum value of input array `X_` for right bound.\n\n X_thresholds_ : ndarray of shape (n_thresholds,)\n Unique ascending `X` values used to interpolate\n the y = f(X) monotonic function.\n\n .. versionadded:: 0.24\n\n y_thresholds_ : ndarray of shape (n_thresholds,)\n De-duplicated `y` values suitable to interpolate the y = f(X)\n monotonic function.\n\n .. versionadded:: 0.24\n\n f_ : function\n The stepwise interpolating function that covers the input domain ``X``.\n\n increasing_ : bool\n Inferred value for ``increasing``.\n\n See Also\n --------\n sklearn.linear_model.LinearRegression : Ordinary least squares Linear\n Regression.\n sklearn.ensemble.HistGradientBoostingRegressor : Gradient boosting that\n is a non-parametric model accepting monotonicity constraints.\n isotonic_regression : Function to solve the isotonic regression model.\n\n Notes\n -----\n Ties are broken using the secondary method from de Leeuw, 1977.\n\n References\n ----------\n Isotonic Median Regression: A Linear Programming Approach\n Nilotpal Chakravarti\n Mathematics of Operations Research\n Vol. 14, No. 2 (May, 1989), pp. 303-308\n\n Isotone Optimization in R : Pool-Adjacent-Violators\n Algorithm (PAVA) and Active Set Methods\n de Leeuw, Hornik, Mair\n Journal of Statistical Software 2009\n\n Correctness of Kruskal's algorithms for monotone regression with ties\n de Leeuw, Psychometrica, 1977\n\n Examples\n --------\n >>> from sklearn.datasets import make_regression\n >>> from sklearn.isotonic import IsotonicRegression\n >>> X, y = make_regression(n_samples=10, n_features=1, random_state=41)\n >>> iso_reg = IsotonicRegression().fit(X, y)\n >>> iso_reg.predict([.1, .2])\n array([1.8628..., 3.7256...])\n \"\"\"\n \n def __init__(self, *, y_min=None, y_max=None, increasing=True, out_of_bounds='nan'):\n self.y_min = y_min\n self.y_max = y_max\n self.increasing = increasing\n self.out_of_bounds = out_of_bounds\n \n def _check_input_data_shape(self, X):\n if not (X.ndim == 1 or X.ndim == 2 and X.shape[1] == 1):\n msg = 'Isotonic regression input X should be a 1d array or 2d array with 1 feature'\n raise ValueError(msg)\n \n def _build_f(self, X, y):\n \"\"\"Build the f_ interp1d function.\"\"\"\n if self.out_of_bounds not in ['raise', 'nan', 'clip']:\n raise ValueError(\"The argument ``out_of_bounds`` must be in 'nan', 'clip', 'raise'; got {0}\".format(self.out_of_bounds))\n bounds_error = self.out_of_bounds == 'raise'\n if len(y) == 1:\n self.f_ = lambda x: y.repeat(x.shape)\n else:\n self.f_ = interpolate.interp1d(X, y, kind='linear', bounds_error=bounds_error)\n \n def _build_y(self, X, y, sample_weight, trim_duplicates=True):\n \"\"\"Build the y_ IsotonicRegression.\"\"\"\n self._check_input_data_shape(X)\n X = X.reshape(-1)\n if self.increasing == 'auto':\n self.increasing_ = check_increasing(X, y)\n else:\n self.increasing_ = self.increasing\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n mask = sample_weight > 0\n (X, y, sample_weight) = (X[mask], y[mask], sample_weight[mask])\n order = np.lexsort((y, X))\n (X, y, sample_weight) = [array[order] for array in [X, y, sample_weight]]\n (unique_X, unique_y, unique_sample_weight) = _make_unique(X, y, sample_weight)\n X = unique_X\n y = isotonic_regression(unique_y, sample_weight=unique_sample_weight, y_min=self.y_min, y_max=self.y_max, increasing=self.increasing_)\n (self.X_min_, self.X_max_) = (np.min(X), np.max(X))\n if trim_duplicates:\n keep_data = np.ones((len(y), ), dtype=bool)\n keep_data[1:-1] = np.logical_or(np.not_equal(y[1:-1], y[:-2]), np.not_equal(y[1:-1], y[2:]))\n return X[keep_data], y[keep_data]\n else:\n return X, y\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples,) or (n_samples, 1)\n Training data.\n\n .. versionchanged:: 0.24\n Also accepts 2d array with 1 feature.\n\n y : array-like of shape (n_samples,)\n Training target.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights. If set to None, all weights will be set to 1 (equal\n weights).\n\n Returns\n -------\n self : object\n Returns an instance of self.\n\n Notes\n -----\n X is stored for future use, as :meth:`transform` needs X to interpolate\n new input data.\n \"\"\"\n check_params = dict(accept_sparse=False, ensure_2d=False)\n X = check_array(X, dtype=[np.float64, np.float32], **check_params)\n y = check_array(y, dtype=X.dtype, **check_params)\n check_consistent_length(X, y, sample_weight)\n (X, y) = self._build_y(X, y, sample_weight)\n (self.X_thresholds_, self.y_thresholds_) = (X, y)\n self._build_f(X, y)\n return self\n \n def transform(self, T):\n \"\"\"Transform new data by linear interpolation.\n\n Parameters\n ----------\n T : array-like of shape (n_samples,) or (n_samples, 1)\n Data to transform.\n\n .. versionchanged:: 0.24\n Also accepts 2d array with 1 feature.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n The transformed data.\n \"\"\"\n if hasattr(self, 'X_thresholds_'):\n dtype = self.X_thresholds_.dtype\n else:\n dtype = np.float64\n T = check_array(T, dtype=dtype, ensure_2d=False)\n self._check_input_data_shape(T)\n T = T.reshape(-1)\n if self.out_of_bounds not in ['raise', 'nan', 'clip']:\n raise ValueError(\"The argument ``out_of_bounds`` must be in 'nan', 'clip', 'raise'; got {0}\".format(self.out_of_bounds))\n if self.out_of_bounds == 'clip':\n T = np.clip(T, self.X_min_, self.X_max_)\n res = self.f_(T)\n res = res.astype(T.dtype)\n return res\n \n def predict(self, T):\n \"\"\"Predict new data by linear interpolation.\n\n Parameters\n ----------\n T : array-like of shape (n_samples,) or (n_samples, 1)\n Data to transform.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Transformed data.\n \"\"\"\n return self.transform(T)\n \n def __getstate__(self):\n \"\"\"Pickle-protocol - return state of the estimator.\"\"\"\n state = super().__getstate__()\n state.pop('f_', None)\n return state\n \n def __setstate__(self, state):\n \"\"\"Pickle-protocol - set state of the estimator.\n\n We need to rebuild the interpolation function.\n \"\"\"\n super().__setstate__(state)\n if hasattr(self, 'X_thresholds_') and hasattr(self, 'y_thresholds_'):\n self._build_f(self.X_thresholds_, self.y_thresholds_)\n \n def _more_tags(self):\n return {'X_types': ['1darray']}\n" }, @@ -23210,7 +23277,7 @@ "sklearn.kernel_approximation.AdditiveChi2Sampler._more_tags" ], "is_public": true, - "description": "Approximate feature map for additive chi2 kernel.\n\nUses sampling the fourier transform of the kernel characteristic at regular intervals. Since the kernel that is to be approximated is additive, the components of the input vectors can be treated separately. Each entry in the original space is transformed into 2*sample_steps+1 features, where sample_steps is a parameter of the method. Typical values of sample_steps include 1, 2 and 3. Optimal choices for the sampling interval for certain data ranges can be computed (see the reference). The default values should be reasonable. Read more in the :ref:`User Guide `.", + "description": "Approximate feature map for additive chi2 kernel.\n\nUses sampling the fourier transform of the kernel characteristic\nat regular intervals.\n\nSince the kernel that is to be approximated is additive, the components of\nthe input vectors can be treated separately. Each entry in the original\nspace is transformed into 2*sample_steps+1 features, where sample_steps is\na parameter of the method. Typical values of sample_steps include 1, 2 and\n3.\n\nOptimal choices for the sampling interval for certain data ranges can be\ncomputed (see the reference). The default values should be reasonable.\n\nRead more in the :ref:`User Guide `.", "docstring": "Approximate feature map for additive chi2 kernel.\n\n Uses sampling the fourier transform of the kernel characteristic\n at regular intervals.\n\n Since the kernel that is to be approximated is additive, the components of\n the input vectors can be treated separately. Each entry in the original\n space is transformed into 2*sample_steps+1 features, where sample_steps is\n a parameter of the method. Typical values of sample_steps include 1, 2 and\n 3.\n\n Optimal choices for the sampling interval for certain data ranges can be\n computed (see the reference). The default values should be reasonable.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n sample_steps : int, default=2\n Gives the number of (complex) sampling points.\n\n sample_interval : float, default=None\n Sampling interval. Must be specified when sample_steps not in {1,2,3}.\n\n Attributes\n ----------\n sample_interval_ : float\n Stored sampling interval. Specified as a parameter if `sample_steps`\n not in {1,2,3}.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SkewedChi2Sampler : A Fourier-approximation to a non-additive variant of\n the chi squared kernel.\n\n sklearn.metrics.pairwise.chi2_kernel : The exact chi squared kernel.\n\n sklearn.metrics.pairwise.additive_chi2_kernel : The exact additive chi\n squared kernel.\n\n Notes\n -----\n This estimator approximates a slightly different version of the additive\n chi squared kernel then ``metric.additive_chi2`` computes.\n\n References\n ----------\n See `\"Efficient additive kernels via explicit feature maps\"\n `_\n A. Vedaldi and A. Zisserman, Pattern Analysis and Machine Intelligence,\n 2011\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.linear_model import SGDClassifier\n >>> from sklearn.kernel_approximation import AdditiveChi2Sampler\n >>> X, y = load_digits(return_X_y=True)\n >>> chi2sampler = AdditiveChi2Sampler(sample_steps=2)\n >>> X_transformed = chi2sampler.fit_transform(X, y)\n >>> clf = SGDClassifier(max_iter=5, random_state=0, tol=1e-3)\n >>> clf.fit(X_transformed, y)\n SGDClassifier(max_iter=5, random_state=0)\n >>> clf.score(X_transformed, y)\n 0.9499...\n ", "source_code": "\n\nclass AdditiveChi2Sampler(TransformerMixin, BaseEstimator):\n \"\"\"Approximate feature map for additive chi2 kernel.\n\n Uses sampling the fourier transform of the kernel characteristic\n at regular intervals.\n\n Since the kernel that is to be approximated is additive, the components of\n the input vectors can be treated separately. Each entry in the original\n space is transformed into 2*sample_steps+1 features, where sample_steps is\n a parameter of the method. Typical values of sample_steps include 1, 2 and\n 3.\n\n Optimal choices for the sampling interval for certain data ranges can be\n computed (see the reference). The default values should be reasonable.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n sample_steps : int, default=2\n Gives the number of (complex) sampling points.\n\n sample_interval : float, default=None\n Sampling interval. Must be specified when sample_steps not in {1,2,3}.\n\n Attributes\n ----------\n sample_interval_ : float\n Stored sampling interval. Specified as a parameter if `sample_steps`\n not in {1,2,3}.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SkewedChi2Sampler : A Fourier-approximation to a non-additive variant of\n the chi squared kernel.\n\n sklearn.metrics.pairwise.chi2_kernel : The exact chi squared kernel.\n\n sklearn.metrics.pairwise.additive_chi2_kernel : The exact additive chi\n squared kernel.\n\n Notes\n -----\n This estimator approximates a slightly different version of the additive\n chi squared kernel then ``metric.additive_chi2`` computes.\n\n References\n ----------\n See `\"Efficient additive kernels via explicit feature maps\"\n `_\n A. Vedaldi and A. Zisserman, Pattern Analysis and Machine Intelligence,\n 2011\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.linear_model import SGDClassifier\n >>> from sklearn.kernel_approximation import AdditiveChi2Sampler\n >>> X, y = load_digits(return_X_y=True)\n >>> chi2sampler = AdditiveChi2Sampler(sample_steps=2)\n >>> X_transformed = chi2sampler.fit_transform(X, y)\n >>> clf = SGDClassifier(max_iter=5, random_state=0, tol=1e-3)\n >>> clf.fit(X_transformed, y)\n SGDClassifier(max_iter=5, random_state=0)\n >>> clf.score(X_transformed, y)\n 0.9499...\n \"\"\"\n \n def __init__(self, *, sample_steps=2, sample_interval=None):\n self.sample_steps = sample_steps\n self.sample_interval = sample_interval\n \n def fit(self, X, y=None):\n \"\"\"Set the parameters.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the transformer.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr')\n check_non_negative(X, 'X in AdditiveChi2Sampler.fit')\n if self.sample_interval is None:\n if self.sample_steps == 1:\n self.sample_interval_ = 0.8\n elif self.sample_steps == 2:\n self.sample_interval_ = 0.5\n elif self.sample_steps == 3:\n self.sample_interval_ = 0.4\n else:\n raise ValueError('If sample_steps is not in [1, 2, 3], you need to provide sample_interval')\n else:\n self.sample_interval_ = self.sample_interval\n return self\n \n def transform(self, X):\n \"\"\"Apply approximate feature map to X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : {ndarray, sparse matrix}, shape = (n_samples, n_features * (2*sample_steps + 1))\n Whether the return value is an array or sparse matrix depends on\n the type of the input X.\n \"\"\"\n msg = '%(name)s is not fitted. Call fit to set the parameters before calling transform'\n check_is_fitted(self, msg=msg)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n check_non_negative(X, 'X in AdditiveChi2Sampler.transform')\n sparse = sp.issparse(X)\n transf = self._transform_sparse if sparse else self._transform_dense\n return transf(X)\n \n def _transform_dense(self, X):\n non_zero = X != 0.0\n X_nz = X[non_zero]\n X_step = np.zeros_like(X)\n X_step[non_zero] = np.sqrt(X_nz * self.sample_interval_)\n X_new = [X_step]\n log_step_nz = self.sample_interval_ * np.log(X_nz)\n step_nz = 2 * X_nz * self.sample_interval_\n for j in range(1, self.sample_steps):\n factor_nz = np.sqrt(step_nz / np.cosh(np.pi * j * self.sample_interval_))\n X_step = np.zeros_like(X)\n X_step[non_zero] = factor_nz * np.cos(j * log_step_nz)\n X_new.append(X_step)\n X_step = np.zeros_like(X)\n X_step[non_zero] = factor_nz * np.sin(j * log_step_nz)\n X_new.append(X_step)\n return np.hstack(X_new)\n \n def _transform_sparse(self, X):\n indices = X.indices.copy()\n indptr = X.indptr.copy()\n data_step = np.sqrt(X.data * self.sample_interval_)\n X_step = sp.csr_matrix((data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False)\n X_new = [X_step]\n log_step_nz = self.sample_interval_ * np.log(X.data)\n step_nz = 2 * X.data * self.sample_interval_\n for j in range(1, self.sample_steps):\n factor_nz = np.sqrt(step_nz / np.cosh(np.pi * j * self.sample_interval_))\n data_step = factor_nz * np.cos(j * log_step_nz)\n X_step = sp.csr_matrix((data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False)\n X_new.append(X_step)\n data_step = factor_nz * np.sin(j * log_step_nz)\n X_step = sp.csr_matrix((data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False)\n X_new.append(X_step)\n return sp.hstack(X_new)\n \n def _more_tags(self):\n return {'stateless': True, 'requires_positive_X': True}\n" }, @@ -23227,7 +23294,7 @@ "sklearn.kernel_approximation.Nystroem._more_tags" ], "is_public": true, - "description": "Approximate a kernel map using a subset of the training data.\n\nConstructs an approximate feature map for an arbitrary kernel using a subset of the data as basis. Read more in the :ref:`User Guide `. .. versionadded:: 0.13", + "description": "Approximate a kernel map using a subset of the training data.\n\nConstructs an approximate feature map for an arbitrary kernel\nusing a subset of the data as basis.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.13", "docstring": "Approximate a kernel map using a subset of the training data.\n\n Constructs an approximate feature map for an arbitrary kernel\n using a subset of the data as basis.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n kernel : str or callable, default='rbf'\n Kernel map to be approximated. A callable should accept two arguments\n and the keyword arguments passed to this object as `kernel_params`, and\n should return a floating point number.\n\n gamma : float, default=None\n Gamma parameter for the RBF, laplacian, polynomial, exponential chi2\n and sigmoid kernels. Interpretation of the default value is left to\n the kernel; see the documentation for sklearn.metrics.pairwise.\n Ignored by other kernels.\n\n coef0 : float, default=None\n Zero coefficient for polynomial and sigmoid kernels.\n Ignored by other kernels.\n\n degree : float, default=None\n Degree of the polynomial kernel. Ignored by other kernels.\n\n kernel_params : dict, default=None\n Additional parameters (keyword arguments) for kernel function passed\n as callable object.\n\n n_components : int, default=100\n Number of features to construct.\n How many data points will be used to construct the mapping.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the uniform sampling without\n replacement of `n_components` of the training data to construct the\n basis kernel.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by breaking\n down the kernel matrix into `n_jobs` even slices and computing them in\n parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Subset of training points used to construct the feature map.\n\n component_indices_ : ndarray of shape (n_components)\n Indices of ``components_`` in the training set.\n\n normalization_ : ndarray of shape (n_components, n_components)\n Normalization matrix needed for embedding.\n Square root of the kernel matrix on ``components_``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n AdditiveChi2Sampler : Approximate feature map for additive chi2 kernel.\n PolynomialCountSketch : Polynomial kernel approximation via Tensor Sketch.\n RBFSampler : Approximate a RBF kernel feature map using random Fourier\n features.\n SkewedChi2Sampler : Approximate feature map for \"skewed chi-squared\" kernel.\n sklearn.metrics.pairwise.kernel_metrics : List of built-in kernels.\n\n References\n ----------\n * Williams, C.K.I. and Seeger, M.\n \"Using the Nystroem method to speed up kernel machines\",\n Advances in neural information processing systems 2001\n\n * T. Yang, Y. Li, M. Mahdavi, R. Jin and Z. Zhou\n \"Nystroem Method vs Random Fourier Features: A Theoretical and Empirical\n Comparison\",\n Advances in Neural Information Processing Systems 2012\n\n Examples\n --------\n >>> from sklearn import datasets, svm\n >>> from sklearn.kernel_approximation import Nystroem\n >>> X, y = datasets.load_digits(n_class=9, return_X_y=True)\n >>> data = X / 16.\n >>> clf = svm.LinearSVC()\n >>> feature_map_nystroem = Nystroem(gamma=.2,\n ... random_state=1,\n ... n_components=300)\n >>> data_transformed = feature_map_nystroem.fit_transform(data)\n >>> clf.fit(data_transformed, y)\n LinearSVC()\n >>> clf.score(data_transformed, y)\n 0.9987...\n ", "source_code": "\n\nclass Nystroem(TransformerMixin, BaseEstimator):\n \"\"\"Approximate a kernel map using a subset of the training data.\n\n Constructs an approximate feature map for an arbitrary kernel\n using a subset of the data as basis.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n kernel : str or callable, default='rbf'\n Kernel map to be approximated. A callable should accept two arguments\n and the keyword arguments passed to this object as `kernel_params`, and\n should return a floating point number.\n\n gamma : float, default=None\n Gamma parameter for the RBF, laplacian, polynomial, exponential chi2\n and sigmoid kernels. Interpretation of the default value is left to\n the kernel; see the documentation for sklearn.metrics.pairwise.\n Ignored by other kernels.\n\n coef0 : float, default=None\n Zero coefficient for polynomial and sigmoid kernels.\n Ignored by other kernels.\n\n degree : float, default=None\n Degree of the polynomial kernel. Ignored by other kernels.\n\n kernel_params : dict, default=None\n Additional parameters (keyword arguments) for kernel function passed\n as callable object.\n\n n_components : int, default=100\n Number of features to construct.\n How many data points will be used to construct the mapping.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the uniform sampling without\n replacement of `n_components` of the training data to construct the\n basis kernel.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by breaking\n down the kernel matrix into `n_jobs` even slices and computing them in\n parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n Subset of training points used to construct the feature map.\n\n component_indices_ : ndarray of shape (n_components)\n Indices of ``components_`` in the training set.\n\n normalization_ : ndarray of shape (n_components, n_components)\n Normalization matrix needed for embedding.\n Square root of the kernel matrix on ``components_``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n AdditiveChi2Sampler : Approximate feature map for additive chi2 kernel.\n PolynomialCountSketch : Polynomial kernel approximation via Tensor Sketch.\n RBFSampler : Approximate a RBF kernel feature map using random Fourier\n features.\n SkewedChi2Sampler : Approximate feature map for \"skewed chi-squared\" kernel.\n sklearn.metrics.pairwise.kernel_metrics : List of built-in kernels.\n\n References\n ----------\n * Williams, C.K.I. and Seeger, M.\n \"Using the Nystroem method to speed up kernel machines\",\n Advances in neural information processing systems 2001\n\n * T. Yang, Y. Li, M. Mahdavi, R. Jin and Z. Zhou\n \"Nystroem Method vs Random Fourier Features: A Theoretical and Empirical\n Comparison\",\n Advances in Neural Information Processing Systems 2012\n\n Examples\n --------\n >>> from sklearn import datasets, svm\n >>> from sklearn.kernel_approximation import Nystroem\n >>> X, y = datasets.load_digits(n_class=9, return_X_y=True)\n >>> data = X / 16.\n >>> clf = svm.LinearSVC()\n >>> feature_map_nystroem = Nystroem(gamma=.2,\n ... random_state=1,\n ... n_components=300)\n >>> data_transformed = feature_map_nystroem.fit_transform(data)\n >>> clf.fit(data_transformed, y)\n LinearSVC()\n >>> clf.score(data_transformed, y)\n 0.9987...\n \"\"\"\n \n def __init__(self, kernel='rbf', *, gamma=None, coef0=None, degree=None, kernel_params=None, n_components=100, random_state=None, n_jobs=None):\n self.kernel = kernel\n self.gamma = gamma\n self.coef0 = coef0\n self.degree = degree\n self.kernel_params = kernel_params\n self.n_components = n_components\n self.random_state = random_state\n self.n_jobs = n_jobs\n \n def fit(self, X, y=None):\n \"\"\"Fit estimator to data.\n\n Samples a subset of training points, computes kernel\n on these and computes normalization matrix.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr')\n rnd = check_random_state(self.random_state)\n n_samples = X.shape[0]\n if self.n_components > n_samples:\n n_components = n_samples\n warnings.warn('n_components > n_samples. This is not possible.\\nn_components was set to n_samples, which results in inefficient evaluation of the full kernel.')\n else:\n n_components = self.n_components\n n_components = min(n_samples, n_components)\n inds = rnd.permutation(n_samples)\n basis_inds = inds[:n_components]\n basis = X[basis_inds]\n basis_kernel = pairwise_kernels(basis, metric=self.kernel, filter_params=True, n_jobs=self.n_jobs, **self._get_kernel_params())\n (U, S, V) = svd(basis_kernel)\n S = np.maximum(S, 1e-12)\n self.normalization_ = np.dot(U / np.sqrt(S), V)\n self.components_ = basis\n self.component_indices_ = basis_inds\n return self\n \n def transform(self, X):\n \"\"\"Apply feature map to X.\n\n Computes an approximate feature map using the kernel\n between some training points and X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to transform.\n\n Returns\n -------\n X_transformed : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n kernel_params = self._get_kernel_params()\n embedded = pairwise_kernels(X, self.components_, metric=self.kernel, filter_params=True, n_jobs=self.n_jobs, **kernel_params)\n return np.dot(embedded, self.normalization_.T)\n \n def _get_kernel_params(self):\n params = self.kernel_params\n if params is None:\n params = {}\n if not callable(self.kernel) and self.kernel != 'precomputed':\n for param in KERNEL_PARAMS[self.kernel]:\n if getattr(self, param) is not None:\n params[param] = getattr(self, param)\n elif self.gamma is not None or self.coef0 is not None or self.degree is not None:\n raise ValueError(\"Don't pass gamma, coef0 or degree to Nystroem if using a callable or precomputed kernel\")\n return params\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_transformer_preserve_dtypes': 'dtypes are preserved but not at a close enough precision'}, 'preserves_dtype': [np.float64, np.float32]}\n" }, @@ -23242,7 +23309,7 @@ "sklearn.kernel_approximation.PolynomialCountSketch.transform" ], "is_public": true, - "description": "Polynomial kernel approximation via Tensor Sketch.\n\nImplements Tensor Sketch, which approximates the feature map of the polynomial kernel:: K(X, Y) = (gamma * + coef0)^degree by efficiently computing a Count Sketch of the outer product of a vector with itself using Fast Fourier Transforms (FFT). Read more in the :ref:`User Guide `. .. versionadded:: 0.24", + "description": "Polynomial kernel approximation via Tensor Sketch.\n\nImplements Tensor Sketch, which approximates the feature map\nof the polynomial kernel::\n\n K(X, Y) = (gamma * + coef0)^degree\n\nby efficiently computing a Count Sketch of the outer product of a\nvector with itself using Fast Fourier Transforms (FFT). Read more in the\n:ref:`User Guide `.\n\n.. versionadded:: 0.24", "docstring": "Polynomial kernel approximation via Tensor Sketch.\n\n Implements Tensor Sketch, which approximates the feature map\n of the polynomial kernel::\n\n K(X, Y) = (gamma * + coef0)^degree\n\n by efficiently computing a Count Sketch of the outer product of a\n vector with itself using Fast Fourier Transforms (FFT). Read more in the\n :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n gamma : float, default=1.0\n Parameter of the polynomial kernel whose feature map\n will be approximated.\n\n degree : int, default=2\n Degree of the polynomial kernel whose feature map\n will be approximated.\n\n coef0 : int, default=0\n Constant term of the polynomial kernel whose feature map\n will be approximated.\n\n n_components : int, default=100\n Dimensionality of the output feature space. Usually, `n_components`\n should be greater than the number of features in input samples in\n order to achieve good performance. The optimal score / run time\n balance is typically achieved around `n_components` = 10 * `n_features`,\n but this depends on the specific dataset being used.\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for indexHash and bitHash\n initialization. Pass an int for reproducible results across multiple\n function calls. See :term:`Glossary `.\n\n Attributes\n ----------\n indexHash_ : ndarray of shape (degree, n_features), dtype=int64\n Array of indexes in range [0, n_components) used to represent\n the 2-wise independent hash functions for Count Sketch computation.\n\n bitHash_ : ndarray of shape (degree, n_features), dtype=float32\n Array with random entries in {+1, -1}, used to represent\n the 2-wise independent hash functions for Count Sketch computation.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n AdditiveChi2Sampler : Approximate feature map for additive chi2 kernel.\n Nystroem : Approximate a kernel map using a subset of the training data.\n RBFSampler : Approximate a RBF kernel feature map using random Fourier\n features.\n SkewedChi2Sampler : Approximate feature map for \"skewed chi-squared\" kernel.\n sklearn.metrics.pairwise.kernel_metrics : List of built-in kernels.\n\n Examples\n --------\n >>> from sklearn.kernel_approximation import PolynomialCountSketch\n >>> from sklearn.linear_model import SGDClassifier\n >>> X = [[0, 0], [1, 1], [1, 0], [0, 1]]\n >>> y = [0, 0, 1, 1]\n >>> ps = PolynomialCountSketch(degree=3, random_state=1)\n >>> X_features = ps.fit_transform(X)\n >>> clf = SGDClassifier(max_iter=10, tol=1e-3)\n >>> clf.fit(X_features, y)\n SGDClassifier(max_iter=10)\n >>> clf.score(X_features, y)\n 1.0\n ", "source_code": "\n\nclass PolynomialCountSketch(BaseEstimator, TransformerMixin):\n \"\"\"Polynomial kernel approximation via Tensor Sketch.\n\n Implements Tensor Sketch, which approximates the feature map\n of the polynomial kernel::\n\n K(X, Y) = (gamma * + coef0)^degree\n\n by efficiently computing a Count Sketch of the outer product of a\n vector with itself using Fast Fourier Transforms (FFT). Read more in the\n :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n gamma : float, default=1.0\n Parameter of the polynomial kernel whose feature map\n will be approximated.\n\n degree : int, default=2\n Degree of the polynomial kernel whose feature map\n will be approximated.\n\n coef0 : int, default=0\n Constant term of the polynomial kernel whose feature map\n will be approximated.\n\n n_components : int, default=100\n Dimensionality of the output feature space. Usually, `n_components`\n should be greater than the number of features in input samples in\n order to achieve good performance. The optimal score / run time\n balance is typically achieved around `n_components` = 10 * `n_features`,\n but this depends on the specific dataset being used.\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for indexHash and bitHash\n initialization. Pass an int for reproducible results across multiple\n function calls. See :term:`Glossary `.\n\n Attributes\n ----------\n indexHash_ : ndarray of shape (degree, n_features), dtype=int64\n Array of indexes in range [0, n_components) used to represent\n the 2-wise independent hash functions for Count Sketch computation.\n\n bitHash_ : ndarray of shape (degree, n_features), dtype=float32\n Array with random entries in {+1, -1}, used to represent\n the 2-wise independent hash functions for Count Sketch computation.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n AdditiveChi2Sampler : Approximate feature map for additive chi2 kernel.\n Nystroem : Approximate a kernel map using a subset of the training data.\n RBFSampler : Approximate a RBF kernel feature map using random Fourier\n features.\n SkewedChi2Sampler : Approximate feature map for \"skewed chi-squared\" kernel.\n sklearn.metrics.pairwise.kernel_metrics : List of built-in kernels.\n\n Examples\n --------\n >>> from sklearn.kernel_approximation import PolynomialCountSketch\n >>> from sklearn.linear_model import SGDClassifier\n >>> X = [[0, 0], [1, 1], [1, 0], [0, 1]]\n >>> y = [0, 0, 1, 1]\n >>> ps = PolynomialCountSketch(degree=3, random_state=1)\n >>> X_features = ps.fit_transform(X)\n >>> clf = SGDClassifier(max_iter=10, tol=1e-3)\n >>> clf.fit(X_features, y)\n SGDClassifier(max_iter=10)\n >>> clf.score(X_features, y)\n 1.0\n \"\"\"\n \n def __init__(self, *, gamma=1.0, degree=2, coef0=0, n_components=100, random_state=None):\n self.gamma = gamma\n self.degree = degree\n self.coef0 = coef0\n self.n_components = n_components\n self.random_state = random_state\n \n def fit(self, X, y=None):\n \"\"\"Fit the model with X.\n\n Initializes the internal variables. The method needs no information\n about the distribution of data, so we only care about n_features in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n if not self.degree >= 1:\n raise ValueError(f'degree={self.degree} should be >=1.')\n X = self._validate_data(X, accept_sparse='csc')\n random_state = check_random_state(self.random_state)\n n_features = X.shape[1]\n if self.coef0 != 0:\n n_features += 1\n self.indexHash_ = random_state.randint(0, high=self.n_components, size=(self.degree, n_features))\n self.bitHash_ = random_state.choice(a=[-1, 1], size=(self.degree, n_features))\n return self\n \n def transform(self, X):\n \"\"\"Generate the feature map approximation for X.\n\n Parameters\n ----------\n X : {array-like}, shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csc', reset=False)\n X_gamma = np.sqrt(self.gamma) * X\n if sp.issparse(X_gamma) and self.coef0 != 0:\n X_gamma = sp.hstack([X_gamma, np.sqrt(self.coef0) * np.ones((X_gamma.shape[0], 1))], format='csc')\n elif not sp.issparse(X_gamma) and self.coef0 != 0:\n X_gamma = np.hstack([X_gamma, np.sqrt(self.coef0) * np.ones((X_gamma.shape[0], 1))])\n if X_gamma.shape[1] != self.indexHash_.shape[1]:\n raise ValueError('Number of features of test samples does not match that of training samples.')\n count_sketches = np.zeros((X_gamma.shape[0], self.degree, self.n_components))\n if sp.issparse(X_gamma):\n for j in range(X_gamma.shape[1]):\n for d in range(self.degree):\n iHashIndex = self.indexHash_[d, j]\n iHashBit = self.bitHash_[d, j]\n count_sketches[:, d, iHashIndex] += (iHashBit * X_gamma[:, j]).toarray().ravel()\n else:\n for j in range(X_gamma.shape[1]):\n for d in range(self.degree):\n iHashIndex = self.indexHash_[d, j]\n iHashBit = self.bitHash_[d, j]\n count_sketches[:, d, iHashIndex] += iHashBit * X_gamma[:, j]\n count_sketches_fft = fft(count_sketches, axis=2, overwrite_x=True)\n count_sketches_fft_prod = np.prod(count_sketches_fft, axis=1)\n data_sketch = np.real(ifft(count_sketches_fft_prod, overwrite_x=True))\n return data_sketch\n" }, @@ -23257,7 +23324,7 @@ "sklearn.kernel_approximation.RBFSampler.transform" ], "is_public": true, - "description": "Approximate a RBF kernel feature map using random Fourier features.\n\nIt implements a variant of Random Kitchen Sinks.[1] Read more in the :ref:`User Guide `.", + "description": "Approximate a RBF kernel feature map using random Fourier features.\n\nIt implements a variant of Random Kitchen Sinks.[1]\n\nRead more in the :ref:`User Guide `.", "docstring": "Approximate a RBF kernel feature map using random Fourier features.\n\n It implements a variant of Random Kitchen Sinks.[1]\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n gamma : float, default=1.0\n Parameter of RBF kernel: exp(-gamma * x^2).\n\n n_components : int, default=100\n Number of Monte Carlo samples per original feature.\n Equals the dimensionality of the computed feature space.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the generation of the random\n weights and random offset when fitting the training data.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n random_offset_ : ndarray of shape (n_components,), dtype=float64\n Random offset used to compute the projection in the `n_components`\n dimensions of the feature space.\n\n random_weights_ : ndarray of shape (n_features, n_components), dtype=float64\n Random projection directions drawn from the Fourier transform\n of the RBF kernel.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n AdditiveChi2Sampler : Approximate feature map for additive chi2 kernel.\n Nystroem : Approximate a kernel map using a subset of the training data.\n PolynomialCountSketch : Polynomial kernel approximation via Tensor Sketch.\n SkewedChi2Sampler : Approximate feature map for\n \"skewed chi-squared\" kernel.\n sklearn.metrics.pairwise.kernel_metrics : List of built-in kernels.\n\n Notes\n -----\n See \"Random Features for Large-Scale Kernel Machines\" by A. Rahimi and\n Benjamin Recht.\n\n [1] \"Weighted Sums of Random Kitchen Sinks: Replacing\n minimization with randomization in learning\" by A. Rahimi and\n Benjamin Recht.\n (https://people.eecs.berkeley.edu/~brecht/papers/08.rah.rec.nips.pdf)\n\n Examples\n --------\n >>> from sklearn.kernel_approximation import RBFSampler\n >>> from sklearn.linear_model import SGDClassifier\n >>> X = [[0, 0], [1, 1], [1, 0], [0, 1]]\n >>> y = [0, 0, 1, 1]\n >>> rbf_feature = RBFSampler(gamma=1, random_state=1)\n >>> X_features = rbf_feature.fit_transform(X)\n >>> clf = SGDClassifier(max_iter=5, tol=1e-3)\n >>> clf.fit(X_features, y)\n SGDClassifier(max_iter=5)\n >>> clf.score(X_features, y)\n 1.0\n ", "source_code": "\n\nclass RBFSampler(TransformerMixin, BaseEstimator):\n \"\"\"Approximate a RBF kernel feature map using random Fourier features.\n\n It implements a variant of Random Kitchen Sinks.[1]\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n gamma : float, default=1.0\n Parameter of RBF kernel: exp(-gamma * x^2).\n\n n_components : int, default=100\n Number of Monte Carlo samples per original feature.\n Equals the dimensionality of the computed feature space.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the generation of the random\n weights and random offset when fitting the training data.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n random_offset_ : ndarray of shape (n_components,), dtype=float64\n Random offset used to compute the projection in the `n_components`\n dimensions of the feature space.\n\n random_weights_ : ndarray of shape (n_features, n_components), dtype=float64\n Random projection directions drawn from the Fourier transform\n of the RBF kernel.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n AdditiveChi2Sampler : Approximate feature map for additive chi2 kernel.\n Nystroem : Approximate a kernel map using a subset of the training data.\n PolynomialCountSketch : Polynomial kernel approximation via Tensor Sketch.\n SkewedChi2Sampler : Approximate feature map for\n \"skewed chi-squared\" kernel.\n sklearn.metrics.pairwise.kernel_metrics : List of built-in kernels.\n\n Notes\n -----\n See \"Random Features for Large-Scale Kernel Machines\" by A. Rahimi and\n Benjamin Recht.\n\n [1] \"Weighted Sums of Random Kitchen Sinks: Replacing\n minimization with randomization in learning\" by A. Rahimi and\n Benjamin Recht.\n (https://people.eecs.berkeley.edu/~brecht/papers/08.rah.rec.nips.pdf)\n\n Examples\n --------\n >>> from sklearn.kernel_approximation import RBFSampler\n >>> from sklearn.linear_model import SGDClassifier\n >>> X = [[0, 0], [1, 1], [1, 0], [0, 1]]\n >>> y = [0, 0, 1, 1]\n >>> rbf_feature = RBFSampler(gamma=1, random_state=1)\n >>> X_features = rbf_feature.fit_transform(X)\n >>> clf = SGDClassifier(max_iter=5, tol=1e-3)\n >>> clf.fit(X_features, y)\n SGDClassifier(max_iter=5)\n >>> clf.score(X_features, y)\n 1.0\n \"\"\"\n \n def __init__(self, *, gamma=1.0, n_components=100, random_state=None):\n self.gamma = gamma\n self.n_components = n_components\n self.random_state = random_state\n \n def fit(self, X, y=None):\n \"\"\"Fit the model with X.\n\n Samples random projection according to n_features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr')\n random_state = check_random_state(self.random_state)\n n_features = X.shape[1]\n self.random_weights_ = np.sqrt(2 * self.gamma) * random_state.normal(size=(n_features, self.n_components))\n self.random_offset_ = random_state.uniform(0, 2 * np.pi, size=self.n_components)\n return self\n \n def transform(self, X):\n \"\"\"Apply the approximate feature map to X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n projection = safe_sparse_dot(X, self.random_weights_)\n projection += self.random_offset_\n np.cos(projection, projection)\n projection *= np.sqrt(2.0) / np.sqrt(self.n_components)\n return projection\n" }, @@ -23294,9 +23361,9 @@ "sklearn.kernel_ridge.KernelRidge.predict" ], "is_public": true, - "description": "Kernel ridge regression.\n\nKernel ridge regression (KRR) combines ridge regression (linear least squares with l2-norm regularization) with the kernel trick. It thus learns a linear function in the space induced by the respective kernel and the data. For non-linear kernels, this corresponds to a non-linear function in the original space. The form of the model learned by KRR is identical to support vector regression (SVR). However, different loss functions are used: KRR uses squared error loss while support vector regression uses epsilon-insensitive loss, both combined with l2 regularization. In contrast to SVR, fitting a KRR model can be done in closed-form and is typically faster for medium-sized datasets. On the other hand, the learned model is non-sparse and thus slower than SVR, which learns a sparse model for epsilon > 0, at prediction-time. This estimator has built-in support for multi-variate regression (i.e., when y is a 2d-array of shape [n_samples, n_targets]). Read more in the :ref:`User Guide `.", - "docstring": "Kernel ridge regression.\n\n Kernel ridge regression (KRR) combines ridge regression (linear least\n squares with l2-norm regularization) with the kernel trick. It thus\n learns a linear function in the space induced by the respective kernel and\n the data. For non-linear kernels, this corresponds to a non-linear\n function in the original space.\n\n The form of the model learned by KRR is identical to support vector\n regression (SVR). However, different loss functions are used: KRR uses\n squared error loss while support vector regression uses epsilon-insensitive\n loss, both combined with l2 regularization. In contrast to SVR, fitting a\n KRR model can be done in closed-form and is typically faster for\n medium-sized datasets. On the other hand, the learned model is non-sparse\n and thus slower than SVR, which learns a sparse model for epsilon > 0, at\n prediction-time.\n\n This estimator has built-in support for multi-variate regression\n (i.e., when y is a 2d-array of shape [n_samples, n_targets]).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float or array-like of shape (n_targets,), default=1.0\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are\n assumed to be specific to the targets. Hence they must correspond in\n number. See :ref:`ridge_regression` for formula.\n\n kernel : str or callable, default=\"linear\"\n Kernel mapping used internally. This parameter is directly passed to\n :class:`~sklearn.metrics.pairwise.pairwise_kernel`.\n If `kernel` is a string, it must be one of the metrics\n in `pairwise.PAIRWISE_KERNEL_FUNCTIONS`.\n If `kernel` is \"precomputed\", X is assumed to be a kernel matrix.\n Alternatively, if `kernel` is a callable function, it is called on\n each pair of instances (rows) and the resulting value recorded. The\n callable should take two rows from X as input and return the\n corresponding kernel value as a single number. This means that\n callables from :mod:`sklearn.metrics.pairwise` are not allowed, as\n they operate on matrices, not single samples. Use the string\n identifying the kernel instead.\n\n gamma : float, default=None\n Gamma parameter for the RBF, laplacian, polynomial, exponential chi2\n and sigmoid kernels. Interpretation of the default value is left to\n the kernel; see the documentation for sklearn.metrics.pairwise.\n Ignored by other kernels.\n\n degree : float, default=3\n Degree of the polynomial kernel. Ignored by other kernels.\n\n coef0 : float, default=1\n Zero coefficient for polynomial and sigmoid kernels.\n Ignored by other kernels.\n\n kernel_params : mapping of str to any, default=None\n Additional parameters (keyword arguments) for kernel function passed\n as callable object.\n\n Attributes\n ----------\n dual_coef_ : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Representation of weight vector(s) in kernel space\n\n X_fit_ : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data, which is also required for prediction. If\n kernel == \"precomputed\" this is instead the precomputed\n training matrix, of shape (n_samples, n_samples).\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.gaussian_process.GaussianProcessRegressor : Gaussian\n Process regressor providing automatic kernel hyperparameters\n tuning and predictions uncertainty.\n sklearn.linear_model.Ridge : Linear ridge regression.\n sklearn.linear_model.RidgeCV : Ridge regression with built-in\n cross-validation.\n sklearn.svm.SVR : Support Vector Regression accepting a large variety\n of kernels.\n\n References\n ----------\n * Kevin P. Murphy\n \"Machine Learning: A Probabilistic Perspective\", The MIT Press\n chapter 14.4.3, pp. 492-493\n\n Examples\n --------\n >>> from sklearn.kernel_ridge import KernelRidge\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> krr = KernelRidge(alpha=1.0)\n >>> krr.fit(X, y)\n KernelRidge(alpha=1.0)\n ", - "source_code": "\n\nclass KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator):\n \"\"\"Kernel ridge regression.\n\n Kernel ridge regression (KRR) combines ridge regression (linear least\n squares with l2-norm regularization) with the kernel trick. It thus\n learns a linear function in the space induced by the respective kernel and\n the data. For non-linear kernels, this corresponds to a non-linear\n function in the original space.\n\n The form of the model learned by KRR is identical to support vector\n regression (SVR). However, different loss functions are used: KRR uses\n squared error loss while support vector regression uses epsilon-insensitive\n loss, both combined with l2 regularization. In contrast to SVR, fitting a\n KRR model can be done in closed-form and is typically faster for\n medium-sized datasets. On the other hand, the learned model is non-sparse\n and thus slower than SVR, which learns a sparse model for epsilon > 0, at\n prediction-time.\n\n This estimator has built-in support for multi-variate regression\n (i.e., when y is a 2d-array of shape [n_samples, n_targets]).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float or array-like of shape (n_targets,), default=1.0\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are\n assumed to be specific to the targets. Hence they must correspond in\n number. See :ref:`ridge_regression` for formula.\n\n kernel : str or callable, default=\"linear\"\n Kernel mapping used internally. This parameter is directly passed to\n :class:`~sklearn.metrics.pairwise.pairwise_kernel`.\n If `kernel` is a string, it must be one of the metrics\n in `pairwise.PAIRWISE_KERNEL_FUNCTIONS`.\n If `kernel` is \"precomputed\", X is assumed to be a kernel matrix.\n Alternatively, if `kernel` is a callable function, it is called on\n each pair of instances (rows) and the resulting value recorded. The\n callable should take two rows from X as input and return the\n corresponding kernel value as a single number. This means that\n callables from :mod:`sklearn.metrics.pairwise` are not allowed, as\n they operate on matrices, not single samples. Use the string\n identifying the kernel instead.\n\n gamma : float, default=None\n Gamma parameter for the RBF, laplacian, polynomial, exponential chi2\n and sigmoid kernels. Interpretation of the default value is left to\n the kernel; see the documentation for sklearn.metrics.pairwise.\n Ignored by other kernels.\n\n degree : float, default=3\n Degree of the polynomial kernel. Ignored by other kernels.\n\n coef0 : float, default=1\n Zero coefficient for polynomial and sigmoid kernels.\n Ignored by other kernels.\n\n kernel_params : mapping of str to any, default=None\n Additional parameters (keyword arguments) for kernel function passed\n as callable object.\n\n Attributes\n ----------\n dual_coef_ : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Representation of weight vector(s) in kernel space\n\n X_fit_ : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data, which is also required for prediction. If\n kernel == \"precomputed\" this is instead the precomputed\n training matrix, of shape (n_samples, n_samples).\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.gaussian_process.GaussianProcessRegressor : Gaussian\n Process regressor providing automatic kernel hyperparameters\n tuning and predictions uncertainty.\n sklearn.linear_model.Ridge : Linear ridge regression.\n sklearn.linear_model.RidgeCV : Ridge regression with built-in\n cross-validation.\n sklearn.svm.SVR : Support Vector Regression accepting a large variety\n of kernels.\n\n References\n ----------\n * Kevin P. Murphy\n \"Machine Learning: A Probabilistic Perspective\", The MIT Press\n chapter 14.4.3, pp. 492-493\n\n Examples\n --------\n >>> from sklearn.kernel_ridge import KernelRidge\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> krr = KernelRidge(alpha=1.0)\n >>> krr.fit(X, y)\n KernelRidge(alpha=1.0)\n \"\"\"\n \n def __init__(self, alpha=1, *, kernel='linear', gamma=None, degree=3, coef0=1, kernel_params=None):\n self.alpha = alpha\n self.kernel = kernel\n self.gamma = gamma\n self.degree = degree\n self.coef0 = coef0\n self.kernel_params = kernel_params\n \n def _get_kernel(self, X, Y=None):\n if callable(self.kernel):\n params = self.kernel_params or {}\n else:\n params = {'gamma': self.gamma, 'degree': self.degree, 'coef0': self.coef0}\n return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params)\n \n def _more_tags(self):\n return {'pairwise': self.kernel == 'precomputed'}\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return self.kernel == 'precomputed'\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit Kernel Ridge regression model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. If kernel == \"precomputed\" this is instead\n a precomputed kernel matrix, of shape (n_samples, n_samples).\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : float or array-like of shape (n_samples,), default=None\n Individual weights for each sample, ignored if None is passed.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=('csr', 'csc'), multi_output=True, y_numeric=True)\n if sample_weight is not None and not isinstance(sample_weight, float):\n sample_weight = _check_sample_weight(sample_weight, X)\n K = self._get_kernel(X)\n alpha = np.atleast_1d(self.alpha)\n ravel = False\n if len(y.shape) == 1:\n y = y.reshape(-1, 1)\n ravel = True\n copy = self.kernel == 'precomputed'\n self.dual_coef_ = _solve_cholesky_kernel(K, y, alpha, sample_weight, copy)\n if ravel:\n self.dual_coef_ = self.dual_coef_.ravel()\n self.X_fit_ = X\n return self\n \n def predict(self, X):\n \"\"\"Predict using the kernel ridge model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples. If kernel == \"precomputed\" this is instead a\n precomputed kernel matrix, shape = [n_samples,\n n_samples_fitted], where n_samples_fitted is the number of\n samples used in the fitting for this estimator.\n\n Returns\n -------\n C : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Returns predicted values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), reset=False)\n K = self._get_kernel(X, self.X_fit_)\n return np.dot(K, self.dual_coef_)\n" + "description": "Kernel ridge regression.\n\nKernel ridge regression (KRR) combines ridge regression (linear least\nsquares with l2-norm regularization) with the kernel trick. It thus\nlearns a linear function in the space induced by the respective kernel and\nthe data. For non-linear kernels, this corresponds to a non-linear\nfunction in the original space.\n\nThe form of the model learned by KRR is identical to support vector\nregression (SVR). However, different loss functions are used: KRR uses\nsquared error loss while support vector regression uses epsilon-insensitive\nloss, both combined with l2 regularization. In contrast to SVR, fitting a\nKRR model can be done in closed-form and is typically faster for\nmedium-sized datasets. On the other hand, the learned model is non-sparse\nand thus slower than SVR, which learns a sparse model for epsilon > 0, at\nprediction-time.\n\nThis estimator has built-in support for multi-variate regression\n(i.e., when y is a 2d-array of shape [n_samples, n_targets]).\n\nRead more in the :ref:`User Guide `.", + "docstring": "Kernel ridge regression.\n\n Kernel ridge regression (KRR) combines ridge regression (linear least\n squares with l2-norm regularization) with the kernel trick. It thus\n learns a linear function in the space induced by the respective kernel and\n the data. For non-linear kernels, this corresponds to a non-linear\n function in the original space.\n\n The form of the model learned by KRR is identical to support vector\n regression (SVR). However, different loss functions are used: KRR uses\n squared error loss while support vector regression uses epsilon-insensitive\n loss, both combined with l2 regularization. In contrast to SVR, fitting a\n KRR model can be done in closed-form and is typically faster for\n medium-sized datasets. On the other hand, the learned model is non-sparse\n and thus slower than SVR, which learns a sparse model for epsilon > 0, at\n prediction-time.\n\n This estimator has built-in support for multi-variate regression\n (i.e., when y is a 2d-array of shape [n_samples, n_targets]).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float or array-like of shape (n_targets,), default=1.0\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are\n assumed to be specific to the targets. Hence they must correspond in\n number. See :ref:`ridge_regression` for formula.\n\n kernel : str or callable, default=\"linear\"\n Kernel mapping used internally. This parameter is directly passed to\n :class:`~sklearn.metrics.pairwise.pairwise_kernel`.\n If `kernel` is a string, it must be one of the metrics\n in `pairwise.PAIRWISE_KERNEL_FUNCTIONS` or \"precomputed\".\n If `kernel` is \"precomputed\", X is assumed to be a kernel matrix.\n Alternatively, if `kernel` is a callable function, it is called on\n each pair of instances (rows) and the resulting value recorded. The\n callable should take two rows from X as input and return the\n corresponding kernel value as a single number. This means that\n callables from :mod:`sklearn.metrics.pairwise` are not allowed, as\n they operate on matrices, not single samples. Use the string\n identifying the kernel instead.\n\n gamma : float, default=None\n Gamma parameter for the RBF, laplacian, polynomial, exponential chi2\n and sigmoid kernels. Interpretation of the default value is left to\n the kernel; see the documentation for sklearn.metrics.pairwise.\n Ignored by other kernels.\n\n degree : float, default=3\n Degree of the polynomial kernel. Ignored by other kernels.\n\n coef0 : float, default=1\n Zero coefficient for polynomial and sigmoid kernels.\n Ignored by other kernels.\n\n kernel_params : mapping of str to any, default=None\n Additional parameters (keyword arguments) for kernel function passed\n as callable object.\n\n Attributes\n ----------\n dual_coef_ : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Representation of weight vector(s) in kernel space\n\n X_fit_ : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data, which is also required for prediction. If\n kernel == \"precomputed\" this is instead the precomputed\n training matrix, of shape (n_samples, n_samples).\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.gaussian_process.GaussianProcessRegressor : Gaussian\n Process regressor providing automatic kernel hyperparameters\n tuning and predictions uncertainty.\n sklearn.linear_model.Ridge : Linear ridge regression.\n sklearn.linear_model.RidgeCV : Ridge regression with built-in\n cross-validation.\n sklearn.svm.SVR : Support Vector Regression accepting a large variety\n of kernels.\n\n References\n ----------\n * Kevin P. Murphy\n \"Machine Learning: A Probabilistic Perspective\", The MIT Press\n chapter 14.4.3, pp. 492-493\n\n Examples\n --------\n >>> from sklearn.kernel_ridge import KernelRidge\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> krr = KernelRidge(alpha=1.0)\n >>> krr.fit(X, y)\n KernelRidge(alpha=1.0)\n ", + "source_code": "\n\nclass KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator):\n \"\"\"Kernel ridge regression.\n\n Kernel ridge regression (KRR) combines ridge regression (linear least\n squares with l2-norm regularization) with the kernel trick. It thus\n learns a linear function in the space induced by the respective kernel and\n the data. For non-linear kernels, this corresponds to a non-linear\n function in the original space.\n\n The form of the model learned by KRR is identical to support vector\n regression (SVR). However, different loss functions are used: KRR uses\n squared error loss while support vector regression uses epsilon-insensitive\n loss, both combined with l2 regularization. In contrast to SVR, fitting a\n KRR model can be done in closed-form and is typically faster for\n medium-sized datasets. On the other hand, the learned model is non-sparse\n and thus slower than SVR, which learns a sparse model for epsilon > 0, at\n prediction-time.\n\n This estimator has built-in support for multi-variate regression\n (i.e., when y is a 2d-array of shape [n_samples, n_targets]).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float or array-like of shape (n_targets,), default=1.0\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are\n assumed to be specific to the targets. Hence they must correspond in\n number. See :ref:`ridge_regression` for formula.\n\n kernel : str or callable, default=\"linear\"\n Kernel mapping used internally. This parameter is directly passed to\n :class:`~sklearn.metrics.pairwise.pairwise_kernel`.\n If `kernel` is a string, it must be one of the metrics\n in `pairwise.PAIRWISE_KERNEL_FUNCTIONS` or \"precomputed\".\n If `kernel` is \"precomputed\", X is assumed to be a kernel matrix.\n Alternatively, if `kernel` is a callable function, it is called on\n each pair of instances (rows) and the resulting value recorded. The\n callable should take two rows from X as input and return the\n corresponding kernel value as a single number. This means that\n callables from :mod:`sklearn.metrics.pairwise` are not allowed, as\n they operate on matrices, not single samples. Use the string\n identifying the kernel instead.\n\n gamma : float, default=None\n Gamma parameter for the RBF, laplacian, polynomial, exponential chi2\n and sigmoid kernels. Interpretation of the default value is left to\n the kernel; see the documentation for sklearn.metrics.pairwise.\n Ignored by other kernels.\n\n degree : float, default=3\n Degree of the polynomial kernel. Ignored by other kernels.\n\n coef0 : float, default=1\n Zero coefficient for polynomial and sigmoid kernels.\n Ignored by other kernels.\n\n kernel_params : mapping of str to any, default=None\n Additional parameters (keyword arguments) for kernel function passed\n as callable object.\n\n Attributes\n ----------\n dual_coef_ : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Representation of weight vector(s) in kernel space\n\n X_fit_ : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data, which is also required for prediction. If\n kernel == \"precomputed\" this is instead the precomputed\n training matrix, of shape (n_samples, n_samples).\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.gaussian_process.GaussianProcessRegressor : Gaussian\n Process regressor providing automatic kernel hyperparameters\n tuning and predictions uncertainty.\n sklearn.linear_model.Ridge : Linear ridge regression.\n sklearn.linear_model.RidgeCV : Ridge regression with built-in\n cross-validation.\n sklearn.svm.SVR : Support Vector Regression accepting a large variety\n of kernels.\n\n References\n ----------\n * Kevin P. Murphy\n \"Machine Learning: A Probabilistic Perspective\", The MIT Press\n chapter 14.4.3, pp. 492-493\n\n Examples\n --------\n >>> from sklearn.kernel_ridge import KernelRidge\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> krr = KernelRidge(alpha=1.0)\n >>> krr.fit(X, y)\n KernelRidge(alpha=1.0)\n \"\"\"\n \n def __init__(self, alpha=1, *, kernel='linear', gamma=None, degree=3, coef0=1, kernel_params=None):\n self.alpha = alpha\n self.kernel = kernel\n self.gamma = gamma\n self.degree = degree\n self.coef0 = coef0\n self.kernel_params = kernel_params\n \n def _get_kernel(self, X, Y=None):\n if callable(self.kernel):\n params = self.kernel_params or {}\n else:\n params = {'gamma': self.gamma, 'degree': self.degree, 'coef0': self.coef0}\n return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params)\n \n def _more_tags(self):\n return {'pairwise': self.kernel == 'precomputed'}\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return self.kernel == 'precomputed'\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit Kernel Ridge regression model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. If kernel == \"precomputed\" this is instead\n a precomputed kernel matrix, of shape (n_samples, n_samples).\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : float or array-like of shape (n_samples,), default=None\n Individual weights for each sample, ignored if None is passed.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=('csr', 'csc'), multi_output=True, y_numeric=True)\n if sample_weight is not None and not isinstance(sample_weight, float):\n sample_weight = _check_sample_weight(sample_weight, X)\n K = self._get_kernel(X)\n alpha = np.atleast_1d(self.alpha)\n ravel = False\n if len(y.shape) == 1:\n y = y.reshape(-1, 1)\n ravel = True\n copy = self.kernel == 'precomputed'\n self.dual_coef_ = _solve_cholesky_kernel(K, y, alpha, sample_weight, copy)\n if ravel:\n self.dual_coef_ = self.dual_coef_.ravel()\n self.X_fit_ = X\n return self\n \n def predict(self, X):\n \"\"\"Predict using the kernel ridge model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples. If kernel == \"precomputed\" this is instead a\n precomputed kernel matrix, shape = [n_samples,\n n_samples_fitted], where n_samples_fitted is the number of\n samples used in the fitting for this estimator.\n\n Returns\n -------\n C : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Returns predicted values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), reset=False)\n K = self._get_kernel(X, self.X_fit_)\n return np.dot(K, self.dual_coef_)\n" }, { "name": "LinearClassifierMixin", @@ -23311,7 +23378,7 @@ "is_public": false, "description": "Mixin for linear classifiers.\n\nHandles prediction for sparse and dense X.", "docstring": "Mixin for linear classifiers.\n\n Handles prediction for sparse and dense X.\n ", - "source_code": "\n\nclass LinearClassifierMixin(ClassifierMixin):\n \"\"\"Mixin for linear classifiers.\n\n Handles prediction for sparse and dense X.\n \"\"\"\n \n def decision_function(self, X):\n \"\"\"\n Predict confidence scores for samples.\n\n The confidence score for a sample is proportional to the signed\n distance of that sample to the hyperplane.\n\n Parameters\n ----------\n X : array-like or sparse matrix, shape (n_samples, n_features)\n Samples.\n\n Returns\n -------\n array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)\n Confidence scores per (sample, class) combination. In the binary\n case, confidence score for self.classes_[1] where >0 means this\n class would be predicted.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_\n return scores.ravel() if scores.shape[1] == 1 else scores\n \n def predict(self, X):\n \"\"\"\n Predict class labels for samples in X.\n\n Parameters\n ----------\n X : array-like or sparse matrix, shape (n_samples, n_features)\n Samples.\n\n Returns\n -------\n C : array, shape [n_samples]\n Predicted class label per sample.\n \"\"\"\n scores = self.decision_function(X)\n if len(scores.shape) == 1:\n indices = (scores > 0).astype(int)\n else:\n indices = scores.argmax(axis=1)\n return self.classes_[indices]\n \n def _predict_proba_lr(self, X):\n \"\"\"Probability estimation for OvR logistic regression.\n\n Positive class probabilities are computed as\n 1. / (1. + np.exp(-self.decision_function(X)));\n multiclass is handled by normalizing that over all classes.\n \"\"\"\n prob = self.decision_function(X)\n expit(prob, out=prob)\n if prob.ndim == 1:\n return np.vstack([1 - prob, prob]).T\n else:\n prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))\n return prob\n" + "source_code": "\n\nclass LinearClassifierMixin(ClassifierMixin):\n \"\"\"Mixin for linear classifiers.\n\n Handles prediction for sparse and dense X.\n \"\"\"\n \n def decision_function(self, X):\n \"\"\"\n Predict confidence scores for samples.\n\n The confidence score for a sample is proportional to the signed\n distance of that sample to the hyperplane.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data matrix for which we want to get the confidence scores.\n\n Returns\n -------\n scores : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Confidence scores per `(n_samples, n_classes)` combination. In the\n binary case, confidence score for `self.classes_[1]` where >0 means\n this class would be predicted.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_\n return scores.ravel() if scores.shape[1] == 1 else scores\n \n def predict(self, X):\n \"\"\"\n Predict class labels for samples in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data matrix for which we want to get the predictions.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Vector containing the class labels for each sample.\n \"\"\"\n scores = self.decision_function(X)\n if len(scores.shape) == 1:\n indices = (scores > 0).astype(int)\n else:\n indices = scores.argmax(axis=1)\n return self.classes_[indices]\n \n def _predict_proba_lr(self, X):\n \"\"\"Probability estimation for OvR logistic regression.\n\n Positive class probabilities are computed as\n 1. / (1. + np.exp(-self.decision_function(X)));\n multiclass is handled by normalizing that over all classes.\n \"\"\"\n prob = self.decision_function(X)\n expit(prob, out=prob)\n if prob.ndim == 1:\n return np.vstack([1 - prob, prob]).T\n else:\n prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))\n return prob\n" }, { "name": "LinearModel", @@ -23344,7 +23411,7 @@ "sklearn.linear_model._base.LinearRegression.fit" ], "is_public": true, - "description": "Ordinary least squares Linear Regression.\n\nLinearRegression fits a linear model with coefficients w = (w1, ..., wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation.", + "description": "Ordinary least squares Linear Regression.\n\nLinearRegression fits a linear model with coefficients w = (w1, ..., wp)\nto minimize the residual sum of squares between the observed targets in\nthe dataset, and the targets predicted by the linear approximation.", "docstring": "\n Ordinary least squares Linear Regression.\n\n LinearRegression fits a linear model with coefficients w = (w1, ..., wp)\n to minimize the residual sum of squares between the observed targets in\n the dataset, and the targets predicted by the linear approximation.\n\n Parameters\n ----------\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to False, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n `normalize` was deprecated in version 1.0 and will be\n removed in 1.2.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This will only provide\n speedup in case of sufficiently large problems, that is if firstly\n `n_targets > 1` and secondly `X` is sparse or if `positive` is set\n to `True`. ``None`` means 1 unless in a\n :obj:`joblib.parallel_backend` context. ``-1`` means using all\n processors. See :term:`Glossary ` for more details.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive. This\n option is only supported for dense arrays.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n coef_ : array of shape (n_features, ) or (n_targets, n_features)\n Estimated coefficients for the linear regression problem.\n If multiple targets are passed during the fit (y 2D), this\n is a 2D array of shape (n_targets, n_features), while if only\n one target is passed, this is a 1D array of length n_features.\n\n rank_ : int\n Rank of matrix `X`. Only available when `X` is dense.\n\n singular_ : array of shape (min(X, y),)\n Singular values of `X`. Only available when `X` is dense.\n\n intercept_ : float or array of shape (n_targets,)\n Independent term in the linear model. Set to 0.0 if\n `fit_intercept = False`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression addresses some of the\n problems of Ordinary Least Squares by imposing a penalty on the\n size of the coefficients with l2 regularization.\n Lasso : The Lasso is a linear model that estimates\n sparse coefficients with l1 regularization.\n ElasticNet : Elastic-Net is a linear regression\n model trained with both l1 and l2 -norm regularization of the\n coefficients.\n\n Notes\n -----\n From the implementation point of view, this is just plain Ordinary\n Least Squares (scipy.linalg.lstsq) or Non Negative Least Squares\n (scipy.optimize.nnls) wrapped as a predictor object.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import LinearRegression\n >>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])\n >>> # y = 1 * x_0 + 2 * x_1 + 3\n >>> y = np.dot(X, np.array([1, 2])) + 3\n >>> reg = LinearRegression().fit(X, y)\n >>> reg.score(X, y)\n 1.0\n >>> reg.coef_\n array([1., 2.])\n >>> reg.intercept_\n 3.0...\n >>> reg.predict(np.array([[3, 5]]))\n array([16.])\n ", "source_code": "\n\nclass LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):\n \"\"\"\n Ordinary least squares Linear Regression.\n\n LinearRegression fits a linear model with coefficients w = (w1, ..., wp)\n to minimize the residual sum of squares between the observed targets in\n the dataset, and the targets predicted by the linear approximation.\n\n Parameters\n ----------\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to False, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n `normalize` was deprecated in version 1.0 and will be\n removed in 1.2.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This will only provide\n speedup in case of sufficiently large problems, that is if firstly\n `n_targets > 1` and secondly `X` is sparse or if `positive` is set\n to `True`. ``None`` means 1 unless in a\n :obj:`joblib.parallel_backend` context. ``-1`` means using all\n processors. See :term:`Glossary ` for more details.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive. This\n option is only supported for dense arrays.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n coef_ : array of shape (n_features, ) or (n_targets, n_features)\n Estimated coefficients for the linear regression problem.\n If multiple targets are passed during the fit (y 2D), this\n is a 2D array of shape (n_targets, n_features), while if only\n one target is passed, this is a 1D array of length n_features.\n\n rank_ : int\n Rank of matrix `X`. Only available when `X` is dense.\n\n singular_ : array of shape (min(X, y),)\n Singular values of `X`. Only available when `X` is dense.\n\n intercept_ : float or array of shape (n_targets,)\n Independent term in the linear model. Set to 0.0 if\n `fit_intercept = False`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression addresses some of the\n problems of Ordinary Least Squares by imposing a penalty on the\n size of the coefficients with l2 regularization.\n Lasso : The Lasso is a linear model that estimates\n sparse coefficients with l1 regularization.\n ElasticNet : Elastic-Net is a linear regression\n model trained with both l1 and l2 -norm regularization of the\n coefficients.\n\n Notes\n -----\n From the implementation point of view, this is just plain Ordinary\n Least Squares (scipy.linalg.lstsq) or Non Negative Least Squares\n (scipy.optimize.nnls) wrapped as a predictor object.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import LinearRegression\n >>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])\n >>> # y = 1 * x_0 + 2 * x_1 + 3\n >>> y = np.dot(X, np.array([1, 2])) + 3\n >>> reg = LinearRegression().fit(X, y)\n >>> reg.score(X, y)\n 1.0\n >>> reg.coef_\n array([1., 2.])\n >>> reg.intercept_\n 3.0...\n >>> reg.predict(np.array([[3, 5]]))\n array([16.])\n \"\"\"\n \n def __init__(self, *, fit_intercept=True, normalize='deprecated', copy_X=True, n_jobs=None, positive=False):\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.copy_X = copy_X\n self.n_jobs = n_jobs\n self.positive = positive\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"\n Fit linear model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.17\n parameter *sample_weight* support to LinearRegression.\n\n Returns\n -------\n self : object\n Fitted Estimator.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=False, estimator_name=self.__class__.__name__)\n n_jobs_ = self.n_jobs\n accept_sparse = False if self.positive else ['csr', 'csc', 'coo']\n (X, y) = self._validate_data(X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n (X, y, X_offset, y_offset, X_scale) = self._preprocess_data(X, y, fit_intercept=self.fit_intercept, normalize=_normalize, copy=self.copy_X, sample_weight=sample_weight, return_mean=True)\n if sample_weight is not None:\n (X, y) = _rescale_data(X, y, sample_weight)\n if self.positive:\n if y.ndim < 2:\n (self.coef_, self._residues) = optimize.nnls(X, y)\n else:\n outs = Parallel(n_jobs=n_jobs_)((delayed(optimize.nnls)(X, y[:, j]) for j in range(y.shape[1])))\n (self.coef_, self._residues) = map(np.vstack, zip(*outs))\n elif sp.issparse(X):\n X_offset_scale = X_offset / X_scale\n \n def matvec(b):\n return X.dot(b) - b.dot(X_offset_scale)\n \n def rmatvec(b):\n return X.T.dot(b) - X_offset_scale * np.sum(b)\n X_centered = sparse.linalg.LinearOperator(shape=X.shape, matvec=matvec, rmatvec=rmatvec)\n if y.ndim < 2:\n out = sparse_lsqr(X_centered, y)\n self.coef_ = out[0]\n self._residues = out[3]\n else:\n outs = Parallel(n_jobs=n_jobs_)((delayed(sparse_lsqr)(X_centered, y[:, j].ravel()) for j in range(y.shape[1])))\n self.coef_ = np.vstack([out[0] for out in outs])\n self._residues = np.vstack([out[3] for out in outs])\n else:\n (self.coef_, self._residues, self.rank_, self.singular_) = linalg.lstsq(X, y)\n self.coef_ = self.coef_.T\n if y.ndim == 1:\n self.coef_ = np.ravel(self.coef_)\n self._set_intercept(X_offset, y_offset, X_scale)\n return self\n" }, @@ -23375,7 +23442,7 @@ "sklearn.linear_model._bayes.ARDRegression.predict" ], "is_public": true, - "description": "Bayesian ARD regression.\n\nFit the weights of a regression model, using an ARD prior. The weights of the regression model are assumed to be in Gaussian distributions. Also estimate the parameters lambda (precisions of the distributions of the weights) and alpha (precision of the distribution of the noise). The estimation is done by an iterative procedures (Evidence Maximization) Read more in the :ref:`User Guide `.", + "description": "Bayesian ARD regression.\n\nFit the weights of a regression model, using an ARD prior. The weights of\nthe regression model are assumed to be in Gaussian distributions.\nAlso estimate the parameters lambda (precisions of the distributions of the\nweights) and alpha (precision of the distribution of the noise).\nThe estimation is done by an iterative procedures (Evidence Maximization)\n\nRead more in the :ref:`User Guide `.", "docstring": "Bayesian ARD regression.\n\n Fit the weights of a regression model, using an ARD prior. The weights of\n the regression model are assumed to be in Gaussian distributions.\n Also estimate the parameters lambda (precisions of the distributions of the\n weights) and alpha (precision of the distribution of the noise).\n The estimation is done by an iterative procedures (Evidence Maximization)\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_iter : int, default=300\n Maximum number of iterations.\n\n tol : float, default=1e-3\n Stop the algorithm if w has converged.\n\n alpha_1 : float, default=1e-6\n Hyper-parameter : shape parameter for the Gamma distribution prior\n over the alpha parameter.\n\n alpha_2 : float, default=1e-6\n Hyper-parameter : inverse scale parameter (rate parameter) for the\n Gamma distribution prior over the alpha parameter.\n\n lambda_1 : float, default=1e-6\n Hyper-parameter : shape parameter for the Gamma distribution prior\n over the lambda parameter.\n\n lambda_2 : float, default=1e-6\n Hyper-parameter : inverse scale parameter (rate parameter) for the\n Gamma distribution prior over the lambda parameter.\n\n compute_score : bool, default=False\n If True, compute the objective function at each step of the model.\n\n threshold_lambda : float, default=10 000\n Threshold for removing (pruning) weights with high precision from\n the computation.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n verbose : bool, default=False\n Verbose mode when fitting the model.\n\n Attributes\n ----------\n coef_ : array-like of shape (n_features,)\n Coefficients of the regression model (mean of distribution)\n\n alpha_ : float\n estimated precision of the noise.\n\n lambda_ : array-like of shape (n_features,)\n estimated precisions of the weights.\n\n sigma_ : array-like of shape (n_features, n_features)\n estimated variance-covariance matrix of the weights\n\n scores_ : float\n if computed, value of the objective function (to be maximized)\n\n intercept_ : float\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n X_offset_ : float\n If `normalize=True`, offset subtracted for centering data to a\n zero mean.\n\n X_scale_ : float\n If `normalize=True`, parameter used to scale data to a unit\n standard deviation.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n BayesianRidge : Bayesian ridge regression.\n\n Notes\n -----\n For an example, see :ref:`examples/linear_model/plot_ard.py\n `.\n\n References\n ----------\n D. J. C. MacKay, Bayesian nonlinear modeling for the prediction\n competition, ASHRAE Transactions, 1994.\n\n R. Salakhutdinov, Lecture notes on Statistical Machine Learning,\n http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15\n Their beta is our ``self.alpha_``\n Their alpha is our ``self.lambda_``\n ARD is a little different than the slide: only dimensions/features for\n which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are\n discarded.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.ARDRegression()\n >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])\n ARDRegression()\n >>> clf.predict([[1, 1]])\n array([1.])\n ", "source_code": "\n\nclass ARDRegression(RegressorMixin, LinearModel):\n \"\"\"Bayesian ARD regression.\n\n Fit the weights of a regression model, using an ARD prior. The weights of\n the regression model are assumed to be in Gaussian distributions.\n Also estimate the parameters lambda (precisions of the distributions of the\n weights) and alpha (precision of the distribution of the noise).\n The estimation is done by an iterative procedures (Evidence Maximization)\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_iter : int, default=300\n Maximum number of iterations.\n\n tol : float, default=1e-3\n Stop the algorithm if w has converged.\n\n alpha_1 : float, default=1e-6\n Hyper-parameter : shape parameter for the Gamma distribution prior\n over the alpha parameter.\n\n alpha_2 : float, default=1e-6\n Hyper-parameter : inverse scale parameter (rate parameter) for the\n Gamma distribution prior over the alpha parameter.\n\n lambda_1 : float, default=1e-6\n Hyper-parameter : shape parameter for the Gamma distribution prior\n over the lambda parameter.\n\n lambda_2 : float, default=1e-6\n Hyper-parameter : inverse scale parameter (rate parameter) for the\n Gamma distribution prior over the lambda parameter.\n\n compute_score : bool, default=False\n If True, compute the objective function at each step of the model.\n\n threshold_lambda : float, default=10 000\n Threshold for removing (pruning) weights with high precision from\n the computation.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n verbose : bool, default=False\n Verbose mode when fitting the model.\n\n Attributes\n ----------\n coef_ : array-like of shape (n_features,)\n Coefficients of the regression model (mean of distribution)\n\n alpha_ : float\n estimated precision of the noise.\n\n lambda_ : array-like of shape (n_features,)\n estimated precisions of the weights.\n\n sigma_ : array-like of shape (n_features, n_features)\n estimated variance-covariance matrix of the weights\n\n scores_ : float\n if computed, value of the objective function (to be maximized)\n\n intercept_ : float\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n X_offset_ : float\n If `normalize=True`, offset subtracted for centering data to a\n zero mean.\n\n X_scale_ : float\n If `normalize=True`, parameter used to scale data to a unit\n standard deviation.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n BayesianRidge : Bayesian ridge regression.\n\n Notes\n -----\n For an example, see :ref:`examples/linear_model/plot_ard.py\n `.\n\n References\n ----------\n D. J. C. MacKay, Bayesian nonlinear modeling for the prediction\n competition, ASHRAE Transactions, 1994.\n\n R. Salakhutdinov, Lecture notes on Statistical Machine Learning,\n http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15\n Their beta is our ``self.alpha_``\n Their alpha is our ``self.lambda_``\n ARD is a little different than the slide: only dimensions/features for\n which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are\n discarded.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.ARDRegression()\n >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])\n ARDRegression()\n >>> clf.predict([[1, 1]])\n array([1.])\n \"\"\"\n \n def __init__(self, *, n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, threshold_lambda=10000.0, fit_intercept=True, normalize='deprecated', copy_X=True, verbose=False):\n self.n_iter = n_iter\n self.tol = tol\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.alpha_1 = alpha_1\n self.alpha_2 = alpha_2\n self.lambda_1 = lambda_1\n self.lambda_2 = lambda_2\n self.compute_score = compute_score\n self.threshold_lambda = threshold_lambda\n self.copy_X = copy_X\n self.verbose = verbose\n \n def fit(self, X, y):\n \"\"\"Fit the model according to the given training data and parameters.\n\n Iterative procedure to maximize the evidence\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n y : array-like of shape (n_samples,)\n Target values (integers). Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self._normalize = _deprecate_normalize(self.normalize, default=False, estimator_name=self.__class__.__name__)\n (X, y) = self._validate_data(X, y, dtype=np.float64, y_numeric=True, ensure_min_samples=2)\n (n_samples, n_features) = X.shape\n coef_ = np.zeros(n_features)\n (X, y, X_offset_, y_offset_, X_scale_) = self._preprocess_data(X, y, self.fit_intercept, self._normalize, self.copy_X)\n self.X_offset_ = X_offset_\n self.X_scale_ = X_scale_\n keep_lambda = np.ones(n_features, dtype=bool)\n lambda_1 = self.lambda_1\n lambda_2 = self.lambda_2\n alpha_1 = self.alpha_1\n alpha_2 = self.alpha_2\n verbose = self.verbose\n eps = np.finfo(np.float64).eps\n alpha_ = 1.0 / (np.var(y) + eps)\n lambda_ = np.ones(n_features)\n self.scores_ = list()\n coef_old_ = None\n \n def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):\n coef_[keep_lambda] = alpha_ * np.linalg.multi_dot([sigma_, X[:, keep_lambda].T, y])\n return coef_\n update_sigma = self._update_sigma if n_samples >= n_features else self._update_sigma_woodbury\n for iter_ in range(self.n_iter):\n sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)\n coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)\n rmse_ = np.sum((y - np.dot(X, coef_))**2)\n gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_)\n lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / (coef_[keep_lambda]**2 + 2.0 * lambda_2)\n alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (rmse_ + 2.0 * alpha_2)\n keep_lambda = lambda_ < self.threshold_lambda\n coef_[~keep_lambda] = 0\n if self.compute_score:\n s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()\n s += alpha_1 * log(alpha_) - alpha_2 * alpha_\n s += 0.5 * (fast_logdet(sigma_) + n_samples * log(alpha_) + np.sum(np.log(lambda_)))\n s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_**2).sum())\n self.scores_.append(s)\n if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:\n if verbose:\n print('Converged after %s iterations' % iter_)\n break\n coef_old_ = np.copy(coef_)\n if not keep_lambda.any():\n break\n if keep_lambda.any():\n sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)\n coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)\n else:\n sigma_ = np.array([]).reshape(0, 0)\n self.coef_ = coef_\n self.alpha_ = alpha_\n self.sigma_ = sigma_\n self.lambda_ = lambda_\n self._set_intercept(X_offset_, y_offset_, X_scale_)\n return self\n \n def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):\n n_samples = X.shape[0]\n X_keep = X[:, keep_lambda]\n inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)\n sigma_ = pinvh(np.eye(n_samples) / alpha_ + np.dot(X_keep * inv_lambda, X_keep.T))\n sigma_ = np.dot(sigma_, X_keep * inv_lambda)\n sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)\n sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda]\n return sigma_\n \n def _update_sigma(self, X, alpha_, lambda_, keep_lambda):\n X_keep = X[:, keep_lambda]\n gram = np.dot(X_keep.T, X_keep)\n eye = np.eye(gram.shape[0])\n sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram\n sigma_ = pinvh(sigma_inv)\n return sigma_\n \n def predict(self, X, return_std=False):\n \"\"\"Predict using the linear model.\n\n In addition to the mean of the predictive distribution, also its\n standard deviation can be returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n return_std : bool, default=False\n Whether to return the standard deviation of posterior prediction.\n\n Returns\n -------\n y_mean : array-like of shape (n_samples,)\n Mean of predictive distribution of query points.\n\n y_std : array-like of shape (n_samples,)\n Standard deviation of predictive distribution of query points.\n \"\"\"\n y_mean = self._decision_function(X)\n if return_std is False:\n return y_mean\n else:\n if self._normalize:\n X = (X - self.X_offset_) / self.X_scale_\n X = X[:, self.lambda_ < self.threshold_lambda]\n sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)\n y_std = np.sqrt(sigmas_squared_data + 1.0 / self.alpha_)\n return y_mean, y_std\n" }, @@ -23392,7 +23459,7 @@ "sklearn.linear_model._bayes.BayesianRidge._log_marginal_likelihood" ], "is_public": true, - "description": "Bayesian ridge regression.\n\nFit a Bayesian ridge model. See the Notes section for details on this implementation and the optimization of the regularization parameters lambda (precision of the weights) and alpha (precision of the noise). Read more in the :ref:`User Guide `.", + "description": "Bayesian ridge regression.\n\nFit a Bayesian ridge model. See the Notes section for details on this\nimplementation and the optimization of the regularization parameters\nlambda (precision of the weights) and alpha (precision of the noise).\n\nRead more in the :ref:`User Guide `.", "docstring": "Bayesian ridge regression.\n\n Fit a Bayesian ridge model. See the Notes section for details on this\n implementation and the optimization of the regularization parameters\n lambda (precision of the weights) and alpha (precision of the noise).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_iter : int, default=300\n Maximum number of iterations. Should be greater than or equal to 1.\n\n tol : float, default=1e-3\n Stop the algorithm if w has converged.\n\n alpha_1 : float, default=1e-6\n Hyper-parameter : shape parameter for the Gamma distribution prior\n over the alpha parameter.\n\n alpha_2 : float, default=1e-6\n Hyper-parameter : inverse scale parameter (rate parameter) for the\n Gamma distribution prior over the alpha parameter.\n\n lambda_1 : float, default=1e-6\n Hyper-parameter : shape parameter for the Gamma distribution prior\n over the lambda parameter.\n\n lambda_2 : float, default=1e-6\n Hyper-parameter : inverse scale parameter (rate parameter) for the\n Gamma distribution prior over the lambda parameter.\n\n alpha_init : float, default=None\n Initial value for alpha (precision of the noise).\n If not set, alpha_init is 1/Var(y).\n\n .. versionadded:: 0.22\n\n lambda_init : float, default=None\n Initial value for lambda (precision of the weights).\n If not set, lambda_init is 1.\n\n .. versionadded:: 0.22\n\n compute_score : bool, default=False\n If True, compute the log marginal likelihood at each iteration of the\n optimization.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model.\n The intercept is not treated as a probabilistic parameter\n and thus has no associated variance. If set\n to False, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n verbose : bool, default=False\n Verbose mode when fitting the model.\n\n Attributes\n ----------\n coef_ : array-like of shape (n_features,)\n Coefficients of the regression model (mean of distribution)\n\n intercept_ : float\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n alpha_ : float\n Estimated precision of the noise.\n\n lambda_ : float\n Estimated precision of the weights.\n\n sigma_ : array-like of shape (n_features, n_features)\n Estimated variance-covariance matrix of the weights\n\n scores_ : array-like of shape (n_iter_+1,)\n If computed_score is True, value of the log marginal likelihood (to be\n maximized) at each iteration of the optimization. The array starts\n with the value of the log marginal likelihood obtained for the initial\n values of alpha and lambda and ends with the value obtained for the\n estimated alpha and lambda.\n\n n_iter_ : int\n The actual number of iterations to reach the stopping criterion.\n\n X_offset_ : float\n If `normalize=True`, offset subtracted for centering data to a\n zero mean.\n\n X_scale_ : float\n If `normalize=True`, parameter used to scale data to a unit\n standard deviation.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n ARDRegression : Bayesian ARD regression.\n\n Notes\n -----\n There exist several strategies to perform Bayesian ridge regression. This\n implementation is based on the algorithm described in Appendix A of\n (Tipping, 2001) where updates of the regularization parameters are done as\n suggested in (MacKay, 1992). Note that according to A New\n View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these\n update rules do not guarantee that the marginal likelihood is increasing\n between two consecutive iterations of the optimization.\n\n References\n ----------\n D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,\n Vol. 4, No. 3, 1992.\n\n M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,\n Journal of Machine Learning Research, Vol. 1, 2001.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.BayesianRidge()\n >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])\n BayesianRidge()\n >>> clf.predict([[1, 1]])\n array([1.])\n ", "source_code": "\n\nclass BayesianRidge(RegressorMixin, LinearModel):\n \"\"\"Bayesian ridge regression.\n\n Fit a Bayesian ridge model. See the Notes section for details on this\n implementation and the optimization of the regularization parameters\n lambda (precision of the weights) and alpha (precision of the noise).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_iter : int, default=300\n Maximum number of iterations. Should be greater than or equal to 1.\n\n tol : float, default=1e-3\n Stop the algorithm if w has converged.\n\n alpha_1 : float, default=1e-6\n Hyper-parameter : shape parameter for the Gamma distribution prior\n over the alpha parameter.\n\n alpha_2 : float, default=1e-6\n Hyper-parameter : inverse scale parameter (rate parameter) for the\n Gamma distribution prior over the alpha parameter.\n\n lambda_1 : float, default=1e-6\n Hyper-parameter : shape parameter for the Gamma distribution prior\n over the lambda parameter.\n\n lambda_2 : float, default=1e-6\n Hyper-parameter : inverse scale parameter (rate parameter) for the\n Gamma distribution prior over the lambda parameter.\n\n alpha_init : float, default=None\n Initial value for alpha (precision of the noise).\n If not set, alpha_init is 1/Var(y).\n\n .. versionadded:: 0.22\n\n lambda_init : float, default=None\n Initial value for lambda (precision of the weights).\n If not set, lambda_init is 1.\n\n .. versionadded:: 0.22\n\n compute_score : bool, default=False\n If True, compute the log marginal likelihood at each iteration of the\n optimization.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model.\n The intercept is not treated as a probabilistic parameter\n and thus has no associated variance. If set\n to False, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n verbose : bool, default=False\n Verbose mode when fitting the model.\n\n Attributes\n ----------\n coef_ : array-like of shape (n_features,)\n Coefficients of the regression model (mean of distribution)\n\n intercept_ : float\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n alpha_ : float\n Estimated precision of the noise.\n\n lambda_ : float\n Estimated precision of the weights.\n\n sigma_ : array-like of shape (n_features, n_features)\n Estimated variance-covariance matrix of the weights\n\n scores_ : array-like of shape (n_iter_+1,)\n If computed_score is True, value of the log marginal likelihood (to be\n maximized) at each iteration of the optimization. The array starts\n with the value of the log marginal likelihood obtained for the initial\n values of alpha and lambda and ends with the value obtained for the\n estimated alpha and lambda.\n\n n_iter_ : int\n The actual number of iterations to reach the stopping criterion.\n\n X_offset_ : float\n If `normalize=True`, offset subtracted for centering data to a\n zero mean.\n\n X_scale_ : float\n If `normalize=True`, parameter used to scale data to a unit\n standard deviation.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n ARDRegression : Bayesian ARD regression.\n\n Notes\n -----\n There exist several strategies to perform Bayesian ridge regression. This\n implementation is based on the algorithm described in Appendix A of\n (Tipping, 2001) where updates of the regularization parameters are done as\n suggested in (MacKay, 1992). Note that according to A New\n View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these\n update rules do not guarantee that the marginal likelihood is increasing\n between two consecutive iterations of the optimization.\n\n References\n ----------\n D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,\n Vol. 4, No. 3, 1992.\n\n M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,\n Journal of Machine Learning Research, Vol. 1, 2001.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.BayesianRidge()\n >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])\n BayesianRidge()\n >>> clf.predict([[1, 1]])\n array([1.])\n \"\"\"\n \n def __init__(self, *, n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, alpha_init=None, lambda_init=None, compute_score=False, fit_intercept=True, normalize='deprecated', copy_X=True, verbose=False):\n self.n_iter = n_iter\n self.tol = tol\n self.alpha_1 = alpha_1\n self.alpha_2 = alpha_2\n self.lambda_1 = lambda_1\n self.lambda_2 = lambda_2\n self.alpha_init = alpha_init\n self.lambda_init = lambda_init\n self.compute_score = compute_score\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.copy_X = copy_X\n self.verbose = verbose\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n y : ndarray of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.20\n parameter *sample_weight* support to BayesianRidge.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self._normalize = _deprecate_normalize(self.normalize, default=False, estimator_name=self.__class__.__name__)\n if self.n_iter < 1:\n raise ValueError('n_iter should be greater than or equal to 1. Got {!r}.'.format(self.n_iter))\n (X, y) = self._validate_data(X, y, dtype=np.float64, y_numeric=True)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n (X, y, X_offset_, y_offset_, X_scale_) = self._preprocess_data(X, y, self.fit_intercept, self._normalize, self.copy_X, sample_weight=sample_weight)\n if sample_weight is not None:\n (X, y) = _rescale_data(X, y, sample_weight)\n self.X_offset_ = X_offset_\n self.X_scale_ = X_scale_\n (n_samples, n_features) = X.shape\n eps = np.finfo(np.float64).eps\n alpha_ = self.alpha_init\n lambda_ = self.lambda_init\n if alpha_ is None:\n alpha_ = 1.0 / (np.var(y) + eps)\n if lambda_ is None:\n lambda_ = 1.0\n verbose = self.verbose\n lambda_1 = self.lambda_1\n lambda_2 = self.lambda_2\n alpha_1 = self.alpha_1\n alpha_2 = self.alpha_2\n self.scores_ = list()\n coef_old_ = None\n XT_y = np.dot(X.T, y)\n (U, S, Vh) = linalg.svd(X, full_matrices=False)\n eigen_vals_ = S**2\n for iter_ in range(self.n_iter):\n (coef_, rmse_) = self._update_coef_(X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_)\n if self.compute_score:\n s = self._log_marginal_likelihood(n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_)\n self.scores_.append(s)\n gamma_ = np.sum(alpha_ * eigen_vals_ / (lambda_ + alpha_ * eigen_vals_))\n lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2)\n alpha_ = (n_samples - gamma_ + 2 * alpha_1) / (rmse_ + 2 * alpha_2)\n if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:\n if verbose:\n print('Convergence after ', str(iter_), ' iterations')\n break\n coef_old_ = np.copy(coef_)\n self.n_iter_ = iter_ + 1\n self.alpha_ = alpha_\n self.lambda_ = lambda_\n (self.coef_, rmse_) = self._update_coef_(X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_)\n if self.compute_score:\n s = self._log_marginal_likelihood(n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_)\n self.scores_.append(s)\n self.scores_ = np.array(self.scores_)\n scaled_sigma_ = np.dot(Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis])\n self.sigma_ = 1.0 / alpha_ * scaled_sigma_\n self._set_intercept(X_offset_, y_offset_, X_scale_)\n return self\n \n def predict(self, X, return_std=False):\n \"\"\"Predict using the linear model.\n\n In addition to the mean of the predictive distribution, also its\n standard deviation can be returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n return_std : bool, default=False\n Whether to return the standard deviation of posterior prediction.\n\n Returns\n -------\n y_mean : array-like of shape (n_samples,)\n Mean of predictive distribution of query points.\n\n y_std : array-like of shape (n_samples,)\n Standard deviation of predictive distribution of query points.\n \"\"\"\n y_mean = self._decision_function(X)\n if return_std is False:\n return y_mean\n else:\n if self._normalize:\n X = (X - self.X_offset_) / self.X_scale_\n sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)\n y_std = np.sqrt(sigmas_squared_data + 1.0 / self.alpha_)\n return y_mean, y_std\n \n def _update_coef_(self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_):\n \"\"\"Update posterior mean and compute corresponding rmse.\n\n Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where\n scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)\n + np.dot(X.T, X))^-1\n \"\"\"\n if n_samples > n_features:\n coef_ = np.linalg.multi_dot([Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y])\n else:\n coef_ = np.linalg.multi_dot([X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y])\n rmse_ = np.sum((y - np.dot(X, coef_))**2)\n return coef_, rmse_\n \n def _log_marginal_likelihood(self, n_samples, n_features, eigen_vals, alpha_, lambda_, coef, rmse):\n \"\"\"Log marginal likelihood.\"\"\"\n alpha_1 = self.alpha_1\n alpha_2 = self.alpha_2\n lambda_1 = self.lambda_1\n lambda_2 = self.lambda_2\n if n_samples > n_features:\n logdet_sigma = -np.sum(np.log(lambda_ + alpha_ * eigen_vals))\n else:\n logdet_sigma = np.full(n_features, lambda_, dtype=np.array(lambda_).dtype)\n logdet_sigma[:n_samples] += alpha_ * eigen_vals\n logdet_sigma = -np.sum(np.log(logdet_sigma))\n score = lambda_1 * log(lambda_) - lambda_2 * lambda_\n score += alpha_1 * log(alpha_) - alpha_2 * alpha_\n score += 0.5 * (n_features * log(lambda_) + n_samples * log(alpha_) - alpha_ * rmse - lambda_ * np.sum(coef**2) + logdet_sigma - n_samples * log(2 * np.pi))\n return score\n" }, @@ -23412,7 +23479,7 @@ "sklearn.linear_model._coordinate_descent.ElasticNet._decision_function" ], "is_public": true, - "description": "Linear regression with combined L1 and L2 priors as regularizer.\n\nMinimizes the objective function:: 1 / (2 * n_samples) * ||y - Xw||^2_2 + alpha * l1_ratio * ||w||_1 + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2 If you are interested in controlling the L1 and L2 penalty separately, keep in mind that this is equivalent to:: a * ||w||_1 + 0.5 * b * ||w||_2^2 where:: alpha = a + b and l1_ratio = a / (a + b) The parameter l1_ratio corresponds to alpha in the glmnet R package while alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio = 1 is the lasso penalty. Currently, l1_ratio <= 0.01 is not reliable, unless you supply your own sequence of alpha. Read more in the :ref:`User Guide `.", + "description": "Linear regression with combined L1 and L2 priors as regularizer.\n\nMinimizes the objective function::\n\n 1 / (2 * n_samples) * ||y - Xw||^2_2\n + alpha * l1_ratio * ||w||_1\n + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2\n\nIf you are interested in controlling the L1 and L2 penalty\nseparately, keep in mind that this is equivalent to::\n\n a * ||w||_1 + 0.5 * b * ||w||_2^2\n\nwhere::\n\n alpha = a + b and l1_ratio = a / (a + b)\n\nThe parameter l1_ratio corresponds to alpha in the glmnet R package while\nalpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio\n= 1 is the lasso penalty. Currently, l1_ratio <= 0.01 is not reliable,\nunless you supply your own sequence of alpha.\n\nRead more in the :ref:`User Guide `.", "docstring": "Linear regression with combined L1 and L2 priors as regularizer.\n\n Minimizes the objective function::\n\n 1 / (2 * n_samples) * ||y - Xw||^2_2\n + alpha * l1_ratio * ||w||_1\n + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2\n\n If you are interested in controlling the L1 and L2 penalty\n separately, keep in mind that this is equivalent to::\n\n a * ||w||_1 + 0.5 * b * ||w||_2^2\n\n where::\n\n alpha = a + b and l1_ratio = a / (a + b)\n\n The parameter l1_ratio corresponds to alpha in the glmnet R package while\n alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio\n = 1 is the lasso penalty. Currently, l1_ratio <= 0.01 is not reliable,\n unless you supply your own sequence of alpha.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Constant that multiplies the penalty terms. Defaults to 1.0.\n See the notes for the exact mathematical meaning of this\n parameter. ``alpha = 0`` is equivalent to an ordinary least square,\n solved by the :class:`LinearRegression` object. For numerical\n reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.\n Given this, you should use the :class:`LinearRegression` object.\n\n l1_ratio : float, default=0.5\n The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For\n ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it\n is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a\n combination of L1 and L2.\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If ``False``, the\n data is assumed to be already centered.\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n precompute : bool or array-like of shape (n_features, n_features), default=False\n Whether to use a precomputed Gram matrix to speed up\n calculations. The Gram matrix can also be passed as argument.\n For sparse input this option is always ``False`` to preserve sparsity.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the cost function formula).\n\n sparse_coef_ : sparse matrix of shape (n_features,) or (n_targets, n_features)\n Sparse representation of the `coef_`.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function.\n\n n_iter_ : list of int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance.\n\n dual_gap_ : float or ndarray of shape (n_targets,)\n Given param alpha, the dual gaps at the end of the optimization,\n same shape as each observation of y.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n ElasticNetCV : Elastic net model with best model selection by\n cross-validation.\n SGDRegressor : Implements elastic net regression with incremental training.\n SGDClassifier : Implements logistic regression with elastic net penalty\n (``SGDClassifier(loss=\"log\", penalty=\"elasticnet\")``).\n\n Notes\n -----\n To avoid unnecessary memory duplication the X argument of the fit method\n should be directly passed as a Fortran-contiguous numpy array.\n\n Examples\n --------\n >>> from sklearn.linear_model import ElasticNet\n >>> from sklearn.datasets import make_regression\n\n >>> X, y = make_regression(n_features=2, random_state=0)\n >>> regr = ElasticNet(random_state=0)\n >>> regr.fit(X, y)\n ElasticNet(random_state=0)\n >>> print(regr.coef_)\n [18.83816048 64.55968825]\n >>> print(regr.intercept_)\n 1.451...\n >>> print(regr.predict([[0, 0]]))\n [1.451...]\n ", "source_code": "\n\nclass ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel):\n \"\"\"Linear regression with combined L1 and L2 priors as regularizer.\n\n Minimizes the objective function::\n\n 1 / (2 * n_samples) * ||y - Xw||^2_2\n + alpha * l1_ratio * ||w||_1\n + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2\n\n If you are interested in controlling the L1 and L2 penalty\n separately, keep in mind that this is equivalent to::\n\n a * ||w||_1 + 0.5 * b * ||w||_2^2\n\n where::\n\n alpha = a + b and l1_ratio = a / (a + b)\n\n The parameter l1_ratio corresponds to alpha in the glmnet R package while\n alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio\n = 1 is the lasso penalty. Currently, l1_ratio <= 0.01 is not reliable,\n unless you supply your own sequence of alpha.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Constant that multiplies the penalty terms. Defaults to 1.0.\n See the notes for the exact mathematical meaning of this\n parameter. ``alpha = 0`` is equivalent to an ordinary least square,\n solved by the :class:`LinearRegression` object. For numerical\n reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.\n Given this, you should use the :class:`LinearRegression` object.\n\n l1_ratio : float, default=0.5\n The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For\n ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it\n is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a\n combination of L1 and L2.\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If ``False``, the\n data is assumed to be already centered.\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n precompute : bool or array-like of shape (n_features, n_features), default=False\n Whether to use a precomputed Gram matrix to speed up\n calculations. The Gram matrix can also be passed as argument.\n For sparse input this option is always ``False`` to preserve sparsity.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the cost function formula).\n\n sparse_coef_ : sparse matrix of shape (n_features,) or (n_targets, n_features)\n Sparse representation of the `coef_`.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function.\n\n n_iter_ : list of int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance.\n\n dual_gap_ : float or ndarray of shape (n_targets,)\n Given param alpha, the dual gaps at the end of the optimization,\n same shape as each observation of y.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n ElasticNetCV : Elastic net model with best model selection by\n cross-validation.\n SGDRegressor : Implements elastic net regression with incremental training.\n SGDClassifier : Implements logistic regression with elastic net penalty\n (``SGDClassifier(loss=\"log\", penalty=\"elasticnet\")``).\n\n Notes\n -----\n To avoid unnecessary memory duplication the X argument of the fit method\n should be directly passed as a Fortran-contiguous numpy array.\n\n Examples\n --------\n >>> from sklearn.linear_model import ElasticNet\n >>> from sklearn.datasets import make_regression\n\n >>> X, y = make_regression(n_features=2, random_state=0)\n >>> regr = ElasticNet(random_state=0)\n >>> regr.fit(X, y)\n ElasticNet(random_state=0)\n >>> print(regr.coef_)\n [18.83816048 64.55968825]\n >>> print(regr.intercept_)\n 1.451...\n >>> print(regr.predict([[0, 0]]))\n [1.451...]\n \"\"\"\n path = staticmethod(enet_path)\n \n def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, normalize='deprecated', precompute=False, max_iter=1000, copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic'):\n self.alpha = alpha\n self.l1_ratio = l1_ratio\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.precompute = precompute\n self.max_iter = max_iter\n self.copy_X = copy_X\n self.tol = tol\n self.warm_start = warm_start\n self.positive = positive\n self.random_state = random_state\n self.selection = selection\n \n def fit(self, X, y, sample_weight=None, check_input=True):\n \"\"\"Fit model with coordinate descent.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of (n_samples, n_features)\n Data.\n\n y : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)\n Target. Will be cast to X's dtype if necessary.\n\n sample_weight : float or array-like of shape (n_samples,), default=None\n Sample weights. Internally, the `sample_weight` vector will be\n rescaled to sum to `n_samples`.\n\n .. versionadded:: 0.23\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n Coordinate descent is an algorithm that considers each column of\n data at a time hence it will automatically convert the X input\n as a Fortran-contiguous numpy array if necessary.\n\n To avoid memory re-allocation it is advised to allocate the\n initial data in memory directly using that format.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=False, estimator_name=self.__class__.__name__)\n if self.alpha == 0:\n warnings.warn('With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator', stacklevel=2)\n if isinstance(self.precompute, str):\n raise ValueError('precompute should be one of True, False or array-like. Got %r' % self.precompute)\n if not isinstance(self.l1_ratio, numbers.Number) or self.l1_ratio < 0 or self.l1_ratio > 1:\n raise ValueError(f'l1_ratio must be between 0 and 1; got l1_ratio={self.l1_ratio}')\n X_copied = False\n if check_input:\n X_copied = self.copy_X and self.fit_intercept\n (X, y) = self._validate_data(X, y, accept_sparse='csc', order='F', dtype=[np.float64, np.float32], copy=X_copied, multi_output=True, y_numeric=True)\n y = check_array(y, order='F', copy=False, dtype=X.dtype.type, ensure_2d=False)\n (n_samples, n_features) = X.shape\n alpha = self.alpha\n if isinstance(sample_weight, numbers.Number):\n sample_weight = None\n if sample_weight is not None:\n if check_input:\n if sparse.issparse(X):\n raise ValueError('Sample weights do not (yet) support sparse matrices.')\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n sample_weight = sample_weight * (n_samples / np.sum(sample_weight))\n should_copy = self.copy_X and not X_copied\n (X, y, X_offset, y_offset, X_scale, precompute, Xy) = _pre_fit(X, y, None, self.precompute, _normalize, self.fit_intercept, copy=should_copy, check_input=check_input, sample_weight=sample_weight)\n if check_input or sample_weight is not None:\n (X, y) = _set_order(X, y, order='F')\n if y.ndim == 1:\n y = y[:, np.newaxis]\n if Xy is not None and Xy.ndim == 1:\n Xy = Xy[:, np.newaxis]\n n_targets = y.shape[1]\n if self.selection not in ['cyclic', 'random']:\n raise ValueError('selection should be either random or cyclic.')\n if not self.warm_start or not hasattr(self, 'coef_'):\n coef_ = np.zeros((n_targets, n_features), dtype=X.dtype, order='F')\n else:\n coef_ = self.coef_\n if coef_.ndim == 1:\n coef_ = coef_[np.newaxis, :]\n dual_gaps_ = np.zeros(n_targets, dtype=X.dtype)\n self.n_iter_ = []\n for k in range(n_targets):\n if Xy is not None:\n this_Xy = Xy[:, k]\n else:\n this_Xy = None\n (_, this_coef, this_dual_gap, this_iter) = self.path(X, y[:, k], l1_ratio=self.l1_ratio, eps=None, n_alphas=None, alphas=[alpha], precompute=precompute, Xy=this_Xy, copy_X=True, verbose=False, tol=self.tol, positive=self.positive, X_offset=X_offset, X_scale=X_scale, return_n_iter=True, coef_init=coef_[k], max_iter=self.max_iter, random_state=self.random_state, selection=self.selection, check_input=False)\n coef_[k] = this_coef[:, 0]\n dual_gaps_[k] = this_dual_gap[0]\n self.n_iter_.append(this_iter[0])\n if n_targets == 1:\n self.n_iter_ = self.n_iter_[0]\n self.coef_ = coef_[0]\n self.dual_gap_ = dual_gaps_[0]\n else:\n self.coef_ = coef_\n self.dual_gap_ = dual_gaps_\n self._set_intercept(X_offset, y_offset, X_scale)\n self.coef_ = np.asarray(self.coef_, dtype=X.dtype)\n return self\n \n @property\n def sparse_coef_(self):\n \"\"\"Sparse representation of the fitted `coef_`.\"\"\"\n return sparse.csr_matrix(self.coef_)\n \n def _decision_function(self, X):\n \"\"\"Decision function of the linear model.\n\n Parameters\n ----------\n X : numpy array or scipy.sparse matrix of shape (n_samples, n_features)\n\n Returns\n -------\n T : ndarray of shape (n_samples,)\n The predicted decision function.\n \"\"\"\n check_is_fitted(self)\n if sparse.isspmatrix(X):\n return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_\n else:\n return super()._decision_function(X)\n" }, @@ -23428,7 +23495,7 @@ "sklearn.linear_model._coordinate_descent.ElasticNetCV._more_tags" ], "is_public": true, - "description": "Elastic Net model with iterative fitting along a regularization path.\n\nSee glossary entry for :term:`cross-validation estimator`. Read more in the :ref:`User Guide `.", + "description": "Elastic Net model with iterative fitting along a regularization path.\n\nSee glossary entry for :term:`cross-validation estimator`.\n\nRead more in the :ref:`User Guide `.", "docstring": "Elastic Net model with iterative fitting along a regularization path.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n l1_ratio : float or list of float, default=0.5\n Float between 0 and 1 passed to ElasticNet (scaling between\n l1 and l2 penalties). For ``l1_ratio = 0``\n the penalty is an L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty.\n For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2\n This parameter can be a list, in which case the different\n values are tested by cross-validation and the one giving the best\n prediction score is used. Note that a good choice of list of\n values for l1_ratio is often to put more values close to 1\n (i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7,\n .9, .95, .99, 1]``.\n\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\n n_alphas : int, default=100\n Number of alphas along the regularization path, used for each l1_ratio.\n\n alphas : ndarray, default=None\n List of alphas where to compute the models.\n If None alphas are set automatically.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n precompute : 'auto', bool or array-like of shape (n_features, n_features), default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n cv : int, cross-validation generator or iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - int, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n verbose : bool or int, default=0\n Amount of verbosity.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n alpha_ : float\n The amount of penalization chosen by cross validation.\n\n l1_ratio_ : float\n The compromise between l1 and l2 penalization chosen by\n cross validation.\n\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the cost function formula).\n\n intercept_ : float or ndarray of shape (n_targets, n_features)\n Independent term in the decision function.\n\n mse_path_ : ndarray of shape (n_l1_ratio, n_alpha, n_folds)\n Mean square error for the test set on each fold, varying l1_ratio and\n alpha.\n\n alphas_ : ndarray of shape (n_alphas,) or (n_l1_ratio, n_alphas)\n The grid of alphas used for fitting, for each l1_ratio.\n\n dual_gap_ : float\n The dual gaps at the end of the optimization for the optimal alpha.\n\n n_iter_ : int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance for the optimal alpha.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n enet_path : Compute elastic net path with coordinate descent.\n ElasticNet : Linear regression with combined L1 and L2 priors as regularizer.\n\n Notes\n -----\n For an example, see\n :ref:`examples/linear_model/plot_lasso_model_selection.py\n `.\n\n To avoid unnecessary memory duplication the X argument of the fit method\n should be directly passed as a Fortran-contiguous numpy array.\n\n The parameter l1_ratio corresponds to alpha in the glmnet R package\n while alpha corresponds to the lambda parameter in glmnet.\n More specifically, the optimization objective is::\n\n 1 / (2 * n_samples) * ||y - Xw||^2_2\n + alpha * l1_ratio * ||w||_1\n + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2\n\n If you are interested in controlling the L1 and L2 penalty\n separately, keep in mind that this is equivalent to::\n\n a * L1 + b * L2\n\n for::\n\n alpha = a + b and l1_ratio = a / (a + b).\n\n Examples\n --------\n >>> from sklearn.linear_model import ElasticNetCV\n >>> from sklearn.datasets import make_regression\n\n >>> X, y = make_regression(n_features=2, random_state=0)\n >>> regr = ElasticNetCV(cv=5, random_state=0)\n >>> regr.fit(X, y)\n ElasticNetCV(cv=5, random_state=0)\n >>> print(regr.alpha_)\n 0.199...\n >>> print(regr.intercept_)\n 0.398...\n >>> print(regr.predict([[0, 0]]))\n [0.398...]\n ", "source_code": "\n\nclass ElasticNetCV(RegressorMixin, LinearModelCV):\n \"\"\"Elastic Net model with iterative fitting along a regularization path.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n l1_ratio : float or list of float, default=0.5\n Float between 0 and 1 passed to ElasticNet (scaling between\n l1 and l2 penalties). For ``l1_ratio = 0``\n the penalty is an L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty.\n For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2\n This parameter can be a list, in which case the different\n values are tested by cross-validation and the one giving the best\n prediction score is used. Note that a good choice of list of\n values for l1_ratio is often to put more values close to 1\n (i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7,\n .9, .95, .99, 1]``.\n\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\n n_alphas : int, default=100\n Number of alphas along the regularization path, used for each l1_ratio.\n\n alphas : ndarray, default=None\n List of alphas where to compute the models.\n If None alphas are set automatically.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n precompute : 'auto', bool or array-like of shape (n_features, n_features), default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n cv : int, cross-validation generator or iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - int, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n verbose : bool or int, default=0\n Amount of verbosity.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n alpha_ : float\n The amount of penalization chosen by cross validation.\n\n l1_ratio_ : float\n The compromise between l1 and l2 penalization chosen by\n cross validation.\n\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the cost function formula).\n\n intercept_ : float or ndarray of shape (n_targets, n_features)\n Independent term in the decision function.\n\n mse_path_ : ndarray of shape (n_l1_ratio, n_alpha, n_folds)\n Mean square error for the test set on each fold, varying l1_ratio and\n alpha.\n\n alphas_ : ndarray of shape (n_alphas,) or (n_l1_ratio, n_alphas)\n The grid of alphas used for fitting, for each l1_ratio.\n\n dual_gap_ : float\n The dual gaps at the end of the optimization for the optimal alpha.\n\n n_iter_ : int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance for the optimal alpha.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n enet_path : Compute elastic net path with coordinate descent.\n ElasticNet : Linear regression with combined L1 and L2 priors as regularizer.\n\n Notes\n -----\n For an example, see\n :ref:`examples/linear_model/plot_lasso_model_selection.py\n `.\n\n To avoid unnecessary memory duplication the X argument of the fit method\n should be directly passed as a Fortran-contiguous numpy array.\n\n The parameter l1_ratio corresponds to alpha in the glmnet R package\n while alpha corresponds to the lambda parameter in glmnet.\n More specifically, the optimization objective is::\n\n 1 / (2 * n_samples) * ||y - Xw||^2_2\n + alpha * l1_ratio * ||w||_1\n + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2\n\n If you are interested in controlling the L1 and L2 penalty\n separately, keep in mind that this is equivalent to::\n\n a * L1 + b * L2\n\n for::\n\n alpha = a + b and l1_ratio = a / (a + b).\n\n Examples\n --------\n >>> from sklearn.linear_model import ElasticNetCV\n >>> from sklearn.datasets import make_regression\n\n >>> X, y = make_regression(n_features=2, random_state=0)\n >>> regr = ElasticNetCV(cv=5, random_state=0)\n >>> regr.fit(X, y)\n ElasticNetCV(cv=5, random_state=0)\n >>> print(regr.alpha_)\n 0.199...\n >>> print(regr.intercept_)\n 0.398...\n >>> print(regr.predict([[0, 0]]))\n [0.398...]\n \"\"\"\n path = staticmethod(enet_path)\n \n def __init__(self, *, l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize='deprecated', precompute='auto', max_iter=1000, tol=0.0001, cv=None, copy_X=True, verbose=0, n_jobs=None, positive=False, random_state=None, selection='cyclic'):\n self.l1_ratio = l1_ratio\n self.eps = eps\n self.n_alphas = n_alphas\n self.alphas = alphas\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.precompute = precompute\n self.max_iter = max_iter\n self.tol = tol\n self.cv = cv\n self.copy_X = copy_X\n self.verbose = verbose\n self.n_jobs = n_jobs\n self.positive = positive\n self.random_state = random_state\n self.selection = selection\n \n def _get_estimator(self):\n return ElasticNet()\n \n def _is_multitask(self):\n return False\n \n def _more_tags(self):\n return {'multioutput': False}\n" }, @@ -23441,9 +23508,9 @@ "sklearn.linear_model._coordinate_descent.Lasso.__init__" ], "is_public": true, - "description": "Linear Model trained with L1 prior as regularizer (aka the Lasso).\n\nThe optimization objective for Lasso is:: (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 Technically the Lasso model is optimizing the same objective function as the Elastic Net with ``l1_ratio=1.0`` (no L2 penalty). Read more in the :ref:`User Guide `.", - "docstring": "Linear Model trained with L1 prior as regularizer (aka the Lasso).\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n Technically the Lasso model is optimizing the same objective function as\n the Elastic Net with ``l1_ratio=1.0`` (no L2 penalty).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Constant that multiplies the L1 term. Defaults to 1.0.\n ``alpha = 0`` is equivalent to an ordinary least square, solved\n by the :class:`LinearRegression` object. For numerical\n reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.\n Given this, you should use the :class:`LinearRegression` object.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to False, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n precompute : 'auto', bool or array-like of shape (n_features, n_features), precompute : bool or array-like of shape (n_features, n_features), default=False\n Whether to use a precomputed Gram matrix to speed up\n calculations. The Gram matrix can also be passed as argument.\n For sparse input this option is always ``False`` to preserve sparsity.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the cost function formula).\n\n dual_gap_ : float or ndarray of shape (n_targets,)\n Given param alpha, the dual gaps at the end of the optimization,\n same shape as each observation of y.\n\n sparse_coef_ : sparse matrix of shape (n_features, 1) or (n_targets, n_features)\n Readonly property derived from ``coef_``.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function.\n\n n_iter_ : int or list of int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Regularization path using LARS.\n lasso_path : Regularization path using Lasso.\n LassoLars : Lasso Path along the regularization parameter usingLARS algorithm.\n LassoCV : Lasso alpha parameter by cross-validation.\n LassoLarsCV : Lasso least angle parameter algorithm by cross-validation.\n sklearn.decomposition.sparse_encode : Sparse coding array estimator.\n\n Notes\n -----\n The algorithm used to fit the model is coordinate descent.\n\n To avoid unnecessary memory duplication the X argument of the fit method\n should be directly passed as a Fortran-contiguous numpy array.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.Lasso(alpha=0.1)\n >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])\n Lasso(alpha=0.1)\n >>> print(clf.coef_)\n [0.85 0. ]\n >>> print(clf.intercept_)\n 0.15...\n ", - "source_code": "\n\nclass Lasso(ElasticNet):\n \"\"\"Linear Model trained with L1 prior as regularizer (aka the Lasso).\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n Technically the Lasso model is optimizing the same objective function as\n the Elastic Net with ``l1_ratio=1.0`` (no L2 penalty).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Constant that multiplies the L1 term. Defaults to 1.0.\n ``alpha = 0`` is equivalent to an ordinary least square, solved\n by the :class:`LinearRegression` object. For numerical\n reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.\n Given this, you should use the :class:`LinearRegression` object.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to False, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n precompute : 'auto', bool or array-like of shape (n_features, n_features), precompute : bool or array-like of shape (n_features, n_features), default=False\n Whether to use a precomputed Gram matrix to speed up\n calculations. The Gram matrix can also be passed as argument.\n For sparse input this option is always ``False`` to preserve sparsity.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the cost function formula).\n\n dual_gap_ : float or ndarray of shape (n_targets,)\n Given param alpha, the dual gaps at the end of the optimization,\n same shape as each observation of y.\n\n sparse_coef_ : sparse matrix of shape (n_features, 1) or (n_targets, n_features)\n Readonly property derived from ``coef_``.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function.\n\n n_iter_ : int or list of int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Regularization path using LARS.\n lasso_path : Regularization path using Lasso.\n LassoLars : Lasso Path along the regularization parameter usingLARS algorithm.\n LassoCV : Lasso alpha parameter by cross-validation.\n LassoLarsCV : Lasso least angle parameter algorithm by cross-validation.\n sklearn.decomposition.sparse_encode : Sparse coding array estimator.\n\n Notes\n -----\n The algorithm used to fit the model is coordinate descent.\n\n To avoid unnecessary memory duplication the X argument of the fit method\n should be directly passed as a Fortran-contiguous numpy array.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.Lasso(alpha=0.1)\n >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])\n Lasso(alpha=0.1)\n >>> print(clf.coef_)\n [0.85 0. ]\n >>> print(clf.intercept_)\n 0.15...\n \"\"\"\n path = staticmethod(enet_path)\n \n def __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic'):\n super().__init__(alpha=alpha, l1_ratio=1.0, fit_intercept=fit_intercept, normalize=normalize, precompute=precompute, copy_X=copy_X, max_iter=max_iter, tol=tol, warm_start=warm_start, positive=positive, random_state=random_state, selection=selection)\n" + "description": "Linear Model trained with L1 prior as regularizer (aka the Lasso).\n\nThe optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nTechnically the Lasso model is optimizing the same objective function as\nthe Elastic Net with ``l1_ratio=1.0`` (no L2 penalty).\n\nRead more in the :ref:`User Guide `.", + "docstring": "Linear Model trained with L1 prior as regularizer (aka the Lasso).\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n Technically the Lasso model is optimizing the same objective function as\n the Elastic Net with ``l1_ratio=1.0`` (no L2 penalty).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Constant that multiplies the L1 term. Defaults to 1.0.\n ``alpha = 0`` is equivalent to an ordinary least square, solved\n by the :class:`LinearRegression` object. For numerical\n reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.\n Given this, you should use the :class:`LinearRegression` object.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to False, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n precompute : bool or array-like of shape (n_features, n_features), default=False\n Whether to use a precomputed Gram matrix to speed up\n calculations. The Gram matrix can also be passed as argument.\n For sparse input this option is always ``False`` to preserve sparsity.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the cost function formula).\n\n dual_gap_ : float or ndarray of shape (n_targets,)\n Given param alpha, the dual gaps at the end of the optimization,\n same shape as each observation of y.\n\n sparse_coef_ : sparse matrix of shape (n_features, 1) or (n_targets, n_features)\n Readonly property derived from ``coef_``.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function.\n\n n_iter_ : int or list of int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Regularization path using LARS.\n lasso_path : Regularization path using Lasso.\n LassoLars : Lasso Path along the regularization parameter usingLARS algorithm.\n LassoCV : Lasso alpha parameter by cross-validation.\n LassoLarsCV : Lasso least angle parameter algorithm by cross-validation.\n sklearn.decomposition.sparse_encode : Sparse coding array estimator.\n\n Notes\n -----\n The algorithm used to fit the model is coordinate descent.\n\n To avoid unnecessary memory duplication the X argument of the fit method\n should be directly passed as a Fortran-contiguous numpy array.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.Lasso(alpha=0.1)\n >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])\n Lasso(alpha=0.1)\n >>> print(clf.coef_)\n [0.85 0. ]\n >>> print(clf.intercept_)\n 0.15...\n ", + "source_code": "\n\nclass Lasso(ElasticNet):\n \"\"\"Linear Model trained with L1 prior as regularizer (aka the Lasso).\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n Technically the Lasso model is optimizing the same objective function as\n the Elastic Net with ``l1_ratio=1.0`` (no L2 penalty).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Constant that multiplies the L1 term. Defaults to 1.0.\n ``alpha = 0`` is equivalent to an ordinary least square, solved\n by the :class:`LinearRegression` object. For numerical\n reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.\n Given this, you should use the :class:`LinearRegression` object.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to False, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n precompute : bool or array-like of shape (n_features, n_features), default=False\n Whether to use a precomputed Gram matrix to speed up\n calculations. The Gram matrix can also be passed as argument.\n For sparse input this option is always ``False`` to preserve sparsity.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the cost function formula).\n\n dual_gap_ : float or ndarray of shape (n_targets,)\n Given param alpha, the dual gaps at the end of the optimization,\n same shape as each observation of y.\n\n sparse_coef_ : sparse matrix of shape (n_features, 1) or (n_targets, n_features)\n Readonly property derived from ``coef_``.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function.\n\n n_iter_ : int or list of int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Regularization path using LARS.\n lasso_path : Regularization path using Lasso.\n LassoLars : Lasso Path along the regularization parameter usingLARS algorithm.\n LassoCV : Lasso alpha parameter by cross-validation.\n LassoLarsCV : Lasso least angle parameter algorithm by cross-validation.\n sklearn.decomposition.sparse_encode : Sparse coding array estimator.\n\n Notes\n -----\n The algorithm used to fit the model is coordinate descent.\n\n To avoid unnecessary memory duplication the X argument of the fit method\n should be directly passed as a Fortran-contiguous numpy array.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.Lasso(alpha=0.1)\n >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])\n Lasso(alpha=0.1)\n >>> print(clf.coef_)\n [0.85 0. ]\n >>> print(clf.intercept_)\n 0.15...\n \"\"\"\n path = staticmethod(enet_path)\n \n def __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic'):\n super().__init__(alpha=alpha, l1_ratio=1.0, fit_intercept=fit_intercept, normalize=normalize, precompute=precompute, copy_X=copy_X, max_iter=max_iter, tol=tol, warm_start=warm_start, positive=positive, random_state=random_state, selection=selection)\n" }, { "name": "LassoCV", @@ -23457,7 +23524,7 @@ "sklearn.linear_model._coordinate_descent.LassoCV._more_tags" ], "is_public": true, - "description": "Lasso linear model with iterative fitting along a regularization path.\n\nSee glossary entry for :term:`cross-validation estimator`. The best model is selected by cross-validation. The optimization objective for Lasso is:: (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 Read more in the :ref:`User Guide `.", + "description": "Lasso linear model with iterative fitting along a regularization path.\n\nSee glossary entry for :term:`cross-validation estimator`.\n\nThe best model is selected by cross-validation.\n\nThe optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nRead more in the :ref:`User Guide `.", "docstring": "Lasso linear model with iterative fitting along a regularization path.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n The best model is selected by cross-validation.\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\n n_alphas : int, default=100\n Number of alphas along the regularization path.\n\n alphas : ndarray, default=None\n List of alphas where to compute the models.\n If ``None`` alphas are set automatically.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n precompute : 'auto', bool or array-like of shape (n_features, n_features), default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n cv : int, cross-validation generator or iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - int, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n verbose : bool or int, default=False\n Amount of verbosity.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n positive : bool, default=False\n If positive, restrict regression coefficients to be positive.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n alpha_ : float\n The amount of penalization chosen by cross validation.\n\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the cost function formula).\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function.\n\n mse_path_ : ndarray of shape (n_alphas, n_folds)\n Mean square error for the test set on each fold, varying alpha.\n\n alphas_ : ndarray of shape (n_alphas,)\n The grid of alphas used for fitting.\n\n dual_gap_ : float or ndarray of shape (n_targets,)\n The dual gap at the end of the optimization for the optimal alpha\n (``alpha_``).\n\n n_iter_ : int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance for the optimal alpha.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso path using LARS\n algorithm.\n lasso_path : Compute Lasso path with coordinate descent.\n Lasso : The Lasso is a linear model that estimates sparse coefficients.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n LassoCV : Lasso linear model with iterative fitting along a regularization\n path.\n LassoLarsCV : Cross-validated Lasso using the LARS algorithm.\n\n Notes\n -----\n For an example, see\n :ref:`examples/linear_model/plot_lasso_model_selection.py\n `.\n\n To avoid unnecessary memory duplication the X argument of the fit method\n should be directly passed as a Fortran-contiguous numpy array.\n\n Examples\n --------\n >>> from sklearn.linear_model import LassoCV\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(noise=4, random_state=0)\n >>> reg = LassoCV(cv=5, random_state=0).fit(X, y)\n >>> reg.score(X, y)\n 0.9993...\n >>> reg.predict(X[:1,])\n array([-78.4951...])\n ", "source_code": "\n\nclass LassoCV(RegressorMixin, LinearModelCV):\n \"\"\"Lasso linear model with iterative fitting along a regularization path.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n The best model is selected by cross-validation.\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\n n_alphas : int, default=100\n Number of alphas along the regularization path.\n\n alphas : ndarray, default=None\n List of alphas where to compute the models.\n If ``None`` alphas are set automatically.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n precompute : 'auto', bool or array-like of shape (n_features, n_features), default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n cv : int, cross-validation generator or iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - int, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n verbose : bool or int, default=False\n Amount of verbosity.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n positive : bool, default=False\n If positive, restrict regression coefficients to be positive.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n alpha_ : float\n The amount of penalization chosen by cross validation.\n\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the cost function formula).\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function.\n\n mse_path_ : ndarray of shape (n_alphas, n_folds)\n Mean square error for the test set on each fold, varying alpha.\n\n alphas_ : ndarray of shape (n_alphas,)\n The grid of alphas used for fitting.\n\n dual_gap_ : float or ndarray of shape (n_targets,)\n The dual gap at the end of the optimization for the optimal alpha\n (``alpha_``).\n\n n_iter_ : int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance for the optimal alpha.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso path using LARS\n algorithm.\n lasso_path : Compute Lasso path with coordinate descent.\n Lasso : The Lasso is a linear model that estimates sparse coefficients.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n LassoCV : Lasso linear model with iterative fitting along a regularization\n path.\n LassoLarsCV : Cross-validated Lasso using the LARS algorithm.\n\n Notes\n -----\n For an example, see\n :ref:`examples/linear_model/plot_lasso_model_selection.py\n `.\n\n To avoid unnecessary memory duplication the X argument of the fit method\n should be directly passed as a Fortran-contiguous numpy array.\n\n Examples\n --------\n >>> from sklearn.linear_model import LassoCV\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(noise=4, random_state=0)\n >>> reg = LassoCV(cv=5, random_state=0).fit(X, y)\n >>> reg.score(X, y)\n 0.9993...\n >>> reg.predict(X[:1,])\n array([-78.4951...])\n \"\"\"\n path = staticmethod(lasso_path)\n \n def __init__(self, *, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize='deprecated', precompute='auto', max_iter=1000, tol=0.0001, copy_X=True, cv=None, verbose=False, n_jobs=None, positive=False, random_state=None, selection='cyclic'):\n super().__init__(eps=eps, n_alphas=n_alphas, alphas=alphas, fit_intercept=fit_intercept, normalize=normalize, precompute=precompute, max_iter=max_iter, tol=tol, copy_X=copy_X, cv=cv, verbose=verbose, n_jobs=n_jobs, positive=positive, random_state=random_state, selection=selection)\n \n def _get_estimator(self):\n return Lasso()\n \n def _is_multitask(self):\n return False\n \n def _more_tags(self):\n return {'multioutput': False}\n" }, @@ -23490,7 +23557,7 @@ "sklearn.linear_model._coordinate_descent.MultiTaskElasticNet._more_tags" ], "is_public": true, - "description": "Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer.\n\nThe optimization objective for MultiTaskElasticNet is:: (1 / (2 * n_samples)) * ||Y - XW||_Fro^2 + alpha * l1_ratio * ||W||_21 + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 Where:: ||W||_21 = sum_i sqrt(sum_j W_ij ^ 2) i.e. the sum of norms of each row. Read more in the :ref:`User Guide `.", + "description": "Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer.\n\nThe optimization objective for MultiTaskElasticNet is::\n\n (1 / (2 * n_samples)) * ||Y - XW||_Fro^2\n + alpha * l1_ratio * ||W||_21\n + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2\n\nWhere::\n\n ||W||_21 = sum_i sqrt(sum_j W_ij ^ 2)\n\ni.e. the sum of norms of each row.\n\nRead more in the :ref:`User Guide `.", "docstring": "Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer.\n\n The optimization objective for MultiTaskElasticNet is::\n\n (1 / (2 * n_samples)) * ||Y - XW||_Fro^2\n + alpha * l1_ratio * ||W||_21\n + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2\n\n Where::\n\n ||W||_21 = sum_i sqrt(sum_j W_ij ^ 2)\n\n i.e. the sum of norms of each row.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Constant that multiplies the L1/L2 term. Defaults to 1.0.\n\n l1_ratio : float, default=0.5\n The ElasticNet mixing parameter, with 0 < l1_ratio <= 1.\n For l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it\n is an L2 penalty.\n For ``0 < l1_ratio < 1``, the penalty is a combination of L1/L2 and L2.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n intercept_ : ndarray of shape (n_targets,)\n Independent term in decision function.\n\n coef_ : ndarray of shape (n_targets, n_features)\n Parameter vector (W in the cost function formula). If a 1D y is\n passed in at fit (non multi-task usage), ``coef_`` is then a 1D array.\n Note that ``coef_`` stores the transpose of ``W``, ``W.T``.\n\n n_iter_ : int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance.\n\n dual_gap_ : float\n The dual gaps at the end of the optimization.\n\n eps_ : float\n The tolerance scaled scaled by the variance of the target `y`.\n\n sparse_coef_ : sparse matrix of shape (n_features,) or (n_targets, n_features)\n Sparse representation of the `coef_`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in\n cross-validation.\n ElasticNet : Linear regression with combined L1 and L2 priors as regularizer.\n MultiTaskLasso : Multi-task L1/L2 Lasso with built-in cross-validation.\n\n Notes\n -----\n The algorithm used to fit the model is coordinate descent.\n\n To avoid unnecessary memory duplication the X and y arguments of the fit\n method should be directly passed as Fortran-contiguous numpy arrays.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.MultiTaskElasticNet(alpha=0.1)\n >>> clf.fit([[0,0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]])\n MultiTaskElasticNet(alpha=0.1)\n >>> print(clf.coef_)\n [[0.45663524 0.45612256]\n [0.45663524 0.45612256]]\n >>> print(clf.intercept_)\n [0.0872422 0.0872422]\n ", "source_code": "\n\nclass MultiTaskElasticNet(Lasso):\n \"\"\"Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer.\n\n The optimization objective for MultiTaskElasticNet is::\n\n (1 / (2 * n_samples)) * ||Y - XW||_Fro^2\n + alpha * l1_ratio * ||W||_21\n + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2\n\n Where::\n\n ||W||_21 = sum_i sqrt(sum_j W_ij ^ 2)\n\n i.e. the sum of norms of each row.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Constant that multiplies the L1/L2 term. Defaults to 1.0.\n\n l1_ratio : float, default=0.5\n The ElasticNet mixing parameter, with 0 < l1_ratio <= 1.\n For l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it\n is an L2 penalty.\n For ``0 < l1_ratio < 1``, the penalty is a combination of L1/L2 and L2.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n intercept_ : ndarray of shape (n_targets,)\n Independent term in decision function.\n\n coef_ : ndarray of shape (n_targets, n_features)\n Parameter vector (W in the cost function formula). If a 1D y is\n passed in at fit (non multi-task usage), ``coef_`` is then a 1D array.\n Note that ``coef_`` stores the transpose of ``W``, ``W.T``.\n\n n_iter_ : int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance.\n\n dual_gap_ : float\n The dual gaps at the end of the optimization.\n\n eps_ : float\n The tolerance scaled scaled by the variance of the target `y`.\n\n sparse_coef_ : sparse matrix of shape (n_features,) or (n_targets, n_features)\n Sparse representation of the `coef_`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in\n cross-validation.\n ElasticNet : Linear regression with combined L1 and L2 priors as regularizer.\n MultiTaskLasso : Multi-task L1/L2 Lasso with built-in cross-validation.\n\n Notes\n -----\n The algorithm used to fit the model is coordinate descent.\n\n To avoid unnecessary memory duplication the X and y arguments of the fit\n method should be directly passed as Fortran-contiguous numpy arrays.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.MultiTaskElasticNet(alpha=0.1)\n >>> clf.fit([[0,0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]])\n MultiTaskElasticNet(alpha=0.1)\n >>> print(clf.coef_)\n [[0.45663524 0.45612256]\n [0.45663524 0.45612256]]\n >>> print(clf.intercept_)\n [0.0872422 0.0872422]\n \"\"\"\n \n def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, random_state=None, selection='cyclic'):\n self.l1_ratio = l1_ratio\n self.alpha = alpha\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.max_iter = max_iter\n self.copy_X = copy_X\n self.tol = tol\n self.warm_start = warm_start\n self.random_state = random_state\n self.selection = selection\n \n def fit(self, X, y):\n \"\"\"Fit MultiTaskElasticNet model with coordinate descent.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data.\n y : ndarray of shape (n_samples, n_targets)\n Target. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n Coordinate descent is an algorithm that considers each column of\n data at a time hence it will automatically convert the X input\n as a Fortran-contiguous numpy array if necessary.\n\n To avoid memory re-allocation it is advised to allocate the\n initial data in memory directly using that format.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=False, estimator_name=self.__class__.__name__)\n check_X_params = dict(dtype=[np.float64, np.float32], order='F', copy=self.copy_X and self.fit_intercept)\n check_y_params = dict(ensure_2d=False, order='F')\n (X, y) = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params))\n check_consistent_length(X, y)\n y = y.astype(X.dtype)\n if hasattr(self, 'l1_ratio'):\n model_str = 'ElasticNet'\n else:\n model_str = 'Lasso'\n if y.ndim == 1:\n raise ValueError('For mono-task outputs, use %s' % model_str)\n (n_samples, n_features) = X.shape\n n_targets = y.shape[1]\n (X, y, X_offset, y_offset, X_scale) = _preprocess_data(X, y, self.fit_intercept, _normalize, copy=False)\n if not self.warm_start or not hasattr(self, 'coef_'):\n self.coef_ = np.zeros((n_targets, n_features), dtype=X.dtype.type, order='F')\n l1_reg = self.alpha * self.l1_ratio * n_samples\n l2_reg = self.alpha * (1.0 - self.l1_ratio) * n_samples\n self.coef_ = np.asfortranarray(self.coef_)\n if self.selection not in ['random', 'cyclic']:\n raise ValueError('selection should be either random or cyclic.')\n random = self.selection == 'random'\n (self.coef_, self.dual_gap_, self.eps_, self.n_iter_) = cd_fast.enet_coordinate_descent_multi_task(self.coef_, l1_reg, l2_reg, X, y, self.max_iter, self.tol, check_random_state(self.random_state), random)\n self.dual_gap_ /= n_samples\n self._set_intercept(X_offset, y_offset, X_scale)\n return self\n \n def _more_tags(self):\n return {'multioutput_only': True}\n" }, @@ -23507,7 +23574,7 @@ "sklearn.linear_model._coordinate_descent.MultiTaskElasticNetCV.fit" ], "is_public": true, - "description": "Multi-task L1/L2 ElasticNet with built-in cross-validation.\n\nSee glossary entry for :term:`cross-validation estimator`. The optimization objective for MultiTaskElasticNet is:: (1 / (2 * n_samples)) * ||Y - XW||^Fro_2 + alpha * l1_ratio * ||W||_21 + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 Where:: ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} i.e. the sum of norm of each row. Read more in the :ref:`User Guide `. .. versionadded:: 0.15", + "description": "Multi-task L1/L2 ElasticNet with built-in cross-validation.\n\nSee glossary entry for :term:`cross-validation estimator`.\n\nThe optimization objective for MultiTaskElasticNet is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^Fro_2\n + alpha * l1_ratio * ||W||_21\n + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2\n\nWhere::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\ni.e. the sum of norm of each row.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.15", "docstring": "Multi-task L1/L2 ElasticNet with built-in cross-validation.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n The optimization objective for MultiTaskElasticNet is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^Fro_2\n + alpha * l1_ratio * ||W||_21\n + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2\n\n Where::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\n i.e. the sum of norm of each row.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.15\n\n Parameters\n ----------\n l1_ratio : float or list of float, default=0.5\n The ElasticNet mixing parameter, with 0 < l1_ratio <= 1.\n For l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it\n is an L2 penalty.\n For ``0 < l1_ratio < 1``, the penalty is a combination of L1/L2 and L2.\n This parameter can be a list, in which case the different\n values are tested by cross-validation and the one giving the best\n prediction score is used. Note that a good choice of list of\n values for l1_ratio is often to put more values close to 1\n (i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7,\n .9, .95, .99, 1]``.\n\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\n n_alphas : int, default=100\n Number of alphas along the regularization path.\n\n alphas : array-like, default=None\n List of alphas where to compute the models.\n If not provided, set automatically.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n cv : int, cross-validation generator or iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - int, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n verbose : bool or int, default=0\n Amount of verbosity.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation. Note that this is\n used only if multiple values for l1_ratio are given.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n intercept_ : ndarray of shape (n_targets,)\n Independent term in decision function.\n\n coef_ : ndarray of shape (n_targets, n_features)\n Parameter vector (W in the cost function formula).\n Note that ``coef_`` stores the transpose of ``W``, ``W.T``.\n\n alpha_ : float\n The amount of penalization chosen by cross validation.\n\n mse_path_ : ndarray of shape (n_alphas, n_folds) or (n_l1_ratio, n_alphas, n_folds)\n Mean square error for the test set on each fold, varying alpha.\n\n alphas_ : ndarray of shape (n_alphas,) or (n_l1_ratio, n_alphas)\n The grid of alphas used for fitting, for each l1_ratio.\n\n l1_ratio_ : float\n Best l1_ratio obtained by cross-validation.\n\n n_iter_ : int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance for the optimal alpha.\n\n dual_gap_ : float\n The dual gap at the end of the optimization for the optimal alpha.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MultiTaskElasticNet : Multi-task L1/L2 ElasticNet with built-in cross-validation.\n ElasticNetCV : Elastic net model with best model selection by\n cross-validation.\n MultiTaskLassoCV : Multi-task Lasso model trained with L1/L2\n mixed-norm as regularizer.\n\n Notes\n -----\n The algorithm used to fit the model is coordinate descent.\n\n To avoid unnecessary memory duplication the X and y arguments of the fit\n method should be directly passed as Fortran-contiguous numpy arrays.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.MultiTaskElasticNetCV(cv=3)\n >>> clf.fit([[0,0], [1, 1], [2, 2]],\n ... [[0, 0], [1, 1], [2, 2]])\n MultiTaskElasticNetCV(cv=3)\n >>> print(clf.coef_)\n [[0.52875032 0.46958558]\n [0.52875032 0.46958558]]\n >>> print(clf.intercept_)\n [0.00166409 0.00166409]\n ", "source_code": "\n\nclass MultiTaskElasticNetCV(RegressorMixin, LinearModelCV):\n \"\"\"Multi-task L1/L2 ElasticNet with built-in cross-validation.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n The optimization objective for MultiTaskElasticNet is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^Fro_2\n + alpha * l1_ratio * ||W||_21\n + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2\n\n Where::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\n i.e. the sum of norm of each row.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.15\n\n Parameters\n ----------\n l1_ratio : float or list of float, default=0.5\n The ElasticNet mixing parameter, with 0 < l1_ratio <= 1.\n For l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it\n is an L2 penalty.\n For ``0 < l1_ratio < 1``, the penalty is a combination of L1/L2 and L2.\n This parameter can be a list, in which case the different\n values are tested by cross-validation and the one giving the best\n prediction score is used. Note that a good choice of list of\n values for l1_ratio is often to put more values close to 1\n (i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7,\n .9, .95, .99, 1]``.\n\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\n n_alphas : int, default=100\n Number of alphas along the regularization path.\n\n alphas : array-like, default=None\n List of alphas where to compute the models.\n If not provided, set automatically.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n cv : int, cross-validation generator or iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - int, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n verbose : bool or int, default=0\n Amount of verbosity.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation. Note that this is\n used only if multiple values for l1_ratio are given.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n intercept_ : ndarray of shape (n_targets,)\n Independent term in decision function.\n\n coef_ : ndarray of shape (n_targets, n_features)\n Parameter vector (W in the cost function formula).\n Note that ``coef_`` stores the transpose of ``W``, ``W.T``.\n\n alpha_ : float\n The amount of penalization chosen by cross validation.\n\n mse_path_ : ndarray of shape (n_alphas, n_folds) or (n_l1_ratio, n_alphas, n_folds)\n Mean square error for the test set on each fold, varying alpha.\n\n alphas_ : ndarray of shape (n_alphas,) or (n_l1_ratio, n_alphas)\n The grid of alphas used for fitting, for each l1_ratio.\n\n l1_ratio_ : float\n Best l1_ratio obtained by cross-validation.\n\n n_iter_ : int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance for the optimal alpha.\n\n dual_gap_ : float\n The dual gap at the end of the optimization for the optimal alpha.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MultiTaskElasticNet : Multi-task L1/L2 ElasticNet with built-in cross-validation.\n ElasticNetCV : Elastic net model with best model selection by\n cross-validation.\n MultiTaskLassoCV : Multi-task Lasso model trained with L1/L2\n mixed-norm as regularizer.\n\n Notes\n -----\n The algorithm used to fit the model is coordinate descent.\n\n To avoid unnecessary memory duplication the X and y arguments of the fit\n method should be directly passed as Fortran-contiguous numpy arrays.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.MultiTaskElasticNetCV(cv=3)\n >>> clf.fit([[0,0], [1, 1], [2, 2]],\n ... [[0, 0], [1, 1], [2, 2]])\n MultiTaskElasticNetCV(cv=3)\n >>> print(clf.coef_)\n [[0.52875032 0.46958558]\n [0.52875032 0.46958558]]\n >>> print(clf.intercept_)\n [0.00166409 0.00166409]\n \"\"\"\n path = staticmethod(enet_path)\n \n def __init__(self, *, l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize='deprecated', max_iter=1000, tol=0.0001, cv=None, copy_X=True, verbose=0, n_jobs=None, random_state=None, selection='cyclic'):\n self.l1_ratio = l1_ratio\n self.eps = eps\n self.n_alphas = n_alphas\n self.alphas = alphas\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.max_iter = max_iter\n self.tol = tol\n self.cv = cv\n self.copy_X = copy_X\n self.verbose = verbose\n self.n_jobs = n_jobs\n self.random_state = random_state\n self.selection = selection\n \n def _get_estimator(self):\n return MultiTaskElasticNet()\n \n def _is_multitask(self):\n return True\n \n def _more_tags(self):\n return {'multioutput_only': True}\n \n def fit(self, X, y):\n \"\"\"Fit MultiTaskElasticNet model with coordinate descent.\n\n Fit is on grid of alphas and best alpha estimated by cross-validation.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n y : ndarray of shape (n_samples, n_targets)\n Training target variable. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Returns MultiTaskElasticNet instance.\n \"\"\"\n return super().fit(X, y)\n" }, @@ -23520,7 +23587,7 @@ "sklearn.linear_model._coordinate_descent.MultiTaskLasso.__init__" ], "is_public": true, - "description": "Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.\n\nThe optimization objective for Lasso is:: (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21 Where:: ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} i.e. the sum of norm of each row. Read more in the :ref:`User Guide `.", + "description": "Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.\n\nThe optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21\n\nWhere::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\ni.e. the sum of norm of each row.\n\nRead more in the :ref:`User Guide `.", "docstring": "Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21\n\n Where::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\n i.e. the sum of norm of each row.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Constant that multiplies the L1/L2 term. Defaults to 1.0.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_targets, n_features)\n Parameter vector (W in the cost function formula).\n Note that ``coef_`` stores the transpose of ``W``, ``W.T``.\n\n intercept_ : ndarray of shape (n_targets,)\n Independent term in decision function.\n\n n_iter_ : int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance.\n\n dual_gap_ : ndarray of shape (n_alphas,)\n The dual gaps at the end of the optimization for each alpha.\n\n eps_ : float\n The tolerance scaled scaled by the variance of the target `y`.\n\n sparse_coef_ : sparse matrix of shape (n_features,) or (n_targets, n_features)\n Sparse representation of the `coef_`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Lasso: Linear Model trained with L1 prior as regularizer (aka the Lasso).\n MultiTaskLasso: Multi-task L1/L2 Lasso with built-in cross-validation.\n MultiTaskElasticNet: Multi-task L1/L2 ElasticNet with built-in cross-validation.\n\n Notes\n -----\n The algorithm used to fit the model is coordinate descent.\n\n To avoid unnecessary memory duplication the X and y arguments of the fit\n method should be directly passed as Fortran-contiguous numpy arrays.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.MultiTaskLasso(alpha=0.1)\n >>> clf.fit([[0, 1], [1, 2], [2, 4]], [[0, 0], [1, 1], [2, 3]])\n MultiTaskLasso(alpha=0.1)\n >>> print(clf.coef_)\n [[0. 0.60809415]\n [0. 0.94592424]]\n >>> print(clf.intercept_)\n [-0.41888636 -0.87382323]\n ", "source_code": "\n\nclass MultiTaskLasso(MultiTaskElasticNet):\n \"\"\"Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21\n\n Where::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\n i.e. the sum of norm of each row.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Constant that multiplies the L1/L2 term. Defaults to 1.0.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n warm_start : bool, default=False\n When set to ``True``, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_targets, n_features)\n Parameter vector (W in the cost function formula).\n Note that ``coef_`` stores the transpose of ``W``, ``W.T``.\n\n intercept_ : ndarray of shape (n_targets,)\n Independent term in decision function.\n\n n_iter_ : int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance.\n\n dual_gap_ : ndarray of shape (n_alphas,)\n The dual gaps at the end of the optimization for each alpha.\n\n eps_ : float\n The tolerance scaled scaled by the variance of the target `y`.\n\n sparse_coef_ : sparse matrix of shape (n_features,) or (n_targets, n_features)\n Sparse representation of the `coef_`.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Lasso: Linear Model trained with L1 prior as regularizer (aka the Lasso).\n MultiTaskLasso: Multi-task L1/L2 Lasso with built-in cross-validation.\n MultiTaskElasticNet: Multi-task L1/L2 ElasticNet with built-in cross-validation.\n\n Notes\n -----\n The algorithm used to fit the model is coordinate descent.\n\n To avoid unnecessary memory duplication the X and y arguments of the fit\n method should be directly passed as Fortran-contiguous numpy arrays.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.MultiTaskLasso(alpha=0.1)\n >>> clf.fit([[0, 1], [1, 2], [2, 4]], [[0, 0], [1, 1], [2, 3]])\n MultiTaskLasso(alpha=0.1)\n >>> print(clf.coef_)\n [[0. 0.60809415]\n [0. 0.94592424]]\n >>> print(clf.intercept_)\n [-0.41888636 -0.87382323]\n \"\"\"\n \n def __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, random_state=None, selection='cyclic'):\n self.alpha = alpha\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.max_iter = max_iter\n self.copy_X = copy_X\n self.tol = tol\n self.warm_start = warm_start\n self.l1_ratio = 1.0\n self.random_state = random_state\n self.selection = selection\n" }, @@ -23537,7 +23604,7 @@ "sklearn.linear_model._coordinate_descent.MultiTaskLassoCV.fit" ], "is_public": true, - "description": "Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.\n\nSee glossary entry for :term:`cross-validation estimator`. The optimization objective for MultiTaskLasso is:: (1 / (2 * n_samples)) * ||Y - XW||^Fro_2 + alpha * ||W||_21 Where:: ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} i.e. the sum of norm of each row. Read more in the :ref:`User Guide `. .. versionadded:: 0.15", + "description": "Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.\n\nSee glossary entry for :term:`cross-validation estimator`.\n\nThe optimization objective for MultiTaskLasso is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^Fro_2 + alpha * ||W||_21\n\nWhere::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\ni.e. the sum of norm of each row.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.15", "docstring": "Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n The optimization objective for MultiTaskLasso is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^Fro_2 + alpha * ||W||_21\n\n Where::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\n i.e. the sum of norm of each row.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.15\n\n Parameters\n ----------\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\n n_alphas : int, default=100\n Number of alphas along the regularization path.\n\n alphas : array-like, default=None\n List of alphas where to compute the models.\n If not provided, set automatically.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n cv : int, cross-validation generator or iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - int, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n verbose : bool or int, default=False\n Amount of verbosity.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation. Note that this is\n used only if multiple values for l1_ratio are given.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n intercept_ : ndarray of shape (n_targets,)\n Independent term in decision function.\n\n coef_ : ndarray of shape (n_targets, n_features)\n Parameter vector (W in the cost function formula).\n Note that ``coef_`` stores the transpose of ``W``, ``W.T``.\n\n alpha_ : float\n The amount of penalization chosen by cross validation.\n\n mse_path_ : ndarray of shape (n_alphas, n_folds)\n Mean square error for the test set on each fold, varying alpha.\n\n alphas_ : ndarray of shape (n_alphas,)\n The grid of alphas used for fitting.\n\n n_iter_ : int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance for the optimal alpha.\n\n dual_gap_ : float\n The dual gap at the end of the optimization for the optimal alpha.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MultiTaskElasticNet : Multi-task ElasticNet model trained with L1/L2\n mixed-norm as regularizer.\n ElasticNetCV : Elastic net model with best model selection by\n cross-validation.\n MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in\n cross-validation.\n\n Notes\n -----\n The algorithm used to fit the model is coordinate descent.\n\n To avoid unnecessary memory duplication the X and y arguments of the fit\n method should be directly passed as Fortran-contiguous numpy arrays.\n\n Examples\n --------\n >>> from sklearn.linear_model import MultiTaskLassoCV\n >>> from sklearn.datasets import make_regression\n >>> from sklearn.metrics import r2_score\n >>> X, y = make_regression(n_targets=2, noise=4, random_state=0)\n >>> reg = MultiTaskLassoCV(cv=5, random_state=0).fit(X, y)\n >>> r2_score(y, reg.predict(X))\n 0.9994...\n >>> reg.alpha_\n 0.5713...\n >>> reg.predict(X[:1,])\n array([[153.7971..., 94.9015...]])\n ", "source_code": "\n\nclass MultiTaskLassoCV(RegressorMixin, LinearModelCV):\n \"\"\"Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n The optimization objective for MultiTaskLasso is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^Fro_2 + alpha * ||W||_21\n\n Where::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\n i.e. the sum of norm of each row.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.15\n\n Parameters\n ----------\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\n n_alphas : int, default=100\n Number of alphas along the regularization path.\n\n alphas : array-like, default=None\n List of alphas where to compute the models.\n If not provided, set automatically.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n max_iter : int, default=1000\n The maximum number of iterations.\n\n tol : float, default=1e-4\n The tolerance for the optimization: if the updates are\n smaller than ``tol``, the optimization code checks the\n dual gap for optimality and continues until it is smaller\n than ``tol``.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n cv : int, cross-validation generator or iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - int, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n verbose : bool or int, default=False\n Amount of verbosity.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation. Note that this is\n used only if multiple values for l1_ratio are given.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator that selects a random\n feature to update. Used when ``selection`` == 'random'.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n selection : {'cyclic', 'random'}, default='cyclic'\n If set to 'random', a random coefficient is updated every iteration\n rather than looping over features sequentially by default. This\n (setting to 'random') often leads to significantly faster convergence\n especially when tol is higher than 1e-4.\n\n Attributes\n ----------\n intercept_ : ndarray of shape (n_targets,)\n Independent term in decision function.\n\n coef_ : ndarray of shape (n_targets, n_features)\n Parameter vector (W in the cost function formula).\n Note that ``coef_`` stores the transpose of ``W``, ``W.T``.\n\n alpha_ : float\n The amount of penalization chosen by cross validation.\n\n mse_path_ : ndarray of shape (n_alphas, n_folds)\n Mean square error for the test set on each fold, varying alpha.\n\n alphas_ : ndarray of shape (n_alphas,)\n The grid of alphas used for fitting.\n\n n_iter_ : int\n Number of iterations run by the coordinate descent solver to reach\n the specified tolerance for the optimal alpha.\n\n dual_gap_ : float\n The dual gap at the end of the optimization for the optimal alpha.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MultiTaskElasticNet : Multi-task ElasticNet model trained with L1/L2\n mixed-norm as regularizer.\n ElasticNetCV : Elastic net model with best model selection by\n cross-validation.\n MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in\n cross-validation.\n\n Notes\n -----\n The algorithm used to fit the model is coordinate descent.\n\n To avoid unnecessary memory duplication the X and y arguments of the fit\n method should be directly passed as Fortran-contiguous numpy arrays.\n\n Examples\n --------\n >>> from sklearn.linear_model import MultiTaskLassoCV\n >>> from sklearn.datasets import make_regression\n >>> from sklearn.metrics import r2_score\n >>> X, y = make_regression(n_targets=2, noise=4, random_state=0)\n >>> reg = MultiTaskLassoCV(cv=5, random_state=0).fit(X, y)\n >>> r2_score(y, reg.predict(X))\n 0.9994...\n >>> reg.alpha_\n 0.5713...\n >>> reg.predict(X[:1,])\n array([[153.7971..., 94.9015...]])\n \"\"\"\n path = staticmethod(lasso_path)\n \n def __init__(self, *, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize='deprecated', max_iter=1000, tol=0.0001, copy_X=True, cv=None, verbose=False, n_jobs=None, random_state=None, selection='cyclic'):\n super().__init__(eps=eps, n_alphas=n_alphas, alphas=alphas, fit_intercept=fit_intercept, normalize=normalize, max_iter=max_iter, tol=tol, copy_X=copy_X, cv=cv, verbose=verbose, n_jobs=n_jobs, random_state=random_state, selection=selection)\n \n def _get_estimator(self):\n return MultiTaskLasso()\n \n def _is_multitask(self):\n return True\n \n def _more_tags(self):\n return {'multioutput_only': True}\n \n def fit(self, X, y):\n \"\"\"Fit MultiTaskLasso model with coordinate descent.\n\n Fit is on grid of alphas and best alpha estimated by cross-validation.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data.\n y : ndarray of shape (n_samples, n_targets)\n Target. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Returns an instance of fitted model.\n \"\"\"\n return super().fit(X, y)\n" }, @@ -23552,7 +23619,7 @@ "sklearn.linear_model._glm.glm.GammaRegressor.family@setter" ], "is_public": true, - "description": "Generalized Linear Model with a Gamma distribution.\n\nThis regressor uses the 'log' link function. Read more in the :ref:`User Guide `. .. versionadded:: 0.23", + "description": "Generalized Linear Model with a Gamma distribution.\n\nThis regressor uses the 'log' link function.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.23", "docstring": "Generalized Linear Model with a Gamma distribution.\n\n This regressor uses the 'log' link function.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.23\n\n Parameters\n ----------\n alpha : float, default=1\n Constant that multiplies the penalty term and thus determines the\n regularization strength. ``alpha = 0`` is equivalent to unpenalized\n GLMs. In this case, the design matrix `X` must have full column rank\n (no collinearities).\n\n fit_intercept : bool, default=True\n Specifies if a constant (a.k.a. bias or intercept) should be\n added to the linear predictor (X @ coef + intercept).\n\n max_iter : int, default=100\n The maximal number of iterations for the solver.\n\n tol : float, default=1e-4\n Stopping criterion. For the lbfgs solver,\n the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``\n where ``g_j`` is the j-th component of the gradient (derivative) of\n the objective function.\n\n warm_start : bool, default=False\n If set to ``True``, reuse the solution of the previous call to ``fit``\n as initialization for ``coef_`` and ``intercept_`` .\n\n verbose : int, default=0\n For the lbfgs solver set verbose to any positive number for verbosity.\n\n Attributes\n ----------\n coef_ : array of shape (n_features,)\n Estimated coefficients for the linear predictor (`X * coef_ +\n intercept_`) in the GLM.\n\n intercept_ : float\n Intercept (a.k.a. bias) added to linear predictor.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n n_iter_ : int\n Actual number of iterations used in the solver.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PoissonRegressor : Generalized Linear Model with a Poisson distribution.\n TweedieRegressor : Generalized Linear Model with a Tweedie distribution.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.GammaRegressor()\n >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]\n >>> y = [19, 26, 33, 30]\n >>> clf.fit(X, y)\n GammaRegressor()\n >>> clf.score(X, y)\n 0.773...\n >>> clf.coef_\n array([0.072..., 0.066...])\n >>> clf.intercept_\n 2.896...\n >>> clf.predict([[1, 0], [2, 8]])\n array([19.483..., 35.795...])\n ", "source_code": "\n\nclass GammaRegressor(GeneralizedLinearRegressor):\n \"\"\"Generalized Linear Model with a Gamma distribution.\n\n This regressor uses the 'log' link function.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.23\n\n Parameters\n ----------\n alpha : float, default=1\n Constant that multiplies the penalty term and thus determines the\n regularization strength. ``alpha = 0`` is equivalent to unpenalized\n GLMs. In this case, the design matrix `X` must have full column rank\n (no collinearities).\n\n fit_intercept : bool, default=True\n Specifies if a constant (a.k.a. bias or intercept) should be\n added to the linear predictor (X @ coef + intercept).\n\n max_iter : int, default=100\n The maximal number of iterations for the solver.\n\n tol : float, default=1e-4\n Stopping criterion. For the lbfgs solver,\n the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``\n where ``g_j`` is the j-th component of the gradient (derivative) of\n the objective function.\n\n warm_start : bool, default=False\n If set to ``True``, reuse the solution of the previous call to ``fit``\n as initialization for ``coef_`` and ``intercept_`` .\n\n verbose : int, default=0\n For the lbfgs solver set verbose to any positive number for verbosity.\n\n Attributes\n ----------\n coef_ : array of shape (n_features,)\n Estimated coefficients for the linear predictor (`X * coef_ +\n intercept_`) in the GLM.\n\n intercept_ : float\n Intercept (a.k.a. bias) added to linear predictor.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n n_iter_ : int\n Actual number of iterations used in the solver.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PoissonRegressor : Generalized Linear Model with a Poisson distribution.\n TweedieRegressor : Generalized Linear Model with a Tweedie distribution.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> clf = linear_model.GammaRegressor()\n >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]\n >>> y = [19, 26, 33, 30]\n >>> clf.fit(X, y)\n GammaRegressor()\n >>> clf.score(X, y)\n 0.773...\n >>> clf.coef_\n array([0.072..., 0.066...])\n >>> clf.intercept_\n 2.896...\n >>> clf.predict([[1, 0], [2, 8]])\n array([19.483..., 35.795...])\n \"\"\"\n \n def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, tol=0.0001, warm_start=False, verbose=0):\n super().__init__(alpha=alpha, fit_intercept=fit_intercept, family='gamma', link='log', max_iter=max_iter, tol=tol, warm_start=warm_start, verbose=verbose)\n \n @property\n def family(self):\n \"\"\"Return the family of the regressor.\"\"\"\n return 'gamma'\n \n @family.setter\n def family(self, value):\n if value != 'gamma':\n raise ValueError(\"GammaRegressor.family must be 'gamma'!\")\n" }, @@ -23570,7 +23637,7 @@ "sklearn.linear_model._glm.glm.GeneralizedLinearRegressor._more_tags" ], "is_public": true, - "description": "Regression via a penalized Generalized Linear Model (GLM).\n\nGLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and predicting the mean of the target y as y_pred=h(X*w). Therefore, the fit minimizes the following objective function with L2 priors as regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * |w|_2 with inverse link function h and s=sample_weight. The parameter ``alpha`` corresponds to the lambda parameter in glmnet. Read more in the :ref:`User Guide `. .. versionadded:: 0.23", + "description": "Regression via a penalized Generalized Linear Model (GLM).\n\nGLMs based on a reproductive Exponential Dispersion Model (EDM) aim at\nfitting and predicting the mean of the target y as y_pred=h(X*w).\nTherefore, the fit minimizes the following objective function with L2\npriors as regularizer::\n\n 1/(2*sum(s)) * deviance(y, h(X*w); s)\n + 1/2 * alpha * |w|_2\n\nwith inverse link function h and s=sample_weight.\nThe parameter ``alpha`` corresponds to the lambda parameter in glmnet.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.23", "docstring": "Regression via a penalized Generalized Linear Model (GLM).\n\n GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at\n fitting and predicting the mean of the target y as y_pred=h(X*w).\n Therefore, the fit minimizes the following objective function with L2\n priors as regularizer::\n\n 1/(2*sum(s)) * deviance(y, h(X*w); s)\n + 1/2 * alpha * |w|_2\n\n with inverse link function h and s=sample_weight.\n The parameter ``alpha`` corresponds to the lambda parameter in glmnet.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.23\n\n Parameters\n ----------\n alpha : float, default=1\n Constant that multiplies the penalty term and thus determines the\n regularization strength. ``alpha = 0`` is equivalent to unpenalized\n GLMs. In this case, the design matrix `X` must have full column rank\n (no collinearities).\n\n fit_intercept : bool, default=True\n Specifies if a constant (a.k.a. bias or intercept) should be\n added to the linear predictor (X @ coef + intercept).\n\n family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} or an ExponentialDispersionModel instance, default='normal'\n The distributional assumption of the GLM, i.e. which distribution from\n the EDM, specifies the loss function to be minimized.\n\n link : {'auto', 'identity', 'log'} or an instance of class BaseLink, default='auto'\n The link function of the GLM, i.e. mapping from linear predictor\n `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets\n the link depending on the chosen family as follows:\n\n - 'identity' for Normal distribution\n - 'log' for Poisson, Gamma and Inverse Gaussian distributions\n\n solver : 'lbfgs', default='lbfgs'\n Algorithm to use in the optimization problem:\n\n 'lbfgs'\n Calls scipy's L-BFGS-B optimizer.\n\n max_iter : int, default=100\n The maximal number of iterations for the solver.\n\n tol : float, default=1e-4\n Stopping criterion. For the lbfgs solver,\n the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``\n where ``g_j`` is the j-th component of the gradient (derivative) of\n the objective function.\n\n warm_start : bool, default=False\n If set to ``True``, reuse the solution of the previous call to ``fit``\n as initialization for ``coef_`` and ``intercept_``.\n\n verbose : int, default=0\n For the lbfgs solver set verbose to any positive number for verbosity.\n\n Attributes\n ----------\n coef_ : array of shape (n_features,)\n Estimated coefficients for the linear predictor (`X @ coef_ +\n intercept_`) in the GLM.\n\n intercept_ : float\n Intercept (a.k.a. bias) added to linear predictor.\n\n n_iter_ : int\n Actual number of iterations used in the solver.\n ", "source_code": "\n\nclass GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):\n \"\"\"Regression via a penalized Generalized Linear Model (GLM).\n\n GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at\n fitting and predicting the mean of the target y as y_pred=h(X*w).\n Therefore, the fit minimizes the following objective function with L2\n priors as regularizer::\n\n 1/(2*sum(s)) * deviance(y, h(X*w); s)\n + 1/2 * alpha * |w|_2\n\n with inverse link function h and s=sample_weight.\n The parameter ``alpha`` corresponds to the lambda parameter in glmnet.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.23\n\n Parameters\n ----------\n alpha : float, default=1\n Constant that multiplies the penalty term and thus determines the\n regularization strength. ``alpha = 0`` is equivalent to unpenalized\n GLMs. In this case, the design matrix `X` must have full column rank\n (no collinearities).\n\n fit_intercept : bool, default=True\n Specifies if a constant (a.k.a. bias or intercept) should be\n added to the linear predictor (X @ coef + intercept).\n\n family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} or an ExponentialDispersionModel instance, default='normal'\n The distributional assumption of the GLM, i.e. which distribution from\n the EDM, specifies the loss function to be minimized.\n\n link : {'auto', 'identity', 'log'} or an instance of class BaseLink, default='auto'\n The link function of the GLM, i.e. mapping from linear predictor\n `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets\n the link depending on the chosen family as follows:\n\n - 'identity' for Normal distribution\n - 'log' for Poisson, Gamma and Inverse Gaussian distributions\n\n solver : 'lbfgs', default='lbfgs'\n Algorithm to use in the optimization problem:\n\n 'lbfgs'\n Calls scipy's L-BFGS-B optimizer.\n\n max_iter : int, default=100\n The maximal number of iterations for the solver.\n\n tol : float, default=1e-4\n Stopping criterion. For the lbfgs solver,\n the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``\n where ``g_j`` is the j-th component of the gradient (derivative) of\n the objective function.\n\n warm_start : bool, default=False\n If set to ``True``, reuse the solution of the previous call to ``fit``\n as initialization for ``coef_`` and ``intercept_``.\n\n verbose : int, default=0\n For the lbfgs solver set verbose to any positive number for verbosity.\n\n Attributes\n ----------\n coef_ : array of shape (n_features,)\n Estimated coefficients for the linear predictor (`X @ coef_ +\n intercept_`) in the GLM.\n\n intercept_ : float\n Intercept (a.k.a. bias) added to linear predictor.\n\n n_iter_ : int\n Actual number of iterations used in the solver.\n \"\"\"\n \n def __init__(self, *, alpha=1.0, fit_intercept=True, family='normal', link='auto', solver='lbfgs', max_iter=100, tol=0.0001, warm_start=False, verbose=0):\n self.alpha = alpha\n self.fit_intercept = fit_intercept\n self.family = family\n self.link = link\n self.solver = solver\n self.max_iter = max_iter\n self.tol = tol\n self.warm_start = warm_start\n self.verbose = verbose\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit a Generalized Linear Model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Fitted model.\n \"\"\"\n if isinstance(self.family, ExponentialDispersionModel):\n self._family_instance = self.family\n elif self.family in EDM_DISTRIBUTIONS:\n self._family_instance = EDM_DISTRIBUTIONS[self.family]()\n else:\n raise ValueError(\"The family must be an instance of class ExponentialDispersionModel or an element of ['normal', 'poisson', 'gamma', 'inverse-gaussian']; got (family={0})\".format(self.family))\n if isinstance(self.link, BaseLink):\n self._link_instance = self.link\n elif self.link == 'auto':\n if isinstance(self._family_instance, TweedieDistribution):\n if self._family_instance.power <= 0:\n self._link_instance = IdentityLink()\n if self._family_instance.power >= 1:\n self._link_instance = LogLink()\n else:\n raise ValueError(\"No default link known for the specified distribution family. Please set link manually, i.e. not to 'auto'; got (link='auto', family={})\".format(self.family))\n elif self.link == 'identity':\n self._link_instance = IdentityLink()\n elif self.link == 'log':\n self._link_instance = LogLink()\n else:\n raise ValueError(\"The link must be an instance of class Link or an element of ['auto', 'identity', 'log']; got (link={0})\".format(self.link))\n if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:\n raise ValueError('Penalty term must be a non-negative number; got (alpha={0})'.format(self.alpha))\n if not isinstance(self.fit_intercept, bool):\n raise ValueError('The argument fit_intercept must be bool; got {0}'.format(self.fit_intercept))\n if self.solver not in ['lbfgs']:\n raise ValueError(\"GeneralizedLinearRegressor supports only solvers'lbfgs'; got {0}\".format(self.solver))\n solver = self.solver\n if not isinstance(self.max_iter, numbers.Integral) or self.max_iter <= 0:\n raise ValueError('Maximum number of iteration must be a positive integer; got (max_iter={0!r})'.format(self.max_iter))\n if not isinstance(self.tol, numbers.Number) or self.tol <= 0:\n raise ValueError('Tolerance for stopping criteria must be positive; got (tol={0!r})'.format(self.tol))\n if not isinstance(self.warm_start, bool):\n raise ValueError('The argument warm_start must be bool; got {0}'.format(self.warm_start))\n family = self._family_instance\n link = self._link_instance\n (X, y) = self._validate_data(X, y, accept_sparse=['csc', 'csr'], dtype=[np.float64, np.float32], y_numeric=True, multi_output=False)\n weights = _check_sample_weight(sample_weight, X)\n (_, n_features) = X.shape\n if not np.all(family.in_y_range(y)):\n raise ValueError('Some value(s) of y are out of the valid range for family {0}'.format(family.__class__.__name__))\n weights = weights / weights.sum()\n if self.warm_start and hasattr(self, 'coef_'):\n if self.fit_intercept:\n coef = np.concatenate((np.array([self.intercept_]), self.coef_))\n else:\n coef = self.coef_\n elif self.fit_intercept:\n coef = np.zeros(n_features + 1)\n coef[0] = link(np.average(y, weights=weights))\n else:\n coef = np.zeros(n_features)\n if solver == 'lbfgs':\n \n def func(coef, X, y, weights, alpha, family, link):\n (y_pred, devp) = _y_pred_deviance_derivative(coef, X, y, weights, family, link)\n dev = family.deviance(y, y_pred, weights)\n offset = 1 if self.fit_intercept else 0\n coef_scaled = alpha * coef[offset:]\n obj = 0.5 * dev + 0.5 * (coef[offset:] @ coef_scaled)\n objp = 0.5 * devp\n objp[offset:] += coef_scaled\n return obj, objp\n args = (X, y, weights, self.alpha, family, link)\n opt_res = scipy.optimize.minimize(func, coef, method='L-BFGS-B', jac=True, options={'maxiter': self.max_iter, 'iprint': (self.verbose > 0) - 1, 'gtol': self.tol, 'ftol': 1000.0 * np.finfo(float).eps}, args=args)\n self.n_iter_ = _check_optimize_result('lbfgs', opt_res)\n coef = opt_res.x\n if self.fit_intercept:\n self.intercept_ = coef[0]\n self.coef_ = coef[1:]\n else:\n self.intercept_ = 0.0\n self.coef_ = coef\n return self\n \n def _linear_predictor(self, X):\n \"\"\"Compute the linear_predictor = `X @ coef_ + intercept_`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n Returns\n -------\n y_pred : array of shape (n_samples,)\n Returns predicted values of linear predictor.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float64, np.float32], ensure_2d=True, allow_nd=False, reset=False)\n return X @ self.coef_ + self.intercept_\n \n def predict(self, X):\n \"\"\"Predict using GLM with feature matrix X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n Returns\n -------\n y_pred : array of shape (n_samples,)\n Returns predicted values.\n \"\"\"\n eta = self._linear_predictor(X)\n y_pred = self._link_instance.inverse(eta)\n return y_pred\n \n def score(self, X, y, sample_weight=None):\n \"\"\"Compute D^2, the percentage of deviance explained.\n\n D^2 is a generalization of the coefficient of determination R^2.\n R^2 uses squared error and D^2 deviance. Note that those two are equal\n for ``family='normal'``.\n\n D^2 is defined as\n :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,\n :math:`D_{null}` is the null deviance, i.e. the deviance of a model\n with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.\n The mean :math:`\\bar{y}` is averaged by sample_weight.\n Best possible score is 1.0 and it can be negative (because the model\n can be arbitrarily worse).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples,)\n True values of target.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n D^2 of self.predict(X) w.r.t. y.\n \"\"\"\n weights = _check_sample_weight(sample_weight, X)\n y_pred = self.predict(X)\n dev = self._family_instance.deviance(y, y_pred, weights=weights)\n y_mean = np.average(y, weights=weights)\n dev_null = self._family_instance.deviance(y, y_mean, weights=weights)\n return 1 - dev / dev_null\n \n def _more_tags(self):\n if hasattr(self, '_family_instance'):\n _family_instance = self._family_instance\n elif isinstance(self.family, ExponentialDispersionModel):\n _family_instance = self.family\n elif self.family in EDM_DISTRIBUTIONS:\n _family_instance = EDM_DISTRIBUTIONS[self.family]()\n else:\n raise ValueError\n return {'requires_positive_y': not _family_instance.in_y_range(-1.0)}\n" }, @@ -23585,7 +23652,7 @@ "sklearn.linear_model._glm.glm.PoissonRegressor.family@setter" ], "is_public": true, - "description": "Generalized Linear Model with a Poisson distribution.\n\nThis regressor uses the 'log' link function. Read more in the :ref:`User Guide `. .. versionadded:: 0.23", + "description": "Generalized Linear Model with a Poisson distribution.\n\nThis regressor uses the 'log' link function.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.23", "docstring": "Generalized Linear Model with a Poisson distribution.\n\n This regressor uses the 'log' link function.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.23\n\n Parameters\n ----------\n alpha : float, default=1\n Constant that multiplies the penalty term and thus determines the\n regularization strength. ``alpha = 0`` is equivalent to unpenalized\n GLMs. In this case, the design matrix `X` must have full column rank\n (no collinearities).\n\n fit_intercept : bool, default=True\n Specifies if a constant (a.k.a. bias or intercept) should be\n added to the linear predictor (X @ coef + intercept).\n\n max_iter : int, default=100\n The maximal number of iterations for the solver.\n\n tol : float, default=1e-4\n Stopping criterion. For the lbfgs solver,\n the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``\n where ``g_j`` is the j-th component of the gradient (derivative) of\n the objective function.\n\n warm_start : bool, default=False\n If set to ``True``, reuse the solution of the previous call to ``fit``\n as initialization for ``coef_`` and ``intercept_`` .\n\n verbose : int, default=0\n For the lbfgs solver set verbose to any positive number for verbosity.\n\n Attributes\n ----------\n coef_ : array of shape (n_features,)\n Estimated coefficients for the linear predictor (`X @ coef_ +\n intercept_`) in the GLM.\n\n intercept_ : float\n Intercept (a.k.a. bias) added to linear predictor.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Actual number of iterations used in the solver.\n\n Examples\n ----------\n >>> from sklearn import linear_model\n >>> clf = linear_model.PoissonRegressor()\n >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]\n >>> y = [12, 17, 22, 21]\n >>> clf.fit(X, y)\n PoissonRegressor()\n >>> clf.score(X, y)\n 0.990...\n >>> clf.coef_\n array([0.121..., 0.158...])\n >>> clf.intercept_\n 2.088...\n >>> clf.predict([[1, 1], [3, 4]])\n array([10.676..., 21.875...])\n\n See Also\n ----------\n GeneralizedLinearRegressor : Generalized Linear Model with a Poisson\n distribution.\n ", "source_code": "\n\nclass PoissonRegressor(GeneralizedLinearRegressor):\n \"\"\"Generalized Linear Model with a Poisson distribution.\n\n This regressor uses the 'log' link function.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.23\n\n Parameters\n ----------\n alpha : float, default=1\n Constant that multiplies the penalty term and thus determines the\n regularization strength. ``alpha = 0`` is equivalent to unpenalized\n GLMs. In this case, the design matrix `X` must have full column rank\n (no collinearities).\n\n fit_intercept : bool, default=True\n Specifies if a constant (a.k.a. bias or intercept) should be\n added to the linear predictor (X @ coef + intercept).\n\n max_iter : int, default=100\n The maximal number of iterations for the solver.\n\n tol : float, default=1e-4\n Stopping criterion. For the lbfgs solver,\n the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``\n where ``g_j`` is the j-th component of the gradient (derivative) of\n the objective function.\n\n warm_start : bool, default=False\n If set to ``True``, reuse the solution of the previous call to ``fit``\n as initialization for ``coef_`` and ``intercept_`` .\n\n verbose : int, default=0\n For the lbfgs solver set verbose to any positive number for verbosity.\n\n Attributes\n ----------\n coef_ : array of shape (n_features,)\n Estimated coefficients for the linear predictor (`X @ coef_ +\n intercept_`) in the GLM.\n\n intercept_ : float\n Intercept (a.k.a. bias) added to linear predictor.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Actual number of iterations used in the solver.\n\n Examples\n ----------\n >>> from sklearn import linear_model\n >>> clf = linear_model.PoissonRegressor()\n >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]\n >>> y = [12, 17, 22, 21]\n >>> clf.fit(X, y)\n PoissonRegressor()\n >>> clf.score(X, y)\n 0.990...\n >>> clf.coef_\n array([0.121..., 0.158...])\n >>> clf.intercept_\n 2.088...\n >>> clf.predict([[1, 1], [3, 4]])\n array([10.676..., 21.875...])\n\n See Also\n ----------\n GeneralizedLinearRegressor : Generalized Linear Model with a Poisson\n distribution.\n \"\"\"\n \n def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, tol=0.0001, warm_start=False, verbose=0):\n super().__init__(alpha=alpha, fit_intercept=fit_intercept, family='poisson', link='log', max_iter=max_iter, tol=tol, warm_start=warm_start, verbose=verbose)\n \n @property\n def family(self):\n \"\"\"Return the string `'poisson'`.\"\"\"\n return 'poisson'\n \n @family.setter\n def family(self, value):\n if value != 'poisson':\n raise ValueError(\"PoissonRegressor.family must be 'poisson'!\")\n" }, @@ -23600,7 +23667,7 @@ "sklearn.linear_model._glm.glm.TweedieRegressor.family@setter" ], "is_public": true, - "description": "Generalized Linear Model with a Tweedie distribution.\n\nThis estimator can be used to model different GLMs depending on the ``power`` parameter, which determines the underlying distribution. Read more in the :ref:`User Guide `. .. versionadded:: 0.23", + "description": "Generalized Linear Model with a Tweedie distribution.\n\nThis estimator can be used to model different GLMs depending on the\n``power`` parameter, which determines the underlying distribution.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.23", "docstring": "Generalized Linear Model with a Tweedie distribution.\n\n This estimator can be used to model different GLMs depending on the\n ``power`` parameter, which determines the underlying distribution.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.23\n\n Parameters\n ----------\n power : float, default=0\n The power determines the underlying target distribution according\n to the following table:\n\n +-------+------------------------+\n | Power | Distribution |\n +=======+========================+\n | 0 | Normal |\n +-------+------------------------+\n | 1 | Poisson |\n +-------+------------------------+\n | (1,2) | Compound Poisson Gamma |\n +-------+------------------------+\n | 2 | Gamma |\n +-------+------------------------+\n | 3 | Inverse Gaussian |\n +-------+------------------------+\n\n For ``0 < power < 1``, no distribution exists.\n\n alpha : float, default=1\n Constant that multiplies the penalty term and thus determines the\n regularization strength. ``alpha = 0`` is equivalent to unpenalized\n GLMs. In this case, the design matrix `X` must have full column rank\n (no collinearities).\n\n fit_intercept : bool, default=True\n Specifies if a constant (a.k.a. bias or intercept) should be\n added to the linear predictor (X @ coef + intercept).\n\n link : {'auto', 'identity', 'log'}, default='auto'\n The link function of the GLM, i.e. mapping from linear predictor\n `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets\n the link depending on the chosen family as follows:\n\n - 'identity' for Normal distribution\n - 'log' for Poisson, Gamma and Inverse Gaussian distributions\n\n max_iter : int, default=100\n The maximal number of iterations for the solver.\n\n tol : float, default=1e-4\n Stopping criterion. For the lbfgs solver,\n the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``\n where ``g_j`` is the j-th component of the gradient (derivative) of\n the objective function.\n\n warm_start : bool, default=False\n If set to ``True``, reuse the solution of the previous call to ``fit``\n as initialization for ``coef_`` and ``intercept_`` .\n\n verbose : int, default=0\n For the lbfgs solver set verbose to any positive number for verbosity.\n\n Attributes\n ----------\n coef_ : array of shape (n_features,)\n Estimated coefficients for the linear predictor (`X @ coef_ +\n intercept_`) in the GLM.\n\n intercept_ : float\n Intercept (a.k.a. bias) added to linear predictor.\n\n n_iter_ : int\n Actual number of iterations used in the solver.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PoissonRegressor : Generalized Linear Model with a Poisson distribution.\n GammaRegressor : Generalized Linear Model with a Gamma distribution.\n\n Examples\n ----------\n >>> from sklearn import linear_model\n >>> clf = linear_model.TweedieRegressor()\n >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]\n >>> y = [2, 3.5, 5, 5.5]\n >>> clf.fit(X, y)\n TweedieRegressor()\n >>> clf.score(X, y)\n 0.839...\n >>> clf.coef_\n array([0.599..., 0.299...])\n >>> clf.intercept_\n 1.600...\n >>> clf.predict([[1, 1], [3, 4]])\n array([2.500..., 4.599...])\n ", "source_code": "\n\nclass TweedieRegressor(GeneralizedLinearRegressor):\n \"\"\"Generalized Linear Model with a Tweedie distribution.\n\n This estimator can be used to model different GLMs depending on the\n ``power`` parameter, which determines the underlying distribution.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.23\n\n Parameters\n ----------\n power : float, default=0\n The power determines the underlying target distribution according\n to the following table:\n\n +-------+------------------------+\n | Power | Distribution |\n +=======+========================+\n | 0 | Normal |\n +-------+------------------------+\n | 1 | Poisson |\n +-------+------------------------+\n | (1,2) | Compound Poisson Gamma |\n +-------+------------------------+\n | 2 | Gamma |\n +-------+------------------------+\n | 3 | Inverse Gaussian |\n +-------+------------------------+\n\n For ``0 < power < 1``, no distribution exists.\n\n alpha : float, default=1\n Constant that multiplies the penalty term and thus determines the\n regularization strength. ``alpha = 0`` is equivalent to unpenalized\n GLMs. In this case, the design matrix `X` must have full column rank\n (no collinearities).\n\n fit_intercept : bool, default=True\n Specifies if a constant (a.k.a. bias or intercept) should be\n added to the linear predictor (X @ coef + intercept).\n\n link : {'auto', 'identity', 'log'}, default='auto'\n The link function of the GLM, i.e. mapping from linear predictor\n `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets\n the link depending on the chosen family as follows:\n\n - 'identity' for Normal distribution\n - 'log' for Poisson, Gamma and Inverse Gaussian distributions\n\n max_iter : int, default=100\n The maximal number of iterations for the solver.\n\n tol : float, default=1e-4\n Stopping criterion. For the lbfgs solver,\n the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``\n where ``g_j`` is the j-th component of the gradient (derivative) of\n the objective function.\n\n warm_start : bool, default=False\n If set to ``True``, reuse the solution of the previous call to ``fit``\n as initialization for ``coef_`` and ``intercept_`` .\n\n verbose : int, default=0\n For the lbfgs solver set verbose to any positive number for verbosity.\n\n Attributes\n ----------\n coef_ : array of shape (n_features,)\n Estimated coefficients for the linear predictor (`X @ coef_ +\n intercept_`) in the GLM.\n\n intercept_ : float\n Intercept (a.k.a. bias) added to linear predictor.\n\n n_iter_ : int\n Actual number of iterations used in the solver.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n PoissonRegressor : Generalized Linear Model with a Poisson distribution.\n GammaRegressor : Generalized Linear Model with a Gamma distribution.\n\n Examples\n ----------\n >>> from sklearn import linear_model\n >>> clf = linear_model.TweedieRegressor()\n >>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]\n >>> y = [2, 3.5, 5, 5.5]\n >>> clf.fit(X, y)\n TweedieRegressor()\n >>> clf.score(X, y)\n 0.839...\n >>> clf.coef_\n array([0.599..., 0.299...])\n >>> clf.intercept_\n 1.600...\n >>> clf.predict([[1, 1], [3, 4]])\n array([2.500..., 4.599...])\n \"\"\"\n \n def __init__(self, *, power=0.0, alpha=1.0, fit_intercept=True, link='auto', max_iter=100, tol=0.0001, warm_start=False, verbose=0):\n super().__init__(alpha=alpha, fit_intercept=fit_intercept, family=TweedieDistribution(power=power), link=link, max_iter=max_iter, tol=tol, warm_start=warm_start, verbose=verbose)\n \n @property\n def family(self):\n \"\"\"Return the family of the regressor.\"\"\"\n dist = TweedieDistribution(power=self.power)\n return dist\n \n @family.setter\n def family(self, value):\n if isinstance(value, TweedieDistribution):\n self.power = value.power\n else:\n raise TypeError('TweedieRegressor.family must be of type TweedieDistribution!')\n" }, @@ -23678,7 +23745,7 @@ "sklearn.linear_model._huber.HuberRegressor.fit" ], "is_public": true, - "description": "Linear regression model that is robust to outliers.\n\nThe Huber Regressor optimizes the squared loss for the samples where ``|(y - X'w) / sigma| < epsilon`` and the absolute loss for the samples where ``|(y - X'w) / sigma| > epsilon``, where w and sigma are parameters to be optimized. The parameter sigma makes sure that if y is scaled up or down by a certain factor, one does not need to rescale epsilon to achieve the same robustness. Note that this does not take into account the fact that the different features of X may be of different scales. This makes sure that the loss function is not heavily influenced by the outliers while not completely ignoring their effect. Read more in the :ref:`User Guide ` .. versionadded:: 0.18", + "description": "Linear regression model that is robust to outliers.\n\nThe Huber Regressor optimizes the squared loss for the samples where\n``|(y - X'w) / sigma| < epsilon`` and the absolute loss for the samples\nwhere ``|(y - X'w) / sigma| > epsilon``, where w and sigma are parameters\nto be optimized. The parameter sigma makes sure that if y is scaled up\nor down by a certain factor, one does not need to rescale epsilon to\nachieve the same robustness. Note that this does not take into account\nthe fact that the different features of X may be of different scales.\n\nThis makes sure that the loss function is not heavily influenced by the\noutliers while not completely ignoring their effect.\n\nRead more in the :ref:`User Guide `\n\n.. versionadded:: 0.18", "docstring": "Linear regression model that is robust to outliers.\n\n The Huber Regressor optimizes the squared loss for the samples where\n ``|(y - X'w) / sigma| < epsilon`` and the absolute loss for the samples\n where ``|(y - X'w) / sigma| > epsilon``, where w and sigma are parameters\n to be optimized. The parameter sigma makes sure that if y is scaled up\n or down by a certain factor, one does not need to rescale epsilon to\n achieve the same robustness. Note that this does not take into account\n the fact that the different features of X may be of different scales.\n\n This makes sure that the loss function is not heavily influenced by the\n outliers while not completely ignoring their effect.\n\n Read more in the :ref:`User Guide `\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n epsilon : float, greater than 1.0, default=1.35\n The parameter epsilon controls the number of samples that should be\n classified as outliers. The smaller the epsilon, the more robust it is\n to outliers.\n\n max_iter : int, default=100\n Maximum number of iterations that\n ``scipy.optimize.minimize(method=\"L-BFGS-B\")`` should run for.\n\n alpha : float, default=0.0001\n Regularization parameter.\n\n warm_start : bool, default=False\n This is useful if the stored attributes of a previously used model\n has to be reused. If set to False, then the coefficients will\n be rewritten for every call to fit.\n See :term:`the Glossary `.\n\n fit_intercept : bool, default=True\n Whether or not to fit the intercept. This can be set to False\n if the data is already centered around the origin.\n\n tol : float, default=1e-05\n The iteration will stop when\n ``max{|proj g_i | i = 1, ..., n}`` <= ``tol``\n where pg_i is the i-th component of the projected gradient.\n\n Attributes\n ----------\n coef_ : array, shape (n_features,)\n Features got by optimizing the Huber loss.\n\n intercept_ : float\n Bias.\n\n scale_ : float\n The value by which ``|y - X'w - c|`` is scaled down.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations that\n ``scipy.optimize.minimize(method=\"L-BFGS-B\")`` has run for.\n\n .. versionchanged:: 0.20\n\n In SciPy <= 1.0.0 the number of lbfgs iterations may exceed\n ``max_iter``. ``n_iter_`` will now report at most ``max_iter``.\n\n outliers_ : array, shape (n_samples,)\n A boolean mask which is set to True where the samples are identified\n as outliers.\n\n See Also\n --------\n RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.\n TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.\n SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.\n\n References\n ----------\n .. [1] Peter J. Huber, Elvezio M. Ronchetti, Robust Statistics\n Concomitant scale estimates, pg 172\n .. [2] Art B. Owen (2006), A robust hybrid of lasso and ridge regression.\n https://statweb.stanford.edu/~owen/reports/hhu.pdf\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import HuberRegressor, LinearRegression\n >>> from sklearn.datasets import make_regression\n >>> rng = np.random.RandomState(0)\n >>> X, y, coef = make_regression(\n ... n_samples=200, n_features=2, noise=4.0, coef=True, random_state=0)\n >>> X[:4] = rng.uniform(10, 20, (4, 2))\n >>> y[:4] = rng.uniform(10, 20, 4)\n >>> huber = HuberRegressor().fit(X, y)\n >>> huber.score(X, y)\n -7.284...\n >>> huber.predict(X[:1,])\n array([806.7200...])\n >>> linear = LinearRegression().fit(X, y)\n >>> print(\"True coefficients:\", coef)\n True coefficients: [20.4923... 34.1698...]\n >>> print(\"Huber coefficients:\", huber.coef_)\n Huber coefficients: [17.7906... 31.0106...]\n >>> print(\"Linear Regression coefficients:\", linear.coef_)\n Linear Regression coefficients: [-1.9221... 7.0226...]\n ", "source_code": "\n\nclass HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):\n \"\"\"Linear regression model that is robust to outliers.\n\n The Huber Regressor optimizes the squared loss for the samples where\n ``|(y - X'w) / sigma| < epsilon`` and the absolute loss for the samples\n where ``|(y - X'w) / sigma| > epsilon``, where w and sigma are parameters\n to be optimized. The parameter sigma makes sure that if y is scaled up\n or down by a certain factor, one does not need to rescale epsilon to\n achieve the same robustness. Note that this does not take into account\n the fact that the different features of X may be of different scales.\n\n This makes sure that the loss function is not heavily influenced by the\n outliers while not completely ignoring their effect.\n\n Read more in the :ref:`User Guide `\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n epsilon : float, greater than 1.0, default=1.35\n The parameter epsilon controls the number of samples that should be\n classified as outliers. The smaller the epsilon, the more robust it is\n to outliers.\n\n max_iter : int, default=100\n Maximum number of iterations that\n ``scipy.optimize.minimize(method=\"L-BFGS-B\")`` should run for.\n\n alpha : float, default=0.0001\n Regularization parameter.\n\n warm_start : bool, default=False\n This is useful if the stored attributes of a previously used model\n has to be reused. If set to False, then the coefficients will\n be rewritten for every call to fit.\n See :term:`the Glossary `.\n\n fit_intercept : bool, default=True\n Whether or not to fit the intercept. This can be set to False\n if the data is already centered around the origin.\n\n tol : float, default=1e-05\n The iteration will stop when\n ``max{|proj g_i | i = 1, ..., n}`` <= ``tol``\n where pg_i is the i-th component of the projected gradient.\n\n Attributes\n ----------\n coef_ : array, shape (n_features,)\n Features got by optimizing the Huber loss.\n\n intercept_ : float\n Bias.\n\n scale_ : float\n The value by which ``|y - X'w - c|`` is scaled down.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations that\n ``scipy.optimize.minimize(method=\"L-BFGS-B\")`` has run for.\n\n .. versionchanged:: 0.20\n\n In SciPy <= 1.0.0 the number of lbfgs iterations may exceed\n ``max_iter``. ``n_iter_`` will now report at most ``max_iter``.\n\n outliers_ : array, shape (n_samples,)\n A boolean mask which is set to True where the samples are identified\n as outliers.\n\n See Also\n --------\n RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.\n TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.\n SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.\n\n References\n ----------\n .. [1] Peter J. Huber, Elvezio M. Ronchetti, Robust Statistics\n Concomitant scale estimates, pg 172\n .. [2] Art B. Owen (2006), A robust hybrid of lasso and ridge regression.\n https://statweb.stanford.edu/~owen/reports/hhu.pdf\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import HuberRegressor, LinearRegression\n >>> from sklearn.datasets import make_regression\n >>> rng = np.random.RandomState(0)\n >>> X, y, coef = make_regression(\n ... n_samples=200, n_features=2, noise=4.0, coef=True, random_state=0)\n >>> X[:4] = rng.uniform(10, 20, (4, 2))\n >>> y[:4] = rng.uniform(10, 20, 4)\n >>> huber = HuberRegressor().fit(X, y)\n >>> huber.score(X, y)\n -7.284...\n >>> huber.predict(X[:1,])\n array([806.7200...])\n >>> linear = LinearRegression().fit(X, y)\n >>> print(\"True coefficients:\", coef)\n True coefficients: [20.4923... 34.1698...]\n >>> print(\"Huber coefficients:\", huber.coef_)\n Huber coefficients: [17.7906... 31.0106...]\n >>> print(\"Linear Regression coefficients:\", linear.coef_)\n Linear Regression coefficients: [-1.9221... 7.0226...]\n \"\"\"\n \n def __init__(self, *, epsilon=1.35, max_iter=100, alpha=0.0001, warm_start=False, fit_intercept=True, tol=1e-05):\n self.epsilon = epsilon\n self.max_iter = max_iter\n self.alpha = alpha\n self.warm_start = warm_start\n self.fit_intercept = fit_intercept\n self.tol = tol\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like, shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like, shape (n_samples,)\n Weight given to each sample.\n\n Returns\n -------\n self : object\n Fitted `HuberRegressor` estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y, copy=False, accept_sparse=['csr'], y_numeric=True, dtype=[np.float64, np.float32])\n sample_weight = _check_sample_weight(sample_weight, X)\n if self.epsilon < 1.0:\n raise ValueError('epsilon should be greater than or equal to 1.0, got %f' % self.epsilon)\n if self.warm_start and hasattr(self, 'coef_'):\n parameters = np.concatenate((self.coef_, [self.intercept_, self.scale_]))\n else:\n if self.fit_intercept:\n parameters = np.zeros(X.shape[1] + 2)\n else:\n parameters = np.zeros(X.shape[1] + 1)\n parameters[-1] = 1\n bounds = np.tile([-np.inf, np.inf], (parameters.shape[0], 1))\n bounds[-1][0] = np.finfo(np.float64).eps * 10\n opt_res = optimize.minimize(_huber_loss_and_gradient, parameters, method='L-BFGS-B', jac=True, args=(X, y, self.epsilon, self.alpha, sample_weight), options={'maxiter': self.max_iter, 'gtol': self.tol, 'iprint': -1}, bounds=bounds)\n parameters = opt_res.x\n if opt_res.status == 2:\n raise ValueError('HuberRegressor convergence failed: l-BFGS-b solver terminated with %s' % opt_res.message)\n self.n_iter_ = _check_optimize_result('lbfgs', opt_res, self.max_iter)\n self.scale_ = parameters[-1]\n if self.fit_intercept:\n self.intercept_ = parameters[-2]\n else:\n self.intercept_ = 0.0\n self.coef_ = parameters[:X.shape[1]]\n residual = np.abs(y - safe_sparse_dot(X, self.coef_) - self.intercept_)\n self.outliers_ = residual > self.scale_ * self.epsilon\n return self\n" }, @@ -23713,7 +23780,7 @@ "sklearn.linear_model._least_angle.LarsCV.fit" ], "is_public": true, - "description": "Cross-validated Least Angle Regression model.\n\nSee glossary entry for :term:`cross-validation estimator`. Read more in the :ref:`User Guide `.", + "description": "Cross-validated Least Angle Regression model.\n\nSee glossary entry for :term:`cross-validation estimator`.\n\nRead more in the :ref:`User Guide `.", "docstring": "Cross-validated Least Angle Regression model.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n verbose : bool or int, default=False\n Sets the verbosity amount.\n\n max_iter : int, default=500\n Maximum number of iterations to perform.\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n precompute : bool, 'auto' or array-like , default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram matrix\n cannot be passed as argument since we will use only subsets of X.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n max_n_alphas : int, default=1000\n The maximum number of points on the path used to compute the\n residuals in the cross-validation.\n\n n_jobs : int or None, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n Attributes\n ----------\n active_ : list of length n_alphas or list of such lists\n Indices of active variables at the end of the path.\n If this is a list of lists, the outer list length is `n_targets`.\n\n coef_ : array-like of shape (n_features,)\n parameter vector (w in the formulation formula)\n\n intercept_ : float\n independent term in decision function\n\n coef_path_ : array-like of shape (n_features, n_alphas)\n the varying values of the coefficients along the path\n\n alpha_ : float\n the estimated regularization parameter alpha\n\n alphas_ : array-like of shape (n_alphas,)\n the different values of alpha along the path\n\n cv_alphas_ : array-like of shape (n_cv_alphas,)\n all the values of alpha along the path for the different folds\n\n mse_path_ : array-like of shape (n_folds, n_cv_alphas)\n the mean square error on left-out for each fold along the path\n (alpha values given by ``cv_alphas``)\n\n n_iter_ : array-like or int\n the number of iterations run by Lars with the optimal alpha.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso\n path using LARS algorithm.\n lasso_path : Compute Lasso path with coordinate descent.\n Lasso : Linear Model trained with L1 prior as\n regularizer (aka the Lasso).\n LassoCV : Lasso linear model with iterative fitting\n along a regularization path.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n LassoLarsIC : Lasso model fit with Lars using BIC\n or AIC for model selection.\n sklearn.decomposition.sparse_encode : Sparse coding.\n\n Examples\n --------\n >>> from sklearn.linear_model import LarsCV\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_samples=200, noise=4.0, random_state=0)\n >>> reg = LarsCV(cv=5, normalize=False).fit(X, y)\n >>> reg.score(X, y)\n 0.9996...\n >>> reg.alpha_\n 0.2961...\n >>> reg.predict(X[:1,])\n array([154.3996...])\n ", "source_code": "\n\nclass LarsCV(Lars):\n \"\"\"Cross-validated Least Angle Regression model.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n verbose : bool or int, default=False\n Sets the verbosity amount.\n\n max_iter : int, default=500\n Maximum number of iterations to perform.\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n precompute : bool, 'auto' or array-like , default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram matrix\n cannot be passed as argument since we will use only subsets of X.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n max_n_alphas : int, default=1000\n The maximum number of points on the path used to compute the\n residuals in the cross-validation.\n\n n_jobs : int or None, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n Attributes\n ----------\n active_ : list of length n_alphas or list of such lists\n Indices of active variables at the end of the path.\n If this is a list of lists, the outer list length is `n_targets`.\n\n coef_ : array-like of shape (n_features,)\n parameter vector (w in the formulation formula)\n\n intercept_ : float\n independent term in decision function\n\n coef_path_ : array-like of shape (n_features, n_alphas)\n the varying values of the coefficients along the path\n\n alpha_ : float\n the estimated regularization parameter alpha\n\n alphas_ : array-like of shape (n_alphas,)\n the different values of alpha along the path\n\n cv_alphas_ : array-like of shape (n_cv_alphas,)\n all the values of alpha along the path for the different folds\n\n mse_path_ : array-like of shape (n_folds, n_cv_alphas)\n the mean square error on left-out for each fold along the path\n (alpha values given by ``cv_alphas``)\n\n n_iter_ : array-like or int\n the number of iterations run by Lars with the optimal alpha.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso\n path using LARS algorithm.\n lasso_path : Compute Lasso path with coordinate descent.\n Lasso : Linear Model trained with L1 prior as\n regularizer (aka the Lasso).\n LassoCV : Lasso linear model with iterative fitting\n along a regularization path.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n LassoLarsIC : Lasso model fit with Lars using BIC\n or AIC for model selection.\n sklearn.decomposition.sparse_encode : Sparse coding.\n\n Examples\n --------\n >>> from sklearn.linear_model import LarsCV\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_samples=200, noise=4.0, random_state=0)\n >>> reg = LarsCV(cv=5, normalize=False).fit(X, y)\n >>> reg.score(X, y)\n 0.9996...\n >>> reg.alpha_\n 0.2961...\n >>> reg.predict(X[:1,])\n array([154.3996...])\n \"\"\"\n method = 'lar'\n \n def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500, normalize='deprecated', precompute='auto', cv=None, max_n_alphas=1000, n_jobs=None, eps=np.finfo(float).eps, copy_X=True):\n self.max_iter = max_iter\n self.cv = cv\n self.max_n_alphas = max_n_alphas\n self.n_jobs = n_jobs\n super().__init__(fit_intercept=fit_intercept, verbose=verbose, normalize=normalize, precompute=precompute, n_nonzero_coefs=500, eps=eps, copy_X=copy_X, fit_path=True)\n \n def _more_tags(self):\n return {'multioutput': False}\n \n def fit(self, X, y):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=True, estimator_name=self.__class__.__name__)\n (X, y) = self._validate_data(X, y, y_numeric=True)\n X = as_float_array(X, copy=self.copy_X)\n y = as_float_array(y, copy=self.copy_X)\n cv = check_cv(self.cv, classifier=False)\n Gram = self.precompute\n if hasattr(Gram, '__array__'):\n warnings.warn('Parameter \"precompute\" cannot be an array in %s. Automatically switch to \"auto\" instead.' % self.__class__.__name__)\n Gram = 'auto'\n cv_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)((delayed(_lars_path_residues)(X[train], y[train], X[test], y[test], Gram=Gram, copy=False, method=self.method, verbose=max(0, self.verbose - 1), normalize=_normalize, fit_intercept=self.fit_intercept, max_iter=self.max_iter, eps=self.eps, positive=self.positive) for (train, test) in cv.split(X, y)))\n all_alphas = np.concatenate(list(zip(*cv_paths))[0])\n all_alphas = np.unique(all_alphas)\n stride = int(max(1, int(len(all_alphas) / float(self.max_n_alphas))))\n all_alphas = all_alphas[::stride]\n mse_path = np.empty((len(all_alphas), len(cv_paths)))\n for (index, (alphas, _, _, residues)) in enumerate(cv_paths):\n alphas = alphas[::-1]\n residues = residues[::-1]\n if alphas[0] != 0:\n alphas = np.r_[0, alphas]\n residues = np.r_[residues[0, np.newaxis], residues]\n if alphas[-1] != all_alphas[-1]:\n alphas = np.r_[alphas, all_alphas[-1]]\n residues = np.r_[residues, residues[-1, np.newaxis]]\n this_residues = interpolate.interp1d(alphas, residues, axis=0)(all_alphas)\n this_residues **= 2\n mse_path[:, index] = np.mean(this_residues, axis=-1)\n mask = np.all(np.isfinite(mse_path), axis=-1)\n all_alphas = all_alphas[mask]\n mse_path = mse_path[mask]\n i_best_alpha = np.argmin(mse_path.mean(axis=-1))\n best_alpha = all_alphas[i_best_alpha]\n self.alpha_ = best_alpha\n self.cv_alphas_ = all_alphas\n self.mse_path_ = mse_path\n self._fit(X, y, max_iter=self.max_iter, alpha=best_alpha, Xy=None, fit_path=True, normalize=_normalize)\n return self\n" }, @@ -23724,7 +23791,7 @@ "superclasses": ["Lars"], "methods": ["sklearn.linear_model._least_angle.LassoLars.__init__"], "is_public": true, - "description": "Lasso model fit with Least Angle Regression a.k.a. Lars.\n\nIt is a Linear Model trained with an L1 prior as regularizer. The optimization objective for Lasso is:: (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 Read more in the :ref:`User Guide `.", + "description": "Lasso model fit with Least Angle Regression a.k.a. Lars.\n\nIt is a Linear Model trained with an L1 prior as regularizer.\n\nThe optimization objective for Lasso is::\n\n(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nRead more in the :ref:`User Guide `.", "docstring": "Lasso model fit with Least Angle Regression a.k.a. Lars.\n\n It is a Linear Model trained with an L1 prior as regularizer.\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Constant that multiplies the penalty term. Defaults to 1.0.\n ``alpha = 0`` is equivalent to an ordinary least square, solved\n by :class:`LinearRegression`. For numerical reasons, using\n ``alpha = 0`` with the LassoLars object is not advised and you\n should prefer the LinearRegression object.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n verbose : bool or int, default=False\n Sets the verbosity amount.\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n precompute : bool, 'auto' or array-like, default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n max_iter : int, default=500\n Maximum number of iterations to perform.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n fit_path : bool, default=True\n If ``True`` the full path is stored in the ``coef_path_`` attribute.\n If you compute the solution for a large problem or many targets,\n setting ``fit_path`` to ``False`` will lead to a speedup, especially\n with a small alpha.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. Be aware that you might want to\n remove fit_intercept which is set True by default.\n Under the positive restriction the model coefficients will not converge\n to the ordinary-least-squares solution for small values of alpha.\n Only coefficients up to the smallest alpha value (``alphas_[alphas_ >\n 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso\n algorithm are typically in congruence with the solution of the\n coordinate descent Lasso estimator.\n\n jitter : float, default=None\n Upper bound on a uniform noise parameter to be added to the\n `y` values, to satisfy the model's assumption of\n one-at-a-time computations. Might help with stability.\n\n .. versionadded:: 0.23\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for jittering. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `. Ignored if `jitter` is None.\n\n .. versionadded:: 0.23\n\n Attributes\n ----------\n alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller. If this is a list of array-like, the length of the outer\n list is `n_targets`.\n\n active_ : list of length n_alphas or list of such lists\n Indices of active variables at the end of the path.\n If this is a list of list, the length of the outer list is `n_targets`.\n\n coef_path_ : array-like of shape (n_features, n_alphas + 1) or list of such arrays\n If a list is passed it's expected to be one of n_targets such arrays.\n The varying values of the coefficients along the path. It is not\n present if the ``fit_path`` parameter is ``False``. If this is a list\n of array-like, the length of the outer list is `n_targets`.\n\n coef_ : array-like of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the formulation formula).\n\n intercept_ : float or array-like of shape (n_targets,)\n Independent term in decision function.\n\n n_iter_ : array-like or int\n The number of iterations taken by lars_path to find the\n grid of alphas for each target.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso\n path using LARS algorithm.\n lasso_path : Compute Lasso path with coordinate descent.\n Lasso : Linear Model trained with L1 prior as\n regularizer (aka the Lasso).\n LassoCV : Lasso linear model with iterative fitting\n along a regularization path.\n LassoLarsCV: Cross-validated Lasso, using the LARS algorithm.\n LassoLarsIC : Lasso model fit with Lars using BIC\n or AIC for model selection.\n sklearn.decomposition.sparse_encode : Sparse coding.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> reg = linear_model.LassoLars(alpha=0.01, normalize=False)\n >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1])\n LassoLars(alpha=0.01, normalize=False)\n >>> print(reg.coef_)\n [ 0. -0.955...]\n ", "source_code": "\n\nclass LassoLars(Lars):\n \"\"\"Lasso model fit with Least Angle Regression a.k.a. Lars.\n\n It is a Linear Model trained with an L1 prior as regularizer.\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Constant that multiplies the penalty term. Defaults to 1.0.\n ``alpha = 0`` is equivalent to an ordinary least square, solved\n by :class:`LinearRegression`. For numerical reasons, using\n ``alpha = 0`` with the LassoLars object is not advised and you\n should prefer the LinearRegression object.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n verbose : bool or int, default=False\n Sets the verbosity amount.\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n precompute : bool, 'auto' or array-like, default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n max_iter : int, default=500\n Maximum number of iterations to perform.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n fit_path : bool, default=True\n If ``True`` the full path is stored in the ``coef_path_`` attribute.\n If you compute the solution for a large problem or many targets,\n setting ``fit_path`` to ``False`` will lead to a speedup, especially\n with a small alpha.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. Be aware that you might want to\n remove fit_intercept which is set True by default.\n Under the positive restriction the model coefficients will not converge\n to the ordinary-least-squares solution for small values of alpha.\n Only coefficients up to the smallest alpha value (``alphas_[alphas_ >\n 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso\n algorithm are typically in congruence with the solution of the\n coordinate descent Lasso estimator.\n\n jitter : float, default=None\n Upper bound on a uniform noise parameter to be added to the\n `y` values, to satisfy the model's assumption of\n one-at-a-time computations. Might help with stability.\n\n .. versionadded:: 0.23\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for jittering. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `. Ignored if `jitter` is None.\n\n .. versionadded:: 0.23\n\n Attributes\n ----------\n alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller. If this is a list of array-like, the length of the outer\n list is `n_targets`.\n\n active_ : list of length n_alphas or list of such lists\n Indices of active variables at the end of the path.\n If this is a list of list, the length of the outer list is `n_targets`.\n\n coef_path_ : array-like of shape (n_features, n_alphas + 1) or list of such arrays\n If a list is passed it's expected to be one of n_targets such arrays.\n The varying values of the coefficients along the path. It is not\n present if the ``fit_path`` parameter is ``False``. If this is a list\n of array-like, the length of the outer list is `n_targets`.\n\n coef_ : array-like of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the formulation formula).\n\n intercept_ : float or array-like of shape (n_targets,)\n Independent term in decision function.\n\n n_iter_ : array-like or int\n The number of iterations taken by lars_path to find the\n grid of alphas for each target.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso\n path using LARS algorithm.\n lasso_path : Compute Lasso path with coordinate descent.\n Lasso : Linear Model trained with L1 prior as\n regularizer (aka the Lasso).\n LassoCV : Lasso linear model with iterative fitting\n along a regularization path.\n LassoLarsCV: Cross-validated Lasso, using the LARS algorithm.\n LassoLarsIC : Lasso model fit with Lars using BIC\n or AIC for model selection.\n sklearn.decomposition.sparse_encode : Sparse coding.\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> reg = linear_model.LassoLars(alpha=0.01, normalize=False)\n >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1])\n LassoLars(alpha=0.01, normalize=False)\n >>> print(reg.coef_)\n [ 0. -0.955...]\n \"\"\"\n method = 'lasso'\n \n def __init__(self, alpha=1.0, *, fit_intercept=True, verbose=False, normalize='deprecated', precompute='auto', max_iter=500, eps=np.finfo(float).eps, copy_X=True, fit_path=True, positive=False, jitter=None, random_state=None):\n self.alpha = alpha\n self.fit_intercept = fit_intercept\n self.max_iter = max_iter\n self.verbose = verbose\n self.normalize = normalize\n self.positive = positive\n self.precompute = precompute\n self.copy_X = copy_X\n self.eps = eps\n self.fit_path = fit_path\n self.jitter = jitter\n self.random_state = random_state\n" }, @@ -23737,7 +23804,7 @@ "sklearn.linear_model._least_angle.LassoLarsCV.__init__" ], "is_public": true, - "description": "Cross-validated Lasso, using the LARS algorithm.\n\nSee glossary entry for :term:`cross-validation estimator`. The optimization objective for Lasso is:: (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 Read more in the :ref:`User Guide `.", + "description": "Cross-validated Lasso, using the LARS algorithm.\n\nSee glossary entry for :term:`cross-validation estimator`.\n\nThe optimization objective for Lasso is::\n\n(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nRead more in the :ref:`User Guide `.", "docstring": "Cross-validated Lasso, using the LARS algorithm.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n verbose : bool or int, default=False\n Sets the verbosity amount.\n\n max_iter : int, default=500\n Maximum number of iterations to perform.\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n precompute : bool or 'auto' , default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram matrix\n cannot be passed as argument since we will use only subsets of X.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n max_n_alphas : int, default=1000\n The maximum number of points on the path used to compute the\n residuals in the cross-validation.\n\n n_jobs : int or None, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. Be aware that you might want to\n remove fit_intercept which is set True by default.\n Under the positive restriction the model coefficients do not converge\n to the ordinary-least-squares solution for small values of alpha.\n Only coefficients up to the smallest alpha value (``alphas_[alphas_ >\n 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso\n algorithm are typically in congruence with the solution of the\n coordinate descent Lasso estimator.\n As a consequence using LassoLarsCV only makes sense for problems where\n a sparse solution is expected and/or reached.\n\n Attributes\n ----------\n coef_ : array-like of shape (n_features,)\n parameter vector (w in the formulation formula)\n\n intercept_ : float\n independent term in decision function.\n\n coef_path_ : array-like of shape (n_features, n_alphas)\n the varying values of the coefficients along the path\n\n alpha_ : float\n the estimated regularization parameter alpha\n\n alphas_ : array-like of shape (n_alphas,)\n the different values of alpha along the path\n\n cv_alphas_ : array-like of shape (n_cv_alphas,)\n all the values of alpha along the path for the different folds\n\n mse_path_ : array-like of shape (n_folds, n_cv_alphas)\n the mean square error on left-out for each fold along the path\n (alpha values given by ``cv_alphas``)\n\n n_iter_ : array-like or int\n the number of iterations run by Lars with the optimal alpha.\n\n active_ : list of int\n Indices of active variables at the end of the path.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso\n path using LARS algorithm.\n lasso_path : Compute Lasso path with coordinate descent.\n Lasso : Linear Model trained with L1 prior as\n regularizer (aka the Lasso).\n LassoCV : Lasso linear model with iterative fitting\n along a regularization path.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n LassoLarsIC : Lasso model fit with Lars using BIC\n or AIC for model selection.\n sklearn.decomposition.sparse_encode : Sparse coding.\n\n Notes\n -----\n The object solves the same problem as the LassoCV object. However,\n unlike the LassoCV, it find the relevant alphas values by itself.\n In general, because of this property, it will be more stable.\n However, it is more fragile to heavily multicollinear datasets.\n\n It is more efficient than the LassoCV if only a small number of\n features are selected compared to the total number, for instance if\n there are very few samples compared to the number of features.\n\n Examples\n --------\n >>> from sklearn.linear_model import LassoLarsCV\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(noise=4.0, random_state=0)\n >>> reg = LassoLarsCV(cv=5, normalize=False).fit(X, y)\n >>> reg.score(X, y)\n 0.9993...\n >>> reg.alpha_\n 0.3972...\n >>> reg.predict(X[:1,])\n array([-78.4831...])\n ", "source_code": "\n\nclass LassoLarsCV(LarsCV):\n \"\"\"Cross-validated Lasso, using the LARS algorithm.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n verbose : bool or int, default=False\n Sets the verbosity amount.\n\n max_iter : int, default=500\n Maximum number of iterations to perform.\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n precompute : bool or 'auto' , default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram matrix\n cannot be passed as argument since we will use only subsets of X.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n max_n_alphas : int, default=1000\n The maximum number of points on the path used to compute the\n residuals in the cross-validation.\n\n n_jobs : int or None, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. Be aware that you might want to\n remove fit_intercept which is set True by default.\n Under the positive restriction the model coefficients do not converge\n to the ordinary-least-squares solution for small values of alpha.\n Only coefficients up to the smallest alpha value (``alphas_[alphas_ >\n 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso\n algorithm are typically in congruence with the solution of the\n coordinate descent Lasso estimator.\n As a consequence using LassoLarsCV only makes sense for problems where\n a sparse solution is expected and/or reached.\n\n Attributes\n ----------\n coef_ : array-like of shape (n_features,)\n parameter vector (w in the formulation formula)\n\n intercept_ : float\n independent term in decision function.\n\n coef_path_ : array-like of shape (n_features, n_alphas)\n the varying values of the coefficients along the path\n\n alpha_ : float\n the estimated regularization parameter alpha\n\n alphas_ : array-like of shape (n_alphas,)\n the different values of alpha along the path\n\n cv_alphas_ : array-like of shape (n_cv_alphas,)\n all the values of alpha along the path for the different folds\n\n mse_path_ : array-like of shape (n_folds, n_cv_alphas)\n the mean square error on left-out for each fold along the path\n (alpha values given by ``cv_alphas``)\n\n n_iter_ : array-like or int\n the number of iterations run by Lars with the optimal alpha.\n\n active_ : list of int\n Indices of active variables at the end of the path.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso\n path using LARS algorithm.\n lasso_path : Compute Lasso path with coordinate descent.\n Lasso : Linear Model trained with L1 prior as\n regularizer (aka the Lasso).\n LassoCV : Lasso linear model with iterative fitting\n along a regularization path.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n LassoLarsIC : Lasso model fit with Lars using BIC\n or AIC for model selection.\n sklearn.decomposition.sparse_encode : Sparse coding.\n\n Notes\n -----\n The object solves the same problem as the LassoCV object. However,\n unlike the LassoCV, it find the relevant alphas values by itself.\n In general, because of this property, it will be more stable.\n However, it is more fragile to heavily multicollinear datasets.\n\n It is more efficient than the LassoCV if only a small number of\n features are selected compared to the total number, for instance if\n there are very few samples compared to the number of features.\n\n Examples\n --------\n >>> from sklearn.linear_model import LassoLarsCV\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(noise=4.0, random_state=0)\n >>> reg = LassoLarsCV(cv=5, normalize=False).fit(X, y)\n >>> reg.score(X, y)\n 0.9993...\n >>> reg.alpha_\n 0.3972...\n >>> reg.predict(X[:1,])\n array([-78.4831...])\n \"\"\"\n method = 'lasso'\n \n def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500, normalize='deprecated', precompute='auto', cv=None, max_n_alphas=1000, n_jobs=None, eps=np.finfo(float).eps, copy_X=True, positive=False):\n self.fit_intercept = fit_intercept\n self.verbose = verbose\n self.max_iter = max_iter\n self.normalize = normalize\n self.precompute = precompute\n self.cv = cv\n self.max_n_alphas = max_n_alphas\n self.n_jobs = n_jobs\n self.eps = eps\n self.copy_X = copy_X\n self.positive = positive\n" }, @@ -23749,12 +23816,13 @@ "methods": [ "sklearn.linear_model._least_angle.LassoLarsIC.__init__", "sklearn.linear_model._least_angle.LassoLarsIC._more_tags", - "sklearn.linear_model._least_angle.LassoLarsIC.fit" + "sklearn.linear_model._least_angle.LassoLarsIC.fit", + "sklearn.linear_model._least_angle.LassoLarsIC._estimate_noise_variance" ], "is_public": true, - "description": "Lasso model fit with Lars using BIC or AIC for model selection.\n\nThe optimization objective for Lasso is:: (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 AIC is the Akaike information criterion and BIC is the Bayes Information criterion. Such criteria are useful to select the value of the regularization parameter by making a trade-off between the goodness of fit and the complexity of the model. A good model should explain well the data while being simple. Read more in the :ref:`User Guide `.", - "docstring": "Lasso model fit with Lars using BIC or AIC for model selection.\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n AIC is the Akaike information criterion and BIC is the Bayes\n Information criterion. Such criteria are useful to select the value\n of the regularization parameter by making a trade-off between the\n goodness of fit and the complexity of the model. A good model should\n explain well the data while being simple.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n criterion : {'bic' , 'aic'}, default='aic'\n The type of criterion to use.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n verbose : bool or int, default=False\n Sets the verbosity amount.\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n precompute : bool, 'auto' or array-like, default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n max_iter : int, default=500\n Maximum number of iterations to perform. Can be used for\n early stopping.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. Be aware that you might want to\n remove fit_intercept which is set True by default.\n Under the positive restriction the model coefficients do not converge\n to the ordinary-least-squares solution for small values of alpha.\n Only coefficients up to the smallest alpha value (``alphas_[alphas_ >\n 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso\n algorithm are typically in congruence with the solution of the\n coordinate descent Lasso estimator.\n As a consequence using LassoLarsIC only makes sense for problems where\n a sparse solution is expected and/or reached.\n\n Attributes\n ----------\n coef_ : array-like of shape (n_features,)\n parameter vector (w in the formulation formula)\n\n intercept_ : float\n independent term in decision function.\n\n alpha_ : float\n the alpha parameter chosen by the information criterion\n\n alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller. If a list, it will be of length `n_targets`.\n\n n_iter_ : int\n number of iterations run by lars_path to find the grid of\n alphas.\n\n criterion_ : array-like of shape (n_alphas,)\n The value of the information criteria ('aic', 'bic') across all\n alphas. The alpha which has the smallest information criterion is\n chosen. This value is larger by a factor of ``n_samples`` compared to\n Eqns. 2.15 and 2.16 in (Zou et al, 2007).\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso\n path using LARS algorithm.\n lasso_path : Compute Lasso path with coordinate descent.\n Lasso : Linear Model trained with L1 prior as\n regularizer (aka the Lasso).\n LassoCV : Lasso linear model with iterative fitting\n along a regularization path.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n LassoLarsCV: Cross-validated Lasso, using the LARS algorithm.\n sklearn.decomposition.sparse_encode : Sparse coding.\n\n Notes\n -----\n The estimation of the number of degrees of freedom is given by:\n\n \"On the degrees of freedom of the lasso\"\n Hui Zou, Trevor Hastie, and Robert Tibshirani\n Ann. Statist. Volume 35, Number 5 (2007), 2173-2192.\n\n https://en.wikipedia.org/wiki/Akaike_information_criterion\n https://en.wikipedia.org/wiki/Bayesian_information_criterion\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> reg = linear_model.LassoLarsIC(criterion='bic', normalize=False)\n >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1.1111, 0, -1.1111])\n LassoLarsIC(criterion='bic', normalize=False)\n >>> print(reg.coef_)\n [ 0. -1.11...]\n ", - "source_code": "\n\nclass LassoLarsIC(LassoLars):\n \"\"\"Lasso model fit with Lars using BIC or AIC for model selection.\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n AIC is the Akaike information criterion and BIC is the Bayes\n Information criterion. Such criteria are useful to select the value\n of the regularization parameter by making a trade-off between the\n goodness of fit and the complexity of the model. A good model should\n explain well the data while being simple.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n criterion : {'bic' , 'aic'}, default='aic'\n The type of criterion to use.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n verbose : bool or int, default=False\n Sets the verbosity amount.\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n precompute : bool, 'auto' or array-like, default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n max_iter : int, default=500\n Maximum number of iterations to perform. Can be used for\n early stopping.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. Be aware that you might want to\n remove fit_intercept which is set True by default.\n Under the positive restriction the model coefficients do not converge\n to the ordinary-least-squares solution for small values of alpha.\n Only coefficients up to the smallest alpha value (``alphas_[alphas_ >\n 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso\n algorithm are typically in congruence with the solution of the\n coordinate descent Lasso estimator.\n As a consequence using LassoLarsIC only makes sense for problems where\n a sparse solution is expected and/or reached.\n\n Attributes\n ----------\n coef_ : array-like of shape (n_features,)\n parameter vector (w in the formulation formula)\n\n intercept_ : float\n independent term in decision function.\n\n alpha_ : float\n the alpha parameter chosen by the information criterion\n\n alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller. If a list, it will be of length `n_targets`.\n\n n_iter_ : int\n number of iterations run by lars_path to find the grid of\n alphas.\n\n criterion_ : array-like of shape (n_alphas,)\n The value of the information criteria ('aic', 'bic') across all\n alphas. The alpha which has the smallest information criterion is\n chosen. This value is larger by a factor of ``n_samples`` compared to\n Eqns. 2.15 and 2.16 in (Zou et al, 2007).\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso\n path using LARS algorithm.\n lasso_path : Compute Lasso path with coordinate descent.\n Lasso : Linear Model trained with L1 prior as\n regularizer (aka the Lasso).\n LassoCV : Lasso linear model with iterative fitting\n along a regularization path.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n LassoLarsCV: Cross-validated Lasso, using the LARS algorithm.\n sklearn.decomposition.sparse_encode : Sparse coding.\n\n Notes\n -----\n The estimation of the number of degrees of freedom is given by:\n\n \"On the degrees of freedom of the lasso\"\n Hui Zou, Trevor Hastie, and Robert Tibshirani\n Ann. Statist. Volume 35, Number 5 (2007), 2173-2192.\n\n https://en.wikipedia.org/wiki/Akaike_information_criterion\n https://en.wikipedia.org/wiki/Bayesian_information_criterion\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> reg = linear_model.LassoLarsIC(criterion='bic', normalize=False)\n >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1.1111, 0, -1.1111])\n LassoLarsIC(criterion='bic', normalize=False)\n >>> print(reg.coef_)\n [ 0. -1.11...]\n \"\"\"\n \n def __init__(self, criterion='aic', *, fit_intercept=True, verbose=False, normalize='deprecated', precompute='auto', max_iter=500, eps=np.finfo(float).eps, copy_X=True, positive=False):\n self.criterion = criterion\n self.fit_intercept = fit_intercept\n self.positive = positive\n self.max_iter = max_iter\n self.verbose = verbose\n self.normalize = normalize\n self.copy_X = copy_X\n self.precompute = precompute\n self.eps = eps\n self.fit_path = True\n \n def _more_tags(self):\n return {'multioutput': False}\n \n def fit(self, X, y, copy_X=None):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n copy_X : bool, default=None\n If provided, this parameter will override the choice\n of copy_X made at instance creation.\n If ``True``, X will be copied; else, it may be overwritten.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=True, estimator_name=self.__class__.__name__)\n if copy_X is None:\n copy_X = self.copy_X\n (X, y) = self._validate_data(X, y, y_numeric=True)\n (X, y, Xmean, ymean, Xstd) = LinearModel._preprocess_data(X, y, self.fit_intercept, _normalize, copy_X)\n Gram = self.precompute\n (alphas_, _, coef_path_, self.n_iter_) = lars_path(X, y, Gram=Gram, copy_X=copy_X, copy_Gram=True, alpha_min=0.0, method='lasso', verbose=self.verbose, max_iter=self.max_iter, eps=self.eps, return_n_iter=True, positive=self.positive)\n n_samples = X.shape[0]\n if self.criterion == 'aic':\n K = 2\n elif self.criterion == 'bic':\n K = log(n_samples)\n else:\n raise ValueError('criterion should be either bic or aic')\n R = y[:, np.newaxis] - np.dot(X, coef_path_)\n mean_squared_error = np.mean(R**2, axis=0)\n sigma2 = np.var(y)\n df = np.zeros(coef_path_.shape[1], dtype=int)\n for (k, coef) in enumerate(coef_path_.T):\n mask = np.abs(coef) > np.finfo(coef.dtype).eps\n if not np.any(mask):\n continue\n df[k] = np.sum(mask)\n self.alphas_ = alphas_\n eps64 = np.finfo('float64').eps\n self.criterion_ = n_samples * mean_squared_error / (sigma2 + eps64) + K * df\n n_best = np.argmin(self.criterion_)\n self.alpha_ = alphas_[n_best]\n self.coef_ = coef_path_[:, n_best]\n self._set_intercept(Xmean, ymean, Xstd)\n return self\n" + "description": "Lasso model fit with Lars using BIC or AIC for model selection.\n\nThe optimization objective for Lasso is::\n\n(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nAIC is the Akaike information criterion [2]_ and BIC is the Bayes\nInformation criterion [3]_. Such criteria are useful to select the value\nof the regularization parameter by making a trade-off between the\ngoodness of fit and the complexity of the model. A good model should\nexplain well the data while being simple.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Lasso model fit with Lars using BIC or AIC for model selection.\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n AIC is the Akaike information criterion [2]_ and BIC is the Bayes\n Information criterion [3]_. Such criteria are useful to select the value\n of the regularization parameter by making a trade-off between the\n goodness of fit and the complexity of the model. A good model should\n explain well the data while being simple.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n criterion : {'aic', 'bic'}, default='aic'\n The type of criterion to use.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n verbose : bool or int, default=False\n Sets the verbosity amount.\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n precompute : bool, 'auto' or array-like, default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n max_iter : int, default=500\n Maximum number of iterations to perform. Can be used for\n early stopping.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. Be aware that you might want to\n remove fit_intercept which is set True by default.\n Under the positive restriction the model coefficients do not converge\n to the ordinary-least-squares solution for small values of alpha.\n Only coefficients up to the smallest alpha value (``alphas_[alphas_ >\n 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso\n algorithm are typically in congruence with the solution of the\n coordinate descent Lasso estimator.\n As a consequence using LassoLarsIC only makes sense for problems where\n a sparse solution is expected and/or reached.\n\n noise_variance : float, default=None\n The estimated noise variance of the data. If `None`, an unbiased\n estimate is computed by an OLS model. However, it is only possible\n in the case where `n_samples > n_features + fit_intercept`.\n\n .. versionadded:: 1.1\n\n Attributes\n ----------\n coef_ : array-like of shape (n_features,)\n parameter vector (w in the formulation formula)\n\n intercept_ : float\n independent term in decision function.\n\n alpha_ : float\n the alpha parameter chosen by the information criterion\n\n alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller. If a list, it will be of length `n_targets`.\n\n n_iter_ : int\n number of iterations run by lars_path to find the grid of\n alphas.\n\n criterion_ : array-like of shape (n_alphas,)\n The value of the information criteria ('aic', 'bic') across all\n alphas. The alpha which has the smallest information criterion is\n chosen, as specified in [1]_.\n\n noise_variance_ : float\n The estimated noise variance from the data used to compute the\n criterion.\n\n .. versionadded:: 1.1\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso\n path using LARS algorithm.\n lasso_path : Compute Lasso path with coordinate descent.\n Lasso : Linear Model trained with L1 prior as\n regularizer (aka the Lasso).\n LassoCV : Lasso linear model with iterative fitting\n along a regularization path.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n LassoLarsCV: Cross-validated Lasso, using the LARS algorithm.\n sklearn.decomposition.sparse_encode : Sparse coding.\n\n Notes\n -----\n The number of degrees of freedom is computed as in [1]_.\n\n To have more details regarding the mathematical formulation of the\n AIC and BIC criteria, please refer to :ref:`User Guide `.\n\n References\n ----------\n .. [1] :arxiv:`Zou, Hui, Trevor Hastie, and Robert Tibshirani.\n \"On the degrees of freedom of the lasso.\"\n The Annals of Statistics 35.5 (2007): 2173-2192.\n <0712.0881>`\n\n .. [2] `Wikipedia entry on the Akaike information criterion\n `_\n\n .. [3] `Wikipedia entry on the Bayesian information criterion\n `_\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> reg = linear_model.LassoLarsIC(criterion='bic', normalize=False)\n >>> X = [[-2, 2], [-1, 1], [0, 0], [1, 1], [2, 2]]\n >>> y = [-2.2222, -1.1111, 0, -1.1111, -2.2222]\n >>> reg.fit(X, y)\n LassoLarsIC(criterion='bic', normalize=False)\n >>> print(reg.coef_)\n [ 0. -1.11...]\n ", + "source_code": "\n\nclass LassoLarsIC(LassoLars):\n \"\"\"Lasso model fit with Lars using BIC or AIC for model selection.\n\n The optimization objective for Lasso is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n AIC is the Akaike information criterion [2]_ and BIC is the Bayes\n Information criterion [3]_. Such criteria are useful to select the value\n of the regularization parameter by making a trade-off between the\n goodness of fit and the complexity of the model. A good model should\n explain well the data while being simple.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n criterion : {'aic', 'bic'}, default='aic'\n The type of criterion to use.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n verbose : bool or int, default=False\n Sets the verbosity amount.\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n precompute : bool, 'auto' or array-like, default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n max_iter : int, default=500\n Maximum number of iterations to perform. Can be used for\n early stopping.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. Be aware that you might want to\n remove fit_intercept which is set True by default.\n Under the positive restriction the model coefficients do not converge\n to the ordinary-least-squares solution for small values of alpha.\n Only coefficients up to the smallest alpha value (``alphas_[alphas_ >\n 0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso\n algorithm are typically in congruence with the solution of the\n coordinate descent Lasso estimator.\n As a consequence using LassoLarsIC only makes sense for problems where\n a sparse solution is expected and/or reached.\n\n noise_variance : float, default=None\n The estimated noise variance of the data. If `None`, an unbiased\n estimate is computed by an OLS model. However, it is only possible\n in the case where `n_samples > n_features + fit_intercept`.\n\n .. versionadded:: 1.1\n\n Attributes\n ----------\n coef_ : array-like of shape (n_features,)\n parameter vector (w in the formulation formula)\n\n intercept_ : float\n independent term in decision function.\n\n alpha_ : float\n the alpha parameter chosen by the information criterion\n\n alphas_ : array-like of shape (n_alphas + 1,) or list of such arrays\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller. If a list, it will be of length `n_targets`.\n\n n_iter_ : int\n number of iterations run by lars_path to find the grid of\n alphas.\n\n criterion_ : array-like of shape (n_alphas,)\n The value of the information criteria ('aic', 'bic') across all\n alphas. The alpha which has the smallest information criterion is\n chosen, as specified in [1]_.\n\n noise_variance_ : float\n The estimated noise variance from the data used to compute the\n criterion.\n\n .. versionadded:: 1.1\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso\n path using LARS algorithm.\n lasso_path : Compute Lasso path with coordinate descent.\n Lasso : Linear Model trained with L1 prior as\n regularizer (aka the Lasso).\n LassoCV : Lasso linear model with iterative fitting\n along a regularization path.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n LassoLarsCV: Cross-validated Lasso, using the LARS algorithm.\n sklearn.decomposition.sparse_encode : Sparse coding.\n\n Notes\n -----\n The number of degrees of freedom is computed as in [1]_.\n\n To have more details regarding the mathematical formulation of the\n AIC and BIC criteria, please refer to :ref:`User Guide `.\n\n References\n ----------\n .. [1] :arxiv:`Zou, Hui, Trevor Hastie, and Robert Tibshirani.\n \"On the degrees of freedom of the lasso.\"\n The Annals of Statistics 35.5 (2007): 2173-2192.\n <0712.0881>`\n\n .. [2] `Wikipedia entry on the Akaike information criterion\n `_\n\n .. [3] `Wikipedia entry on the Bayesian information criterion\n `_\n\n Examples\n --------\n >>> from sklearn import linear_model\n >>> reg = linear_model.LassoLarsIC(criterion='bic', normalize=False)\n >>> X = [[-2, 2], [-1, 1], [0, 0], [1, 1], [2, 2]]\n >>> y = [-2.2222, -1.1111, 0, -1.1111, -2.2222]\n >>> reg.fit(X, y)\n LassoLarsIC(criterion='bic', normalize=False)\n >>> print(reg.coef_)\n [ 0. -1.11...]\n \"\"\"\n \n def __init__(self, criterion='aic', *, fit_intercept=True, verbose=False, normalize='deprecated', precompute='auto', max_iter=500, eps=np.finfo(float).eps, copy_X=True, positive=False, noise_variance=None):\n self.criterion = criterion\n self.fit_intercept = fit_intercept\n self.positive = positive\n self.max_iter = max_iter\n self.verbose = verbose\n self.normalize = normalize\n self.copy_X = copy_X\n self.precompute = precompute\n self.eps = eps\n self.fit_path = True\n self.noise_variance = noise_variance\n \n def _more_tags(self):\n return {'multioutput': False}\n \n def fit(self, X, y, copy_X=None):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n copy_X : bool, default=None\n If provided, this parameter will override the choice\n of copy_X made at instance creation.\n If ``True``, X will be copied; else, it may be overwritten.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=True, estimator_name=self.__class__.__name__)\n if copy_X is None:\n copy_X = self.copy_X\n (X, y) = self._validate_data(X, y, y_numeric=True)\n (X, y, Xmean, ymean, Xstd) = LinearModel._preprocess_data(X, y, self.fit_intercept, _normalize, copy_X)\n Gram = self.precompute\n (alphas_, _, coef_path_, self.n_iter_) = lars_path(X, y, Gram=Gram, copy_X=copy_X, copy_Gram=True, alpha_min=0.0, method='lasso', verbose=self.verbose, max_iter=self.max_iter, eps=self.eps, return_n_iter=True, positive=self.positive)\n n_samples = X.shape[0]\n if self.criterion == 'aic':\n criterion_factor = 2\n elif self.criterion == 'bic':\n criterion_factor = log(n_samples)\n else:\n raise ValueError(f'criterion should be either bic or aic, got {self.criterion!r}')\n residuals = y[:, np.newaxis] - np.dot(X, coef_path_)\n residuals_sum_squares = np.sum(residuals**2, axis=0)\n degrees_of_freedom = np.zeros(coef_path_.shape[1], dtype=int)\n for (k, coef) in enumerate(coef_path_.T):\n mask = np.abs(coef) > np.finfo(coef.dtype).eps\n if not np.any(mask):\n continue\n degrees_of_freedom[k] = np.sum(mask)\n self.alphas_ = alphas_\n if self.noise_variance is None:\n self.noise_variance_ = self._estimate_noise_variance(X, y, positive=self.positive)\n else:\n self.noise_variance_ = self.noise_variance\n self.criterion_ = n_samples * np.log(2 * np.pi * self.noise_variance_) + residuals_sum_squares / self.noise_variance_ + criterion_factor * degrees_of_freedom\n n_best = np.argmin(self.criterion_)\n self.alpha_ = alphas_[n_best]\n self.coef_ = coef_path_[:, n_best]\n self._set_intercept(Xmean, ymean, Xstd)\n return self\n \n def _estimate_noise_variance(self, X, y, positive):\n \"\"\"Compute an estimate of the variance with an OLS model.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data to be fitted by the OLS model. We expect the data to be\n centered.\n\n y : ndarray of shape (n_samples,)\n Associated target.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. This should be inline with\n the `positive` parameter from `LassoLarsIC`.\n\n Returns\n -------\n noise_variance : float\n An estimator of the noise variance of an OLS model.\n \"\"\"\n if X.shape[0] <= X.shape[1] + self.fit_intercept:\n raise ValueError(f'You are using {self.__class__.__name__} in the case where the number of samples is smaller than the number of features. In this setting, getting a good estimate for the variance of the noise is not possible. Provide an estimate of the noise variance in the constructor.')\n ols_model = LinearRegression(positive=positive, fit_intercept=False)\n y_pred = ols_model.fit(X, y).predict(X)\n return np.sum((y - y_pred)**2) / (X.shape[0] - X.shape[1] - self.fit_intercept)\n" }, { "name": "LogisticRegression", @@ -23772,7 +23840,7 @@ "sklearn.linear_model._logistic.LogisticRegression.predict_log_proba" ], "is_public": true, - "description": "Logistic Regression (aka logit, MaxEnt) classifier.\n\nIn the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the 'multi_class' option is set to 'ovr', and uses the cross-entropy loss if the 'multi_class' option is set to 'multinomial'. (Currently the 'multinomial' option is supported only by the 'lbfgs', 'sag', 'saga' and 'newton-cg' solvers.) This class implements regularized logistic regression using the 'liblinear' library, 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers. **Note that regularization is applied by default**. It can handle both dense and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit floats for optimal performance; any other input format will be converted (and copied). The 'newton-cg', 'sag', and 'lbfgs' solvers support only L2 regularization with primal formulation, or no regularization. The 'liblinear' solver supports both L1 and L2 regularization, with a dual formulation only for the L2 penalty. The Elastic-Net regularization is only supported by the 'saga' solver. Read more in the :ref:`User Guide `.", + "description": "Logistic Regression (aka logit, MaxEnt) classifier.\n\nIn the multiclass case, the training algorithm uses the one-vs-rest (OvR)\nscheme if the 'multi_class' option is set to 'ovr', and uses the\ncross-entropy loss if the 'multi_class' option is set to 'multinomial'.\n(Currently the 'multinomial' option is supported only by the 'lbfgs',\n'sag', 'saga' and 'newton-cg' solvers.)\n\nThis class implements regularized logistic regression using the\n'liblinear' library, 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers. **Note\nthat regularization is applied by default**. It can handle both dense\nand sparse input. Use C-ordered arrays or CSR matrices containing 64-bit\nfloats for optimal performance; any other input format will be converted\n(and copied).\n\nThe 'newton-cg', 'sag', and 'lbfgs' solvers support only L2 regularization\nwith primal formulation, or no regularization. The 'liblinear' solver\nsupports both L1 and L2 regularization, with a dual formulation only for\nthe L2 penalty. The Elastic-Net regularization is only supported by the\n'saga' solver.\n\nRead more in the :ref:`User Guide `.", "docstring": "\n Logistic Regression (aka logit, MaxEnt) classifier.\n\n In the multiclass case, the training algorithm uses the one-vs-rest (OvR)\n scheme if the 'multi_class' option is set to 'ovr', and uses the\n cross-entropy loss if the 'multi_class' option is set to 'multinomial'.\n (Currently the 'multinomial' option is supported only by the 'lbfgs',\n 'sag', 'saga' and 'newton-cg' solvers.)\n\n This class implements regularized logistic regression using the\n 'liblinear' library, 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers. **Note\n that regularization is applied by default**. It can handle both dense\n and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit\n floats for optimal performance; any other input format will be converted\n (and copied).\n\n The 'newton-cg', 'sag', and 'lbfgs' solvers support only L2 regularization\n with primal formulation, or no regularization. The 'liblinear' solver\n supports both L1 and L2 regularization, with a dual formulation only for\n the L2 penalty. The Elastic-Net regularization is only supported by the\n 'saga' solver.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n penalty : {'l1', 'l2', 'elasticnet', 'none'}, default='l2'\n Specify the norm of the penalty:\n\n - `'none'`: no penalty is added;\n - `'l2'`: add a L2 penalty term and it is the default choice;\n - `'l1'`: add a L1 penalty term;\n - `'elasticnet'`: both L1 and L2 penalty terms are added.\n\n .. warning::\n Some penalties may not work with some solvers. See the parameter\n `solver` below, to know the compatibility between the penalty and\n solver.\n\n .. versionadded:: 0.19\n l1 penalty with SAGA solver (allowing 'multinomial' + L1)\n\n dual : bool, default=False\n Dual or primal formulation. Dual formulation is only implemented for\n l2 penalty with liblinear solver. Prefer dual=False when\n n_samples > n_features.\n\n tol : float, default=1e-4\n Tolerance for stopping criteria.\n\n C : float, default=1.0\n Inverse of regularization strength; must be a positive float.\n Like in support vector machines, smaller values specify stronger\n regularization.\n\n fit_intercept : bool, default=True\n Specifies if a constant (a.k.a. bias or intercept) should be\n added to the decision function.\n\n intercept_scaling : float, default=1\n Useful only when the solver 'liblinear' is used\n and self.fit_intercept is set to True. In this case, x becomes\n [x, self.intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equal to\n intercept_scaling is appended to the instance vector.\n The intercept becomes ``intercept_scaling * synthetic_feature_weight``.\n\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n .. versionadded:: 0.17\n *class_weight='balanced'*\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the\n data. See :term:`Glossary ` for details.\n\n solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, default='lbfgs'\n\n Algorithm to use in the optimization problem. Default is 'lbfgs'.\n To choose a solver, you might want to consider the following aspects:\n\n - For small datasets, 'liblinear' is a good choice, whereas 'sag'\n and 'saga' are faster for large ones;\n - For multiclass problems, only 'newton-cg', 'sag', 'saga' and\n 'lbfgs' handle multinomial loss;\n - 'liblinear' is limited to one-versus-rest schemes.\n\n .. warning::\n The choice of the algorithm depends on the penalty chosen:\n Supported penalties by solver:\n\n - 'newton-cg' - ['l2', 'none']\n - 'lbfgs' - ['l2', 'none']\n - 'liblinear' - ['l1', 'l2']\n - 'sag' - ['l2', 'none']\n - 'saga' - ['elasticnet', 'l1', 'l2', 'none']\n\n .. note::\n 'sag' and 'saga' fast convergence is only guaranteed on\n features with approximately the same scale. You can\n preprocess the data with a scaler from :mod:`sklearn.preprocessing`.\n\n .. seealso::\n Refer to the User Guide for more information regarding\n :class:`LogisticRegression` and more specifically the\n `Table `_\n summarazing solver/penalty supports.\n \n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n .. versionchanged:: 0.22\n The default solver changed from 'liblinear' to 'lbfgs' in 0.22.\n\n max_iter : int, default=100\n Maximum number of iterations taken for the solvers to converge.\n\n multi_class : {'auto', 'ovr', 'multinomial'}, default='auto'\n If the option chosen is 'ovr', then a binary problem is fit for each\n label. For 'multinomial' the loss minimised is the multinomial loss fit\n across the entire probability distribution, *even when the data is\n binary*. 'multinomial' is unavailable when solver='liblinear'.\n 'auto' selects 'ovr' if the data is binary, or if solver='liblinear',\n and otherwise selects 'multinomial'.\n\n .. versionadded:: 0.18\n Stochastic Average Gradient descent solver for 'multinomial' case.\n .. versionchanged:: 0.22\n Default changed from 'ovr' to 'auto' in 0.22.\n\n verbose : int, default=0\n For the liblinear and lbfgs solvers set verbose to any positive\n number for verbosity.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n Useless for liblinear solver. See :term:`the Glossary `.\n\n .. versionadded:: 0.17\n *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers.\n\n n_jobs : int, default=None\n Number of CPU cores used when parallelizing over classes if\n multi_class='ovr'\". This parameter is ignored when the ``solver`` is\n set to 'liblinear' regardless of whether 'multi_class' is specified or\n not. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors.\n See :term:`Glossary ` for more details.\n\n l1_ratio : float, default=None\n The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only\n used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent\n to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent\n to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a\n combination of L1 and L2.\n\n Attributes\n ----------\n\n classes_ : ndarray of shape (n_classes, )\n A list of class labels known to the classifier.\n\n coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)\n Coefficient of the features in the decision function.\n\n `coef_` is of shape (1, n_features) when the given problem is binary.\n In particular, when `multi_class='multinomial'`, `coef_` corresponds\n to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False).\n\n intercept_ : ndarray of shape (1,) or (n_classes,)\n Intercept (a.k.a. bias) added to the decision function.\n\n If `fit_intercept` is set to False, the intercept is set to zero.\n `intercept_` is of shape (1,) when the given problem is binary.\n In particular, when `multi_class='multinomial'`, `intercept_`\n corresponds to outcome 1 (True) and `-intercept_` corresponds to\n outcome 0 (False).\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : ndarray of shape (n_classes,) or (1, )\n Actual number of iterations for all classes. If binary or multinomial,\n it returns only 1 element. For liblinear solver, only the maximum\n number of iteration across all classes is given.\n\n .. versionchanged:: 0.20\n\n In SciPy <= 1.0.0 the number of lbfgs iterations may exceed\n ``max_iter``. ``n_iter_`` will now report at most ``max_iter``.\n\n See Also\n --------\n SGDClassifier : Incrementally trained logistic regression (when given\n the parameter ``loss=\"log\"``).\n LogisticRegressionCV : Logistic regression with built-in cross validation.\n\n Notes\n -----\n The underlying C implementation uses a random number generator to\n select features when fitting the model. It is thus not uncommon,\n to have slightly different results for the same input data. If\n that happens, try with a smaller tol parameter.\n\n Predict output may not match that of standalone liblinear in certain\n cases. See :ref:`differences from liblinear `\n in the narrative documentation.\n\n References\n ----------\n\n L-BFGS-B -- Software for Large-scale Bound-constrained Optimization\n Ciyou Zhu, Richard Byrd, Jorge Nocedal and Jose Luis Morales.\n http://users.iems.northwestern.edu/~nocedal/lbfgsb.html\n\n LIBLINEAR -- A Library for Large Linear Classification\n https://www.csie.ntu.edu.tw/~cjlin/liblinear/\n\n SAG -- Mark Schmidt, Nicolas Le Roux, and Francis Bach\n Minimizing Finite Sums with the Stochastic Average Gradient\n https://hal.inria.fr/hal-00860051/document\n\n SAGA -- Defazio, A., Bach F. & Lacoste-Julien S. (2014).\n SAGA: A Fast Incremental Gradient Method With Support\n for Non-Strongly Convex Composite Objectives\n https://arxiv.org/abs/1407.0202\n\n Hsiang-Fu Yu, Fang-Lan Huang, Chih-Jen Lin (2011). Dual coordinate descent\n methods for logistic regression and maximum entropy models.\n Machine Learning 85(1-2):41-75.\n https://www.csie.ntu.edu.tw/~cjlin/papers/maxent_dual.pdf\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.linear_model import LogisticRegression\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = LogisticRegression(random_state=0).fit(X, y)\n >>> clf.predict(X[:2, :])\n array([0, 0])\n >>> clf.predict_proba(X[:2, :])\n array([[9.8...e-01, 1.8...e-02, 1.4...e-08],\n [9.7...e-01, 2.8...e-02, ...e-08]])\n >>> clf.score(X, y)\n 0.97...\n ", "source_code": "\n\nclass LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):\n \"\"\"\n Logistic Regression (aka logit, MaxEnt) classifier.\n\n In the multiclass case, the training algorithm uses the one-vs-rest (OvR)\n scheme if the 'multi_class' option is set to 'ovr', and uses the\n cross-entropy loss if the 'multi_class' option is set to 'multinomial'.\n (Currently the 'multinomial' option is supported only by the 'lbfgs',\n 'sag', 'saga' and 'newton-cg' solvers.)\n\n This class implements regularized logistic regression using the\n 'liblinear' library, 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers. **Note\n that regularization is applied by default**. It can handle both dense\n and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit\n floats for optimal performance; any other input format will be converted\n (and copied).\n\n The 'newton-cg', 'sag', and 'lbfgs' solvers support only L2 regularization\n with primal formulation, or no regularization. The 'liblinear' solver\n supports both L1 and L2 regularization, with a dual formulation only for\n the L2 penalty. The Elastic-Net regularization is only supported by the\n 'saga' solver.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n penalty : {'l1', 'l2', 'elasticnet', 'none'}, default='l2'\n Specify the norm of the penalty:\n\n - `'none'`: no penalty is added;\n - `'l2'`: add a L2 penalty term and it is the default choice;\n - `'l1'`: add a L1 penalty term;\n - `'elasticnet'`: both L1 and L2 penalty terms are added.\n\n .. warning::\n Some penalties may not work with some solvers. See the parameter\n `solver` below, to know the compatibility between the penalty and\n solver.\n\n .. versionadded:: 0.19\n l1 penalty with SAGA solver (allowing 'multinomial' + L1)\n\n dual : bool, default=False\n Dual or primal formulation. Dual formulation is only implemented for\n l2 penalty with liblinear solver. Prefer dual=False when\n n_samples > n_features.\n\n tol : float, default=1e-4\n Tolerance for stopping criteria.\n\n C : float, default=1.0\n Inverse of regularization strength; must be a positive float.\n Like in support vector machines, smaller values specify stronger\n regularization.\n\n fit_intercept : bool, default=True\n Specifies if a constant (a.k.a. bias or intercept) should be\n added to the decision function.\n\n intercept_scaling : float, default=1\n Useful only when the solver 'liblinear' is used\n and self.fit_intercept is set to True. In this case, x becomes\n [x, self.intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equal to\n intercept_scaling is appended to the instance vector.\n The intercept becomes ``intercept_scaling * synthetic_feature_weight``.\n\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n .. versionadded:: 0.17\n *class_weight='balanced'*\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the\n data. See :term:`Glossary ` for details.\n\n solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, default='lbfgs'\n\n Algorithm to use in the optimization problem. Default is 'lbfgs'.\n To choose a solver, you might want to consider the following aspects:\n\n - For small datasets, 'liblinear' is a good choice, whereas 'sag'\n and 'saga' are faster for large ones;\n - For multiclass problems, only 'newton-cg', 'sag', 'saga' and\n 'lbfgs' handle multinomial loss;\n - 'liblinear' is limited to one-versus-rest schemes.\n\n .. warning::\n The choice of the algorithm depends on the penalty chosen:\n Supported penalties by solver:\n\n - 'newton-cg' - ['l2', 'none']\n - 'lbfgs' - ['l2', 'none']\n - 'liblinear' - ['l1', 'l2']\n - 'sag' - ['l2', 'none']\n - 'saga' - ['elasticnet', 'l1', 'l2', 'none']\n\n .. note::\n 'sag' and 'saga' fast convergence is only guaranteed on\n features with approximately the same scale. You can\n preprocess the data with a scaler from :mod:`sklearn.preprocessing`.\n\n .. seealso::\n Refer to the User Guide for more information regarding\n :class:`LogisticRegression` and more specifically the\n `Table `_\n summarazing solver/penalty supports.\n \n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n .. versionchanged:: 0.22\n The default solver changed from 'liblinear' to 'lbfgs' in 0.22.\n\n max_iter : int, default=100\n Maximum number of iterations taken for the solvers to converge.\n\n multi_class : {'auto', 'ovr', 'multinomial'}, default='auto'\n If the option chosen is 'ovr', then a binary problem is fit for each\n label. For 'multinomial' the loss minimised is the multinomial loss fit\n across the entire probability distribution, *even when the data is\n binary*. 'multinomial' is unavailable when solver='liblinear'.\n 'auto' selects 'ovr' if the data is binary, or if solver='liblinear',\n and otherwise selects 'multinomial'.\n\n .. versionadded:: 0.18\n Stochastic Average Gradient descent solver for 'multinomial' case.\n .. versionchanged:: 0.22\n Default changed from 'ovr' to 'auto' in 0.22.\n\n verbose : int, default=0\n For the liblinear and lbfgs solvers set verbose to any positive\n number for verbosity.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n Useless for liblinear solver. See :term:`the Glossary `.\n\n .. versionadded:: 0.17\n *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers.\n\n n_jobs : int, default=None\n Number of CPU cores used when parallelizing over classes if\n multi_class='ovr'\". This parameter is ignored when the ``solver`` is\n set to 'liblinear' regardless of whether 'multi_class' is specified or\n not. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\n context. ``-1`` means using all processors.\n See :term:`Glossary ` for more details.\n\n l1_ratio : float, default=None\n The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only\n used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent\n to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent\n to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a\n combination of L1 and L2.\n\n Attributes\n ----------\n\n classes_ : ndarray of shape (n_classes, )\n A list of class labels known to the classifier.\n\n coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)\n Coefficient of the features in the decision function.\n\n `coef_` is of shape (1, n_features) when the given problem is binary.\n In particular, when `multi_class='multinomial'`, `coef_` corresponds\n to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False).\n\n intercept_ : ndarray of shape (1,) or (n_classes,)\n Intercept (a.k.a. bias) added to the decision function.\n\n If `fit_intercept` is set to False, the intercept is set to zero.\n `intercept_` is of shape (1,) when the given problem is binary.\n In particular, when `multi_class='multinomial'`, `intercept_`\n corresponds to outcome 1 (True) and `-intercept_` corresponds to\n outcome 0 (False).\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : ndarray of shape (n_classes,) or (1, )\n Actual number of iterations for all classes. If binary or multinomial,\n it returns only 1 element. For liblinear solver, only the maximum\n number of iteration across all classes is given.\n\n .. versionchanged:: 0.20\n\n In SciPy <= 1.0.0 the number of lbfgs iterations may exceed\n ``max_iter``. ``n_iter_`` will now report at most ``max_iter``.\n\n See Also\n --------\n SGDClassifier : Incrementally trained logistic regression (when given\n the parameter ``loss=\"log\"``).\n LogisticRegressionCV : Logistic regression with built-in cross validation.\n\n Notes\n -----\n The underlying C implementation uses a random number generator to\n select features when fitting the model. It is thus not uncommon,\n to have slightly different results for the same input data. If\n that happens, try with a smaller tol parameter.\n\n Predict output may not match that of standalone liblinear in certain\n cases. See :ref:`differences from liblinear `\n in the narrative documentation.\n\n References\n ----------\n\n L-BFGS-B -- Software for Large-scale Bound-constrained Optimization\n Ciyou Zhu, Richard Byrd, Jorge Nocedal and Jose Luis Morales.\n http://users.iems.northwestern.edu/~nocedal/lbfgsb.html\n\n LIBLINEAR -- A Library for Large Linear Classification\n https://www.csie.ntu.edu.tw/~cjlin/liblinear/\n\n SAG -- Mark Schmidt, Nicolas Le Roux, and Francis Bach\n Minimizing Finite Sums with the Stochastic Average Gradient\n https://hal.inria.fr/hal-00860051/document\n\n SAGA -- Defazio, A., Bach F. & Lacoste-Julien S. (2014).\n SAGA: A Fast Incremental Gradient Method With Support\n for Non-Strongly Convex Composite Objectives\n https://arxiv.org/abs/1407.0202\n\n Hsiang-Fu Yu, Fang-Lan Huang, Chih-Jen Lin (2011). Dual coordinate descent\n methods for logistic regression and maximum entropy models.\n Machine Learning 85(1-2):41-75.\n https://www.csie.ntu.edu.tw/~cjlin/papers/maxent_dual.pdf\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.linear_model import LogisticRegression\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = LogisticRegression(random_state=0).fit(X, y)\n >>> clf.predict(X[:2, :])\n array([0, 0])\n >>> clf.predict_proba(X[:2, :])\n array([[9.8...e-01, 1.8...e-02, 1.4...e-08],\n [9.7...e-01, 2.8...e-02, ...e-08]])\n >>> clf.score(X, y)\n 0.97...\n \"\"\"\n \n def __init__(self, penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None):\n self.penalty = penalty\n self.dual = dual\n self.tol = tol\n self.C = C\n self.fit_intercept = fit_intercept\n self.intercept_scaling = intercept_scaling\n self.class_weight = class_weight\n self.random_state = random_state\n self.solver = solver\n self.max_iter = max_iter\n self.multi_class = multi_class\n self.verbose = verbose\n self.warm_start = warm_start\n self.n_jobs = n_jobs\n self.l1_ratio = l1_ratio\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"\n Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n .. versionadded:: 0.17\n *sample_weight* support to LogisticRegression.\n\n Returns\n -------\n self\n Fitted estimator.\n\n Notes\n -----\n The SAGA solver supports both float64 and float32 bit arrays.\n \"\"\"\n solver = _check_solver(self.solver, self.penalty, self.dual)\n if not isinstance(self.C, numbers.Number) or self.C < 0:\n raise ValueError('Penalty term must be positive; got (C=%r)' % self.C)\n if self.penalty == 'elasticnet':\n if not isinstance(self.l1_ratio, numbers.Number) or self.l1_ratio < 0 or self.l1_ratio > 1:\n raise ValueError('l1_ratio must be between 0 and 1; got (l1_ratio=%r)' % self.l1_ratio)\n elif self.l1_ratio is not None:\n warnings.warn(\"l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty={})\".format(self.penalty))\n if self.penalty == 'none':\n if self.C != 1.0:\n warnings.warn(\"Setting penalty='none' will ignore the C and l1_ratio parameters\")\n C_ = np.inf\n penalty = 'l2'\n else:\n C_ = self.C\n penalty = self.penalty\n if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:\n raise ValueError('Maximum number of iteration must be positive; got (max_iter=%r)' % self.max_iter)\n if not isinstance(self.tol, numbers.Number) or self.tol < 0:\n raise ValueError('Tolerance for stopping criteria must be positive; got (tol=%r)' % self.tol)\n if solver == 'lbfgs':\n _dtype = np.float64\n else:\n _dtype = [np.float64, np.float32]\n (X, y) = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype, order='C', accept_large_sparse=solver not in ['liblinear', 'sag', 'saga'])\n check_classification_targets(y)\n self.classes_ = np.unique(y)\n multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_))\n if solver == 'liblinear':\n if effective_n_jobs(self.n_jobs) != 1:\n warnings.warn(\"'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = {}.\".format(effective_n_jobs(self.n_jobs)))\n (self.coef_, self.intercept_, n_iter_) = _fit_liblinear(X, y, self.C, self.fit_intercept, self.intercept_scaling, self.class_weight, self.penalty, self.dual, self.verbose, self.max_iter, self.tol, self.random_state, sample_weight=sample_weight)\n self.n_iter_ = np.array([n_iter_])\n return self\n if solver in ['sag', 'saga']:\n max_squared_sum = row_norms(X, squared=True).max()\n else:\n max_squared_sum = None\n n_classes = len(self.classes_)\n classes_ = self.classes_\n if n_classes < 2:\n raise ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: %r' % classes_[0])\n if len(self.classes_) == 2:\n n_classes = 1\n classes_ = classes_[1:]\n if self.warm_start:\n warm_start_coef = getattr(self, 'coef_', None)\n else:\n warm_start_coef = None\n if warm_start_coef is not None and self.fit_intercept:\n warm_start_coef = np.append(warm_start_coef, self.intercept_[:, np.newaxis], axis=1)\n if multi_class == 'multinomial':\n classes_ = [None]\n warm_start_coef = [warm_start_coef]\n if warm_start_coef is None:\n warm_start_coef = [None] * n_classes\n path_func = delayed(_logistic_regression_path)\n if solver in ['sag', 'saga']:\n prefer = 'threads'\n else:\n prefer = 'processes'\n fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer=prefer))((path_func(X, y, pos_class=class_, Cs=[C_], l1_ratio=self.l1_ratio, fit_intercept=self.fit_intercept, tol=self.tol, verbose=self.verbose, solver=solver, multi_class=multi_class, max_iter=self.max_iter, class_weight=self.class_weight, check_input=False, random_state=self.random_state, coef=warm_start_coef_, penalty=penalty, max_squared_sum=max_squared_sum, sample_weight=sample_weight) for (class_, warm_start_coef_) in zip(classes_, warm_start_coef)))\n (fold_coefs_, _, n_iter_) = zip(*fold_coefs_)\n self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0]\n n_features = X.shape[1]\n if multi_class == 'multinomial':\n self.coef_ = fold_coefs_[0][0]\n else:\n self.coef_ = np.asarray(fold_coefs_)\n self.coef_ = self.coef_.reshape(n_classes, n_features + int(self.fit_intercept))\n if self.fit_intercept:\n self.intercept_ = self.coef_[:, -1]\n self.coef_ = self.coef_[:, :-1]\n else:\n self.intercept_ = np.zeros(n_classes)\n return self\n \n def predict_proba(self, X):\n \"\"\"\n Probability estimates.\n\n The returned estimates for all classes are ordered by the\n label of classes.\n\n For a multi_class problem, if multi_class is set to be \"multinomial\"\n the softmax function is used to find the predicted probability of\n each class.\n Else use a one-vs-rest approach, i.e calculate the probability\n of each class assuming it to be positive using the logistic function.\n and normalize these values across all the classes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Vector to be scored, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n T : array-like of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in ``self.classes_``.\n \"\"\"\n check_is_fitted(self)\n ovr = self.multi_class in ['ovr', 'warn'] or self.multi_class == 'auto' and (self.classes_.size <= 2 or self.solver == 'liblinear')\n if ovr:\n return super()._predict_proba_lr(X)\n else:\n decision = self.decision_function(X)\n if decision.ndim == 1:\n decision_2d = np.c_[-decision, decision]\n else:\n decision_2d = decision\n return softmax(decision_2d, copy=False)\n \n def predict_log_proba(self, X):\n \"\"\"\n Predict logarithm of probability estimates.\n\n The returned estimates for all classes are ordered by the\n label of classes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Vector to be scored, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n T : array-like of shape (n_samples, n_classes)\n Returns the log-probability of the sample for each class in the\n model, where classes are ordered as they are in ``self.classes_``.\n \"\"\"\n return np.log(self.predict_proba(X))\n" }, @@ -23792,7 +23860,7 @@ "sklearn.linear_model._logistic.LogisticRegressionCV._more_tags" ], "is_public": true, - "description": "Logistic Regression CV (aka logit, MaxEnt) classifier.\n\nSee glossary entry for :term:`cross-validation estimator`. This class implements logistic regression using liblinear, newton-cg, sag of lbfgs optimizer. The newton-cg, sag and lbfgs solvers support only L2 regularization with primal formulation. The liblinear solver supports both L1 and L2 regularization, with a dual formulation only for the L2 penalty. Elastic-Net penalty is only supported by the saga solver. For the grid of `Cs` values and `l1_ratios` values, the best hyperparameter is selected by the cross-validator :class:`~sklearn.model_selection.StratifiedKFold`, but it can be changed using the :term:`cv` parameter. The 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers can warm-start the coefficients (see :term:`Glossary`). Read more in the :ref:`User Guide `.", + "description": "Logistic Regression CV (aka logit, MaxEnt) classifier.\n\nSee glossary entry for :term:`cross-validation estimator`.\n\nThis class implements logistic regression using liblinear, newton-cg, sag\nof lbfgs optimizer. The newton-cg, sag and lbfgs solvers support only L2\nregularization with primal formulation. The liblinear solver supports both\nL1 and L2 regularization, with a dual formulation only for the L2 penalty.\nElastic-Net penalty is only supported by the saga solver.\n\nFor the grid of `Cs` values and `l1_ratios` values, the best hyperparameter\nis selected by the cross-validator\n:class:`~sklearn.model_selection.StratifiedKFold`, but it can be changed\nusing the :term:`cv` parameter. The 'newton-cg', 'sag', 'saga' and 'lbfgs'\nsolvers can warm-start the coefficients (see :term:`Glossary`).\n\nRead more in the :ref:`User Guide `.", "docstring": "Logistic Regression CV (aka logit, MaxEnt) classifier.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n This class implements logistic regression using liblinear, newton-cg, sag\n of lbfgs optimizer. The newton-cg, sag and lbfgs solvers support only L2\n regularization with primal formulation. The liblinear solver supports both\n L1 and L2 regularization, with a dual formulation only for the L2 penalty.\n Elastic-Net penalty is only supported by the saga solver.\n\n For the grid of `Cs` values and `l1_ratios` values, the best hyperparameter\n is selected by the cross-validator\n :class:`~sklearn.model_selection.StratifiedKFold`, but it can be changed\n using the :term:`cv` parameter. The 'newton-cg', 'sag', 'saga' and 'lbfgs'\n solvers can warm-start the coefficients (see :term:`Glossary`).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n Cs : int or list of floats, default=10\n Each of the values in Cs describes the inverse of regularization\n strength. If Cs is as an int, then a grid of Cs values are chosen\n in a logarithmic scale between 1e-4 and 1e4.\n Like in support vector machines, smaller values specify stronger\n regularization.\n\n fit_intercept : bool, default=True\n Specifies if a constant (a.k.a. bias or intercept) should be\n added to the decision function.\n\n cv : int or cross-validation generator, default=None\n The default cross-validation generator used is Stratified K-Folds.\n If an integer is provided, then it is the number of folds used.\n See the module :mod:`sklearn.model_selection` module for the\n list of possible cross-validation objects.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n dual : bool, default=False\n Dual or primal formulation. Dual formulation is only implemented for\n l2 penalty with liblinear solver. Prefer dual=False when\n n_samples > n_features.\n\n penalty : {'l1', 'l2', 'elasticnet'}, default='l2'\n Specify the norm of the penalty:\n\n - `'l2'`: add a L2 penalty term (used by default);\n - `'l1'`: add a L1 penalty term;\n - `'elasticnet'`: both L1 and L2 penalty terms are added.\n\n .. warning::\n Some penalties may not work with some solvers. See the parameter\n `solver` below, to know the compatibility between the penalty and\n solver.\n\n scoring : str or callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``. For a list of scoring functions\n that can be used, look at :mod:`sklearn.metrics`. The\n default scoring option used is 'accuracy'.\n\n solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, default='lbfgs'\n\n Algorithm to use in the optimization problem. Default is 'lbfgs'.\n To choose a solver, you might want to consider the following aspects:\n\n - For small datasets, 'liblinear' is a good choice, whereas 'sag'\n and 'saga' are faster for large ones;\n - For multiclass problems, only 'newton-cg', 'sag', 'saga' and\n 'lbfgs' handle multinomial loss;\n - 'liblinear' might be slower in :class:`LogisticRegressionCV`\n because it does not handle warm-starting. 'liblinear' is\n limited to one-versus-rest schemes.\n\n .. warning::\n The choice of the algorithm depends on the penalty chosen:\n\n - 'newton-cg' - ['l2']\n - 'lbfgs' - ['l2']\n - 'liblinear' - ['l1', 'l2']\n - 'sag' - ['l2']\n - 'saga' - ['elasticnet', 'l1', 'l2']\n\n .. note::\n 'sag' and 'saga' fast convergence is only guaranteed on features\n with approximately the same scale. You can preprocess the data with\n a scaler from :mod:`sklearn.preprocessing`.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\n tol : float, default=1e-4\n Tolerance for stopping criteria.\n\n max_iter : int, default=100\n Maximum number of iterations of the optimization algorithm.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n .. versionadded:: 0.17\n class_weight == 'balanced'\n\n n_jobs : int, default=None\n Number of CPU cores used during the cross-validation loop.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n For the 'liblinear', 'sag' and 'lbfgs' solvers set verbose to any\n positive number for verbosity.\n\n refit : bool, default=True\n If set to True, the scores are averaged across all folds, and the\n coefs and the C that corresponds to the best score is taken, and a\n final refit is done using these parameters.\n Otherwise the coefs, intercepts and C that correspond to the\n best scores across folds are averaged.\n\n intercept_scaling : float, default=1\n Useful only when the solver 'liblinear' is used\n and self.fit_intercept is set to True. In this case, x becomes\n [x, self.intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equal to\n intercept_scaling is appended to the instance vector.\n The intercept becomes ``intercept_scaling * synthetic_feature_weight``.\n\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\n multi_class : {'auto, 'ovr', 'multinomial'}, default='auto'\n If the option chosen is 'ovr', then a binary problem is fit for each\n label. For 'multinomial' the loss minimised is the multinomial loss fit\n across the entire probability distribution, *even when the data is\n binary*. 'multinomial' is unavailable when solver='liblinear'.\n 'auto' selects 'ovr' if the data is binary, or if solver='liblinear',\n and otherwise selects 'multinomial'.\n\n .. versionadded:: 0.18\n Stochastic Average Gradient descent solver for 'multinomial' case.\n .. versionchanged:: 0.22\n Default changed from 'ovr' to 'auto' in 0.22.\n\n random_state : int, RandomState instance, default=None\n Used when `solver='sag'`, 'saga' or 'liblinear' to shuffle the data.\n Note that this only applies to the solver and not the cross-validation\n generator. See :term:`Glossary ` for details.\n\n l1_ratios : list of float, default=None\n The list of Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``.\n Only used if ``penalty='elasticnet'``. A value of 0 is equivalent to\n using ``penalty='l2'``, while 1 is equivalent to using\n ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a combination\n of L1 and L2.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes, )\n A list of class labels known to the classifier.\n\n coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)\n Coefficient of the features in the decision function.\n\n `coef_` is of shape (1, n_features) when the given problem\n is binary.\n\n intercept_ : ndarray of shape (1,) or (n_classes,)\n Intercept (a.k.a. bias) added to the decision function.\n\n If `fit_intercept` is set to False, the intercept is set to zero.\n `intercept_` is of shape(1,) when the problem is binary.\n\n Cs_ : ndarray of shape (n_cs)\n Array of C i.e. inverse of regularization parameter values used\n for cross-validation.\n\n l1_ratios_ : ndarray of shape (n_l1_ratios)\n Array of l1_ratios used for cross-validation. If no l1_ratio is used\n (i.e. penalty is not 'elasticnet'), this is set to ``[None]``\n\n coefs_paths_ : ndarray of shape (n_folds, n_cs, n_features) or (n_folds, n_cs, n_features + 1)\n dict with classes as the keys, and the path of coefficients obtained\n during cross-validating across each fold and then across each Cs\n after doing an OvR for the corresponding class as values.\n If the 'multi_class' option is set to 'multinomial', then\n the coefs_paths are the coefficients corresponding to each class.\n Each dict value has shape ``(n_folds, n_cs, n_features)`` or\n ``(n_folds, n_cs, n_features + 1)`` depending on whether the\n intercept is fit or not. If ``penalty='elasticnet'``, the shape is\n ``(n_folds, n_cs, n_l1_ratios_, n_features)`` or\n ``(n_folds, n_cs, n_l1_ratios_, n_features + 1)``.\n\n scores_ : dict\n dict with classes as the keys, and the values as the\n grid of scores obtained during cross-validating each fold, after doing\n an OvR for the corresponding class. If the 'multi_class' option\n given is 'multinomial' then the same scores are repeated across\n all classes, since this is the multinomial class. Each dict value\n has shape ``(n_folds, n_cs`` or ``(n_folds, n_cs, n_l1_ratios)`` if\n ``penalty='elasticnet'``.\n\n C_ : ndarray of shape (n_classes,) or (n_classes - 1,)\n Array of C that maps to the best scores across every class. If refit is\n set to False, then for each class, the best C is the average of the\n C's that correspond to the best scores for each fold.\n `C_` is of shape(n_classes,) when the problem is binary.\n\n l1_ratio_ : ndarray of shape (n_classes,) or (n_classes - 1,)\n Array of l1_ratio that maps to the best scores across every class. If\n refit is set to False, then for each class, the best l1_ratio is the\n average of the l1_ratio's that correspond to the best scores for each\n fold. `l1_ratio_` is of shape(n_classes,) when the problem is binary.\n\n n_iter_ : ndarray of shape (n_classes, n_folds, n_cs) or (1, n_folds, n_cs)\n Actual number of iterations for all classes, folds and Cs.\n In the binary or multinomial cases, the first dimension is equal to 1.\n If ``penalty='elasticnet'``, the shape is ``(n_classes, n_folds,\n n_cs, n_l1_ratios)`` or ``(1, n_folds, n_cs, n_l1_ratios)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n LogisticRegression : Logistic regression without tuning the\n hyperparameter `C`.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.linear_model import LogisticRegressionCV\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = LogisticRegressionCV(cv=5, random_state=0).fit(X, y)\n >>> clf.predict(X[:2, :])\n array([0, 0])\n >>> clf.predict_proba(X[:2, :]).shape\n (2, 3)\n >>> clf.score(X, y)\n 0.98...\n ", "source_code": "\n\nclass LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstimator):\n \"\"\"Logistic Regression CV (aka logit, MaxEnt) classifier.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n This class implements logistic regression using liblinear, newton-cg, sag\n of lbfgs optimizer. The newton-cg, sag and lbfgs solvers support only L2\n regularization with primal formulation. The liblinear solver supports both\n L1 and L2 regularization, with a dual formulation only for the L2 penalty.\n Elastic-Net penalty is only supported by the saga solver.\n\n For the grid of `Cs` values and `l1_ratios` values, the best hyperparameter\n is selected by the cross-validator\n :class:`~sklearn.model_selection.StratifiedKFold`, but it can be changed\n using the :term:`cv` parameter. The 'newton-cg', 'sag', 'saga' and 'lbfgs'\n solvers can warm-start the coefficients (see :term:`Glossary`).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n Cs : int or list of floats, default=10\n Each of the values in Cs describes the inverse of regularization\n strength. If Cs is as an int, then a grid of Cs values are chosen\n in a logarithmic scale between 1e-4 and 1e4.\n Like in support vector machines, smaller values specify stronger\n regularization.\n\n fit_intercept : bool, default=True\n Specifies if a constant (a.k.a. bias or intercept) should be\n added to the decision function.\n\n cv : int or cross-validation generator, default=None\n The default cross-validation generator used is Stratified K-Folds.\n If an integer is provided, then it is the number of folds used.\n See the module :mod:`sklearn.model_selection` module for the\n list of possible cross-validation objects.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n dual : bool, default=False\n Dual or primal formulation. Dual formulation is only implemented for\n l2 penalty with liblinear solver. Prefer dual=False when\n n_samples > n_features.\n\n penalty : {'l1', 'l2', 'elasticnet'}, default='l2'\n Specify the norm of the penalty:\n\n - `'l2'`: add a L2 penalty term (used by default);\n - `'l1'`: add a L1 penalty term;\n - `'elasticnet'`: both L1 and L2 penalty terms are added.\n\n .. warning::\n Some penalties may not work with some solvers. See the parameter\n `solver` below, to know the compatibility between the penalty and\n solver.\n\n scoring : str or callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``. For a list of scoring functions\n that can be used, look at :mod:`sklearn.metrics`. The\n default scoring option used is 'accuracy'.\n\n solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, default='lbfgs'\n\n Algorithm to use in the optimization problem. Default is 'lbfgs'.\n To choose a solver, you might want to consider the following aspects:\n\n - For small datasets, 'liblinear' is a good choice, whereas 'sag'\n and 'saga' are faster for large ones;\n - For multiclass problems, only 'newton-cg', 'sag', 'saga' and\n 'lbfgs' handle multinomial loss;\n - 'liblinear' might be slower in :class:`LogisticRegressionCV`\n because it does not handle warm-starting. 'liblinear' is\n limited to one-versus-rest schemes.\n\n .. warning::\n The choice of the algorithm depends on the penalty chosen:\n\n - 'newton-cg' - ['l2']\n - 'lbfgs' - ['l2']\n - 'liblinear' - ['l1', 'l2']\n - 'sag' - ['l2']\n - 'saga' - ['elasticnet', 'l1', 'l2']\n\n .. note::\n 'sag' and 'saga' fast convergence is only guaranteed on features\n with approximately the same scale. You can preprocess the data with\n a scaler from :mod:`sklearn.preprocessing`.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\n tol : float, default=1e-4\n Tolerance for stopping criteria.\n\n max_iter : int, default=100\n Maximum number of iterations of the optimization algorithm.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n .. versionadded:: 0.17\n class_weight == 'balanced'\n\n n_jobs : int, default=None\n Number of CPU cores used during the cross-validation loop.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n For the 'liblinear', 'sag' and 'lbfgs' solvers set verbose to any\n positive number for verbosity.\n\n refit : bool, default=True\n If set to True, the scores are averaged across all folds, and the\n coefs and the C that corresponds to the best score is taken, and a\n final refit is done using these parameters.\n Otherwise the coefs, intercepts and C that correspond to the\n best scores across folds are averaged.\n\n intercept_scaling : float, default=1\n Useful only when the solver 'liblinear' is used\n and self.fit_intercept is set to True. In this case, x becomes\n [x, self.intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equal to\n intercept_scaling is appended to the instance vector.\n The intercept becomes ``intercept_scaling * synthetic_feature_weight``.\n\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\n multi_class : {'auto, 'ovr', 'multinomial'}, default='auto'\n If the option chosen is 'ovr', then a binary problem is fit for each\n label. For 'multinomial' the loss minimised is the multinomial loss fit\n across the entire probability distribution, *even when the data is\n binary*. 'multinomial' is unavailable when solver='liblinear'.\n 'auto' selects 'ovr' if the data is binary, or if solver='liblinear',\n and otherwise selects 'multinomial'.\n\n .. versionadded:: 0.18\n Stochastic Average Gradient descent solver for 'multinomial' case.\n .. versionchanged:: 0.22\n Default changed from 'ovr' to 'auto' in 0.22.\n\n random_state : int, RandomState instance, default=None\n Used when `solver='sag'`, 'saga' or 'liblinear' to shuffle the data.\n Note that this only applies to the solver and not the cross-validation\n generator. See :term:`Glossary ` for details.\n\n l1_ratios : list of float, default=None\n The list of Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``.\n Only used if ``penalty='elasticnet'``. A value of 0 is equivalent to\n using ``penalty='l2'``, while 1 is equivalent to using\n ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a combination\n of L1 and L2.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes, )\n A list of class labels known to the classifier.\n\n coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)\n Coefficient of the features in the decision function.\n\n `coef_` is of shape (1, n_features) when the given problem\n is binary.\n\n intercept_ : ndarray of shape (1,) or (n_classes,)\n Intercept (a.k.a. bias) added to the decision function.\n\n If `fit_intercept` is set to False, the intercept is set to zero.\n `intercept_` is of shape(1,) when the problem is binary.\n\n Cs_ : ndarray of shape (n_cs)\n Array of C i.e. inverse of regularization parameter values used\n for cross-validation.\n\n l1_ratios_ : ndarray of shape (n_l1_ratios)\n Array of l1_ratios used for cross-validation. If no l1_ratio is used\n (i.e. penalty is not 'elasticnet'), this is set to ``[None]``\n\n coefs_paths_ : ndarray of shape (n_folds, n_cs, n_features) or (n_folds, n_cs, n_features + 1)\n dict with classes as the keys, and the path of coefficients obtained\n during cross-validating across each fold and then across each Cs\n after doing an OvR for the corresponding class as values.\n If the 'multi_class' option is set to 'multinomial', then\n the coefs_paths are the coefficients corresponding to each class.\n Each dict value has shape ``(n_folds, n_cs, n_features)`` or\n ``(n_folds, n_cs, n_features + 1)`` depending on whether the\n intercept is fit or not. If ``penalty='elasticnet'``, the shape is\n ``(n_folds, n_cs, n_l1_ratios_, n_features)`` or\n ``(n_folds, n_cs, n_l1_ratios_, n_features + 1)``.\n\n scores_ : dict\n dict with classes as the keys, and the values as the\n grid of scores obtained during cross-validating each fold, after doing\n an OvR for the corresponding class. If the 'multi_class' option\n given is 'multinomial' then the same scores are repeated across\n all classes, since this is the multinomial class. Each dict value\n has shape ``(n_folds, n_cs`` or ``(n_folds, n_cs, n_l1_ratios)`` if\n ``penalty='elasticnet'``.\n\n C_ : ndarray of shape (n_classes,) or (n_classes - 1,)\n Array of C that maps to the best scores across every class. If refit is\n set to False, then for each class, the best C is the average of the\n C's that correspond to the best scores for each fold.\n `C_` is of shape(n_classes,) when the problem is binary.\n\n l1_ratio_ : ndarray of shape (n_classes,) or (n_classes - 1,)\n Array of l1_ratio that maps to the best scores across every class. If\n refit is set to False, then for each class, the best l1_ratio is the\n average of the l1_ratio's that correspond to the best scores for each\n fold. `l1_ratio_` is of shape(n_classes,) when the problem is binary.\n\n n_iter_ : ndarray of shape (n_classes, n_folds, n_cs) or (1, n_folds, n_cs)\n Actual number of iterations for all classes, folds and Cs.\n In the binary or multinomial cases, the first dimension is equal to 1.\n If ``penalty='elasticnet'``, the shape is ``(n_classes, n_folds,\n n_cs, n_l1_ratios)`` or ``(1, n_folds, n_cs, n_l1_ratios)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n LogisticRegression : Logistic regression without tuning the\n hyperparameter `C`.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.linear_model import LogisticRegressionCV\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = LogisticRegressionCV(cv=5, random_state=0).fit(X, y)\n >>> clf.predict(X[:2, :])\n array([0, 0])\n >>> clf.predict_proba(X[:2, :]).shape\n (2, 3)\n >>> clf.score(X, y)\n 0.98...\n \"\"\"\n \n def __init__(self, *, Cs=10, fit_intercept=True, cv=None, dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='auto', random_state=None, l1_ratios=None):\n self.Cs = Cs\n self.fit_intercept = fit_intercept\n self.cv = cv\n self.dual = dual\n self.penalty = penalty\n self.scoring = scoring\n self.tol = tol\n self.max_iter = max_iter\n self.class_weight = class_weight\n self.n_jobs = n_jobs\n self.verbose = verbose\n self.solver = solver\n self.refit = refit\n self.intercept_scaling = intercept_scaling\n self.multi_class = multi_class\n self.random_state = random_state\n self.l1_ratios = l1_ratios\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n Returns\n -------\n self : object\n Fitted LogisticRegressionCV estimator.\n \"\"\"\n solver = _check_solver(self.solver, self.penalty, self.dual)\n if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:\n raise ValueError('Maximum number of iteration must be positive; got (max_iter=%r)' % self.max_iter)\n if not isinstance(self.tol, numbers.Number) or self.tol < 0:\n raise ValueError('Tolerance for stopping criteria must be positive; got (tol=%r)' % self.tol)\n if self.penalty == 'elasticnet':\n if self.l1_ratios is None or len(self.l1_ratios) == 0 or any((not isinstance(l1_ratio, numbers.Number) or l1_ratio < 0 or l1_ratio > 1 for l1_ratio in self.l1_ratios)):\n raise ValueError('l1_ratios must be a list of numbers between 0 and 1; got (l1_ratios=%r)' % self.l1_ratios)\n l1_ratios_ = self.l1_ratios\n else:\n if self.l1_ratios is not None:\n warnings.warn(\"l1_ratios parameter is only used when penalty is 'elasticnet'. Got (penalty={})\".format(self.penalty))\n l1_ratios_ = [None]\n if self.penalty == 'none':\n raise ValueError(\"penalty='none' is not useful and not supported by LogisticRegressionCV.\")\n (X, y) = self._validate_data(X, y, accept_sparse='csr', dtype=np.float64, order='C', accept_large_sparse=solver not in ['liblinear', 'sag', 'saga'])\n check_classification_targets(y)\n class_weight = self.class_weight\n label_encoder = LabelEncoder().fit(y)\n y = label_encoder.transform(y)\n if isinstance(class_weight, dict):\n class_weight = {label_encoder.transform([cls])[0]: v for (cls, v) in class_weight.items()}\n classes = self.classes_ = label_encoder.classes_\n encoded_labels = label_encoder.transform(label_encoder.classes_)\n multi_class = _check_multi_class(self.multi_class, solver, len(classes))\n if solver in ['sag', 'saga']:\n max_squared_sum = row_norms(X, squared=True).max()\n else:\n max_squared_sum = None\n cv = check_cv(self.cv, y, classifier=True)\n folds = list(cv.split(X, y))\n n_classes = len(encoded_labels)\n if n_classes < 2:\n raise ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: %r' % classes[0])\n if n_classes == 2:\n n_classes = 1\n encoded_labels = encoded_labels[1:]\n classes = classes[1:]\n if multi_class == 'multinomial':\n iter_encoded_labels = iter_classes = [None]\n else:\n iter_encoded_labels = encoded_labels\n iter_classes = classes\n if class_weight == 'balanced':\n class_weight = compute_class_weight(class_weight, classes=np.arange(len(self.classes_)), y=y)\n class_weight = dict(enumerate(class_weight))\n path_func = delayed(_log_reg_scoring_path)\n if self.solver in ['sag', 'saga']:\n prefer = 'threads'\n else:\n prefer = 'processes'\n fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer=prefer))((path_func(X, y, train, test, pos_class=label, Cs=self.Cs, fit_intercept=self.fit_intercept, penalty=self.penalty, dual=self.dual, solver=solver, tol=self.tol, max_iter=self.max_iter, verbose=self.verbose, class_weight=class_weight, scoring=self.scoring, multi_class=multi_class, intercept_scaling=self.intercept_scaling, random_state=self.random_state, max_squared_sum=max_squared_sum, sample_weight=sample_weight, l1_ratio=l1_ratio) for label in iter_encoded_labels for (train, test) in folds for l1_ratio in l1_ratios_))\n (coefs_paths, Cs, scores, n_iter_) = zip(*fold_coefs_)\n self.Cs_ = Cs[0]\n if multi_class == 'multinomial':\n coefs_paths = np.reshape(coefs_paths, (len(folds), len(l1_ratios_) * len(self.Cs_), n_classes, -1))\n coefs_paths = np.swapaxes(coefs_paths, 0, 1)\n coefs_paths = np.swapaxes(coefs_paths, 0, 2)\n self.n_iter_ = np.reshape(n_iter_, (1, len(folds), len(self.Cs_) * len(l1_ratios_)))\n scores = np.tile(scores, (n_classes, 1, 1))\n else:\n coefs_paths = np.reshape(coefs_paths, (n_classes, len(folds), len(self.Cs_) * len(l1_ratios_), -1))\n self.n_iter_ = np.reshape(n_iter_, (n_classes, len(folds), len(self.Cs_) * len(l1_ratios_)))\n scores = np.reshape(scores, (n_classes, len(folds), -1))\n self.scores_ = dict(zip(classes, scores))\n self.coefs_paths_ = dict(zip(classes, coefs_paths))\n self.C_ = list()\n self.l1_ratio_ = list()\n self.coef_ = np.empty((n_classes, X.shape[1]))\n self.intercept_ = np.zeros(n_classes)\n for (index, (cls, encoded_label)) in enumerate(zip(iter_classes, iter_encoded_labels)):\n if multi_class == 'ovr':\n scores = self.scores_[cls]\n coefs_paths = self.coefs_paths_[cls]\n else:\n scores = scores[0]\n if self.refit:\n best_index = scores.sum(axis=0).argmax()\n best_index_C = best_index % len(self.Cs_)\n C_ = self.Cs_[best_index_C]\n self.C_.append(C_)\n best_index_l1 = best_index // len(self.Cs_)\n l1_ratio_ = l1_ratios_[best_index_l1]\n self.l1_ratio_.append(l1_ratio_)\n if multi_class == 'multinomial':\n coef_init = np.mean(coefs_paths[:, :, best_index, :], axis=1)\n else:\n coef_init = np.mean(coefs_paths[:, best_index, :], axis=0)\n (w, _, _) = _logistic_regression_path(X, y, pos_class=encoded_label, Cs=[C_], solver=solver, fit_intercept=self.fit_intercept, coef=coef_init, max_iter=self.max_iter, tol=self.tol, penalty=self.penalty, class_weight=class_weight, multi_class=multi_class, verbose=max(0, self.verbose - 1), random_state=self.random_state, check_input=False, max_squared_sum=max_squared_sum, sample_weight=sample_weight, l1_ratio=l1_ratio_)\n w = w[0]\n else:\n best_indices = np.argmax(scores, axis=1)\n if multi_class == 'ovr':\n w = np.mean([coefs_paths[i, best_indices[i], :] for i in range(len(folds))], axis=0)\n else:\n w = np.mean([coefs_paths[:, i, best_indices[i], :] for i in range(len(folds))], axis=0)\n best_indices_C = best_indices % len(self.Cs_)\n self.C_.append(np.mean(self.Cs_[best_indices_C]))\n if self.penalty == 'elasticnet':\n best_indices_l1 = best_indices // len(self.Cs_)\n self.l1_ratio_.append(np.mean(l1_ratios_[best_indices_l1]))\n else:\n self.l1_ratio_.append(None)\n if multi_class == 'multinomial':\n self.C_ = np.tile(self.C_, n_classes)\n self.l1_ratio_ = np.tile(self.l1_ratio_, n_classes)\n self.coef_ = w[:, :X.shape[1]]\n if self.fit_intercept:\n self.intercept_ = w[:, -1]\n else:\n self.coef_[index] = w[:X.shape[1]]\n if self.fit_intercept:\n self.intercept_[index] = w[-1]\n self.C_ = np.asarray(self.C_)\n self.l1_ratio_ = np.asarray(self.l1_ratio_)\n self.l1_ratios_ = np.asarray(l1_ratios_)\n if self.l1_ratios is not None:\n for (cls, coefs_path) in self.coefs_paths_.items():\n self.coefs_paths_[cls] = coefs_path.reshape((len(folds), self.l1_ratios_.size, self.Cs_.size, -1))\n self.coefs_paths_[cls] = np.transpose(self.coefs_paths_[cls], (0, 2, 1, 3))\n for (cls, score) in self.scores_.items():\n self.scores_[cls] = score.reshape((len(folds), self.l1_ratios_.size, self.Cs_.size))\n self.scores_[cls] = np.transpose(self.scores_[cls], (0, 2, 1))\n self.n_iter_ = self.n_iter_.reshape((-1, len(folds), self.l1_ratios_.size, self.Cs_.size))\n self.n_iter_ = np.transpose(self.n_iter_, (0, 1, 3, 2))\n return self\n \n def score(self, X, y, sample_weight=None):\n \"\"\"Score using the `scoring` option on the given test data and labels.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples,)\n True labels for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Score of self.predict(X) wrt. y.\n \"\"\"\n scoring = self.scoring or 'accuracy'\n scoring = get_scorer(scoring)\n return scoring(self, X, y, sample_weight=sample_weight)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, @@ -23824,7 +23892,7 @@ "sklearn.linear_model._omp.OrthogonalMatchingPursuitCV.fit" ], "is_public": true, - "description": "Cross-validated Orthogonal Matching Pursuit model (OMP).\n\nSee glossary entry for :term:`cross-validation estimator`. Read more in the :ref:`User Guide `.", + "description": "Cross-validated Orthogonal Matching Pursuit model (OMP).\n\nSee glossary entry for :term:`cross-validation estimator`.\n\nRead more in the :ref:`User Guide `.", "docstring": "Cross-validated Orthogonal Matching Pursuit model (OMP).\n\n See glossary entry for :term:`cross-validation estimator`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n copy : bool, default=True\n Whether the design matrix X must be copied by the algorithm. A false\n value is only helpful if X is already Fortran-ordered, otherwise a\n copy is made anyway.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n max_iter : int, default=None\n Maximum numbers of iterations to perform, therefore maximum features\n to include. 10% of ``n_features`` but at least 5 if available.\n\n cv : int, cross-validation generator or iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : bool or int, default=False\n Sets the verbosity amount.\n\n Attributes\n ----------\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function.\n\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the problem formulation).\n\n n_nonzero_coefs_ : int\n Estimated number of non-zero coefficients giving the best mean squared\n error over the cross-validation folds.\n\n n_iter_ : int or array-like\n Number of active features across every target for the model refit with\n the best hyperparameters got by cross-validating across all folds.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n orthogonal_mp : Solves n_targets Orthogonal Matching Pursuit problems.\n orthogonal_mp_gram : Solves n_targets Orthogonal Matching Pursuit\n problems using only the Gram matrix X.T * X and the product X.T * y.\n lars_path : Compute Least Angle Regression or Lasso path using LARS algorithm.\n Lars : Least Angle Regression model a.k.a. LAR.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n OrthogonalMatchingPursuit : Orthogonal Matching Pursuit model (OMP).\n LarsCV : Cross-validated Least Angle Regression model.\n LassoLarsCV : Cross-validated Lasso model fit with Least Angle Regression.\n sklearn.decomposition.sparse_encode : Generic sparse coding.\n Each column of the result is the solution to a Lasso problem.\n\n Examples\n --------\n >>> from sklearn.linear_model import OrthogonalMatchingPursuitCV\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_features=100, n_informative=10,\n ... noise=4, random_state=0)\n >>> reg = OrthogonalMatchingPursuitCV(cv=5, normalize=False).fit(X, y)\n >>> reg.score(X, y)\n 0.9991...\n >>> reg.n_nonzero_coefs_\n 10\n >>> reg.predict(X[:1,])\n array([-78.3854...])\n ", "source_code": "\n\nclass OrthogonalMatchingPursuitCV(RegressorMixin, LinearModel):\n \"\"\"Cross-validated Orthogonal Matching Pursuit model (OMP).\n\n See glossary entry for :term:`cross-validation estimator`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n copy : bool, default=True\n Whether the design matrix X must be copied by the algorithm. A false\n value is only helpful if X is already Fortran-ordered, otherwise a\n copy is made anyway.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n max_iter : int, default=None\n Maximum numbers of iterations to perform, therefore maximum features\n to include. 10% of ``n_features`` but at least 5 if available.\n\n cv : int, cross-validation generator or iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross-validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : bool or int, default=False\n Sets the verbosity amount.\n\n Attributes\n ----------\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function.\n\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Parameter vector (w in the problem formulation).\n\n n_nonzero_coefs_ : int\n Estimated number of non-zero coefficients giving the best mean squared\n error over the cross-validation folds.\n\n n_iter_ : int or array-like\n Number of active features across every target for the model refit with\n the best hyperparameters got by cross-validating across all folds.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n orthogonal_mp : Solves n_targets Orthogonal Matching Pursuit problems.\n orthogonal_mp_gram : Solves n_targets Orthogonal Matching Pursuit\n problems using only the Gram matrix X.T * X and the product X.T * y.\n lars_path : Compute Least Angle Regression or Lasso path using LARS algorithm.\n Lars : Least Angle Regression model a.k.a. LAR.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n OrthogonalMatchingPursuit : Orthogonal Matching Pursuit model (OMP).\n LarsCV : Cross-validated Least Angle Regression model.\n LassoLarsCV : Cross-validated Lasso model fit with Least Angle Regression.\n sklearn.decomposition.sparse_encode : Generic sparse coding.\n Each column of the result is the solution to a Lasso problem.\n\n Examples\n --------\n >>> from sklearn.linear_model import OrthogonalMatchingPursuitCV\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_features=100, n_informative=10,\n ... noise=4, random_state=0)\n >>> reg = OrthogonalMatchingPursuitCV(cv=5, normalize=False).fit(X, y)\n >>> reg.score(X, y)\n 0.9991...\n >>> reg.n_nonzero_coefs_\n 10\n >>> reg.predict(X[:1,])\n array([-78.3854...])\n \"\"\"\n \n def __init__(self, *, copy=True, fit_intercept=True, normalize='deprecated', max_iter=None, cv=None, n_jobs=None, verbose=False):\n self.copy = copy\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.max_iter = max_iter\n self.cv = cv\n self.n_jobs = n_jobs\n self.verbose = verbose\n \n def fit(self, X, y):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=True, estimator_name=self.__class__.__name__)\n (X, y) = self._validate_data(X, y, y_numeric=True, ensure_min_features=2, estimator=self)\n X = as_float_array(X, copy=False, force_all_finite=False)\n cv = check_cv(self.cv, classifier=False)\n max_iter = min(max(int(0.1 * X.shape[1]), 5), X.shape[1]) if not self.max_iter else self.max_iter\n cv_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)((delayed(_omp_path_residues)(X[train], y[train], X[test], y[test], self.copy, self.fit_intercept, _normalize, max_iter) for (train, test) in cv.split(X)))\n min_early_stop = min((fold.shape[0] for fold in cv_paths))\n mse_folds = np.array([(fold[:min_early_stop]**2).mean(axis=1) for fold in cv_paths])\n best_n_nonzero_coefs = np.argmin(mse_folds.mean(axis=0)) + 1\n self.n_nonzero_coefs_ = best_n_nonzero_coefs\n omp = OrthogonalMatchingPursuit(n_nonzero_coefs=best_n_nonzero_coefs, fit_intercept=self.fit_intercept, normalize=_normalize)\n omp.fit(X, y)\n self.coef_ = omp.coef_\n self.intercept_ = omp.intercept_\n self.n_iter_ = omp.n_iter_\n return self\n" }, @@ -23866,8 +23934,8 @@ "methods": ["sklearn.linear_model._perceptron.Perceptron.__init__"], "is_public": true, "description": "Linear perceptron classifier.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Linear perceptron classifier.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n\n penalty : {'l2','l1','elasticnet'}, default=None\n The penalty (aka regularization term) to be used.\n\n alpha : float, default=0.0001\n Constant that multiplies the regularization term if regularization is\n used.\n\n l1_ratio : float, default=0.15\n The Elastic Net mixing parameter, with `0 <= l1_ratio <= 1`.\n `l1_ratio=0` corresponds to L2 penalty, `l1_ratio=1` to L1.\n Only used if `penalty='elasticnet'`.\n\n .. versionadded:: 0.24\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If False, the\n data is assumed to be already centered.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n :meth:`partial_fit` method.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-3\n The stopping criterion. If it is not None, the iterations will stop\n when (loss > previous_loss - tol).\n\n .. versionadded:: 0.19\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n\n verbose : int, default=0\n The verbosity level.\n\n eta0 : double, default=1\n Constant by which the updates are multiplied.\n\n n_jobs : int, default=None\n The number of CPUs to use to do the OVA (One Versus All, for\n multi-class problems) computation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n Used to shuffle the training data, when ``shuffle`` is set to\n ``True``. Pass an int for reproducible output across multiple\n function calls.\n See :term:`Glossary `.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation.\n score is not improving. If set to True, it will automatically set aside\n a stratified fraction of training data as validation and terminate\n training when validation score is not improving by at least tol for\n n_iter_no_change consecutive epochs.\n\n .. versionadded:: 0.20\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if early_stopping is True.\n\n .. versionadded:: 0.20\n\n n_iter_no_change : int, default=5\n Number of iterations with no improvement to wait before early stopping.\n\n .. versionadded:: 0.20\n\n class_weight : dict, {class_label: weight} or \"balanced\", default=None\n Preset for the class_weight fit parameter.\n\n Weights associated with classes. If not given, all classes\n are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution. See\n :term:`the Glossary `.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n The unique classes labels.\n\n coef_ : ndarray of shape (1, n_features) if n_classes == 2 else (n_classes, n_features)\n Weights assigned to the features.\n\n intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)\n Constants in decision function.\n\n loss_function_ : concrete\u00a0LossFunction\n The function that determines the loss, or difference between the\n output of the algorithm and the target values.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The actual number of iterations to reach the stopping criterion.\n For multiclass fits, it is the maximum over every binary fit.\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n See Also\n --------\n sklearn.linear_model.SGDClassifier : Linear classifiers\n (SVM, logistic regression, etc.) with SGD training.\n\n Notes\n -----\n ``Perceptron`` is a classification algorithm which shares the same\n underlying implementation with ``SGDClassifier``. In fact,\n ``Perceptron()`` is equivalent to `SGDClassifier(loss=\"perceptron\",\n eta0=1, learning_rate=\"constant\", penalty=None)`.\n\n References\n ----------\n https://en.wikipedia.org/wiki/Perceptron and references therein.\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.linear_model import Perceptron\n >>> X, y = load_digits(return_X_y=True)\n >>> clf = Perceptron(tol=1e-3, random_state=0)\n >>> clf.fit(X, y)\n Perceptron()\n >>> clf.score(X, y)\n 0.939...\n ", - "source_code": "\n\nclass Perceptron(BaseSGDClassifier):\n \"\"\"Linear perceptron classifier.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n\n penalty : {'l2','l1','elasticnet'}, default=None\n The penalty (aka regularization term) to be used.\n\n alpha : float, default=0.0001\n Constant that multiplies the regularization term if regularization is\n used.\n\n l1_ratio : float, default=0.15\n The Elastic Net mixing parameter, with `0 <= l1_ratio <= 1`.\n `l1_ratio=0` corresponds to L2 penalty, `l1_ratio=1` to L1.\n Only used if `penalty='elasticnet'`.\n\n .. versionadded:: 0.24\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If False, the\n data is assumed to be already centered.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n :meth:`partial_fit` method.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-3\n The stopping criterion. If it is not None, the iterations will stop\n when (loss > previous_loss - tol).\n\n .. versionadded:: 0.19\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n\n verbose : int, default=0\n The verbosity level.\n\n eta0 : double, default=1\n Constant by which the updates are multiplied.\n\n n_jobs : int, default=None\n The number of CPUs to use to do the OVA (One Versus All, for\n multi-class problems) computation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n Used to shuffle the training data, when ``shuffle`` is set to\n ``True``. Pass an int for reproducible output across multiple\n function calls.\n See :term:`Glossary `.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation.\n score is not improving. If set to True, it will automatically set aside\n a stratified fraction of training data as validation and terminate\n training when validation score is not improving by at least tol for\n n_iter_no_change consecutive epochs.\n\n .. versionadded:: 0.20\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if early_stopping is True.\n\n .. versionadded:: 0.20\n\n n_iter_no_change : int, default=5\n Number of iterations with no improvement to wait before early stopping.\n\n .. versionadded:: 0.20\n\n class_weight : dict, {class_label: weight} or \"balanced\", default=None\n Preset for the class_weight fit parameter.\n\n Weights associated with classes. If not given, all classes\n are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution. See\n :term:`the Glossary `.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n The unique classes labels.\n\n coef_ : ndarray of shape (1, n_features) if n_classes == 2 else (n_classes, n_features)\n Weights assigned to the features.\n\n intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)\n Constants in decision function.\n\n loss_function_ : concrete\u00a0LossFunction\n The function that determines the loss, or difference between the\n output of the algorithm and the target values.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The actual number of iterations to reach the stopping criterion.\n For multiclass fits, it is the maximum over every binary fit.\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n See Also\n --------\n sklearn.linear_model.SGDClassifier : Linear classifiers\n (SVM, logistic regression, etc.) with SGD training.\n\n Notes\n -----\n ``Perceptron`` is a classification algorithm which shares the same\n underlying implementation with ``SGDClassifier``. In fact,\n ``Perceptron()`` is equivalent to `SGDClassifier(loss=\"perceptron\",\n eta0=1, learning_rate=\"constant\", penalty=None)`.\n\n References\n ----------\n https://en.wikipedia.org/wiki/Perceptron and references therein.\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.linear_model import Perceptron\n >>> X, y = load_digits(return_X_y=True)\n >>> clf = Perceptron(tol=1e-3, random_state=0)\n >>> clf.fit(X, y)\n Perceptron()\n >>> clf.score(X, y)\n 0.939...\n \"\"\"\n \n def __init__(self, *, penalty=None, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, eta0=1.0, n_jobs=None, random_state=0, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False):\n super().__init__(loss='perceptron', penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, random_state=random_state, learning_rate='constant', eta0=eta0, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, power_t=0.5, warm_start=warm_start, class_weight=class_weight, n_jobs=n_jobs)\n" + "docstring": "Linear perceptron classifier.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n\n penalty : {'l2','l1','elasticnet'}, default=None\n The penalty (aka regularization term) to be used.\n\n alpha : float, default=0.0001\n Constant that multiplies the regularization term if regularization is\n used.\n\n l1_ratio : float, default=0.15\n The Elastic Net mixing parameter, with `0 <= l1_ratio <= 1`.\n `l1_ratio=0` corresponds to L2 penalty, `l1_ratio=1` to L1.\n Only used if `penalty='elasticnet'`.\n\n .. versionadded:: 0.24\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If False, the\n data is assumed to be already centered.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n :meth:`partial_fit` method.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-3\n The stopping criterion. If it is not None, the iterations will stop\n when (loss > previous_loss - tol).\n\n .. versionadded:: 0.19\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n\n verbose : int, default=0\n The verbosity level.\n\n eta0 : float, default=1\n Constant by which the updates are multiplied.\n\n n_jobs : int, default=None\n The number of CPUs to use to do the OVA (One Versus All, for\n multi-class problems) computation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n Used to shuffle the training data, when ``shuffle`` is set to\n ``True``. Pass an int for reproducible output across multiple\n function calls.\n See :term:`Glossary `.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation.\n score is not improving. If set to True, it will automatically set aside\n a stratified fraction of training data as validation and terminate\n training when validation score is not improving by at least tol for\n n_iter_no_change consecutive epochs.\n\n .. versionadded:: 0.20\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if early_stopping is True.\n\n .. versionadded:: 0.20\n\n n_iter_no_change : int, default=5\n Number of iterations with no improvement to wait before early stopping.\n\n .. versionadded:: 0.20\n\n class_weight : dict, {class_label: weight} or \"balanced\", default=None\n Preset for the class_weight fit parameter.\n\n Weights associated with classes. If not given, all classes\n are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution. See\n :term:`the Glossary `.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n The unique classes labels.\n\n coef_ : ndarray of shape (1, n_features) if n_classes == 2 else (n_classes, n_features)\n Weights assigned to the features.\n\n intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)\n Constants in decision function.\n\n loss_function_ : concrete\u00a0LossFunction\n The function that determines the loss, or difference between the\n output of the algorithm and the target values.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The actual number of iterations to reach the stopping criterion.\n For multiclass fits, it is the maximum over every binary fit.\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n See Also\n --------\n sklearn.linear_model.SGDClassifier : Linear classifiers\n (SVM, logistic regression, etc.) with SGD training.\n\n Notes\n -----\n ``Perceptron`` is a classification algorithm which shares the same\n underlying implementation with ``SGDClassifier``. In fact,\n ``Perceptron()`` is equivalent to `SGDClassifier(loss=\"perceptron\",\n eta0=1, learning_rate=\"constant\", penalty=None)`.\n\n References\n ----------\n https://en.wikipedia.org/wiki/Perceptron and references therein.\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.linear_model import Perceptron\n >>> X, y = load_digits(return_X_y=True)\n >>> clf = Perceptron(tol=1e-3, random_state=0)\n >>> clf.fit(X, y)\n Perceptron()\n >>> clf.score(X, y)\n 0.939...\n ", + "source_code": "\n\nclass Perceptron(BaseSGDClassifier):\n \"\"\"Linear perceptron classifier.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n\n penalty : {'l2','l1','elasticnet'}, default=None\n The penalty (aka regularization term) to be used.\n\n alpha : float, default=0.0001\n Constant that multiplies the regularization term if regularization is\n used.\n\n l1_ratio : float, default=0.15\n The Elastic Net mixing parameter, with `0 <= l1_ratio <= 1`.\n `l1_ratio=0` corresponds to L2 penalty, `l1_ratio=1` to L1.\n Only used if `penalty='elasticnet'`.\n\n .. versionadded:: 0.24\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If False, the\n data is assumed to be already centered.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n :meth:`partial_fit` method.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-3\n The stopping criterion. If it is not None, the iterations will stop\n when (loss > previous_loss - tol).\n\n .. versionadded:: 0.19\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n\n verbose : int, default=0\n The verbosity level.\n\n eta0 : float, default=1\n Constant by which the updates are multiplied.\n\n n_jobs : int, default=None\n The number of CPUs to use to do the OVA (One Versus All, for\n multi-class problems) computation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n Used to shuffle the training data, when ``shuffle`` is set to\n ``True``. Pass an int for reproducible output across multiple\n function calls.\n See :term:`Glossary `.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation.\n score is not improving. If set to True, it will automatically set aside\n a stratified fraction of training data as validation and terminate\n training when validation score is not improving by at least tol for\n n_iter_no_change consecutive epochs.\n\n .. versionadded:: 0.20\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if early_stopping is True.\n\n .. versionadded:: 0.20\n\n n_iter_no_change : int, default=5\n Number of iterations with no improvement to wait before early stopping.\n\n .. versionadded:: 0.20\n\n class_weight : dict, {class_label: weight} or \"balanced\", default=None\n Preset for the class_weight fit parameter.\n\n Weights associated with classes. If not given, all classes\n are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution. See\n :term:`the Glossary `.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n The unique classes labels.\n\n coef_ : ndarray of shape (1, n_features) if n_classes == 2 else (n_classes, n_features)\n Weights assigned to the features.\n\n intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)\n Constants in decision function.\n\n loss_function_ : concrete\u00a0LossFunction\n The function that determines the loss, or difference between the\n output of the algorithm and the target values.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The actual number of iterations to reach the stopping criterion.\n For multiclass fits, it is the maximum over every binary fit.\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n See Also\n --------\n sklearn.linear_model.SGDClassifier : Linear classifiers\n (SVM, logistic regression, etc.) with SGD training.\n\n Notes\n -----\n ``Perceptron`` is a classification algorithm which shares the same\n underlying implementation with ``SGDClassifier``. In fact,\n ``Perceptron()`` is equivalent to `SGDClassifier(loss=\"perceptron\",\n eta0=1, learning_rate=\"constant\", penalty=None)`.\n\n References\n ----------\n https://en.wikipedia.org/wiki/Perceptron and references therein.\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.linear_model import Perceptron\n >>> X, y = load_digits(return_X_y=True)\n >>> clf = Perceptron(tol=1e-3, random_state=0)\n >>> clf.fit(X, y)\n Perceptron()\n >>> clf.score(X, y)\n 0.939...\n \"\"\"\n \n def __init__(self, *, penalty=None, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, eta0=1.0, n_jobs=None, random_state=0, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False):\n super().__init__(loss='perceptron', penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, random_state=random_state, learning_rate='constant', eta0=eta0, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, power_t=0.5, warm_start=warm_start, class_weight=class_weight, n_jobs=n_jobs)\n" }, { "name": "QuantileRegressor", @@ -23879,7 +23947,7 @@ "sklearn.linear_model._quantile.QuantileRegressor.fit" ], "is_public": true, - "description": "Linear regression model that predicts conditional quantiles.\n\nThe linear :class:`QuantileRegressor` optimizes the pinball loss for a desired `quantile` and is robust to outliers. This model uses an L1 regularization like :class:`~sklearn.linear_model.Lasso`. Read more in the :ref:`User Guide `. .. versionadded:: 1.0", + "description": "Linear regression model that predicts conditional quantiles.\n\nThe linear :class:`QuantileRegressor` optimizes the pinball loss for a\ndesired `quantile` and is robust to outliers.\n\nThis model uses an L1 regularization like\n:class:`~sklearn.linear_model.Lasso`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0", "docstring": "Linear regression model that predicts conditional quantiles.\n\n The linear :class:`QuantileRegressor` optimizes the pinball loss for a\n desired `quantile` and is robust to outliers.\n\n This model uses an L1 regularization like\n :class:`~sklearn.linear_model.Lasso`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n quantile : float, default=0.5\n The quantile that the model tries to predict. It must be strictly\n between 0 and 1. If 0.5 (default), the model predicts the 50%\n quantile, i.e. the median.\n\n alpha : float, default=1.0\n Regularization constant that multiplies the L1 penalty term.\n\n fit_intercept : bool, default=True\n Whether or not to fit the intercept.\n\n solver : {'highs-ds', 'highs-ipm', 'highs', 'interior-point', 'revised simplex'}, default='interior-point'\n Method used by :func:`scipy.optimize.linprog` to solve the linear\n programming formulation. Note that the highs methods are recommended\n for usage with `scipy>=1.6.0` because they are the fastest ones.\n\n solver_options : dict, default=None\n Additional parameters passed to :func:`scipy.optimize.linprog` as\n options. If `None` and if `solver='interior-point'`, then\n `{\"lstsq\": True}` is passed to :func:`scipy.optimize.linprog` for the\n sake of stability.\n\n Attributes\n ----------\n coef_ : array of shape (n_features,)\n Estimated coefficients for the features.\n\n intercept_ : float\n The intercept of the model, aka bias term.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The actual number of iterations performed by the solver.\n\n See Also\n --------\n Lasso : The Lasso is a linear model that estimates sparse coefficients\n with l1 regularization.\n HuberRegressor : Linear regression model that is robust to outliers.\n\n Examples\n --------\n >>> from sklearn.linear_model import QuantileRegressor\n >>> import numpy as np\n >>> n_samples, n_features = 10, 2\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> reg = QuantileRegressor(quantile=0.8).fit(X, y)\n >>> np.mean(y <= reg.predict(X))\n 0.8\n ", "source_code": "\n\nclass QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator):\n \"\"\"Linear regression model that predicts conditional quantiles.\n\n The linear :class:`QuantileRegressor` optimizes the pinball loss for a\n desired `quantile` and is robust to outliers.\n\n This model uses an L1 regularization like\n :class:`~sklearn.linear_model.Lasso`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n quantile : float, default=0.5\n The quantile that the model tries to predict. It must be strictly\n between 0 and 1. If 0.5 (default), the model predicts the 50%\n quantile, i.e. the median.\n\n alpha : float, default=1.0\n Regularization constant that multiplies the L1 penalty term.\n\n fit_intercept : bool, default=True\n Whether or not to fit the intercept.\n\n solver : {'highs-ds', 'highs-ipm', 'highs', 'interior-point', 'revised simplex'}, default='interior-point'\n Method used by :func:`scipy.optimize.linprog` to solve the linear\n programming formulation. Note that the highs methods are recommended\n for usage with `scipy>=1.6.0` because they are the fastest ones.\n\n solver_options : dict, default=None\n Additional parameters passed to :func:`scipy.optimize.linprog` as\n options. If `None` and if `solver='interior-point'`, then\n `{\"lstsq\": True}` is passed to :func:`scipy.optimize.linprog` for the\n sake of stability.\n\n Attributes\n ----------\n coef_ : array of shape (n_features,)\n Estimated coefficients for the features.\n\n intercept_ : float\n The intercept of the model, aka bias term.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The actual number of iterations performed by the solver.\n\n See Also\n --------\n Lasso : The Lasso is a linear model that estimates sparse coefficients\n with l1 regularization.\n HuberRegressor : Linear regression model that is robust to outliers.\n\n Examples\n --------\n >>> from sklearn.linear_model import QuantileRegressor\n >>> import numpy as np\n >>> n_samples, n_features = 10, 2\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> reg = QuantileRegressor(quantile=0.8).fit(X, y)\n >>> np.mean(y <= reg.predict(X))\n 0.8\n \"\"\"\n \n def __init__(self, *, quantile=0.5, alpha=1.0, fit_intercept=True, solver='interior-point', solver_options=None):\n self.quantile = quantile\n self.alpha = alpha\n self.fit_intercept = fit_intercept\n self.solver = solver\n self.solver_options = solver_options\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Returns self.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=False, y_numeric=True, multi_output=False)\n sample_weight = _check_sample_weight(sample_weight, X)\n n_features = X.shape[1]\n n_params = n_features\n if self.fit_intercept:\n n_params += 1\n if self.alpha >= 0:\n alpha = np.sum(sample_weight) * self.alpha\n else:\n raise ValueError(f'Penalty alpha must be a non-negative number, got {self.alpha}')\n if self.quantile >= 1.0 or self.quantile <= 0.0:\n raise ValueError(f'Quantile should be strictly between 0.0 and 1.0, got {self.quantile}')\n if not isinstance(self.fit_intercept, bool):\n raise ValueError(f'The argument fit_intercept must be bool, got {self.fit_intercept}')\n if self.solver not in ('highs-ds', 'highs-ipm', 'highs', 'interior-point', 'revised simplex'):\n raise ValueError(f'Invalid value for argument solver, got {self.solver}')\n elif self.solver == 'revised simplex' and sp_version < parse_version('1.3.0'):\n raise ValueError(f\"Solver 'revised simplex' is only available with scipy>=1.3.0, got {sp_version}\")\n elif self.solver in ('highs-ds', 'highs-ipm', 'highs') and sp_version < parse_version('1.6.0'):\n raise ValueError(f'Solver {self.solver} is only available with scipy>=1.6.0, got {sp_version}')\n if self.solver_options is not None and not isinstance(self.solver_options, dict):\n raise ValueError(f'Invalid value for argument solver_options, must be None or a dictionary, got {self.solver_options}')\n if self.solver_options is None and self.solver == 'interior-point':\n solver_options = {'lstsq': True}\n else:\n solver_options = self.solver_options\n mask = sample_weight != 0\n n_mask = int(np.sum(mask))\n c = np.concatenate([np.full(2 * n_params, fill_value=alpha), sample_weight[mask] * self.quantile, sample_weight[mask] * (1 - self.quantile)])\n if self.fit_intercept:\n c[0] = 0\n c[n_params] = 0\n A_eq = np.concatenate([np.ones((n_mask, 1)), X[mask], -np.ones((n_mask, 1)), -X[mask], np.eye(n_mask), -np.eye(n_mask)], axis=1)\n else:\n A_eq = np.concatenate([X[mask], -X[mask], np.eye(n_mask), -np.eye(n_mask)], axis=1)\n b_eq = y[mask]\n result = linprog(c=c, A_eq=A_eq, b_eq=b_eq, method=self.solver, options=solver_options)\n solution = result.x\n if not result.success:\n failure = {1: 'Iteration limit reached.', 2: 'Problem appears to be infeasible.', 3: 'Problem appears to be unbounded.', 4: 'Numerical difficulties encountered.'}\n warnings.warn(f'Linear programming for QuantileRegressor did not succeed.\\nStatus is {result.status}: ' + failure.setdefault(result.status, 'unknown reason') + '\\n' + 'Result message of linprog:\\n' + result.message, ConvergenceWarning)\n params = solution[:n_params] - solution[n_params:2 * n_params]\n self.n_iter_ = result.nit\n if self.fit_intercept:\n self.coef_ = params[1:]\n self.intercept_ = params[0]\n else:\n self.coef_ = params\n self.intercept_ = 0.0\n return self\n" }, @@ -23901,7 +23969,7 @@ "sklearn.linear_model._ransac.RANSACRegressor._more_tags" ], "is_public": true, - "description": "RANSAC (RANdom SAmple Consensus) algorithm.\n\nRANSAC is an iterative algorithm for the robust estimation of parameters from a subset of inliers from the complete data set. Read more in the :ref:`User Guide `.", + "description": "RANSAC (RANdom SAmple Consensus) algorithm.\n\nRANSAC is an iterative algorithm for the robust estimation of parameters\nfrom a subset of inliers from the complete data set.\n\nRead more in the :ref:`User Guide `.", "docstring": "RANSAC (RANdom SAmple Consensus) algorithm.\n\n RANSAC is an iterative algorithm for the robust estimation of parameters\n from a subset of inliers from the complete data set.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n base_estimator : object, default=None\n Base estimator object which implements the following methods:\n\n * `fit(X, y)`: Fit model to given training data and target values.\n * `score(X, y)`: Returns the mean accuracy on the given test data,\n which is used for the stop criterion defined by `stop_score`.\n Additionally, the score is used to decide which of two equally\n large consensus sets is chosen as the better one.\n * `predict(X)`: Returns predicted values using the linear model,\n which is used to compute residual error using loss function.\n\n If `base_estimator` is None, then\n :class:`~sklearn.linear_model.LinearRegression` is used for\n target values of dtype float.\n\n Note that the current implementation only supports regression\n estimators.\n\n min_samples : int (>= 1) or float ([0, 1]), default=None\n Minimum number of samples chosen randomly from original data. Treated\n as an absolute number of samples for `min_samples >= 1`, treated as a\n relative number `ceil(min_samples * X.shape[0])` for\n `min_samples < 1`. This is typically chosen as the minimal number of\n samples necessary to estimate the given `base_estimator`. By default a\n ``sklearn.linear_model.LinearRegression()`` estimator is assumed and\n `min_samples` is chosen as ``X.shape[1] + 1``. This parameter is highly\n dependent upon the model, so if a `base_estimator` other than\n :class:`linear_model.LinearRegression` is used, the user is\n encouraged to provide a value.\n\n .. deprecated:: 1.0\n Not setting `min_samples` explicitly will raise an error in version\n 1.2 for models other than\n :class:`~sklearn.linear_model.LinearRegression`. To keep the old\n default behavior, set `min_samples=X.shape[1] + 1` explicitly.\n\n residual_threshold : float, default=None\n Maximum residual for a data sample to be classified as an inlier.\n By default the threshold is chosen as the MAD (median absolute\n deviation) of the target values `y`. Points whose residuals are\n strictly equal to the threshold are considered as inliers.\n\n is_data_valid : callable, default=None\n This function is called with the randomly selected data before the\n model is fitted to it: `is_data_valid(X, y)`. If its return value is\n False the current randomly chosen sub-sample is skipped.\n\n is_model_valid : callable, default=None\n This function is called with the estimated model and the randomly\n selected data: `is_model_valid(model, X, y)`. If its return value is\n False the current randomly chosen sub-sample is skipped.\n Rejecting samples with this function is computationally costlier than\n with `is_data_valid`. `is_model_valid` should therefore only be used if\n the estimated model is needed for making the rejection decision.\n\n max_trials : int, default=100\n Maximum number of iterations for random sample selection.\n\n max_skips : int, default=np.inf\n Maximum number of iterations that can be skipped due to finding zero\n inliers or invalid data defined by ``is_data_valid`` or invalid models\n defined by ``is_model_valid``.\n\n .. versionadded:: 0.19\n\n stop_n_inliers : int, default=np.inf\n Stop iteration if at least this number of inliers are found.\n\n stop_score : float, default=np.inf\n Stop iteration if score is greater equal than this threshold.\n\n stop_probability : float in range [0, 1], default=0.99\n RANSAC iteration stops if at least one outlier-free set of the training\n data is sampled in RANSAC. This requires to generate at least N\n samples (iterations)::\n\n N >= log(1 - probability) / log(1 - e**m)\n\n where the probability (confidence) is typically set to high value such\n as 0.99 (the default) and e is the current fraction of inliers w.r.t.\n the total number of samples.\n\n loss : str, callable, default='absolute_error'\n String inputs, 'absolute_error' and 'squared_error' are supported which\n find the absolute error and squared error per sample respectively.\n\n If ``loss`` is a callable, then it should be a function that takes\n two arrays as inputs, the true and predicted value and returns a 1-D\n array with the i-th value of the array corresponding to the loss\n on ``X[i]``.\n\n If the loss on a sample is greater than the ``residual_threshold``,\n then this sample is classified as an outlier.\n\n .. versionadded:: 0.18\n\n .. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n .. deprecated:: 1.0\n The loss 'absolute_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='absolute_error'` which is equivalent.\n\n random_state : int, RandomState instance, default=None\n The generator used to initialize the centers.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n estimator_ : object\n Best fitted model (copy of the `base_estimator` object).\n\n n_trials_ : int\n Number of random selection trials until one of the stop criteria is\n met. It is always ``<= max_trials``.\n\n inlier_mask_ : bool array of shape [n_samples]\n Boolean mask of inliers classified as ``True``.\n\n n_skips_no_inliers_ : int\n Number of iterations skipped due to finding zero inliers.\n\n .. versionadded:: 0.19\n\n n_skips_invalid_data_ : int\n Number of iterations skipped due to invalid data defined by\n ``is_data_valid``.\n\n .. versionadded:: 0.19\n\n n_skips_invalid_model_ : int\n Number of iterations skipped due to an invalid model defined by\n ``is_model_valid``.\n\n .. versionadded:: 0.19\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n HuberRegressor : Linear regression model that is robust to outliers.\n TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.\n SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.\n\n References\n ----------\n .. [1] https://en.wikipedia.org/wiki/RANSAC\n .. [2] https://www.sri.com/sites/default/files/publications/ransac-publication.pdf\n .. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf\n\n Examples\n --------\n >>> from sklearn.linear_model import RANSACRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(\n ... n_samples=200, n_features=2, noise=4.0, random_state=0)\n >>> reg = RANSACRegressor(random_state=0).fit(X, y)\n >>> reg.score(X, y)\n 0.9885...\n >>> reg.predict(X[:1,])\n array([-31.9417...])\n ", "source_code": "\n\nclass RANSACRegressor(MetaEstimatorMixin, RegressorMixin, MultiOutputMixin, BaseEstimator):\n \"\"\"RANSAC (RANdom SAmple Consensus) algorithm.\n\n RANSAC is an iterative algorithm for the robust estimation of parameters\n from a subset of inliers from the complete data set.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n base_estimator : object, default=None\n Base estimator object which implements the following methods:\n\n * `fit(X, y)`: Fit model to given training data and target values.\n * `score(X, y)`: Returns the mean accuracy on the given test data,\n which is used for the stop criterion defined by `stop_score`.\n Additionally, the score is used to decide which of two equally\n large consensus sets is chosen as the better one.\n * `predict(X)`: Returns predicted values using the linear model,\n which is used to compute residual error using loss function.\n\n If `base_estimator` is None, then\n :class:`~sklearn.linear_model.LinearRegression` is used for\n target values of dtype float.\n\n Note that the current implementation only supports regression\n estimators.\n\n min_samples : int (>= 1) or float ([0, 1]), default=None\n Minimum number of samples chosen randomly from original data. Treated\n as an absolute number of samples for `min_samples >= 1`, treated as a\n relative number `ceil(min_samples * X.shape[0])` for\n `min_samples < 1`. This is typically chosen as the minimal number of\n samples necessary to estimate the given `base_estimator`. By default a\n ``sklearn.linear_model.LinearRegression()`` estimator is assumed and\n `min_samples` is chosen as ``X.shape[1] + 1``. This parameter is highly\n dependent upon the model, so if a `base_estimator` other than\n :class:`linear_model.LinearRegression` is used, the user is\n encouraged to provide a value.\n\n .. deprecated:: 1.0\n Not setting `min_samples` explicitly will raise an error in version\n 1.2 for models other than\n :class:`~sklearn.linear_model.LinearRegression`. To keep the old\n default behavior, set `min_samples=X.shape[1] + 1` explicitly.\n\n residual_threshold : float, default=None\n Maximum residual for a data sample to be classified as an inlier.\n By default the threshold is chosen as the MAD (median absolute\n deviation) of the target values `y`. Points whose residuals are\n strictly equal to the threshold are considered as inliers.\n\n is_data_valid : callable, default=None\n This function is called with the randomly selected data before the\n model is fitted to it: `is_data_valid(X, y)`. If its return value is\n False the current randomly chosen sub-sample is skipped.\n\n is_model_valid : callable, default=None\n This function is called with the estimated model and the randomly\n selected data: `is_model_valid(model, X, y)`. If its return value is\n False the current randomly chosen sub-sample is skipped.\n Rejecting samples with this function is computationally costlier than\n with `is_data_valid`. `is_model_valid` should therefore only be used if\n the estimated model is needed for making the rejection decision.\n\n max_trials : int, default=100\n Maximum number of iterations for random sample selection.\n\n max_skips : int, default=np.inf\n Maximum number of iterations that can be skipped due to finding zero\n inliers or invalid data defined by ``is_data_valid`` or invalid models\n defined by ``is_model_valid``.\n\n .. versionadded:: 0.19\n\n stop_n_inliers : int, default=np.inf\n Stop iteration if at least this number of inliers are found.\n\n stop_score : float, default=np.inf\n Stop iteration if score is greater equal than this threshold.\n\n stop_probability : float in range [0, 1], default=0.99\n RANSAC iteration stops if at least one outlier-free set of the training\n data is sampled in RANSAC. This requires to generate at least N\n samples (iterations)::\n\n N >= log(1 - probability) / log(1 - e**m)\n\n where the probability (confidence) is typically set to high value such\n as 0.99 (the default) and e is the current fraction of inliers w.r.t.\n the total number of samples.\n\n loss : str, callable, default='absolute_error'\n String inputs, 'absolute_error' and 'squared_error' are supported which\n find the absolute error and squared error per sample respectively.\n\n If ``loss`` is a callable, then it should be a function that takes\n two arrays as inputs, the true and predicted value and returns a 1-D\n array with the i-th value of the array corresponding to the loss\n on ``X[i]``.\n\n If the loss on a sample is greater than the ``residual_threshold``,\n then this sample is classified as an outlier.\n\n .. versionadded:: 0.18\n\n .. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n .. deprecated:: 1.0\n The loss 'absolute_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='absolute_error'` which is equivalent.\n\n random_state : int, RandomState instance, default=None\n The generator used to initialize the centers.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n estimator_ : object\n Best fitted model (copy of the `base_estimator` object).\n\n n_trials_ : int\n Number of random selection trials until one of the stop criteria is\n met. It is always ``<= max_trials``.\n\n inlier_mask_ : bool array of shape [n_samples]\n Boolean mask of inliers classified as ``True``.\n\n n_skips_no_inliers_ : int\n Number of iterations skipped due to finding zero inliers.\n\n .. versionadded:: 0.19\n\n n_skips_invalid_data_ : int\n Number of iterations skipped due to invalid data defined by\n ``is_data_valid``.\n\n .. versionadded:: 0.19\n\n n_skips_invalid_model_ : int\n Number of iterations skipped due to an invalid model defined by\n ``is_model_valid``.\n\n .. versionadded:: 0.19\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n HuberRegressor : Linear regression model that is robust to outliers.\n TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.\n SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.\n\n References\n ----------\n .. [1] https://en.wikipedia.org/wiki/RANSAC\n .. [2] https://www.sri.com/sites/default/files/publications/ransac-publication.pdf\n .. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf\n\n Examples\n --------\n >>> from sklearn.linear_model import RANSACRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(\n ... n_samples=200, n_features=2, noise=4.0, random_state=0)\n >>> reg = RANSACRegressor(random_state=0).fit(X, y)\n >>> reg.score(X, y)\n 0.9885...\n >>> reg.predict(X[:1,])\n array([-31.9417...])\n \"\"\"\n \n def __init__(self, base_estimator=None, *, min_samples=None, residual_threshold=None, is_data_valid=None, is_model_valid=None, max_trials=100, max_skips=np.inf, stop_n_inliers=np.inf, stop_score=np.inf, stop_probability=0.99, loss='absolute_error', random_state=None):\n self.base_estimator = base_estimator\n self.min_samples = min_samples\n self.residual_threshold = residual_threshold\n self.is_data_valid = is_data_valid\n self.is_model_valid = is_model_valid\n self.max_trials = max_trials\n self.max_skips = max_skips\n self.stop_n_inliers = stop_n_inliers\n self.stop_score = stop_score\n self.stop_probability = stop_probability\n self.random_state = random_state\n self.loss = loss\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit estimator using RANSAC algorithm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample\n raises error if sample_weight is passed and base_estimator\n fit method does not support it.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n self : object\n Fitted `RANSACRegressor` estimator.\n\n Raises\n ------\n ValueError\n If no valid consensus set could be found. This occurs if\n `is_data_valid` and `is_model_valid` return False for all\n `max_trials` randomly chosen sub-samples.\n \"\"\"\n check_X_params = dict(accept_sparse='csr', force_all_finite=False)\n check_y_params = dict(ensure_2d=False)\n (X, y) = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params))\n check_consistent_length(X, y)\n if self.base_estimator is not None:\n base_estimator = clone(self.base_estimator)\n else:\n base_estimator = LinearRegression()\n if self.min_samples is None:\n if not isinstance(base_estimator, LinearRegression):\n warnings.warn(f'From version 1.2, `min_samples` needs to be explicitly set otherwise an error will be raised. To keep the current behavior, you need to set `min_samples` to `X.shape[1] + 1 that is {X.shape[1] + 1}', FutureWarning)\n min_samples = X.shape[1] + 1\n elif 0 < self.min_samples < 1:\n min_samples = np.ceil(self.min_samples * X.shape[0])\n elif self.min_samples >= 1:\n if self.min_samples % 1 != 0:\n raise ValueError('Absolute number of samples must be an integer value.')\n min_samples = self.min_samples\n else:\n raise ValueError('Value for `min_samples` must be scalar and positive.')\n if min_samples > X.shape[0]:\n raise ValueError('`min_samples` may not be larger than number of samples: n_samples = %d.' % X.shape[0])\n if self.stop_probability < 0 or self.stop_probability > 1:\n raise ValueError('`stop_probability` must be in range [0, 1].')\n if self.residual_threshold is None:\n residual_threshold = np.median(np.abs(y - np.median(y)))\n else:\n residual_threshold = self.residual_threshold\n if self.loss in ('absolute_error', 'absolute_loss'):\n if self.loss == 'absolute_loss':\n warnings.warn(\"The loss 'absolute_loss' was deprecated in v1.0 and will be removed in version 1.2. Use `loss='absolute_error'` which is equivalent.\", FutureWarning)\n if y.ndim == 1:\n loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred)\n else:\n loss_function = lambda y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1)\n elif self.loss in ('squared_error', 'squared_loss'):\n if self.loss == 'squared_loss':\n warnings.warn(\"The loss 'squared_loss' was deprecated in v1.0 and will be removed in version 1.2. Use `loss='squared_error'` which is equivalent.\", FutureWarning)\n if y.ndim == 1:\n loss_function = lambda y_true, y_pred: (y_true - y_pred)**2\n else:\n loss_function = lambda y_true, y_pred: np.sum((y_true - y_pred)**2, axis=1)\n elif callable(self.loss):\n loss_function = self.loss\n else:\n raise ValueError(\"loss should be 'absolute_error', 'squared_error' or a callable. Got %s. \" % self.loss)\n random_state = check_random_state(self.random_state)\n try:\n base_estimator.set_params(random_state=random_state)\n except ValueError:\n pass\n estimator_fit_has_sample_weight = has_fit_parameter(base_estimator, 'sample_weight')\n estimator_name = type(base_estimator).__name__\n if sample_weight is not None and not estimator_fit_has_sample_weight:\n raise ValueError('%s does not support sample_weight. Samples weights are only used for the calibration itself.' % estimator_name)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n n_inliers_best = 1\n score_best = -np.inf\n inlier_mask_best = None\n X_inlier_best = None\n y_inlier_best = None\n inlier_best_idxs_subset = None\n self.n_skips_no_inliers_ = 0\n self.n_skips_invalid_data_ = 0\n self.n_skips_invalid_model_ = 0\n n_samples = X.shape[0]\n sample_idxs = np.arange(n_samples)\n self.n_trials_ = 0\n max_trials = self.max_trials\n while self.n_trials_ < max_trials:\n self.n_trials_ += 1\n if self.n_skips_no_inliers_ + self.n_skips_invalid_data_ + self.n_skips_invalid_model_ > self.max_skips:\n break\n subset_idxs = sample_without_replacement(n_samples, min_samples, random_state=random_state)\n X_subset = X[subset_idxs]\n y_subset = y[subset_idxs]\n if self.is_data_valid is not None and not self.is_data_valid(X_subset, y_subset):\n self.n_skips_invalid_data_ += 1\n continue\n if sample_weight is None:\n base_estimator.fit(X_subset, y_subset)\n else:\n base_estimator.fit(X_subset, y_subset, sample_weight=sample_weight[subset_idxs])\n if self.is_model_valid is not None and not self.is_model_valid(base_estimator, X_subset, y_subset):\n self.n_skips_invalid_model_ += 1\n continue\n y_pred = base_estimator.predict(X)\n residuals_subset = loss_function(y, y_pred)\n inlier_mask_subset = residuals_subset <= residual_threshold\n n_inliers_subset = np.sum(inlier_mask_subset)\n if n_inliers_subset < n_inliers_best:\n self.n_skips_no_inliers_ += 1\n continue\n inlier_idxs_subset = sample_idxs[inlier_mask_subset]\n X_inlier_subset = X[inlier_idxs_subset]\n y_inlier_subset = y[inlier_idxs_subset]\n score_subset = base_estimator.score(X_inlier_subset, y_inlier_subset)\n if n_inliers_subset == n_inliers_best and score_subset < score_best:\n continue\n n_inliers_best = n_inliers_subset\n score_best = score_subset\n inlier_mask_best = inlier_mask_subset\n X_inlier_best = X_inlier_subset\n y_inlier_best = y_inlier_subset\n inlier_best_idxs_subset = inlier_idxs_subset\n max_trials = min(max_trials, _dynamic_max_trials(n_inliers_best, n_samples, min_samples, self.stop_probability))\n if n_inliers_best >= self.stop_n_inliers or score_best >= self.stop_score:\n break\n if inlier_mask_best is None:\n if self.n_skips_no_inliers_ + self.n_skips_invalid_data_ + self.n_skips_invalid_model_ > self.max_skips:\n raise ValueError('RANSAC skipped more iterations than `max_skips` without finding a valid consensus set. Iterations were skipped because each randomly chosen sub-sample failed the passing criteria. See estimator attributes for diagnostics (n_skips*).')\n else:\n raise ValueError('RANSAC could not find a valid consensus set. All `max_trials` iterations were skipped because each randomly chosen sub-sample failed the passing criteria. See estimator attributes for diagnostics (n_skips*).')\n elif self.n_skips_no_inliers_ + self.n_skips_invalid_data_ + self.n_skips_invalid_model_ > self.max_skips:\n warnings.warn('RANSAC found a valid consensus set but exited early due to skipping more iterations than `max_skips`. See estimator attributes for diagnostics (n_skips*).', ConvergenceWarning)\n if sample_weight is None:\n base_estimator.fit(X_inlier_best, y_inlier_best)\n else:\n base_estimator.fit(X_inlier_best, y_inlier_best, sample_weight=sample_weight[inlier_best_idxs_subset])\n self.estimator_ = base_estimator\n self.inlier_mask_ = inlier_mask_best\n return self\n \n def predict(self, X):\n \"\"\"Predict using the estimated model.\n\n This is a wrapper for `estimator_.predict(X)`.\n\n Parameters\n ----------\n X : {array-like or sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n y : array, shape = [n_samples] or [n_samples, n_targets]\n Returns predicted values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, force_all_finite=False, accept_sparse=True, reset=False)\n return self.estimator_.predict(X)\n \n def score(self, X, y):\n \"\"\"Return the score of the prediction.\n\n This is a wrapper for `estimator_.score(X, y)`.\n\n Parameters\n ----------\n X : (array-like or sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n Returns\n -------\n z : float\n Score of the prediction.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, force_all_finite=False, accept_sparse=True, reset=False)\n return self.estimator_.score(X, y)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, @@ -23919,7 +23987,7 @@ "sklearn.linear_model._ridge.Ridge.fit" ], "is_public": true, - "description": "Linear least squares with l2 regularization.\n\nMinimizes the objective function:: ||y - Xw||^2_2 + alpha * ||w||^2_2 This model solves a regression model where the loss function is the linear least squares function and regularization is given by the l2-norm. Also known as Ridge Regression or Tikhonov regularization. This estimator has built-in support for multi-variate regression (i.e., when y is a 2d-array of shape (n_samples, n_targets)). Read more in the :ref:`User Guide `.", + "description": "Linear least squares with l2 regularization.\n\nMinimizes the objective function::\n\n||y - Xw||^2_2 + alpha * ||w||^2_2\n\nThis model solves a regression model where the loss function is\nthe linear least squares function and regularization is given by\nthe l2-norm. Also known as Ridge Regression or Tikhonov regularization.\nThis estimator has built-in support for multi-variate regression\n(i.e., when y is a 2d-array of shape (n_samples, n_targets)).\n\nRead more in the :ref:`User Guide `.", "docstring": "Linear least squares with l2 regularization.\n\n Minimizes the objective function::\n\n ||y - Xw||^2_2 + alpha * ||w||^2_2\n\n This model solves a regression model where the loss function is\n the linear least squares function and regularization is given by\n the l2-norm. Also known as Ridge Regression or Tikhonov regularization.\n This estimator has built-in support for multi-variate regression\n (i.e., when y is a 2d-array of shape (n_samples, n_targets)).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : {float, ndarray of shape (n_targets,)}, default=1.0\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are\n assumed to be specific to the targets. Hence they must correspond in\n number.\n\n fit_intercept : bool, default=True\n Whether to fit the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. ``X`` and ``y`` are expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and\n will be removed in 1.2.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=None\n Maximum number of iterations for conjugate gradient solver.\n For 'sparse_cg' and 'lsqr' solvers, the default value is determined\n by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.\n For 'lbfgs' solver, the default value is 15000.\n\n tol : float, default=1e-3\n Precision of the solution.\n\n solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto'\n Solver to use in the computational routines:\n\n - 'auto' chooses the solver automatically based on the type of data.\n\n - 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n - 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution.\n\n - 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n - 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its improved, unbiased version named SAGA. Both methods also use an\n iterative procedure, and are often faster than other solvers when\n both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n - 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True.\n\n All last six solvers support both dense and sparse data. However, only\n 'sag', 'sparse_cg', and 'lbfgs' support sparse input when `fit_intercept`\n is True.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n Only 'lbfgs' solver is supported in this case.\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag' or 'saga' to shuffle the data.\n See :term:`Glossary ` for details.\n\n .. versionadded:: 0.17\n `random_state` to support Stochastic Average Gradient.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Weight vector(s).\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n n_iter_ : None or ndarray of shape (n_targets,)\n Actual number of iterations for each target. Available only for\n sag and lsqr solvers. Other solvers will return None.\n\n .. versionadded:: 0.17\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n RidgeClassifier : Ridge classifier.\n RidgeCV : Ridge regression with built-in cross validation.\n :class:`~sklearn.kernel_ridge.KernelRidge` : Kernel ridge regression\n combines ridge regression with the kernel trick.\n\n Examples\n --------\n >>> from sklearn.linear_model import Ridge\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> clf = Ridge(alpha=1.0)\n >>> clf.fit(X, y)\n Ridge()\n ", "source_code": "\n\nclass Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):\n \"\"\"Linear least squares with l2 regularization.\n\n Minimizes the objective function::\n\n ||y - Xw||^2_2 + alpha * ||w||^2_2\n\n This model solves a regression model where the loss function is\n the linear least squares function and regularization is given by\n the l2-norm. Also known as Ridge Regression or Tikhonov regularization.\n This estimator has built-in support for multi-variate regression\n (i.e., when y is a 2d-array of shape (n_samples, n_targets)).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : {float, ndarray of shape (n_targets,)}, default=1.0\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are\n assumed to be specific to the targets. Hence they must correspond in\n number.\n\n fit_intercept : bool, default=True\n Whether to fit the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. ``X`` and ``y`` are expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and\n will be removed in 1.2.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=None\n Maximum number of iterations for conjugate gradient solver.\n For 'sparse_cg' and 'lsqr' solvers, the default value is determined\n by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.\n For 'lbfgs' solver, the default value is 15000.\n\n tol : float, default=1e-3\n Precision of the solution.\n\n solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto'\n Solver to use in the computational routines:\n\n - 'auto' chooses the solver automatically based on the type of data.\n\n - 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n - 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution.\n\n - 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n - 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its improved, unbiased version named SAGA. Both methods also use an\n iterative procedure, and are often faster than other solvers when\n both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n - 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True.\n\n All last six solvers support both dense and sparse data. However, only\n 'sag', 'sparse_cg', and 'lbfgs' support sparse input when `fit_intercept`\n is True.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n Only 'lbfgs' solver is supported in this case.\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag' or 'saga' to shuffle the data.\n See :term:`Glossary ` for details.\n\n .. versionadded:: 0.17\n `random_state` to support Stochastic Average Gradient.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,) or (n_targets, n_features)\n Weight vector(s).\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n n_iter_ : None or ndarray of shape (n_targets,)\n Actual number of iterations for each target. Available only for\n sag and lsqr solvers. Other solvers will return None.\n\n .. versionadded:: 0.17\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n RidgeClassifier : Ridge classifier.\n RidgeCV : Ridge regression with built-in cross validation.\n :class:`~sklearn.kernel_ridge.KernelRidge` : Kernel ridge regression\n combines ridge regression with the kernel trick.\n\n Examples\n --------\n >>> from sklearn.linear_model import Ridge\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> clf = Ridge(alpha=1.0)\n >>> clf.fit(X, y)\n Ridge()\n \"\"\"\n \n def __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=None, tol=0.001, solver='auto', positive=False, random_state=None):\n super().__init__(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver, positive=positive, random_state=random_state)\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit Ridge regression model.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver)\n (X, y) = self._validate_data(X, y, accept_sparse=_accept_sparse, dtype=[np.float64, np.float32], multi_output=True, y_numeric=True)\n return super().fit(X, y, sample_weight=sample_weight)\n" }, @@ -23934,40 +24002,38 @@ ], "methods": [], "is_public": true, - "description": "Ridge regression with built-in cross-validation.\n\nSee glossary entry for :term:`cross-validation estimator`. By default, it performs efficient Leave-One-Out Cross-Validation. Read more in the :ref:`User Guide `.", - "docstring": "Ridge regression with built-in cross-validation.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n By default, it performs efficient Leave-One-Out Cross-Validation.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alphas : ndarray of shape (n_alphas,), default=(0.1, 1.0, 10.0)\n Array of alpha values to try.\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`.\n If using Leave-One-Out cross-validation, alphas must be positive.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n scoring : str, callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n If None, the negative mean squared error if cv is 'auto' or None\n (i.e. when using leave-one-out cross-validation), and r2 score\n otherwise.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the efficient Leave-One-Out cross-validation\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if ``y`` is binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used, else,\n :class:`~sklearn.model_selection.KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n gcv_mode : {'auto', 'svd', eigen'}, default='auto'\n Flag indicating which strategy to use when performing\n Leave-One-Out Cross-Validation. Options are::\n\n 'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen'\n 'svd' : force use of singular value decomposition of X when X is\n dense, eigenvalue decomposition of X^T.X when X is sparse.\n 'eigen' : force computation via eigendecomposition of X.X^T\n\n The 'auto' mode is the default and is intended to pick the cheaper\n option of the two depending on the shape of the training data.\n\n store_cv_values : bool, default=False\n Flag indicating if the cross-validation values corresponding to\n each alpha should be stored in the ``cv_values_`` attribute (see\n below). This flag is only compatible with ``cv=None`` (i.e. using\n Leave-One-Out Cross-Validation).\n\n alpha_per_target : bool, default=False\n Flag indicating whether to optimize the alpha value (picked from the\n `alphas` parameter list) for each target separately (for multi-output\n settings: multiple prediction targets). When set to `True`, after\n fitting, the `alpha_` attribute will contain a value for each target.\n When set to `False`, a single alpha is used for all targets.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n cv_values_ : ndarray of shape (n_samples, n_alphas) or shape (n_samples, n_targets, n_alphas), optional\n Cross-validation values for each alpha (only available if\n ``store_cv_values=True`` and ``cv=None``). After ``fit()`` has been\n called, this attribute will contain the mean squared errors if\n `scoring is None` otherwise it will contain standardized per point\n prediction values.\n\n coef_ : ndarray of shape (n_features) or (n_targets, n_features)\n Weight vector(s).\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n alpha_ : float or ndarray of shape (n_targets,)\n Estimated regularization parameter, or, if ``alpha_per_target=True``,\n the estimated regularization parameter for each target.\n\n best_score_ : float or ndarray of shape (n_targets,)\n Score of base estimator with best alpha, or, if\n ``alpha_per_target=True``, a score for each target.\n\n .. versionadded:: 0.23\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression.\n RidgeClassifier : Classifier based on ridge regression on {-1, 1} labels.\n RidgeClassifierCV : Ridge classifier with built-in cross validation.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.linear_model import RidgeCV\n >>> X, y = load_diabetes(return_X_y=True)\n >>> clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)\n >>> clf.score(X, y)\n 0.5166...\n ", - "source_code": "\n\nclass RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):\n \"\"\"Ridge regression with built-in cross-validation.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n By default, it performs efficient Leave-One-Out Cross-Validation.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alphas : ndarray of shape (n_alphas,), default=(0.1, 1.0, 10.0)\n Array of alpha values to try.\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`.\n If using Leave-One-Out cross-validation, alphas must be positive.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n scoring : str, callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n If None, the negative mean squared error if cv is 'auto' or None\n (i.e. when using leave-one-out cross-validation), and r2 score\n otherwise.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the efficient Leave-One-Out cross-validation\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if ``y`` is binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used, else,\n :class:`~sklearn.model_selection.KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n gcv_mode : {'auto', 'svd', eigen'}, default='auto'\n Flag indicating which strategy to use when performing\n Leave-One-Out Cross-Validation. Options are::\n\n 'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen'\n 'svd' : force use of singular value decomposition of X when X is\n dense, eigenvalue decomposition of X^T.X when X is sparse.\n 'eigen' : force computation via eigendecomposition of X.X^T\n\n The 'auto' mode is the default and is intended to pick the cheaper\n option of the two depending on the shape of the training data.\n\n store_cv_values : bool, default=False\n Flag indicating if the cross-validation values corresponding to\n each alpha should be stored in the ``cv_values_`` attribute (see\n below). This flag is only compatible with ``cv=None`` (i.e. using\n Leave-One-Out Cross-Validation).\n\n alpha_per_target : bool, default=False\n Flag indicating whether to optimize the alpha value (picked from the\n `alphas` parameter list) for each target separately (for multi-output\n settings: multiple prediction targets). When set to `True`, after\n fitting, the `alpha_` attribute will contain a value for each target.\n When set to `False`, a single alpha is used for all targets.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n cv_values_ : ndarray of shape (n_samples, n_alphas) or shape (n_samples, n_targets, n_alphas), optional\n Cross-validation values for each alpha (only available if\n ``store_cv_values=True`` and ``cv=None``). After ``fit()`` has been\n called, this attribute will contain the mean squared errors if\n `scoring is None` otherwise it will contain standardized per point\n prediction values.\n\n coef_ : ndarray of shape (n_features) or (n_targets, n_features)\n Weight vector(s).\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n alpha_ : float or ndarray of shape (n_targets,)\n Estimated regularization parameter, or, if ``alpha_per_target=True``,\n the estimated regularization parameter for each target.\n\n best_score_ : float or ndarray of shape (n_targets,)\n Score of base estimator with best alpha, or, if\n ``alpha_per_target=True``, a score for each target.\n\n .. versionadded:: 0.23\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression.\n RidgeClassifier : Classifier based on ridge regression on {-1, 1} labels.\n RidgeClassifierCV : Ridge classifier with built-in cross validation.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.linear_model import RidgeCV\n >>> X, y = load_diabetes(return_X_y=True)\n >>> clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)\n >>> clf.score(X, y)\n 0.5166...\n \"\"\"\n \n" + "description": "Ridge regression with built-in cross-validation.\n\nSee glossary entry for :term:`cross-validation estimator`.\n\nBy default, it performs efficient Leave-One-Out Cross-Validation.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Ridge regression with built-in cross-validation.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n By default, it performs efficient Leave-One-Out Cross-Validation.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alphas : ndarray of shape (n_alphas,), default=(0.1, 1.0, 10.0)\n Array of alpha values to try.\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`.\n If using Leave-One-Out cross-validation, alphas must be positive.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n scoring : str, callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n If None, the negative mean squared error if cv is 'auto' or None\n (i.e. when using leave-one-out cross-validation), and r2 score\n otherwise.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the efficient Leave-One-Out cross-validation\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if ``y`` is binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used, else,\n :class:`~sklearn.model_selection.KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n gcv_mode : {'auto', 'svd', 'eigen'}, default='auto'\n Flag indicating which strategy to use when performing\n Leave-One-Out Cross-Validation. Options are::\n\n 'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen'\n 'svd' : force use of singular value decomposition of X when X is\n dense, eigenvalue decomposition of X^T.X when X is sparse.\n 'eigen' : force computation via eigendecomposition of X.X^T\n\n The 'auto' mode is the default and is intended to pick the cheaper\n option of the two depending on the shape of the training data.\n\n store_cv_values : bool, default=False\n Flag indicating if the cross-validation values corresponding to\n each alpha should be stored in the ``cv_values_`` attribute (see\n below). This flag is only compatible with ``cv=None`` (i.e. using\n Leave-One-Out Cross-Validation).\n\n alpha_per_target : bool, default=False\n Flag indicating whether to optimize the alpha value (picked from the\n `alphas` parameter list) for each target separately (for multi-output\n settings: multiple prediction targets). When set to `True`, after\n fitting, the `alpha_` attribute will contain a value for each target.\n When set to `False`, a single alpha is used for all targets.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n cv_values_ : ndarray of shape (n_samples, n_alphas) or shape (n_samples, n_targets, n_alphas), optional\n Cross-validation values for each alpha (only available if\n ``store_cv_values=True`` and ``cv=None``). After ``fit()`` has been\n called, this attribute will contain the mean squared errors if\n `scoring is None` otherwise it will contain standardized per point\n prediction values.\n\n coef_ : ndarray of shape (n_features) or (n_targets, n_features)\n Weight vector(s).\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n alpha_ : float or ndarray of shape (n_targets,)\n Estimated regularization parameter, or, if ``alpha_per_target=True``,\n the estimated regularization parameter for each target.\n\n best_score_ : float or ndarray of shape (n_targets,)\n Score of base estimator with best alpha, or, if\n ``alpha_per_target=True``, a score for each target.\n\n .. versionadded:: 0.23\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression.\n RidgeClassifier : Classifier based on ridge regression on {-1, 1} labels.\n RidgeClassifierCV : Ridge classifier with built-in cross validation.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.linear_model import RidgeCV\n >>> X, y = load_diabetes(return_X_y=True)\n >>> clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)\n >>> clf.score(X, y)\n 0.5166...\n ", + "source_code": "\n\nclass RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):\n \"\"\"Ridge regression with built-in cross-validation.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n By default, it performs efficient Leave-One-Out Cross-Validation.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alphas : ndarray of shape (n_alphas,), default=(0.1, 1.0, 10.0)\n Array of alpha values to try.\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`.\n If using Leave-One-Out cross-validation, alphas must be positive.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n scoring : str, callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n If None, the negative mean squared error if cv is 'auto' or None\n (i.e. when using leave-one-out cross-validation), and r2 score\n otherwise.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the efficient Leave-One-Out cross-validation\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if ``y`` is binary or multiclass,\n :class:`~sklearn.model_selection.StratifiedKFold` is used, else,\n :class:`~sklearn.model_selection.KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n gcv_mode : {'auto', 'svd', 'eigen'}, default='auto'\n Flag indicating which strategy to use when performing\n Leave-One-Out Cross-Validation. Options are::\n\n 'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen'\n 'svd' : force use of singular value decomposition of X when X is\n dense, eigenvalue decomposition of X^T.X when X is sparse.\n 'eigen' : force computation via eigendecomposition of X.X^T\n\n The 'auto' mode is the default and is intended to pick the cheaper\n option of the two depending on the shape of the training data.\n\n store_cv_values : bool, default=False\n Flag indicating if the cross-validation values corresponding to\n each alpha should be stored in the ``cv_values_`` attribute (see\n below). This flag is only compatible with ``cv=None`` (i.e. using\n Leave-One-Out Cross-Validation).\n\n alpha_per_target : bool, default=False\n Flag indicating whether to optimize the alpha value (picked from the\n `alphas` parameter list) for each target separately (for multi-output\n settings: multiple prediction targets). When set to `True`, after\n fitting, the `alpha_` attribute will contain a value for each target.\n When set to `False`, a single alpha is used for all targets.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n cv_values_ : ndarray of shape (n_samples, n_alphas) or shape (n_samples, n_targets, n_alphas), optional\n Cross-validation values for each alpha (only available if\n ``store_cv_values=True`` and ``cv=None``). After ``fit()`` has been\n called, this attribute will contain the mean squared errors if\n `scoring is None` otherwise it will contain standardized per point\n prediction values.\n\n coef_ : ndarray of shape (n_features) or (n_targets, n_features)\n Weight vector(s).\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n alpha_ : float or ndarray of shape (n_targets,)\n Estimated regularization parameter, or, if ``alpha_per_target=True``,\n the estimated regularization parameter for each target.\n\n best_score_ : float or ndarray of shape (n_targets,)\n Score of base estimator with best alpha, or, if\n ``alpha_per_target=True``, a score for each target.\n\n .. versionadded:: 0.23\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression.\n RidgeClassifier : Classifier based on ridge regression on {-1, 1} labels.\n RidgeClassifierCV : Ridge classifier with built-in cross validation.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.linear_model import RidgeCV\n >>> X, y = load_diabetes(return_X_y=True)\n >>> clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)\n >>> clf.score(X, y)\n 0.5166...\n \"\"\"\n \n" }, { "name": "RidgeClassifier", "qname": "sklearn.linear_model._ridge.RidgeClassifier", "decorators": [], - "superclasses": ["LinearClassifierMixin", "_BaseRidge"], + "superclasses": ["_RidgeClassifierMixin", "_BaseRidge"], "methods": [ "sklearn.linear_model._ridge.RidgeClassifier.__init__", - "sklearn.linear_model._ridge.RidgeClassifier.fit", - "sklearn.linear_model._ridge.RidgeClassifier.classes_@getter" + "sklearn.linear_model._ridge.RidgeClassifier.fit" ], "is_public": true, - "description": "Classifier using Ridge regression.\n\nThis classifier first converts the target values into ``{-1, 1}`` and then treats the problem as a regression task (multi-output regression in the multiclass case). Read more in the :ref:`User Guide `.", - "docstring": "Classifier using Ridge regression.\n\n This classifier first converts the target values into ``{-1, 1}`` and\n then treats the problem as a regression task (multi-output regression in\n the multiclass case).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set to false, no\n intercept will be used in calculations (e.g. data is expected to be\n already centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and\n will be removed in 1.2.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=None\n Maximum number of iterations for conjugate gradient solver.\n The default value is determined by scipy.sparse.linalg.\n\n tol : float, default=1e-3\n Precision of the solution.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto'\n Solver to use in the computational routines:\n\n - 'auto' chooses the solver automatically based on the type of data.\n\n - 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n - 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution.\n\n - 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n - 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its unbiased and more flexible version named SAGA. Both methods\n use an iterative procedure, and are often faster than other solvers\n when both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\n - 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n Only 'lbfgs' solver is supported in this case.\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag' or 'saga' to shuffle the data.\n See :term:`Glossary ` for details.\n\n Attributes\n ----------\n coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)\n Coefficient of the features in the decision function.\n\n ``coef_`` is of shape (1, n_features) when the given problem is binary.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n n_iter_ : None or ndarray of shape (n_targets,)\n Actual number of iterations for each target. Available only for\n sag and lsqr solvers. Other solvers will return None.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression.\n RidgeClassifierCV : Ridge classifier with built-in cross validation.\n\n Notes\n -----\n For multi-class classification, n_class classifiers are trained in\n a one-versus-all approach. Concretely, this is implemented by taking\n advantage of the multi-variate response support in Ridge.\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.linear_model import RidgeClassifier\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> clf = RidgeClassifier().fit(X, y)\n >>> clf.score(X, y)\n 0.9595...\n ", - "source_code": "\n\nclass RidgeClassifier(LinearClassifierMixin, _BaseRidge):\n \"\"\"Classifier using Ridge regression.\n\n This classifier first converts the target values into ``{-1, 1}`` and\n then treats the problem as a regression task (multi-output regression in\n the multiclass case).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set to false, no\n intercept will be used in calculations (e.g. data is expected to be\n already centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and\n will be removed in 1.2.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=None\n Maximum number of iterations for conjugate gradient solver.\n The default value is determined by scipy.sparse.linalg.\n\n tol : float, default=1e-3\n Precision of the solution.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto'\n Solver to use in the computational routines:\n\n - 'auto' chooses the solver automatically based on the type of data.\n\n - 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n - 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution.\n\n - 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n - 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its unbiased and more flexible version named SAGA. Both methods\n use an iterative procedure, and are often faster than other solvers\n when both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\n - 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n Only 'lbfgs' solver is supported in this case.\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag' or 'saga' to shuffle the data.\n See :term:`Glossary ` for details.\n\n Attributes\n ----------\n coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)\n Coefficient of the features in the decision function.\n\n ``coef_`` is of shape (1, n_features) when the given problem is binary.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n n_iter_ : None or ndarray of shape (n_targets,)\n Actual number of iterations for each target. Available only for\n sag and lsqr solvers. Other solvers will return None.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression.\n RidgeClassifierCV : Ridge classifier with built-in cross validation.\n\n Notes\n -----\n For multi-class classification, n_class classifiers are trained in\n a one-versus-all approach. Concretely, this is implemented by taking\n advantage of the multi-variate response support in Ridge.\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.linear_model import RidgeClassifier\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> clf = RidgeClassifier().fit(X, y)\n >>> clf.score(X, y)\n 0.9595...\n \"\"\"\n \n def __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=None, tol=0.001, class_weight=None, solver='auto', positive=False, random_state=None):\n super().__init__(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver, positive=positive, random_state=random_state)\n self.class_weight = class_weight\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit Ridge classifier model.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n .. versionadded:: 0.17\n *sample_weight* support to Classifier.\n\n Returns\n -------\n self : object\n Instance of the estimator.\n \"\"\"\n _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver)\n (X, y) = self._validate_data(X, y, accept_sparse=_accept_sparse, multi_output=True, y_numeric=False)\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)\n Y = self._label_binarizer.fit_transform(y)\n if not self._label_binarizer.y_type_.startswith('multilabel'):\n y = column_or_1d(y, warn=True)\n else:\n raise ValueError(\"%s doesn't support multi-label classification\" % self.__class__.__name__)\n if self.class_weight:\n sample_weight = sample_weight * compute_sample_weight(self.class_weight, y)\n super().fit(X, Y, sample_weight=sample_weight)\n return self\n \n @property\n def classes_(self):\n \"\"\"Classes labels.\"\"\"\n return self._label_binarizer.classes_\n" + "description": "Classifier using Ridge regression.\n\nThis classifier first converts the target values into ``{-1, 1}`` and\nthen treats the problem as a regression task (multi-output regression in\nthe multiclass case).\n\nRead more in the :ref:`User Guide `.", + "docstring": "Classifier using Ridge regression.\n\n This classifier first converts the target values into ``{-1, 1}`` and\n then treats the problem as a regression task (multi-output regression in\n the multiclass case).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set to false, no\n intercept will be used in calculations (e.g. data is expected to be\n already centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and\n will be removed in 1.2.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=None\n Maximum number of iterations for conjugate gradient solver.\n The default value is determined by scipy.sparse.linalg.\n\n tol : float, default=1e-3\n Precision of the solution.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto'\n Solver to use in the computational routines:\n\n - 'auto' chooses the solver automatically based on the type of data.\n\n - 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n - 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution.\n\n - 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n - 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its unbiased and more flexible version named SAGA. Both methods\n use an iterative procedure, and are often faster than other solvers\n when both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\n - 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n Only 'lbfgs' solver is supported in this case.\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag' or 'saga' to shuffle the data.\n See :term:`Glossary ` for details.\n\n Attributes\n ----------\n coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)\n Coefficient of the features in the decision function.\n\n ``coef_`` is of shape (1, n_features) when the given problem is binary.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n n_iter_ : None or ndarray of shape (n_targets,)\n Actual number of iterations for each target. Available only for\n sag and lsqr solvers. Other solvers will return None.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression.\n RidgeClassifierCV : Ridge classifier with built-in cross validation.\n\n Notes\n -----\n For multi-class classification, n_class classifiers are trained in\n a one-versus-all approach. Concretely, this is implemented by taking\n advantage of the multi-variate response support in Ridge.\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.linear_model import RidgeClassifier\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> clf = RidgeClassifier().fit(X, y)\n >>> clf.score(X, y)\n 0.9595...\n ", + "source_code": "\n\nclass RidgeClassifier(_RidgeClassifierMixin, _BaseRidge):\n \"\"\"Classifier using Ridge regression.\n\n This classifier first converts the target values into ``{-1, 1}`` and\n then treats the problem as a regression task (multi-output regression in\n the multiclass case).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set to false, no\n intercept will be used in calculations (e.g. data is expected to be\n already centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and\n will be removed in 1.2.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n max_iter : int, default=None\n Maximum number of iterations for conjugate gradient solver.\n The default value is determined by scipy.sparse.linalg.\n\n tol : float, default=1e-3\n Precision of the solution.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto'\n Solver to use in the computational routines:\n\n - 'auto' chooses the solver automatically based on the type of data.\n\n - 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n - 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution.\n\n - 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n - 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its unbiased and more flexible version named SAGA. Both methods\n use an iterative procedure, and are often faster than other solvers\n when both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\n - 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n Only 'lbfgs' solver is supported in this case.\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag' or 'saga' to shuffle the data.\n See :term:`Glossary ` for details.\n\n Attributes\n ----------\n coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)\n Coefficient of the features in the decision function.\n\n ``coef_`` is of shape (1, n_features) when the given problem is binary.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n n_iter_ : None or ndarray of shape (n_targets,)\n Actual number of iterations for each target. Available only for\n sag and lsqr solvers. Other solvers will return None.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression.\n RidgeClassifierCV : Ridge classifier with built-in cross validation.\n\n Notes\n -----\n For multi-class classification, n_class classifiers are trained in\n a one-versus-all approach. Concretely, this is implemented by taking\n advantage of the multi-variate response support in Ridge.\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.linear_model import RidgeClassifier\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> clf = RidgeClassifier().fit(X, y)\n >>> clf.score(X, y)\n 0.9595...\n \"\"\"\n \n def __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=None, tol=0.001, class_weight=None, solver='auto', positive=False, random_state=None):\n super().__init__(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver, positive=positive, random_state=random_state)\n self.class_weight = class_weight\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit Ridge classifier model.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n .. versionadded:: 0.17\n *sample_weight* support to RidgeClassifier.\n\n Returns\n -------\n self : object\n Instance of the estimator.\n \"\"\"\n (X, y, sample_weight, Y) = self._prepare_data(X, y, sample_weight, self.solver)\n super().fit(X, Y, sample_weight=sample_weight)\n return self\n" }, { "name": "RidgeClassifierCV", "qname": "sklearn.linear_model._ridge.RidgeClassifierCV", "decorators": [], - "superclasses": ["LinearClassifierMixin", "_BaseRidgeCV"], + "superclasses": ["_RidgeClassifierMixin", "_BaseRidgeCV"], "methods": [ "sklearn.linear_model._ridge.RidgeClassifierCV.__init__", "sklearn.linear_model._ridge.RidgeClassifierCV.fit", - "sklearn.linear_model._ridge.RidgeClassifierCV.classes_@getter", "sklearn.linear_model._ridge.RidgeClassifierCV._more_tags" ], "is_public": true, - "description": "Ridge classifier with built-in cross-validation.\n\nSee glossary entry for :term:`cross-validation estimator`. By default, it performs Leave-One-Out Cross-Validation. Currently, only the n_features > n_samples case is handled efficiently. Read more in the :ref:`User Guide `.", + "description": "Ridge classifier with built-in cross-validation.\n\nSee glossary entry for :term:`cross-validation estimator`.\n\nBy default, it performs Leave-One-Out Cross-Validation. Currently,\nonly the n_features > n_samples case is handled efficiently.\n\nRead more in the :ref:`User Guide `.", "docstring": "Ridge classifier with built-in cross-validation.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n By default, it performs Leave-One-Out Cross-Validation. Currently,\n only the n_features > n_samples case is handled efficiently.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alphas : ndarray of shape (n_alphas,), default=(0.1, 1.0, 10.0)\n Array of alpha values to try.\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and\n will be removed in 1.2.\n\n scoring : str, callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the efficient Leave-One-Out cross-validation\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n store_cv_values : bool, default=False\n Flag indicating if the cross-validation values corresponding to\n each alpha should be stored in the ``cv_values_`` attribute (see\n below). This flag is only compatible with ``cv=None`` (i.e. using\n Leave-One-Out Cross-Validation).\n\n Attributes\n ----------\n cv_values_ : ndarray of shape (n_samples, n_targets, n_alphas), optional\n Cross-validation values for each alpha (only if ``store_cv_values=True`` and\n ``cv=None``). After ``fit()`` has been called, this attribute will\n contain the mean squared errors if `scoring is None` otherwise it\n will contain standardized per point prediction values.\n\n coef_ : ndarray of shape (1, n_features) or (n_targets, n_features)\n Coefficient of the features in the decision function.\n\n ``coef_`` is of shape (1, n_features) when the given problem is binary.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n alpha_ : float\n Estimated regularization parameter.\n\n best_score_ : float\n Score of base estimator with best alpha.\n\n .. versionadded:: 0.23\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression.\n RidgeClassifier : Ridge classifier.\n RidgeCV : Ridge regression with built-in cross validation.\n\n Notes\n -----\n For multi-class classification, n_class classifiers are trained in\n a one-versus-all approach. Concretely, this is implemented by taking\n advantage of the multi-variate response support in Ridge.\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.linear_model import RidgeClassifierCV\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)\n >>> clf.score(X, y)\n 0.9630...\n ", - "source_code": "\n\nclass RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):\n \"\"\"Ridge classifier with built-in cross-validation.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n By default, it performs Leave-One-Out Cross-Validation. Currently,\n only the n_features > n_samples case is handled efficiently.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alphas : ndarray of shape (n_alphas,), default=(0.1, 1.0, 10.0)\n Array of alpha values to try.\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and\n will be removed in 1.2.\n\n scoring : str, callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the efficient Leave-One-Out cross-validation\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n store_cv_values : bool, default=False\n Flag indicating if the cross-validation values corresponding to\n each alpha should be stored in the ``cv_values_`` attribute (see\n below). This flag is only compatible with ``cv=None`` (i.e. using\n Leave-One-Out Cross-Validation).\n\n Attributes\n ----------\n cv_values_ : ndarray of shape (n_samples, n_targets, n_alphas), optional\n Cross-validation values for each alpha (only if ``store_cv_values=True`` and\n ``cv=None``). After ``fit()`` has been called, this attribute will\n contain the mean squared errors if `scoring is None` otherwise it\n will contain standardized per point prediction values.\n\n coef_ : ndarray of shape (1, n_features) or (n_targets, n_features)\n Coefficient of the features in the decision function.\n\n ``coef_`` is of shape (1, n_features) when the given problem is binary.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n alpha_ : float\n Estimated regularization parameter.\n\n best_score_ : float\n Score of base estimator with best alpha.\n\n .. versionadded:: 0.23\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression.\n RidgeClassifier : Ridge classifier.\n RidgeCV : Ridge regression with built-in cross validation.\n\n Notes\n -----\n For multi-class classification, n_class classifiers are trained in\n a one-versus-all approach. Concretely, this is implemented by taking\n advantage of the multi-variate response support in Ridge.\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.linear_model import RidgeClassifierCV\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)\n >>> clf.score(X, y)\n 0.9630...\n \"\"\"\n \n def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize='deprecated', scoring=None, cv=None, class_weight=None, store_cv_values=False):\n super().__init__(alphas=alphas, fit_intercept=fit_intercept, normalize=normalize, scoring=scoring, cv=cv, store_cv_values=store_cv_values)\n self.class_weight = class_weight\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit Ridge classifier with cv.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features. When using GCV,\n will be cast to float64 if necessary.\n\n y : ndarray of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], multi_output=True, y_numeric=False)\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)\n Y = self._label_binarizer.fit_transform(y)\n if not self._label_binarizer.y_type_.startswith('multilabel'):\n y = column_or_1d(y, warn=True)\n if self.class_weight:\n sample_weight = sample_weight * compute_sample_weight(self.class_weight, y)\n target = Y if self.cv is None else y\n _BaseRidgeCV.fit(self, X, target, sample_weight=sample_weight)\n return self\n \n @property\n def classes_(self):\n \"\"\"Classes labels.\"\"\"\n return self._label_binarizer.classes_\n \n def _more_tags(self):\n return {'multilabel': True, '_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples', 'check_classifiers_multilabel_output_format_predict': 'RidgeClassifierCV.predict outputs an array of shape (25,) instead of (25, 5)'}}\n" + "source_code": "\n\nclass RidgeClassifierCV(_RidgeClassifierMixin, _BaseRidgeCV):\n \"\"\"Ridge classifier with built-in cross-validation.\n\n See glossary entry for :term:`cross-validation estimator`.\n\n By default, it performs Leave-One-Out Cross-Validation. Currently,\n only the n_features > n_samples case is handled efficiently.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alphas : ndarray of shape (n_alphas,), default=(0.1, 1.0, 10.0)\n Array of alpha values to try.\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and\n will be removed in 1.2.\n\n scoring : str, callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the efficient Leave-One-Out cross-validation\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n store_cv_values : bool, default=False\n Flag indicating if the cross-validation values corresponding to\n each alpha should be stored in the ``cv_values_`` attribute (see\n below). This flag is only compatible with ``cv=None`` (i.e. using\n Leave-One-Out Cross-Validation).\n\n Attributes\n ----------\n cv_values_ : ndarray of shape (n_samples, n_targets, n_alphas), optional\n Cross-validation values for each alpha (only if ``store_cv_values=True`` and\n ``cv=None``). After ``fit()`` has been called, this attribute will\n contain the mean squared errors if `scoring is None` otherwise it\n will contain standardized per point prediction values.\n\n coef_ : ndarray of shape (1, n_features) or (n_targets, n_features)\n Coefficient of the features in the decision function.\n\n ``coef_`` is of shape (1, n_features) when the given problem is binary.\n\n intercept_ : float or ndarray of shape (n_targets,)\n Independent term in decision function. Set to 0.0 if\n ``fit_intercept = False``.\n\n alpha_ : float\n Estimated regularization parameter.\n\n best_score_ : float\n Score of base estimator with best alpha.\n\n .. versionadded:: 0.23\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Ridge : Ridge regression.\n RidgeClassifier : Ridge classifier.\n RidgeCV : Ridge regression with built-in cross validation.\n\n Notes\n -----\n For multi-class classification, n_class classifiers are trained in\n a one-versus-all approach. Concretely, this is implemented by taking\n advantage of the multi-variate response support in Ridge.\n\n Examples\n --------\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.linear_model import RidgeClassifierCV\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)\n >>> clf.score(X, y)\n 0.9630...\n \"\"\"\n \n def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize='deprecated', scoring=None, cv=None, class_weight=None, store_cv_values=False):\n super().__init__(alphas=alphas, fit_intercept=fit_intercept, normalize=normalize, scoring=scoring, cv=cv, store_cv_values=store_cv_values)\n self.class_weight = class_weight\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit Ridge classifier with cv.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features. When using GCV,\n will be cast to float64 if necessary.\n\n y : ndarray of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y, sample_weight, Y) = self._prepare_data(X, y, sample_weight, solver='eigen')\n target = Y if self.cv is None else y\n super().fit(X, target, sample_weight=sample_weight)\n return self\n \n def _more_tags(self):\n return {'multilabel': True, '_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, { "name": "_BaseRidge", @@ -24007,7 +24073,7 @@ "sklearn.linear_model._ridge._IdentityClassifier.decision_function" ], "is_public": false, - "description": "Fake classifier which will directly output the prediction.\n\nWe inherit from LinearClassifierMixin to get the proper shape for the output `y`.", + "description": "Fake classifier which will directly output the prediction.\n\nWe inherit from LinearClassifierMixin to get the proper shape for the\noutput `y`.", "docstring": "Fake classifier which will directly output the prediction.\n\n We inherit from LinearClassifierMixin to get the proper shape for the\n output `y`.\n ", "source_code": "\n\nclass _IdentityClassifier(LinearClassifierMixin):\n \"\"\"Fake classifier which will directly output the prediction.\n\n We inherit from LinearClassifierMixin to get the proper shape for the\n output `y`.\n \"\"\"\n \n def __init__(self, classes):\n self.classes_ = classes\n \n def decision_function(self, y_predict):\n return y_predict\n" }, @@ -24025,6 +24091,22 @@ "docstring": "Fake regressor which will directly output the prediction.", "source_code": "\n\nclass _IdentityRegressor:\n \"\"\"Fake regressor which will directly output the prediction.\"\"\"\n \n def decision_function(self, y_predict):\n return y_predict\n \n def predict(self, y_predict):\n return y_predict\n" }, + { + "name": "_RidgeClassifierMixin", + "qname": "sklearn.linear_model._ridge._RidgeClassifierMixin", + "decorators": [], + "superclasses": ["LinearClassifierMixin"], + "methods": [ + "sklearn.linear_model._ridge._RidgeClassifierMixin._prepare_data", + "sklearn.linear_model._ridge._RidgeClassifierMixin.predict", + "sklearn.linear_model._ridge._RidgeClassifierMixin.classes_@getter", + "sklearn.linear_model._ridge._RidgeClassifierMixin._more_tags" + ], + "is_public": false, + "description": "", + "docstring": null, + "source_code": "\n\nclass _RidgeClassifierMixin(LinearClassifierMixin):\n \n def _prepare_data(self, X, y, sample_weight, solver):\n \"\"\"Validate `X` and `y` and binarize `y`.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n solver : str\n The solver used in `Ridge` to know which sparse format to support.\n\n Returns\n -------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Validated training data.\n\n y : ndarray of shape (n_samples,)\n Validated target values.\n\n sample_weight : ndarray of shape (n_samples,)\n Validated sample weights.\n\n Y : ndarray of shape (n_samples, n_classes)\n The binarized version of `y`.\n \"\"\"\n accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver)\n (X, y) = self._validate_data(X, y, accept_sparse=accept_sparse, multi_output=True, y_numeric=False)\n self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)\n Y = self._label_binarizer.fit_transform(y)\n if not self._label_binarizer.y_type_.startswith('multilabel'):\n y = column_or_1d(y, warn=True)\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n if self.class_weight:\n sample_weight = sample_weight * compute_sample_weight(self.class_weight, y)\n return X, y, sample_weight, Y\n \n def predict(self, X):\n \"\"\"Predict class labels for samples in `X`.\n\n Parameters\n ----------\n X : {array-like, spare matrix} of shape (n_samples, n_features)\n The data matrix for which we want to predict the targets.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Vector or matrix containing the predictions. In binary and\n multiclass problems, this is a vector containing `n_samples`. In\n a multilabel problem, it returns a matrix of shape\n `(n_samples, n_outputs)`.\n \"\"\"\n check_is_fitted(self, attributes=['_label_binarizer'])\n if self._label_binarizer.y_type_.startswith('multilabel'):\n scores = 2 * (self.decision_function(X) > 0) - 1\n return self._label_binarizer.inverse_transform(scores)\n return super().predict(X)\n \n @property\n def classes_(self):\n \"\"\"Classes labels.\"\"\"\n return self._label_binarizer.classes_\n \n def _more_tags(self):\n return {'multilabel': True}\n" + }, { "name": "_RidgeGCV", "qname": "sklearn.linear_model._ridge._RidgeGCV", @@ -24063,7 +24145,7 @@ "sklearn.linear_model._ridge._XT_CenterStackOp._matmat" ], "is_public": false, - "description": "Behaves as transposed centered and scaled X with an intercept column.\n\nThis operator behaves as np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T", + "description": "Behaves as transposed centered and scaled X with an intercept column.\n\nThis operator behaves as\nnp.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T", "docstring": "Behaves as transposed centered and scaled X with an intercept column.\n\n This operator behaves as\n np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T\n ", "source_code": "\n\nclass _XT_CenterStackOp(sparse.linalg.LinearOperator):\n \"\"\"Behaves as transposed centered and scaled X with an intercept column.\n\n This operator behaves as\n np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T\n \"\"\"\n \n def __init__(self, X, X_mean, sqrt_sw):\n (n_samples, n_features) = X.shape\n super().__init__(X.dtype, (n_features + 1, n_samples))\n self.X = X\n self.X_mean = X_mean\n self.sqrt_sw = sqrt_sw\n \n def _matvec(self, v):\n v = v.ravel()\n n_features = self.shape[0]\n res = np.empty(n_features, dtype=self.X.dtype)\n res[:-1] = safe_sparse_dot(self.X.T, v, dense_output=True) - self.X_mean * self.sqrt_sw.dot(v)\n res[-1] = np.dot(v, self.sqrt_sw)\n return res\n \n def _matmat(self, v):\n n_features = self.shape[0]\n res = np.empty((n_features, v.shape[1]), dtype=self.X.dtype)\n res[:-1] = safe_sparse_dot(self.X.T, v, dense_output=True) - self.X_mean[:, None] * self.sqrt_sw.dot(v)\n res[-1] = np.dot(self.sqrt_sw, v)\n return res\n" }, @@ -24079,7 +24161,7 @@ "sklearn.linear_model._ridge._X_CenterStackOp._transpose" ], "is_public": false, - "description": "Behaves as centered and scaled X with an added intercept column.\n\nThis operator behaves as np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]])", + "description": "Behaves as centered and scaled X with an added intercept column.\n\nThis operator behaves as\nnp.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]])", "docstring": "Behaves as centered and scaled X with an added intercept column.\n\n This operator behaves as\n np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]])\n ", "source_code": "\n\nclass _X_CenterStackOp(sparse.linalg.LinearOperator):\n \"\"\"Behaves as centered and scaled X with an added intercept column.\n\n This operator behaves as\n np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]])\n \"\"\"\n \n def __init__(self, X, X_mean, sqrt_sw):\n (n_samples, n_features) = X.shape\n super().__init__(X.dtype, (n_samples, n_features + 1))\n self.X = X\n self.X_mean = X_mean\n self.sqrt_sw = sqrt_sw\n \n def _matvec(self, v):\n v = v.ravel()\n return safe_sparse_dot(self.X, v[:-1], dense_output=True) - self.sqrt_sw * self.X_mean.dot(v[:-1]) + v[-1] * self.sqrt_sw\n \n def _matmat(self, v):\n return safe_sparse_dot(self.X, v[:-1], dense_output=True) - self.sqrt_sw[:, None] * self.X_mean.dot(v[:-1]) + v[-1] * self.sqrt_sw[:, None]\n \n def _transpose(self):\n return _XT_CenterStackOp(self.X, self.X_mean, self.sqrt_sw)\n" }, @@ -24156,9 +24238,9 @@ "sklearn.linear_model._stochastic_gradient.SGDClassifier._more_tags" ], "is_public": true, - "description": "Linear classifiers (SVM, logistic regression, etc.) with SGD training.\n\nThis estimator implements regularized linear models with stochastic gradient descent (SGD) learning: the gradient of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength schedule (aka learning rate). SGD allows minibatch (online/out-of-core) learning via the `partial_fit` method. For best results using the default learning rate schedule, the data should have zero mean and unit variance. This implementation works with data represented as dense or sparse arrays of floating point values for the features. The model it fits can be controlled with the loss parameter; by default, it fits a linear support vector machine (SVM). The regularizer is a penalty added to the loss function that shrinks model parameters towards the zero vector using either the squared euclidean norm L2 or the absolute norm L1 or a combination of both (Elastic Net). If the parameter update crosses the 0.0 value because of the regularizer, the update is truncated to 0.0 to allow for learning sparse models and achieve online feature selection. Read more in the :ref:`User Guide `.", - "docstring": "Linear classifiers (SVM, logistic regression, etc.) with SGD training.\n\n This estimator implements regularized linear models with stochastic\n gradient descent (SGD) learning: the gradient of the loss is estimated\n each sample at a time and the model is updated along the way with a\n decreasing strength schedule (aka learning rate). SGD allows minibatch\n (online/out-of-core) learning via the `partial_fit` method.\n For best results using the default learning rate schedule, the data should\n have zero mean and unit variance.\n\n This implementation works with data represented as dense or sparse arrays\n of floating point values for the features. The model it fits can be\n controlled with the loss parameter; by default, it fits a linear support\n vector machine (SVM).\n\n The regularizer is a penalty added to the loss function that shrinks model\n parameters towards the zero vector using either the squared euclidean norm\n L2 or the absolute norm L1 or a combination of both (Elastic Net). If the\n parameter update crosses the 0.0 value because of the regularizer, the\n update is truncated to 0.0 to allow for learning sparse models and achieve\n online feature selection.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n loss : str, default='hinge'\n The loss function to be used. Defaults to 'hinge', which gives a\n linear SVM.\n\n The possible options are 'hinge', 'log', 'modified_huber',\n 'squared_hinge', 'perceptron', or a regression loss: 'squared_error',\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n\n The 'log' loss gives logistic regression, a probabilistic classifier.\n 'modified_huber' is another smooth loss that brings tolerance to\n outliers as well as probability estimates.\n 'squared_hinge' is like hinge but is quadratically penalized.\n 'perceptron' is the linear loss used by the perceptron algorithm.\n The other losses are designed for regression but can be useful in\n classification as well; see\n :class:`~sklearn.linear_model.SGDRegressor` for a description.\n\n More details about the losses formulas can be found in the\n :ref:`User Guide `.\n\n .. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n penalty : {'l2', 'l1', 'elasticnet'}, default='l2'\n The penalty (aka regularization term) to be used. Defaults to 'l2'\n which is the standard regularizer for linear SVM models. 'l1' and\n 'elasticnet' might bring sparsity to the model (feature selection)\n not achievable with 'l2'.\n\n alpha : float, default=0.0001\n Constant that multiplies the regularization term. The higher the\n value, the stronger the regularization.\n Also used to compute the learning rate when set to `learning_rate` is\n set to 'optimal'.\n\n l1_ratio : float, default=0.15\n The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.\n l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.\n Only used if `penalty` is 'elasticnet'.\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If False, the\n data is assumed to be already centered.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n :meth:`partial_fit` method.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-3\n The stopping criterion. If it is not None, training will stop\n when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive\n epochs.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.19\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n\n verbose : int, default=0\n The verbosity level.\n\n epsilon : float, default=0.1\n Epsilon in the epsilon-insensitive loss functions; only if `loss` is\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n For 'huber', determines the threshold at which it becomes less\n important to get the prediction exactly right.\n For epsilon-insensitive, any differences between the current prediction\n and the correct label are ignored if they are less than this threshold.\n\n n_jobs : int, default=None\n The number of CPUs to use to do the OVA (One Versus All, for\n multi-class problems) computation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n Used for shuffling the data, when ``shuffle`` is set to ``True``.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n learning_rate : str, default='optimal'\n The learning rate schedule:\n\n - 'constant': `eta = eta0`\n - 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n - 'invscaling': `eta = eta0 / pow(t, power_t)`\n - 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n .. versionadded:: 0.20\n Added 'adaptive' option\n\n eta0 : double, default=0.0\n The initial learning rate for the 'constant', 'invscaling' or\n 'adaptive' schedules. The default value is 0.0 as eta0 is not used by\n the default schedule 'optimal'.\n\n power_t : double, default=0.5\n The exponent for inverse scaling learning rate [default 0.5].\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to True, it will automatically set aside\n a stratified fraction of training data as validation and terminate\n training when validation score returned by the `score` method is not\n improving by at least tol for n_iter_no_change consecutive epochs.\n\n .. versionadded:: 0.20\n Added 'early_stopping' option\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if `early_stopping` is True.\n\n .. versionadded:: 0.20\n Added 'validation_fraction' option\n\n n_iter_no_change : int, default=5\n Number of iterations with no improvement to wait before stopping\n fitting.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.20\n Added 'n_iter_no_change' option\n\n class_weight : dict, {class_label: weight} or \"balanced\", default=None\n Preset for the class_weight fit parameter.\n\n Weights associated with classes. If not given, all classes\n are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n Repeatedly calling fit or partial_fit when warm_start is True can\n result in a different solution than when calling fit a single time\n because of the way the data is shuffled.\n If a dynamic learning rate is used, the learning rate is adapted\n depending on the number of samples already seen. Calling ``fit`` resets\n this counter, while ``partial_fit`` will result in increasing the\n existing counter.\n\n average : bool or int, default=False\n When set to True, computes the averaged SGD weights across all\n updates and stores the result in the ``coef_`` attribute. If set to\n an int greater than 1, averaging will begin once the total number of\n samples seen reaches `average`. So ``average=10`` will begin\n averaging after seeing 10 samples.\n\n Attributes\n ----------\n coef_ : ndarray of shape (1, n_features) if n_classes == 2 else (n_classes, n_features)\n Weights assigned to the features.\n\n intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)\n Constants in decision function.\n\n n_iter_ : int\n The actual number of iterations before reaching the stopping criterion.\n For multiclass fits, it is the maximum over every binary fit.\n\n loss_function_ : concrete ``LossFunction``\n\n classes_ : array of shape (n_classes,)\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.svm.LinearSVC : Linear support vector classification.\n LogisticRegression : Logistic regression.\n Perceptron : Inherits from SGDClassifier. ``Perceptron()`` is equivalent to\n ``SGDClassifier(loss=\"perceptron\", eta0=1, learning_rate=\"constant\",\n penalty=None)``.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import SGDClassifier\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.pipeline import make_pipeline\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> Y = np.array([1, 1, 2, 2])\n >>> # Always scale the input. The most convenient way is to use a pipeline.\n >>> clf = make_pipeline(StandardScaler(),\n ... SGDClassifier(max_iter=1000, tol=1e-3))\n >>> clf.fit(X, Y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('sgdclassifier', SGDClassifier())])\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n ", - "source_code": "\n\nclass SGDClassifier(BaseSGDClassifier):\n \"\"\"Linear classifiers (SVM, logistic regression, etc.) with SGD training.\n\n This estimator implements regularized linear models with stochastic\n gradient descent (SGD) learning: the gradient of the loss is estimated\n each sample at a time and the model is updated along the way with a\n decreasing strength schedule (aka learning rate). SGD allows minibatch\n (online/out-of-core) learning via the `partial_fit` method.\n For best results using the default learning rate schedule, the data should\n have zero mean and unit variance.\n\n This implementation works with data represented as dense or sparse arrays\n of floating point values for the features. The model it fits can be\n controlled with the loss parameter; by default, it fits a linear support\n vector machine (SVM).\n\n The regularizer is a penalty added to the loss function that shrinks model\n parameters towards the zero vector using either the squared euclidean norm\n L2 or the absolute norm L1 or a combination of both (Elastic Net). If the\n parameter update crosses the 0.0 value because of the regularizer, the\n update is truncated to 0.0 to allow for learning sparse models and achieve\n online feature selection.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n loss : str, default='hinge'\n The loss function to be used. Defaults to 'hinge', which gives a\n linear SVM.\n\n The possible options are 'hinge', 'log', 'modified_huber',\n 'squared_hinge', 'perceptron', or a regression loss: 'squared_error',\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n\n The 'log' loss gives logistic regression, a probabilistic classifier.\n 'modified_huber' is another smooth loss that brings tolerance to\n outliers as well as probability estimates.\n 'squared_hinge' is like hinge but is quadratically penalized.\n 'perceptron' is the linear loss used by the perceptron algorithm.\n The other losses are designed for regression but can be useful in\n classification as well; see\n :class:`~sklearn.linear_model.SGDRegressor` for a description.\n\n More details about the losses formulas can be found in the\n :ref:`User Guide `.\n\n .. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n penalty : {'l2', 'l1', 'elasticnet'}, default='l2'\n The penalty (aka regularization term) to be used. Defaults to 'l2'\n which is the standard regularizer for linear SVM models. 'l1' and\n 'elasticnet' might bring sparsity to the model (feature selection)\n not achievable with 'l2'.\n\n alpha : float, default=0.0001\n Constant that multiplies the regularization term. The higher the\n value, the stronger the regularization.\n Also used to compute the learning rate when set to `learning_rate` is\n set to 'optimal'.\n\n l1_ratio : float, default=0.15\n The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.\n l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.\n Only used if `penalty` is 'elasticnet'.\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If False, the\n data is assumed to be already centered.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n :meth:`partial_fit` method.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-3\n The stopping criterion. If it is not None, training will stop\n when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive\n epochs.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.19\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n\n verbose : int, default=0\n The verbosity level.\n\n epsilon : float, default=0.1\n Epsilon in the epsilon-insensitive loss functions; only if `loss` is\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n For 'huber', determines the threshold at which it becomes less\n important to get the prediction exactly right.\n For epsilon-insensitive, any differences between the current prediction\n and the correct label are ignored if they are less than this threshold.\n\n n_jobs : int, default=None\n The number of CPUs to use to do the OVA (One Versus All, for\n multi-class problems) computation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n Used for shuffling the data, when ``shuffle`` is set to ``True``.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n learning_rate : str, default='optimal'\n The learning rate schedule:\n\n - 'constant': `eta = eta0`\n - 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n - 'invscaling': `eta = eta0 / pow(t, power_t)`\n - 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n .. versionadded:: 0.20\n Added 'adaptive' option\n\n eta0 : double, default=0.0\n The initial learning rate for the 'constant', 'invscaling' or\n 'adaptive' schedules. The default value is 0.0 as eta0 is not used by\n the default schedule 'optimal'.\n\n power_t : double, default=0.5\n The exponent for inverse scaling learning rate [default 0.5].\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to True, it will automatically set aside\n a stratified fraction of training data as validation and terminate\n training when validation score returned by the `score` method is not\n improving by at least tol for n_iter_no_change consecutive epochs.\n\n .. versionadded:: 0.20\n Added 'early_stopping' option\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if `early_stopping` is True.\n\n .. versionadded:: 0.20\n Added 'validation_fraction' option\n\n n_iter_no_change : int, default=5\n Number of iterations with no improvement to wait before stopping\n fitting.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.20\n Added 'n_iter_no_change' option\n\n class_weight : dict, {class_label: weight} or \"balanced\", default=None\n Preset for the class_weight fit parameter.\n\n Weights associated with classes. If not given, all classes\n are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n Repeatedly calling fit or partial_fit when warm_start is True can\n result in a different solution than when calling fit a single time\n because of the way the data is shuffled.\n If a dynamic learning rate is used, the learning rate is adapted\n depending on the number of samples already seen. Calling ``fit`` resets\n this counter, while ``partial_fit`` will result in increasing the\n existing counter.\n\n average : bool or int, default=False\n When set to True, computes the averaged SGD weights across all\n updates and stores the result in the ``coef_`` attribute. If set to\n an int greater than 1, averaging will begin once the total number of\n samples seen reaches `average`. So ``average=10`` will begin\n averaging after seeing 10 samples.\n\n Attributes\n ----------\n coef_ : ndarray of shape (1, n_features) if n_classes == 2 else (n_classes, n_features)\n Weights assigned to the features.\n\n intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)\n Constants in decision function.\n\n n_iter_ : int\n The actual number of iterations before reaching the stopping criterion.\n For multiclass fits, it is the maximum over every binary fit.\n\n loss_function_ : concrete ``LossFunction``\n\n classes_ : array of shape (n_classes,)\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.svm.LinearSVC : Linear support vector classification.\n LogisticRegression : Logistic regression.\n Perceptron : Inherits from SGDClassifier. ``Perceptron()`` is equivalent to\n ``SGDClassifier(loss=\"perceptron\", eta0=1, learning_rate=\"constant\",\n penalty=None)``.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import SGDClassifier\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.pipeline import make_pipeline\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> Y = np.array([1, 1, 2, 2])\n >>> # Always scale the input. The most convenient way is to use a pipeline.\n >>> clf = make_pipeline(StandardScaler(),\n ... SGDClassifier(max_iter=1000, tol=1e-3))\n >>> clf.fit(X, Y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('sgdclassifier', SGDClassifier())])\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n \"\"\"\n \n def __init__(self, loss='hinge', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False):\n super().__init__(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=epsilon, n_jobs=n_jobs, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, class_weight=class_weight, warm_start=warm_start, average=average)\n \n def _check_proba(self):\n if self.loss not in ('log', 'modified_huber'):\n raise AttributeError('probability estimates are not available for loss=%r' % self.loss)\n return True\n \n @available_if(_check_proba)\n def predict_proba(self, X):\n \"\"\"Probability estimates.\n\n This method is only available for log loss and modified Huber loss.\n\n Multiclass probability estimates are derived from binary (one-vs.-rest)\n estimates by simple normalization, as recommended by Zadrozny and\n Elkan.\n\n Binary probability estimates for loss=\"modified_huber\" are given by\n (clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions\n it is necessary to perform proper probability calibration by wrapping\n the classifier with\n :class:`~sklearn.calibration.CalibratedClassifierCV` instead.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Input data for prediction.\n\n Returns\n -------\n ndarray of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in `self.classes_`.\n\n References\n ----------\n Zadrozny and Elkan, \"Transforming classifier scores into multiclass\n probability estimates\", SIGKDD'02,\n https://dl.acm.org/doi/pdf/10.1145/775047.775151\n\n The justification for the formula in the loss=\"modified_huber\"\n case is in the appendix B in:\n http://jmlr.csail.mit.edu/papers/volume2/zhang02c/zhang02c.pdf\n \"\"\"\n check_is_fitted(self)\n if self.loss == 'log':\n return self._predict_proba_lr(X)\n elif self.loss == 'modified_huber':\n binary = len(self.classes_) == 2\n scores = self.decision_function(X)\n if binary:\n prob2 = np.ones((scores.shape[0], 2))\n prob = prob2[:, 1]\n else:\n prob = scores\n np.clip(scores, -1, 1, prob)\n prob += 1.0\n prob /= 2.0\n if binary:\n prob2[:, 0] -= prob\n prob = prob2\n else:\n prob_sum = prob.sum(axis=1)\n all_zero = prob_sum == 0\n if np.any(all_zero):\n prob[all_zero, :] = 1\n prob_sum[all_zero] = len(self.classes_)\n prob /= prob_sum.reshape((prob.shape[0], -1))\n return prob\n else:\n raise NotImplementedError(\"predict_(log_)proba only supported when loss='log' or loss='modified_huber' (%r given)\" % self.loss)\n \n @available_if(_check_proba)\n def predict_log_proba(self, X):\n \"\"\"Log of probability estimates.\n\n This method is only available for log loss and modified Huber loss.\n\n When loss=\"modified_huber\", probability estimates may be hard zeros\n and ones, so taking the logarithm is not possible.\n\n See ``predict_proba`` for details.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data for prediction.\n\n Returns\n -------\n T : array-like, shape (n_samples, n_classes)\n Returns the log-probability of the sample for each class in the\n model, where classes are ordered as they are in\n `self.classes_`.\n \"\"\"\n return np.log(self.predict_proba(X))\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" + "description": "Linear classifiers (SVM, logistic regression, etc.) with SGD training.\n\nThis estimator implements regularized linear models with stochastic\ngradient descent (SGD) learning: the gradient of the loss is estimated\neach sample at a time and the model is updated along the way with a\ndecreasing strength schedule (aka learning rate). SGD allows minibatch\n(online/out-of-core) learning via the `partial_fit` method.\nFor best results using the default learning rate schedule, the data should\nhave zero mean and unit variance.\n\nThis implementation works with data represented as dense or sparse arrays\nof floating point values for the features. The model it fits can be\ncontrolled with the loss parameter; by default, it fits a linear support\nvector machine (SVM).\n\nThe regularizer is a penalty added to the loss function that shrinks model\nparameters towards the zero vector using either the squared euclidean norm\nL2 or the absolute norm L1 or a combination of both (Elastic Net). If the\nparameter update crosses the 0.0 value because of the regularizer, the\nupdate is truncated to 0.0 to allow for learning sparse models and achieve\nonline feature selection.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Linear classifiers (SVM, logistic regression, etc.) with SGD training.\n\n This estimator implements regularized linear models with stochastic\n gradient descent (SGD) learning: the gradient of the loss is estimated\n each sample at a time and the model is updated along the way with a\n decreasing strength schedule (aka learning rate). SGD allows minibatch\n (online/out-of-core) learning via the `partial_fit` method.\n For best results using the default learning rate schedule, the data should\n have zero mean and unit variance.\n\n This implementation works with data represented as dense or sparse arrays\n of floating point values for the features. The model it fits can be\n controlled with the loss parameter; by default, it fits a linear support\n vector machine (SVM).\n\n The regularizer is a penalty added to the loss function that shrinks model\n parameters towards the zero vector using either the squared euclidean norm\n L2 or the absolute norm L1 or a combination of both (Elastic Net). If the\n parameter update crosses the 0.0 value because of the regularizer, the\n update is truncated to 0.0 to allow for learning sparse models and achieve\n online feature selection.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n loss : str, default='hinge'\n The loss function to be used. Defaults to 'hinge', which gives a\n linear SVM.\n\n The possible options are 'hinge', 'log', 'modified_huber',\n 'squared_hinge', 'perceptron', or a regression loss: 'squared_error',\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n\n The 'log' loss gives logistic regression, a probabilistic classifier.\n 'modified_huber' is another smooth loss that brings tolerance to\n outliers as well as probability estimates.\n 'squared_hinge' is like hinge but is quadratically penalized.\n 'perceptron' is the linear loss used by the perceptron algorithm.\n The other losses are designed for regression but can be useful in\n classification as well; see\n :class:`~sklearn.linear_model.SGDRegressor` for a description.\n\n More details about the losses formulas can be found in the\n :ref:`User Guide `.\n\n .. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n penalty : {'l2', 'l1', 'elasticnet'}, default='l2'\n The penalty (aka regularization term) to be used. Defaults to 'l2'\n which is the standard regularizer for linear SVM models. 'l1' and\n 'elasticnet' might bring sparsity to the model (feature selection)\n not achievable with 'l2'.\n\n alpha : float, default=0.0001\n Constant that multiplies the regularization term. The higher the\n value, the stronger the regularization.\n Also used to compute the learning rate when set to `learning_rate` is\n set to 'optimal'.\n\n l1_ratio : float, default=0.15\n The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.\n l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.\n Only used if `penalty` is 'elasticnet'.\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If False, the\n data is assumed to be already centered.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n :meth:`partial_fit` method.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-3\n The stopping criterion. If it is not None, training will stop\n when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive\n epochs.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.19\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n\n verbose : int, default=0\n The verbosity level.\n\n epsilon : float, default=0.1\n Epsilon in the epsilon-insensitive loss functions; only if `loss` is\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n For 'huber', determines the threshold at which it becomes less\n important to get the prediction exactly right.\n For epsilon-insensitive, any differences between the current prediction\n and the correct label are ignored if they are less than this threshold.\n\n n_jobs : int, default=None\n The number of CPUs to use to do the OVA (One Versus All, for\n multi-class problems) computation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n Used for shuffling the data, when ``shuffle`` is set to ``True``.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n learning_rate : str, default='optimal'\n The learning rate schedule:\n\n - 'constant': `eta = eta0`\n - 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n - 'invscaling': `eta = eta0 / pow(t, power_t)`\n - 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n .. versionadded:: 0.20\n Added 'adaptive' option\n\n eta0 : float, default=0.0\n The initial learning rate for the 'constant', 'invscaling' or\n 'adaptive' schedules. The default value is 0.0 as eta0 is not used by\n the default schedule 'optimal'.\n\n power_t : float, default=0.5\n The exponent for inverse scaling learning rate [default 0.5].\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to True, it will automatically set aside\n a stratified fraction of training data as validation and terminate\n training when validation score returned by the `score` method is not\n improving by at least tol for n_iter_no_change consecutive epochs.\n\n .. versionadded:: 0.20\n Added 'early_stopping' option\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if `early_stopping` is True.\n\n .. versionadded:: 0.20\n Added 'validation_fraction' option\n\n n_iter_no_change : int, default=5\n Number of iterations with no improvement to wait before stopping\n fitting.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.20\n Added 'n_iter_no_change' option\n\n class_weight : dict, {class_label: weight} or \"balanced\", default=None\n Preset for the class_weight fit parameter.\n\n Weights associated with classes. If not given, all classes\n are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n Repeatedly calling fit or partial_fit when warm_start is True can\n result in a different solution than when calling fit a single time\n because of the way the data is shuffled.\n If a dynamic learning rate is used, the learning rate is adapted\n depending on the number of samples already seen. Calling ``fit`` resets\n this counter, while ``partial_fit`` will result in increasing the\n existing counter.\n\n average : bool or int, default=False\n When set to True, computes the averaged SGD weights across all\n updates and stores the result in the ``coef_`` attribute. If set to\n an int greater than 1, averaging will begin once the total number of\n samples seen reaches `average`. So ``average=10`` will begin\n averaging after seeing 10 samples.\n\n Attributes\n ----------\n coef_ : ndarray of shape (1, n_features) if n_classes == 2 else (n_classes, n_features)\n Weights assigned to the features.\n\n intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)\n Constants in decision function.\n\n n_iter_ : int\n The actual number of iterations before reaching the stopping criterion.\n For multiclass fits, it is the maximum over every binary fit.\n\n loss_function_ : concrete ``LossFunction``\n\n classes_ : array of shape (n_classes,)\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.svm.LinearSVC : Linear support vector classification.\n LogisticRegression : Logistic regression.\n Perceptron : Inherits from SGDClassifier. ``Perceptron()`` is equivalent to\n ``SGDClassifier(loss=\"perceptron\", eta0=1, learning_rate=\"constant\",\n penalty=None)``.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import SGDClassifier\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.pipeline import make_pipeline\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> Y = np.array([1, 1, 2, 2])\n >>> # Always scale the input. The most convenient way is to use a pipeline.\n >>> clf = make_pipeline(StandardScaler(),\n ... SGDClassifier(max_iter=1000, tol=1e-3))\n >>> clf.fit(X, Y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('sgdclassifier', SGDClassifier())])\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n ", + "source_code": "\n\nclass SGDClassifier(BaseSGDClassifier):\n \"\"\"Linear classifiers (SVM, logistic regression, etc.) with SGD training.\n\n This estimator implements regularized linear models with stochastic\n gradient descent (SGD) learning: the gradient of the loss is estimated\n each sample at a time and the model is updated along the way with a\n decreasing strength schedule (aka learning rate). SGD allows minibatch\n (online/out-of-core) learning via the `partial_fit` method.\n For best results using the default learning rate schedule, the data should\n have zero mean and unit variance.\n\n This implementation works with data represented as dense or sparse arrays\n of floating point values for the features. The model it fits can be\n controlled with the loss parameter; by default, it fits a linear support\n vector machine (SVM).\n\n The regularizer is a penalty added to the loss function that shrinks model\n parameters towards the zero vector using either the squared euclidean norm\n L2 or the absolute norm L1 or a combination of both (Elastic Net). If the\n parameter update crosses the 0.0 value because of the regularizer, the\n update is truncated to 0.0 to allow for learning sparse models and achieve\n online feature selection.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n loss : str, default='hinge'\n The loss function to be used. Defaults to 'hinge', which gives a\n linear SVM.\n\n The possible options are 'hinge', 'log', 'modified_huber',\n 'squared_hinge', 'perceptron', or a regression loss: 'squared_error',\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n\n The 'log' loss gives logistic regression, a probabilistic classifier.\n 'modified_huber' is another smooth loss that brings tolerance to\n outliers as well as probability estimates.\n 'squared_hinge' is like hinge but is quadratically penalized.\n 'perceptron' is the linear loss used by the perceptron algorithm.\n The other losses are designed for regression but can be useful in\n classification as well; see\n :class:`~sklearn.linear_model.SGDRegressor` for a description.\n\n More details about the losses formulas can be found in the\n :ref:`User Guide `.\n\n .. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n penalty : {'l2', 'l1', 'elasticnet'}, default='l2'\n The penalty (aka regularization term) to be used. Defaults to 'l2'\n which is the standard regularizer for linear SVM models. 'l1' and\n 'elasticnet' might bring sparsity to the model (feature selection)\n not achievable with 'l2'.\n\n alpha : float, default=0.0001\n Constant that multiplies the regularization term. The higher the\n value, the stronger the regularization.\n Also used to compute the learning rate when set to `learning_rate` is\n set to 'optimal'.\n\n l1_ratio : float, default=0.15\n The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.\n l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.\n Only used if `penalty` is 'elasticnet'.\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If False, the\n data is assumed to be already centered.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n :meth:`partial_fit` method.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-3\n The stopping criterion. If it is not None, training will stop\n when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive\n epochs.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.19\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n\n verbose : int, default=0\n The verbosity level.\n\n epsilon : float, default=0.1\n Epsilon in the epsilon-insensitive loss functions; only if `loss` is\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n For 'huber', determines the threshold at which it becomes less\n important to get the prediction exactly right.\n For epsilon-insensitive, any differences between the current prediction\n and the correct label are ignored if they are less than this threshold.\n\n n_jobs : int, default=None\n The number of CPUs to use to do the OVA (One Versus All, for\n multi-class problems) computation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n Used for shuffling the data, when ``shuffle`` is set to ``True``.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n learning_rate : str, default='optimal'\n The learning rate schedule:\n\n - 'constant': `eta = eta0`\n - 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n - 'invscaling': `eta = eta0 / pow(t, power_t)`\n - 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n .. versionadded:: 0.20\n Added 'adaptive' option\n\n eta0 : float, default=0.0\n The initial learning rate for the 'constant', 'invscaling' or\n 'adaptive' schedules. The default value is 0.0 as eta0 is not used by\n the default schedule 'optimal'.\n\n power_t : float, default=0.5\n The exponent for inverse scaling learning rate [default 0.5].\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to True, it will automatically set aside\n a stratified fraction of training data as validation and terminate\n training when validation score returned by the `score` method is not\n improving by at least tol for n_iter_no_change consecutive epochs.\n\n .. versionadded:: 0.20\n Added 'early_stopping' option\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if `early_stopping` is True.\n\n .. versionadded:: 0.20\n Added 'validation_fraction' option\n\n n_iter_no_change : int, default=5\n Number of iterations with no improvement to wait before stopping\n fitting.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.20\n Added 'n_iter_no_change' option\n\n class_weight : dict, {class_label: weight} or \"balanced\", default=None\n Preset for the class_weight fit parameter.\n\n Weights associated with classes. If not given, all classes\n are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n Repeatedly calling fit or partial_fit when warm_start is True can\n result in a different solution than when calling fit a single time\n because of the way the data is shuffled.\n If a dynamic learning rate is used, the learning rate is adapted\n depending on the number of samples already seen. Calling ``fit`` resets\n this counter, while ``partial_fit`` will result in increasing the\n existing counter.\n\n average : bool or int, default=False\n When set to True, computes the averaged SGD weights across all\n updates and stores the result in the ``coef_`` attribute. If set to\n an int greater than 1, averaging will begin once the total number of\n samples seen reaches `average`. So ``average=10`` will begin\n averaging after seeing 10 samples.\n\n Attributes\n ----------\n coef_ : ndarray of shape (1, n_features) if n_classes == 2 else (n_classes, n_features)\n Weights assigned to the features.\n\n intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)\n Constants in decision function.\n\n n_iter_ : int\n The actual number of iterations before reaching the stopping criterion.\n For multiclass fits, it is the maximum over every binary fit.\n\n loss_function_ : concrete ``LossFunction``\n\n classes_ : array of shape (n_classes,)\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.svm.LinearSVC : Linear support vector classification.\n LogisticRegression : Logistic regression.\n Perceptron : Inherits from SGDClassifier. ``Perceptron()`` is equivalent to\n ``SGDClassifier(loss=\"perceptron\", eta0=1, learning_rate=\"constant\",\n penalty=None)``.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import SGDClassifier\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.pipeline import make_pipeline\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> Y = np.array([1, 1, 2, 2])\n >>> # Always scale the input. The most convenient way is to use a pipeline.\n >>> clf = make_pipeline(StandardScaler(),\n ... SGDClassifier(max_iter=1000, tol=1e-3))\n >>> clf.fit(X, Y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('sgdclassifier', SGDClassifier())])\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n \"\"\"\n \n def __init__(self, loss='hinge', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False):\n super().__init__(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=epsilon, n_jobs=n_jobs, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, class_weight=class_weight, warm_start=warm_start, average=average)\n \n def _check_proba(self):\n if self.loss not in ('log', 'modified_huber'):\n raise AttributeError('probability estimates are not available for loss=%r' % self.loss)\n return True\n \n @available_if(_check_proba)\n def predict_proba(self, X):\n \"\"\"Probability estimates.\n\n This method is only available for log loss and modified Huber loss.\n\n Multiclass probability estimates are derived from binary (one-vs.-rest)\n estimates by simple normalization, as recommended by Zadrozny and\n Elkan.\n\n Binary probability estimates for loss=\"modified_huber\" are given by\n (clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions\n it is necessary to perform proper probability calibration by wrapping\n the classifier with\n :class:`~sklearn.calibration.CalibratedClassifierCV` instead.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Input data for prediction.\n\n Returns\n -------\n ndarray of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in `self.classes_`.\n\n References\n ----------\n Zadrozny and Elkan, \"Transforming classifier scores into multiclass\n probability estimates\", SIGKDD'02,\n https://dl.acm.org/doi/pdf/10.1145/775047.775151\n\n The justification for the formula in the loss=\"modified_huber\"\n case is in the appendix B in:\n http://jmlr.csail.mit.edu/papers/volume2/zhang02c/zhang02c.pdf\n \"\"\"\n check_is_fitted(self)\n if self.loss == 'log':\n return self._predict_proba_lr(X)\n elif self.loss == 'modified_huber':\n binary = len(self.classes_) == 2\n scores = self.decision_function(X)\n if binary:\n prob2 = np.ones((scores.shape[0], 2))\n prob = prob2[:, 1]\n else:\n prob = scores\n np.clip(scores, -1, 1, prob)\n prob += 1.0\n prob /= 2.0\n if binary:\n prob2[:, 0] -= prob\n prob = prob2\n else:\n prob_sum = prob.sum(axis=1)\n all_zero = prob_sum == 0\n if np.any(all_zero):\n prob[all_zero, :] = 1\n prob_sum[all_zero] = len(self.classes_)\n prob /= prob_sum.reshape((prob.shape[0], -1))\n return prob\n else:\n raise NotImplementedError(\"predict_(log_)proba only supported when loss='log' or loss='modified_huber' (%r given)\" % self.loss)\n \n @available_if(_check_proba)\n def predict_log_proba(self, X):\n \"\"\"Log of probability estimates.\n\n This method is only available for log loss and modified Huber loss.\n\n When loss=\"modified_huber\", probability estimates may be hard zeros\n and ones, so taking the logarithm is not possible.\n\n See ``predict_proba`` for details.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data for prediction.\n\n Returns\n -------\n T : array-like, shape (n_samples, n_classes)\n Returns the log-probability of the sample for each class in the\n model, where classes are ordered as they are in\n `self.classes_`.\n \"\"\"\n return np.log(self.predict_proba(X))\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, { "name": "SGDOneClassSVM", @@ -24179,9 +24261,9 @@ "sklearn.linear_model._stochastic_gradient.SGDOneClassSVM._more_tags" ], "is_public": true, - "description": "Solves linear One-Class SVM using Stochastic Gradient Descent.\n\nThis implementation is meant to be used with a kernel approximation technique (e.g. `sklearn.kernel_approximation.Nystroem`) to obtain results similar to `sklearn.svm.OneClassSVM` which uses a Gaussian kernel by default. Read more in the :ref:`User Guide `. .. versionadded:: 1.0", - "docstring": "Solves linear One-Class SVM using Stochastic Gradient Descent.\n\n This implementation is meant to be used with a kernel approximation\n technique (e.g. `sklearn.kernel_approximation.Nystroem`) to obtain results\n similar to `sklearn.svm.OneClassSVM` which uses a Gaussian kernel by\n default.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n nu : float, optional\n The nu parameter of the One Class SVM: an upper bound on the\n fraction of training errors and a lower bound of the fraction of\n support vectors. Should be in the interval (0, 1]. By default 0.5\n will be taken.\n\n fit_intercept : bool\n Whether the intercept should be estimated or not. Defaults to True.\n\n max_iter : int, optional\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n `partial_fit`. Defaults to 1000.\n\n tol : float or None, optional\n The stopping criterion. If it is not None, the iterations will stop\n when (loss > previous_loss - tol). Defaults to 1e-3.\n\n shuffle : bool, optional\n Whether or not the training data should be shuffled after each epoch.\n Defaults to True.\n\n verbose : int, optional\n The verbosity level.\n\n random_state : int, RandomState instance or None, optional (default=None)\n The seed of the pseudo random number generator to use when shuffling\n the data. If int, random_state is the seed used by the random number\n generator; If RandomState instance, random_state is the random number\n generator; If None, the random number generator is the RandomState\n instance used by `np.random`.\n\n learning_rate : str, optional\n The learning rate schedule to use with `fit`. (If using `partial_fit`,\n learning rate must be controlled directly).\n\n 'constant':\n eta = eta0\n 'optimal': [default]\n eta = 1.0 / (alpha * (t + t0))\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n 'invscaling':\n eta = eta0 / pow(t, power_t)\n 'adaptive':\n eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n eta0 : double\n The initial learning rate for the 'constant', 'invscaling' or\n 'adaptive' schedules. The default value is 0.0 as eta0 is not used by\n the default schedule 'optimal'.\n\n power_t : double\n The exponent for inverse scaling learning rate [default 0.5].\n\n warm_start : bool, optional\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n Repeatedly calling fit or partial_fit when warm_start is True can\n result in a different solution than when calling fit a single time\n because of the way the data is shuffled.\n If a dynamic learning rate is used, the learning rate is adapted\n depending on the number of samples already seen. Calling ``fit`` resets\n this counter, while ``partial_fit`` will result in increasing the\n existing counter.\n\n average : bool or int, optional\n When set to True, computes the averaged SGD weights and stores the\n result in the ``coef_`` attribute. If set to an int greater than 1,\n averaging will begin once the total number of samples seen reaches\n average. So ``average=10`` will begin averaging after seeing 10\n samples.\n\n Attributes\n ----------\n coef_ : array, shape (1, n_features)\n Weights assigned to the features.\n\n offset_ : array, shape (1,)\n Offset used to define the decision function from the raw scores.\n We have the relation: decision_function = score_samples - offset.\n\n n_iter_ : int\n The actual number of iterations to reach the stopping criterion.\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n loss_function_ : concrete ``LossFunction``\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.\n\n Notes\n -----\n This estimator has a linear complexity in the number of training samples\n and is thus better suited than the `sklearn.svm.OneClassSVM`\n implementation for datasets with a large number of training samples (say\n > 10,000).\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import linear_model\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> clf = linear_model.SGDOneClassSVM(random_state=42)\n >>> clf.fit(X)\n SGDOneClassSVM(random_state=42)\n\n >>> print(clf.predict([[4, 4]]))\n [1]\n ", - "source_code": "\n\nclass SGDOneClassSVM(BaseSGD, OutlierMixin):\n \"\"\"Solves linear One-Class SVM using Stochastic Gradient Descent.\n\n This implementation is meant to be used with a kernel approximation\n technique (e.g. `sklearn.kernel_approximation.Nystroem`) to obtain results\n similar to `sklearn.svm.OneClassSVM` which uses a Gaussian kernel by\n default.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n nu : float, optional\n The nu parameter of the One Class SVM: an upper bound on the\n fraction of training errors and a lower bound of the fraction of\n support vectors. Should be in the interval (0, 1]. By default 0.5\n will be taken.\n\n fit_intercept : bool\n Whether the intercept should be estimated or not. Defaults to True.\n\n max_iter : int, optional\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n `partial_fit`. Defaults to 1000.\n\n tol : float or None, optional\n The stopping criterion. If it is not None, the iterations will stop\n when (loss > previous_loss - tol). Defaults to 1e-3.\n\n shuffle : bool, optional\n Whether or not the training data should be shuffled after each epoch.\n Defaults to True.\n\n verbose : int, optional\n The verbosity level.\n\n random_state : int, RandomState instance or None, optional (default=None)\n The seed of the pseudo random number generator to use when shuffling\n the data. If int, random_state is the seed used by the random number\n generator; If RandomState instance, random_state is the random number\n generator; If None, the random number generator is the RandomState\n instance used by `np.random`.\n\n learning_rate : str, optional\n The learning rate schedule to use with `fit`. (If using `partial_fit`,\n learning rate must be controlled directly).\n\n 'constant':\n eta = eta0\n 'optimal': [default]\n eta = 1.0 / (alpha * (t + t0))\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n 'invscaling':\n eta = eta0 / pow(t, power_t)\n 'adaptive':\n eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n eta0 : double\n The initial learning rate for the 'constant', 'invscaling' or\n 'adaptive' schedules. The default value is 0.0 as eta0 is not used by\n the default schedule 'optimal'.\n\n power_t : double\n The exponent for inverse scaling learning rate [default 0.5].\n\n warm_start : bool, optional\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n Repeatedly calling fit or partial_fit when warm_start is True can\n result in a different solution than when calling fit a single time\n because of the way the data is shuffled.\n If a dynamic learning rate is used, the learning rate is adapted\n depending on the number of samples already seen. Calling ``fit`` resets\n this counter, while ``partial_fit`` will result in increasing the\n existing counter.\n\n average : bool or int, optional\n When set to True, computes the averaged SGD weights and stores the\n result in the ``coef_`` attribute. If set to an int greater than 1,\n averaging will begin once the total number of samples seen reaches\n average. So ``average=10`` will begin averaging after seeing 10\n samples.\n\n Attributes\n ----------\n coef_ : array, shape (1, n_features)\n Weights assigned to the features.\n\n offset_ : array, shape (1,)\n Offset used to define the decision function from the raw scores.\n We have the relation: decision_function = score_samples - offset.\n\n n_iter_ : int\n The actual number of iterations to reach the stopping criterion.\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n loss_function_ : concrete ``LossFunction``\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.\n\n Notes\n -----\n This estimator has a linear complexity in the number of training samples\n and is thus better suited than the `sklearn.svm.OneClassSVM`\n implementation for datasets with a large number of training samples (say\n > 10,000).\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import linear_model\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> clf = linear_model.SGDOneClassSVM(random_state=42)\n >>> clf.fit(X)\n SGDOneClassSVM(random_state=42)\n\n >>> print(clf.predict([[4, 4]]))\n [1]\n \"\"\"\n loss_functions = {'hinge': (Hinge, 1.0)}\n \n def __init__(self, nu=0.5, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, warm_start=False, average=False):\n alpha = nu / 2\n self.nu = nu\n super(SGDOneClassSVM, self).__init__(loss='hinge', penalty='l2', alpha=alpha, C=1.0, l1_ratio=0, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=DEFAULT_EPSILON, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=warm_start, average=average)\n \n def _validate_params(self, for_partial_fit=False):\n \"\"\"Validate input params.\"\"\"\n if not 0 < self.nu <= 1:\n raise ValueError('nu must be in (0, 1], got nu=%f' % self.nu)\n super(SGDOneClassSVM, self)._validate_params(for_partial_fit=for_partial_fit)\n \n def _fit_one_class(self, X, alpha, C, sample_weight, learning_rate, max_iter):\n \"\"\"Uses SGD implementation with X and y=np.ones(n_samples).\"\"\"\n n_samples = X.shape[0]\n y = np.ones(n_samples, dtype=np.float64, order='C')\n (dataset, offset_decay) = make_dataset(X, y, sample_weight)\n penalty_type = self._get_penalty_type(self.penalty)\n learning_rate_type = self._get_learning_rate_type(learning_rate)\n validation_mask = self._make_validation_split(y)\n validation_score_cb = self._make_validation_score_cb(validation_mask, X, y, sample_weight)\n random_state = check_random_state(self.random_state)\n seed = random_state.randint(0, np.iinfo(np.int32).max)\n tol = self.tol if self.tol is not None else -np.inf\n one_class = 1\n pos_weight = 1\n neg_weight = 1\n if self.average:\n coef = self._standard_coef\n intercept = self._standard_intercept\n average_coef = self._average_coef\n average_intercept = self._average_intercept\n else:\n coef = self.coef_\n intercept = 1 - self.offset_\n average_coef = None\n average_intercept = [0]\n (coef, intercept, average_coef, average_intercept, self.n_iter_) = _plain_sgd(coef, intercept[0], average_coef, average_intercept[0], self.loss_function_, penalty_type, alpha, C, self.l1_ratio, dataset, validation_mask, self.early_stopping, validation_score_cb, int(self.n_iter_no_change), max_iter, tol, int(self.fit_intercept), int(self.verbose), int(self.shuffle), seed, neg_weight, pos_weight, learning_rate_type, self.eta0, self.power_t, one_class, self.t_, offset_decay, self.average)\n self.t_ += self.n_iter_ * n_samples\n if self.average > 0:\n self._average_intercept = np.atleast_1d(average_intercept)\n self._standard_intercept = np.atleast_1d(intercept)\n if self.average <= self.t_ - 1.0:\n self.coef_ = average_coef\n self.offset_ = 1 - np.atleast_1d(average_intercept)\n else:\n self.coef_ = coef\n self.offset_ = 1 - np.atleast_1d(intercept)\n else:\n self.offset_ = 1 - np.atleast_1d(intercept)\n \n def _partial_fit(self, X, alpha, C, loss, learning_rate, max_iter, sample_weight, coef_init, offset_init):\n first_call = getattr(self, 'coef_', None) is None\n X = self._validate_data(X, None, accept_sparse='csr', dtype=np.float64, order='C', accept_large_sparse=False, reset=first_call)\n n_features = X.shape[1]\n sample_weight = _check_sample_weight(sample_weight, X)\n if getattr(self, 'coef_', None) is None or coef_init is not None:\n self._allocate_parameter_mem(1, n_features, coef_init, offset_init, 1)\n elif n_features != self.coef_.shape[-1]:\n raise ValueError('Number of features %d does not match previous data %d.' % (n_features, self.coef_.shape[-1]))\n if self.average and getattr(self, '_average_coef', None) is None:\n self._average_coef = np.zeros(n_features, dtype=np.float64, order='C')\n self._average_intercept = np.zeros(1, dtype=np.float64, order='C')\n self.loss_function_ = self._get_loss_function(loss)\n if not hasattr(self, 't_'):\n self.t_ = 1.0\n self._fit_one_class(X, alpha=alpha, C=C, learning_rate=learning_rate, sample_weight=sample_weight, max_iter=max_iter)\n return self\n \n def partial_fit(self, X, y=None, sample_weight=None):\n \"\"\"Fit linear One-Class SVM with Stochastic Gradient Descent.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Subset of the training data.\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like, shape (n_samples,), optional\n Weights applied to individual samples.\n If not provided, uniform weights are assumed.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n alpha = self.nu / 2\n self._validate_params(for_partial_fit=True)\n return self._partial_fit(X, alpha, C=1.0, loss=self.loss, learning_rate=self.learning_rate, max_iter=1, sample_weight=sample_weight, coef_init=None, offset_init=None)\n \n def _fit(self, X, alpha, C, loss, learning_rate, coef_init=None, offset_init=None, sample_weight=None):\n self._validate_params()\n if self.warm_start and hasattr(self, 'coef_'):\n if coef_init is None:\n coef_init = self.coef_\n if offset_init is None:\n offset_init = self.offset_\n else:\n self.coef_ = None\n self.offset_ = None\n self.t_ = 1.0\n self._partial_fit(X, alpha, C, loss, learning_rate, self.max_iter, sample_weight, coef_init, offset_init)\n if self.tol is not None and self.tol > -np.inf and self.n_iter_ == self.max_iter:\n warnings.warn('Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.', ConvergenceWarning)\n return self\n \n def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None):\n \"\"\"Fit linear One-Class SVM with Stochastic Gradient Descent.\n\n This solves an equivalent optimization problem of the\n One-Class SVM primal optimization problem and returns a weight vector\n w and an offset rho such that the decision function is given by\n - rho.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data.\n y : Ignored\n Not used, present for API consistency by convention.\n\n coef_init : array, shape (n_classes, n_features)\n The initial coefficients to warm-start the optimization.\n\n offset_init : array, shape (n_classes,)\n The initial offset to warm-start the optimization.\n\n sample_weight : array-like, shape (n_samples,), optional\n Weights applied to individual samples.\n If not provided, uniform weights are assumed. These weights will\n be multiplied with class_weight (passed through the\n constructor) if class_weight is specified.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n alpha = self.nu / 2\n self._fit(X, alpha=alpha, C=1.0, loss=self.loss, learning_rate=self.learning_rate, coef_init=coef_init, offset_init=offset_init, sample_weight=sample_weight)\n return self\n \n def decision_function(self, X):\n \"\"\"Signed distance to the separating hyperplane.\n\n Signed distance is positive for an inlier and negative for an\n outlier.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\n Returns\n -------\n dec : array-like, shape (n_samples,)\n Decision function values of the samples.\n \"\"\"\n check_is_fitted(self, 'coef_')\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n decisions = safe_sparse_dot(X, self.coef_.T, dense_output=True) - self.offset_\n return decisions.ravel()\n \n def score_samples(self, X):\n \"\"\"Raw scoring function of the samples.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\n Returns\n -------\n score_samples : array-like, shape (n_samples,)\n Unshiffted scoring function values of the samples.\n \"\"\"\n score_samples = self.decision_function(X) + self.offset_\n return score_samples\n \n def predict(self, X):\n \"\"\"Return labels (1 inlier, -1 outlier) of the samples.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\n Returns\n -------\n y : array, shape (n_samples,)\n Labels of the samples.\n \"\"\"\n y = (self.decision_function(X) >= 0).astype(np.int32)\n y[y == 0] = -1\n return y\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" + "description": "Solves linear One-Class SVM using Stochastic Gradient Descent.\n\nThis implementation is meant to be used with a kernel approximation\ntechnique (e.g. `sklearn.kernel_approximation.Nystroem`) to obtain results\nsimilar to `sklearn.svm.OneClassSVM` which uses a Gaussian kernel by\ndefault.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0", + "docstring": "Solves linear One-Class SVM using Stochastic Gradient Descent.\n\n This implementation is meant to be used with a kernel approximation\n technique (e.g. `sklearn.kernel_approximation.Nystroem`) to obtain results\n similar to `sklearn.svm.OneClassSVM` which uses a Gaussian kernel by\n default.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n nu : float, default=0.5\n The nu parameter of the One Class SVM: an upper bound on the\n fraction of training errors and a lower bound of the fraction of\n support vectors. Should be in the interval (0, 1]. By default 0.5\n will be taken.\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. Defaults to True.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n `partial_fit`. Defaults to 1000.\n\n tol : float or None, default=1e-3\n The stopping criterion. If it is not None, the iterations will stop\n when (loss > previous_loss - tol). Defaults to 1e-3.\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n Defaults to True.\n\n verbose : int, default=0\n The verbosity level.\n\n random_state : int, RandomState instance or None, default=None\n The seed of the pseudo random number generator to use when shuffling\n the data. If int, random_state is the seed used by the random number\n generator; If RandomState instance, random_state is the random number\n generator; If None, the random number generator is the RandomState\n instance used by `np.random`.\n\n learning_rate : {'constant', 'optimal', 'invscaling', 'adaptive'}, default='optimal'\n The learning rate schedule to use with `fit`. (If using `partial_fit`,\n learning rate must be controlled directly).\n\n - 'constant': `eta = eta0`\n - 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n - 'invscaling': `eta = eta0 / pow(t, power_t)`\n - 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n eta0 : float, default=0.0\n The initial learning rate for the 'constant', 'invscaling' or\n 'adaptive' schedules. The default value is 0.0 as eta0 is not used by\n the default schedule 'optimal'.\n\n power_t : float, default=0.5\n The exponent for inverse scaling learning rate [default 0.5].\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n Repeatedly calling fit or partial_fit when warm_start is True can\n result in a different solution than when calling fit a single time\n because of the way the data is shuffled.\n If a dynamic learning rate is used, the learning rate is adapted\n depending on the number of samples already seen. Calling ``fit`` resets\n this counter, while ``partial_fit`` will result in increasing the\n existing counter.\n\n average : bool or int, default=False\n When set to True, computes the averaged SGD weights and stores the\n result in the ``coef_`` attribute. If set to an int greater than 1,\n averaging will begin once the total number of samples seen reaches\n average. So ``average=10`` will begin averaging after seeing 10\n samples.\n\n Attributes\n ----------\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features.\n\n offset_ : ndarray of shape (1,)\n Offset used to define the decision function from the raw scores.\n We have the relation: decision_function = score_samples - offset.\n\n n_iter_ : int\n The actual number of iterations to reach the stopping criterion.\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n loss_function_ : concrete ``LossFunction``\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.\n\n Notes\n -----\n This estimator has a linear complexity in the number of training samples\n and is thus better suited than the `sklearn.svm.OneClassSVM`\n implementation for datasets with a large number of training samples (say\n > 10,000).\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import linear_model\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> clf = linear_model.SGDOneClassSVM(random_state=42)\n >>> clf.fit(X)\n SGDOneClassSVM(random_state=42)\n\n >>> print(clf.predict([[4, 4]]))\n [1]\n ", + "source_code": "\n\nclass SGDOneClassSVM(BaseSGD, OutlierMixin):\n \"\"\"Solves linear One-Class SVM using Stochastic Gradient Descent.\n\n This implementation is meant to be used with a kernel approximation\n technique (e.g. `sklearn.kernel_approximation.Nystroem`) to obtain results\n similar to `sklearn.svm.OneClassSVM` which uses a Gaussian kernel by\n default.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n nu : float, default=0.5\n The nu parameter of the One Class SVM: an upper bound on the\n fraction of training errors and a lower bound of the fraction of\n support vectors. Should be in the interval (0, 1]. By default 0.5\n will be taken.\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. Defaults to True.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n `partial_fit`. Defaults to 1000.\n\n tol : float or None, default=1e-3\n The stopping criterion. If it is not None, the iterations will stop\n when (loss > previous_loss - tol). Defaults to 1e-3.\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n Defaults to True.\n\n verbose : int, default=0\n The verbosity level.\n\n random_state : int, RandomState instance or None, default=None\n The seed of the pseudo random number generator to use when shuffling\n the data. If int, random_state is the seed used by the random number\n generator; If RandomState instance, random_state is the random number\n generator; If None, the random number generator is the RandomState\n instance used by `np.random`.\n\n learning_rate : {'constant', 'optimal', 'invscaling', 'adaptive'}, default='optimal'\n The learning rate schedule to use with `fit`. (If using `partial_fit`,\n learning rate must be controlled directly).\n\n - 'constant': `eta = eta0`\n - 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n - 'invscaling': `eta = eta0 / pow(t, power_t)`\n - 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n eta0 : float, default=0.0\n The initial learning rate for the 'constant', 'invscaling' or\n 'adaptive' schedules. The default value is 0.0 as eta0 is not used by\n the default schedule 'optimal'.\n\n power_t : float, default=0.5\n The exponent for inverse scaling learning rate [default 0.5].\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n Repeatedly calling fit or partial_fit when warm_start is True can\n result in a different solution than when calling fit a single time\n because of the way the data is shuffled.\n If a dynamic learning rate is used, the learning rate is adapted\n depending on the number of samples already seen. Calling ``fit`` resets\n this counter, while ``partial_fit`` will result in increasing the\n existing counter.\n\n average : bool or int, default=False\n When set to True, computes the averaged SGD weights and stores the\n result in the ``coef_`` attribute. If set to an int greater than 1,\n averaging will begin once the total number of samples seen reaches\n average. So ``average=10`` will begin averaging after seeing 10\n samples.\n\n Attributes\n ----------\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features.\n\n offset_ : ndarray of shape (1,)\n Offset used to define the decision function from the raw scores.\n We have the relation: decision_function = score_samples - offset.\n\n n_iter_ : int\n The actual number of iterations to reach the stopping criterion.\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n loss_function_ : concrete ``LossFunction``\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.svm.OneClassSVM : Unsupervised Outlier Detection.\n\n Notes\n -----\n This estimator has a linear complexity in the number of training samples\n and is thus better suited than the `sklearn.svm.OneClassSVM`\n implementation for datasets with a large number of training samples (say\n > 10,000).\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import linear_model\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> clf = linear_model.SGDOneClassSVM(random_state=42)\n >>> clf.fit(X)\n SGDOneClassSVM(random_state=42)\n\n >>> print(clf.predict([[4, 4]]))\n [1]\n \"\"\"\n loss_functions = {'hinge': (Hinge, 1.0)}\n \n def __init__(self, nu=0.5, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, warm_start=False, average=False):\n alpha = nu / 2\n self.nu = nu\n super(SGDOneClassSVM, self).__init__(loss='hinge', penalty='l2', alpha=alpha, C=1.0, l1_ratio=0, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=DEFAULT_EPSILON, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=warm_start, average=average)\n \n def _validate_params(self, for_partial_fit=False):\n \"\"\"Validate input params.\"\"\"\n if not 0 < self.nu <= 1:\n raise ValueError('nu must be in (0, 1], got nu=%f' % self.nu)\n super(SGDOneClassSVM, self)._validate_params(for_partial_fit=for_partial_fit)\n \n def _fit_one_class(self, X, alpha, C, sample_weight, learning_rate, max_iter):\n \"\"\"Uses SGD implementation with X and y=np.ones(n_samples).\"\"\"\n n_samples = X.shape[0]\n y = np.ones(n_samples, dtype=np.float64, order='C')\n (dataset, offset_decay) = make_dataset(X, y, sample_weight)\n penalty_type = self._get_penalty_type(self.penalty)\n learning_rate_type = self._get_learning_rate_type(learning_rate)\n validation_mask = self._make_validation_split(y)\n validation_score_cb = self._make_validation_score_cb(validation_mask, X, y, sample_weight)\n random_state = check_random_state(self.random_state)\n seed = random_state.randint(0, np.iinfo(np.int32).max)\n tol = self.tol if self.tol is not None else -np.inf\n one_class = 1\n pos_weight = 1\n neg_weight = 1\n if self.average:\n coef = self._standard_coef\n intercept = self._standard_intercept\n average_coef = self._average_coef\n average_intercept = self._average_intercept\n else:\n coef = self.coef_\n intercept = 1 - self.offset_\n average_coef = None\n average_intercept = [0]\n (coef, intercept, average_coef, average_intercept, self.n_iter_) = _plain_sgd(coef, intercept[0], average_coef, average_intercept[0], self.loss_function_, penalty_type, alpha, C, self.l1_ratio, dataset, validation_mask, self.early_stopping, validation_score_cb, int(self.n_iter_no_change), max_iter, tol, int(self.fit_intercept), int(self.verbose), int(self.shuffle), seed, neg_weight, pos_weight, learning_rate_type, self.eta0, self.power_t, one_class, self.t_, offset_decay, self.average)\n self.t_ += self.n_iter_ * n_samples\n if self.average > 0:\n self._average_intercept = np.atleast_1d(average_intercept)\n self._standard_intercept = np.atleast_1d(intercept)\n if self.average <= self.t_ - 1.0:\n self.coef_ = average_coef\n self.offset_ = 1 - np.atleast_1d(average_intercept)\n else:\n self.coef_ = coef\n self.offset_ = 1 - np.atleast_1d(intercept)\n else:\n self.offset_ = 1 - np.atleast_1d(intercept)\n \n def _partial_fit(self, X, alpha, C, loss, learning_rate, max_iter, sample_weight, coef_init, offset_init):\n first_call = getattr(self, 'coef_', None) is None\n X = self._validate_data(X, None, accept_sparse='csr', dtype=np.float64, order='C', accept_large_sparse=False, reset=first_call)\n n_features = X.shape[1]\n sample_weight = _check_sample_weight(sample_weight, X)\n if getattr(self, 'coef_', None) is None or coef_init is not None:\n self._allocate_parameter_mem(1, n_features, coef_init, offset_init, 1)\n elif n_features != self.coef_.shape[-1]:\n raise ValueError('Number of features %d does not match previous data %d.' % (n_features, self.coef_.shape[-1]))\n if self.average and getattr(self, '_average_coef', None) is None:\n self._average_coef = np.zeros(n_features, dtype=np.float64, order='C')\n self._average_intercept = np.zeros(1, dtype=np.float64, order='C')\n self.loss_function_ = self._get_loss_function(loss)\n if not hasattr(self, 't_'):\n self.t_ = 1.0\n self._fit_one_class(X, alpha=alpha, C=C, learning_rate=learning_rate, sample_weight=sample_weight, max_iter=max_iter)\n return self\n \n def partial_fit(self, X, y=None, sample_weight=None):\n \"\"\"Fit linear One-Class SVM with Stochastic Gradient Descent.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Subset of the training data.\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like, shape (n_samples,), optional\n Weights applied to individual samples.\n If not provided, uniform weights are assumed.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n alpha = self.nu / 2\n self._validate_params(for_partial_fit=True)\n return self._partial_fit(X, alpha, C=1.0, loss=self.loss, learning_rate=self.learning_rate, max_iter=1, sample_weight=sample_weight, coef_init=None, offset_init=None)\n \n def _fit(self, X, alpha, C, loss, learning_rate, coef_init=None, offset_init=None, sample_weight=None):\n self._validate_params()\n if self.warm_start and hasattr(self, 'coef_'):\n if coef_init is None:\n coef_init = self.coef_\n if offset_init is None:\n offset_init = self.offset_\n else:\n self.coef_ = None\n self.offset_ = None\n self.t_ = 1.0\n self._partial_fit(X, alpha, C, loss, learning_rate, self.max_iter, sample_weight, coef_init, offset_init)\n if self.tol is not None and self.tol > -np.inf and self.n_iter_ == self.max_iter:\n warnings.warn('Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.', ConvergenceWarning)\n return self\n \n def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None):\n \"\"\"Fit linear One-Class SVM with Stochastic Gradient Descent.\n\n This solves an equivalent optimization problem of the\n One-Class SVM primal optimization problem and returns a weight vector\n w and an offset rho such that the decision function is given by\n - rho.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data.\n y : Ignored\n Not used, present for API consistency by convention.\n\n coef_init : array, shape (n_classes, n_features)\n The initial coefficients to warm-start the optimization.\n\n offset_init : array, shape (n_classes,)\n The initial offset to warm-start the optimization.\n\n sample_weight : array-like, shape (n_samples,), optional\n Weights applied to individual samples.\n If not provided, uniform weights are assumed. These weights will\n be multiplied with class_weight (passed through the\n constructor) if class_weight is specified.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n alpha = self.nu / 2\n self._fit(X, alpha=alpha, C=1.0, loss=self.loss, learning_rate=self.learning_rate, coef_init=coef_init, offset_init=offset_init, sample_weight=sample_weight)\n return self\n \n def decision_function(self, X):\n \"\"\"Signed distance to the separating hyperplane.\n\n Signed distance is positive for an inlier and negative for an\n outlier.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\n Returns\n -------\n dec : array-like, shape (n_samples,)\n Decision function values of the samples.\n \"\"\"\n check_is_fitted(self, 'coef_')\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n decisions = safe_sparse_dot(X, self.coef_.T, dense_output=True) - self.offset_\n return decisions.ravel()\n \n def score_samples(self, X):\n \"\"\"Raw scoring function of the samples.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\n Returns\n -------\n score_samples : array-like, shape (n_samples,)\n Unshiffted scoring function values of the samples.\n \"\"\"\n score_samples = self.decision_function(X) + self.offset_\n return score_samples\n \n def predict(self, X):\n \"\"\"Return labels (1 inlier, -1 outlier) of the samples.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\n Returns\n -------\n y : array, shape (n_samples,)\n Labels of the samples.\n \"\"\"\n y = (self.decision_function(X) >= 0).astype(np.int32)\n y[y == 0] = -1\n return y\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, { "name": "SGDRegressor", @@ -24193,9 +24275,9 @@ "sklearn.linear_model._stochastic_gradient.SGDRegressor._more_tags" ], "is_public": true, - "description": "Linear model fitted by minimizing a regularized empirical loss with SGD.\n\nSGD stands for Stochastic Gradient Descent: the gradient of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength schedule (aka learning rate). The regularizer is a penalty added to the loss function that shrinks model parameters towards the zero vector using either the squared euclidean norm L2 or the absolute norm L1 or a combination of both (Elastic Net). If the parameter update crosses the 0.0 value because of the regularizer, the update is truncated to 0.0 to allow for learning sparse models and achieve online feature selection. This implementation works with data represented as dense numpy arrays of floating point values for the features. Read more in the :ref:`User Guide `.", - "docstring": "Linear model fitted by minimizing a regularized empirical loss with SGD.\n\n SGD stands for Stochastic Gradient Descent: the gradient of the loss is\n estimated each sample at a time and the model is updated along the way with\n a decreasing strength schedule (aka learning rate).\n\n The regularizer is a penalty added to the loss function that shrinks model\n parameters towards the zero vector using either the squared euclidean norm\n L2 or the absolute norm L1 or a combination of both (Elastic Net). If the\n parameter update crosses the 0.0 value because of the regularizer, the\n update is truncated to 0.0 to allow for learning sparse models and achieve\n online feature selection.\n\n This implementation works with data represented as dense numpy arrays of\n floating point values for the features.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n loss : str, default='squared_error'\n The loss function to be used. The possible values are 'squared_error',\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'\n\n The 'squared_error' refers to the ordinary least squares fit.\n 'huber' modifies 'squared_error' to focus less on getting outliers\n correct by switching from squared to linear loss past a distance of\n epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is\n linear past that; this is the loss function used in SVR.\n 'squared_epsilon_insensitive' is the same but becomes squared loss past\n a tolerance of epsilon.\n\n More details about the losses formulas can be found in the\n :ref:`User Guide `.\n\n .. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n penalty : {'l2', 'l1', 'elasticnet'}, default='l2'\n The penalty (aka regularization term) to be used. Defaults to 'l2'\n which is the standard regularizer for linear SVM models. 'l1' and\n 'elasticnet' might bring sparsity to the model (feature selection)\n not achievable with 'l2'.\n\n alpha : float, default=0.0001\n Constant that multiplies the regularization term. The higher the\n value, the stronger the regularization.\n Also used to compute the learning rate when set to `learning_rate` is\n set to 'optimal'.\n\n l1_ratio : float, default=0.15\n The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.\n l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.\n Only used if `penalty` is 'elasticnet'.\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If False, the\n data is assumed to be already centered.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n :meth:`partial_fit` method.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-3\n The stopping criterion. If it is not None, training will stop\n when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive\n epochs.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.19\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n\n verbose : int, default=0\n The verbosity level.\n\n epsilon : float, default=0.1\n Epsilon in the epsilon-insensitive loss functions; only if `loss` is\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n For 'huber', determines the threshold at which it becomes less\n important to get the prediction exactly right.\n For epsilon-insensitive, any differences between the current prediction\n and the correct label are ignored if they are less than this threshold.\n\n random_state : int, RandomState instance, default=None\n Used for shuffling the data, when ``shuffle`` is set to ``True``.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n learning_rate : str, default='invscaling'\n The learning rate schedule:\n\n - 'constant': `eta = eta0`\n - 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n - 'invscaling': `eta = eta0 / pow(t, power_t)`\n - 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n .. versionadded:: 0.20\n Added 'adaptive' option\n\n eta0 : double, default=0.01\n The initial learning rate for the 'constant', 'invscaling' or\n 'adaptive' schedules. The default value is 0.01.\n\n power_t : double, default=0.25\n The exponent for inverse scaling learning rate.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to True, it will automatically set aside\n a fraction of training data as validation and terminate\n training when validation score returned by the `score` method is not\n improving by at least `tol` for `n_iter_no_change` consecutive\n epochs.\n\n .. versionadded:: 0.20\n Added 'early_stopping' option\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if `early_stopping` is True.\n\n .. versionadded:: 0.20\n Added 'validation_fraction' option\n\n n_iter_no_change : int, default=5\n Number of iterations with no improvement to wait before stopping\n fitting.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.20\n Added 'n_iter_no_change' option\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n Repeatedly calling fit or partial_fit when warm_start is True can\n result in a different solution than when calling fit a single time\n because of the way the data is shuffled.\n If a dynamic learning rate is used, the learning rate is adapted\n depending on the number of samples already seen. Calling ``fit`` resets\n this counter, while ``partial_fit`` will result in increasing the\n existing counter.\n\n average : bool or int, default=False\n When set to True, computes the averaged SGD weights across all\n updates and stores the result in the ``coef_`` attribute. If set to\n an int greater than 1, averaging will begin once the total number of\n samples seen reaches `average`. So ``average=10`` will begin\n averaging after seeing 10 samples.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,)\n Weights assigned to the features.\n\n intercept_ : ndarray of shape (1,)\n The intercept term.\n\n n_iter_ : int\n The actual number of iterations before reaching the stopping criterion.\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n HuberRegressor : Linear regression model that is robust to outliers.\n Lars : Least Angle Regression model.\n Lasso : Linear Model trained with L1 prior as regularizer.\n RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.\n Ridge : Linear least squares with l2 regularization.\n sklearn.svm.SVR : Epsilon-Support Vector Regression.\n TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import SGDRegressor\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> # Always scale the input. The most convenient way is to use a pipeline.\n >>> reg = make_pipeline(StandardScaler(),\n ... SGDRegressor(max_iter=1000, tol=1e-3))\n >>> reg.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('sgdregressor', SGDRegressor())])\n ", - "source_code": "\n\nclass SGDRegressor(BaseSGDRegressor):\n \"\"\"Linear model fitted by minimizing a regularized empirical loss with SGD.\n\n SGD stands for Stochastic Gradient Descent: the gradient of the loss is\n estimated each sample at a time and the model is updated along the way with\n a decreasing strength schedule (aka learning rate).\n\n The regularizer is a penalty added to the loss function that shrinks model\n parameters towards the zero vector using either the squared euclidean norm\n L2 or the absolute norm L1 or a combination of both (Elastic Net). If the\n parameter update crosses the 0.0 value because of the regularizer, the\n update is truncated to 0.0 to allow for learning sparse models and achieve\n online feature selection.\n\n This implementation works with data represented as dense numpy arrays of\n floating point values for the features.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n loss : str, default='squared_error'\n The loss function to be used. The possible values are 'squared_error',\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'\n\n The 'squared_error' refers to the ordinary least squares fit.\n 'huber' modifies 'squared_error' to focus less on getting outliers\n correct by switching from squared to linear loss past a distance of\n epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is\n linear past that; this is the loss function used in SVR.\n 'squared_epsilon_insensitive' is the same but becomes squared loss past\n a tolerance of epsilon.\n\n More details about the losses formulas can be found in the\n :ref:`User Guide `.\n\n .. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n penalty : {'l2', 'l1', 'elasticnet'}, default='l2'\n The penalty (aka regularization term) to be used. Defaults to 'l2'\n which is the standard regularizer for linear SVM models. 'l1' and\n 'elasticnet' might bring sparsity to the model (feature selection)\n not achievable with 'l2'.\n\n alpha : float, default=0.0001\n Constant that multiplies the regularization term. The higher the\n value, the stronger the regularization.\n Also used to compute the learning rate when set to `learning_rate` is\n set to 'optimal'.\n\n l1_ratio : float, default=0.15\n The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.\n l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.\n Only used if `penalty` is 'elasticnet'.\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If False, the\n data is assumed to be already centered.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n :meth:`partial_fit` method.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-3\n The stopping criterion. If it is not None, training will stop\n when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive\n epochs.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.19\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n\n verbose : int, default=0\n The verbosity level.\n\n epsilon : float, default=0.1\n Epsilon in the epsilon-insensitive loss functions; only if `loss` is\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n For 'huber', determines the threshold at which it becomes less\n important to get the prediction exactly right.\n For epsilon-insensitive, any differences between the current prediction\n and the correct label are ignored if they are less than this threshold.\n\n random_state : int, RandomState instance, default=None\n Used for shuffling the data, when ``shuffle`` is set to ``True``.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n learning_rate : str, default='invscaling'\n The learning rate schedule:\n\n - 'constant': `eta = eta0`\n - 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n - 'invscaling': `eta = eta0 / pow(t, power_t)`\n - 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n .. versionadded:: 0.20\n Added 'adaptive' option\n\n eta0 : double, default=0.01\n The initial learning rate for the 'constant', 'invscaling' or\n 'adaptive' schedules. The default value is 0.01.\n\n power_t : double, default=0.25\n The exponent for inverse scaling learning rate.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to True, it will automatically set aside\n a fraction of training data as validation and terminate\n training when validation score returned by the `score` method is not\n improving by at least `tol` for `n_iter_no_change` consecutive\n epochs.\n\n .. versionadded:: 0.20\n Added 'early_stopping' option\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if `early_stopping` is True.\n\n .. versionadded:: 0.20\n Added 'validation_fraction' option\n\n n_iter_no_change : int, default=5\n Number of iterations with no improvement to wait before stopping\n fitting.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.20\n Added 'n_iter_no_change' option\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n Repeatedly calling fit or partial_fit when warm_start is True can\n result in a different solution than when calling fit a single time\n because of the way the data is shuffled.\n If a dynamic learning rate is used, the learning rate is adapted\n depending on the number of samples already seen. Calling ``fit`` resets\n this counter, while ``partial_fit`` will result in increasing the\n existing counter.\n\n average : bool or int, default=False\n When set to True, computes the averaged SGD weights across all\n updates and stores the result in the ``coef_`` attribute. If set to\n an int greater than 1, averaging will begin once the total number of\n samples seen reaches `average`. So ``average=10`` will begin\n averaging after seeing 10 samples.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,)\n Weights assigned to the features.\n\n intercept_ : ndarray of shape (1,)\n The intercept term.\n\n n_iter_ : int\n The actual number of iterations before reaching the stopping criterion.\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n HuberRegressor : Linear regression model that is robust to outliers.\n Lars : Least Angle Regression model.\n Lasso : Linear Model trained with L1 prior as regularizer.\n RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.\n Ridge : Linear least squares with l2 regularization.\n sklearn.svm.SVR : Epsilon-Support Vector Regression.\n TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import SGDRegressor\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> # Always scale the input. The most convenient way is to use a pipeline.\n >>> reg = make_pipeline(StandardScaler(),\n ... SGDRegressor(max_iter=1000, tol=1e-3))\n >>> reg.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('sgdregressor', SGDRegressor())])\n \"\"\"\n \n def __init__(self, loss='squared_error', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False):\n super().__init__(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=epsilon, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, warm_start=warm_start, average=average)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" + "description": "Linear model fitted by minimizing a regularized empirical loss with SGD.\n\nSGD stands for Stochastic Gradient Descent: the gradient of the loss is\nestimated each sample at a time and the model is updated along the way with\na decreasing strength schedule (aka learning rate).\n\nThe regularizer is a penalty added to the loss function that shrinks model\nparameters towards the zero vector using either the squared euclidean norm\nL2 or the absolute norm L1 or a combination of both (Elastic Net). If the\nparameter update crosses the 0.0 value because of the regularizer, the\nupdate is truncated to 0.0 to allow for learning sparse models and achieve\nonline feature selection.\n\nThis implementation works with data represented as dense numpy arrays of\nfloating point values for the features.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Linear model fitted by minimizing a regularized empirical loss with SGD.\n\n SGD stands for Stochastic Gradient Descent: the gradient of the loss is\n estimated each sample at a time and the model is updated along the way with\n a decreasing strength schedule (aka learning rate).\n\n The regularizer is a penalty added to the loss function that shrinks model\n parameters towards the zero vector using either the squared euclidean norm\n L2 or the absolute norm L1 or a combination of both (Elastic Net). If the\n parameter update crosses the 0.0 value because of the regularizer, the\n update is truncated to 0.0 to allow for learning sparse models and achieve\n online feature selection.\n\n This implementation works with data represented as dense numpy arrays of\n floating point values for the features.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n loss : str, default='squared_error'\n The loss function to be used. The possible values are 'squared_error',\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'\n\n The 'squared_error' refers to the ordinary least squares fit.\n 'huber' modifies 'squared_error' to focus less on getting outliers\n correct by switching from squared to linear loss past a distance of\n epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is\n linear past that; this is the loss function used in SVR.\n 'squared_epsilon_insensitive' is the same but becomes squared loss past\n a tolerance of epsilon.\n\n More details about the losses formulas can be found in the\n :ref:`User Guide `.\n\n .. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n penalty : {'l2', 'l1', 'elasticnet'}, default='l2'\n The penalty (aka regularization term) to be used. Defaults to 'l2'\n which is the standard regularizer for linear SVM models. 'l1' and\n 'elasticnet' might bring sparsity to the model (feature selection)\n not achievable with 'l2'.\n\n alpha : float, default=0.0001\n Constant that multiplies the regularization term. The higher the\n value, the stronger the regularization.\n Also used to compute the learning rate when set to `learning_rate` is\n set to 'optimal'.\n\n l1_ratio : float, default=0.15\n The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.\n l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.\n Only used if `penalty` is 'elasticnet'.\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If False, the\n data is assumed to be already centered.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n :meth:`partial_fit` method.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-3\n The stopping criterion. If it is not None, training will stop\n when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive\n epochs.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.19\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n\n verbose : int, default=0\n The verbosity level.\n\n epsilon : float, default=0.1\n Epsilon in the epsilon-insensitive loss functions; only if `loss` is\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n For 'huber', determines the threshold at which it becomes less\n important to get the prediction exactly right.\n For epsilon-insensitive, any differences between the current prediction\n and the correct label are ignored if they are less than this threshold.\n\n random_state : int, RandomState instance, default=None\n Used for shuffling the data, when ``shuffle`` is set to ``True``.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n learning_rate : str, default='invscaling'\n The learning rate schedule:\n\n - 'constant': `eta = eta0`\n - 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n - 'invscaling': `eta = eta0 / pow(t, power_t)`\n - 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n .. versionadded:: 0.20\n Added 'adaptive' option\n\n eta0 : float, default=0.01\n The initial learning rate for the 'constant', 'invscaling' or\n 'adaptive' schedules. The default value is 0.01.\n\n power_t : float, default=0.25\n The exponent for inverse scaling learning rate.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to True, it will automatically set aside\n a fraction of training data as validation and terminate\n training when validation score returned by the `score` method is not\n improving by at least `tol` for `n_iter_no_change` consecutive\n epochs.\n\n .. versionadded:: 0.20\n Added 'early_stopping' option\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if `early_stopping` is True.\n\n .. versionadded:: 0.20\n Added 'validation_fraction' option\n\n n_iter_no_change : int, default=5\n Number of iterations with no improvement to wait before stopping\n fitting.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.20\n Added 'n_iter_no_change' option\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n Repeatedly calling fit or partial_fit when warm_start is True can\n result in a different solution than when calling fit a single time\n because of the way the data is shuffled.\n If a dynamic learning rate is used, the learning rate is adapted\n depending on the number of samples already seen. Calling ``fit`` resets\n this counter, while ``partial_fit`` will result in increasing the\n existing counter.\n\n average : bool or int, default=False\n When set to True, computes the averaged SGD weights across all\n updates and stores the result in the ``coef_`` attribute. If set to\n an int greater than 1, averaging will begin once the total number of\n samples seen reaches `average`. So ``average=10`` will begin\n averaging after seeing 10 samples.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,)\n Weights assigned to the features.\n\n intercept_ : ndarray of shape (1,)\n The intercept term.\n\n n_iter_ : int\n The actual number of iterations before reaching the stopping criterion.\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n HuberRegressor : Linear regression model that is robust to outliers.\n Lars : Least Angle Regression model.\n Lasso : Linear Model trained with L1 prior as regularizer.\n RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.\n Ridge : Linear least squares with l2 regularization.\n sklearn.svm.SVR : Epsilon-Support Vector Regression.\n TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import SGDRegressor\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> # Always scale the input. The most convenient way is to use a pipeline.\n >>> reg = make_pipeline(StandardScaler(),\n ... SGDRegressor(max_iter=1000, tol=1e-3))\n >>> reg.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('sgdregressor', SGDRegressor())])\n ", + "source_code": "\n\nclass SGDRegressor(BaseSGDRegressor):\n \"\"\"Linear model fitted by minimizing a regularized empirical loss with SGD.\n\n SGD stands for Stochastic Gradient Descent: the gradient of the loss is\n estimated each sample at a time and the model is updated along the way with\n a decreasing strength schedule (aka learning rate).\n\n The regularizer is a penalty added to the loss function that shrinks model\n parameters towards the zero vector using either the squared euclidean norm\n L2 or the absolute norm L1 or a combination of both (Elastic Net). If the\n parameter update crosses the 0.0 value because of the regularizer, the\n update is truncated to 0.0 to allow for learning sparse models and achieve\n online feature selection.\n\n This implementation works with data represented as dense numpy arrays of\n floating point values for the features.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n loss : str, default='squared_error'\n The loss function to be used. The possible values are 'squared_error',\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'\n\n The 'squared_error' refers to the ordinary least squares fit.\n 'huber' modifies 'squared_error' to focus less on getting outliers\n correct by switching from squared to linear loss past a distance of\n epsilon. 'epsilon_insensitive' ignores errors less than epsilon and is\n linear past that; this is the loss function used in SVR.\n 'squared_epsilon_insensitive' is the same but becomes squared loss past\n a tolerance of epsilon.\n\n More details about the losses formulas can be found in the\n :ref:`User Guide `.\n\n .. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n penalty : {'l2', 'l1', 'elasticnet'}, default='l2'\n The penalty (aka regularization term) to be used. Defaults to 'l2'\n which is the standard regularizer for linear SVM models. 'l1' and\n 'elasticnet' might bring sparsity to the model (feature selection)\n not achievable with 'l2'.\n\n alpha : float, default=0.0001\n Constant that multiplies the regularization term. The higher the\n value, the stronger the regularization.\n Also used to compute the learning rate when set to `learning_rate` is\n set to 'optimal'.\n\n l1_ratio : float, default=0.15\n The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.\n l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.\n Only used if `penalty` is 'elasticnet'.\n\n fit_intercept : bool, default=True\n Whether the intercept should be estimated or not. If False, the\n data is assumed to be already centered.\n\n max_iter : int, default=1000\n The maximum number of passes over the training data (aka epochs).\n It only impacts the behavior in the ``fit`` method, and not the\n :meth:`partial_fit` method.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-3\n The stopping criterion. If it is not None, training will stop\n when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive\n epochs.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.19\n\n shuffle : bool, default=True\n Whether or not the training data should be shuffled after each epoch.\n\n verbose : int, default=0\n The verbosity level.\n\n epsilon : float, default=0.1\n Epsilon in the epsilon-insensitive loss functions; only if `loss` is\n 'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n For 'huber', determines the threshold at which it becomes less\n important to get the prediction exactly right.\n For epsilon-insensitive, any differences between the current prediction\n and the correct label are ignored if they are less than this threshold.\n\n random_state : int, RandomState instance, default=None\n Used for shuffling the data, when ``shuffle`` is set to ``True``.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n learning_rate : str, default='invscaling'\n The learning rate schedule:\n\n - 'constant': `eta = eta0`\n - 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n - 'invscaling': `eta = eta0 / pow(t, power_t)`\n - 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n .. versionadded:: 0.20\n Added 'adaptive' option\n\n eta0 : float, default=0.01\n The initial learning rate for the 'constant', 'invscaling' or\n 'adaptive' schedules. The default value is 0.01.\n\n power_t : float, default=0.25\n The exponent for inverse scaling learning rate.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to True, it will automatically set aside\n a fraction of training data as validation and terminate\n training when validation score returned by the `score` method is not\n improving by at least `tol` for `n_iter_no_change` consecutive\n epochs.\n\n .. versionadded:: 0.20\n Added 'early_stopping' option\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if `early_stopping` is True.\n\n .. versionadded:: 0.20\n Added 'validation_fraction' option\n\n n_iter_no_change : int, default=5\n Number of iterations with no improvement to wait before stopping\n fitting.\n Convergence is checked against the training loss or the\n validation loss depending on the `early_stopping` parameter.\n\n .. versionadded:: 0.20\n Added 'n_iter_no_change' option\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous call to fit as\n initialization, otherwise, just erase the previous solution.\n See :term:`the Glossary `.\n\n Repeatedly calling fit or partial_fit when warm_start is True can\n result in a different solution than when calling fit a single time\n because of the way the data is shuffled.\n If a dynamic learning rate is used, the learning rate is adapted\n depending on the number of samples already seen. Calling ``fit`` resets\n this counter, while ``partial_fit`` will result in increasing the\n existing counter.\n\n average : bool or int, default=False\n When set to True, computes the averaged SGD weights across all\n updates and stores the result in the ``coef_`` attribute. If set to\n an int greater than 1, averaging will begin once the total number of\n samples seen reaches `average`. So ``average=10`` will begin\n averaging after seeing 10 samples.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,)\n Weights assigned to the features.\n\n intercept_ : ndarray of shape (1,)\n The intercept term.\n\n n_iter_ : int\n The actual number of iterations before reaching the stopping criterion.\n\n t_ : int\n Number of weight updates performed during training.\n Same as ``(n_iter_ * n_samples)``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n HuberRegressor : Linear regression model that is robust to outliers.\n Lars : Least Angle Regression model.\n Lasso : Linear Model trained with L1 prior as regularizer.\n RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.\n Ridge : Linear least squares with l2 regularization.\n sklearn.svm.SVR : Epsilon-Support Vector Regression.\n TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.linear_model import SGDRegressor\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> # Always scale the input. The most convenient way is to use a pipeline.\n >>> reg = make_pipeline(StandardScaler(),\n ... SGDRegressor(max_iter=1000, tol=1e-3))\n >>> reg.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('sgdregressor', SGDRegressor())])\n \"\"\"\n \n def __init__(self, loss='squared_error', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False):\n super().__init__(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=epsilon, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, warm_start=warm_start, average=average)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, { "name": "_ValidationScoreCallback", @@ -24222,9 +24304,9 @@ "sklearn.linear_model._theil_sen.TheilSenRegressor.fit" ], "is_public": true, - "description": "Theil-Sen Estimator: robust multivariate regression model.\n\nThe algorithm calculates least square solutions on subsets with size n_subsamples of the samples in X. Any value of n_subsamples between the number of features and samples leads to an estimator with a compromise between robustness and efficiency. Since the number of least square solutions is \"n_samples choose n_subsamples\", it can be extremely large and can therefore be limited with max_subpopulation. If this limit is reached, the subsets are chosen randomly. In a final step, the spatial median (or L1 median) is calculated of all least square solutions. Read more in the :ref:`User Guide `.", - "docstring": "Theil-Sen Estimator: robust multivariate regression model.\n\n The algorithm calculates least square solutions on subsets with size\n n_subsamples of the samples in X. Any value of n_subsamples between the\n number of features and samples leads to an estimator with a compromise\n between robustness and efficiency. Since the number of least square\n solutions is \"n_samples choose n_subsamples\", it can be extremely large\n and can therefore be limited with max_subpopulation. If this limit is\n reached, the subsets are chosen randomly. In a final step, the spatial\n median (or L1 median) is calculated of all least square solutions.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n max_subpopulation : int, default=1e4\n Instead of computing with a set of cardinality 'n choose k', where n is\n the number of samples and k is the number of subsamples (at least\n number of features), consider only a stochastic subpopulation of a\n given maximal size if 'n choose k' is larger than max_subpopulation.\n For other than small problem sizes this parameter will determine\n memory usage and runtime if n_subsamples is not changed.\n\n n_subsamples : int, default=None\n Number of samples to calculate the parameters. This is at least the\n number of features (plus 1 if fit_intercept=True) and the number of\n samples as a maximum. A lower number leads to a higher breakdown\n point and a low efficiency while a high number leads to a low\n breakdown point and a high efficiency. If None, take the\n minimum number of subsamples leading to maximal robustness.\n If n_subsamples is set to n_samples, Theil-Sen is identical to least\n squares.\n\n max_iter : int, default=300\n Maximum number of iterations for the calculation of spatial median.\n\n tol : float, default=1.e-3\n Tolerance when calculating spatial median.\n\n random_state : int, RandomState instance or None, default=None\n A random number generator instance to define the state of the random\n permutations generator. Pass an int for reproducible output across\n multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : bool, default=False\n Verbose mode when fitting the model.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,)\n Coefficients of the regression model (median of distribution).\n\n intercept_ : float\n Estimated intercept of regression model.\n\n breakdown_ : float\n Approximated breakdown point.\n\n n_iter_ : int\n Number of iterations needed for the spatial median.\n\n n_subpopulation_ : int\n Number of combinations taken into account from 'n choose k', where n is\n the number of samples and k is the number of subsamples.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n HuberRegressor : Linear regression model that is robust to outliers.\n RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.\n SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.\n\n References\n ----------\n - Theil-Sen Estimators in a Multiple Linear Regression Model, 2009\n Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang\n http://home.olemiss.edu/~xdang/papers/MTSE.pdf\n\n Examples\n --------\n >>> from sklearn.linear_model import TheilSenRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(\n ... n_samples=200, n_features=2, noise=4.0, random_state=0)\n >>> reg = TheilSenRegressor(random_state=0).fit(X, y)\n >>> reg.score(X, y)\n 0.9884...\n >>> reg.predict(X[:1,])\n array([-31.5871...])\n ", - "source_code": "\n\nclass TheilSenRegressor(RegressorMixin, LinearModel):\n \"\"\"Theil-Sen Estimator: robust multivariate regression model.\n\n The algorithm calculates least square solutions on subsets with size\n n_subsamples of the samples in X. Any value of n_subsamples between the\n number of features and samples leads to an estimator with a compromise\n between robustness and efficiency. Since the number of least square\n solutions is \"n_samples choose n_subsamples\", it can be extremely large\n and can therefore be limited with max_subpopulation. If this limit is\n reached, the subsets are chosen randomly. In a final step, the spatial\n median (or L1 median) is calculated of all least square solutions.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n max_subpopulation : int, default=1e4\n Instead of computing with a set of cardinality 'n choose k', where n is\n the number of samples and k is the number of subsamples (at least\n number of features), consider only a stochastic subpopulation of a\n given maximal size if 'n choose k' is larger than max_subpopulation.\n For other than small problem sizes this parameter will determine\n memory usage and runtime if n_subsamples is not changed.\n\n n_subsamples : int, default=None\n Number of samples to calculate the parameters. This is at least the\n number of features (plus 1 if fit_intercept=True) and the number of\n samples as a maximum. A lower number leads to a higher breakdown\n point and a low efficiency while a high number leads to a low\n breakdown point and a high efficiency. If None, take the\n minimum number of subsamples leading to maximal robustness.\n If n_subsamples is set to n_samples, Theil-Sen is identical to least\n squares.\n\n max_iter : int, default=300\n Maximum number of iterations for the calculation of spatial median.\n\n tol : float, default=1.e-3\n Tolerance when calculating spatial median.\n\n random_state : int, RandomState instance or None, default=None\n A random number generator instance to define the state of the random\n permutations generator. Pass an int for reproducible output across\n multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : bool, default=False\n Verbose mode when fitting the model.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,)\n Coefficients of the regression model (median of distribution).\n\n intercept_ : float\n Estimated intercept of regression model.\n\n breakdown_ : float\n Approximated breakdown point.\n\n n_iter_ : int\n Number of iterations needed for the spatial median.\n\n n_subpopulation_ : int\n Number of combinations taken into account from 'n choose k', where n is\n the number of samples and k is the number of subsamples.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n HuberRegressor : Linear regression model that is robust to outliers.\n RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.\n SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.\n\n References\n ----------\n - Theil-Sen Estimators in a Multiple Linear Regression Model, 2009\n Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang\n http://home.olemiss.edu/~xdang/papers/MTSE.pdf\n\n Examples\n --------\n >>> from sklearn.linear_model import TheilSenRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(\n ... n_samples=200, n_features=2, noise=4.0, random_state=0)\n >>> reg = TheilSenRegressor(random_state=0).fit(X, y)\n >>> reg.score(X, y)\n 0.9884...\n >>> reg.predict(X[:1,])\n array([-31.5871...])\n \"\"\"\n \n def __init__(self, *, fit_intercept=True, copy_X=True, max_subpopulation=10000.0, n_subsamples=None, max_iter=300, tol=0.001, random_state=None, n_jobs=None, verbose=False):\n self.fit_intercept = fit_intercept\n self.copy_X = copy_X\n self.max_subpopulation = int(max_subpopulation)\n self.n_subsamples = n_subsamples\n self.max_iter = max_iter\n self.tol = tol\n self.random_state = random_state\n self.n_jobs = n_jobs\n self.verbose = verbose\n \n def _check_subparams(self, n_samples, n_features):\n n_subsamples = self.n_subsamples\n if self.fit_intercept:\n n_dim = n_features + 1\n else:\n n_dim = n_features\n if n_subsamples is not None:\n if n_subsamples > n_samples:\n raise ValueError('Invalid parameter since n_subsamples > n_samples ({0} > {1}).'.format(n_subsamples, n_samples))\n if n_samples >= n_features:\n if n_dim > n_subsamples:\n plus_1 = '+1' if self.fit_intercept else ''\n raise ValueError('Invalid parameter since n_features{0} > n_subsamples ({1} > {2}).'.format(plus_1, n_dim, n_samples))\n elif n_subsamples != n_samples:\n raise ValueError('Invalid parameter since n_subsamples != n_samples ({0} != {1}) while n_samples < n_features.'.format(n_subsamples, n_samples))\n else:\n n_subsamples = min(n_dim, n_samples)\n if self.max_subpopulation <= 0:\n raise ValueError('Subpopulation must be strictly positive ({0} <= 0).'.format(self.max_subpopulation))\n all_combinations = max(1, np.rint(binom(n_samples, n_subsamples)))\n n_subpopulation = int(min(self.max_subpopulation, all_combinations))\n return n_subsamples, n_subpopulation\n \n def fit(self, X, y):\n \"\"\"Fit linear model.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n y : ndarray of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : returns an instance of self.\n Fitted `TheilSenRegressor` estimator.\n \"\"\"\n random_state = check_random_state(self.random_state)\n (X, y) = self._validate_data(X, y, y_numeric=True)\n (n_samples, n_features) = X.shape\n (n_subsamples, self.n_subpopulation_) = self._check_subparams(n_samples, n_features)\n self.breakdown_ = _breakdown_point(n_samples, n_subsamples)\n if self.verbose:\n print('Breakdown point: {0}'.format(self.breakdown_))\n print('Number of samples: {0}'.format(n_samples))\n tol_outliers = int(self.breakdown_ * n_samples)\n print('Tolerable outliers: {0}'.format(tol_outliers))\n print('Number of subpopulations: {0}'.format(self.n_subpopulation_))\n if np.rint(binom(n_samples, n_subsamples)) <= self.max_subpopulation:\n indices = list(combinations(range(n_samples), n_subsamples))\n else:\n indices = [random_state.choice(n_samples, size=n_subsamples, replace=False) for _ in range(self.n_subpopulation_)]\n n_jobs = effective_n_jobs(self.n_jobs)\n index_list = np.array_split(indices, n_jobs)\n weights = Parallel(n_jobs=n_jobs, verbose=self.verbose)((delayed(_lstsq)(X, y, index_list[job], self.fit_intercept) for job in range(n_jobs)))\n weights = np.vstack(weights)\n (self.n_iter_, coefs) = _spatial_median(weights, max_iter=self.max_iter, tol=self.tol)\n if self.fit_intercept:\n self.intercept_ = coefs[0]\n self.coef_ = coefs[1:]\n else:\n self.intercept_ = 0.0\n self.coef_ = coefs\n return self\n" + "description": "Theil-Sen Estimator: robust multivariate regression model.\n\nThe algorithm calculates least square solutions on subsets with size\nn_subsamples of the samples in X. Any value of n_subsamples between the\nnumber of features and samples leads to an estimator with a compromise\nbetween robustness and efficiency. Since the number of least square\nsolutions is \"n_samples choose n_subsamples\", it can be extremely large\nand can therefore be limited with max_subpopulation. If this limit is\nreached, the subsets are chosen randomly. In a final step, the spatial\nmedian (or L1 median) is calculated of all least square solutions.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Theil-Sen Estimator: robust multivariate regression model.\n\n The algorithm calculates least square solutions on subsets with size\n n_subsamples of the samples in X. Any value of n_subsamples between the\n number of features and samples leads to an estimator with a compromise\n between robustness and efficiency. Since the number of least square\n solutions is \"n_samples choose n_subsamples\", it can be extremely large\n and can therefore be limited with max_subpopulation. If this limit is\n reached, the subsets are chosen randomly. In a final step, the spatial\n median (or L1 median) is calculated of all least square solutions.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n max_subpopulation : int, default=1e4\n Instead of computing with a set of cardinality 'n choose k', where n is\n the number of samples and k is the number of subsamples (at least\n number of features), consider only a stochastic subpopulation of a\n given maximal size if 'n choose k' is larger than max_subpopulation.\n For other than small problem sizes this parameter will determine\n memory usage and runtime if n_subsamples is not changed.\n\n n_subsamples : int, default=None\n Number of samples to calculate the parameters. This is at least the\n number of features (plus 1 if fit_intercept=True) and the number of\n samples as a maximum. A lower number leads to a higher breakdown\n point and a low efficiency while a high number leads to a low\n breakdown point and a high efficiency. If None, take the\n minimum number of subsamples leading to maximal robustness.\n If n_subsamples is set to n_samples, Theil-Sen is identical to least\n squares.\n\n max_iter : int, default=300\n Maximum number of iterations for the calculation of spatial median.\n\n tol : float, default=1e-3\n Tolerance when calculating spatial median.\n\n random_state : int, RandomState instance or None, default=None\n A random number generator instance to define the state of the random\n permutations generator. Pass an int for reproducible output across\n multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : bool, default=False\n Verbose mode when fitting the model.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,)\n Coefficients of the regression model (median of distribution).\n\n intercept_ : float\n Estimated intercept of regression model.\n\n breakdown_ : float\n Approximated breakdown point.\n\n n_iter_ : int\n Number of iterations needed for the spatial median.\n\n n_subpopulation_ : int\n Number of combinations taken into account from 'n choose k', where n is\n the number of samples and k is the number of subsamples.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n HuberRegressor : Linear regression model that is robust to outliers.\n RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.\n SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.\n\n References\n ----------\n - Theil-Sen Estimators in a Multiple Linear Regression Model, 2009\n Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang\n http://home.olemiss.edu/~xdang/papers/MTSE.pdf\n\n Examples\n --------\n >>> from sklearn.linear_model import TheilSenRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(\n ... n_samples=200, n_features=2, noise=4.0, random_state=0)\n >>> reg = TheilSenRegressor(random_state=0).fit(X, y)\n >>> reg.score(X, y)\n 0.9884...\n >>> reg.predict(X[:1,])\n array([-31.5871...])\n ", + "source_code": "\n\nclass TheilSenRegressor(RegressorMixin, LinearModel):\n \"\"\"Theil-Sen Estimator: robust multivariate regression model.\n\n The algorithm calculates least square solutions on subsets with size\n n_subsamples of the samples in X. Any value of n_subsamples between the\n number of features and samples leads to an estimator with a compromise\n between robustness and efficiency. Since the number of least square\n solutions is \"n_samples choose n_subsamples\", it can be extremely large\n and can therefore be limited with max_subpopulation. If this limit is\n reached, the subsets are chosen randomly. In a final step, the spatial\n median (or L1 median) is calculated of all least square solutions.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations.\n\n copy_X : bool, default=True\n If True, X will be copied; else, it may be overwritten.\n\n max_subpopulation : int, default=1e4\n Instead of computing with a set of cardinality 'n choose k', where n is\n the number of samples and k is the number of subsamples (at least\n number of features), consider only a stochastic subpopulation of a\n given maximal size if 'n choose k' is larger than max_subpopulation.\n For other than small problem sizes this parameter will determine\n memory usage and runtime if n_subsamples is not changed.\n\n n_subsamples : int, default=None\n Number of samples to calculate the parameters. This is at least the\n number of features (plus 1 if fit_intercept=True) and the number of\n samples as a maximum. A lower number leads to a higher breakdown\n point and a low efficiency while a high number leads to a low\n breakdown point and a high efficiency. If None, take the\n minimum number of subsamples leading to maximal robustness.\n If n_subsamples is set to n_samples, Theil-Sen is identical to least\n squares.\n\n max_iter : int, default=300\n Maximum number of iterations for the calculation of spatial median.\n\n tol : float, default=1e-3\n Tolerance when calculating spatial median.\n\n random_state : int, RandomState instance or None, default=None\n A random number generator instance to define the state of the random\n permutations generator. Pass an int for reproducible output across\n multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n Number of CPUs to use during the cross validation.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : bool, default=False\n Verbose mode when fitting the model.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features,)\n Coefficients of the regression model (median of distribution).\n\n intercept_ : float\n Estimated intercept of regression model.\n\n breakdown_ : float\n Approximated breakdown point.\n\n n_iter_ : int\n Number of iterations needed for the spatial median.\n\n n_subpopulation_ : int\n Number of combinations taken into account from 'n choose k', where n is\n the number of samples and k is the number of subsamples.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n HuberRegressor : Linear regression model that is robust to outliers.\n RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.\n SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.\n\n References\n ----------\n - Theil-Sen Estimators in a Multiple Linear Regression Model, 2009\n Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang\n http://home.olemiss.edu/~xdang/papers/MTSE.pdf\n\n Examples\n --------\n >>> from sklearn.linear_model import TheilSenRegressor\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(\n ... n_samples=200, n_features=2, noise=4.0, random_state=0)\n >>> reg = TheilSenRegressor(random_state=0).fit(X, y)\n >>> reg.score(X, y)\n 0.9884...\n >>> reg.predict(X[:1,])\n array([-31.5871...])\n \"\"\"\n \n def __init__(self, *, fit_intercept=True, copy_X=True, max_subpopulation=10000.0, n_subsamples=None, max_iter=300, tol=0.001, random_state=None, n_jobs=None, verbose=False):\n self.fit_intercept = fit_intercept\n self.copy_X = copy_X\n self.max_subpopulation = int(max_subpopulation)\n self.n_subsamples = n_subsamples\n self.max_iter = max_iter\n self.tol = tol\n self.random_state = random_state\n self.n_jobs = n_jobs\n self.verbose = verbose\n \n def _check_subparams(self, n_samples, n_features):\n n_subsamples = self.n_subsamples\n if self.fit_intercept:\n n_dim = n_features + 1\n else:\n n_dim = n_features\n if n_subsamples is not None:\n if n_subsamples > n_samples:\n raise ValueError('Invalid parameter since n_subsamples > n_samples ({0} > {1}).'.format(n_subsamples, n_samples))\n if n_samples >= n_features:\n if n_dim > n_subsamples:\n plus_1 = '+1' if self.fit_intercept else ''\n raise ValueError('Invalid parameter since n_features{0} > n_subsamples ({1} > {2}).'.format(plus_1, n_dim, n_samples))\n elif n_subsamples != n_samples:\n raise ValueError('Invalid parameter since n_subsamples != n_samples ({0} != {1}) while n_samples < n_features.'.format(n_subsamples, n_samples))\n else:\n n_subsamples = min(n_dim, n_samples)\n if self.max_subpopulation <= 0:\n raise ValueError('Subpopulation must be strictly positive ({0} <= 0).'.format(self.max_subpopulation))\n all_combinations = max(1, np.rint(binom(n_samples, n_subsamples)))\n n_subpopulation = int(min(self.max_subpopulation, all_combinations))\n return n_subsamples, n_subpopulation\n \n def fit(self, X, y):\n \"\"\"Fit linear model.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n y : ndarray of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : returns an instance of self.\n Fitted `TheilSenRegressor` estimator.\n \"\"\"\n random_state = check_random_state(self.random_state)\n (X, y) = self._validate_data(X, y, y_numeric=True)\n (n_samples, n_features) = X.shape\n (n_subsamples, self.n_subpopulation_) = self._check_subparams(n_samples, n_features)\n self.breakdown_ = _breakdown_point(n_samples, n_subsamples)\n if self.verbose:\n print('Breakdown point: {0}'.format(self.breakdown_))\n print('Number of samples: {0}'.format(n_samples))\n tol_outliers = int(self.breakdown_ * n_samples)\n print('Tolerable outliers: {0}'.format(tol_outliers))\n print('Number of subpopulations: {0}'.format(self.n_subpopulation_))\n if np.rint(binom(n_samples, n_subsamples)) <= self.max_subpopulation:\n indices = list(combinations(range(n_samples), n_subsamples))\n else:\n indices = [random_state.choice(n_samples, size=n_subsamples, replace=False) for _ in range(self.n_subpopulation_)]\n n_jobs = effective_n_jobs(self.n_jobs)\n index_list = np.array_split(indices, n_jobs)\n weights = Parallel(n_jobs=n_jobs, verbose=self.verbose)((delayed(_lstsq)(X, y, index_list[job], self.fit_intercept) for job in range(n_jobs)))\n weights = np.vstack(weights)\n (self.n_iter_, coefs) = _spatial_median(weights, max_iter=self.max_iter, tol=self.tol)\n if self.fit_intercept:\n self.intercept_ = coefs[0]\n self.coef_ = coefs[1:]\n else:\n self.intercept_ = 0.0\n self.coef_ = coefs\n return self\n" }, { "name": "Isomap", @@ -24240,9 +24322,9 @@ "sklearn.manifold._isomap.Isomap.transform" ], "is_public": true, - "description": "Isomap Embedding.\n\nNon-linear dimensionality reduction through Isometric Mapping Read more in the :ref:`User Guide `.", + "description": "Isomap Embedding.\n\nNon-linear dimensionality reduction through Isometric Mapping\n\nRead more in the :ref:`User Guide `.", "docstring": "Isomap Embedding.\n\n Non-linear dimensionality reduction through Isometric Mapping\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to consider for each point.\n\n n_components : int, default=2\n Number of coordinates for the manifold.\n\n eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'\n 'auto' : Attempt to choose the most efficient solver\n for the given problem.\n\n 'arpack' : Use Arnoldi decomposition to find the eigenvalues\n and eigenvectors.\n\n 'dense' : Use a direct solver (i.e. LAPACK)\n for the eigenvalue decomposition.\n\n tol : float, default=0\n Convergence tolerance passed to arpack or lobpcg.\n not used if eigen_solver == 'dense'.\n\n max_iter : int, default=None\n Maximum number of iterations for the arpack solver.\n not used if eigen_solver == 'dense'.\n\n path_method : {'auto', 'FW', 'D'}, default='auto'\n Method to use in finding shortest path.\n\n 'auto' : attempt to choose the best algorithm automatically.\n\n 'FW' : Floyd-Warshall algorithm.\n\n 'D' : Dijkstra's algorithm.\n\n neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, default='auto'\n Algorithm to use for nearest neighbors search,\n passed to neighbors.NearestNeighbors instance.\n\n n_jobs : int or None, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n metric : str, or callable, default=\"minkowski\"\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string or callable, it must be one of\n the options allowed by :func:`sklearn.metrics.pairwise_distances` for\n its metric parameter.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square. X may be a :term:`Glossary `.\n\n .. versionadded:: 0.22\n\n p : int, default=2\n Parameter for the Minkowski metric from\n sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n .. versionadded:: 0.22\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n embedding_ : array-like, shape (n_samples, n_components)\n Stores the embedding vectors.\n\n kernel_pca_ : object\n :class:`~sklearn.decomposition.KernelPCA` object used to implement the\n embedding.\n\n nbrs_ : sklearn.neighbors.NearestNeighbors instance\n Stores nearest neighbors instance, including BallTree or KDtree\n if applicable.\n\n dist_matrix_ : array-like, shape (n_samples, n_samples)\n Stores the geodesic distance matrix of training data.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.decomposition.PCA : Principal component analysis that is a linear\n dimensionality reduction method.\n sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using\n kernels and PCA.\n MDS : Manifold learning using multidimensional scaling.\n TSNE : T-distributed Stochastic Neighbor Embedding.\n LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.\n SpectralEmbedding : Spectral embedding for non-linear dimensionality.\n\n References\n ----------\n\n .. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric\n framework for nonlinear dimensionality reduction. Science 290 (5500)\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.manifold import Isomap\n >>> X, _ = load_digits(return_X_y=True)\n >>> X.shape\n (1797, 64)\n >>> embedding = Isomap(n_components=2)\n >>> X_transformed = embedding.fit_transform(X[:100])\n >>> X_transformed.shape\n (100, 2)\n ", - "source_code": "\n\nclass Isomap(TransformerMixin, BaseEstimator):\n \"\"\"Isomap Embedding.\n\n Non-linear dimensionality reduction through Isometric Mapping\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to consider for each point.\n\n n_components : int, default=2\n Number of coordinates for the manifold.\n\n eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'\n 'auto' : Attempt to choose the most efficient solver\n for the given problem.\n\n 'arpack' : Use Arnoldi decomposition to find the eigenvalues\n and eigenvectors.\n\n 'dense' : Use a direct solver (i.e. LAPACK)\n for the eigenvalue decomposition.\n\n tol : float, default=0\n Convergence tolerance passed to arpack or lobpcg.\n not used if eigen_solver == 'dense'.\n\n max_iter : int, default=None\n Maximum number of iterations for the arpack solver.\n not used if eigen_solver == 'dense'.\n\n path_method : {'auto', 'FW', 'D'}, default='auto'\n Method to use in finding shortest path.\n\n 'auto' : attempt to choose the best algorithm automatically.\n\n 'FW' : Floyd-Warshall algorithm.\n\n 'D' : Dijkstra's algorithm.\n\n neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, default='auto'\n Algorithm to use for nearest neighbors search,\n passed to neighbors.NearestNeighbors instance.\n\n n_jobs : int or None, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n metric : str, or callable, default=\"minkowski\"\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string or callable, it must be one of\n the options allowed by :func:`sklearn.metrics.pairwise_distances` for\n its metric parameter.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square. X may be a :term:`Glossary `.\n\n .. versionadded:: 0.22\n\n p : int, default=2\n Parameter for the Minkowski metric from\n sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n .. versionadded:: 0.22\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n embedding_ : array-like, shape (n_samples, n_components)\n Stores the embedding vectors.\n\n kernel_pca_ : object\n :class:`~sklearn.decomposition.KernelPCA` object used to implement the\n embedding.\n\n nbrs_ : sklearn.neighbors.NearestNeighbors instance\n Stores nearest neighbors instance, including BallTree or KDtree\n if applicable.\n\n dist_matrix_ : array-like, shape (n_samples, n_samples)\n Stores the geodesic distance matrix of training data.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.decomposition.PCA : Principal component analysis that is a linear\n dimensionality reduction method.\n sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using\n kernels and PCA.\n MDS : Manifold learning using multidimensional scaling.\n TSNE : T-distributed Stochastic Neighbor Embedding.\n LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.\n SpectralEmbedding : Spectral embedding for non-linear dimensionality.\n\n References\n ----------\n\n .. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric\n framework for nonlinear dimensionality reduction. Science 290 (5500)\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.manifold import Isomap\n >>> X, _ = load_digits(return_X_y=True)\n >>> X.shape\n (1797, 64)\n >>> embedding = Isomap(n_components=2)\n >>> X_transformed = embedding.fit_transform(X[:100])\n >>> X_transformed.shape\n (100, 2)\n \"\"\"\n \n def __init__(self, *, n_neighbors=5, n_components=2, eigen_solver='auto', tol=0, max_iter=None, path_method='auto', neighbors_algorithm='auto', n_jobs=None, metric='minkowski', p=2, metric_params=None):\n self.n_neighbors = n_neighbors\n self.n_components = n_components\n self.eigen_solver = eigen_solver\n self.tol = tol\n self.max_iter = max_iter\n self.path_method = path_method\n self.neighbors_algorithm = neighbors_algorithm\n self.n_jobs = n_jobs\n self.metric = metric\n self.p = p\n self.metric_params = metric_params\n \n def _fit_transform(self, X):\n self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.neighbors_algorithm, metric=self.metric, p=self.p, metric_params=self.metric_params, n_jobs=self.n_jobs)\n self.nbrs_.fit(X)\n self.n_features_in_ = self.nbrs_.n_features_in_\n if hasattr(self.nbrs_, 'feature_names_in_'):\n self.feature_names_in_ = self.nbrs_.feature_names_in_\n self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel='precomputed', eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter, n_jobs=self.n_jobs)\n kng = kneighbors_graph(self.nbrs_, self.n_neighbors, metric=self.metric, p=self.p, metric_params=self.metric_params, mode='distance', n_jobs=self.n_jobs)\n (n_connected_components, labels) = connected_components(kng)\n if n_connected_components > 1:\n if self.metric == 'precomputed':\n raise RuntimeError(f\"The number of connected components of the neighbors graph is {n_connected_components} > 1. The graph cannot be completed with metric='precomputed', and Isomap cannot befitted. Increase the number of neighbors to avoid this issue.\")\n warnings.warn(f'The number of connected components of the neighbors graph is {n_connected_components} > 1. Completing the graph to fit Isomap might be slow. Increase the number of neighbors to avoid this issue.', stacklevel=2)\n kng = _fix_connected_components(X=self.nbrs_._fit_X, graph=kng, n_connected_components=n_connected_components, component_labels=labels, mode='distance', metric=self.nbrs_.effective_metric_, **self.nbrs_.effective_metric_params_)\n if parse_version(scipy.__version__) < parse_version('1.3.2'):\n kng.data += 1e-15\n self.dist_matrix_ = shortest_path(kng, method=self.path_method, directed=False)\n G = self.dist_matrix_**2\n G *= -0.5\n self.embedding_ = self.kernel_pca_.fit_transform(G)\n \n def reconstruction_error(self):\n \"\"\"Compute the reconstruction error for the embedding.\n\n Returns\n -------\n reconstruction_error : float\n Reconstruction error.\n\n Notes\n -----\n The cost function of an isomap embedding is\n\n ``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``\n\n Where D is the matrix of distances for the input data X,\n D_fit is the matrix of distances for the output embedding X_fit,\n and K is the isomap kernel:\n\n ``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``\n \"\"\"\n G = -0.5 * self.dist_matrix_**2\n G_center = KernelCenterer().fit_transform(G)\n evals = self.kernel_pca_.eigenvalues_\n return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]\n \n def fit(self, X, y=None):\n \"\"\"Compute the embedding vectors for data X.\n\n Parameters\n ----------\n X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}\n Sample data, shape = (n_samples, n_features), in the form of a\n numpy array, sparse graph, precomputed tree, or NearestNeighbors\n object.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n self._fit_transform(X)\n return self\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse graph, BallTree, KDTree}\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n X transformed in the new space.\n \"\"\"\n self._fit_transform(X)\n return self.embedding_\n \n def transform(self, X):\n \"\"\"Transform X.\n\n This is implemented by linking the points X into the graph of geodesic\n distances of the training data. First the `n_neighbors` nearest\n neighbors of X are found in the training data, and from these the\n shortest geodesic distances from each point in X to each point in\n the training data are computed in order to construct the kernel.\n The embedding of X is the projection of this kernel onto the\n embedding vectors of the training set.\n\n Parameters\n ----------\n X : array-like, shape (n_queries, n_features)\n If neighbors_algorithm='precomputed', X is assumed to be a\n distance matrix or a sparse graph of shape\n (n_queries, n_samples_fit).\n\n Returns\n -------\n X_new : array-like, shape (n_queries, n_components)\n X transformed in the new space.\n \"\"\"\n check_is_fitted(self)\n (distances, indices) = self.nbrs_.kneighbors(X, return_distance=True)\n n_samples_fit = self.nbrs_.n_samples_fit_\n n_queries = distances.shape[0]\n G_X = np.zeros((n_queries, n_samples_fit))\n for i in range(n_queries):\n G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0)\n G_X **= 2\n G_X *= -0.5\n return self.kernel_pca_.transform(G_X)\n" + "source_code": "\n\nclass Isomap(TransformerMixin, BaseEstimator):\n \"\"\"Isomap Embedding.\n\n Non-linear dimensionality reduction through Isometric Mapping\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to consider for each point.\n\n n_components : int, default=2\n Number of coordinates for the manifold.\n\n eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'\n 'auto' : Attempt to choose the most efficient solver\n for the given problem.\n\n 'arpack' : Use Arnoldi decomposition to find the eigenvalues\n and eigenvectors.\n\n 'dense' : Use a direct solver (i.e. LAPACK)\n for the eigenvalue decomposition.\n\n tol : float, default=0\n Convergence tolerance passed to arpack or lobpcg.\n not used if eigen_solver == 'dense'.\n\n max_iter : int, default=None\n Maximum number of iterations for the arpack solver.\n not used if eigen_solver == 'dense'.\n\n path_method : {'auto', 'FW', 'D'}, default='auto'\n Method to use in finding shortest path.\n\n 'auto' : attempt to choose the best algorithm automatically.\n\n 'FW' : Floyd-Warshall algorithm.\n\n 'D' : Dijkstra's algorithm.\n\n neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, default='auto'\n Algorithm to use for nearest neighbors search,\n passed to neighbors.NearestNeighbors instance.\n\n n_jobs : int or None, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n metric : str, or callable, default=\"minkowski\"\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string or callable, it must be one of\n the options allowed by :func:`sklearn.metrics.pairwise_distances` for\n its metric parameter.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square. X may be a :term:`Glossary `.\n\n .. versionadded:: 0.22\n\n p : int, default=2\n Parameter for the Minkowski metric from\n sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n .. versionadded:: 0.22\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n embedding_ : array-like, shape (n_samples, n_components)\n Stores the embedding vectors.\n\n kernel_pca_ : object\n :class:`~sklearn.decomposition.KernelPCA` object used to implement the\n embedding.\n\n nbrs_ : sklearn.neighbors.NearestNeighbors instance\n Stores nearest neighbors instance, including BallTree or KDtree\n if applicable.\n\n dist_matrix_ : array-like, shape (n_samples, n_samples)\n Stores the geodesic distance matrix of training data.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.decomposition.PCA : Principal component analysis that is a linear\n dimensionality reduction method.\n sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using\n kernels and PCA.\n MDS : Manifold learning using multidimensional scaling.\n TSNE : T-distributed Stochastic Neighbor Embedding.\n LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.\n SpectralEmbedding : Spectral embedding for non-linear dimensionality.\n\n References\n ----------\n\n .. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric\n framework for nonlinear dimensionality reduction. Science 290 (5500)\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.manifold import Isomap\n >>> X, _ = load_digits(return_X_y=True)\n >>> X.shape\n (1797, 64)\n >>> embedding = Isomap(n_components=2)\n >>> X_transformed = embedding.fit_transform(X[:100])\n >>> X_transformed.shape\n (100, 2)\n \"\"\"\n \n def __init__(self, *, n_neighbors=5, n_components=2, eigen_solver='auto', tol=0, max_iter=None, path_method='auto', neighbors_algorithm='auto', n_jobs=None, metric='minkowski', p=2, metric_params=None):\n self.n_neighbors = n_neighbors\n self.n_components = n_components\n self.eigen_solver = eigen_solver\n self.tol = tol\n self.max_iter = max_iter\n self.path_method = path_method\n self.neighbors_algorithm = neighbors_algorithm\n self.n_jobs = n_jobs\n self.metric = metric\n self.p = p\n self.metric_params = metric_params\n \n def _fit_transform(self, X):\n self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.neighbors_algorithm, metric=self.metric, p=self.p, metric_params=self.metric_params, n_jobs=self.n_jobs)\n self.nbrs_.fit(X)\n self.n_features_in_ = self.nbrs_.n_features_in_\n if hasattr(self.nbrs_, 'feature_names_in_'):\n self.feature_names_in_ = self.nbrs_.feature_names_in_\n self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel='precomputed', eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter, n_jobs=self.n_jobs)\n kng = kneighbors_graph(self.nbrs_, self.n_neighbors, metric=self.metric, p=self.p, metric_params=self.metric_params, mode='distance', n_jobs=self.n_jobs)\n (n_connected_components, labels) = connected_components(kng)\n if n_connected_components > 1:\n if self.metric == 'precomputed' and issparse(X):\n raise RuntimeError(f\"The number of connected components of the neighbors graph is {n_connected_components} > 1. The graph cannot be completed with metric='precomputed', and Isomap cannot befitted. Increase the number of neighbors to avoid this issue, or precompute the full distance matrix instead of passing a sparse neighbors graph.\")\n warnings.warn(f'The number of connected components of the neighbors graph is {n_connected_components} > 1. Completing the graph to fit Isomap might be slow. Increase the number of neighbors to avoid this issue.', stacklevel=2)\n kng = _fix_connected_components(X=self.nbrs_._fit_X, graph=kng, n_connected_components=n_connected_components, component_labels=labels, mode='distance', metric=self.nbrs_.effective_metric_, **self.nbrs_.effective_metric_params_)\n if parse_version(scipy.__version__) < parse_version('1.3.2'):\n kng.data += 1e-15\n self.dist_matrix_ = shortest_path(kng, method=self.path_method, directed=False)\n G = self.dist_matrix_**2\n G *= -0.5\n self.embedding_ = self.kernel_pca_.fit_transform(G)\n \n def reconstruction_error(self):\n \"\"\"Compute the reconstruction error for the embedding.\n\n Returns\n -------\n reconstruction_error : float\n Reconstruction error.\n\n Notes\n -----\n The cost function of an isomap embedding is\n\n ``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``\n\n Where D is the matrix of distances for the input data X,\n D_fit is the matrix of distances for the output embedding X_fit,\n and K is the isomap kernel:\n\n ``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``\n \"\"\"\n G = -0.5 * self.dist_matrix_**2\n G_center = KernelCenterer().fit_transform(G)\n evals = self.kernel_pca_.eigenvalues_\n return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]\n \n def fit(self, X, y=None):\n \"\"\"Compute the embedding vectors for data X.\n\n Parameters\n ----------\n X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}\n Sample data, shape = (n_samples, n_features), in the form of a\n numpy array, sparse graph, precomputed tree, or NearestNeighbors\n object.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n self._fit_transform(X)\n return self\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse graph, BallTree, KDTree}\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n X transformed in the new space.\n \"\"\"\n self._fit_transform(X)\n return self.embedding_\n \n def transform(self, X):\n \"\"\"Transform X.\n\n This is implemented by linking the points X into the graph of geodesic\n distances of the training data. First the `n_neighbors` nearest\n neighbors of X are found in the training data, and from these the\n shortest geodesic distances from each point in X to each point in\n the training data are computed in order to construct the kernel.\n The embedding of X is the projection of this kernel onto the\n embedding vectors of the training set.\n\n Parameters\n ----------\n X : array-like, shape (n_queries, n_features)\n If neighbors_algorithm='precomputed', X is assumed to be a\n distance matrix or a sparse graph of shape\n (n_queries, n_samples_fit).\n\n Returns\n -------\n X_new : array-like, shape (n_queries, n_components)\n X transformed in the new space.\n \"\"\"\n check_is_fitted(self)\n (distances, indices) = self.nbrs_.kneighbors(X, return_distance=True)\n n_samples_fit = self.nbrs_.n_samples_fit_\n n_queries = distances.shape[0]\n G_X = np.zeros((n_queries, n_samples_fit))\n for i in range(n_queries):\n G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0)\n G_X **= 2\n G_X *= -0.5\n return self.kernel_pca_.transform(G_X)\n" }, { "name": "LocallyLinearEmbedding", @@ -24296,9 +24378,9 @@ "sklearn.manifold._spectral_embedding.SpectralEmbedding.fit_transform" ], "is_public": true, - "description": "Spectral embedding for non-linear dimensionality reduction.\n\nForms an affinity matrix given by the specified function and applies spectral decomposition to the corresponding graph laplacian. The resulting transformation is given by the value of the eigenvectors for each data point. Note : Laplacian Eigenmaps is the actual algorithm implemented here. Read more in the :ref:`User Guide `.", - "docstring": "Spectral embedding for non-linear dimensionality reduction.\n\n Forms an affinity matrix given by the specified function and\n applies spectral decomposition to the corresponding graph laplacian.\n The resulting transformation is given by the value of the\n eigenvectors for each data point.\n\n Note : Laplacian Eigenmaps is the actual algorithm implemented here.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=2\n The dimension of the projected subspace.\n\n affinity : {'nearest_neighbors', 'rbf', 'precomputed', 'precomputed_nearest_neighbors'} or callable, default='nearest_neighbors'\n How to construct the affinity matrix.\n - 'nearest_neighbors' : construct the affinity matrix by computing a\n graph of nearest neighbors.\n - 'rbf' : construct the affinity matrix by computing a radial basis\n function (RBF) kernel.\n - 'precomputed' : interpret ``X`` as a precomputed affinity matrix.\n - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph\n of precomputed nearest neighbors, and constructs the affinity matrix\n by selecting the ``n_neighbors`` nearest neighbors.\n - callable : use passed in function as affinity\n the function takes in data matrix (n_samples, n_features)\n and return affinity matrix (n_samples, n_samples).\n\n gamma : float, default=None\n Kernel coefficient for rbf kernel. If None, gamma will be set to\n 1/n_features.\n\n random_state : int, RandomState instance or None, default=None\n A pseudo random number generator used for the initialization\n of the lobpcg eigen vectors decomposition when `eigen_solver ==\n 'amg'`, and for the K-Means initialization. Use an int to make\n the results deterministic across calls (See\n :term:`Glossary `).\n\n .. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information.\n\n eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None\n The eigenvalue decomposition strategy to use. AMG requires pyamg\n to be installed. It can be faster on very large, sparse problems.\n If None, then ``'arpack'`` is used.\n\n n_neighbors : int, default=None\n Number of nearest neighbors for nearest_neighbors graph building.\n If None, n_neighbors will be set to max(n_samples/10, 1).\n\n n_jobs : int, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n embedding_ : ndarray of shape (n_samples, n_components)\n Spectral embedding of the training matrix.\n\n affinity_matrix_ : ndarray of shape (n_samples, n_samples)\n Affinity_matrix constructed from samples or precomputed.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_neighbors_ : int\n Number of nearest neighbors effectively used.\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.manifold import SpectralEmbedding\n >>> X, _ = load_digits(return_X_y=True)\n >>> X.shape\n (1797, 64)\n >>> embedding = SpectralEmbedding(n_components=2)\n >>> X_transformed = embedding.fit_transform(X[:100])\n >>> X_transformed.shape\n (100, 2)\n\n References\n ----------\n\n - A Tutorial on Spectral Clustering, 2007\n Ulrike von Luxburg\n http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323\n\n - On Spectral Clustering: Analysis and an algorithm, 2001\n Andrew Y. Ng, Michael I. Jordan, Yair Weiss\n http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.19.8100\n\n - Normalized cuts and image segmentation, 2000\n Jianbo Shi, Jitendra Malik\n http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324\n ", - "source_code": "\n\nclass SpectralEmbedding(BaseEstimator):\n \"\"\"Spectral embedding for non-linear dimensionality reduction.\n\n Forms an affinity matrix given by the specified function and\n applies spectral decomposition to the corresponding graph laplacian.\n The resulting transformation is given by the value of the\n eigenvectors for each data point.\n\n Note : Laplacian Eigenmaps is the actual algorithm implemented here.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=2\n The dimension of the projected subspace.\n\n affinity : {'nearest_neighbors', 'rbf', 'precomputed', 'precomputed_nearest_neighbors'} or callable, default='nearest_neighbors'\n How to construct the affinity matrix.\n - 'nearest_neighbors' : construct the affinity matrix by computing a\n graph of nearest neighbors.\n - 'rbf' : construct the affinity matrix by computing a radial basis\n function (RBF) kernel.\n - 'precomputed' : interpret ``X`` as a precomputed affinity matrix.\n - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph\n of precomputed nearest neighbors, and constructs the affinity matrix\n by selecting the ``n_neighbors`` nearest neighbors.\n - callable : use passed in function as affinity\n the function takes in data matrix (n_samples, n_features)\n and return affinity matrix (n_samples, n_samples).\n\n gamma : float, default=None\n Kernel coefficient for rbf kernel. If None, gamma will be set to\n 1/n_features.\n\n random_state : int, RandomState instance or None, default=None\n A pseudo random number generator used for the initialization\n of the lobpcg eigen vectors decomposition when `eigen_solver ==\n 'amg'`, and for the K-Means initialization. Use an int to make\n the results deterministic across calls (See\n :term:`Glossary `).\n\n .. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information.\n\n eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None\n The eigenvalue decomposition strategy to use. AMG requires pyamg\n to be installed. It can be faster on very large, sparse problems.\n If None, then ``'arpack'`` is used.\n\n n_neighbors : int, default=None\n Number of nearest neighbors for nearest_neighbors graph building.\n If None, n_neighbors will be set to max(n_samples/10, 1).\n\n n_jobs : int, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n embedding_ : ndarray of shape (n_samples, n_components)\n Spectral embedding of the training matrix.\n\n affinity_matrix_ : ndarray of shape (n_samples, n_samples)\n Affinity_matrix constructed from samples or precomputed.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_neighbors_ : int\n Number of nearest neighbors effectively used.\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.manifold import SpectralEmbedding\n >>> X, _ = load_digits(return_X_y=True)\n >>> X.shape\n (1797, 64)\n >>> embedding = SpectralEmbedding(n_components=2)\n >>> X_transformed = embedding.fit_transform(X[:100])\n >>> X_transformed.shape\n (100, 2)\n\n References\n ----------\n\n - A Tutorial on Spectral Clustering, 2007\n Ulrike von Luxburg\n http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323\n\n - On Spectral Clustering: Analysis and an algorithm, 2001\n Andrew Y. Ng, Michael I. Jordan, Yair Weiss\n http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.19.8100\n\n - Normalized cuts and image segmentation, 2000\n Jianbo Shi, Jitendra Malik\n http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324\n \"\"\"\n \n def __init__(self, n_components=2, *, affinity='nearest_neighbors', gamma=None, random_state=None, eigen_solver=None, n_neighbors=None, n_jobs=None):\n self.n_components = n_components\n self.affinity = affinity\n self.gamma = gamma\n self.random_state = random_state\n self.eigen_solver = eigen_solver\n self.n_neighbors = n_neighbors\n self.n_jobs = n_jobs\n \n def _more_tags(self):\n return {'pairwise': self.affinity in ['precomputed', 'precomputed_nearest_neighbors']}\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return self.affinity in ['precomputed', 'precomputed_nearest_neighbors']\n \n def _get_affinity_matrix(self, X, Y=None):\n \"\"\"Calculate the affinity matrix from data\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : array-like of shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n Y: Ignored\n\n Returns\n -------\n affinity_matrix of shape (n_samples, n_samples)\n \"\"\"\n if self.affinity == 'precomputed':\n self.affinity_matrix_ = X\n return self.affinity_matrix_\n if self.affinity == 'precomputed_nearest_neighbors':\n estimator = NearestNeighbors(n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric='precomputed').fit(X)\n connectivity = estimator.kneighbors_graph(X=X, mode='connectivity')\n self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)\n return self.affinity_matrix_\n if self.affinity == 'nearest_neighbors':\n if sparse.issparse(X):\n warnings.warn('Nearest neighbors affinity currently does not support sparse input, falling back to rbf affinity')\n self.affinity = 'rbf'\n else:\n self.n_neighbors_ = self.n_neighbors if self.n_neighbors is not None else max(int(X.shape[0] / 10), 1)\n self.affinity_matrix_ = kneighbors_graph(X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs)\n self.affinity_matrix_ = 0.5 * (self.affinity_matrix_ + self.affinity_matrix_.T)\n return self.affinity_matrix_\n if self.affinity == 'rbf':\n self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1]\n self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)\n return self.affinity_matrix_\n self.affinity_matrix_ = self.affinity(X)\n return self.affinity_matrix_\n \n def fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : {array-like, sparse matrix}, shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n y : Ignored\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', ensure_min_samples=2, estimator=self)\n random_state = check_random_state(self.random_state)\n if isinstance(self.affinity, str):\n if self.affinity not in {'nearest_neighbors', 'rbf', 'precomputed', 'precomputed_nearest_neighbors'}:\n raise ValueError(\"%s is not a valid affinity. Expected 'precomputed', 'rbf', 'nearest_neighbors' or a callable.\" % self.affinity)\n elif not callable(self.affinity):\n raise ValueError(\"'affinity' is expected to be an affinity name or a callable. Got: %s\" % self.affinity)\n affinity_matrix = self._get_affinity_matrix(X)\n self.embedding_ = spectral_embedding(affinity_matrix, n_components=self.n_components, eigen_solver=self.eigen_solver, random_state=random_state)\n return self\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : {array-like, sparse matrix} of shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n y : Ignored\n\n Returns\n -------\n X_new : array-like of shape (n_samples, n_components)\n \"\"\"\n self.fit(X)\n return self.embedding_\n" + "description": "Spectral embedding for non-linear dimensionality reduction.\n\nForms an affinity matrix given by the specified function and\napplies spectral decomposition to the corresponding graph laplacian.\nThe resulting transformation is given by the value of the\neigenvectors for each data point.\n\nNote : Laplacian Eigenmaps is the actual algorithm implemented here.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Spectral embedding for non-linear dimensionality reduction.\n\n Forms an affinity matrix given by the specified function and\n applies spectral decomposition to the corresponding graph laplacian.\n The resulting transformation is given by the value of the\n eigenvectors for each data point.\n\n Note : Laplacian Eigenmaps is the actual algorithm implemented here.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=2\n The dimension of the projected subspace.\n\n affinity : {'nearest_neighbors', 'rbf', 'precomputed', 'precomputed_nearest_neighbors'} or callable, default='nearest_neighbors'\n How to construct the affinity matrix.\n - 'nearest_neighbors' : construct the affinity matrix by computing a\n graph of nearest neighbors.\n - 'rbf' : construct the affinity matrix by computing a radial basis\n function (RBF) kernel.\n - 'precomputed' : interpret ``X`` as a precomputed affinity matrix.\n - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph\n of precomputed nearest neighbors, and constructs the affinity matrix\n by selecting the ``n_neighbors`` nearest neighbors.\n - callable : use passed in function as affinity\n the function takes in data matrix (n_samples, n_features)\n and return affinity matrix (n_samples, n_samples).\n\n gamma : float, default=None\n Kernel coefficient for rbf kernel. If None, gamma will be set to\n 1/n_features.\n\n random_state : int, RandomState instance or None, default=None\n A pseudo random number generator used for the initialization\n of the lobpcg eigen vectors decomposition when `eigen_solver ==\n 'amg'`, and for the K-Means initialization. Use an int to make\n the results deterministic across calls (See\n :term:`Glossary `).\n\n .. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information.\n\n eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None\n The eigenvalue decomposition strategy to use. AMG requires pyamg\n to be installed. It can be faster on very large, sparse problems.\n If None, then ``'arpack'`` is used.\n\n n_neighbors : int, default=None\n Number of nearest neighbors for nearest_neighbors graph building.\n If None, n_neighbors will be set to max(n_samples/10, 1).\n\n n_jobs : int, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n embedding_ : ndarray of shape (n_samples, n_components)\n Spectral embedding of the training matrix.\n\n affinity_matrix_ : ndarray of shape (n_samples, n_samples)\n Affinity_matrix constructed from samples or precomputed.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_neighbors_ : int\n Number of nearest neighbors effectively used.\n\n See Also\n --------\n Isomap : Non-linear dimensionality reduction through Isometric Mapping.\n\n References\n ----------\n\n - A Tutorial on Spectral Clustering, 2007\n Ulrike von Luxburg\n http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323\n\n - On Spectral Clustering: Analysis and an algorithm, 2001\n Andrew Y. Ng, Michael I. Jordan, Yair Weiss\n http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.19.8100\n\n - Normalized cuts and image segmentation, 2000\n Jianbo Shi, Jitendra Malik\n http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.manifold import SpectralEmbedding\n >>> X, _ = load_digits(return_X_y=True)\n >>> X.shape\n (1797, 64)\n >>> embedding = SpectralEmbedding(n_components=2)\n >>> X_transformed = embedding.fit_transform(X[:100])\n >>> X_transformed.shape\n (100, 2)\n ", + "source_code": "\n\nclass SpectralEmbedding(BaseEstimator):\n \"\"\"Spectral embedding for non-linear dimensionality reduction.\n\n Forms an affinity matrix given by the specified function and\n applies spectral decomposition to the corresponding graph laplacian.\n The resulting transformation is given by the value of the\n eigenvectors for each data point.\n\n Note : Laplacian Eigenmaps is the actual algorithm implemented here.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=2\n The dimension of the projected subspace.\n\n affinity : {'nearest_neighbors', 'rbf', 'precomputed', 'precomputed_nearest_neighbors'} or callable, default='nearest_neighbors'\n How to construct the affinity matrix.\n - 'nearest_neighbors' : construct the affinity matrix by computing a\n graph of nearest neighbors.\n - 'rbf' : construct the affinity matrix by computing a radial basis\n function (RBF) kernel.\n - 'precomputed' : interpret ``X`` as a precomputed affinity matrix.\n - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph\n of precomputed nearest neighbors, and constructs the affinity matrix\n by selecting the ``n_neighbors`` nearest neighbors.\n - callable : use passed in function as affinity\n the function takes in data matrix (n_samples, n_features)\n and return affinity matrix (n_samples, n_samples).\n\n gamma : float, default=None\n Kernel coefficient for rbf kernel. If None, gamma will be set to\n 1/n_features.\n\n random_state : int, RandomState instance or None, default=None\n A pseudo random number generator used for the initialization\n of the lobpcg eigen vectors decomposition when `eigen_solver ==\n 'amg'`, and for the K-Means initialization. Use an int to make\n the results deterministic across calls (See\n :term:`Glossary `).\n\n .. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information.\n\n eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None\n The eigenvalue decomposition strategy to use. AMG requires pyamg\n to be installed. It can be faster on very large, sparse problems.\n If None, then ``'arpack'`` is used.\n\n n_neighbors : int, default=None\n Number of nearest neighbors for nearest_neighbors graph building.\n If None, n_neighbors will be set to max(n_samples/10, 1).\n\n n_jobs : int, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n embedding_ : ndarray of shape (n_samples, n_components)\n Spectral embedding of the training matrix.\n\n affinity_matrix_ : ndarray of shape (n_samples, n_samples)\n Affinity_matrix constructed from samples or precomputed.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_neighbors_ : int\n Number of nearest neighbors effectively used.\n\n See Also\n --------\n Isomap : Non-linear dimensionality reduction through Isometric Mapping.\n\n References\n ----------\n\n - A Tutorial on Spectral Clustering, 2007\n Ulrike von Luxburg\n http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323\n\n - On Spectral Clustering: Analysis and an algorithm, 2001\n Andrew Y. Ng, Michael I. Jordan, Yair Weiss\n http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.19.8100\n\n - Normalized cuts and image segmentation, 2000\n Jianbo Shi, Jitendra Malik\n http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324\n\n Examples\n --------\n >>> from sklearn.datasets import load_digits\n >>> from sklearn.manifold import SpectralEmbedding\n >>> X, _ = load_digits(return_X_y=True)\n >>> X.shape\n (1797, 64)\n >>> embedding = SpectralEmbedding(n_components=2)\n >>> X_transformed = embedding.fit_transform(X[:100])\n >>> X_transformed.shape\n (100, 2)\n \"\"\"\n \n def __init__(self, n_components=2, *, affinity='nearest_neighbors', gamma=None, random_state=None, eigen_solver=None, n_neighbors=None, n_jobs=None):\n self.n_components = n_components\n self.affinity = affinity\n self.gamma = gamma\n self.random_state = random_state\n self.eigen_solver = eigen_solver\n self.n_neighbors = n_neighbors\n self.n_jobs = n_jobs\n \n def _more_tags(self):\n return {'pairwise': self.affinity in ['precomputed', 'precomputed_nearest_neighbors']}\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return self.affinity in ['precomputed', 'precomputed_nearest_neighbors']\n \n def _get_affinity_matrix(self, X, Y=None):\n \"\"\"Calculate the affinity matrix from data\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : array-like of shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n Y: Ignored\n\n Returns\n -------\n affinity_matrix of shape (n_samples, n_samples)\n \"\"\"\n if self.affinity == 'precomputed':\n self.affinity_matrix_ = X\n return self.affinity_matrix_\n if self.affinity == 'precomputed_nearest_neighbors':\n estimator = NearestNeighbors(n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric='precomputed').fit(X)\n connectivity = estimator.kneighbors_graph(X=X, mode='connectivity')\n self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)\n return self.affinity_matrix_\n if self.affinity == 'nearest_neighbors':\n if sparse.issparse(X):\n warnings.warn('Nearest neighbors affinity currently does not support sparse input, falling back to rbf affinity')\n self.affinity = 'rbf'\n else:\n self.n_neighbors_ = self.n_neighbors if self.n_neighbors is not None else max(int(X.shape[0] / 10), 1)\n self.affinity_matrix_ = kneighbors_graph(X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs)\n self.affinity_matrix_ = 0.5 * (self.affinity_matrix_ + self.affinity_matrix_.T)\n return self.affinity_matrix_\n if self.affinity == 'rbf':\n self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1]\n self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)\n return self.affinity_matrix_\n self.affinity_matrix_ = self.affinity(X)\n return self.affinity_matrix_\n \n def fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : {array-like, sparse matrix}, shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', ensure_min_samples=2, estimator=self)\n random_state = check_random_state(self.random_state)\n if isinstance(self.affinity, str):\n if self.affinity not in {'nearest_neighbors', 'rbf', 'precomputed', 'precomputed_nearest_neighbors'}:\n raise ValueError(\"%s is not a valid affinity. Expected 'precomputed', 'rbf', 'nearest_neighbors' or a callable.\" % self.affinity)\n elif not callable(self.affinity):\n raise ValueError(\"'affinity' is expected to be an affinity name or a callable. Got: %s\" % self.affinity)\n affinity_matrix = self._get_affinity_matrix(X)\n self.embedding_ = spectral_embedding(affinity_matrix, n_components=self.n_components, eigen_solver=self.eigen_solver, random_state=random_state)\n return self\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : {array-like, sparse matrix} of shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : array-like of shape (n_samples, n_components)\n Spectral embedding of the training matrix.\n \"\"\"\n self.fit(X)\n return self.embedding_\n" }, { "name": "TSNE", @@ -24313,7 +24395,7 @@ "sklearn.manifold._t_sne.TSNE.fit" ], "is_public": true, - "description": "T-distributed Stochastic Neighbor Embedding.\n\nt-SNE [1] is a tool to visualize high-dimensional data. It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data. t-SNE has a cost function that is not convex, i.e. with different initializations we can get different results. It is highly recommended to use another dimensionality reduction method (e.g. PCA for dense data or TruncatedSVD for sparse data) to reduce the number of dimensions to a reasonable amount (e.g. 50) if the number of features is very high. This will suppress some noise and speed up the computation of pairwise distances between samples. For more tips see Laurens van der Maaten's FAQ [2]. Read more in the :ref:`User Guide `.", + "description": "T-distributed Stochastic Neighbor Embedding.\n\nt-SNE [1] is a tool to visualize high-dimensional data. It converts\nsimilarities between data points to joint probabilities and tries\nto minimize the Kullback-Leibler divergence between the joint\nprobabilities of the low-dimensional embedding and the\nhigh-dimensional data. t-SNE has a cost function that is not convex,\ni.e. with different initializations we can get different results.\n\nIt is highly recommended to use another dimensionality reduction\nmethod (e.g. PCA for dense data or TruncatedSVD for sparse data)\nto reduce the number of dimensions to a reasonable amount (e.g. 50)\nif the number of features is very high. This will suppress some\nnoise and speed up the computation of pairwise distances between\nsamples. For more tips see Laurens van der Maaten's FAQ [2].\n\nRead more in the :ref:`User Guide `.", "docstring": "T-distributed Stochastic Neighbor Embedding.\n\n t-SNE [1] is a tool to visualize high-dimensional data. It converts\n similarities between data points to joint probabilities and tries\n to minimize the Kullback-Leibler divergence between the joint\n probabilities of the low-dimensional embedding and the\n high-dimensional data. t-SNE has a cost function that is not convex,\n i.e. with different initializations we can get different results.\n\n It is highly recommended to use another dimensionality reduction\n method (e.g. PCA for dense data or TruncatedSVD for sparse data)\n to reduce the number of dimensions to a reasonable amount (e.g. 50)\n if the number of features is very high. This will suppress some\n noise and speed up the computation of pairwise distances between\n samples. For more tips see Laurens van der Maaten's FAQ [2].\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=2\n Dimension of the embedded space.\n\n perplexity : float, default=30.0\n The perplexity is related to the number of nearest neighbors that\n is used in other manifold learning algorithms. Larger datasets\n usually require a larger perplexity. Consider selecting a value\n between 5 and 50. Different values can result in significantly\n different results.\n\n early_exaggeration : float, default=12.0\n Controls how tight natural clusters in the original space are in\n the embedded space and how much space will be between them. For\n larger values, the space between natural clusters will be larger\n in the embedded space. Again, the choice of this parameter is not\n very critical. If the cost function increases during initial\n optimization, the early exaggeration factor or the learning rate\n might be too high.\n\n learning_rate : float or 'auto', default=200.0\n The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If\n the learning rate is too high, the data may look like a 'ball' with any\n point approximately equidistant from its nearest neighbours. If the\n learning rate is too low, most points may look compressed in a dense\n cloud with few outliers. If the cost function gets stuck in a bad local\n minimum increasing the learning rate may help.\n Note that many other t-SNE implementations (bhtsne, FIt-SNE, openTSNE,\n etc.) use a definition of learning_rate that is 4 times smaller than\n ours. So our learning_rate=200 corresponds to learning_rate=800 in\n those other implementations. The 'auto' option sets the learning_rate\n to `max(N / early_exaggeration / 4, 50)` where N is the sample size,\n following [4] and [5]. This will become default in 1.2.\n\n n_iter : int, default=1000\n Maximum number of iterations for the optimization. Should be at\n least 250.\n\n n_iter_without_progress : int, default=300\n Maximum number of iterations without progress before we abort the\n optimization, used after 250 initial iterations with early\n exaggeration. Note that progress is only checked every 50 iterations so\n this value is rounded to the next multiple of 50.\n\n .. versionadded:: 0.17\n parameter *n_iter_without_progress* to control stopping criteria.\n\n min_grad_norm : float, default=1e-7\n If the gradient norm is below this threshold, the optimization will\n be stopped.\n\n metric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by scipy.spatial.distance.pdist for its metric parameter, or\n a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.\n If metric is \"precomputed\", X is assumed to be a distance matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them. The default is \"euclidean\" which is\n interpreted as squared euclidean distance.\n\n init : {'random', 'pca'} or ndarray of shape (n_samples, n_components), default='random'\n Initialization of embedding. Possible options are 'random', 'pca',\n and a numpy array of shape (n_samples, n_components).\n PCA initialization cannot be used with precomputed distances and is\n usually more globally stable than random initialization. `init='pca'`\n will become default in 1.2.\n\n verbose : int, default=0\n Verbosity level.\n\n random_state : int, RandomState instance or None, default=None\n Determines the random number generator. Pass an int for reproducible\n results across multiple function calls. Note that different\n initializations might result in different local minima of the cost\n function. See :term:`Glossary `.\n\n method : str, default='barnes_hut'\n By default the gradient calculation algorithm uses Barnes-Hut\n approximation running in O(NlogN) time. method='exact'\n will run on the slower, but exact, algorithm in O(N^2) time. The\n exact algorithm should be used when nearest-neighbor errors need\n to be better than 3%. However, the exact method cannot scale to\n millions of examples.\n\n .. versionadded:: 0.17\n Approximate optimization *method* via the Barnes-Hut.\n\n angle : float, default=0.5\n Only used if method='barnes_hut'\n This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.\n 'angle' is the angular size (referred to as theta in [3]) of a distant\n node as measured from a point. If this size is below 'angle' then it is\n used as a summary node of all points contained within it.\n This method is not very sensitive to changes in this parameter\n in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing\n computation time and angle greater 0.8 has quickly increasing error.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search. This parameter\n has no impact when ``metric=\"precomputed\"`` or\n (``metric=\"euclidean\"`` and ``method=\"exact\"``).\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.22\n\n square_distances : True or 'legacy', default='legacy'\n Whether TSNE should square the distance values. ``'legacy'`` means\n that distance values are squared only when ``metric=\"euclidean\"``.\n ``True`` means that distance values are squared for all metrics.\n\n .. versionadded:: 0.24\n Added to provide backward compatibility during deprecation of\n legacy squaring behavior.\n .. deprecated:: 0.24\n Legacy squaring behavior was deprecated in 0.24. The ``'legacy'``\n value will be removed in 1.1 (renaming of 0.26), at which point the\n default value will change to ``True``.\n\n Attributes\n ----------\n embedding_ : array-like of shape (n_samples, n_components)\n Stores the embedding vectors.\n\n kl_divergence_ : float\n Kullback-Leibler divergence after optimization.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations run.\n\n See Also\n --------\n sklearn.decomposition.PCA : Principal component analysis that is a linear\n dimensionality reduction method.\n sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using\n kernels and PCA.\n MDS : Manifold learning using multidimensional scaling.\n Isomap : Manifold learning based on Isometric Mapping.\n LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.\n SpectralEmbedding : Spectral embedding for non-linear dimensionality.\n\n References\n ----------\n\n [1] van der Maaten, L.J.P.; Hinton, G.E. Visualizing High-Dimensional Data\n Using t-SNE. Journal of Machine Learning Research 9:2579-2605, 2008.\n\n [2] van der Maaten, L.J.P. t-Distributed Stochastic Neighbor Embedding\n https://lvdmaaten.github.io/tsne/\n\n [3] L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms.\n Journal of Machine Learning Research 15(Oct):3221-3245, 2014.\n https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf\n\n [4] Belkina, A. C., Ciccolella, C. O., Anno, R., Halpert, R., Spidlen, J.,\n & Snyder-Cappione, J. E. (2019). Automated optimized parameters for\n T-distributed stochastic neighbor embedding improve visualization\n and analysis of large datasets. Nature Communications, 10(1), 1-12.\n\n [5] Kobak, D., & Berens, P. (2019). The art of using t-SNE for single-cell\n transcriptomics. Nature Communications, 10(1), 1-14.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.manifold import TSNE\n >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])\n >>> X_embedded = TSNE(n_components=2, learning_rate='auto',\n ... init='random').fit_transform(X)\n >>> X_embedded.shape\n (4, 2)\n ", "source_code": "\n\nclass TSNE(BaseEstimator):\n \"\"\"T-distributed Stochastic Neighbor Embedding.\n\n t-SNE [1] is a tool to visualize high-dimensional data. It converts\n similarities between data points to joint probabilities and tries\n to minimize the Kullback-Leibler divergence between the joint\n probabilities of the low-dimensional embedding and the\n high-dimensional data. t-SNE has a cost function that is not convex,\n i.e. with different initializations we can get different results.\n\n It is highly recommended to use another dimensionality reduction\n method (e.g. PCA for dense data or TruncatedSVD for sparse data)\n to reduce the number of dimensions to a reasonable amount (e.g. 50)\n if the number of features is very high. This will suppress some\n noise and speed up the computation of pairwise distances between\n samples. For more tips see Laurens van der Maaten's FAQ [2].\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=2\n Dimension of the embedded space.\n\n perplexity : float, default=30.0\n The perplexity is related to the number of nearest neighbors that\n is used in other manifold learning algorithms. Larger datasets\n usually require a larger perplexity. Consider selecting a value\n between 5 and 50. Different values can result in significantly\n different results.\n\n early_exaggeration : float, default=12.0\n Controls how tight natural clusters in the original space are in\n the embedded space and how much space will be between them. For\n larger values, the space between natural clusters will be larger\n in the embedded space. Again, the choice of this parameter is not\n very critical. If the cost function increases during initial\n optimization, the early exaggeration factor or the learning rate\n might be too high.\n\n learning_rate : float or 'auto', default=200.0\n The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If\n the learning rate is too high, the data may look like a 'ball' with any\n point approximately equidistant from its nearest neighbours. If the\n learning rate is too low, most points may look compressed in a dense\n cloud with few outliers. If the cost function gets stuck in a bad local\n minimum increasing the learning rate may help.\n Note that many other t-SNE implementations (bhtsne, FIt-SNE, openTSNE,\n etc.) use a definition of learning_rate that is 4 times smaller than\n ours. So our learning_rate=200 corresponds to learning_rate=800 in\n those other implementations. The 'auto' option sets the learning_rate\n to `max(N / early_exaggeration / 4, 50)` where N is the sample size,\n following [4] and [5]. This will become default in 1.2.\n\n n_iter : int, default=1000\n Maximum number of iterations for the optimization. Should be at\n least 250.\n\n n_iter_without_progress : int, default=300\n Maximum number of iterations without progress before we abort the\n optimization, used after 250 initial iterations with early\n exaggeration. Note that progress is only checked every 50 iterations so\n this value is rounded to the next multiple of 50.\n\n .. versionadded:: 0.17\n parameter *n_iter_without_progress* to control stopping criteria.\n\n min_grad_norm : float, default=1e-7\n If the gradient norm is below this threshold, the optimization will\n be stopped.\n\n metric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by scipy.spatial.distance.pdist for its metric parameter, or\n a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.\n If metric is \"precomputed\", X is assumed to be a distance matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them. The default is \"euclidean\" which is\n interpreted as squared euclidean distance.\n\n init : {'random', 'pca'} or ndarray of shape (n_samples, n_components), default='random'\n Initialization of embedding. Possible options are 'random', 'pca',\n and a numpy array of shape (n_samples, n_components).\n PCA initialization cannot be used with precomputed distances and is\n usually more globally stable than random initialization. `init='pca'`\n will become default in 1.2.\n\n verbose : int, default=0\n Verbosity level.\n\n random_state : int, RandomState instance or None, default=None\n Determines the random number generator. Pass an int for reproducible\n results across multiple function calls. Note that different\n initializations might result in different local minima of the cost\n function. See :term:`Glossary `.\n\n method : str, default='barnes_hut'\n By default the gradient calculation algorithm uses Barnes-Hut\n approximation running in O(NlogN) time. method='exact'\n will run on the slower, but exact, algorithm in O(N^2) time. The\n exact algorithm should be used when nearest-neighbor errors need\n to be better than 3%. However, the exact method cannot scale to\n millions of examples.\n\n .. versionadded:: 0.17\n Approximate optimization *method* via the Barnes-Hut.\n\n angle : float, default=0.5\n Only used if method='barnes_hut'\n This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.\n 'angle' is the angular size (referred to as theta in [3]) of a distant\n node as measured from a point. If this size is below 'angle' then it is\n used as a summary node of all points contained within it.\n This method is not very sensitive to changes in this parameter\n in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing\n computation time and angle greater 0.8 has quickly increasing error.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search. This parameter\n has no impact when ``metric=\"precomputed\"`` or\n (``metric=\"euclidean\"`` and ``method=\"exact\"``).\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.22\n\n square_distances : True or 'legacy', default='legacy'\n Whether TSNE should square the distance values. ``'legacy'`` means\n that distance values are squared only when ``metric=\"euclidean\"``.\n ``True`` means that distance values are squared for all metrics.\n\n .. versionadded:: 0.24\n Added to provide backward compatibility during deprecation of\n legacy squaring behavior.\n .. deprecated:: 0.24\n Legacy squaring behavior was deprecated in 0.24. The ``'legacy'``\n value will be removed in 1.1 (renaming of 0.26), at which point the\n default value will change to ``True``.\n\n Attributes\n ----------\n embedding_ : array-like of shape (n_samples, n_components)\n Stores the embedding vectors.\n\n kl_divergence_ : float\n Kullback-Leibler divergence after optimization.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations run.\n\n See Also\n --------\n sklearn.decomposition.PCA : Principal component analysis that is a linear\n dimensionality reduction method.\n sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using\n kernels and PCA.\n MDS : Manifold learning using multidimensional scaling.\n Isomap : Manifold learning based on Isometric Mapping.\n LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.\n SpectralEmbedding : Spectral embedding for non-linear dimensionality.\n\n References\n ----------\n\n [1] van der Maaten, L.J.P.; Hinton, G.E. Visualizing High-Dimensional Data\n Using t-SNE. Journal of Machine Learning Research 9:2579-2605, 2008.\n\n [2] van der Maaten, L.J.P. t-Distributed Stochastic Neighbor Embedding\n https://lvdmaaten.github.io/tsne/\n\n [3] L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms.\n Journal of Machine Learning Research 15(Oct):3221-3245, 2014.\n https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf\n\n [4] Belkina, A. C., Ciccolella, C. O., Anno, R., Halpert, R., Spidlen, J.,\n & Snyder-Cappione, J. E. (2019). Automated optimized parameters for\n T-distributed stochastic neighbor embedding improve visualization\n and analysis of large datasets. Nature Communications, 10(1), 1-12.\n\n [5] Kobak, D., & Berens, P. (2019). The art of using t-SNE for single-cell\n transcriptomics. Nature Communications, 10(1), 1-14.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.manifold import TSNE\n >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])\n >>> X_embedded = TSNE(n_components=2, learning_rate='auto',\n ... init='random').fit_transform(X)\n >>> X_embedded.shape\n (4, 2)\n \"\"\"\n _EXPLORATION_N_ITER = 250\n _N_ITER_CHECK = 50\n \n def __init__(self, n_components=2, *, perplexity=30.0, early_exaggeration=12.0, learning_rate='warn', n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07, metric='euclidean', init='warn', verbose=0, random_state=None, method='barnes_hut', angle=0.5, n_jobs=None, square_distances='legacy'):\n self.n_components = n_components\n self.perplexity = perplexity\n self.early_exaggeration = early_exaggeration\n self.learning_rate = learning_rate\n self.n_iter = n_iter\n self.n_iter_without_progress = n_iter_without_progress\n self.min_grad_norm = min_grad_norm\n self.metric = metric\n self.init = init\n self.verbose = verbose\n self.random_state = random_state\n self.method = method\n self.angle = angle\n self.n_jobs = n_jobs\n self.square_distances = square_distances\n \n def _fit(self, X, skip_num_points=0):\n \"\"\"Private function to fit the model using X as training data.\"\"\"\n if isinstance(self.init, str) and self.init == 'warn':\n warnings.warn(\"The default initialization in TSNE will change from 'random' to 'pca' in 1.2.\", FutureWarning)\n self._init = 'random'\n else:\n self._init = self.init\n if self.learning_rate == 'warn':\n warnings.warn(\"The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.\", FutureWarning)\n self._learning_rate = 200.0\n else:\n self._learning_rate = self.learning_rate\n if isinstance(self._init, str) and self._init == 'pca' and issparse(X):\n raise TypeError('PCA initialization is currently not supported with the sparse input matrix. Use init=\"random\" instead.')\n if self.method not in ['barnes_hut', 'exact']:\n raise ValueError(\"'method' must be 'barnes_hut' or 'exact'\")\n if self.angle < 0.0 or self.angle > 1.0:\n raise ValueError(\"'angle' must be between 0.0 - 1.0\")\n if self.square_distances not in [True, 'legacy']:\n raise ValueError(\"'square_distances' must be True or 'legacy'.\")\n if self._learning_rate == 'auto':\n self._learning_rate = X.shape[0] / self.early_exaggeration / 4\n self._learning_rate = np.maximum(self._learning_rate, 50)\n elif not self._learning_rate > 0:\n raise ValueError(\"'learning_rate' must be a positive number or 'auto'.\")\n if self.metric != 'euclidean' and self.square_distances is not True:\n warnings.warn(\"'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning.\", FutureWarning)\n if self.method == 'barnes_hut':\n X = self._validate_data(X, accept_sparse=['csr'], ensure_min_samples=2, dtype=[np.float32, np.float64])\n else:\n X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float32, np.float64])\n if self.metric == 'precomputed':\n if isinstance(self._init, str) and self._init == 'pca':\n raise ValueError('The parameter init=\"pca\" cannot be used with metric=\"precomputed\".')\n if X.shape[0] != X.shape[1]:\n raise ValueError('X should be a square distance matrix')\n check_non_negative(X, \"TSNE.fit(). With metric='precomputed', X should contain positive distances.\")\n if self.method == 'exact' and issparse(X):\n raise TypeError('TSNE with method=\"exact\" does not accept sparse precomputed distance matrix. Use method=\"barnes_hut\" or provide the dense distance matrix.')\n if self.method == 'barnes_hut' and self.n_components > 3:\n raise ValueError(\"'n_components' should be inferior to 4 for the barnes_hut algorithm as it relies on quad-tree or oct-tree.\")\n random_state = check_random_state(self.random_state)\n if self.early_exaggeration < 1.0:\n raise ValueError('early_exaggeration must be at least 1, but is {}'.format(self.early_exaggeration))\n if self.n_iter < 250:\n raise ValueError('n_iter should be at least 250')\n n_samples = X.shape[0]\n neighbors_nn = None\n if self.method == 'exact':\n if self.metric == 'precomputed':\n distances = X\n else:\n if self.verbose:\n print('[t-SNE] Computing pairwise distances...')\n if self.metric == 'euclidean':\n distances = pairwise_distances(X, metric=self.metric, squared=True)\n else:\n distances = pairwise_distances(X, metric=self.metric, n_jobs=self.n_jobs)\n if np.any(distances < 0):\n raise ValueError('All distances should be positive, the metric given is not correct')\n if self.metric != 'euclidean' and self.square_distances is True:\n distances **= 2\n P = _joint_probabilities(distances, self.perplexity, self.verbose)\n assert np.all(np.isfinite(P)), 'All probabilities should be finite'\n assert np.all(P >= 0), 'All probabilities should be non-negative'\n assert np.all(P <= 1), 'All probabilities should be less or then equal to one'\n else:\n n_neighbors = min(n_samples - 1, int(3.0 * self.perplexity + 1))\n if self.verbose:\n print('[t-SNE] Computing {} nearest neighbors...'.format(n_neighbors))\n knn = NearestNeighbors(algorithm='auto', n_jobs=self.n_jobs, n_neighbors=n_neighbors, metric=self.metric)\n t0 = time()\n knn.fit(X)\n duration = time() - t0\n if self.verbose:\n print('[t-SNE] Indexed {} samples in {:.3f}s...'.format(n_samples, duration))\n t0 = time()\n distances_nn = knn.kneighbors_graph(mode='distance')\n duration = time() - t0\n if self.verbose:\n print('[t-SNE] Computed neighbors for {} samples in {:.3f}s...'.format(n_samples, duration))\n del knn\n if self.square_distances is True or self.metric == 'euclidean':\n distances_nn.data **= 2\n P = _joint_probabilities_nn(distances_nn, self.perplexity, self.verbose)\n if isinstance(self._init, np.ndarray):\n X_embedded = self._init\n elif self._init == 'pca':\n pca = PCA(n_components=self.n_components, svd_solver='randomized', random_state=random_state)\n X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)\n warnings.warn('The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.', FutureWarning)\n elif self._init == 'random':\n X_embedded = 0.0001 * random_state.randn(n_samples, self.n_components).astype(np.float32)\n else:\n raise ValueError(\"'init' must be 'pca', 'random', or a numpy array\")\n degrees_of_freedom = max(self.n_components - 1, 1)\n return self._tsne(P, degrees_of_freedom, n_samples, X_embedded=X_embedded, neighbors=neighbors_nn, skip_num_points=skip_num_points)\n \n def _tsne(self, P, degrees_of_freedom, n_samples, X_embedded, neighbors=None, skip_num_points=0):\n \"\"\"Runs t-SNE.\"\"\"\n params = X_embedded.ravel()\n opt_args = {'it': 0, 'n_iter_check': self._N_ITER_CHECK, 'min_grad_norm': self.min_grad_norm, 'learning_rate': self._learning_rate, 'verbose': self.verbose, 'kwargs': dict(skip_num_points=skip_num_points), 'args': [P, degrees_of_freedom, n_samples, self.n_components], 'n_iter_without_progress': self._EXPLORATION_N_ITER, 'n_iter': self._EXPLORATION_N_ITER, 'momentum': 0.5}\n if self.method == 'barnes_hut':\n obj_func = _kl_divergence_bh\n opt_args['kwargs']['angle'] = self.angle\n opt_args['kwargs']['verbose'] = self.verbose\n opt_args['kwargs']['num_threads'] = _openmp_effective_n_threads()\n else:\n obj_func = _kl_divergence\n P *= self.early_exaggeration\n (params, kl_divergence, it) = _gradient_descent(obj_func, params, **opt_args)\n if self.verbose:\n print('[t-SNE] KL divergence after %d iterations with early exaggeration: %f' % (it + 1, kl_divergence))\n P /= self.early_exaggeration\n remaining = self.n_iter - self._EXPLORATION_N_ITER\n if it < self._EXPLORATION_N_ITER or remaining > 0:\n opt_args['n_iter'] = self.n_iter\n opt_args['it'] = it + 1\n opt_args['momentum'] = 0.8\n opt_args['n_iter_without_progress'] = self.n_iter_without_progress\n (params, kl_divergence, it) = _gradient_descent(obj_func, params, **opt_args)\n self.n_iter_ = it\n if self.verbose:\n print('[t-SNE] KL divergence after %d iterations: %f' % (it + 1, kl_divergence))\n X_embedded = params.reshape(n_samples, self.n_components)\n self.kl_divergence_ = kl_divergence\n return X_embedded\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit X into an embedded space and return that transformed output.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n If the metric is 'precomputed' X must be a square distance\n matrix. Otherwise it contains a sample per row. If the method\n is 'exact', X may be a sparse matrix of type 'csr', 'csc'\n or 'coo'. If the method is 'barnes_hut' and the metric is\n 'precomputed', X may be a precomputed sparse graph.\n\n y : None\n Ignored.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Embedding of the training data in low-dimensional space.\n \"\"\"\n embedding = self._fit(X)\n self.embedding_ = embedding\n return self.embedding_\n \n def fit(self, X, y=None):\n \"\"\"Fit X into an embedded space.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n If the metric is 'precomputed' X must be a square distance\n matrix. Otherwise it contains a sample per row. If the method\n is 'exact', X may be a sparse matrix of type 'csr', 'csc'\n or 'coo'. If the method is 'barnes_hut' and the metric is\n 'precomputed', X may be a precomputed sparse graph.\n\n y : None\n Ignored.\n\n Returns\n -------\n X_new : array of shape (n_samples, n_components)\n Embedding of the training data in low-dimensional space.\n \"\"\"\n self.fit_transform(X)\n return self\n" }, @@ -24329,7 +24411,7 @@ "sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay.from_predictions" ], "is_public": true, - "description": "Confusion Matrix visualization.\n\nIt is recommend to use :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator` or :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` to create a :class:`ConfusionMatrixDisplay`. All parameters are stored as attributes. Read more in the :ref:`User Guide `.", + "description": "Confusion Matrix visualization.\n\nIt is recommend to use\n:func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator` or\n:func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` to\ncreate a :class:`ConfusionMatrixDisplay`. All parameters are stored as\nattributes.\n\nRead more in the :ref:`User Guide `.", "docstring": "Confusion Matrix visualization.\n\n It is recommend to use\n :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator` or\n :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` to\n create a :class:`ConfusionMatrixDisplay`. All parameters are stored as\n attributes.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n confusion_matrix : ndarray of shape (n_classes, n_classes)\n Confusion matrix.\n\n display_labels : ndarray of shape (n_classes,), default=None\n Display labels for plot. If None, display labels are set from 0 to\n `n_classes - 1`.\n\n Attributes\n ----------\n im_ : matplotlib AxesImage\n Image representing the confusion matrix.\n\n text_ : ndarray of shape (n_classes, n_classes), dtype=matplotlib Text, or None\n Array of matplotlib axes. `None` if `include_values` is false.\n\n ax_ : matplotlib Axes\n Axes with confusion matrix.\n\n figure_ : matplotlib Figure\n Figure containing the confusion matrix.\n\n See Also\n --------\n confusion_matrix : Compute Confusion Matrix to evaluate the accuracy of a\n classification.\n ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix\n given an estimator, the data, and the label.\n ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix\n given the true and predicted labels.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y,\n ... random_state=0)\n >>> clf = SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> predictions = clf.predict(X_test)\n >>> cm = confusion_matrix(y_test, predictions, labels=clf.classes_)\n >>> disp = ConfusionMatrixDisplay(confusion_matrix=cm,\n ... display_labels=clf.classes_)\n >>> disp.plot()\n <...>\n >>> plt.show()\n ", "source_code": "\n\nclass ConfusionMatrixDisplay:\n \"\"\"Confusion Matrix visualization.\n\n It is recommend to use\n :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator` or\n :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` to\n create a :class:`ConfusionMatrixDisplay`. All parameters are stored as\n attributes.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n confusion_matrix : ndarray of shape (n_classes, n_classes)\n Confusion matrix.\n\n display_labels : ndarray of shape (n_classes,), default=None\n Display labels for plot. If None, display labels are set from 0 to\n `n_classes - 1`.\n\n Attributes\n ----------\n im_ : matplotlib AxesImage\n Image representing the confusion matrix.\n\n text_ : ndarray of shape (n_classes, n_classes), dtype=matplotlib Text, or None\n Array of matplotlib axes. `None` if `include_values` is false.\n\n ax_ : matplotlib Axes\n Axes with confusion matrix.\n\n figure_ : matplotlib Figure\n Figure containing the confusion matrix.\n\n See Also\n --------\n confusion_matrix : Compute Confusion Matrix to evaluate the accuracy of a\n classification.\n ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix\n given an estimator, the data, and the label.\n ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix\n given the true and predicted labels.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y,\n ... random_state=0)\n >>> clf = SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> predictions = clf.predict(X_test)\n >>> cm = confusion_matrix(y_test, predictions, labels=clf.classes_)\n >>> disp = ConfusionMatrixDisplay(confusion_matrix=cm,\n ... display_labels=clf.classes_)\n >>> disp.plot()\n <...>\n >>> plt.show()\n \"\"\"\n \n def __init__(self, confusion_matrix, *, display_labels=None):\n self.confusion_matrix = confusion_matrix\n self.display_labels = display_labels\n \n def plot(self, *, include_values=True, cmap='viridis', xticks_rotation='horizontal', values_format=None, ax=None, colorbar=True):\n \"\"\"Plot visualization.\n\n Parameters\n ----------\n include_values : bool, default=True\n Includes values in confusion matrix.\n\n cmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\n xticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\n values_format : str, default=None\n Format specification for values in confusion matrix. If `None`,\n the format specification is 'd' or '.2g' whichever is shorter.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n colorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n \"\"\"\n check_matplotlib_support('ConfusionMatrixDisplay.plot')\n import matplotlib.pyplot as plt\n if ax is None:\n (fig, ax) = plt.subplots()\n else:\n fig = ax.figure\n cm = self.confusion_matrix\n n_classes = cm.shape[0]\n self.im_ = ax.imshow(cm, interpolation='nearest', cmap=cmap)\n self.text_ = None\n (cmap_min, cmap_max) = (self.im_.cmap(0), self.im_.cmap(1.0))\n if include_values:\n self.text_ = np.empty_like(cm, dtype=object)\n thresh = (cm.max() + cm.min()) / 2.0\n for (i, j) in product(range(n_classes), range(n_classes)):\n color = cmap_max if cm[i, j] < thresh else cmap_min\n if values_format is None:\n text_cm = format(cm[i, j], '.2g')\n if cm.dtype.kind != 'f':\n text_d = format(cm[i, j], 'd')\n if len(text_d) < len(text_cm):\n text_cm = text_d\n else:\n text_cm = format(cm[i, j], values_format)\n self.text_[i, j] = ax.text(j, i, text_cm, ha='center', va='center', color=color)\n if self.display_labels is None:\n display_labels = np.arange(n_classes)\n else:\n display_labels = self.display_labels\n if colorbar:\n fig.colorbar(self.im_, ax=ax)\n ax.set(xticks=np.arange(n_classes), yticks=np.arange(n_classes), xticklabels=display_labels, yticklabels=display_labels, ylabel='True label', xlabel='Predicted label')\n ax.set_ylim((n_classes - 0.5, -0.5))\n plt.setp(ax.get_xticklabels(), rotation=xticks_rotation)\n self.figure_ = fig\n self.ax_ = ax\n return self\n \n @classmethod\n def from_estimator(cls, estimator, X, y, *, labels=None, sample_weight=None, normalize=None, display_labels=None, include_values=True, xticks_rotation='horizontal', values_format=None, cmap='viridis', ax=None, colorbar=True):\n \"\"\"Plot Confusion Matrix given an estimator and some data.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n labels : array-like of shape (n_classes,), default=None\n List of labels to index the confusion matrix. This may be used to\n reorder or select a subset of labels. If `None` is given, those\n that appear at least once in `y_true` or `y_pred` are used in\n sorted order.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n normalize : {'true', 'pred', 'all'}, default=None\n Either to normalize the counts display in the matrix:\n\n - if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n - if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n - if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n - if `None` (default), the confusion matrix will not be normalized.\n\n display_labels : array-like of shape (n_classes,), default=None\n Target names used for plotting. By default, `labels` will be used\n if it is defined, otherwise the unique labels of `y_true` and\n `y_pred` will be used.\n\n include_values : bool, default=True\n Includes values in confusion matrix.\n\n xticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\n values_format : str, default=None\n Format specification for values in confusion matrix. If `None`, the\n format specification is 'd' or '.2g' whichever is shorter.\n\n cmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\n ax : matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n colorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n\n See Also\n --------\n ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix\n given the true and predicted labels.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import ConfusionMatrixDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> ConfusionMatrixDisplay.from_estimator(\n ... clf, X_test, y_test)\n <...>\n >>> plt.show()\n \"\"\"\n method_name = f'{cls.__name__}.from_estimator'\n check_matplotlib_support(method_name)\n if not is_classifier(estimator):\n raise ValueError(f'{method_name} only supports classifiers')\n y_pred = estimator.predict(X)\n return cls.from_predictions(y, y_pred, sample_weight=sample_weight, labels=labels, normalize=normalize, display_labels=display_labels, include_values=include_values, cmap=cmap, ax=ax, xticks_rotation=xticks_rotation, values_format=values_format, colorbar=colorbar)\n \n @classmethod\n def from_predictions(cls, y_true, y_pred, *, labels=None, sample_weight=None, normalize=None, display_labels=None, include_values=True, xticks_rotation='horizontal', values_format=None, cmap='viridis', ax=None, colorbar=True):\n \"\"\"Plot Confusion Matrix given true and predicted labels.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_pred : array-like of shape (n_samples,)\n The predicted labels given by the method `predict` of an\n classifier.\n\n labels : array-like of shape (n_classes,), default=None\n List of labels to index the confusion matrix. This may be used to\n reorder or select a subset of labels. If `None` is given, those\n that appear at least once in `y_true` or `y_pred` are used in\n sorted order.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n normalize : {'true', 'pred', 'all'}, default=None\n Either to normalize the counts display in the matrix:\n\n - if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n - if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n - if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n - if `None` (default), the confusion matrix will not be normalized.\n\n display_labels : array-like of shape (n_classes,), default=None\n Target names used for plotting. By default, `labels` will be used\n if it is defined, otherwise the unique labels of `y_true` and\n `y_pred` will be used.\n\n include_values : bool, default=True\n Includes values in confusion matrix.\n\n xticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\n values_format : str, default=None\n Format specification for values in confusion matrix. If `None`, the\n format specification is 'd' or '.2g' whichever is shorter.\n\n cmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\n ax : matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n colorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n\n See Also\n --------\n ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix\n given an estimator, the data, and the label.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import ConfusionMatrixDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> y_pred = clf.predict(X_test)\n >>> ConfusionMatrixDisplay.from_predictions(\n ... y_test, y_pred)\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_predictions')\n if display_labels is None:\n if labels is None:\n display_labels = unique_labels(y_true, y_pred)\n else:\n display_labels = labels\n cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight, labels=labels, normalize=normalize)\n disp = cls(confusion_matrix=cm, display_labels=display_labels)\n return disp.plot(include_values=include_values, cmap=cmap, ax=ax, xticks_rotation=xticks_rotation, values_format=values_format, colorbar=colorbar)\n" }, @@ -24345,7 +24427,7 @@ "sklearn.metrics._plot.det_curve.DetCurveDisplay.plot" ], "is_public": true, - "description": "DET curve visualization.\n\nIt is recommend to use :func:`~sklearn.metrics.DetCurveDisplay.from_estimator` or :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` to create a visualizer. All parameters are stored as attributes. Read more in the :ref:`User Guide `. .. versionadded:: 0.24", + "description": "DET curve visualization.\n\nIt is recommend to use :func:`~sklearn.metrics.DetCurveDisplay.from_estimator`\nor :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` to create a\nvisualizer. All parameters are stored as attributes.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.24", "docstring": "DET curve visualization.\n\n It is recommend to use :func:`~sklearn.metrics.DetCurveDisplay.from_estimator`\n or :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` to create a\n visualizer. All parameters are stored as attributes.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n fpr : ndarray\n False positive rate.\n\n fnr : ndarray\n False negative rate.\n\n estimator_name : str, default=None\n Name of estimator. If None, the estimator name is not shown.\n\n pos_label : str or int, default=None\n The label of the positive class.\n\n Attributes\n ----------\n line_ : matplotlib Artist\n DET Curve.\n\n ax_ : matplotlib Axes\n Axes with DET Curve.\n\n figure_ : matplotlib Figure\n Figure containing the curve.\n\n See Also\n --------\n det_curve : Compute error rates for different probability thresholds.\n DetCurveDisplay.from_estimator : Plot DET curve given an estimator and\n some data.\n DetCurveDisplay.from_predictions : Plot DET curve given the true and\n predicted labels.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import det_curve, DetCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(n_samples=1000, random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.4, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> y_pred = clf.decision_function(X_test)\n >>> fpr, fnr, _ = det_curve(y_test, y_pred)\n >>> display = DetCurveDisplay(\n ... fpr=fpr, fnr=fnr, estimator_name=\"SVC\"\n ... )\n >>> display.plot()\n <...>\n >>> plt.show()\n ", "source_code": "\n\nclass DetCurveDisplay:\n \"\"\"DET curve visualization.\n\n It is recommend to use :func:`~sklearn.metrics.DetCurveDisplay.from_estimator`\n or :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` to create a\n visualizer. All parameters are stored as attributes.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n fpr : ndarray\n False positive rate.\n\n fnr : ndarray\n False negative rate.\n\n estimator_name : str, default=None\n Name of estimator. If None, the estimator name is not shown.\n\n pos_label : str or int, default=None\n The label of the positive class.\n\n Attributes\n ----------\n line_ : matplotlib Artist\n DET Curve.\n\n ax_ : matplotlib Axes\n Axes with DET Curve.\n\n figure_ : matplotlib Figure\n Figure containing the curve.\n\n See Also\n --------\n det_curve : Compute error rates for different probability thresholds.\n DetCurveDisplay.from_estimator : Plot DET curve given an estimator and\n some data.\n DetCurveDisplay.from_predictions : Plot DET curve given the true and\n predicted labels.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import det_curve, DetCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(n_samples=1000, random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.4, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> y_pred = clf.decision_function(X_test)\n >>> fpr, fnr, _ = det_curve(y_test, y_pred)\n >>> display = DetCurveDisplay(\n ... fpr=fpr, fnr=fnr, estimator_name=\"SVC\"\n ... )\n >>> display.plot()\n <...>\n >>> plt.show()\n \"\"\"\n \n def __init__(self, *, fpr, fnr, estimator_name=None, pos_label=None):\n self.fpr = fpr\n self.fnr = fnr\n self.estimator_name = estimator_name\n self.pos_label = pos_label\n \n @classmethod\n def from_estimator(cls, estimator, X, y, *, sample_weight=None, response_method='auto', pos_label=None, name=None, ax=None, **kwargs):\n \"\"\"Plot DET curve given an estimator and data.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n response_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the predicted target response. If set\n to 'auto', :term:`predict_proba` is tried first and if it does not\n exist :term:`decision_function` is tried next.\n\n pos_label : str or int, default=None\n The label of the positive class. When `pos_label=None`, if `y_true`\n is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\n error will be raised.\n\n name : str, default=None\n Name of DET curve for labeling. If `None`, use the name of the\n estimator.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n det_curve : Compute error rates for different probability thresholds.\n DetCurveDisplay.from_predictions : Plot DET curve given the true and\n predicted labels.\n plot_roc_curve : Plot Receiver operating characteristic (ROC) curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import DetCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(n_samples=1000, random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.4, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> DetCurveDisplay.from_estimator(\n ... clf, X_test, y_test)\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_estimator')\n name = estimator.__class__.__name__ if name is None else name\n (y_pred, pos_label) = _get_response(X, estimator, response_method, pos_label=pos_label)\n return cls.from_predictions(y_true=y, y_pred=y_pred, sample_weight=sample_weight, name=name, ax=ax, pos_label=pos_label, **kwargs)\n \n @classmethod\n def from_predictions(cls, y_true, y_pred, *, sample_weight=None, pos_label=None, name=None, ax=None, **kwargs):\n \"\"\"Plot DET curve given the true and\n predicted labels.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_pred : array-like of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by `decision_function` on some classifiers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n pos_label : str or int, default=None\n The label of the positive class. When `pos_label=None`, if `y_true`\n is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\n error will be raised.\n\n name : str, default=None\n Name of DET curve for labeling. If `None`, name will be set to\n `\"Classifier\"`.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n det_curve : Compute error rates for different probability thresholds.\n DetCurveDisplay.from_estimator : Plot DET curve given an estimator and\n some data.\n plot_roc_curve : Plot Receiver operating characteristic (ROC) curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import DetCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(n_samples=1000, random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.4, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> y_pred = clf.decision_function(X_test)\n >>> DetCurveDisplay.from_predictions(\n ... y_test, y_pred)\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_predictions')\n (fpr, fnr, _) = det_curve(y_true, y_pred, pos_label=pos_label, sample_weight=sample_weight)\n pos_label = _check_pos_label_consistency(pos_label, y_true)\n name = 'Classifier' if name is None else name\n viz = DetCurveDisplay(fpr=fpr, fnr=fnr, estimator_name=name, pos_label=pos_label)\n return viz.plot(ax=ax, name=name, **kwargs)\n \n def plot(self, ax=None, *, name=None, **kwargs):\n \"\"\"Plot visualization.\n\n Parameters\n ----------\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name of DET curve for labeling. If `None`, use `estimator_name` if\n it is not `None`, otherwise no labeling is shown.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.plot.DetCurveDisplay`\n Object that stores computed values.\n \"\"\"\n check_matplotlib_support('DetCurveDisplay.plot')\n name = self.estimator_name if name is None else name\n line_kwargs = {} if name is None else {'label': name}\n line_kwargs.update(**kwargs)\n import matplotlib.pyplot as plt\n if ax is None:\n (_, ax) = plt.subplots()\n (self.line_, ) = ax.plot(sp.stats.norm.ppf(self.fpr), sp.stats.norm.ppf(self.fnr), **line_kwargs)\n info_pos_label = f' (Positive label: {self.pos_label})' if self.pos_label is not None else ''\n xlabel = 'False Positive Rate' + info_pos_label\n ylabel = 'False Negative Rate' + info_pos_label\n ax.set(xlabel=xlabel, ylabel=ylabel)\n if 'label' in line_kwargs:\n ax.legend(loc='lower right')\n ticks = [0.001, 0.01, 0.05, 0.2, 0.5, 0.8, 0.95, 0.99, 0.999]\n tick_locations = sp.stats.norm.ppf(ticks)\n tick_labels = ['{:.0%}'.format(s) if (100 * s).is_integer() else '{:.1%}'.format(s) for s in ticks]\n ax.set_xticks(tick_locations)\n ax.set_xticklabels(tick_labels)\n ax.set_xlim(-3, 3)\n ax.set_yticks(tick_locations)\n ax.set_yticklabels(tick_labels)\n ax.set_ylim(-3, 3)\n self.ax_ = ax\n self.figure_ = ax.figure\n return self\n" }, @@ -24361,7 +24443,7 @@ "sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay.from_predictions" ], "is_public": true, - "description": "Precision Recall visualization.\n\nIt is recommend to use :func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator` or :func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` to create a :class:`~sklearn.metrics.PredictionRecallDisplay`. All parameters are stored as attributes. Read more in the :ref:`User Guide `.", + "description": "Precision Recall visualization.\n\nIt is recommend to use\n:func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator` or\n:func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` to create\na :class:`~sklearn.metrics.PredictionRecallDisplay`. All parameters are\nstored as attributes.\n\nRead more in the :ref:`User Guide `.", "docstring": "Precision Recall visualization.\n\n It is recommend to use\n :func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator` or\n :func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` to create\n a :class:`~sklearn.metrics.PredictionRecallDisplay`. All parameters are\n stored as attributes.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n -----------\n precision : ndarray\n Precision values.\n\n recall : ndarray\n Recall values.\n\n average_precision : float, default=None\n Average precision. If None, the average precision is not shown.\n\n estimator_name : str, default=None\n Name of estimator. If None, then the estimator name is not shown.\n\n pos_label : str or int, default=None\n The class considered as the positive class. If None, the class will not\n be shown in the legend.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n line_ : matplotlib Artist\n Precision recall curve.\n\n ax_ : matplotlib Axes\n Axes with precision recall curve.\n\n figure_ : matplotlib Figure\n Figure containing the curve.\n\n See Also\n --------\n precision_recall_curve : Compute precision-recall pairs for different\n probability thresholds.\n PrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given\n a binary classifier.\n PrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve\n using predictions from a binary classifier.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import (precision_recall_curve,\n ... PrecisionRecallDisplay)\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y,\n ... random_state=0)\n >>> clf = SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> predictions = clf.predict(X_test)\n >>> precision, recall, _ = precision_recall_curve(y_test, predictions)\n >>> disp = PrecisionRecallDisplay(precision=precision, recall=recall)\n >>> disp.plot()\n <...>\n >>> plt.show()\n ", "source_code": "\n\nclass PrecisionRecallDisplay:\n \"\"\"Precision Recall visualization.\n\n It is recommend to use\n :func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator` or\n :func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` to create\n a :class:`~sklearn.metrics.PredictionRecallDisplay`. All parameters are\n stored as attributes.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n -----------\n precision : ndarray\n Precision values.\n\n recall : ndarray\n Recall values.\n\n average_precision : float, default=None\n Average precision. If None, the average precision is not shown.\n\n estimator_name : str, default=None\n Name of estimator. If None, then the estimator name is not shown.\n\n pos_label : str or int, default=None\n The class considered as the positive class. If None, the class will not\n be shown in the legend.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n line_ : matplotlib Artist\n Precision recall curve.\n\n ax_ : matplotlib Axes\n Axes with precision recall curve.\n\n figure_ : matplotlib Figure\n Figure containing the curve.\n\n See Also\n --------\n precision_recall_curve : Compute precision-recall pairs for different\n probability thresholds.\n PrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given\n a binary classifier.\n PrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve\n using predictions from a binary classifier.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import (precision_recall_curve,\n ... PrecisionRecallDisplay)\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y,\n ... random_state=0)\n >>> clf = SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> predictions = clf.predict(X_test)\n >>> precision, recall, _ = precision_recall_curve(y_test, predictions)\n >>> disp = PrecisionRecallDisplay(precision=precision, recall=recall)\n >>> disp.plot()\n <...>\n >>> plt.show()\n \"\"\"\n \n def __init__(self, precision, recall, *, average_precision=None, estimator_name=None, pos_label=None):\n self.estimator_name = estimator_name\n self.precision = precision\n self.recall = recall\n self.average_precision = average_precision\n self.pos_label = pos_label\n \n def plot(self, ax=None, *, name=None, **kwargs):\n \"\"\"Plot visualization.\n\n Extra keyword arguments will be passed to matplotlib's `plot`.\n\n Parameters\n ----------\n ax : Matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name of precision recall curve for labeling. If `None`, use\n `estimator_name` if not `None`, otherwise no labeling is shown.\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n Object that stores computed values.\n \"\"\"\n check_matplotlib_support('PrecisionRecallDisplay.plot')\n name = self.estimator_name if name is None else name\n line_kwargs = {'drawstyle': 'steps-post'}\n if self.average_precision is not None and name is not None:\n line_kwargs['label'] = f'{name} (AP = {self.average_precision:0.2f})'\n elif self.average_precision is not None:\n line_kwargs['label'] = f'AP = {self.average_precision:0.2f}'\n elif name is not None:\n line_kwargs['label'] = name\n line_kwargs.update(**kwargs)\n import matplotlib.pyplot as plt\n if ax is None:\n (fig, ax) = plt.subplots()\n (self.line_, ) = ax.plot(self.recall, self.precision, **line_kwargs)\n info_pos_label = f' (Positive label: {self.pos_label})' if self.pos_label is not None else ''\n xlabel = 'Recall' + info_pos_label\n ylabel = 'Precision' + info_pos_label\n ax.set(xlabel=xlabel, ylabel=ylabel)\n if 'label' in line_kwargs:\n ax.legend(loc='lower left')\n self.ax_ = ax\n self.figure_ = ax.figure\n return self\n \n @classmethod\n def from_estimator(cls, estimator, X, y, *, sample_weight=None, pos_label=None, response_method='auto', name=None, ax=None, **kwargs):\n \"\"\"Plot precision-recall curve given an estimator and some data.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the\n precision and recall metrics. By default, `estimators.classes_[1]`\n is considered as the positive class.\n\n response_method : {'predict_proba', 'decision_function', 'auto'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n name : str, default=None\n Name for labeling curve. If `None`, no name is used.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n\n See Also\n --------\n PrecisionRecallDisplay.from_predictions : Plot precision-recall curve\n using estimated probabilities or output of decision function.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import PrecisionRecallDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression()\n >>> clf.fit(X_train, y_train)\n LogisticRegression()\n >>> PrecisionRecallDisplay.from_estimator(\n ... clf, X_test, y_test)\n <...>\n >>> plt.show()\n \"\"\"\n method_name = f'{cls.__name__}.from_estimator'\n check_matplotlib_support(method_name)\n if not is_classifier(estimator):\n raise ValueError(f'{method_name} only supports classifiers')\n (y_pred, pos_label) = _get_response(X, estimator, response_method, pos_label=pos_label)\n name = name if name is not None else estimator.__class__.__name__\n return cls.from_predictions(y, y_pred, sample_weight=sample_weight, name=name, pos_label=pos_label, ax=ax, **kwargs)\n \n @classmethod\n def from_predictions(cls, y_true, y_pred, *, sample_weight=None, pos_label=None, name=None, ax=None, **kwargs):\n \"\"\"Plot precision-recall curve given binary class predictions.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True binary labels.\n\n y_pred : array-like of shape (n_samples,)\n Estimated probabilities or output of decision function.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the\n precision and recall metrics.\n\n name : str, default=None\n Name for labeling curve. If `None`, name will be set to\n `\"Classifier\"`.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n\n See Also\n --------\n PrecisionRecallDisplay.from_estimator : Plot precision-recall curve\n using an estimator.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import PrecisionRecallDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression()\n >>> clf.fit(X_train, y_train)\n LogisticRegression()\n >>> y_pred = clf.predict_proba(X_test)[:, 1]\n >>> PrecisionRecallDisplay.from_predictions(\n ... y_test, y_pred)\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_predictions')\n check_consistent_length(y_true, y_pred, sample_weight)\n pos_label = _check_pos_label_consistency(pos_label, y_true)\n (precision, recall, _) = precision_recall_curve(y_true, y_pred, pos_label=pos_label, sample_weight=sample_weight)\n average_precision = average_precision_score(y_true, y_pred, pos_label=pos_label, sample_weight=sample_weight)\n name = name if name is not None else 'Classifier'\n viz = PrecisionRecallDisplay(precision=precision, recall=recall, average_precision=average_precision, estimator_name=name, pos_label=pos_label)\n return viz.plot(ax=ax, name=name, **kwargs)\n" }, @@ -24377,7 +24459,7 @@ "sklearn.metrics._plot.roc_curve.RocCurveDisplay.from_predictions" ], "is_public": true, - "description": "ROC Curve visualization.\n\nIt is recommend to use :func:`~sklearn.metrics.RocCurveDisplay.from_estimator` or :func:`~sklearn.metrics.RocCurveDisplay.from_predictions` to create a :class:`~sklearn.metrics.RocCurveDisplay`. All parameters are stored as attributes. Read more in the :ref:`User Guide `.", + "description": "ROC Curve visualization.\n\nIt is recommend to use\n:func:`~sklearn.metrics.RocCurveDisplay.from_estimator` or\n:func:`~sklearn.metrics.RocCurveDisplay.from_predictions` to create\na :class:`~sklearn.metrics.RocCurveDisplay`. All parameters are\nstored as attributes.\n\nRead more in the :ref:`User Guide `.", "docstring": "ROC Curve visualization.\n\n It is recommend to use\n :func:`~sklearn.metrics.RocCurveDisplay.from_estimator` or\n :func:`~sklearn.metrics.RocCurveDisplay.from_predictions` to create\n a :class:`~sklearn.metrics.RocCurveDisplay`. All parameters are\n stored as attributes.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n fpr : ndarray\n False positive rate.\n\n tpr : ndarray\n True positive rate.\n\n roc_auc : float, default=None\n Area under ROC curve. If None, the roc_auc score is not shown.\n\n estimator_name : str, default=None\n Name of estimator. If None, the estimator name is not shown.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the roc auc\n metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n line_ : matplotlib Artist\n ROC Curve.\n\n ax_ : matplotlib Axes\n Axes with ROC Curve.\n\n figure_ : matplotlib Figure\n Figure containing the curve.\n\n See Also\n --------\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic\n (ROC) curve given an estimator and some data.\n RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic\n (ROC) curve given the true and predicted values.\n roc_auc_score : Compute the area under the ROC curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> import numpy as np\n >>> from sklearn import metrics\n >>> y = np.array([0, 0, 1, 1])\n >>> pred = np.array([0.1, 0.4, 0.35, 0.8])\n >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred)\n >>> roc_auc = metrics.auc(fpr, tpr)\n >>> display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,\n ... estimator_name='example estimator')\n >>> display.plot()\n <...>\n >>> plt.show()\n ", "source_code": "\n\nclass RocCurveDisplay:\n \"\"\"ROC Curve visualization.\n\n It is recommend to use\n :func:`~sklearn.metrics.RocCurveDisplay.from_estimator` or\n :func:`~sklearn.metrics.RocCurveDisplay.from_predictions` to create\n a :class:`~sklearn.metrics.RocCurveDisplay`. All parameters are\n stored as attributes.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n fpr : ndarray\n False positive rate.\n\n tpr : ndarray\n True positive rate.\n\n roc_auc : float, default=None\n Area under ROC curve. If None, the roc_auc score is not shown.\n\n estimator_name : str, default=None\n Name of estimator. If None, the estimator name is not shown.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the roc auc\n metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n line_ : matplotlib Artist\n ROC Curve.\n\n ax_ : matplotlib Axes\n Axes with ROC Curve.\n\n figure_ : matplotlib Figure\n Figure containing the curve.\n\n See Also\n --------\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic\n (ROC) curve given an estimator and some data.\n RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic\n (ROC) curve given the true and predicted values.\n roc_auc_score : Compute the area under the ROC curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> import numpy as np\n >>> from sklearn import metrics\n >>> y = np.array([0, 0, 1, 1])\n >>> pred = np.array([0.1, 0.4, 0.35, 0.8])\n >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred)\n >>> roc_auc = metrics.auc(fpr, tpr)\n >>> display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,\n ... estimator_name='example estimator')\n >>> display.plot()\n <...>\n >>> plt.show()\n \"\"\"\n \n def __init__(self, *, fpr, tpr, roc_auc=None, estimator_name=None, pos_label=None):\n self.estimator_name = estimator_name\n self.fpr = fpr\n self.tpr = tpr\n self.roc_auc = roc_auc\n self.pos_label = pos_label\n \n def plot(self, ax=None, *, name=None, **kwargs):\n \"\"\"Plot visualization\n\n Extra keyword arguments will be passed to matplotlib's ``plot``.\n\n Parameters\n ----------\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name of ROC Curve for labeling. If `None`, use `estimator_name` if\n not `None`, otherwise no labeling is shown.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.plot.RocCurveDisplay`\n Object that stores computed values.\n \"\"\"\n check_matplotlib_support('RocCurveDisplay.plot')\n name = self.estimator_name if name is None else name\n line_kwargs = {}\n if self.roc_auc is not None and name is not None:\n line_kwargs['label'] = f'{name} (AUC = {self.roc_auc:0.2f})'\n elif self.roc_auc is not None:\n line_kwargs['label'] = f'AUC = {self.roc_auc:0.2f}'\n elif name is not None:\n line_kwargs['label'] = name\n line_kwargs.update(**kwargs)\n import matplotlib.pyplot as plt\n if ax is None:\n (fig, ax) = plt.subplots()\n (self.line_, ) = ax.plot(self.fpr, self.tpr, **line_kwargs)\n info_pos_label = f' (Positive label: {self.pos_label})' if self.pos_label is not None else ''\n xlabel = 'False Positive Rate' + info_pos_label\n ylabel = 'True Positive Rate' + info_pos_label\n ax.set(xlabel=xlabel, ylabel=ylabel)\n if 'label' in line_kwargs:\n ax.legend(loc='lower right')\n self.ax_ = ax\n self.figure_ = ax.figure\n return self\n \n @classmethod\n def from_estimator(cls, estimator, X, y, *, sample_weight=None, drop_intermediate=True, response_method='auto', pos_label=None, name=None, ax=None, **kwargs):\n \"\"\"Create a ROC Curve display from an estimator.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n drop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\n response_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the roc auc\n metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\n name : str, default=None\n Name of ROC Curve for labeling. If `None`, use the name of the\n estimator.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.plot.RocCurveDisplay`\n The ROC Curve display.\n\n See Also\n --------\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_predictions : ROC Curve visualization given the\n probabilities of scores of a classifier.\n roc_auc_score : Compute the area under the ROC curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import RocCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> RocCurveDisplay.from_estimator(\n ... clf, X_test, y_test)\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_estimator')\n name = estimator.__class__.__name__ if name is None else name\n (y_pred, pos_label) = _get_response(X, estimator, response_method=response_method, pos_label=pos_label)\n return cls.from_predictions(y_true=y, y_pred=y_pred, sample_weight=sample_weight, drop_intermediate=drop_intermediate, name=name, ax=ax, pos_label=pos_label, **kwargs)\n \n @classmethod\n def from_predictions(cls, y_true, y_pred, *, sample_weight=None, drop_intermediate=True, pos_label=None, name=None, ax=None, **kwargs):\n \"\"\"Plot ROC curve given the true and predicted values.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_pred : array-like of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \u201cdecision_function\u201d on some classifiers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n drop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\n pos_label : str or int, default=None\n The label of the positive class. When `pos_label=None`, if `y_true`\n is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\n error will be raised.\n\n name : str, default=None\n Name of ROC curve for labeling. If `None`, name will be set to\n `\"Classifier\"`.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_estimator : ROC Curve visualization given an\n estimator and some data.\n roc_auc_score : Compute the area under the ROC curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import RocCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> y_pred = clf.decision_function(X_test)\n >>> RocCurveDisplay.from_predictions(\n ... y_test, y_pred)\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_predictions')\n (fpr, tpr, _) = roc_curve(y_true, y_pred, pos_label=pos_label, sample_weight=sample_weight, drop_intermediate=drop_intermediate)\n roc_auc = auc(fpr, tpr)\n name = 'Classifier' if name is None else name\n pos_label = _check_pos_label_consistency(pos_label, y_true)\n viz = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=name, pos_label=pos_label)\n return viz.plot(ax=ax, name=name, **kwargs)\n" }, @@ -24410,7 +24492,7 @@ "sklearn.metrics._scorer._MultimetricScorer._use_cache" ], "is_public": false, - "description": "Callable for multimetric scoring used to avoid repeated calls to `predict_proba`, `predict`, and `decision_function`.\n\n`_MultimetricScorer` will return a dictionary of scores corresponding to the scorers in the dictionary. Note that `_MultimetricScorer` can be created with a dictionary with one key (i.e. only one actual scorer).", + "description": "Callable for multimetric scoring used to avoid repeated calls\nto `predict_proba`, `predict`, and `decision_function`.\n\n`_MultimetricScorer` will return a dictionary of scores corresponding to\nthe scorers in the dictionary. Note that `_MultimetricScorer` can be\ncreated with a dictionary with one key (i.e. only one actual scorer).", "docstring": "Callable for multimetric scoring used to avoid repeated calls\n to `predict_proba`, `predict`, and `decision_function`.\n\n `_MultimetricScorer` will return a dictionary of scores corresponding to\n the scorers in the dictionary. Note that `_MultimetricScorer` can be\n created with a dictionary with one key (i.e. only one actual scorer).\n\n Parameters\n ----------\n scorers : dict\n Dictionary mapping names to callable scorers.\n ", "source_code": "\n\nclass _MultimetricScorer:\n \"\"\"Callable for multimetric scoring used to avoid repeated calls\n to `predict_proba`, `predict`, and `decision_function`.\n\n `_MultimetricScorer` will return a dictionary of scores corresponding to\n the scorers in the dictionary. Note that `_MultimetricScorer` can be\n created with a dictionary with one key (i.e. only one actual scorer).\n\n Parameters\n ----------\n scorers : dict\n Dictionary mapping names to callable scorers.\n \"\"\"\n \n def __init__(self, **scorers):\n self._scorers = scorers\n \n def __call__(self, estimator, *args, **kwargs):\n \"\"\"Evaluate predicted target values.\"\"\"\n scores = {}\n cache = {} if self._use_cache(estimator) else None\n cached_call = partial(_cached_call, cache)\n for (name, scorer) in self._scorers.items():\n if isinstance(scorer, _BaseScorer):\n score = scorer._score(cached_call, estimator, *args, **kwargs)\n else:\n score = scorer(estimator, *args, **kwargs)\n scores[name] = score\n return scores\n \n def _use_cache(self, estimator):\n \"\"\"Return True if using a cache is beneficial.\n\n Caching may be beneficial when one of these conditions holds:\n - `_ProbaScorer` will be called twice.\n - `_PredictScorer` will be called twice.\n - `_ThresholdScorer` will be called twice.\n - `_ThresholdScorer` and `_PredictScorer` are called and\n estimator is a regressor.\n - `_ThresholdScorer` and `_ProbaScorer` are called and\n estimator does not have a `decision_function` attribute.\n\n \"\"\"\n if len(self._scorers) == 1:\n return False\n counter = Counter([type(v) for v in self._scorers.values()])\n if any((counter[known_type] > 1 for known_type in [_PredictScorer, _ProbaScorer, _ThresholdScorer])):\n return True\n if counter[_ThresholdScorer]:\n if is_regressor(estimator) and counter[_PredictScorer]:\n return True\n elif counter[_ProbaScorer] and not hasattr(estimator, 'decision_function'):\n return True\n return False\n" }, @@ -24484,7 +24566,7 @@ "sklearn.mixture._base.BaseMixture._print_verbose_msg_init_end" ], "is_public": false, - "description": "Base class for mixture models.\n\nThis abstract class specifies an interface for all mixture classes and provides basic common methods for mixture models.", + "description": "Base class for mixture models.\n\nThis abstract class specifies an interface for all mixture classes and\nprovides basic common methods for mixture models.", "docstring": "Base class for mixture models.\n\n This abstract class specifies an interface for all mixture classes and\n provides basic common methods for mixture models.\n ", "source_code": "\n\nclass BaseMixture(DensityMixin, BaseEstimator, metaclass=ABCMeta):\n \"\"\"Base class for mixture models.\n\n This abstract class specifies an interface for all mixture classes and\n provides basic common methods for mixture models.\n \"\"\"\n \n def __init__(self, n_components, tol, reg_covar, max_iter, n_init, init_params, random_state, warm_start, verbose, verbose_interval):\n self.n_components = n_components\n self.tol = tol\n self.reg_covar = reg_covar\n self.max_iter = max_iter\n self.n_init = n_init\n self.init_params = init_params\n self.random_state = random_state\n self.warm_start = warm_start\n self.verbose = verbose\n self.verbose_interval = verbose_interval\n \n def _check_initial_parameters(self, X):\n \"\"\"Check values of the basic parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n \"\"\"\n if self.n_components < 1:\n raise ValueError(\"Invalid value for 'n_components': %d Estimation requires at least one component\" % self.n_components)\n if self.tol < 0.0:\n raise ValueError(\"Invalid value for 'tol': %.5f Tolerance used by the EM must be non-negative\" % self.tol)\n if self.n_init < 1:\n raise ValueError(\"Invalid value for 'n_init': %d Estimation requires at least one run\" % self.n_init)\n if self.max_iter < 1:\n raise ValueError(\"Invalid value for 'max_iter': %d Estimation requires at least one iteration\" % self.max_iter)\n if self.reg_covar < 0.0:\n raise ValueError(\"Invalid value for 'reg_covar': %.5f regularization on covariance must be non-negative\" % self.reg_covar)\n self._check_parameters(X)\n \n @abstractmethod\n def _check_parameters(self, X):\n \"\"\"Check initial parameters of the derived class.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n \"\"\"\n pass\n \n def _initialize_parameters(self, X, random_state):\n \"\"\"Initialize the model parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n random_state : RandomState\n A random number generator instance that controls the random seed\n used for the method chosen to initialize the parameters.\n \"\"\"\n (n_samples, _) = X.shape\n if self.init_params == 'kmeans':\n resp = np.zeros((n_samples, self.n_components))\n label = cluster.KMeans(n_clusters=self.n_components, n_init=1, random_state=random_state).fit(X).labels_\n resp[np.arange(n_samples), label] = 1\n elif self.init_params == 'random':\n resp = random_state.rand(n_samples, self.n_components)\n resp /= resp.sum(axis=1)[:, np.newaxis]\n else:\n raise ValueError(\"Unimplemented initialization method '%s'\" % self.init_params)\n self._initialize(X, resp)\n \n @abstractmethod\n def _initialize(self, X, resp):\n \"\"\"Initialize the model parameters of the derived class.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n resp : array-like of shape (n_samples, n_components)\n \"\"\"\n pass\n \n def fit(self, X, y=None):\n \"\"\"Estimate model parameters with the EM algorithm.\n\n The method fits the model ``n_init`` times and sets the parameters with\n which the model has the largest likelihood or lower bound. Within each\n trial, the method iterates between E-step and M-step for ``max_iter``\n times until the change of likelihood or lower bound is less than\n ``tol``, otherwise, a ``ConvergenceWarning`` is raised.\n If ``warm_start`` is ``True``, then ``n_init`` is ignored and a single\n initialization is performed upon the first call. Upon consecutive\n calls, training starts where it left off.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n The fitted mixture.\n \"\"\"\n self.fit_predict(X, y)\n return self\n \n def fit_predict(self, X, y=None):\n \"\"\"Estimate model parameters using X and predict the labels for X.\n\n The method fits the model n_init times and sets the parameters with\n which the model has the largest likelihood or lower bound. Within each\n trial, the method iterates between E-step and M-step for `max_iter`\n times until the change of likelihood or lower bound is less than\n `tol`, otherwise, a :class:`~sklearn.exceptions.ConvergenceWarning` is\n raised. After fitting, it predicts the most probable label for the\n input data points.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n labels : array, shape (n_samples,)\n Component labels.\n \"\"\"\n X = self._validate_data(X, dtype=[np.float64, np.float32], ensure_min_samples=2)\n if X.shape[0] < self.n_components:\n raise ValueError(f'Expected n_samples >= n_components but got n_components = {self.n_components}, n_samples = {X.shape[0]}')\n self._check_initial_parameters(X)\n do_init = not (self.warm_start and hasattr(self, 'converged_'))\n n_init = self.n_init if do_init else 1\n max_lower_bound = -np.inf\n self.converged_ = False\n random_state = check_random_state(self.random_state)\n (n_samples, _) = X.shape\n for init in range(n_init):\n self._print_verbose_msg_init_beg(init)\n if do_init:\n self._initialize_parameters(X, random_state)\n lower_bound = -np.inf if do_init else self.lower_bound_\n for n_iter in range(1, self.max_iter + 1):\n prev_lower_bound = lower_bound\n (log_prob_norm, log_resp) = self._e_step(X)\n self._m_step(X, log_resp)\n lower_bound = self._compute_lower_bound(log_resp, log_prob_norm)\n change = lower_bound - prev_lower_bound\n self._print_verbose_msg_iter_end(n_iter, change)\n if abs(change) < self.tol:\n self.converged_ = True\n break\n self._print_verbose_msg_init_end(lower_bound)\n if lower_bound > max_lower_bound or max_lower_bound == -np.inf:\n max_lower_bound = lower_bound\n best_params = self._get_parameters()\n best_n_iter = n_iter\n if not self.converged_:\n warnings.warn('Initialization %d did not converge. Try different init parameters, or increase max_iter, tol or check for degenerate data.' % (init + 1), ConvergenceWarning)\n self._set_parameters(best_params)\n self.n_iter_ = best_n_iter\n self.lower_bound_ = max_lower_bound\n (_, log_resp) = self._e_step(X)\n return log_resp.argmax(axis=1)\n \n def _e_step(self, X):\n \"\"\"E step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n log_prob_norm : float\n Mean of the logarithms of the probabilities of each sample in X\n\n log_responsibility : array, shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n \"\"\"\n (log_prob_norm, log_resp) = self._estimate_log_prob_resp(X)\n return np.mean(log_prob_norm), log_resp\n \n @abstractmethod\n def _m_step(self, X, log_resp):\n \"\"\"M step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n \"\"\"\n pass\n \n @abstractmethod\n def _get_parameters(self):\n pass\n \n @abstractmethod\n def _set_parameters(self, params):\n pass\n \n def score_samples(self, X):\n \"\"\"Compute the log-likelihood of each sample.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n Returns\n -------\n log_prob : array, shape (n_samples,)\n Log-likelihood of each sample in `X` under the current model.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n return logsumexp(self._estimate_weighted_log_prob(X), axis=1)\n \n def score(self, X, y=None):\n \"\"\"Compute the per-sample average log-likelihood of the given data X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_dimensions)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n log_likelihood : float\n Log-likelihood of `X` under the Gaussian mixture model.\n \"\"\"\n return self.score_samples(X).mean()\n \n def predict(self, X):\n \"\"\"Predict the labels for the data samples in X using trained model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n Returns\n -------\n labels : array, shape (n_samples,)\n Component labels.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n return self._estimate_weighted_log_prob(X).argmax(axis=1)\n \n def predict_proba(self, X):\n \"\"\"Evaluate the components' density for each sample.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n Returns\n -------\n resp : array, shape (n_samples, n_components)\n Density of each Gaussian component for each sample in X.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n (_, log_resp) = self._estimate_log_prob_resp(X)\n return np.exp(log_resp)\n \n def sample(self, n_samples=1):\n \"\"\"Generate random samples from the fitted Gaussian distribution.\n\n Parameters\n ----------\n n_samples : int, default=1\n Number of samples to generate.\n\n Returns\n -------\n X : array, shape (n_samples, n_features)\n Randomly generated sample.\n\n y : array, shape (nsamples,)\n Component labels.\n \"\"\"\n check_is_fitted(self)\n if n_samples < 1:\n raise ValueError(\"Invalid value for 'n_samples': %d . The sampling requires at least one sample.\" % self.n_components)\n (_, n_features) = self.means_.shape\n rng = check_random_state(self.random_state)\n n_samples_comp = rng.multinomial(n_samples, self.weights_)\n if self.covariance_type == 'full':\n X = np.vstack([rng.multivariate_normal(mean, covariance, int(sample)) for (mean, covariance, sample) in zip(self.means_, self.covariances_, n_samples_comp)])\n elif self.covariance_type == 'tied':\n X = np.vstack([rng.multivariate_normal(mean, self.covariances_, int(sample)) for (mean, sample) in zip(self.means_, n_samples_comp)])\n else:\n X = np.vstack([mean + rng.randn(sample, n_features) * np.sqrt(covariance) for (mean, covariance, sample) in zip(self.means_, self.covariances_, n_samples_comp)])\n y = np.concatenate([np.full(sample, j, dtype=int) for (j, sample) in enumerate(n_samples_comp)])\n return X, y\n \n def _estimate_weighted_log_prob(self, X):\n \"\"\"Estimate the weighted log-probabilities, log P(X | Z) + log weights.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n weighted_log_prob : array, shape (n_samples, n_component)\n \"\"\"\n return self._estimate_log_prob(X) + self._estimate_log_weights()\n \n @abstractmethod\n def _estimate_log_weights(self):\n \"\"\"Estimate log-weights in EM algorithm, E[ log pi ] in VB algorithm.\n\n Returns\n -------\n log_weight : array, shape (n_components, )\n \"\"\"\n pass\n \n @abstractmethod\n def _estimate_log_prob(self, X):\n \"\"\"Estimate the log-probabilities log P(X | Z).\n\n Compute the log-probabilities per each component for each sample.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n log_prob : array, shape (n_samples, n_component)\n \"\"\"\n pass\n \n def _estimate_log_prob_resp(self, X):\n \"\"\"Estimate log probabilities and responsibilities for each sample.\n\n Compute the log probabilities, weighted log probabilities per\n component and responsibilities for each sample in X with respect to\n the current state of the model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n log_prob_norm : array, shape (n_samples,)\n log p(X)\n\n log_responsibilities : array, shape (n_samples, n_components)\n logarithm of the responsibilities\n \"\"\"\n weighted_log_prob = self._estimate_weighted_log_prob(X)\n log_prob_norm = logsumexp(weighted_log_prob, axis=1)\n with np.errstate(under='ignore'):\n log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis]\n return log_prob_norm, log_resp\n \n def _print_verbose_msg_init_beg(self, n_init):\n \"\"\"Print verbose message on initialization.\"\"\"\n if self.verbose == 1:\n print('Initialization %d' % n_init)\n elif self.verbose >= 2:\n print('Initialization %d' % n_init)\n self._init_prev_time = time()\n self._iter_prev_time = self._init_prev_time\n \n def _print_verbose_msg_iter_end(self, n_iter, diff_ll):\n \"\"\"Print verbose message on initialization.\"\"\"\n if n_iter % self.verbose_interval == 0:\n if self.verbose == 1:\n print(' Iteration %d' % n_iter)\n elif self.verbose >= 2:\n cur_time = time()\n print(' Iteration %d\\t time lapse %.5fs\\t ll change %.5f' % (n_iter, cur_time - self._iter_prev_time, diff_ll))\n self._iter_prev_time = cur_time\n \n def _print_verbose_msg_init_end(self, ll):\n \"\"\"Print verbose message on the end of iteration.\"\"\"\n if self.verbose == 1:\n print('Initialization converged: %s' % self.converged_)\n elif self.verbose >= 2:\n print('Initialization converged: %s\\t time lapse %.5fs\\t ll %.5f' % (self.converged_, time() - self._init_prev_time, ll))\n" }, @@ -24516,7 +24598,7 @@ "sklearn.mixture._bayesian_mixture.BayesianGaussianMixture._set_parameters" ], "is_public": true, - "description": "Variational Bayesian estimation of a Gaussian mixture.\n\nThis class allows to infer an approximate posterior distribution over the parameters of a Gaussian mixture distribution. The effective number of components can be inferred from the data. This class implements two types of prior for the weights distribution: a finite mixture model with Dirichlet distribution and an infinite mixture model with the Dirichlet Process. In practice Dirichlet Process inference algorithm is approximated and uses a truncated distribution with a fixed maximum number of components (called the Stick-breaking representation). The number of components actually used almost always depends on the data. .. versionadded:: 0.18 Read more in the :ref:`User Guide `.", + "description": "Variational Bayesian estimation of a Gaussian mixture.\n\nThis class allows to infer an approximate posterior distribution over the\nparameters of a Gaussian mixture distribution. The effective number of\ncomponents can be inferred from the data.\n\nThis class implements two types of prior for the weights distribution: a\nfinite mixture model with Dirichlet distribution and an infinite mixture\nmodel with the Dirichlet Process. In practice Dirichlet Process inference\nalgorithm is approximated and uses a truncated distribution with a fixed\nmaximum number of components (called the Stick-breaking representation).\nThe number of components actually used almost always depends on the data.\n\n.. versionadded:: 0.18\n\nRead more in the :ref:`User Guide `.", "docstring": "Variational Bayesian estimation of a Gaussian mixture.\n\n This class allows to infer an approximate posterior distribution over the\n parameters of a Gaussian mixture distribution. The effective number of\n components can be inferred from the data.\n\n This class implements two types of prior for the weights distribution: a\n finite mixture model with Dirichlet distribution and an infinite mixture\n model with the Dirichlet Process. In practice Dirichlet Process inference\n algorithm is approximated and uses a truncated distribution with a fixed\n maximum number of components (called the Stick-breaking representation).\n The number of components actually used almost always depends on the data.\n\n .. versionadded:: 0.18\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=1\n The number of mixture components. Depending on the data and the value\n of the `weight_concentration_prior` the model can decide to not use\n all the components by setting some component `weights_` to values very\n close to zero. The number of effective components is therefore smaller\n than n_components.\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full'\n String describing the type of covariance parameters to use.\n Must be one of::\n\n 'full' (each component has its own general covariance matrix),\n 'tied' (all components share the same general covariance matrix),\n 'diag' (each component has its own diagonal covariance matrix),\n 'spherical' (each component has its own single variance).\n\n tol : float, default=1e-3\n The convergence threshold. EM iterations will stop when the\n lower bound average gain on the likelihood (of the training data with\n respect to the model) is below this threshold.\n\n reg_covar : float, default=1e-6\n Non-negative regularization added to the diagonal of covariance.\n Allows to assure that the covariance matrices are all positive.\n\n max_iter : int, default=100\n The number of EM iterations to perform.\n\n n_init : int, default=1\n The number of initializations to perform. The result with the highest\n lower bound value on the likelihood is kept.\n\n init_params : {'kmeans', 'random'}, default='kmeans'\n The method used to initialize the weights, the means and the\n covariances.\n Must be one of::\n\n 'kmeans' : responsibilities are initialized using kmeans.\n 'random' : responsibilities are initialized randomly.\n\n weight_concentration_prior_type : str, default='dirichlet_process'\n String describing the type of the weight concentration prior.\n Must be one of::\n\n 'dirichlet_process' (using the Stick-breaking representation),\n 'dirichlet_distribution' (can favor more uniform weights).\n\n weight_concentration_prior : float or None, default=None\n The dirichlet concentration of each component on the weight\n distribution (Dirichlet). This is commonly called gamma in the\n literature. The higher concentration puts more mass in\n the center and will lead to more components being active, while a lower\n concentration parameter will lead to more mass at the edge of the\n mixture weights simplex. The value of the parameter must be greater\n than 0. If it is None, it's set to ``1. / n_components``.\n\n mean_precision_prior : float or None, default=None\n The precision prior on the mean distribution (Gaussian).\n Controls the extent of where means can be placed. Larger\n values concentrate the cluster means around `mean_prior`.\n The value of the parameter must be greater than 0.\n If it is None, it is set to 1.\n\n mean_prior : array-like, shape (n_features,), default=None\n The prior on the mean distribution (Gaussian).\n If it is None, it is set to the mean of X.\n\n degrees_of_freedom_prior : float or None, default=None\n The prior of the number of degrees of freedom on the covariance\n distributions (Wishart). If it is None, it's set to `n_features`.\n\n covariance_prior : float or array-like, default=None\n The prior on the covariance distribution (Wishart).\n If it is None, the emiprical covariance prior is initialized using the\n covariance of X. The shape depends on `covariance_type`::\n\n (n_features, n_features) if 'full',\n (n_features, n_features) if 'tied',\n (n_features) if 'diag',\n float if 'spherical'\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given to the method chosen to initialize the\n parameters (see `init_params`).\n In addition, it controls the generation of random samples from the\n fitted distribution (see the method `sample`).\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n warm_start : bool, default=False\n If 'warm_start' is True, the solution of the last fitting is used as\n initialization for the next call of fit(). This can speed up\n convergence when fit is called several times on similar problems.\n See :term:`the Glossary `.\n\n verbose : int, default=0\n Enable verbose output. If 1 then it prints the current\n initialization and each iteration step. If greater than 1 then\n it prints also the log probability and the time needed\n for each step.\n\n verbose_interval : int, default=10\n Number of iteration done before the next print.\n\n Attributes\n ----------\n weights_ : array-like of shape (n_components,)\n The weights of each mixture components.\n\n means_ : array-like of shape (n_components, n_features)\n The mean of each mixture component.\n\n covariances_ : array-like\n The covariance of each mixture component.\n The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n precisions_ : array-like\n The precision matrices for each component in the mixture. A precision\n matrix is the inverse of a covariance matrix. A covariance matrix is\n symmetric positive definite so the mixture of Gaussian can be\n equivalently parameterized by the precision matrices. Storing the\n precision matrices instead of the covariance matrices makes it more\n efficient to compute the log-likelihood of new samples at test time.\n The shape depends on ``covariance_type``::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n precisions_cholesky_ : array-like\n The cholesky decomposition of the precision matrices of each mixture\n component. A precision matrix is the inverse of a covariance matrix.\n A covariance matrix is symmetric positive definite so the mixture of\n Gaussian can be equivalently parameterized by the precision matrices.\n Storing the precision matrices instead of the covariance matrices makes\n it more efficient to compute the log-likelihood of new samples at test\n time. The shape depends on ``covariance_type``::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n converged_ : bool\n True when convergence was reached in fit(), False otherwise.\n\n n_iter_ : int\n Number of step used by the best fit of inference to reach the\n convergence.\n\n lower_bound_ : float\n Lower bound value on the likelihood (of the training data with\n respect to the model) of the best fit of inference.\n\n weight_concentration_prior_ : tuple or float\n The dirichlet concentration of each component on the weight\n distribution (Dirichlet). The type depends on\n ``weight_concentration_prior_type``::\n\n (float, float) if 'dirichlet_process' (Beta parameters),\n float if 'dirichlet_distribution' (Dirichlet parameters).\n\n The higher concentration puts more mass in\n the center and will lead to more components being active, while a lower\n concentration parameter will lead to more mass at the edge of the\n simplex.\n\n weight_concentration_ : array-like of shape (n_components,)\n The dirichlet concentration of each component on the weight\n distribution (Dirichlet).\n\n mean_precision_prior_ : float\n The precision prior on the mean distribution (Gaussian).\n Controls the extent of where means can be placed.\n Larger values concentrate the cluster means around `mean_prior`.\n If mean_precision_prior is set to None, `mean_precision_prior_` is set\n to 1.\n\n mean_precision_ : array-like of shape (n_components,)\n The precision of each components on the mean distribution (Gaussian).\n\n mean_prior_ : array-like of shape (n_features,)\n The prior on the mean distribution (Gaussian).\n\n degrees_of_freedom_prior_ : float\n The prior of the number of degrees of freedom on the covariance\n distributions (Wishart).\n\n degrees_of_freedom_ : array-like of shape (n_components,)\n The number of degrees of freedom of each components in the model.\n\n covariance_prior_ : float or array-like\n The prior on the covariance distribution (Wishart).\n The shape depends on `covariance_type`::\n\n (n_features, n_features) if 'full',\n (n_features, n_features) if 'tied',\n (n_features) if 'diag',\n float if 'spherical'\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GaussianMixture : Finite Gaussian mixture fit with EM.\n\n References\n ----------\n\n .. [1] `Bishop, Christopher M. (2006). \"Pattern recognition and machine\n learning\". Vol. 4 No. 4. New York: Springer.\n `_\n\n .. [2] `Hagai Attias. (2000). \"A Variational Bayesian Framework for\n Graphical Models\". In Advances in Neural Information Processing\n Systems 12.\n `_\n\n .. [3] `Blei, David M. and Michael I. Jordan. (2006). \"Variational\n inference for Dirichlet process mixtures\". Bayesian analysis 1.1\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.mixture import BayesianGaussianMixture\n >>> X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [12, 4], [10, 7]])\n >>> bgm = BayesianGaussianMixture(n_components=2, random_state=42).fit(X)\n >>> bgm.means_\n array([[2.49... , 2.29...],\n [8.45..., 4.52... ]])\n >>> bgm.predict([[0, 0], [9, 3]])\n array([0, 1])\n ", "source_code": "\n\nclass BayesianGaussianMixture(BaseMixture):\n \"\"\"Variational Bayesian estimation of a Gaussian mixture.\n\n This class allows to infer an approximate posterior distribution over the\n parameters of a Gaussian mixture distribution. The effective number of\n components can be inferred from the data.\n\n This class implements two types of prior for the weights distribution: a\n finite mixture model with Dirichlet distribution and an infinite mixture\n model with the Dirichlet Process. In practice Dirichlet Process inference\n algorithm is approximated and uses a truncated distribution with a fixed\n maximum number of components (called the Stick-breaking representation).\n The number of components actually used almost always depends on the data.\n\n .. versionadded:: 0.18\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=1\n The number of mixture components. Depending on the data and the value\n of the `weight_concentration_prior` the model can decide to not use\n all the components by setting some component `weights_` to values very\n close to zero. The number of effective components is therefore smaller\n than n_components.\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full'\n String describing the type of covariance parameters to use.\n Must be one of::\n\n 'full' (each component has its own general covariance matrix),\n 'tied' (all components share the same general covariance matrix),\n 'diag' (each component has its own diagonal covariance matrix),\n 'spherical' (each component has its own single variance).\n\n tol : float, default=1e-3\n The convergence threshold. EM iterations will stop when the\n lower bound average gain on the likelihood (of the training data with\n respect to the model) is below this threshold.\n\n reg_covar : float, default=1e-6\n Non-negative regularization added to the diagonal of covariance.\n Allows to assure that the covariance matrices are all positive.\n\n max_iter : int, default=100\n The number of EM iterations to perform.\n\n n_init : int, default=1\n The number of initializations to perform. The result with the highest\n lower bound value on the likelihood is kept.\n\n init_params : {'kmeans', 'random'}, default='kmeans'\n The method used to initialize the weights, the means and the\n covariances.\n Must be one of::\n\n 'kmeans' : responsibilities are initialized using kmeans.\n 'random' : responsibilities are initialized randomly.\n\n weight_concentration_prior_type : str, default='dirichlet_process'\n String describing the type of the weight concentration prior.\n Must be one of::\n\n 'dirichlet_process' (using the Stick-breaking representation),\n 'dirichlet_distribution' (can favor more uniform weights).\n\n weight_concentration_prior : float or None, default=None\n The dirichlet concentration of each component on the weight\n distribution (Dirichlet). This is commonly called gamma in the\n literature. The higher concentration puts more mass in\n the center and will lead to more components being active, while a lower\n concentration parameter will lead to more mass at the edge of the\n mixture weights simplex. The value of the parameter must be greater\n than 0. If it is None, it's set to ``1. / n_components``.\n\n mean_precision_prior : float or None, default=None\n The precision prior on the mean distribution (Gaussian).\n Controls the extent of where means can be placed. Larger\n values concentrate the cluster means around `mean_prior`.\n The value of the parameter must be greater than 0.\n If it is None, it is set to 1.\n\n mean_prior : array-like, shape (n_features,), default=None\n The prior on the mean distribution (Gaussian).\n If it is None, it is set to the mean of X.\n\n degrees_of_freedom_prior : float or None, default=None\n The prior of the number of degrees of freedom on the covariance\n distributions (Wishart). If it is None, it's set to `n_features`.\n\n covariance_prior : float or array-like, default=None\n The prior on the covariance distribution (Wishart).\n If it is None, the emiprical covariance prior is initialized using the\n covariance of X. The shape depends on `covariance_type`::\n\n (n_features, n_features) if 'full',\n (n_features, n_features) if 'tied',\n (n_features) if 'diag',\n float if 'spherical'\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given to the method chosen to initialize the\n parameters (see `init_params`).\n In addition, it controls the generation of random samples from the\n fitted distribution (see the method `sample`).\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n warm_start : bool, default=False\n If 'warm_start' is True, the solution of the last fitting is used as\n initialization for the next call of fit(). This can speed up\n convergence when fit is called several times on similar problems.\n See :term:`the Glossary `.\n\n verbose : int, default=0\n Enable verbose output. If 1 then it prints the current\n initialization and each iteration step. If greater than 1 then\n it prints also the log probability and the time needed\n for each step.\n\n verbose_interval : int, default=10\n Number of iteration done before the next print.\n\n Attributes\n ----------\n weights_ : array-like of shape (n_components,)\n The weights of each mixture components.\n\n means_ : array-like of shape (n_components, n_features)\n The mean of each mixture component.\n\n covariances_ : array-like\n The covariance of each mixture component.\n The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n precisions_ : array-like\n The precision matrices for each component in the mixture. A precision\n matrix is the inverse of a covariance matrix. A covariance matrix is\n symmetric positive definite so the mixture of Gaussian can be\n equivalently parameterized by the precision matrices. Storing the\n precision matrices instead of the covariance matrices makes it more\n efficient to compute the log-likelihood of new samples at test time.\n The shape depends on ``covariance_type``::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n precisions_cholesky_ : array-like\n The cholesky decomposition of the precision matrices of each mixture\n component. A precision matrix is the inverse of a covariance matrix.\n A covariance matrix is symmetric positive definite so the mixture of\n Gaussian can be equivalently parameterized by the precision matrices.\n Storing the precision matrices instead of the covariance matrices makes\n it more efficient to compute the log-likelihood of new samples at test\n time. The shape depends on ``covariance_type``::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n converged_ : bool\n True when convergence was reached in fit(), False otherwise.\n\n n_iter_ : int\n Number of step used by the best fit of inference to reach the\n convergence.\n\n lower_bound_ : float\n Lower bound value on the likelihood (of the training data with\n respect to the model) of the best fit of inference.\n\n weight_concentration_prior_ : tuple or float\n The dirichlet concentration of each component on the weight\n distribution (Dirichlet). The type depends on\n ``weight_concentration_prior_type``::\n\n (float, float) if 'dirichlet_process' (Beta parameters),\n float if 'dirichlet_distribution' (Dirichlet parameters).\n\n The higher concentration puts more mass in\n the center and will lead to more components being active, while a lower\n concentration parameter will lead to more mass at the edge of the\n simplex.\n\n weight_concentration_ : array-like of shape (n_components,)\n The dirichlet concentration of each component on the weight\n distribution (Dirichlet).\n\n mean_precision_prior_ : float\n The precision prior on the mean distribution (Gaussian).\n Controls the extent of where means can be placed.\n Larger values concentrate the cluster means around `mean_prior`.\n If mean_precision_prior is set to None, `mean_precision_prior_` is set\n to 1.\n\n mean_precision_ : array-like of shape (n_components,)\n The precision of each components on the mean distribution (Gaussian).\n\n mean_prior_ : array-like of shape (n_features,)\n The prior on the mean distribution (Gaussian).\n\n degrees_of_freedom_prior_ : float\n The prior of the number of degrees of freedom on the covariance\n distributions (Wishart).\n\n degrees_of_freedom_ : array-like of shape (n_components,)\n The number of degrees of freedom of each components in the model.\n\n covariance_prior_ : float or array-like\n The prior on the covariance distribution (Wishart).\n The shape depends on `covariance_type`::\n\n (n_features, n_features) if 'full',\n (n_features, n_features) if 'tied',\n (n_features) if 'diag',\n float if 'spherical'\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GaussianMixture : Finite Gaussian mixture fit with EM.\n\n References\n ----------\n\n .. [1] `Bishop, Christopher M. (2006). \"Pattern recognition and machine\n learning\". Vol. 4 No. 4. New York: Springer.\n `_\n\n .. [2] `Hagai Attias. (2000). \"A Variational Bayesian Framework for\n Graphical Models\". In Advances in Neural Information Processing\n Systems 12.\n `_\n\n .. [3] `Blei, David M. and Michael I. Jordan. (2006). \"Variational\n inference for Dirichlet process mixtures\". Bayesian analysis 1.1\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.mixture import BayesianGaussianMixture\n >>> X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [12, 4], [10, 7]])\n >>> bgm = BayesianGaussianMixture(n_components=2, random_state=42).fit(X)\n >>> bgm.means_\n array([[2.49... , 2.29...],\n [8.45..., 4.52... ]])\n >>> bgm.predict([[0, 0], [9, 3]])\n array([0, 1])\n \"\"\"\n \n def __init__(self, *, n_components=1, covariance_type='full', tol=0.001, reg_covar=1e-06, max_iter=100, n_init=1, init_params='kmeans', weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=None, mean_precision_prior=None, mean_prior=None, degrees_of_freedom_prior=None, covariance_prior=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10):\n super().__init__(n_components=n_components, tol=tol, reg_covar=reg_covar, max_iter=max_iter, n_init=n_init, init_params=init_params, random_state=random_state, warm_start=warm_start, verbose=verbose, verbose_interval=verbose_interval)\n self.covariance_type = covariance_type\n self.weight_concentration_prior_type = weight_concentration_prior_type\n self.weight_concentration_prior = weight_concentration_prior\n self.mean_precision_prior = mean_precision_prior\n self.mean_prior = mean_prior\n self.degrees_of_freedom_prior = degrees_of_freedom_prior\n self.covariance_prior = covariance_prior\n \n def _check_parameters(self, X):\n \"\"\"Check that the parameters are well defined.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n \"\"\"\n if self.covariance_type not in ['spherical', 'tied', 'diag', 'full']:\n raise ValueError(\"Invalid value for 'covariance_type': %s 'covariance_type' should be in ['spherical', 'tied', 'diag', 'full']\" % self.covariance_type)\n if self.weight_concentration_prior_type not in ['dirichlet_process', 'dirichlet_distribution']:\n raise ValueError(\"Invalid value for 'weight_concentration_prior_type': %s 'weight_concentration_prior_type' should be in ['dirichlet_process', 'dirichlet_distribution']\" % self.weight_concentration_prior_type)\n self._check_weights_parameters()\n self._check_means_parameters(X)\n self._check_precision_parameters(X)\n self._checkcovariance_prior_parameter(X)\n \n def _check_weights_parameters(self):\n \"\"\"Check the parameter of the Dirichlet distribution.\"\"\"\n if self.weight_concentration_prior is None:\n self.weight_concentration_prior_ = 1.0 / self.n_components\n elif self.weight_concentration_prior > 0.0:\n self.weight_concentration_prior_ = self.weight_concentration_prior\n else:\n raise ValueError(\"The parameter 'weight_concentration_prior' should be greater than 0., but got %.3f.\" % self.weight_concentration_prior)\n \n def _check_means_parameters(self, X):\n \"\"\"Check the parameters of the Gaussian distribution.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n \"\"\"\n (_, n_features) = X.shape\n if self.mean_precision_prior is None:\n self.mean_precision_prior_ = 1.0\n elif self.mean_precision_prior > 0.0:\n self.mean_precision_prior_ = self.mean_precision_prior\n else:\n raise ValueError(\"The parameter 'mean_precision_prior' should be greater than 0., but got %.3f.\" % self.mean_precision_prior)\n if self.mean_prior is None:\n self.mean_prior_ = X.mean(axis=0)\n else:\n self.mean_prior_ = check_array(self.mean_prior, dtype=[np.float64, np.float32], ensure_2d=False)\n _check_shape(self.mean_prior_, (n_features, ), 'means')\n \n def _check_precision_parameters(self, X):\n \"\"\"Check the prior parameters of the precision distribution.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n \"\"\"\n (_, n_features) = X.shape\n if self.degrees_of_freedom_prior is None:\n self.degrees_of_freedom_prior_ = n_features\n elif self.degrees_of_freedom_prior > n_features - 1.0:\n self.degrees_of_freedom_prior_ = self.degrees_of_freedom_prior\n else:\n raise ValueError(\"The parameter 'degrees_of_freedom_prior' should be greater than %d, but got %.3f.\" % (n_features - 1, self.degrees_of_freedom_prior))\n \n def _checkcovariance_prior_parameter(self, X):\n \"\"\"Check the `covariance_prior_`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n \"\"\"\n (_, n_features) = X.shape\n if self.covariance_prior is None:\n self.covariance_prior_ = {'full': np.atleast_2d(np.cov(X.T)), 'tied': np.atleast_2d(np.cov(X.T)), 'diag': np.var(X, axis=0, ddof=1), 'spherical': np.var(X, axis=0, ddof=1).mean()}[self.covariance_type]\n elif self.covariance_type in ['full', 'tied']:\n self.covariance_prior_ = check_array(self.covariance_prior, dtype=[np.float64, np.float32], ensure_2d=False)\n _check_shape(self.covariance_prior_, (n_features, n_features), '%s covariance_prior' % self.covariance_type)\n _check_precision_matrix(self.covariance_prior_, self.covariance_type)\n elif self.covariance_type == 'diag':\n self.covariance_prior_ = check_array(self.covariance_prior, dtype=[np.float64, np.float32], ensure_2d=False)\n _check_shape(self.covariance_prior_, (n_features, ), '%s covariance_prior' % self.covariance_type)\n _check_precision_positivity(self.covariance_prior_, self.covariance_type)\n elif self.covariance_prior > 0.0:\n self.covariance_prior_ = self.covariance_prior\n else:\n raise ValueError(\"The parameter 'spherical covariance_prior' should be greater than 0., but got %.3f.\" % self.covariance_prior)\n \n def _initialize(self, X, resp):\n \"\"\"Initialization of the mixture parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n resp : array-like of shape (n_samples, n_components)\n \"\"\"\n (nk, xk, sk) = _estimate_gaussian_parameters(X, resp, self.reg_covar, self.covariance_type)\n self._estimate_weights(nk)\n self._estimate_means(nk, xk)\n self._estimate_precisions(nk, xk, sk)\n \n def _estimate_weights(self, nk):\n \"\"\"Estimate the parameters of the Dirichlet distribution.\n\n Parameters\n ----------\n nk : array-like of shape (n_components,)\n \"\"\"\n if self.weight_concentration_prior_type == 'dirichlet_process':\n self.weight_concentration_ = (1.0 + nk, self.weight_concentration_prior_ + np.hstack((np.cumsum(nk[::-1])[-2::-1], 0)))\n else:\n self.weight_concentration_ = self.weight_concentration_prior_ + nk\n \n def _estimate_means(self, nk, xk):\n \"\"\"Estimate the parameters of the Gaussian distribution.\n\n Parameters\n ----------\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n \"\"\"\n self.mean_precision_ = self.mean_precision_prior_ + nk\n self.means_ = (self.mean_precision_prior_ * self.mean_prior_ + nk[:, np.newaxis] * xk) / self.mean_precision_[:, np.newaxis]\n \n def _estimate_precisions(self, nk, xk, sk):\n \"\"\"Estimate the precisions parameters of the precision distribution.\n\n Parameters\n ----------\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like\n The shape depends of `covariance_type`:\n 'full' : (n_components, n_features, n_features)\n 'tied' : (n_features, n_features)\n 'diag' : (n_components, n_features)\n 'spherical' : (n_components,)\n \"\"\"\n {'full': self._estimate_wishart_full, 'tied': self._estimate_wishart_tied, 'diag': self._estimate_wishart_diag, 'spherical': self._estimate_wishart_spherical}[self.covariance_type](nk, xk, sk)\n self.precisions_cholesky_ = _compute_precision_cholesky(self.covariances_, self.covariance_type)\n \n def _estimate_wishart_full(self, nk, xk, sk):\n \"\"\"Estimate the full Wishart distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like of shape (n_components, n_features, n_features)\n \"\"\"\n (_, n_features) = xk.shape\n self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk\n self.covariances_ = np.empty((self.n_components, n_features, n_features))\n for k in range(self.n_components):\n diff = xk[k] - self.mean_prior_\n self.covariances_[k] = self.covariance_prior_ + nk[k] * sk[k] + nk[k] * self.mean_precision_prior_ / self.mean_precision_[k] * np.outer(diff, diff)\n self.covariances_ /= self.degrees_of_freedom_[:, np.newaxis, np.newaxis]\n \n def _estimate_wishart_tied(self, nk, xk, sk):\n \"\"\"Estimate the tied Wishart distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like of shape (n_features, n_features)\n \"\"\"\n (_, n_features) = xk.shape\n self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk.sum() / self.n_components\n diff = xk - self.mean_prior_\n self.covariances_ = self.covariance_prior_ + sk * nk.sum() / self.n_components + self.mean_precision_prior_ / self.n_components * np.dot(nk / self.mean_precision_ * diff.T, diff)\n self.covariances_ /= self.degrees_of_freedom_\n \n def _estimate_wishart_diag(self, nk, xk, sk):\n \"\"\"Estimate the diag Wishart distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like of shape (n_components, n_features)\n \"\"\"\n (_, n_features) = xk.shape\n self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk\n diff = xk - self.mean_prior_\n self.covariances_ = self.covariance_prior_ + nk[:, np.newaxis] * (sk + (self.mean_precision_prior_ / self.mean_precision_)[:, np.newaxis] * np.square(diff))\n self.covariances_ /= self.degrees_of_freedom_[:, np.newaxis]\n \n def _estimate_wishart_spherical(self, nk, xk, sk):\n \"\"\"Estimate the spherical Wishart distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like of shape (n_components,)\n \"\"\"\n (_, n_features) = xk.shape\n self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk\n diff = xk - self.mean_prior_\n self.covariances_ = self.covariance_prior_ + nk * (sk + self.mean_precision_prior_ / self.mean_precision_ * np.mean(np.square(diff), 1))\n self.covariances_ /= self.degrees_of_freedom_\n \n def _m_step(self, X, log_resp):\n \"\"\"M step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n \"\"\"\n (n_samples, _) = X.shape\n (nk, xk, sk) = _estimate_gaussian_parameters(X, np.exp(log_resp), self.reg_covar, self.covariance_type)\n self._estimate_weights(nk)\n self._estimate_means(nk, xk)\n self._estimate_precisions(nk, xk, sk)\n \n def _estimate_log_weights(self):\n if self.weight_concentration_prior_type == 'dirichlet_process':\n digamma_sum = digamma(self.weight_concentration_[0] + self.weight_concentration_[1])\n digamma_a = digamma(self.weight_concentration_[0])\n digamma_b = digamma(self.weight_concentration_[1])\n return digamma_a - digamma_sum + np.hstack((0, np.cumsum(digamma_b - digamma_sum)[:-1]))\n else:\n return digamma(self.weight_concentration_) - digamma(np.sum(self.weight_concentration_))\n \n def _estimate_log_prob(self, X):\n (_, n_features) = X.shape\n log_gauss = _estimate_log_gaussian_prob(X, self.means_, self.precisions_cholesky_, self.covariance_type) - 0.5 * n_features * np.log(self.degrees_of_freedom_)\n log_lambda = n_features * np.log(2.0) + np.sum(digamma(0.5 * (self.degrees_of_freedom_ - np.arange(0, n_features)[:, np.newaxis])), 0)\n return log_gauss + 0.5 * (log_lambda - n_features / self.mean_precision_)\n \n def _compute_lower_bound(self, log_resp, log_prob_norm):\n \"\"\"Estimate the lower bound of the model.\n\n The lower bound on the likelihood (of the training data with respect to\n the model) is used to detect the convergence and has to increase at\n each iteration.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array, shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n\n log_prob_norm : float\n Logarithm of the probability of each sample in X.\n\n Returns\n -------\n lower_bound : float\n \"\"\"\n (n_features, ) = self.mean_prior_.shape\n log_det_precisions_chol = _compute_log_det_cholesky(self.precisions_cholesky_, self.covariance_type, n_features) - 0.5 * n_features * np.log(self.degrees_of_freedom_)\n if self.covariance_type == 'tied':\n log_wishart = self.n_components * np.float64(_log_wishart_norm(self.degrees_of_freedom_, log_det_precisions_chol, n_features))\n else:\n log_wishart = np.sum(_log_wishart_norm(self.degrees_of_freedom_, log_det_precisions_chol, n_features))\n if self.weight_concentration_prior_type == 'dirichlet_process':\n log_norm_weight = -np.sum(betaln(self.weight_concentration_[0], self.weight_concentration_[1]))\n else:\n log_norm_weight = _log_dirichlet_norm(self.weight_concentration_)\n return -np.sum(np.exp(log_resp) * log_resp) - log_wishart - log_norm_weight - 0.5 * n_features * np.sum(np.log(self.mean_precision_))\n \n def _get_parameters(self):\n return self.weight_concentration_, self.mean_precision_, self.means_, self.degrees_of_freedom_, self.covariances_, self.precisions_cholesky_\n \n def _set_parameters(self, params):\n (self.weight_concentration_, self.mean_precision_, self.means_, self.degrees_of_freedom_, self.covariances_, self.precisions_cholesky_) = params\n if self.weight_concentration_prior_type == 'dirichlet_process':\n weight_dirichlet_sum = self.weight_concentration_[0] + self.weight_concentration_[1]\n tmp = self.weight_concentration_[1] / weight_dirichlet_sum\n self.weights_ = self.weight_concentration_[0] / weight_dirichlet_sum * np.hstack((1, np.cumprod(tmp[:-1])))\n self.weights_ /= np.sum(self.weights_)\n else:\n self.weights_ = self.weight_concentration_ / np.sum(self.weight_concentration_)\n if self.covariance_type == 'full':\n self.precisions_ = np.array([np.dot(prec_chol, prec_chol.T) for prec_chol in self.precisions_cholesky_])\n elif self.covariance_type == 'tied':\n self.precisions_ = np.dot(self.precisions_cholesky_, self.precisions_cholesky_.T)\n else:\n self.precisions_ = self.precisions_cholesky_**2\n" }, @@ -24540,9 +24622,9 @@ "sklearn.mixture._gaussian_mixture.GaussianMixture.aic" ], "is_public": true, - "description": "Gaussian Mixture.\n\nRepresentation of a Gaussian mixture model probability distribution. This class allows to estimate the parameters of a Gaussian mixture distribution. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", - "docstring": "Gaussian Mixture.\n\n Representation of a Gaussian mixture model probability distribution.\n This class allows to estimate the parameters of a Gaussian mixture\n distribution.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n n_components : int, default=1\n The number of mixture components.\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full'\n String describing the type of covariance parameters to use.\n Must be one of:\n\n 'full'\n each component has its own general covariance matrix\n 'tied'\n all components share the same general covariance matrix\n 'diag'\n each component has its own diagonal covariance matrix\n 'spherical'\n each component has its own single variance\n\n tol : float, default=1e-3\n The convergence threshold. EM iterations will stop when the\n lower bound average gain is below this threshold.\n\n reg_covar : float, default=1e-6\n Non-negative regularization added to the diagonal of covariance.\n Allows to assure that the covariance matrices are all positive.\n\n max_iter : int, default=100\n The number of EM iterations to perform.\n\n n_init : int, default=1\n The number of initializations to perform. The best results are kept.\n\n init_params : {'kmeans', 'random'}, default='kmeans'\n The method used to initialize the weights, the means and the\n precisions.\n Must be one of::\n\n 'kmeans' : responsibilities are initialized using kmeans.\n 'random' : responsibilities are initialized randomly.\n\n weights_init : array-like of shape (n_components, ), default=None\n The user-provided initial weights.\n If it is None, weights are initialized using the `init_params` method.\n\n means_init : array-like of shape (n_components, n_features), default=None\n The user-provided initial means,\n If it is None, means are initialized using the `init_params` method.\n\n precisions_init : array-like, default=None\n The user-provided initial precisions (inverse of the covariance\n matrices).\n If it is None, precisions are initialized using the 'init_params'\n method.\n The shape depends on 'covariance_type'::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given to the method chosen to initialize the\n parameters (see `init_params`).\n In addition, it controls the generation of random samples from the\n fitted distribution (see the method `sample`).\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n warm_start : bool, default=False\n If 'warm_start' is True, the solution of the last fitting is used as\n initialization for the next call of fit(). This can speed up\n convergence when fit is called several times on similar problems.\n In that case, 'n_init' is ignored and only a single initialization\n occurs upon the first call.\n See :term:`the Glossary `.\n\n verbose : int, default=0\n Enable verbose output. If 1 then it prints the current\n initialization and each iteration step. If greater than 1 then\n it prints also the log probability and the time needed\n for each step.\n\n verbose_interval : int, default=10\n Number of iteration done before the next print.\n\n Attributes\n ----------\n weights_ : array-like of shape (n_components,)\n The weights of each mixture components.\n\n means_ : array-like of shape (n_components, n_features)\n The mean of each mixture component.\n\n covariances_ : array-like\n The covariance of each mixture component.\n The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n precisions_ : array-like\n The precision matrices for each component in the mixture. A precision\n matrix is the inverse of a covariance matrix. A covariance matrix is\n symmetric positive definite so the mixture of Gaussian can be\n equivalently parameterized by the precision matrices. Storing the\n precision matrices instead of the covariance matrices makes it more\n efficient to compute the log-likelihood of new samples at test time.\n The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n precisions_cholesky_ : array-like\n The cholesky decomposition of the precision matrices of each mixture\n component. A precision matrix is the inverse of a covariance matrix.\n A covariance matrix is symmetric positive definite so the mixture of\n Gaussian can be equivalently parameterized by the precision matrices.\n Storing the precision matrices instead of the covariance matrices makes\n it more efficient to compute the log-likelihood of new samples at test\n time. The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n converged_ : bool\n True when convergence was reached in fit(), False otherwise.\n\n n_iter_ : int\n Number of step used by the best fit of EM to reach the convergence.\n\n lower_bound_ : float\n Lower bound value on the log-likelihood (of the training data with\n respect to the model) of the best fit of EM.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n BayesianGaussianMixture : Gaussian mixture model fit with a variational\n inference.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.mixture import GaussianMixture\n >>> X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])\n >>> gm = GaussianMixture(n_components=2, random_state=0).fit(X)\n >>> gm.means_\n array([[10., 2.],\n [ 1., 2.]])\n >>> gm.predict([[0, 0], [12, 3]])\n array([1, 0])\n ", - "source_code": "\n\nclass GaussianMixture(BaseMixture):\n \"\"\"Gaussian Mixture.\n\n Representation of a Gaussian mixture model probability distribution.\n This class allows to estimate the parameters of a Gaussian mixture\n distribution.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n n_components : int, default=1\n The number of mixture components.\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full'\n String describing the type of covariance parameters to use.\n Must be one of:\n\n 'full'\n each component has its own general covariance matrix\n 'tied'\n all components share the same general covariance matrix\n 'diag'\n each component has its own diagonal covariance matrix\n 'spherical'\n each component has its own single variance\n\n tol : float, default=1e-3\n The convergence threshold. EM iterations will stop when the\n lower bound average gain is below this threshold.\n\n reg_covar : float, default=1e-6\n Non-negative regularization added to the diagonal of covariance.\n Allows to assure that the covariance matrices are all positive.\n\n max_iter : int, default=100\n The number of EM iterations to perform.\n\n n_init : int, default=1\n The number of initializations to perform. The best results are kept.\n\n init_params : {'kmeans', 'random'}, default='kmeans'\n The method used to initialize the weights, the means and the\n precisions.\n Must be one of::\n\n 'kmeans' : responsibilities are initialized using kmeans.\n 'random' : responsibilities are initialized randomly.\n\n weights_init : array-like of shape (n_components, ), default=None\n The user-provided initial weights.\n If it is None, weights are initialized using the `init_params` method.\n\n means_init : array-like of shape (n_components, n_features), default=None\n The user-provided initial means,\n If it is None, means are initialized using the `init_params` method.\n\n precisions_init : array-like, default=None\n The user-provided initial precisions (inverse of the covariance\n matrices).\n If it is None, precisions are initialized using the 'init_params'\n method.\n The shape depends on 'covariance_type'::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given to the method chosen to initialize the\n parameters (see `init_params`).\n In addition, it controls the generation of random samples from the\n fitted distribution (see the method `sample`).\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n warm_start : bool, default=False\n If 'warm_start' is True, the solution of the last fitting is used as\n initialization for the next call of fit(). This can speed up\n convergence when fit is called several times on similar problems.\n In that case, 'n_init' is ignored and only a single initialization\n occurs upon the first call.\n See :term:`the Glossary `.\n\n verbose : int, default=0\n Enable verbose output. If 1 then it prints the current\n initialization and each iteration step. If greater than 1 then\n it prints also the log probability and the time needed\n for each step.\n\n verbose_interval : int, default=10\n Number of iteration done before the next print.\n\n Attributes\n ----------\n weights_ : array-like of shape (n_components,)\n The weights of each mixture components.\n\n means_ : array-like of shape (n_components, n_features)\n The mean of each mixture component.\n\n covariances_ : array-like\n The covariance of each mixture component.\n The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n precisions_ : array-like\n The precision matrices for each component in the mixture. A precision\n matrix is the inverse of a covariance matrix. A covariance matrix is\n symmetric positive definite so the mixture of Gaussian can be\n equivalently parameterized by the precision matrices. Storing the\n precision matrices instead of the covariance matrices makes it more\n efficient to compute the log-likelihood of new samples at test time.\n The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n precisions_cholesky_ : array-like\n The cholesky decomposition of the precision matrices of each mixture\n component. A precision matrix is the inverse of a covariance matrix.\n A covariance matrix is symmetric positive definite so the mixture of\n Gaussian can be equivalently parameterized by the precision matrices.\n Storing the precision matrices instead of the covariance matrices makes\n it more efficient to compute the log-likelihood of new samples at test\n time. The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n converged_ : bool\n True when convergence was reached in fit(), False otherwise.\n\n n_iter_ : int\n Number of step used by the best fit of EM to reach the convergence.\n\n lower_bound_ : float\n Lower bound value on the log-likelihood (of the training data with\n respect to the model) of the best fit of EM.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n BayesianGaussianMixture : Gaussian mixture model fit with a variational\n inference.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.mixture import GaussianMixture\n >>> X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])\n >>> gm = GaussianMixture(n_components=2, random_state=0).fit(X)\n >>> gm.means_\n array([[10., 2.],\n [ 1., 2.]])\n >>> gm.predict([[0, 0], [12, 3]])\n array([1, 0])\n \"\"\"\n \n def __init__(self, n_components=1, *, covariance_type='full', tol=0.001, reg_covar=1e-06, max_iter=100, n_init=1, init_params='kmeans', weights_init=None, means_init=None, precisions_init=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10):\n super().__init__(n_components=n_components, tol=tol, reg_covar=reg_covar, max_iter=max_iter, n_init=n_init, init_params=init_params, random_state=random_state, warm_start=warm_start, verbose=verbose, verbose_interval=verbose_interval)\n self.covariance_type = covariance_type\n self.weights_init = weights_init\n self.means_init = means_init\n self.precisions_init = precisions_init\n \n def _check_parameters(self, X):\n \"\"\"Check the Gaussian mixture parameters are well defined.\"\"\"\n (_, n_features) = X.shape\n if self.covariance_type not in ['spherical', 'tied', 'diag', 'full']:\n raise ValueError(\"Invalid value for 'covariance_type': %s 'covariance_type' should be in ['spherical', 'tied', 'diag', 'full']\" % self.covariance_type)\n if self.weights_init is not None:\n self.weights_init = _check_weights(self.weights_init, self.n_components)\n if self.means_init is not None:\n self.means_init = _check_means(self.means_init, self.n_components, n_features)\n if self.precisions_init is not None:\n self.precisions_init = _check_precisions(self.precisions_init, self.covariance_type, self.n_components, n_features)\n \n def _initialize(self, X, resp):\n \"\"\"Initialization of the Gaussian mixture parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n resp : array-like of shape (n_samples, n_components)\n \"\"\"\n (n_samples, _) = X.shape\n (weights, means, covariances) = _estimate_gaussian_parameters(X, resp, self.reg_covar, self.covariance_type)\n weights /= n_samples\n self.weights_ = weights if self.weights_init is None else self.weights_init\n self.means_ = means if self.means_init is None else self.means_init\n if self.precisions_init is None:\n self.covariances_ = covariances\n self.precisions_cholesky_ = _compute_precision_cholesky(covariances, self.covariance_type)\n elif self.covariance_type == 'full':\n self.precisions_cholesky_ = np.array([linalg.cholesky(prec_init, lower=True) for prec_init in self.precisions_init])\n elif self.covariance_type == 'tied':\n self.precisions_cholesky_ = linalg.cholesky(self.precisions_init, lower=True)\n else:\n self.precisions_cholesky_ = self.precisions_init\n \n def _m_step(self, X, log_resp):\n \"\"\"M step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n \"\"\"\n (n_samples, _) = X.shape\n (self.weights_, self.means_, self.covariances_) = _estimate_gaussian_parameters(X, np.exp(log_resp), self.reg_covar, self.covariance_type)\n self.weights_ /= n_samples\n self.precisions_cholesky_ = _compute_precision_cholesky(self.covariances_, self.covariance_type)\n \n def _estimate_log_prob(self, X):\n return _estimate_log_gaussian_prob(X, self.means_, self.precisions_cholesky_, self.covariance_type)\n \n def _estimate_log_weights(self):\n return np.log(self.weights_)\n \n def _compute_lower_bound(self, _, log_prob_norm):\n return log_prob_norm\n \n def _get_parameters(self):\n return self.weights_, self.means_, self.covariances_, self.precisions_cholesky_\n \n def _set_parameters(self, params):\n (self.weights_, self.means_, self.covariances_, self.precisions_cholesky_) = params\n (_, n_features) = self.means_.shape\n if self.covariance_type == 'full':\n self.precisions_ = np.empty(self.precisions_cholesky_.shape)\n for (k, prec_chol) in enumerate(self.precisions_cholesky_):\n self.precisions_[k] = np.dot(prec_chol, prec_chol.T)\n elif self.covariance_type == 'tied':\n self.precisions_ = np.dot(self.precisions_cholesky_, self.precisions_cholesky_.T)\n else:\n self.precisions_ = self.precisions_cholesky_**2\n \n def _n_parameters(self):\n \"\"\"Return the number of free parameters in the model.\"\"\"\n (_, n_features) = self.means_.shape\n if self.covariance_type == 'full':\n cov_params = self.n_components * n_features * (n_features + 1) / 2.0\n elif self.covariance_type == 'diag':\n cov_params = self.n_components * n_features\n elif self.covariance_type == 'tied':\n cov_params = n_features * (n_features + 1) / 2.0\n elif self.covariance_type == 'spherical':\n cov_params = self.n_components\n mean_params = n_features * self.n_components\n return int(cov_params + mean_params + self.n_components - 1)\n \n def bic(self, X):\n \"\"\"Bayesian information criterion for the current model on the input X.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_dimensions)\n The input samples.\n\n Returns\n -------\n bic : float\n The lower the better.\n \"\"\"\n return -2 * self.score(X) * X.shape[0] + self._n_parameters() * np.log(X.shape[0])\n \n def aic(self, X):\n \"\"\"Akaike information criterion for the current model on the input X.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_dimensions)\n The input samples.\n\n Returns\n -------\n aic : float\n The lower the better.\n \"\"\"\n return -2 * self.score(X) * X.shape[0] + 2 * self._n_parameters()\n" + "description": "Gaussian Mixture.\n\nRepresentation of a Gaussian mixture model probability distribution.\nThis class allows to estimate the parameters of a Gaussian mixture\ndistribution.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", + "docstring": "Gaussian Mixture.\n\n Representation of a Gaussian mixture model probability distribution.\n This class allows to estimate the parameters of a Gaussian mixture\n distribution.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n n_components : int, default=1\n The number of mixture components.\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full'\n String describing the type of covariance parameters to use.\n Must be one of:\n\n - 'full': each component has its own general covariance matrix.\n - 'tied': all components share the same general covariance matrix.\n - 'diag': each component has its own diagonal covariance matrix.\n - 'spherical': each component has its own single variance.\n\n tol : float, default=1e-3\n The convergence threshold. EM iterations will stop when the\n lower bound average gain is below this threshold.\n\n reg_covar : float, default=1e-6\n Non-negative regularization added to the diagonal of covariance.\n Allows to assure that the covariance matrices are all positive.\n\n max_iter : int, default=100\n The number of EM iterations to perform.\n\n n_init : int, default=1\n The number of initializations to perform. The best results are kept.\n\n init_params : {'kmeans', 'random'}, default='kmeans'\n The method used to initialize the weights, the means and the\n precisions.\n Must be one of::\n\n 'kmeans' : responsibilities are initialized using kmeans.\n 'random' : responsibilities are initialized randomly.\n\n weights_init : array-like of shape (n_components, ), default=None\n The user-provided initial weights.\n If it is None, weights are initialized using the `init_params` method.\n\n means_init : array-like of shape (n_components, n_features), default=None\n The user-provided initial means,\n If it is None, means are initialized using the `init_params` method.\n\n precisions_init : array-like, default=None\n The user-provided initial precisions (inverse of the covariance\n matrices).\n If it is None, precisions are initialized using the 'init_params'\n method.\n The shape depends on 'covariance_type'::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given to the method chosen to initialize the\n parameters (see `init_params`).\n In addition, it controls the generation of random samples from the\n fitted distribution (see the method `sample`).\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n warm_start : bool, default=False\n If 'warm_start' is True, the solution of the last fitting is used as\n initialization for the next call of fit(). This can speed up\n convergence when fit is called several times on similar problems.\n In that case, 'n_init' is ignored and only a single initialization\n occurs upon the first call.\n See :term:`the Glossary `.\n\n verbose : int, default=0\n Enable verbose output. If 1 then it prints the current\n initialization and each iteration step. If greater than 1 then\n it prints also the log probability and the time needed\n for each step.\n\n verbose_interval : int, default=10\n Number of iteration done before the next print.\n\n Attributes\n ----------\n weights_ : array-like of shape (n_components,)\n The weights of each mixture components.\n\n means_ : array-like of shape (n_components, n_features)\n The mean of each mixture component.\n\n covariances_ : array-like\n The covariance of each mixture component.\n The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n precisions_ : array-like\n The precision matrices for each component in the mixture. A precision\n matrix is the inverse of a covariance matrix. A covariance matrix is\n symmetric positive definite so the mixture of Gaussian can be\n equivalently parameterized by the precision matrices. Storing the\n precision matrices instead of the covariance matrices makes it more\n efficient to compute the log-likelihood of new samples at test time.\n The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n precisions_cholesky_ : array-like\n The cholesky decomposition of the precision matrices of each mixture\n component. A precision matrix is the inverse of a covariance matrix.\n A covariance matrix is symmetric positive definite so the mixture of\n Gaussian can be equivalently parameterized by the precision matrices.\n Storing the precision matrices instead of the covariance matrices makes\n it more efficient to compute the log-likelihood of new samples at test\n time. The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n converged_ : bool\n True when convergence was reached in fit(), False otherwise.\n\n n_iter_ : int\n Number of step used by the best fit of EM to reach the convergence.\n\n lower_bound_ : float\n Lower bound value on the log-likelihood (of the training data with\n respect to the model) of the best fit of EM.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n BayesianGaussianMixture : Gaussian mixture model fit with a variational\n inference.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.mixture import GaussianMixture\n >>> X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])\n >>> gm = GaussianMixture(n_components=2, random_state=0).fit(X)\n >>> gm.means_\n array([[10., 2.],\n [ 1., 2.]])\n >>> gm.predict([[0, 0], [12, 3]])\n array([1, 0])\n ", + "source_code": "\n\nclass GaussianMixture(BaseMixture):\n \"\"\"Gaussian Mixture.\n\n Representation of a Gaussian mixture model probability distribution.\n This class allows to estimate the parameters of a Gaussian mixture\n distribution.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n n_components : int, default=1\n The number of mixture components.\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full'\n String describing the type of covariance parameters to use.\n Must be one of:\n\n - 'full': each component has its own general covariance matrix.\n - 'tied': all components share the same general covariance matrix.\n - 'diag': each component has its own diagonal covariance matrix.\n - 'spherical': each component has its own single variance.\n\n tol : float, default=1e-3\n The convergence threshold. EM iterations will stop when the\n lower bound average gain is below this threshold.\n\n reg_covar : float, default=1e-6\n Non-negative regularization added to the diagonal of covariance.\n Allows to assure that the covariance matrices are all positive.\n\n max_iter : int, default=100\n The number of EM iterations to perform.\n\n n_init : int, default=1\n The number of initializations to perform. The best results are kept.\n\n init_params : {'kmeans', 'random'}, default='kmeans'\n The method used to initialize the weights, the means and the\n precisions.\n Must be one of::\n\n 'kmeans' : responsibilities are initialized using kmeans.\n 'random' : responsibilities are initialized randomly.\n\n weights_init : array-like of shape (n_components, ), default=None\n The user-provided initial weights.\n If it is None, weights are initialized using the `init_params` method.\n\n means_init : array-like of shape (n_components, n_features), default=None\n The user-provided initial means,\n If it is None, means are initialized using the `init_params` method.\n\n precisions_init : array-like, default=None\n The user-provided initial precisions (inverse of the covariance\n matrices).\n If it is None, precisions are initialized using the 'init_params'\n method.\n The shape depends on 'covariance_type'::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n random_state : int, RandomState instance or None, default=None\n Controls the random seed given to the method chosen to initialize the\n parameters (see `init_params`).\n In addition, it controls the generation of random samples from the\n fitted distribution (see the method `sample`).\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n warm_start : bool, default=False\n If 'warm_start' is True, the solution of the last fitting is used as\n initialization for the next call of fit(). This can speed up\n convergence when fit is called several times on similar problems.\n In that case, 'n_init' is ignored and only a single initialization\n occurs upon the first call.\n See :term:`the Glossary `.\n\n verbose : int, default=0\n Enable verbose output. If 1 then it prints the current\n initialization and each iteration step. If greater than 1 then\n it prints also the log probability and the time needed\n for each step.\n\n verbose_interval : int, default=10\n Number of iteration done before the next print.\n\n Attributes\n ----------\n weights_ : array-like of shape (n_components,)\n The weights of each mixture components.\n\n means_ : array-like of shape (n_components, n_features)\n The mean of each mixture component.\n\n covariances_ : array-like\n The covariance of each mixture component.\n The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n precisions_ : array-like\n The precision matrices for each component in the mixture. A precision\n matrix is the inverse of a covariance matrix. A covariance matrix is\n symmetric positive definite so the mixture of Gaussian can be\n equivalently parameterized by the precision matrices. Storing the\n precision matrices instead of the covariance matrices makes it more\n efficient to compute the log-likelihood of new samples at test time.\n The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n precisions_cholesky_ : array-like\n The cholesky decomposition of the precision matrices of each mixture\n component. A precision matrix is the inverse of a covariance matrix.\n A covariance matrix is symmetric positive definite so the mixture of\n Gaussian can be equivalently parameterized by the precision matrices.\n Storing the precision matrices instead of the covariance matrices makes\n it more efficient to compute the log-likelihood of new samples at test\n time. The shape depends on `covariance_type`::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'\n\n converged_ : bool\n True when convergence was reached in fit(), False otherwise.\n\n n_iter_ : int\n Number of step used by the best fit of EM to reach the convergence.\n\n lower_bound_ : float\n Lower bound value on the log-likelihood (of the training data with\n respect to the model) of the best fit of EM.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n BayesianGaussianMixture : Gaussian mixture model fit with a variational\n inference.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.mixture import GaussianMixture\n >>> X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])\n >>> gm = GaussianMixture(n_components=2, random_state=0).fit(X)\n >>> gm.means_\n array([[10., 2.],\n [ 1., 2.]])\n >>> gm.predict([[0, 0], [12, 3]])\n array([1, 0])\n \"\"\"\n \n def __init__(self, n_components=1, *, covariance_type='full', tol=0.001, reg_covar=1e-06, max_iter=100, n_init=1, init_params='kmeans', weights_init=None, means_init=None, precisions_init=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10):\n super().__init__(n_components=n_components, tol=tol, reg_covar=reg_covar, max_iter=max_iter, n_init=n_init, init_params=init_params, random_state=random_state, warm_start=warm_start, verbose=verbose, verbose_interval=verbose_interval)\n self.covariance_type = covariance_type\n self.weights_init = weights_init\n self.means_init = means_init\n self.precisions_init = precisions_init\n \n def _check_parameters(self, X):\n \"\"\"Check the Gaussian mixture parameters are well defined.\"\"\"\n (_, n_features) = X.shape\n if self.covariance_type not in ['spherical', 'tied', 'diag', 'full']:\n raise ValueError(\"Invalid value for 'covariance_type': %s 'covariance_type' should be in ['spherical', 'tied', 'diag', 'full']\" % self.covariance_type)\n if self.weights_init is not None:\n self.weights_init = _check_weights(self.weights_init, self.n_components)\n if self.means_init is not None:\n self.means_init = _check_means(self.means_init, self.n_components, n_features)\n if self.precisions_init is not None:\n self.precisions_init = _check_precisions(self.precisions_init, self.covariance_type, self.n_components, n_features)\n \n def _initialize(self, X, resp):\n \"\"\"Initialization of the Gaussian mixture parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n resp : array-like of shape (n_samples, n_components)\n \"\"\"\n (n_samples, _) = X.shape\n (weights, means, covariances) = _estimate_gaussian_parameters(X, resp, self.reg_covar, self.covariance_type)\n weights /= n_samples\n self.weights_ = weights if self.weights_init is None else self.weights_init\n self.means_ = means if self.means_init is None else self.means_init\n if self.precisions_init is None:\n self.covariances_ = covariances\n self.precisions_cholesky_ = _compute_precision_cholesky(covariances, self.covariance_type)\n elif self.covariance_type == 'full':\n self.precisions_cholesky_ = np.array([linalg.cholesky(prec_init, lower=True) for prec_init in self.precisions_init])\n elif self.covariance_type == 'tied':\n self.precisions_cholesky_ = linalg.cholesky(self.precisions_init, lower=True)\n else:\n self.precisions_cholesky_ = np.sqrt(self.precisions_init)\n \n def _m_step(self, X, log_resp):\n \"\"\"M step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n \"\"\"\n (n_samples, _) = X.shape\n (self.weights_, self.means_, self.covariances_) = _estimate_gaussian_parameters(X, np.exp(log_resp), self.reg_covar, self.covariance_type)\n self.weights_ /= n_samples\n self.precisions_cholesky_ = _compute_precision_cholesky(self.covariances_, self.covariance_type)\n \n def _estimate_log_prob(self, X):\n return _estimate_log_gaussian_prob(X, self.means_, self.precisions_cholesky_, self.covariance_type)\n \n def _estimate_log_weights(self):\n return np.log(self.weights_)\n \n def _compute_lower_bound(self, _, log_prob_norm):\n return log_prob_norm\n \n def _get_parameters(self):\n return self.weights_, self.means_, self.covariances_, self.precisions_cholesky_\n \n def _set_parameters(self, params):\n (self.weights_, self.means_, self.covariances_, self.precisions_cholesky_) = params\n (_, n_features) = self.means_.shape\n if self.covariance_type == 'full':\n self.precisions_ = np.empty(self.precisions_cholesky_.shape)\n for (k, prec_chol) in enumerate(self.precisions_cholesky_):\n self.precisions_[k] = np.dot(prec_chol, prec_chol.T)\n elif self.covariance_type == 'tied':\n self.precisions_ = np.dot(self.precisions_cholesky_, self.precisions_cholesky_.T)\n else:\n self.precisions_ = self.precisions_cholesky_**2\n \n def _n_parameters(self):\n \"\"\"Return the number of free parameters in the model.\"\"\"\n (_, n_features) = self.means_.shape\n if self.covariance_type == 'full':\n cov_params = self.n_components * n_features * (n_features + 1) / 2.0\n elif self.covariance_type == 'diag':\n cov_params = self.n_components * n_features\n elif self.covariance_type == 'tied':\n cov_params = n_features * (n_features + 1) / 2.0\n elif self.covariance_type == 'spherical':\n cov_params = self.n_components\n mean_params = n_features * self.n_components\n return int(cov_params + mean_params + self.n_components - 1)\n \n def bic(self, X):\n \"\"\"Bayesian information criterion for the current model on the input X.\n\n You can refer to this :ref:`mathematical section ` for more\n details regarding the formulation of the BIC used.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_dimensions)\n The input samples.\n\n Returns\n -------\n bic : float\n The lower the better.\n \"\"\"\n return -2 * self.score(X) * X.shape[0] + self._n_parameters() * np.log(X.shape[0])\n \n def aic(self, X):\n \"\"\"Akaike information criterion for the current model on the input X.\n\n You can refer to this :ref:`mathematical section ` for more\n details regarding the formulation of the AIC used.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_dimensions)\n The input samples.\n\n Returns\n -------\n aic : float\n The lower the better.\n \"\"\"\n return -2 * self.score(X) * X.shape[0] + 2 * self._n_parameters()\n" }, { "name": "BaseSearchCV", @@ -24585,7 +24667,7 @@ "sklearn.model_selection._search.GridSearchCV._run_search" ], "is_public": true, - "description": "Exhaustive search over specified parameter values for an estimator.\n\nImportant members are fit, predict. GridSearchCV implements a \"fit\" and a \"score\" method. It also implements \"score_samples\", \"predict\", \"predict_proba\", \"decision_function\", \"transform\" and \"inverse_transform\" if they are implemented in the estimator used. The parameters of the estimator used to apply these methods are optimized by cross-validated grid-search over a parameter grid. Read more in the :ref:`User Guide `.", + "description": "Exhaustive search over specified parameter values for an estimator.\n\nImportant members are fit, predict.\n\nGridSearchCV implements a \"fit\" and a \"score\" method.\nIt also implements \"score_samples\", \"predict\", \"predict_proba\",\n\"decision_function\", \"transform\" and \"inverse_transform\" if they are\nimplemented in the estimator used.\n\nThe parameters of the estimator used to apply these methods are optimized\nby cross-validated grid-search over a parameter grid.\n\nRead more in the :ref:`User Guide `.", "docstring": "Exhaustive search over specified parameter values for an estimator.\n\n Important members are fit, predict.\n\n GridSearchCV implements a \"fit\" and a \"score\" method.\n It also implements \"score_samples\", \"predict\", \"predict_proba\",\n \"decision_function\", \"transform\" and \"inverse_transform\" if they are\n implemented in the estimator used.\n\n The parameters of the estimator used to apply these methods are optimized\n by cross-validated grid-search over a parameter grid.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n This is assumed to implement the scikit-learn estimator interface.\n Either estimator needs to provide a ``score`` function,\n or ``scoring`` must be passed.\n\n param_grid : dict or list of dictionaries\n Dictionary with parameters names (`str`) as keys and lists of\n parameter settings to try as values, or a list of such\n dictionaries, in which case the grids spanned by each dictionary\n in the list are explored. This enables searching over any sequence\n of parameter settings.\n\n scoring : str, callable, list, tuple or dict, default=None\n Strategy to evaluate the performance of the cross-validated model on\n the test set.\n\n If `scoring` represents a single score, one can use:\n\n - a single string (see :ref:`scoring_parameter`);\n - a callable (see :ref:`scoring`) that returns a single value.\n\n If `scoring` represents multiple scores, one can use:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n See :ref:`multimetric_grid_search` for an example.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\n refit : bool, str, or callable, default=True\n Refit an estimator using the best found parameters on the whole\n dataset.\n\n For multiple metric evaluation, this needs to be a `str` denoting the\n scorer that would be used to find the best parameters for refitting\n the estimator at the end.\n\n Where there are considerations other than maximum score in\n choosing a best estimator, ``refit`` can be set to a function which\n returns the selected ``best_index_`` given ``cv_results_``. In that\n case, the ``best_estimator_`` and ``best_params_`` will be set\n according to the returned ``best_index_`` while the ``best_score_``\n attribute will not be available.\n\n The refitted estimator is made available at the ``best_estimator_``\n attribute and permits using ``predict`` directly on this\n ``GridSearchCV`` instance.\n\n Also for multiple metric evaluation, the attributes ``best_index_``,\n ``best_score_`` and ``best_params_`` will only be available if\n ``refit`` is set and all of them will be determined w.r.t this specific\n scorer.\n\n See ``scoring`` parameter to know more about multiple metric\n evaluation.\n\n .. versionchanged:: 0.20\n Support for callable added.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - integer, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n verbose : int\n Controls the verbosity: the higher, the more messages.\n\n - >1 : the computation time for each fold and parameter candidate is\n displayed;\n - >2 : the score is also displayed;\n - >3 : the fold and candidate parameter indexes are also displayed\n together with the starting time of the computation.\n\n pre_dispatch : int, or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised. If a numeric value is given,\n FitFailedWarning is raised. This parameter does not affect the refit\n step, which will always raise the error.\n\n return_train_score : bool, default=False\n If ``False``, the ``cv_results_`` attribute will not include training\n scores.\n Computing training scores is used to get insights on how different\n parameter settings impact the overfitting/underfitting trade-off.\n However computing the scores on the training set can be computationally\n expensive and is not strictly required to select the parameters that\n yield the best generalization performance.\n\n .. versionadded:: 0.19\n\n .. versionchanged:: 0.21\n Default value was changed from ``True`` to ``False``\n\n Attributes\n ----------\n cv_results_ : dict of numpy (masked) ndarrays\n A dict with keys as column headers and values as columns, that can be\n imported into a pandas ``DataFrame``.\n\n For instance the below given table\n\n +------------+-----------+------------+-----------------+---+---------+\n |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...|\n +============+===========+============+=================+===+=========+\n | 'poly' | -- | 2 | 0.80 |...| 2 |\n +------------+-----------+------------+-----------------+---+---------+\n | 'poly' | -- | 3 | 0.70 |...| 4 |\n +------------+-----------+------------+-----------------+---+---------+\n | 'rbf' | 0.1 | -- | 0.80 |...| 3 |\n +------------+-----------+------------+-----------------+---+---------+\n | 'rbf' | 0.2 | -- | 0.93 |...| 1 |\n +------------+-----------+------------+-----------------+---+---------+\n\n will be represented by a ``cv_results_`` dict of::\n\n {\n 'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],\n mask = [False False False False]...)\n 'param_gamma': masked_array(data = [-- -- 0.1 0.2],\n mask = [ True True False False]...),\n 'param_degree': masked_array(data = [2.0 3.0 -- --],\n mask = [False False True True]...),\n 'split0_test_score' : [0.80, 0.70, 0.80, 0.93],\n 'split1_test_score' : [0.82, 0.50, 0.70, 0.78],\n 'mean_test_score' : [0.81, 0.60, 0.75, 0.85],\n 'std_test_score' : [0.01, 0.10, 0.05, 0.08],\n 'rank_test_score' : [2, 4, 3, 1],\n 'split0_train_score' : [0.80, 0.92, 0.70, 0.93],\n 'split1_train_score' : [0.82, 0.55, 0.70, 0.87],\n 'mean_train_score' : [0.81, 0.74, 0.70, 0.90],\n 'std_train_score' : [0.01, 0.19, 0.00, 0.03],\n 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49],\n 'std_fit_time' : [0.01, 0.02, 0.01, 0.01],\n 'mean_score_time' : [0.01, 0.06, 0.04, 0.04],\n 'std_score_time' : [0.00, 0.00, 0.00, 0.01],\n 'params' : [{'kernel': 'poly', 'degree': 2}, ...],\n }\n\n NOTE\n\n The key ``'params'`` is used to store a list of parameter\n settings dicts for all the parameter candidates.\n\n The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and\n ``std_score_time`` are all in seconds.\n\n For multi-metric evaluation, the scores for all the scorers are\n available in the ``cv_results_`` dict at the keys ending with that\n scorer's name (``'_'``) instead of ``'_score'`` shown\n above. ('split0_test_precision', 'mean_train_precision' etc.)\n\n best_estimator_ : estimator\n Estimator that was chosen by the search, i.e. estimator\n which gave highest score (or smallest loss if specified)\n on the left out data. Not available if ``refit=False``.\n\n See ``refit`` parameter for more information on allowed values.\n\n best_score_ : float\n Mean cross-validated score of the best_estimator\n\n For multi-metric evaluation, this is present only if ``refit`` is\n specified.\n\n This attribute is not available if ``refit`` is a function.\n\n best_params_ : dict\n Parameter setting that gave the best results on the hold out data.\n\n For multi-metric evaluation, this is present only if ``refit`` is\n specified.\n\n best_index_ : int\n The index (of the ``cv_results_`` arrays) which corresponds to the best\n candidate parameter setting.\n\n The dict at ``search.cv_results_['params'][search.best_index_]`` gives\n the parameter setting for the best model, that gives the highest\n mean score (``search.best_score_``).\n\n For multi-metric evaluation, this is present only if ``refit`` is\n specified.\n\n scorer_ : function or a dict\n Scorer function used on the held out data to choose the best\n parameters for the model.\n\n For multi-metric evaluation, this attribute holds the validated\n ``scoring`` dict which maps the scorer key to the scorer callable.\n\n n_splits_ : int\n The number of cross-validation splits (folds/iterations).\n\n refit_time_ : float\n Seconds used for refitting the best model on the whole dataset.\n\n This is present only if ``refit`` is not False.\n\n .. versionadded:: 0.20\n\n multimetric_ : bool\n Whether or not the scorers compute several metrics.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels. This is present only if ``refit`` is specified and\n the underlying estimator is a classifier.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `n_features_in_` when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `feature_names_in_` when fit.\n\n .. versionadded:: 1.0\n\n Notes\n -----\n The parameters selected are those that maximize the score of the left out\n data, unless an explicit score is passed in which case it is used instead.\n\n If `n_jobs` was set to a value higher than one, the data is copied for each\n point in the grid (and not `n_jobs` times). This is done for efficiency\n reasons if individual jobs take very little time, but may raise errors if\n the dataset is large and not enough memory is available. A workaround in\n this case is to set `pre_dispatch`. Then, the memory is copied only\n `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *\n n_jobs`.\n\n See Also\n ---------\n ParameterGrid : Generates all the combinations of a hyperparameter grid.\n train_test_split : Utility function to split the data into a development\n set usable for fitting a GridSearchCV instance and an evaluation set\n for its final evaluation.\n sklearn.metrics.make_scorer : Make a scorer from a performance metric or\n loss function.\n\n Examples\n --------\n >>> from sklearn import svm, datasets\n >>> from sklearn.model_selection import GridSearchCV\n >>> iris = datasets.load_iris()\n >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}\n >>> svc = svm.SVC()\n >>> clf = GridSearchCV(svc, parameters)\n >>> clf.fit(iris.data, iris.target)\n GridSearchCV(estimator=SVC(),\n param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})\n >>> sorted(clf.cv_results_.keys())\n ['mean_fit_time', 'mean_score_time', 'mean_test_score',...\n 'param_C', 'param_kernel', 'params',...\n 'rank_test_score', 'split0_test_score',...\n 'split2_test_score', ...\n 'std_fit_time', 'std_score_time', 'std_test_score']\n ", "source_code": "\n\nclass GridSearchCV(BaseSearchCV):\n \"\"\"Exhaustive search over specified parameter values for an estimator.\n\n Important members are fit, predict.\n\n GridSearchCV implements a \"fit\" and a \"score\" method.\n It also implements \"score_samples\", \"predict\", \"predict_proba\",\n \"decision_function\", \"transform\" and \"inverse_transform\" if they are\n implemented in the estimator used.\n\n The parameters of the estimator used to apply these methods are optimized\n by cross-validated grid-search over a parameter grid.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n This is assumed to implement the scikit-learn estimator interface.\n Either estimator needs to provide a ``score`` function,\n or ``scoring`` must be passed.\n\n param_grid : dict or list of dictionaries\n Dictionary with parameters names (`str`) as keys and lists of\n parameter settings to try as values, or a list of such\n dictionaries, in which case the grids spanned by each dictionary\n in the list are explored. This enables searching over any sequence\n of parameter settings.\n\n scoring : str, callable, list, tuple or dict, default=None\n Strategy to evaluate the performance of the cross-validated model on\n the test set.\n\n If `scoring` represents a single score, one can use:\n\n - a single string (see :ref:`scoring_parameter`);\n - a callable (see :ref:`scoring`) that returns a single value.\n\n If `scoring` represents multiple scores, one can use:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n See :ref:`multimetric_grid_search` for an example.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\n refit : bool, str, or callable, default=True\n Refit an estimator using the best found parameters on the whole\n dataset.\n\n For multiple metric evaluation, this needs to be a `str` denoting the\n scorer that would be used to find the best parameters for refitting\n the estimator at the end.\n\n Where there are considerations other than maximum score in\n choosing a best estimator, ``refit`` can be set to a function which\n returns the selected ``best_index_`` given ``cv_results_``. In that\n case, the ``best_estimator_`` and ``best_params_`` will be set\n according to the returned ``best_index_`` while the ``best_score_``\n attribute will not be available.\n\n The refitted estimator is made available at the ``best_estimator_``\n attribute and permits using ``predict`` directly on this\n ``GridSearchCV`` instance.\n\n Also for multiple metric evaluation, the attributes ``best_index_``,\n ``best_score_`` and ``best_params_`` will only be available if\n ``refit`` is set and all of them will be determined w.r.t this specific\n scorer.\n\n See ``scoring`` parameter to know more about multiple metric\n evaluation.\n\n .. versionchanged:: 0.20\n Support for callable added.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - integer, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n verbose : int\n Controls the verbosity: the higher, the more messages.\n\n - >1 : the computation time for each fold and parameter candidate is\n displayed;\n - >2 : the score is also displayed;\n - >3 : the fold and candidate parameter indexes are also displayed\n together with the starting time of the computation.\n\n pre_dispatch : int, or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised. If a numeric value is given,\n FitFailedWarning is raised. This parameter does not affect the refit\n step, which will always raise the error.\n\n return_train_score : bool, default=False\n If ``False``, the ``cv_results_`` attribute will not include training\n scores.\n Computing training scores is used to get insights on how different\n parameter settings impact the overfitting/underfitting trade-off.\n However computing the scores on the training set can be computationally\n expensive and is not strictly required to select the parameters that\n yield the best generalization performance.\n\n .. versionadded:: 0.19\n\n .. versionchanged:: 0.21\n Default value was changed from ``True`` to ``False``\n\n Attributes\n ----------\n cv_results_ : dict of numpy (masked) ndarrays\n A dict with keys as column headers and values as columns, that can be\n imported into a pandas ``DataFrame``.\n\n For instance the below given table\n\n +------------+-----------+------------+-----------------+---+---------+\n |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...|\n +============+===========+============+=================+===+=========+\n | 'poly' | -- | 2 | 0.80 |...| 2 |\n +------------+-----------+------------+-----------------+---+---------+\n | 'poly' | -- | 3 | 0.70 |...| 4 |\n +------------+-----------+------------+-----------------+---+---------+\n | 'rbf' | 0.1 | -- | 0.80 |...| 3 |\n +------------+-----------+------------+-----------------+---+---------+\n | 'rbf' | 0.2 | -- | 0.93 |...| 1 |\n +------------+-----------+------------+-----------------+---+---------+\n\n will be represented by a ``cv_results_`` dict of::\n\n {\n 'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],\n mask = [False False False False]...)\n 'param_gamma': masked_array(data = [-- -- 0.1 0.2],\n mask = [ True True False False]...),\n 'param_degree': masked_array(data = [2.0 3.0 -- --],\n mask = [False False True True]...),\n 'split0_test_score' : [0.80, 0.70, 0.80, 0.93],\n 'split1_test_score' : [0.82, 0.50, 0.70, 0.78],\n 'mean_test_score' : [0.81, 0.60, 0.75, 0.85],\n 'std_test_score' : [0.01, 0.10, 0.05, 0.08],\n 'rank_test_score' : [2, 4, 3, 1],\n 'split0_train_score' : [0.80, 0.92, 0.70, 0.93],\n 'split1_train_score' : [0.82, 0.55, 0.70, 0.87],\n 'mean_train_score' : [0.81, 0.74, 0.70, 0.90],\n 'std_train_score' : [0.01, 0.19, 0.00, 0.03],\n 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49],\n 'std_fit_time' : [0.01, 0.02, 0.01, 0.01],\n 'mean_score_time' : [0.01, 0.06, 0.04, 0.04],\n 'std_score_time' : [0.00, 0.00, 0.00, 0.01],\n 'params' : [{'kernel': 'poly', 'degree': 2}, ...],\n }\n\n NOTE\n\n The key ``'params'`` is used to store a list of parameter\n settings dicts for all the parameter candidates.\n\n The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and\n ``std_score_time`` are all in seconds.\n\n For multi-metric evaluation, the scores for all the scorers are\n available in the ``cv_results_`` dict at the keys ending with that\n scorer's name (``'_'``) instead of ``'_score'`` shown\n above. ('split0_test_precision', 'mean_train_precision' etc.)\n\n best_estimator_ : estimator\n Estimator that was chosen by the search, i.e. estimator\n which gave highest score (or smallest loss if specified)\n on the left out data. Not available if ``refit=False``.\n\n See ``refit`` parameter for more information on allowed values.\n\n best_score_ : float\n Mean cross-validated score of the best_estimator\n\n For multi-metric evaluation, this is present only if ``refit`` is\n specified.\n\n This attribute is not available if ``refit`` is a function.\n\n best_params_ : dict\n Parameter setting that gave the best results on the hold out data.\n\n For multi-metric evaluation, this is present only if ``refit`` is\n specified.\n\n best_index_ : int\n The index (of the ``cv_results_`` arrays) which corresponds to the best\n candidate parameter setting.\n\n The dict at ``search.cv_results_['params'][search.best_index_]`` gives\n the parameter setting for the best model, that gives the highest\n mean score (``search.best_score_``).\n\n For multi-metric evaluation, this is present only if ``refit`` is\n specified.\n\n scorer_ : function or a dict\n Scorer function used on the held out data to choose the best\n parameters for the model.\n\n For multi-metric evaluation, this attribute holds the validated\n ``scoring`` dict which maps the scorer key to the scorer callable.\n\n n_splits_ : int\n The number of cross-validation splits (folds/iterations).\n\n refit_time_ : float\n Seconds used for refitting the best model on the whole dataset.\n\n This is present only if ``refit`` is not False.\n\n .. versionadded:: 0.20\n\n multimetric_ : bool\n Whether or not the scorers compute several metrics.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels. This is present only if ``refit`` is specified and\n the underlying estimator is a classifier.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `n_features_in_` when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `feature_names_in_` when fit.\n\n .. versionadded:: 1.0\n\n Notes\n -----\n The parameters selected are those that maximize the score of the left out\n data, unless an explicit score is passed in which case it is used instead.\n\n If `n_jobs` was set to a value higher than one, the data is copied for each\n point in the grid (and not `n_jobs` times). This is done for efficiency\n reasons if individual jobs take very little time, but may raise errors if\n the dataset is large and not enough memory is available. A workaround in\n this case is to set `pre_dispatch`. Then, the memory is copied only\n `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *\n n_jobs`.\n\n See Also\n ---------\n ParameterGrid : Generates all the combinations of a hyperparameter grid.\n train_test_split : Utility function to split the data into a development\n set usable for fitting a GridSearchCV instance and an evaluation set\n for its final evaluation.\n sklearn.metrics.make_scorer : Make a scorer from a performance metric or\n loss function.\n\n Examples\n --------\n >>> from sklearn import svm, datasets\n >>> from sklearn.model_selection import GridSearchCV\n >>> iris = datasets.load_iris()\n >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}\n >>> svc = svm.SVC()\n >>> clf = GridSearchCV(svc, parameters)\n >>> clf.fit(iris.data, iris.target)\n GridSearchCV(estimator=SVC(),\n param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})\n >>> sorted(clf.cv_results_.keys())\n ['mean_fit_time', 'mean_score_time', 'mean_test_score',...\n 'param_C', 'param_kernel', 'params',...\n 'rank_test_score', 'split0_test_score',...\n 'split2_test_score', ...\n 'std_fit_time', 'std_score_time', 'std_test_score']\n \"\"\"\n _required_parameters = ['estimator', 'param_grid']\n \n def __init__(self, estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False):\n super().__init__(estimator=estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score)\n self.param_grid = param_grid\n _check_param_grid(param_grid)\n \n def _run_search(self, evaluate_candidates):\n \"\"\"Search all candidates in param_grid\"\"\"\n evaluate_candidates(ParameterGrid(self.param_grid))\n" }, @@ -24601,7 +24683,7 @@ "sklearn.model_selection._search.ParameterGrid.__getitem__" ], "is_public": true, - "description": "Grid of parameters with a discrete number of values for each.\n\nCan be used to iterate over parameter value combinations with the Python built-in function iter. The order of the generated parameter combinations is deterministic. Read more in the :ref:`User Guide `.", + "description": "Grid of parameters with a discrete number of values for each.\n\nCan be used to iterate over parameter value combinations with the\nPython built-in function iter.\nThe order of the generated parameter combinations is deterministic.\n\nRead more in the :ref:`User Guide `.", "docstring": "Grid of parameters with a discrete number of values for each.\n\n Can be used to iterate over parameter value combinations with the\n Python built-in function iter.\n The order of the generated parameter combinations is deterministic.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n param_grid : dict of str to sequence, or sequence of such\n The parameter grid to explore, as a dictionary mapping estimator\n parameters to sequences of allowed values.\n\n An empty dict signifies default parameters.\n\n A sequence of dicts signifies a sequence of grids to search, and is\n useful to avoid exploring parameter combinations that make no sense\n or have no effect. See the examples below.\n\n Examples\n --------\n >>> from sklearn.model_selection import ParameterGrid\n >>> param_grid = {'a': [1, 2], 'b': [True, False]}\n >>> list(ParameterGrid(param_grid)) == (\n ... [{'a': 1, 'b': True}, {'a': 1, 'b': False},\n ... {'a': 2, 'b': True}, {'a': 2, 'b': False}])\n True\n\n >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]\n >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},\n ... {'kernel': 'rbf', 'gamma': 1},\n ... {'kernel': 'rbf', 'gamma': 10}]\n True\n >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}\n True\n\n See Also\n --------\n GridSearchCV : Uses :class:`ParameterGrid` to perform a full parallelized\n parameter search.\n ", "source_code": "\n\nclass ParameterGrid:\n \"\"\"Grid of parameters with a discrete number of values for each.\n\n Can be used to iterate over parameter value combinations with the\n Python built-in function iter.\n The order of the generated parameter combinations is deterministic.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n param_grid : dict of str to sequence, or sequence of such\n The parameter grid to explore, as a dictionary mapping estimator\n parameters to sequences of allowed values.\n\n An empty dict signifies default parameters.\n\n A sequence of dicts signifies a sequence of grids to search, and is\n useful to avoid exploring parameter combinations that make no sense\n or have no effect. See the examples below.\n\n Examples\n --------\n >>> from sklearn.model_selection import ParameterGrid\n >>> param_grid = {'a': [1, 2], 'b': [True, False]}\n >>> list(ParameterGrid(param_grid)) == (\n ... [{'a': 1, 'b': True}, {'a': 1, 'b': False},\n ... {'a': 2, 'b': True}, {'a': 2, 'b': False}])\n True\n\n >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]\n >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},\n ... {'kernel': 'rbf', 'gamma': 1},\n ... {'kernel': 'rbf', 'gamma': 10}]\n True\n >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}\n True\n\n See Also\n --------\n GridSearchCV : Uses :class:`ParameterGrid` to perform a full parallelized\n parameter search.\n \"\"\"\n \n def __init__(self, param_grid):\n if not isinstance(param_grid, (Mapping, Iterable)):\n raise TypeError('Parameter grid is not a dict or a list ({!r})'.format(param_grid))\n if isinstance(param_grid, Mapping):\n param_grid = [param_grid]\n for grid in param_grid:\n if not isinstance(grid, dict):\n raise TypeError('Parameter grid is not a dict ({!r})'.format(grid))\n for key in grid:\n if not isinstance(grid[key], Iterable):\n raise TypeError('Parameter grid value is not iterable (key={!r}, value={!r})'.format(key, grid[key]))\n self.param_grid = param_grid\n \n def __iter__(self):\n \"\"\"Iterate over the points in the grid.\n\n Returns\n -------\n params : iterator over dict of str to any\n Yields dictionaries mapping each estimator parameter to one of its\n allowed values.\n \"\"\"\n for p in self.param_grid:\n items = sorted(p.items())\n if not items:\n yield {}\n else:\n (keys, values) = zip(*items)\n for v in product(*values):\n params = dict(zip(keys, v))\n yield params\n \n def __len__(self):\n \"\"\"Number of points on the grid.\"\"\"\n product = partial(reduce, operator.mul)\n return sum((product((len(v) for v in p.values())) if p else 1 for p in self.param_grid))\n \n def __getitem__(self, ind):\n \"\"\"Get the parameters that would be ``ind``th in iteration\n\n Parameters\n ----------\n ind : int\n The iteration index\n\n Returns\n -------\n params : dict of str to any\n Equal to list(self)[ind]\n \"\"\"\n for sub_grid in self.param_grid:\n if not sub_grid:\n if ind == 0:\n return {}\n else:\n ind -= 1\n continue\n (keys, values_lists) = zip(*sorted(sub_grid.items())[::-1])\n sizes = [len(v_list) for v_list in values_lists]\n total = np.product(sizes)\n if ind >= total:\n ind -= total\n else:\n out = {}\n for (key, v_list, n) in zip(keys, values_lists, sizes):\n (ind, offset) = divmod(ind, n)\n out[key] = v_list[offset]\n return out\n raise IndexError('ParameterGrid index out of range')\n" }, @@ -24617,7 +24699,7 @@ "sklearn.model_selection._search.ParameterSampler.__len__" ], "is_public": true, - "description": "Generator on parameters sampled from given distributions.\n\nNon-deterministic iterable over random candidate combinations for hyper- parameter search. If all parameters are presented as a list, sampling without replacement is performed. If at least one parameter is given as a distribution, sampling with replacement is used. It is highly recommended to use continuous distributions for continuous parameters. Read more in the :ref:`User Guide `.", + "description": "Generator on parameters sampled from given distributions.\n\nNon-deterministic iterable over random candidate combinations for hyper-\nparameter search. If all parameters are presented as a list,\nsampling without replacement is performed. If at least one parameter\nis given as a distribution, sampling with replacement is used.\nIt is highly recommended to use continuous distributions for continuous\nparameters.\n\nRead more in the :ref:`User Guide `.", "docstring": "Generator on parameters sampled from given distributions.\n\n Non-deterministic iterable over random candidate combinations for hyper-\n parameter search. If all parameters are presented as a list,\n sampling without replacement is performed. If at least one parameter\n is given as a distribution, sampling with replacement is used.\n It is highly recommended to use continuous distributions for continuous\n parameters.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n param_distributions : dict\n Dictionary with parameters names (`str`) as keys and distributions\n or lists of parameters to try. Distributions must provide a ``rvs``\n method for sampling (such as those from scipy.stats.distributions).\n If a list is given, it is sampled uniformly.\n If a list of dicts is given, first a dict is sampled uniformly, and\n then a parameter is sampled using that dict as above.\n\n n_iter : int\n Number of parameter settings that are produced.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo random number generator state used for random uniform sampling\n from lists of possible values instead of scipy.stats distributions.\n Pass an int for reproducible output across multiple\n function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n params : dict of str to any\n **Yields** dictionaries mapping each estimator parameter to\n as sampled value.\n\n Examples\n --------\n >>> from sklearn.model_selection import ParameterSampler\n >>> from scipy.stats.distributions import expon\n >>> import numpy as np\n >>> rng = np.random.RandomState(0)\n >>> param_grid = {'a':[1, 2], 'b': expon()}\n >>> param_list = list(ParameterSampler(param_grid, n_iter=4,\n ... random_state=rng))\n >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())\n ... for d in param_list]\n >>> rounded_list == [{'b': 0.89856, 'a': 1},\n ... {'b': 0.923223, 'a': 1},\n ... {'b': 1.878964, 'a': 2},\n ... {'b': 1.038159, 'a': 2}]\n True\n ", "source_code": "\n\nclass ParameterSampler:\n \"\"\"Generator on parameters sampled from given distributions.\n\n Non-deterministic iterable over random candidate combinations for hyper-\n parameter search. If all parameters are presented as a list,\n sampling without replacement is performed. If at least one parameter\n is given as a distribution, sampling with replacement is used.\n It is highly recommended to use continuous distributions for continuous\n parameters.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n param_distributions : dict\n Dictionary with parameters names (`str`) as keys and distributions\n or lists of parameters to try. Distributions must provide a ``rvs``\n method for sampling (such as those from scipy.stats.distributions).\n If a list is given, it is sampled uniformly.\n If a list of dicts is given, first a dict is sampled uniformly, and\n then a parameter is sampled using that dict as above.\n\n n_iter : int\n Number of parameter settings that are produced.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo random number generator state used for random uniform sampling\n from lists of possible values instead of scipy.stats distributions.\n Pass an int for reproducible output across multiple\n function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n params : dict of str to any\n **Yields** dictionaries mapping each estimator parameter to\n as sampled value.\n\n Examples\n --------\n >>> from sklearn.model_selection import ParameterSampler\n >>> from scipy.stats.distributions import expon\n >>> import numpy as np\n >>> rng = np.random.RandomState(0)\n >>> param_grid = {'a':[1, 2], 'b': expon()}\n >>> param_list = list(ParameterSampler(param_grid, n_iter=4,\n ... random_state=rng))\n >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())\n ... for d in param_list]\n >>> rounded_list == [{'b': 0.89856, 'a': 1},\n ... {'b': 0.923223, 'a': 1},\n ... {'b': 1.878964, 'a': 2},\n ... {'b': 1.038159, 'a': 2}]\n True\n \"\"\"\n \n def __init__(self, param_distributions, n_iter, *, random_state=None):\n if not isinstance(param_distributions, (Mapping, Iterable)):\n raise TypeError('Parameter distribution is not a dict or a list ({!r})'.format(param_distributions))\n if isinstance(param_distributions, Mapping):\n param_distributions = [param_distributions]\n for dist in param_distributions:\n if not isinstance(dist, dict):\n raise TypeError('Parameter distribution is not a dict ({!r})'.format(dist))\n for key in dist:\n if not isinstance(dist[key], Iterable) and not hasattr(dist[key], 'rvs'):\n raise TypeError('Parameter value is not iterable or distribution (key={!r}, value={!r})'.format(key, dist[key]))\n self.n_iter = n_iter\n self.random_state = random_state\n self.param_distributions = param_distributions\n \n def _is_all_lists(self):\n return all((all((not hasattr(v, 'rvs') for v in dist.values())) for dist in self.param_distributions))\n \n def __iter__(self):\n rng = check_random_state(self.random_state)\n if self._is_all_lists():\n param_grid = ParameterGrid(self.param_distributions)\n grid_size = len(param_grid)\n n_iter = self.n_iter\n if grid_size < n_iter:\n warnings.warn('The total space of parameters %d is smaller than n_iter=%d. Running %d iterations. For exhaustive searches, use GridSearchCV.' % (grid_size, self.n_iter, grid_size), UserWarning)\n n_iter = grid_size\n for i in sample_without_replacement(grid_size, n_iter, random_state=rng):\n yield param_grid[i]\n else:\n for _ in range(self.n_iter):\n dist = rng.choice(self.param_distributions)\n items = sorted(dist.items())\n params = dict()\n for (k, v) in items:\n if hasattr(v, 'rvs'):\n params[k] = v.rvs(random_state=rng)\n else:\n params[k] = v[rng.randint(len(v))]\n yield params\n \n def __len__(self):\n \"\"\"Number of points that will be sampled.\"\"\"\n if self._is_all_lists():\n grid_size = len(ParameterGrid(self.param_distributions))\n return min(self.n_iter, grid_size)\n else:\n return self.n_iter\n" }, @@ -24631,7 +24713,7 @@ "sklearn.model_selection._search.RandomizedSearchCV._run_search" ], "is_public": true, - "description": "Randomized search on hyper parameters.\n\nRandomizedSearchCV implements a \"fit\" and a \"score\" method. It also implements \"score_samples\", \"predict\", \"predict_proba\", \"decision_function\", \"transform\" and \"inverse_transform\" if they are implemented in the estimator used. The parameters of the estimator used to apply these methods are optimized by cross-validated search over parameter settings. In contrast to GridSearchCV, not all parameter values are tried out, but rather a fixed number of parameter settings is sampled from the specified distributions. The number of parameter settings that are tried is given by n_iter. If all parameters are presented as a list, sampling without replacement is performed. If at least one parameter is given as a distribution, sampling with replacement is used. It is highly recommended to use continuous distributions for continuous parameters. Read more in the :ref:`User Guide `. .. versionadded:: 0.14", + "description": "Randomized search on hyper parameters.\n\nRandomizedSearchCV implements a \"fit\" and a \"score\" method.\nIt also implements \"score_samples\", \"predict\", \"predict_proba\",\n\"decision_function\", \"transform\" and \"inverse_transform\" if they are\nimplemented in the estimator used.\n\nThe parameters of the estimator used to apply these methods are optimized\nby cross-validated search over parameter settings.\n\nIn contrast to GridSearchCV, not all parameter values are tried out, but\nrather a fixed number of parameter settings is sampled from the specified\ndistributions. The number of parameter settings that are tried is\ngiven by n_iter.\n\nIf all parameters are presented as a list,\nsampling without replacement is performed. If at least one parameter\nis given as a distribution, sampling with replacement is used.\nIt is highly recommended to use continuous distributions for continuous\nparameters.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.14", "docstring": "Randomized search on hyper parameters.\n\n RandomizedSearchCV implements a \"fit\" and a \"score\" method.\n It also implements \"score_samples\", \"predict\", \"predict_proba\",\n \"decision_function\", \"transform\" and \"inverse_transform\" if they are\n implemented in the estimator used.\n\n The parameters of the estimator used to apply these methods are optimized\n by cross-validated search over parameter settings.\n\n In contrast to GridSearchCV, not all parameter values are tried out, but\n rather a fixed number of parameter settings is sampled from the specified\n distributions. The number of parameter settings that are tried is\n given by n_iter.\n\n If all parameters are presented as a list,\n sampling without replacement is performed. If at least one parameter\n is given as a distribution, sampling with replacement is used.\n It is highly recommended to use continuous distributions for continuous\n parameters.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.14\n\n Parameters\n ----------\n estimator : estimator object\n A object of that type is instantiated for each grid point.\n This is assumed to implement the scikit-learn estimator interface.\n Either estimator needs to provide a ``score`` function,\n or ``scoring`` must be passed.\n\n param_distributions : dict or list of dicts\n Dictionary with parameters names (`str`) as keys and distributions\n or lists of parameters to try. Distributions must provide a ``rvs``\n method for sampling (such as those from scipy.stats.distributions).\n If a list is given, it is sampled uniformly.\n If a list of dicts is given, first a dict is sampled uniformly, and\n then a parameter is sampled using that dict as above.\n\n n_iter : int, default=10\n Number of parameter settings that are sampled. n_iter trades\n off runtime vs quality of the solution.\n\n scoring : str, callable, list, tuple or dict, default=None\n Strategy to evaluate the performance of the cross-validated model on\n the test set.\n\n If `scoring` represents a single score, one can use:\n\n - a single string (see :ref:`scoring_parameter`);\n - a callable (see :ref:`scoring`) that returns a single value.\n\n If `scoring` represents multiple scores, one can use:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n See :ref:`multimetric_grid_search` for an example.\n\n If None, the estimator's score method is used.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\n refit : bool, str, or callable, default=True\n Refit an estimator using the best found parameters on the whole\n dataset.\n\n For multiple metric evaluation, this needs to be a `str` denoting the\n scorer that would be used to find the best parameters for refitting\n the estimator at the end.\n\n Where there are considerations other than maximum score in\n choosing a best estimator, ``refit`` can be set to a function which\n returns the selected ``best_index_`` given the ``cv_results``. In that\n case, the ``best_estimator_`` and ``best_params_`` will be set\n according to the returned ``best_index_`` while the ``best_score_``\n attribute will not be available.\n\n The refitted estimator is made available at the ``best_estimator_``\n attribute and permits using ``predict`` directly on this\n ``RandomizedSearchCV`` instance.\n\n Also for multiple metric evaluation, the attributes ``best_index_``,\n ``best_score_`` and ``best_params_`` will only be available if\n ``refit`` is set and all of them will be determined w.r.t this specific\n scorer.\n\n See ``scoring`` parameter to know more about multiple metric\n evaluation.\n\n .. versionchanged:: 0.20\n Support for callable added.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - integer, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n verbose : int\n Controls the verbosity: the higher, the more messages.\n\n pre_dispatch : int, or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\n random_state : int, RandomState instance or None, default=None\n Pseudo random number generator state used for random uniform sampling\n from lists of possible values instead of scipy.stats distributions.\n Pass an int for reproducible output across multiple\n function calls.\n See :term:`Glossary `.\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised. If a numeric value is given,\n FitFailedWarning is raised. This parameter does not affect the refit\n step, which will always raise the error.\n\n return_train_score : bool, default=False\n If ``False``, the ``cv_results_`` attribute will not include training\n scores.\n Computing training scores is used to get insights on how different\n parameter settings impact the overfitting/underfitting trade-off.\n However computing the scores on the training set can be computationally\n expensive and is not strictly required to select the parameters that\n yield the best generalization performance.\n\n .. versionadded:: 0.19\n\n .. versionchanged:: 0.21\n Default value was changed from ``True`` to ``False``\n\n Attributes\n ----------\n cv_results_ : dict of numpy (masked) ndarrays\n A dict with keys as column headers and values as columns, that can be\n imported into a pandas ``DataFrame``.\n\n For instance the below given table\n\n +--------------+-------------+-------------------+---+---------------+\n | param_kernel | param_gamma | split0_test_score |...|rank_test_score|\n +==============+=============+===================+===+===============+\n | 'rbf' | 0.1 | 0.80 |...| 1 |\n +--------------+-------------+-------------------+---+---------------+\n | 'rbf' | 0.2 | 0.84 |...| 3 |\n +--------------+-------------+-------------------+---+---------------+\n | 'rbf' | 0.3 | 0.70 |...| 2 |\n +--------------+-------------+-------------------+---+---------------+\n\n will be represented by a ``cv_results_`` dict of::\n\n {\n 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],\n mask = False),\n 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False),\n 'split0_test_score' : [0.80, 0.84, 0.70],\n 'split1_test_score' : [0.82, 0.50, 0.70],\n 'mean_test_score' : [0.81, 0.67, 0.70],\n 'std_test_score' : [0.01, 0.24, 0.00],\n 'rank_test_score' : [1, 3, 2],\n 'split0_train_score' : [0.80, 0.92, 0.70],\n 'split1_train_score' : [0.82, 0.55, 0.70],\n 'mean_train_score' : [0.81, 0.74, 0.70],\n 'std_train_score' : [0.01, 0.19, 0.00],\n 'mean_fit_time' : [0.73, 0.63, 0.43],\n 'std_fit_time' : [0.01, 0.02, 0.01],\n 'mean_score_time' : [0.01, 0.06, 0.04],\n 'std_score_time' : [0.00, 0.00, 0.00],\n 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],\n }\n\n NOTE\n\n The key ``'params'`` is used to store a list of parameter\n settings dicts for all the parameter candidates.\n\n The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and\n ``std_score_time`` are all in seconds.\n\n For multi-metric evaluation, the scores for all the scorers are\n available in the ``cv_results_`` dict at the keys ending with that\n scorer's name (``'_'``) instead of ``'_score'`` shown\n above. ('split0_test_precision', 'mean_train_precision' etc.)\n\n best_estimator_ : estimator\n Estimator that was chosen by the search, i.e. estimator\n which gave highest score (or smallest loss if specified)\n on the left out data. Not available if ``refit=False``.\n\n For multi-metric evaluation, this attribute is present only if\n ``refit`` is specified.\n\n See ``refit`` parameter for more information on allowed values.\n\n best_score_ : float\n Mean cross-validated score of the best_estimator.\n\n For multi-metric evaluation, this is not available if ``refit`` is\n ``False``. See ``refit`` parameter for more information.\n\n This attribute is not available if ``refit`` is a function.\n\n best_params_ : dict\n Parameter setting that gave the best results on the hold out data.\n\n For multi-metric evaluation, this is not available if ``refit`` is\n ``False``. See ``refit`` parameter for more information.\n\n best_index_ : int\n The index (of the ``cv_results_`` arrays) which corresponds to the best\n candidate parameter setting.\n\n The dict at ``search.cv_results_['params'][search.best_index_]`` gives\n the parameter setting for the best model, that gives the highest\n mean score (``search.best_score_``).\n\n For multi-metric evaluation, this is not available if ``refit`` is\n ``False``. See ``refit`` parameter for more information.\n\n scorer_ : function or a dict\n Scorer function used on the held out data to choose the best\n parameters for the model.\n\n For multi-metric evaluation, this attribute holds the validated\n ``scoring`` dict which maps the scorer key to the scorer callable.\n\n n_splits_ : int\n The number of cross-validation splits (folds/iterations).\n\n refit_time_ : float\n Seconds used for refitting the best model on the whole dataset.\n\n This is present only if ``refit`` is not False.\n\n .. versionadded:: 0.20\n\n multimetric_ : bool\n Whether or not the scorers compute several metrics.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels. This is present only if ``refit`` is specified and\n the underlying estimator is a classifier.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `n_features_in_` when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `feature_names_in_` when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GridSearchCV : Does exhaustive search over a grid of parameters.\n ParameterSampler : A generator over parameter settings, constructed from\n param_distributions.\n\n Notes\n -----\n The parameters selected are those that maximize the score of the held-out\n data, according to the scoring parameter.\n\n If `n_jobs` was set to a value higher than one, the data is copied for each\n parameter setting(and not `n_jobs` times). This is done for efficiency\n reasons if individual jobs take very little time, but may raise errors if\n the dataset is large and not enough memory is available. A workaround in\n this case is to set `pre_dispatch`. Then, the memory is copied only\n `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *\n n_jobs`.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.model_selection import RandomizedSearchCV\n >>> from scipy.stats import uniform\n >>> iris = load_iris()\n >>> logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,\n ... random_state=0)\n >>> distributions = dict(C=uniform(loc=0, scale=4),\n ... penalty=['l2', 'l1'])\n >>> clf = RandomizedSearchCV(logistic, distributions, random_state=0)\n >>> search = clf.fit(iris.data, iris.target)\n >>> search.best_params_\n {'C': 2..., 'penalty': 'l1'}\n ", "source_code": "\n\nclass RandomizedSearchCV(BaseSearchCV):\n \"\"\"Randomized search on hyper parameters.\n\n RandomizedSearchCV implements a \"fit\" and a \"score\" method.\n It also implements \"score_samples\", \"predict\", \"predict_proba\",\n \"decision_function\", \"transform\" and \"inverse_transform\" if they are\n implemented in the estimator used.\n\n The parameters of the estimator used to apply these methods are optimized\n by cross-validated search over parameter settings.\n\n In contrast to GridSearchCV, not all parameter values are tried out, but\n rather a fixed number of parameter settings is sampled from the specified\n distributions. The number of parameter settings that are tried is\n given by n_iter.\n\n If all parameters are presented as a list,\n sampling without replacement is performed. If at least one parameter\n is given as a distribution, sampling with replacement is used.\n It is highly recommended to use continuous distributions for continuous\n parameters.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.14\n\n Parameters\n ----------\n estimator : estimator object\n A object of that type is instantiated for each grid point.\n This is assumed to implement the scikit-learn estimator interface.\n Either estimator needs to provide a ``score`` function,\n or ``scoring`` must be passed.\n\n param_distributions : dict or list of dicts\n Dictionary with parameters names (`str`) as keys and distributions\n or lists of parameters to try. Distributions must provide a ``rvs``\n method for sampling (such as those from scipy.stats.distributions).\n If a list is given, it is sampled uniformly.\n If a list of dicts is given, first a dict is sampled uniformly, and\n then a parameter is sampled using that dict as above.\n\n n_iter : int, default=10\n Number of parameter settings that are sampled. n_iter trades\n off runtime vs quality of the solution.\n\n scoring : str, callable, list, tuple or dict, default=None\n Strategy to evaluate the performance of the cross-validated model on\n the test set.\n\n If `scoring` represents a single score, one can use:\n\n - a single string (see :ref:`scoring_parameter`);\n - a callable (see :ref:`scoring`) that returns a single value.\n\n If `scoring` represents multiple scores, one can use:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n See :ref:`multimetric_grid_search` for an example.\n\n If None, the estimator's score method is used.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\n refit : bool, str, or callable, default=True\n Refit an estimator using the best found parameters on the whole\n dataset.\n\n For multiple metric evaluation, this needs to be a `str` denoting the\n scorer that would be used to find the best parameters for refitting\n the estimator at the end.\n\n Where there are considerations other than maximum score in\n choosing a best estimator, ``refit`` can be set to a function which\n returns the selected ``best_index_`` given the ``cv_results``. In that\n case, the ``best_estimator_`` and ``best_params_`` will be set\n according to the returned ``best_index_`` while the ``best_score_``\n attribute will not be available.\n\n The refitted estimator is made available at the ``best_estimator_``\n attribute and permits using ``predict`` directly on this\n ``RandomizedSearchCV`` instance.\n\n Also for multiple metric evaluation, the attributes ``best_index_``,\n ``best_score_`` and ``best_params_`` will only be available if\n ``refit`` is set and all of them will be determined w.r.t this specific\n scorer.\n\n See ``scoring`` parameter to know more about multiple metric\n evaluation.\n\n .. versionchanged:: 0.20\n Support for callable added.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - integer, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n verbose : int\n Controls the verbosity: the higher, the more messages.\n\n pre_dispatch : int, or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\n random_state : int, RandomState instance or None, default=None\n Pseudo random number generator state used for random uniform sampling\n from lists of possible values instead of scipy.stats distributions.\n Pass an int for reproducible output across multiple\n function calls.\n See :term:`Glossary `.\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised. If a numeric value is given,\n FitFailedWarning is raised. This parameter does not affect the refit\n step, which will always raise the error.\n\n return_train_score : bool, default=False\n If ``False``, the ``cv_results_`` attribute will not include training\n scores.\n Computing training scores is used to get insights on how different\n parameter settings impact the overfitting/underfitting trade-off.\n However computing the scores on the training set can be computationally\n expensive and is not strictly required to select the parameters that\n yield the best generalization performance.\n\n .. versionadded:: 0.19\n\n .. versionchanged:: 0.21\n Default value was changed from ``True`` to ``False``\n\n Attributes\n ----------\n cv_results_ : dict of numpy (masked) ndarrays\n A dict with keys as column headers and values as columns, that can be\n imported into a pandas ``DataFrame``.\n\n For instance the below given table\n\n +--------------+-------------+-------------------+---+---------------+\n | param_kernel | param_gamma | split0_test_score |...|rank_test_score|\n +==============+=============+===================+===+===============+\n | 'rbf' | 0.1 | 0.80 |...| 1 |\n +--------------+-------------+-------------------+---+---------------+\n | 'rbf' | 0.2 | 0.84 |...| 3 |\n +--------------+-------------+-------------------+---+---------------+\n | 'rbf' | 0.3 | 0.70 |...| 2 |\n +--------------+-------------+-------------------+---+---------------+\n\n will be represented by a ``cv_results_`` dict of::\n\n {\n 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],\n mask = False),\n 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False),\n 'split0_test_score' : [0.80, 0.84, 0.70],\n 'split1_test_score' : [0.82, 0.50, 0.70],\n 'mean_test_score' : [0.81, 0.67, 0.70],\n 'std_test_score' : [0.01, 0.24, 0.00],\n 'rank_test_score' : [1, 3, 2],\n 'split0_train_score' : [0.80, 0.92, 0.70],\n 'split1_train_score' : [0.82, 0.55, 0.70],\n 'mean_train_score' : [0.81, 0.74, 0.70],\n 'std_train_score' : [0.01, 0.19, 0.00],\n 'mean_fit_time' : [0.73, 0.63, 0.43],\n 'std_fit_time' : [0.01, 0.02, 0.01],\n 'mean_score_time' : [0.01, 0.06, 0.04],\n 'std_score_time' : [0.00, 0.00, 0.00],\n 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],\n }\n\n NOTE\n\n The key ``'params'`` is used to store a list of parameter\n settings dicts for all the parameter candidates.\n\n The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and\n ``std_score_time`` are all in seconds.\n\n For multi-metric evaluation, the scores for all the scorers are\n available in the ``cv_results_`` dict at the keys ending with that\n scorer's name (``'_'``) instead of ``'_score'`` shown\n above. ('split0_test_precision', 'mean_train_precision' etc.)\n\n best_estimator_ : estimator\n Estimator that was chosen by the search, i.e. estimator\n which gave highest score (or smallest loss if specified)\n on the left out data. Not available if ``refit=False``.\n\n For multi-metric evaluation, this attribute is present only if\n ``refit`` is specified.\n\n See ``refit`` parameter for more information on allowed values.\n\n best_score_ : float\n Mean cross-validated score of the best_estimator.\n\n For multi-metric evaluation, this is not available if ``refit`` is\n ``False``. See ``refit`` parameter for more information.\n\n This attribute is not available if ``refit`` is a function.\n\n best_params_ : dict\n Parameter setting that gave the best results on the hold out data.\n\n For multi-metric evaluation, this is not available if ``refit`` is\n ``False``. See ``refit`` parameter for more information.\n\n best_index_ : int\n The index (of the ``cv_results_`` arrays) which corresponds to the best\n candidate parameter setting.\n\n The dict at ``search.cv_results_['params'][search.best_index_]`` gives\n the parameter setting for the best model, that gives the highest\n mean score (``search.best_score_``).\n\n For multi-metric evaluation, this is not available if ``refit`` is\n ``False``. See ``refit`` parameter for more information.\n\n scorer_ : function or a dict\n Scorer function used on the held out data to choose the best\n parameters for the model.\n\n For multi-metric evaluation, this attribute holds the validated\n ``scoring`` dict which maps the scorer key to the scorer callable.\n\n n_splits_ : int\n The number of cross-validation splits (folds/iterations).\n\n refit_time_ : float\n Seconds used for refitting the best model on the whole dataset.\n\n This is present only if ``refit`` is not False.\n\n .. versionadded:: 0.20\n\n multimetric_ : bool\n Whether or not the scorers compute several metrics.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels. This is present only if ``refit`` is specified and\n the underlying estimator is a classifier.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `n_features_in_` when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `feature_names_in_` when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GridSearchCV : Does exhaustive search over a grid of parameters.\n ParameterSampler : A generator over parameter settings, constructed from\n param_distributions.\n\n Notes\n -----\n The parameters selected are those that maximize the score of the held-out\n data, according to the scoring parameter.\n\n If `n_jobs` was set to a value higher than one, the data is copied for each\n parameter setting(and not `n_jobs` times). This is done for efficiency\n reasons if individual jobs take very little time, but may raise errors if\n the dataset is large and not enough memory is available. A workaround in\n this case is to set `pre_dispatch`. Then, the memory is copied only\n `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *\n n_jobs`.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.model_selection import RandomizedSearchCV\n >>> from scipy.stats import uniform\n >>> iris = load_iris()\n >>> logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,\n ... random_state=0)\n >>> distributions = dict(C=uniform(loc=0, scale=4),\n ... penalty=['l2', 'l1'])\n >>> clf = RandomizedSearchCV(logistic, distributions, random_state=0)\n >>> search = clf.fit(iris.data, iris.target)\n >>> search.best_params_\n {'C': 2..., 'penalty': 'l1'}\n \"\"\"\n _required_parameters = ['estimator', 'param_distributions']\n \n def __init__(self, estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=False):\n self.param_distributions = param_distributions\n self.n_iter = n_iter\n self.random_state = random_state\n super().__init__(estimator=estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score)\n \n def _run_search(self, evaluate_candidates):\n \"\"\"Search n_iter candidates from param_distributions\"\"\"\n evaluate_candidates(ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state))\n" }, @@ -24650,7 +24732,7 @@ "sklearn.model_selection._search_successive_halving.BaseSuccessiveHalving._more_tags" ], "is_public": false, - "description": "Implements successive halving.\n\nRef: Almost optimal exploration in multi-armed bandits, ICML 13 Zohar Karnin, Tomer Koren, Oren Somekh", + "description": "Implements successive halving.\n\nRef:\nAlmost optimal exploration in multi-armed bandits, ICML 13\nZohar Karnin, Tomer Koren, Oren Somekh", "docstring": "Implements successive halving.\n\n Ref:\n Almost optimal exploration in multi-armed bandits, ICML 13\n Zohar Karnin, Tomer Koren, Oren Somekh\n ", "source_code": "\n\nclass BaseSuccessiveHalving(BaseSearchCV):\n \"\"\"Implements successive halving.\n\n Ref:\n Almost optimal exploration in multi-armed bandits, ICML 13\n Zohar Karnin, Tomer Koren, Oren Somekh\n \"\"\"\n \n def __init__(self, estimator, *, scoring=None, n_jobs=None, refit=True, cv=5, verbose=0, random_state=None, error_score=np.nan, return_train_score=True, max_resources='auto', min_resources='exhaust', resource='n_samples', factor=3, aggressive_elimination=False):\n super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose, error_score=error_score, return_train_score=return_train_score)\n self.random_state = random_state\n self.max_resources = max_resources\n self.resource = resource\n self.factor = factor\n self.min_resources = min_resources\n self.aggressive_elimination = aggressive_elimination\n \n def _check_input_parameters(self, X, y, groups):\n if self.scoring is not None and not (isinstance(self.scoring, str) or callable(self.scoring)):\n raise ValueError('scoring parameter must be a string, a callable or None. Multimetric scoring is not supported.')\n if not _yields_constant_splits(self._checked_cv_orig):\n raise ValueError('The cv parameter must yield consistent folds across calls to split(). Set its random_state to an int, or set shuffle=False.')\n if self.resource != 'n_samples' and self.resource not in self.estimator.get_params():\n raise ValueError(f'Cannot use resource={self.resource} which is not supported by estimator {self.estimator.__class__.__name__}')\n if isinstance(self.max_resources, str) and self.max_resources != 'auto':\n raise ValueError(\"max_resources must be either 'auto' or a positive integer\")\n if self.max_resources != 'auto' and (not isinstance(self.max_resources, Integral) or self.max_resources <= 0):\n raise ValueError(\"max_resources must be either 'auto' or a positive integer\")\n if self.min_resources not in ('smallest', 'exhaust') and (not isinstance(self.min_resources, Integral) or self.min_resources <= 0):\n raise ValueError(\"min_resources must be either 'smallest', 'exhaust', or a positive integer no greater than max_resources.\")\n if isinstance(self, HalvingRandomSearchCV):\n if self.min_resources == self.n_candidates == 'exhaust':\n raise ValueError(\"n_candidates and min_resources cannot be both set to 'exhaust'.\")\n if self.n_candidates != 'exhaust' and (not isinstance(self.n_candidates, Integral) or self.n_candidates <= 0):\n raise ValueError(\"n_candidates must be either 'exhaust' or a positive integer\")\n self.min_resources_ = self.min_resources\n if self.min_resources_ in ('smallest', 'exhaust'):\n if self.resource == 'n_samples':\n n_splits = self._checked_cv_orig.get_n_splits(X, y, groups)\n magic_factor = 2\n self.min_resources_ = n_splits * magic_factor\n if is_classifier(self.estimator):\n y = self._validate_data(X='no_validation', y=y)\n check_classification_targets(y)\n n_classes = np.unique(y).shape[0]\n self.min_resources_ *= n_classes\n else:\n self.min_resources_ = 1\n self.max_resources_ = self.max_resources\n if self.max_resources_ == 'auto':\n if not self.resource == 'n_samples':\n raise ValueError(\"max_resources can only be 'auto' if resource='n_samples'\")\n self.max_resources_ = _num_samples(X)\n if self.min_resources_ > self.max_resources_:\n raise ValueError(f'min_resources_={self.min_resources_} is greater than max_resources_={self.max_resources_}.')\n if self.min_resources_ == 0:\n raise ValueError(f'min_resources_={self.min_resources_}: you might have passed an empty dataset X.')\n if not isinstance(self.refit, bool):\n raise ValueError(f'refit is expected to be a boolean. Got {type(self.refit)} instead.')\n \n @staticmethod\n def _select_best_index(refit, refit_metric, results):\n \"\"\"Custom refit callable to return the index of the best candidate.\n\n We want the best candidate out of the last iteration. By default\n BaseSearchCV would return the best candidate out of all iterations.\n\n Currently, we only support for a single metric thus `refit` and\n `refit_metric` are not required.\n \"\"\"\n last_iter = np.max(results['iter'])\n last_iter_indices = np.flatnonzero(results['iter'] == last_iter)\n best_idx = np.argmax(results['mean_test_score'][last_iter_indices])\n return last_iter_indices[best_idx]\n \n def fit(self, X, y=None, groups=None, **fit_params):\n \"\"\"Run fit with all sets of parameters.\n\n Parameters\n ----------\n\n X : array-like, shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_output), optional\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of the estimator.\n\n Returns\n -------\n self : object\n Instance of fitted estimator.\n \"\"\"\n self._checked_cv_orig = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n self._check_input_parameters(X=X, y=y, groups=groups)\n self._n_samples_orig = _num_samples(X)\n super().fit(X, y=y, groups=groups, **fit_params)\n self.best_score_ = self.cv_results_['mean_test_score'][self.best_index_]\n return self\n \n def _run_search(self, evaluate_candidates):\n candidate_params = self._generate_candidate_params()\n if self.resource != 'n_samples' and any((self.resource in candidate for candidate in candidate_params)):\n raise ValueError(f'Cannot use parameter {self.resource} as the resource since it is part of the searched parameters.')\n n_required_iterations = 1 + floor(log(len(candidate_params), self.factor))\n if self.min_resources == 'exhaust':\n last_iteration = n_required_iterations - 1\n self.min_resources_ = max(self.min_resources_, self.max_resources_ // self.factor**last_iteration)\n n_possible_iterations = 1 + floor(log(self.max_resources_ // self.min_resources_, self.factor))\n if self.aggressive_elimination:\n n_iterations = n_required_iterations\n else:\n n_iterations = min(n_possible_iterations, n_required_iterations)\n if self.verbose:\n print(f'n_iterations: {n_iterations}')\n print(f'n_required_iterations: {n_required_iterations}')\n print(f'n_possible_iterations: {n_possible_iterations}')\n print(f'min_resources_: {self.min_resources_}')\n print(f'max_resources_: {self.max_resources_}')\n print(f'aggressive_elimination: {self.aggressive_elimination}')\n print(f'factor: {self.factor}')\n self.n_resources_ = []\n self.n_candidates_ = []\n for itr in range(n_iterations):\n power = itr\n if self.aggressive_elimination:\n power = max(0, itr - n_required_iterations + n_possible_iterations)\n n_resources = int(self.factor**power * self.min_resources_)\n n_resources = min(n_resources, self.max_resources_)\n self.n_resources_.append(n_resources)\n n_candidates = len(candidate_params)\n self.n_candidates_.append(n_candidates)\n if self.verbose:\n print('-' * 10)\n print(f'iter: {itr}')\n print(f'n_candidates: {n_candidates}')\n print(f'n_resources: {n_resources}')\n if self.resource == 'n_samples':\n cv = _SubsampleMetaSplitter(base_cv=self._checked_cv_orig, fraction=n_resources / self._n_samples_orig, subsample_test=True, random_state=self.random_state)\n else:\n candidate_params = [c.copy() for c in candidate_params]\n for candidate in candidate_params:\n candidate[self.resource] = n_resources\n cv = self._checked_cv_orig\n more_results = {'iter': [itr] * n_candidates, 'n_resources': [n_resources] * n_candidates}\n results = evaluate_candidates(candidate_params, cv, more_results=more_results)\n n_candidates_to_keep = ceil(n_candidates / self.factor)\n candidate_params = _top_k(results, n_candidates_to_keep, itr)\n self.n_remaining_candidates_ = len(candidate_params)\n self.n_required_iterations_ = n_required_iterations\n self.n_possible_iterations_ = n_possible_iterations\n self.n_iterations_ = n_iterations\n \n @abstractmethod\n def _generate_candidate_params(self):\n pass\n \n def _more_tags(self):\n tags = deepcopy(super()._more_tags())\n tags['_xfail_checks'].update({'check_fit2d_1sample': 'Fail during parameter check since min/max resources requires more samples'})\n return tags\n" }, @@ -24664,7 +24746,7 @@ "sklearn.model_selection._search_successive_halving.HalvingGridSearchCV._generate_candidate_params" ], "is_public": true, - "description": "Search over specified parameter values with successive halving.\n\nThe search strategy starts evaluating all the candidates with a small amount of resources and iteratively selects the best candidates, using more and more resources. Read more in the :ref:`User guide `. .. note:: This estimator is still **experimental** for now: the predictions and the API might change without any deprecation cycle. To use it, you need to explicitly import ``enable_halving_search_cv``:: >>> # explicitly require this experimental feature >>> from sklearn.experimental import enable_halving_search_cv # noqa >>> # now you can import normally from model_selection >>> from sklearn.model_selection import HalvingGridSearchCV", + "description": "Search over specified parameter values with successive halving.\n\nThe search strategy starts evaluating all the candidates with a small\namount of resources and iteratively selects the best candidates, using\nmore and more resources.\n\nRead more in the :ref:`User guide `.\n\n.. note::\n\n This estimator is still **experimental** for now: the predictions\n and the API might change without any deprecation cycle. To use it,\n you need to explicitly import ``enable_halving_search_cv``::\n\n >>> # explicitly require this experimental feature\n >>> from sklearn.experimental import enable_halving_search_cv # noqa\n >>> # now you can import normally from model_selection\n >>> from sklearn.model_selection import HalvingGridSearchCV", "docstring": "Search over specified parameter values with successive halving.\n\n The search strategy starts evaluating all the candidates with a small\n amount of resources and iteratively selects the best candidates, using\n more and more resources.\n\n Read more in the :ref:`User guide `.\n\n .. note::\n\n This estimator is still **experimental** for now: the predictions\n and the API might change without any deprecation cycle. To use it,\n you need to explicitly import ``enable_halving_search_cv``::\n\n >>> # explicitly require this experimental feature\n >>> from sklearn.experimental import enable_halving_search_cv # noqa\n >>> # now you can import normally from model_selection\n >>> from sklearn.model_selection import HalvingGridSearchCV\n\n Parameters\n ----------\n estimator : estimator object\n This is assumed to implement the scikit-learn estimator interface.\n Either estimator needs to provide a ``score`` function,\n or ``scoring`` must be passed.\n\n param_grid : dict or list of dictionaries\n Dictionary with parameters names (string) as keys and lists of\n parameter settings to try as values, or a list of such\n dictionaries, in which case the grids spanned by each dictionary\n in the list are explored. This enables searching over any sequence\n of parameter settings.\n\n factor : int or float, default=3\n The 'halving' parameter, which determines the proportion of candidates\n that are selected for each subsequent iteration. For example,\n ``factor=3`` means that only one third of the candidates are selected.\n\n resource : ``'n_samples'`` or str, default='n_samples'\n Defines the resource that increases with each iteration. By default,\n the resource is the number of samples. It can also be set to any\n parameter of the base estimator that accepts positive integer\n values, e.g. 'n_iterations' or 'n_estimators' for a gradient\n boosting estimator. In this case ``max_resources`` cannot be 'auto'\n and must be set explicitly.\n\n max_resources : int, default='auto'\n The maximum amount of resource that any candidate is allowed to use\n for a given iteration. By default, this is set to ``n_samples`` when\n ``resource='n_samples'`` (default), else an error is raised.\n\n min_resources : {'exhaust', 'smallest'} or int, default='exhaust'\n The minimum amount of resource that any candidate is allowed to use\n for a given iteration. Equivalently, this defines the amount of\n resources `r0` that are allocated for each candidate at the first\n iteration.\n\n - 'smallest' is a heuristic that sets `r0` to a small value:\n\n - ``n_splits * 2`` when ``resource='n_samples'`` for a regression\n problem\n - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a\n classification problem\n - ``1`` when ``resource != 'n_samples'``\n\n - 'exhaust' will set `r0` such that the **last** iteration uses as\n much resources as possible. Namely, the last iteration will use the\n highest value smaller than ``max_resources`` that is a multiple of\n both ``min_resources`` and ``factor``. In general, using 'exhaust'\n leads to a more accurate estimator, but is slightly more time\n consuming.\n\n Note that the amount of resources used at each iteration is always a\n multiple of ``min_resources``.\n\n aggressive_elimination : bool, default=False\n This is only relevant in cases where there isn't enough resources to\n reduce the remaining candidates to at most `factor` after the last\n iteration. If ``True``, then the search process will 'replay' the\n first iteration for as long as needed until the number of candidates\n is small enough. This is ``False`` by default, which means that the\n last iteration may evaluate more than ``factor`` candidates. See\n :ref:`aggressive_elimination` for more details.\n\n cv : int, cross-validation generator or iterable, default=5\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - integer, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. note::\n Due to implementation details, the folds produced by `cv` must be\n the same across multiple calls to `cv.split()`. For\n built-in `scikit-learn` iterators, this can be achieved by\n deactivating shuffling (`shuffle=False`), or by setting the\n `cv`'s `random_state` parameter to an integer.\n\n scoring : str, callable, or None, default=None\n A single string (see :ref:`scoring_parameter`) or a callable\n (see :ref:`scoring`) to evaluate the predictions on the test set.\n If None, the estimator's score method is used.\n\n refit : bool, default=True\n If True, refit an estimator using the best found parameters on the\n whole dataset.\n\n The refitted estimator is made available at the ``best_estimator_``\n attribute and permits using ``predict`` directly on this\n ``HalvingGridSearchCV`` instance.\n\n error_score : 'raise' or numeric\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised. If a numeric value is given,\n FitFailedWarning is raised. This parameter does not affect the refit\n step, which will always raise the error. Default is ``np.nan``.\n\n return_train_score : bool, default=False\n If ``False``, the ``cv_results_`` attribute will not include training\n scores.\n Computing training scores is used to get insights on how different\n parameter settings impact the overfitting/underfitting trade-off.\n However computing the scores on the training set can be computationally\n expensive and is not strictly required to select the parameters that\n yield the best generalization performance.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo random number generator state used for subsampling the dataset\n when `resources != 'n_samples'`. Ignored otherwise.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int or None, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int\n Controls the verbosity: the higher, the more messages.\n\n Attributes\n ----------\n n_resources_ : list of int\n The amount of resources used at each iteration.\n\n n_candidates_ : list of int\n The number of candidate parameters that were evaluated at each\n iteration.\n\n n_remaining_candidates_ : int\n The number of candidate parameters that are left after the last\n iteration. It corresponds to `ceil(n_candidates[-1] / factor)`\n\n max_resources_ : int\n The maximum number of resources that any candidate is allowed to use\n for a given iteration. Note that since the number of resources used\n at each iteration must be a multiple of ``min_resources_``, the\n actual number of resources used at the last iteration may be smaller\n than ``max_resources_``.\n\n min_resources_ : int\n The amount of resources that are allocated for each candidate at the\n first iteration.\n\n n_iterations_ : int\n The actual number of iterations that were run. This is equal to\n ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``.\n Else, this is equal to ``min(n_possible_iterations_,\n n_required_iterations_)``.\n\n n_possible_iterations_ : int\n The number of iterations that are possible starting with\n ``min_resources_`` resources and without exceeding\n ``max_resources_``.\n\n n_required_iterations_ : int\n The number of iterations that are required to end up with less than\n ``factor`` candidates at the last iteration, starting with\n ``min_resources_`` resources. This will be smaller than\n ``n_possible_iterations_`` when there isn't enough resources.\n\n cv_results_ : dict of numpy (masked) ndarrays\n A dict with keys as column headers and values as columns, that can be\n imported into a pandas ``DataFrame``. It contains lots of information\n for analysing the results of a search.\n Please refer to the :ref:`User guide`\n for details.\n\n best_estimator_ : estimator or dict\n Estimator that was chosen by the search, i.e. estimator\n which gave highest score (or smallest loss if specified)\n on the left out data. Not available if ``refit=False``.\n\n best_score_ : float\n Mean cross-validated score of the best_estimator.\n\n best_params_ : dict\n Parameter setting that gave the best results on the hold out data.\n\n best_index_ : int\n The index (of the ``cv_results_`` arrays) which corresponds to the best\n candidate parameter setting.\n\n The dict at ``search.cv_results_['params'][search.best_index_]`` gives\n the parameter setting for the best model, that gives the highest\n mean score (``search.best_score_``).\n\n scorer_ : function or a dict\n Scorer function used on the held out data to choose the best\n parameters for the model.\n\n n_splits_ : int\n The number of cross-validation splits (folds/iterations).\n\n refit_time_ : float\n Seconds used for refitting the best model on the whole dataset.\n\n This is present only if ``refit`` is not False.\n\n multimetric_ : bool\n Whether or not the scorers compute several metrics.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels. This is present only if ``refit`` is specified and\n the underlying estimator is a classifier.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `n_features_in_` when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `feature_names_in_` when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n :class:`HalvingRandomSearchCV`:\n Random search over a set of parameters using successive halving.\n\n Notes\n -----\n The parameters selected are those that maximize the score of the held-out\n data, according to the scoring parameter.\n\n Examples\n --------\n\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.experimental import enable_halving_search_cv # noqa\n >>> from sklearn.model_selection import HalvingGridSearchCV\n ...\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = RandomForestClassifier(random_state=0)\n ...\n >>> param_grid = {\"max_depth\": [3, None],\n ... \"min_samples_split\": [5, 10]}\n >>> search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators',\n ... max_resources=10,\n ... random_state=0).fit(X, y)\n >>> search.best_params_ # doctest: +SKIP\n {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9}\n ", "source_code": "\n\nclass HalvingGridSearchCV(BaseSuccessiveHalving):\n \"\"\"Search over specified parameter values with successive halving.\n\n The search strategy starts evaluating all the candidates with a small\n amount of resources and iteratively selects the best candidates, using\n more and more resources.\n\n Read more in the :ref:`User guide `.\n\n .. note::\n\n This estimator is still **experimental** for now: the predictions\n and the API might change without any deprecation cycle. To use it,\n you need to explicitly import ``enable_halving_search_cv``::\n\n >>> # explicitly require this experimental feature\n >>> from sklearn.experimental import enable_halving_search_cv # noqa\n >>> # now you can import normally from model_selection\n >>> from sklearn.model_selection import HalvingGridSearchCV\n\n Parameters\n ----------\n estimator : estimator object\n This is assumed to implement the scikit-learn estimator interface.\n Either estimator needs to provide a ``score`` function,\n or ``scoring`` must be passed.\n\n param_grid : dict or list of dictionaries\n Dictionary with parameters names (string) as keys and lists of\n parameter settings to try as values, or a list of such\n dictionaries, in which case the grids spanned by each dictionary\n in the list are explored. This enables searching over any sequence\n of parameter settings.\n\n factor : int or float, default=3\n The 'halving' parameter, which determines the proportion of candidates\n that are selected for each subsequent iteration. For example,\n ``factor=3`` means that only one third of the candidates are selected.\n\n resource : ``'n_samples'`` or str, default='n_samples'\n Defines the resource that increases with each iteration. By default,\n the resource is the number of samples. It can also be set to any\n parameter of the base estimator that accepts positive integer\n values, e.g. 'n_iterations' or 'n_estimators' for a gradient\n boosting estimator. In this case ``max_resources`` cannot be 'auto'\n and must be set explicitly.\n\n max_resources : int, default='auto'\n The maximum amount of resource that any candidate is allowed to use\n for a given iteration. By default, this is set to ``n_samples`` when\n ``resource='n_samples'`` (default), else an error is raised.\n\n min_resources : {'exhaust', 'smallest'} or int, default='exhaust'\n The minimum amount of resource that any candidate is allowed to use\n for a given iteration. Equivalently, this defines the amount of\n resources `r0` that are allocated for each candidate at the first\n iteration.\n\n - 'smallest' is a heuristic that sets `r0` to a small value:\n\n - ``n_splits * 2`` when ``resource='n_samples'`` for a regression\n problem\n - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a\n classification problem\n - ``1`` when ``resource != 'n_samples'``\n\n - 'exhaust' will set `r0` such that the **last** iteration uses as\n much resources as possible. Namely, the last iteration will use the\n highest value smaller than ``max_resources`` that is a multiple of\n both ``min_resources`` and ``factor``. In general, using 'exhaust'\n leads to a more accurate estimator, but is slightly more time\n consuming.\n\n Note that the amount of resources used at each iteration is always a\n multiple of ``min_resources``.\n\n aggressive_elimination : bool, default=False\n This is only relevant in cases where there isn't enough resources to\n reduce the remaining candidates to at most `factor` after the last\n iteration. If ``True``, then the search process will 'replay' the\n first iteration for as long as needed until the number of candidates\n is small enough. This is ``False`` by default, which means that the\n last iteration may evaluate more than ``factor`` candidates. See\n :ref:`aggressive_elimination` for more details.\n\n cv : int, cross-validation generator or iterable, default=5\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - integer, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. note::\n Due to implementation details, the folds produced by `cv` must be\n the same across multiple calls to `cv.split()`. For\n built-in `scikit-learn` iterators, this can be achieved by\n deactivating shuffling (`shuffle=False`), or by setting the\n `cv`'s `random_state` parameter to an integer.\n\n scoring : str, callable, or None, default=None\n A single string (see :ref:`scoring_parameter`) or a callable\n (see :ref:`scoring`) to evaluate the predictions on the test set.\n If None, the estimator's score method is used.\n\n refit : bool, default=True\n If True, refit an estimator using the best found parameters on the\n whole dataset.\n\n The refitted estimator is made available at the ``best_estimator_``\n attribute and permits using ``predict`` directly on this\n ``HalvingGridSearchCV`` instance.\n\n error_score : 'raise' or numeric\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised. If a numeric value is given,\n FitFailedWarning is raised. This parameter does not affect the refit\n step, which will always raise the error. Default is ``np.nan``.\n\n return_train_score : bool, default=False\n If ``False``, the ``cv_results_`` attribute will not include training\n scores.\n Computing training scores is used to get insights on how different\n parameter settings impact the overfitting/underfitting trade-off.\n However computing the scores on the training set can be computationally\n expensive and is not strictly required to select the parameters that\n yield the best generalization performance.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo random number generator state used for subsampling the dataset\n when `resources != 'n_samples'`. Ignored otherwise.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int or None, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int\n Controls the verbosity: the higher, the more messages.\n\n Attributes\n ----------\n n_resources_ : list of int\n The amount of resources used at each iteration.\n\n n_candidates_ : list of int\n The number of candidate parameters that were evaluated at each\n iteration.\n\n n_remaining_candidates_ : int\n The number of candidate parameters that are left after the last\n iteration. It corresponds to `ceil(n_candidates[-1] / factor)`\n\n max_resources_ : int\n The maximum number of resources that any candidate is allowed to use\n for a given iteration. Note that since the number of resources used\n at each iteration must be a multiple of ``min_resources_``, the\n actual number of resources used at the last iteration may be smaller\n than ``max_resources_``.\n\n min_resources_ : int\n The amount of resources that are allocated for each candidate at the\n first iteration.\n\n n_iterations_ : int\n The actual number of iterations that were run. This is equal to\n ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``.\n Else, this is equal to ``min(n_possible_iterations_,\n n_required_iterations_)``.\n\n n_possible_iterations_ : int\n The number of iterations that are possible starting with\n ``min_resources_`` resources and without exceeding\n ``max_resources_``.\n\n n_required_iterations_ : int\n The number of iterations that are required to end up with less than\n ``factor`` candidates at the last iteration, starting with\n ``min_resources_`` resources. This will be smaller than\n ``n_possible_iterations_`` when there isn't enough resources.\n\n cv_results_ : dict of numpy (masked) ndarrays\n A dict with keys as column headers and values as columns, that can be\n imported into a pandas ``DataFrame``. It contains lots of information\n for analysing the results of a search.\n Please refer to the :ref:`User guide`\n for details.\n\n best_estimator_ : estimator or dict\n Estimator that was chosen by the search, i.e. estimator\n which gave highest score (or smallest loss if specified)\n on the left out data. Not available if ``refit=False``.\n\n best_score_ : float\n Mean cross-validated score of the best_estimator.\n\n best_params_ : dict\n Parameter setting that gave the best results on the hold out data.\n\n best_index_ : int\n The index (of the ``cv_results_`` arrays) which corresponds to the best\n candidate parameter setting.\n\n The dict at ``search.cv_results_['params'][search.best_index_]`` gives\n the parameter setting for the best model, that gives the highest\n mean score (``search.best_score_``).\n\n scorer_ : function or a dict\n Scorer function used on the held out data to choose the best\n parameters for the model.\n\n n_splits_ : int\n The number of cross-validation splits (folds/iterations).\n\n refit_time_ : float\n Seconds used for refitting the best model on the whole dataset.\n\n This is present only if ``refit`` is not False.\n\n multimetric_ : bool\n Whether or not the scorers compute several metrics.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels. This is present only if ``refit`` is specified and\n the underlying estimator is a classifier.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `n_features_in_` when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `feature_names_in_` when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n :class:`HalvingRandomSearchCV`:\n Random search over a set of parameters using successive halving.\n\n Notes\n -----\n The parameters selected are those that maximize the score of the held-out\n data, according to the scoring parameter.\n\n Examples\n --------\n\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.experimental import enable_halving_search_cv # noqa\n >>> from sklearn.model_selection import HalvingGridSearchCV\n ...\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = RandomForestClassifier(random_state=0)\n ...\n >>> param_grid = {\"max_depth\": [3, None],\n ... \"min_samples_split\": [5, 10]}\n >>> search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators',\n ... max_resources=10,\n ... random_state=0).fit(X, y)\n >>> search.best_params_ # doctest: +SKIP\n {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9}\n \"\"\"\n _required_parameters = ['estimator', 'param_grid']\n \n def __init__(self, estimator, param_grid, *, factor=3, resource='n_samples', max_resources='auto', min_resources='exhaust', aggressive_elimination=False, cv=5, scoring=None, refit=True, error_score=np.nan, return_train_score=True, random_state=None, n_jobs=None, verbose=0):\n super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, random_state=random_state, error_score=error_score, return_train_score=return_train_score, max_resources=max_resources, resource=resource, factor=factor, min_resources=min_resources, aggressive_elimination=aggressive_elimination)\n self.param_grid = param_grid\n _check_param_grid(self.param_grid)\n \n def _generate_candidate_params(self):\n return ParameterGrid(self.param_grid)\n" }, @@ -24678,7 +24760,7 @@ "sklearn.model_selection._search_successive_halving.HalvingRandomSearchCV._generate_candidate_params" ], "is_public": true, - "description": "Randomized search on hyper parameters.\n\nThe search strategy starts evaluating all the candidates with a small amount of resources and iteratively selects the best candidates, using more and more resources. The candidates are sampled at random from the parameter space and the number of sampled candidates is determined by ``n_candidates``. Read more in the :ref:`User guide`. .. note:: This estimator is still **experimental** for now: the predictions and the API might change without any deprecation cycle. To use it, you need to explicitly import ``enable_halving_search_cv``:: >>> # explicitly require this experimental feature >>> from sklearn.experimental import enable_halving_search_cv # noqa >>> # now you can import normally from model_selection >>> from sklearn.model_selection import HalvingRandomSearchCV", + "description": "Randomized search on hyper parameters.\n\nThe search strategy starts evaluating all the candidates with a small\namount of resources and iteratively selects the best candidates, using more\nand more resources.\n\nThe candidates are sampled at random from the parameter space and the\nnumber of sampled candidates is determined by ``n_candidates``.\n\nRead more in the :ref:`User guide`.\n\n.. note::\n\n This estimator is still **experimental** for now: the predictions\n and the API might change without any deprecation cycle. To use it,\n you need to explicitly import ``enable_halving_search_cv``::\n\n >>> # explicitly require this experimental feature\n >>> from sklearn.experimental import enable_halving_search_cv # noqa\n >>> # now you can import normally from model_selection\n >>> from sklearn.model_selection import HalvingRandomSearchCV", "docstring": "Randomized search on hyper parameters.\n\n The search strategy starts evaluating all the candidates with a small\n amount of resources and iteratively selects the best candidates, using more\n and more resources.\n\n The candidates are sampled at random from the parameter space and the\n number of sampled candidates is determined by ``n_candidates``.\n\n Read more in the :ref:`User guide`.\n\n .. note::\n\n This estimator is still **experimental** for now: the predictions\n and the API might change without any deprecation cycle. To use it,\n you need to explicitly import ``enable_halving_search_cv``::\n\n >>> # explicitly require this experimental feature\n >>> from sklearn.experimental import enable_halving_search_cv # noqa\n >>> # now you can import normally from model_selection\n >>> from sklearn.model_selection import HalvingRandomSearchCV\n\n Parameters\n ----------\n estimator : estimator object\n This is assumed to implement the scikit-learn estimator interface.\n Either estimator needs to provide a ``score`` function,\n or ``scoring`` must be passed.\n\n param_distributions : dict\n Dictionary with parameters names (string) as keys and distributions\n or lists of parameters to try. Distributions must provide a ``rvs``\n method for sampling (such as those from scipy.stats.distributions).\n If a list is given, it is sampled uniformly.\n\n n_candidates : int, default='exhaust'\n The number of candidate parameters to sample, at the first\n iteration. Using 'exhaust' will sample enough candidates so that the\n last iteration uses as many resources as possible, based on\n `min_resources`, `max_resources` and `factor`. In this case,\n `min_resources` cannot be 'exhaust'.\n\n factor : int or float, default=3\n The 'halving' parameter, which determines the proportion of candidates\n that are selected for each subsequent iteration. For example,\n ``factor=3`` means that only one third of the candidates are selected.\n\n resource : ``'n_samples'`` or str, default='n_samples'\n Defines the resource that increases with each iteration. By default,\n the resource is the number of samples. It can also be set to any\n parameter of the base estimator that accepts positive integer\n values, e.g. 'n_iterations' or 'n_estimators' for a gradient\n boosting estimator. In this case ``max_resources`` cannot be 'auto'\n and must be set explicitly.\n\n max_resources : int, default='auto'\n The maximum number of resources that any candidate is allowed to use\n for a given iteration. By default, this is set ``n_samples`` when\n ``resource='n_samples'`` (default), else an error is raised.\n\n min_resources : {'exhaust', 'smallest'} or int, default='smallest'\n The minimum amount of resource that any candidate is allowed to use\n for a given iteration. Equivalently, this defines the amount of\n resources `r0` that are allocated for each candidate at the first\n iteration.\n\n - 'smallest' is a heuristic that sets `r0` to a small value:\n\n - ``n_splits * 2`` when ``resource='n_samples'`` for a regression\n problem\n - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a\n classification problem\n - ``1`` when ``resource != 'n_samples'``\n\n - 'exhaust' will set `r0` such that the **last** iteration uses as\n much resources as possible. Namely, the last iteration will use the\n highest value smaller than ``max_resources`` that is a multiple of\n both ``min_resources`` and ``factor``. In general, using 'exhaust'\n leads to a more accurate estimator, but is slightly more time\n consuming. 'exhaust' isn't available when `n_candidates='exhaust'`.\n\n Note that the amount of resources used at each iteration is always a\n multiple of ``min_resources``.\n\n aggressive_elimination : bool, default=False\n This is only relevant in cases where there isn't enough resources to\n reduce the remaining candidates to at most `factor` after the last\n iteration. If ``True``, then the search process will 'replay' the\n first iteration for as long as needed until the number of candidates\n is small enough. This is ``False`` by default, which means that the\n last iteration may evaluate more than ``factor`` candidates. See\n :ref:`aggressive_elimination` for more details.\n\n cv : int, cross-validation generator or an iterable, default=5\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - integer, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. note::\n Due to implementation details, the folds produced by `cv` must be\n the same across multiple calls to `cv.split()`. For\n built-in `scikit-learn` iterators, this can be achieved by\n deactivating shuffling (`shuffle=False`), or by setting the\n `cv`'s `random_state` parameter to an integer.\n\n scoring : str, callable, or None, default=None\n A single string (see :ref:`scoring_parameter`) or a callable\n (see :ref:`scoring`) to evaluate the predictions on the test set.\n If None, the estimator's score method is used.\n\n refit : bool, default=True\n If True, refit an estimator using the best found parameters on the\n whole dataset.\n\n The refitted estimator is made available at the ``best_estimator_``\n attribute and permits using ``predict`` directly on this\n ``HalvingRandomSearchCV`` instance.\n\n error_score : 'raise' or numeric\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised. If a numeric value is given,\n FitFailedWarning is raised. This parameter does not affect the refit\n step, which will always raise the error. Default is ``np.nan``.\n\n return_train_score : bool, default=False\n If ``False``, the ``cv_results_`` attribute will not include training\n scores.\n Computing training scores is used to get insights on how different\n parameter settings impact the overfitting/underfitting trade-off.\n However computing the scores on the training set can be computationally\n expensive and is not strictly required to select the parameters that\n yield the best generalization performance.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo random number generator state used for subsampling the dataset\n when `resources != 'n_samples'`. Also used for random uniform\n sampling from lists of possible values instead of scipy.stats\n distributions.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int or None, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int\n Controls the verbosity: the higher, the more messages.\n\n Attributes\n ----------\n n_resources_ : list of int\n The amount of resources used at each iteration.\n\n n_candidates_ : list of int\n The number of candidate parameters that were evaluated at each\n iteration.\n\n n_remaining_candidates_ : int\n The number of candidate parameters that are left after the last\n iteration. It corresponds to `ceil(n_candidates[-1] / factor)`\n\n max_resources_ : int\n The maximum number of resources that any candidate is allowed to use\n for a given iteration. Note that since the number of resources used at\n each iteration must be a multiple of ``min_resources_``, the actual\n number of resources used at the last iteration may be smaller than\n ``max_resources_``.\n\n min_resources_ : int\n The amount of resources that are allocated for each candidate at the\n first iteration.\n\n n_iterations_ : int\n The actual number of iterations that were run. This is equal to\n ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``.\n Else, this is equal to ``min(n_possible_iterations_,\n n_required_iterations_)``.\n\n n_possible_iterations_ : int\n The number of iterations that are possible starting with\n ``min_resources_`` resources and without exceeding\n ``max_resources_``.\n\n n_required_iterations_ : int\n The number of iterations that are required to end up with less than\n ``factor`` candidates at the last iteration, starting with\n ``min_resources_`` resources. This will be smaller than\n ``n_possible_iterations_`` when there isn't enough resources.\n\n cv_results_ : dict of numpy (masked) ndarrays\n A dict with keys as column headers and values as columns, that can be\n imported into a pandas ``DataFrame``. It contains lots of information\n for analysing the results of a search.\n Please refer to the :ref:`User guide`\n for details.\n\n best_estimator_ : estimator or dict\n Estimator that was chosen by the search, i.e. estimator\n which gave highest score (or smallest loss if specified)\n on the left out data. Not available if ``refit=False``.\n\n best_score_ : float\n Mean cross-validated score of the best_estimator.\n\n best_params_ : dict\n Parameter setting that gave the best results on the hold out data.\n\n best_index_ : int\n The index (of the ``cv_results_`` arrays) which corresponds to the best\n candidate parameter setting.\n\n The dict at ``search.cv_results_['params'][search.best_index_]`` gives\n the parameter setting for the best model, that gives the highest\n mean score (``search.best_score_``).\n\n scorer_ : function or a dict\n Scorer function used on the held out data to choose the best\n parameters for the model.\n\n n_splits_ : int\n The number of cross-validation splits (folds/iterations).\n\n refit_time_ : float\n Seconds used for refitting the best model on the whole dataset.\n\n This is present only if ``refit`` is not False.\n\n multimetric_ : bool\n Whether or not the scorers compute several metrics.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels. This is present only if ``refit`` is specified and\n the underlying estimator is a classifier.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `n_features_in_` when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `feature_names_in_` when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n :class:`HalvingGridSearchCV`:\n Search over a grid of parameters using successive halving.\n\n Notes\n -----\n The parameters selected are those that maximize the score of the held-out\n data, according to the scoring parameter.\n\n Examples\n --------\n\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.experimental import enable_halving_search_cv # noqa\n >>> from sklearn.model_selection import HalvingRandomSearchCV\n >>> from scipy.stats import randint\n >>> import numpy as np\n ...\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = RandomForestClassifier(random_state=0)\n >>> np.random.seed(0)\n ...\n >>> param_distributions = {\"max_depth\": [3, None],\n ... \"min_samples_split\": randint(2, 11)}\n >>> search = HalvingRandomSearchCV(clf, param_distributions,\n ... resource='n_estimators',\n ... max_resources=10,\n ... random_state=0).fit(X, y)\n >>> search.best_params_ # doctest: +SKIP\n {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9}\n ", "source_code": "\n\nclass HalvingRandomSearchCV(BaseSuccessiveHalving):\n \"\"\"Randomized search on hyper parameters.\n\n The search strategy starts evaluating all the candidates with a small\n amount of resources and iteratively selects the best candidates, using more\n and more resources.\n\n The candidates are sampled at random from the parameter space and the\n number of sampled candidates is determined by ``n_candidates``.\n\n Read more in the :ref:`User guide`.\n\n .. note::\n\n This estimator is still **experimental** for now: the predictions\n and the API might change without any deprecation cycle. To use it,\n you need to explicitly import ``enable_halving_search_cv``::\n\n >>> # explicitly require this experimental feature\n >>> from sklearn.experimental import enable_halving_search_cv # noqa\n >>> # now you can import normally from model_selection\n >>> from sklearn.model_selection import HalvingRandomSearchCV\n\n Parameters\n ----------\n estimator : estimator object\n This is assumed to implement the scikit-learn estimator interface.\n Either estimator needs to provide a ``score`` function,\n or ``scoring`` must be passed.\n\n param_distributions : dict\n Dictionary with parameters names (string) as keys and distributions\n or lists of parameters to try. Distributions must provide a ``rvs``\n method for sampling (such as those from scipy.stats.distributions).\n If a list is given, it is sampled uniformly.\n\n n_candidates : int, default='exhaust'\n The number of candidate parameters to sample, at the first\n iteration. Using 'exhaust' will sample enough candidates so that the\n last iteration uses as many resources as possible, based on\n `min_resources`, `max_resources` and `factor`. In this case,\n `min_resources` cannot be 'exhaust'.\n\n factor : int or float, default=3\n The 'halving' parameter, which determines the proportion of candidates\n that are selected for each subsequent iteration. For example,\n ``factor=3`` means that only one third of the candidates are selected.\n\n resource : ``'n_samples'`` or str, default='n_samples'\n Defines the resource that increases with each iteration. By default,\n the resource is the number of samples. It can also be set to any\n parameter of the base estimator that accepts positive integer\n values, e.g. 'n_iterations' or 'n_estimators' for a gradient\n boosting estimator. In this case ``max_resources`` cannot be 'auto'\n and must be set explicitly.\n\n max_resources : int, default='auto'\n The maximum number of resources that any candidate is allowed to use\n for a given iteration. By default, this is set ``n_samples`` when\n ``resource='n_samples'`` (default), else an error is raised.\n\n min_resources : {'exhaust', 'smallest'} or int, default='smallest'\n The minimum amount of resource that any candidate is allowed to use\n for a given iteration. Equivalently, this defines the amount of\n resources `r0` that are allocated for each candidate at the first\n iteration.\n\n - 'smallest' is a heuristic that sets `r0` to a small value:\n\n - ``n_splits * 2`` when ``resource='n_samples'`` for a regression\n problem\n - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a\n classification problem\n - ``1`` when ``resource != 'n_samples'``\n\n - 'exhaust' will set `r0` such that the **last** iteration uses as\n much resources as possible. Namely, the last iteration will use the\n highest value smaller than ``max_resources`` that is a multiple of\n both ``min_resources`` and ``factor``. In general, using 'exhaust'\n leads to a more accurate estimator, but is slightly more time\n consuming. 'exhaust' isn't available when `n_candidates='exhaust'`.\n\n Note that the amount of resources used at each iteration is always a\n multiple of ``min_resources``.\n\n aggressive_elimination : bool, default=False\n This is only relevant in cases where there isn't enough resources to\n reduce the remaining candidates to at most `factor` after the last\n iteration. If ``True``, then the search process will 'replay' the\n first iteration for as long as needed until the number of candidates\n is small enough. This is ``False`` by default, which means that the\n last iteration may evaluate more than ``factor`` candidates. See\n :ref:`aggressive_elimination` for more details.\n\n cv : int, cross-validation generator or an iterable, default=5\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - integer, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. note::\n Due to implementation details, the folds produced by `cv` must be\n the same across multiple calls to `cv.split()`. For\n built-in `scikit-learn` iterators, this can be achieved by\n deactivating shuffling (`shuffle=False`), or by setting the\n `cv`'s `random_state` parameter to an integer.\n\n scoring : str, callable, or None, default=None\n A single string (see :ref:`scoring_parameter`) or a callable\n (see :ref:`scoring`) to evaluate the predictions on the test set.\n If None, the estimator's score method is used.\n\n refit : bool, default=True\n If True, refit an estimator using the best found parameters on the\n whole dataset.\n\n The refitted estimator is made available at the ``best_estimator_``\n attribute and permits using ``predict`` directly on this\n ``HalvingRandomSearchCV`` instance.\n\n error_score : 'raise' or numeric\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised. If a numeric value is given,\n FitFailedWarning is raised. This parameter does not affect the refit\n step, which will always raise the error. Default is ``np.nan``.\n\n return_train_score : bool, default=False\n If ``False``, the ``cv_results_`` attribute will not include training\n scores.\n Computing training scores is used to get insights on how different\n parameter settings impact the overfitting/underfitting trade-off.\n However computing the scores on the training set can be computationally\n expensive and is not strictly required to select the parameters that\n yield the best generalization performance.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo random number generator state used for subsampling the dataset\n when `resources != 'n_samples'`. Also used for random uniform\n sampling from lists of possible values instead of scipy.stats\n distributions.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int or None, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int\n Controls the verbosity: the higher, the more messages.\n\n Attributes\n ----------\n n_resources_ : list of int\n The amount of resources used at each iteration.\n\n n_candidates_ : list of int\n The number of candidate parameters that were evaluated at each\n iteration.\n\n n_remaining_candidates_ : int\n The number of candidate parameters that are left after the last\n iteration. It corresponds to `ceil(n_candidates[-1] / factor)`\n\n max_resources_ : int\n The maximum number of resources that any candidate is allowed to use\n for a given iteration. Note that since the number of resources used at\n each iteration must be a multiple of ``min_resources_``, the actual\n number of resources used at the last iteration may be smaller than\n ``max_resources_``.\n\n min_resources_ : int\n The amount of resources that are allocated for each candidate at the\n first iteration.\n\n n_iterations_ : int\n The actual number of iterations that were run. This is equal to\n ``n_required_iterations_`` if ``aggressive_elimination`` is ``True``.\n Else, this is equal to ``min(n_possible_iterations_,\n n_required_iterations_)``.\n\n n_possible_iterations_ : int\n The number of iterations that are possible starting with\n ``min_resources_`` resources and without exceeding\n ``max_resources_``.\n\n n_required_iterations_ : int\n The number of iterations that are required to end up with less than\n ``factor`` candidates at the last iteration, starting with\n ``min_resources_`` resources. This will be smaller than\n ``n_possible_iterations_`` when there isn't enough resources.\n\n cv_results_ : dict of numpy (masked) ndarrays\n A dict with keys as column headers and values as columns, that can be\n imported into a pandas ``DataFrame``. It contains lots of information\n for analysing the results of a search.\n Please refer to the :ref:`User guide`\n for details.\n\n best_estimator_ : estimator or dict\n Estimator that was chosen by the search, i.e. estimator\n which gave highest score (or smallest loss if specified)\n on the left out data. Not available if ``refit=False``.\n\n best_score_ : float\n Mean cross-validated score of the best_estimator.\n\n best_params_ : dict\n Parameter setting that gave the best results on the hold out data.\n\n best_index_ : int\n The index (of the ``cv_results_`` arrays) which corresponds to the best\n candidate parameter setting.\n\n The dict at ``search.cv_results_['params'][search.best_index_]`` gives\n the parameter setting for the best model, that gives the highest\n mean score (``search.best_score_``).\n\n scorer_ : function or a dict\n Scorer function used on the held out data to choose the best\n parameters for the model.\n\n n_splits_ : int\n The number of cross-validation splits (folds/iterations).\n\n refit_time_ : float\n Seconds used for refitting the best model on the whole dataset.\n\n This is present only if ``refit`` is not False.\n\n multimetric_ : bool\n Whether or not the scorers compute several metrics.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels. This is present only if ``refit`` is specified and\n the underlying estimator is a classifier.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `n_features_in_` when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if\n `best_estimator_` is defined (see the documentation for the `refit`\n parameter for more details) and that `best_estimator_` exposes\n `feature_names_in_` when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n :class:`HalvingGridSearchCV`:\n Search over a grid of parameters using successive halving.\n\n Notes\n -----\n The parameters selected are those that maximize the score of the held-out\n data, according to the scoring parameter.\n\n Examples\n --------\n\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.experimental import enable_halving_search_cv # noqa\n >>> from sklearn.model_selection import HalvingRandomSearchCV\n >>> from scipy.stats import randint\n >>> import numpy as np\n ...\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = RandomForestClassifier(random_state=0)\n >>> np.random.seed(0)\n ...\n >>> param_distributions = {\"max_depth\": [3, None],\n ... \"min_samples_split\": randint(2, 11)}\n >>> search = HalvingRandomSearchCV(clf, param_distributions,\n ... resource='n_estimators',\n ... max_resources=10,\n ... random_state=0).fit(X, y)\n >>> search.best_params_ # doctest: +SKIP\n {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9}\n \"\"\"\n _required_parameters = ['estimator', 'param_distributions']\n \n def __init__(self, estimator, param_distributions, *, n_candidates='exhaust', factor=3, resource='n_samples', max_resources='auto', min_resources='smallest', aggressive_elimination=False, cv=5, scoring=None, refit=True, error_score=np.nan, return_train_score=True, random_state=None, n_jobs=None, verbose=0):\n super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, random_state=random_state, error_score=error_score, return_train_score=return_train_score, max_resources=max_resources, resource=resource, factor=factor, min_resources=min_resources, aggressive_elimination=aggressive_elimination)\n self.param_distributions = param_distributions\n self.n_candidates = n_candidates\n \n def _generate_candidate_params(self):\n n_candidates_first_iter = self.n_candidates\n if n_candidates_first_iter == 'exhaust':\n n_candidates_first_iter = self.max_resources_ // self.min_resources_\n return ParameterSampler(self.param_distributions, n_candidates_first_iter, random_state=self.random_state)\n" }, @@ -24741,7 +24823,7 @@ "sklearn.model_selection._split.GroupKFold.split" ], "is_public": true, - "description": "K-fold iterator variant with non-overlapping groups.\n\nThe same group will not appear in two different folds (the number of distinct groups has to be at least equal to the number of folds). The folds are approximately balanced in the sense that the number of distinct groups is approximately the same in each fold. Read more in the :ref:`User Guide `.", + "description": "K-fold iterator variant with non-overlapping groups.\n\nThe same group will not appear in two different folds (the number of\ndistinct groups has to be at least equal to the number of folds).\n\nThe folds are approximately balanced in the sense that the number of\ndistinct groups is approximately the same in each fold.\n\nRead more in the :ref:`User Guide `.", "docstring": "K-fold iterator variant with non-overlapping groups.\n\n The same group will not appear in two different folds (the number of\n distinct groups has to be at least equal to the number of folds).\n\n The folds are approximately balanced in the sense that the number of\n distinct groups is approximately the same in each fold.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of folds. Must be at least 2.\n\n .. versionchanged:: 0.22\n ``n_splits`` default value changed from 3 to 5.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import GroupKFold\n >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])\n >>> y = np.array([1, 2, 3, 4])\n >>> groups = np.array([0, 0, 2, 2])\n >>> group_kfold = GroupKFold(n_splits=2)\n >>> group_kfold.get_n_splits(X, y, groups)\n 2\n >>> print(group_kfold)\n GroupKFold(n_splits=2)\n >>> for train_index, test_index in group_kfold.split(X, y, groups):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n ... print(X_train, X_test, y_train, y_test)\n ...\n TRAIN: [0 1] TEST: [2 3]\n [[1 2]\n [3 4]] [[5 6]\n [7 8]] [1 2] [3 4]\n TRAIN: [2 3] TEST: [0 1]\n [[5 6]\n [7 8]] [[1 2]\n [3 4]] [3 4] [1 2]\n\n See Also\n --------\n LeaveOneGroupOut : For splitting the data according to explicit\n domain-specific stratification of the dataset.\n ", "source_code": "\n\nclass GroupKFold(_BaseKFold):\n \"\"\"K-fold iterator variant with non-overlapping groups.\n\n The same group will not appear in two different folds (the number of\n distinct groups has to be at least equal to the number of folds).\n\n The folds are approximately balanced in the sense that the number of\n distinct groups is approximately the same in each fold.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of folds. Must be at least 2.\n\n .. versionchanged:: 0.22\n ``n_splits`` default value changed from 3 to 5.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import GroupKFold\n >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])\n >>> y = np.array([1, 2, 3, 4])\n >>> groups = np.array([0, 0, 2, 2])\n >>> group_kfold = GroupKFold(n_splits=2)\n >>> group_kfold.get_n_splits(X, y, groups)\n 2\n >>> print(group_kfold)\n GroupKFold(n_splits=2)\n >>> for train_index, test_index in group_kfold.split(X, y, groups):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n ... print(X_train, X_test, y_train, y_test)\n ...\n TRAIN: [0 1] TEST: [2 3]\n [[1 2]\n [3 4]] [[5 6]\n [7 8]] [1 2] [3 4]\n TRAIN: [2 3] TEST: [0 1]\n [[5 6]\n [7 8]] [[1 2]\n [3 4]] [3 4] [1 2]\n\n See Also\n --------\n LeaveOneGroupOut : For splitting the data according to explicit\n domain-specific stratification of the dataset.\n \"\"\"\n \n def __init__(self, n_splits=5):\n super().__init__(n_splits, shuffle=False, random_state=None)\n \n def _iter_test_indices(self, X, y, groups):\n if groups is None:\n raise ValueError(\"The 'groups' parameter should not be None.\")\n groups = check_array(groups, ensure_2d=False, dtype=None)\n (unique_groups, groups) = np.unique(groups, return_inverse=True)\n n_groups = len(unique_groups)\n if self.n_splits > n_groups:\n raise ValueError('Cannot have number of splits n_splits=%d greater than the number of groups: %d.' % (self.n_splits, n_groups))\n n_samples_per_group = np.bincount(groups)\n indices = np.argsort(n_samples_per_group)[::-1]\n n_samples_per_group = n_samples_per_group[indices]\n n_samples_per_fold = np.zeros(self.n_splits)\n group_to_fold = np.zeros(len(unique_groups))\n for (group_index, weight) in enumerate(n_samples_per_group):\n lightest_fold = np.argmin(n_samples_per_fold)\n n_samples_per_fold[lightest_fold] += weight\n group_to_fold[indices[group_index]] = lightest_fold\n indices = group_to_fold[groups]\n for f in range(self.n_splits):\n yield np.where(indices == f)[0]\n \n def split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n return super().split(X, y, groups)\n" }, @@ -24756,7 +24838,7 @@ "sklearn.model_selection._split.GroupShuffleSplit.split" ], "is_public": true, - "description": "Shuffle-Group(s)-Out cross-validation iterator\n\nProvides randomized train/test indices to split data according to a third-party provided group. This group information can be used to encode arbitrary domain specific stratifications of the samples as integers. For instance the groups could be the year of collection of the samples and thus allow for cross-validation against time-based splits. The difference between LeavePGroupsOut and GroupShuffleSplit is that the former generates splits using all subsets of size ``p`` unique groups, whereas GroupShuffleSplit generates a user-determined number of random test splits, each with a user-determined fraction of unique groups. For example, a less computationally intensive alternative to ``LeavePGroupsOut(p=10)`` would be ``GroupShuffleSplit(test_size=10, n_splits=100)``. Note: The parameters ``test_size`` and ``train_size`` refer to groups, and not to samples, as in ShuffleSplit. Read more in the :ref:`User Guide `.", + "description": "Shuffle-Group(s)-Out cross-validation iterator\n\nProvides randomized train/test indices to split data according to a\nthird-party provided group. This group information can be used to encode\narbitrary domain specific stratifications of the samples as integers.\n\nFor instance the groups could be the year of collection of the samples\nand thus allow for cross-validation against time-based splits.\n\nThe difference between LeavePGroupsOut and GroupShuffleSplit is that\nthe former generates splits using all subsets of size ``p`` unique groups,\nwhereas GroupShuffleSplit generates a user-determined number of random\ntest splits, each with a user-determined fraction of unique groups.\n\nFor example, a less computationally intensive alternative to\n``LeavePGroupsOut(p=10)`` would be\n``GroupShuffleSplit(test_size=10, n_splits=100)``.\n\nNote: The parameters ``test_size`` and ``train_size`` refer to groups, and\nnot to samples, as in ShuffleSplit.\n\nRead more in the :ref:`User Guide `.", "docstring": "Shuffle-Group(s)-Out cross-validation iterator\n\n Provides randomized train/test indices to split data according to a\n third-party provided group. This group information can be used to encode\n arbitrary domain specific stratifications of the samples as integers.\n\n For instance the groups could be the year of collection of the samples\n and thus allow for cross-validation against time-based splits.\n\n The difference between LeavePGroupsOut and GroupShuffleSplit is that\n the former generates splits using all subsets of size ``p`` unique groups,\n whereas GroupShuffleSplit generates a user-determined number of random\n test splits, each with a user-determined fraction of unique groups.\n\n For example, a less computationally intensive alternative to\n ``LeavePGroupsOut(p=10)`` would be\n ``GroupShuffleSplit(test_size=10, n_splits=100)``.\n\n Note: The parameters ``test_size`` and ``train_size`` refer to groups, and\n not to samples, as in ShuffleSplit.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of re-shuffling & splitting iterations.\n\n test_size : float, int, default=0.2\n If float, should be between 0.0 and 1.0 and represent the proportion\n of groups to include in the test split (rounded up). If int,\n represents the absolute number of test groups. If None, the value is\n set to the complement of the train size.\n The default will change in version 0.21. It will remain 0.2 only\n if ``train_size`` is unspecified, otherwise it will complement\n the specified ``train_size``.\n\n train_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the\n proportion of the groups to include in the train split. If\n int, represents the absolute number of train groups. If None,\n the value is automatically set to the complement of the test size.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the training and testing indices produced.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import GroupShuffleSplit\n >>> X = np.ones(shape=(8, 2))\n >>> y = np.ones(shape=(8, 1))\n >>> groups = np.array([1, 1, 2, 2, 2, 3, 3, 3])\n >>> print(groups.shape)\n (8,)\n >>> gss = GroupShuffleSplit(n_splits=2, train_size=.7, random_state=42)\n >>> gss.get_n_splits()\n 2\n >>> for train_idx, test_idx in gss.split(X, y, groups):\n ... print(\"TRAIN:\", train_idx, \"TEST:\", test_idx)\n TRAIN: [2 3 4 5 6 7] TEST: [0 1]\n TRAIN: [0 1 5 6 7] TEST: [2 3 4]\n ", "source_code": "\n\nclass GroupShuffleSplit(ShuffleSplit):\n \"\"\"Shuffle-Group(s)-Out cross-validation iterator\n\n Provides randomized train/test indices to split data according to a\n third-party provided group. This group information can be used to encode\n arbitrary domain specific stratifications of the samples as integers.\n\n For instance the groups could be the year of collection of the samples\n and thus allow for cross-validation against time-based splits.\n\n The difference between LeavePGroupsOut and GroupShuffleSplit is that\n the former generates splits using all subsets of size ``p`` unique groups,\n whereas GroupShuffleSplit generates a user-determined number of random\n test splits, each with a user-determined fraction of unique groups.\n\n For example, a less computationally intensive alternative to\n ``LeavePGroupsOut(p=10)`` would be\n ``GroupShuffleSplit(test_size=10, n_splits=100)``.\n\n Note: The parameters ``test_size`` and ``train_size`` refer to groups, and\n not to samples, as in ShuffleSplit.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of re-shuffling & splitting iterations.\n\n test_size : float, int, default=0.2\n If float, should be between 0.0 and 1.0 and represent the proportion\n of groups to include in the test split (rounded up). If int,\n represents the absolute number of test groups. If None, the value is\n set to the complement of the train size.\n The default will change in version 0.21. It will remain 0.2 only\n if ``train_size`` is unspecified, otherwise it will complement\n the specified ``train_size``.\n\n train_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the\n proportion of the groups to include in the train split. If\n int, represents the absolute number of train groups. If None,\n the value is automatically set to the complement of the test size.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the training and testing indices produced.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import GroupShuffleSplit\n >>> X = np.ones(shape=(8, 2))\n >>> y = np.ones(shape=(8, 1))\n >>> groups = np.array([1, 1, 2, 2, 2, 3, 3, 3])\n >>> print(groups.shape)\n (8,)\n >>> gss = GroupShuffleSplit(n_splits=2, train_size=.7, random_state=42)\n >>> gss.get_n_splits()\n 2\n >>> for train_idx, test_idx in gss.split(X, y, groups):\n ... print(\"TRAIN:\", train_idx, \"TEST:\", test_idx)\n TRAIN: [2 3 4 5 6 7] TEST: [0 1]\n TRAIN: [0 1 5 6 7] TEST: [2 3 4]\n \"\"\"\n \n def __init__(self, n_splits=5, *, test_size=None, train_size=None, random_state=None):\n super().__init__(n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state)\n self._default_test_size = 0.2\n \n def _iter_indices(self, X, y, groups):\n if groups is None:\n raise ValueError(\"The 'groups' parameter should not be None.\")\n groups = check_array(groups, ensure_2d=False, dtype=None)\n (classes, group_indices) = np.unique(groups, return_inverse=True)\n for (group_train, group_test) in super()._iter_indices(X=classes):\n train = np.flatnonzero(np.in1d(group_indices, group_train))\n test = np.flatnonzero(np.in1d(group_indices, group_test))\n yield (train, test)\n \n def split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n \"\"\"\n return super().split(X, y, groups)\n" }, @@ -24770,7 +24852,7 @@ "sklearn.model_selection._split.KFold._iter_test_indices" ], "is_public": true, - "description": "K-Folds cross-validator\n\nProvides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default). Each fold is then used once as a validation while the k - 1 remaining folds form the training set. Read more in the :ref:`User Guide `.", + "description": "K-Folds cross-validator\n\nProvides train/test indices to split data in train/test sets. Split\ndataset into k consecutive folds (without shuffling by default).\n\nEach fold is then used once as a validation while the k - 1 remaining\nfolds form the training set.\n\nRead more in the :ref:`User Guide `.", "docstring": "K-Folds cross-validator\n\n Provides train/test indices to split data in train/test sets. Split\n dataset into k consecutive folds (without shuffling by default).\n\n Each fold is then used once as a validation while the k - 1 remaining\n folds form the training set.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of folds. Must be at least 2.\n\n .. versionchanged:: 0.22\n ``n_splits`` default value changed from 3 to 5.\n\n shuffle : bool, default=False\n Whether to shuffle the data before splitting into batches.\n Note that the samples within each split will not be shuffled.\n\n random_state : int, RandomState instance or None, default=None\n When `shuffle` is True, `random_state` affects the ordering of the\n indices, which controls the randomness of each fold. Otherwise, this\n parameter has no effect.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import KFold\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([1, 2, 3, 4])\n >>> kf = KFold(n_splits=2)\n >>> kf.get_n_splits(X)\n 2\n >>> print(kf)\n KFold(n_splits=2, random_state=None, shuffle=False)\n >>> for train_index, test_index in kf.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [2 3] TEST: [0 1]\n TRAIN: [0 1] TEST: [2 3]\n\n Notes\n -----\n The first ``n_samples % n_splits`` folds have size\n ``n_samples // n_splits + 1``, other folds have size\n ``n_samples // n_splits``, where ``n_samples`` is the number of samples.\n\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n\n See Also\n --------\n StratifiedKFold : Takes group information into account to avoid building\n folds with imbalanced class distributions (for binary or multiclass\n classification tasks).\n\n GroupKFold : K-fold iterator variant with non-overlapping groups.\n\n RepeatedKFold : Repeats K-Fold n times.\n ", "source_code": "\n\nclass KFold(_BaseKFold):\n \"\"\"K-Folds cross-validator\n\n Provides train/test indices to split data in train/test sets. Split\n dataset into k consecutive folds (without shuffling by default).\n\n Each fold is then used once as a validation while the k - 1 remaining\n folds form the training set.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of folds. Must be at least 2.\n\n .. versionchanged:: 0.22\n ``n_splits`` default value changed from 3 to 5.\n\n shuffle : bool, default=False\n Whether to shuffle the data before splitting into batches.\n Note that the samples within each split will not be shuffled.\n\n random_state : int, RandomState instance or None, default=None\n When `shuffle` is True, `random_state` affects the ordering of the\n indices, which controls the randomness of each fold. Otherwise, this\n parameter has no effect.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import KFold\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([1, 2, 3, 4])\n >>> kf = KFold(n_splits=2)\n >>> kf.get_n_splits(X)\n 2\n >>> print(kf)\n KFold(n_splits=2, random_state=None, shuffle=False)\n >>> for train_index, test_index in kf.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [2 3] TEST: [0 1]\n TRAIN: [0 1] TEST: [2 3]\n\n Notes\n -----\n The first ``n_samples % n_splits`` folds have size\n ``n_samples // n_splits + 1``, other folds have size\n ``n_samples // n_splits``, where ``n_samples`` is the number of samples.\n\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n\n See Also\n --------\n StratifiedKFold : Takes group information into account to avoid building\n folds with imbalanced class distributions (for binary or multiclass\n classification tasks).\n\n GroupKFold : K-fold iterator variant with non-overlapping groups.\n\n RepeatedKFold : Repeats K-Fold n times.\n \"\"\"\n \n def __init__(self, n_splits=5, *, shuffle=False, random_state=None):\n super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)\n \n def _iter_test_indices(self, X, y=None, groups=None):\n n_samples = _num_samples(X)\n indices = np.arange(n_samples)\n if self.shuffle:\n check_random_state(self.random_state).shuffle(indices)\n n_splits = self.n_splits\n fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int)\n fold_sizes[:n_samples % n_splits] += 1\n current = 0\n for fold_size in fold_sizes:\n (start, stop) = (current, current + fold_size)\n yield indices[start:stop]\n current = stop\n" }, @@ -24785,7 +24867,7 @@ "sklearn.model_selection._split.LeaveOneGroupOut.split" ], "is_public": true, - "description": "Leave One Group Out cross-validator\n\nProvides train/test indices to split data according to a third-party provided group. This group information can be used to encode arbitrary domain specific stratifications of the samples as integers. For instance the groups could be the year of collection of the samples and thus allow for cross-validation against time-based splits. Read more in the :ref:`User Guide `.", + "description": "Leave One Group Out cross-validator\n\nProvides train/test indices to split data according to a third-party\nprovided group. This group information can be used to encode arbitrary\ndomain specific stratifications of the samples as integers.\n\nFor instance the groups could be the year of collection of the samples\nand thus allow for cross-validation against time-based splits.\n\nRead more in the :ref:`User Guide `.", "docstring": "Leave One Group Out cross-validator\n\n Provides train/test indices to split data according to a third-party\n provided group. This group information can be used to encode arbitrary\n domain specific stratifications of the samples as integers.\n\n For instance the groups could be the year of collection of the samples\n and thus allow for cross-validation against time-based splits.\n\n Read more in the :ref:`User Guide `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import LeaveOneGroupOut\n >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])\n >>> y = np.array([1, 2, 1, 2])\n >>> groups = np.array([1, 1, 2, 2])\n >>> logo = LeaveOneGroupOut()\n >>> logo.get_n_splits(X, y, groups)\n 2\n >>> logo.get_n_splits(groups=groups) # 'groups' is always required\n 2\n >>> print(logo)\n LeaveOneGroupOut()\n >>> for train_index, test_index in logo.split(X, y, groups):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n ... print(X_train, X_test, y_train, y_test)\n TRAIN: [2 3] TEST: [0 1]\n [[5 6]\n [7 8]] [[1 2]\n [3 4]] [1 2] [1 2]\n TRAIN: [0 1] TEST: [2 3]\n [[1 2]\n [3 4]] [[5 6]\n [7 8]] [1 2] [1 2]\n\n ", "source_code": "\n\nclass LeaveOneGroupOut(BaseCrossValidator):\n \"\"\"Leave One Group Out cross-validator\n\n Provides train/test indices to split data according to a third-party\n provided group. This group information can be used to encode arbitrary\n domain specific stratifications of the samples as integers.\n\n For instance the groups could be the year of collection of the samples\n and thus allow for cross-validation against time-based splits.\n\n Read more in the :ref:`User Guide `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import LeaveOneGroupOut\n >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])\n >>> y = np.array([1, 2, 1, 2])\n >>> groups = np.array([1, 1, 2, 2])\n >>> logo = LeaveOneGroupOut()\n >>> logo.get_n_splits(X, y, groups)\n 2\n >>> logo.get_n_splits(groups=groups) # 'groups' is always required\n 2\n >>> print(logo)\n LeaveOneGroupOut()\n >>> for train_index, test_index in logo.split(X, y, groups):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n ... print(X_train, X_test, y_train, y_test)\n TRAIN: [2 3] TEST: [0 1]\n [[5 6]\n [7 8]] [[1 2]\n [3 4]] [1 2] [1 2]\n TRAIN: [0 1] TEST: [2 3]\n [[1 2]\n [3 4]] [[5 6]\n [7 8]] [1 2] [1 2]\n\n \"\"\"\n \n def _iter_test_masks(self, X, y, groups):\n if groups is None:\n raise ValueError(\"The 'groups' parameter should not be None.\")\n groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)\n unique_groups = np.unique(groups)\n if len(unique_groups) <= 1:\n raise ValueError('The groups parameter contains fewer than 2 unique groups (%s). LeaveOneGroupOut expects at least 2.' % unique_groups)\n for i in unique_groups:\n yield groups == i\n \n def get_n_splits(self, X=None, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set. This 'groups' parameter must always be specified to\n calculate the number of splits, though the other parameters can be\n omitted.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n if groups is None:\n raise ValueError(\"The 'groups' parameter should not be None.\")\n groups = check_array(groups, ensure_2d=False, dtype=None)\n return len(np.unique(groups))\n \n def split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n return super().split(X, y, groups)\n" }, @@ -24799,7 +24881,7 @@ "sklearn.model_selection._split.LeaveOneOut.get_n_splits" ], "is_public": true, - "description": "Leave-One-Out cross-validator\n\nProvides train/test indices to split data in train/test sets. Each sample is used once as a test set (singleton) while the remaining samples form the training set. Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and ``LeavePOut(p=1)`` where ``n`` is the number of samples. Due to the high number of test sets (which is the same as the number of samples) this cross-validation method can be very costly. For large datasets one should favor :class:`KFold`, :class:`ShuffleSplit` or :class:`StratifiedKFold`. Read more in the :ref:`User Guide `.", + "description": "Leave-One-Out cross-validator\n\nProvides train/test indices to split data in train/test sets. Each\nsample is used once as a test set (singleton) while the remaining\nsamples form the training set.\n\nNote: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and\n``LeavePOut(p=1)`` where ``n`` is the number of samples.\n\nDue to the high number of test sets (which is the same as the\nnumber of samples) this cross-validation method can be very costly.\nFor large datasets one should favor :class:`KFold`, :class:`ShuffleSplit`\nor :class:`StratifiedKFold`.\n\nRead more in the :ref:`User Guide `.", "docstring": "Leave-One-Out cross-validator\n\n Provides train/test indices to split data in train/test sets. Each\n sample is used once as a test set (singleton) while the remaining\n samples form the training set.\n\n Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and\n ``LeavePOut(p=1)`` where ``n`` is the number of samples.\n\n Due to the high number of test sets (which is the same as the\n number of samples) this cross-validation method can be very costly.\n For large datasets one should favor :class:`KFold`, :class:`ShuffleSplit`\n or :class:`StratifiedKFold`.\n\n Read more in the :ref:`User Guide `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import LeaveOneOut\n >>> X = np.array([[1, 2], [3, 4]])\n >>> y = np.array([1, 2])\n >>> loo = LeaveOneOut()\n >>> loo.get_n_splits(X)\n 2\n >>> print(loo)\n LeaveOneOut()\n >>> for train_index, test_index in loo.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n ... print(X_train, X_test, y_train, y_test)\n TRAIN: [1] TEST: [0]\n [[3 4]] [[1 2]] [2] [1]\n TRAIN: [0] TEST: [1]\n [[1 2]] [[3 4]] [1] [2]\n\n See Also\n --------\n LeaveOneGroupOut : For splitting the data according to explicit,\n domain-specific stratification of the dataset.\n GroupKFold : K-fold iterator variant with non-overlapping groups.\n ", "source_code": "\n\nclass LeaveOneOut(BaseCrossValidator):\n \"\"\"Leave-One-Out cross-validator\n\n Provides train/test indices to split data in train/test sets. Each\n sample is used once as a test set (singleton) while the remaining\n samples form the training set.\n\n Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and\n ``LeavePOut(p=1)`` where ``n`` is the number of samples.\n\n Due to the high number of test sets (which is the same as the\n number of samples) this cross-validation method can be very costly.\n For large datasets one should favor :class:`KFold`, :class:`ShuffleSplit`\n or :class:`StratifiedKFold`.\n\n Read more in the :ref:`User Guide `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import LeaveOneOut\n >>> X = np.array([[1, 2], [3, 4]])\n >>> y = np.array([1, 2])\n >>> loo = LeaveOneOut()\n >>> loo.get_n_splits(X)\n 2\n >>> print(loo)\n LeaveOneOut()\n >>> for train_index, test_index in loo.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n ... print(X_train, X_test, y_train, y_test)\n TRAIN: [1] TEST: [0]\n [[3 4]] [[1 2]] [2] [1]\n TRAIN: [0] TEST: [1]\n [[1 2]] [[3 4]] [1] [2]\n\n See Also\n --------\n LeaveOneGroupOut : For splitting the data according to explicit,\n domain-specific stratification of the dataset.\n GroupKFold : K-fold iterator variant with non-overlapping groups.\n \"\"\"\n \n def _iter_test_indices(self, X, y=None, groups=None):\n n_samples = _num_samples(X)\n if n_samples <= 1:\n raise ValueError('Cannot perform LeaveOneOut with n_samples={}.'.format(n_samples))\n return range(n_samples)\n \n def get_n_splits(self, X, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n if X is None:\n raise ValueError(\"The 'X' parameter should not be None.\")\n return _num_samples(X)\n" }, @@ -24815,7 +24897,7 @@ "sklearn.model_selection._split.LeavePGroupsOut.split" ], "is_public": true, - "description": "Leave P Group(s) Out cross-validator\n\nProvides train/test indices to split data according to a third-party provided group. This group information can be used to encode arbitrary domain specific stratifications of the samples as integers. For instance the groups could be the year of collection of the samples and thus allow for cross-validation against time-based splits. The difference between LeavePGroupsOut and LeaveOneGroupOut is that the former builds the test sets with all the samples assigned to ``p`` different values of the groups while the latter uses samples all assigned the same groups. Read more in the :ref:`User Guide `.", + "description": "Leave P Group(s) Out cross-validator\n\nProvides train/test indices to split data according to a third-party\nprovided group. This group information can be used to encode arbitrary\ndomain specific stratifications of the samples as integers.\n\nFor instance the groups could be the year of collection of the samples\nand thus allow for cross-validation against time-based splits.\n\nThe difference between LeavePGroupsOut and LeaveOneGroupOut is that\nthe former builds the test sets with all the samples assigned to\n``p`` different values of the groups while the latter uses samples\nall assigned the same groups.\n\nRead more in the :ref:`User Guide `.", "docstring": "Leave P Group(s) Out cross-validator\n\n Provides train/test indices to split data according to a third-party\n provided group. This group information can be used to encode arbitrary\n domain specific stratifications of the samples as integers.\n\n For instance the groups could be the year of collection of the samples\n and thus allow for cross-validation against time-based splits.\n\n The difference between LeavePGroupsOut and LeaveOneGroupOut is that\n the former builds the test sets with all the samples assigned to\n ``p`` different values of the groups while the latter uses samples\n all assigned the same groups.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_groups : int\n Number of groups (``p``) to leave out in the test split.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import LeavePGroupsOut\n >>> X = np.array([[1, 2], [3, 4], [5, 6]])\n >>> y = np.array([1, 2, 1])\n >>> groups = np.array([1, 2, 3])\n >>> lpgo = LeavePGroupsOut(n_groups=2)\n >>> lpgo.get_n_splits(X, y, groups)\n 3\n >>> lpgo.get_n_splits(groups=groups) # 'groups' is always required\n 3\n >>> print(lpgo)\n LeavePGroupsOut(n_groups=2)\n >>> for train_index, test_index in lpgo.split(X, y, groups):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n ... print(X_train, X_test, y_train, y_test)\n TRAIN: [2] TEST: [0 1]\n [[5 6]] [[1 2]\n [3 4]] [1] [1 2]\n TRAIN: [1] TEST: [0 2]\n [[3 4]] [[1 2]\n [5 6]] [2] [1 1]\n TRAIN: [0] TEST: [1 2]\n [[1 2]] [[3 4]\n [5 6]] [1] [2 1]\n\n See Also\n --------\n GroupKFold : K-fold iterator variant with non-overlapping groups.\n ", "source_code": "\n\nclass LeavePGroupsOut(BaseCrossValidator):\n \"\"\"Leave P Group(s) Out cross-validator\n\n Provides train/test indices to split data according to a third-party\n provided group. This group information can be used to encode arbitrary\n domain specific stratifications of the samples as integers.\n\n For instance the groups could be the year of collection of the samples\n and thus allow for cross-validation against time-based splits.\n\n The difference between LeavePGroupsOut and LeaveOneGroupOut is that\n the former builds the test sets with all the samples assigned to\n ``p`` different values of the groups while the latter uses samples\n all assigned the same groups.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_groups : int\n Number of groups (``p``) to leave out in the test split.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import LeavePGroupsOut\n >>> X = np.array([[1, 2], [3, 4], [5, 6]])\n >>> y = np.array([1, 2, 1])\n >>> groups = np.array([1, 2, 3])\n >>> lpgo = LeavePGroupsOut(n_groups=2)\n >>> lpgo.get_n_splits(X, y, groups)\n 3\n >>> lpgo.get_n_splits(groups=groups) # 'groups' is always required\n 3\n >>> print(lpgo)\n LeavePGroupsOut(n_groups=2)\n >>> for train_index, test_index in lpgo.split(X, y, groups):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n ... print(X_train, X_test, y_train, y_test)\n TRAIN: [2] TEST: [0 1]\n [[5 6]] [[1 2]\n [3 4]] [1] [1 2]\n TRAIN: [1] TEST: [0 2]\n [[3 4]] [[1 2]\n [5 6]] [2] [1 1]\n TRAIN: [0] TEST: [1 2]\n [[1 2]] [[3 4]\n [5 6]] [1] [2 1]\n\n See Also\n --------\n GroupKFold : K-fold iterator variant with non-overlapping groups.\n \"\"\"\n \n def __init__(self, n_groups):\n self.n_groups = n_groups\n \n def _iter_test_masks(self, X, y, groups):\n if groups is None:\n raise ValueError(\"The 'groups' parameter should not be None.\")\n groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)\n unique_groups = np.unique(groups)\n if self.n_groups >= len(unique_groups):\n raise ValueError('The groups parameter contains fewer than (or equal to) n_groups (%d) numbers of unique groups (%s). LeavePGroupsOut expects that at least n_groups + 1 (%d) unique groups be present' % (self.n_groups, unique_groups, self.n_groups + 1))\n combi = combinations(range(len(unique_groups)), self.n_groups)\n for indices in combi:\n test_index = np.zeros(_num_samples(X), dtype=bool)\n for l in unique_groups[np.array(indices)]:\n test_index[groups == l] = True\n yield test_index\n \n def get_n_splits(self, X=None, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set. This 'groups' parameter must always be specified to\n calculate the number of splits, though the other parameters can be\n omitted.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n if groups is None:\n raise ValueError(\"The 'groups' parameter should not be None.\")\n groups = check_array(groups, ensure_2d=False, dtype=None)\n return int(comb(len(np.unique(groups)), self.n_groups, exact=True))\n \n def split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n return super().split(X, y, groups)\n" }, @@ -24830,7 +24912,7 @@ "sklearn.model_selection._split.LeavePOut.get_n_splits" ], "is_public": true, - "description": "Leave-P-Out cross-validator\n\nProvides train/test indices to split data in train/test sets. This results in testing on all distinct samples of size p, while the remaining n - p samples form the training set in each iteration. Note: ``LeavePOut(p)`` is NOT equivalent to ``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets. Due to the high number of iterations which grows combinatorically with the number of samples this cross-validation method can be very costly. For large datasets one should favor :class:`KFold`, :class:`StratifiedKFold` or :class:`ShuffleSplit`. Read more in the :ref:`User Guide `.", + "description": "Leave-P-Out cross-validator\n\nProvides train/test indices to split data in train/test sets. This results\nin testing on all distinct samples of size p, while the remaining n - p\nsamples form the training set in each iteration.\n\nNote: ``LeavePOut(p)`` is NOT equivalent to\n``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets.\n\nDue to the high number of iterations which grows combinatorically with the\nnumber of samples this cross-validation method can be very costly. For\nlarge datasets one should favor :class:`KFold`, :class:`StratifiedKFold`\nor :class:`ShuffleSplit`.\n\nRead more in the :ref:`User Guide `.", "docstring": "Leave-P-Out cross-validator\n\n Provides train/test indices to split data in train/test sets. This results\n in testing on all distinct samples of size p, while the remaining n - p\n samples form the training set in each iteration.\n\n Note: ``LeavePOut(p)`` is NOT equivalent to\n ``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets.\n\n Due to the high number of iterations which grows combinatorically with the\n number of samples this cross-validation method can be very costly. For\n large datasets one should favor :class:`KFold`, :class:`StratifiedKFold`\n or :class:`ShuffleSplit`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n p : int\n Size of the test sets. Must be strictly less than the number of\n samples.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import LeavePOut\n >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])\n >>> y = np.array([1, 2, 3, 4])\n >>> lpo = LeavePOut(2)\n >>> lpo.get_n_splits(X)\n 6\n >>> print(lpo)\n LeavePOut(p=2)\n >>> for train_index, test_index in lpo.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [2 3] TEST: [0 1]\n TRAIN: [1 3] TEST: [0 2]\n TRAIN: [1 2] TEST: [0 3]\n TRAIN: [0 3] TEST: [1 2]\n TRAIN: [0 2] TEST: [1 3]\n TRAIN: [0 1] TEST: [2 3]\n ", "source_code": "\n\nclass LeavePOut(BaseCrossValidator):\n \"\"\"Leave-P-Out cross-validator\n\n Provides train/test indices to split data in train/test sets. This results\n in testing on all distinct samples of size p, while the remaining n - p\n samples form the training set in each iteration.\n\n Note: ``LeavePOut(p)`` is NOT equivalent to\n ``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets.\n\n Due to the high number of iterations which grows combinatorically with the\n number of samples this cross-validation method can be very costly. For\n large datasets one should favor :class:`KFold`, :class:`StratifiedKFold`\n or :class:`ShuffleSplit`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n p : int\n Size of the test sets. Must be strictly less than the number of\n samples.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import LeavePOut\n >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])\n >>> y = np.array([1, 2, 3, 4])\n >>> lpo = LeavePOut(2)\n >>> lpo.get_n_splits(X)\n 6\n >>> print(lpo)\n LeavePOut(p=2)\n >>> for train_index, test_index in lpo.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [2 3] TEST: [0 1]\n TRAIN: [1 3] TEST: [0 2]\n TRAIN: [1 2] TEST: [0 3]\n TRAIN: [0 3] TEST: [1 2]\n TRAIN: [0 2] TEST: [1 3]\n TRAIN: [0 1] TEST: [2 3]\n \"\"\"\n \n def __init__(self, p):\n self.p = p\n \n def _iter_test_indices(self, X, y=None, groups=None):\n n_samples = _num_samples(X)\n if n_samples <= self.p:\n raise ValueError('p={} must be strictly less than the number of samples={}'.format(self.p, n_samples))\n for combination in combinations(range(n_samples), self.p):\n yield np.array(combination)\n \n def get_n_splits(self, X, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n \"\"\"\n if X is None:\n raise ValueError(\"The 'X' parameter should not be None.\")\n return int(comb(_num_samples(X), self.p, exact=True))\n" }, @@ -24846,7 +24928,7 @@ "sklearn.model_selection._split.PredefinedSplit.get_n_splits" ], "is_public": true, - "description": "Predefined split cross-validator\n\nProvides train/test indices to split data into train/test sets using a predefined scheme specified by the user with the ``test_fold`` parameter. Read more in the :ref:`User Guide `. .. versionadded:: 0.16", + "description": "Predefined split cross-validator\n\nProvides train/test indices to split data into train/test sets using a\npredefined scheme specified by the user with the ``test_fold`` parameter.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.16", "docstring": "Predefined split cross-validator\n\n Provides train/test indices to split data into train/test sets using a\n predefined scheme specified by the user with the ``test_fold`` parameter.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.16\n\n Parameters\n ----------\n test_fold : array-like of shape (n_samples,)\n The entry ``test_fold[i]`` represents the index of the test set that\n sample ``i`` belongs to. It is possible to exclude sample ``i`` from\n any test set (i.e. include sample ``i`` in every training set) by\n setting ``test_fold[i]`` equal to -1.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import PredefinedSplit\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([0, 0, 1, 1])\n >>> test_fold = [0, 1, -1, 1]\n >>> ps = PredefinedSplit(test_fold)\n >>> ps.get_n_splits()\n 2\n >>> print(ps)\n PredefinedSplit(test_fold=array([ 0, 1, -1, 1]))\n >>> for train_index, test_index in ps.split():\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [1 2 3] TEST: [0]\n TRAIN: [0 2] TEST: [1 3]\n ", "source_code": "\n\nclass PredefinedSplit(BaseCrossValidator):\n \"\"\"Predefined split cross-validator\n\n Provides train/test indices to split data into train/test sets using a\n predefined scheme specified by the user with the ``test_fold`` parameter.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.16\n\n Parameters\n ----------\n test_fold : array-like of shape (n_samples,)\n The entry ``test_fold[i]`` represents the index of the test set that\n sample ``i`` belongs to. It is possible to exclude sample ``i`` from\n any test set (i.e. include sample ``i`` in every training set) by\n setting ``test_fold[i]`` equal to -1.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import PredefinedSplit\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([0, 0, 1, 1])\n >>> test_fold = [0, 1, -1, 1]\n >>> ps = PredefinedSplit(test_fold)\n >>> ps.get_n_splits()\n 2\n >>> print(ps)\n PredefinedSplit(test_fold=array([ 0, 1, -1, 1]))\n >>> for train_index, test_index in ps.split():\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [1 2 3] TEST: [0]\n TRAIN: [0 2] TEST: [1 3]\n \"\"\"\n \n def __init__(self, test_fold):\n self.test_fold = np.array(test_fold, dtype=int)\n self.test_fold = column_or_1d(self.test_fold)\n self.unique_folds = np.unique(self.test_fold)\n self.unique_folds = self.unique_folds[self.unique_folds != -1]\n \n def split(self, X=None, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n ind = np.arange(len(self.test_fold))\n for test_index in self._iter_test_masks():\n train_index = ind[np.logical_not(test_index)]\n test_index = ind[test_index]\n yield (train_index, test_index)\n \n def _iter_test_masks(self):\n \"\"\"Generates boolean masks corresponding to test sets.\"\"\"\n for f in self.unique_folds:\n test_index = np.where(self.test_fold == f)[0]\n test_mask = np.zeros(len(self.test_fold), dtype=bool)\n test_mask[test_index] = True\n yield test_mask\n \n def get_n_splits(self, X=None, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n return len(self.unique_folds)\n" }, @@ -24859,7 +24941,7 @@ "sklearn.model_selection._split.RepeatedKFold.__init__" ], "is_public": true, - "description": "Repeated K-Fold cross validator.\n\nRepeats K-Fold n times with different randomization in each repetition. Read more in the :ref:`User Guide `.", + "description": "Repeated K-Fold cross validator.\n\nRepeats K-Fold n times with different randomization in each repetition.\n\nRead more in the :ref:`User Guide `.", "docstring": "Repeated K-Fold cross validator.\n\n Repeats K-Fold n times with different randomization in each repetition.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of folds. Must be at least 2.\n\n n_repeats : int, default=10\n Number of times cross-validator needs to be repeated.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of each repeated cross-validation instance.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import RepeatedKFold\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([0, 0, 1, 1])\n >>> rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)\n >>> for train_index, test_index in rkf.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n ...\n TRAIN: [0 1] TEST: [2 3]\n TRAIN: [2 3] TEST: [0 1]\n TRAIN: [1 2] TEST: [0 3]\n TRAIN: [0 3] TEST: [1 2]\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n\n See Also\n --------\n RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.\n ", "source_code": "\n\nclass RepeatedKFold(_RepeatedSplits):\n \"\"\"Repeated K-Fold cross validator.\n\n Repeats K-Fold n times with different randomization in each repetition.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of folds. Must be at least 2.\n\n n_repeats : int, default=10\n Number of times cross-validator needs to be repeated.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of each repeated cross-validation instance.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import RepeatedKFold\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([0, 0, 1, 1])\n >>> rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)\n >>> for train_index, test_index in rkf.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n ...\n TRAIN: [0 1] TEST: [2 3]\n TRAIN: [2 3] TEST: [0 1]\n TRAIN: [1 2] TEST: [0 3]\n TRAIN: [0 3] TEST: [1 2]\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n\n See Also\n --------\n RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.\n \"\"\"\n \n def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):\n super().__init__(KFold, n_repeats=n_repeats, random_state=random_state, n_splits=n_splits)\n" }, @@ -24872,7 +24954,7 @@ "sklearn.model_selection._split.RepeatedStratifiedKFold.__init__" ], "is_public": true, - "description": "Repeated Stratified K-Fold cross validator.\n\nRepeats Stratified K-Fold n times with different randomization in each repetition. Read more in the :ref:`User Guide `.", + "description": "Repeated Stratified K-Fold cross validator.\n\nRepeats Stratified K-Fold n times with different randomization in each\nrepetition.\n\nRead more in the :ref:`User Guide `.", "docstring": "Repeated Stratified K-Fold cross validator.\n\n Repeats Stratified K-Fold n times with different randomization in each\n repetition.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of folds. Must be at least 2.\n\n n_repeats : int, default=10\n Number of times cross-validator needs to be repeated.\n\n random_state : int, RandomState instance or None, default=None\n Controls the generation of the random states for each repetition.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import RepeatedStratifiedKFold\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([0, 0, 1, 1])\n >>> rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2,\n ... random_state=36851234)\n >>> for train_index, test_index in rskf.split(X, y):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n ...\n TRAIN: [1 2] TEST: [0 3]\n TRAIN: [0 3] TEST: [1 2]\n TRAIN: [1 3] TEST: [0 2]\n TRAIN: [0 2] TEST: [1 3]\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n\n See Also\n --------\n RepeatedKFold : Repeats K-Fold n times.\n ", "source_code": "\n\nclass RepeatedStratifiedKFold(_RepeatedSplits):\n \"\"\"Repeated Stratified K-Fold cross validator.\n\n Repeats Stratified K-Fold n times with different randomization in each\n repetition.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of folds. Must be at least 2.\n\n n_repeats : int, default=10\n Number of times cross-validator needs to be repeated.\n\n random_state : int, RandomState instance or None, default=None\n Controls the generation of the random states for each repetition.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import RepeatedStratifiedKFold\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([0, 0, 1, 1])\n >>> rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2,\n ... random_state=36851234)\n >>> for train_index, test_index in rskf.split(X, y):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n ...\n TRAIN: [1 2] TEST: [0 3]\n TRAIN: [0 3] TEST: [1 2]\n TRAIN: [1 3] TEST: [0 2]\n TRAIN: [0 2] TEST: [1 3]\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n\n See Also\n --------\n RepeatedKFold : Repeats K-Fold n times.\n \"\"\"\n \n def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):\n super().__init__(StratifiedKFold, n_repeats=n_repeats, random_state=random_state, n_splits=n_splits)\n" }, @@ -24886,7 +24968,7 @@ "sklearn.model_selection._split.ShuffleSplit._iter_indices" ], "is_public": true, - "description": "Random permutation cross-validator\n\nYields indices to split data into training and test sets. Note: contrary to other cross-validation strategies, random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets. Read more in the :ref:`User Guide `.", + "description": "Random permutation cross-validator\n\nYields indices to split data into training and test sets.\n\nNote: contrary to other cross-validation strategies, random splits\ndo not guarantee that all folds will be different, although this is\nstill very likely for sizeable datasets.\n\nRead more in the :ref:`User Guide `.", "docstring": "Random permutation cross-validator\n\n Yields indices to split data into training and test sets.\n\n Note: contrary to other cross-validation strategies, random splits\n do not guarantee that all folds will be different, although this is\n still very likely for sizeable datasets.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=10\n Number of re-shuffling & splitting iterations.\n\n test_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to include in the test split. If int, represents the\n absolute number of test samples. If None, the value is set to the\n complement of the train size. If ``train_size`` is also None, it will\n be set to 0.1.\n\n train_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the\n proportion of the dataset to include in the train split. If\n int, represents the absolute number of train samples. If None,\n the value is automatically set to the complement of the test size.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the training and testing indices produced.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import ShuffleSplit\n >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]])\n >>> y = np.array([1, 2, 1, 2, 1, 2])\n >>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)\n >>> rs.get_n_splits(X)\n 5\n >>> print(rs)\n ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None)\n >>> for train_index, test_index in rs.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n TRAIN: [1 3 0 4] TEST: [5 2]\n TRAIN: [4 0 2 5] TEST: [1 3]\n TRAIN: [1 2 4 0] TEST: [3 5]\n TRAIN: [3 4 1 0] TEST: [5 2]\n TRAIN: [3 5 1 0] TEST: [2 4]\n >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25,\n ... random_state=0)\n >>> for train_index, test_index in rs.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n TRAIN: [1 3 0] TEST: [5 2]\n TRAIN: [4 0 2] TEST: [1 3]\n TRAIN: [1 2 4] TEST: [3 5]\n TRAIN: [3 4 1] TEST: [5 2]\n TRAIN: [3 5 1] TEST: [2 4]\n ", "source_code": "\n\nclass ShuffleSplit(BaseShuffleSplit):\n \"\"\"Random permutation cross-validator\n\n Yields indices to split data into training and test sets.\n\n Note: contrary to other cross-validation strategies, random splits\n do not guarantee that all folds will be different, although this is\n still very likely for sizeable datasets.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=10\n Number of re-shuffling & splitting iterations.\n\n test_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to include in the test split. If int, represents the\n absolute number of test samples. If None, the value is set to the\n complement of the train size. If ``train_size`` is also None, it will\n be set to 0.1.\n\n train_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the\n proportion of the dataset to include in the train split. If\n int, represents the absolute number of train samples. If None,\n the value is automatically set to the complement of the test size.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the training and testing indices produced.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import ShuffleSplit\n >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]])\n >>> y = np.array([1, 2, 1, 2, 1, 2])\n >>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)\n >>> rs.get_n_splits(X)\n 5\n >>> print(rs)\n ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None)\n >>> for train_index, test_index in rs.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n TRAIN: [1 3 0 4] TEST: [5 2]\n TRAIN: [4 0 2 5] TEST: [1 3]\n TRAIN: [1 2 4 0] TEST: [3 5]\n TRAIN: [3 4 1 0] TEST: [5 2]\n TRAIN: [3 5 1 0] TEST: [2 4]\n >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25,\n ... random_state=0)\n >>> for train_index, test_index in rs.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n TRAIN: [1 3 0] TEST: [5 2]\n TRAIN: [4 0 2] TEST: [1 3]\n TRAIN: [1 2 4] TEST: [3 5]\n TRAIN: [3 4 1] TEST: [5 2]\n TRAIN: [3 5 1] TEST: [2 4]\n \"\"\"\n \n def __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None):\n super().__init__(n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state)\n self._default_test_size = 0.1\n \n def _iter_indices(self, X, y=None, groups=None):\n n_samples = _num_samples(X)\n (n_train, n_test) = _validate_shuffle_split(n_samples, self.test_size, self.train_size, default_test_size=self._default_test_size)\n rng = check_random_state(self.random_state)\n for i in range(self.n_splits):\n permutation = rng.permutation(n_samples)\n ind_test = permutation[:n_test]\n ind_train = permutation[n_test:n_test + n_train]\n yield (ind_train, ind_test)\n" }, @@ -24901,7 +24983,7 @@ "sklearn.model_selection._split.StratifiedGroupKFold._find_best_fold" ], "is_public": true, - "description": "Stratified K-Folds iterator variant with non-overlapping groups.\n\nThis cross-validation object is a variation of StratifiedKFold attempts to return stratified folds with non-overlapping groups. The folds are made by preserving the percentage of samples for each class. The same group will not appear in two different folds (the number of distinct groups has to be at least equal to the number of folds). The difference between GroupKFold and StratifiedGroupKFold is that the former attempts to create balanced folds such that the number of distinct groups is approximately the same in each fold, whereas StratifiedGroupKFold attempts to create folds which preserve the percentage of samples for each class as much as possible given the constraint of non-overlapping groups between splits. Read more in the :ref:`User Guide `.", + "description": "Stratified K-Folds iterator variant with non-overlapping groups.\n\nThis cross-validation object is a variation of StratifiedKFold attempts to\nreturn stratified folds with non-overlapping groups. The folds are made by\npreserving the percentage of samples for each class.\n\nThe same group will not appear in two different folds (the number of\ndistinct groups has to be at least equal to the number of folds).\n\nThe difference between GroupKFold and StratifiedGroupKFold is that\nthe former attempts to create balanced folds such that the number of\ndistinct groups is approximately the same in each fold, whereas\nStratifiedGroupKFold attempts to create folds which preserve the\npercentage of samples for each class as much as possible given the\nconstraint of non-overlapping groups between splits.\n\nRead more in the :ref:`User Guide `.", "docstring": "Stratified K-Folds iterator variant with non-overlapping groups.\n\n This cross-validation object is a variation of StratifiedKFold attempts to\n return stratified folds with non-overlapping groups. The folds are made by\n preserving the percentage of samples for each class.\n\n The same group will not appear in two different folds (the number of\n distinct groups has to be at least equal to the number of folds).\n\n The difference between GroupKFold and StratifiedGroupKFold is that\n the former attempts to create balanced folds such that the number of\n distinct groups is approximately the same in each fold, whereas\n StratifiedGroupKFold attempts to create folds which preserve the\n percentage of samples for each class as much as possible given the\n constraint of non-overlapping groups between splits.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of folds. Must be at least 2.\n\n shuffle : bool, default=False\n Whether to shuffle each class's samples before splitting into batches.\n Note that the samples within each split will not be shuffled.\n This implementation can only shuffle groups that have approximately the\n same y distribution, no global shuffle will be performed.\n\n random_state : int or RandomState instance, default=None\n When `shuffle` is True, `random_state` affects the ordering of the\n indices, which controls the randomness of each fold for each class.\n Otherwise, leave `random_state` as `None`.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import StratifiedGroupKFold\n >>> X = np.ones((17, 2))\n >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])\n >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8])\n >>> cv = StratifiedGroupKFold(n_splits=3)\n >>> for train_idxs, test_idxs in cv.split(X, y, groups):\n ... print(\"TRAIN:\", groups[train_idxs])\n ... print(\" \", y[train_idxs])\n ... print(\" TEST:\", groups[test_idxs])\n ... print(\" \", y[test_idxs])\n TRAIN: [1 1 2 2 4 5 5 5 5 8 8]\n [0 0 1 1 1 0 0 0 0 0 0]\n TEST: [3 3 3 6 6 7]\n [1 1 1 0 0 0]\n TRAIN: [3 3 3 4 5 5 5 5 6 6 7]\n [1 1 1 1 0 0 0 0 0 0 0]\n TEST: [1 1 2 2 8 8]\n [0 0 1 1 0 0]\n TRAIN: [1 1 2 2 3 3 3 6 6 7 8 8]\n [0 0 1 1 1 1 1 0 0 0 0 0]\n TEST: [4 5 5 5 5]\n [1 0 0 0 0]\n\n Notes\n -----\n The implementation is designed to:\n\n * Mimic the behavior of StratifiedKFold as much as possible for trivial\n groups (e.g. when each group contains only one sample).\n * Be invariant to class label: relabelling ``y = [\"Happy\", \"Sad\"]`` to\n ``y = [1, 0]`` should not change the indices generated.\n * Stratify based on samples as much as possible while keeping\n non-overlapping groups constraint. That means that in some cases when\n there is a small number of groups containing a large number of samples\n the stratification will not be possible and the behavior will be close\n to GroupKFold.\n\n See also\n --------\n StratifiedKFold: Takes class information into account to build folds which\n retain class distributions (for binary or multiclass classification\n tasks).\n\n GroupKFold: K-fold iterator variant with non-overlapping groups.\n ", "source_code": "\n\nclass StratifiedGroupKFold(_BaseKFold):\n \"\"\"Stratified K-Folds iterator variant with non-overlapping groups.\n\n This cross-validation object is a variation of StratifiedKFold attempts to\n return stratified folds with non-overlapping groups. The folds are made by\n preserving the percentage of samples for each class.\n\n The same group will not appear in two different folds (the number of\n distinct groups has to be at least equal to the number of folds).\n\n The difference between GroupKFold and StratifiedGroupKFold is that\n the former attempts to create balanced folds such that the number of\n distinct groups is approximately the same in each fold, whereas\n StratifiedGroupKFold attempts to create folds which preserve the\n percentage of samples for each class as much as possible given the\n constraint of non-overlapping groups between splits.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of folds. Must be at least 2.\n\n shuffle : bool, default=False\n Whether to shuffle each class's samples before splitting into batches.\n Note that the samples within each split will not be shuffled.\n This implementation can only shuffle groups that have approximately the\n same y distribution, no global shuffle will be performed.\n\n random_state : int or RandomState instance, default=None\n When `shuffle` is True, `random_state` affects the ordering of the\n indices, which controls the randomness of each fold for each class.\n Otherwise, leave `random_state` as `None`.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import StratifiedGroupKFold\n >>> X = np.ones((17, 2))\n >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])\n >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8])\n >>> cv = StratifiedGroupKFold(n_splits=3)\n >>> for train_idxs, test_idxs in cv.split(X, y, groups):\n ... print(\"TRAIN:\", groups[train_idxs])\n ... print(\" \", y[train_idxs])\n ... print(\" TEST:\", groups[test_idxs])\n ... print(\" \", y[test_idxs])\n TRAIN: [1 1 2 2 4 5 5 5 5 8 8]\n [0 0 1 1 1 0 0 0 0 0 0]\n TEST: [3 3 3 6 6 7]\n [1 1 1 0 0 0]\n TRAIN: [3 3 3 4 5 5 5 5 6 6 7]\n [1 1 1 1 0 0 0 0 0 0 0]\n TEST: [1 1 2 2 8 8]\n [0 0 1 1 0 0]\n TRAIN: [1 1 2 2 3 3 3 6 6 7 8 8]\n [0 0 1 1 1 1 1 0 0 0 0 0]\n TEST: [4 5 5 5 5]\n [1 0 0 0 0]\n\n Notes\n -----\n The implementation is designed to:\n\n * Mimic the behavior of StratifiedKFold as much as possible for trivial\n groups (e.g. when each group contains only one sample).\n * Be invariant to class label: relabelling ``y = [\"Happy\", \"Sad\"]`` to\n ``y = [1, 0]`` should not change the indices generated.\n * Stratify based on samples as much as possible while keeping\n non-overlapping groups constraint. That means that in some cases when\n there is a small number of groups containing a large number of samples\n the stratification will not be possible and the behavior will be close\n to GroupKFold.\n\n See also\n --------\n StratifiedKFold: Takes class information into account to build folds which\n retain class distributions (for binary or multiclass classification\n tasks).\n\n GroupKFold: K-fold iterator variant with non-overlapping groups.\n \"\"\"\n \n def __init__(self, n_splits=5, shuffle=False, random_state=None):\n super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)\n \n def _iter_test_indices(self, X, y, groups):\n rng = check_random_state(self.random_state)\n y = np.asarray(y)\n type_of_target_y = type_of_target(y)\n allowed_target_types = ('binary', 'multiclass')\n if type_of_target_y not in allowed_target_types:\n raise ValueError('Supported target types are: {}. Got {!r} instead.'.format(allowed_target_types, type_of_target_y))\n y = column_or_1d(y)\n (_, y_inv, y_cnt) = np.unique(y, return_inverse=True, return_counts=True)\n if np.all(self.n_splits > y_cnt):\n raise ValueError('n_splits=%d cannot be greater than the number of members in each class.' % self.n_splits)\n n_smallest_class = np.min(y_cnt)\n if self.n_splits > n_smallest_class:\n warnings.warn('The least populated class in y has only %d members, which is less than n_splits=%d.' % (n_smallest_class, self.n_splits), UserWarning)\n n_classes = len(y_cnt)\n (_, groups_inv, groups_cnt) = np.unique(groups, return_inverse=True, return_counts=True)\n y_counts_per_group = np.zeros((len(groups_cnt), n_classes))\n for (class_idx, group_idx) in zip(y_inv, groups_inv):\n y_counts_per_group[group_idx, class_idx] += 1\n y_counts_per_fold = np.zeros((self.n_splits, n_classes))\n groups_per_fold = defaultdict(set)\n if self.shuffle:\n rng.shuffle(y_counts_per_group)\n sorted_groups_idx = np.argsort(-np.std(y_counts_per_group, axis=1), kind='mergesort')\n for group_idx in sorted_groups_idx:\n group_y_counts = y_counts_per_group[group_idx]\n best_fold = self._find_best_fold(y_counts_per_fold=y_counts_per_fold, y_cnt=y_cnt, group_y_counts=group_y_counts)\n y_counts_per_fold[best_fold] += group_y_counts\n groups_per_fold[best_fold].add(group_idx)\n for i in range(self.n_splits):\n test_indices = [idx for (idx, group_idx) in enumerate(groups_inv) if group_idx in groups_per_fold[i]]\n yield test_indices\n \n def _find_best_fold(self, y_counts_per_fold, y_cnt, group_y_counts):\n best_fold = None\n min_eval = np.inf\n min_samples_in_fold = np.inf\n for i in range(self.n_splits):\n y_counts_per_fold[i] += group_y_counts\n std_per_class = np.std(y_counts_per_fold / y_cnt.reshape(1, -1), axis=0)\n y_counts_per_fold[i] -= group_y_counts\n fold_eval = np.mean(std_per_class)\n samples_in_fold = np.sum(y_counts_per_fold[i])\n is_current_fold_better = fold_eval < min_eval or np.isclose(fold_eval, min_eval) and samples_in_fold < min_samples_in_fold\n if is_current_fold_better:\n min_eval = fold_eval\n min_samples_in_fold = samples_in_fold\n best_fold = i\n return best_fold\n" }, @@ -24917,7 +24999,7 @@ "sklearn.model_selection._split.StratifiedKFold.split" ], "is_public": true, - "description": "Stratified K-Folds cross-validator.\n\nProvides train/test indices to split data in train/test sets. This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class. Read more in the :ref:`User Guide `.", + "description": "Stratified K-Folds cross-validator.\n\nProvides train/test indices to split data in train/test sets.\n\nThis cross-validation object is a variation of KFold that returns\nstratified folds. The folds are made by preserving the percentage of\nsamples for each class.\n\nRead more in the :ref:`User Guide `.", "docstring": "Stratified K-Folds cross-validator.\n\n Provides train/test indices to split data in train/test sets.\n\n This cross-validation object is a variation of KFold that returns\n stratified folds. The folds are made by preserving the percentage of\n samples for each class.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of folds. Must be at least 2.\n\n .. versionchanged:: 0.22\n ``n_splits`` default value changed from 3 to 5.\n\n shuffle : bool, default=False\n Whether to shuffle each class's samples before splitting into batches.\n Note that the samples within each split will not be shuffled.\n\n random_state : int, RandomState instance or None, default=None\n When `shuffle` is True, `random_state` affects the ordering of the\n indices, which controls the randomness of each fold for each class.\n Otherwise, leave `random_state` as `None`.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import StratifiedKFold\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([0, 0, 1, 1])\n >>> skf = StratifiedKFold(n_splits=2)\n >>> skf.get_n_splits(X, y)\n 2\n >>> print(skf)\n StratifiedKFold(n_splits=2, random_state=None, shuffle=False)\n >>> for train_index, test_index in skf.split(X, y):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [1 3] TEST: [0 2]\n TRAIN: [0 2] TEST: [1 3]\n\n Notes\n -----\n The implementation is designed to:\n\n * Generate test sets such that all contain the same distribution of\n classes, or as close as possible.\n * Be invariant to class label: relabelling ``y = [\"Happy\", \"Sad\"]`` to\n ``y = [1, 0]`` should not change the indices generated.\n * Preserve order dependencies in the dataset ordering, when\n ``shuffle=False``: all samples from class k in some test set were\n contiguous in y, or separated in y by samples from classes other than k.\n * Generate test sets where the smallest and largest differ by at most one\n sample.\n\n .. versionchanged:: 0.22\n The previous implementation did not follow the last constraint.\n\n See Also\n --------\n RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.\n ", "source_code": "\n\nclass StratifiedKFold(_BaseKFold):\n \"\"\"Stratified K-Folds cross-validator.\n\n Provides train/test indices to split data in train/test sets.\n\n This cross-validation object is a variation of KFold that returns\n stratified folds. The folds are made by preserving the percentage of\n samples for each class.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of folds. Must be at least 2.\n\n .. versionchanged:: 0.22\n ``n_splits`` default value changed from 3 to 5.\n\n shuffle : bool, default=False\n Whether to shuffle each class's samples before splitting into batches.\n Note that the samples within each split will not be shuffled.\n\n random_state : int, RandomState instance or None, default=None\n When `shuffle` is True, `random_state` affects the ordering of the\n indices, which controls the randomness of each fold for each class.\n Otherwise, leave `random_state` as `None`.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import StratifiedKFold\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([0, 0, 1, 1])\n >>> skf = StratifiedKFold(n_splits=2)\n >>> skf.get_n_splits(X, y)\n 2\n >>> print(skf)\n StratifiedKFold(n_splits=2, random_state=None, shuffle=False)\n >>> for train_index, test_index in skf.split(X, y):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [1 3] TEST: [0 2]\n TRAIN: [0 2] TEST: [1 3]\n\n Notes\n -----\n The implementation is designed to:\n\n * Generate test sets such that all contain the same distribution of\n classes, or as close as possible.\n * Be invariant to class label: relabelling ``y = [\"Happy\", \"Sad\"]`` to\n ``y = [1, 0]`` should not change the indices generated.\n * Preserve order dependencies in the dataset ordering, when\n ``shuffle=False``: all samples from class k in some test set were\n contiguous in y, or separated in y by samples from classes other than k.\n * Generate test sets where the smallest and largest differ by at most one\n sample.\n\n .. versionchanged:: 0.22\n The previous implementation did not follow the last constraint.\n\n See Also\n --------\n RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.\n \"\"\"\n \n def __init__(self, n_splits=5, *, shuffle=False, random_state=None):\n super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)\n \n def _make_test_folds(self, X, y=None):\n rng = check_random_state(self.random_state)\n y = np.asarray(y)\n type_of_target_y = type_of_target(y)\n allowed_target_types = ('binary', 'multiclass')\n if type_of_target_y not in allowed_target_types:\n raise ValueError('Supported target types are: {}. Got {!r} instead.'.format(allowed_target_types, type_of_target_y))\n y = column_or_1d(y)\n (_, y_idx, y_inv) = np.unique(y, return_index=True, return_inverse=True)\n (_, class_perm) = np.unique(y_idx, return_inverse=True)\n y_encoded = class_perm[y_inv]\n n_classes = len(y_idx)\n y_counts = np.bincount(y_encoded)\n min_groups = np.min(y_counts)\n if np.all(self.n_splits > y_counts):\n raise ValueError('n_splits=%d cannot be greater than the number of members in each class.' % self.n_splits)\n if self.n_splits > min_groups:\n warnings.warn('The least populated class in y has only %d members, which is less than n_splits=%d.' % (min_groups, self.n_splits), UserWarning)\n y_order = np.sort(y_encoded)\n allocation = np.asarray([np.bincount(y_order[i::self.n_splits], minlength=n_classes) for i in range(self.n_splits)])\n test_folds = np.empty(len(y), dtype='i')\n for k in range(n_classes):\n folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k])\n if self.shuffle:\n rng.shuffle(folds_for_class)\n test_folds[y_encoded == k] = folds_for_class\n return test_folds\n \n def _iter_test_masks(self, X, y=None, groups=None):\n test_folds = self._make_test_folds(X, y)\n for i in range(self.n_splits):\n yield test_folds == i\n \n def split(self, X, y, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Note that providing ``y`` is sufficient to generate the splits and\n hence ``np.zeros(n_samples)`` may be used as a placeholder for\n ``X`` instead of actual training data.\n\n y : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n Stratification is done based on the y labels.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n \"\"\"\n y = check_array(y, ensure_2d=False, dtype=None)\n return super().split(X, y, groups)\n" }, @@ -24932,7 +25014,7 @@ "sklearn.model_selection._split.StratifiedShuffleSplit.split" ], "is_public": true, - "description": "Stratified ShuffleSplit cross-validator\n\nProvides train/test indices to split data in train/test sets. This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, which returns stratified randomized folds. The folds are made by preserving the percentage of samples for each class. Note: like the ShuffleSplit strategy, stratified random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets. Read more in the :ref:`User Guide `.", + "description": "Stratified ShuffleSplit cross-validator\n\nProvides train/test indices to split data in train/test sets.\n\nThis cross-validation object is a merge of StratifiedKFold and\nShuffleSplit, which returns stratified randomized folds. The folds\nare made by preserving the percentage of samples for each class.\n\nNote: like the ShuffleSplit strategy, stratified random splits\ndo not guarantee that all folds will be different, although this is\nstill very likely for sizeable datasets.\n\nRead more in the :ref:`User Guide `.", "docstring": "Stratified ShuffleSplit cross-validator\n\n Provides train/test indices to split data in train/test sets.\n\n This cross-validation object is a merge of StratifiedKFold and\n ShuffleSplit, which returns stratified randomized folds. The folds\n are made by preserving the percentage of samples for each class.\n\n Note: like the ShuffleSplit strategy, stratified random splits\n do not guarantee that all folds will be different, although this is\n still very likely for sizeable datasets.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=10\n Number of re-shuffling & splitting iterations.\n\n test_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to include in the test split. If int, represents the\n absolute number of test samples. If None, the value is set to the\n complement of the train size. If ``train_size`` is also None, it will\n be set to 0.1.\n\n train_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the\n proportion of the dataset to include in the train split. If\n int, represents the absolute number of train samples. If None,\n the value is automatically set to the complement of the test size.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the training and testing indices produced.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import StratifiedShuffleSplit\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([0, 0, 0, 1, 1, 1])\n >>> sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)\n >>> sss.get_n_splits(X, y)\n 5\n >>> print(sss)\n StratifiedShuffleSplit(n_splits=5, random_state=0, ...)\n >>> for train_index, test_index in sss.split(X, y):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [5 2 3] TEST: [4 1 0]\n TRAIN: [5 1 4] TEST: [0 2 3]\n TRAIN: [5 0 2] TEST: [4 3 1]\n TRAIN: [4 1 0] TEST: [2 3 5]\n TRAIN: [0 5 1] TEST: [3 4 2]\n ", "source_code": "\n\nclass StratifiedShuffleSplit(BaseShuffleSplit):\n \"\"\"Stratified ShuffleSplit cross-validator\n\n Provides train/test indices to split data in train/test sets.\n\n This cross-validation object is a merge of StratifiedKFold and\n ShuffleSplit, which returns stratified randomized folds. The folds\n are made by preserving the percentage of samples for each class.\n\n Note: like the ShuffleSplit strategy, stratified random splits\n do not guarantee that all folds will be different, although this is\n still very likely for sizeable datasets.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_splits : int, default=10\n Number of re-shuffling & splitting iterations.\n\n test_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to include in the test split. If int, represents the\n absolute number of test samples. If None, the value is set to the\n complement of the train size. If ``train_size`` is also None, it will\n be set to 0.1.\n\n train_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the\n proportion of the dataset to include in the train split. If\n int, represents the absolute number of train samples. If None,\n the value is automatically set to the complement of the test size.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the training and testing indices produced.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import StratifiedShuffleSplit\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([0, 0, 0, 1, 1, 1])\n >>> sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)\n >>> sss.get_n_splits(X, y)\n 5\n >>> print(sss)\n StratifiedShuffleSplit(n_splits=5, random_state=0, ...)\n >>> for train_index, test_index in sss.split(X, y):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [5 2 3] TEST: [4 1 0]\n TRAIN: [5 1 4] TEST: [0 2 3]\n TRAIN: [5 0 2] TEST: [4 3 1]\n TRAIN: [4 1 0] TEST: [2 3 5]\n TRAIN: [0 5 1] TEST: [3 4 2]\n \"\"\"\n \n def __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None):\n super().__init__(n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state)\n self._default_test_size = 0.1\n \n def _iter_indices(self, X, y, groups=None):\n n_samples = _num_samples(X)\n y = check_array(y, ensure_2d=False, dtype=None)\n (n_train, n_test) = _validate_shuffle_split(n_samples, self.test_size, self.train_size, default_test_size=self._default_test_size)\n if y.ndim == 2:\n y = np.array([' '.join(row.astype('str')) for row in y])\n (classes, y_indices) = np.unique(y, return_inverse=True)\n n_classes = classes.shape[0]\n class_counts = np.bincount(y_indices)\n if np.min(class_counts) < 2:\n raise ValueError('The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.')\n if n_train < n_classes:\n raise ValueError('The train_size = %d should be greater or equal to the number of classes = %d' % (n_train, n_classes))\n if n_test < n_classes:\n raise ValueError('The test_size = %d should be greater or equal to the number of classes = %d' % (n_test, n_classes))\n class_indices = np.split(np.argsort(y_indices, kind='mergesort'), np.cumsum(class_counts)[:-1])\n rng = check_random_state(self.random_state)\n for _ in range(self.n_splits):\n n_i = _approximate_mode(class_counts, n_train, rng)\n class_counts_remaining = class_counts - n_i\n t_i = _approximate_mode(class_counts_remaining, n_test, rng)\n train = []\n test = []\n for i in range(n_classes):\n permutation = rng.permutation(class_counts[i])\n perm_indices_class_i = class_indices[i].take(permutation, mode='clip')\n train.extend(perm_indices_class_i[:n_i[i]])\n test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])\n train = rng.permutation(train)\n test = rng.permutation(test)\n yield (train, test)\n \n def split(self, X, y, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Note that providing ``y`` is sufficient to generate the splits and\n hence ``np.zeros(n_samples)`` may be used as a placeholder for\n ``X`` instead of actual training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_labels)\n The target variable for supervised learning problems.\n Stratification is done based on the y labels.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n \"\"\"\n y = check_array(y, ensure_2d=False, dtype=None)\n return super().split(X, y, groups)\n" }, @@ -24946,7 +25028,7 @@ "sklearn.model_selection._split.TimeSeriesSplit.split" ], "is_public": true, - "description": "Time Series cross-validator\n\nProvides train/test indices to split time series data samples that are observed at fixed time intervals, in train/test sets. In each split, test indices must be higher than before, and thus shuffling in cross validator is inappropriate. This cross-validation object is a variation of :class:`KFold`. In the kth split, it returns first k folds as train set and the (k+1)th fold as test set. Note that unlike standard cross-validation methods, successive training sets are supersets of those that come before them. Read more in the :ref:`User Guide `. .. versionadded:: 0.18", + "description": "Time Series cross-validator\n\nProvides train/test indices to split time series data samples\nthat are observed at fixed time intervals, in train/test sets.\nIn each split, test indices must be higher than before, and thus shuffling\nin cross validator is inappropriate.\n\nThis cross-validation object is a variation of :class:`KFold`.\nIn the kth split, it returns first k folds as train set and the\n(k+1)th fold as test set.\n\nNote that unlike standard cross-validation methods, successive\ntraining sets are supersets of those that come before them.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", "docstring": "Time Series cross-validator\n\n Provides train/test indices to split time series data samples\n that are observed at fixed time intervals, in train/test sets.\n In each split, test indices must be higher than before, and thus shuffling\n in cross validator is inappropriate.\n\n This cross-validation object is a variation of :class:`KFold`.\n In the kth split, it returns first k folds as train set and the\n (k+1)th fold as test set.\n\n Note that unlike standard cross-validation methods, successive\n training sets are supersets of those that come before them.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of splits. Must be at least 2.\n\n .. versionchanged:: 0.22\n ``n_splits`` default value changed from 3 to 5.\n\n max_train_size : int, default=None\n Maximum size for a single training set.\n\n test_size : int, default=None\n Used to limit the size of the test set. Defaults to\n ``n_samples // (n_splits + 1)``, which is the maximum allowed value\n with ``gap=0``.\n\n .. versionadded:: 0.24\n\n gap : int, default=0\n Number of samples to exclude from the end of each train set before\n the test set.\n\n .. versionadded:: 0.24\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import TimeSeriesSplit\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([1, 2, 3, 4, 5, 6])\n >>> tscv = TimeSeriesSplit()\n >>> print(tscv)\n TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)\n >>> for train_index, test_index in tscv.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [0] TEST: [1]\n TRAIN: [0 1] TEST: [2]\n TRAIN: [0 1 2] TEST: [3]\n TRAIN: [0 1 2 3] TEST: [4]\n TRAIN: [0 1 2 3 4] TEST: [5]\n >>> # Fix test_size to 2 with 12 samples\n >>> X = np.random.randn(12, 2)\n >>> y = np.random.randint(0, 2, 12)\n >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2)\n >>> for train_index, test_index in tscv.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [0 1 2 3 4 5] TEST: [6 7]\n TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9]\n TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11]\n >>> # Add in a 2 period gap\n >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2)\n >>> for train_index, test_index in tscv.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [0 1 2 3] TEST: [6 7]\n TRAIN: [0 1 2 3 4 5] TEST: [8 9]\n TRAIN: [0 1 2 3 4 5 6 7] TEST: [10 11]\n\n Notes\n -----\n The training set has size ``i * n_samples // (n_splits + 1)\n + n_samples % (n_splits + 1)`` in the ``i`` th split,\n with a test set of size ``n_samples//(n_splits + 1)`` by default,\n where ``n_samples`` is the number of samples.\n ", "source_code": "\n\nclass TimeSeriesSplit(_BaseKFold):\n \"\"\"Time Series cross-validator\n\n Provides train/test indices to split time series data samples\n that are observed at fixed time intervals, in train/test sets.\n In each split, test indices must be higher than before, and thus shuffling\n in cross validator is inappropriate.\n\n This cross-validation object is a variation of :class:`KFold`.\n In the kth split, it returns first k folds as train set and the\n (k+1)th fold as test set.\n\n Note that unlike standard cross-validation methods, successive\n training sets are supersets of those that come before them.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n n_splits : int, default=5\n Number of splits. Must be at least 2.\n\n .. versionchanged:: 0.22\n ``n_splits`` default value changed from 3 to 5.\n\n max_train_size : int, default=None\n Maximum size for a single training set.\n\n test_size : int, default=None\n Used to limit the size of the test set. Defaults to\n ``n_samples // (n_splits + 1)``, which is the maximum allowed value\n with ``gap=0``.\n\n .. versionadded:: 0.24\n\n gap : int, default=0\n Number of samples to exclude from the end of each train set before\n the test set.\n\n .. versionadded:: 0.24\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import TimeSeriesSplit\n >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])\n >>> y = np.array([1, 2, 3, 4, 5, 6])\n >>> tscv = TimeSeriesSplit()\n >>> print(tscv)\n TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)\n >>> for train_index, test_index in tscv.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [0] TEST: [1]\n TRAIN: [0 1] TEST: [2]\n TRAIN: [0 1 2] TEST: [3]\n TRAIN: [0 1 2 3] TEST: [4]\n TRAIN: [0 1 2 3 4] TEST: [5]\n >>> # Fix test_size to 2 with 12 samples\n >>> X = np.random.randn(12, 2)\n >>> y = np.random.randint(0, 2, 12)\n >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2)\n >>> for train_index, test_index in tscv.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [0 1 2 3 4 5] TEST: [6 7]\n TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9]\n TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11]\n >>> # Add in a 2 period gap\n >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2)\n >>> for train_index, test_index in tscv.split(X):\n ... print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n ... X_train, X_test = X[train_index], X[test_index]\n ... y_train, y_test = y[train_index], y[test_index]\n TRAIN: [0 1 2 3] TEST: [6 7]\n TRAIN: [0 1 2 3 4 5] TEST: [8 9]\n TRAIN: [0 1 2 3 4 5 6 7] TEST: [10 11]\n\n Notes\n -----\n The training set has size ``i * n_samples // (n_splits + 1)\n + n_samples % (n_splits + 1)`` in the ``i`` th split,\n with a test set of size ``n_samples//(n_splits + 1)`` by default,\n where ``n_samples`` is the number of samples.\n \"\"\"\n \n def __init__(self, n_splits=5, *, max_train_size=None, test_size=None, gap=0):\n super().__init__(n_splits, shuffle=False, random_state=None)\n self.max_train_size = max_train_size\n self.test_size = test_size\n self.gap = gap\n \n def split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Always ignored, exists for compatibility.\n\n groups : array-like of shape (n_samples,)\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n (X, y, groups) = indexable(X, y, groups)\n n_samples = _num_samples(X)\n n_splits = self.n_splits\n n_folds = n_splits + 1\n gap = self.gap\n test_size = self.test_size if self.test_size is not None else n_samples // n_folds\n if n_folds > n_samples:\n raise ValueError(f'Cannot have number of folds={n_folds} greater than the number of samples={n_samples}.')\n if n_samples - gap - test_size * n_splits <= 0:\n raise ValueError(f'Too many splits={n_splits} for number of samples={n_samples} with test_size={test_size} and gap={gap}.')\n indices = np.arange(n_samples)\n test_starts = range(n_samples - n_splits * test_size, n_samples, test_size)\n for test_start in test_starts:\n train_end = test_start - gap\n if self.max_train_size and self.max_train_size < train_end:\n yield (indices[train_end - self.max_train_size:train_end], indices[test_start:test_start + test_size])\n else:\n yield (indices[:train_end], indices[test_start:test_start + test_size])\n" }, @@ -24992,7 +25074,7 @@ "sklearn.model_selection._split._RepeatedSplits.__repr__" ], "is_public": false, - "description": "Repeated splits for an arbitrary randomized CV splitter.\n\nRepeats splits for cross-validators n times with different randomization in each repetition.", + "description": "Repeated splits for an arbitrary randomized CV splitter.\n\nRepeats splits for cross-validators n times with different randomization\nin each repetition.", "docstring": "Repeated splits for an arbitrary randomized CV splitter.\n\n Repeats splits for cross-validators n times with different randomization\n in each repetition.\n\n Parameters\n ----------\n cv : callable\n Cross-validator class.\n\n n_repeats : int, default=10\n Number of times cross-validator needs to be repeated.\n\n random_state : int, RandomState instance or None, default=None\n Passes `random_state` to the arbitrary repeating cross validator.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n **cvargs : additional params\n Constructor parameters for cv. Must not contain random_state\n and shuffle.\n ", "source_code": "\n\nclass _RepeatedSplits(metaclass=ABCMeta):\n \"\"\"Repeated splits for an arbitrary randomized CV splitter.\n\n Repeats splits for cross-validators n times with different randomization\n in each repetition.\n\n Parameters\n ----------\n cv : callable\n Cross-validator class.\n\n n_repeats : int, default=10\n Number of times cross-validator needs to be repeated.\n\n random_state : int, RandomState instance or None, default=None\n Passes `random_state` to the arbitrary repeating cross validator.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n **cvargs : additional params\n Constructor parameters for cv. Must not contain random_state\n and shuffle.\n \"\"\"\n \n def __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs):\n if not isinstance(n_repeats, numbers.Integral):\n raise ValueError('Number of repetitions must be of Integral type.')\n if n_repeats <= 0:\n raise ValueError('Number of repetitions must be greater than 0.')\n if any((key in cvargs for key in ('random_state', 'shuffle'))):\n raise ValueError('cvargs must not contain random_state or shuffle.')\n self.cv = cv\n self.n_repeats = n_repeats\n self.random_state = random_state\n self.cvargs = cvargs\n \n def split(self, X, y=None, groups=None):\n \"\"\"Generates indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n n_repeats = self.n_repeats\n rng = check_random_state(self.random_state)\n for idx in range(n_repeats):\n cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)\n for (train_index, test_index) in cv.split(X, y, groups):\n yield (train_index, test_index)\n \n def get_n_splits(self, X=None, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n ``np.zeros(n_samples)`` may be used as a placeholder.\n\n y : object\n Always ignored, exists for compatibility.\n ``np.zeros(n_samples)`` may be used as a placeholder.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n rng = check_random_state(self.random_state)\n cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)\n return cv.get_n_splits(X, y, groups) * self.n_repeats\n \n def __repr__(self):\n return _build_repr(self)\n" }, @@ -25016,7 +25098,7 @@ "sklearn.multiclass.OneVsOneClassifier._more_tags" ], "is_public": true, - "description": "One-vs-one multiclass strategy.\n\nThis strategy consists in fitting one classifier per class pair. At prediction time, the class which received the most votes is selected. Since it requires to fit `n_classes * (n_classes - 1) / 2` classifiers, this method is usually slower than one-vs-the-rest, due to its O(n_classes^2) complexity. However, this method may be advantageous for algorithms such as kernel algorithms which don't scale well with `n_samples`. This is because each individual learning problem only involves a small subset of the data whereas, with one-vs-the-rest, the complete dataset is used `n_classes` times. Read more in the :ref:`User Guide `.", + "description": "One-vs-one multiclass strategy.\n\nThis strategy consists in fitting one classifier per class pair.\nAt prediction time, the class which received the most votes is selected.\nSince it requires to fit `n_classes * (n_classes - 1) / 2` classifiers,\nthis method is usually slower than one-vs-the-rest, due to its\nO(n_classes^2) complexity. However, this method may be advantageous for\nalgorithms such as kernel algorithms which don't scale well with\n`n_samples`. This is because each individual learning problem only involves\na small subset of the data whereas, with one-vs-the-rest, the complete\ndataset is used `n_classes` times.\n\nRead more in the :ref:`User Guide `.", "docstring": "One-vs-one multiclass strategy.\n\n This strategy consists in fitting one classifier per class pair.\n At prediction time, the class which received the most votes is selected.\n Since it requires to fit `n_classes * (n_classes - 1) / 2` classifiers,\n this method is usually slower than one-vs-the-rest, due to its\n O(n_classes^2) complexity. However, this method may be advantageous for\n algorithms such as kernel algorithms which don't scale well with\n `n_samples`. This is because each individual learning problem only involves\n a small subset of the data whereas, with one-vs-the-rest, the complete\n dataset is used `n_classes` times.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit` and one of\n :term:`decision_function` or :term:`predict_proba`.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation: the `n_classes * (\n n_classes - 1) / 2` OVO problems are computed in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n estimators_ : list of ``n_classes * (n_classes - 1) / 2`` estimators\n Estimators used for predictions.\n\n classes_ : numpy array of shape [n_classes]\n Array containing labels.\n\n n_classes_ : int\n Number of classes.\n\n pairwise_indices_ : list, length = ``len(estimators_)``, or ``None``\n Indices of samples used when training the estimators.\n ``None`` when ``estimator``'s `pairwise` tag is False.\n\n .. deprecated:: 0.24\n\n The _pairwise attribute is deprecated in 0.24. From 1.1\n (renaming of 0.25) and onward, `pairwise_indices_` will use the\n pairwise estimator tag instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OneVsRestClassifier : One-vs-all multiclass strategy.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.multiclass import OneVsOneClassifier\n >>> from sklearn.svm import LinearSVC\n >>> X, y = load_iris(return_X_y=True)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.33, shuffle=True, random_state=0)\n >>> clf = OneVsOneClassifier(\n ... LinearSVC(random_state=0)).fit(X_train, y_train)\n >>> clf.predict(X_test[:10])\n array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1])\n ", "source_code": "\n\nclass OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):\n \"\"\"One-vs-one multiclass strategy.\n\n This strategy consists in fitting one classifier per class pair.\n At prediction time, the class which received the most votes is selected.\n Since it requires to fit `n_classes * (n_classes - 1) / 2` classifiers,\n this method is usually slower than one-vs-the-rest, due to its\n O(n_classes^2) complexity. However, this method may be advantageous for\n algorithms such as kernel algorithms which don't scale well with\n `n_samples`. This is because each individual learning problem only involves\n a small subset of the data whereas, with one-vs-the-rest, the complete\n dataset is used `n_classes` times.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit` and one of\n :term:`decision_function` or :term:`predict_proba`.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation: the `n_classes * (\n n_classes - 1) / 2` OVO problems are computed in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n estimators_ : list of ``n_classes * (n_classes - 1) / 2`` estimators\n Estimators used for predictions.\n\n classes_ : numpy array of shape [n_classes]\n Array containing labels.\n\n n_classes_ : int\n Number of classes.\n\n pairwise_indices_ : list, length = ``len(estimators_)``, or ``None``\n Indices of samples used when training the estimators.\n ``None`` when ``estimator``'s `pairwise` tag is False.\n\n .. deprecated:: 0.24\n\n The _pairwise attribute is deprecated in 0.24. From 1.1\n (renaming of 0.25) and onward, `pairwise_indices_` will use the\n pairwise estimator tag instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OneVsRestClassifier : One-vs-all multiclass strategy.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.multiclass import OneVsOneClassifier\n >>> from sklearn.svm import LinearSVC\n >>> X, y = load_iris(return_X_y=True)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.33, shuffle=True, random_state=0)\n >>> clf = OneVsOneClassifier(\n ... LinearSVC(random_state=0)).fit(X_train, y_train)\n >>> clf.predict(X_test[:10])\n array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1])\n \"\"\"\n \n def __init__(self, estimator, *, n_jobs=None):\n self.estimator = estimator\n self.n_jobs = n_jobs\n \n def fit(self, X, y):\n \"\"\"Fit underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : array-like of shape (n_samples,)\n Multi-class targets.\n\n Returns\n -------\n self : object\n The fitted underlying estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], force_all_finite=False)\n check_classification_targets(y)\n self.classes_ = np.unique(y)\n if len(self.classes_) == 1:\n raise ValueError('OneVsOneClassifier can not be fit when only one class is present.')\n n_classes = self.classes_.shape[0]\n estimators_indices = list(zip(*Parallel(n_jobs=self.n_jobs)((delayed(_fit_ovo_binary)(self.estimator, X, y, self.classes_[i], self.classes_[j]) for i in range(n_classes) for j in range(i + 1, n_classes)))))\n self.estimators_ = estimators_indices[0]\n pairwise = _is_pairwise(self)\n self.pairwise_indices_ = estimators_indices[1] if pairwise else None\n return self\n \n @available_if(_estimators_has('partial_fit'))\n def partial_fit(self, X, y, classes=None):\n \"\"\"Partially fit underlying estimators.\n\n Should be used when memory is inefficient to train all data. Chunks\n of data can be passed in several iteration, where the first call\n should have an array of all target variables.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : array-like of shape (n_samples,)\n Multi-class targets.\n\n classes : array, shape (n_classes, )\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is only required in the first call of partial_fit\n and can be omitted in the subsequent calls.\n\n Returns\n -------\n self : object\n The partially fitted underlying estimator.\n \"\"\"\n first_call = _check_partial_fit_first_call(self, classes)\n if first_call:\n self.estimators_ = [clone(self.estimator) for _ in range(self.n_classes_ * (self.n_classes_ - 1) // 2)]\n if len(np.setdiff1d(y, self.classes_)):\n raise ValueError('Mini-batch contains {0} while it must be subset of {1}'.format(np.unique(y), self.classes_))\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], force_all_finite=False, reset=first_call)\n check_classification_targets(y)\n combinations = itertools.combinations(range(self.n_classes_), 2)\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_partial_fit_ovo_binary)(estimator, X, y, self.classes_[i], self.classes_[j]) for (estimator, (i, j)) in zip(self.estimators_, combinations)))\n self.pairwise_indices_ = None\n if hasattr(self.estimators_[0], 'n_features_in_'):\n self.n_features_in_ = self.estimators_[0].n_features_in_\n return self\n \n def predict(self, X):\n \"\"\"Estimate the best class label for each sample in X.\n\n This is implemented as ``argmax(decision_function(X), axis=1)`` which\n will return the label of the class with most votes by estimators\n predicting the outcome of a decision for each possible class pair.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n Returns\n -------\n y : numpy array of shape [n_samples]\n Predicted multi-class targets.\n \"\"\"\n Y = self.decision_function(X)\n if self.n_classes_ == 2:\n return self.classes_[(Y > 0).astype(int)]\n return self.classes_[Y.argmax(axis=1)]\n \n def decision_function(self, X):\n \"\"\"Decision function for the OneVsOneClassifier.\n\n The decision values for the samples are computed by adding the\n normalized sum of pair-wise classification confidence levels to the\n votes in order to disambiguate between the decision values when the\n votes for all the classes are equal leading to a tie.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n Y : array-like of shape (n_samples, n_classes) or (n_samples,)\n Result of calling `decision_function` on the final estimator.\n\n .. versionchanged:: 0.19\n output shape changed to ``(n_samples,)`` to conform to\n scikit-learn conventions for binary classification.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, force_all_finite=False, reset=False)\n indices = self.pairwise_indices_\n if indices is None:\n Xs = [X] * len(self.estimators_)\n else:\n Xs = [X[:, idx] for idx in indices]\n predictions = np.vstack([est.predict(Xi) for (est, Xi) in zip(self.estimators_, Xs)]).T\n confidences = np.vstack([_predict_binary(est, Xi) for (est, Xi) in zip(self.estimators_, Xs)]).T\n Y = _ovr_decision_function(predictions, confidences, len(self.classes_))\n if self.n_classes_ == 2:\n return Y[:, 1]\n return Y\n \n @property\n def n_classes_(self):\n \"\"\"Number of classes.\"\"\"\n return len(self.classes_)\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n \"\"\"Indicate if wrapped estimator is using a precomputed Gram matrix\"\"\"\n return getattr(self.estimator, '_pairwise', False)\n \n def _more_tags(self):\n \"\"\"Indicate if wrapped estimator is using a precomputed Gram matrix\"\"\"\n return {'pairwise': _safe_tags(self.estimator, key='pairwise')}\n" }, @@ -25045,7 +25127,7 @@ "sklearn.multiclass.OneVsRestClassifier._more_tags" ], "is_public": true, - "description": "One-vs-the-rest (OvR) multiclass strategy.\n\nAlso known as one-vs-all, this strategy consists in fitting one classifier per class. For each classifier, the class is fitted against all the other classes. In addition to its computational efficiency (only `n_classes` classifiers are needed), one advantage of this approach is its interpretability. Since each class is represented by one and one classifier only, it is possible to gain knowledge about the class by inspecting its corresponding classifier. This is the most commonly used strategy for multiclass classification and is a fair default choice. OneVsRestClassifier can also be used for multilabel classification. To use this feature, provide an indicator matrix for the target `y` when calling `.fit`. In other words, the target labels should be formatted as a 2D binary (0/1) matrix, where [i, j] == 1 indicates the presence of label j in sample i. This estimator uses the binary relevance method to perform multilabel classification, which involves training one binary classifier independently for each label. Read more in the :ref:`User Guide `.", + "description": "One-vs-the-rest (OvR) multiclass strategy.\n\nAlso known as one-vs-all, this strategy consists in fitting one classifier\nper class. For each classifier, the class is fitted against all the other\nclasses. In addition to its computational efficiency (only `n_classes`\nclassifiers are needed), one advantage of this approach is its\ninterpretability. Since each class is represented by one and one classifier\nonly, it is possible to gain knowledge about the class by inspecting its\ncorresponding classifier. This is the most commonly used strategy for\nmulticlass classification and is a fair default choice.\n\nOneVsRestClassifier can also be used for multilabel classification. To use\nthis feature, provide an indicator matrix for the target `y` when calling\n`.fit`. In other words, the target labels should be formatted as a 2D\nbinary (0/1) matrix, where [i, j] == 1 indicates the presence of label j\nin sample i. This estimator uses the binary relevance method to perform\nmultilabel classification, which involves training one binary classifier\nindependently for each label.\n\nRead more in the :ref:`User Guide `.", "docstring": "One-vs-the-rest (OvR) multiclass strategy.\n\n Also known as one-vs-all, this strategy consists in fitting one classifier\n per class. For each classifier, the class is fitted against all the other\n classes. In addition to its computational efficiency (only `n_classes`\n classifiers are needed), one advantage of this approach is its\n interpretability. Since each class is represented by one and one classifier\n only, it is possible to gain knowledge about the class by inspecting its\n corresponding classifier. This is the most commonly used strategy for\n multiclass classification and is a fair default choice.\n\n OneVsRestClassifier can also be used for multilabel classification. To use\n this feature, provide an indicator matrix for the target `y` when calling\n `.fit`. In other words, the target labels should be formatted as a 2D\n binary (0/1) matrix, where [i, j] == 1 indicates the presence of label j\n in sample i. This estimator uses the binary relevance method to perform\n multilabel classification, which involves training one binary classifier\n independently for each label.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit` and one of\n :term:`decision_function` or :term:`predict_proba`.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation: the `n_classes`\n one-vs-rest problems are computed in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\n Attributes\n ----------\n estimators_ : list of `n_classes` estimators\n Estimators used for predictions.\n\n coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)\n Coefficient of the features in the decision function. This attribute\n exists only if the ``estimators_`` defines ``coef_``.\n\n .. deprecated:: 0.24\n This attribute is deprecated in 0.24 and will\n be removed in 1.1 (renaming of 0.26). If you use this attribute\n in :class:`~sklearn.feature_selection.RFE` or\n :class:`~sklearn.feature_selection.SelectFromModel`,\n you may pass a callable to the `importance_getter`\n parameter that extracts feature the importances\n from `estimators_`.\n\n intercept_ : ndarray of shape (1, 1) or (n_classes, 1)\n If ``y`` is binary, the shape is ``(1, 1)`` else ``(n_classes, 1)``\n This attribute exists only if the ``estimators_`` defines\n ``intercept_``.\n\n .. deprecated:: 0.24\n This attribute is deprecated in 0.24 and will\n be removed in 1.1 (renaming of 0.26). If you use this attribute\n in :class:`~sklearn.feature_selection.RFE` or\n :class:`~sklearn.feature_selection.SelectFromModel`,\n you may pass a callable to the `importance_getter`\n parameter that extracts feature the importances\n from `estimators_`.\n\n classes_ : array, shape = [`n_classes`]\n Class labels.\n\n n_classes_ : int\n Number of classes.\n\n label_binarizer_ : LabelBinarizer object\n Object used to transform multiclass labels to binary labels and\n vice-versa.\n\n multilabel_ : boolean\n Whether a OneVsRestClassifier is a multilabel classifier.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MultiOutputClassifier : Alternate way of extending an estimator for\n multilabel classification.\n sklearn.preprocessing.MultiLabelBinarizer : Transform iterable of iterables\n to binary indicator matrix.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.multiclass import OneVsRestClassifier\n >>> from sklearn.svm import SVC\n >>> X = np.array([\n ... [10, 10],\n ... [8, 10],\n ... [-5, 5.5],\n ... [-5.4, 5.5],\n ... [-20, -20],\n ... [-15, -20]\n ... ])\n >>> y = np.array([0, 0, 1, 1, 2, 2])\n >>> clf = OneVsRestClassifier(SVC()).fit(X, y)\n >>> clf.predict([[-19, -20], [9, 9], [-5, 5]])\n array([2, 0, 1])\n ", "source_code": "\n\nclass OneVsRestClassifier(MultiOutputMixin, ClassifierMixin, MetaEstimatorMixin, BaseEstimator):\n \"\"\"One-vs-the-rest (OvR) multiclass strategy.\n\n Also known as one-vs-all, this strategy consists in fitting one classifier\n per class. For each classifier, the class is fitted against all the other\n classes. In addition to its computational efficiency (only `n_classes`\n classifiers are needed), one advantage of this approach is its\n interpretability. Since each class is represented by one and one classifier\n only, it is possible to gain knowledge about the class by inspecting its\n corresponding classifier. This is the most commonly used strategy for\n multiclass classification and is a fair default choice.\n\n OneVsRestClassifier can also be used for multilabel classification. To use\n this feature, provide an indicator matrix for the target `y` when calling\n `.fit`. In other words, the target labels should be formatted as a 2D\n binary (0/1) matrix, where [i, j] == 1 indicates the presence of label j\n in sample i. This estimator uses the binary relevance method to perform\n multilabel classification, which involves training one binary classifier\n independently for each label.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit` and one of\n :term:`decision_function` or :term:`predict_proba`.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation: the `n_classes`\n one-vs-rest problems are computed in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\n Attributes\n ----------\n estimators_ : list of `n_classes` estimators\n Estimators used for predictions.\n\n coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)\n Coefficient of the features in the decision function. This attribute\n exists only if the ``estimators_`` defines ``coef_``.\n\n .. deprecated:: 0.24\n This attribute is deprecated in 0.24 and will\n be removed in 1.1 (renaming of 0.26). If you use this attribute\n in :class:`~sklearn.feature_selection.RFE` or\n :class:`~sklearn.feature_selection.SelectFromModel`,\n you may pass a callable to the `importance_getter`\n parameter that extracts feature the importances\n from `estimators_`.\n\n intercept_ : ndarray of shape (1, 1) or (n_classes, 1)\n If ``y`` is binary, the shape is ``(1, 1)`` else ``(n_classes, 1)``\n This attribute exists only if the ``estimators_`` defines\n ``intercept_``.\n\n .. deprecated:: 0.24\n This attribute is deprecated in 0.24 and will\n be removed in 1.1 (renaming of 0.26). If you use this attribute\n in :class:`~sklearn.feature_selection.RFE` or\n :class:`~sklearn.feature_selection.SelectFromModel`,\n you may pass a callable to the `importance_getter`\n parameter that extracts feature the importances\n from `estimators_`.\n\n classes_ : array, shape = [`n_classes`]\n Class labels.\n\n n_classes_ : int\n Number of classes.\n\n label_binarizer_ : LabelBinarizer object\n Object used to transform multiclass labels to binary labels and\n vice-versa.\n\n multilabel_ : boolean\n Whether a OneVsRestClassifier is a multilabel classifier.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MultiOutputClassifier : Alternate way of extending an estimator for\n multilabel classification.\n sklearn.preprocessing.MultiLabelBinarizer : Transform iterable of iterables\n to binary indicator matrix.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.multiclass import OneVsRestClassifier\n >>> from sklearn.svm import SVC\n >>> X = np.array([\n ... [10, 10],\n ... [8, 10],\n ... [-5, 5.5],\n ... [-5.4, 5.5],\n ... [-20, -20],\n ... [-15, -20]\n ... ])\n >>> y = np.array([0, 0, 1, 1, 2, 2])\n >>> clf = OneVsRestClassifier(SVC()).fit(X, y)\n >>> clf.predict([[-19, -20], [9, 9], [-5, 5]])\n array([2, 0, 1])\n \"\"\"\n \n def __init__(self, estimator, *, n_jobs=None):\n self.estimator = estimator\n self.n_jobs = n_jobs\n \n def fit(self, X, y):\n \"\"\"Fit underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\n Multi-class targets. An indicator matrix turns on multilabel\n classification.\n\n Returns\n -------\n self : object\n Instance of fitted estimator.\n \"\"\"\n self.label_binarizer_ = LabelBinarizer(sparse_output=True)\n Y = self.label_binarizer_.fit_transform(y)\n Y = Y.tocsc()\n self.classes_ = self.label_binarizer_.classes_\n columns = (col.toarray().ravel() for col in Y.T)\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_fit_binary)(self.estimator, X, column, classes=['not %s' % self.label_binarizer_.classes_[i], self.label_binarizer_.classes_[i]]) for (i, column) in enumerate(columns)))\n if hasattr(self.estimators_[0], 'n_features_in_'):\n self.n_features_in_ = self.estimators_[0].n_features_in_\n if hasattr(self.estimators_[0], 'feature_names_in_'):\n self.feature_names_in_ = self.estimators_[0].feature_names_in_\n return self\n \n @available_if(_estimators_has('partial_fit'))\n def partial_fit(self, X, y, classes=None):\n \"\"\"Partially fit underlying estimators.\n\n Should be used when memory is inefficient to train all data.\n Chunks of data can be passed in several iteration.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\n Multi-class targets. An indicator matrix turns on multilabel\n classification.\n\n classes : array, shape (n_classes, )\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is only required in the first call of partial_fit\n and can be omitted in the subsequent calls.\n\n Returns\n -------\n self : object\n Instance of partially fitted estimator.\n \"\"\"\n if _check_partial_fit_first_call(self, classes):\n if not hasattr(self.estimator, 'partial_fit'):\n raise ValueError(\"Base estimator {0}, doesn't have partial_fit method\".format(self.estimator))\n self.estimators_ = [clone(self.estimator) for _ in range(self.n_classes_)]\n self.label_binarizer_ = LabelBinarizer(sparse_output=True)\n self.label_binarizer_.fit(self.classes_)\n if len(np.setdiff1d(y, self.classes_)):\n raise ValueError(('Mini-batch contains {0} while classes ' + 'must be subset of {1}').format(np.unique(y), self.classes_))\n Y = self.label_binarizer_.transform(y)\n Y = Y.tocsc()\n columns = (col.toarray().ravel() for col in Y.T)\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_partial_fit_binary)(estimator, X, column) for (estimator, column) in zip(self.estimators_, columns)))\n if hasattr(self.estimators_[0], 'n_features_in_'):\n self.n_features_in_ = self.estimators_[0].n_features_in_\n return self\n \n def predict(self, X):\n \"\"\"Predict multi-class targets using underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n Returns\n -------\n y : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\n Predicted multi-class targets.\n \"\"\"\n check_is_fitted(self)\n n_samples = _num_samples(X)\n if self.label_binarizer_.y_type_ == 'multiclass':\n maxima = np.empty(n_samples, dtype=float)\n maxima.fill(-np.inf)\n argmaxima = np.zeros(n_samples, dtype=int)\n for (i, e) in enumerate(self.estimators_):\n pred = _predict_binary(e, X)\n np.maximum(maxima, pred, out=maxima)\n argmaxima[maxima == pred] = i\n return self.classes_[argmaxima]\n else:\n if hasattr(self.estimators_[0], 'decision_function') and is_classifier(self.estimators_[0]):\n thresh = 0\n else:\n thresh = 0.5\n indices = array.array('i')\n indptr = array.array('i', [0])\n for e in self.estimators_:\n indices.extend(np.where(_predict_binary(e, X) > thresh)[0])\n indptr.append(len(indices))\n data = np.ones(len(indices), dtype=int)\n indicator = sp.csc_matrix((data, indices, indptr), shape=(n_samples, len(self.estimators_)))\n return self.label_binarizer_.inverse_transform(indicator)\n \n @available_if(_estimators_has('predict_proba'))\n def predict_proba(self, X):\n \"\"\"Probability estimates.\n\n The returned estimates for all classes are ordered by label of classes.\n\n Note that in the multilabel case, each sample can have any number of\n labels. This returns the marginal probability that the given sample has\n the label in question. For example, it is entirely consistent that two\n labels both have a 90% probability of applying to a given sample.\n\n In the single label multiclass case, the rows of the returned matrix\n sum to 1.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n T : (sparse) array-like of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in `self.classes_`.\n \"\"\"\n check_is_fitted(self)\n Y = np.array([e.predict_proba(X)[:, 1] for e in self.estimators_]).T\n if len(self.estimators_) == 1:\n Y = np.concatenate((1 - Y, Y), axis=1)\n if not self.multilabel_:\n Y /= np.sum(Y, axis=1)[:, np.newaxis]\n return Y\n \n @available_if(_estimators_has('decision_function'))\n def decision_function(self, X):\n \"\"\"Decision function for the OneVsRestClassifier.\n\n Return the distance of each sample from the decision boundary for each\n class. This can only be used with estimators which implement the\n `decision_function` method.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n T : array-like of shape (n_samples, n_classes) or (n_samples,) for binary classification.\n Result of calling `decision_function` on the final estimator.\n\n .. versionchanged:: 0.19\n output shape changed to ``(n_samples,)`` to conform to\n scikit-learn conventions for binary classification.\n \"\"\"\n check_is_fitted(self)\n if len(self.estimators_) == 1:\n return self.estimators_[0].decision_function(X)\n return np.array([est.decision_function(X).ravel() for est in self.estimators_]).T\n \n @property\n def multilabel_(self):\n \"\"\"Whether this is a multilabel classifier.\"\"\"\n return self.label_binarizer_.y_type_.startswith('multilabel')\n \n @property\n def n_classes_(self):\n \"\"\"Number of classes.\"\"\"\n return len(self.classes_)\n \n @deprecated('Attribute `coef_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26). If you observe this warning while using RFE or SelectFromModel, use the importance_getter parameter instead.')\n @property\n def coef_(self):\n check_is_fitted(self)\n if not hasattr(self.estimators_[0], 'coef_'):\n raise AttributeError(\"Base estimator doesn't have a coef_ attribute.\")\n coefs = [e.coef_ for e in self.estimators_]\n if sp.issparse(coefs[0]):\n return sp.vstack(coefs)\n return np.vstack(coefs)\n \n @deprecated('Attribute `intercept_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26). If you observe this warning while using RFE or SelectFromModel, use the importance_getter parameter instead.')\n @property\n def intercept_(self):\n check_is_fitted(self)\n if not hasattr(self.estimators_[0], 'intercept_'):\n raise AttributeError(\"Base estimator doesn't have an intercept_ attribute.\")\n return np.array([e.intercept_.ravel() for e in self.estimators_])\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n \"\"\"Indicate if wrapped estimator is using a precomputed Gram matrix\"\"\"\n return getattr(self.estimator, '_pairwise', False)\n \n def _more_tags(self):\n \"\"\"Indicate if wrapped estimator is using a precomputed Gram matrix\"\"\"\n return {'pairwise': _safe_tags(self.estimator, key='pairwise')}\n" }, @@ -25064,9 +25146,9 @@ "sklearn.multiclass.OutputCodeClassifier.predict" ], "is_public": true, - "description": "(Error-Correcting) Output-Code multiclass strategy.\n\nOutput-code based strategies consist in representing each class with a binary code (an array of 0s and 1s). At fitting time, one binary classifier per bit in the code book is fitted. At prediction time, the classifiers are used to project new points in the class space and the class closest to the points is chosen. The main advantage of these strategies is that the number of classifiers used can be controlled by the user, either for compressing the model (0 < code_size < 1) or for making the model more robust to errors (code_size > 1). See the documentation for more details. Read more in the :ref:`User Guide `.", - "docstring": "(Error-Correcting) Output-Code multiclass strategy.\n\n Output-code based strategies consist in representing each class with a\n binary code (an array of 0s and 1s). At fitting time, one binary\n classifier per bit in the code book is fitted. At prediction time, the\n classifiers are used to project new points in the class space and the class\n closest to the points is chosen. The main advantage of these strategies is\n that the number of classifiers used can be controlled by the user, either\n for compressing the model (0 < code_size < 1) or for making the model more\n robust to errors (code_size > 1). See the documentation for more details.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit` and one of\n :term:`decision_function` or :term:`predict_proba`.\n\n code_size : float\n Percentage of the number of classes to be used to create the code book.\n A number between 0 and 1 will require fewer classifiers than\n one-vs-the-rest. A number greater than 1 will require more classifiers\n than one-vs-the-rest.\n\n random_state : int, RandomState instance, default=None\n The generator used to initialize the codebook.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation: the multiclass problems\n are computed in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n estimators_ : list of `int(n_classes * code_size)` estimators\n Estimators used for predictions.\n\n classes_ : ndarray of shape (n_classes,)\n Array containing labels.\n\n code_book_ : ndarray of shape (n_classes, code_size)\n Binary array containing the code of each class.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OneVsRestClassifier : One-vs-all multiclass strategy.\n OneVsOneClassifier : One-vs-one multiclass strategy.\n\n References\n ----------\n\n .. [1] \"Solving multiclass learning problems via error-correcting output\n codes\",\n Dietterich T., Bakiri G.,\n Journal of Artificial Intelligence Research 2,\n 1995.\n\n .. [2] \"The error coding method and PICTs\",\n James G., Hastie T.,\n Journal of Computational and Graphical statistics 7,\n 1998.\n\n .. [3] \"The Elements of Statistical Learning\",\n Hastie T., Tibshirani R., Friedman J., page 606 (second-edition)\n 2008.\n\n Examples\n --------\n >>> from sklearn.multiclass import OutputCodeClassifier\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_samples=100, n_features=4,\n ... n_informative=2, n_redundant=0,\n ... random_state=0, shuffle=False)\n >>> clf = OutputCodeClassifier(\n ... estimator=RandomForestClassifier(random_state=0),\n ... random_state=0).fit(X, y)\n >>> clf.predict([[0, 0, 0, 0]])\n array([1])\n ", - "source_code": "\n\nclass OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):\n \"\"\"(Error-Correcting) Output-Code multiclass strategy.\n\n Output-code based strategies consist in representing each class with a\n binary code (an array of 0s and 1s). At fitting time, one binary\n classifier per bit in the code book is fitted. At prediction time, the\n classifiers are used to project new points in the class space and the class\n closest to the points is chosen. The main advantage of these strategies is\n that the number of classifiers used can be controlled by the user, either\n for compressing the model (0 < code_size < 1) or for making the model more\n robust to errors (code_size > 1). See the documentation for more details.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit` and one of\n :term:`decision_function` or :term:`predict_proba`.\n\n code_size : float\n Percentage of the number of classes to be used to create the code book.\n A number between 0 and 1 will require fewer classifiers than\n one-vs-the-rest. A number greater than 1 will require more classifiers\n than one-vs-the-rest.\n\n random_state : int, RandomState instance, default=None\n The generator used to initialize the codebook.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation: the multiclass problems\n are computed in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n estimators_ : list of `int(n_classes * code_size)` estimators\n Estimators used for predictions.\n\n classes_ : ndarray of shape (n_classes,)\n Array containing labels.\n\n code_book_ : ndarray of shape (n_classes, code_size)\n Binary array containing the code of each class.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OneVsRestClassifier : One-vs-all multiclass strategy.\n OneVsOneClassifier : One-vs-one multiclass strategy.\n\n References\n ----------\n\n .. [1] \"Solving multiclass learning problems via error-correcting output\n codes\",\n Dietterich T., Bakiri G.,\n Journal of Artificial Intelligence Research 2,\n 1995.\n\n .. [2] \"The error coding method and PICTs\",\n James G., Hastie T.,\n Journal of Computational and Graphical statistics 7,\n 1998.\n\n .. [3] \"The Elements of Statistical Learning\",\n Hastie T., Tibshirani R., Friedman J., page 606 (second-edition)\n 2008.\n\n Examples\n --------\n >>> from sklearn.multiclass import OutputCodeClassifier\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_samples=100, n_features=4,\n ... n_informative=2, n_redundant=0,\n ... random_state=0, shuffle=False)\n >>> clf = OutputCodeClassifier(\n ... estimator=RandomForestClassifier(random_state=0),\n ... random_state=0).fit(X, y)\n >>> clf.predict([[0, 0, 0, 0]])\n array([1])\n \"\"\"\n \n def __init__(self, estimator, *, code_size=1.5, random_state=None, n_jobs=None):\n self.estimator = estimator\n self.code_size = code_size\n self.random_state = random_state\n self.n_jobs = n_jobs\n \n def fit(self, X, y):\n \"\"\"Fit underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : array-like of shape (n_samples,)\n Multi-class targets.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n y = self._validate_data(X='no_validation', y=y)\n if self.code_size <= 0:\n raise ValueError('code_size should be greater than 0, got {0}'.format(self.code_size))\n _check_estimator(self.estimator)\n random_state = check_random_state(self.random_state)\n check_classification_targets(y)\n self.classes_ = np.unique(y)\n n_classes = self.classes_.shape[0]\n if n_classes == 0:\n raise ValueError('OutputCodeClassifier can not be fit when no class is present.')\n code_size_ = int(n_classes * self.code_size)\n self.code_book_ = random_state.random_sample((n_classes, code_size_))\n self.code_book_[self.code_book_ > 0.5] = 1\n if hasattr(self.estimator, 'decision_function'):\n self.code_book_[self.code_book_ != 1] = -1\n else:\n self.code_book_[self.code_book_ != 1] = 0\n classes_index = {c: i for (i, c) in enumerate(self.classes_)}\n Y = np.array([self.code_book_[classes_index[y[i]]] for i in range(_num_samples(y))], dtype=int)\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_fit_binary)(self.estimator, X, Y[:, i]) for i in range(Y.shape[1])))\n if hasattr(self.estimators_[0], 'n_features_in_'):\n self.n_features_in_ = self.estimators_[0].n_features_in_\n if hasattr(self.estimators_[0], 'feature_names_in_'):\n self.feature_names_in_ = self.estimators_[0].feature_names_in_\n return self\n \n def predict(self, X):\n \"\"\"Predict multi-class targets using underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n Predicted multi-class targets.\n \"\"\"\n check_is_fitted(self)\n Y = np.array([_predict_binary(e, X) for e in self.estimators_]).T\n pred = euclidean_distances(Y, self.code_book_).argmin(axis=1)\n return self.classes_[pred]\n" + "description": "(Error-Correcting) Output-Code multiclass strategy.\n\nOutput-code based strategies consist in representing each class with a\nbinary code (an array of 0s and 1s). At fitting time, one binary\nclassifier per bit in the code book is fitted. At prediction time, the\nclassifiers are used to project new points in the class space and the class\nclosest to the points is chosen. The main advantage of these strategies is\nthat the number of classifiers used can be controlled by the user, either\nfor compressing the model (0 < code_size < 1) or for making the model more\nrobust to errors (code_size > 1). See the documentation for more details.\n\nRead more in the :ref:`User Guide `.", + "docstring": "(Error-Correcting) Output-Code multiclass strategy.\n\n Output-code based strategies consist in representing each class with a\n binary code (an array of 0s and 1s). At fitting time, one binary\n classifier per bit in the code book is fitted. At prediction time, the\n classifiers are used to project new points in the class space and the class\n closest to the points is chosen. The main advantage of these strategies is\n that the number of classifiers used can be controlled by the user, either\n for compressing the model (0 < code_size < 1) or for making the model more\n robust to errors (code_size > 1). See the documentation for more details.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit` and one of\n :term:`decision_function` or :term:`predict_proba`.\n\n code_size : float, default=1.5\n Percentage of the number of classes to be used to create the code book.\n A number between 0 and 1 will require fewer classifiers than\n one-vs-the-rest. A number greater than 1 will require more classifiers\n than one-vs-the-rest.\n\n random_state : int, RandomState instance, default=None\n The generator used to initialize the codebook.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation: the multiclass problems\n are computed in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n estimators_ : list of `int(n_classes * code_size)` estimators\n Estimators used for predictions.\n\n classes_ : ndarray of shape (n_classes,)\n Array containing labels.\n\n code_book_ : ndarray of shape (n_classes, code_size)\n Binary array containing the code of each class.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OneVsRestClassifier : One-vs-all multiclass strategy.\n OneVsOneClassifier : One-vs-one multiclass strategy.\n\n References\n ----------\n\n .. [1] \"Solving multiclass learning problems via error-correcting output\n codes\",\n Dietterich T., Bakiri G.,\n Journal of Artificial Intelligence Research 2,\n 1995.\n\n .. [2] \"The error coding method and PICTs\",\n James G., Hastie T.,\n Journal of Computational and Graphical statistics 7,\n 1998.\n\n .. [3] \"The Elements of Statistical Learning\",\n Hastie T., Tibshirani R., Friedman J., page 606 (second-edition)\n 2008.\n\n Examples\n --------\n >>> from sklearn.multiclass import OutputCodeClassifier\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_samples=100, n_features=4,\n ... n_informative=2, n_redundant=0,\n ... random_state=0, shuffle=False)\n >>> clf = OutputCodeClassifier(\n ... estimator=RandomForestClassifier(random_state=0),\n ... random_state=0).fit(X, y)\n >>> clf.predict([[0, 0, 0, 0]])\n array([1])\n ", + "source_code": "\n\nclass OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):\n \"\"\"(Error-Correcting) Output-Code multiclass strategy.\n\n Output-code based strategies consist in representing each class with a\n binary code (an array of 0s and 1s). At fitting time, one binary\n classifier per bit in the code book is fitted. At prediction time, the\n classifiers are used to project new points in the class space and the class\n closest to the points is chosen. The main advantage of these strategies is\n that the number of classifiers used can be controlled by the user, either\n for compressing the model (0 < code_size < 1) or for making the model more\n robust to errors (code_size > 1). See the documentation for more details.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit` and one of\n :term:`decision_function` or :term:`predict_proba`.\n\n code_size : float, default=1.5\n Percentage of the number of classes to be used to create the code book.\n A number between 0 and 1 will require fewer classifiers than\n one-vs-the-rest. A number greater than 1 will require more classifiers\n than one-vs-the-rest.\n\n random_state : int, RandomState instance, default=None\n The generator used to initialize the codebook.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation: the multiclass problems\n are computed in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n estimators_ : list of `int(n_classes * code_size)` estimators\n Estimators used for predictions.\n\n classes_ : ndarray of shape (n_classes,)\n Array containing labels.\n\n code_book_ : ndarray of shape (n_classes, code_size)\n Binary array containing the code of each class.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OneVsRestClassifier : One-vs-all multiclass strategy.\n OneVsOneClassifier : One-vs-one multiclass strategy.\n\n References\n ----------\n\n .. [1] \"Solving multiclass learning problems via error-correcting output\n codes\",\n Dietterich T., Bakiri G.,\n Journal of Artificial Intelligence Research 2,\n 1995.\n\n .. [2] \"The error coding method and PICTs\",\n James G., Hastie T.,\n Journal of Computational and Graphical statistics 7,\n 1998.\n\n .. [3] \"The Elements of Statistical Learning\",\n Hastie T., Tibshirani R., Friedman J., page 606 (second-edition)\n 2008.\n\n Examples\n --------\n >>> from sklearn.multiclass import OutputCodeClassifier\n >>> from sklearn.ensemble import RandomForestClassifier\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_samples=100, n_features=4,\n ... n_informative=2, n_redundant=0,\n ... random_state=0, shuffle=False)\n >>> clf = OutputCodeClassifier(\n ... estimator=RandomForestClassifier(random_state=0),\n ... random_state=0).fit(X, y)\n >>> clf.predict([[0, 0, 0, 0]])\n array([1])\n \"\"\"\n \n def __init__(self, estimator, *, code_size=1.5, random_state=None, n_jobs=None):\n self.estimator = estimator\n self.code_size = code_size\n self.random_state = random_state\n self.n_jobs = n_jobs\n \n def fit(self, X, y):\n \"\"\"Fit underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : array-like of shape (n_samples,)\n Multi-class targets.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n y = self._validate_data(X='no_validation', y=y)\n if self.code_size <= 0:\n raise ValueError('code_size should be greater than 0, got {0}'.format(self.code_size))\n _check_estimator(self.estimator)\n random_state = check_random_state(self.random_state)\n check_classification_targets(y)\n self.classes_ = np.unique(y)\n n_classes = self.classes_.shape[0]\n if n_classes == 0:\n raise ValueError('OutputCodeClassifier can not be fit when no class is present.')\n code_size_ = int(n_classes * self.code_size)\n self.code_book_ = random_state.random_sample((n_classes, code_size_))\n self.code_book_[self.code_book_ > 0.5] = 1\n if hasattr(self.estimator, 'decision_function'):\n self.code_book_[self.code_book_ != 1] = -1\n else:\n self.code_book_[self.code_book_ != 1] = 0\n classes_index = {c: i for (i, c) in enumerate(self.classes_)}\n Y = np.array([self.code_book_[classes_index[y[i]]] for i in range(_num_samples(y))], dtype=int)\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_fit_binary)(self.estimator, X, Y[:, i]) for i in range(Y.shape[1])))\n if hasattr(self.estimators_[0], 'n_features_in_'):\n self.n_features_in_ = self.estimators_[0].n_features_in_\n if hasattr(self.estimators_[0], 'feature_names_in_'):\n self.feature_names_in_ = self.estimators_[0].feature_names_in_\n return self\n \n def predict(self, X):\n \"\"\"Predict multi-class targets using underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n Predicted multi-class targets.\n \"\"\"\n check_is_fitted(self)\n Y = np.array([_predict_binary(e, X) for e in self.estimators_]).T\n pred = euclidean_distances(Y, self.code_book_).argmin(axis=1)\n return self.classes_[pred]\n" }, { "name": "_ConstantPredictor", @@ -25100,7 +25182,7 @@ "sklearn.multioutput.ClassifierChain._more_tags" ], "is_public": true, - "description": "A multi-label model that arranges binary classifiers into a chain.\n\nEach model makes a prediction in the order specified by the chain using all of the available features provided to the model plus the predictions of models that are earlier in the chain. Read more in the :ref:`User Guide `. .. versionadded:: 0.19", + "description": "A multi-label model that arranges binary classifiers into a chain.\n\nEach model makes a prediction in the order specified by the chain using\nall of the available features provided to the model plus the predictions\nof models that are earlier in the chain.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.19", "docstring": "A multi-label model that arranges binary classifiers into a chain.\n\n Each model makes a prediction in the order specified by the chain using\n all of the available features provided to the model plus the predictions\n of models that are earlier in the chain.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.19\n\n Parameters\n ----------\n base_estimator : estimator\n The base estimator from which the classifier chain is built.\n\n order : array-like of shape (n_outputs,) or 'random', default=None\n If `None`, the order will be determined by the order of columns in\n the label matrix Y.::\n\n order = [0, 1, 2, ..., Y.shape[1] - 1]\n\n The order of the chain can be explicitly set by providing a list of\n integers. For example, for a chain of length 5.::\n\n order = [1, 3, 2, 4, 0]\n\n means that the first model in the chain will make predictions for\n column 1 in the Y matrix, the second model will make predictions\n for column 3, etc.\n\n If order is `random` a random ordering will be used.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines whether to use cross validated predictions or true\n labels for the results of previous estimators in the chain.\n Possible inputs for cv are:\n\n - None, to use true labels when fitting,\n - integer, to specify the number of folds in a (Stratified)KFold,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n random_state : int, RandomState instance or None, optional (default=None)\n If ``order='random'``, determines random number generation for the\n chain order.\n In addition, it controls the random seed given at each `base_estimator`\n at each chaining iteration. Thus, it is only used when `base_estimator`\n exposes a `random_state`.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n classes_ : list\n A list of arrays of length ``len(estimators_)`` containing the\n class labels for each estimator in the chain.\n\n estimators_ : list\n A list of clones of base_estimator.\n\n order_ : list\n The order of labels in the classifier chain.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying `base_estimator` exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n RegressorChain : Equivalent for regression.\n MultioutputClassifier : Classifies each output independently rather than\n chaining.\n\n References\n ----------\n Jesse Read, Bernhard Pfahringer, Geoff Holmes, Eibe Frank, \"Classifier\n Chains for Multi-label Classification\", 2009.\n\n Examples\n --------\n >>> from sklearn.datasets import make_multilabel_classification\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.multioutput import ClassifierChain\n >>> X, Y = make_multilabel_classification(\n ... n_samples=12, n_classes=3, random_state=0\n ... )\n >>> X_train, X_test, Y_train, Y_test = train_test_split(\n ... X, Y, random_state=0\n ... )\n >>> base_lr = LogisticRegression(solver='lbfgs', random_state=0)\n >>> chain = ClassifierChain(base_lr, order='random', random_state=0)\n >>> chain.fit(X_train, Y_train).predict(X_test)\n array([[1., 1., 0.],\n [1., 0., 0.],\n [0., 1., 0.]])\n >>> chain.predict_proba(X_test)\n array([[0.8387..., 0.9431..., 0.4576...],\n [0.8878..., 0.3684..., 0.2640...],\n [0.0321..., 0.9935..., 0.0625...]])\n ", "source_code": "\n\nclass ClassifierChain(MetaEstimatorMixin, ClassifierMixin, _BaseChain):\n \"\"\"A multi-label model that arranges binary classifiers into a chain.\n\n Each model makes a prediction in the order specified by the chain using\n all of the available features provided to the model plus the predictions\n of models that are earlier in the chain.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.19\n\n Parameters\n ----------\n base_estimator : estimator\n The base estimator from which the classifier chain is built.\n\n order : array-like of shape (n_outputs,) or 'random', default=None\n If `None`, the order will be determined by the order of columns in\n the label matrix Y.::\n\n order = [0, 1, 2, ..., Y.shape[1] - 1]\n\n The order of the chain can be explicitly set by providing a list of\n integers. For example, for a chain of length 5.::\n\n order = [1, 3, 2, 4, 0]\n\n means that the first model in the chain will make predictions for\n column 1 in the Y matrix, the second model will make predictions\n for column 3, etc.\n\n If order is `random` a random ordering will be used.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines whether to use cross validated predictions or true\n labels for the results of previous estimators in the chain.\n Possible inputs for cv are:\n\n - None, to use true labels when fitting,\n - integer, to specify the number of folds in a (Stratified)KFold,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n random_state : int, RandomState instance or None, optional (default=None)\n If ``order='random'``, determines random number generation for the\n chain order.\n In addition, it controls the random seed given at each `base_estimator`\n at each chaining iteration. Thus, it is only used when `base_estimator`\n exposes a `random_state`.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n classes_ : list\n A list of arrays of length ``len(estimators_)`` containing the\n class labels for each estimator in the chain.\n\n estimators_ : list\n A list of clones of base_estimator.\n\n order_ : list\n The order of labels in the classifier chain.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying `base_estimator` exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n RegressorChain : Equivalent for regression.\n MultioutputClassifier : Classifies each output independently rather than\n chaining.\n\n References\n ----------\n Jesse Read, Bernhard Pfahringer, Geoff Holmes, Eibe Frank, \"Classifier\n Chains for Multi-label Classification\", 2009.\n\n Examples\n --------\n >>> from sklearn.datasets import make_multilabel_classification\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.multioutput import ClassifierChain\n >>> X, Y = make_multilabel_classification(\n ... n_samples=12, n_classes=3, random_state=0\n ... )\n >>> X_train, X_test, Y_train, Y_test = train_test_split(\n ... X, Y, random_state=0\n ... )\n >>> base_lr = LogisticRegression(solver='lbfgs', random_state=0)\n >>> chain = ClassifierChain(base_lr, order='random', random_state=0)\n >>> chain.fit(X_train, Y_train).predict(X_test)\n array([[1., 1., 0.],\n [1., 0., 0.],\n [0., 1., 0.]])\n >>> chain.predict_proba(X_test)\n array([[0.8387..., 0.9431..., 0.4576...],\n [0.8878..., 0.3684..., 0.2640...],\n [0.0321..., 0.9935..., 0.0625...]])\n \"\"\"\n \n def fit(self, X, Y):\n \"\"\"Fit the model to data matrix X and targets Y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Y : array-like of shape (n_samples, n_classes)\n The target values.\n\n Returns\n -------\n self : object\n Class instance.\n \"\"\"\n super().fit(X, Y)\n self.classes_ = [estimator.classes_ for (chain_idx, estimator) in enumerate(self.estimators_)]\n return self\n \n @_available_if_base_estimator_has('predict_proba')\n def predict_proba(self, X):\n \"\"\"Predict probability estimates.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n Y_prob : array-like of shape (n_samples, n_classes)\n The predicted probabilities.\n \"\"\"\n X = self._validate_data(X, accept_sparse=True, reset=False)\n Y_prob_chain = np.zeros((X.shape[0], len(self.estimators_)))\n Y_pred_chain = np.zeros((X.shape[0], len(self.estimators_)))\n for (chain_idx, estimator) in enumerate(self.estimators_):\n previous_predictions = Y_pred_chain[:, :chain_idx]\n if sp.issparse(X):\n X_aug = sp.hstack((X, previous_predictions))\n else:\n X_aug = np.hstack((X, previous_predictions))\n Y_prob_chain[:, chain_idx] = estimator.predict_proba(X_aug)[:, 1]\n Y_pred_chain[:, chain_idx] = estimator.predict(X_aug)\n inv_order = np.empty_like(self.order_)\n inv_order[self.order_] = np.arange(len(self.order_))\n Y_prob = Y_prob_chain[:, inv_order]\n return Y_prob\n \n @_available_if_base_estimator_has('decision_function')\n def decision_function(self, X):\n \"\"\"Evaluate the decision_function of the models in the chain.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n Y_decision : array-like of shape (n_samples, n_classes)\n Returns the decision function of the sample for each model\n in the chain.\n \"\"\"\n X = self._validate_data(X, accept_sparse=True, reset=False)\n Y_decision_chain = np.zeros((X.shape[0], len(self.estimators_)))\n Y_pred_chain = np.zeros((X.shape[0], len(self.estimators_)))\n for (chain_idx, estimator) in enumerate(self.estimators_):\n previous_predictions = Y_pred_chain[:, :chain_idx]\n if sp.issparse(X):\n X_aug = sp.hstack((X, previous_predictions))\n else:\n X_aug = np.hstack((X, previous_predictions))\n Y_decision_chain[:, chain_idx] = estimator.decision_function(X_aug)\n Y_pred_chain[:, chain_idx] = estimator.predict(X_aug)\n inv_order = np.empty_like(self.order_)\n inv_order[self.order_] = np.arange(len(self.order_))\n Y_decision = Y_decision_chain[:, inv_order]\n return Y_decision\n \n def _more_tags(self):\n return {'_skip_test': True, 'multioutput_only': True}\n" }, @@ -25118,7 +25200,7 @@ "sklearn.multioutput.MultiOutputClassifier._more_tags" ], "is_public": true, - "description": "Multi target classification.\n\nThis strategy consists of fitting one classifier per target. This is a simple strategy for extending classifiers that do not natively support multi-target classification.", + "description": "Multi target classification.\n\nThis strategy consists of fitting one classifier per target. This is a\nsimple strategy for extending classifiers that do not natively support\nmulti-target classification.", "docstring": "Multi target classification.\n\n This strategy consists of fitting one classifier per target. This is a\n simple strategy for extending classifiers that do not natively support\n multi-target classification.\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit`, :term:`score` and\n :term:`predict_proba`.\n\n n_jobs : int or None, optional (default=None)\n The number of jobs to run in parallel.\n :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported\n by the passed estimator) will be parallelized for each target.\n\n When individual estimators are fast to train or predict,\n using ``n_jobs > 1`` can result in slower performance due\n to the parallelism overhead.\n\n ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all available processes / threads.\n See :term:`Glossary ` for more details.\n\n .. versionchanged:: 0.20\n `n_jobs` default changed from `1` to `None`.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n Class labels.\n\n estimators_ : list of ``n_output`` estimators\n Estimators used for predictions.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying `estimator` exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n ClassifierChain : A multi-label model that arranges binary classifiers\n into a chain.\n MultiOutputRegressor : Fits one regressor per target variable.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_multilabel_classification\n >>> from sklearn.multioutput import MultiOutputClassifier\n >>> from sklearn.neighbors import KNeighborsClassifier\n >>> X, y = make_multilabel_classification(n_classes=3, random_state=0)\n >>> clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X, y)\n >>> clf.predict(X[-2:])\n array([[1, 1, 0], [1, 1, 1]])\n ", "source_code": "\n\nclass MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator):\n \"\"\"Multi target classification.\n\n This strategy consists of fitting one classifier per target. This is a\n simple strategy for extending classifiers that do not natively support\n multi-target classification.\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit`, :term:`score` and\n :term:`predict_proba`.\n\n n_jobs : int or None, optional (default=None)\n The number of jobs to run in parallel.\n :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported\n by the passed estimator) will be parallelized for each target.\n\n When individual estimators are fast to train or predict,\n using ``n_jobs > 1`` can result in slower performance due\n to the parallelism overhead.\n\n ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all available processes / threads.\n See :term:`Glossary ` for more details.\n\n .. versionchanged:: 0.20\n `n_jobs` default changed from `1` to `None`.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n Class labels.\n\n estimators_ : list of ``n_output`` estimators\n Estimators used for predictions.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying `estimator` exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n ClassifierChain : A multi-label model that arranges binary classifiers\n into a chain.\n MultiOutputRegressor : Fits one regressor per target variable.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import make_multilabel_classification\n >>> from sklearn.multioutput import MultiOutputClassifier\n >>> from sklearn.neighbors import KNeighborsClassifier\n >>> X, y = make_multilabel_classification(n_classes=3, random_state=0)\n >>> clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X, y)\n >>> clf.predict(X[-2:])\n array([[1, 1, 0], [1, 1, 1]])\n \"\"\"\n \n def __init__(self, estimator, *, n_jobs=None):\n super().__init__(estimator, n_jobs=n_jobs)\n \n def fit(self, X, Y, sample_weight=None, **fit_params):\n \"\"\"Fit the model to data matrix X and targets Y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Y : array-like of shape (n_samples, n_classes)\n The target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying classifier supports sample\n weights.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``estimator.fit`` method of each step.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n \"\"\"\n super().fit(X, Y, sample_weight, **fit_params)\n self.classes_ = [estimator.classes_ for estimator in self.estimators_]\n return self\n \n def _check_predict_proba(self):\n if hasattr(self, 'estimators_'):\n [getattr(est, 'predict_proba') for est in self.estimators_]\n return True\n getattr(self.estimator, 'predict_proba')\n return True\n \n @available_if(_check_predict_proba)\n def predict_proba(self, X):\n \"\"\"Return prediction probabilities for each class of each output.\n\n This method will raise a ``ValueError`` if any of the\n estimators do not have ``predict_proba``.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n p : array of shape (n_samples, n_classes), or a list of n_outputs such arrays if n_outputs > 1.\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n\n .. versionchanged:: 0.19\n This function now returns a list of arrays where the length of\n the list is ``n_outputs``, and each array is (``n_samples``,\n ``n_classes``) for that particular output.\n \"\"\"\n check_is_fitted(self)\n results = [estimator.predict_proba(X) for estimator in self.estimators_]\n return results\n \n def score(self, X, y):\n \"\"\"Return the mean accuracy on the given test data and labels.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples, n_outputs)\n True values for X.\n\n Returns\n -------\n scores : float\n Mean accuracy of predicted target versus true target.\n \"\"\"\n check_is_fitted(self)\n n_outputs_ = len(self.estimators_)\n if y.ndim == 1:\n raise ValueError('y must have at least two dimensions for multi target classification but has only one')\n if y.shape[1] != n_outputs_:\n raise ValueError('The number of outputs of Y for fit {0} and score {1} should be same'.format(n_outputs_, y.shape[1]))\n y_pred = self.predict(X)\n return np.mean(np.all(y == y_pred, axis=1))\n \n def _more_tags(self):\n return {'_skip_test': True}\n" }, @@ -25132,9 +25214,9 @@ "sklearn.multioutput.MultiOutputRegressor.partial_fit" ], "is_public": true, - "description": "Multi target regression.\n\nThis strategy consists of fitting one regressor per target. This is a simple strategy for extending regressors that do not natively support multi-target regression. .. versionadded:: 0.18", - "docstring": "Multi target regression.\n\n This strategy consists of fitting one regressor per target. This is a\n simple strategy for extending regressors that do not natively support\n multi-target regression.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit` and :term:`predict`.\n\n n_jobs : int or None, optional (default=None)\n The number of jobs to run in parallel.\n :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported\n by the passed estimator) will be parallelized for each target.\n\n When individual estimators are fast to train or predict,\n using ``n_jobs > 1`` can result in slower performance due\n to the parallelism overhead.\n\n ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all available processes / threads.\n See :term:`Glossary ` for more details.\n\n .. versionchanged:: 0.20\n `n_jobs` default changed from `1` to `None`.\n\n Attributes\n ----------\n estimators_ : list of ``n_output`` estimators\n Estimators used for predictions.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying `estimator` exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n RegressorChain : A multi-label model that arranges regressions into a\n chain.\n MultiOutputClassifier : Classifies each output independently rather than\n chaining.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import load_linnerud\n >>> from sklearn.multioutput import MultiOutputRegressor\n >>> from sklearn.linear_model import Ridge\n >>> X, y = load_linnerud(return_X_y=True)\n >>> clf = MultiOutputRegressor(Ridge(random_state=123)).fit(X, y)\n >>> clf.predict(X[[0]])\n array([[176..., 35..., 57...]])\n ", - "source_code": "\n\nclass MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator):\n \"\"\"Multi target regression.\n\n This strategy consists of fitting one regressor per target. This is a\n simple strategy for extending regressors that do not natively support\n multi-target regression.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit` and :term:`predict`.\n\n n_jobs : int or None, optional (default=None)\n The number of jobs to run in parallel.\n :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported\n by the passed estimator) will be parallelized for each target.\n\n When individual estimators are fast to train or predict,\n using ``n_jobs > 1`` can result in slower performance due\n to the parallelism overhead.\n\n ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all available processes / threads.\n See :term:`Glossary ` for more details.\n\n .. versionchanged:: 0.20\n `n_jobs` default changed from `1` to `None`.\n\n Attributes\n ----------\n estimators_ : list of ``n_output`` estimators\n Estimators used for predictions.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying `estimator` exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n RegressorChain : A multi-label model that arranges regressions into a\n chain.\n MultiOutputClassifier : Classifies each output independently rather than\n chaining.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import load_linnerud\n >>> from sklearn.multioutput import MultiOutputRegressor\n >>> from sklearn.linear_model import Ridge\n >>> X, y = load_linnerud(return_X_y=True)\n >>> clf = MultiOutputRegressor(Ridge(random_state=123)).fit(X, y)\n >>> clf.predict(X[[0]])\n array([[176..., 35..., 57...]])\n \"\"\"\n \n def __init__(self, estimator, *, n_jobs=None):\n super().__init__(estimator, n_jobs=n_jobs)\n \n @_available_if_estimator_has('partial_fit')\n def partial_fit(self, X, y, sample_weight=None):\n \"\"\"Incrementally fit the model to data, for each output variable.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying regressor supports sample\n weights.\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n \"\"\"\n super().partial_fit(X, y, sample_weight=sample_weight)\n" + "description": "Multi target regression.\n\nThis strategy consists of fitting one regressor per target. This is a\nsimple strategy for extending regressors that do not natively support\nmulti-target regression.\n\n.. versionadded:: 0.18", + "docstring": "Multi target regression.\n\n This strategy consists of fitting one regressor per target. This is a\n simple strategy for extending regressors that do not natively support\n multi-target regression.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit` and :term:`predict`.\n\n n_jobs : int or None, optional (default=None)\n The number of jobs to run in parallel.\n :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported\n by the passed estimator) will be parallelized for each target.\n\n When individual estimators are fast to train or predict,\n using ``n_jobs > 1`` can result in slower performance due\n to the parallelism overhead.\n\n ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all available processes / threads.\n See :term:`Glossary ` for more details.\n\n .. versionchanged:: 0.20\n `n_jobs` default changed from `1` to `None`.\n\n Attributes\n ----------\n estimators_ : list of ``n_output`` estimators\n Estimators used for predictions.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying `estimator` exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n RegressorChain : A multi-label model that arranges regressions into a\n chain.\n MultiOutputClassifier : Classifies each output independently rather than\n chaining.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import load_linnerud\n >>> from sklearn.multioutput import MultiOutputRegressor\n >>> from sklearn.linear_model import Ridge\n >>> X, y = load_linnerud(return_X_y=True)\n >>> regr = MultiOutputRegressor(Ridge(random_state=123)).fit(X, y)\n >>> regr.predict(X[[0]])\n array([[176..., 35..., 57...]])\n ", + "source_code": "\n\nclass MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator):\n \"\"\"Multi target regression.\n\n This strategy consists of fitting one regressor per target. This is a\n simple strategy for extending regressors that do not natively support\n multi-target regression.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n estimator : estimator object\n An estimator object implementing :term:`fit` and :term:`predict`.\n\n n_jobs : int or None, optional (default=None)\n The number of jobs to run in parallel.\n :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported\n by the passed estimator) will be parallelized for each target.\n\n When individual estimators are fast to train or predict,\n using ``n_jobs > 1`` can result in slower performance due\n to the parallelism overhead.\n\n ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all available processes / threads.\n See :term:`Glossary ` for more details.\n\n .. versionchanged:: 0.20\n `n_jobs` default changed from `1` to `None`.\n\n Attributes\n ----------\n estimators_ : list of ``n_output`` estimators\n Estimators used for predictions.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying `estimator` exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimators expose such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n RegressorChain : A multi-label model that arranges regressions into a\n chain.\n MultiOutputClassifier : Classifies each output independently rather than\n chaining.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import load_linnerud\n >>> from sklearn.multioutput import MultiOutputRegressor\n >>> from sklearn.linear_model import Ridge\n >>> X, y = load_linnerud(return_X_y=True)\n >>> regr = MultiOutputRegressor(Ridge(random_state=123)).fit(X, y)\n >>> regr.predict(X[[0]])\n array([[176..., 35..., 57...]])\n \"\"\"\n \n def __init__(self, estimator, *, n_jobs=None):\n super().__init__(estimator, n_jobs=n_jobs)\n \n @_available_if_estimator_has('partial_fit')\n def partial_fit(self, X, y, sample_weight=None):\n \"\"\"Incrementally fit the model to data, for each output variable.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying regressor supports sample\n weights.\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n \"\"\"\n super().partial_fit(X, y, sample_weight=sample_weight)\n" }, { "name": "RegressorChain", @@ -25150,7 +25232,7 @@ "sklearn.multioutput.RegressorChain._more_tags" ], "is_public": true, - "description": "A multi-label model that arranges regressions into a chain.\n\nEach model makes a prediction in the order specified by the chain using all of the available features provided to the model plus the predictions of models that are earlier in the chain. Read more in the :ref:`User Guide `. .. versionadded:: 0.20", + "description": "A multi-label model that arranges regressions into a chain.\n\nEach model makes a prediction in the order specified by the chain using\nall of the available features provided to the model plus the predictions\nof models that are earlier in the chain.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20", "docstring": "A multi-label model that arranges regressions into a chain.\n\n Each model makes a prediction in the order specified by the chain using\n all of the available features provided to the model plus the predictions\n of models that are earlier in the chain.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n base_estimator : estimator\n The base estimator from which the classifier chain is built.\n\n order : array-like of shape (n_outputs,) or 'random', default=None\n If `None`, the order will be determined by the order of columns in\n the label matrix Y.::\n\n order = [0, 1, 2, ..., Y.shape[1] - 1]\n\n The order of the chain can be explicitly set by providing a list of\n integers. For example, for a chain of length 5.::\n\n order = [1, 3, 2, 4, 0]\n\n means that the first model in the chain will make predictions for\n column 1 in the Y matrix, the second model will make predictions\n for column 3, etc.\n\n If order is 'random' a random ordering will be used.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines whether to use cross validated predictions or true\n labels for the results of previous estimators in the chain.\n Possible inputs for cv are:\n\n - None, to use true labels when fitting,\n - integer, to specify the number of folds in a (Stratified)KFold,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n random_state : int, RandomState instance or None, optional (default=None)\n If ``order='random'``, determines random number generation for the\n chain order.\n In addition, it controls the random seed given at each `base_estimator`\n at each chaining iteration. Thus, it is only used when `base_estimator`\n exposes a `random_state`.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n estimators_ : list\n A list of clones of base_estimator.\n\n order_ : list\n The order of labels in the classifier chain.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying `base_estimator` exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n ClassifierChain : Equivalent for classification.\n MultiOutputRegressor : Learns each output independently rather than\n chaining.\n\n Examples\n --------\n >>> from sklearn.multioutput import RegressorChain\n >>> from sklearn.linear_model import LogisticRegression\n >>> logreg = LogisticRegression(solver='lbfgs',multi_class='multinomial')\n >>> X, Y = [[1, 0], [0, 1], [1, 1]], [[0, 2], [1, 1], [2, 0]]\n >>> chain = RegressorChain(base_estimator=logreg, order=[0, 1]).fit(X, Y)\n >>> chain.predict(X)\n array([[0., 2.],\n [1., 1.],\n [2., 0.]])\n ", "source_code": "\n\nclass RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain):\n \"\"\"A multi-label model that arranges regressions into a chain.\n\n Each model makes a prediction in the order specified by the chain using\n all of the available features provided to the model plus the predictions\n of models that are earlier in the chain.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n base_estimator : estimator\n The base estimator from which the classifier chain is built.\n\n order : array-like of shape (n_outputs,) or 'random', default=None\n If `None`, the order will be determined by the order of columns in\n the label matrix Y.::\n\n order = [0, 1, 2, ..., Y.shape[1] - 1]\n\n The order of the chain can be explicitly set by providing a list of\n integers. For example, for a chain of length 5.::\n\n order = [1, 3, 2, 4, 0]\n\n means that the first model in the chain will make predictions for\n column 1 in the Y matrix, the second model will make predictions\n for column 3, etc.\n\n If order is 'random' a random ordering will be used.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines whether to use cross validated predictions or true\n labels for the results of previous estimators in the chain.\n Possible inputs for cv are:\n\n - None, to use true labels when fitting,\n - integer, to specify the number of folds in a (Stratified)KFold,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n random_state : int, RandomState instance or None, optional (default=None)\n If ``order='random'``, determines random number generation for the\n chain order.\n In addition, it controls the random seed given at each `base_estimator`\n at each chaining iteration. Thus, it is only used when `base_estimator`\n exposes a `random_state`.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n estimators_ : list\n A list of clones of base_estimator.\n\n order_ : list\n The order of labels in the classifier chain.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying `base_estimator` exposes such an attribute when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n ClassifierChain : Equivalent for classification.\n MultiOutputRegressor : Learns each output independently rather than\n chaining.\n\n Examples\n --------\n >>> from sklearn.multioutput import RegressorChain\n >>> from sklearn.linear_model import LogisticRegression\n >>> logreg = LogisticRegression(solver='lbfgs',multi_class='multinomial')\n >>> X, Y = [[1, 0], [0, 1], [1, 1]], [[0, 2], [1, 1], [2, 0]]\n >>> chain = RegressorChain(base_estimator=logreg, order=[0, 1]).fit(X, Y)\n >>> chain.predict(X)\n array([[0., 2.],\n [1., 1.],\n [2., 0.]])\n \"\"\"\n \n def fit(self, X, Y, **fit_params):\n \"\"\"Fit the model to data matrix X and targets Y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Y : array-like of shape (n_samples, n_classes)\n The target values.\n\n **fit_params : dict of string -> object\n Parameters passed to the `fit` method at each step\n of the regressor chain.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n \"\"\"\n super().fit(X, Y, **fit_params)\n return self\n \n def _more_tags(self):\n return {'multioutput_only': True}\n" }, @@ -25200,7 +25282,7 @@ "sklearn.naive_bayes.BernoulliNB._joint_log_likelihood" ], "is_public": true, - "description": "Naive Bayes classifier for multivariate Bernoulli models.\n\nLike MultinomialNB, this classifier is suitable for discrete data. The difference is that while MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolean features. Read more in the :ref:`User Guide `.", + "description": "Naive Bayes classifier for multivariate Bernoulli models.\n\nLike MultinomialNB, this classifier is suitable for discrete data. The\ndifference is that while MultinomialNB works with occurrence counts,\nBernoulliNB is designed for binary/boolean features.\n\nRead more in the :ref:`User Guide `.", "docstring": "Naive Bayes classifier for multivariate Bernoulli models.\n\n Like MultinomialNB, this classifier is suitable for discrete data. The\n difference is that while MultinomialNB works with occurrence counts,\n BernoulliNB is designed for binary/boolean features.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Additive (Laplace/Lidstone) smoothing parameter\n (0 for no smoothing).\n\n binarize : float or None, default=0.0\n Threshold for binarizing (mapping to booleans) of sample features.\n If None, input is presumed to already consist of binary vectors.\n\n fit_prior : bool, default=True\n Whether to learn class prior probabilities or not.\n If false, a uniform prior will be used.\n\n class_prior : array-like of shape (n_classes,), default=None\n Prior probabilities of the classes. If specified the priors are not\n adjusted according to the data.\n\n Attributes\n ----------\n class_count_ : ndarray of shape (n_classes,)\n Number of samples encountered for each class during fitting. This\n value is weighted by the sample weight when provided.\n\n class_log_prior_ : ndarray of shape (n_classes,)\n Log probability of each class (smoothed).\n\n classes_ : ndarray of shape (n_classes,)\n Class labels known to the classifier\n\n coef_ : ndarray of shape (n_classes, n_features)\n Mirrors ``feature_log_prob_`` for interpreting `BernoulliNB`\n as a linear model.\n\n feature_count_ : ndarray of shape (n_classes, n_features)\n Number of samples encountered for each (class, feature)\n during fitting. This value is weighted by the sample weight when\n provided.\n\n feature_log_prob_ : ndarray of shape (n_classes, n_features)\n Empirical log probability of features given a class, P(x_i|y).\n\n intercept_ : ndarray of shape (n_classes,)\n Mirrors ``class_log_prior_`` for interpreting `BernoulliNB`\n as a linear model.\n\n n_features_ : int\n Number of features of each sample.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n CategoricalNB : Naive Bayes classifier for categorical features.\n ComplementNB : The Complement Naive Bayes classifier\n described in Rennie et al. (2003).\n GaussianNB : Gaussian Naive Bayes (GaussianNB).\n MultinomialNB : Naive Bayes classifier for multinomial models.\n\n References\n ----------\n C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to\n Information Retrieval. Cambridge University Press, pp. 234-265.\n https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html\n\n A. McCallum and K. Nigam (1998). A comparison of event models for naive\n Bayes text classification. Proc. AAAI/ICML-98 Workshop on Learning for\n Text Categorization, pp. 41-48.\n\n V. Metsis, I. Androutsopoulos and G. Paliouras (2006). Spam filtering with\n naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS).\n\n Examples\n --------\n >>> import numpy as np\n >>> rng = np.random.RandomState(1)\n >>> X = rng.randint(5, size=(6, 100))\n >>> Y = np.array([1, 2, 3, 4, 4, 5])\n >>> from sklearn.naive_bayes import BernoulliNB\n >>> clf = BernoulliNB()\n >>> clf.fit(X, Y)\n BernoulliNB()\n >>> print(clf.predict(X[2:3]))\n [3]\n ", "source_code": "\n\nclass BernoulliNB(_BaseDiscreteNB):\n \"\"\"Naive Bayes classifier for multivariate Bernoulli models.\n\n Like MultinomialNB, this classifier is suitable for discrete data. The\n difference is that while MultinomialNB works with occurrence counts,\n BernoulliNB is designed for binary/boolean features.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Additive (Laplace/Lidstone) smoothing parameter\n (0 for no smoothing).\n\n binarize : float or None, default=0.0\n Threshold for binarizing (mapping to booleans) of sample features.\n If None, input is presumed to already consist of binary vectors.\n\n fit_prior : bool, default=True\n Whether to learn class prior probabilities or not.\n If false, a uniform prior will be used.\n\n class_prior : array-like of shape (n_classes,), default=None\n Prior probabilities of the classes. If specified the priors are not\n adjusted according to the data.\n\n Attributes\n ----------\n class_count_ : ndarray of shape (n_classes,)\n Number of samples encountered for each class during fitting. This\n value is weighted by the sample weight when provided.\n\n class_log_prior_ : ndarray of shape (n_classes,)\n Log probability of each class (smoothed).\n\n classes_ : ndarray of shape (n_classes,)\n Class labels known to the classifier\n\n coef_ : ndarray of shape (n_classes, n_features)\n Mirrors ``feature_log_prob_`` for interpreting `BernoulliNB`\n as a linear model.\n\n feature_count_ : ndarray of shape (n_classes, n_features)\n Number of samples encountered for each (class, feature)\n during fitting. This value is weighted by the sample weight when\n provided.\n\n feature_log_prob_ : ndarray of shape (n_classes, n_features)\n Empirical log probability of features given a class, P(x_i|y).\n\n intercept_ : ndarray of shape (n_classes,)\n Mirrors ``class_log_prior_`` for interpreting `BernoulliNB`\n as a linear model.\n\n n_features_ : int\n Number of features of each sample.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n CategoricalNB : Naive Bayes classifier for categorical features.\n ComplementNB : The Complement Naive Bayes classifier\n described in Rennie et al. (2003).\n GaussianNB : Gaussian Naive Bayes (GaussianNB).\n MultinomialNB : Naive Bayes classifier for multinomial models.\n\n References\n ----------\n C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to\n Information Retrieval. Cambridge University Press, pp. 234-265.\n https://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html\n\n A. McCallum and K. Nigam (1998). A comparison of event models for naive\n Bayes text classification. Proc. AAAI/ICML-98 Workshop on Learning for\n Text Categorization, pp. 41-48.\n\n V. Metsis, I. Androutsopoulos and G. Paliouras (2006). Spam filtering with\n naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS).\n\n Examples\n --------\n >>> import numpy as np\n >>> rng = np.random.RandomState(1)\n >>> X = rng.randint(5, size=(6, 100))\n >>> Y = np.array([1, 2, 3, 4, 4, 5])\n >>> from sklearn.naive_bayes import BernoulliNB\n >>> clf = BernoulliNB()\n >>> clf.fit(X, Y)\n BernoulliNB()\n >>> print(clf.predict(X[2:3]))\n [3]\n \"\"\"\n \n def __init__(self, *, alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None):\n self.alpha = alpha\n self.binarize = binarize\n self.fit_prior = fit_prior\n self.class_prior = class_prior\n \n def _check_X(self, X):\n \"\"\"Validate X, used only in predict* methods.\"\"\"\n X = super()._check_X(X)\n if self.binarize is not None:\n X = binarize(X, threshold=self.binarize)\n return X\n \n def _check_X_y(self, X, y, reset=True):\n (X, y) = super()._check_X_y(X, y, reset=reset)\n if self.binarize is not None:\n X = binarize(X, threshold=self.binarize)\n return X, y\n \n def _count(self, X, Y):\n \"\"\"Count and smooth feature occurrences.\"\"\"\n self.feature_count_ += safe_sparse_dot(Y.T, X)\n self.class_count_ += Y.sum(axis=0)\n \n def _update_feature_log_prob(self, alpha):\n \"\"\"Apply smoothing to raw counts and recompute log probabilities\"\"\"\n smoothed_fc = self.feature_count_ + alpha\n smoothed_cc = self.class_count_ + alpha * 2\n self.feature_log_prob_ = np.log(smoothed_fc) - np.log(smoothed_cc.reshape(-1, 1))\n \n def _joint_log_likelihood(self, X):\n \"\"\"Calculate the posterior log probability of the samples X\"\"\"\n n_features = self.feature_log_prob_.shape[1]\n n_features_X = X.shape[1]\n if n_features_X != n_features:\n raise ValueError('Expected input with %d features, got %d instead' % (n_features, n_features_X))\n neg_prob = np.log(1 - np.exp(self.feature_log_prob_))\n jll = safe_sparse_dot(X, (self.feature_log_prob_ - neg_prob).T)\n jll += self.class_log_prior_ + neg_prob.sum(axis=1)\n return jll\n" }, @@ -25223,7 +25305,7 @@ "sklearn.naive_bayes.CategoricalNB._joint_log_likelihood" ], "is_public": true, - "description": "Naive Bayes classifier for categorical features.\n\nThe categorical Naive Bayes classifier is suitable for classification with discrete features that are categorically distributed. The categories of each feature are drawn from a categorical distribution. Read more in the :ref:`User Guide `.", + "description": "Naive Bayes classifier for categorical features.\n\nThe categorical Naive Bayes classifier is suitable for classification with\ndiscrete features that are categorically distributed. The categories of\neach feature are drawn from a categorical distribution.\n\nRead more in the :ref:`User Guide `.", "docstring": "Naive Bayes classifier for categorical features.\n\n The categorical Naive Bayes classifier is suitable for classification with\n discrete features that are categorically distributed. The categories of\n each feature are drawn from a categorical distribution.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Additive (Laplace/Lidstone) smoothing parameter\n (0 for no smoothing).\n\n fit_prior : bool, default=True\n Whether to learn class prior probabilities or not.\n If false, a uniform prior will be used.\n\n class_prior : array-like of shape (n_classes,), default=None\n Prior probabilities of the classes. If specified the priors are not\n adjusted according to the data.\n\n min_categories : int or array-like of shape (n_features,), default=None\n Minimum number of categories per feature.\n\n - integer: Sets the minimum number of categories per feature to\n `n_categories` for each features.\n - array-like: shape (n_features,) where `n_categories[i]` holds the\n minimum number of categories for the ith column of the input.\n - None (default): Determines the number of categories automatically\n from the training data.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n category_count_ : list of arrays of shape (n_features,)\n Holds arrays of shape (n_classes, n_categories of respective feature)\n for each feature. Each array provides the number of samples\n encountered for each class and category of the specific feature.\n\n class_count_ : ndarray of shape (n_classes,)\n Number of samples encountered for each class during fitting. This\n value is weighted by the sample weight when provided.\n\n class_log_prior_ : ndarray of shape (n_classes,)\n Smoothed empirical log probability for each class.\n\n classes_ : ndarray of shape (n_classes,)\n Class labels known to the classifier\n\n feature_log_prob_ : list of arrays of shape (n_features,)\n Holds arrays of shape (n_classes, n_categories of respective feature)\n for each feature. Each array provides the empirical log probability\n of categories given the respective feature and class, ``P(x_i|y)``.\n\n n_features_ : int\n Number of features of each sample.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_categories_ : ndarray of shape (n_features,), dtype=np.int64\n Number of categories for each feature. This value is\n inferred from the data or set by the minimum number of categories.\n\n .. versionadded:: 0.24\n\n See Also\n --------\n BernoulliNB : Naive Bayes classifier for multivariate Bernoulli models.\n ComplementNB : Complement Naive Bayes classifier.\n GaussianNB : Gaussian Naive Bayes.\n MultinomialNB : Naive Bayes classifier for multinomial models.\n\n Examples\n --------\n >>> import numpy as np\n >>> rng = np.random.RandomState(1)\n >>> X = rng.randint(5, size=(6, 100))\n >>> y = np.array([1, 2, 3, 4, 5, 6])\n >>> from sklearn.naive_bayes import CategoricalNB\n >>> clf = CategoricalNB()\n >>> clf.fit(X, y)\n CategoricalNB()\n >>> print(clf.predict(X[2:3]))\n [3]\n ", "source_code": "\n\nclass CategoricalNB(_BaseDiscreteNB):\n \"\"\"Naive Bayes classifier for categorical features.\n\n The categorical Naive Bayes classifier is suitable for classification with\n discrete features that are categorically distributed. The categories of\n each feature are drawn from a categorical distribution.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Additive (Laplace/Lidstone) smoothing parameter\n (0 for no smoothing).\n\n fit_prior : bool, default=True\n Whether to learn class prior probabilities or not.\n If false, a uniform prior will be used.\n\n class_prior : array-like of shape (n_classes,), default=None\n Prior probabilities of the classes. If specified the priors are not\n adjusted according to the data.\n\n min_categories : int or array-like of shape (n_features,), default=None\n Minimum number of categories per feature.\n\n - integer: Sets the minimum number of categories per feature to\n `n_categories` for each features.\n - array-like: shape (n_features,) where `n_categories[i]` holds the\n minimum number of categories for the ith column of the input.\n - None (default): Determines the number of categories automatically\n from the training data.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n category_count_ : list of arrays of shape (n_features,)\n Holds arrays of shape (n_classes, n_categories of respective feature)\n for each feature. Each array provides the number of samples\n encountered for each class and category of the specific feature.\n\n class_count_ : ndarray of shape (n_classes,)\n Number of samples encountered for each class during fitting. This\n value is weighted by the sample weight when provided.\n\n class_log_prior_ : ndarray of shape (n_classes,)\n Smoothed empirical log probability for each class.\n\n classes_ : ndarray of shape (n_classes,)\n Class labels known to the classifier\n\n feature_log_prob_ : list of arrays of shape (n_features,)\n Holds arrays of shape (n_classes, n_categories of respective feature)\n for each feature. Each array provides the empirical log probability\n of categories given the respective feature and class, ``P(x_i|y)``.\n\n n_features_ : int\n Number of features of each sample.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_categories_ : ndarray of shape (n_features,), dtype=np.int64\n Number of categories for each feature. This value is\n inferred from the data or set by the minimum number of categories.\n\n .. versionadded:: 0.24\n\n See Also\n --------\n BernoulliNB : Naive Bayes classifier for multivariate Bernoulli models.\n ComplementNB : Complement Naive Bayes classifier.\n GaussianNB : Gaussian Naive Bayes.\n MultinomialNB : Naive Bayes classifier for multinomial models.\n\n Examples\n --------\n >>> import numpy as np\n >>> rng = np.random.RandomState(1)\n >>> X = rng.randint(5, size=(6, 100))\n >>> y = np.array([1, 2, 3, 4, 5, 6])\n >>> from sklearn.naive_bayes import CategoricalNB\n >>> clf = CategoricalNB()\n >>> clf.fit(X, y)\n CategoricalNB()\n >>> print(clf.predict(X[2:3]))\n [3]\n \"\"\"\n \n def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None, min_categories=None):\n self.alpha = alpha\n self.fit_prior = fit_prior\n self.class_prior = class_prior\n self.min_categories = min_categories\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit Naive Bayes classifier according to X, y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features. Here, each feature of X is\n assumed to be from a different categorical distribution.\n It is further assumed that all categories of each feature are\n represented by the numbers 0, ..., n - 1, where n refers to the\n total number of categories for the given feature. This can, for\n instance, be achieved with the help of OrdinalEncoder.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n return super().fit(X, y, sample_weight=sample_weight)\n \n def partial_fit(self, X, y, classes=None, sample_weight=None):\n \"\"\"Incremental fit on a batch of samples.\n\n This method is expected to be called several times consecutively\n on different chunks of a dataset so as to implement out-of-core\n or online learning.\n\n This is especially useful when the whole dataset is too big to fit in\n memory at once.\n\n This method has some performance overhead hence it is better to call\n partial_fit on chunks of data that are as large as possible\n (as long as fitting in the memory budget) to hide the overhead.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features. Here, each feature of X is\n assumed to be from a different categorical distribution.\n It is further assumed that all categories of each feature are\n represented by the numbers 0, ..., n - 1, where n refers to the\n total number of categories for the given feature. This can, for\n instance, be achieved with the help of OrdinalEncoder.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n classes : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n return super().partial_fit(X, y, classes, sample_weight=sample_weight)\n \n def _more_tags(self):\n return {'requires_positive_X': True}\n \n def _check_X(self, X):\n \"\"\"Validate X, used only in predict* methods.\"\"\"\n X = self._validate_data(X, dtype='int', accept_sparse=False, force_all_finite=True, reset=False)\n check_non_negative(X, 'CategoricalNB (input X)')\n return X\n \n def _check_X_y(self, X, y, reset=True):\n (X, y) = self._validate_data(X, y, dtype='int', accept_sparse=False, force_all_finite=True, reset=reset)\n check_non_negative(X, 'CategoricalNB (input X)')\n return X, y\n \n def _init_counters(self, n_classes, n_features):\n self.class_count_ = np.zeros(n_classes, dtype=np.float64)\n self.category_count_ = [np.zeros((n_classes, 0)) for _ in range(n_features)]\n \n @staticmethod\n def _validate_n_categories(X, min_categories):\n n_categories_X = X.max(axis=0) + 1\n min_categories_ = np.array(min_categories)\n if min_categories is not None:\n if not np.issubdtype(min_categories_.dtype, np.signedinteger):\n raise ValueError(f\"'min_categories' should have integral type. Got {min_categories_.dtype} instead.\")\n n_categories_ = np.maximum(n_categories_X, min_categories_, dtype=np.int64)\n if n_categories_.shape != n_categories_X.shape:\n raise ValueError(f\"'min_categories' should have shape ({X.shape[1]},) when an array-like is provided. Got {min_categories_.shape} instead.\")\n return n_categories_\n else:\n return n_categories_X\n \n def _count(self, X, Y):\n \n def _update_cat_count_dims(cat_count, highest_feature):\n diff = highest_feature + 1 - cat_count.shape[1]\n if diff > 0:\n return np.pad(cat_count, [(0, 0), (0, diff)], 'constant')\n return cat_count\n \n def _update_cat_count(X_feature, Y, cat_count, n_classes):\n for j in range(n_classes):\n mask = Y[:, j].astype(bool)\n if Y.dtype.type == np.int64:\n weights = None\n else:\n weights = Y[mask, j]\n counts = np.bincount(X_feature[mask], weights=weights)\n indices = np.nonzero(counts)[0]\n cat_count[j, indices] += counts[indices]\n self.class_count_ += Y.sum(axis=0)\n self.n_categories_ = self._validate_n_categories(X, self.min_categories)\n for i in range(self.n_features_in_):\n X_feature = X[:, i]\n self.category_count_[i] = _update_cat_count_dims(self.category_count_[i], self.n_categories_[i] - 1)\n _update_cat_count(X_feature, Y, self.category_count_[i], self.class_count_.shape[0])\n \n def _update_feature_log_prob(self, alpha):\n feature_log_prob = []\n for i in range(self.n_features_in_):\n smoothed_cat_count = self.category_count_[i] + alpha\n smoothed_class_count = smoothed_cat_count.sum(axis=1)\n feature_log_prob.append(np.log(smoothed_cat_count) - np.log(smoothed_class_count.reshape(-1, 1)))\n self.feature_log_prob_ = feature_log_prob\n \n def _joint_log_likelihood(self, X):\n self._check_n_features(X, reset=False)\n jll = np.zeros((X.shape[0], self.class_count_.shape[0]))\n for i in range(self.n_features_in_):\n indices = X[:, i]\n jll += self.feature_log_prob_[i][:, indices].T\n total_ll = jll + self.class_log_prior_\n return total_ll\n" }, @@ -25240,7 +25322,7 @@ "sklearn.naive_bayes.ComplementNB._joint_log_likelihood" ], "is_public": true, - "description": "The Complement Naive Bayes classifier described in Rennie et al. (2003).\n\nThe Complement Naive Bayes classifier was designed to correct the \"severe assumptions\" made by the standard Multinomial Naive Bayes classifier. It is particularly suited for imbalanced data sets. Read more in the :ref:`User Guide `. .. versionadded:: 0.20", + "description": "The Complement Naive Bayes classifier described in Rennie et al. (2003).\n\nThe Complement Naive Bayes classifier was designed to correct the \"severe\nassumptions\" made by the standard Multinomial Naive Bayes classifier. It is\nparticularly suited for imbalanced data sets.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20", "docstring": "The Complement Naive Bayes classifier described in Rennie et al. (2003).\n\n The Complement Naive Bayes classifier was designed to correct the \"severe\n assumptions\" made by the standard Multinomial Naive Bayes classifier. It is\n particularly suited for imbalanced data sets.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n alpha : float, default=1.0\n Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).\n\n fit_prior : bool, default=True\n Only used in edge case with a single class in the training set.\n\n class_prior : array-like of shape (n_classes,), default=None\n Prior probabilities of the classes. Not used.\n\n norm : bool, default=False\n Whether or not a second normalization of the weights is performed. The\n default behavior mirrors the implementations found in Mahout and Weka,\n which do not follow the full algorithm described in Table 9 of the\n paper.\n\n Attributes\n ----------\n class_count_ : ndarray of shape (n_classes,)\n Number of samples encountered for each class during fitting. This\n value is weighted by the sample weight when provided.\n\n class_log_prior_ : ndarray of shape (n_classes,)\n Smoothed empirical log probability for each class. Only used in edge\n case with a single class in the training set.\n\n classes_ : ndarray of shape (n_classes,)\n Class labels known to the classifier\n\n coef_ : ndarray of shape (n_classes, n_features)\n Mirrors ``feature_log_prob_`` for interpreting `ComplementNB`\n as a linear model.\n\n .. deprecated:: 0.24\n ``coef_`` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26).\n\n feature_all_ : ndarray of shape (n_features,)\n Number of samples encountered for each feature during fitting. This\n value is weighted by the sample weight when provided.\n\n feature_count_ : ndarray of shape (n_classes, n_features)\n Number of samples encountered for each (class, feature) during fitting.\n This value is weighted by the sample weight when provided.\n\n feature_log_prob_ : ndarray of shape (n_classes, n_features)\n Empirical weights for class complements.\n\n intercept_ : ndarray of shape (n_classes,)\n Mirrors ``class_log_prior_`` for interpreting `ComplementNB`\n as a linear model.\n\n .. deprecated:: 0.24\n ``coef_`` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26).\n\n n_features_ : int\n Number of features of each sample.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n BernoulliNB : Naive Bayes classifier for multivariate Bernoulli models.\n CategoricalNB : Naive Bayes classifier for categorical features.\n GaussianNB : Gaussian Naive Bayes.\n MultinomialNB : Naive Bayes classifier for multinomial models.\n\n References\n ----------\n Rennie, J. D., Shih, L., Teevan, J., & Karger, D. R. (2003).\n Tackling the poor assumptions of naive bayes text classifiers. In ICML\n (Vol. 3, pp. 616-623).\n https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf\n\n Examples\n --------\n >>> import numpy as np\n >>> rng = np.random.RandomState(1)\n >>> X = rng.randint(5, size=(6, 100))\n >>> y = np.array([1, 2, 3, 4, 5, 6])\n >>> from sklearn.naive_bayes import ComplementNB\n >>> clf = ComplementNB()\n >>> clf.fit(X, y)\n ComplementNB()\n >>> print(clf.predict(X[2:3]))\n [3]\n ", "source_code": "\n\nclass ComplementNB(_BaseDiscreteNB):\n \"\"\"The Complement Naive Bayes classifier described in Rennie et al. (2003).\n\n The Complement Naive Bayes classifier was designed to correct the \"severe\n assumptions\" made by the standard Multinomial Naive Bayes classifier. It is\n particularly suited for imbalanced data sets.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n alpha : float, default=1.0\n Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).\n\n fit_prior : bool, default=True\n Only used in edge case with a single class in the training set.\n\n class_prior : array-like of shape (n_classes,), default=None\n Prior probabilities of the classes. Not used.\n\n norm : bool, default=False\n Whether or not a second normalization of the weights is performed. The\n default behavior mirrors the implementations found in Mahout and Weka,\n which do not follow the full algorithm described in Table 9 of the\n paper.\n\n Attributes\n ----------\n class_count_ : ndarray of shape (n_classes,)\n Number of samples encountered for each class during fitting. This\n value is weighted by the sample weight when provided.\n\n class_log_prior_ : ndarray of shape (n_classes,)\n Smoothed empirical log probability for each class. Only used in edge\n case with a single class in the training set.\n\n classes_ : ndarray of shape (n_classes,)\n Class labels known to the classifier\n\n coef_ : ndarray of shape (n_classes, n_features)\n Mirrors ``feature_log_prob_`` for interpreting `ComplementNB`\n as a linear model.\n\n .. deprecated:: 0.24\n ``coef_`` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26).\n\n feature_all_ : ndarray of shape (n_features,)\n Number of samples encountered for each feature during fitting. This\n value is weighted by the sample weight when provided.\n\n feature_count_ : ndarray of shape (n_classes, n_features)\n Number of samples encountered for each (class, feature) during fitting.\n This value is weighted by the sample weight when provided.\n\n feature_log_prob_ : ndarray of shape (n_classes, n_features)\n Empirical weights for class complements.\n\n intercept_ : ndarray of shape (n_classes,)\n Mirrors ``class_log_prior_`` for interpreting `ComplementNB`\n as a linear model.\n\n .. deprecated:: 0.24\n ``coef_`` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26).\n\n n_features_ : int\n Number of features of each sample.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n BernoulliNB : Naive Bayes classifier for multivariate Bernoulli models.\n CategoricalNB : Naive Bayes classifier for categorical features.\n GaussianNB : Gaussian Naive Bayes.\n MultinomialNB : Naive Bayes classifier for multinomial models.\n\n References\n ----------\n Rennie, J. D., Shih, L., Teevan, J., & Karger, D. R. (2003).\n Tackling the poor assumptions of naive bayes text classifiers. In ICML\n (Vol. 3, pp. 616-623).\n https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf\n\n Examples\n --------\n >>> import numpy as np\n >>> rng = np.random.RandomState(1)\n >>> X = rng.randint(5, size=(6, 100))\n >>> y = np.array([1, 2, 3, 4, 5, 6])\n >>> from sklearn.naive_bayes import ComplementNB\n >>> clf = ComplementNB()\n >>> clf.fit(X, y)\n ComplementNB()\n >>> print(clf.predict(X[2:3]))\n [3]\n \"\"\"\n \n def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None, norm=False):\n self.alpha = alpha\n self.fit_prior = fit_prior\n self.class_prior = class_prior\n self.norm = norm\n \n def _more_tags(self):\n return {'requires_positive_X': True}\n \n def _count(self, X, Y):\n \"\"\"Count feature occurrences.\"\"\"\n check_non_negative(X, 'ComplementNB (input X)')\n self.feature_count_ += safe_sparse_dot(Y.T, X)\n self.class_count_ += Y.sum(axis=0)\n self.feature_all_ = self.feature_count_.sum(axis=0)\n \n def _update_feature_log_prob(self, alpha):\n \"\"\"Apply smoothing to raw counts and compute the weights.\"\"\"\n comp_count = self.feature_all_ + alpha - self.feature_count_\n logged = np.log(comp_count / comp_count.sum(axis=1, keepdims=True))\n if self.norm:\n summed = logged.sum(axis=1, keepdims=True)\n feature_log_prob = logged / summed\n else:\n feature_log_prob = -logged\n self.feature_log_prob_ = feature_log_prob\n \n def _joint_log_likelihood(self, X):\n \"\"\"Calculate the class scores for the samples in X.\"\"\"\n jll = safe_sparse_dot(X, self.feature_log_prob_.T)\n if len(self.classes_) == 1:\n jll += self.class_log_prior_\n return jll\n" }, @@ -25260,7 +25342,7 @@ "sklearn.naive_bayes.GaussianNB.sigma_@getter" ], "is_public": true, - "description": "Gaussian Naive Bayes (GaussianNB).\n\nCan perform online updates to model parameters via :meth:`partial_fit`. For details on algorithm used to update feature means and variance online, see Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque: http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf Read more in the :ref:`User Guide `.", + "description": "Gaussian Naive Bayes (GaussianNB).\n\nCan perform online updates to model parameters via :meth:`partial_fit`.\nFor details on algorithm used to update feature means and variance online,\nsee Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:\n\n http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf\n\nRead more in the :ref:`User Guide `.", "docstring": "\n Gaussian Naive Bayes (GaussianNB).\n\n Can perform online updates to model parameters via :meth:`partial_fit`.\n For details on algorithm used to update feature means and variance online,\n see Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:\n\n http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n priors : array-like of shape (n_classes,)\n Prior probabilities of the classes. If specified the priors are not\n adjusted according to the data.\n\n var_smoothing : float, default=1e-9\n Portion of the largest variance of all features that is added to\n variances for calculation stability.\n\n .. versionadded:: 0.20\n\n Attributes\n ----------\n class_count_ : ndarray of shape (n_classes,)\n number of training samples observed in each class.\n\n class_prior_ : ndarray of shape (n_classes,)\n probability of each class.\n\n classes_ : ndarray of shape (n_classes,)\n class labels known to the classifier.\n\n epsilon_ : float\n absolute additive value to variances.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n sigma_ : ndarray of shape (n_classes, n_features)\n Variance of each feature per class.\n\n .. deprecated:: 1.0\n `sigma_` is deprecated in 1.0 and will be removed in 1.2.\n Use `var_` instead.\n\n var_ : ndarray of shape (n_classes, n_features)\n Variance of each feature per class.\n\n .. versionadded:: 1.0\n\n theta_ : ndarray of shape (n_classes, n_features)\n mean of each feature per class.\n\n See Also\n --------\n BernoulliNB : Naive Bayes classifier for multivariate Bernoulli models.\n CategoricalNB : Naive Bayes classifier for categorical features.\n ComplementNB : Complement Naive Bayes classifier.\n MultinomialNB : Naive Bayes classifier for multinomial models.\n\n Examples\n --------\n >>> import numpy as np\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> Y = np.array([1, 1, 1, 2, 2, 2])\n >>> from sklearn.naive_bayes import GaussianNB\n >>> clf = GaussianNB()\n >>> clf.fit(X, Y)\n GaussianNB()\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n >>> clf_pf = GaussianNB()\n >>> clf_pf.partial_fit(X, Y, np.unique(Y))\n GaussianNB()\n >>> print(clf_pf.predict([[-0.8, -1]]))\n [1]\n ", "source_code": "\n\nclass GaussianNB(_BaseNB):\n \"\"\"\n Gaussian Naive Bayes (GaussianNB).\n\n Can perform online updates to model parameters via :meth:`partial_fit`.\n For details on algorithm used to update feature means and variance online,\n see Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:\n\n http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n priors : array-like of shape (n_classes,)\n Prior probabilities of the classes. If specified the priors are not\n adjusted according to the data.\n\n var_smoothing : float, default=1e-9\n Portion of the largest variance of all features that is added to\n variances for calculation stability.\n\n .. versionadded:: 0.20\n\n Attributes\n ----------\n class_count_ : ndarray of shape (n_classes,)\n number of training samples observed in each class.\n\n class_prior_ : ndarray of shape (n_classes,)\n probability of each class.\n\n classes_ : ndarray of shape (n_classes,)\n class labels known to the classifier.\n\n epsilon_ : float\n absolute additive value to variances.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n sigma_ : ndarray of shape (n_classes, n_features)\n Variance of each feature per class.\n\n .. deprecated:: 1.0\n `sigma_` is deprecated in 1.0 and will be removed in 1.2.\n Use `var_` instead.\n\n var_ : ndarray of shape (n_classes, n_features)\n Variance of each feature per class.\n\n .. versionadded:: 1.0\n\n theta_ : ndarray of shape (n_classes, n_features)\n mean of each feature per class.\n\n See Also\n --------\n BernoulliNB : Naive Bayes classifier for multivariate Bernoulli models.\n CategoricalNB : Naive Bayes classifier for categorical features.\n ComplementNB : Complement Naive Bayes classifier.\n MultinomialNB : Naive Bayes classifier for multinomial models.\n\n Examples\n --------\n >>> import numpy as np\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> Y = np.array([1, 1, 1, 2, 2, 2])\n >>> from sklearn.naive_bayes import GaussianNB\n >>> clf = GaussianNB()\n >>> clf.fit(X, Y)\n GaussianNB()\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n >>> clf_pf = GaussianNB()\n >>> clf_pf.partial_fit(X, Y, np.unique(Y))\n GaussianNB()\n >>> print(clf_pf.predict([[-0.8, -1]]))\n [1]\n \"\"\"\n \n def __init__(self, *, priors=None, var_smoothing=1e-09):\n self.priors = priors\n self.var_smoothing = var_smoothing\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit Gaussian Naive Bayes according to X, y.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n .. versionadded:: 0.17\n Gaussian Naive Bayes supports fitting with *sample_weight*.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n y = self._validate_data(y=y)\n return self._partial_fit(X, y, np.unique(y), _refit=True, sample_weight=sample_weight)\n \n def _check_X(self, X):\n \"\"\"Validate X, used only in predict* methods.\"\"\"\n return self._validate_data(X, reset=False)\n \n @staticmethod\n def _update_mean_variance(n_past, mu, var, X, sample_weight=None):\n \"\"\"Compute online update of Gaussian mean and variance.\n\n Given starting sample count, mean, and variance, a new set of\n points X, and optionally sample weights, return the updated mean and\n variance. (NB - each dimension (column) in X is treated as independent\n -- you get variance, not covariance).\n\n Can take scalar mean and variance, or vector mean and variance to\n simultaneously update a number of independent Gaussians.\n\n See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:\n\n http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf\n\n Parameters\n ----------\n n_past : int\n Number of samples represented in old mean and variance. If sample\n weights were given, this should contain the sum of sample\n weights represented in old mean and variance.\n\n mu : array-like of shape (number of Gaussians,)\n Means for Gaussians in original set.\n\n var : array-like of shape (number of Gaussians,)\n Variances for Gaussians in original set.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n total_mu : array-like of shape (number of Gaussians,)\n Updated mean for each Gaussian over the combined set.\n\n total_var : array-like of shape (number of Gaussians,)\n Updated variance for each Gaussian over the combined set.\n \"\"\"\n if X.shape[0] == 0:\n return mu, var\n if sample_weight is not None:\n n_new = float(sample_weight.sum())\n new_mu = np.average(X, axis=0, weights=sample_weight)\n new_var = np.average((X - new_mu)**2, axis=0, weights=sample_weight)\n else:\n n_new = X.shape[0]\n new_var = np.var(X, axis=0)\n new_mu = np.mean(X, axis=0)\n if n_past == 0:\n return new_mu, new_var\n n_total = float(n_past + n_new)\n total_mu = (n_new * new_mu + n_past * mu) / n_total\n old_ssd = n_past * var\n new_ssd = n_new * new_var\n total_ssd = old_ssd + new_ssd + n_new * n_past / n_total * (mu - new_mu)**2\n total_var = total_ssd / n_total\n return total_mu, total_var\n \n def partial_fit(self, X, y, classes=None, sample_weight=None):\n \"\"\"Incremental fit on a batch of samples.\n\n This method is expected to be called several times consecutively\n on different chunks of a dataset so as to implement out-of-core\n or online learning.\n\n This is especially useful when the whole dataset is too big to fit in\n memory at once.\n\n This method has some performance and numerical stability overhead,\n hence it is better to call partial_fit on chunks of data that are\n as large as possible (as long as fitting in the memory budget) to\n hide the overhead.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n classes : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n .. versionadded:: 0.17\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n return self._partial_fit(X, y, classes, _refit=False, sample_weight=sample_weight)\n \n def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None):\n \"\"\"Actual implementation of Gaussian NB fitting.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n classes : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n _refit : bool, default=False\n If true, act as though this were the first time we called\n _partial_fit (ie, throw away any past fitting and start over).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n \"\"\"\n if _refit:\n self.classes_ = None\n first_call = _check_partial_fit_first_call(self, classes)\n (X, y) = self._validate_data(X, y, reset=first_call)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n self.epsilon_ = self.var_smoothing * np.var(X, axis=0).max()\n if first_call:\n n_features = X.shape[1]\n n_classes = len(self.classes_)\n self.theta_ = np.zeros((n_classes, n_features))\n self.var_ = np.zeros((n_classes, n_features))\n self.class_count_ = np.zeros(n_classes, dtype=np.float64)\n if self.priors is not None:\n priors = np.asarray(self.priors)\n if len(priors) != n_classes:\n raise ValueError('Number of priors must match number of classes.')\n if not np.isclose(priors.sum(), 1.0):\n raise ValueError('The sum of the priors should be 1.')\n if (priors < 0).any():\n raise ValueError('Priors must be non-negative.')\n self.class_prior_ = priors\n else:\n self.class_prior_ = np.zeros(len(self.classes_), dtype=np.float64)\n else:\n if X.shape[1] != self.theta_.shape[1]:\n msg = 'Number of features %d does not match previous data %d.'\n raise ValueError(msg % (X.shape[1], self.theta_.shape[1]))\n self.var_[:, :] -= self.epsilon_\n classes = self.classes_\n unique_y = np.unique(y)\n unique_y_in_classes = np.in1d(unique_y, classes)\n if not np.all(unique_y_in_classes):\n raise ValueError('The target label(s) %s in y do not exist in the initial classes %s' % (unique_y[~unique_y_in_classes], classes))\n for y_i in unique_y:\n i = classes.searchsorted(y_i)\n X_i = X[y == y_i, :]\n if sample_weight is not None:\n sw_i = sample_weight[y == y_i]\n N_i = sw_i.sum()\n else:\n sw_i = None\n N_i = X_i.shape[0]\n (new_theta, new_sigma) = self._update_mean_variance(self.class_count_[i], self.theta_[i, :], self.var_[i, :], X_i, sw_i)\n self.theta_[i, :] = new_theta\n self.var_[i, :] = new_sigma\n self.class_count_[i] += N_i\n self.var_[:, :] += self.epsilon_\n if self.priors is None:\n self.class_prior_ = self.class_count_ / self.class_count_.sum()\n return self\n \n def _joint_log_likelihood(self, X):\n joint_log_likelihood = []\n for i in range(np.size(self.classes_)):\n jointi = np.log(self.class_prior_[i])\n n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))\n n_ij -= 0.5 * np.sum((X - self.theta_[i, :])**2 / self.var_[i, :], 1)\n joint_log_likelihood.append(jointi + n_ij)\n joint_log_likelihood = np.array(joint_log_likelihood).T\n return joint_log_likelihood\n \n @deprecated('Attribute `sigma_` was deprecated in 1.0 and will be removed in1.2. Use `var_` instead.')\n @property\n def sigma_(self):\n return self.var_\n" }, @@ -25277,7 +25359,7 @@ "sklearn.naive_bayes.MultinomialNB._joint_log_likelihood" ], "is_public": true, - "description": "Naive Bayes classifier for multinomial models.\n\nThe multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work. Read more in the :ref:`User Guide `.", + "description": "Naive Bayes classifier for multinomial models.\n\nThe multinomial Naive Bayes classifier is suitable for classification with\ndiscrete features (e.g., word counts for text classification). The\nmultinomial distribution normally requires integer feature counts. However,\nin practice, fractional counts such as tf-idf may also work.\n\nRead more in the :ref:`User Guide `.", "docstring": "\n Naive Bayes classifier for multinomial models.\n\n The multinomial Naive Bayes classifier is suitable for classification with\n discrete features (e.g., word counts for text classification). The\n multinomial distribution normally requires integer feature counts. However,\n in practice, fractional counts such as tf-idf may also work.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Additive (Laplace/Lidstone) smoothing parameter\n (0 for no smoothing).\n\n fit_prior : bool, default=True\n Whether to learn class prior probabilities or not.\n If false, a uniform prior will be used.\n\n class_prior : array-like of shape (n_classes,), default=None\n Prior probabilities of the classes. If specified the priors are not\n adjusted according to the data.\n\n Attributes\n ----------\n class_count_ : ndarray of shape (n_classes,)\n Number of samples encountered for each class during fitting. This\n value is weighted by the sample weight when provided.\n\n class_log_prior_ : ndarray of shape (n_classes,)\n Smoothed empirical log probability for each class.\n\n classes_ : ndarray of shape (n_classes,)\n Class labels known to the classifier\n\n coef_ : ndarray of shape (n_classes, n_features)\n Mirrors ``feature_log_prob_`` for interpreting `MultinomialNB`\n as a linear model.\n\n .. deprecated:: 0.24\n ``coef_`` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26).\n\n feature_count_ : ndarray of shape (n_classes, n_features)\n Number of samples encountered for each (class, feature)\n during fitting. This value is weighted by the sample weight when\n provided.\n\n feature_log_prob_ : ndarray of shape (n_classes, n_features)\n Empirical log probability of features\n given a class, ``P(x_i|y)``.\n\n intercept_ : ndarray of shape (n_classes,)\n Mirrors ``class_log_prior_`` for interpreting `MultinomialNB`\n as a linear model.\n\n .. deprecated:: 0.24\n ``intercept_`` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26).\n\n n_features_ : int\n Number of features of each sample.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n BernoulliNB : Naive Bayes classifier for multivariate Bernoulli models.\n CategoricalNB : Naive Bayes classifier for categorical features.\n ComplementNB : Complement Naive Bayes classifier.\n GaussianNB : Gaussian Naive Bayes.\n\n Notes\n -----\n For the rationale behind the names `coef_` and `intercept_`, i.e.\n naive Bayes as a linear classifier, see J. Rennie et al. (2003),\n Tackling the poor assumptions of naive Bayes text classifiers, ICML.\n\n References\n ----------\n C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to\n Information Retrieval. Cambridge University Press, pp. 234-265.\n https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html\n\n Examples\n --------\n >>> import numpy as np\n >>> rng = np.random.RandomState(1)\n >>> X = rng.randint(5, size=(6, 100))\n >>> y = np.array([1, 2, 3, 4, 5, 6])\n >>> from sklearn.naive_bayes import MultinomialNB\n >>> clf = MultinomialNB()\n >>> clf.fit(X, y)\n MultinomialNB()\n >>> print(clf.predict(X[2:3]))\n [3]\n ", "source_code": "\n\nclass MultinomialNB(_BaseDiscreteNB):\n \"\"\"\n Naive Bayes classifier for multinomial models.\n\n The multinomial Naive Bayes classifier is suitable for classification with\n discrete features (e.g., word counts for text classification). The\n multinomial distribution normally requires integer feature counts. However,\n in practice, fractional counts such as tf-idf may also work.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n alpha : float, default=1.0\n Additive (Laplace/Lidstone) smoothing parameter\n (0 for no smoothing).\n\n fit_prior : bool, default=True\n Whether to learn class prior probabilities or not.\n If false, a uniform prior will be used.\n\n class_prior : array-like of shape (n_classes,), default=None\n Prior probabilities of the classes. If specified the priors are not\n adjusted according to the data.\n\n Attributes\n ----------\n class_count_ : ndarray of shape (n_classes,)\n Number of samples encountered for each class during fitting. This\n value is weighted by the sample weight when provided.\n\n class_log_prior_ : ndarray of shape (n_classes,)\n Smoothed empirical log probability for each class.\n\n classes_ : ndarray of shape (n_classes,)\n Class labels known to the classifier\n\n coef_ : ndarray of shape (n_classes, n_features)\n Mirrors ``feature_log_prob_`` for interpreting `MultinomialNB`\n as a linear model.\n\n .. deprecated:: 0.24\n ``coef_`` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26).\n\n feature_count_ : ndarray of shape (n_classes, n_features)\n Number of samples encountered for each (class, feature)\n during fitting. This value is weighted by the sample weight when\n provided.\n\n feature_log_prob_ : ndarray of shape (n_classes, n_features)\n Empirical log probability of features\n given a class, ``P(x_i|y)``.\n\n intercept_ : ndarray of shape (n_classes,)\n Mirrors ``class_log_prior_`` for interpreting `MultinomialNB`\n as a linear model.\n\n .. deprecated:: 0.24\n ``intercept_`` is deprecated in 0.24 and will be removed in 1.1\n (renaming of 0.26).\n\n n_features_ : int\n Number of features of each sample.\n\n .. deprecated:: 1.0\n Attribute `n_features_` was deprecated in version 1.0 and will be\n removed in 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n BernoulliNB : Naive Bayes classifier for multivariate Bernoulli models.\n CategoricalNB : Naive Bayes classifier for categorical features.\n ComplementNB : Complement Naive Bayes classifier.\n GaussianNB : Gaussian Naive Bayes.\n\n Notes\n -----\n For the rationale behind the names `coef_` and `intercept_`, i.e.\n naive Bayes as a linear classifier, see J. Rennie et al. (2003),\n Tackling the poor assumptions of naive Bayes text classifiers, ICML.\n\n References\n ----------\n C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to\n Information Retrieval. Cambridge University Press, pp. 234-265.\n https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html\n\n Examples\n --------\n >>> import numpy as np\n >>> rng = np.random.RandomState(1)\n >>> X = rng.randint(5, size=(6, 100))\n >>> y = np.array([1, 2, 3, 4, 5, 6])\n >>> from sklearn.naive_bayes import MultinomialNB\n >>> clf = MultinomialNB()\n >>> clf.fit(X, y)\n MultinomialNB()\n >>> print(clf.predict(X[2:3]))\n [3]\n \"\"\"\n \n def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None):\n self.alpha = alpha\n self.fit_prior = fit_prior\n self.class_prior = class_prior\n \n def _more_tags(self):\n return {'requires_positive_X': True}\n \n def _count(self, X, Y):\n \"\"\"Count and smooth feature occurrences.\"\"\"\n check_non_negative(X, 'MultinomialNB (input X)')\n self.feature_count_ += safe_sparse_dot(Y.T, X)\n self.class_count_ += Y.sum(axis=0)\n \n def _update_feature_log_prob(self, alpha):\n \"\"\"Apply smoothing to raw counts and recompute log probabilities\"\"\"\n smoothed_fc = self.feature_count_ + alpha\n smoothed_cc = smoothed_fc.sum(axis=1)\n self.feature_log_prob_ = np.log(smoothed_fc) - np.log(smoothed_cc.reshape(-1, 1))\n \n def _joint_log_likelihood(self, X):\n \"\"\"Calculate the posterior log probability of the samples X\"\"\"\n return safe_sparse_dot(X, self.feature_log_prob_.T) + self.class_log_prior_\n" }, @@ -25300,7 +25382,7 @@ "sklearn.naive_bayes._BaseDiscreteNB.n_features_@getter" ], "is_public": false, - "description": "Abstract base class for naive Bayes on discrete/categorical data\n\nAny estimator based on this class should provide: __init__ _joint_log_likelihood(X) as per _BaseNB", + "description": "Abstract base class for naive Bayes on discrete/categorical data\n\nAny estimator based on this class should provide:\n\n__init__\n_joint_log_likelihood(X) as per _BaseNB", "docstring": "Abstract base class for naive Bayes on discrete/categorical data\n\n Any estimator based on this class should provide:\n\n __init__\n _joint_log_likelihood(X) as per _BaseNB\n ", "source_code": "\n\nclass _BaseDiscreteNB(_BaseNB):\n \"\"\"Abstract base class for naive Bayes on discrete/categorical data\n\n Any estimator based on this class should provide:\n\n __init__\n _joint_log_likelihood(X) as per _BaseNB\n \"\"\"\n \n def _check_X(self, X):\n \"\"\"Validate X, used only in predict* methods.\"\"\"\n return self._validate_data(X, accept_sparse='csr', reset=False)\n \n def _check_X_y(self, X, y, reset=True):\n \"\"\"Validate X and y in fit methods.\"\"\"\n return self._validate_data(X, y, accept_sparse='csr', reset=reset)\n \n def _update_class_log_prior(self, class_prior=None):\n n_classes = len(self.classes_)\n if class_prior is not None:\n if len(class_prior) != n_classes:\n raise ValueError('Number of priors must match number of classes.')\n self.class_log_prior_ = np.log(class_prior)\n elif self.fit_prior:\n with warnings.catch_warnings():\n warnings.simplefilter('ignore', RuntimeWarning)\n log_class_count = np.log(self.class_count_)\n self.class_log_prior_ = log_class_count - np.log(self.class_count_.sum())\n else:\n self.class_log_prior_ = np.full(n_classes, -np.log(n_classes))\n \n def _check_alpha(self):\n if np.min(self.alpha) < 0:\n raise ValueError('Smoothing parameter alpha = %.1e. alpha should be > 0.' % np.min(self.alpha))\n if isinstance(self.alpha, np.ndarray):\n if not self.alpha.shape[0] == self.n_features_in_:\n raise ValueError('alpha should be a scalar or a numpy array with shape [n_features]')\n if np.min(self.alpha) < _ALPHA_MIN:\n warnings.warn('alpha too small will result in numeric errors, setting alpha = %.1e' % _ALPHA_MIN)\n return np.maximum(self.alpha, _ALPHA_MIN)\n return self.alpha\n \n def partial_fit(self, X, y, classes=None, sample_weight=None):\n \"\"\"Incremental fit on a batch of samples.\n\n This method is expected to be called several times consecutively\n on different chunks of a dataset so as to implement out-of-core\n or online learning.\n\n This is especially useful when the whole dataset is too big to fit in\n memory at once.\n\n This method has some performance overhead hence it is better to call\n partial_fit on chunks of data that are as large as possible\n (as long as fitting in the memory budget) to hide the overhead.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n classes : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n first_call = not hasattr(self, 'classes_')\n (X, y) = self._check_X_y(X, y, reset=first_call)\n (_, n_features) = X.shape\n if _check_partial_fit_first_call(self, classes):\n n_classes = len(classes)\n self._init_counters(n_classes, n_features)\n Y = label_binarize(y, classes=self.classes_)\n if Y.shape[1] == 1:\n if len(self.classes_) == 2:\n Y = np.concatenate((1 - Y, Y), axis=1)\n else:\n Y = np.ones_like(Y)\n if X.shape[0] != Y.shape[0]:\n msg = 'X.shape[0]=%d and y.shape[0]=%d are incompatible.'\n raise ValueError(msg % (X.shape[0], y.shape[0]))\n Y = Y.astype(np.float64, copy=False)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n sample_weight = np.atleast_2d(sample_weight)\n Y *= sample_weight.T\n class_prior = self.class_prior\n self._count(X, Y)\n alpha = self._check_alpha()\n self._update_feature_log_prob(alpha)\n self._update_class_log_prior(class_prior=class_prior)\n return self\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit Naive Bayes classifier according to X, y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n (X, y) = self._check_X_y(X, y)\n (_, n_features) = X.shape\n labelbin = LabelBinarizer()\n Y = labelbin.fit_transform(y)\n self.classes_ = labelbin.classes_\n if Y.shape[1] == 1:\n if len(self.classes_) == 2:\n Y = np.concatenate((1 - Y, Y), axis=1)\n else:\n Y = np.ones_like(Y)\n if sample_weight is not None:\n Y = Y.astype(np.float64, copy=False)\n sample_weight = _check_sample_weight(sample_weight, X)\n sample_weight = np.atleast_2d(sample_weight)\n Y *= sample_weight.T\n class_prior = self.class_prior\n n_classes = Y.shape[1]\n self._init_counters(n_classes, n_features)\n self._count(X, Y)\n alpha = self._check_alpha()\n self._update_feature_log_prob(alpha)\n self._update_class_log_prior(class_prior=class_prior)\n return self\n \n def _init_counters(self, n_classes, n_features):\n self.class_count_ = np.zeros(n_classes, dtype=np.float64)\n self.feature_count_ = np.zeros((n_classes, n_features), dtype=np.float64)\n \n @deprecated('Attribute `coef_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def coef_(self):\n return self.feature_log_prob_[1:] if len(self.classes_) == 2 else self.feature_log_prob_\n \n @deprecated('Attribute `intercept_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def intercept_(self):\n return self.class_log_prior_[1:] if len(self.classes_) == 2 else self.class_log_prior_\n \n def _more_tags(self):\n return {'poor_score': True}\n \n @deprecated('Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead.')\n @property\n def n_features_(self):\n return self.n_features_in_\n" }, @@ -25351,7 +25433,7 @@ "is_public": false, "description": "Base class for nearest neighbors estimators.", "docstring": "Base class for nearest neighbors estimators.", - "source_code": "\n\nclass NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):\n \"\"\"Base class for nearest neighbors estimators.\"\"\"\n \n @abstractmethod\n def __init__(self, n_neighbors=None, radius=None, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None):\n self.n_neighbors = n_neighbors\n self.radius = radius\n self.algorithm = algorithm\n self.leaf_size = leaf_size\n self.metric = metric\n self.metric_params = metric_params\n self.p = p\n self.n_jobs = n_jobs\n \n def _check_algorithm_metric(self):\n if self.algorithm not in ['auto', 'brute', 'kd_tree', 'ball_tree']:\n raise ValueError(\"unrecognized algorithm: '%s'\" % self.algorithm)\n if self.algorithm == 'auto':\n if self.metric == 'precomputed':\n alg_check = 'brute'\n elif callable(self.metric) or self.metric in VALID_METRICS['ball_tree']:\n alg_check = 'ball_tree'\n else:\n alg_check = 'brute'\n else:\n alg_check = self.algorithm\n if callable(self.metric):\n if self.algorithm == 'kd_tree':\n raise ValueError(\"kd_tree does not support callable metric '%s'Function call overhead will resultin very poor performance.\" % self.metric)\n elif self.metric not in VALID_METRICS[alg_check]:\n raise ValueError(\"Metric '%s' not valid. Use sorted(sklearn.neighbors.VALID_METRICS['%s']) to get valid options. Metric can also be a callable function.\" % (self.metric, alg_check))\n if self.metric_params is not None and 'p' in self.metric_params:\n if self.p is not None:\n warnings.warn('Parameter p is found in metric_params. The corresponding parameter from __init__ is ignored.', SyntaxWarning, stacklevel=3)\n effective_p = self.metric_params['p']\n else:\n effective_p = self.p\n if self.metric in ['wminkowski', 'minkowski'] and effective_p < 1:\n raise ValueError('p must be greater or equal to one for minkowski metric')\n \n def _fit(self, X, y=None):\n if self._get_tags()['requires_y']:\n if not isinstance(X, (KDTree, BallTree, NeighborsBase)):\n (X, y) = self._validate_data(X, y, accept_sparse='csr', multi_output=True)\n if is_classifier(self):\n if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:\n if y.ndim != 1:\n warnings.warn('A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().', DataConversionWarning, stacklevel=2)\n self.outputs_2d_ = False\n y = y.reshape((-1, 1))\n else:\n self.outputs_2d_ = True\n check_classification_targets(y)\n self.classes_ = []\n self._y = np.empty(y.shape, dtype=int)\n for k in range(self._y.shape[1]):\n (classes, self._y[:, k]) = np.unique(y[:, k], return_inverse=True)\n self.classes_.append(classes)\n if not self.outputs_2d_:\n self.classes_ = self.classes_[0]\n self._y = self._y.ravel()\n else:\n self._y = y\n elif not isinstance(X, (KDTree, BallTree, NeighborsBase)):\n X = self._validate_data(X, accept_sparse='csr')\n self._check_algorithm_metric()\n if self.metric_params is None:\n self.effective_metric_params_ = {}\n else:\n self.effective_metric_params_ = self.metric_params.copy()\n effective_p = self.effective_metric_params_.get('p', self.p)\n if self.metric in ['wminkowski', 'minkowski']:\n self.effective_metric_params_['p'] = effective_p\n self.effective_metric_ = self.metric\n if self.metric == 'minkowski':\n p = self.effective_metric_params_.pop('p', 2)\n if p < 1:\n raise ValueError('p must be greater or equal to one for minkowski metric')\n elif p == 1:\n self.effective_metric_ = 'manhattan'\n elif p == 2:\n self.effective_metric_ = 'euclidean'\n elif p == np.inf:\n self.effective_metric_ = 'chebyshev'\n else:\n self.effective_metric_params_['p'] = p\n if isinstance(X, NeighborsBase):\n self._fit_X = X._fit_X\n self._tree = X._tree\n self._fit_method = X._fit_method\n self.n_samples_fit_ = X.n_samples_fit_\n return self\n elif isinstance(X, BallTree):\n self._fit_X = X.data\n self._tree = X\n self._fit_method = 'ball_tree'\n self.n_samples_fit_ = X.data.shape[0]\n return self\n elif isinstance(X, KDTree):\n self._fit_X = X.data\n self._tree = X\n self._fit_method = 'kd_tree'\n self.n_samples_fit_ = X.data.shape[0]\n return self\n if self.metric == 'precomputed':\n X = _check_precomputed(X)\n if X.shape[0] != X.shape[1]:\n raise ValueError('Precomputed matrix must be square. Input is a {}x{} matrix.'.format(X.shape[0], X.shape[1]))\n self.n_features_in_ = X.shape[1]\n n_samples = X.shape[0]\n if n_samples == 0:\n raise ValueError('n_samples must be greater than 0')\n if issparse(X):\n if self.algorithm not in ('auto', 'brute'):\n warnings.warn('cannot use tree with sparse input: using brute force')\n if self.effective_metric_ not in VALID_METRICS_SPARSE['brute'] and not callable(self.effective_metric_):\n raise ValueError(\"Metric '%s' not valid for sparse input. Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) to get valid options. Metric can also be a callable function.\" % self.effective_metric_)\n self._fit_X = X.copy()\n self._tree = None\n self._fit_method = 'brute'\n self.n_samples_fit_ = X.shape[0]\n return self\n self._fit_method = self.algorithm\n self._fit_X = X\n self.n_samples_fit_ = X.shape[0]\n if self._fit_method == 'auto':\n if self.metric == 'precomputed' or self._fit_X.shape[1] > 15 or self.n_neighbors is not None and self.n_neighbors >= self._fit_X.shape[0] // 2:\n self._fit_method = 'brute'\n elif self.effective_metric_ in VALID_METRICS['kd_tree']:\n self._fit_method = 'kd_tree'\n elif callable(self.effective_metric_) or self.effective_metric_ in VALID_METRICS['ball_tree']:\n self._fit_method = 'ball_tree'\n else:\n self._fit_method = 'brute'\n if self._fit_method == 'ball_tree':\n self._tree = BallTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_)\n elif self._fit_method == 'kd_tree':\n self._tree = KDTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_)\n elif self._fit_method == 'brute':\n self._tree = None\n else:\n raise ValueError(\"algorithm = '%s' not recognized\" % self.algorithm)\n if self.n_neighbors is not None:\n if self.n_neighbors <= 0:\n raise ValueError('Expected n_neighbors > 0. Got %d' % self.n_neighbors)\n elif not isinstance(self.n_neighbors, numbers.Integral):\n raise TypeError('n_neighbors does not take %s value, enter integer value' % type(self.n_neighbors))\n return self\n \n def _more_tags(self):\n return {'pairwise': self.metric == 'precomputed'}\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return self.metric == 'precomputed'\n" + "source_code": "\n\nclass NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):\n \"\"\"Base class for nearest neighbors estimators.\"\"\"\n \n @abstractmethod\n def __init__(self, n_neighbors=None, radius=None, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None):\n self.n_neighbors = n_neighbors\n self.radius = radius\n self.algorithm = algorithm\n self.leaf_size = leaf_size\n self.metric = metric\n self.metric_params = metric_params\n self.p = p\n self.n_jobs = n_jobs\n \n def _check_algorithm_metric(self):\n if self.algorithm not in ['auto', 'brute', 'kd_tree', 'ball_tree']:\n raise ValueError(\"unrecognized algorithm: '%s'\" % self.algorithm)\n if self.algorithm == 'auto':\n if self.metric == 'precomputed':\n alg_check = 'brute'\n elif callable(self.metric) or self.metric in VALID_METRICS['ball_tree']:\n alg_check = 'ball_tree'\n else:\n alg_check = 'brute'\n else:\n alg_check = self.algorithm\n if callable(self.metric):\n if self.algorithm == 'kd_tree':\n raise ValueError(\"kd_tree does not support callable metric '%s'Function call overhead will resultin very poor performance.\" % self.metric)\n elif self.metric not in VALID_METRICS[alg_check]:\n raise ValueError(\"Metric '%s' not valid. Use sorted(sklearn.neighbors.VALID_METRICS['%s']) to get valid options. Metric can also be a callable function.\" % (self.metric, alg_check))\n if self.metric_params is not None and 'p' in self.metric_params:\n if self.p is not None:\n warnings.warn('Parameter p is found in metric_params. The corresponding parameter from __init__ is ignored.', SyntaxWarning, stacklevel=3)\n effective_p = self.metric_params['p']\n else:\n effective_p = self.p\n if self.metric in ['wminkowski', 'minkowski'] and effective_p < 1:\n raise ValueError('p must be greater or equal to one for minkowski metric')\n \n def _fit(self, X, y=None):\n if self._get_tags()['requires_y']:\n if not isinstance(X, (KDTree, BallTree, NeighborsBase)):\n (X, y) = self._validate_data(X, y, accept_sparse='csr', multi_output=True)\n if is_classifier(self):\n if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:\n if y.ndim != 1:\n warnings.warn('A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().', DataConversionWarning, stacklevel=2)\n self.outputs_2d_ = False\n y = y.reshape((-1, 1))\n else:\n self.outputs_2d_ = True\n check_classification_targets(y)\n self.classes_ = []\n self._y = np.empty(y.shape, dtype=int)\n for k in range(self._y.shape[1]):\n (classes, self._y[:, k]) = np.unique(y[:, k], return_inverse=True)\n self.classes_.append(classes)\n if not self.outputs_2d_:\n self.classes_ = self.classes_[0]\n self._y = self._y.ravel()\n else:\n self._y = y\n elif not isinstance(X, (KDTree, BallTree, NeighborsBase)):\n X = self._validate_data(X, accept_sparse='csr')\n self._check_algorithm_metric()\n if self.metric_params is None:\n self.effective_metric_params_ = {}\n else:\n self.effective_metric_params_ = self.metric_params.copy()\n effective_p = self.effective_metric_params_.get('p', self.p)\n if self.metric in ['wminkowski', 'minkowski']:\n self.effective_metric_params_['p'] = effective_p\n self.effective_metric_ = self.metric\n if self.metric == 'minkowski':\n p = self.effective_metric_params_.pop('p', 2)\n w = self.effective_metric_params_.pop('w', None)\n if p < 1:\n raise ValueError('p must be greater or equal to one for minkowski metric')\n elif p == 1 and w is None:\n self.effective_metric_ = 'manhattan'\n elif p == 2 and w is None:\n self.effective_metric_ = 'euclidean'\n elif p == np.inf and w is None:\n self.effective_metric_ = 'chebyshev'\n else:\n self.effective_metric_params_['p'] = p\n self.effective_metric_params_['w'] = w\n if isinstance(X, NeighborsBase):\n self._fit_X = X._fit_X\n self._tree = X._tree\n self._fit_method = X._fit_method\n self.n_samples_fit_ = X.n_samples_fit_\n return self\n elif isinstance(X, BallTree):\n self._fit_X = X.data\n self._tree = X\n self._fit_method = 'ball_tree'\n self.n_samples_fit_ = X.data.shape[0]\n return self\n elif isinstance(X, KDTree):\n self._fit_X = X.data\n self._tree = X\n self._fit_method = 'kd_tree'\n self.n_samples_fit_ = X.data.shape[0]\n return self\n if self.metric == 'precomputed':\n X = _check_precomputed(X)\n if X.shape[0] != X.shape[1]:\n raise ValueError('Precomputed matrix must be square. Input is a {}x{} matrix.'.format(X.shape[0], X.shape[1]))\n self.n_features_in_ = X.shape[1]\n n_samples = X.shape[0]\n if n_samples == 0:\n raise ValueError('n_samples must be greater than 0')\n if issparse(X):\n if self.algorithm not in ('auto', 'brute'):\n warnings.warn('cannot use tree with sparse input: using brute force')\n if self.effective_metric_ not in VALID_METRICS_SPARSE['brute'] and not callable(self.effective_metric_):\n raise ValueError(\"Metric '%s' not valid for sparse input. Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) to get valid options. Metric can also be a callable function.\" % self.effective_metric_)\n self._fit_X = X.copy()\n self._tree = None\n self._fit_method = 'brute'\n self.n_samples_fit_ = X.shape[0]\n return self\n self._fit_method = self.algorithm\n self._fit_X = X\n self.n_samples_fit_ = X.shape[0]\n if self._fit_method == 'auto':\n if self.metric == 'precomputed' or self._fit_X.shape[1] > 15 or self.n_neighbors is not None and self.n_neighbors >= self._fit_X.shape[0] // 2:\n self._fit_method = 'brute'\n elif self.effective_metric_ in VALID_METRICS['kd_tree']:\n self._fit_method = 'kd_tree'\n elif callable(self.effective_metric_) or self.effective_metric_ in VALID_METRICS['ball_tree']:\n self._fit_method = 'ball_tree'\n else:\n self._fit_method = 'brute'\n if self._fit_method == 'ball_tree':\n self._tree = BallTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_)\n elif self._fit_method == 'kd_tree':\n self._tree = KDTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_)\n elif self._fit_method == 'brute':\n self._tree = None\n else:\n raise ValueError(\"algorithm = '%s' not recognized\" % self.algorithm)\n if self.n_neighbors is not None:\n if self.n_neighbors <= 0:\n raise ValueError('Expected n_neighbors > 0. Got %d' % self.n_neighbors)\n elif not isinstance(self.n_neighbors, numbers.Integral):\n raise TypeError('n_neighbors does not take %s value, enter integer value' % type(self.n_neighbors))\n return self\n \n def _more_tags(self):\n return {'pairwise': self.metric == 'precomputed'}\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return self.metric == 'precomputed'\n" }, { "name": "RadiusNeighborsMixin", @@ -25386,8 +25468,8 @@ ], "is_public": true, "description": "Classifier implementing the k-nearest neighbors vote.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Classifier implementing the k-nearest neighbors vote.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. See the documentation of :class:`DistanceMetric` for a\n list of available metrics.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n Doesn't affect :meth:`fit` method.\n\n Attributes\n ----------\n classes_ : array of shape (n_classes,)\n Class labels known to the classifier\n\n effective_metric_ : str or callble\n The distance metric used. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n outputs_2d_ : bool\n False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit\n otherwise True.\n\n See Also\n --------\n RadiusNeighborsClassifier: Classifier based on neighbors within a fixed radius.\n KNeighborsRegressor: Regression based on k-nearest neighbors.\n RadiusNeighborsRegressor: Regression based on neighbors within a fixed radius.\n NearestNeighbors: Unsupervised learner for implementing neighbor searches.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n .. warning::\n\n Regarding the Nearest Neighbors algorithms, if it is found that two\n neighbors, neighbor `k+1` and `k`, have identical distances\n but different labels, the results will depend on the ordering of the\n training data.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import KNeighborsClassifier\n >>> neigh = KNeighborsClassifier(n_neighbors=3)\n >>> neigh.fit(X, y)\n KNeighborsClassifier(...)\n >>> print(neigh.predict([[1.1]]))\n [0]\n >>> print(neigh.predict_proba([[0.9]]))\n [[0.666... 0.333...]]\n ", - "source_code": "\n\nclass KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):\n \"\"\"Classifier implementing the k-nearest neighbors vote.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. See the documentation of :class:`DistanceMetric` for a\n list of available metrics.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n Doesn't affect :meth:`fit` method.\n\n Attributes\n ----------\n classes_ : array of shape (n_classes,)\n Class labels known to the classifier\n\n effective_metric_ : str or callble\n The distance metric used. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n outputs_2d_ : bool\n False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit\n otherwise True.\n\n See Also\n --------\n RadiusNeighborsClassifier: Classifier based on neighbors within a fixed radius.\n KNeighborsRegressor: Regression based on k-nearest neighbors.\n RadiusNeighborsRegressor: Regression based on neighbors within a fixed radius.\n NearestNeighbors: Unsupervised learner for implementing neighbor searches.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n .. warning::\n\n Regarding the Nearest Neighbors algorithms, if it is found that two\n neighbors, neighbor `k+1` and `k`, have identical distances\n but different labels, the results will depend on the ordering of the\n training data.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import KNeighborsClassifier\n >>> neigh = KNeighborsClassifier(n_neighbors=3)\n >>> neigh.fit(X, y)\n KNeighborsClassifier(...)\n >>> print(neigh.predict([[1.1]]))\n [0]\n >>> print(neigh.predict_proba([[0.9]]))\n [[0.666... 0.333...]]\n \"\"\"\n \n def __init__(self, n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):\n super().__init__(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.weights = weights\n \n def fit(self, X, y):\n \"\"\"Fit the k-nearest neighbors classifier from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : KNeighborsClassifier\n The fitted k-nearest neighbors classifier.\n \"\"\"\n self.weights = _check_weights(self.weights)\n return self._fit(X, y)\n \n def predict(self, X):\n \"\"\"Predict the class labels for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs)\n Class labels for each data sample.\n \"\"\"\n (neigh_dist, neigh_ind) = self.kneighbors(X)\n classes_ = self.classes_\n _y = self._y\n if not self.outputs_2d_:\n _y = self._y.reshape((-1, 1))\n classes_ = [self.classes_]\n n_outputs = len(classes_)\n n_queries = _num_samples(X)\n weights = _get_weights(neigh_dist, self.weights)\n y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)\n for (k, classes_k) in enumerate(classes_):\n if weights is None:\n (mode, _) = stats.mode(_y[neigh_ind, k], axis=1)\n else:\n (mode, _) = weighted_mode(_y[neigh_ind, k], weights, axis=1)\n mode = np.asarray(mode.ravel(), dtype=np.intp)\n y_pred[:, k] = classes_k.take(mode)\n if not self.outputs_2d_:\n y_pred = y_pred.ravel()\n return y_pred\n \n def predict_proba(self, X):\n \"\"\"Return probability estimates for the test data X.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n p : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1.\n The class probabilities of the input samples. Classes are ordered\n by lexicographic order.\n \"\"\"\n (neigh_dist, neigh_ind) = self.kneighbors(X)\n classes_ = self.classes_\n _y = self._y\n if not self.outputs_2d_:\n _y = self._y.reshape((-1, 1))\n classes_ = [self.classes_]\n n_queries = _num_samples(X)\n weights = _get_weights(neigh_dist, self.weights)\n if weights is None:\n weights = np.ones_like(neigh_ind)\n all_rows = np.arange(n_queries)\n probabilities = []\n for (k, classes_k) in enumerate(classes_):\n pred_labels = _y[:, k][neigh_ind]\n proba_k = np.zeros((n_queries, classes_k.size))\n for (i, idx) in enumerate(pred_labels.T):\n proba_k[all_rows, idx] += weights[:, i]\n normalizer = proba_k.sum(axis=1)[:, np.newaxis]\n normalizer[normalizer == 0.0] = 1.0\n proba_k /= normalizer\n probabilities.append(proba_k)\n if not self.outputs_2d_:\n probabilities = probabilities[0]\n return probabilities\n \n def _more_tags(self):\n return {'multilabel': True}\n" + "docstring": "Classifier implementing the k-nearest neighbors vote.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. For a list of available metrics, see the documentation of\n :class:`~sklearn.metrics.DistanceMetric`.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n Doesn't affect :meth:`fit` method.\n\n Attributes\n ----------\n classes_ : array of shape (n_classes,)\n Class labels known to the classifier\n\n effective_metric_ : str or callble\n The distance metric used. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n outputs_2d_ : bool\n False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit\n otherwise True.\n\n See Also\n --------\n RadiusNeighborsClassifier: Classifier based on neighbors within a fixed radius.\n KNeighborsRegressor: Regression based on k-nearest neighbors.\n RadiusNeighborsRegressor: Regression based on neighbors within a fixed radius.\n NearestNeighbors: Unsupervised learner for implementing neighbor searches.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n .. warning::\n\n Regarding the Nearest Neighbors algorithms, if it is found that two\n neighbors, neighbor `k+1` and `k`, have identical distances\n but different labels, the results will depend on the ordering of the\n training data.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import KNeighborsClassifier\n >>> neigh = KNeighborsClassifier(n_neighbors=3)\n >>> neigh.fit(X, y)\n KNeighborsClassifier(...)\n >>> print(neigh.predict([[1.1]]))\n [0]\n >>> print(neigh.predict_proba([[0.9]]))\n [[0.666... 0.333...]]\n ", + "source_code": "\n\nclass KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):\n \"\"\"Classifier implementing the k-nearest neighbors vote.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. For a list of available metrics, see the documentation of\n :class:`~sklearn.metrics.DistanceMetric`.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n Doesn't affect :meth:`fit` method.\n\n Attributes\n ----------\n classes_ : array of shape (n_classes,)\n Class labels known to the classifier\n\n effective_metric_ : str or callble\n The distance metric used. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n outputs_2d_ : bool\n False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit\n otherwise True.\n\n See Also\n --------\n RadiusNeighborsClassifier: Classifier based on neighbors within a fixed radius.\n KNeighborsRegressor: Regression based on k-nearest neighbors.\n RadiusNeighborsRegressor: Regression based on neighbors within a fixed radius.\n NearestNeighbors: Unsupervised learner for implementing neighbor searches.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n .. warning::\n\n Regarding the Nearest Neighbors algorithms, if it is found that two\n neighbors, neighbor `k+1` and `k`, have identical distances\n but different labels, the results will depend on the ordering of the\n training data.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import KNeighborsClassifier\n >>> neigh = KNeighborsClassifier(n_neighbors=3)\n >>> neigh.fit(X, y)\n KNeighborsClassifier(...)\n >>> print(neigh.predict([[1.1]]))\n [0]\n >>> print(neigh.predict_proba([[0.9]]))\n [[0.666... 0.333...]]\n \"\"\"\n \n def __init__(self, n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):\n super().__init__(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.weights = weights\n \n def fit(self, X, y):\n \"\"\"Fit the k-nearest neighbors classifier from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : KNeighborsClassifier\n The fitted k-nearest neighbors classifier.\n \"\"\"\n self.weights = _check_weights(self.weights)\n return self._fit(X, y)\n \n def predict(self, X):\n \"\"\"Predict the class labels for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs)\n Class labels for each data sample.\n \"\"\"\n (neigh_dist, neigh_ind) = self.kneighbors(X)\n classes_ = self.classes_\n _y = self._y\n if not self.outputs_2d_:\n _y = self._y.reshape((-1, 1))\n classes_ = [self.classes_]\n n_outputs = len(classes_)\n n_queries = _num_samples(X)\n weights = _get_weights(neigh_dist, self.weights)\n y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)\n for (k, classes_k) in enumerate(classes_):\n if weights is None:\n (mode, _) = stats.mode(_y[neigh_ind, k], axis=1)\n else:\n (mode, _) = weighted_mode(_y[neigh_ind, k], weights, axis=1)\n mode = np.asarray(mode.ravel(), dtype=np.intp)\n y_pred[:, k] = classes_k.take(mode)\n if not self.outputs_2d_:\n y_pred = y_pred.ravel()\n return y_pred\n \n def predict_proba(self, X):\n \"\"\"Return probability estimates for the test data X.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n p : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1.\n The class probabilities of the input samples. Classes are ordered\n by lexicographic order.\n \"\"\"\n (neigh_dist, neigh_ind) = self.kneighbors(X)\n classes_ = self.classes_\n _y = self._y\n if not self.outputs_2d_:\n _y = self._y.reshape((-1, 1))\n classes_ = [self.classes_]\n n_queries = _num_samples(X)\n weights = _get_weights(neigh_dist, self.weights)\n if weights is None:\n weights = np.ones_like(neigh_ind)\n all_rows = np.arange(n_queries)\n probabilities = []\n for (k, classes_k) in enumerate(classes_):\n pred_labels = _y[:, k][neigh_ind]\n proba_k = np.zeros((n_queries, classes_k.size))\n for (i, idx) in enumerate(pred_labels.T):\n proba_k[all_rows, idx] += weights[:, i]\n normalizer = proba_k.sum(axis=1)[:, np.newaxis]\n normalizer[normalizer == 0.0] = 1.0\n proba_k /= normalizer\n probabilities.append(proba_k)\n if not self.outputs_2d_:\n probabilities = probabilities[0]\n return probabilities\n \n def _more_tags(self):\n return {'multilabel': True}\n" }, { "name": "RadiusNeighborsClassifier", @@ -25407,8 +25489,22 @@ ], "is_public": true, "description": "Classifier implementing a vote among neighbors within a given radius.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Classifier implementing a vote among neighbors within a given radius.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n radius : float, default=1.0\n Range of parameter space to use by default for :meth:`radius_neighbors`\n queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n Uniform weights are used by default.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n Distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. See the documentation of :class:`DistanceMetric` for a\n list of available metrics.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n outlier_label : {manual label, 'most_frequent'}, default=None\n Label for outlier samples (samples with no neighbors in given radius).\n\n - manual label: str or int label (should be the same type as y)\n or list of manual labels if multi-output is used.\n - 'most_frequent' : assign the most frequent label of y to outliers.\n - None : when any outlier is detected, ValueError will be raised.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n **kwargs : dict\n Additional keyword arguments passed to the constructor.\n\n .. deprecated:: 1.0\n The RadiusNeighborsClassifier class will not longer accept extra\n keyword parameters in 1.2 since they are unused.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n Class labels known to the classifier.\n\n effective_metric_ : str or callable\n The distance metric used. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n outlier_label_ : int or array-like of shape (n_class,)\n Label which is given for outlier samples (samples with no neighbors\n on given radius).\n\n outputs_2d_ : bool\n False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit\n otherwise True.\n\n See Also\n --------\n KNeighborsClassifier : Classifier implementing the k-nearest neighbors\n vote.\n RadiusNeighborsRegressor : Regression based on neighbors within a\n fixed radius.\n KNeighborsRegressor : Regression based on k-nearest neighbors.\n NearestNeighbors : Unsupervised learner for implementing neighbor\n searches.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import RadiusNeighborsClassifier\n >>> neigh = RadiusNeighborsClassifier(radius=1.0)\n >>> neigh.fit(X, y)\n RadiusNeighborsClassifier(...)\n >>> print(neigh.predict([[1.5]]))\n [0]\n >>> print(neigh.predict_proba([[1.0]]))\n [[0.66666667 0.33333333]]\n ", - "source_code": "\n\nclass RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase):\n \"\"\"Classifier implementing a vote among neighbors within a given radius.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n radius : float, default=1.0\n Range of parameter space to use by default for :meth:`radius_neighbors`\n queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n Uniform weights are used by default.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n Distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. See the documentation of :class:`DistanceMetric` for a\n list of available metrics.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n outlier_label : {manual label, 'most_frequent'}, default=None\n Label for outlier samples (samples with no neighbors in given radius).\n\n - manual label: str or int label (should be the same type as y)\n or list of manual labels if multi-output is used.\n - 'most_frequent' : assign the most frequent label of y to outliers.\n - None : when any outlier is detected, ValueError will be raised.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n **kwargs : dict\n Additional keyword arguments passed to the constructor.\n\n .. deprecated:: 1.0\n The RadiusNeighborsClassifier class will not longer accept extra\n keyword parameters in 1.2 since they are unused.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n Class labels known to the classifier.\n\n effective_metric_ : str or callable\n The distance metric used. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n outlier_label_ : int or array-like of shape (n_class,)\n Label which is given for outlier samples (samples with no neighbors\n on given radius).\n\n outputs_2d_ : bool\n False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit\n otherwise True.\n\n See Also\n --------\n KNeighborsClassifier : Classifier implementing the k-nearest neighbors\n vote.\n RadiusNeighborsRegressor : Regression based on neighbors within a\n fixed radius.\n KNeighborsRegressor : Regression based on k-nearest neighbors.\n NearestNeighbors : Unsupervised learner for implementing neighbor\n searches.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import RadiusNeighborsClassifier\n >>> neigh = RadiusNeighborsClassifier(radius=1.0)\n >>> neigh.fit(X, y)\n RadiusNeighborsClassifier(...)\n >>> print(neigh.predict([[1.5]]))\n [0]\n >>> print(neigh.predict_proba([[1.0]]))\n [[0.66666667 0.33333333]]\n \"\"\"\n \n def __init__(self, radius=1.0, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label=None, metric_params=None, n_jobs=None, **kwargs):\n if len(kwargs) > 0:\n warnings.warn(f'Passing additional keyword parameters has no effect and is deprecated in 1.0. An error will be raised from 1.2 and beyond. The ignored keyword parameter(s) are: {kwargs.keys()}.', FutureWarning)\n super().__init__(radius=radius, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.weights = weights\n self.outlier_label = outlier_label\n \n def fit(self, X, y):\n \"\"\"Fit the radius neighbors classifier from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : RadiusNeighborsClassifier\n The fitted radius neighbors classifier.\n \"\"\"\n self.weights = _check_weights(self.weights)\n self._fit(X, y)\n classes_ = self.classes_\n _y = self._y\n if not self.outputs_2d_:\n _y = self._y.reshape((-1, 1))\n classes_ = [self.classes_]\n if self.outlier_label is None:\n outlier_label_ = None\n elif self.outlier_label == 'most_frequent':\n outlier_label_ = []\n for (k, classes_k) in enumerate(classes_):\n label_count = np.bincount(_y[:, k])\n outlier_label_.append(classes_k[label_count.argmax()])\n else:\n if _is_arraylike(self.outlier_label) and not isinstance(self.outlier_label, str):\n if len(self.outlier_label) != len(classes_):\n raise ValueError('The length of outlier_label: {} is inconsistent with the output length: {}'.format(self.outlier_label, len(classes_)))\n outlier_label_ = self.outlier_label\n else:\n outlier_label_ = [self.outlier_label] * len(classes_)\n for (classes, label) in zip(classes_, outlier_label_):\n if _is_arraylike(label) and not isinstance(label, str):\n raise TypeError('The outlier_label of classes {} is supposed to be a scalar, got {}.'.format(classes, label))\n if np.append(classes, label).dtype != classes.dtype:\n raise TypeError('The dtype of outlier_label {} is inconsistent with classes {} in y.'.format(label, classes))\n self.outlier_label_ = outlier_label_\n return self\n \n def predict(self, X):\n \"\"\"Predict the class labels for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs)\n Class labels for each data sample.\n \"\"\"\n probs = self.predict_proba(X)\n classes_ = self.classes_\n if not self.outputs_2d_:\n probs = [probs]\n classes_ = [self.classes_]\n n_outputs = len(classes_)\n n_queries = probs[0].shape[0]\n y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)\n for (k, prob) in enumerate(probs):\n max_prob_index = prob.argmax(axis=1)\n y_pred[:, k] = classes_[k].take(max_prob_index)\n outlier_zero_probs = (prob == 0).all(axis=1)\n if outlier_zero_probs.any():\n zero_prob_index = np.flatnonzero(outlier_zero_probs)\n y_pred[zero_prob_index, k] = self.outlier_label_[k]\n if not self.outputs_2d_:\n y_pred = y_pred.ravel()\n return y_pred\n \n def predict_proba(self, X):\n \"\"\"Return probability estimates for the test data X.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n p : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1.\n The class probabilities of the input samples. Classes are ordered\n by lexicographic order.\n \"\"\"\n n_queries = _num_samples(X)\n (neigh_dist, neigh_ind) = self.radius_neighbors(X)\n outlier_mask = np.zeros(n_queries, dtype=bool)\n outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]\n outliers = np.flatnonzero(outlier_mask)\n inliers = np.flatnonzero(~outlier_mask)\n classes_ = self.classes_\n _y = self._y\n if not self.outputs_2d_:\n _y = self._y.reshape((-1, 1))\n classes_ = [self.classes_]\n if self.outlier_label_ is None and outliers.size > 0:\n raise ValueError('No neighbors found for test samples %r, you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.' % outliers)\n weights = _get_weights(neigh_dist, self.weights)\n if weights is not None:\n weights = weights[inliers]\n probabilities = []\n for (k, classes_k) in enumerate(classes_):\n pred_labels = np.zeros(len(neigh_ind), dtype=object)\n pred_labels[:] = [_y[ind, k] for ind in neigh_ind]\n proba_k = np.zeros((n_queries, classes_k.size))\n proba_inl = np.zeros((len(inliers), classes_k.size))\n if weights is None:\n for (i, idx) in enumerate(pred_labels[inliers]):\n proba_inl[i, :] = np.bincount(idx, minlength=classes_k.size)\n else:\n for (i, idx) in enumerate(pred_labels[inliers]):\n proba_inl[i, :] = np.bincount(idx, weights[i], minlength=classes_k.size)\n proba_k[inliers, :] = proba_inl\n if outliers.size > 0:\n _outlier_label = self.outlier_label_[k]\n label_index = np.flatnonzero(classes_k == _outlier_label)\n if label_index.size == 1:\n proba_k[outliers, label_index[0]] = 1.0\n else:\n warnings.warn('Outlier label {} is not in training classes. All class probabilities of outliers will be assigned with 0.'.format(self.outlier_label_[k]))\n normalizer = proba_k.sum(axis=1)[:, np.newaxis]\n normalizer[normalizer == 0.0] = 1.0\n proba_k /= normalizer\n probabilities.append(proba_k)\n if not self.outputs_2d_:\n probabilities = probabilities[0]\n return probabilities\n \n def _more_tags(self):\n return {'multilabel': True}\n" + "docstring": "Classifier implementing a vote among neighbors within a given radius.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n radius : float, default=1.0\n Range of parameter space to use by default for :meth:`radius_neighbors`\n queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n Uniform weights are used by default.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n Distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. For a list of available metrics, see the documentation of\n :class:`~sklearn.metrics.DistanceMetric`.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n outlier_label : {manual label, 'most_frequent'}, default=None\n Label for outlier samples (samples with no neighbors in given radius).\n\n - manual label: str or int label (should be the same type as y)\n or list of manual labels if multi-output is used.\n - 'most_frequent' : assign the most frequent label of y to outliers.\n - None : when any outlier is detected, ValueError will be raised.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n **kwargs : dict\n Additional keyword arguments passed to the constructor.\n\n .. deprecated:: 1.0\n The RadiusNeighborsClassifier class will not longer accept extra\n keyword parameters in 1.2 since they are unused.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n Class labels known to the classifier.\n\n effective_metric_ : str or callable\n The distance metric used. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n outlier_label_ : int or array-like of shape (n_class,)\n Label which is given for outlier samples (samples with no neighbors\n on given radius).\n\n outputs_2d_ : bool\n False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit\n otherwise True.\n\n See Also\n --------\n KNeighborsClassifier : Classifier implementing the k-nearest neighbors\n vote.\n RadiusNeighborsRegressor : Regression based on neighbors within a\n fixed radius.\n KNeighborsRegressor : Regression based on k-nearest neighbors.\n NearestNeighbors : Unsupervised learner for implementing neighbor\n searches.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import RadiusNeighborsClassifier\n >>> neigh = RadiusNeighborsClassifier(radius=1.0)\n >>> neigh.fit(X, y)\n RadiusNeighborsClassifier(...)\n >>> print(neigh.predict([[1.5]]))\n [0]\n >>> print(neigh.predict_proba([[1.0]]))\n [[0.66666667 0.33333333]]\n ", + "source_code": "\n\nclass RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase):\n \"\"\"Classifier implementing a vote among neighbors within a given radius.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n radius : float, default=1.0\n Range of parameter space to use by default for :meth:`radius_neighbors`\n queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n Uniform weights are used by default.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n Distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. For a list of available metrics, see the documentation of\n :class:`~sklearn.metrics.DistanceMetric`.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n outlier_label : {manual label, 'most_frequent'}, default=None\n Label for outlier samples (samples with no neighbors in given radius).\n\n - manual label: str or int label (should be the same type as y)\n or list of manual labels if multi-output is used.\n - 'most_frequent' : assign the most frequent label of y to outliers.\n - None : when any outlier is detected, ValueError will be raised.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n **kwargs : dict\n Additional keyword arguments passed to the constructor.\n\n .. deprecated:: 1.0\n The RadiusNeighborsClassifier class will not longer accept extra\n keyword parameters in 1.2 since they are unused.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n Class labels known to the classifier.\n\n effective_metric_ : str or callable\n The distance metric used. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n outlier_label_ : int or array-like of shape (n_class,)\n Label which is given for outlier samples (samples with no neighbors\n on given radius).\n\n outputs_2d_ : bool\n False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit\n otherwise True.\n\n See Also\n --------\n KNeighborsClassifier : Classifier implementing the k-nearest neighbors\n vote.\n RadiusNeighborsRegressor : Regression based on neighbors within a\n fixed radius.\n KNeighborsRegressor : Regression based on k-nearest neighbors.\n NearestNeighbors : Unsupervised learner for implementing neighbor\n searches.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import RadiusNeighborsClassifier\n >>> neigh = RadiusNeighborsClassifier(radius=1.0)\n >>> neigh.fit(X, y)\n RadiusNeighborsClassifier(...)\n >>> print(neigh.predict([[1.5]]))\n [0]\n >>> print(neigh.predict_proba([[1.0]]))\n [[0.66666667 0.33333333]]\n \"\"\"\n \n def __init__(self, radius=1.0, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label=None, metric_params=None, n_jobs=None, **kwargs):\n if len(kwargs) > 0:\n warnings.warn(f'Passing additional keyword parameters has no effect and is deprecated in 1.0. An error will be raised from 1.2 and beyond. The ignored keyword parameter(s) are: {kwargs.keys()}.', FutureWarning)\n super().__init__(radius=radius, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.weights = weights\n self.outlier_label = outlier_label\n \n def fit(self, X, y):\n \"\"\"Fit the radius neighbors classifier from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : RadiusNeighborsClassifier\n The fitted radius neighbors classifier.\n \"\"\"\n self.weights = _check_weights(self.weights)\n self._fit(X, y)\n classes_ = self.classes_\n _y = self._y\n if not self.outputs_2d_:\n _y = self._y.reshape((-1, 1))\n classes_ = [self.classes_]\n if self.outlier_label is None:\n outlier_label_ = None\n elif self.outlier_label == 'most_frequent':\n outlier_label_ = []\n for (k, classes_k) in enumerate(classes_):\n label_count = np.bincount(_y[:, k])\n outlier_label_.append(classes_k[label_count.argmax()])\n else:\n if _is_arraylike(self.outlier_label) and not isinstance(self.outlier_label, str):\n if len(self.outlier_label) != len(classes_):\n raise ValueError('The length of outlier_label: {} is inconsistent with the output length: {}'.format(self.outlier_label, len(classes_)))\n outlier_label_ = self.outlier_label\n else:\n outlier_label_ = [self.outlier_label] * len(classes_)\n for (classes, label) in zip(classes_, outlier_label_):\n if _is_arraylike(label) and not isinstance(label, str):\n raise TypeError('The outlier_label of classes {} is supposed to be a scalar, got {}.'.format(classes, label))\n if np.append(classes, label).dtype != classes.dtype:\n raise TypeError('The dtype of outlier_label {} is inconsistent with classes {} in y.'.format(label, classes))\n self.outlier_label_ = outlier_label_\n return self\n \n def predict(self, X):\n \"\"\"Predict the class labels for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs)\n Class labels for each data sample.\n \"\"\"\n probs = self.predict_proba(X)\n classes_ = self.classes_\n if not self.outputs_2d_:\n probs = [probs]\n classes_ = [self.classes_]\n n_outputs = len(classes_)\n n_queries = probs[0].shape[0]\n y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)\n for (k, prob) in enumerate(probs):\n max_prob_index = prob.argmax(axis=1)\n y_pred[:, k] = classes_[k].take(max_prob_index)\n outlier_zero_probs = (prob == 0).all(axis=1)\n if outlier_zero_probs.any():\n zero_prob_index = np.flatnonzero(outlier_zero_probs)\n y_pred[zero_prob_index, k] = self.outlier_label_[k]\n if not self.outputs_2d_:\n y_pred = y_pred.ravel()\n return y_pred\n \n def predict_proba(self, X):\n \"\"\"Return probability estimates for the test data X.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n p : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1.\n The class probabilities of the input samples. Classes are ordered\n by lexicographic order.\n \"\"\"\n n_queries = _num_samples(X)\n (neigh_dist, neigh_ind) = self.radius_neighbors(X)\n outlier_mask = np.zeros(n_queries, dtype=bool)\n outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]\n outliers = np.flatnonzero(outlier_mask)\n inliers = np.flatnonzero(~outlier_mask)\n classes_ = self.classes_\n _y = self._y\n if not self.outputs_2d_:\n _y = self._y.reshape((-1, 1))\n classes_ = [self.classes_]\n if self.outlier_label_ is None and outliers.size > 0:\n raise ValueError('No neighbors found for test samples %r, you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.' % outliers)\n weights = _get_weights(neigh_dist, self.weights)\n if weights is not None:\n weights = weights[inliers]\n probabilities = []\n for (k, classes_k) in enumerate(classes_):\n pred_labels = np.zeros(len(neigh_ind), dtype=object)\n pred_labels[:] = [_y[ind, k] for ind in neigh_ind]\n proba_k = np.zeros((n_queries, classes_k.size))\n proba_inl = np.zeros((len(inliers), classes_k.size))\n if weights is None:\n for (i, idx) in enumerate(pred_labels[inliers]):\n proba_inl[i, :] = np.bincount(idx, minlength=classes_k.size)\n else:\n for (i, idx) in enumerate(pred_labels[inliers]):\n proba_inl[i, :] = np.bincount(idx, weights[i], minlength=classes_k.size)\n proba_k[inliers, :] = proba_inl\n if outliers.size > 0:\n _outlier_label = self.outlier_label_[k]\n label_index = np.flatnonzero(classes_k == _outlier_label)\n if label_index.size == 1:\n proba_k[outliers, label_index[0]] = 1.0\n else:\n warnings.warn('Outlier label {} is not in training classes. All class probabilities of outliers will be assigned with 0.'.format(self.outlier_label_[k]))\n normalizer = proba_k.sum(axis=1)[:, np.newaxis]\n normalizer[normalizer == 0.0] = 1.0\n proba_k /= normalizer\n probabilities.append(proba_k)\n if not self.outputs_2d_:\n probabilities = probabilities[0]\n return probabilities\n \n def _more_tags(self):\n return {'multilabel': True}\n" + }, + { + "name": "DistanceMetric", + "qname": "sklearn.neighbors._distance_metric.DistanceMetric", + "decorators": [], + "superclasses": ["_DistanceMetric"], + "methods": [ + "sklearn.neighbors._distance_metric.DistanceMetric._warn", + "sklearn.neighbors._distance_metric.DistanceMetric.get_metric" + ], + "is_public": true, + "description": "", + "docstring": null, + "source_code": "\n\nclass DistanceMetric(_DistanceMetric):\n \n @classmethod\n def _warn(cls):\n warnings.warn('sklearn.neighbors.DistanceMetric has been moved to sklearn.metrics.DistanceMetric in 1.0. This import path will be removed in 1.3', category=FutureWarning)\n \n @classmethod\n def get_metric(cls, metric, **kwargs):\n DistanceMetric._warn()\n return _DistanceMetric.get_metric(metric, **kwargs)\n" }, { "name": "KNeighborsTransformer", @@ -25427,7 +25523,7 @@ "sklearn.neighbors._graph.KNeighborsTransformer._more_tags" ], "is_public": true, - "description": "Transform X into a (weighted) graph of k nearest neighbors.\n\nThe transformed data is a sparse graph as returned by kneighbors_graph. Read more in the :ref:`User Guide `. .. versionadded:: 0.22", + "description": "Transform X into a (weighted) graph of k nearest neighbors.\n\nThe transformed data is a sparse graph as returned by kneighbors_graph.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.22", "docstring": "Transform X into a (weighted) graph of k nearest neighbors.\n\n The transformed data is a sparse graph as returned by kneighbors_graph.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n mode : {'distance', 'connectivity'}, default='distance'\n Type of returned matrix: 'connectivity' will return the connectivity\n matrix with ones and zeros, and 'distance' will return the distances\n between neighbors according to the given metric.\n\n n_neighbors : int, default=5\n Number of neighbors for each sample in the transformed sparse graph.\n For compatibility reasons, as each sample is considered as its own\n neighbor, one extra neighbor will be computed when mode == 'distance'.\n In this case, the sparse graph contains (n_neighbors + 1) neighbors.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n metric : str or callable, default='minkowski'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string.\n\n Distance matrices are not supported.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=1\n The number of parallel jobs to run for neighbors search.\n If ``-1``, then the number of jobs is set to the number of CPU cores.\n\n Attributes\n ----------\n effective_metric_ : str or callable\n The distance metric used. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n kneighbors_graph : Compute the weighted graph of k-neighbors for\n points in X.\n RadiusNeighborsTransformer : Transform X into a weighted graph of\n neighbors nearer than a radius.\n\n Examples\n --------\n >>> from sklearn.datasets import load_wine\n >>> from sklearn.neighbors import KNeighborsTransformer\n >>> X, _ = load_wine(return_X_y=True)\n >>> X.shape\n (178, 13)\n >>> transformer = KNeighborsTransformer(n_neighbors=5, mode='distance')\n >>> X_dist_graph = transformer.fit_transform(X)\n >>> X_dist_graph.shape\n (178, 178)\n ", "source_code": "\n\nclass KNeighborsTransformer(KNeighborsMixin, TransformerMixin, NeighborsBase):\n \"\"\"Transform X into a (weighted) graph of k nearest neighbors.\n\n The transformed data is a sparse graph as returned by kneighbors_graph.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n mode : {'distance', 'connectivity'}, default='distance'\n Type of returned matrix: 'connectivity' will return the connectivity\n matrix with ones and zeros, and 'distance' will return the distances\n between neighbors according to the given metric.\n\n n_neighbors : int, default=5\n Number of neighbors for each sample in the transformed sparse graph.\n For compatibility reasons, as each sample is considered as its own\n neighbor, one extra neighbor will be computed when mode == 'distance'.\n In this case, the sparse graph contains (n_neighbors + 1) neighbors.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n metric : str or callable, default='minkowski'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string.\n\n Distance matrices are not supported.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=1\n The number of parallel jobs to run for neighbors search.\n If ``-1``, then the number of jobs is set to the number of CPU cores.\n\n Attributes\n ----------\n effective_metric_ : str or callable\n The distance metric used. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n kneighbors_graph : Compute the weighted graph of k-neighbors for\n points in X.\n RadiusNeighborsTransformer : Transform X into a weighted graph of\n neighbors nearer than a radius.\n\n Examples\n --------\n >>> from sklearn.datasets import load_wine\n >>> from sklearn.neighbors import KNeighborsTransformer\n >>> X, _ = load_wine(return_X_y=True)\n >>> X.shape\n (178, 13)\n >>> transformer = KNeighborsTransformer(n_neighbors=5, mode='distance')\n >>> X_dist_graph = transformer.fit_transform(X)\n >>> X_dist_graph.shape\n (178, 178)\n \"\"\"\n \n def __init__(self, *, mode='distance', n_neighbors=5, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1):\n super(KNeighborsTransformer, self).__init__(n_neighbors=n_neighbors, radius=None, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.mode = mode\n \n def fit(self, X, y=None):\n \"\"\"Fit the k-nearest neighbors transformer from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : KNeighborsTransformer\n The fitted k-nearest neighbors transformer.\n \"\"\"\n return self._fit(X)\n \n def transform(self, X):\n \"\"\"Compute the (weighted) graph of Neighbors for points in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_transform, n_features)\n Sample data.\n\n Returns\n -------\n Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.\n \"\"\"\n check_is_fitted(self)\n add_one = self.mode == 'distance'\n return self.kneighbors_graph(X, mode=self.mode, n_neighbors=self.n_neighbors + add_one)\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit to data, then transform it.\n\n Fits transformer to X and y with optional parameters fit_params\n and returns a transformed version of X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training set.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n Xt : sparse matrix of shape (n_samples, n_samples)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.\n \"\"\"\n return self.fit(X).transform(X)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_methods_sample_order_invariance': 'check is not applicable.'}}\n" }, @@ -25448,7 +25544,7 @@ "sklearn.neighbors._graph.RadiusNeighborsTransformer._more_tags" ], "is_public": true, - "description": "Transform X into a (weighted) graph of neighbors nearer than a radius.\n\nThe transformed data is a sparse graph as returned by `radius_neighbors_graph`. Read more in the :ref:`User Guide `. .. versionadded:: 0.22", + "description": "Transform X into a (weighted) graph of neighbors nearer than a radius.\n\nThe transformed data is a sparse graph as returned by\n`radius_neighbors_graph`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.22", "docstring": "Transform X into a (weighted) graph of neighbors nearer than a radius.\n\n The transformed data is a sparse graph as returned by\n `radius_neighbors_graph`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n mode : {'distance', 'connectivity'}, default='distance'\n Type of returned matrix: 'connectivity' will return the connectivity\n matrix with ones and zeros, and 'distance' will return the distances\n between neighbors according to the given metric.\n\n radius : float, default=1.0\n Radius of neighborhood in the transformed sparse graph.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n metric : str or callable, default='minkowski'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string.\n\n Distance matrices are not supported.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=1\n The number of parallel jobs to run for neighbors search.\n If ``-1``, then the number of jobs is set to the number of CPU cores.\n\n Attributes\n ----------\n effective_metric_ : str or callable\n The distance metric used. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n kneighbors_graph : Compute the weighted graph of k-neighbors for\n points in X.\n KNeighborsTransformer : Transform X into a weighted graph of k\n nearest neighbors.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import load_wine\n >>> from sklearn.cluster import DBSCAN\n >>> from sklearn.neighbors import RadiusNeighborsTransformer\n >>> from sklearn.pipeline import make_pipeline\n >>> X, _ = load_wine(return_X_y=True)\n >>> estimator = make_pipeline(\n ... RadiusNeighborsTransformer(radius=42.0, mode='distance'),\n ... DBSCAN(eps=25.0, metric='precomputed'))\n >>> X_clustered = estimator.fit_predict(X)\n >>> clusters, counts = np.unique(X_clustered, return_counts=True)\n >>> print(counts)\n [ 29 15 111 11 12]\n ", "source_code": "\n\nclass RadiusNeighborsTransformer(RadiusNeighborsMixin, TransformerMixin, NeighborsBase):\n \"\"\"Transform X into a (weighted) graph of neighbors nearer than a radius.\n\n The transformed data is a sparse graph as returned by\n `radius_neighbors_graph`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n mode : {'distance', 'connectivity'}, default='distance'\n Type of returned matrix: 'connectivity' will return the connectivity\n matrix with ones and zeros, and 'distance' will return the distances\n between neighbors according to the given metric.\n\n radius : float, default=1.0\n Radius of neighborhood in the transformed sparse graph.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n metric : str or callable, default='minkowski'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string.\n\n Distance matrices are not supported.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=1\n The number of parallel jobs to run for neighbors search.\n If ``-1``, then the number of jobs is set to the number of CPU cores.\n\n Attributes\n ----------\n effective_metric_ : str or callable\n The distance metric used. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n kneighbors_graph : Compute the weighted graph of k-neighbors for\n points in X.\n KNeighborsTransformer : Transform X into a weighted graph of k\n nearest neighbors.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.datasets import load_wine\n >>> from sklearn.cluster import DBSCAN\n >>> from sklearn.neighbors import RadiusNeighborsTransformer\n >>> from sklearn.pipeline import make_pipeline\n >>> X, _ = load_wine(return_X_y=True)\n >>> estimator = make_pipeline(\n ... RadiusNeighborsTransformer(radius=42.0, mode='distance'),\n ... DBSCAN(eps=25.0, metric='precomputed'))\n >>> X_clustered = estimator.fit_predict(X)\n >>> clusters, counts = np.unique(X_clustered, return_counts=True)\n >>> print(counts)\n [ 29 15 111 11 12]\n \"\"\"\n \n def __init__(self, *, mode='distance', radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1):\n super(RadiusNeighborsTransformer, self).__init__(n_neighbors=None, radius=radius, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.mode = mode\n \n def fit(self, X, y=None):\n \"\"\"Fit the radius neighbors transformer from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : RadiusNeighborsTransformer\n The fitted radius neighbors transformer.\n \"\"\"\n return self._fit(X)\n \n def transform(self, X):\n \"\"\"Compute the (weighted) graph of Neighbors for points in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_transform, n_features)\n Sample data.\n\n Returns\n -------\n Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.\n \"\"\"\n check_is_fitted(self)\n return self.radius_neighbors_graph(X, mode=self.mode, sort_results=True)\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit to data, then transform it.\n\n Fits transformer to X and y with optional parameters fit_params\n and returns a transformed version of X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training set.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n Xt : sparse matrix of shape (n_samples, n_samples)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.\n \"\"\"\n return self.fit(X).transform(X)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_methods_sample_order_invariance': 'check is not applicable.'}}\n" }, @@ -25495,7 +25591,7 @@ "sklearn.neighbors._lof.LocalOutlierFactor._local_reachability_density" ], "is_public": true, - "description": "Unsupervised Outlier Detection using the Local Outlier Factor (LOF).\n\nThe anomaly score of each sample is called the Local Outlier Factor. It measures the local deviation of the density of a given sample with respect to its neighbors. It is local in that the anomaly score depends on how isolated the object is with respect to the surrounding neighborhood. More precisely, locality is given by k-nearest neighbors, whose distance is used to estimate the local density. By comparing the local density of a sample to the local densities of its neighbors, one can identify samples that have a substantially lower density than their neighbors. These are considered outliers. .. versionadded:: 0.19", + "description": "Unsupervised Outlier Detection using the Local Outlier Factor (LOF).\n\nThe anomaly score of each sample is called the Local Outlier Factor.\nIt measures the local deviation of the density of a given sample with respect\nto its neighbors.\nIt is local in that the anomaly score depends on how isolated the object\nis with respect to the surrounding neighborhood.\nMore precisely, locality is given by k-nearest neighbors, whose distance\nis used to estimate the local density.\nBy comparing the local density of a sample to the local densities of its\nneighbors, one can identify samples that have a substantially lower density\nthan their neighbors. These are considered outliers.\n\n.. versionadded:: 0.19", "docstring": "Unsupervised Outlier Detection using the Local Outlier Factor (LOF).\n\n The anomaly score of each sample is called the Local Outlier Factor.\n It measures the local deviation of the density of a given sample with respect\n to its neighbors.\n It is local in that the anomaly score depends on how isolated the object\n is with respect to the surrounding neighborhood.\n More precisely, locality is given by k-nearest neighbors, whose distance\n is used to estimate the local density.\n By comparing the local density of a sample to the local densities of its\n neighbors, one can identify samples that have a substantially lower density\n than their neighbors. These are considered outliers.\n\n .. versionadded:: 0.19\n\n Parameters\n ----------\n n_neighbors : int, default=20\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n If n_neighbors is larger than the number of samples provided,\n all samples will be used.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf is size passed to :class:`BallTree` or :class:`KDTree`. This can\n affect the speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n metric : str or callable, default='minkowski'\n The metric is used for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square. X may be a sparse matrix, in which case only \"nonzero\"\n elements may be considered neighbors.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics:\n https://docs.scipy.org/doc/scipy/reference/spatial.distance.html.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n :func:`sklearn.metrics.pairwise.pairwise_distances`. When p = 1, this\n is equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n contamination : 'auto' or float, default='auto'\n The amount of contamination of the data set, i.e. the proportion\n of outliers in the data set. When fitting this is used to define the\n threshold on the scores of the samples.\n\n - if 'auto', the threshold is determined as in the\n original paper,\n - if a float, the contamination should be in the range (0, 0.5].\n\n .. versionchanged:: 0.22\n The default value of ``contamination`` changed from 0.1\n to ``'auto'``.\n\n novelty : bool, default=False\n By default, LocalOutlierFactor is only meant to be used for outlier\n detection (novelty=False). Set novelty to True if you want to use\n LocalOutlierFactor for novelty detection. In this case be aware that\n you should only use predict, decision_function and score_samples\n on new unseen data and not on the training set.\n\n .. versionadded:: 0.20\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n negative_outlier_factor_ : ndarray of shape (n_samples,)\n The opposite LOF of the training samples. The higher, the more normal.\n Inliers tend to have a LOF score close to 1\n (``negative_outlier_factor_`` close to -1), while outliers tend to have\n a larger LOF score.\n\n The local outlier factor (LOF) of a sample captures its\n supposed 'degree of abnormality'.\n It is the average of the ratio of the local reachability density of\n a sample and those of its k-nearest neighbors.\n\n n_neighbors_ : int\n The actual number of neighbors used for :meth:`kneighbors` queries.\n\n offset_ : float\n Offset used to obtain binary labels from the raw scores.\n Observations having a negative_outlier_factor smaller than `offset_`\n are detected as abnormal.\n The offset is set to -1.5 (inliers score around -1), except when a\n contamination parameter different than \"auto\" is provided. In that\n case, the offset is defined in such a way we obtain the expected\n number of outliers in training.\n\n .. versionadded:: 0.20\n\n effective_metric_ : str\n The effective metric used for the distance computation.\n\n effective_metric_params_ : dict\n The effective additional keyword arguments for the metric function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n It is the number of samples in the fitted data.\n\n See also\n ----------\n sklearn.svm.OneClassSVM: Unsupervised Outlier Detection using\n Support Vector Machine.\n\n References\n ----------\n .. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May).\n LOF: identifying density-based local outliers. In ACM sigmod record.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.neighbors import LocalOutlierFactor\n >>> X = [[-1.1], [0.2], [101.1], [0.3]]\n >>> clf = LocalOutlierFactor(n_neighbors=2)\n >>> clf.fit_predict(X)\n array([ 1, 1, -1, 1])\n >>> clf.negative_outlier_factor_\n array([ -0.9821..., -1.0370..., -73.3697..., -0.9821...])\n ", "source_code": "\n\nclass LocalOutlierFactor(KNeighborsMixin, OutlierMixin, NeighborsBase):\n \"\"\"Unsupervised Outlier Detection using the Local Outlier Factor (LOF).\n\n The anomaly score of each sample is called the Local Outlier Factor.\n It measures the local deviation of the density of a given sample with respect\n to its neighbors.\n It is local in that the anomaly score depends on how isolated the object\n is with respect to the surrounding neighborhood.\n More precisely, locality is given by k-nearest neighbors, whose distance\n is used to estimate the local density.\n By comparing the local density of a sample to the local densities of its\n neighbors, one can identify samples that have a substantially lower density\n than their neighbors. These are considered outliers.\n\n .. versionadded:: 0.19\n\n Parameters\n ----------\n n_neighbors : int, default=20\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n If n_neighbors is larger than the number of samples provided,\n all samples will be used.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf is size passed to :class:`BallTree` or :class:`KDTree`. This can\n affect the speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n metric : str or callable, default='minkowski'\n The metric is used for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square. X may be a sparse matrix, in which case only \"nonzero\"\n elements may be considered neighbors.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics:\n https://docs.scipy.org/doc/scipy/reference/spatial.distance.html.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n :func:`sklearn.metrics.pairwise.pairwise_distances`. When p = 1, this\n is equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n contamination : 'auto' or float, default='auto'\n The amount of contamination of the data set, i.e. the proportion\n of outliers in the data set. When fitting this is used to define the\n threshold on the scores of the samples.\n\n - if 'auto', the threshold is determined as in the\n original paper,\n - if a float, the contamination should be in the range (0, 0.5].\n\n .. versionchanged:: 0.22\n The default value of ``contamination`` changed from 0.1\n to ``'auto'``.\n\n novelty : bool, default=False\n By default, LocalOutlierFactor is only meant to be used for outlier\n detection (novelty=False). Set novelty to True if you want to use\n LocalOutlierFactor for novelty detection. In this case be aware that\n you should only use predict, decision_function and score_samples\n on new unseen data and not on the training set.\n\n .. versionadded:: 0.20\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n negative_outlier_factor_ : ndarray of shape (n_samples,)\n The opposite LOF of the training samples. The higher, the more normal.\n Inliers tend to have a LOF score close to 1\n (``negative_outlier_factor_`` close to -1), while outliers tend to have\n a larger LOF score.\n\n The local outlier factor (LOF) of a sample captures its\n supposed 'degree of abnormality'.\n It is the average of the ratio of the local reachability density of\n a sample and those of its k-nearest neighbors.\n\n n_neighbors_ : int\n The actual number of neighbors used for :meth:`kneighbors` queries.\n\n offset_ : float\n Offset used to obtain binary labels from the raw scores.\n Observations having a negative_outlier_factor smaller than `offset_`\n are detected as abnormal.\n The offset is set to -1.5 (inliers score around -1), except when a\n contamination parameter different than \"auto\" is provided. In that\n case, the offset is defined in such a way we obtain the expected\n number of outliers in training.\n\n .. versionadded:: 0.20\n\n effective_metric_ : str\n The effective metric used for the distance computation.\n\n effective_metric_params_ : dict\n The effective additional keyword arguments for the metric function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n It is the number of samples in the fitted data.\n\n See also\n ----------\n sklearn.svm.OneClassSVM: Unsupervised Outlier Detection using\n Support Vector Machine.\n\n References\n ----------\n .. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May).\n LOF: identifying density-based local outliers. In ACM sigmod record.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.neighbors import LocalOutlierFactor\n >>> X = [[-1.1], [0.2], [101.1], [0.3]]\n >>> clf = LocalOutlierFactor(n_neighbors=2)\n >>> clf.fit_predict(X)\n array([ 1, 1, -1, 1])\n >>> clf.negative_outlier_factor_\n array([ -0.9821..., -1.0370..., -73.3697..., -0.9821...])\n \"\"\"\n \n def __init__(self, n_neighbors=20, *, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination='auto', novelty=False, n_jobs=None):\n super().__init__(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.contamination = contamination\n self.novelty = novelty\n \n def _check_novelty_fit_predict(self):\n if self.novelty:\n msg = 'fit_predict is not available when novelty=True. Use novelty=False if you want to predict on the training set.'\n raise AttributeError(msg)\n return True\n \n @available_if(_check_novelty_fit_predict)\n def fit_predict(self, X, y=None):\n \"\"\"Fit the model to the training set X and return the labels.\n\n **Not available for novelty detection (when novelty is set to True).**\n Label is 1 for an inlier and -1 for an outlier according to the LOF\n score and the contamination parameter.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features), default=None\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. to the training samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and 1 for inliers.\n \"\"\"\n return self.fit(X)._predict()\n \n def fit(self, X, y=None):\n \"\"\"Fit the local outlier factor detector from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : LocalOutlierFactor\n The fitted local outlier factor detector.\n \"\"\"\n self._fit(X)\n if self.contamination != 'auto':\n if not 0.0 < self.contamination <= 0.5:\n raise ValueError('contamination must be in (0, 0.5], got: %f' % self.contamination)\n n_samples = self.n_samples_fit_\n if self.n_neighbors > n_samples:\n warnings.warn('n_neighbors (%s) is greater than the total number of samples (%s). n_neighbors will be set to (n_samples - 1) for estimation.' % (self.n_neighbors, n_samples))\n self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1))\n (self._distances_fit_X_, _neighbors_indices_fit_X_) = self.kneighbors(n_neighbors=self.n_neighbors_)\n self._lrd = self._local_reachability_density(self._distances_fit_X_, _neighbors_indices_fit_X_)\n lrd_ratios_array = self._lrd[_neighbors_indices_fit_X_] / self._lrd[:, np.newaxis]\n self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1)\n if self.contamination == 'auto':\n self.offset_ = -1.5\n else:\n self.offset_ = np.percentile(self.negative_outlier_factor_, 100.0 * self.contamination)\n return self\n \n def _check_novelty_predict(self):\n if not self.novelty:\n msg = 'predict is not available when novelty=False, use fit_predict if you want to predict on training data. Use novelty=True if you want to use LOF for novelty detection and predict on new unseen data.'\n raise AttributeError(msg)\n return True\n \n @available_if(_check_novelty_predict)\n def predict(self, X=None):\n \"\"\"Predict the labels (1 inlier, -1 outlier) of X according to LOF.\n\n **Only available for novelty detection (when novelty is set to True).**\n This method allows to generalize prediction to *new observations* (not\n in the training set).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. to the training samples.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and +1 for inliers.\n \"\"\"\n return self._predict(X)\n \n def _predict(self, X=None):\n \"\"\"Predict the labels (1 inlier, -1 outlier) of X according to LOF.\n\n If X is None, returns the same as fit_predict(X_train).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features), default=None\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. to the training samples. If None, makes prediction on the\n training data without considering them as their own neighbors.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and +1 for inliers.\n \"\"\"\n check_is_fitted(self)\n if X is not None:\n X = check_array(X, accept_sparse='csr')\n is_inlier = np.ones(X.shape[0], dtype=int)\n is_inlier[self.decision_function(X) < 0] = -1\n else:\n is_inlier = np.ones(self.n_samples_fit_, dtype=int)\n is_inlier[self.negative_outlier_factor_ < self.offset_] = -1\n return is_inlier\n \n def _check_novelty_decision_function(self):\n if not self.novelty:\n msg = 'decision_function is not available when novelty=False. Use novelty=True if you want to use LOF for novelty detection and compute decision_function for new unseen data. Note that the opposite LOF of the training samples is always available by considering the negative_outlier_factor_ attribute.'\n raise AttributeError(msg)\n return True\n \n @available_if(_check_novelty_decision_function)\n def decision_function(self, X):\n \"\"\"Shifted opposite of the Local Outlier Factor of X.\n\n Bigger is better, i.e. large values correspond to inliers.\n\n **Only available for novelty detection (when novelty is set to True).**\n The shift offset allows a zero threshold for being an outlier.\n The argument X is supposed to contain *new data*: if X contains a\n point from training, it considers the later in its own neighborhood.\n Also, the samples in X are not considered in the neighborhood of any\n point.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. the training samples.\n\n Returns\n -------\n shifted_opposite_lof_scores : ndarray of shape (n_samples,)\n The shifted opposite of the Local Outlier Factor of each input\n samples. The lower, the more abnormal. Negative scores represent\n outliers, positive scores represent inliers.\n \"\"\"\n return self.score_samples(X) - self.offset_\n \n def _check_novelty_score_samples(self):\n if not self.novelty:\n msg = 'score_samples is not available when novelty=False. The scores of the training samples are always available through the negative_outlier_factor_ attribute. Use novelty=True if you want to use LOF for novelty detection and compute score_samples for new unseen data.'\n raise AttributeError(msg)\n return True\n \n @available_if(_check_novelty_score_samples)\n def score_samples(self, X):\n \"\"\"Opposite of the Local Outlier Factor of X.\n\n It is the opposite as bigger is better, i.e. large values correspond\n to inliers.\n\n **Only available for novelty detection (when novelty is set to True).**\n The argument X is supposed to contain *new data*: if X contains a\n point from training, it considers the later in its own neighborhood.\n Also, the samples in X are not considered in the neighborhood of any\n point.\n The score_samples on training data is available by considering the\n the ``negative_outlier_factor_`` attribute.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. the training samples.\n\n Returns\n -------\n opposite_lof_scores : ndarray of shape (n_samples,)\n The opposite of the Local Outlier Factor of each input samples.\n The lower, the more abnormal.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, accept_sparse='csr')\n (distances_X, neighbors_indices_X) = self.kneighbors(X, n_neighbors=self.n_neighbors_)\n X_lrd = self._local_reachability_density(distances_X, neighbors_indices_X)\n lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis]\n return -np.mean(lrd_ratios_array, axis=1)\n \n def _local_reachability_density(self, distances_X, neighbors_indices):\n \"\"\"The local reachability density (LRD)\n\n The LRD of a sample is the inverse of the average reachability\n distance of its k-nearest neighbors.\n\n Parameters\n ----------\n distances_X : ndarray of shape (n_queries, self.n_neighbors)\n Distances to the neighbors (in the training samples `self._fit_X`)\n of each query point to compute the LRD.\n\n neighbors_indices : ndarray of shape (n_queries, self.n_neighbors)\n Neighbors indices (of each query point) among training samples\n self._fit_X.\n\n Returns\n -------\n local_reachability_density : ndarray of shape (n_queries,)\n The local reachability density of each sample.\n \"\"\"\n dist_k = self._distances_fit_X_[neighbors_indices, self.n_neighbors_ - 1]\n reach_dist_array = np.maximum(distances_X, dist_k)\n return 1.0 / (np.mean(reach_dist_array, axis=1) + 1e-10)\n" }, @@ -25515,7 +25611,7 @@ "sklearn.neighbors._nca.NeighborhoodComponentsAnalysis._more_tags" ], "is_public": true, - "description": "Neighborhood Components Analysis.\n\nNeighborhood Component Analysis (NCA) is a machine learning algorithm for metric learning. It learns a linear transformation in a supervised fashion to improve the classification accuracy of a stochastic nearest neighbors rule in the transformed space. Read more in the :ref:`User Guide `.", + "description": "Neighborhood Components Analysis.\n\nNeighborhood Component Analysis (NCA) is a machine learning algorithm for\nmetric learning. It learns a linear transformation in a supervised fashion\nto improve the classification accuracy of a stochastic nearest neighbors\nrule in the transformed space.\n\nRead more in the :ref:`User Guide `.", "docstring": "Neighborhood Components Analysis.\n\n Neighborhood Component Analysis (NCA) is a machine learning algorithm for\n metric learning. It learns a linear transformation in a supervised fashion\n to improve the classification accuracy of a stochastic nearest neighbors\n rule in the transformed space.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Preferred dimensionality of the projected space.\n If None it will be set to `n_features`.\n\n init : {'auto', 'pca', 'lda', 'identity', 'random'} or ndarray of shape (n_features_a, n_features_b), default='auto'\n Initialization of the linear transformation. Possible options are\n `'auto'`, `'pca'`, `'lda'`, `'identity'`, `'random'`, and a numpy\n array of shape `(n_features_a, n_features_b)`.\n\n - `'auto'`\n Depending on `n_components`, the most reasonable initialization\n will be chosen. If `n_components <= n_classes` we use `'lda'`, as\n it uses labels information. If not, but\n `n_components < min(n_features, n_samples)`, we use `'pca'`, as\n it projects data in meaningful directions (those of higher\n variance). Otherwise, we just use `'identity'`.\n\n - `'pca'`\n `n_components` principal components of the inputs passed\n to :meth:`fit` will be used to initialize the transformation.\n (See :class:`~sklearn.decomposition.PCA`)\n\n - `'lda'`\n `min(n_components, n_classes)` most discriminative\n components of the inputs passed to :meth:`fit` will be used to\n initialize the transformation. (If `n_components > n_classes`,\n the rest of the components will be zero.) (See\n :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)\n\n - `'identity'`\n If `n_components` is strictly smaller than the\n dimensionality of the inputs passed to :meth:`fit`, the identity\n matrix will be truncated to the first `n_components` rows.\n\n - `'random'`\n The initial transformation will be a random array of shape\n `(n_components, n_features)`. Each value is sampled from the\n standard normal distribution.\n\n - numpy array\n `n_features_b` must match the dimensionality of the inputs passed\n to :meth:`fit` and n_features_a must be less than or equal to that.\n If `n_components` is not `None`, `n_features_a` must match it.\n\n warm_start : bool, default=False\n If `True` and :meth:`fit` has been called before, the solution of the\n previous call to :meth:`fit` is used as the initial linear\n transformation (`n_components` and `init` will be ignored).\n\n max_iter : int, default=50\n Maximum number of iterations in the optimization.\n\n tol : float, default=1e-5\n Convergence tolerance for the optimization.\n\n callback : callable, default=None\n If not `None`, this function is called after every iteration of the\n optimizer, taking as arguments the current solution (flattened\n transformation matrix) and the number of iterations. This might be\n useful in case one wants to examine or store the transformation\n found after each iteration.\n\n verbose : int, default=0\n If 0, no progress messages will be printed.\n If 1, progress messages will be printed to stdout.\n If > 1, progress messages will be printed and the `disp`\n parameter of :func:`scipy.optimize.minimize` will be set to\n `verbose - 2`.\n\n random_state : int or numpy.RandomState, default=None\n A pseudo random number generator object or a seed for it if int. If\n `init='random'`, `random_state` is used to initialize the random\n transformation. If `init='pca'`, `random_state` is passed as an\n argument to PCA when initializing the transformation. Pass an int\n for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n The linear transformation learned during fitting.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n n_iter_ : int\n Counts the number of iterations performed by the optimizer.\n\n random_state_ : numpy.RandomState\n Pseudo random number generator object used during initialization.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.discriminant_analysis.LinearDiscriminantAnalysis : Linear\n Discriminant Analysis.\n sklearn.decomposition.PCA : Principal component analysis (PCA).\n\n References\n ----------\n .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov.\n \"Neighbourhood Components Analysis\". Advances in Neural Information\n Processing Systems. 17, 513-520, 2005.\n http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf\n\n .. [2] Wikipedia entry on Neighborhood Components Analysis\n https://en.wikipedia.org/wiki/Neighbourhood_components_analysis\n\n Examples\n --------\n >>> from sklearn.neighbors import NeighborhoodComponentsAnalysis\n >>> from sklearn.neighbors import KNeighborsClassifier\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = load_iris(return_X_y=True)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y,\n ... stratify=y, test_size=0.7, random_state=42)\n >>> nca = NeighborhoodComponentsAnalysis(random_state=42)\n >>> nca.fit(X_train, y_train)\n NeighborhoodComponentsAnalysis(...)\n >>> knn = KNeighborsClassifier(n_neighbors=3)\n >>> knn.fit(X_train, y_train)\n KNeighborsClassifier(...)\n >>> print(knn.score(X_test, y_test))\n 0.933333...\n >>> knn.fit(nca.transform(X_train), y_train)\n KNeighborsClassifier(...)\n >>> print(knn.score(nca.transform(X_test), y_test))\n 0.961904...\n ", "source_code": "\n\nclass NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator):\n \"\"\"Neighborhood Components Analysis.\n\n Neighborhood Component Analysis (NCA) is a machine learning algorithm for\n metric learning. It learns a linear transformation in a supervised fashion\n to improve the classification accuracy of a stochastic nearest neighbors\n rule in the transformed space.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=None\n Preferred dimensionality of the projected space.\n If None it will be set to `n_features`.\n\n init : {'auto', 'pca', 'lda', 'identity', 'random'} or ndarray of shape (n_features_a, n_features_b), default='auto'\n Initialization of the linear transformation. Possible options are\n `'auto'`, `'pca'`, `'lda'`, `'identity'`, `'random'`, and a numpy\n array of shape `(n_features_a, n_features_b)`.\n\n - `'auto'`\n Depending on `n_components`, the most reasonable initialization\n will be chosen. If `n_components <= n_classes` we use `'lda'`, as\n it uses labels information. If not, but\n `n_components < min(n_features, n_samples)`, we use `'pca'`, as\n it projects data in meaningful directions (those of higher\n variance). Otherwise, we just use `'identity'`.\n\n - `'pca'`\n `n_components` principal components of the inputs passed\n to :meth:`fit` will be used to initialize the transformation.\n (See :class:`~sklearn.decomposition.PCA`)\n\n - `'lda'`\n `min(n_components, n_classes)` most discriminative\n components of the inputs passed to :meth:`fit` will be used to\n initialize the transformation. (If `n_components > n_classes`,\n the rest of the components will be zero.) (See\n :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)\n\n - `'identity'`\n If `n_components` is strictly smaller than the\n dimensionality of the inputs passed to :meth:`fit`, the identity\n matrix will be truncated to the first `n_components` rows.\n\n - `'random'`\n The initial transformation will be a random array of shape\n `(n_components, n_features)`. Each value is sampled from the\n standard normal distribution.\n\n - numpy array\n `n_features_b` must match the dimensionality of the inputs passed\n to :meth:`fit` and n_features_a must be less than or equal to that.\n If `n_components` is not `None`, `n_features_a` must match it.\n\n warm_start : bool, default=False\n If `True` and :meth:`fit` has been called before, the solution of the\n previous call to :meth:`fit` is used as the initial linear\n transformation (`n_components` and `init` will be ignored).\n\n max_iter : int, default=50\n Maximum number of iterations in the optimization.\n\n tol : float, default=1e-5\n Convergence tolerance for the optimization.\n\n callback : callable, default=None\n If not `None`, this function is called after every iteration of the\n optimizer, taking as arguments the current solution (flattened\n transformation matrix) and the number of iterations. This might be\n useful in case one wants to examine or store the transformation\n found after each iteration.\n\n verbose : int, default=0\n If 0, no progress messages will be printed.\n If 1, progress messages will be printed to stdout.\n If > 1, progress messages will be printed and the `disp`\n parameter of :func:`scipy.optimize.minimize` will be set to\n `verbose - 2`.\n\n random_state : int or numpy.RandomState, default=None\n A pseudo random number generator object or a seed for it if int. If\n `init='random'`, `random_state` is used to initialize the random\n transformation. If `init='pca'`, `random_state` is passed as an\n argument to PCA when initializing the transformation. Pass an int\n for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n components_ : ndarray of shape (n_components, n_features)\n The linear transformation learned during fitting.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n n_iter_ : int\n Counts the number of iterations performed by the optimizer.\n\n random_state_ : numpy.RandomState\n Pseudo random number generator object used during initialization.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.discriminant_analysis.LinearDiscriminantAnalysis : Linear\n Discriminant Analysis.\n sklearn.decomposition.PCA : Principal component analysis (PCA).\n\n References\n ----------\n .. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov.\n \"Neighbourhood Components Analysis\". Advances in Neural Information\n Processing Systems. 17, 513-520, 2005.\n http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf\n\n .. [2] Wikipedia entry on Neighborhood Components Analysis\n https://en.wikipedia.org/wiki/Neighbourhood_components_analysis\n\n Examples\n --------\n >>> from sklearn.neighbors import NeighborhoodComponentsAnalysis\n >>> from sklearn.neighbors import KNeighborsClassifier\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = load_iris(return_X_y=True)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y,\n ... stratify=y, test_size=0.7, random_state=42)\n >>> nca = NeighborhoodComponentsAnalysis(random_state=42)\n >>> nca.fit(X_train, y_train)\n NeighborhoodComponentsAnalysis(...)\n >>> knn = KNeighborsClassifier(n_neighbors=3)\n >>> knn.fit(X_train, y_train)\n KNeighborsClassifier(...)\n >>> print(knn.score(X_test, y_test))\n 0.933333...\n >>> knn.fit(nca.transform(X_train), y_train)\n KNeighborsClassifier(...)\n >>> print(knn.score(nca.transform(X_test), y_test))\n 0.961904...\n \"\"\"\n \n def __init__(self, n_components=None, *, init='auto', warm_start=False, max_iter=50, tol=1e-05, callback=None, verbose=0, random_state=None):\n self.n_components = n_components\n self.init = init\n self.warm_start = warm_start\n self.max_iter = max_iter\n self.tol = tol\n self.callback = callback\n self.verbose = verbose\n self.random_state = random_state\n \n def fit(self, X, y):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training samples.\n\n y : array-like of shape (n_samples,)\n The corresponding training labels.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y, init) = self._validate_params(X, y)\n self.random_state_ = check_random_state(self.random_state)\n t_train = time.time()\n same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]\n transformation = self._initialize(X, y, init)\n disp = self.verbose - 2 if self.verbose > 1 else -1\n optimizer_params = {'method': 'L-BFGS-B', 'fun': self._loss_grad_lbfgs, 'args': (X, same_class_mask, -1.0), 'jac': True, 'x0': transformation, 'tol': self.tol, 'options': dict(maxiter=self.max_iter, disp=disp), 'callback': self._callback}\n self.n_iter_ = 0\n opt_result = minimize(**optimizer_params)\n self.components_ = opt_result.x.reshape(-1, X.shape[1])\n t_train = time.time() - t_train\n if self.verbose:\n cls_name = self.__class__.__name__\n if not opt_result.success:\n warn('[{}] NCA did not converge: {}'.format(cls_name, opt_result.message), ConvergenceWarning)\n print('[{}] Training took {:8.2f}s.'.format(cls_name, t_train))\n return self\n \n def transform(self, X):\n \"\"\"Apply the learned transformation to the given data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data samples.\n\n Returns\n -------\n X_embedded: ndarray of shape (n_samples, n_components)\n The data samples transformed.\n\n Raises\n ------\n NotFittedError\n If :meth:`fit` has not been called before.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n return np.dot(X, self.components_.T)\n \n def _validate_params(self, X, y):\n \"\"\"Validate parameters as soon as :meth:`fit` is called.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training samples.\n\n y : array-like of shape (n_samples,)\n The corresponding training labels.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The validated training samples.\n\n y : ndarray of shape (n_samples,)\n The validated training labels, encoded to be integers in\n the `range(0, n_classes)`.\n\n init : str or ndarray of shape (n_features_a, n_features_b)\n The validated initialization of the linear transformation.\n\n Raises\n -------\n TypeError\n If a parameter is not an instance of the desired type.\n\n ValueError\n If a parameter's value violates its legal value range or if the\n combination of two or more given parameters is incompatible.\n \"\"\"\n (X, y) = self._validate_data(X, y, ensure_min_samples=2)\n check_classification_targets(y)\n y = LabelEncoder().fit_transform(y)\n if self.n_components is not None:\n check_scalar(self.n_components, 'n_components', numbers.Integral, min_val=1)\n if self.n_components > X.shape[1]:\n raise ValueError('The preferred dimensionality of the projected space `n_components` ({}) cannot be greater than the given data dimensionality ({})!'.format(self.n_components, X.shape[1]))\n check_scalar(self.warm_start, 'warm_start', bool)\n if self.warm_start and hasattr(self, 'components_'):\n if self.components_.shape[1] != X.shape[1]:\n raise ValueError('The new inputs dimensionality ({}) does not match the input dimensionality of the previously learned transformation ({}).'.format(X.shape[1], self.components_.shape[1]))\n check_scalar(self.max_iter, 'max_iter', numbers.Integral, min_val=1)\n check_scalar(self.tol, 'tol', numbers.Real, min_val=0.0)\n check_scalar(self.verbose, 'verbose', numbers.Integral, min_val=0)\n if self.callback is not None:\n if not callable(self.callback):\n raise ValueError('`callback` is not callable.')\n init = self.init\n if isinstance(init, np.ndarray):\n init = check_array(init)\n if init.shape[1] != X.shape[1]:\n raise ValueError('The input dimensionality ({}) of the given linear transformation `init` must match the dimensionality of the given inputs `X` ({}).'.format(init.shape[1], X.shape[1]))\n if init.shape[0] > init.shape[1]:\n raise ValueError('The output dimensionality ({}) of the given linear transformation `init` cannot be greater than its input dimensionality ({}).'.format(init.shape[0], init.shape[1]))\n if self.n_components is not None:\n if self.n_components != init.shape[0]:\n raise ValueError('The preferred dimensionality of the projected space `n_components` ({}) does not match the output dimensionality of the given linear transformation `init` ({})!'.format(self.n_components, init.shape[0]))\n elif init in ['auto', 'pca', 'lda', 'identity', 'random']:\n pass\n else:\n raise ValueError(\"`init` must be 'auto', 'pca', 'lda', 'identity', 'random' or a numpy array of shape (n_components, n_features).\")\n return X, y, init\n \n def _initialize(self, X, y, init):\n \"\"\"Initialize the transformation.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training samples.\n\n y : array-like of shape (n_samples,)\n The training labels.\n\n init : str or ndarray of shape (n_features_a, n_features_b)\n The validated initialization of the linear transformation.\n\n Returns\n -------\n transformation : ndarray of shape (n_components, n_features)\n The initialized linear transformation.\n\n \"\"\"\n transformation = init\n if self.warm_start and hasattr(self, 'components_'):\n transformation = self.components_\n elif isinstance(init, np.ndarray):\n pass\n else:\n (n_samples, n_features) = X.shape\n n_components = self.n_components or n_features\n if init == 'auto':\n n_classes = len(np.unique(y))\n if n_components <= min(n_features, n_classes - 1):\n init = 'lda'\n elif n_components < min(n_features, n_samples):\n init = 'pca'\n else:\n init = 'identity'\n if init == 'identity':\n transformation = np.eye(n_components, X.shape[1])\n elif init == 'random':\n transformation = self.random_state_.randn(n_components, X.shape[1])\n elif init in {'pca', 'lda'}:\n init_time = time.time()\n if init == 'pca':\n pca = PCA(n_components=n_components, random_state=self.random_state_)\n if self.verbose:\n print('Finding principal components... ', end='')\n sys.stdout.flush()\n pca.fit(X)\n transformation = pca.components_\n elif init == 'lda':\n from ..discriminant_analysis import LinearDiscriminantAnalysis\n lda = LinearDiscriminantAnalysis(n_components=n_components)\n if self.verbose:\n print('Finding most discriminative components... ', end='')\n sys.stdout.flush()\n lda.fit(X, y)\n transformation = lda.scalings_.T[:n_components]\n if self.verbose:\n print('done in {:5.2f}s'.format(time.time() - init_time))\n return transformation\n \n def _callback(self, transformation):\n \"\"\"Called after each iteration of the optimizer.\n\n Parameters\n ----------\n transformation : ndarray of shape (n_components * n_features,)\n The solution computed by the optimizer in this iteration.\n \"\"\"\n if self.callback is not None:\n self.callback(transformation, self.n_iter_)\n self.n_iter_ += 1\n \n def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):\n \"\"\"Compute the loss and the loss gradient w.r.t. `transformation`.\n\n Parameters\n ----------\n transformation : ndarray of shape (n_components * n_features,)\n The raveled linear transformation on which to compute loss and\n evaluate gradient.\n\n X : ndarray of shape (n_samples, n_features)\n The training samples.\n\n same_class_mask : ndarray of shape (n_samples, n_samples)\n A mask where `mask[i, j] == 1` if `X[i]` and `X[j]` belong\n to the same class, and `0` otherwise.\n\n Returns\n -------\n loss : float\n The loss computed for the given transformation.\n\n gradient : ndarray of shape (n_components * n_features,)\n The new (flattened) gradient of the loss.\n \"\"\"\n if self.n_iter_ == 0:\n self.n_iter_ += 1\n if self.verbose:\n header_fields = ['Iteration', 'Objective Value', 'Time(s)']\n header_fmt = '{:>10} {:>20} {:>10}'\n header = header_fmt.format(*header_fields)\n cls_name = self.__class__.__name__\n print('[{}]'.format(cls_name))\n print('[{}] {}\\n[{}] {}'.format(cls_name, header, cls_name, '-' * len(header)))\n t_funcall = time.time()\n transformation = transformation.reshape(-1, X.shape[1])\n X_embedded = np.dot(X, transformation.T)\n p_ij = pairwise_distances(X_embedded, squared=True)\n np.fill_diagonal(p_ij, np.inf)\n p_ij = softmax(-p_ij)\n masked_p_ij = p_ij * same_class_mask\n p = np.sum(masked_p_ij, axis=1, keepdims=True)\n loss = np.sum(p)\n weighted_p_ij = masked_p_ij - p_ij * p\n weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T\n np.fill_diagonal(weighted_p_ij_sym, -weighted_p_ij.sum(axis=0))\n gradient = 2 * X_embedded.T.dot(weighted_p_ij_sym).dot(X)\n if self.verbose:\n t_funcall = time.time() - t_funcall\n values_fmt = '[{}] {:>10} {:>20.6e} {:>10.2f}'\n print(values_fmt.format(self.__class__.__name__, self.n_iter_, loss, t_funcall))\n sys.stdout.flush()\n return sign * loss, sign * gradient.ravel()\n \n def _more_tags(self):\n return {'requires_y': True}\n" }, @@ -25530,9 +25626,9 @@ "sklearn.neighbors._nearest_centroid.NearestCentroid.predict" ], "is_public": true, - "description": "Nearest centroid classifier.\n\nEach class is represented by its centroid, with test samples classified to the class with the nearest centroid. Read more in the :ref:`User Guide `.", - "docstring": "Nearest centroid classifier.\n\n Each class is represented by its centroid, with test samples classified to\n the class with the nearest centroid.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n metric : str or callable\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string or callable, it must be one of\n the options allowed by\n :func:`~sklearn.metrics.pairwise_distances` for its metric\n parameter. The centroids for the samples corresponding to each class is\n the point from which the sum of the distances (according to the metric)\n of all samples that belong to that particular class are minimized.\n If the `\"manhattan\"` metric is provided, this centroid is the median\n and for all other metrics, the centroid is now set to be the mean.\n\n .. versionchanged:: 0.19\n `metric='precomputed'` was deprecated and now raises an error\n\n shrink_threshold : float, default=None\n Threshold for shrinking centroids to remove features.\n\n Attributes\n ----------\n centroids_ : array-like of shape (n_classes, n_features)\n Centroid of each class.\n\n classes_ : array of shape (n_classes,)\n The unique classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n KNeighborsClassifier : Nearest neighbors classifier.\n\n Notes\n -----\n When used for text classification with tf-idf vectors, this classifier is\n also known as the Rocchio classifier.\n\n References\n ----------\n Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of\n multiple cancer types by shrunken centroids of gene expression. Proceedings\n of the National Academy of Sciences of the United States of America,\n 99(10), 6567-6572. The National Academy of Sciences.\n\n Examples\n --------\n >>> from sklearn.neighbors import NearestCentroid\n >>> import numpy as np\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> y = np.array([1, 1, 1, 2, 2, 2])\n >>> clf = NearestCentroid()\n >>> clf.fit(X, y)\n NearestCentroid()\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n ", - "source_code": "\n\nclass NearestCentroid(ClassifierMixin, BaseEstimator):\n \"\"\"Nearest centroid classifier.\n\n Each class is represented by its centroid, with test samples classified to\n the class with the nearest centroid.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n metric : str or callable\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string or callable, it must be one of\n the options allowed by\n :func:`~sklearn.metrics.pairwise_distances` for its metric\n parameter. The centroids for the samples corresponding to each class is\n the point from which the sum of the distances (according to the metric)\n of all samples that belong to that particular class are minimized.\n If the `\"manhattan\"` metric is provided, this centroid is the median\n and for all other metrics, the centroid is now set to be the mean.\n\n .. versionchanged:: 0.19\n `metric='precomputed'` was deprecated and now raises an error\n\n shrink_threshold : float, default=None\n Threshold for shrinking centroids to remove features.\n\n Attributes\n ----------\n centroids_ : array-like of shape (n_classes, n_features)\n Centroid of each class.\n\n classes_ : array of shape (n_classes,)\n The unique classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n KNeighborsClassifier : Nearest neighbors classifier.\n\n Notes\n -----\n When used for text classification with tf-idf vectors, this classifier is\n also known as the Rocchio classifier.\n\n References\n ----------\n Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of\n multiple cancer types by shrunken centroids of gene expression. Proceedings\n of the National Academy of Sciences of the United States of America,\n 99(10), 6567-6572. The National Academy of Sciences.\n\n Examples\n --------\n >>> from sklearn.neighbors import NearestCentroid\n >>> import numpy as np\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> y = np.array([1, 1, 1, 2, 2, 2])\n >>> clf = NearestCentroid()\n >>> clf.fit(X, y)\n NearestCentroid()\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n \"\"\"\n \n def __init__(self, metric='euclidean', *, shrink_threshold=None):\n self.metric = metric\n self.shrink_threshold = shrink_threshold\n \n def fit(self, X, y):\n \"\"\"\n Fit the NearestCentroid model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n Note that centroid shrinking cannot be used with sparse matrices.\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.metric == 'precomputed':\n raise ValueError('Precomputed is not supported.')\n if self.metric == 'manhattan':\n (X, y) = self._validate_data(X, y, accept_sparse=['csc'])\n else:\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'])\n is_X_sparse = sp.issparse(X)\n if is_X_sparse and self.shrink_threshold:\n raise ValueError('threshold shrinking not supported for sparse input')\n check_classification_targets(y)\n (n_samples, n_features) = X.shape\n le = LabelEncoder()\n y_ind = le.fit_transform(y)\n self.classes_ = classes = le.classes_\n n_classes = classes.size\n if n_classes < 2:\n raise ValueError('The number of classes has to be greater than one; got %d class' % n_classes)\n self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)\n nk = np.zeros(n_classes)\n for cur_class in range(n_classes):\n center_mask = y_ind == cur_class\n nk[cur_class] = np.sum(center_mask)\n if is_X_sparse:\n center_mask = np.where(center_mask)[0]\n if self.metric == 'manhattan':\n if not is_X_sparse:\n self.centroids_[cur_class] = np.median(X[center_mask], axis=0)\n else:\n self.centroids_[cur_class] = csc_median_axis_0(X[center_mask])\n else:\n if self.metric != 'euclidean':\n warnings.warn('Averaging for metrics other than euclidean and manhattan not supported. The average is set to be the mean.')\n self.centroids_[cur_class] = X[center_mask].mean(axis=0)\n if self.shrink_threshold:\n if np.all(np.ptp(X, axis=0) == 0):\n raise ValueError('All features have zero variance. Division by zero.')\n dataset_centroid_ = np.mean(X, axis=0)\n m = np.sqrt(1.0 / nk - 1.0 / n_samples)\n variance = (X - self.centroids_[y_ind])**2\n variance = variance.sum(axis=0)\n s = np.sqrt(variance / (n_samples - n_classes))\n s += np.median(s)\n mm = m.reshape(len(m), 1)\n ms = mm * s\n deviation = (self.centroids_ - dataset_centroid_) / ms\n signs = np.sign(deviation)\n deviation = np.abs(deviation) - self.shrink_threshold\n np.clip(deviation, 0, None, out=deviation)\n deviation *= signs\n msd = ms * deviation\n self.centroids_ = dataset_centroid_[np.newaxis, :] + msd\n return self\n \n def predict(self, X):\n \"\"\"Perform classification on an array of test vectors `X`.\n\n The predicted class `C` for each sample in `X` is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Test samples.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n The predicted classes.\n\n Notes\n -----\n If the metric constructor parameter is `\"precomputed\"`, `X` is assumed\n to be the distance matrix between the data to be predicted and\n `self.centroids_`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n return self.classes_[pairwise_distances(X, self.centroids_, metric=self.metric).argmin(axis=1)]\n" + "description": "Nearest centroid classifier.\n\nEach class is represented by its centroid, with test samples classified to\nthe class with the nearest centroid.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Nearest centroid classifier.\n\n Each class is represented by its centroid, with test samples classified to\n the class with the nearest centroid.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n metric : str or callable, default=\"euclidian\"\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string or callable, it must be one of\n the options allowed by\n :func:`~sklearn.metrics.pairwise_distances` for its metric\n parameter. The centroids for the samples corresponding to each class is\n the point from which the sum of the distances (according to the metric)\n of all samples that belong to that particular class are minimized.\n If the `\"manhattan\"` metric is provided, this centroid is the median\n and for all other metrics, the centroid is now set to be the mean.\n\n .. versionchanged:: 0.19\n `metric='precomputed'` was deprecated and now raises an error\n\n shrink_threshold : float, default=None\n Threshold for shrinking centroids to remove features.\n\n Attributes\n ----------\n centroids_ : array-like of shape (n_classes, n_features)\n Centroid of each class.\n\n classes_ : array of shape (n_classes,)\n The unique classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n KNeighborsClassifier : Nearest neighbors classifier.\n\n Notes\n -----\n When used for text classification with tf-idf vectors, this classifier is\n also known as the Rocchio classifier.\n\n References\n ----------\n Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of\n multiple cancer types by shrunken centroids of gene expression. Proceedings\n of the National Academy of Sciences of the United States of America,\n 99(10), 6567-6572. The National Academy of Sciences.\n\n Examples\n --------\n >>> from sklearn.neighbors import NearestCentroid\n >>> import numpy as np\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> y = np.array([1, 1, 1, 2, 2, 2])\n >>> clf = NearestCentroid()\n >>> clf.fit(X, y)\n NearestCentroid()\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n ", + "source_code": "\n\nclass NearestCentroid(ClassifierMixin, BaseEstimator):\n \"\"\"Nearest centroid classifier.\n\n Each class is represented by its centroid, with test samples classified to\n the class with the nearest centroid.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n metric : str or callable, default=\"euclidian\"\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string or callable, it must be one of\n the options allowed by\n :func:`~sklearn.metrics.pairwise_distances` for its metric\n parameter. The centroids for the samples corresponding to each class is\n the point from which the sum of the distances (according to the metric)\n of all samples that belong to that particular class are minimized.\n If the `\"manhattan\"` metric is provided, this centroid is the median\n and for all other metrics, the centroid is now set to be the mean.\n\n .. versionchanged:: 0.19\n `metric='precomputed'` was deprecated and now raises an error\n\n shrink_threshold : float, default=None\n Threshold for shrinking centroids to remove features.\n\n Attributes\n ----------\n centroids_ : array-like of shape (n_classes, n_features)\n Centroid of each class.\n\n classes_ : array of shape (n_classes,)\n The unique classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n KNeighborsClassifier : Nearest neighbors classifier.\n\n Notes\n -----\n When used for text classification with tf-idf vectors, this classifier is\n also known as the Rocchio classifier.\n\n References\n ----------\n Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of\n multiple cancer types by shrunken centroids of gene expression. Proceedings\n of the National Academy of Sciences of the United States of America,\n 99(10), 6567-6572. The National Academy of Sciences.\n\n Examples\n --------\n >>> from sklearn.neighbors import NearestCentroid\n >>> import numpy as np\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])\n >>> y = np.array([1, 1, 1, 2, 2, 2])\n >>> clf = NearestCentroid()\n >>> clf.fit(X, y)\n NearestCentroid()\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n \"\"\"\n \n def __init__(self, metric='euclidean', *, shrink_threshold=None):\n self.metric = metric\n self.shrink_threshold = shrink_threshold\n \n def fit(self, X, y):\n \"\"\"\n Fit the NearestCentroid model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n Note that centroid shrinking cannot be used with sparse matrices.\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.metric == 'precomputed':\n raise ValueError('Precomputed is not supported.')\n if self.metric == 'manhattan':\n (X, y) = self._validate_data(X, y, accept_sparse=['csc'])\n else:\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'])\n is_X_sparse = sp.issparse(X)\n if is_X_sparse and self.shrink_threshold:\n raise ValueError('threshold shrinking not supported for sparse input')\n check_classification_targets(y)\n (n_samples, n_features) = X.shape\n le = LabelEncoder()\n y_ind = le.fit_transform(y)\n self.classes_ = classes = le.classes_\n n_classes = classes.size\n if n_classes < 2:\n raise ValueError('The number of classes has to be greater than one; got %d class' % n_classes)\n self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)\n nk = np.zeros(n_classes)\n for cur_class in range(n_classes):\n center_mask = y_ind == cur_class\n nk[cur_class] = np.sum(center_mask)\n if is_X_sparse:\n center_mask = np.where(center_mask)[0]\n if self.metric == 'manhattan':\n if not is_X_sparse:\n self.centroids_[cur_class] = np.median(X[center_mask], axis=0)\n else:\n self.centroids_[cur_class] = csc_median_axis_0(X[center_mask])\n else:\n if self.metric != 'euclidean':\n warnings.warn('Averaging for metrics other than euclidean and manhattan not supported. The average is set to be the mean.')\n self.centroids_[cur_class] = X[center_mask].mean(axis=0)\n if self.shrink_threshold:\n if np.all(np.ptp(X, axis=0) == 0):\n raise ValueError('All features have zero variance. Division by zero.')\n dataset_centroid_ = np.mean(X, axis=0)\n m = np.sqrt(1.0 / nk - 1.0 / n_samples)\n variance = (X - self.centroids_[y_ind])**2\n variance = variance.sum(axis=0)\n s = np.sqrt(variance / (n_samples - n_classes))\n s += np.median(s)\n mm = m.reshape(len(m), 1)\n ms = mm * s\n deviation = (self.centroids_ - dataset_centroid_) / ms\n signs = np.sign(deviation)\n deviation = np.abs(deviation) - self.shrink_threshold\n np.clip(deviation, 0, None, out=deviation)\n deviation *= signs\n msd = ms * deviation\n self.centroids_ = dataset_centroid_[np.newaxis, :] + msd\n return self\n \n def predict(self, X):\n \"\"\"Perform classification on an array of test vectors `X`.\n\n The predicted class `C` for each sample in `X` is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Test samples.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n The predicted classes.\n\n Notes\n -----\n If the metric constructor parameter is `\"precomputed\"`, `X` is assumed\n to be the distance matrix between the data to be predicted and\n `self.centroids_`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n return self.classes_[pairwise_distances(X, self.centroids_, metric=self.metric).argmin(axis=1)]\n" }, { "name": "KNeighborsRegressor", @@ -25551,9 +25647,9 @@ "sklearn.neighbors._regression.KNeighborsRegressor.predict" ], "is_public": true, - "description": "Regression based on k-nearest neighbors.\n\nThe target is predicted by local interpolation of the targets associated of the nearest neighbors in the training set. Read more in the :ref:`User Guide `. .. versionadded:: 0.9", - "docstring": "Regression based on k-nearest neighbors.\n\n The target is predicted by local interpolation of the targets\n associated of the nearest neighbors in the training set.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.9\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n Uniform weights are used by default.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. See the documentation of :class:`DistanceMetric` for a\n list of available metrics.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n Doesn't affect :meth:`fit` method.\n\n Attributes\n ----------\n effective_metric_ : str or callable\n The distance metric to use. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n NearestNeighbors : Unsupervised learner for implementing neighbor searches.\n RadiusNeighborsRegressor : Regression based on neighbors within a fixed radius.\n KNeighborsClassifier : Classifier implementing the k-nearest neighbors vote.\n RadiusNeighborsClassifier : Classifier implementing\n a vote among neighbors within a given radius.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n .. warning::\n\n Regarding the Nearest Neighbors algorithms, if it is found that two\n neighbors, neighbor `k+1` and `k`, have identical distances but\n different labels, the results will depend on the ordering of the\n training data.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import KNeighborsRegressor\n >>> neigh = KNeighborsRegressor(n_neighbors=2)\n >>> neigh.fit(X, y)\n KNeighborsRegressor(...)\n >>> print(neigh.predict([[1.5]]))\n [0.5]\n ", - "source_code": "\n\nclass KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):\n \"\"\"Regression based on k-nearest neighbors.\n\n The target is predicted by local interpolation of the targets\n associated of the nearest neighbors in the training set.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.9\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n Uniform weights are used by default.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. See the documentation of :class:`DistanceMetric` for a\n list of available metrics.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n Doesn't affect :meth:`fit` method.\n\n Attributes\n ----------\n effective_metric_ : str or callable\n The distance metric to use. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n NearestNeighbors : Unsupervised learner for implementing neighbor searches.\n RadiusNeighborsRegressor : Regression based on neighbors within a fixed radius.\n KNeighborsClassifier : Classifier implementing the k-nearest neighbors vote.\n RadiusNeighborsClassifier : Classifier implementing\n a vote among neighbors within a given radius.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n .. warning::\n\n Regarding the Nearest Neighbors algorithms, if it is found that two\n neighbors, neighbor `k+1` and `k`, have identical distances but\n different labels, the results will depend on the ordering of the\n training data.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import KNeighborsRegressor\n >>> neigh = KNeighborsRegressor(n_neighbors=2)\n >>> neigh.fit(X, y)\n KNeighborsRegressor(...)\n >>> print(neigh.predict([[1.5]]))\n [0.5]\n \"\"\"\n \n def __init__(self, n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):\n super().__init__(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.weights = weights\n \n def _more_tags(self):\n return {'pairwise': self.metric == 'precomputed'}\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return self.metric == 'precomputed'\n \n def fit(self, X, y):\n \"\"\"Fit the k-nearest neighbors regressor from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : KNeighborsRegressor\n The fitted k-nearest neighbors regressor.\n \"\"\"\n self.weights = _check_weights(self.weights)\n return self._fit(X, y)\n \n def predict(self, X):\n \"\"\"Predict the target for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int\n Target values.\n \"\"\"\n (neigh_dist, neigh_ind) = self.kneighbors(X)\n weights = _get_weights(neigh_dist, self.weights)\n _y = self._y\n if _y.ndim == 1:\n _y = _y.reshape((-1, 1))\n if weights is None:\n y_pred = np.mean(_y[neigh_ind], axis=1)\n else:\n y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64)\n denom = np.sum(weights, axis=1)\n for j in range(_y.shape[1]):\n num = np.sum(_y[neigh_ind, j] * weights, axis=1)\n y_pred[:, j] = num / denom\n if self._y.ndim == 1:\n y_pred = y_pred.ravel()\n return y_pred\n" + "description": "Regression based on k-nearest neighbors.\n\nThe target is predicted by local interpolation of the targets\nassociated of the nearest neighbors in the training set.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.9", + "docstring": "Regression based on k-nearest neighbors.\n\n The target is predicted by local interpolation of the targets\n associated of the nearest neighbors in the training set.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.9\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n Uniform weights are used by default.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. See the documentation of :class:`DistanceMetric` for a\n list of available metrics.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n Doesn't affect :meth:`fit` method.\n\n Attributes\n ----------\n effective_metric_ : str or callable\n The distance metric to use. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n NearestNeighbors : Unsupervised learner for implementing neighbor searches.\n RadiusNeighborsRegressor : Regression based on neighbors within a fixed radius.\n KNeighborsClassifier : Classifier implementing the k-nearest neighbors vote.\n RadiusNeighborsClassifier : Classifier implementing\n a vote among neighbors within a given radius.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n .. warning::\n\n Regarding the Nearest Neighbors algorithms, if it is found that two\n neighbors, neighbor `k+1` and `k`, have identical distances but\n different labels, the results will depend on the ordering of the\n training data.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import KNeighborsRegressor\n >>> neigh = KNeighborsRegressor(n_neighbors=2)\n >>> neigh.fit(X, y)\n KNeighborsRegressor(...)\n >>> print(neigh.predict([[1.5]]))\n [0.5]\n ", + "source_code": "\n\nclass KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):\n \"\"\"Regression based on k-nearest neighbors.\n\n The target is predicted by local interpolation of the targets\n associated of the nearest neighbors in the training set.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.9\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n Uniform weights are used by default.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. See the documentation of :class:`DistanceMetric` for a\n list of available metrics.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n Doesn't affect :meth:`fit` method.\n\n Attributes\n ----------\n effective_metric_ : str or callable\n The distance metric to use. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n NearestNeighbors : Unsupervised learner for implementing neighbor searches.\n RadiusNeighborsRegressor : Regression based on neighbors within a fixed radius.\n KNeighborsClassifier : Classifier implementing the k-nearest neighbors vote.\n RadiusNeighborsClassifier : Classifier implementing\n a vote among neighbors within a given radius.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n .. warning::\n\n Regarding the Nearest Neighbors algorithms, if it is found that two\n neighbors, neighbor `k+1` and `k`, have identical distances but\n different labels, the results will depend on the ordering of the\n training data.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import KNeighborsRegressor\n >>> neigh = KNeighborsRegressor(n_neighbors=2)\n >>> neigh.fit(X, y)\n KNeighborsRegressor(...)\n >>> print(neigh.predict([[1.5]]))\n [0.5]\n \"\"\"\n \n def __init__(self, n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):\n super().__init__(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.weights = weights\n \n def _more_tags(self):\n return {'pairwise': self.metric == 'precomputed'}\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return self.metric == 'precomputed'\n \n def fit(self, X, y):\n \"\"\"Fit the k-nearest neighbors regressor from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : KNeighborsRegressor\n The fitted k-nearest neighbors regressor.\n \"\"\"\n self.weights = _check_weights(self.weights)\n return self._fit(X, y)\n \n def predict(self, X):\n \"\"\"Predict the target for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int\n Target values.\n \"\"\"\n (neigh_dist, neigh_ind) = self.kneighbors(X)\n weights = _get_weights(neigh_dist, self.weights)\n _y = self._y\n if _y.ndim == 1:\n _y = _y.reshape((-1, 1))\n if weights is None:\n y_pred = np.mean(_y[neigh_ind], axis=1)\n else:\n y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64)\n denom = np.sum(weights, axis=1)\n for j in range(_y.shape[1]):\n num = np.sum(_y[neigh_ind, j] * weights, axis=1)\n y_pred[:, j] = num / denom\n if self._y.ndim == 1:\n y_pred = y_pred.ravel()\n return y_pred\n" }, { "name": "RadiusNeighborsRegressor", @@ -25570,7 +25666,7 @@ "sklearn.neighbors._regression.RadiusNeighborsRegressor.predict" ], "is_public": true, - "description": "Regression based on neighbors within a fixed radius.\n\nThe target is predicted by local interpolation of the targets associated of the nearest neighbors in the training set. Read more in the :ref:`User Guide `. .. versionadded:: 0.9", + "description": "Regression based on neighbors within a fixed radius.\n\nThe target is predicted by local interpolation of the targets\nassociated of the nearest neighbors in the training set.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.9", "docstring": "Regression based on neighbors within a fixed radius.\n\n The target is predicted by local interpolation of the targets\n associated of the nearest neighbors in the training set.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.9\n\n Parameters\n ----------\n radius : float, default=1.0\n Range of parameter space to use by default for :meth:`radius_neighbors`\n queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n Uniform weights are used by default.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. See the documentation of :class:`DistanceMetric` for a\n list of available metrics.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n effective_metric_ : str or callable\n The distance metric to use. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n NearestNeighbors : Regression based on nearest neighbors.\n KNeighborsRegressor : Regression based on k-nearest neighbors.\n KNeighborsClassifier : Classifier based on the k-nearest neighbors.\n RadiusNeighborsClassifier : Classifier based on neighbors within a given radius.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import RadiusNeighborsRegressor\n >>> neigh = RadiusNeighborsRegressor(radius=1.0)\n >>> neigh.fit(X, y)\n RadiusNeighborsRegressor(...)\n >>> print(neigh.predict([[1.5]]))\n [0.5]\n ", "source_code": "\n\nclass RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBase):\n \"\"\"Regression based on neighbors within a fixed radius.\n\n The target is predicted by local interpolation of the targets\n associated of the nearest neighbors in the training set.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.9\n\n Parameters\n ----------\n radius : float, default=1.0\n Range of parameter space to use by default for :meth:`radius_neighbors`\n queries.\n\n weights : {'uniform', 'distance'} or callable, default='uniform'\n Weight function used in prediction. Possible values:\n\n - 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n - 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n - [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\n Uniform weights are used by default.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. See the documentation of :class:`DistanceMetric` for a\n list of available metrics.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n effective_metric_ : str or callable\n The distance metric to use. It will be same as the `metric` parameter\n or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to\n 'minkowski' and `p` parameter set to 2.\n\n effective_metric_params_ : dict\n Additional keyword arguments for the metric function. For most metrics\n will be same with `metric_params` parameter, but may also contain the\n `p` parameter value if the `effective_metric_` attribute is set to\n 'minkowski'.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n NearestNeighbors : Regression based on nearest neighbors.\n KNeighborsRegressor : Regression based on k-nearest neighbors.\n KNeighborsClassifier : Classifier based on the k-nearest neighbors.\n RadiusNeighborsClassifier : Classifier based on neighbors within a given radius.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm\n\n Examples\n --------\n >>> X = [[0], [1], [2], [3]]\n >>> y = [0, 0, 1, 1]\n >>> from sklearn.neighbors import RadiusNeighborsRegressor\n >>> neigh = RadiusNeighborsRegressor(radius=1.0)\n >>> neigh.fit(X, y)\n RadiusNeighborsRegressor(...)\n >>> print(neigh.predict([[1.5]]))\n [0.5]\n \"\"\"\n \n def __init__(self, radius=1.0, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):\n super().__init__(radius=radius, algorithm=algorithm, leaf_size=leaf_size, p=p, metric=metric, metric_params=metric_params, n_jobs=n_jobs)\n self.weights = weights\n \n def fit(self, X, y):\n \"\"\"Fit the radius neighbors regressor from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : RadiusNeighborsRegressor\n The fitted radius neighbors regressor.\n \"\"\"\n self.weights = _check_weights(self.weights)\n return self._fit(X, y)\n \n def predict(self, X):\n \"\"\"Predict the target for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=double\n Target values.\n \"\"\"\n (neigh_dist, neigh_ind) = self.radius_neighbors(X)\n weights = _get_weights(neigh_dist, self.weights)\n _y = self._y\n if _y.ndim == 1:\n _y = _y.reshape((-1, 1))\n empty_obs = np.full_like(_y[0], np.nan)\n if weights is None:\n y_pred = np.array([np.mean(_y[ind, :], axis=0) if len(ind) else empty_obs for (i, ind) in enumerate(neigh_ind)])\n else:\n y_pred = np.array([np.average(_y[ind, :], axis=0, weights=weights[i]) if len(ind) else empty_obs for (i, ind) in enumerate(neigh_ind)])\n if np.any(np.isnan(y_pred)):\n empty_warning_msg = 'One or more samples have no neighbors within specified radius; predicting NaN.'\n warnings.warn(empty_warning_msg)\n if self._y.ndim == 1:\n y_pred = y_pred.ravel()\n return y_pred\n" }, @@ -25588,9 +25684,9 @@ "sklearn.neighbors._unsupervised.NearestNeighbors.fit" ], "is_public": true, - "description": "Unsupervised learner for implementing neighbor searches.\n\nRead more in the :ref:`User Guide `. .. versionadded:: 0.9", - "docstring": "Unsupervised learner for implementing neighbor searches.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.9\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n\n radius : float, default=1.0\n Range of parameter space to use by default for :meth:`radius_neighbors`\n queries.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. See the documentation of :class:`DistanceMetric` for a\n list of available metrics.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n effective_metric_ : str\n Metric used to compute distances to neighbors.\n\n effective_metric_params_ : dict\n Parameters for the metric used to compute distances to neighbors.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n KNeighborsClassifier : Classifier implementing the k-nearest neighbors\n vote.\n RadiusNeighborsClassifier : Classifier implementing a vote among neighbors\n within a given radius.\n KNeighborsRegressor : Regression based on k-nearest neighbors.\n RadiusNeighborsRegressor : Regression based on neighbors within a fixed\n radius.\n BallTree : Space partitioning data structure for organizing points in a\n multi-dimensional space, used for nearest neighbor search.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.neighbors import NearestNeighbors\n >>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]\n\n >>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4)\n >>> neigh.fit(samples)\n NearestNeighbors(...)\n\n >>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)\n array([[2, 0]]...)\n\n >>> nbrs = neigh.radius_neighbors(\n ... [[0, 0, 1.3]], 0.4, return_distance=False\n ... )\n >>> np.asarray(nbrs[0][0])\n array(2)\n ", - "source_code": "\n\nclass NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):\n \"\"\"Unsupervised learner for implementing neighbor searches.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.9\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n\n radius : float, default=1.0\n Range of parameter space to use by default for :meth:`radius_neighbors`\n queries.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. See the documentation of :class:`DistanceMetric` for a\n list of available metrics.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n effective_metric_ : str\n Metric used to compute distances to neighbors.\n\n effective_metric_params_ : dict\n Parameters for the metric used to compute distances to neighbors.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n KNeighborsClassifier : Classifier implementing the k-nearest neighbors\n vote.\n RadiusNeighborsClassifier : Classifier implementing a vote among neighbors\n within a given radius.\n KNeighborsRegressor : Regression based on k-nearest neighbors.\n RadiusNeighborsRegressor : Regression based on neighbors within a fixed\n radius.\n BallTree : Space partitioning data structure for organizing points in a\n multi-dimensional space, used for nearest neighbor search.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.neighbors import NearestNeighbors\n >>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]\n\n >>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4)\n >>> neigh.fit(samples)\n NearestNeighbors(...)\n\n >>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)\n array([[2, 0]]...)\n\n >>> nbrs = neigh.radius_neighbors(\n ... [[0, 0, 1.3]], 0.4, return_distance=False\n ... )\n >>> np.asarray(nbrs[0][0])\n array(2)\n \"\"\"\n \n def __init__(self, *, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None):\n super().__init__(n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n \n def fit(self, X, y=None):\n \"\"\"Fit the nearest neighbors estimator from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : NearestNeighbors\n The fitted nearest neighbors estimator.\n \"\"\"\n return self._fit(X)\n" + "description": "Unsupervised learner for implementing neighbor searches.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.9", + "docstring": "Unsupervised learner for implementing neighbor searches.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.9\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n\n radius : float, default=1.0\n Range of parameter space to use by default for :meth:`radius_neighbors`\n queries.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. For a list of available metrics, see the documentation of\n :class:`~sklearn.metrics.DistanceMetric`.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n effective_metric_ : str\n Metric used to compute distances to neighbors.\n\n effective_metric_params_ : dict\n Parameters for the metric used to compute distances to neighbors.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n KNeighborsClassifier : Classifier implementing the k-nearest neighbors\n vote.\n RadiusNeighborsClassifier : Classifier implementing a vote among neighbors\n within a given radius.\n KNeighborsRegressor : Regression based on k-nearest neighbors.\n RadiusNeighborsRegressor : Regression based on neighbors within a fixed\n radius.\n BallTree : Space partitioning data structure for organizing points in a\n multi-dimensional space, used for nearest neighbor search.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.neighbors import NearestNeighbors\n >>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]\n\n >>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4)\n >>> neigh.fit(samples)\n NearestNeighbors(...)\n\n >>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)\n array([[2, 0]]...)\n\n >>> nbrs = neigh.radius_neighbors(\n ... [[0, 0, 1.3]], 0.4, return_distance=False\n ... )\n >>> np.asarray(nbrs[0][0])\n array(2)\n ", + "source_code": "\n\nclass NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):\n \"\"\"Unsupervised learner for implementing neighbor searches.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.9\n\n Parameters\n ----------\n n_neighbors : int, default=5\n Number of neighbors to use by default for :meth:`kneighbors` queries.\n\n radius : float, default=1.0\n Range of parameter space to use by default for :meth:`radius_neighbors`\n queries.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or KDTree. This can affect the\n speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n metric : str or callable, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric. For a list of available metrics, see the documentation of\n :class:`~sklearn.metrics.DistanceMetric`.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit. X may be a :term:`sparse graph`,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n effective_metric_ : str\n Metric used to compute distances to neighbors.\n\n effective_metric_params_ : dict\n Parameters for the metric used to compute distances to neighbors.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_fit_ : int\n Number of samples in the fitted data.\n\n See Also\n --------\n KNeighborsClassifier : Classifier implementing the k-nearest neighbors\n vote.\n RadiusNeighborsClassifier : Classifier implementing a vote among neighbors\n within a given radius.\n KNeighborsRegressor : Regression based on k-nearest neighbors.\n RadiusNeighborsRegressor : Regression based on neighbors within a fixed\n radius.\n BallTree : Space partitioning data structure for organizing points in a\n multi-dimensional space, used for nearest neighbor search.\n\n Notes\n -----\n See :ref:`Nearest Neighbors ` in the online documentation\n for a discussion of the choice of ``algorithm`` and ``leaf_size``.\n\n https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.neighbors import NearestNeighbors\n >>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]\n\n >>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4)\n >>> neigh.fit(samples)\n NearestNeighbors(...)\n\n >>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)\n array([[2, 0]]...)\n\n >>> nbrs = neigh.radius_neighbors(\n ... [[0, 0, 1.3]], 0.4, return_distance=False\n ... )\n >>> np.asarray(nbrs[0][0])\n array(2)\n \"\"\"\n \n def __init__(self, *, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None):\n super().__init__(n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n \n def fit(self, X, y=None):\n \"\"\"Fit the nearest neighbors estimator from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : NearestNeighbors\n The fitted nearest neighbors estimator.\n \"\"\"\n return self._fit(X)\n" }, { "name": "BaseMultilayerPerceptron", @@ -25617,7 +25713,7 @@ "sklearn.neural_network._multilayer_perceptron.BaseMultilayerPerceptron.partial_fit" ], "is_public": false, - "description": "Base class for MLP classification and regression.\n\nWarning: This class should not be used directly. Use derived classes instead. .. versionadded:: 0.18", + "description": "Base class for MLP classification and regression.\n\nWarning: This class should not be used directly.\nUse derived classes instead.\n\n.. versionadded:: 0.18", "docstring": "Base class for MLP classification and regression.\n\n Warning: This class should not be used directly.\n Use derived classes instead.\n\n .. versionadded:: 0.18\n ", "source_code": "\n\nclass BaseMultilayerPerceptron(BaseEstimator, metaclass=ABCMeta):\n \"\"\"Base class for MLP classification and regression.\n\n Warning: This class should not be used directly.\n Use derived classes instead.\n\n .. versionadded:: 0.18\n \"\"\"\n \n @abstractmethod\n def __init__(self, hidden_layer_sizes, activation, solver, alpha, batch_size, learning_rate, learning_rate_init, power_t, max_iter, loss, shuffle, random_state, tol, verbose, warm_start, momentum, nesterovs_momentum, early_stopping, validation_fraction, beta_1, beta_2, epsilon, n_iter_no_change, max_fun):\n self.activation = activation\n self.solver = solver\n self.alpha = alpha\n self.batch_size = batch_size\n self.learning_rate = learning_rate\n self.learning_rate_init = learning_rate_init\n self.power_t = power_t\n self.max_iter = max_iter\n self.loss = loss\n self.hidden_layer_sizes = hidden_layer_sizes\n self.shuffle = shuffle\n self.random_state = random_state\n self.tol = tol\n self.verbose = verbose\n self.warm_start = warm_start\n self.momentum = momentum\n self.nesterovs_momentum = nesterovs_momentum\n self.early_stopping = early_stopping\n self.validation_fraction = validation_fraction\n self.beta_1 = beta_1\n self.beta_2 = beta_2\n self.epsilon = epsilon\n self.n_iter_no_change = n_iter_no_change\n self.max_fun = max_fun\n \n def _unpack(self, packed_parameters):\n \"\"\"Extract the coefficients and intercepts from packed_parameters.\"\"\"\n for i in range(self.n_layers_ - 1):\n (start, end, shape) = self._coef_indptr[i]\n self.coefs_[i] = np.reshape(packed_parameters[start:end], shape)\n (start, end) = self._intercept_indptr[i]\n self.intercepts_[i] = packed_parameters[start:end]\n \n def _forward_pass(self, activations):\n \"\"\"Perform a forward pass on the network by computing the values\n of the neurons in the hidden layers and the output layer.\n\n Parameters\n ----------\n activations : list, length = n_layers - 1\n The ith element of the list holds the values of the ith layer.\n \"\"\"\n hidden_activation = ACTIVATIONS[self.activation]\n for i in range(self.n_layers_ - 1):\n activations[i + 1] = safe_sparse_dot(activations[i], self.coefs_[i])\n activations[i + 1] += self.intercepts_[i]\n if i + 1 != self.n_layers_ - 1:\n hidden_activation(activations[i + 1])\n output_activation = ACTIVATIONS[self.out_activation_]\n output_activation(activations[i + 1])\n return activations\n \n def _forward_pass_fast(self, X):\n \"\"\"Predict using the trained model\n\n This is the same as _forward_pass but does not record the activations\n of all layers and only returns the last layer's activation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The decision function of the samples for each class in the model.\n \"\"\"\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], reset=False)\n activation = X\n hidden_activation = ACTIVATIONS[self.activation]\n for i in range(self.n_layers_ - 1):\n activation = safe_sparse_dot(activation, self.coefs_[i])\n activation += self.intercepts_[i]\n if i != self.n_layers_ - 2:\n hidden_activation(activation)\n output_activation = ACTIVATIONS[self.out_activation_]\n output_activation(activation)\n return activation\n \n def _compute_loss_grad(self, layer, n_samples, activations, deltas, coef_grads, intercept_grads):\n \"\"\"Compute the gradient of loss with respect to coefs and intercept for\n specified layer.\n\n This function does backpropagation for the specified one layer.\n \"\"\"\n coef_grads[layer] = safe_sparse_dot(activations[layer].T, deltas[layer])\n coef_grads[layer] += self.alpha * self.coefs_[layer]\n coef_grads[layer] /= n_samples\n intercept_grads[layer] = np.mean(deltas[layer], 0)\n \n def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas, coef_grads, intercept_grads):\n \"\"\"Compute the MLP loss function and its corresponding derivatives\n with respect to the different parameters given in the initialization.\n\n Returned gradients are packed in a single vector so it can be used\n in lbfgs\n\n Parameters\n ----------\n packed_coef_inter : ndarray\n A vector comprising the flattened coefficients and intercepts.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : ndarray of shape (n_samples,)\n The target values.\n\n activations : list, length = n_layers - 1\n The ith element of the list holds the values of the ith layer.\n\n deltas : list, length = n_layers - 1\n The ith element of the list holds the difference between the\n activations of the i + 1 layer and the backpropagated error.\n More specifically, deltas are gradients of loss with respect to z\n in each layer, where z = wx + b is the value of a particular layer\n before passing through the activation function\n\n coef_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n coefficient parameters of the ith layer in an iteration.\n\n intercept_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n intercept parameters of the ith layer in an iteration.\n\n Returns\n -------\n loss : float\n grad : array-like, shape (number of nodes of all layers,)\n \"\"\"\n self._unpack(packed_coef_inter)\n (loss, coef_grads, intercept_grads) = self._backprop(X, y, activations, deltas, coef_grads, intercept_grads)\n grad = _pack(coef_grads, intercept_grads)\n return loss, grad\n \n def _backprop(self, X, y, activations, deltas, coef_grads, intercept_grads):\n \"\"\"Compute the MLP loss function and its corresponding derivatives\n with respect to each parameter: weights and bias vectors.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : ndarray of shape (n_samples,)\n The target values.\n\n activations : list, length = n_layers - 1\n The ith element of the list holds the values of the ith layer.\n\n deltas : list, length = n_layers - 1\n The ith element of the list holds the difference between the\n activations of the i + 1 layer and the backpropagated error.\n More specifically, deltas are gradients of loss with respect to z\n in each layer, where z = wx + b is the value of a particular layer\n before passing through the activation function\n\n coef_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n coefficient parameters of the ith layer in an iteration.\n\n intercept_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n intercept parameters of the ith layer in an iteration.\n\n Returns\n -------\n loss : float\n coef_grads : list, length = n_layers - 1\n intercept_grads : list, length = n_layers - 1\n \"\"\"\n n_samples = X.shape[0]\n activations = self._forward_pass(activations)\n loss_func_name = self.loss\n if loss_func_name == 'log_loss' and self.out_activation_ == 'logistic':\n loss_func_name = 'binary_log_loss'\n loss = LOSS_FUNCTIONS[loss_func_name](y, activations[-1])\n values = 0\n for s in self.coefs_:\n s = s.ravel()\n values += np.dot(s, s)\n loss += 0.5 * self.alpha * values / n_samples\n last = self.n_layers_ - 2\n deltas[last] = activations[-1] - y\n self._compute_loss_grad(last, n_samples, activations, deltas, coef_grads, intercept_grads)\n inplace_derivative = DERIVATIVES[self.activation]\n for i in range(self.n_layers_ - 2, 0, -1):\n deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)\n inplace_derivative(activations[i], deltas[i - 1])\n self._compute_loss_grad(i - 1, n_samples, activations, deltas, coef_grads, intercept_grads)\n return loss, coef_grads, intercept_grads\n \n def _initialize(self, y, layer_units, dtype):\n self.n_iter_ = 0\n self.t_ = 0\n self.n_outputs_ = y.shape[1]\n self.n_layers_ = len(layer_units)\n if not is_classifier(self):\n self.out_activation_ = 'identity'\n elif self._label_binarizer.y_type_ == 'multiclass':\n self.out_activation_ = 'softmax'\n else:\n self.out_activation_ = 'logistic'\n self.coefs_ = []\n self.intercepts_ = []\n for i in range(self.n_layers_ - 1):\n (coef_init, intercept_init) = self._init_coef(layer_units[i], layer_units[i + 1], dtype)\n self.coefs_.append(coef_init)\n self.intercepts_.append(intercept_init)\n if self.solver in _STOCHASTIC_SOLVERS:\n self.loss_curve_ = []\n self._no_improvement_count = 0\n if self.early_stopping:\n self.validation_scores_ = []\n self.best_validation_score_ = -np.inf\n else:\n self.best_loss_ = np.inf\n \n def _init_coef(self, fan_in, fan_out, dtype):\n factor = 6.0\n if self.activation == 'logistic':\n factor = 2.0\n init_bound = np.sqrt(factor / (fan_in + fan_out))\n coef_init = self._random_state.uniform(-init_bound, init_bound, (fan_in, fan_out))\n intercept_init = self._random_state.uniform(-init_bound, init_bound, fan_out)\n coef_init = coef_init.astype(dtype, copy=False)\n intercept_init = intercept_init.astype(dtype, copy=False)\n return coef_init, intercept_init\n \n def _fit(self, X, y, incremental=False):\n hidden_layer_sizes = self.hidden_layer_sizes\n if not hasattr(hidden_layer_sizes, '__iter__'):\n hidden_layer_sizes = [hidden_layer_sizes]\n hidden_layer_sizes = list(hidden_layer_sizes)\n self._validate_hyperparameters()\n if np.any(np.array(hidden_layer_sizes) <= 0):\n raise ValueError('hidden_layer_sizes must be > 0, got %s.' % hidden_layer_sizes)\n first_pass = not hasattr(self, 'coefs_') or not self.warm_start and not incremental\n (X, y) = self._validate_input(X, y, incremental, reset=first_pass)\n (n_samples, n_features) = X.shape\n if y.ndim == 1:\n y = y.reshape((-1, 1))\n self.n_outputs_ = y.shape[1]\n layer_units = [n_features] + hidden_layer_sizes + [self.n_outputs_]\n self._random_state = check_random_state(self.random_state)\n if first_pass:\n self._initialize(y, layer_units, X.dtype)\n activations = [X] + [None] * (len(layer_units) - 1)\n deltas = [None] * (len(activations) - 1)\n coef_grads = [np.empty((n_fan_in_, n_fan_out_), dtype=X.dtype) for (n_fan_in_, n_fan_out_) in zip(layer_units[:-1], layer_units[1:])]\n intercept_grads = [np.empty(n_fan_out_, dtype=X.dtype) for n_fan_out_ in layer_units[1:]]\n if self.solver in _STOCHASTIC_SOLVERS:\n self._fit_stochastic(X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental)\n elif self.solver == 'lbfgs':\n self._fit_lbfgs(X, y, activations, deltas, coef_grads, intercept_grads, layer_units)\n return self\n \n def _validate_hyperparameters(self):\n if not isinstance(self.shuffle, bool):\n raise ValueError('shuffle must be either True or False, got %s.' % self.shuffle)\n if self.max_iter <= 0:\n raise ValueError('max_iter must be > 0, got %s.' % self.max_iter)\n if self.max_fun <= 0:\n raise ValueError('max_fun must be > 0, got %s.' % self.max_fun)\n if self.alpha < 0.0:\n raise ValueError('alpha must be >= 0, got %s.' % self.alpha)\n if self.learning_rate in ['constant', 'invscaling', 'adaptive'] and self.learning_rate_init <= 0.0:\n raise ValueError('learning_rate_init must be > 0, got %s.' % self.learning_rate)\n if self.momentum > 1 or self.momentum < 0:\n raise ValueError('momentum must be >= 0 and <= 1, got %s' % self.momentum)\n if not isinstance(self.nesterovs_momentum, bool):\n raise ValueError('nesterovs_momentum must be either True or False, got %s.' % self.nesterovs_momentum)\n if not isinstance(self.early_stopping, bool):\n raise ValueError('early_stopping must be either True or False, got %s.' % self.early_stopping)\n if self.validation_fraction < 0 or self.validation_fraction >= 1:\n raise ValueError('validation_fraction must be >= 0 and < 1, got %s' % self.validation_fraction)\n if self.beta_1 < 0 or self.beta_1 >= 1:\n raise ValueError('beta_1 must be >= 0 and < 1, got %s' % self.beta_1)\n if self.beta_2 < 0 or self.beta_2 >= 1:\n raise ValueError('beta_2 must be >= 0 and < 1, got %s' % self.beta_2)\n if self.epsilon <= 0.0:\n raise ValueError('epsilon must be > 0, got %s.' % self.epsilon)\n if self.n_iter_no_change <= 0:\n raise ValueError('n_iter_no_change must be > 0, got %s.' % self.n_iter_no_change)\n if self.activation not in ACTIVATIONS:\n raise ValueError(\"The activation '%s' is not supported. Supported activations are %s.\" % (self.activation, list(sorted(ACTIVATIONS))))\n if self.learning_rate not in ['constant', 'invscaling', 'adaptive']:\n raise ValueError('learning rate %s is not supported. ' % self.learning_rate)\n supported_solvers = _STOCHASTIC_SOLVERS + ['lbfgs']\n if self.solver not in supported_solvers:\n raise ValueError('The solver %s is not supported. Expected one of: %s' % (self.solver, ', '.join(supported_solvers)))\n \n def _fit_lbfgs(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units):\n self._coef_indptr = []\n self._intercept_indptr = []\n start = 0\n for i in range(self.n_layers_ - 1):\n (n_fan_in, n_fan_out) = (layer_units[i], layer_units[i + 1])\n end = start + n_fan_in * n_fan_out\n self._coef_indptr.append((start, end, (n_fan_in, n_fan_out)))\n start = end\n for i in range(self.n_layers_ - 1):\n end = start + layer_units[i + 1]\n self._intercept_indptr.append((start, end))\n start = end\n packed_coef_inter = _pack(self.coefs_, self.intercepts_)\n if self.verbose is True or self.verbose >= 1:\n iprint = 1\n else:\n iprint = -1\n opt_res = scipy.optimize.minimize(self._loss_grad_lbfgs, packed_coef_inter, method='L-BFGS-B', jac=True, options={'maxfun': self.max_fun, 'maxiter': self.max_iter, 'iprint': iprint, 'gtol': self.tol}, args=(X, y, activations, deltas, coef_grads, intercept_grads))\n self.n_iter_ = _check_optimize_result('lbfgs', opt_res, self.max_iter)\n self.loss_ = opt_res.fun\n self._unpack(opt_res.x)\n \n def _fit_stochastic(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental):\n params = self.coefs_ + self.intercepts_\n if not incremental or not hasattr(self, '_optimizer'):\n if self.solver == 'sgd':\n self._optimizer = SGDOptimizer(params, self.learning_rate_init, self.learning_rate, self.momentum, self.nesterovs_momentum, self.power_t)\n elif self.solver == 'adam':\n self._optimizer = AdamOptimizer(params, self.learning_rate_init, self.beta_1, self.beta_2, self.epsilon)\n early_stopping = self.early_stopping and not incremental\n if early_stopping:\n should_stratify = is_classifier(self) and self.n_outputs_ == 1\n stratify = y if should_stratify else None\n (X, X_val, y, y_val) = train_test_split(X, y, random_state=self._random_state, test_size=self.validation_fraction, stratify=stratify)\n if is_classifier(self):\n y_val = self._label_binarizer.inverse_transform(y_val)\n else:\n X_val = None\n y_val = None\n n_samples = X.shape[0]\n sample_idx = np.arange(n_samples, dtype=int)\n if self.batch_size == 'auto':\n batch_size = min(200, n_samples)\n else:\n if self.batch_size < 1 or self.batch_size > n_samples:\n warnings.warn('Got `batch_size` less than 1 or larger than sample size. It is going to be clipped')\n batch_size = np.clip(self.batch_size, 1, n_samples)\n try:\n for it in range(self.max_iter):\n if self.shuffle:\n sample_idx = shuffle(sample_idx, random_state=self._random_state)\n accumulated_loss = 0.0\n for batch_slice in gen_batches(n_samples, batch_size):\n if self.shuffle:\n X_batch = _safe_indexing(X, sample_idx[batch_slice])\n y_batch = y[sample_idx[batch_slice]]\n else:\n X_batch = X[batch_slice]\n y_batch = y[batch_slice]\n activations[0] = X_batch\n (batch_loss, coef_grads, intercept_grads) = self._backprop(X_batch, y_batch, activations, deltas, coef_grads, intercept_grads)\n accumulated_loss += batch_loss * (batch_slice.stop - batch_slice.start)\n grads = coef_grads + intercept_grads\n self._optimizer.update_params(params, grads)\n self.n_iter_ += 1\n self.loss_ = accumulated_loss / X.shape[0]\n self.t_ += n_samples\n self.loss_curve_.append(self.loss_)\n if self.verbose:\n print('Iteration %d, loss = %.8f' % (self.n_iter_, self.loss_))\n self._update_no_improvement_count(early_stopping, X_val, y_val)\n self._optimizer.iteration_ends(self.t_)\n if self._no_improvement_count > self.n_iter_no_change:\n if early_stopping:\n msg = 'Validation score did not improve more than tol=%f for %d consecutive epochs.' % (self.tol, self.n_iter_no_change)\n else:\n msg = 'Training loss did not improve more than tol=%f for %d consecutive epochs.' % (self.tol, self.n_iter_no_change)\n is_stopping = self._optimizer.trigger_stopping(msg, self.verbose)\n if is_stopping:\n break\n else:\n self._no_improvement_count = 0\n if incremental:\n break\n if self.n_iter_ == self.max_iter:\n warnings.warn(\"Stochastic Optimizer: Maximum iterations (%d) reached and the optimization hasn't converged yet.\" % self.max_iter, ConvergenceWarning)\n except KeyboardInterrupt:\n warnings.warn('Training interrupted by user.')\n if early_stopping:\n self.coefs_ = self._best_coefs\n self.intercepts_ = self._best_intercepts\n \n def _update_no_improvement_count(self, early_stopping, X_val, y_val):\n if early_stopping:\n self.validation_scores_.append(self.score(X_val, y_val))\n if self.verbose:\n print('Validation score: %f' % self.validation_scores_[-1])\n last_valid_score = self.validation_scores_[-1]\n if last_valid_score < self.best_validation_score_ + self.tol:\n self._no_improvement_count += 1\n else:\n self._no_improvement_count = 0\n if last_valid_score > self.best_validation_score_:\n self.best_validation_score_ = last_valid_score\n self._best_coefs = [c.copy() for c in self.coefs_]\n self._best_intercepts = [i.copy() for i in self.intercepts_]\n else:\n if self.loss_curve_[-1] > self.best_loss_ - self.tol:\n self._no_improvement_count += 1\n else:\n self._no_improvement_count = 0\n if self.loss_curve_[-1] < self.best_loss_:\n self.best_loss_ = self.loss_curve_[-1]\n \n def fit(self, X, y):\n \"\"\"Fit the model to data matrix X and target(s) y.\n\n Parameters\n ----------\n X : ndarray or sparse matrix of shape (n_samples, n_features)\n The input data.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels in classification, real numbers in\n regression).\n\n Returns\n -------\n self : object\n Returns a trained MLP model.\n \"\"\"\n return self._fit(X, y, incremental=False)\n \n def _check_solver(self):\n if self.solver not in _STOCHASTIC_SOLVERS:\n raise AttributeError('partial_fit is only available for stochastic optimizers. %s is not stochastic.' % self.solver)\n return True\n \n @available_if(_check_solver)\n def partial_fit(self, X, y):\n \"\"\"Update the model with a single iteration over the given data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : ndarray of shape (n_samples,)\n The target values.\n\n Returns\n -------\n self : object\n Trained MLP model.\n \"\"\"\n return self._fit(X, y, incremental=True)\n" }, @@ -25636,9 +25732,9 @@ "sklearn.neural_network._multilayer_perceptron.MLPClassifier._more_tags" ], "is_public": true, - "description": "Multi-layer Perceptron classifier.\n\nThis model optimizes the log-loss function using LBFGS or stochastic gradient descent. .. versionadded:: 0.18", - "docstring": "Multi-layer Perceptron classifier.\n\n This model optimizes the log-loss function using LBFGS or stochastic\n gradient descent.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)\n The ith element represents the number of neurons in the ith\n hidden layer.\n\n activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'\n Activation function for the hidden layer.\n\n - 'identity', no-op activation, useful to implement linear bottleneck,\n returns f(x) = x\n\n - 'logistic', the logistic sigmoid function,\n returns f(x) = 1 / (1 + exp(-x)).\n\n - 'tanh', the hyperbolic tan function,\n returns f(x) = tanh(x).\n\n - 'relu', the rectified linear unit function,\n returns f(x) = max(0, x)\n\n solver : {'lbfgs', 'sgd', 'adam'}, default='adam'\n The solver for weight optimization.\n\n - 'lbfgs' is an optimizer in the family of quasi-Newton methods.\n\n - 'sgd' refers to stochastic gradient descent.\n\n - 'adam' refers to a stochastic gradient-based optimizer proposed\n by Kingma, Diederik, and Jimmy Ba\n\n Note: The default solver 'adam' works pretty well on relatively\n large datasets (with thousands of training samples or more) in terms of\n both training time and validation score.\n For small datasets, however, 'lbfgs' can converge faster and perform\n better.\n\n alpha : float, default=0.0001\n L2 penalty (regularization term) parameter.\n\n batch_size : int, default='auto'\n Size of minibatches for stochastic optimizers.\n If the solver is 'lbfgs', the classifier will not use minibatch.\n When set to \"auto\", `batch_size=min(200, n_samples)`.\n\n learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'\n Learning rate schedule for weight updates.\n\n - 'constant' is a constant learning rate given by\n 'learning_rate_init'.\n\n - 'invscaling' gradually decreases the learning rate at each\n time step 't' using an inverse scaling exponent of 'power_t'.\n effective_learning_rate = learning_rate_init / pow(t, power_t)\n\n - 'adaptive' keeps the learning rate constant to\n 'learning_rate_init' as long as training loss keeps decreasing.\n Each time two consecutive epochs fail to decrease training loss by at\n least tol, or fail to increase validation score by at least tol if\n 'early_stopping' is on, the current learning rate is divided by 5.\n\n Only used when ``solver='sgd'``.\n\n learning_rate_init : double, default=0.001\n The initial learning rate used. It controls the step-size\n in updating the weights. Only used when solver='sgd' or 'adam'.\n\n power_t : double, default=0.5\n The exponent for inverse scaling learning rate.\n It is used in updating effective learning rate when the learning_rate\n is set to 'invscaling'. Only used when solver='sgd'.\n\n max_iter : int, default=200\n Maximum number of iterations. The solver iterates until convergence\n (determined by 'tol') or this number of iterations. For stochastic\n solvers ('sgd', 'adam'), note that this determines the number of epochs\n (how many times each data point will be used), not the number of\n gradient steps.\n\n shuffle : bool, default=True\n Whether to shuffle samples in each iteration. Only used when\n solver='sgd' or 'adam'.\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for weights and bias\n initialization, train-test split if early stopping is used, and batch\n sampling when solver='sgd' or 'adam'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n tol : float, default=1e-4\n Tolerance for the optimization. When the loss or score is not improving\n by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,\n unless ``learning_rate`` is set to 'adaptive', convergence is\n considered to be reached and training stops.\n\n verbose : bool, default=False\n Whether to print progress messages to stdout.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous\n call to fit as initialization, otherwise, just erase the\n previous solution. See :term:`the Glossary `.\n\n momentum : float, default=0.9\n Momentum for gradient descent update. Should be between 0 and 1. Only\n used when solver='sgd'.\n\n nesterovs_momentum : bool, default=True\n Whether to use Nesterov's momentum. Only used when solver='sgd' and\n momentum > 0.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to true, it will automatically set\n aside 10% of training data as validation and terminate training when\n validation score is not improving by at least tol for\n ``n_iter_no_change`` consecutive epochs. The split is stratified,\n except in a multilabel setting.\n If early stopping is False, then the training stops when the training\n loss does not improve by more than tol for n_iter_no_change consecutive\n passes over the training set.\n Only effective when solver='sgd' or 'adam'.\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if early_stopping is True.\n\n beta_1 : float, default=0.9\n Exponential decay rate for estimates of first moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n beta_2 : float, default=0.999\n Exponential decay rate for estimates of second moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n epsilon : float, default=1e-8\n Value for numerical stability in adam. Only used when solver='adam'.\n\n n_iter_no_change : int, default=10\n Maximum number of epochs to not meet ``tol`` improvement.\n Only effective when solver='sgd' or 'adam'.\n\n .. versionadded:: 0.20\n\n max_fun : int, default=15000\n Only used when solver='lbfgs'. Maximum number of loss function calls.\n The solver iterates until convergence (determined by 'tol'), number\n of iterations reaches max_iter, or this number of loss function calls.\n Note that number of loss function calls will be greater than or equal\n to the number of iterations for the `MLPClassifier`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n classes_ : ndarray or list of ndarray of shape (n_classes,)\n Class labels for each output.\n\n loss_ : float\n The current loss computed with the loss function.\n\n best_loss_ : float\n The minimum loss reached by the solver throughout fitting.\n\n loss_curve_ : list of shape (`n_iter_`,)\n The ith element in the list represents the loss at the ith iteration.\n\n t_ : int\n The number of training samples seen by the solver during fitting.\n\n coefs_ : list of shape (n_layers - 1,)\n The ith element in the list represents the weight matrix corresponding\n to layer i.\n\n intercepts_ : list of shape (n_layers - 1,)\n The ith element in the list represents the bias vector corresponding to\n layer i + 1.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The number of iterations the solver has run.\n\n n_layers_ : int\n Number of layers.\n\n n_outputs_ : int\n Number of outputs.\n\n out_activation_ : str\n Name of the output activation function.\n\n See Also\n --------\n MLPRegressor : Multi-layer Perceptron regressor.\n BernoulliRBM : Bernoulli Restricted Boltzmann Machine (RBM).\n\n Notes\n -----\n MLPClassifier trains iteratively since at each time step\n the partial derivatives of the loss function with respect to the model\n parameters are computed to update the parameters.\n\n It can also have a regularization term added to the loss function\n that shrinks model parameters to prevent overfitting.\n\n This implementation works with data represented as dense numpy arrays or\n sparse scipy arrays of floating point values.\n\n References\n ----------\n Hinton, Geoffrey E.\n \"Connectionist learning procedures.\" Artificial intelligence 40.1\n (1989): 185-234.\n\n Glorot, Xavier, and Yoshua Bengio. \"Understanding the difficulty of\n training deep feedforward neural networks.\" International Conference\n on Artificial Intelligence and Statistics. 2010.\n\n He, Kaiming, et al. \"Delving deep into rectifiers: Surpassing human-level\n performance on imagenet classification.\" arXiv preprint\n arXiv:1502.01852 (2015).\n\n Kingma, Diederik, and Jimmy Ba. \"Adam: A method for stochastic\n optimization.\" arXiv preprint arXiv:1412.6980 (2014).\n\n Examples\n --------\n >>> from sklearn.neural_network import MLPClassifier\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = make_classification(n_samples=100, random_state=1)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,\n ... random_state=1)\n >>> clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)\n >>> clf.predict_proba(X_test[:1])\n array([[0.038..., 0.961...]])\n >>> clf.predict(X_test[:5, :])\n array([1, 0, 1, 0, 1])\n >>> clf.score(X_test, y_test)\n 0.8...\n ", - "source_code": "\n\nclass MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):\n \"\"\"Multi-layer Perceptron classifier.\n\n This model optimizes the log-loss function using LBFGS or stochastic\n gradient descent.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)\n The ith element represents the number of neurons in the ith\n hidden layer.\n\n activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'\n Activation function for the hidden layer.\n\n - 'identity', no-op activation, useful to implement linear bottleneck,\n returns f(x) = x\n\n - 'logistic', the logistic sigmoid function,\n returns f(x) = 1 / (1 + exp(-x)).\n\n - 'tanh', the hyperbolic tan function,\n returns f(x) = tanh(x).\n\n - 'relu', the rectified linear unit function,\n returns f(x) = max(0, x)\n\n solver : {'lbfgs', 'sgd', 'adam'}, default='adam'\n The solver for weight optimization.\n\n - 'lbfgs' is an optimizer in the family of quasi-Newton methods.\n\n - 'sgd' refers to stochastic gradient descent.\n\n - 'adam' refers to a stochastic gradient-based optimizer proposed\n by Kingma, Diederik, and Jimmy Ba\n\n Note: The default solver 'adam' works pretty well on relatively\n large datasets (with thousands of training samples or more) in terms of\n both training time and validation score.\n For small datasets, however, 'lbfgs' can converge faster and perform\n better.\n\n alpha : float, default=0.0001\n L2 penalty (regularization term) parameter.\n\n batch_size : int, default='auto'\n Size of minibatches for stochastic optimizers.\n If the solver is 'lbfgs', the classifier will not use minibatch.\n When set to \"auto\", `batch_size=min(200, n_samples)`.\n\n learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'\n Learning rate schedule for weight updates.\n\n - 'constant' is a constant learning rate given by\n 'learning_rate_init'.\n\n - 'invscaling' gradually decreases the learning rate at each\n time step 't' using an inverse scaling exponent of 'power_t'.\n effective_learning_rate = learning_rate_init / pow(t, power_t)\n\n - 'adaptive' keeps the learning rate constant to\n 'learning_rate_init' as long as training loss keeps decreasing.\n Each time two consecutive epochs fail to decrease training loss by at\n least tol, or fail to increase validation score by at least tol if\n 'early_stopping' is on, the current learning rate is divided by 5.\n\n Only used when ``solver='sgd'``.\n\n learning_rate_init : double, default=0.001\n The initial learning rate used. It controls the step-size\n in updating the weights. Only used when solver='sgd' or 'adam'.\n\n power_t : double, default=0.5\n The exponent for inverse scaling learning rate.\n It is used in updating effective learning rate when the learning_rate\n is set to 'invscaling'. Only used when solver='sgd'.\n\n max_iter : int, default=200\n Maximum number of iterations. The solver iterates until convergence\n (determined by 'tol') or this number of iterations. For stochastic\n solvers ('sgd', 'adam'), note that this determines the number of epochs\n (how many times each data point will be used), not the number of\n gradient steps.\n\n shuffle : bool, default=True\n Whether to shuffle samples in each iteration. Only used when\n solver='sgd' or 'adam'.\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for weights and bias\n initialization, train-test split if early stopping is used, and batch\n sampling when solver='sgd' or 'adam'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n tol : float, default=1e-4\n Tolerance for the optimization. When the loss or score is not improving\n by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,\n unless ``learning_rate`` is set to 'adaptive', convergence is\n considered to be reached and training stops.\n\n verbose : bool, default=False\n Whether to print progress messages to stdout.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous\n call to fit as initialization, otherwise, just erase the\n previous solution. See :term:`the Glossary `.\n\n momentum : float, default=0.9\n Momentum for gradient descent update. Should be between 0 and 1. Only\n used when solver='sgd'.\n\n nesterovs_momentum : bool, default=True\n Whether to use Nesterov's momentum. Only used when solver='sgd' and\n momentum > 0.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to true, it will automatically set\n aside 10% of training data as validation and terminate training when\n validation score is not improving by at least tol for\n ``n_iter_no_change`` consecutive epochs. The split is stratified,\n except in a multilabel setting.\n If early stopping is False, then the training stops when the training\n loss does not improve by more than tol for n_iter_no_change consecutive\n passes over the training set.\n Only effective when solver='sgd' or 'adam'.\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if early_stopping is True.\n\n beta_1 : float, default=0.9\n Exponential decay rate for estimates of first moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n beta_2 : float, default=0.999\n Exponential decay rate for estimates of second moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n epsilon : float, default=1e-8\n Value for numerical stability in adam. Only used when solver='adam'.\n\n n_iter_no_change : int, default=10\n Maximum number of epochs to not meet ``tol`` improvement.\n Only effective when solver='sgd' or 'adam'.\n\n .. versionadded:: 0.20\n\n max_fun : int, default=15000\n Only used when solver='lbfgs'. Maximum number of loss function calls.\n The solver iterates until convergence (determined by 'tol'), number\n of iterations reaches max_iter, or this number of loss function calls.\n Note that number of loss function calls will be greater than or equal\n to the number of iterations for the `MLPClassifier`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n classes_ : ndarray or list of ndarray of shape (n_classes,)\n Class labels for each output.\n\n loss_ : float\n The current loss computed with the loss function.\n\n best_loss_ : float\n The minimum loss reached by the solver throughout fitting.\n\n loss_curve_ : list of shape (`n_iter_`,)\n The ith element in the list represents the loss at the ith iteration.\n\n t_ : int\n The number of training samples seen by the solver during fitting.\n\n coefs_ : list of shape (n_layers - 1,)\n The ith element in the list represents the weight matrix corresponding\n to layer i.\n\n intercepts_ : list of shape (n_layers - 1,)\n The ith element in the list represents the bias vector corresponding to\n layer i + 1.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The number of iterations the solver has run.\n\n n_layers_ : int\n Number of layers.\n\n n_outputs_ : int\n Number of outputs.\n\n out_activation_ : str\n Name of the output activation function.\n\n See Also\n --------\n MLPRegressor : Multi-layer Perceptron regressor.\n BernoulliRBM : Bernoulli Restricted Boltzmann Machine (RBM).\n\n Notes\n -----\n MLPClassifier trains iteratively since at each time step\n the partial derivatives of the loss function with respect to the model\n parameters are computed to update the parameters.\n\n It can also have a regularization term added to the loss function\n that shrinks model parameters to prevent overfitting.\n\n This implementation works with data represented as dense numpy arrays or\n sparse scipy arrays of floating point values.\n\n References\n ----------\n Hinton, Geoffrey E.\n \"Connectionist learning procedures.\" Artificial intelligence 40.1\n (1989): 185-234.\n\n Glorot, Xavier, and Yoshua Bengio. \"Understanding the difficulty of\n training deep feedforward neural networks.\" International Conference\n on Artificial Intelligence and Statistics. 2010.\n\n He, Kaiming, et al. \"Delving deep into rectifiers: Surpassing human-level\n performance on imagenet classification.\" arXiv preprint\n arXiv:1502.01852 (2015).\n\n Kingma, Diederik, and Jimmy Ba. \"Adam: A method for stochastic\n optimization.\" arXiv preprint arXiv:1412.6980 (2014).\n\n Examples\n --------\n >>> from sklearn.neural_network import MLPClassifier\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = make_classification(n_samples=100, random_state=1)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,\n ... random_state=1)\n >>> clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)\n >>> clf.predict_proba(X_test[:1])\n array([[0.038..., 0.961...]])\n >>> clf.predict(X_test[:5, :])\n array([1, 0, 1, 0, 1])\n >>> clf.score(X_test, y_test)\n 0.8...\n \"\"\"\n \n def __init__(self, hidden_layer_sizes=(100, ), activation='relu', *, solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000):\n super().__init__(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, loss='log_loss', shuffle=shuffle, random_state=random_state, tol=tol, verbose=verbose, warm_start=warm_start, momentum=momentum, nesterovs_momentum=nesterovs_momentum, early_stopping=early_stopping, validation_fraction=validation_fraction, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, n_iter_no_change=n_iter_no_change, max_fun=max_fun)\n \n def _validate_input(self, X, y, incremental, reset):\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], multi_output=True, dtype=(np.float64, np.float32), reset=reset)\n if y.ndim == 2 and y.shape[1] == 1:\n y = column_or_1d(y, warn=True)\n if not hasattr(self, 'classes_') or not self.warm_start and not incremental:\n self._label_binarizer = LabelBinarizer()\n self._label_binarizer.fit(y)\n self.classes_ = self._label_binarizer.classes_\n else:\n classes = unique_labels(y)\n if self.warm_start:\n if set(classes) != set(self.classes_):\n raise ValueError(f'warm_start can only be used where `y` has the same classes as in the previous call to fit. Previously got {self.classes_}, `y` has {classes}')\n elif len(np.setdiff1d(classes, self.classes_, assume_unique=True)):\n raise ValueError(f\"`y` has classes not in `self.classes_`. `self.classes_` has {self.classes_}. 'y' has {classes}.\")\n y = self._label_binarizer.transform(y).astype(bool)\n return X, y\n \n def predict(self, X):\n \"\"\"Predict using the multi-layer perceptron classifier.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y : ndarray, shape (n_samples,) or (n_samples, n_classes)\n The predicted classes.\n \"\"\"\n check_is_fitted(self)\n y_pred = self._forward_pass_fast(X)\n if self.n_outputs_ == 1:\n y_pred = y_pred.ravel()\n return self._label_binarizer.inverse_transform(y_pred)\n \n @available_if(lambda est: est._check_solver())\n def partial_fit(self, X, y, classes=None):\n \"\"\"Update the model with a single iteration over the given data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : array-like of shape (n_samples,)\n The target values.\n\n classes : array of shape (n_classes,), default=None\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that y doesn't need to contain all labels in `classes`.\n\n Returns\n -------\n self : object\n Trained MLP model.\n \"\"\"\n if _check_partial_fit_first_call(self, classes):\n self._label_binarizer = LabelBinarizer()\n if type_of_target(y).startswith('multilabel'):\n self._label_binarizer.fit(y)\n else:\n self._label_binarizer.fit(classes)\n super().partial_fit(X, y)\n return self\n \n def predict_log_proba(self, X):\n \"\"\"Return the log of probability estimates.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n log_y_prob : ndarray of shape (n_samples, n_classes)\n The predicted log-probability of the sample for each class\n in the model, where classes are ordered as they are in\n `self.classes_`. Equivalent to `log(predict_proba(X))`.\n \"\"\"\n y_prob = self.predict_proba(X)\n return np.log(y_prob, out=y_prob)\n \n def predict_proba(self, X):\n \"\"\"Probability estimates.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y_prob : ndarray of shape (n_samples, n_classes)\n The predicted probability of the sample for each class in the\n model, where classes are ordered as they are in `self.classes_`.\n \"\"\"\n check_is_fitted(self)\n y_pred = self._forward_pass_fast(X)\n if self.n_outputs_ == 1:\n y_pred = y_pred.ravel()\n if y_pred.ndim == 1:\n return np.vstack([1 - y_pred, y_pred]).T\n else:\n return y_pred\n \n def _more_tags(self):\n return {'multilabel': True}\n" + "description": "Multi-layer Perceptron classifier.\n\nThis model optimizes the log-loss function using LBFGS or stochastic\ngradient descent.\n\n.. versionadded:: 0.18", + "docstring": "Multi-layer Perceptron classifier.\n\n This model optimizes the log-loss function using LBFGS or stochastic\n gradient descent.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)\n The ith element represents the number of neurons in the ith\n hidden layer.\n\n activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'\n Activation function for the hidden layer.\n\n - 'identity', no-op activation, useful to implement linear bottleneck,\n returns f(x) = x\n\n - 'logistic', the logistic sigmoid function,\n returns f(x) = 1 / (1 + exp(-x)).\n\n - 'tanh', the hyperbolic tan function,\n returns f(x) = tanh(x).\n\n - 'relu', the rectified linear unit function,\n returns f(x) = max(0, x)\n\n solver : {'lbfgs', 'sgd', 'adam'}, default='adam'\n The solver for weight optimization.\n\n - 'lbfgs' is an optimizer in the family of quasi-Newton methods.\n\n - 'sgd' refers to stochastic gradient descent.\n\n - 'adam' refers to a stochastic gradient-based optimizer proposed\n by Kingma, Diederik, and Jimmy Ba\n\n Note: The default solver 'adam' works pretty well on relatively\n large datasets (with thousands of training samples or more) in terms of\n both training time and validation score.\n For small datasets, however, 'lbfgs' can converge faster and perform\n better.\n\n alpha : float, default=0.0001\n L2 penalty (regularization term) parameter.\n\n batch_size : int, default='auto'\n Size of minibatches for stochastic optimizers.\n If the solver is 'lbfgs', the classifier will not use minibatch.\n When set to \"auto\", `batch_size=min(200, n_samples)`.\n\n learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'\n Learning rate schedule for weight updates.\n\n - 'constant' is a constant learning rate given by\n 'learning_rate_init'.\n\n - 'invscaling' gradually decreases the learning rate at each\n time step 't' using an inverse scaling exponent of 'power_t'.\n effective_learning_rate = learning_rate_init / pow(t, power_t)\n\n - 'adaptive' keeps the learning rate constant to\n 'learning_rate_init' as long as training loss keeps decreasing.\n Each time two consecutive epochs fail to decrease training loss by at\n least tol, or fail to increase validation score by at least tol if\n 'early_stopping' is on, the current learning rate is divided by 5.\n\n Only used when ``solver='sgd'``.\n\n learning_rate_init : float, default=0.001\n The initial learning rate used. It controls the step-size\n in updating the weights. Only used when solver='sgd' or 'adam'.\n\n power_t : float, default=0.5\n The exponent for inverse scaling learning rate.\n It is used in updating effective learning rate when the learning_rate\n is set to 'invscaling'. Only used when solver='sgd'.\n\n max_iter : int, default=200\n Maximum number of iterations. The solver iterates until convergence\n (determined by 'tol') or this number of iterations. For stochastic\n solvers ('sgd', 'adam'), note that this determines the number of epochs\n (how many times each data point will be used), not the number of\n gradient steps.\n\n shuffle : bool, default=True\n Whether to shuffle samples in each iteration. Only used when\n solver='sgd' or 'adam'.\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for weights and bias\n initialization, train-test split if early stopping is used, and batch\n sampling when solver='sgd' or 'adam'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n tol : float, default=1e-4\n Tolerance for the optimization. When the loss or score is not improving\n by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,\n unless ``learning_rate`` is set to 'adaptive', convergence is\n considered to be reached and training stops.\n\n verbose : bool, default=False\n Whether to print progress messages to stdout.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous\n call to fit as initialization, otherwise, just erase the\n previous solution. See :term:`the Glossary `.\n\n momentum : float, default=0.9\n Momentum for gradient descent update. Should be between 0 and 1. Only\n used when solver='sgd'.\n\n nesterovs_momentum : bool, default=True\n Whether to use Nesterov's momentum. Only used when solver='sgd' and\n momentum > 0.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to true, it will automatically set\n aside 10% of training data as validation and terminate training when\n validation score is not improving by at least tol for\n ``n_iter_no_change`` consecutive epochs. The split is stratified,\n except in a multilabel setting.\n If early stopping is False, then the training stops when the training\n loss does not improve by more than tol for n_iter_no_change consecutive\n passes over the training set.\n Only effective when solver='sgd' or 'adam'.\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if early_stopping is True.\n\n beta_1 : float, default=0.9\n Exponential decay rate for estimates of first moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n beta_2 : float, default=0.999\n Exponential decay rate for estimates of second moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n epsilon : float, default=1e-8\n Value for numerical stability in adam. Only used when solver='adam'.\n\n n_iter_no_change : int, default=10\n Maximum number of epochs to not meet ``tol`` improvement.\n Only effective when solver='sgd' or 'adam'.\n\n .. versionadded:: 0.20\n\n max_fun : int, default=15000\n Only used when solver='lbfgs'. Maximum number of loss function calls.\n The solver iterates until convergence (determined by 'tol'), number\n of iterations reaches max_iter, or this number of loss function calls.\n Note that number of loss function calls will be greater than or equal\n to the number of iterations for the `MLPClassifier`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n classes_ : ndarray or list of ndarray of shape (n_classes,)\n Class labels for each output.\n\n loss_ : float\n The current loss computed with the loss function.\n\n best_loss_ : float\n The minimum loss reached by the solver throughout fitting.\n\n loss_curve_ : list of shape (`n_iter_`,)\n The ith element in the list represents the loss at the ith iteration.\n\n t_ : int\n The number of training samples seen by the solver during fitting.\n\n coefs_ : list of shape (n_layers - 1,)\n The ith element in the list represents the weight matrix corresponding\n to layer i.\n\n intercepts_ : list of shape (n_layers - 1,)\n The ith element in the list represents the bias vector corresponding to\n layer i + 1.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The number of iterations the solver has run.\n\n n_layers_ : int\n Number of layers.\n\n n_outputs_ : int\n Number of outputs.\n\n out_activation_ : str\n Name of the output activation function.\n\n See Also\n --------\n MLPRegressor : Multi-layer Perceptron regressor.\n BernoulliRBM : Bernoulli Restricted Boltzmann Machine (RBM).\n\n Notes\n -----\n MLPClassifier trains iteratively since at each time step\n the partial derivatives of the loss function with respect to the model\n parameters are computed to update the parameters.\n\n It can also have a regularization term added to the loss function\n that shrinks model parameters to prevent overfitting.\n\n This implementation works with data represented as dense numpy arrays or\n sparse scipy arrays of floating point values.\n\n References\n ----------\n Hinton, Geoffrey E.\n \"Connectionist learning procedures.\" Artificial intelligence 40.1\n (1989): 185-234.\n\n Glorot, Xavier, and Yoshua Bengio. \"Understanding the difficulty of\n training deep feedforward neural networks.\" International Conference\n on Artificial Intelligence and Statistics. 2010.\n\n He, Kaiming, et al. \"Delving deep into rectifiers: Surpassing human-level\n performance on imagenet classification.\" arXiv preprint\n arXiv:1502.01852 (2015).\n\n Kingma, Diederik, and Jimmy Ba. \"Adam: A method for stochastic\n optimization.\" arXiv preprint arXiv:1412.6980 (2014).\n\n Examples\n --------\n >>> from sklearn.neural_network import MLPClassifier\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = make_classification(n_samples=100, random_state=1)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,\n ... random_state=1)\n >>> clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)\n >>> clf.predict_proba(X_test[:1])\n array([[0.038..., 0.961...]])\n >>> clf.predict(X_test[:5, :])\n array([1, 0, 1, 0, 1])\n >>> clf.score(X_test, y_test)\n 0.8...\n ", + "source_code": "\n\nclass MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):\n \"\"\"Multi-layer Perceptron classifier.\n\n This model optimizes the log-loss function using LBFGS or stochastic\n gradient descent.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)\n The ith element represents the number of neurons in the ith\n hidden layer.\n\n activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'\n Activation function for the hidden layer.\n\n - 'identity', no-op activation, useful to implement linear bottleneck,\n returns f(x) = x\n\n - 'logistic', the logistic sigmoid function,\n returns f(x) = 1 / (1 + exp(-x)).\n\n - 'tanh', the hyperbolic tan function,\n returns f(x) = tanh(x).\n\n - 'relu', the rectified linear unit function,\n returns f(x) = max(0, x)\n\n solver : {'lbfgs', 'sgd', 'adam'}, default='adam'\n The solver for weight optimization.\n\n - 'lbfgs' is an optimizer in the family of quasi-Newton methods.\n\n - 'sgd' refers to stochastic gradient descent.\n\n - 'adam' refers to a stochastic gradient-based optimizer proposed\n by Kingma, Diederik, and Jimmy Ba\n\n Note: The default solver 'adam' works pretty well on relatively\n large datasets (with thousands of training samples or more) in terms of\n both training time and validation score.\n For small datasets, however, 'lbfgs' can converge faster and perform\n better.\n\n alpha : float, default=0.0001\n L2 penalty (regularization term) parameter.\n\n batch_size : int, default='auto'\n Size of minibatches for stochastic optimizers.\n If the solver is 'lbfgs', the classifier will not use minibatch.\n When set to \"auto\", `batch_size=min(200, n_samples)`.\n\n learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'\n Learning rate schedule for weight updates.\n\n - 'constant' is a constant learning rate given by\n 'learning_rate_init'.\n\n - 'invscaling' gradually decreases the learning rate at each\n time step 't' using an inverse scaling exponent of 'power_t'.\n effective_learning_rate = learning_rate_init / pow(t, power_t)\n\n - 'adaptive' keeps the learning rate constant to\n 'learning_rate_init' as long as training loss keeps decreasing.\n Each time two consecutive epochs fail to decrease training loss by at\n least tol, or fail to increase validation score by at least tol if\n 'early_stopping' is on, the current learning rate is divided by 5.\n\n Only used when ``solver='sgd'``.\n\n learning_rate_init : float, default=0.001\n The initial learning rate used. It controls the step-size\n in updating the weights. Only used when solver='sgd' or 'adam'.\n\n power_t : float, default=0.5\n The exponent for inverse scaling learning rate.\n It is used in updating effective learning rate when the learning_rate\n is set to 'invscaling'. Only used when solver='sgd'.\n\n max_iter : int, default=200\n Maximum number of iterations. The solver iterates until convergence\n (determined by 'tol') or this number of iterations. For stochastic\n solvers ('sgd', 'adam'), note that this determines the number of epochs\n (how many times each data point will be used), not the number of\n gradient steps.\n\n shuffle : bool, default=True\n Whether to shuffle samples in each iteration. Only used when\n solver='sgd' or 'adam'.\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for weights and bias\n initialization, train-test split if early stopping is used, and batch\n sampling when solver='sgd' or 'adam'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n tol : float, default=1e-4\n Tolerance for the optimization. When the loss or score is not improving\n by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,\n unless ``learning_rate`` is set to 'adaptive', convergence is\n considered to be reached and training stops.\n\n verbose : bool, default=False\n Whether to print progress messages to stdout.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous\n call to fit as initialization, otherwise, just erase the\n previous solution. See :term:`the Glossary `.\n\n momentum : float, default=0.9\n Momentum for gradient descent update. Should be between 0 and 1. Only\n used when solver='sgd'.\n\n nesterovs_momentum : bool, default=True\n Whether to use Nesterov's momentum. Only used when solver='sgd' and\n momentum > 0.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to true, it will automatically set\n aside 10% of training data as validation and terminate training when\n validation score is not improving by at least tol for\n ``n_iter_no_change`` consecutive epochs. The split is stratified,\n except in a multilabel setting.\n If early stopping is False, then the training stops when the training\n loss does not improve by more than tol for n_iter_no_change consecutive\n passes over the training set.\n Only effective when solver='sgd' or 'adam'.\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if early_stopping is True.\n\n beta_1 : float, default=0.9\n Exponential decay rate for estimates of first moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n beta_2 : float, default=0.999\n Exponential decay rate for estimates of second moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n epsilon : float, default=1e-8\n Value for numerical stability in adam. Only used when solver='adam'.\n\n n_iter_no_change : int, default=10\n Maximum number of epochs to not meet ``tol`` improvement.\n Only effective when solver='sgd' or 'adam'.\n\n .. versionadded:: 0.20\n\n max_fun : int, default=15000\n Only used when solver='lbfgs'. Maximum number of loss function calls.\n The solver iterates until convergence (determined by 'tol'), number\n of iterations reaches max_iter, or this number of loss function calls.\n Note that number of loss function calls will be greater than or equal\n to the number of iterations for the `MLPClassifier`.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n classes_ : ndarray or list of ndarray of shape (n_classes,)\n Class labels for each output.\n\n loss_ : float\n The current loss computed with the loss function.\n\n best_loss_ : float\n The minimum loss reached by the solver throughout fitting.\n\n loss_curve_ : list of shape (`n_iter_`,)\n The ith element in the list represents the loss at the ith iteration.\n\n t_ : int\n The number of training samples seen by the solver during fitting.\n\n coefs_ : list of shape (n_layers - 1,)\n The ith element in the list represents the weight matrix corresponding\n to layer i.\n\n intercepts_ : list of shape (n_layers - 1,)\n The ith element in the list represents the bias vector corresponding to\n layer i + 1.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The number of iterations the solver has run.\n\n n_layers_ : int\n Number of layers.\n\n n_outputs_ : int\n Number of outputs.\n\n out_activation_ : str\n Name of the output activation function.\n\n See Also\n --------\n MLPRegressor : Multi-layer Perceptron regressor.\n BernoulliRBM : Bernoulli Restricted Boltzmann Machine (RBM).\n\n Notes\n -----\n MLPClassifier trains iteratively since at each time step\n the partial derivatives of the loss function with respect to the model\n parameters are computed to update the parameters.\n\n It can also have a regularization term added to the loss function\n that shrinks model parameters to prevent overfitting.\n\n This implementation works with data represented as dense numpy arrays or\n sparse scipy arrays of floating point values.\n\n References\n ----------\n Hinton, Geoffrey E.\n \"Connectionist learning procedures.\" Artificial intelligence 40.1\n (1989): 185-234.\n\n Glorot, Xavier, and Yoshua Bengio. \"Understanding the difficulty of\n training deep feedforward neural networks.\" International Conference\n on Artificial Intelligence and Statistics. 2010.\n\n He, Kaiming, et al. \"Delving deep into rectifiers: Surpassing human-level\n performance on imagenet classification.\" arXiv preprint\n arXiv:1502.01852 (2015).\n\n Kingma, Diederik, and Jimmy Ba. \"Adam: A method for stochastic\n optimization.\" arXiv preprint arXiv:1412.6980 (2014).\n\n Examples\n --------\n >>> from sklearn.neural_network import MLPClassifier\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = make_classification(n_samples=100, random_state=1)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,\n ... random_state=1)\n >>> clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)\n >>> clf.predict_proba(X_test[:1])\n array([[0.038..., 0.961...]])\n >>> clf.predict(X_test[:5, :])\n array([1, 0, 1, 0, 1])\n >>> clf.score(X_test, y_test)\n 0.8...\n \"\"\"\n \n def __init__(self, hidden_layer_sizes=(100, ), activation='relu', *, solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000):\n super().__init__(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, loss='log_loss', shuffle=shuffle, random_state=random_state, tol=tol, verbose=verbose, warm_start=warm_start, momentum=momentum, nesterovs_momentum=nesterovs_momentum, early_stopping=early_stopping, validation_fraction=validation_fraction, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, n_iter_no_change=n_iter_no_change, max_fun=max_fun)\n \n def _validate_input(self, X, y, incremental, reset):\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], multi_output=True, dtype=(np.float64, np.float32), reset=reset)\n if y.ndim == 2 and y.shape[1] == 1:\n y = column_or_1d(y, warn=True)\n if not hasattr(self, 'classes_') or not self.warm_start and not incremental:\n self._label_binarizer = LabelBinarizer()\n self._label_binarizer.fit(y)\n self.classes_ = self._label_binarizer.classes_\n else:\n classes = unique_labels(y)\n if self.warm_start:\n if set(classes) != set(self.classes_):\n raise ValueError(f'warm_start can only be used where `y` has the same classes as in the previous call to fit. Previously got {self.classes_}, `y` has {classes}')\n elif len(np.setdiff1d(classes, self.classes_, assume_unique=True)):\n raise ValueError(f\"`y` has classes not in `self.classes_`. `self.classes_` has {self.classes_}. 'y' has {classes}.\")\n y = self._label_binarizer.transform(y).astype(bool)\n return X, y\n \n def predict(self, X):\n \"\"\"Predict using the multi-layer perceptron classifier.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y : ndarray, shape (n_samples,) or (n_samples, n_classes)\n The predicted classes.\n \"\"\"\n check_is_fitted(self)\n y_pred = self._forward_pass_fast(X)\n if self.n_outputs_ == 1:\n y_pred = y_pred.ravel()\n return self._label_binarizer.inverse_transform(y_pred)\n \n @available_if(lambda est: est._check_solver())\n def partial_fit(self, X, y, classes=None):\n \"\"\"Update the model with a single iteration over the given data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : array-like of shape (n_samples,)\n The target values.\n\n classes : array of shape (n_classes,), default=None\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that y doesn't need to contain all labels in `classes`.\n\n Returns\n -------\n self : object\n Trained MLP model.\n \"\"\"\n if _check_partial_fit_first_call(self, classes):\n self._label_binarizer = LabelBinarizer()\n if type_of_target(y).startswith('multilabel'):\n self._label_binarizer.fit(y)\n else:\n self._label_binarizer.fit(classes)\n super().partial_fit(X, y)\n return self\n \n def predict_log_proba(self, X):\n \"\"\"Return the log of probability estimates.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n log_y_prob : ndarray of shape (n_samples, n_classes)\n The predicted log-probability of the sample for each class\n in the model, where classes are ordered as they are in\n `self.classes_`. Equivalent to `log(predict_proba(X))`.\n \"\"\"\n y_prob = self.predict_proba(X)\n return np.log(y_prob, out=y_prob)\n \n def predict_proba(self, X):\n \"\"\"Probability estimates.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y_prob : ndarray of shape (n_samples, n_classes)\n The predicted probability of the sample for each class in the\n model, where classes are ordered as they are in `self.classes_`.\n \"\"\"\n check_is_fitted(self)\n y_pred = self._forward_pass_fast(X)\n if self.n_outputs_ == 1:\n y_pred = y_pred.ravel()\n if y_pred.ndim == 1:\n return np.vstack([1 - y_pred, y_pred]).T\n else:\n return y_pred\n \n def _more_tags(self):\n return {'multilabel': True}\n" }, { "name": "MLPRegressor", @@ -25651,9 +25747,9 @@ "sklearn.neural_network._multilayer_perceptron.MLPRegressor._validate_input" ], "is_public": true, - "description": "Multi-layer Perceptron regressor.\n\nThis model optimizes the squared error using LBFGS or stochastic gradient descent. .. versionadded:: 0.18", - "docstring": "Multi-layer Perceptron regressor.\n\n This model optimizes the squared error using LBFGS or stochastic gradient\n descent.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)\n The ith element represents the number of neurons in the ith\n hidden layer.\n\n activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'\n Activation function for the hidden layer.\n\n - 'identity', no-op activation, useful to implement linear bottleneck,\n returns f(x) = x\n\n - 'logistic', the logistic sigmoid function,\n returns f(x) = 1 / (1 + exp(-x)).\n\n - 'tanh', the hyperbolic tan function,\n returns f(x) = tanh(x).\n\n - 'relu', the rectified linear unit function,\n returns f(x) = max(0, x)\n\n solver : {'lbfgs', 'sgd', 'adam'}, default='adam'\n The solver for weight optimization.\n\n - 'lbfgs' is an optimizer in the family of quasi-Newton methods.\n\n - 'sgd' refers to stochastic gradient descent.\n\n - 'adam' refers to a stochastic gradient-based optimizer proposed by\n Kingma, Diederik, and Jimmy Ba\n\n Note: The default solver 'adam' works pretty well on relatively\n large datasets (with thousands of training samples or more) in terms of\n both training time and validation score.\n For small datasets, however, 'lbfgs' can converge faster and perform\n better.\n\n alpha : float, default=0.0001\n L2 penalty (regularization term) parameter.\n\n batch_size : int, default='auto'\n Size of minibatches for stochastic optimizers.\n If the solver is 'lbfgs', the classifier will not use minibatch.\n When set to \"auto\", `batch_size=min(200, n_samples)`.\n\n learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'\n Learning rate schedule for weight updates.\n\n - 'constant' is a constant learning rate given by\n 'learning_rate_init'.\n\n - 'invscaling' gradually decreases the learning rate ``learning_rate_``\n at each time step 't' using an inverse scaling exponent of 'power_t'.\n effective_learning_rate = learning_rate_init / pow(t, power_t)\n\n - 'adaptive' keeps the learning rate constant to\n 'learning_rate_init' as long as training loss keeps decreasing.\n Each time two consecutive epochs fail to decrease training loss by at\n least tol, or fail to increase validation score by at least tol if\n 'early_stopping' is on, the current learning rate is divided by 5.\n\n Only used when solver='sgd'.\n\n learning_rate_init : double, default=0.001\n The initial learning rate used. It controls the step-size\n in updating the weights. Only used when solver='sgd' or 'adam'.\n\n power_t : double, default=0.5\n The exponent for inverse scaling learning rate.\n It is used in updating effective learning rate when the learning_rate\n is set to 'invscaling'. Only used when solver='sgd'.\n\n max_iter : int, default=200\n Maximum number of iterations. The solver iterates until convergence\n (determined by 'tol') or this number of iterations. For stochastic\n solvers ('sgd', 'adam'), note that this determines the number of epochs\n (how many times each data point will be used), not the number of\n gradient steps.\n\n shuffle : bool, default=True\n Whether to shuffle samples in each iteration. Only used when\n solver='sgd' or 'adam'.\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for weights and bias\n initialization, train-test split if early stopping is used, and batch\n sampling when solver='sgd' or 'adam'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n tol : float, default=1e-4\n Tolerance for the optimization. When the loss or score is not improving\n by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,\n unless ``learning_rate`` is set to 'adaptive', convergence is\n considered to be reached and training stops.\n\n verbose : bool, default=False\n Whether to print progress messages to stdout.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous\n call to fit as initialization, otherwise, just erase the\n previous solution. See :term:`the Glossary `.\n\n momentum : float, default=0.9\n Momentum for gradient descent update. Should be between 0 and 1. Only\n used when solver='sgd'.\n\n nesterovs_momentum : bool, default=True\n Whether to use Nesterov's momentum. Only used when solver='sgd' and\n momentum > 0.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to true, it will automatically set\n aside 10% of training data as validation and terminate training when\n validation score is not improving by at least ``tol`` for\n ``n_iter_no_change`` consecutive epochs.\n Only effective when solver='sgd' or 'adam'.\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if early_stopping is True.\n\n beta_1 : float, default=0.9\n Exponential decay rate for estimates of first moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n beta_2 : float, default=0.999\n Exponential decay rate for estimates of second moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n epsilon : float, default=1e-8\n Value for numerical stability in adam. Only used when solver='adam'.\n\n n_iter_no_change : int, default=10\n Maximum number of epochs to not meet ``tol`` improvement.\n Only effective when solver='sgd' or 'adam'.\n\n .. versionadded:: 0.20\n\n max_fun : int, default=15000\n Only used when solver='lbfgs'. Maximum number of function calls.\n The solver iterates until convergence (determined by 'tol'), number\n of iterations reaches max_iter, or this number of function calls.\n Note that number of function calls will be greater than or equal to\n the number of iterations for the MLPRegressor.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n loss_ : float\n The current loss computed with the loss function.\n\n best_loss_ : float\n The minimum loss reached by the solver throughout fitting.\n\n loss_curve_ : list of shape (`n_iter_`,)\n Loss value evaluated at the end of each training step.\n The ith element in the list represents the loss at the ith iteration.\n\n t_ : int\n The number of training samples seen by the solver during fitting.\n Mathematically equals `n_iters * X.shape[0]`, it means\n `time_step` and it is used by optimizer's learning rate scheduler.\n\n coefs_ : list of shape (n_layers - 1,)\n The ith element in the list represents the weight matrix corresponding\n to layer i.\n\n intercepts_ : list of shape (n_layers - 1,)\n The ith element in the list represents the bias vector corresponding to\n layer i + 1.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The number of iterations the solver has run.\n\n n_layers_ : int\n Number of layers.\n\n n_outputs_ : int\n Number of outputs.\n\n out_activation_ : str\n Name of the output activation function.\n\n See Also\n --------\n BernoulliRBM : Bernoulli Restricted Boltzmann Machine (RBM).\n MLPClassifier : Multi-layer Perceptron classifier.\n sklearn.linear_model.SGDRegressor : Linear model fitted by minimizing\n a regularized empirical loss with SGD.\n\n Notes\n -----\n MLPRegressor trains iteratively since at each time step\n the partial derivatives of the loss function with respect to the model\n parameters are computed to update the parameters.\n\n It can also have a regularization term added to the loss function\n that shrinks model parameters to prevent overfitting.\n\n This implementation works with data represented as dense and sparse numpy\n arrays of floating point values.\n\n References\n ----------\n Hinton, Geoffrey E.\n \"Connectionist learning procedures.\" Artificial intelligence 40.1\n (1989): 185-234.\n\n Glorot, Xavier, and Yoshua Bengio. \"Understanding the difficulty of\n training deep feedforward neural networks.\" International Conference\n on Artificial Intelligence and Statistics. 2010.\n\n He, Kaiming, et al. \"Delving deep into rectifiers: Surpassing human-level\n performance on imagenet classification.\" arXiv preprint\n arXiv:1502.01852 (2015).\n\n Kingma, Diederik, and Jimmy Ba. \"Adam: A method for stochastic\n optimization.\" arXiv preprint arXiv:1412.6980 (2014).\n\n Examples\n --------\n >>> from sklearn.neural_network import MLPRegressor\n >>> from sklearn.datasets import make_regression\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = make_regression(n_samples=200, random_state=1)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y,\n ... random_state=1)\n >>> regr = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)\n >>> regr.predict(X_test[:2])\n array([-0.9..., -7.1...])\n >>> regr.score(X_test, y_test)\n 0.4...\n ", - "source_code": "\n\nclass MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):\n \"\"\"Multi-layer Perceptron regressor.\n\n This model optimizes the squared error using LBFGS or stochastic gradient\n descent.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)\n The ith element represents the number of neurons in the ith\n hidden layer.\n\n activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'\n Activation function for the hidden layer.\n\n - 'identity', no-op activation, useful to implement linear bottleneck,\n returns f(x) = x\n\n - 'logistic', the logistic sigmoid function,\n returns f(x) = 1 / (1 + exp(-x)).\n\n - 'tanh', the hyperbolic tan function,\n returns f(x) = tanh(x).\n\n - 'relu', the rectified linear unit function,\n returns f(x) = max(0, x)\n\n solver : {'lbfgs', 'sgd', 'adam'}, default='adam'\n The solver for weight optimization.\n\n - 'lbfgs' is an optimizer in the family of quasi-Newton methods.\n\n - 'sgd' refers to stochastic gradient descent.\n\n - 'adam' refers to a stochastic gradient-based optimizer proposed by\n Kingma, Diederik, and Jimmy Ba\n\n Note: The default solver 'adam' works pretty well on relatively\n large datasets (with thousands of training samples or more) in terms of\n both training time and validation score.\n For small datasets, however, 'lbfgs' can converge faster and perform\n better.\n\n alpha : float, default=0.0001\n L2 penalty (regularization term) parameter.\n\n batch_size : int, default='auto'\n Size of minibatches for stochastic optimizers.\n If the solver is 'lbfgs', the classifier will not use minibatch.\n When set to \"auto\", `batch_size=min(200, n_samples)`.\n\n learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'\n Learning rate schedule for weight updates.\n\n - 'constant' is a constant learning rate given by\n 'learning_rate_init'.\n\n - 'invscaling' gradually decreases the learning rate ``learning_rate_``\n at each time step 't' using an inverse scaling exponent of 'power_t'.\n effective_learning_rate = learning_rate_init / pow(t, power_t)\n\n - 'adaptive' keeps the learning rate constant to\n 'learning_rate_init' as long as training loss keeps decreasing.\n Each time two consecutive epochs fail to decrease training loss by at\n least tol, or fail to increase validation score by at least tol if\n 'early_stopping' is on, the current learning rate is divided by 5.\n\n Only used when solver='sgd'.\n\n learning_rate_init : double, default=0.001\n The initial learning rate used. It controls the step-size\n in updating the weights. Only used when solver='sgd' or 'adam'.\n\n power_t : double, default=0.5\n The exponent for inverse scaling learning rate.\n It is used in updating effective learning rate when the learning_rate\n is set to 'invscaling'. Only used when solver='sgd'.\n\n max_iter : int, default=200\n Maximum number of iterations. The solver iterates until convergence\n (determined by 'tol') or this number of iterations. For stochastic\n solvers ('sgd', 'adam'), note that this determines the number of epochs\n (how many times each data point will be used), not the number of\n gradient steps.\n\n shuffle : bool, default=True\n Whether to shuffle samples in each iteration. Only used when\n solver='sgd' or 'adam'.\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for weights and bias\n initialization, train-test split if early stopping is used, and batch\n sampling when solver='sgd' or 'adam'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n tol : float, default=1e-4\n Tolerance for the optimization. When the loss or score is not improving\n by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,\n unless ``learning_rate`` is set to 'adaptive', convergence is\n considered to be reached and training stops.\n\n verbose : bool, default=False\n Whether to print progress messages to stdout.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous\n call to fit as initialization, otherwise, just erase the\n previous solution. See :term:`the Glossary `.\n\n momentum : float, default=0.9\n Momentum for gradient descent update. Should be between 0 and 1. Only\n used when solver='sgd'.\n\n nesterovs_momentum : bool, default=True\n Whether to use Nesterov's momentum. Only used when solver='sgd' and\n momentum > 0.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to true, it will automatically set\n aside 10% of training data as validation and terminate training when\n validation score is not improving by at least ``tol`` for\n ``n_iter_no_change`` consecutive epochs.\n Only effective when solver='sgd' or 'adam'.\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if early_stopping is True.\n\n beta_1 : float, default=0.9\n Exponential decay rate for estimates of first moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n beta_2 : float, default=0.999\n Exponential decay rate for estimates of second moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n epsilon : float, default=1e-8\n Value for numerical stability in adam. Only used when solver='adam'.\n\n n_iter_no_change : int, default=10\n Maximum number of epochs to not meet ``tol`` improvement.\n Only effective when solver='sgd' or 'adam'.\n\n .. versionadded:: 0.20\n\n max_fun : int, default=15000\n Only used when solver='lbfgs'. Maximum number of function calls.\n The solver iterates until convergence (determined by 'tol'), number\n of iterations reaches max_iter, or this number of function calls.\n Note that number of function calls will be greater than or equal to\n the number of iterations for the MLPRegressor.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n loss_ : float\n The current loss computed with the loss function.\n\n best_loss_ : float\n The minimum loss reached by the solver throughout fitting.\n\n loss_curve_ : list of shape (`n_iter_`,)\n Loss value evaluated at the end of each training step.\n The ith element in the list represents the loss at the ith iteration.\n\n t_ : int\n The number of training samples seen by the solver during fitting.\n Mathematically equals `n_iters * X.shape[0]`, it means\n `time_step` and it is used by optimizer's learning rate scheduler.\n\n coefs_ : list of shape (n_layers - 1,)\n The ith element in the list represents the weight matrix corresponding\n to layer i.\n\n intercepts_ : list of shape (n_layers - 1,)\n The ith element in the list represents the bias vector corresponding to\n layer i + 1.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The number of iterations the solver has run.\n\n n_layers_ : int\n Number of layers.\n\n n_outputs_ : int\n Number of outputs.\n\n out_activation_ : str\n Name of the output activation function.\n\n See Also\n --------\n BernoulliRBM : Bernoulli Restricted Boltzmann Machine (RBM).\n MLPClassifier : Multi-layer Perceptron classifier.\n sklearn.linear_model.SGDRegressor : Linear model fitted by minimizing\n a regularized empirical loss with SGD.\n\n Notes\n -----\n MLPRegressor trains iteratively since at each time step\n the partial derivatives of the loss function with respect to the model\n parameters are computed to update the parameters.\n\n It can also have a regularization term added to the loss function\n that shrinks model parameters to prevent overfitting.\n\n This implementation works with data represented as dense and sparse numpy\n arrays of floating point values.\n\n References\n ----------\n Hinton, Geoffrey E.\n \"Connectionist learning procedures.\" Artificial intelligence 40.1\n (1989): 185-234.\n\n Glorot, Xavier, and Yoshua Bengio. \"Understanding the difficulty of\n training deep feedforward neural networks.\" International Conference\n on Artificial Intelligence and Statistics. 2010.\n\n He, Kaiming, et al. \"Delving deep into rectifiers: Surpassing human-level\n performance on imagenet classification.\" arXiv preprint\n arXiv:1502.01852 (2015).\n\n Kingma, Diederik, and Jimmy Ba. \"Adam: A method for stochastic\n optimization.\" arXiv preprint arXiv:1412.6980 (2014).\n\n Examples\n --------\n >>> from sklearn.neural_network import MLPRegressor\n >>> from sklearn.datasets import make_regression\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = make_regression(n_samples=200, random_state=1)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y,\n ... random_state=1)\n >>> regr = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)\n >>> regr.predict(X_test[:2])\n array([-0.9..., -7.1...])\n >>> regr.score(X_test, y_test)\n 0.4...\n \"\"\"\n \n def __init__(self, hidden_layer_sizes=(100, ), activation='relu', *, solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000):\n super().__init__(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, loss='squared_error', shuffle=shuffle, random_state=random_state, tol=tol, verbose=verbose, warm_start=warm_start, momentum=momentum, nesterovs_momentum=nesterovs_momentum, early_stopping=early_stopping, validation_fraction=validation_fraction, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, n_iter_no_change=n_iter_no_change, max_fun=max_fun)\n \n def predict(self, X):\n \"\"\"Predict using the multi-layer perceptron model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_outputs)\n The predicted values.\n \"\"\"\n check_is_fitted(self)\n y_pred = self._forward_pass_fast(X)\n if y_pred.shape[1] == 1:\n return y_pred.ravel()\n return y_pred\n \n def _validate_input(self, X, y, incremental, reset):\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], multi_output=True, y_numeric=True, dtype=(np.float64, np.float32), reset=reset)\n if y.ndim == 2 and y.shape[1] == 1:\n y = column_or_1d(y, warn=True)\n return X, y\n" + "description": "Multi-layer Perceptron regressor.\n\nThis model optimizes the squared error using LBFGS or stochastic gradient\ndescent.\n\n.. versionadded:: 0.18", + "docstring": "Multi-layer Perceptron regressor.\n\n This model optimizes the squared error using LBFGS or stochastic gradient\n descent.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)\n The ith element represents the number of neurons in the ith\n hidden layer.\n\n activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'\n Activation function for the hidden layer.\n\n - 'identity', no-op activation, useful to implement linear bottleneck,\n returns f(x) = x\n\n - 'logistic', the logistic sigmoid function,\n returns f(x) = 1 / (1 + exp(-x)).\n\n - 'tanh', the hyperbolic tan function,\n returns f(x) = tanh(x).\n\n - 'relu', the rectified linear unit function,\n returns f(x) = max(0, x)\n\n solver : {'lbfgs', 'sgd', 'adam'}, default='adam'\n The solver for weight optimization.\n\n - 'lbfgs' is an optimizer in the family of quasi-Newton methods.\n\n - 'sgd' refers to stochastic gradient descent.\n\n - 'adam' refers to a stochastic gradient-based optimizer proposed by\n Kingma, Diederik, and Jimmy Ba\n\n Note: The default solver 'adam' works pretty well on relatively\n large datasets (with thousands of training samples or more) in terms of\n both training time and validation score.\n For small datasets, however, 'lbfgs' can converge faster and perform\n better.\n\n alpha : float, default=0.0001\n L2 penalty (regularization term) parameter.\n\n batch_size : int, default='auto'\n Size of minibatches for stochastic optimizers.\n If the solver is 'lbfgs', the classifier will not use minibatch.\n When set to \"auto\", `batch_size=min(200, n_samples)`.\n\n learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'\n Learning rate schedule for weight updates.\n\n - 'constant' is a constant learning rate given by\n 'learning_rate_init'.\n\n - 'invscaling' gradually decreases the learning rate ``learning_rate_``\n at each time step 't' using an inverse scaling exponent of 'power_t'.\n effective_learning_rate = learning_rate_init / pow(t, power_t)\n\n - 'adaptive' keeps the learning rate constant to\n 'learning_rate_init' as long as training loss keeps decreasing.\n Each time two consecutive epochs fail to decrease training loss by at\n least tol, or fail to increase validation score by at least tol if\n 'early_stopping' is on, the current learning rate is divided by 5.\n\n Only used when solver='sgd'.\n\n learning_rate_init : float, default=0.001\n The initial learning rate used. It controls the step-size\n in updating the weights. Only used when solver='sgd' or 'adam'.\n\n power_t : float, default=0.5\n The exponent for inverse scaling learning rate.\n It is used in updating effective learning rate when the learning_rate\n is set to 'invscaling'. Only used when solver='sgd'.\n\n max_iter : int, default=200\n Maximum number of iterations. The solver iterates until convergence\n (determined by 'tol') or this number of iterations. For stochastic\n solvers ('sgd', 'adam'), note that this determines the number of epochs\n (how many times each data point will be used), not the number of\n gradient steps.\n\n shuffle : bool, default=True\n Whether to shuffle samples in each iteration. Only used when\n solver='sgd' or 'adam'.\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for weights and bias\n initialization, train-test split if early stopping is used, and batch\n sampling when solver='sgd' or 'adam'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n tol : float, default=1e-4\n Tolerance for the optimization. When the loss or score is not improving\n by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,\n unless ``learning_rate`` is set to 'adaptive', convergence is\n considered to be reached and training stops.\n\n verbose : bool, default=False\n Whether to print progress messages to stdout.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous\n call to fit as initialization, otherwise, just erase the\n previous solution. See :term:`the Glossary `.\n\n momentum : float, default=0.9\n Momentum for gradient descent update. Should be between 0 and 1. Only\n used when solver='sgd'.\n\n nesterovs_momentum : bool, default=True\n Whether to use Nesterov's momentum. Only used when solver='sgd' and\n momentum > 0.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to true, it will automatically set\n aside 10% of training data as validation and terminate training when\n validation score is not improving by at least ``tol`` for\n ``n_iter_no_change`` consecutive epochs.\n Only effective when solver='sgd' or 'adam'.\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if early_stopping is True.\n\n beta_1 : float, default=0.9\n Exponential decay rate for estimates of first moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n beta_2 : float, default=0.999\n Exponential decay rate for estimates of second moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n epsilon : float, default=1e-8\n Value for numerical stability in adam. Only used when solver='adam'.\n\n n_iter_no_change : int, default=10\n Maximum number of epochs to not meet ``tol`` improvement.\n Only effective when solver='sgd' or 'adam'.\n\n .. versionadded:: 0.20\n\n max_fun : int, default=15000\n Only used when solver='lbfgs'. Maximum number of function calls.\n The solver iterates until convergence (determined by 'tol'), number\n of iterations reaches max_iter, or this number of function calls.\n Note that number of function calls will be greater than or equal to\n the number of iterations for the MLPRegressor.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n loss_ : float\n The current loss computed with the loss function.\n\n best_loss_ : float\n The minimum loss reached by the solver throughout fitting.\n\n loss_curve_ : list of shape (`n_iter_`,)\n Loss value evaluated at the end of each training step.\n The ith element in the list represents the loss at the ith iteration.\n\n t_ : int\n The number of training samples seen by the solver during fitting.\n Mathematically equals `n_iters * X.shape[0]`, it means\n `time_step` and it is used by optimizer's learning rate scheduler.\n\n coefs_ : list of shape (n_layers - 1,)\n The ith element in the list represents the weight matrix corresponding\n to layer i.\n\n intercepts_ : list of shape (n_layers - 1,)\n The ith element in the list represents the bias vector corresponding to\n layer i + 1.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The number of iterations the solver has run.\n\n n_layers_ : int\n Number of layers.\n\n n_outputs_ : int\n Number of outputs.\n\n out_activation_ : str\n Name of the output activation function.\n\n See Also\n --------\n BernoulliRBM : Bernoulli Restricted Boltzmann Machine (RBM).\n MLPClassifier : Multi-layer Perceptron classifier.\n sklearn.linear_model.SGDRegressor : Linear model fitted by minimizing\n a regularized empirical loss with SGD.\n\n Notes\n -----\n MLPRegressor trains iteratively since at each time step\n the partial derivatives of the loss function with respect to the model\n parameters are computed to update the parameters.\n\n It can also have a regularization term added to the loss function\n that shrinks model parameters to prevent overfitting.\n\n This implementation works with data represented as dense and sparse numpy\n arrays of floating point values.\n\n References\n ----------\n Hinton, Geoffrey E.\n \"Connectionist learning procedures.\" Artificial intelligence 40.1\n (1989): 185-234.\n\n Glorot, Xavier, and Yoshua Bengio. \"Understanding the difficulty of\n training deep feedforward neural networks.\" International Conference\n on Artificial Intelligence and Statistics. 2010.\n\n He, Kaiming, et al. \"Delving deep into rectifiers: Surpassing human-level\n performance on imagenet classification.\" arXiv preprint\n arXiv:1502.01852 (2015).\n\n Kingma, Diederik, and Jimmy Ba. \"Adam: A method for stochastic\n optimization.\" arXiv preprint arXiv:1412.6980 (2014).\n\n Examples\n --------\n >>> from sklearn.neural_network import MLPRegressor\n >>> from sklearn.datasets import make_regression\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = make_regression(n_samples=200, random_state=1)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y,\n ... random_state=1)\n >>> regr = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)\n >>> regr.predict(X_test[:2])\n array([-0.9..., -7.1...])\n >>> regr.score(X_test, y_test)\n 0.4...\n ", + "source_code": "\n\nclass MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):\n \"\"\"Multi-layer Perceptron regressor.\n\n This model optimizes the squared error using LBFGS or stochastic gradient\n descent.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)\n The ith element represents the number of neurons in the ith\n hidden layer.\n\n activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'\n Activation function for the hidden layer.\n\n - 'identity', no-op activation, useful to implement linear bottleneck,\n returns f(x) = x\n\n - 'logistic', the logistic sigmoid function,\n returns f(x) = 1 / (1 + exp(-x)).\n\n - 'tanh', the hyperbolic tan function,\n returns f(x) = tanh(x).\n\n - 'relu', the rectified linear unit function,\n returns f(x) = max(0, x)\n\n solver : {'lbfgs', 'sgd', 'adam'}, default='adam'\n The solver for weight optimization.\n\n - 'lbfgs' is an optimizer in the family of quasi-Newton methods.\n\n - 'sgd' refers to stochastic gradient descent.\n\n - 'adam' refers to a stochastic gradient-based optimizer proposed by\n Kingma, Diederik, and Jimmy Ba\n\n Note: The default solver 'adam' works pretty well on relatively\n large datasets (with thousands of training samples or more) in terms of\n both training time and validation score.\n For small datasets, however, 'lbfgs' can converge faster and perform\n better.\n\n alpha : float, default=0.0001\n L2 penalty (regularization term) parameter.\n\n batch_size : int, default='auto'\n Size of minibatches for stochastic optimizers.\n If the solver is 'lbfgs', the classifier will not use minibatch.\n When set to \"auto\", `batch_size=min(200, n_samples)`.\n\n learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'\n Learning rate schedule for weight updates.\n\n - 'constant' is a constant learning rate given by\n 'learning_rate_init'.\n\n - 'invscaling' gradually decreases the learning rate ``learning_rate_``\n at each time step 't' using an inverse scaling exponent of 'power_t'.\n effective_learning_rate = learning_rate_init / pow(t, power_t)\n\n - 'adaptive' keeps the learning rate constant to\n 'learning_rate_init' as long as training loss keeps decreasing.\n Each time two consecutive epochs fail to decrease training loss by at\n least tol, or fail to increase validation score by at least tol if\n 'early_stopping' is on, the current learning rate is divided by 5.\n\n Only used when solver='sgd'.\n\n learning_rate_init : float, default=0.001\n The initial learning rate used. It controls the step-size\n in updating the weights. Only used when solver='sgd' or 'adam'.\n\n power_t : float, default=0.5\n The exponent for inverse scaling learning rate.\n It is used in updating effective learning rate when the learning_rate\n is set to 'invscaling'. Only used when solver='sgd'.\n\n max_iter : int, default=200\n Maximum number of iterations. The solver iterates until convergence\n (determined by 'tol') or this number of iterations. For stochastic\n solvers ('sgd', 'adam'), note that this determines the number of epochs\n (how many times each data point will be used), not the number of\n gradient steps.\n\n shuffle : bool, default=True\n Whether to shuffle samples in each iteration. Only used when\n solver='sgd' or 'adam'.\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for weights and bias\n initialization, train-test split if early stopping is used, and batch\n sampling when solver='sgd' or 'adam'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n tol : float, default=1e-4\n Tolerance for the optimization. When the loss or score is not improving\n by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,\n unless ``learning_rate`` is set to 'adaptive', convergence is\n considered to be reached and training stops.\n\n verbose : bool, default=False\n Whether to print progress messages to stdout.\n\n warm_start : bool, default=False\n When set to True, reuse the solution of the previous\n call to fit as initialization, otherwise, just erase the\n previous solution. See :term:`the Glossary `.\n\n momentum : float, default=0.9\n Momentum for gradient descent update. Should be between 0 and 1. Only\n used when solver='sgd'.\n\n nesterovs_momentum : bool, default=True\n Whether to use Nesterov's momentum. Only used when solver='sgd' and\n momentum > 0.\n\n early_stopping : bool, default=False\n Whether to use early stopping to terminate training when validation\n score is not improving. If set to true, it will automatically set\n aside 10% of training data as validation and terminate training when\n validation score is not improving by at least ``tol`` for\n ``n_iter_no_change`` consecutive epochs.\n Only effective when solver='sgd' or 'adam'.\n\n validation_fraction : float, default=0.1\n The proportion of training data to set aside as validation set for\n early stopping. Must be between 0 and 1.\n Only used if early_stopping is True.\n\n beta_1 : float, default=0.9\n Exponential decay rate for estimates of first moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n beta_2 : float, default=0.999\n Exponential decay rate for estimates of second moment vector in adam,\n should be in [0, 1). Only used when solver='adam'.\n\n epsilon : float, default=1e-8\n Value for numerical stability in adam. Only used when solver='adam'.\n\n n_iter_no_change : int, default=10\n Maximum number of epochs to not meet ``tol`` improvement.\n Only effective when solver='sgd' or 'adam'.\n\n .. versionadded:: 0.20\n\n max_fun : int, default=15000\n Only used when solver='lbfgs'. Maximum number of function calls.\n The solver iterates until convergence (determined by 'tol'), number\n of iterations reaches max_iter, or this number of function calls.\n Note that number of function calls will be greater than or equal to\n the number of iterations for the MLPRegressor.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n loss_ : float\n The current loss computed with the loss function.\n\n best_loss_ : float\n The minimum loss reached by the solver throughout fitting.\n\n loss_curve_ : list of shape (`n_iter_`,)\n Loss value evaluated at the end of each training step.\n The ith element in the list represents the loss at the ith iteration.\n\n t_ : int\n The number of training samples seen by the solver during fitting.\n Mathematically equals `n_iters * X.shape[0]`, it means\n `time_step` and it is used by optimizer's learning rate scheduler.\n\n coefs_ : list of shape (n_layers - 1,)\n The ith element in the list represents the weight matrix corresponding\n to layer i.\n\n intercepts_ : list of shape (n_layers - 1,)\n The ith element in the list represents the bias vector corresponding to\n layer i + 1.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The number of iterations the solver has run.\n\n n_layers_ : int\n Number of layers.\n\n n_outputs_ : int\n Number of outputs.\n\n out_activation_ : str\n Name of the output activation function.\n\n See Also\n --------\n BernoulliRBM : Bernoulli Restricted Boltzmann Machine (RBM).\n MLPClassifier : Multi-layer Perceptron classifier.\n sklearn.linear_model.SGDRegressor : Linear model fitted by minimizing\n a regularized empirical loss with SGD.\n\n Notes\n -----\n MLPRegressor trains iteratively since at each time step\n the partial derivatives of the loss function with respect to the model\n parameters are computed to update the parameters.\n\n It can also have a regularization term added to the loss function\n that shrinks model parameters to prevent overfitting.\n\n This implementation works with data represented as dense and sparse numpy\n arrays of floating point values.\n\n References\n ----------\n Hinton, Geoffrey E.\n \"Connectionist learning procedures.\" Artificial intelligence 40.1\n (1989): 185-234.\n\n Glorot, Xavier, and Yoshua Bengio. \"Understanding the difficulty of\n training deep feedforward neural networks.\" International Conference\n on Artificial Intelligence and Statistics. 2010.\n\n He, Kaiming, et al. \"Delving deep into rectifiers: Surpassing human-level\n performance on imagenet classification.\" arXiv preprint\n arXiv:1502.01852 (2015).\n\n Kingma, Diederik, and Jimmy Ba. \"Adam: A method for stochastic\n optimization.\" arXiv preprint arXiv:1412.6980 (2014).\n\n Examples\n --------\n >>> from sklearn.neural_network import MLPRegressor\n >>> from sklearn.datasets import make_regression\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = make_regression(n_samples=200, random_state=1)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y,\n ... random_state=1)\n >>> regr = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)\n >>> regr.predict(X_test[:2])\n array([-0.9..., -7.1...])\n >>> regr.score(X_test, y_test)\n 0.4...\n \"\"\"\n \n def __init__(self, hidden_layer_sizes=(100, ), activation='relu', *, solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000):\n super().__init__(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, loss='squared_error', shuffle=shuffle, random_state=random_state, tol=tol, verbose=verbose, warm_start=warm_start, momentum=momentum, nesterovs_momentum=nesterovs_momentum, early_stopping=early_stopping, validation_fraction=validation_fraction, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, n_iter_no_change=n_iter_no_change, max_fun=max_fun)\n \n def predict(self, X):\n \"\"\"Predict using the multi-layer perceptron model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_outputs)\n The predicted values.\n \"\"\"\n check_is_fitted(self)\n y_pred = self._forward_pass_fast(X)\n if y_pred.shape[1] == 1:\n return y_pred.ravel()\n return y_pred\n \n def _validate_input(self, X, y, incremental, reset):\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], multi_output=True, y_numeric=True, dtype=(np.float64, np.float32), reset=reset)\n if y.ndim == 2 and y.shape[1] == 1:\n y = column_or_1d(y, warn=True)\n return X, y\n" }, { "name": "BernoulliRBM", @@ -25675,7 +25771,7 @@ "sklearn.neural_network._rbm.BernoulliRBM._more_tags" ], "is_public": true, - "description": "Bernoulli Restricted Boltzmann Machine (RBM).\n\nA Restricted Boltzmann Machine with binary visible units and binary hidden units. Parameters are estimated using Stochastic Maximum Likelihood (SML), also known as Persistent Contrastive Divergence (PCD) [2]. The time complexity of this implementation is ``O(d ** 2)`` assuming d ~ n_features ~ n_components. Read more in the :ref:`User Guide `.", + "description": "Bernoulli Restricted Boltzmann Machine (RBM).\n\nA Restricted Boltzmann Machine with binary visible units and\nbinary hidden units. Parameters are estimated using Stochastic Maximum\nLikelihood (SML), also known as Persistent Contrastive Divergence (PCD)\n[2].\n\nThe time complexity of this implementation is ``O(d ** 2)`` assuming\nd ~ n_features ~ n_components.\n\nRead more in the :ref:`User Guide `.", "docstring": "Bernoulli Restricted Boltzmann Machine (RBM).\n\n A Restricted Boltzmann Machine with binary visible units and\n binary hidden units. Parameters are estimated using Stochastic Maximum\n Likelihood (SML), also known as Persistent Contrastive Divergence (PCD)\n [2].\n\n The time complexity of this implementation is ``O(d ** 2)`` assuming\n d ~ n_features ~ n_components.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=256\n Number of binary hidden units.\n\n learning_rate : float, default=0.1\n The learning rate for weight updates. It is *highly* recommended\n to tune this hyper-parameter. Reasonable values are in the\n 10**[0., -3.] range.\n\n batch_size : int, default=10\n Number of examples per minibatch.\n\n n_iter : int, default=10\n Number of iterations/sweeps over the training dataset to perform\n during training.\n\n verbose : int, default=0\n The verbosity level. The default, zero, means silent mode. Range\n of values is [0, inf].\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for:\n\n - Gibbs sampling from visible and hidden layers.\n\n - Initializing components, sampling from layers during fit.\n\n - Corrupting the data when scoring samples.\n\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n intercept_hidden_ : array-like of shape (n_components,)\n Biases of the hidden units.\n\n intercept_visible_ : array-like of shape (n_features,)\n Biases of the visible units.\n\n components_ : array-like of shape (n_components, n_features)\n Weight matrix, where `n_features` is the number of\n visible units and `n_components` is the number of hidden units.\n\n h_samples_ : array-like of shape (batch_size, n_components)\n Hidden Activation sampled from the model distribution,\n where `batch_size` is the number of examples per minibatch and\n `n_components` is the number of hidden units.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.neural_network.MLPRegressor : Multi-layer Perceptron regressor.\n sklearn.neural_network.MLPClassifier : Multi-layer Perceptron classifier.\n sklearn.decomposition.PCA : An unsupervised linear dimensionality\n reduction model.\n\n References\n ----------\n\n [1] Hinton, G. E., Osindero, S. and Teh, Y. A fast learning algorithm for\n deep belief nets. Neural Computation 18, pp 1527-1554.\n https://www.cs.toronto.edu/~hinton/absps/fastnc.pdf\n\n [2] Tieleman, T. Training Restricted Boltzmann Machines using\n Approximations to the Likelihood Gradient. International Conference\n on Machine Learning (ICML) 2008\n\n Examples\n --------\n\n >>> import numpy as np\n >>> from sklearn.neural_network import BernoulliRBM\n >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])\n >>> model = BernoulliRBM(n_components=2)\n >>> model.fit(X)\n BernoulliRBM(n_components=2)\n ", "source_code": "\n\nclass BernoulliRBM(TransformerMixin, BaseEstimator):\n \"\"\"Bernoulli Restricted Boltzmann Machine (RBM).\n\n A Restricted Boltzmann Machine with binary visible units and\n binary hidden units. Parameters are estimated using Stochastic Maximum\n Likelihood (SML), also known as Persistent Contrastive Divergence (PCD)\n [2].\n\n The time complexity of this implementation is ``O(d ** 2)`` assuming\n d ~ n_features ~ n_components.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int, default=256\n Number of binary hidden units.\n\n learning_rate : float, default=0.1\n The learning rate for weight updates. It is *highly* recommended\n to tune this hyper-parameter. Reasonable values are in the\n 10**[0., -3.] range.\n\n batch_size : int, default=10\n Number of examples per minibatch.\n\n n_iter : int, default=10\n Number of iterations/sweeps over the training dataset to perform\n during training.\n\n verbose : int, default=0\n The verbosity level. The default, zero, means silent mode. Range\n of values is [0, inf].\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for:\n\n - Gibbs sampling from visible and hidden layers.\n\n - Initializing components, sampling from layers during fit.\n\n - Corrupting the data when scoring samples.\n\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n intercept_hidden_ : array-like of shape (n_components,)\n Biases of the hidden units.\n\n intercept_visible_ : array-like of shape (n_features,)\n Biases of the visible units.\n\n components_ : array-like of shape (n_components, n_features)\n Weight matrix, where `n_features` is the number of\n visible units and `n_components` is the number of hidden units.\n\n h_samples_ : array-like of shape (batch_size, n_components)\n Hidden Activation sampled from the model distribution,\n where `batch_size` is the number of examples per minibatch and\n `n_components` is the number of hidden units.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.neural_network.MLPRegressor : Multi-layer Perceptron regressor.\n sklearn.neural_network.MLPClassifier : Multi-layer Perceptron classifier.\n sklearn.decomposition.PCA : An unsupervised linear dimensionality\n reduction model.\n\n References\n ----------\n\n [1] Hinton, G. E., Osindero, S. and Teh, Y. A fast learning algorithm for\n deep belief nets. Neural Computation 18, pp 1527-1554.\n https://www.cs.toronto.edu/~hinton/absps/fastnc.pdf\n\n [2] Tieleman, T. Training Restricted Boltzmann Machines using\n Approximations to the Likelihood Gradient. International Conference\n on Machine Learning (ICML) 2008\n\n Examples\n --------\n\n >>> import numpy as np\n >>> from sklearn.neural_network import BernoulliRBM\n >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])\n >>> model = BernoulliRBM(n_components=2)\n >>> model.fit(X)\n BernoulliRBM(n_components=2)\n \"\"\"\n \n def __init__(self, n_components=256, *, learning_rate=0.1, batch_size=10, n_iter=10, verbose=0, random_state=None):\n self.n_components = n_components\n self.learning_rate = learning_rate\n self.batch_size = batch_size\n self.n_iter = n_iter\n self.verbose = verbose\n self.random_state = random_state\n \n def transform(self, X):\n \"\"\"Compute the hidden layer activation probabilities, P(h=1|v=X).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to be transformed.\n\n Returns\n -------\n h : ndarray of shape (n_samples, n_components)\n Latent representations of the data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False, dtype=(np.float64, np.float32))\n return self._mean_hiddens(X)\n \n def _mean_hiddens(self, v):\n \"\"\"Computes the probabilities P(h=1|v).\n\n Parameters\n ----------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer.\n\n Returns\n -------\n h : ndarray of shape (n_samples, n_components)\n Corresponding mean field values for the hidden layer.\n \"\"\"\n p = safe_sparse_dot(v, self.components_.T)\n p += self.intercept_hidden_\n return expit(p, out=p)\n \n def _sample_hiddens(self, v, rng):\n \"\"\"Sample from the distribution P(h|v).\n\n Parameters\n ----------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer to sample from.\n\n rng : RandomState instance\n Random number generator to use.\n\n Returns\n -------\n h : ndarray of shape (n_samples, n_components)\n Values of the hidden layer.\n \"\"\"\n p = self._mean_hiddens(v)\n return rng.random_sample(size=p.shape) < p\n \n def _sample_visibles(self, h, rng):\n \"\"\"Sample from the distribution P(v|h).\n\n Parameters\n ----------\n h : ndarray of shape (n_samples, n_components)\n Values of the hidden layer to sample from.\n\n rng : RandomState instance\n Random number generator to use.\n\n Returns\n -------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer.\n \"\"\"\n p = np.dot(h, self.components_)\n p += self.intercept_visible_\n expit(p, out=p)\n return rng.random_sample(size=p.shape) < p\n \n def _free_energy(self, v):\n \"\"\"Computes the free energy F(v) = - log sum_h exp(-E(v,h)).\n\n Parameters\n ----------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer.\n\n Returns\n -------\n free_energy : ndarray of shape (n_samples,)\n The value of the free energy.\n \"\"\"\n return -safe_sparse_dot(v, self.intercept_visible_) - np.logaddexp(0, safe_sparse_dot(v, self.components_.T) + self.intercept_hidden_).sum(axis=1)\n \n def gibbs(self, v):\n \"\"\"Perform one Gibbs sampling step.\n\n Parameters\n ----------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer to start from.\n\n Returns\n -------\n v_new : ndarray of shape (n_samples, n_features)\n Values of the visible layer after one Gibbs step.\n \"\"\"\n check_is_fitted(self)\n if not hasattr(self, 'random_state_'):\n self.random_state_ = check_random_state(self.random_state)\n h_ = self._sample_hiddens(v, self.random_state_)\n v_ = self._sample_visibles(h_, self.random_state_)\n return v_\n \n def partial_fit(self, X, y=None):\n \"\"\"Fit the model to the partial segment of the data X.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : BernoulliRBM\n The fitted model.\n \"\"\"\n first_pass = not hasattr(self, 'components_')\n X = self._validate_data(X, accept_sparse='csr', dtype=np.float64, reset=first_pass)\n if not hasattr(self, 'random_state_'):\n self.random_state_ = check_random_state(self.random_state)\n if not hasattr(self, 'components_'):\n self.components_ = np.asarray(self.random_state_.normal(0, 0.01, (self.n_components, X.shape[1])), order='F')\n if not hasattr(self, 'intercept_hidden_'):\n self.intercept_hidden_ = np.zeros(self.n_components)\n if not hasattr(self, 'intercept_visible_'):\n self.intercept_visible_ = np.zeros(X.shape[1])\n if not hasattr(self, 'h_samples_'):\n self.h_samples_ = np.zeros((self.batch_size, self.n_components))\n self._fit(X, self.random_state_)\n \n def _fit(self, v_pos, rng):\n \"\"\"Inner fit for one mini-batch.\n\n Adjust the parameters to maximize the likelihood of v using\n Stochastic Maximum Likelihood (SML).\n\n Parameters\n ----------\n v_pos : ndarray of shape (n_samples, n_features)\n The data to use for training.\n\n rng : RandomState instance\n Random number generator to use for sampling.\n \"\"\"\n h_pos = self._mean_hiddens(v_pos)\n v_neg = self._sample_visibles(self.h_samples_, rng)\n h_neg = self._mean_hiddens(v_neg)\n lr = float(self.learning_rate) / v_pos.shape[0]\n update = safe_sparse_dot(v_pos.T, h_pos, dense_output=True).T\n update -= np.dot(h_neg.T, v_neg)\n self.components_ += lr * update\n self.intercept_hidden_ += lr * (h_pos.sum(axis=0) - h_neg.sum(axis=0))\n self.intercept_visible_ += lr * (np.asarray(v_pos.sum(axis=0)).squeeze() - v_neg.sum(axis=0))\n h_neg[rng.uniform(size=h_neg.shape) < h_neg] = 1.0\n self.h_samples_ = np.floor(h_neg, h_neg)\n \n def score_samples(self, X):\n \"\"\"Compute the pseudo-likelihood of X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Values of the visible layer. Must be all-boolean (not checked).\n\n Returns\n -------\n pseudo_likelihood : ndarray of shape (n_samples,)\n Value of the pseudo-likelihood (proxy for likelihood).\n\n Notes\n -----\n This method is not deterministic: it computes a quantity called the\n free energy on X, then on a randomly corrupted version of X, and\n returns the log of the logistic function of the difference.\n \"\"\"\n check_is_fitted(self)\n v = self._validate_data(X, accept_sparse='csr', reset=False)\n rng = check_random_state(self.random_state)\n ind = (np.arange(v.shape[0]), rng.randint(0, v.shape[1], v.shape[0]))\n if sp.issparse(v):\n data = -2 * v[ind] + 1\n v_ = v + sp.csr_matrix((data.A.ravel(), ind), shape=v.shape)\n else:\n v_ = v.copy()\n v_[ind] = 1 - v_[ind]\n fe = self._free_energy(v)\n fe_ = self._free_energy(v_)\n return v.shape[1] * log_logistic(fe_ - fe)\n \n def fit(self, X, y=None):\n \"\"\"Fit the model to the data X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : BernoulliRBM\n The fitted model.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', dtype=(np.float64, np.float32))\n n_samples = X.shape[0]\n rng = check_random_state(self.random_state)\n self.components_ = np.asarray(rng.normal(0, 0.01, (self.n_components, X.shape[1])), order='F', dtype=X.dtype)\n self.intercept_hidden_ = np.zeros(self.n_components, dtype=X.dtype)\n self.intercept_visible_ = np.zeros(X.shape[1], dtype=X.dtype)\n self.h_samples_ = np.zeros((self.batch_size, self.n_components), dtype=X.dtype)\n n_batches = int(np.ceil(float(n_samples) / self.batch_size))\n batch_slices = list(gen_even_slices(n_batches * self.batch_size, n_batches, n_samples=n_samples))\n verbose = self.verbose\n begin = time.time()\n for iteration in range(1, self.n_iter + 1):\n for batch_slice in batch_slices:\n self._fit(X[batch_slice], rng)\n if verbose:\n end = time.time()\n print('[%s] Iteration %d, pseudo-likelihood = %.2f, time = %.2fs' % (type(self).__name__, iteration, self.score_samples(X).mean(), end - begin))\n begin = end\n return self\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_methods_subset_invariance': 'fails for the decision_function method', 'check_methods_sample_order_invariance': 'fails for the score_samples method'}}\n" }, @@ -25750,7 +25846,7 @@ "sklearn.pipeline.FeatureUnion._sk_visual_block_" ], "is_public": true, - "description": "Concatenates results of multiple transformer objects.\n\nThis estimator applies a list of transformer objects in parallel to the input data, then concatenates the results. This is useful to combine several feature extraction mechanisms into a single transformer. Parameters of the transformers may be set using its name and the parameter name separated by a '__'. A transformer may be replaced entirely by setting the parameter with its name to another transformer, or removed by setting to 'drop'. Read more in the :ref:`User Guide `. .. versionadded:: 0.13", + "description": "Concatenates results of multiple transformer objects.\n\nThis estimator applies a list of transformer objects in parallel to the\ninput data, then concatenates the results. This is useful to combine\nseveral feature extraction mechanisms into a single transformer.\n\nParameters of the transformers may be set using its name and the parameter\nname separated by a '__'. A transformer may be replaced entirely by\nsetting the parameter with its name to another transformer,\nor removed by setting to 'drop'.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.13", "docstring": "Concatenates results of multiple transformer objects.\n\n This estimator applies a list of transformer objects in parallel to the\n input data, then concatenates the results. This is useful to combine\n several feature extraction mechanisms into a single transformer.\n\n Parameters of the transformers may be set using its name and the parameter\n name separated by a '__'. A transformer may be replaced entirely by\n setting the parameter with its name to another transformer,\n or removed by setting to 'drop'.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n transformer_list : list of tuple\n List of tuple containing `(str, transformer)`. The first element\n of the tuple is name affected to the transformer while the\n second element is a scikit-learn transformer instance.\n The transformer instance can also be `\"drop\"` for it to be\n ignored.\n\n .. versionchanged:: 0.22\n Deprecated `None` as a transformer in favor of 'drop'.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\n transformer_weights : dict, default=None\n Multiplicative weights for features per transformer.\n Keys are transformer names, values the weights.\n Raises ValueError if key not present in ``transformer_list``.\n\n verbose : bool, default=False\n If True, the time elapsed while fitting each transformer will be\n printed as it is completed.\n\n Attributes\n ----------\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying first transformer in `transformer_list` exposes such an\n attribute when fit.\n\n .. versionadded:: 0.24\n\n See Also\n --------\n make_union : Convenience function for simplified feature union\n construction.\n\n Examples\n --------\n >>> from sklearn.pipeline import FeatureUnion\n >>> from sklearn.decomposition import PCA, TruncatedSVD\n >>> union = FeatureUnion([(\"pca\", PCA(n_components=1)),\n ... (\"svd\", TruncatedSVD(n_components=2))])\n >>> X = [[0., 1., 3], [2., 2., 5]]\n >>> union.fit_transform(X)\n array([[ 1.5 , 3.0..., 0.8...],\n [-1.5 , 5.7..., -0.4...]])\n ", "source_code": "\n\nclass FeatureUnion(TransformerMixin, _BaseComposition):\n \"\"\"Concatenates results of multiple transformer objects.\n\n This estimator applies a list of transformer objects in parallel to the\n input data, then concatenates the results. This is useful to combine\n several feature extraction mechanisms into a single transformer.\n\n Parameters of the transformers may be set using its name and the parameter\n name separated by a '__'. A transformer may be replaced entirely by\n setting the parameter with its name to another transformer,\n or removed by setting to 'drop'.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n transformer_list : list of tuple\n List of tuple containing `(str, transformer)`. The first element\n of the tuple is name affected to the transformer while the\n second element is a scikit-learn transformer instance.\n The transformer instance can also be `\"drop\"` for it to be\n ignored.\n\n .. versionchanged:: 0.22\n Deprecated `None` as a transformer in favor of 'drop'.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\n transformer_weights : dict, default=None\n Multiplicative weights for features per transformer.\n Keys are transformer names, values the weights.\n Raises ValueError if key not present in ``transformer_list``.\n\n verbose : bool, default=False\n If True, the time elapsed while fitting each transformer will be\n printed as it is completed.\n\n Attributes\n ----------\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying first transformer in `transformer_list` exposes such an\n attribute when fit.\n\n .. versionadded:: 0.24\n\n See Also\n --------\n make_union : Convenience function for simplified feature union\n construction.\n\n Examples\n --------\n >>> from sklearn.pipeline import FeatureUnion\n >>> from sklearn.decomposition import PCA, TruncatedSVD\n >>> union = FeatureUnion([(\"pca\", PCA(n_components=1)),\n ... (\"svd\", TruncatedSVD(n_components=2))])\n >>> X = [[0., 1., 3], [2., 2., 5]]\n >>> union.fit_transform(X)\n array([[ 1.5 , 3.0..., 0.8...],\n [-1.5 , 5.7..., -0.4...]])\n \"\"\"\n _required_parameters = ['transformer_list']\n \n def __init__(self, transformer_list, *, n_jobs=None, transformer_weights=None, verbose=False):\n self.transformer_list = transformer_list\n self.n_jobs = n_jobs\n self.transformer_weights = transformer_weights\n self.verbose = verbose\n self._validate_transformers()\n \n def get_params(self, deep=True):\n \"\"\"Get parameters for this estimator.\n\n Returns the parameters given in the constructor as well as the\n estimators contained within the `transformer_list` of the\n `FeatureUnion`.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : mapping of string to any\n Parameter names mapped to their values.\n \"\"\"\n return self._get_params('transformer_list', deep=deep)\n \n def set_params(self, **kwargs):\n \"\"\"Set the parameters of this estimator.\n\n Valid parameter keys can be listed with ``get_params()``. Note that\n you can directly set the parameters of the estimators contained in\n `tranformer_list`.\n\n Parameters\n ----------\n **kwargs : dict\n Parameters of this estimator or parameters of estimators contained\n in `transform_list`. Parameters of the transformers may be set\n using its name and the parameter name separated by a '__'.\n\n Returns\n -------\n self : object\n FeatureUnion class instance.\n \"\"\"\n self._set_params('transformer_list', **kwargs)\n return self\n \n def _validate_transformers(self):\n (names, transformers) = zip(*self.transformer_list)\n self._validate_names(names)\n for t in transformers:\n if t == 'drop':\n continue\n if not (hasattr(t, 'fit') or hasattr(t, 'fit_transform')) or not hasattr(t, 'transform'):\n raise TypeError(\"All estimators should implement fit and transform. '%s' (type %s) doesn't\" % (t, type(t)))\n \n def _validate_transformer_weights(self):\n if not self.transformer_weights:\n return\n transformer_names = set((name for (name, _) in self.transformer_list))\n for name in self.transformer_weights:\n if name not in transformer_names:\n raise ValueError(f'Attempting to weight transformer \"{name}\", but it is not present in transformer_list.')\n \n def _iter(self):\n \"\"\"\n Generate (name, trans, weight) tuples excluding None and\n 'drop' transformers.\n \"\"\"\n get_weight = (self.transformer_weights or {}).get\n return ((name, trans, get_weight(name)) for (name, trans) in self.transformer_list if trans != 'drop')\n \n @deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\n def get_feature_names(self):\n \"\"\"Get feature names from all transformers.\n\n Returns\n -------\n feature_names : list of strings\n Names of the features produced by transform.\n \"\"\"\n feature_names = []\n for (name, trans, weight) in self._iter():\n if not hasattr(trans, 'get_feature_names'):\n raise AttributeError('Transformer %s (type %s) does not provide get_feature_names.' % (str(name), type(trans).__name__))\n feature_names.extend([name + '__' + f for f in trans.get_feature_names()])\n return feature_names\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n feature_names = []\n for (name, trans, _) in self._iter():\n if not hasattr(trans, 'get_feature_names_out'):\n raise AttributeError('Transformer %s (type %s) does not provide get_feature_names_out.' % (str(name), type(trans).__name__))\n feature_names.extend([f'{name}__{f}' for f in trans.get_feature_names_out(input_features)])\n return np.asarray(feature_names, dtype=object)\n \n def fit(self, X, y=None, **fit_params):\n \"\"\"Fit all transformers using X.\n\n Parameters\n ----------\n X : iterable or array-like, depending on transformers\n Input data, used to fit transformers.\n\n y : array-like of shape (n_samples, n_outputs), default=None\n Targets for supervised learning.\n\n **fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n Returns\n -------\n self : object\n FeatureUnion class instance.\n \"\"\"\n transformers = self._parallel_func(X, y, fit_params, _fit_one)\n if not transformers:\n return self\n self._update_transformer_list(transformers)\n return self\n \n def fit_transform(self, X, y=None, **fit_params):\n \"\"\"Fit all transformers, transform the data and concatenate results.\n\n Parameters\n ----------\n X : iterable or array-like, depending on transformers\n Input data to be transformed.\n\n y : array-like of shape (n_samples, n_outputs), default=None\n Targets for supervised learning.\n\n **fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n Returns\n -------\n X_t : array-like or sparse matrix of shape (n_samples, sum_n_components)\n The `hstack` of results of transformers. `sum_n_components` is the\n sum of `n_components` (output dimension) over transformers.\n \"\"\"\n results = self._parallel_func(X, y, fit_params, _fit_transform_one)\n if not results:\n return np.zeros((X.shape[0], 0))\n (Xs, transformers) = zip(*results)\n self._update_transformer_list(transformers)\n return self._hstack(Xs)\n \n def _log_message(self, name, idx, total):\n if not self.verbose:\n return None\n return '(step %d of %d) Processing %s' % (idx, total, name)\n \n def _parallel_func(self, X, y, fit_params, func):\n \"\"\"Runs func in parallel on X and y\"\"\"\n self.transformer_list = list(self.transformer_list)\n self._validate_transformers()\n self._validate_transformer_weights()\n transformers = list(self._iter())\n return Parallel(n_jobs=self.n_jobs)((delayed(func)(transformer, X, y, weight, message_clsname='FeatureUnion', message=self._log_message(name, idx, len(transformers)), **fit_params) for (idx, (name, transformer, weight)) in enumerate(transformers, 1)))\n \n def transform(self, X):\n \"\"\"Transform X separately by each transformer, concatenate results.\n\n Parameters\n ----------\n X : iterable or array-like, depending on transformers\n Input data to be transformed.\n\n Returns\n -------\n X_t : array-like or sparse matrix of shape (n_samples, sum_n_components)\n The `hstack` of results of transformers. `sum_n_components` is the\n sum of `n_components` (output dimension) over transformers.\n \"\"\"\n Xs = Parallel(n_jobs=self.n_jobs)((delayed(_transform_one)(trans, X, None, weight) for (name, trans, weight) in self._iter()))\n if not Xs:\n return np.zeros((X.shape[0], 0))\n return self._hstack(Xs)\n \n def _hstack(self, Xs):\n if any((sparse.issparse(f) for f in Xs)):\n Xs = sparse.hstack(Xs).tocsr()\n else:\n Xs = np.hstack(Xs)\n return Xs\n \n def _update_transformer_list(self, transformers):\n transformers = iter(transformers)\n self.transformer_list[:] = [(name, old if old == 'drop' else next(transformers)) for (name, old) in self.transformer_list]\n \n @property\n def n_features_in_(self):\n \"\"\"Number of features seen during :term:`fit`.\"\"\"\n return self.transformer_list[0][1].n_features_in_\n \n def _sk_visual_block_(self):\n (names, transformers) = zip(*self.transformer_list)\n return _VisualBlock('parallel', transformers, names=names)\n" }, @@ -25796,7 +25892,7 @@ "sklearn.pipeline.Pipeline._sk_visual_block_" ], "is_public": true, - "description": "Pipeline of transforms with a final estimator.\n\nSequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be 'transforms', that is, they must implement `fit` and `transform` methods. The final estimator only needs to implement `fit`. The transformers in the pipeline can be cached using ``memory`` argument. The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. For this, it enables setting parameters of the various steps using their names and the parameter name separated by a `'__'`, as in the example below. A step's estimator may be replaced entirely by setting the parameter with its name to another estimator, or a transformer removed by setting it to `'passthrough'` or `None`. Read more in the :ref:`User Guide `. .. versionadded:: 0.5", + "description": "Pipeline of transforms with a final estimator.\n\nSequentially apply a list of transforms and a final estimator.\nIntermediate steps of the pipeline must be 'transforms', that is, they\nmust implement `fit` and `transform` methods.\nThe final estimator only needs to implement `fit`.\nThe transformers in the pipeline can be cached using ``memory`` argument.\n\nThe purpose of the pipeline is to assemble several steps that can be\ncross-validated together while setting different parameters. For this, it\nenables setting parameters of the various steps using their names and the\nparameter name separated by a `'__'`, as in the example below. A step's\nestimator may be replaced entirely by setting the parameter with its name\nto another estimator, or a transformer removed by setting it to\n`'passthrough'` or `None`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.5", "docstring": "\n Pipeline of transforms with a final estimator.\n\n Sequentially apply a list of transforms and a final estimator.\n Intermediate steps of the pipeline must be 'transforms', that is, they\n must implement `fit` and `transform` methods.\n The final estimator only needs to implement `fit`.\n The transformers in the pipeline can be cached using ``memory`` argument.\n\n The purpose of the pipeline is to assemble several steps that can be\n cross-validated together while setting different parameters. For this, it\n enables setting parameters of the various steps using their names and the\n parameter name separated by a `'__'`, as in the example below. A step's\n estimator may be replaced entirely by setting the parameter with its name\n to another estimator, or a transformer removed by setting it to\n `'passthrough'` or `None`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.5\n\n Parameters\n ----------\n steps : list of tuple\n List of (name, transform) tuples (implementing `fit`/`transform`) that\n are chained, in the order in which they are chained, with the last\n object an estimator.\n\n memory : str or object with the joblib.Memory interface, default=None\n Used to cache the fitted transformers of the pipeline. By default,\n no caching is performed. If a string is given, it is the path to\n the caching directory. Enabling caching triggers a clone of\n the transformers before fitting. Therefore, the transformer\n instance given to the pipeline cannot be inspected\n directly. Use the attribute ``named_steps`` or ``steps`` to\n inspect estimators within the pipeline. Caching the\n transformers is advantageous when fitting is time consuming.\n\n verbose : bool, default=False\n If True, the time elapsed while fitting each step will be printed as it\n is completed.\n\n Attributes\n ----------\n named_steps : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n Read-only attribute to access any step parameter by user given name.\n Keys are step names and values are steps parameters.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels. Only exist if the last step of the pipeline is a\n classifier.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying first estimator in `steps` exposes such an attribute\n when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n make_pipeline : Convenience function for simplified pipeline construction.\n\n Examples\n --------\n >>> from sklearn.svm import SVC\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.pipeline import Pipeline\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y,\n ... random_state=0)\n >>> pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])\n >>> # The pipeline can be used as any other estimator\n >>> # and avoids leaking the test set into the train set\n >>> pipe.fit(X_train, y_train)\n Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])\n >>> pipe.score(X_test, y_test)\n 0.88\n ", "source_code": "\n\nclass Pipeline(_BaseComposition):\n \"\"\"\n Pipeline of transforms with a final estimator.\n\n Sequentially apply a list of transforms and a final estimator.\n Intermediate steps of the pipeline must be 'transforms', that is, they\n must implement `fit` and `transform` methods.\n The final estimator only needs to implement `fit`.\n The transformers in the pipeline can be cached using ``memory`` argument.\n\n The purpose of the pipeline is to assemble several steps that can be\n cross-validated together while setting different parameters. For this, it\n enables setting parameters of the various steps using their names and the\n parameter name separated by a `'__'`, as in the example below. A step's\n estimator may be replaced entirely by setting the parameter with its name\n to another estimator, or a transformer removed by setting it to\n `'passthrough'` or `None`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.5\n\n Parameters\n ----------\n steps : list of tuple\n List of (name, transform) tuples (implementing `fit`/`transform`) that\n are chained, in the order in which they are chained, with the last\n object an estimator.\n\n memory : str or object with the joblib.Memory interface, default=None\n Used to cache the fitted transformers of the pipeline. By default,\n no caching is performed. If a string is given, it is the path to\n the caching directory. Enabling caching triggers a clone of\n the transformers before fitting. Therefore, the transformer\n instance given to the pipeline cannot be inspected\n directly. Use the attribute ``named_steps`` or ``steps`` to\n inspect estimators within the pipeline. Caching the\n transformers is advantageous when fitting is time consuming.\n\n verbose : bool, default=False\n If True, the time elapsed while fitting each step will be printed as it\n is completed.\n\n Attributes\n ----------\n named_steps : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n Read-only attribute to access any step parameter by user given name.\n Keys are step names and values are steps parameters.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels. Only exist if the last step of the pipeline is a\n classifier.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`. Only defined if the\n underlying first estimator in `steps` exposes such an attribute\n when fit.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Only defined if the\n underlying estimator exposes such an attribute when fit.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n make_pipeline : Convenience function for simplified pipeline construction.\n\n Examples\n --------\n >>> from sklearn.svm import SVC\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.pipeline import Pipeline\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(X, y,\n ... random_state=0)\n >>> pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])\n >>> # The pipeline can be used as any other estimator\n >>> # and avoids leaking the test set into the train set\n >>> pipe.fit(X_train, y_train)\n Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])\n >>> pipe.score(X_test, y_test)\n 0.88\n \"\"\"\n _required_parameters = ['steps']\n \n def __init__(self, steps, *, memory=None, verbose=False):\n self.steps = steps\n self.memory = memory\n self.verbose = verbose\n self._validate_steps()\n \n def get_params(self, deep=True):\n \"\"\"Get parameters for this estimator.\n\n Returns the parameters given in the constructor as well as the\n estimators contained within the `steps` of the `Pipeline`.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : mapping of string to any\n Parameter names mapped to their values.\n \"\"\"\n return self._get_params('steps', deep=deep)\n \n def set_params(self, **kwargs):\n \"\"\"Set the parameters of this estimator.\n\n Valid parameter keys can be listed with ``get_params()``. Note that\n you can directly set the parameters of the estimators contained in\n `steps`.\n\n Parameters\n ----------\n **kwargs : dict\n Parameters of this estimator or parameters of estimators contained\n in `steps`. Parameters of the steps may be set using its name and\n the parameter name separated by a '__'.\n\n Returns\n -------\n self : object\n Pipeline class instance.\n \"\"\"\n self._set_params('steps', **kwargs)\n return self\n \n def _validate_steps(self):\n (names, estimators) = zip(*self.steps)\n self._validate_names(names)\n transformers = estimators[:-1]\n estimator = estimators[-1]\n for t in transformers:\n if t is None or t == 'passthrough':\n continue\n if not (hasattr(t, 'fit') or hasattr(t, 'fit_transform')) or not hasattr(t, 'transform'):\n raise TypeError(\"All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' '%s' (type %s) doesn't\" % (t, type(t)))\n if estimator is not None and estimator != 'passthrough' and not hasattr(estimator, 'fit'):\n raise TypeError(\"Last step of Pipeline should implement fit or be the string 'passthrough'. '%s' (type %s) doesn't\" % (estimator, type(estimator)))\n \n def _iter(self, with_final=True, filter_passthrough=True):\n \"\"\"\n Generate (idx, (name, trans)) tuples from self.steps\n\n When filter_passthrough is True, 'passthrough' and None transformers\n are filtered out.\n \"\"\"\n stop = len(self.steps)\n if not with_final:\n stop -= 1\n for (idx, (name, trans)) in enumerate(islice(self.steps, 0, stop)):\n if not filter_passthrough:\n yield (idx, name, trans)\n elif trans is not None and trans != 'passthrough':\n yield (idx, name, trans)\n \n def __len__(self):\n \"\"\"\n Returns the length of the Pipeline\n \"\"\"\n return len(self.steps)\n \n def __getitem__(self, ind):\n \"\"\"Returns a sub-pipeline or a single estimator in the pipeline\n\n Indexing with an integer will return an estimator; using a slice\n returns another Pipeline instance which copies a slice of this\n Pipeline. This copy is shallow: modifying (or fitting) estimators in\n the sub-pipeline will affect the larger pipeline and vice-versa.\n However, replacing a value in `step` will not affect a copy.\n \"\"\"\n if isinstance(ind, slice):\n if ind.step not in (1, None):\n raise ValueError('Pipeline slicing only supports a step of 1')\n return self.__class__(self.steps[ind], memory=self.memory, verbose=self.verbose)\n try:\n (name, est) = self.steps[ind]\n except TypeError:\n return self.named_steps[ind]\n return est\n \n @property\n def _estimator_type(self):\n return self.steps[-1][1]._estimator_type\n \n @property\n def named_steps(self):\n \"\"\"Access the steps by name.\n\n Read-only attribute to access any step by given name.\n Keys are steps names and values are the steps objects.\"\"\"\n return Bunch(**dict(self.steps))\n \n @property\n def _final_estimator(self):\n estimator = self.steps[-1][1]\n return 'passthrough' if estimator is None else estimator\n \n def _log_message(self, step_idx):\n if not self.verbose:\n return None\n (name, _) = self.steps[step_idx]\n return '(step %d of %d) Processing %s' % (step_idx + 1, len(self.steps), name)\n \n def _check_fit_params(self, **fit_params):\n fit_params_steps = {name: {} for (name, step) in self.steps if step is not None}\n for (pname, pval) in fit_params.items():\n if '__' not in pname:\n raise ValueError('Pipeline.fit does not accept the {} parameter. You can pass parameters to specific steps of your pipeline using the stepname__parameter format, e.g. `Pipeline.fit(X, y, logisticregression__sample_weight=sample_weight)`.'.format(pname))\n (step, param) = pname.split('__', 1)\n fit_params_steps[step][param] = pval\n return fit_params_steps\n \n def _fit(self, X, y=None, **fit_params_steps):\n self.steps = list(self.steps)\n self._validate_steps()\n memory = check_memory(self.memory)\n fit_transform_one_cached = memory.cache(_fit_transform_one)\n for (step_idx, name, transformer) in self._iter(with_final=False, filter_passthrough=False):\n if transformer is None or transformer == 'passthrough':\n with _print_elapsed_time('Pipeline', self._log_message(step_idx)):\n continue\n if hasattr(memory, 'location'):\n if memory.location is None:\n cloned_transformer = transformer\n else:\n cloned_transformer = clone(transformer)\n elif hasattr(memory, 'cachedir'):\n if memory.cachedir is None:\n cloned_transformer = transformer\n else:\n cloned_transformer = clone(transformer)\n else:\n cloned_transformer = clone(transformer)\n (X, fitted_transformer) = fit_transform_one_cached(cloned_transformer, X, y, None, message_clsname='Pipeline', message=self._log_message(step_idx), **fit_params_steps[name])\n self.steps[step_idx] = (name, fitted_transformer)\n return X\n \n def fit(self, X, y=None, **fit_params):\n \"\"\"Fit the model.\n\n Fit all the transformers one after the other and transform the\n data. Finally, fit the transformed data using the final estimator.\n\n Parameters\n ----------\n X : iterable\n Training data. Must fulfill input requirements of first step of the\n pipeline.\n\n y : iterable, default=None\n Training targets. Must fulfill label requirements for all steps of\n the pipeline.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of each step, where\n each parameter name is prefixed such that parameter ``p`` for step\n ``s`` has key ``s__p``.\n\n Returns\n -------\n self : object\n Pipeline with fitted steps.\n \"\"\"\n fit_params_steps = self._check_fit_params(**fit_params)\n Xt = self._fit(X, y, **fit_params_steps)\n with _print_elapsed_time('Pipeline', self._log_message(len(self.steps) - 1)):\n if self._final_estimator != 'passthrough':\n fit_params_last_step = fit_params_steps[self.steps[-1][0]]\n self._final_estimator.fit(Xt, y, **fit_params_last_step)\n return self\n \n def fit_transform(self, X, y=None, **fit_params):\n \"\"\"Fit the model and transform with the final estimator.\n\n Fits all the transformers one after the other and transform the\n data. Then uses `fit_transform` on transformed data with the final\n estimator.\n\n Parameters\n ----------\n X : iterable\n Training data. Must fulfill input requirements of first step of the\n pipeline.\n\n y : iterable, default=None\n Training targets. Must fulfill label requirements for all steps of\n the pipeline.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of each step, where\n each parameter name is prefixed such that parameter ``p`` for step\n ``s`` has key ``s__p``.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_transformed_features)\n Transformed samples.\n \"\"\"\n fit_params_steps = self._check_fit_params(**fit_params)\n Xt = self._fit(X, y, **fit_params_steps)\n last_step = self._final_estimator\n with _print_elapsed_time('Pipeline', self._log_message(len(self.steps) - 1)):\n if last_step == 'passthrough':\n return Xt\n fit_params_last_step = fit_params_steps[self.steps[-1][0]]\n if hasattr(last_step, 'fit_transform'):\n return last_step.fit_transform(Xt, y, **fit_params_last_step)\n else:\n return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)\n \n @available_if(_final_estimator_has('predict'))\n def predict(self, X, **predict_params):\n \"\"\"Transform the data, and apply `predict` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls `predict`\n method. Only valid if the final estimator implements `predict`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n **predict_params : dict of string -> object\n Parameters to the ``predict`` called at the end of all\n transformations in the pipeline. Note that while this may be\n used to return uncertainties from some models with return_std\n or return_cov, uncertainties that are generated by the\n transformations in the pipeline are not propagated to the\n final estimator.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n y_pred : ndarray\n Result of calling `predict` on the final estimator.\n \"\"\"\n Xt = X\n for (_, name, transform) in self._iter(with_final=False):\n Xt = transform.transform(Xt)\n return self.steps[-1][1].predict(Xt, **predict_params)\n \n @available_if(_final_estimator_has('fit_predict'))\n def fit_predict(self, X, y=None, **fit_params):\n \"\"\"Transform the data, and apply `fit_predict` with the final estimator.\n\n Call `fit_transform` of each transformer in the pipeline. The\n transformed data are finally passed to the final estimator that calls\n `fit_predict` method. Only valid if the final estimator implements\n `fit_predict`.\n\n Parameters\n ----------\n X : iterable\n Training data. Must fulfill input requirements of first step of\n the pipeline.\n\n y : iterable, default=None\n Training targets. Must fulfill label requirements for all steps\n of the pipeline.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of each step, where\n each parameter name is prefixed such that parameter ``p`` for step\n ``s`` has key ``s__p``.\n\n Returns\n -------\n y_pred : ndarray\n Result of calling `fit_predict` on the final estimator.\n \"\"\"\n fit_params_steps = self._check_fit_params(**fit_params)\n Xt = self._fit(X, y, **fit_params_steps)\n fit_params_last_step = fit_params_steps[self.steps[-1][0]]\n with _print_elapsed_time('Pipeline', self._log_message(len(self.steps) - 1)):\n y_pred = self.steps[-1][1].fit_predict(Xt, y, **fit_params_last_step)\n return y_pred\n \n @available_if(_final_estimator_has('predict_proba'))\n def predict_proba(self, X, **predict_proba_params):\n \"\"\"Transform the data, and apply `predict_proba` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `predict_proba` method. Only valid if the final estimator implements\n `predict_proba`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n **predict_proba_params : dict of string -> object\n Parameters to the `predict_proba` called at the end of all\n transformations in the pipeline.\n\n Returns\n -------\n y_proba : ndarray of shape (n_samples, n_classes)\n Result of calling `predict_proba` on the final estimator.\n \"\"\"\n Xt = X\n for (_, name, transform) in self._iter(with_final=False):\n Xt = transform.transform(Xt)\n return self.steps[-1][1].predict_proba(Xt, **predict_proba_params)\n \n @available_if(_final_estimator_has('decision_function'))\n def decision_function(self, X):\n \"\"\"Transform the data, and apply `decision_function` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `decision_function` method. Only valid if the final estimator\n implements `decision_function`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n Returns\n -------\n y_score : ndarray of shape (n_samples, n_classes)\n Result of calling `decision_function` on the final estimator.\n \"\"\"\n Xt = X\n for (_, name, transform) in self._iter(with_final=False):\n Xt = transform.transform(Xt)\n return self.steps[-1][1].decision_function(Xt)\n \n @available_if(_final_estimator_has('score_samples'))\n def score_samples(self, X):\n \"\"\"Transform the data, and apply `score_samples` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `score_samples` method. Only valid if the final estimator implements\n `score_samples`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n Returns\n -------\n y_score : ndarray of shape (n_samples,)\n Result of calling `score_samples` on the final estimator.\n \"\"\"\n Xt = X\n for (_, _, transformer) in self._iter(with_final=False):\n Xt = transformer.transform(Xt)\n return self.steps[-1][1].score_samples(Xt)\n \n @available_if(_final_estimator_has('predict_log_proba'))\n def predict_log_proba(self, X, **predict_log_proba_params):\n \"\"\"Transform the data, and apply `predict_log_proba` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `predict_log_proba` method. Only valid if the final estimator\n implements `predict_log_proba`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n **predict_log_proba_params : dict of string -> object\n Parameters to the ``predict_log_proba`` called at the end of all\n transformations in the pipeline.\n\n Returns\n -------\n y_log_proba : ndarray of shape (n_samples, n_classes)\n Result of calling `predict_log_proba` on the final estimator.\n \"\"\"\n Xt = X\n for (_, name, transform) in self._iter(with_final=False):\n Xt = transform.transform(Xt)\n return self.steps[-1][1].predict_log_proba(Xt, **predict_log_proba_params)\n \n def _can_transform(self):\n return self._final_estimator == 'passthrough' or hasattr(self._final_estimator, 'transform')\n \n @available_if(_can_transform)\n def transform(self, X):\n \"\"\"Transform the data, and apply `transform` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `transform` method. Only valid if the final estimator\n implements `transform`.\n\n This also works where final estimator is `None` in which case all prior\n transformations are applied.\n\n Parameters\n ----------\n X : iterable\n Data to transform. Must fulfill input requirements of first step\n of the pipeline.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_transformed_features)\n Transformed data.\n \"\"\"\n Xt = X\n for (_, _, transform) in self._iter():\n Xt = transform.transform(Xt)\n return Xt\n \n def _can_inverse_transform(self):\n return all((hasattr(t, 'inverse_transform') for (_, _, t) in self._iter()))\n \n @available_if(_can_inverse_transform)\n def inverse_transform(self, Xt):\n \"\"\"Apply `inverse_transform` for each step in a reverse order.\n\n All estimators in the pipeline must support `inverse_transform`.\n\n Parameters\n ----------\n Xt : array-like of shape (n_samples, n_transformed_features)\n Data samples, where ``n_samples`` is the number of samples and\n ``n_features`` is the number of features. Must fulfill\n input requirements of last step of pipeline's\n ``inverse_transform`` method.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_features)\n Inverse transformed data, that is, data in the original feature\n space.\n \"\"\"\n reverse_iter = reversed(list(self._iter()))\n for (_, _, transform) in reverse_iter:\n Xt = transform.inverse_transform(Xt)\n return Xt\n \n @available_if(_final_estimator_has('score'))\n def score(self, X, y=None, sample_weight=None):\n \"\"\"Transform the data, and apply `score` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `score` method. Only valid if the final estimator implements `score`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n y : iterable, default=None\n Targets used for scoring. Must fulfill label requirements for all\n steps of the pipeline.\n\n sample_weight : array-like, default=None\n If not None, this argument is passed as ``sample_weight`` keyword\n argument to the ``score`` method of the final estimator.\n\n Returns\n -------\n score : float\n Result of calling `score` on the final estimator.\n \"\"\"\n Xt = X\n for (_, name, transform) in self._iter(with_final=False):\n Xt = transform.transform(Xt)\n score_params = {}\n if sample_weight is not None:\n score_params['sample_weight'] = sample_weight\n return self.steps[-1][1].score(Xt, y, **score_params)\n \n @property\n def classes_(self):\n \"\"\"The classes labels. Only exist if the last step is a classifier.\"\"\"\n return self.steps[-1][1].classes_\n \n def _more_tags(self):\n return {'pairwise': _safe_tags(self.steps[0][1], 'pairwise')}\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return getattr(self.steps[0][1], '_pairwise', False)\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Transform input features using the pipeline.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n feature_names_out = input_features\n for (_, name, transform) in self._iter():\n if not hasattr(transform, 'get_feature_names_out'):\n raise AttributeError('Estimator {} does not provide get_feature_names_out. Did you mean to call pipeline[:-1].get_feature_names_out()?'.format(name))\n feature_names_out = transform.get_feature_names_out(feature_names_out)\n return feature_names_out\n \n @property\n def n_features_in_(self):\n \"\"\"Number of features seen during first step `fit` method.\"\"\"\n return self.steps[0][1].n_features_in_\n \n @property\n def feature_names_in_(self):\n \"\"\"Names of features seen during first step `fit` method.\"\"\"\n return self.steps[0][1].feature_names_in_\n \n def __sklearn_is_fitted__(self):\n \"\"\"Indicate whether pipeline has been fit.\"\"\"\n try:\n check_is_fitted(self.steps[-1][1])\n return True\n except NotFittedError:\n return False\n \n def _sk_visual_block_(self):\n (_, estimators) = zip(*self.steps)\n \n def _get_name(name, est):\n if est is None or est == 'passthrough':\n return f'{name}: passthrough'\n return f'{name}: {est.__class__.__name__}'\n names = [_get_name(name, est) for (name, est) in self.steps]\n name_details = [str(est) for est in estimators]\n return _VisualBlock('serial', estimators, names=names, name_details=name_details, dash_wrapped=False)\n" }, @@ -25812,7 +25908,7 @@ "sklearn.preprocessing._data.Binarizer._more_tags" ], "is_public": true, - "description": "Binarize data (set feature values to 0 or 1) according to a threshold.\n\nValues greater than the threshold map to 1, while values less than or equal to the threshold map to 0. With the default threshold of 0, only positive values map to 1. Binarization is a common operation on text count data where the analyst can decide to only consider the presence or absence of a feature rather than a quantified number of occurrences for instance. It can also be used as a pre-processing step for estimators that consider boolean random variables (e.g. modelled using the Bernoulli distribution in a Bayesian setting). Read more in the :ref:`User Guide `.", + "description": "Binarize data (set feature values to 0 or 1) according to a threshold.\n\nValues greater than the threshold map to 1, while values less than\nor equal to the threshold map to 0. With the default threshold of 0,\nonly positive values map to 1.\n\nBinarization is a common operation on text count data where the\nanalyst can decide to only consider the presence or absence of a\nfeature rather than a quantified number of occurrences for instance.\n\nIt can also be used as a pre-processing step for estimators that\nconsider boolean random variables (e.g. modelled using the Bernoulli\ndistribution in a Bayesian setting).\n\nRead more in the :ref:`User Guide `.", "docstring": "Binarize data (set feature values to 0 or 1) according to a threshold.\n\n Values greater than the threshold map to 1, while values less than\n or equal to the threshold map to 0. With the default threshold of 0,\n only positive values map to 1.\n\n Binarization is a common operation on text count data where the\n analyst can decide to only consider the presence or absence of a\n feature rather than a quantified number of occurrences for instance.\n\n It can also be used as a pre-processing step for estimators that\n consider boolean random variables (e.g. modelled using the Bernoulli\n distribution in a Bayesian setting).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n threshold : float, default=0.0\n Feature values below or equal to this are replaced by 0, above it by 1.\n Threshold may not be less than 0 for operations on sparse matrices.\n\n copy : bool, default=True\n Set to False to perform inplace binarization and avoid a copy (if\n the input is already a numpy array or a scipy.sparse CSR matrix).\n\n Attributes\n ----------\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n binarize : Equivalent function without the estimator API.\n KBinsDiscretizer : Bin continuous data into intervals.\n OneHotEncoder : Encode categorical features as a one-hot numeric array.\n\n Notes\n -----\n If the input is a sparse matrix, only the non-zero values are subject\n to update by the Binarizer class.\n\n This estimator is stateless (besides constructor parameters), the\n fit method does nothing but is useful when used in a pipeline.\n\n Examples\n --------\n >>> from sklearn.preprocessing import Binarizer\n >>> X = [[ 1., -1., 2.],\n ... [ 2., 0., 0.],\n ... [ 0., 1., -1.]]\n >>> transformer = Binarizer().fit(X) # fit does nothing.\n >>> transformer\n Binarizer()\n >>> transformer.transform(X)\n array([[1., 0., 1.],\n [1., 0., 0.],\n [0., 1., 0.]])\n ", "source_code": "\n\nclass Binarizer(TransformerMixin, BaseEstimator):\n \"\"\"Binarize data (set feature values to 0 or 1) according to a threshold.\n\n Values greater than the threshold map to 1, while values less than\n or equal to the threshold map to 0. With the default threshold of 0,\n only positive values map to 1.\n\n Binarization is a common operation on text count data where the\n analyst can decide to only consider the presence or absence of a\n feature rather than a quantified number of occurrences for instance.\n\n It can also be used as a pre-processing step for estimators that\n consider boolean random variables (e.g. modelled using the Bernoulli\n distribution in a Bayesian setting).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n threshold : float, default=0.0\n Feature values below or equal to this are replaced by 0, above it by 1.\n Threshold may not be less than 0 for operations on sparse matrices.\n\n copy : bool, default=True\n Set to False to perform inplace binarization and avoid a copy (if\n the input is already a numpy array or a scipy.sparse CSR matrix).\n\n Attributes\n ----------\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n binarize : Equivalent function without the estimator API.\n KBinsDiscretizer : Bin continuous data into intervals.\n OneHotEncoder : Encode categorical features as a one-hot numeric array.\n\n Notes\n -----\n If the input is a sparse matrix, only the non-zero values are subject\n to update by the Binarizer class.\n\n This estimator is stateless (besides constructor parameters), the\n fit method does nothing but is useful when used in a pipeline.\n\n Examples\n --------\n >>> from sklearn.preprocessing import Binarizer\n >>> X = [[ 1., -1., 2.],\n ... [ 2., 0., 0.],\n ... [ 0., 1., -1.]]\n >>> transformer = Binarizer().fit(X) # fit does nothing.\n >>> transformer\n Binarizer()\n >>> transformer.transform(X)\n array([[1., 0., 1.],\n [1., 0., 0.],\n [0., 1., 0.]])\n \"\"\"\n \n def __init__(self, *, threshold=0.0, copy=True):\n self.threshold = threshold\n self.copy = copy\n \n def fit(self, X, y=None):\n \"\"\"Do nothing and return the estimator unchanged.\n\n This method is just there to implement the usual API and hence\n work in pipelines.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n self._validate_data(X, accept_sparse='csr')\n return self\n \n def transform(self, X, copy=None):\n \"\"\"Binarize each element of X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to binarize, element by element.\n scipy.sparse matrices should be in CSR format to avoid an\n un-necessary copy.\n\n copy : bool\n Copy the input X or not.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n copy = copy if copy is not None else self.copy\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], copy=copy, reset=False)\n return binarize(X, threshold=self.threshold, copy=False)\n \n def _more_tags(self):\n return {'stateless': True}\n" }, @@ -25829,7 +25925,7 @@ "sklearn.preprocessing._data.KernelCenterer._pairwise@getter" ], "is_public": true, - "description": "Center an arbitrary kernel matrix :math:`K`.\n\nLet define a kernel :math:`K` such that: .. math:: K(X, Y) = \\phi(X) . \\phi(Y)^{T} :math:`\\phi(X)` is a function mapping of rows of :math:`X` to a Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`. This class allows to compute :math:`\\tilde{K}(X, Y)` such that: .. math:: \\tilde{K(X, Y)} = \\tilde{\\phi}(X) . \\tilde{\\phi}(Y)^{T} :math:`\\tilde{\\phi}(X)` is the centered mapped data in the Hilbert space. `KernelCenterer` centers the features without explicitly computing the mapping :math:`\\phi(\\cdot)`. Working with centered kernels is sometime expected when dealing with algebra computation such as eigendecomposition for :class:`~sklearn.decomposition.KernelPCA` for instance. Read more in the :ref:`User Guide `.", + "description": "Center an arbitrary kernel matrix :math:`K`.\n\nLet define a kernel :math:`K` such that:\n\n.. math::\n K(X, Y) = \\phi(X) . \\phi(Y)^{T}\n\n:math:`\\phi(X)` is a function mapping of rows of :math:`X` to a\nHilbert space and :math:`K` is of shape `(n_samples, n_samples)`.\n\nThis class allows to compute :math:`\\tilde{K}(X, Y)` such that:\n\n.. math::\n \\tilde{K(X, Y)} = \\tilde{\\phi}(X) . \\tilde{\\phi}(Y)^{T}\n\n:math:`\\tilde{\\phi}(X)` is the centered mapped data in the Hilbert\nspace.\n\n`KernelCenterer` centers the features without explicitly computing the\nmapping :math:`\\phi(\\cdot)`. Working with centered kernels is sometime\nexpected when dealing with algebra computation such as eigendecomposition\nfor :class:`~sklearn.decomposition.KernelPCA` for instance.\n\nRead more in the :ref:`User Guide `.", "docstring": "Center an arbitrary kernel matrix :math:`K`.\n\n Let define a kernel :math:`K` such that:\n\n .. math::\n K(X, Y) = \\phi(X) . \\phi(Y)^{T}\n\n :math:`\\phi(X)` is a function mapping of rows of :math:`X` to a\n Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`.\n\n This class allows to compute :math:`\\tilde{K}(X, Y)` such that:\n\n .. math::\n \\tilde{K(X, Y)} = \\tilde{\\phi}(X) . \\tilde{\\phi}(Y)^{T}\n\n :math:`\\tilde{\\phi}(X)` is the centered mapped data in the Hilbert\n space.\n\n `KernelCenterer` centers the features without explicitly computing the\n mapping :math:`\\phi(\\cdot)`. Working with centered kernels is sometime\n expected when dealing with algebra computation such as eigendecomposition\n for :class:`~sklearn.decomposition.KernelPCA` for instance.\n\n Read more in the :ref:`User Guide `.\n\n Attributes\n ----------\n K_fit_rows_ : ndarray of shape (n_samples,)\n Average of each column of kernel matrix.\n\n K_fit_all_ : float\n Average of kernel matrix.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.kernel_approximation.Nystroem : Approximate a kernel map\n using a subset of the training data.\n\n References\n ----------\n .. [1] `Sch\u00f6lkopf, Bernhard, Alexander Smola, and Klaus-Robert M\u00fcller.\n \"Nonlinear component analysis as a kernel eigenvalue problem.\"\n Neural computation 10.5 (1998): 1299-1319.\n `_\n\n Examples\n --------\n >>> from sklearn.preprocessing import KernelCenterer\n >>> from sklearn.metrics.pairwise import pairwise_kernels\n >>> X = [[ 1., -2., 2.],\n ... [ -2., 1., 3.],\n ... [ 4., 1., -2.]]\n >>> K = pairwise_kernels(X, metric='linear')\n >>> K\n array([[ 9., 2., -2.],\n [ 2., 14., -13.],\n [ -2., -13., 21.]])\n >>> transformer = KernelCenterer().fit(K)\n >>> transformer\n KernelCenterer()\n >>> transformer.transform(K)\n array([[ 5., 0., -5.],\n [ 0., 14., -14.],\n [ -5., -14., 19.]])\n ", "source_code": "\n\nclass KernelCenterer(TransformerMixin, BaseEstimator):\n \"\"\"Center an arbitrary kernel matrix :math:`K`.\n\n Let define a kernel :math:`K` such that:\n\n .. math::\n K(X, Y) = \\phi(X) . \\phi(Y)^{T}\n\n :math:`\\phi(X)` is a function mapping of rows of :math:`X` to a\n Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`.\n\n This class allows to compute :math:`\\tilde{K}(X, Y)` such that:\n\n .. math::\n \\tilde{K(X, Y)} = \\tilde{\\phi}(X) . \\tilde{\\phi}(Y)^{T}\n\n :math:`\\tilde{\\phi}(X)` is the centered mapped data in the Hilbert\n space.\n\n `KernelCenterer` centers the features without explicitly computing the\n mapping :math:`\\phi(\\cdot)`. Working with centered kernels is sometime\n expected when dealing with algebra computation such as eigendecomposition\n for :class:`~sklearn.decomposition.KernelPCA` for instance.\n\n Read more in the :ref:`User Guide `.\n\n Attributes\n ----------\n K_fit_rows_ : ndarray of shape (n_samples,)\n Average of each column of kernel matrix.\n\n K_fit_all_ : float\n Average of kernel matrix.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n sklearn.kernel_approximation.Nystroem : Approximate a kernel map\n using a subset of the training data.\n\n References\n ----------\n .. [1] `Sch\u00f6lkopf, Bernhard, Alexander Smola, and Klaus-Robert M\u00fcller.\n \"Nonlinear component analysis as a kernel eigenvalue problem.\"\n Neural computation 10.5 (1998): 1299-1319.\n `_\n\n Examples\n --------\n >>> from sklearn.preprocessing import KernelCenterer\n >>> from sklearn.metrics.pairwise import pairwise_kernels\n >>> X = [[ 1., -2., 2.],\n ... [ -2., 1., 3.],\n ... [ 4., 1., -2.]]\n >>> K = pairwise_kernels(X, metric='linear')\n >>> K\n array([[ 9., 2., -2.],\n [ 2., 14., -13.],\n [ -2., -13., 21.]])\n >>> transformer = KernelCenterer().fit(K)\n >>> transformer\n KernelCenterer()\n >>> transformer.transform(K)\n array([[ 5., 0., -5.],\n [ 0., 14., -14.],\n [ -5., -14., 19.]])\n \"\"\"\n \n def __init__(self):\n pass\n \n def fit(self, K, y=None):\n \"\"\"Fit KernelCenterer.\n\n Parameters\n ----------\n K : ndarray of shape (n_samples, n_samples)\n Kernel matrix.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n K = self._validate_data(K, dtype=FLOAT_DTYPES)\n if K.shape[0] != K.shape[1]:\n raise ValueError('Kernel matrix must be a square matrix. Input is a {}x{} matrix.'.format(K.shape[0], K.shape[1]))\n n_samples = K.shape[0]\n self.K_fit_rows_ = np.sum(K, axis=0) / n_samples\n self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples\n return self\n \n def transform(self, K, copy=True):\n \"\"\"Center kernel matrix.\n\n Parameters\n ----------\n K : ndarray of shape (n_samples1, n_samples2)\n Kernel matrix.\n\n copy : bool, default=True\n Set to False to perform inplace computation.\n\n Returns\n -------\n K_new : ndarray of shape (n_samples1, n_samples2)\n Returns the instance itself.\n \"\"\"\n check_is_fitted(self)\n K = self._validate_data(K, copy=copy, dtype=FLOAT_DTYPES, reset=False)\n K_pred_cols = (np.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, np.newaxis]\n K -= self.K_fit_rows_\n K -= K_pred_cols\n K += self.K_fit_all_\n return K\n \n def _more_tags(self):\n return {'pairwise': True}\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1.')\n @property\n def _pairwise(self):\n return True\n" }, @@ -25852,7 +25948,7 @@ "sklearn.preprocessing._data.MaxAbsScaler._more_tags" ], "is_public": true, - "description": "Scale each feature by its maximum absolute value.\n\nThis estimator scales and translates each feature individually such that the maximal absolute value of each feature in the training set will be 1.0. It does not shift/center the data, and thus does not destroy any sparsity. This scaler can also be applied to sparse CSR or CSC matrices. .. versionadded:: 0.17", + "description": "Scale each feature by its maximum absolute value.\n\nThis estimator scales and translates each feature individually such\nthat the maximal absolute value of each feature in the\ntraining set will be 1.0. It does not shift/center the data, and\nthus does not destroy any sparsity.\n\nThis scaler can also be applied to sparse CSR or CSC matrices.\n\n.. versionadded:: 0.17", "docstring": "Scale each feature by its maximum absolute value.\n\n This estimator scales and translates each feature individually such\n that the maximal absolute value of each feature in the\n training set will be 1.0. It does not shift/center the data, and\n thus does not destroy any sparsity.\n\n This scaler can also be applied to sparse CSR or CSC matrices.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n copy : bool, default=True\n Set to False to perform inplace scaling and avoid a copy (if the input\n is already a numpy array).\n\n Attributes\n ----------\n scale_ : ndarray of shape (n_features,)\n Per feature relative scaling of the data.\n\n .. versionadded:: 0.17\n *scale_* attribute.\n\n max_abs_ : ndarray of shape (n_features,)\n Per feature maximum absolute value.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_seen_ : int\n The number of samples processed by the estimator. Will be reset on\n new calls to fit, but increments across ``partial_fit`` calls.\n\n See Also\n --------\n maxabs_scale : Equivalent function without the estimator API.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in fit, and maintained in\n transform.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n Examples\n --------\n >>> from sklearn.preprocessing import MaxAbsScaler\n >>> X = [[ 1., -1., 2.],\n ... [ 2., 0., 0.],\n ... [ 0., 1., -1.]]\n >>> transformer = MaxAbsScaler().fit(X)\n >>> transformer\n MaxAbsScaler()\n >>> transformer.transform(X)\n array([[ 0.5, -1. , 1. ],\n [ 1. , 0. , 0. ],\n [ 0. , 1. , -0.5]])\n ", "source_code": "\n\nclass MaxAbsScaler(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):\n \"\"\"Scale each feature by its maximum absolute value.\n\n This estimator scales and translates each feature individually such\n that the maximal absolute value of each feature in the\n training set will be 1.0. It does not shift/center the data, and\n thus does not destroy any sparsity.\n\n This scaler can also be applied to sparse CSR or CSC matrices.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n copy : bool, default=True\n Set to False to perform inplace scaling and avoid a copy (if the input\n is already a numpy array).\n\n Attributes\n ----------\n scale_ : ndarray of shape (n_features,)\n Per feature relative scaling of the data.\n\n .. versionadded:: 0.17\n *scale_* attribute.\n\n max_abs_ : ndarray of shape (n_features,)\n Per feature maximum absolute value.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_seen_ : int\n The number of samples processed by the estimator. Will be reset on\n new calls to fit, but increments across ``partial_fit`` calls.\n\n See Also\n --------\n maxabs_scale : Equivalent function without the estimator API.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in fit, and maintained in\n transform.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n Examples\n --------\n >>> from sklearn.preprocessing import MaxAbsScaler\n >>> X = [[ 1., -1., 2.],\n ... [ 2., 0., 0.],\n ... [ 0., 1., -1.]]\n >>> transformer = MaxAbsScaler().fit(X)\n >>> transformer\n MaxAbsScaler()\n >>> transformer.transform(X)\n array([[ 0.5, -1. , 1. ],\n [ 1. , 0. , 0. ],\n [ 0. , 1. , -0.5]])\n \"\"\"\n \n def __init__(self, *, copy=True):\n self.copy = copy\n \n def _reset(self):\n \"\"\"Reset internal data-dependent state of the scaler, if necessary.\n\n __init__ parameters are not touched.\n \"\"\"\n if hasattr(self, 'scale_'):\n del self.scale_\n del self.n_samples_seen_\n del self.max_abs_\n \n def fit(self, X, y=None):\n \"\"\"Compute the maximum absolute value to be used for later scaling.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the per-feature minimum and maximum\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n self._reset()\n return self.partial_fit(X, y)\n \n def partial_fit(self, X, y=None):\n \"\"\"Online computation of max absolute value of X for later scaling.\n\n All of X is processed as a single batch. This is intended for cases\n when :meth:`fit` is not feasible due to very large number of\n `n_samples` or because X is read from a continuous stream.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n first_pass = not hasattr(self, 'n_samples_seen_')\n X = self._validate_data(X, reset=first_pass, accept_sparse=('csr', 'csc'), estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n (mins, maxs) = min_max_axis(X, axis=0, ignore_nan=True)\n max_abs = np.maximum(np.abs(mins), np.abs(maxs))\n else:\n max_abs = np.nanmax(np.abs(X), axis=0)\n if first_pass:\n self.n_samples_seen_ = X.shape[0]\n else:\n max_abs = np.maximum(self.max_abs_, max_abs)\n self.n_samples_seen_ += X.shape[0]\n self.max_abs_ = max_abs\n self.scale_ = _handle_zeros_in_scale(max_abs, copy=True)\n return self\n \n def transform(self, X):\n \"\"\"Scale the data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data that should be scaled.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), copy=self.copy, reset=False, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n inplace_column_scale(X, 1.0 / self.scale_)\n else:\n X /= self.scale_\n return X\n \n def inverse_transform(self, X):\n \"\"\"Scale back the data to the original representation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data that should be transformed back.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n inplace_column_scale(X, self.scale_)\n else:\n X *= self.scale_\n return X\n \n def _more_tags(self):\n return {'allow_nan': True}\n" }, @@ -25875,7 +25971,7 @@ "sklearn.preprocessing._data.MinMaxScaler._more_tags" ], "is_public": true, - "description": "Transform features by scaling each feature to a given range.\n\nThis estimator scales and translates each feature individually such that it is in the given range on the training set, e.g. between zero and one. The transformation is given by:: X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) X_scaled = X_std * (max - min) + min where min, max = feature_range. This transformation is often used as an alternative to zero mean, unit variance scaling. Read more in the :ref:`User Guide `.", + "description": "Transform features by scaling each feature to a given range.\n\nThis estimator scales and translates each feature individually such\nthat it is in the given range on the training set, e.g. between\nzero and one.\n\nThe transformation is given by::\n\n X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))\n X_scaled = X_std * (max - min) + min\n\nwhere min, max = feature_range.\n\nThis transformation is often used as an alternative to zero mean,\nunit variance scaling.\n\nRead more in the :ref:`User Guide `.", "docstring": "Transform features by scaling each feature to a given range.\n\n This estimator scales and translates each feature individually such\n that it is in the given range on the training set, e.g. between\n zero and one.\n\n The transformation is given by::\n\n X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))\n X_scaled = X_std * (max - min) + min\n\n where min, max = feature_range.\n\n This transformation is often used as an alternative to zero mean,\n unit variance scaling.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n feature_range : tuple (min, max), default=(0, 1)\n Desired range of transformed data.\n\n copy : bool, default=True\n Set to False to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array).\n\n clip : bool, default=False\n Set to True to clip transformed values of held-out data to\n provided `feature range`.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n min_ : ndarray of shape (n_features,)\n Per feature adjustment for minimum. Equivalent to\n ``min - X.min(axis=0) * self.scale_``\n\n scale_ : ndarray of shape (n_features,)\n Per feature relative scaling of the data. Equivalent to\n ``(max - min) / (X.max(axis=0) - X.min(axis=0))``\n\n .. versionadded:: 0.17\n *scale_* attribute.\n\n data_min_ : ndarray of shape (n_features,)\n Per feature minimum seen in the data\n\n .. versionadded:: 0.17\n *data_min_*\n\n data_max_ : ndarray of shape (n_features,)\n Per feature maximum seen in the data\n\n .. versionadded:: 0.17\n *data_max_*\n\n data_range_ : ndarray of shape (n_features,)\n Per feature range ``(data_max_ - data_min_)`` seen in the data\n\n .. versionadded:: 0.17\n *data_range_*\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n n_samples_seen_ : int\n The number of samples processed by the estimator.\n It will be reset on new calls to fit, but increments across\n ``partial_fit`` calls.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n minmax_scale : Equivalent function without the estimator API.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in fit, and maintained in\n transform.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n Examples\n --------\n >>> from sklearn.preprocessing import MinMaxScaler\n >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]\n >>> scaler = MinMaxScaler()\n >>> print(scaler.fit(data))\n MinMaxScaler()\n >>> print(scaler.data_max_)\n [ 1. 18.]\n >>> print(scaler.transform(data))\n [[0. 0. ]\n [0.25 0.25]\n [0.5 0.5 ]\n [1. 1. ]]\n >>> print(scaler.transform([[2, 2]]))\n [[1.5 0. ]]\n ", "source_code": "\n\nclass MinMaxScaler(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):\n \"\"\"Transform features by scaling each feature to a given range.\n\n This estimator scales and translates each feature individually such\n that it is in the given range on the training set, e.g. between\n zero and one.\n\n The transformation is given by::\n\n X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))\n X_scaled = X_std * (max - min) + min\n\n where min, max = feature_range.\n\n This transformation is often used as an alternative to zero mean,\n unit variance scaling.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n feature_range : tuple (min, max), default=(0, 1)\n Desired range of transformed data.\n\n copy : bool, default=True\n Set to False to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array).\n\n clip : bool, default=False\n Set to True to clip transformed values of held-out data to\n provided `feature range`.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n min_ : ndarray of shape (n_features,)\n Per feature adjustment for minimum. Equivalent to\n ``min - X.min(axis=0) * self.scale_``\n\n scale_ : ndarray of shape (n_features,)\n Per feature relative scaling of the data. Equivalent to\n ``(max - min) / (X.max(axis=0) - X.min(axis=0))``\n\n .. versionadded:: 0.17\n *scale_* attribute.\n\n data_min_ : ndarray of shape (n_features,)\n Per feature minimum seen in the data\n\n .. versionadded:: 0.17\n *data_min_*\n\n data_max_ : ndarray of shape (n_features,)\n Per feature maximum seen in the data\n\n .. versionadded:: 0.17\n *data_max_*\n\n data_range_ : ndarray of shape (n_features,)\n Per feature range ``(data_max_ - data_min_)`` seen in the data\n\n .. versionadded:: 0.17\n *data_range_*\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n n_samples_seen_ : int\n The number of samples processed by the estimator.\n It will be reset on new calls to fit, but increments across\n ``partial_fit`` calls.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n minmax_scale : Equivalent function without the estimator API.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in fit, and maintained in\n transform.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n Examples\n --------\n >>> from sklearn.preprocessing import MinMaxScaler\n >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]\n >>> scaler = MinMaxScaler()\n >>> print(scaler.fit(data))\n MinMaxScaler()\n >>> print(scaler.data_max_)\n [ 1. 18.]\n >>> print(scaler.transform(data))\n [[0. 0. ]\n [0.25 0.25]\n [0.5 0.5 ]\n [1. 1. ]]\n >>> print(scaler.transform([[2, 2]]))\n [[1.5 0. ]]\n \"\"\"\n \n def __init__(self, feature_range=(0, 1), *, copy=True, clip=False):\n self.feature_range = feature_range\n self.copy = copy\n self.clip = clip\n \n def _reset(self):\n \"\"\"Reset internal data-dependent state of the scaler, if necessary.\n\n __init__ parameters are not touched.\n \"\"\"\n if hasattr(self, 'scale_'):\n del self.scale_\n del self.min_\n del self.n_samples_seen_\n del self.data_min_\n del self.data_max_\n del self.data_range_\n \n def fit(self, X, y=None):\n \"\"\"Compute the minimum and maximum to be used for later scaling.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data used to compute the per-feature minimum and maximum\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n self._reset()\n return self.partial_fit(X, y)\n \n def partial_fit(self, X, y=None):\n \"\"\"Online computation of min and max on X for later scaling.\n\n All of X is processed as a single batch. This is intended for cases\n when :meth:`fit` is not feasible due to very large number of\n `n_samples` or because X is read from a continuous stream.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n feature_range = self.feature_range\n if feature_range[0] >= feature_range[1]:\n raise ValueError('Minimum of desired feature range must be smaller than maximum. Got %s.' % str(feature_range))\n if sparse.issparse(X):\n raise TypeError('MinMaxScaler does not support sparse input. Consider using MaxAbsScaler instead.')\n first_pass = not hasattr(self, 'n_samples_seen_')\n X = self._validate_data(X, reset=first_pass, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n data_min = np.nanmin(X, axis=0)\n data_max = np.nanmax(X, axis=0)\n if first_pass:\n self.n_samples_seen_ = X.shape[0]\n else:\n data_min = np.minimum(self.data_min_, data_min)\n data_max = np.maximum(self.data_max_, data_max)\n self.n_samples_seen_ += X.shape[0]\n data_range = data_max - data_min\n self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(data_range, copy=True)\n self.min_ = feature_range[0] - data_min * self.scale_\n self.data_min_ = data_min\n self.data_max_ = data_max\n self.data_range_ = data_range\n return self\n \n def transform(self, X):\n \"\"\"Scale features of X according to feature_range.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data that will be transformed.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_features)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite='allow-nan', reset=False)\n X *= self.scale_\n X += self.min_\n if self.clip:\n np.clip(X, self.feature_range[0], self.feature_range[1], out=X)\n return X\n \n def inverse_transform(self, X):\n \"\"\"Undo the scaling of X according to feature_range.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data that will be transformed. It cannot be sparse.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_features)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n X -= self.min_\n X /= self.scale_\n return X\n \n def _more_tags(self):\n return {'allow_nan': True}\n" }, @@ -25891,7 +25987,7 @@ "sklearn.preprocessing._data.Normalizer._more_tags" ], "is_public": true, - "description": "Normalize samples individually to unit norm.\n\nEach sample (i.e. each row of the data matrix) with at least one non zero component is rescaled independently of other samples so that its norm (l1, l2 or inf) equals one. This transformer is able to work both with dense numpy arrays and scipy.sparse matrix (use CSR format if you want to avoid the burden of a copy / conversion). Scaling inputs to unit norms is a common operation for text classification or clustering for instance. For instance the dot product of two l2-normalized TF-IDF vectors is the cosine similarity of the vectors and is the base similarity metric for the Vector Space Model commonly used by the Information Retrieval community. Read more in the :ref:`User Guide `.", + "description": "Normalize samples individually to unit norm.\n\nEach sample (i.e. each row of the data matrix) with at least one\nnon zero component is rescaled independently of other samples so\nthat its norm (l1, l2 or inf) equals one.\n\nThis transformer is able to work both with dense numpy arrays and\nscipy.sparse matrix (use CSR format if you want to avoid the burden of\na copy / conversion).\n\nScaling inputs to unit norms is a common operation for text\nclassification or clustering for instance. For instance the dot\nproduct of two l2-normalized TF-IDF vectors is the cosine similarity\nof the vectors and is the base similarity metric for the Vector\nSpace Model commonly used by the Information Retrieval community.\n\nRead more in the :ref:`User Guide `.", "docstring": "Normalize samples individually to unit norm.\n\n Each sample (i.e. each row of the data matrix) with at least one\n non zero component is rescaled independently of other samples so\n that its norm (l1, l2 or inf) equals one.\n\n This transformer is able to work both with dense numpy arrays and\n scipy.sparse matrix (use CSR format if you want to avoid the burden of\n a copy / conversion).\n\n Scaling inputs to unit norms is a common operation for text\n classification or clustering for instance. For instance the dot\n product of two l2-normalized TF-IDF vectors is the cosine similarity\n of the vectors and is the base similarity metric for the Vector\n Space Model commonly used by the Information Retrieval community.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n norm : {'l1', 'l2', 'max'}, default='l2'\n The norm to use to normalize each non zero sample. If norm='max'\n is used, values will be rescaled by the maximum of the absolute\n values.\n\n copy : bool, default=True\n Set to False to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array or a scipy.sparse\n CSR matrix).\n\n Attributes\n ----------\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n normalize : Equivalent function without the estimator API.\n\n Notes\n -----\n This estimator is stateless (besides constructor parameters), the\n fit method does nothing but is useful when used in a pipeline.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n Examples\n --------\n >>> from sklearn.preprocessing import Normalizer\n >>> X = [[4, 1, 2, 2],\n ... [1, 3, 9, 3],\n ... [5, 7, 5, 1]]\n >>> transformer = Normalizer().fit(X) # fit does nothing.\n >>> transformer\n Normalizer()\n >>> transformer.transform(X)\n array([[0.8, 0.2, 0.4, 0.4],\n [0.1, 0.3, 0.9, 0.3],\n [0.5, 0.7, 0.5, 0.1]])\n ", "source_code": "\n\nclass Normalizer(TransformerMixin, BaseEstimator):\n \"\"\"Normalize samples individually to unit norm.\n\n Each sample (i.e. each row of the data matrix) with at least one\n non zero component is rescaled independently of other samples so\n that its norm (l1, l2 or inf) equals one.\n\n This transformer is able to work both with dense numpy arrays and\n scipy.sparse matrix (use CSR format if you want to avoid the burden of\n a copy / conversion).\n\n Scaling inputs to unit norms is a common operation for text\n classification or clustering for instance. For instance the dot\n product of two l2-normalized TF-IDF vectors is the cosine similarity\n of the vectors and is the base similarity metric for the Vector\n Space Model commonly used by the Information Retrieval community.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n norm : {'l1', 'l2', 'max'}, default='l2'\n The norm to use to normalize each non zero sample. If norm='max'\n is used, values will be rescaled by the maximum of the absolute\n values.\n\n copy : bool, default=True\n Set to False to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array or a scipy.sparse\n CSR matrix).\n\n Attributes\n ----------\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n normalize : Equivalent function without the estimator API.\n\n Notes\n -----\n This estimator is stateless (besides constructor parameters), the\n fit method does nothing but is useful when used in a pipeline.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n Examples\n --------\n >>> from sklearn.preprocessing import Normalizer\n >>> X = [[4, 1, 2, 2],\n ... [1, 3, 9, 3],\n ... [5, 7, 5, 1]]\n >>> transformer = Normalizer().fit(X) # fit does nothing.\n >>> transformer\n Normalizer()\n >>> transformer.transform(X)\n array([[0.8, 0.2, 0.4, 0.4],\n [0.1, 0.3, 0.9, 0.3],\n [0.5, 0.7, 0.5, 0.1]])\n \"\"\"\n \n def __init__(self, norm='l2', *, copy=True):\n self.norm = norm\n self.copy = copy\n \n def fit(self, X, y=None):\n \"\"\"Do nothing and return the estimator unchanged.\n\n This method is just there to implement the usual API and hence\n work in pipelines.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to estimate the normalization parameters.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n self._validate_data(X, accept_sparse='csr')\n return self\n \n def transform(self, X, copy=None):\n \"\"\"Scale each non zero row of X to unit norm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to normalize, row by row. scipy.sparse matrices should be\n in CSR format to avoid an un-necessary copy.\n\n copy : bool, default=None\n Copy the input X or not.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n copy = copy if copy is not None else self.copy\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n return normalize(X, norm=self.norm, axis=1, copy=copy)\n \n def _more_tags(self):\n return {'stateless': True}\n" }, @@ -25920,7 +26016,7 @@ "sklearn.preprocessing._data.PowerTransformer._more_tags" ], "is_public": true, - "description": "Apply a power transform featurewise to make data more Gaussian-like.\n\nPower transforms are a family of parametric, monotonic transformations that are applied to make data more Gaussian-like. This is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired. Currently, PowerTransformer supports the Box-Cox transform and the Yeo-Johnson transform. The optimal parameter for stabilizing variance and minimizing skewness is estimated through maximum likelihood. Box-Cox requires input data to be strictly positive, while Yeo-Johnson supports both positive or negative data. By default, zero-mean, unit-variance normalization is applied to the transformed data. Read more in the :ref:`User Guide `. .. versionadded:: 0.20", + "description": "Apply a power transform featurewise to make data more Gaussian-like.\n\nPower transforms are a family of parametric, monotonic transformations\nthat are applied to make data more Gaussian-like. This is useful for\nmodeling issues related to heteroscedasticity (non-constant variance),\nor other situations where normality is desired.\n\nCurrently, PowerTransformer supports the Box-Cox transform and the\nYeo-Johnson transform. The optimal parameter for stabilizing variance and\nminimizing skewness is estimated through maximum likelihood.\n\nBox-Cox requires input data to be strictly positive, while Yeo-Johnson\nsupports both positive or negative data.\n\nBy default, zero-mean, unit-variance normalization is applied to the\ntransformed data.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20", "docstring": "Apply a power transform featurewise to make data more Gaussian-like.\n\n Power transforms are a family of parametric, monotonic transformations\n that are applied to make data more Gaussian-like. This is useful for\n modeling issues related to heteroscedasticity (non-constant variance),\n or other situations where normality is desired.\n\n Currently, PowerTransformer supports the Box-Cox transform and the\n Yeo-Johnson transform. The optimal parameter for stabilizing variance and\n minimizing skewness is estimated through maximum likelihood.\n\n Box-Cox requires input data to be strictly positive, while Yeo-Johnson\n supports both positive or negative data.\n\n By default, zero-mean, unit-variance normalization is applied to the\n transformed data.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'\n The power transform method. Available methods are:\n\n - 'yeo-johnson' [1]_, works with positive and negative values\n - 'box-cox' [2]_, only works with strictly positive values\n\n standardize : bool, default=True\n Set to True to apply zero-mean, unit-variance normalization to the\n transformed output.\n\n copy : bool, default=True\n Set to False to perform inplace computation during transformation.\n\n Attributes\n ----------\n lambdas_ : ndarray of float of shape (n_features,)\n The parameters of the power transformation for the selected features.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n power_transform : Equivalent function without the estimator API.\n\n QuantileTransformer : Maps data to a standard normal distribution with\n the parameter `output_distribution='normal'`.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in ``fit``, and maintained\n in ``transform``.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n References\n ----------\n\n .. [1] I.K. Yeo and R.A. Johnson, \"A new family of power transformations to\n improve normality or symmetry.\" Biometrika, 87(4), pp.954-959,\n (2000).\n\n .. [2] G.E.P. Box and D.R. Cox, \"An Analysis of Transformations\", Journal\n of the Royal Statistical Society B, 26, 211-252 (1964).\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import PowerTransformer\n >>> pt = PowerTransformer()\n >>> data = [[1, 2], [3, 2], [4, 5]]\n >>> print(pt.fit(data))\n PowerTransformer()\n >>> print(pt.lambdas_)\n [ 1.386... -3.100...]\n >>> print(pt.transform(data))\n [[-1.316... -0.707...]\n [ 0.209... -0.707...]\n [ 1.106... 1.414...]]\n ", "source_code": "\n\nclass PowerTransformer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):\n \"\"\"Apply a power transform featurewise to make data more Gaussian-like.\n\n Power transforms are a family of parametric, monotonic transformations\n that are applied to make data more Gaussian-like. This is useful for\n modeling issues related to heteroscedasticity (non-constant variance),\n or other situations where normality is desired.\n\n Currently, PowerTransformer supports the Box-Cox transform and the\n Yeo-Johnson transform. The optimal parameter for stabilizing variance and\n minimizing skewness is estimated through maximum likelihood.\n\n Box-Cox requires input data to be strictly positive, while Yeo-Johnson\n supports both positive or negative data.\n\n By default, zero-mean, unit-variance normalization is applied to the\n transformed data.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'\n The power transform method. Available methods are:\n\n - 'yeo-johnson' [1]_, works with positive and negative values\n - 'box-cox' [2]_, only works with strictly positive values\n\n standardize : bool, default=True\n Set to True to apply zero-mean, unit-variance normalization to the\n transformed output.\n\n copy : bool, default=True\n Set to False to perform inplace computation during transformation.\n\n Attributes\n ----------\n lambdas_ : ndarray of float of shape (n_features,)\n The parameters of the power transformation for the selected features.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n power_transform : Equivalent function without the estimator API.\n\n QuantileTransformer : Maps data to a standard normal distribution with\n the parameter `output_distribution='normal'`.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in ``fit``, and maintained\n in ``transform``.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n References\n ----------\n\n .. [1] I.K. Yeo and R.A. Johnson, \"A new family of power transformations to\n improve normality or symmetry.\" Biometrika, 87(4), pp.954-959,\n (2000).\n\n .. [2] G.E.P. Box and D.R. Cox, \"An Analysis of Transformations\", Journal\n of the Royal Statistical Society B, 26, 211-252 (1964).\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import PowerTransformer\n >>> pt = PowerTransformer()\n >>> data = [[1, 2], [3, 2], [4, 5]]\n >>> print(pt.fit(data))\n PowerTransformer()\n >>> print(pt.lambdas_)\n [ 1.386... -3.100...]\n >>> print(pt.transform(data))\n [[-1.316... -0.707...]\n [ 0.209... -0.707...]\n [ 1.106... 1.414...]]\n \"\"\"\n \n def __init__(self, method='yeo-johnson', *, standardize=True, copy=True):\n self.method = method\n self.standardize = standardize\n self.copy = copy\n \n def fit(self, X, y=None):\n \"\"\"Estimate the optimal parameter lambda for each feature.\n\n The optimal lambda parameter for minimizing skewness is estimated on\n each feature independently using maximum likelihood.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data used to estimate the optimal transformation parameters.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n self._fit(X, y=y, force_transform=False)\n return self\n \n def fit_transform(self, X, y=None):\n \"\"\"Fit `PowerTransformer` to `X`, then transform `X`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data used to estimate the optimal transformation parameters\n and to be transformed using a power transformation.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_features)\n Transformed data.\n \"\"\"\n return self._fit(X, y, force_transform=True)\n \n def _fit(self, X, y=None, force_transform=False):\n X = self._check_input(X, in_fit=True, check_positive=True, check_method=True)\n if not self.copy and not force_transform:\n X = X.copy()\n optim_function = {'box-cox': self._box_cox_optimize, 'yeo-johnson': self._yeo_johnson_optimize}[self.method]\n with np.errstate(invalid='ignore'):\n self.lambdas_ = np.array([optim_function(col) for col in X.T])\n if self.standardize or force_transform:\n transform_function = {'box-cox': boxcox, 'yeo-johnson': self._yeo_johnson_transform}[self.method]\n for (i, lmbda) in enumerate(self.lambdas_):\n with np.errstate(invalid='ignore'):\n X[:, i] = transform_function(X[:, i], lmbda)\n if self.standardize:\n self._scaler = StandardScaler(copy=False)\n if force_transform:\n X = self._scaler.fit_transform(X)\n else:\n self._scaler.fit(X)\n return X\n \n def transform(self, X):\n \"\"\"Apply the power transform to each feature using the fitted lambdas.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to be transformed using a power transformation.\n\n Returns\n -------\n X_trans : ndarray of shape (n_samples, n_features)\n The transformed data.\n \"\"\"\n check_is_fitted(self)\n X = self._check_input(X, in_fit=False, check_positive=True, check_shape=True)\n transform_function = {'box-cox': boxcox, 'yeo-johnson': self._yeo_johnson_transform}[self.method]\n for (i, lmbda) in enumerate(self.lambdas_):\n with np.errstate(invalid='ignore'):\n X[:, i] = transform_function(X[:, i], lmbda)\n if self.standardize:\n X = self._scaler.transform(X)\n return X\n \n def inverse_transform(self, X):\n \"\"\"Apply the inverse power transformation using the fitted lambdas.\n\n The inverse of the Box-Cox transformation is given by::\n\n if lambda_ == 0:\n X = exp(X_trans)\n else:\n X = (X_trans * lambda_ + 1) ** (1 / lambda_)\n\n The inverse of the Yeo-Johnson transformation is given by::\n\n if X >= 0 and lambda_ == 0:\n X = exp(X_trans) - 1\n elif X >= 0 and lambda_ != 0:\n X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1\n elif X < 0 and lambda_ != 2:\n X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_))\n elif X < 0 and lambda_ == 2:\n X = 1 - exp(-X_trans)\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The transformed data.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The original data.\n \"\"\"\n check_is_fitted(self)\n X = self._check_input(X, in_fit=False, check_shape=True)\n if self.standardize:\n X = self._scaler.inverse_transform(X)\n inv_fun = {'box-cox': self._box_cox_inverse_tranform, 'yeo-johnson': self._yeo_johnson_inverse_transform}[self.method]\n for (i, lmbda) in enumerate(self.lambdas_):\n with np.errstate(invalid='ignore'):\n X[:, i] = inv_fun(X[:, i], lmbda)\n return X\n \n def _box_cox_inverse_tranform(self, x, lmbda):\n \"\"\"Return inverse-transformed input x following Box-Cox inverse\n transform with parameter lambda.\n \"\"\"\n if lmbda == 0:\n x_inv = np.exp(x)\n else:\n x_inv = (x * lmbda + 1)**(1 / lmbda)\n return x_inv\n \n def _yeo_johnson_inverse_transform(self, x, lmbda):\n \"\"\"Return inverse-transformed input x following Yeo-Johnson inverse\n transform with parameter lambda.\n \"\"\"\n x_inv = np.zeros_like(x)\n pos = x >= 0\n if abs(lmbda) < np.spacing(1.0):\n x_inv[pos] = np.exp(x[pos]) - 1\n else:\n x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1\n if abs(lmbda - 2) > np.spacing(1.0):\n x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda))\n else:\n x_inv[~pos] = 1 - np.exp(-x[~pos])\n return x_inv\n \n def _yeo_johnson_transform(self, x, lmbda):\n \"\"\"Return transformed input x following Yeo-Johnson transform with\n parameter lambda.\n \"\"\"\n out = np.zeros_like(x)\n pos = x >= 0\n if abs(lmbda) < np.spacing(1.0):\n out[pos] = np.log1p(x[pos])\n else:\n out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda\n if abs(lmbda - 2) > np.spacing(1.0):\n out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)\n else:\n out[~pos] = -np.log1p(-x[~pos])\n return out\n \n def _box_cox_optimize(self, x):\n \"\"\"Find and return optimal lambda parameter of the Box-Cox transform by\n MLE, for observed data x.\n\n We here use scipy builtins which uses the brent optimizer.\n \"\"\"\n (_, lmbda) = stats.boxcox(x[~np.isnan(x)], lmbda=None)\n return lmbda\n \n def _yeo_johnson_optimize(self, x):\n \"\"\"Find and return optimal lambda parameter of the Yeo-Johnson\n transform by MLE, for observed data x.\n\n Like for Box-Cox, MLE is done via the brent optimizer.\n \"\"\"\n \n def _neg_log_likelihood(lmbda):\n \"\"\"Return the negative log likelihood of the observed data x as a\n function of lambda.\"\"\"\n x_trans = self._yeo_johnson_transform(x, lmbda)\n n_samples = x.shape[0]\n loglike = -n_samples / 2 * np.log(x_trans.var())\n loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum()\n return -loglike\n x = x[~np.isnan(x)]\n return optimize.brent(_neg_log_likelihood, brack=(-2, 2))\n \n def _check_input(self, X, in_fit, check_positive=False, check_shape=False, check_method=False):\n \"\"\"Validate the input before fit and transform.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n in_fit : bool\n Whether or not `_check_input` is called from `fit` or other\n methods, e.g. `predict`, `transform`, etc.\n\n check_positive : bool, default=False\n If True, check that all data is positive and non-zero (only if\n ``self.method=='box-cox'``).\n\n check_shape : bool, default=False\n If True, check that n_features matches the length of self.lambdas_\n\n check_method : bool, default=False\n If True, check that the transformation method is valid.\n \"\"\"\n X = self._validate_data(X, ensure_2d=True, dtype=FLOAT_DTYPES, copy=self.copy, force_all_finite='allow-nan', reset=in_fit)\n with np.warnings.catch_warnings():\n np.warnings.filterwarnings('ignore', 'All-NaN (slice|axis) encountered')\n if check_positive and self.method == 'box-cox' and np.nanmin(X) <= 0:\n raise ValueError('The Box-Cox transformation can only be applied to strictly positive data')\n if check_shape and not X.shape[1] == len(self.lambdas_):\n raise ValueError('Input data has a different number of features than fitting data. Should have {n}, data has {m}'.format(n=len(self.lambdas_), m=X.shape[1]))\n valid_methods = ('box-cox', 'yeo-johnson')\n if check_method and self.method not in valid_methods:\n raise ValueError(\"'method' must be one of {}, got {} instead.\".format(valid_methods, self.method))\n return X\n \n def _more_tags(self):\n return {'allow_nan': True}\n" }, @@ -25946,7 +26042,7 @@ "sklearn.preprocessing._data.QuantileTransformer._more_tags" ], "is_public": true, - "description": "Transform features using quantiles information.\n\nThis method transforms the features to follow a uniform or a normal distribution. Therefore, for a given feature, this transformation tends to spread out the most frequent values. It also reduces the impact of (marginal) outliers: this is therefore a robust preprocessing scheme. The transformation is applied on each feature independently. First an estimate of the cumulative distribution function of a feature is used to map the original values to a uniform distribution. The obtained values are then mapped to the desired output distribution using the associated quantile function. Features values of new/unseen data that fall below or above the fitted range will be mapped to the bounds of the output distribution. Note that this transform is non-linear. It may distort linear correlations between variables measured at the same scale but renders variables measured at different scales more directly comparable. Read more in the :ref:`User Guide `. .. versionadded:: 0.19", + "description": "Transform features using quantiles information.\n\nThis method transforms the features to follow a uniform or a normal\ndistribution. Therefore, for a given feature, this transformation tends\nto spread out the most frequent values. It also reduces the impact of\n(marginal) outliers: this is therefore a robust preprocessing scheme.\n\nThe transformation is applied on each feature independently. First an\nestimate of the cumulative distribution function of a feature is\nused to map the original values to a uniform distribution. The obtained\nvalues are then mapped to the desired output distribution using the\nassociated quantile function. Features values of new/unseen data that fall\nbelow or above the fitted range will be mapped to the bounds of the output\ndistribution. Note that this transform is non-linear. It may distort linear\ncorrelations between variables measured at the same scale but renders\nvariables measured at different scales more directly comparable.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.19", "docstring": "Transform features using quantiles information.\n\n This method transforms the features to follow a uniform or a normal\n distribution. Therefore, for a given feature, this transformation tends\n to spread out the most frequent values. It also reduces the impact of\n (marginal) outliers: this is therefore a robust preprocessing scheme.\n\n The transformation is applied on each feature independently. First an\n estimate of the cumulative distribution function of a feature is\n used to map the original values to a uniform distribution. The obtained\n values are then mapped to the desired output distribution using the\n associated quantile function. Features values of new/unseen data that fall\n below or above the fitted range will be mapped to the bounds of the output\n distribution. Note that this transform is non-linear. It may distort linear\n correlations between variables measured at the same scale but renders\n variables measured at different scales more directly comparable.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.19\n\n Parameters\n ----------\n n_quantiles : int, default=1000 or n_samples\n Number of quantiles to be computed. It corresponds to the number\n of landmarks used to discretize the cumulative distribution function.\n If n_quantiles is larger than the number of samples, n_quantiles is set\n to the number of samples as a larger number of quantiles does not give\n a better approximation of the cumulative distribution function\n estimator.\n\n output_distribution : {'uniform', 'normal'}, default='uniform'\n Marginal distribution for the transformed data. The choices are\n 'uniform' (default) or 'normal'.\n\n ignore_implicit_zeros : bool, default=False\n Only applies to sparse matrices. If True, the sparse entries of the\n matrix are discarded to compute the quantile statistics. If False,\n these entries are treated as zeros.\n\n subsample : int, default=1e5\n Maximum number of samples used to estimate the quantiles for\n computational efficiency. Note that the subsampling procedure may\n differ for value-identical sparse and dense matrices.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for subsampling and smoothing\n noise.\n Please see ``subsample`` for more details.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n copy : bool, default=True\n Set to False to perform inplace transformation and avoid a copy (if the\n input is already a numpy array).\n\n Attributes\n ----------\n n_quantiles_ : int\n The actual number of quantiles used to discretize the cumulative\n distribution function.\n\n quantiles_ : ndarray of shape (n_quantiles, n_features)\n The values corresponding the quantiles of reference.\n\n references_ : ndarray of shape (n_quantiles, )\n Quantiles of references.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n quantile_transform : Equivalent function without the estimator API.\n PowerTransformer : Perform mapping to a normal distribution using a power\n transform.\n StandardScaler : Perform standardization that is faster, but less robust\n to outliers.\n RobustScaler : Perform robust standardization that removes the influence\n of outliers but does not put outliers and inliers on the same scale.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in fit, and maintained in\n transform.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import QuantileTransformer\n >>> rng = np.random.RandomState(0)\n >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)\n >>> qt = QuantileTransformer(n_quantiles=10, random_state=0)\n >>> qt.fit_transform(X)\n array([...])\n ", "source_code": "\n\nclass QuantileTransformer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):\n \"\"\"Transform features using quantiles information.\n\n This method transforms the features to follow a uniform or a normal\n distribution. Therefore, for a given feature, this transformation tends\n to spread out the most frequent values. It also reduces the impact of\n (marginal) outliers: this is therefore a robust preprocessing scheme.\n\n The transformation is applied on each feature independently. First an\n estimate of the cumulative distribution function of a feature is\n used to map the original values to a uniform distribution. The obtained\n values are then mapped to the desired output distribution using the\n associated quantile function. Features values of new/unseen data that fall\n below or above the fitted range will be mapped to the bounds of the output\n distribution. Note that this transform is non-linear. It may distort linear\n correlations between variables measured at the same scale but renders\n variables measured at different scales more directly comparable.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.19\n\n Parameters\n ----------\n n_quantiles : int, default=1000 or n_samples\n Number of quantiles to be computed. It corresponds to the number\n of landmarks used to discretize the cumulative distribution function.\n If n_quantiles is larger than the number of samples, n_quantiles is set\n to the number of samples as a larger number of quantiles does not give\n a better approximation of the cumulative distribution function\n estimator.\n\n output_distribution : {'uniform', 'normal'}, default='uniform'\n Marginal distribution for the transformed data. The choices are\n 'uniform' (default) or 'normal'.\n\n ignore_implicit_zeros : bool, default=False\n Only applies to sparse matrices. If True, the sparse entries of the\n matrix are discarded to compute the quantile statistics. If False,\n these entries are treated as zeros.\n\n subsample : int, default=1e5\n Maximum number of samples used to estimate the quantiles for\n computational efficiency. Note that the subsampling procedure may\n differ for value-identical sparse and dense matrices.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for subsampling and smoothing\n noise.\n Please see ``subsample`` for more details.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n copy : bool, default=True\n Set to False to perform inplace transformation and avoid a copy (if the\n input is already a numpy array).\n\n Attributes\n ----------\n n_quantiles_ : int\n The actual number of quantiles used to discretize the cumulative\n distribution function.\n\n quantiles_ : ndarray of shape (n_quantiles, n_features)\n The values corresponding the quantiles of reference.\n\n references_ : ndarray of shape (n_quantiles, )\n Quantiles of references.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n quantile_transform : Equivalent function without the estimator API.\n PowerTransformer : Perform mapping to a normal distribution using a power\n transform.\n StandardScaler : Perform standardization that is faster, but less robust\n to outliers.\n RobustScaler : Perform robust standardization that removes the influence\n of outliers but does not put outliers and inliers on the same scale.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in fit, and maintained in\n transform.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import QuantileTransformer\n >>> rng = np.random.RandomState(0)\n >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)\n >>> qt = QuantileTransformer(n_quantiles=10, random_state=0)\n >>> qt.fit_transform(X)\n array([...])\n \"\"\"\n \n def __init__(self, *, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(100000.0), random_state=None, copy=True):\n self.n_quantiles = n_quantiles\n self.output_distribution = output_distribution\n self.ignore_implicit_zeros = ignore_implicit_zeros\n self.subsample = subsample\n self.random_state = random_state\n self.copy = copy\n \n def _dense_fit(self, X, random_state):\n \"\"\"Compute percentiles for dense matrices.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data used to scale along the features axis.\n \"\"\"\n if self.ignore_implicit_zeros:\n warnings.warn(\"'ignore_implicit_zeros' takes effect only with sparse matrix. This parameter has no effect.\")\n (n_samples, n_features) = X.shape\n references = self.references_ * 100\n self.quantiles_ = []\n for col in X.T:\n if self.subsample < n_samples:\n subsample_idx = random_state.choice(n_samples, size=self.subsample, replace=False)\n col = col.take(subsample_idx, mode='clip')\n self.quantiles_.append(np.nanpercentile(col, references))\n self.quantiles_ = np.transpose(self.quantiles_)\n self.quantiles_ = np.maximum.accumulate(self.quantiles_)\n \n def _sparse_fit(self, X, random_state):\n \"\"\"Compute percentiles for sparse matrices.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n The data used to scale along the features axis. The sparse matrix\n needs to be nonnegative. If a sparse matrix is provided,\n it will be converted into a sparse ``csc_matrix``.\n \"\"\"\n (n_samples, n_features) = X.shape\n references = self.references_ * 100\n self.quantiles_ = []\n for feature_idx in range(n_features):\n column_nnz_data = X.data[X.indptr[feature_idx]:X.indptr[feature_idx + 1]]\n if len(column_nnz_data) > self.subsample:\n column_subsample = self.subsample * len(column_nnz_data) // n_samples\n if self.ignore_implicit_zeros:\n column_data = np.zeros(shape=column_subsample, dtype=X.dtype)\n else:\n column_data = np.zeros(shape=self.subsample, dtype=X.dtype)\n column_data[:column_subsample] = random_state.choice(column_nnz_data, size=column_subsample, replace=False)\n else:\n if self.ignore_implicit_zeros:\n column_data = np.zeros(shape=len(column_nnz_data), dtype=X.dtype)\n else:\n column_data = np.zeros(shape=n_samples, dtype=X.dtype)\n column_data[:len(column_nnz_data)] = column_nnz_data\n if not column_data.size:\n self.quantiles_.append([0] * len(references))\n else:\n self.quantiles_.append(np.nanpercentile(column_data, references))\n self.quantiles_ = np.transpose(self.quantiles_)\n self.quantiles_ = np.maximum.accumulate(self.quantiles_)\n \n def fit(self, X, y=None):\n \"\"\"Compute the quantiles used for transforming.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis. If a sparse\n matrix is provided, it will be converted into a sparse\n ``csc_matrix``. Additionally, the sparse matrix needs to be\n nonnegative if `ignore_implicit_zeros` is False.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n if self.n_quantiles <= 0:\n raise ValueError(\"Invalid value for 'n_quantiles': %d. The number of quantiles must be at least one.\" % self.n_quantiles)\n if self.subsample <= 0:\n raise ValueError(\"Invalid value for 'subsample': %d. The number of subsamples must be at least one.\" % self.subsample)\n if self.n_quantiles > self.subsample:\n raise ValueError('The number of quantiles cannot be greater than the number of samples used. Got {} quantiles and {} samples.'.format(self.n_quantiles, self.subsample))\n X = self._check_inputs(X, in_fit=True, copy=False)\n n_samples = X.shape[0]\n if self.n_quantiles > n_samples:\n warnings.warn('n_quantiles (%s) is greater than the total number of samples (%s). n_quantiles is set to n_samples.' % (self.n_quantiles, n_samples))\n self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples))\n rng = check_random_state(self.random_state)\n self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True)\n if sparse.issparse(X):\n self._sparse_fit(X, rng)\n else:\n self._dense_fit(X, rng)\n return self\n \n def _transform_col(self, X_col, quantiles, inverse):\n \"\"\"Private function to transform a single feature.\"\"\"\n output_distribution = self.output_distribution\n if not inverse:\n lower_bound_x = quantiles[0]\n upper_bound_x = quantiles[-1]\n lower_bound_y = 0\n upper_bound_y = 1\n else:\n lower_bound_x = 0\n upper_bound_x = 1\n lower_bound_y = quantiles[0]\n upper_bound_y = quantiles[-1]\n with np.errstate(invalid='ignore'):\n if output_distribution == 'normal':\n X_col = stats.norm.cdf(X_col)\n with np.errstate(invalid='ignore'):\n if output_distribution == 'normal':\n lower_bounds_idx = X_col - BOUNDS_THRESHOLD < lower_bound_x\n upper_bounds_idx = X_col + BOUNDS_THRESHOLD > upper_bound_x\n if output_distribution == 'uniform':\n lower_bounds_idx = X_col == lower_bound_x\n upper_bounds_idx = X_col == upper_bound_x\n isfinite_mask = ~np.isnan(X_col)\n X_col_finite = X_col[isfinite_mask]\n if not inverse:\n X_col[isfinite_mask] = 0.5 * (np.interp(X_col_finite, quantiles, self.references_) - np.interp(-X_col_finite, -quantiles[::-1], -self.references_[::-1]))\n else:\n X_col[isfinite_mask] = np.interp(X_col_finite, self.references_, quantiles)\n X_col[upper_bounds_idx] = upper_bound_y\n X_col[lower_bounds_idx] = lower_bound_y\n if not inverse:\n with np.errstate(invalid='ignore'):\n if output_distribution == 'normal':\n X_col = stats.norm.ppf(X_col)\n clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1))\n clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD - np.spacing(1)))\n X_col = np.clip(X_col, clip_min, clip_max)\n return X_col\n \n def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False):\n \"\"\"Check inputs before fit and transform.\"\"\"\n X = self._validate_data(X, reset=in_fit, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n with np.errstate(invalid='ignore'):\n if not accept_sparse_negative and not self.ignore_implicit_zeros and sparse.issparse(X) and np.any(X.data < 0):\n raise ValueError('QuantileTransformer only accepts non-negative sparse matrices.')\n if self.output_distribution not in ('normal', 'uniform'):\n raise ValueError(\"'output_distribution' has to be either 'normal' or 'uniform'. Got '{}' instead.\".format(self.output_distribution))\n return X\n \n def _transform(self, X, inverse=False):\n \"\"\"Forward and inverse transform.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data used to scale along the features axis.\n\n inverse : bool, default=False\n If False, apply forward transform. If True, apply\n inverse transform.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n Projected data.\n \"\"\"\n if sparse.issparse(X):\n for feature_idx in range(X.shape[1]):\n column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1])\n X.data[column_slice] = self._transform_col(X.data[column_slice], self.quantiles_[:, feature_idx], inverse)\n else:\n for feature_idx in range(X.shape[1]):\n X[:, feature_idx] = self._transform_col(X[:, feature_idx], self.quantiles_[:, feature_idx], inverse)\n return X\n \n def transform(self, X):\n \"\"\"Feature-wise transformation of the data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis. If a sparse\n matrix is provided, it will be converted into a sparse\n ``csc_matrix``. Additionally, the sparse matrix needs to be\n nonnegative if `ignore_implicit_zeros` is False.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The projected data.\n \"\"\"\n check_is_fitted(self)\n X = self._check_inputs(X, in_fit=False, copy=self.copy)\n return self._transform(X, inverse=False)\n \n def inverse_transform(self, X):\n \"\"\"Back-projection to the original space.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis. If a sparse\n matrix is provided, it will be converted into a sparse\n ``csc_matrix``. Additionally, the sparse matrix needs to be\n nonnegative if `ignore_implicit_zeros` is False.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of (n_samples, n_features)\n The projected data.\n \"\"\"\n check_is_fitted(self)\n X = self._check_inputs(X, in_fit=False, accept_sparse_negative=True, copy=self.copy)\n return self._transform(X, inverse=True)\n \n def _more_tags(self):\n return {'allow_nan': True}\n" }, @@ -25967,7 +26063,7 @@ "sklearn.preprocessing._data.RobustScaler._more_tags" ], "is_public": true, - "description": "Scale features using statistics that are robust to outliers.\n\nThis Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile). Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Median and interquartile range are then stored to be used on later data using the :meth:`transform` method. Standardization of a dataset is a common requirement for many machine learning estimators. Typically this is done by removing the mean and scaling to unit variance. However, outliers can often influence the sample mean / variance in a negative way. In such cases, the median and the interquartile range often give better results. .. versionadded:: 0.17 Read more in the :ref:`User Guide `.", + "description": "Scale features using statistics that are robust to outliers.\n\nThis Scaler removes the median and scales the data according to\nthe quantile range (defaults to IQR: Interquartile Range).\nThe IQR is the range between the 1st quartile (25th quantile)\nand the 3rd quartile (75th quantile).\n\nCentering and scaling happen independently on each feature by\ncomputing the relevant statistics on the samples in the training\nset. Median and interquartile range are then stored to be used on\nlater data using the :meth:`transform` method.\n\nStandardization of a dataset is a common requirement for many\nmachine learning estimators. Typically this is done by removing the mean\nand scaling to unit variance. However, outliers can often influence the\nsample mean / variance in a negative way. In such cases, the median and\nthe interquartile range often give better results.\n\n.. versionadded:: 0.17\n\nRead more in the :ref:`User Guide `.", "docstring": "Scale features using statistics that are robust to outliers.\n\n This Scaler removes the median and scales the data according to\n the quantile range (defaults to IQR: Interquartile Range).\n The IQR is the range between the 1st quartile (25th quantile)\n and the 3rd quartile (75th quantile).\n\n Centering and scaling happen independently on each feature by\n computing the relevant statistics on the samples in the training\n set. Median and interquartile range are then stored to be used on\n later data using the :meth:`transform` method.\n\n Standardization of a dataset is a common requirement for many\n machine learning estimators. Typically this is done by removing the mean\n and scaling to unit variance. However, outliers can often influence the\n sample mean / variance in a negative way. In such cases, the median and\n the interquartile range often give better results.\n\n .. versionadded:: 0.17\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n with_centering : bool, default=True\n If `True`, center the data before scaling.\n This will cause :meth:`transform` to raise an exception when attempted\n on sparse matrices, because centering them entails building a dense\n matrix which in common use cases is likely to be too large to fit in\n memory.\n\n with_scaling : bool, default=True\n If `True`, scale the data to interquartile range.\n\n quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, default=(25.0, 75.0)\n Quantile range used to calculate `scale_`. By default this is equal to\n the IQR, i.e., `q_min` is the first quantile and `q_max` is the third\n quantile.\n\n .. versionadded:: 0.18\n\n copy : bool, default=True\n If `False`, try to avoid a copy and do inplace scaling instead.\n This is not guaranteed to always work inplace; e.g. if the data is\n not a NumPy array or scipy.sparse CSR matrix, a copy may still be\n returned.\n\n unit_variance : bool, default=False\n If `True`, scale data so that normally distributed features have a\n variance of 1. In general, if the difference between the x-values of\n `q_max` and `q_min` for a standard normal distribution is greater\n than 1, the dataset will be scaled down. If less than 1, the dataset\n will be scaled up.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n center_ : array of floats\n The median value for each feature in the training set.\n\n scale_ : array of floats\n The (scaled) interquartile range for each feature in the training set.\n\n .. versionadded:: 0.17\n *scale_* attribute.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n robust_scale : Equivalent function without the estimator API.\n sklearn.decomposition.PCA : Further removes the linear correlation across\n features with 'whiten=True'.\n\n Notes\n -----\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n https://en.wikipedia.org/wiki/Median\n https://en.wikipedia.org/wiki/Interquartile_range\n\n Examples\n --------\n >>> from sklearn.preprocessing import RobustScaler\n >>> X = [[ 1., -2., 2.],\n ... [ -2., 1., 3.],\n ... [ 4., 1., -2.]]\n >>> transformer = RobustScaler().fit(X)\n >>> transformer\n RobustScaler()\n >>> transformer.transform(X)\n array([[ 0. , -2. , 0. ],\n [-1. , 0. , 0.4],\n [ 1. , 0. , -1.6]])\n ", "source_code": "\n\nclass RobustScaler(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):\n \"\"\"Scale features using statistics that are robust to outliers.\n\n This Scaler removes the median and scales the data according to\n the quantile range (defaults to IQR: Interquartile Range).\n The IQR is the range between the 1st quartile (25th quantile)\n and the 3rd quartile (75th quantile).\n\n Centering and scaling happen independently on each feature by\n computing the relevant statistics on the samples in the training\n set. Median and interquartile range are then stored to be used on\n later data using the :meth:`transform` method.\n\n Standardization of a dataset is a common requirement for many\n machine learning estimators. Typically this is done by removing the mean\n and scaling to unit variance. However, outliers can often influence the\n sample mean / variance in a negative way. In such cases, the median and\n the interquartile range often give better results.\n\n .. versionadded:: 0.17\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n with_centering : bool, default=True\n If `True`, center the data before scaling.\n This will cause :meth:`transform` to raise an exception when attempted\n on sparse matrices, because centering them entails building a dense\n matrix which in common use cases is likely to be too large to fit in\n memory.\n\n with_scaling : bool, default=True\n If `True`, scale the data to interquartile range.\n\n quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, default=(25.0, 75.0)\n Quantile range used to calculate `scale_`. By default this is equal to\n the IQR, i.e., `q_min` is the first quantile and `q_max` is the third\n quantile.\n\n .. versionadded:: 0.18\n\n copy : bool, default=True\n If `False`, try to avoid a copy and do inplace scaling instead.\n This is not guaranteed to always work inplace; e.g. if the data is\n not a NumPy array or scipy.sparse CSR matrix, a copy may still be\n returned.\n\n unit_variance : bool, default=False\n If `True`, scale data so that normally distributed features have a\n variance of 1. In general, if the difference between the x-values of\n `q_max` and `q_min` for a standard normal distribution is greater\n than 1, the dataset will be scaled down. If less than 1, the dataset\n will be scaled up.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n center_ : array of floats\n The median value for each feature in the training set.\n\n scale_ : array of floats\n The (scaled) interquartile range for each feature in the training set.\n\n .. versionadded:: 0.17\n *scale_* attribute.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n robust_scale : Equivalent function without the estimator API.\n sklearn.decomposition.PCA : Further removes the linear correlation across\n features with 'whiten=True'.\n\n Notes\n -----\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n https://en.wikipedia.org/wiki/Median\n https://en.wikipedia.org/wiki/Interquartile_range\n\n Examples\n --------\n >>> from sklearn.preprocessing import RobustScaler\n >>> X = [[ 1., -2., 2.],\n ... [ -2., 1., 3.],\n ... [ 4., 1., -2.]]\n >>> transformer = RobustScaler().fit(X)\n >>> transformer\n RobustScaler()\n >>> transformer.transform(X)\n array([[ 0. , -2. , 0. ],\n [-1. , 0. , 0.4],\n [ 1. , 0. , -1.6]])\n \"\"\"\n \n def __init__(self, *, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True, unit_variance=False):\n self.with_centering = with_centering\n self.with_scaling = with_scaling\n self.quantile_range = quantile_range\n self.unit_variance = unit_variance\n self.copy = copy\n \n def fit(self, X, y=None):\n \"\"\"Compute the median and quantiles to be used for scaling.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the median and quantiles\n used for later scaling along the features axis.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csc', estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n (q_min, q_max) = self.quantile_range\n if not 0 <= q_min <= q_max <= 100:\n raise ValueError('Invalid quantile range: %s' % str(self.quantile_range))\n if self.with_centering:\n if sparse.issparse(X):\n raise ValueError('Cannot center sparse matrices: use `with_centering=False` instead. See docstring for motivation and alternatives.')\n self.center_ = np.nanmedian(X, axis=0)\n else:\n self.center_ = None\n if self.with_scaling:\n quantiles = []\n for feature_idx in range(X.shape[1]):\n if sparse.issparse(X):\n column_nnz_data = X.data[X.indptr[feature_idx]:X.indptr[feature_idx + 1]]\n column_data = np.zeros(shape=X.shape[0], dtype=X.dtype)\n column_data[:len(column_nnz_data)] = column_nnz_data\n else:\n column_data = X[:, feature_idx]\n quantiles.append(np.nanpercentile(column_data, self.quantile_range))\n quantiles = np.transpose(quantiles)\n self.scale_ = quantiles[1] - quantiles[0]\n self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)\n if self.unit_variance:\n adjust = stats.norm.ppf(q_max / 100.0) - stats.norm.ppf(q_min / 100.0)\n self.scale_ = self.scale_ / adjust\n else:\n self.scale_ = None\n return self\n \n def transform(self, X):\n \"\"\"Center and scale the data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the specified axis.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, reset=False, force_all_finite='allow-nan')\n if sparse.issparse(X):\n if self.with_scaling:\n inplace_column_scale(X, 1.0 / self.scale_)\n else:\n if self.with_centering:\n X -= self.center_\n if self.with_scaling:\n X /= self.scale_\n return X\n \n def inverse_transform(self, X):\n \"\"\"Scale back the data to the original representation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The rescaled data to be transformed back.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n if self.with_scaling:\n inplace_column_scale(X, self.scale_)\n else:\n if self.with_scaling:\n X *= self.scale_\n if self.with_centering:\n X += self.center_\n return X\n \n def _more_tags(self):\n return {'allow_nan': True}\n" }, @@ -25990,7 +26086,7 @@ "sklearn.preprocessing._data.StandardScaler._more_tags" ], "is_public": true, - "description": "Standardize features by removing the mean and scaling to unit variance.\n\nThe standard score of a sample `x` is calculated as: z = (x - u) / s where `u` is the mean of the training samples or zero if `with_mean=False`, and `s` is the standard deviation of the training samples or one if `with_std=False`. Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Mean and standard deviation are then stored to be used on later data using :meth:`transform`. Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance). For instance many elements used in the objective function of a learning algorithm (such as the RBF kernel of Support Vector Machines or the L1 and L2 regularizers of linear models) assume that all features are centered around 0 and have variance in the same order. If a feature has a variance that is orders of magnitude larger that others, it might dominate the objective function and make the estimator unable to learn from other features correctly as expected. This scaler can also be applied to sparse CSR or CSC matrices by passing `with_mean=False` to avoid breaking the sparsity structure of the data. Read more in the :ref:`User Guide `.", + "description": "Standardize features by removing the mean and scaling to unit variance.\n\nThe standard score of a sample `x` is calculated as:\n\n z = (x - u) / s\n\nwhere `u` is the mean of the training samples or zero if `with_mean=False`,\nand `s` is the standard deviation of the training samples or one if\n`with_std=False`.\n\nCentering and scaling happen independently on each feature by computing\nthe relevant statistics on the samples in the training set. Mean and\nstandard deviation are then stored to be used on later data using\n:meth:`transform`.\n\nStandardization of a dataset is a common requirement for many\nmachine learning estimators: they might behave badly if the\nindividual features do not more or less look like standard normally\ndistributed data (e.g. Gaussian with 0 mean and unit variance).\n\nFor instance many elements used in the objective function of\na learning algorithm (such as the RBF kernel of Support Vector\nMachines or the L1 and L2 regularizers of linear models) assume that\nall features are centered around 0 and have variance in the same\norder. If a feature has a variance that is orders of magnitude larger\nthat others, it might dominate the objective function and make the\nestimator unable to learn from other features correctly as expected.\n\nThis scaler can also be applied to sparse CSR or CSC matrices by passing\n`with_mean=False` to avoid breaking the sparsity structure of the data.\n\nRead more in the :ref:`User Guide `.", "docstring": "Standardize features by removing the mean and scaling to unit variance.\n\n The standard score of a sample `x` is calculated as:\n\n z = (x - u) / s\n\n where `u` is the mean of the training samples or zero if `with_mean=False`,\n and `s` is the standard deviation of the training samples or one if\n `with_std=False`.\n\n Centering and scaling happen independently on each feature by computing\n the relevant statistics on the samples in the training set. Mean and\n standard deviation are then stored to be used on later data using\n :meth:`transform`.\n\n Standardization of a dataset is a common requirement for many\n machine learning estimators: they might behave badly if the\n individual features do not more or less look like standard normally\n distributed data (e.g. Gaussian with 0 mean and unit variance).\n\n For instance many elements used in the objective function of\n a learning algorithm (such as the RBF kernel of Support Vector\n Machines or the L1 and L2 regularizers of linear models) assume that\n all features are centered around 0 and have variance in the same\n order. If a feature has a variance that is orders of magnitude larger\n that others, it might dominate the objective function and make the\n estimator unable to learn from other features correctly as expected.\n\n This scaler can also be applied to sparse CSR or CSC matrices by passing\n `with_mean=False` to avoid breaking the sparsity structure of the data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n copy : bool, default=True\n If False, try to avoid a copy and do inplace scaling instead.\n This is not guaranteed to always work inplace; e.g. if the data is\n not a NumPy array or scipy.sparse CSR matrix, a copy may still be\n returned.\n\n with_mean : bool, default=True\n If True, center the data before scaling.\n This does not work (and will raise an exception) when attempted on\n sparse matrices, because centering them entails building a dense\n matrix which in common use cases is likely to be too large to fit in\n memory.\n\n with_std : bool, default=True\n If True, scale the data to unit variance (or equivalently,\n unit standard deviation).\n\n Attributes\n ----------\n scale_ : ndarray of shape (n_features,) or None\n Per feature relative scaling of the data to achieve zero mean and unit\n variance. Generally this is calculated using `np.sqrt(var_)`. If a\n variance is zero, we can't achieve unit variance, and the data is left\n as-is, giving a scaling factor of 1. `scale_` is equal to `None`\n when `with_std=False`.\n\n .. versionadded:: 0.17\n *scale_*\n\n mean_ : ndarray of shape (n_features,) or None\n The mean value for each feature in the training set.\n Equal to ``None`` when ``with_mean=False``.\n\n var_ : ndarray of shape (n_features,) or None\n The variance for each feature in the training set. Used to compute\n `scale_`. Equal to ``None`` when ``with_std=False``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_seen_ : int or ndarray of shape (n_features,)\n The number of samples processed by the estimator for each feature.\n If there are no missing samples, the ``n_samples_seen`` will be an\n integer, otherwise it will be an array of dtype int. If\n `sample_weights` are used it will be a float (if no missing data)\n or an array of dtype float that sums the weights seen so far.\n Will be reset on new calls to fit, but increments across\n ``partial_fit`` calls.\n\n See Also\n --------\n scale : Equivalent function without the estimator API.\n\n :class:`~sklearn.decomposition.PCA` : Further removes the linear\n correlation across features with 'whiten=True'.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in fit, and maintained in\n transform.\n\n We use a biased estimator for the standard deviation, equivalent to\n `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to\n affect model performance.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n Examples\n --------\n >>> from sklearn.preprocessing import StandardScaler\n >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]\n >>> scaler = StandardScaler()\n >>> print(scaler.fit(data))\n StandardScaler()\n >>> print(scaler.mean_)\n [0.5 0.5]\n >>> print(scaler.transform(data))\n [[-1. -1.]\n [-1. -1.]\n [ 1. 1.]\n [ 1. 1.]]\n >>> print(scaler.transform([[2, 2]]))\n [[3. 3.]]\n ", "source_code": "\n\nclass StandardScaler(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator):\n \"\"\"Standardize features by removing the mean and scaling to unit variance.\n\n The standard score of a sample `x` is calculated as:\n\n z = (x - u) / s\n\n where `u` is the mean of the training samples or zero if `with_mean=False`,\n and `s` is the standard deviation of the training samples or one if\n `with_std=False`.\n\n Centering and scaling happen independently on each feature by computing\n the relevant statistics on the samples in the training set. Mean and\n standard deviation are then stored to be used on later data using\n :meth:`transform`.\n\n Standardization of a dataset is a common requirement for many\n machine learning estimators: they might behave badly if the\n individual features do not more or less look like standard normally\n distributed data (e.g. Gaussian with 0 mean and unit variance).\n\n For instance many elements used in the objective function of\n a learning algorithm (such as the RBF kernel of Support Vector\n Machines or the L1 and L2 regularizers of linear models) assume that\n all features are centered around 0 and have variance in the same\n order. If a feature has a variance that is orders of magnitude larger\n that others, it might dominate the objective function and make the\n estimator unable to learn from other features correctly as expected.\n\n This scaler can also be applied to sparse CSR or CSC matrices by passing\n `with_mean=False` to avoid breaking the sparsity structure of the data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n copy : bool, default=True\n If False, try to avoid a copy and do inplace scaling instead.\n This is not guaranteed to always work inplace; e.g. if the data is\n not a NumPy array or scipy.sparse CSR matrix, a copy may still be\n returned.\n\n with_mean : bool, default=True\n If True, center the data before scaling.\n This does not work (and will raise an exception) when attempted on\n sparse matrices, because centering them entails building a dense\n matrix which in common use cases is likely to be too large to fit in\n memory.\n\n with_std : bool, default=True\n If True, scale the data to unit variance (or equivalently,\n unit standard deviation).\n\n Attributes\n ----------\n scale_ : ndarray of shape (n_features,) or None\n Per feature relative scaling of the data to achieve zero mean and unit\n variance. Generally this is calculated using `np.sqrt(var_)`. If a\n variance is zero, we can't achieve unit variance, and the data is left\n as-is, giving a scaling factor of 1. `scale_` is equal to `None`\n when `with_std=False`.\n\n .. versionadded:: 0.17\n *scale_*\n\n mean_ : ndarray of shape (n_features,) or None\n The mean value for each feature in the training set.\n Equal to ``None`` when ``with_mean=False``.\n\n var_ : ndarray of shape (n_features,) or None\n The variance for each feature in the training set. Used to compute\n `scale_`. Equal to ``None`` when ``with_std=False``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_samples_seen_ : int or ndarray of shape (n_features,)\n The number of samples processed by the estimator for each feature.\n If there are no missing samples, the ``n_samples_seen`` will be an\n integer, otherwise it will be an array of dtype int. If\n `sample_weights` are used it will be a float (if no missing data)\n or an array of dtype float that sums the weights seen so far.\n Will be reset on new calls to fit, but increments across\n ``partial_fit`` calls.\n\n See Also\n --------\n scale : Equivalent function without the estimator API.\n\n :class:`~sklearn.decomposition.PCA` : Further removes the linear\n correlation across features with 'whiten=True'.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in fit, and maintained in\n transform.\n\n We use a biased estimator for the standard deviation, equivalent to\n `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to\n affect model performance.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n Examples\n --------\n >>> from sklearn.preprocessing import StandardScaler\n >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]\n >>> scaler = StandardScaler()\n >>> print(scaler.fit(data))\n StandardScaler()\n >>> print(scaler.mean_)\n [0.5 0.5]\n >>> print(scaler.transform(data))\n [[-1. -1.]\n [-1. -1.]\n [ 1. 1.]\n [ 1. 1.]]\n >>> print(scaler.transform([[2, 2]]))\n [[3. 3.]]\n \"\"\"\n \n def __init__(self, *, copy=True, with_mean=True, with_std=True):\n self.with_mean = with_mean\n self.with_std = with_std\n self.copy = copy\n \n def _reset(self):\n \"\"\"Reset internal data-dependent state of the scaler, if necessary.\n\n __init__ parameters are not touched.\n \"\"\"\n if hasattr(self, 'scale_'):\n del self.scale_\n del self.n_samples_seen_\n del self.mean_\n del self.var_\n \n def fit(self, X, y=None, sample_weight=None):\n \"\"\"Compute the mean and std to be used for later scaling.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.24\n parameter *sample_weight* support to StandardScaler.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n self._reset()\n return self.partial_fit(X, y, sample_weight)\n \n def partial_fit(self, X, y=None, sample_weight=None):\n \"\"\"Online computation of mean and std on X for later scaling.\n\n All of X is processed as a single batch. This is intended for cases\n when :meth:`fit` is not feasible due to very large number of\n `n_samples` or because X is read from a continuous stream.\n\n The algorithm for incremental mean and std is given in Equation 1.5a,b\n in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. \"Algorithms\n for computing the sample variance: Analysis and recommendations.\"\n The American Statistician 37.3 (1983): 242-247:\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.24\n parameter *sample_weight* support to StandardScaler.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n first_call = not hasattr(self, 'n_samples_seen_')\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan', reset=first_call)\n n_features = X.shape[1]\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n dtype = np.int64 if sample_weight is None else X.dtype\n if not hasattr(self, 'n_samples_seen_'):\n self.n_samples_seen_ = np.zeros(n_features, dtype=dtype)\n elif np.size(self.n_samples_seen_) == 1:\n self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1])\n self.n_samples_seen_ = self.n_samples_seen_.astype(dtype, copy=False)\n if sparse.issparse(X):\n if self.with_mean:\n raise ValueError('Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.')\n sparse_constructor = sparse.csr_matrix if X.format == 'csr' else sparse.csc_matrix\n if self.with_std:\n if not hasattr(self, 'scale_'):\n (self.mean_, self.var_, self.n_samples_seen_) = mean_variance_axis(X, axis=0, weights=sample_weight, return_sum_weights=True)\n else:\n (self.mean_, self.var_, self.n_samples_seen_) = incr_mean_variance_axis(X, axis=0, last_mean=self.mean_, last_var=self.var_, last_n=self.n_samples_seen_, weights=sample_weight)\n self.mean_ = self.mean_.astype(np.float64, copy=False)\n self.var_ = self.var_.astype(np.float64, copy=False)\n else:\n self.mean_ = None\n self.var_ = None\n weights = _check_sample_weight(sample_weight, X)\n sum_weights_nan = weights @ sparse_constructor((np.isnan(X.data), X.indices, X.indptr), shape=X.shape)\n self.n_samples_seen_ += (np.sum(weights) - sum_weights_nan).astype(dtype)\n else:\n if not hasattr(self, 'scale_'):\n self.mean_ = 0.0\n if self.with_std:\n self.var_ = 0.0\n else:\n self.var_ = None\n if not self.with_mean and not self.with_std:\n self.mean_ = None\n self.var_ = None\n self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)\n else:\n (self.mean_, self.var_, self.n_samples_seen_) = _incremental_mean_and_var(X, self.mean_, self.var_, self.n_samples_seen_, sample_weight=sample_weight)\n if np.ptp(self.n_samples_seen_) == 0:\n self.n_samples_seen_ = self.n_samples_seen_[0]\n if self.with_std:\n constant_mask = _is_constant_feature(self.var_, self.mean_, self.n_samples_seen_)\n self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_), copy=False, constant_mask=constant_mask)\n else:\n self.scale_ = None\n return self\n \n def transform(self, X, copy=None):\n \"\"\"Perform standardization by centering and scaling.\n\n Parameters\n ----------\n X : {array-like, sparse matrix of shape (n_samples, n_features)\n The data used to scale along the features axis.\n copy : bool, default=None\n Copy the input X or not.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n check_is_fitted(self)\n copy = copy if copy is not None else self.copy\n X = self._validate_data(X, reset=False, accept_sparse='csr', copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n if self.with_mean:\n raise ValueError('Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.')\n if self.scale_ is not None:\n inplace_column_scale(X, 1 / self.scale_)\n else:\n if self.with_mean:\n X -= self.mean_\n if self.with_std:\n X /= self.scale_\n return X\n \n def inverse_transform(self, X, copy=None):\n \"\"\"Scale back the data to the original representation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis.\n copy : bool, default=None\n Copy the input X or not.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n check_is_fitted(self)\n copy = copy if copy is not None else self.copy\n X = check_array(X, accept_sparse='csr', copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n if self.with_mean:\n raise ValueError('Cannot uncenter sparse matrices: pass `with_mean=False` instead See docstring for motivation and alternatives.')\n if self.scale_ is not None:\n inplace_column_scale(X, self.scale_)\n else:\n if self.with_std:\n X *= self.scale_\n if self.with_mean:\n X += self.mean_\n return X\n \n def _more_tags(self):\n return {'allow_nan': True, 'preserves_dtype': [np.float64, np.float32]}\n" }, @@ -26008,9 +26104,9 @@ "sklearn.preprocessing._discretization.KBinsDiscretizer.get_feature_names_out" ], "is_public": true, - "description": "Bin continuous data into intervals.\n\nRead more in the :ref:`User Guide `. .. versionadded:: 0.20", - "docstring": "\n Bin continuous data into intervals.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n n_bins : int or array-like of shape (n_features,), default=5\n The number of bins to produce. Raises ValueError if ``n_bins < 2``.\n\n encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'\n Method used to encode the transformed result.\n\n onehot\n Encode the transformed result with one-hot encoding\n and return a sparse matrix. Ignored features are always\n stacked to the right.\n onehot-dense\n Encode the transformed result with one-hot encoding\n and return a dense array. Ignored features are always\n stacked to the right.\n ordinal\n Return the bin identifier encoded as an integer value.\n\n strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'\n Strategy used to define the widths of the bins.\n\n uniform\n All bins in each feature have identical widths.\n quantile\n All bins in each feature have the same number of points.\n kmeans\n Values in each bin have the same nearest center of a 1D k-means\n cluster.\n\n dtype : {np.float32, np.float64}, default=None\n The desired data-type for the output. If None, output dtype is\n consistent with input dtype. Only np.float32 and np.float64 are\n supported.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n bin_edges_ : ndarray of ndarray of shape (n_features,)\n The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``\n Ignored features will have empty arrays.\n\n n_bins_ : ndarray of shape (n_features,), dtype=np.int_\n Number of bins per feature. Bins whose width are too small\n (i.e., <= 1e-8) are removed with a warning.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Binarizer : Class used to bin values as ``0`` or\n ``1`` based on a parameter ``threshold``.\n\n Notes\n -----\n In bin edges for feature ``i``, the first and last values are used only for\n ``inverse_transform``. During transform, bin edges are extended to::\n\n np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])\n\n You can combine ``KBinsDiscretizer`` with\n :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess\n part of the features.\n\n ``KBinsDiscretizer`` might produce constant features (e.g., when\n ``encode = 'onehot'`` and certain bins do not contain any data).\n These features can be removed with feature selection algorithms\n (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).\n\n Examples\n --------\n >>> from sklearn.preprocessing import KBinsDiscretizer\n >>> X = [[-2, 1, -4, -1],\n ... [-1, 2, -3, -0.5],\n ... [ 0, 3, -2, 0.5],\n ... [ 1, 4, -1, 2]]\n >>> est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')\n >>> est.fit(X)\n KBinsDiscretizer(...)\n >>> Xt = est.transform(X)\n >>> Xt # doctest: +SKIP\n array([[ 0., 0., 0., 0.],\n [ 1., 1., 1., 0.],\n [ 2., 2., 2., 1.],\n [ 2., 2., 2., 2.]])\n\n Sometimes it may be useful to convert the data back into the original\n feature space. The ``inverse_transform`` function converts the binned\n data into the original feature space. Each value will be equal to the mean\n of the two bin edges.\n\n >>> est.bin_edges_[0]\n array([-2., -1., 0., 1.])\n >>> est.inverse_transform(Xt)\n array([[-1.5, 1.5, -3.5, -0.5],\n [-0.5, 2.5, -2.5, -0.5],\n [ 0.5, 3.5, -1.5, 0.5],\n [ 0.5, 3.5, -1.5, 1.5]])\n ", - "source_code": "\n\nclass KBinsDiscretizer(TransformerMixin, BaseEstimator):\n \"\"\"\n Bin continuous data into intervals.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n n_bins : int or array-like of shape (n_features,), default=5\n The number of bins to produce. Raises ValueError if ``n_bins < 2``.\n\n encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'\n Method used to encode the transformed result.\n\n onehot\n Encode the transformed result with one-hot encoding\n and return a sparse matrix. Ignored features are always\n stacked to the right.\n onehot-dense\n Encode the transformed result with one-hot encoding\n and return a dense array. Ignored features are always\n stacked to the right.\n ordinal\n Return the bin identifier encoded as an integer value.\n\n strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'\n Strategy used to define the widths of the bins.\n\n uniform\n All bins in each feature have identical widths.\n quantile\n All bins in each feature have the same number of points.\n kmeans\n Values in each bin have the same nearest center of a 1D k-means\n cluster.\n\n dtype : {np.float32, np.float64}, default=None\n The desired data-type for the output. If None, output dtype is\n consistent with input dtype. Only np.float32 and np.float64 are\n supported.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n bin_edges_ : ndarray of ndarray of shape (n_features,)\n The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``\n Ignored features will have empty arrays.\n\n n_bins_ : ndarray of shape (n_features,), dtype=np.int_\n Number of bins per feature. Bins whose width are too small\n (i.e., <= 1e-8) are removed with a warning.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Binarizer : Class used to bin values as ``0`` or\n ``1`` based on a parameter ``threshold``.\n\n Notes\n -----\n In bin edges for feature ``i``, the first and last values are used only for\n ``inverse_transform``. During transform, bin edges are extended to::\n\n np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])\n\n You can combine ``KBinsDiscretizer`` with\n :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess\n part of the features.\n\n ``KBinsDiscretizer`` might produce constant features (e.g., when\n ``encode = 'onehot'`` and certain bins do not contain any data).\n These features can be removed with feature selection algorithms\n (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).\n\n Examples\n --------\n >>> from sklearn.preprocessing import KBinsDiscretizer\n >>> X = [[-2, 1, -4, -1],\n ... [-1, 2, -3, -0.5],\n ... [ 0, 3, -2, 0.5],\n ... [ 1, 4, -1, 2]]\n >>> est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')\n >>> est.fit(X)\n KBinsDiscretizer(...)\n >>> Xt = est.transform(X)\n >>> Xt # doctest: +SKIP\n array([[ 0., 0., 0., 0.],\n [ 1., 1., 1., 0.],\n [ 2., 2., 2., 1.],\n [ 2., 2., 2., 2.]])\n\n Sometimes it may be useful to convert the data back into the original\n feature space. The ``inverse_transform`` function converts the binned\n data into the original feature space. Each value will be equal to the mean\n of the two bin edges.\n\n >>> est.bin_edges_[0]\n array([-2., -1., 0., 1.])\n >>> est.inverse_transform(Xt)\n array([[-1.5, 1.5, -3.5, -0.5],\n [-0.5, 2.5, -2.5, -0.5],\n [ 0.5, 3.5, -1.5, 0.5],\n [ 0.5, 3.5, -1.5, 1.5]])\n \"\"\"\n \n def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile', dtype=None):\n self.n_bins = n_bins\n self.encode = encode\n self.strategy = strategy\n self.dtype = dtype\n \n def fit(self, X, y=None):\n \"\"\"\n Fit the estimator.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to be discretized.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, dtype='numeric')\n supported_dtype = (np.float64, np.float32)\n if self.dtype in supported_dtype:\n output_dtype = self.dtype\n elif self.dtype is None:\n output_dtype = X.dtype\n else:\n raise ValueError(f\"Valid options for 'dtype' are {supported_dtype + (None, )}. Got dtype={self.dtype} instead.\")\n valid_encode = ('onehot', 'onehot-dense', 'ordinal')\n if self.encode not in valid_encode:\n raise ValueError(\"Valid options for 'encode' are {}. Got encode={!r} instead.\".format(valid_encode, self.encode))\n valid_strategy = ('uniform', 'quantile', 'kmeans')\n if self.strategy not in valid_strategy:\n raise ValueError(\"Valid options for 'strategy' are {}. Got strategy={!r} instead.\".format(valid_strategy, self.strategy))\n n_features = X.shape[1]\n n_bins = self._validate_n_bins(n_features)\n bin_edges = np.zeros(n_features, dtype=object)\n for jj in range(n_features):\n column = X[:, jj]\n (col_min, col_max) = (column.min(), column.max())\n if col_min == col_max:\n warnings.warn('Feature %d is constant and will be replaced with 0.' % jj)\n n_bins[jj] = 1\n bin_edges[jj] = np.array([-np.inf, np.inf])\n continue\n if self.strategy == 'uniform':\n bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)\n elif self.strategy == 'quantile':\n quantiles = np.linspace(0, 100, n_bins[jj] + 1)\n bin_edges[jj] = np.asarray(np.percentile(column, quantiles))\n elif self.strategy == 'kmeans':\n from ..cluster import KMeans\n uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)\n init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5\n km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1, algorithm='full')\n centers = km.fit(column[:, None]).cluster_centers_[:, 0]\n centers.sort()\n bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5\n bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]\n if self.strategy in ('quantile', 'kmeans'):\n mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-08\n bin_edges[jj] = bin_edges[jj][mask]\n if len(bin_edges[jj]) - 1 != n_bins[jj]:\n warnings.warn('Bins whose width are too small (i.e., <= 1e-8) in feature %d are removed. Consider decreasing the number of bins.' % jj)\n n_bins[jj] = len(bin_edges[jj]) - 1\n self.bin_edges_ = bin_edges\n self.n_bins_ = n_bins\n if 'onehot' in self.encode:\n self._encoder = OneHotEncoder(categories=[np.arange(i) for i in self.n_bins_], sparse=self.encode == 'onehot', dtype=output_dtype)\n self._encoder.fit(np.zeros((1, len(self.n_bins_))))\n return self\n \n def _validate_n_bins(self, n_features):\n \"\"\"Returns n_bins_, the number of bins per feature.\"\"\"\n orig_bins = self.n_bins\n if isinstance(orig_bins, numbers.Number):\n if not isinstance(orig_bins, numbers.Integral):\n raise ValueError('{} received an invalid n_bins type. Received {}, expected int.'.format(KBinsDiscretizer.__name__, type(orig_bins).__name__))\n if orig_bins < 2:\n raise ValueError('{} received an invalid number of bins. Received {}, expected at least 2.'.format(KBinsDiscretizer.__name__, orig_bins))\n return np.full(n_features, orig_bins, dtype=int)\n n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)\n if n_bins.ndim > 1 or n_bins.shape[0] != n_features:\n raise ValueError('n_bins must be a scalar or array of shape (n_features,).')\n bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)\n violating_indices = np.where(bad_nbins_value)[0]\n if violating_indices.shape[0] > 0:\n indices = ', '.join((str(i) for i in violating_indices))\n raise ValueError('{} received an invalid number of bins at indices {}. Number of bins must be at least 2, and must be an int.'.format(KBinsDiscretizer.__name__, indices))\n return n_bins\n \n def transform(self, X):\n \"\"\"\n Discretize the data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to be discretized.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}\n Data in the binned space. Will be a sparse matrix if\n `self.encode='onehot'` and ndarray otherwise.\n \"\"\"\n check_is_fitted(self)\n dtype = (np.float64, np.float32) if self.dtype is None else self.dtype\n Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)\n bin_edges = self.bin_edges_\n for jj in range(Xt.shape[1]):\n rtol = 1e-05\n atol = 1e-08\n eps = atol + rtol * np.abs(Xt[:, jj])\n Xt[:, jj] = np.digitize(Xt[:, jj] + eps, bin_edges[jj][1:])\n np.clip(Xt, 0, self.n_bins_ - 1, out=Xt)\n if self.encode == 'ordinal':\n return Xt\n dtype_init = None\n if 'onehot' in self.encode:\n dtype_init = self._encoder.dtype\n self._encoder.dtype = Xt.dtype\n try:\n Xt_enc = self._encoder.transform(Xt)\n finally:\n self._encoder.dtype = dtype_init\n return Xt_enc\n \n def inverse_transform(self, Xt):\n \"\"\"\n Transform discretized data back to original feature space.\n\n Note that this function does not regenerate the original data\n due to discretization rounding.\n\n Parameters\n ----------\n Xt : array-like of shape (n_samples, n_features)\n Transformed data in the binned space.\n\n Returns\n -------\n Xinv : ndarray, dtype={np.float32, np.float64}\n Data in the original feature space.\n \"\"\"\n check_is_fitted(self)\n if 'onehot' in self.encode:\n Xt = self._encoder.inverse_transform(Xt)\n Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32))\n n_features = self.n_bins_.shape[0]\n if Xinv.shape[1] != n_features:\n raise ValueError('Incorrect number of features. Expecting {}, received {}.'.format(n_features, Xinv.shape[1]))\n for jj in range(n_features):\n bin_edges = self.bin_edges_[jj]\n bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5\n Xinv[:, jj] = bin_centers[np.int_(Xinv[:, jj])]\n return Xinv\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n input_features = _check_feature_names_in(self, input_features)\n return self._encoder.get_feature_names_out(input_features)\n" + "description": "Bin continuous data into intervals.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20", + "docstring": "\n Bin continuous data into intervals.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n n_bins : int or array-like of shape (n_features,), default=5\n The number of bins to produce. Raises ValueError if ``n_bins < 2``.\n\n encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'\n Method used to encode the transformed result.\n\n - 'onehot': Encode the transformed result with one-hot encoding\n and return a sparse matrix. Ignored features are always\n stacked to the right.\n - 'onehot-dense': Encode the transformed result with one-hot encoding\n and return a dense array. Ignored features are always\n stacked to the right.\n - 'ordinal': Return the bin identifier encoded as an integer value.\n\n strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'\n Strategy used to define the widths of the bins.\n\n - 'uniform': All bins in each feature have identical widths.\n - 'quantile': All bins in each feature have the same number of points.\n - 'kmeans': Values in each bin have the same nearest center of a 1D\n k-means cluster.\n\n dtype : {np.float32, np.float64}, default=None\n The desired data-type for the output. If None, output dtype is\n consistent with input dtype. Only np.float32 and np.float64 are\n supported.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n bin_edges_ : ndarray of ndarray of shape (n_features,)\n The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``\n Ignored features will have empty arrays.\n\n n_bins_ : ndarray of shape (n_features,), dtype=np.int_\n Number of bins per feature. Bins whose width are too small\n (i.e., <= 1e-8) are removed with a warning.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Binarizer : Class used to bin values as ``0`` or\n ``1`` based on a parameter ``threshold``.\n\n Notes\n -----\n In bin edges for feature ``i``, the first and last values are used only for\n ``inverse_transform``. During transform, bin edges are extended to::\n\n np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])\n\n You can combine ``KBinsDiscretizer`` with\n :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess\n part of the features.\n\n ``KBinsDiscretizer`` might produce constant features (e.g., when\n ``encode = 'onehot'`` and certain bins do not contain any data).\n These features can be removed with feature selection algorithms\n (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).\n\n Examples\n --------\n >>> from sklearn.preprocessing import KBinsDiscretizer\n >>> X = [[-2, 1, -4, -1],\n ... [-1, 2, -3, -0.5],\n ... [ 0, 3, -2, 0.5],\n ... [ 1, 4, -1, 2]]\n >>> est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')\n >>> est.fit(X)\n KBinsDiscretizer(...)\n >>> Xt = est.transform(X)\n >>> Xt # doctest: +SKIP\n array([[ 0., 0., 0., 0.],\n [ 1., 1., 1., 0.],\n [ 2., 2., 2., 1.],\n [ 2., 2., 2., 2.]])\n\n Sometimes it may be useful to convert the data back into the original\n feature space. The ``inverse_transform`` function converts the binned\n data into the original feature space. Each value will be equal to the mean\n of the two bin edges.\n\n >>> est.bin_edges_[0]\n array([-2., -1., 0., 1.])\n >>> est.inverse_transform(Xt)\n array([[-1.5, 1.5, -3.5, -0.5],\n [-0.5, 2.5, -2.5, -0.5],\n [ 0.5, 3.5, -1.5, 0.5],\n [ 0.5, 3.5, -1.5, 1.5]])\n ", + "source_code": "\n\nclass KBinsDiscretizer(TransformerMixin, BaseEstimator):\n \"\"\"\n Bin continuous data into intervals.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n n_bins : int or array-like of shape (n_features,), default=5\n The number of bins to produce. Raises ValueError if ``n_bins < 2``.\n\n encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'\n Method used to encode the transformed result.\n\n - 'onehot': Encode the transformed result with one-hot encoding\n and return a sparse matrix. Ignored features are always\n stacked to the right.\n - 'onehot-dense': Encode the transformed result with one-hot encoding\n and return a dense array. Ignored features are always\n stacked to the right.\n - 'ordinal': Return the bin identifier encoded as an integer value.\n\n strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'\n Strategy used to define the widths of the bins.\n\n - 'uniform': All bins in each feature have identical widths.\n - 'quantile': All bins in each feature have the same number of points.\n - 'kmeans': Values in each bin have the same nearest center of a 1D\n k-means cluster.\n\n dtype : {np.float32, np.float64}, default=None\n The desired data-type for the output. If None, output dtype is\n consistent with input dtype. Only np.float32 and np.float64 are\n supported.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n bin_edges_ : ndarray of ndarray of shape (n_features,)\n The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``\n Ignored features will have empty arrays.\n\n n_bins_ : ndarray of shape (n_features,), dtype=np.int_\n Number of bins per feature. Bins whose width are too small\n (i.e., <= 1e-8) are removed with a warning.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n Binarizer : Class used to bin values as ``0`` or\n ``1`` based on a parameter ``threshold``.\n\n Notes\n -----\n In bin edges for feature ``i``, the first and last values are used only for\n ``inverse_transform``. During transform, bin edges are extended to::\n\n np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])\n\n You can combine ``KBinsDiscretizer`` with\n :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess\n part of the features.\n\n ``KBinsDiscretizer`` might produce constant features (e.g., when\n ``encode = 'onehot'`` and certain bins do not contain any data).\n These features can be removed with feature selection algorithms\n (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).\n\n Examples\n --------\n >>> from sklearn.preprocessing import KBinsDiscretizer\n >>> X = [[-2, 1, -4, -1],\n ... [-1, 2, -3, -0.5],\n ... [ 0, 3, -2, 0.5],\n ... [ 1, 4, -1, 2]]\n >>> est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')\n >>> est.fit(X)\n KBinsDiscretizer(...)\n >>> Xt = est.transform(X)\n >>> Xt # doctest: +SKIP\n array([[ 0., 0., 0., 0.],\n [ 1., 1., 1., 0.],\n [ 2., 2., 2., 1.],\n [ 2., 2., 2., 2.]])\n\n Sometimes it may be useful to convert the data back into the original\n feature space. The ``inverse_transform`` function converts the binned\n data into the original feature space. Each value will be equal to the mean\n of the two bin edges.\n\n >>> est.bin_edges_[0]\n array([-2., -1., 0., 1.])\n >>> est.inverse_transform(Xt)\n array([[-1.5, 1.5, -3.5, -0.5],\n [-0.5, 2.5, -2.5, -0.5],\n [ 0.5, 3.5, -1.5, 0.5],\n [ 0.5, 3.5, -1.5, 1.5]])\n \"\"\"\n \n def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile', dtype=None):\n self.n_bins = n_bins\n self.encode = encode\n self.strategy = strategy\n self.dtype = dtype\n \n def fit(self, X, y=None):\n \"\"\"\n Fit the estimator.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to be discretized.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, dtype='numeric')\n supported_dtype = (np.float64, np.float32)\n if self.dtype in supported_dtype:\n output_dtype = self.dtype\n elif self.dtype is None:\n output_dtype = X.dtype\n else:\n raise ValueError(f\"Valid options for 'dtype' are {supported_dtype + (None, )}. Got dtype={self.dtype} instead.\")\n valid_encode = ('onehot', 'onehot-dense', 'ordinal')\n if self.encode not in valid_encode:\n raise ValueError(\"Valid options for 'encode' are {}. Got encode={!r} instead.\".format(valid_encode, self.encode))\n valid_strategy = ('uniform', 'quantile', 'kmeans')\n if self.strategy not in valid_strategy:\n raise ValueError(\"Valid options for 'strategy' are {}. Got strategy={!r} instead.\".format(valid_strategy, self.strategy))\n n_features = X.shape[1]\n n_bins = self._validate_n_bins(n_features)\n bin_edges = np.zeros(n_features, dtype=object)\n for jj in range(n_features):\n column = X[:, jj]\n (col_min, col_max) = (column.min(), column.max())\n if col_min == col_max:\n warnings.warn('Feature %d is constant and will be replaced with 0.' % jj)\n n_bins[jj] = 1\n bin_edges[jj] = np.array([-np.inf, np.inf])\n continue\n if self.strategy == 'uniform':\n bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)\n elif self.strategy == 'quantile':\n quantiles = np.linspace(0, 100, n_bins[jj] + 1)\n bin_edges[jj] = np.asarray(np.percentile(column, quantiles))\n elif self.strategy == 'kmeans':\n from ..cluster import KMeans\n uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)\n init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5\n km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1, algorithm='full')\n centers = km.fit(column[:, None]).cluster_centers_[:, 0]\n centers.sort()\n bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5\n bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]\n if self.strategy in ('quantile', 'kmeans'):\n mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-08\n bin_edges[jj] = bin_edges[jj][mask]\n if len(bin_edges[jj]) - 1 != n_bins[jj]:\n warnings.warn('Bins whose width are too small (i.e., <= 1e-8) in feature %d are removed. Consider decreasing the number of bins.' % jj)\n n_bins[jj] = len(bin_edges[jj]) - 1\n self.bin_edges_ = bin_edges\n self.n_bins_ = n_bins\n if 'onehot' in self.encode:\n self._encoder = OneHotEncoder(categories=[np.arange(i) for i in self.n_bins_], sparse=self.encode == 'onehot', dtype=output_dtype)\n self._encoder.fit(np.zeros((1, len(self.n_bins_))))\n return self\n \n def _validate_n_bins(self, n_features):\n \"\"\"Returns n_bins_, the number of bins per feature.\"\"\"\n orig_bins = self.n_bins\n if isinstance(orig_bins, numbers.Number):\n if not isinstance(orig_bins, numbers.Integral):\n raise ValueError('{} received an invalid n_bins type. Received {}, expected int.'.format(KBinsDiscretizer.__name__, type(orig_bins).__name__))\n if orig_bins < 2:\n raise ValueError('{} received an invalid number of bins. Received {}, expected at least 2.'.format(KBinsDiscretizer.__name__, orig_bins))\n return np.full(n_features, orig_bins, dtype=int)\n n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)\n if n_bins.ndim > 1 or n_bins.shape[0] != n_features:\n raise ValueError('n_bins must be a scalar or array of shape (n_features,).')\n bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)\n violating_indices = np.where(bad_nbins_value)[0]\n if violating_indices.shape[0] > 0:\n indices = ', '.join((str(i) for i in violating_indices))\n raise ValueError('{} received an invalid number of bins at indices {}. Number of bins must be at least 2, and must be an int.'.format(KBinsDiscretizer.__name__, indices))\n return n_bins\n \n def transform(self, X):\n \"\"\"\n Discretize the data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to be discretized.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}\n Data in the binned space. Will be a sparse matrix if\n `self.encode='onehot'` and ndarray otherwise.\n \"\"\"\n check_is_fitted(self)\n dtype = (np.float64, np.float32) if self.dtype is None else self.dtype\n Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)\n bin_edges = self.bin_edges_\n for jj in range(Xt.shape[1]):\n rtol = 1e-05\n atol = 1e-08\n eps = atol + rtol * np.abs(Xt[:, jj])\n Xt[:, jj] = np.digitize(Xt[:, jj] + eps, bin_edges[jj][1:])\n np.clip(Xt, 0, self.n_bins_ - 1, out=Xt)\n if self.encode == 'ordinal':\n return Xt\n dtype_init = None\n if 'onehot' in self.encode:\n dtype_init = self._encoder.dtype\n self._encoder.dtype = Xt.dtype\n try:\n Xt_enc = self._encoder.transform(Xt)\n finally:\n self._encoder.dtype = dtype_init\n return Xt_enc\n \n def inverse_transform(self, Xt):\n \"\"\"\n Transform discretized data back to original feature space.\n\n Note that this function does not regenerate the original data\n due to discretization rounding.\n\n Parameters\n ----------\n Xt : array-like of shape (n_samples, n_features)\n Transformed data in the binned space.\n\n Returns\n -------\n Xinv : ndarray, dtype={np.float32, np.float64}\n Data in the original feature space.\n \"\"\"\n check_is_fitted(self)\n if 'onehot' in self.encode:\n Xt = self._encoder.inverse_transform(Xt)\n Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32))\n n_features = self.n_bins_.shape[0]\n if Xinv.shape[1] != n_features:\n raise ValueError('Incorrect number of features. Expecting {}, received {}.'.format(n_features, Xinv.shape[1]))\n for jj in range(n_features):\n bin_edges = self.bin_edges_[jj]\n bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5\n Xinv[:, jj] = bin_centers[np.int_(Xinv[:, jj])]\n return Xinv\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n input_features = _check_feature_names_in(self, input_features)\n return self._encoder.get_feature_names_out(input_features)\n" }, { "name": "OneHotEncoder", @@ -26029,9 +26125,9 @@ "sklearn.preprocessing._encoders.OneHotEncoder.get_feature_names_out" ], "is_public": true, - "description": "Encode categorical features as a one-hot numeric array.\n\nThe input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') encoding scheme. This creates a binary column for each category and returns a sparse matrix or dense array (depending on the ``sparse`` parameter) By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the `categories` manually. This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. Note: a one-hot encoding of y labels should use a LabelBinarizer instead. Read more in the :ref:`User Guide `.", - "docstring": "\n Encode categorical features as a one-hot numeric array.\n\n The input to this transformer should be an array-like of integers or\n strings, denoting the values taken on by categorical (discrete) features.\n The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')\n encoding scheme. This creates a binary column for each category and\n returns a sparse matrix or dense array (depending on the ``sparse``\n parameter)\n\n By default, the encoder derives the categories based on the unique values\n in each feature. Alternatively, you can also specify the `categories`\n manually.\n\n This encoding is needed for feeding categorical data to many scikit-learn\n estimators, notably linear models and SVMs with the standard kernels.\n\n Note: a one-hot encoding of y labels should use a LabelBinarizer\n instead.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n categories : 'auto' or a list of array-like, default='auto'\n Categories (unique values) per feature:\n\n - 'auto' : Determine categories automatically from the training data.\n - list : ``categories[i]`` holds the categories expected in the ith\n column. The passed categories should not mix strings and numeric\n values within a single feature, and should be sorted in case of\n numeric values.\n\n The used categories can be found in the ``categories_`` attribute.\n\n .. versionadded:: 0.20\n\n drop : {'first', 'if_binary'} or a array-like of shape (n_features,), default=None\n Specifies a methodology to use to drop one of the categories per\n feature. This is useful in situations where perfectly collinear\n features cause problems, such as when feeding the resulting data\n into a neural network or an unregularized regression.\n\n However, dropping one category breaks the symmetry of the original\n representation and can therefore induce a bias in downstream models,\n for instance for penalized linear classification or regression models.\n\n - None : retain all features (the default).\n - 'first' : drop the first category in each feature. If only one\n category is present, the feature will be dropped entirely.\n - 'if_binary' : drop the first category in each feature with two\n categories. Features with 1 or more than 2 categories are\n left intact.\n - array : ``drop[i]`` is the category in feature ``X[:, i]`` that\n should be dropped.\n\n .. versionadded:: 0.21\n The parameter `drop` was added in 0.21.\n\n .. versionchanged:: 0.23\n The option `drop='if_binary'` was added in 0.23.\n\n sparse : bool, default=True\n Will return sparse matrix if set True else will return an array.\n\n dtype : number type, default=float\n Desired dtype of output.\n\n handle_unknown : {'error', 'ignore'}, default='error'\n Whether to raise an error or ignore if an unknown categorical feature\n is present during transform (default is to raise). When this parameter\n is set to 'ignore' and an unknown category is encountered during\n transform, the resulting one-hot encoded columns for this feature\n will be all zeros. In the inverse transform, an unknown category\n will be denoted as None.\n\n Attributes\n ----------\n categories_ : list of arrays\n The categories of each feature determined during fitting\n (in order of the features in X and corresponding with the output\n of ``transform``). This includes the category specified in ``drop``\n (if any).\n\n drop_idx_ : array of shape (n_features,)\n - ``drop_idx_[i]`` is\u00a0the index in ``categories_[i]`` of the category\n to be dropped for each feature.\n - ``drop_idx_[i] = None`` if no category is to be dropped from the\n feature with index ``i``, e.g. when `drop='if_binary'` and the\n feature isn't binary.\n - ``drop_idx_ = None`` if all the transformed features will be\n retained.\n\n .. versionchanged:: 0.23\n Added the possibility to contain `None` values.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 1.0\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OrdinalEncoder : Performs an ordinal (integer)\n encoding of the categorical features.\n sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of\n dictionary items (also handles string-valued features).\n sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot\n encoding of dictionary items or strings.\n LabelBinarizer : Binarizes labels in a one-vs-all\n fashion.\n MultiLabelBinarizer : Transforms between iterable of\n iterables and a multilabel format, e.g. a (samples x classes) binary\n matrix indicating the presence of a class label.\n\n Examples\n --------\n Given a dataset with two features, we let the encoder find the unique\n values per feature and transform the data to a binary one-hot encoding.\n\n >>> from sklearn.preprocessing import OneHotEncoder\n\n One can discard categories not seen during `fit`:\n\n >>> enc = OneHotEncoder(handle_unknown='ignore')\n >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]\n >>> enc.fit(X)\n OneHotEncoder(handle_unknown='ignore')\n >>> enc.categories_\n [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]\n >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()\n array([[1., 0., 1., 0., 0.],\n [0., 1., 0., 0., 0.]])\n >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])\n array([['Male', 1],\n [None, 2]], dtype=object)\n >>> enc.get_feature_names_out(['gender', 'group'])\n array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...)\n\n One can always drop the first column for each feature:\n\n >>> drop_enc = OneHotEncoder(drop='first').fit(X)\n >>> drop_enc.categories_\n [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]\n >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()\n array([[0., 0., 0.],\n [1., 1., 0.]])\n\n Or drop a column for feature only having 2 categories:\n\n >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)\n >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()\n array([[0., 1., 0., 0.],\n [1., 0., 1., 0.]])\n ", - "source_code": "\n\nclass OneHotEncoder(_BaseEncoder):\n \"\"\"\n Encode categorical features as a one-hot numeric array.\n\n The input to this transformer should be an array-like of integers or\n strings, denoting the values taken on by categorical (discrete) features.\n The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')\n encoding scheme. This creates a binary column for each category and\n returns a sparse matrix or dense array (depending on the ``sparse``\n parameter)\n\n By default, the encoder derives the categories based on the unique values\n in each feature. Alternatively, you can also specify the `categories`\n manually.\n\n This encoding is needed for feeding categorical data to many scikit-learn\n estimators, notably linear models and SVMs with the standard kernels.\n\n Note: a one-hot encoding of y labels should use a LabelBinarizer\n instead.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n categories : 'auto' or a list of array-like, default='auto'\n Categories (unique values) per feature:\n\n - 'auto' : Determine categories automatically from the training data.\n - list : ``categories[i]`` holds the categories expected in the ith\n column. The passed categories should not mix strings and numeric\n values within a single feature, and should be sorted in case of\n numeric values.\n\n The used categories can be found in the ``categories_`` attribute.\n\n .. versionadded:: 0.20\n\n drop : {'first', 'if_binary'} or a array-like of shape (n_features,), default=None\n Specifies a methodology to use to drop one of the categories per\n feature. This is useful in situations where perfectly collinear\n features cause problems, such as when feeding the resulting data\n into a neural network or an unregularized regression.\n\n However, dropping one category breaks the symmetry of the original\n representation and can therefore induce a bias in downstream models,\n for instance for penalized linear classification or regression models.\n\n - None : retain all features (the default).\n - 'first' : drop the first category in each feature. If only one\n category is present, the feature will be dropped entirely.\n - 'if_binary' : drop the first category in each feature with two\n categories. Features with 1 or more than 2 categories are\n left intact.\n - array : ``drop[i]`` is the category in feature ``X[:, i]`` that\n should be dropped.\n\n .. versionadded:: 0.21\n The parameter `drop` was added in 0.21.\n\n .. versionchanged:: 0.23\n The option `drop='if_binary'` was added in 0.23.\n\n sparse : bool, default=True\n Will return sparse matrix if set True else will return an array.\n\n dtype : number type, default=float\n Desired dtype of output.\n\n handle_unknown : {'error', 'ignore'}, default='error'\n Whether to raise an error or ignore if an unknown categorical feature\n is present during transform (default is to raise). When this parameter\n is set to 'ignore' and an unknown category is encountered during\n transform, the resulting one-hot encoded columns for this feature\n will be all zeros. In the inverse transform, an unknown category\n will be denoted as None.\n\n Attributes\n ----------\n categories_ : list of arrays\n The categories of each feature determined during fitting\n (in order of the features in X and corresponding with the output\n of ``transform``). This includes the category specified in ``drop``\n (if any).\n\n drop_idx_ : array of shape (n_features,)\n - ``drop_idx_[i]`` is\u00a0the index in ``categories_[i]`` of the category\n to be dropped for each feature.\n - ``drop_idx_[i] = None`` if no category is to be dropped from the\n feature with index ``i``, e.g. when `drop='if_binary'` and the\n feature isn't binary.\n - ``drop_idx_ = None`` if all the transformed features will be\n retained.\n\n .. versionchanged:: 0.23\n Added the possibility to contain `None` values.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 1.0\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OrdinalEncoder : Performs an ordinal (integer)\n encoding of the categorical features.\n sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of\n dictionary items (also handles string-valued features).\n sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot\n encoding of dictionary items or strings.\n LabelBinarizer : Binarizes labels in a one-vs-all\n fashion.\n MultiLabelBinarizer : Transforms between iterable of\n iterables and a multilabel format, e.g. a (samples x classes) binary\n matrix indicating the presence of a class label.\n\n Examples\n --------\n Given a dataset with two features, we let the encoder find the unique\n values per feature and transform the data to a binary one-hot encoding.\n\n >>> from sklearn.preprocessing import OneHotEncoder\n\n One can discard categories not seen during `fit`:\n\n >>> enc = OneHotEncoder(handle_unknown='ignore')\n >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]\n >>> enc.fit(X)\n OneHotEncoder(handle_unknown='ignore')\n >>> enc.categories_\n [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]\n >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()\n array([[1., 0., 1., 0., 0.],\n [0., 1., 0., 0., 0.]])\n >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])\n array([['Male', 1],\n [None, 2]], dtype=object)\n >>> enc.get_feature_names_out(['gender', 'group'])\n array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...)\n\n One can always drop the first column for each feature:\n\n >>> drop_enc = OneHotEncoder(drop='first').fit(X)\n >>> drop_enc.categories_\n [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]\n >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()\n array([[0., 0., 0.],\n [1., 1., 0.]])\n\n Or drop a column for feature only having 2 categories:\n\n >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)\n >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()\n array([[0., 1., 0., 0.],\n [1., 0., 1., 0.]])\n \"\"\"\n \n def __init__(self, *, categories='auto', drop=None, sparse=True, dtype=np.float64, handle_unknown='error'):\n self.categories = categories\n self.sparse = sparse\n self.dtype = dtype\n self.handle_unknown = handle_unknown\n self.drop = drop\n \n def _validate_keywords(self):\n if self.handle_unknown not in ('error', 'ignore'):\n msg = \"handle_unknown should be either 'error' or 'ignore', got {0}.\".format(self.handle_unknown)\n raise ValueError(msg)\n \n def _compute_drop_idx(self):\n if self.drop is None:\n return None\n elif isinstance(self.drop, str):\n if self.drop == 'first':\n return np.zeros(len(self.categories_), dtype=object)\n elif self.drop == 'if_binary':\n return np.array([0 if len(cats) == 2 else None for cats in self.categories_], dtype=object)\n else:\n msg = \"Wrong input for parameter `drop`. Expected 'first', 'if_binary', None or array of objects, got {}\"\n raise ValueError(msg.format(type(self.drop)))\n else:\n try:\n drop_array = np.asarray(self.drop, dtype=object)\n droplen = len(drop_array)\n except (ValueError, TypeError):\n msg = \"Wrong input for parameter `drop`. Expected 'first', 'if_binary', None or array of objects, got {}\"\n raise ValueError(msg.format(type(drop_array)))\n if droplen != len(self.categories_):\n msg = '`drop` should have length equal to the number of features ({}), got {}'\n raise ValueError(msg.format(len(self.categories_), droplen))\n missing_drops = []\n drop_indices = []\n for (col_idx, (val, cat_list)) in enumerate(zip(drop_array, self.categories_)):\n if not is_scalar_nan(val):\n drop_idx = np.where(cat_list == val)[0]\n if drop_idx.size:\n drop_indices.append(drop_idx[0])\n else:\n missing_drops.append((col_idx, val))\n continue\n for (cat_idx, cat) in enumerate(cat_list):\n if is_scalar_nan(cat):\n drop_indices.append(cat_idx)\n break\n else:\n missing_drops.append((col_idx, val))\n if any(missing_drops):\n msg = 'The following categories were supposed to be dropped, but were not found in the training data.\\n{}'.format('\\n'.join(['Category: {}, Feature: {}'.format(c, v) for (c, v) in missing_drops]))\n raise ValueError(msg)\n return np.array(drop_indices, dtype=object)\n \n def fit(self, X, y=None):\n \"\"\"\n Fit OneHotEncoder to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to determine the categories of each feature.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n self\n Fitted encoder.\n \"\"\"\n self._validate_keywords()\n self._fit(X, handle_unknown=self.handle_unknown, force_all_finite='allow-nan')\n self.drop_idx_ = self._compute_drop_idx()\n return self\n \n def fit_transform(self, X, y=None):\n \"\"\"\n Fit OneHotEncoder to X, then transform X.\n\n Equivalent to fit(X).transform(X) but more convenient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to encode.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n X_out : {ndarray, sparse matrix} of shape (n_samples, n_encoded_features)\n Transformed input. If `sparse=True`, a sparse matrix will be\n returned.\n \"\"\"\n self._validate_keywords()\n return super().fit_transform(X, y)\n \n def transform(self, X):\n \"\"\"\n Transform X using one-hot encoding.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to encode.\n\n Returns\n -------\n X_out : {ndarray, sparse matrix} of shape (n_samples, n_encoded_features)\n Transformed input. If `sparse=True`, a sparse matrix will be\n returned.\n \"\"\"\n check_is_fitted(self)\n warn_on_unknown = self.handle_unknown == 'ignore' and self.drop is not None\n (X_int, X_mask) = self._transform(X, handle_unknown=self.handle_unknown, force_all_finite='allow-nan', warn_on_unknown=warn_on_unknown)\n (n_samples, n_features) = X_int.shape\n if self.drop_idx_ is not None:\n to_drop = self.drop_idx_.copy()\n keep_cells = X_int != to_drop\n n_values = []\n for (i, cats) in enumerate(self.categories_):\n n_cats = len(cats)\n if to_drop[i] is None:\n to_drop[i] = n_cats\n n_values.append(n_cats)\n else:\n n_values.append(n_cats - 1)\n to_drop = to_drop.reshape(1, -1)\n X_int[X_int > to_drop] -= 1\n X_mask &= keep_cells\n else:\n n_values = [len(cats) for cats in self.categories_]\n mask = X_mask.ravel()\n feature_indices = np.cumsum([0] + n_values)\n indices = (X_int + feature_indices[:-1]).ravel()[mask]\n indptr = np.empty(n_samples + 1, dtype=int)\n indptr[0] = 0\n np.sum(X_mask, axis=1, out=indptr[1:])\n np.cumsum(indptr[1:], out=indptr[1:])\n data = np.ones(indptr[-1])\n out = sparse.csr_matrix((data, indices, indptr), shape=(n_samples, feature_indices[-1]), dtype=self.dtype)\n if not self.sparse:\n return out.toarray()\n else:\n return out\n \n def inverse_transform(self, X):\n \"\"\"\n Convert the data back to the original representation.\n\n When unknown categories are encountered (all zeros in the\n one-hot encoding), ``None`` is used to represent this category. If the\n feature with the unknown category has a dropped caregory, the dropped\n category will be its inverse.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_encoded_features)\n The transformed data.\n\n Returns\n -------\n X_tr : ndarray of shape (n_samples, n_features)\n Inverse transformed array.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, accept_sparse='csr')\n (n_samples, _) = X.shape\n n_features = len(self.categories_)\n if self.drop_idx_ is None:\n n_transformed_features = sum((len(cats) for cats in self.categories_))\n else:\n n_transformed_features = sum((len(cats) - 1 if to_drop is not None else len(cats) for (cats, to_drop) in zip(self.categories_, self.drop_idx_)))\n msg = 'Shape of the passed X data is not correct. Expected {0} columns, got {1}.'\n if X.shape[1] != n_transformed_features:\n raise ValueError(msg.format(n_transformed_features, X.shape[1]))\n dt = np.find_common_type([cat.dtype for cat in self.categories_], [])\n X_tr = np.empty((n_samples, n_features), dtype=dt)\n j = 0\n found_unknown = {}\n for i in range(n_features):\n if self.drop_idx_ is None or self.drop_idx_[i] is None:\n cats = self.categories_[i]\n else:\n cats = np.delete(self.categories_[i], self.drop_idx_[i])\n n_categories = len(cats)\n if n_categories == 0:\n X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]\n j += n_categories\n continue\n sub = X[:, j:j + n_categories]\n labels = np.asarray(sub.argmax(axis=1)).flatten()\n X_tr[:, i] = cats[labels]\n if self.handle_unknown == 'ignore':\n unknown = np.asarray(sub.sum(axis=1) == 0).flatten()\n if unknown.any():\n if self.drop_idx_ is None or self.drop_idx_[i] is None:\n found_unknown[i] = unknown\n else:\n X_tr[unknown, i] = self.categories_[i][self.drop_idx_[i]]\n else:\n dropped = np.asarray(sub.sum(axis=1) == 0).flatten()\n if dropped.any():\n if self.drop_idx_ is None:\n all_zero_samples = np.flatnonzero(dropped)\n raise ValueError(f\"Samples {all_zero_samples} can not be inverted when drop=None and handle_unknown='error' because they contain all zeros\")\n X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]]\n j += n_categories\n if found_unknown:\n if X_tr.dtype != object:\n X_tr = X_tr.astype(object)\n for (idx, mask) in found_unknown.items():\n X_tr[mask, idx] = None\n return X_tr\n \n @deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\n def get_feature_names(self, input_features=None):\n \"\"\"Return feature names for output features.\n\n Parameters\n ----------\n input_features : list of str of shape (n_features,)\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\n Returns\n -------\n output_feature_names : ndarray of shape (n_output_features,)\n Array of feature names.\n \"\"\"\n check_is_fitted(self)\n cats = self.categories_\n if input_features is None:\n input_features = ['x%d' % i for i in range(len(cats))]\n elif len(input_features) != len(self.categories_):\n raise ValueError('input_features should have length equal to number of features ({}), got {}'.format(len(self.categories_), len(input_features)))\n feature_names = []\n for i in range(len(cats)):\n names = [input_features[i] + '_' + str(t) for t in cats[i]]\n if self.drop_idx_ is not None and self.drop_idx_[i] is not None:\n names.pop(self.drop_idx_[i])\n feature_names.extend(names)\n return np.array(feature_names, dtype=object)\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n check_is_fitted(self)\n cats = self.categories_\n input_features = _check_feature_names_in(self, input_features)\n feature_names = []\n for i in range(len(cats)):\n names = [input_features[i] + '_' + str(t) for t in cats[i]]\n if self.drop_idx_ is not None and self.drop_idx_[i] is not None:\n names.pop(self.drop_idx_[i])\n feature_names.extend(names)\n return np.asarray(feature_names, dtype=object)\n" + "description": "Encode categorical features as a one-hot numeric array.\n\nThe input to this transformer should be an array-like of integers or\nstrings, denoting the values taken on by categorical (discrete) features.\nThe features are encoded using a one-hot (aka 'one-of-K' or 'dummy')\nencoding scheme. This creates a binary column for each category and\nreturns a sparse matrix or dense array (depending on the ``sparse``\nparameter)\n\nBy default, the encoder derives the categories based on the unique values\nin each feature. Alternatively, you can also specify the `categories`\nmanually.\n\nThis encoding is needed for feeding categorical data to many scikit-learn\nestimators, notably linear models and SVMs with the standard kernels.\n\nNote: a one-hot encoding of y labels should use a LabelBinarizer\ninstead.\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n Encode categorical features as a one-hot numeric array.\n\n The input to this transformer should be an array-like of integers or\n strings, denoting the values taken on by categorical (discrete) features.\n The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')\n encoding scheme. This creates a binary column for each category and\n returns a sparse matrix or dense array (depending on the ``sparse``\n parameter)\n\n By default, the encoder derives the categories based on the unique values\n in each feature. Alternatively, you can also specify the `categories`\n manually.\n\n This encoding is needed for feeding categorical data to many scikit-learn\n estimators, notably linear models and SVMs with the standard kernels.\n\n Note: a one-hot encoding of y labels should use a LabelBinarizer\n instead.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n categories : 'auto' or a list of array-like, default='auto'\n Categories (unique values) per feature:\n\n - 'auto' : Determine categories automatically from the training data.\n - list : ``categories[i]`` holds the categories expected in the ith\n column. The passed categories should not mix strings and numeric\n values within a single feature, and should be sorted in case of\n numeric values.\n\n The used categories can be found in the ``categories_`` attribute.\n\n .. versionadded:: 0.20\n\n drop : {'first', 'if_binary'} or an array-like of shape (n_features,), default=None\n Specifies a methodology to use to drop one of the categories per\n feature. This is useful in situations where perfectly collinear\n features cause problems, such as when feeding the resulting data\n into a neural network or an unregularized regression.\n\n However, dropping one category breaks the symmetry of the original\n representation and can therefore induce a bias in downstream models,\n for instance for penalized linear classification or regression models.\n\n - None : retain all features (the default).\n - 'first' : drop the first category in each feature. If only one\n category is present, the feature will be dropped entirely.\n - 'if_binary' : drop the first category in each feature with two\n categories. Features with 1 or more than 2 categories are\n left intact.\n - array : ``drop[i]`` is the category in feature ``X[:, i]`` that\n should be dropped.\n\n .. versionadded:: 0.21\n The parameter `drop` was added in 0.21.\n\n .. versionchanged:: 0.23\n The option `drop='if_binary'` was added in 0.23.\n\n sparse : bool, default=True\n Will return sparse matrix if set True else will return an array.\n\n dtype : number type, default=float\n Desired dtype of output.\n\n handle_unknown : {'error', 'ignore'}, default='error'\n Whether to raise an error or ignore if an unknown categorical feature\n is present during transform (default is to raise). When this parameter\n is set to 'ignore' and an unknown category is encountered during\n transform, the resulting one-hot encoded columns for this feature\n will be all zeros. In the inverse transform, an unknown category\n will be denoted as None.\n\n Attributes\n ----------\n categories_ : list of arrays\n The categories of each feature determined during fitting\n (in order of the features in X and corresponding with the output\n of ``transform``). This includes the category specified in ``drop``\n (if any).\n\n drop_idx_ : array of shape (n_features,)\n - ``drop_idx_[i]`` is\u00a0the index in ``categories_[i]`` of the category\n to be dropped for each feature.\n - ``drop_idx_[i] = None`` if no category is to be dropped from the\n feature with index ``i``, e.g. when `drop='if_binary'` and the\n feature isn't binary.\n - ``drop_idx_ = None`` if all the transformed features will be\n retained.\n\n .. versionchanged:: 0.23\n Added the possibility to contain `None` values.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 1.0\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OrdinalEncoder : Performs an ordinal (integer)\n encoding of the categorical features.\n sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of\n dictionary items (also handles string-valued features).\n sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot\n encoding of dictionary items or strings.\n LabelBinarizer : Binarizes labels in a one-vs-all\n fashion.\n MultiLabelBinarizer : Transforms between iterable of\n iterables and a multilabel format, e.g. a (samples x classes) binary\n matrix indicating the presence of a class label.\n\n Examples\n --------\n Given a dataset with two features, we let the encoder find the unique\n values per feature and transform the data to a binary one-hot encoding.\n\n >>> from sklearn.preprocessing import OneHotEncoder\n\n One can discard categories not seen during `fit`:\n\n >>> enc = OneHotEncoder(handle_unknown='ignore')\n >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]\n >>> enc.fit(X)\n OneHotEncoder(handle_unknown='ignore')\n >>> enc.categories_\n [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]\n >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()\n array([[1., 0., 1., 0., 0.],\n [0., 1., 0., 0., 0.]])\n >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])\n array([['Male', 1],\n [None, 2]], dtype=object)\n >>> enc.get_feature_names_out(['gender', 'group'])\n array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...)\n\n One can always drop the first column for each feature:\n\n >>> drop_enc = OneHotEncoder(drop='first').fit(X)\n >>> drop_enc.categories_\n [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]\n >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()\n array([[0., 0., 0.],\n [1., 1., 0.]])\n\n Or drop a column for feature only having 2 categories:\n\n >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)\n >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()\n array([[0., 1., 0., 0.],\n [1., 0., 1., 0.]])\n ", + "source_code": "\n\nclass OneHotEncoder(_BaseEncoder):\n \"\"\"\n Encode categorical features as a one-hot numeric array.\n\n The input to this transformer should be an array-like of integers or\n strings, denoting the values taken on by categorical (discrete) features.\n The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')\n encoding scheme. This creates a binary column for each category and\n returns a sparse matrix or dense array (depending on the ``sparse``\n parameter)\n\n By default, the encoder derives the categories based on the unique values\n in each feature. Alternatively, you can also specify the `categories`\n manually.\n\n This encoding is needed for feeding categorical data to many scikit-learn\n estimators, notably linear models and SVMs with the standard kernels.\n\n Note: a one-hot encoding of y labels should use a LabelBinarizer\n instead.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n categories : 'auto' or a list of array-like, default='auto'\n Categories (unique values) per feature:\n\n - 'auto' : Determine categories automatically from the training data.\n - list : ``categories[i]`` holds the categories expected in the ith\n column. The passed categories should not mix strings and numeric\n values within a single feature, and should be sorted in case of\n numeric values.\n\n The used categories can be found in the ``categories_`` attribute.\n\n .. versionadded:: 0.20\n\n drop : {'first', 'if_binary'} or an array-like of shape (n_features,), default=None\n Specifies a methodology to use to drop one of the categories per\n feature. This is useful in situations where perfectly collinear\n features cause problems, such as when feeding the resulting data\n into a neural network or an unregularized regression.\n\n However, dropping one category breaks the symmetry of the original\n representation and can therefore induce a bias in downstream models,\n for instance for penalized linear classification or regression models.\n\n - None : retain all features (the default).\n - 'first' : drop the first category in each feature. If only one\n category is present, the feature will be dropped entirely.\n - 'if_binary' : drop the first category in each feature with two\n categories. Features with 1 or more than 2 categories are\n left intact.\n - array : ``drop[i]`` is the category in feature ``X[:, i]`` that\n should be dropped.\n\n .. versionadded:: 0.21\n The parameter `drop` was added in 0.21.\n\n .. versionchanged:: 0.23\n The option `drop='if_binary'` was added in 0.23.\n\n sparse : bool, default=True\n Will return sparse matrix if set True else will return an array.\n\n dtype : number type, default=float\n Desired dtype of output.\n\n handle_unknown : {'error', 'ignore'}, default='error'\n Whether to raise an error or ignore if an unknown categorical feature\n is present during transform (default is to raise). When this parameter\n is set to 'ignore' and an unknown category is encountered during\n transform, the resulting one-hot encoded columns for this feature\n will be all zeros. In the inverse transform, an unknown category\n will be denoted as None.\n\n Attributes\n ----------\n categories_ : list of arrays\n The categories of each feature determined during fitting\n (in order of the features in X and corresponding with the output\n of ``transform``). This includes the category specified in ``drop``\n (if any).\n\n drop_idx_ : array of shape (n_features,)\n - ``drop_idx_[i]`` is\u00a0the index in ``categories_[i]`` of the category\n to be dropped for each feature.\n - ``drop_idx_[i] = None`` if no category is to be dropped from the\n feature with index ``i``, e.g. when `drop='if_binary'` and the\n feature isn't binary.\n - ``drop_idx_ = None`` if all the transformed features will be\n retained.\n\n .. versionchanged:: 0.23\n Added the possibility to contain `None` values.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 1.0\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OrdinalEncoder : Performs an ordinal (integer)\n encoding of the categorical features.\n sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of\n dictionary items (also handles string-valued features).\n sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot\n encoding of dictionary items or strings.\n LabelBinarizer : Binarizes labels in a one-vs-all\n fashion.\n MultiLabelBinarizer : Transforms between iterable of\n iterables and a multilabel format, e.g. a (samples x classes) binary\n matrix indicating the presence of a class label.\n\n Examples\n --------\n Given a dataset with two features, we let the encoder find the unique\n values per feature and transform the data to a binary one-hot encoding.\n\n >>> from sklearn.preprocessing import OneHotEncoder\n\n One can discard categories not seen during `fit`:\n\n >>> enc = OneHotEncoder(handle_unknown='ignore')\n >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]\n >>> enc.fit(X)\n OneHotEncoder(handle_unknown='ignore')\n >>> enc.categories_\n [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]\n >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()\n array([[1., 0., 1., 0., 0.],\n [0., 1., 0., 0., 0.]])\n >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])\n array([['Male', 1],\n [None, 2]], dtype=object)\n >>> enc.get_feature_names_out(['gender', 'group'])\n array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...)\n\n One can always drop the first column for each feature:\n\n >>> drop_enc = OneHotEncoder(drop='first').fit(X)\n >>> drop_enc.categories_\n [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]\n >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()\n array([[0., 0., 0.],\n [1., 1., 0.]])\n\n Or drop a column for feature only having 2 categories:\n\n >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)\n >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()\n array([[0., 1., 0., 0.],\n [1., 0., 1., 0.]])\n \"\"\"\n \n def __init__(self, *, categories='auto', drop=None, sparse=True, dtype=np.float64, handle_unknown='error'):\n self.categories = categories\n self.sparse = sparse\n self.dtype = dtype\n self.handle_unknown = handle_unknown\n self.drop = drop\n \n def _validate_keywords(self):\n if self.handle_unknown not in ('error', 'ignore'):\n msg = \"handle_unknown should be either 'error' or 'ignore', got {0}.\".format(self.handle_unknown)\n raise ValueError(msg)\n \n def _compute_drop_idx(self):\n if self.drop is None:\n return None\n elif isinstance(self.drop, str):\n if self.drop == 'first':\n return np.zeros(len(self.categories_), dtype=object)\n elif self.drop == 'if_binary':\n return np.array([0 if len(cats) == 2 else None for cats in self.categories_], dtype=object)\n else:\n msg = \"Wrong input for parameter `drop`. Expected 'first', 'if_binary', None or array of objects, got {}\"\n raise ValueError(msg.format(type(self.drop)))\n else:\n try:\n drop_array = np.asarray(self.drop, dtype=object)\n droplen = len(drop_array)\n except (ValueError, TypeError):\n msg = \"Wrong input for parameter `drop`. Expected 'first', 'if_binary', None or array of objects, got {}\"\n raise ValueError(msg.format(type(drop_array)))\n if droplen != len(self.categories_):\n msg = '`drop` should have length equal to the number of features ({}), got {}'\n raise ValueError(msg.format(len(self.categories_), droplen))\n missing_drops = []\n drop_indices = []\n for (col_idx, (val, cat_list)) in enumerate(zip(drop_array, self.categories_)):\n if not is_scalar_nan(val):\n drop_idx = np.where(cat_list == val)[0]\n if drop_idx.size:\n drop_indices.append(drop_idx[0])\n else:\n missing_drops.append((col_idx, val))\n continue\n for (cat_idx, cat) in enumerate(cat_list):\n if is_scalar_nan(cat):\n drop_indices.append(cat_idx)\n break\n else:\n missing_drops.append((col_idx, val))\n if any(missing_drops):\n msg = 'The following categories were supposed to be dropped, but were not found in the training data.\\n{}'.format('\\n'.join(['Category: {}, Feature: {}'.format(c, v) for (c, v) in missing_drops]))\n raise ValueError(msg)\n return np.array(drop_indices, dtype=object)\n \n def fit(self, X, y=None):\n \"\"\"\n Fit OneHotEncoder to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to determine the categories of each feature.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n self\n Fitted encoder.\n \"\"\"\n self._validate_keywords()\n self._fit(X, handle_unknown=self.handle_unknown, force_all_finite='allow-nan')\n self.drop_idx_ = self._compute_drop_idx()\n return self\n \n def fit_transform(self, X, y=None):\n \"\"\"\n Fit OneHotEncoder to X, then transform X.\n\n Equivalent to fit(X).transform(X) but more convenient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to encode.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n X_out : {ndarray, sparse matrix} of shape (n_samples, n_encoded_features)\n Transformed input. If `sparse=True`, a sparse matrix will be\n returned.\n \"\"\"\n self._validate_keywords()\n return super().fit_transform(X, y)\n \n def transform(self, X):\n \"\"\"\n Transform X using one-hot encoding.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to encode.\n\n Returns\n -------\n X_out : {ndarray, sparse matrix} of shape (n_samples, n_encoded_features)\n Transformed input. If `sparse=True`, a sparse matrix will be\n returned.\n \"\"\"\n check_is_fitted(self)\n warn_on_unknown = self.handle_unknown == 'ignore' and self.drop is not None\n (X_int, X_mask) = self._transform(X, handle_unknown=self.handle_unknown, force_all_finite='allow-nan', warn_on_unknown=warn_on_unknown)\n (n_samples, n_features) = X_int.shape\n if self.drop_idx_ is not None:\n to_drop = self.drop_idx_.copy()\n keep_cells = X_int != to_drop\n n_values = []\n for (i, cats) in enumerate(self.categories_):\n n_cats = len(cats)\n if to_drop[i] is None:\n to_drop[i] = n_cats\n n_values.append(n_cats)\n else:\n n_values.append(n_cats - 1)\n to_drop = to_drop.reshape(1, -1)\n X_int[X_int > to_drop] -= 1\n X_mask &= keep_cells\n else:\n n_values = [len(cats) for cats in self.categories_]\n mask = X_mask.ravel()\n feature_indices = np.cumsum([0] + n_values)\n indices = (X_int + feature_indices[:-1]).ravel()[mask]\n indptr = np.empty(n_samples + 1, dtype=int)\n indptr[0] = 0\n np.sum(X_mask, axis=1, out=indptr[1:], dtype=indptr.dtype)\n np.cumsum(indptr[1:], out=indptr[1:])\n data = np.ones(indptr[-1])\n out = sparse.csr_matrix((data, indices, indptr), shape=(n_samples, feature_indices[-1]), dtype=self.dtype)\n if not self.sparse:\n return out.toarray()\n else:\n return out\n \n def inverse_transform(self, X):\n \"\"\"\n Convert the data back to the original representation.\n\n When unknown categories are encountered (all zeros in the\n one-hot encoding), ``None`` is used to represent this category. If the\n feature with the unknown category has a dropped category, the dropped\n category will be its inverse.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_encoded_features)\n The transformed data.\n\n Returns\n -------\n X_tr : ndarray of shape (n_samples, n_features)\n Inverse transformed array.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, accept_sparse='csr')\n (n_samples, _) = X.shape\n n_features = len(self.categories_)\n if self.drop_idx_ is None:\n n_transformed_features = sum((len(cats) for cats in self.categories_))\n else:\n n_transformed_features = sum((len(cats) - 1 if to_drop is not None else len(cats) for (cats, to_drop) in zip(self.categories_, self.drop_idx_)))\n msg = 'Shape of the passed X data is not correct. Expected {0} columns, got {1}.'\n if X.shape[1] != n_transformed_features:\n raise ValueError(msg.format(n_transformed_features, X.shape[1]))\n dt = np.find_common_type([cat.dtype for cat in self.categories_], [])\n X_tr = np.empty((n_samples, n_features), dtype=dt)\n j = 0\n found_unknown = {}\n for i in range(n_features):\n if self.drop_idx_ is None or self.drop_idx_[i] is None:\n cats = self.categories_[i]\n else:\n cats = np.delete(self.categories_[i], self.drop_idx_[i])\n n_categories = len(cats)\n if n_categories == 0:\n X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]\n j += n_categories\n continue\n sub = X[:, j:j + n_categories]\n labels = np.asarray(sub.argmax(axis=1)).flatten()\n X_tr[:, i] = cats[labels]\n if self.handle_unknown == 'ignore':\n unknown = np.asarray(sub.sum(axis=1) == 0).flatten()\n if unknown.any():\n if self.drop_idx_ is None or self.drop_idx_[i] is None:\n found_unknown[i] = unknown\n else:\n X_tr[unknown, i] = self.categories_[i][self.drop_idx_[i]]\n else:\n dropped = np.asarray(sub.sum(axis=1) == 0).flatten()\n if dropped.any():\n if self.drop_idx_ is None:\n all_zero_samples = np.flatnonzero(dropped)\n raise ValueError(f\"Samples {all_zero_samples} can not be inverted when drop=None and handle_unknown='error' because they contain all zeros\")\n X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]]\n j += n_categories\n if found_unknown:\n if X_tr.dtype != object:\n X_tr = X_tr.astype(object)\n for (idx, mask) in found_unknown.items():\n X_tr[mask, idx] = None\n return X_tr\n \n @deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\n def get_feature_names(self, input_features=None):\n \"\"\"Return feature names for output features.\n\n Parameters\n ----------\n input_features : list of str of shape (n_features,)\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\n Returns\n -------\n output_feature_names : ndarray of shape (n_output_features,)\n Array of feature names.\n \"\"\"\n check_is_fitted(self)\n cats = self.categories_\n if input_features is None:\n input_features = ['x%d' % i for i in range(len(cats))]\n elif len(input_features) != len(self.categories_):\n raise ValueError('input_features should have length equal to number of features ({}), got {}'.format(len(self.categories_), len(input_features)))\n feature_names = []\n for i in range(len(cats)):\n names = [input_features[i] + '_' + str(t) for t in cats[i]]\n if self.drop_idx_ is not None and self.drop_idx_[i] is not None:\n names.pop(self.drop_idx_[i])\n feature_names.extend(names)\n return np.array(feature_names, dtype=object)\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n check_is_fitted(self)\n cats = self.categories_\n input_features = _check_feature_names_in(self, input_features)\n feature_names = []\n for i in range(len(cats)):\n names = [input_features[i] + '_' + str(t) for t in cats[i]]\n if self.drop_idx_ is not None and self.drop_idx_[i] is not None:\n names.pop(self.drop_idx_[i])\n feature_names.extend(names)\n return np.asarray(feature_names, dtype=object)\n" }, { "name": "OrdinalEncoder", @@ -26045,7 +26141,7 @@ "sklearn.preprocessing._encoders.OrdinalEncoder.inverse_transform" ], "is_public": true, - "description": "Encode categorical features as an integer array.\n\nThe input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are converted to ordinal integers. This results in a single column of integers (0 to n_categories - 1) per feature. Read more in the :ref:`User Guide `. .. versionadded:: 0.20", + "description": "Encode categorical features as an integer array.\n\nThe input to this transformer should be an array-like of integers or\nstrings, denoting the values taken on by categorical (discrete) features.\nThe features are converted to ordinal integers. This results in\na single column of integers (0 to n_categories - 1) per feature.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20", "docstring": "\n Encode categorical features as an integer array.\n\n The input to this transformer should be an array-like of integers or\n strings, denoting the values taken on by categorical (discrete) features.\n The features are converted to ordinal integers. This results in\n a single column of integers (0 to n_categories - 1) per feature.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n categories : 'auto' or a list of array-like, default='auto'\n Categories (unique values) per feature:\n\n - 'auto' : Determine categories automatically from the training data.\n - list : ``categories[i]`` holds the categories expected in the ith\n column. The passed categories should not mix strings and numeric\n values, and should be sorted in case of numeric values.\n\n The used categories can be found in the ``categories_`` attribute.\n\n dtype : number type, default np.float64\n Desired dtype of output.\n\n handle_unknown : {'error', 'use_encoded_value'}, default='error'\n When set to 'error' an error will be raised in case an unknown\n categorical feature is present during transform. When set to\n 'use_encoded_value', the encoded value of unknown categories will be\n set to the value given for the parameter `unknown_value`. In\n :meth:`inverse_transform`, an unknown category will be denoted as None.\n\n .. versionadded:: 0.24\n\n unknown_value : int or np.nan, default=None\n When the parameter handle_unknown is set to 'use_encoded_value', this\n parameter is required and will set the encoded value of unknown\n categories. It has to be distinct from the values used to encode any of\n the categories in `fit`. If set to np.nan, the `dtype` parameter must\n be a float dtype.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n categories_ : list of arrays\n The categories of each feature determined during ``fit`` (in order of\n the features in X and corresponding with the output of ``transform``).\n This does not include categories that weren't seen during ``fit``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 1.0\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OneHotEncoder : Performs a one-hot encoding of categorical features.\n LabelEncoder : Encodes target labels with values between 0 and\n ``n_classes-1``.\n\n Examples\n --------\n Given a dataset with two features, we let the encoder find the unique\n values per feature and transform the data to an ordinal encoding.\n\n >>> from sklearn.preprocessing import OrdinalEncoder\n >>> enc = OrdinalEncoder()\n >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]\n >>> enc.fit(X)\n OrdinalEncoder()\n >>> enc.categories_\n [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]\n >>> enc.transform([['Female', 3], ['Male', 1]])\n array([[0., 2.],\n [1., 0.]])\n\n >>> enc.inverse_transform([[1, 0], [0, 1]])\n array([['Male', 1],\n ['Female', 2]], dtype=object)\n ", "source_code": "\n\nclass OrdinalEncoder(_BaseEncoder):\n \"\"\"\n Encode categorical features as an integer array.\n\n The input to this transformer should be an array-like of integers or\n strings, denoting the values taken on by categorical (discrete) features.\n The features are converted to ordinal integers. This results in\n a single column of integers (0 to n_categories - 1) per feature.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n categories : 'auto' or a list of array-like, default='auto'\n Categories (unique values) per feature:\n\n - 'auto' : Determine categories automatically from the training data.\n - list : ``categories[i]`` holds the categories expected in the ith\n column. The passed categories should not mix strings and numeric\n values, and should be sorted in case of numeric values.\n\n The used categories can be found in the ``categories_`` attribute.\n\n dtype : number type, default np.float64\n Desired dtype of output.\n\n handle_unknown : {'error', 'use_encoded_value'}, default='error'\n When set to 'error' an error will be raised in case an unknown\n categorical feature is present during transform. When set to\n 'use_encoded_value', the encoded value of unknown categories will be\n set to the value given for the parameter `unknown_value`. In\n :meth:`inverse_transform`, an unknown category will be denoted as None.\n\n .. versionadded:: 0.24\n\n unknown_value : int or np.nan, default=None\n When the parameter handle_unknown is set to 'use_encoded_value', this\n parameter is required and will set the encoded value of unknown\n categories. It has to be distinct from the values used to encode any of\n the categories in `fit`. If set to np.nan, the `dtype` parameter must\n be a float dtype.\n\n .. versionadded:: 0.24\n\n Attributes\n ----------\n categories_ : list of arrays\n The categories of each feature determined during ``fit`` (in order of\n the features in X and corresponding with the output of ``transform``).\n This does not include categories that weren't seen during ``fit``.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 1.0\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n OneHotEncoder : Performs a one-hot encoding of categorical features.\n LabelEncoder : Encodes target labels with values between 0 and\n ``n_classes-1``.\n\n Examples\n --------\n Given a dataset with two features, we let the encoder find the unique\n values per feature and transform the data to an ordinal encoding.\n\n >>> from sklearn.preprocessing import OrdinalEncoder\n >>> enc = OrdinalEncoder()\n >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]\n >>> enc.fit(X)\n OrdinalEncoder()\n >>> enc.categories_\n [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]\n >>> enc.transform([['Female', 3], ['Male', 1]])\n array([[0., 2.],\n [1., 0.]])\n\n >>> enc.inverse_transform([[1, 0], [0, 1]])\n array([['Male', 1],\n ['Female', 2]], dtype=object)\n \"\"\"\n \n def __init__(self, *, categories='auto', dtype=np.float64, handle_unknown='error', unknown_value=None):\n self.categories = categories\n self.dtype = dtype\n self.handle_unknown = handle_unknown\n self.unknown_value = unknown_value\n \n def fit(self, X, y=None):\n \"\"\"\n Fit the OrdinalEncoder to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to determine the categories of each feature.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n self : object\n Fitted encoder.\n \"\"\"\n handle_unknown_strategies = ('error', 'use_encoded_value')\n if self.handle_unknown not in handle_unknown_strategies:\n raise ValueError(f\"handle_unknown should be either 'error' or 'use_encoded_value', got {self.handle_unknown}.\")\n if self.handle_unknown == 'use_encoded_value':\n if is_scalar_nan(self.unknown_value):\n if np.dtype(self.dtype).kind != 'f':\n raise ValueError(f'When unknown_value is np.nan, the dtype parameter should be a float dtype. Got {self.dtype}.')\n elif not isinstance(self.unknown_value, numbers.Integral):\n raise TypeError(f\"unknown_value should be an integer or np.nan when handle_unknown is 'use_encoded_value', got {self.unknown_value}.\")\n elif self.unknown_value is not None:\n raise TypeError(f\"unknown_value should only be set when handle_unknown is 'use_encoded_value', got {self.unknown_value}.\")\n self._fit(X, handle_unknown=self.handle_unknown, force_all_finite='allow-nan')\n if self.handle_unknown == 'use_encoded_value':\n for feature_cats in self.categories_:\n if 0 <= self.unknown_value < len(feature_cats):\n raise ValueError(f'The used value for unknown_value {self.unknown_value} is one of the values already used for encoding the seen categories.')\n self._missing_indices = {}\n for (cat_idx, categories_for_idx) in enumerate(self.categories_):\n for (i, cat) in enumerate(categories_for_idx):\n if is_scalar_nan(cat):\n self._missing_indices[cat_idx] = i\n continue\n if np.dtype(self.dtype).kind != 'f' and self._missing_indices:\n raise ValueError(f'There are missing values in features {list(self._missing_indices)}. For OrdinalEncoder to passthrough missing values, the dtype parameter must be a float')\n return self\n \n def transform(self, X):\n \"\"\"\n Transform X to ordinal codes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to encode.\n\n Returns\n -------\n X_out : ndarray of shape (n_samples, n_features)\n Transformed input.\n \"\"\"\n (X_int, X_mask) = self._transform(X, handle_unknown=self.handle_unknown, force_all_finite='allow-nan')\n X_trans = X_int.astype(self.dtype, copy=False)\n for (cat_idx, missing_idx) in self._missing_indices.items():\n X_missing_mask = X_int[:, cat_idx] == missing_idx\n X_trans[X_missing_mask, cat_idx] = np.nan\n if self.handle_unknown == 'use_encoded_value':\n X_trans[~X_mask] = self.unknown_value\n return X_trans\n \n def inverse_transform(self, X):\n \"\"\"\n Convert the data back to the original representation.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_encoded_features)\n The transformed data.\n\n Returns\n -------\n X_tr : ndarray of shape (n_samples, n_features)\n Inverse transformed array.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, force_all_finite='allow-nan')\n (n_samples, _) = X.shape\n n_features = len(self.categories_)\n msg = 'Shape of the passed X data is not correct. Expected {0} columns, got {1}.'\n if X.shape[1] != n_features:\n raise ValueError(msg.format(n_features, X.shape[1]))\n dt = np.find_common_type([cat.dtype for cat in self.categories_], [])\n X_tr = np.empty((n_samples, n_features), dtype=dt)\n found_unknown = {}\n for i in range(n_features):\n labels = X[:, i].astype('int64', copy=False)\n if i in self._missing_indices:\n X_i_mask = _get_mask(X[:, i], np.nan)\n labels[X_i_mask] = self._missing_indices[i]\n if self.handle_unknown == 'use_encoded_value':\n unknown_labels = labels == self.unknown_value\n X_tr[:, i] = self.categories_[i][np.where(unknown_labels, 0, labels)]\n found_unknown[i] = unknown_labels\n else:\n X_tr[:, i] = self.categories_[i][labels]\n if found_unknown:\n X_tr = X_tr.astype(object, copy=False)\n for (idx, mask) in found_unknown.items():\n X_tr[mask, idx] = None\n return X_tr\n" }, @@ -26062,9 +26158,9 @@ "sklearn.preprocessing._encoders._BaseEncoder._more_tags" ], "is_public": false, - "description": "Base class for encoders that includes the code to categorize and transform the input features.", + "description": "Base class for encoders that includes the code to categorize and\ntransform the input features.", "docstring": "\n Base class for encoders that includes the code to categorize and\n transform the input features.\n\n ", - "source_code": "\n\nclass _BaseEncoder(TransformerMixin, BaseEstimator):\n \"\"\"\n Base class for encoders that includes the code to categorize and\n transform the input features.\n\n \"\"\"\n \n def _check_X(self, X, force_all_finite=True):\n \"\"\"\n Perform custom check_array:\n - convert list of strings to object dtype\n - check for missing values for object dtype data (check_array does\n not do that)\n - return list of features (arrays): this list of features is\n constructed feature by feature to preserve the data types\n of pandas DataFrame columns, as otherwise information is lost\n and cannot be used, eg for the `categories_` attribute.\n\n \"\"\"\n if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):\n X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite)\n if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):\n X = check_array(X, dtype=object, force_all_finite=force_all_finite)\n else:\n X = X_temp\n needs_validation = False\n else:\n needs_validation = force_all_finite\n (n_samples, n_features) = X.shape\n X_columns = []\n for i in range(n_features):\n Xi = self._get_feature(X, feature_idx=i)\n Xi = check_array(Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation)\n X_columns.append(Xi)\n return X_columns, n_samples, n_features\n \n def _get_feature(self, X, feature_idx):\n if hasattr(X, 'iloc'):\n return X.iloc[:, feature_idx]\n return X[:, feature_idx]\n \n def _fit(self, X, handle_unknown='error', force_all_finite=True):\n self._check_n_features(X, reset=True)\n self._check_feature_names(X, reset=True)\n (X_list, n_samples, n_features) = self._check_X(X, force_all_finite=force_all_finite)\n self.n_features_in_ = n_features\n if self.categories != 'auto':\n if len(self.categories) != n_features:\n raise ValueError('Shape mismatch: if categories is an array, it has to be of shape (n_features,).')\n self.categories_ = []\n for i in range(n_features):\n Xi = X_list[i]\n if self.categories == 'auto':\n cats = _unique(Xi)\n else:\n cats = np.array(self.categories[i], dtype=Xi.dtype)\n if Xi.dtype.kind not in 'OUS':\n sorted_cats = np.sort(cats)\n error_msg = 'Unsorted categories are not supported for numerical categories'\n stop_idx = -1 if np.isnan(sorted_cats[-1]) else None\n if np.any(sorted_cats[:stop_idx] != cats[:stop_idx]) or np.isnan(sorted_cats[-1]) and not np.isnan(sorted_cats[-1]):\n raise ValueError(error_msg)\n if handle_unknown == 'error':\n diff = _check_unknown(Xi, cats)\n if diff:\n msg = 'Found unknown categories {0} in column {1} during fit'.format(diff, i)\n raise ValueError(msg)\n self.categories_.append(cats)\n \n def _transform(self, X, handle_unknown='error', force_all_finite=True, warn_on_unknown=False):\n self._check_feature_names(X, reset=False)\n self._check_n_features(X, reset=False)\n (X_list, n_samples, n_features) = self._check_X(X, force_all_finite=force_all_finite)\n X_int = np.zeros((n_samples, n_features), dtype=int)\n X_mask = np.ones((n_samples, n_features), dtype=bool)\n columns_with_unknown = []\n for i in range(n_features):\n Xi = X_list[i]\n (diff, valid_mask) = _check_unknown(Xi, self.categories_[i], return_mask=True)\n if not np.all(valid_mask):\n if handle_unknown == 'error':\n msg = 'Found unknown categories {0} in column {1} during transform'.format(diff, i)\n raise ValueError(msg)\n else:\n if warn_on_unknown:\n columns_with_unknown.append(i)\n X_mask[:, i] = valid_mask\n if self.categories_[i].dtype.kind in ('U', 'S') and self.categories_[i].itemsize > Xi.itemsize:\n Xi = Xi.astype(self.categories_[i].dtype)\n elif self.categories_[i].dtype.kind == 'O' and Xi.dtype.kind == 'U':\n Xi = Xi.astype('O')\n else:\n Xi = Xi.copy()\n Xi[~valid_mask] = self.categories_[i][0]\n X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False)\n if columns_with_unknown:\n warnings.warn(f'Found unknown categories in columns {columns_with_unknown} during transform. These unknown categories will be encoded as all zeros', UserWarning)\n return X_int, X_mask\n \n def _more_tags(self):\n return {'X_types': ['categorical']}\n" + "source_code": "\n\nclass _BaseEncoder(TransformerMixin, BaseEstimator):\n \"\"\"\n Base class for encoders that includes the code to categorize and\n transform the input features.\n\n \"\"\"\n \n def _check_X(self, X, force_all_finite=True):\n \"\"\"\n Perform custom check_array:\n - convert list of strings to object dtype\n - check for missing values for object dtype data (check_array does\n not do that)\n - return list of features (arrays): this list of features is\n constructed feature by feature to preserve the data types\n of pandas DataFrame columns, as otherwise information is lost\n and cannot be used, e.g. for the `categories_` attribute.\n\n \"\"\"\n if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):\n X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite)\n if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):\n X = check_array(X, dtype=object, force_all_finite=force_all_finite)\n else:\n X = X_temp\n needs_validation = False\n else:\n needs_validation = force_all_finite\n (n_samples, n_features) = X.shape\n X_columns = []\n for i in range(n_features):\n Xi = self._get_feature(X, feature_idx=i)\n Xi = check_array(Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation)\n X_columns.append(Xi)\n return X_columns, n_samples, n_features\n \n def _get_feature(self, X, feature_idx):\n if hasattr(X, 'iloc'):\n return X.iloc[:, feature_idx]\n return X[:, feature_idx]\n \n def _fit(self, X, handle_unknown='error', force_all_finite=True):\n self._check_n_features(X, reset=True)\n self._check_feature_names(X, reset=True)\n (X_list, n_samples, n_features) = self._check_X(X, force_all_finite=force_all_finite)\n self.n_features_in_ = n_features\n if self.categories != 'auto':\n if len(self.categories) != n_features:\n raise ValueError('Shape mismatch: if categories is an array, it has to be of shape (n_features,).')\n self.categories_ = []\n for i in range(n_features):\n Xi = X_list[i]\n if self.categories == 'auto':\n cats = _unique(Xi)\n else:\n cats = np.array(self.categories[i], dtype=Xi.dtype)\n if Xi.dtype.kind not in 'OUS':\n sorted_cats = np.sort(cats)\n error_msg = 'Unsorted categories are not supported for numerical categories'\n stop_idx = -1 if np.isnan(sorted_cats[-1]) else None\n if np.any(sorted_cats[:stop_idx] != cats[:stop_idx]) or np.isnan(sorted_cats[-1]) and not np.isnan(sorted_cats[-1]):\n raise ValueError(error_msg)\n if handle_unknown == 'error':\n diff = _check_unknown(Xi, cats)\n if diff:\n msg = 'Found unknown categories {0} in column {1} during fit'.format(diff, i)\n raise ValueError(msg)\n self.categories_.append(cats)\n \n def _transform(self, X, handle_unknown='error', force_all_finite=True, warn_on_unknown=False):\n self._check_feature_names(X, reset=False)\n self._check_n_features(X, reset=False)\n (X_list, n_samples, n_features) = self._check_X(X, force_all_finite=force_all_finite)\n X_int = np.zeros((n_samples, n_features), dtype=int)\n X_mask = np.ones((n_samples, n_features), dtype=bool)\n columns_with_unknown = []\n for i in range(n_features):\n Xi = X_list[i]\n (diff, valid_mask) = _check_unknown(Xi, self.categories_[i], return_mask=True)\n if not np.all(valid_mask):\n if handle_unknown == 'error':\n msg = 'Found unknown categories {0} in column {1} during transform'.format(diff, i)\n raise ValueError(msg)\n else:\n if warn_on_unknown:\n columns_with_unknown.append(i)\n X_mask[:, i] = valid_mask\n if self.categories_[i].dtype.kind in ('U', 'S') and self.categories_[i].itemsize > Xi.itemsize:\n Xi = Xi.astype(self.categories_[i].dtype)\n elif self.categories_[i].dtype.kind == 'O' and Xi.dtype.kind == 'U':\n Xi = Xi.astype('O')\n else:\n Xi = Xi.copy()\n Xi[~valid_mask] = self.categories_[i][0]\n X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False)\n if columns_with_unknown:\n warnings.warn(f'Found unknown categories in columns {columns_with_unknown} during transform. These unknown categories will be encoded as all zeros', UserWarning)\n return X_int, X_mask\n \n def _more_tags(self):\n return {'X_types': ['categorical']}\n" }, { "name": "FunctionTransformer", @@ -26083,7 +26179,7 @@ "sklearn.preprocessing._function_transformer.FunctionTransformer._more_tags" ], "is_public": true, - "description": "Constructs a transformer from an arbitrary callable.\n\nA FunctionTransformer forwards its X (and optionally y) arguments to a user-defined function or function object and returns the result of this function. This is useful for stateless transformations such as taking the log of frequencies, doing custom scaling, etc. Note: If a lambda is used as the function, then the resulting transformer will not be pickleable. .. versionadded:: 0.17 Read more in the :ref:`User Guide `.", + "description": "Constructs a transformer from an arbitrary callable.\n\nA FunctionTransformer forwards its X (and optionally y) arguments to a\nuser-defined function or function object and returns the result of this\nfunction. This is useful for stateless transformations such as taking the\nlog of frequencies, doing custom scaling, etc.\n\nNote: If a lambda is used as the function, then the resulting\ntransformer will not be pickleable.\n\n.. versionadded:: 0.17\n\nRead more in the :ref:`User Guide `.", "docstring": "Constructs a transformer from an arbitrary callable.\n\n A FunctionTransformer forwards its X (and optionally y) arguments to a\n user-defined function or function object and returns the result of this\n function. This is useful for stateless transformations such as taking the\n log of frequencies, doing custom scaling, etc.\n\n Note: If a lambda is used as the function, then the resulting\n transformer will not be pickleable.\n\n .. versionadded:: 0.17\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n func : callable, default=None\n The callable to use for the transformation. This will be passed\n the same arguments as transform, with args and kwargs forwarded.\n If func is None, then func will be the identity function.\n\n inverse_func : callable, default=None\n The callable to use for the inverse transformation. This will be\n passed the same arguments as inverse transform, with args and\n kwargs forwarded. If inverse_func is None, then inverse_func\n will be the identity function.\n\n validate : bool, default=False\n Indicate that the input X array should be checked before calling\n ``func``. The possibilities are:\n\n - If False, there is no input validation.\n - If True, then X will be converted to a 2-dimensional NumPy array or\n sparse matrix. If the conversion is not possible an exception is\n raised.\n\n .. versionchanged:: 0.22\n The default of ``validate`` changed from True to False.\n\n accept_sparse : bool, default=False\n Indicate that func accepts a sparse matrix as input. If validate is\n False, this has no effect. Otherwise, if accept_sparse is false,\n sparse matrix inputs will cause an exception to be raised.\n\n check_inverse : bool, default=True\n Whether to check that or ``func`` followed by ``inverse_func`` leads to\n the original inputs. It can be used for a sanity check, raising a\n warning when the condition is not fulfilled.\n\n .. versionadded:: 0.20\n\n kw_args : dict, default=None\n Dictionary of additional keyword arguments to pass to func.\n\n .. versionadded:: 0.18\n\n inv_kw_args : dict, default=None\n Dictionary of additional keyword arguments to pass to inverse_func.\n\n .. versionadded:: 0.18\n\n Attributes\n ----------\n n_features_in_ : int\n Number of features seen during :term:`fit`. Defined only when\n `validate=True`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `validate=True`\n and `X` has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MaxAbsScaler : Scale each feature by its maximum absolute value.\n StandardScaler : Standardize features by removing the mean and\n scaling to unit variance.\n LabelBinarizer : Binarize labels in a one-vs-all fashion.\n MultiLabelBinarizer : Transform between iterable of iterables\n and a multilabel format.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import FunctionTransformer\n >>> transformer = FunctionTransformer(np.log1p)\n >>> X = np.array([[0, 1], [2, 3]])\n >>> transformer.transform(X)\n array([[0. , 0.6931...],\n [1.0986..., 1.3862...]])\n ", "source_code": "\n\nclass FunctionTransformer(TransformerMixin, BaseEstimator):\n \"\"\"Constructs a transformer from an arbitrary callable.\n\n A FunctionTransformer forwards its X (and optionally y) arguments to a\n user-defined function or function object and returns the result of this\n function. This is useful for stateless transformations such as taking the\n log of frequencies, doing custom scaling, etc.\n\n Note: If a lambda is used as the function, then the resulting\n transformer will not be pickleable.\n\n .. versionadded:: 0.17\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n func : callable, default=None\n The callable to use for the transformation. This will be passed\n the same arguments as transform, with args and kwargs forwarded.\n If func is None, then func will be the identity function.\n\n inverse_func : callable, default=None\n The callable to use for the inverse transformation. This will be\n passed the same arguments as inverse transform, with args and\n kwargs forwarded. If inverse_func is None, then inverse_func\n will be the identity function.\n\n validate : bool, default=False\n Indicate that the input X array should be checked before calling\n ``func``. The possibilities are:\n\n - If False, there is no input validation.\n - If True, then X will be converted to a 2-dimensional NumPy array or\n sparse matrix. If the conversion is not possible an exception is\n raised.\n\n .. versionchanged:: 0.22\n The default of ``validate`` changed from True to False.\n\n accept_sparse : bool, default=False\n Indicate that func accepts a sparse matrix as input. If validate is\n False, this has no effect. Otherwise, if accept_sparse is false,\n sparse matrix inputs will cause an exception to be raised.\n\n check_inverse : bool, default=True\n Whether to check that or ``func`` followed by ``inverse_func`` leads to\n the original inputs. It can be used for a sanity check, raising a\n warning when the condition is not fulfilled.\n\n .. versionadded:: 0.20\n\n kw_args : dict, default=None\n Dictionary of additional keyword arguments to pass to func.\n\n .. versionadded:: 0.18\n\n inv_kw_args : dict, default=None\n Dictionary of additional keyword arguments to pass to inverse_func.\n\n .. versionadded:: 0.18\n\n Attributes\n ----------\n n_features_in_ : int\n Number of features seen during :term:`fit`. Defined only when\n `validate=True`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `validate=True`\n and `X` has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n MaxAbsScaler : Scale each feature by its maximum absolute value.\n StandardScaler : Standardize features by removing the mean and\n scaling to unit variance.\n LabelBinarizer : Binarize labels in a one-vs-all fashion.\n MultiLabelBinarizer : Transform between iterable of iterables\n and a multilabel format.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import FunctionTransformer\n >>> transformer = FunctionTransformer(np.log1p)\n >>> X = np.array([[0, 1], [2, 3]])\n >>> transformer.transform(X)\n array([[0. , 0.6931...],\n [1.0986..., 1.3862...]])\n \"\"\"\n \n def __init__(self, func=None, inverse_func=None, *, validate=False, accept_sparse=False, check_inverse=True, kw_args=None, inv_kw_args=None):\n self.func = func\n self.inverse_func = inverse_func\n self.validate = validate\n self.accept_sparse = accept_sparse\n self.check_inverse = check_inverse\n self.kw_args = kw_args\n self.inv_kw_args = inv_kw_args\n \n def _check_input(self, X, *, reset):\n if self.validate:\n return self._validate_data(X, accept_sparse=self.accept_sparse, reset=reset)\n return X\n \n def _check_inverse_transform(self, X):\n \"\"\"Check that func and inverse_func are the inverse.\"\"\"\n idx_selected = slice(None, None, max(1, X.shape[0] // 100))\n X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))\n if not _allclose_dense_sparse(X[idx_selected], X_round_trip):\n warnings.warn(\"The provided functions are not strictly inverse of each other. If you are sure you want to proceed regardless, set 'check_inverse=False'.\", UserWarning)\n \n def fit(self, X, y=None):\n \"\"\"Fit transformer by checking X.\n\n If ``validate`` is ``True``, ``X`` will be checked.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input array.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n FunctionTransformer class instance.\n \"\"\"\n X = self._check_input(X, reset=True)\n if self.check_inverse and not (self.func is None or self.inverse_func is None):\n self._check_inverse_transform(X)\n return self\n \n def transform(self, X):\n \"\"\"Transform X using the forward function.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input array.\n\n Returns\n -------\n X_out : array-like, shape (n_samples, n_features)\n Transformed input.\n \"\"\"\n X = self._check_input(X, reset=False)\n return self._transform(X, func=self.func, kw_args=self.kw_args)\n \n def inverse_transform(self, X):\n \"\"\"Transform X using the inverse function.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input array.\n\n Returns\n -------\n X_out : array-like, shape (n_samples, n_features)\n Transformed input.\n \"\"\"\n if self.validate:\n X = check_array(X, accept_sparse=self.accept_sparse)\n return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args)\n \n def _transform(self, X, func=None, kw_args=None):\n if func is None:\n func = _identity\n return func(X, **kw_args if kw_args else {})\n \n def __sklearn_is_fitted__(self):\n \"\"\"Return True since FunctionTransfomer is stateless.\"\"\"\n return True\n \n def _more_tags(self):\n return {'no_validation': not self.validate, 'stateless': True}\n" }, @@ -26101,7 +26197,7 @@ "sklearn.preprocessing._label.LabelBinarizer._more_tags" ], "is_public": true, - "description": "Binarize labels in a one-vs-all fashion.\n\nSeveral regression and binary classification algorithms are available in scikit-learn. A simple way to extend these algorithms to the multi-class classification case is to use the so-called one-vs-all scheme. At learning time, this simply consists in learning one regressor or binary classifier per class. In doing so, one needs to convert multi-class labels to binary labels (belong or does not belong to the class). LabelBinarizer makes this process easy with the transform method. At prediction time, one assigns the class for which the corresponding model gave the greatest confidence. LabelBinarizer makes this easy with the inverse_transform method. Read more in the :ref:`User Guide `.", + "description": "Binarize labels in a one-vs-all fashion.\n\nSeveral regression and binary classification algorithms are\navailable in scikit-learn. A simple way to extend these algorithms\nto the multi-class classification case is to use the so-called\none-vs-all scheme.\n\nAt learning time, this simply consists in learning one regressor\nor binary classifier per class. In doing so, one needs to convert\nmulti-class labels to binary labels (belong or does not belong\nto the class). LabelBinarizer makes this process easy with the\ntransform method.\n\nAt prediction time, one assigns the class for which the corresponding\nmodel gave the greatest confidence. LabelBinarizer makes this easy\nwith the inverse_transform method.\n\nRead more in the :ref:`User Guide `.", "docstring": "Binarize labels in a one-vs-all fashion.\n\n Several regression and binary classification algorithms are\n available in scikit-learn. A simple way to extend these algorithms\n to the multi-class classification case is to use the so-called\n one-vs-all scheme.\n\n At learning time, this simply consists in learning one regressor\n or binary classifier per class. In doing so, one needs to convert\n multi-class labels to binary labels (belong or does not belong\n to the class). LabelBinarizer makes this process easy with the\n transform method.\n\n At prediction time, one assigns the class for which the corresponding\n model gave the greatest confidence. LabelBinarizer makes this easy\n with the inverse_transform method.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n neg_label : int, default=0\n Value with which negative labels must be encoded.\n\n pos_label : int, default=1\n Value with which positive labels must be encoded.\n\n sparse_output : bool, default=False\n True if the returned array from transform is desired to be in sparse\n CSR format.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n Holds the label for each class.\n\n y_type_ : str\n Represents the type of the target data as evaluated by\n utils.multiclass.type_of_target. Possible type are 'continuous',\n 'continuous-multioutput', 'binary', 'multiclass',\n 'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.\n\n sparse_input_ : bool\n True if the input data to transform is given as a sparse matrix, False\n otherwise.\n\n See Also\n --------\n label_binarize : Function to perform the transform operation of\n LabelBinarizer with fixed classes.\n OneHotEncoder : Encode categorical features using a one-hot aka one-of-K\n scheme.\n\n Examples\n --------\n >>> from sklearn import preprocessing\n >>> lb = preprocessing.LabelBinarizer()\n >>> lb.fit([1, 2, 6, 4, 2])\n LabelBinarizer()\n >>> lb.classes_\n array([1, 2, 4, 6])\n >>> lb.transform([1, 6])\n array([[1, 0, 0, 0],\n [0, 0, 0, 1]])\n\n Binary targets transform to a column vector\n\n >>> lb = preprocessing.LabelBinarizer()\n >>> lb.fit_transform(['yes', 'no', 'no', 'yes'])\n array([[1],\n [0],\n [0],\n [1]])\n\n Passing a 2D matrix for multilabel classification\n\n >>> import numpy as np\n >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))\n LabelBinarizer()\n >>> lb.classes_\n array([0, 1, 2])\n >>> lb.transform([0, 1, 2, 1])\n array([[1, 0, 0],\n [0, 1, 0],\n [0, 0, 1],\n [0, 1, 0]])\n ", "source_code": "\n\nclass LabelBinarizer(TransformerMixin, BaseEstimator):\n \"\"\"Binarize labels in a one-vs-all fashion.\n\n Several regression and binary classification algorithms are\n available in scikit-learn. A simple way to extend these algorithms\n to the multi-class classification case is to use the so-called\n one-vs-all scheme.\n\n At learning time, this simply consists in learning one regressor\n or binary classifier per class. In doing so, one needs to convert\n multi-class labels to binary labels (belong or does not belong\n to the class). LabelBinarizer makes this process easy with the\n transform method.\n\n At prediction time, one assigns the class for which the corresponding\n model gave the greatest confidence. LabelBinarizer makes this easy\n with the inverse_transform method.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n neg_label : int, default=0\n Value with which negative labels must be encoded.\n\n pos_label : int, default=1\n Value with which positive labels must be encoded.\n\n sparse_output : bool, default=False\n True if the returned array from transform is desired to be in sparse\n CSR format.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n Holds the label for each class.\n\n y_type_ : str\n Represents the type of the target data as evaluated by\n utils.multiclass.type_of_target. Possible type are 'continuous',\n 'continuous-multioutput', 'binary', 'multiclass',\n 'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.\n\n sparse_input_ : bool\n True if the input data to transform is given as a sparse matrix, False\n otherwise.\n\n See Also\n --------\n label_binarize : Function to perform the transform operation of\n LabelBinarizer with fixed classes.\n OneHotEncoder : Encode categorical features using a one-hot aka one-of-K\n scheme.\n\n Examples\n --------\n >>> from sklearn import preprocessing\n >>> lb = preprocessing.LabelBinarizer()\n >>> lb.fit([1, 2, 6, 4, 2])\n LabelBinarizer()\n >>> lb.classes_\n array([1, 2, 4, 6])\n >>> lb.transform([1, 6])\n array([[1, 0, 0, 0],\n [0, 0, 0, 1]])\n\n Binary targets transform to a column vector\n\n >>> lb = preprocessing.LabelBinarizer()\n >>> lb.fit_transform(['yes', 'no', 'no', 'yes'])\n array([[1],\n [0],\n [0],\n [1]])\n\n Passing a 2D matrix for multilabel classification\n\n >>> import numpy as np\n >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))\n LabelBinarizer()\n >>> lb.classes_\n array([0, 1, 2])\n >>> lb.transform([0, 1, 2, 1])\n array([[1, 0, 0],\n [0, 1, 0],\n [0, 0, 1],\n [0, 1, 0]])\n \"\"\"\n \n def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):\n if neg_label >= pos_label:\n raise ValueError('neg_label={0} must be strictly less than pos_label={1}.'.format(neg_label, pos_label))\n if sparse_output and (pos_label == 0 or neg_label != 0):\n raise ValueError('Sparse binarization is only supported with non zero pos_label and zero neg_label, got pos_label={0} and neg_label={1}'.format(pos_label, neg_label))\n self.neg_label = neg_label\n self.pos_label = pos_label\n self.sparse_output = sparse_output\n \n def fit(self, y):\n \"\"\"Fit label binarizer.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Target values. The 2-d matrix should only contain 0 and 1,\n represents multilabel classification.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self.y_type_ = type_of_target(y)\n if 'multioutput' in self.y_type_:\n raise ValueError('Multioutput target data is not supported with label binarization')\n if _num_samples(y) == 0:\n raise ValueError('y has 0 samples: %r' % y)\n self.sparse_input_ = sp.issparse(y)\n self.classes_ = unique_labels(y)\n return self\n \n def fit_transform(self, y):\n \"\"\"Fit label binarizer/transform multi-class labels to binary labels.\n\n The output of transform is sometimes referred to as\n the 1-of-K coding scheme.\n\n Parameters\n ----------\n y : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_classes)\n Target values. The 2-d matrix should only contain 0 and 1,\n represents multilabel classification. Sparse matrix can be\n CSR, CSC, COO, DOK, or LIL.\n\n Returns\n -------\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Shape will be (n_samples, 1) for binary problems. Sparse matrix\n will be of CSR format.\n \"\"\"\n return self.fit(y).transform(y)\n \n def transform(self, y):\n \"\"\"Transform multi-class labels to binary labels.\n\n The output of transform is sometimes referred to by some authors as\n the 1-of-K coding scheme.\n\n Parameters\n ----------\n y : {array, sparse matrix} of shape (n_samples,) or (n_samples, n_classes)\n Target values. The 2-d matrix should only contain 0 and 1,\n represents multilabel classification. Sparse matrix can be\n CSR, CSC, COO, DOK, or LIL.\n\n Returns\n -------\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Shape will be (n_samples, 1) for binary problems. Sparse matrix\n will be of CSR format.\n \"\"\"\n check_is_fitted(self)\n y_is_multilabel = type_of_target(y).startswith('multilabel')\n if y_is_multilabel and not self.y_type_.startswith('multilabel'):\n raise ValueError('The object was not fitted with multilabel input.')\n return label_binarize(y, classes=self.classes_, pos_label=self.pos_label, neg_label=self.neg_label, sparse_output=self.sparse_output)\n \n def inverse_transform(self, Y, threshold=None):\n \"\"\"Transform binary labels back to multi-class labels.\n\n Parameters\n ----------\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Target values. All sparse matrices are converted to CSR before\n inverse transformation.\n\n threshold : float, default=None\n Threshold used in the binary and multi-label cases.\n\n Use 0 when ``Y`` contains the output of decision_function\n (classifier).\n Use 0.5 when ``Y`` contains the output of predict_proba.\n\n If None, the threshold is assumed to be half way between\n neg_label and pos_label.\n\n Returns\n -------\n y : {ndarray, sparse matrix} of shape (n_samples,)\n Target values. Sparse matrix will be of CSR format.\n\n Notes\n -----\n In the case when the binary labels are fractional\n (probabilistic), inverse_transform chooses the class with the\n greatest value. Typically, this allows to use the output of a\n linear model's decision_function method directly as the input\n of inverse_transform.\n \"\"\"\n check_is_fitted(self)\n if threshold is None:\n threshold = (self.pos_label + self.neg_label) / 2.0\n if self.y_type_ == 'multiclass':\n y_inv = _inverse_binarize_multiclass(Y, self.classes_)\n else:\n y_inv = _inverse_binarize_thresholding(Y, self.y_type_, self.classes_, threshold)\n if self.sparse_input_:\n y_inv = sp.csr_matrix(y_inv)\n elif sp.issparse(y_inv):\n y_inv = y_inv.toarray()\n return y_inv\n \n def _more_tags(self):\n return {'X_types': ['1dlabels']}\n" }, @@ -26118,7 +26214,7 @@ "sklearn.preprocessing._label.LabelEncoder._more_tags" ], "is_public": true, - "description": "Encode target labels with value between 0 and n_classes-1.\n\nThis transformer should be used to encode target values, *i.e.* `y`, and not the input `X`. Read more in the :ref:`User Guide `. .. versionadded:: 0.12", + "description": "Encode target labels with value between 0 and n_classes-1.\n\nThis transformer should be used to encode target values, *i.e.* `y`, and\nnot the input `X`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.12", "docstring": "Encode target labels with value between 0 and n_classes-1.\n\n This transformer should be used to encode target values, *i.e.* `y`, and\n not the input `X`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.12\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n Holds the label for each class.\n\n See Also\n --------\n OrdinalEncoder : Encode categorical features using an ordinal encoding\n scheme.\n OneHotEncoder : Encode categorical features as a one-hot numeric array.\n\n Examples\n --------\n `LabelEncoder` can be used to normalize labels.\n\n >>> from sklearn import preprocessing\n >>> le = preprocessing.LabelEncoder()\n >>> le.fit([1, 2, 2, 6])\n LabelEncoder()\n >>> le.classes_\n array([1, 2, 6])\n >>> le.transform([1, 1, 2, 6])\n array([0, 0, 1, 2]...)\n >>> le.inverse_transform([0, 0, 1, 2])\n array([1, 1, 2, 6])\n\n It can also be used to transform non-numerical labels (as long as they are\n hashable and comparable) to numerical labels.\n\n >>> le = preprocessing.LabelEncoder()\n >>> le.fit([\"paris\", \"paris\", \"tokyo\", \"amsterdam\"])\n LabelEncoder()\n >>> list(le.classes_)\n ['amsterdam', 'paris', 'tokyo']\n >>> le.transform([\"tokyo\", \"tokyo\", \"paris\"])\n array([2, 2, 1]...)\n >>> list(le.inverse_transform([2, 2, 1]))\n ['tokyo', 'tokyo', 'paris']\n ", "source_code": "\n\nclass LabelEncoder(TransformerMixin, BaseEstimator):\n \"\"\"Encode target labels with value between 0 and n_classes-1.\n\n This transformer should be used to encode target values, *i.e.* `y`, and\n not the input `X`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.12\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n Holds the label for each class.\n\n See Also\n --------\n OrdinalEncoder : Encode categorical features using an ordinal encoding\n scheme.\n OneHotEncoder : Encode categorical features as a one-hot numeric array.\n\n Examples\n --------\n `LabelEncoder` can be used to normalize labels.\n\n >>> from sklearn import preprocessing\n >>> le = preprocessing.LabelEncoder()\n >>> le.fit([1, 2, 2, 6])\n LabelEncoder()\n >>> le.classes_\n array([1, 2, 6])\n >>> le.transform([1, 1, 2, 6])\n array([0, 0, 1, 2]...)\n >>> le.inverse_transform([0, 0, 1, 2])\n array([1, 1, 2, 6])\n\n It can also be used to transform non-numerical labels (as long as they are\n hashable and comparable) to numerical labels.\n\n >>> le = preprocessing.LabelEncoder()\n >>> le.fit([\"paris\", \"paris\", \"tokyo\", \"amsterdam\"])\n LabelEncoder()\n >>> list(le.classes_)\n ['amsterdam', 'paris', 'tokyo']\n >>> le.transform([\"tokyo\", \"tokyo\", \"paris\"])\n array([2, 2, 1]...)\n >>> list(le.inverse_transform([2, 2, 1]))\n ['tokyo', 'tokyo', 'paris']\n \"\"\"\n \n def fit(self, y):\n \"\"\"Fit label encoder.\n\n Parameters\n ----------\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : returns an instance of self.\n Fitted label encoder.\n \"\"\"\n y = column_or_1d(y, warn=True)\n self.classes_ = _unique(y)\n return self\n \n def fit_transform(self, y):\n \"\"\"Fit label encoder and return encoded labels.\n\n Parameters\n ----------\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n y : array-like of shape (n_samples,)\n Encoded labels.\n \"\"\"\n y = column_or_1d(y, warn=True)\n (self.classes_, y) = _unique(y, return_inverse=True)\n return y\n \n def transform(self, y):\n \"\"\"Transform labels to normalized encoding.\n\n Parameters\n ----------\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n y : array-like of shape (n_samples,)\n Labels as normalized encodings.\n \"\"\"\n check_is_fitted(self)\n y = column_or_1d(y, warn=True)\n if _num_samples(y) == 0:\n return np.array([])\n return _encode(y, uniques=self.classes_)\n \n def inverse_transform(self, y):\n \"\"\"Transform labels back to original encoding.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n Target values.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n Original encoding.\n \"\"\"\n check_is_fitted(self)\n y = column_or_1d(y, warn=True)\n if _num_samples(y) == 0:\n return np.array([])\n diff = np.setdiff1d(y, np.arange(len(self.classes_)))\n if len(diff):\n raise ValueError('y contains previously unseen labels: %s' % str(diff))\n y = np.asarray(y)\n return self.classes_[y]\n \n def _more_tags(self):\n return {'X_types': ['1dlabels']}\n" }, @@ -26138,7 +26234,7 @@ "sklearn.preprocessing._label.MultiLabelBinarizer._more_tags" ], "is_public": true, - "description": "Transform between iterable of iterables and a multilabel format.\n\nAlthough a list of sets or tuples is a very intuitive format for multilabel data, it is unwieldy to process. This transformer converts between this intuitive format and the supported multilabel format: a (samples x classes) binary matrix indicating the presence of a class label.", + "description": "Transform between iterable of iterables and a multilabel format.\n\nAlthough a list of sets or tuples is a very intuitive format for multilabel\ndata, it is unwieldy to process. This transformer converts between this\nintuitive format and the supported multilabel format: a (samples x classes)\nbinary matrix indicating the presence of a class label.", "docstring": "Transform between iterable of iterables and a multilabel format.\n\n Although a list of sets or tuples is a very intuitive format for multilabel\n data, it is unwieldy to process. This transformer converts between this\n intuitive format and the supported multilabel format: a (samples x classes)\n binary matrix indicating the presence of a class label.\n\n Parameters\n ----------\n classes : array-like of shape (n_classes,), default=None\n Indicates an ordering for the class labels.\n All entries should be unique (cannot contain duplicate classes).\n\n sparse_output : bool, default=False\n Set to True if output binary array is desired in CSR sparse format.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n A copy of the `classes` parameter when provided.\n Otherwise it corresponds to the sorted set of classes found\n when fitting.\n\n See Also\n --------\n OneHotEncoder : Encode categorical features using a one-hot aka one-of-K\n scheme.\n\n Examples\n --------\n >>> from sklearn.preprocessing import MultiLabelBinarizer\n >>> mlb = MultiLabelBinarizer()\n >>> mlb.fit_transform([(1, 2), (3,)])\n array([[1, 1, 0],\n [0, 0, 1]])\n >>> mlb.classes_\n array([1, 2, 3])\n\n >>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])\n array([[0, 1, 1],\n [1, 0, 0]])\n >>> list(mlb.classes_)\n ['comedy', 'sci-fi', 'thriller']\n\n A common mistake is to pass in a list, which leads to the following issue:\n\n >>> mlb = MultiLabelBinarizer()\n >>> mlb.fit(['sci-fi', 'thriller', 'comedy'])\n MultiLabelBinarizer()\n >>> mlb.classes_\n array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',\n 'y'], dtype=object)\n\n To correct this, the list of labels should be passed in as:\n\n >>> mlb = MultiLabelBinarizer()\n >>> mlb.fit([['sci-fi', 'thriller', 'comedy']])\n MultiLabelBinarizer()\n >>> mlb.classes_\n array(['comedy', 'sci-fi', 'thriller'], dtype=object)\n ", "source_code": "\n\nclass MultiLabelBinarizer(TransformerMixin, BaseEstimator):\n \"\"\"Transform between iterable of iterables and a multilabel format.\n\n Although a list of sets or tuples is a very intuitive format for multilabel\n data, it is unwieldy to process. This transformer converts between this\n intuitive format and the supported multilabel format: a (samples x classes)\n binary matrix indicating the presence of a class label.\n\n Parameters\n ----------\n classes : array-like of shape (n_classes,), default=None\n Indicates an ordering for the class labels.\n All entries should be unique (cannot contain duplicate classes).\n\n sparse_output : bool, default=False\n Set to True if output binary array is desired in CSR sparse format.\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,)\n A copy of the `classes` parameter when provided.\n Otherwise it corresponds to the sorted set of classes found\n when fitting.\n\n See Also\n --------\n OneHotEncoder : Encode categorical features using a one-hot aka one-of-K\n scheme.\n\n Examples\n --------\n >>> from sklearn.preprocessing import MultiLabelBinarizer\n >>> mlb = MultiLabelBinarizer()\n >>> mlb.fit_transform([(1, 2), (3,)])\n array([[1, 1, 0],\n [0, 0, 1]])\n >>> mlb.classes_\n array([1, 2, 3])\n\n >>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])\n array([[0, 1, 1],\n [1, 0, 0]])\n >>> list(mlb.classes_)\n ['comedy', 'sci-fi', 'thriller']\n\n A common mistake is to pass in a list, which leads to the following issue:\n\n >>> mlb = MultiLabelBinarizer()\n >>> mlb.fit(['sci-fi', 'thriller', 'comedy'])\n MultiLabelBinarizer()\n >>> mlb.classes_\n array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',\n 'y'], dtype=object)\n\n To correct this, the list of labels should be passed in as:\n\n >>> mlb = MultiLabelBinarizer()\n >>> mlb.fit([['sci-fi', 'thriller', 'comedy']])\n MultiLabelBinarizer()\n >>> mlb.classes_\n array(['comedy', 'sci-fi', 'thriller'], dtype=object)\n \"\"\"\n \n def __init__(self, *, classes=None, sparse_output=False):\n self.classes = classes\n self.sparse_output = sparse_output\n \n def fit(self, y):\n \"\"\"Fit the label sets binarizer, storing :term:`classes_`.\n\n Parameters\n ----------\n y : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self._cached_dict = None\n if self.classes is None:\n classes = sorted(set(itertools.chain.from_iterable(y)))\n elif len(set(self.classes)) < len(self.classes):\n raise ValueError('The classes argument contains duplicate classes. Remove these duplicates before passing them to MultiLabelBinarizer.')\n else:\n classes = self.classes\n dtype = int if all((isinstance(c, int) for c in classes)) else object\n self.classes_ = np.empty(len(classes), dtype=dtype)\n self.classes_[:] = classes\n return self\n \n def fit_transform(self, y):\n \"\"\"Fit the label sets binarizer and transform the given label sets.\n\n Parameters\n ----------\n y : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\n Returns\n -------\n y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`\n is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR\n format.\n \"\"\"\n self._cached_dict = None\n if self.classes is not None:\n return self.fit(y).transform(y)\n class_mapping = defaultdict(int)\n class_mapping.default_factory = class_mapping.__len__\n yt = self._transform(y, class_mapping)\n tmp = sorted(class_mapping, key=class_mapping.get)\n dtype = int if all((isinstance(c, int) for c in tmp)) else object\n class_mapping = np.empty(len(tmp), dtype=dtype)\n class_mapping[:] = tmp\n (self.classes_, inverse) = np.unique(class_mapping, return_inverse=True)\n yt.indices = np.array(inverse[yt.indices], dtype=yt.indices.dtype, copy=False)\n if not self.sparse_output:\n yt = yt.toarray()\n return yt\n \n def transform(self, y):\n \"\"\"Transform the given label sets.\n\n Parameters\n ----------\n y : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\n Returns\n -------\n y_indicator : array or CSR matrix, shape (n_samples, n_classes)\n A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in\n `y[i]`, and 0 otherwise.\n \"\"\"\n check_is_fitted(self)\n class_to_index = self._build_cache()\n yt = self._transform(y, class_to_index)\n if not self.sparse_output:\n yt = yt.toarray()\n return yt\n \n def _build_cache(self):\n if self._cached_dict is None:\n self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))\n return self._cached_dict\n \n def _transform(self, y, class_mapping):\n \"\"\"Transforms the label sets with a given mapping.\n\n Parameters\n ----------\n y : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\n class_mapping : Mapping\n Maps from label to column index in label indicator matrix.\n\n Returns\n -------\n y_indicator : sparse matrix of shape (n_samples, n_classes)\n Label indicator matrix. Will be of CSR format.\n \"\"\"\n indices = array.array('i')\n indptr = array.array('i', [0])\n unknown = set()\n for labels in y:\n index = set()\n for label in labels:\n try:\n index.add(class_mapping[label])\n except KeyError:\n unknown.add(label)\n indices.extend(index)\n indptr.append(len(indices))\n if unknown:\n warnings.warn('unknown class(es) {0} will be ignored'.format(sorted(unknown, key=str)))\n data = np.ones(len(indices), dtype=int)\n return sp.csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping)))\n \n def inverse_transform(self, yt):\n \"\"\"Transform the given indicator matrix into label sets.\n\n Parameters\n ----------\n yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n A matrix containing only 1s ands 0s.\n\n Returns\n -------\n y : list of tuples\n The set of labels for each sample such that `y[i]` consists of\n `classes_[j]` for each `yt[i, j] == 1`.\n \"\"\"\n check_is_fitted(self)\n if yt.shape[1] != len(self.classes_):\n raise ValueError('Expected indicator for {0} classes, but got {1}'.format(len(self.classes_), yt.shape[1]))\n if sp.issparse(yt):\n yt = yt.tocsr()\n if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:\n raise ValueError('Expected only 0s and 1s in label indicator.')\n return [tuple(self.classes_.take(yt.indices[start:end])) for (start, end) in zip(yt.indptr[:-1], yt.indptr[1:])]\n else:\n unexpected = np.setdiff1d(yt, [0, 1])\n if len(unexpected) > 0:\n raise ValueError('Expected only 0s and 1s in label indicator. Also got {0}'.format(unexpected))\n return [tuple(self.classes_.compress(indicators)) for indicators in yt]\n \n def _more_tags(self):\n return {'X_types': ['2dlabels']}\n" }, @@ -26159,7 +26255,7 @@ "sklearn.preprocessing._polynomial.PolynomialFeatures.n_input_features_@getter" ], "is_public": true, - "description": "Generate polynomial and interaction features.\n\nGenerate a new feature matrix consisting of all polynomial combinations of the features with degree less than or equal to the specified degree. For example, if an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. Read more in the :ref:`User Guide `.", + "description": "Generate polynomial and interaction features.\n\nGenerate a new feature matrix consisting of all polynomial combinations\nof the features with degree less than or equal to the specified degree.\nFor example, if an input sample is two dimensional and of the form\n[a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].\n\nRead more in the :ref:`User Guide `.", "docstring": "Generate polynomial and interaction features.\n\n Generate a new feature matrix consisting of all polynomial combinations\n of the features with degree less than or equal to the specified degree.\n For example, if an input sample is two dimensional and of the form\n [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n degree : int or tuple (min_degree, max_degree), default=2\n If a single int is given, it specifies the maximal degree of the\n polynomial features. If a tuple `(min_degree, max_degree)` is passed,\n then `min_degree` is the minimum and `max_degree` is the maximum\n polynomial degree of the generated features. Note that `min_degree=0`\n and `min_degree=1` are equivalent as outputting the degree zero term is\n determined by `include_bias`.\n\n interaction_only : bool, default=False\n If `True`, only interaction features are produced: features that are\n products of at most `degree` *distinct* input features, i.e. terms with\n power of 2 or higher of the same input feature are excluded:\n\n - included: `x[0]`, `x[1]`, `x[0] * x[1]`, etc.\n - excluded: `x[0] ** 2`, `x[0] ** 2 * x[1]`, etc.\n\n include_bias : bool, default=True\n If `True` (default), then include a bias column, the feature in which\n all polynomial powers are zero (i.e. a column of ones - acts as an\n intercept term in a linear model).\n\n order : {'C', 'F'}, default='C'\n Order of output array in the dense case. `'F'` order is faster to\n compute, but may slow down subsequent estimators.\n\n .. versionadded:: 0.21\n\n Attributes\n ----------\n powers_ : ndarray of shape (`n_output_features_`, `n_features_in_`)\n `powers_[i, j]` is the exponent of the jth input in the ith output.\n\n n_input_features_ : int\n The total number of input features.\n\n .. deprecated:: 1.0\n This attribute is deprecated in 1.0 and will be removed in 1.2.\n Refer to `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_output_features_ : int\n The total number of polynomial output features. The number of output\n features is computed by iterating over all suitably sized combinations\n of input features.\n\n See Also\n --------\n SplineTransformer : Transformer that generates univariate B-spline bases\n for features.\n\n Notes\n -----\n Be aware that the number of features in the output array scales\n polynomially in the number of features of the input array, and\n exponentially in the degree. High degrees can cause overfitting.\n\n See :ref:`examples/linear_model/plot_polynomial_interpolation.py\n `\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import PolynomialFeatures\n >>> X = np.arange(6).reshape(3, 2)\n >>> X\n array([[0, 1],\n [2, 3],\n [4, 5]])\n >>> poly = PolynomialFeatures(2)\n >>> poly.fit_transform(X)\n array([[ 1., 0., 1., 0., 0., 1.],\n [ 1., 2., 3., 4., 6., 9.],\n [ 1., 4., 5., 16., 20., 25.]])\n >>> poly = PolynomialFeatures(interaction_only=True)\n >>> poly.fit_transform(X)\n array([[ 1., 0., 1., 0.],\n [ 1., 2., 3., 6.],\n [ 1., 4., 5., 20.]])\n ", "source_code": "\n\nclass PolynomialFeatures(TransformerMixin, BaseEstimator):\n \"\"\"Generate polynomial and interaction features.\n\n Generate a new feature matrix consisting of all polynomial combinations\n of the features with degree less than or equal to the specified degree.\n For example, if an input sample is two dimensional and of the form\n [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n degree : int or tuple (min_degree, max_degree), default=2\n If a single int is given, it specifies the maximal degree of the\n polynomial features. If a tuple `(min_degree, max_degree)` is passed,\n then `min_degree` is the minimum and `max_degree` is the maximum\n polynomial degree of the generated features. Note that `min_degree=0`\n and `min_degree=1` are equivalent as outputting the degree zero term is\n determined by `include_bias`.\n\n interaction_only : bool, default=False\n If `True`, only interaction features are produced: features that are\n products of at most `degree` *distinct* input features, i.e. terms with\n power of 2 or higher of the same input feature are excluded:\n\n - included: `x[0]`, `x[1]`, `x[0] * x[1]`, etc.\n - excluded: `x[0] ** 2`, `x[0] ** 2 * x[1]`, etc.\n\n include_bias : bool, default=True\n If `True` (default), then include a bias column, the feature in which\n all polynomial powers are zero (i.e. a column of ones - acts as an\n intercept term in a linear model).\n\n order : {'C', 'F'}, default='C'\n Order of output array in the dense case. `'F'` order is faster to\n compute, but may slow down subsequent estimators.\n\n .. versionadded:: 0.21\n\n Attributes\n ----------\n powers_ : ndarray of shape (`n_output_features_`, `n_features_in_`)\n `powers_[i, j]` is the exponent of the jth input in the ith output.\n\n n_input_features_ : int\n The total number of input features.\n\n .. deprecated:: 1.0\n This attribute is deprecated in 1.0 and will be removed in 1.2.\n Refer to `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_output_features_ : int\n The total number of polynomial output features. The number of output\n features is computed by iterating over all suitably sized combinations\n of input features.\n\n See Also\n --------\n SplineTransformer : Transformer that generates univariate B-spline bases\n for features.\n\n Notes\n -----\n Be aware that the number of features in the output array scales\n polynomially in the number of features of the input array, and\n exponentially in the degree. High degrees can cause overfitting.\n\n See :ref:`examples/linear_model/plot_polynomial_interpolation.py\n `\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import PolynomialFeatures\n >>> X = np.arange(6).reshape(3, 2)\n >>> X\n array([[0, 1],\n [2, 3],\n [4, 5]])\n >>> poly = PolynomialFeatures(2)\n >>> poly.fit_transform(X)\n array([[ 1., 0., 1., 0., 0., 1.],\n [ 1., 2., 3., 4., 6., 9.],\n [ 1., 4., 5., 16., 20., 25.]])\n >>> poly = PolynomialFeatures(interaction_only=True)\n >>> poly.fit_transform(X)\n array([[ 1., 0., 1., 0.],\n [ 1., 2., 3., 6.],\n [ 1., 4., 5., 20.]])\n \"\"\"\n \n def __init__(self, degree=2, *, interaction_only=False, include_bias=True, order='C'):\n self.degree = degree\n self.interaction_only = interaction_only\n self.include_bias = include_bias\n self.order = order\n \n @staticmethod\n def _combinations(n_features, min_degree, max_degree, interaction_only, include_bias):\n comb = combinations if interaction_only else combinations_w_r\n start = max(1, min_degree)\n iter = chain.from_iterable((comb(range(n_features), i) for i in range(start, max_degree + 1)))\n if include_bias:\n iter = chain(comb(range(n_features), 0), iter)\n return iter\n \n @staticmethod\n def _num_combinations(n_features, min_degree, max_degree, interaction_only, include_bias):\n \"\"\"Calculate number of terms in polynomial expansion\n\n This should be equivalent to counting the number of terms returned by\n _combinations(...) but much faster.\n \"\"\"\n if interaction_only:\n combinations = sum([comb(n_features, i, exact=True) for i in range(max(1, min_degree), min(max_degree, n_features) + 1)])\n else:\n combinations = comb(n_features + max_degree, max_degree, exact=True) - 1\n if min_degree > 0:\n d = min_degree - 1\n combinations -= comb(n_features + d, d, exact=True) - 1\n if include_bias:\n combinations += 1\n return combinations\n \n @property\n def powers_(self):\n \"\"\"Exponent for each of the inputs in the output.\"\"\"\n check_is_fitted(self)\n combinations = self._combinations(n_features=self.n_features_in_, min_degree=self._min_degree, max_degree=self._max_degree, interaction_only=self.interaction_only, include_bias=self.include_bias)\n return np.vstack([np.bincount(c, minlength=self.n_features_in_) for c in combinations])\n \n @deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\n def get_feature_names(self, input_features=None):\n \"\"\"Return feature names for output features.\n\n Parameters\n ----------\n input_features : list of str of shape (n_features,), default=None\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\n Returns\n -------\n output_feature_names : list of str of shape (n_output_features,)\n Transformed feature names.\n \"\"\"\n powers = self.powers_\n if input_features is None:\n input_features = ['x%d' % i for i in range(powers.shape[1])]\n feature_names = []\n for row in powers:\n inds = np.where(row)[0]\n if len(inds):\n name = ' '.join(('%s^%d' % (input_features[ind], exp) if exp != 1 else input_features[ind] for (ind, exp) in zip(inds, row[inds])))\n else:\n name = '1'\n feature_names.append(name)\n return feature_names\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features is None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n powers = self.powers_\n input_features = _check_feature_names_in(self, input_features)\n feature_names = []\n for row in powers:\n inds = np.where(row)[0]\n if len(inds):\n name = ' '.join(('%s^%d' % (input_features[ind], exp) if exp != 1 else input_features[ind] for (ind, exp) in zip(inds, row[inds])))\n else:\n name = '1'\n feature_names.append(name)\n return np.asarray(feature_names, dtype=object)\n \n def fit(self, X, y=None):\n \"\"\"\n Compute number of output features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n (_, n_features) = self._validate_data(X, accept_sparse=True).shape\n if isinstance(self.degree, numbers.Integral):\n if self.degree < 0:\n raise ValueError(f'degree must be a non-negative integer, got {self.degree}.')\n self._min_degree = 0\n self._max_degree = self.degree\n elif isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2:\n (self._min_degree, self._max_degree) = self.degree\n if not (isinstance(self._min_degree, numbers.Integral) and isinstance(self._max_degree, numbers.Integral) and self._min_degree >= 0 and self._min_degree <= self._max_degree):\n raise ValueError(f'degree=(min_degree, max_degree) must be non-negative integers that fulfil min_degree <= max_degree, got {self.degree}.')\n else:\n raise ValueError(f'degree must be a non-negative int or tuple (min_degree, max_degree), got {self.degree}.')\n self.n_output_features_ = self._num_combinations(n_features=n_features, min_degree=self._min_degree, max_degree=self._max_degree, interaction_only=self.interaction_only, include_bias=self.include_bias)\n self._n_out_full = self._num_combinations(n_features=n_features, min_degree=0, max_degree=self._max_degree, interaction_only=self.interaction_only, include_bias=self.include_bias)\n return self\n \n def transform(self, X):\n \"\"\"Transform data to polynomial features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to transform, row by row.\n\n Prefer CSR over CSC for sparse input (for speed), but CSC is\n required if the degree is 4 or higher. If the degree is less than\n 4 and the input format is CSC, it will be converted to CSR, have\n its polynomial features generated, then converted back to CSC.\n\n If the degree is 2 or 3, the method described in \"Leveraging\n Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices\n Using K-Simplex Numbers\" by Andrew Nystrom and John Hughes is\n used, which is much faster than the method used on CSC input. For\n this reason, a CSC input will be converted to CSR, and the output\n will be converted back to CSC prior to being returned, hence the\n preference of CSR.\n\n Returns\n -------\n XP : {ndarray, sparse matrix} of shape (n_samples, NP)\n The matrix of features, where `NP` is the number of polynomial\n features generated from the combination of inputs. If a sparse\n matrix is provided, it will be converted into a sparse\n `csr_matrix`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False, accept_sparse=('csr', 'csc'))\n (n_samples, n_features) = X.shape\n if sparse.isspmatrix_csr(X):\n if self._max_degree > 3:\n return self.transform(X.tocsc()).tocsr()\n to_stack = []\n if self.include_bias:\n to_stack.append(sparse.csc_matrix(np.ones(shape=(n_samples, 1), dtype=X.dtype)))\n if self._min_degree <= 1:\n to_stack.append(X)\n for deg in range(max(2, self._min_degree), self._max_degree + 1):\n Xp_next = _csr_polynomial_expansion(X.data, X.indices, X.indptr, X.shape[1], self.interaction_only, deg)\n if Xp_next is None:\n break\n to_stack.append(Xp_next)\n if len(to_stack) == 0:\n XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype)\n else:\n XP = sparse.hstack(to_stack, format='csr')\n elif sparse.isspmatrix_csc(X) and self._max_degree < 4:\n return self.transform(X.tocsr()).tocsc()\n elif sparse.isspmatrix(X):\n combinations = self._combinations(n_features=n_features, min_degree=self._min_degree, max_degree=self._max_degree, interaction_only=self.interaction_only, include_bias=self.include_bias)\n columns = []\n for combi in combinations:\n if combi:\n out_col = 1\n for col_idx in combi:\n out_col = X[:, col_idx].multiply(out_col)\n columns.append(out_col)\n else:\n bias = sparse.csc_matrix(np.ones((X.shape[0], 1)))\n columns.append(bias)\n XP = sparse.hstack(columns, dtype=X.dtype).tocsc()\n else:\n XP = np.empty(shape=(n_samples, self._n_out_full), dtype=X.dtype, order=self.order)\n if self.include_bias:\n XP[:, 0] = 1\n current_col = 1\n else:\n current_col = 0\n XP[:, current_col:current_col + n_features] = X\n index = list(range(current_col, current_col + n_features))\n current_col += n_features\n index.append(current_col)\n for _ in range(2, self._max_degree + 1):\n new_index = []\n end = index[-1]\n for feature_idx in range(n_features):\n start = index[feature_idx]\n new_index.append(current_col)\n if self.interaction_only:\n start += index[feature_idx + 1] - index[feature_idx]\n next_col = current_col + end - start\n if next_col <= current_col:\n break\n np.multiply(XP[:, start:end], X[:, feature_idx:feature_idx + 1], out=XP[:, current_col:next_col], casting='no')\n current_col = next_col\n new_index.append(current_col)\n index = new_index\n if self._min_degree > 1:\n (n_XP, n_Xout) = (self._n_out_full, self.n_output_features_)\n if self.include_bias:\n Xout = np.empty(shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order)\n Xout[:, 0] = 1\n Xout[:, 1:] = XP[:, n_XP - n_Xout + 1:]\n else:\n Xout = XP[:, n_XP - n_Xout:].copy()\n XP = Xout\n return XP\n \n @deprecated('The attribute `n_input_features_` was deprecated in version 1.0 and will be removed in 1.2.')\n @property\n def n_input_features_(self):\n return self.n_features_in_\n" }, @@ -26177,7 +26273,7 @@ "sklearn.preprocessing._polynomial.SplineTransformer.transform" ], "is_public": true, - "description": "Generate univariate B-spline bases for features.\n\nGenerate a new feature matrix consisting of `n_splines=n_knots + degree - 1` (`n_knots - 1` for `extrapolation=\"periodic\"`) spline basis functions (B-splines) of polynomial order=`degree` for each feature. Read more in the :ref:`User Guide `. .. versionadded:: 1.0", + "description": "Generate univariate B-spline bases for features.\n\nGenerate a new feature matrix consisting of\n`n_splines=n_knots + degree - 1` (`n_knots - 1` for\n`extrapolation=\"periodic\"`) spline basis functions\n(B-splines) of polynomial order=`degree` for each feature.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0", "docstring": "Generate univariate B-spline bases for features.\n\n Generate a new feature matrix consisting of\n `n_splines=n_knots + degree - 1` (`n_knots - 1` for\n `extrapolation=\"periodic\"`) spline basis functions\n (B-splines) of polynomial order=`degree` for each feature.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n n_knots : int, default=5\n Number of knots of the splines if `knots` equals one of\n {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots`\n is array-like.\n\n degree : int, default=3\n The polynomial degree of the spline basis. Must be a non-negative\n integer.\n\n knots : {'uniform', 'quantile'} or array-like of shape (n_knots, n_features), default='uniform'\n Set knot positions such that first knot <= features <= last knot.\n\n - If 'uniform', `n_knots` number of knots are distributed uniformly\n from min to max values of the features.\n - If 'quantile', they are distributed uniformly along the quantiles of\n the features.\n - If an array-like is given, it directly specifies the sorted knot\n positions including the boundary knots. Note that, internally,\n `degree` number of knots are added before the first knot, the same\n after the last knot.\n\n extrapolation : {'error', 'constant', 'linear', 'continue', 'periodic'}, default='constant'\n If 'error', values outside the min and max values of the training\n features raises a `ValueError`. If 'constant', the value of the\n splines at minimum and maximum value of the features is used as\n constant extrapolation. If 'linear', a linear extrapolation is used.\n If 'continue', the splines are extrapolated as is, i.e. option\n `extrapolate=True` in :class:`scipy.interpolate.BSpline`. If\n 'periodic', periodic splines with a periodicity equal to the distance\n between the first and last knot are used. Periodic splines enforce\n equal function values and derivatives at the first and last knot.\n For example, this makes it possible to avoid introducing an arbitrary\n jump between Dec 31st and Jan 1st in spline features derived from a\n naturally periodic \"day-of-year\" input feature. In this case it is\n recommended to manually set the knot values to control the period.\n\n include_bias : bool, default=True\n If True (default), then the last spline element inside the data range\n of a feature is dropped. As B-splines sum to one over the spline basis\n functions for each data point, they implicitly include a bias term,\n i.e. a column of ones. It acts as an intercept term in a linear models.\n\n order : {'C', 'F'}, default='C'\n Order of output array. 'F' order is faster to compute, but may slow\n down subsequent estimators.\n\n Attributes\n ----------\n bsplines_ : list of shape (n_features,)\n List of BSplines objects, one for each feature.\n\n n_features_in_ : int\n The total number of input features.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_features_out_ : int\n The total number of output features, which is computed as\n `n_features * n_splines`, where `n_splines` is\n the number of bases elements of the B-splines,\n `n_knots + degree - 1` for non-periodic splines and\n `n_knots - 1` for periodic ones.\n If `include_bias=False`, then it is only\n `n_features * (n_splines - 1)`.\n\n See Also\n --------\n KBinsDiscretizer : Transformer that bins continuous data into intervals.\n\n PolynomialFeatures : Transformer that generates polynomial and interaction\n features.\n\n Notes\n -----\n High degrees and a high number of knots can cause overfitting.\n\n See :ref:`examples/linear_model/plot_polynomial_interpolation.py\n `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import SplineTransformer\n >>> X = np.arange(6).reshape(6, 1)\n >>> spline = SplineTransformer(degree=2, n_knots=3)\n >>> spline.fit_transform(X)\n array([[0.5 , 0.5 , 0. , 0. ],\n [0.18, 0.74, 0.08, 0. ],\n [0.02, 0.66, 0.32, 0. ],\n [0. , 0.32, 0.66, 0.02],\n [0. , 0.08, 0.74, 0.18],\n [0. , 0. , 0.5 , 0.5 ]])\n ", "source_code": "\n\nclass SplineTransformer(TransformerMixin, BaseEstimator):\n \"\"\"Generate univariate B-spline bases for features.\n\n Generate a new feature matrix consisting of\n `n_splines=n_knots + degree - 1` (`n_knots - 1` for\n `extrapolation=\"periodic\"`) spline basis functions\n (B-splines) of polynomial order=`degree` for each feature.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n n_knots : int, default=5\n Number of knots of the splines if `knots` equals one of\n {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots`\n is array-like.\n\n degree : int, default=3\n The polynomial degree of the spline basis. Must be a non-negative\n integer.\n\n knots : {'uniform', 'quantile'} or array-like of shape (n_knots, n_features), default='uniform'\n Set knot positions such that first knot <= features <= last knot.\n\n - If 'uniform', `n_knots` number of knots are distributed uniformly\n from min to max values of the features.\n - If 'quantile', they are distributed uniformly along the quantiles of\n the features.\n - If an array-like is given, it directly specifies the sorted knot\n positions including the boundary knots. Note that, internally,\n `degree` number of knots are added before the first knot, the same\n after the last knot.\n\n extrapolation : {'error', 'constant', 'linear', 'continue', 'periodic'}, default='constant'\n If 'error', values outside the min and max values of the training\n features raises a `ValueError`. If 'constant', the value of the\n splines at minimum and maximum value of the features is used as\n constant extrapolation. If 'linear', a linear extrapolation is used.\n If 'continue', the splines are extrapolated as is, i.e. option\n `extrapolate=True` in :class:`scipy.interpolate.BSpline`. If\n 'periodic', periodic splines with a periodicity equal to the distance\n between the first and last knot are used. Periodic splines enforce\n equal function values and derivatives at the first and last knot.\n For example, this makes it possible to avoid introducing an arbitrary\n jump between Dec 31st and Jan 1st in spline features derived from a\n naturally periodic \"day-of-year\" input feature. In this case it is\n recommended to manually set the knot values to control the period.\n\n include_bias : bool, default=True\n If True (default), then the last spline element inside the data range\n of a feature is dropped. As B-splines sum to one over the spline basis\n functions for each data point, they implicitly include a bias term,\n i.e. a column of ones. It acts as an intercept term in a linear models.\n\n order : {'C', 'F'}, default='C'\n Order of output array. 'F' order is faster to compute, but may slow\n down subsequent estimators.\n\n Attributes\n ----------\n bsplines_ : list of shape (n_features,)\n List of BSplines objects, one for each feature.\n\n n_features_in_ : int\n The total number of input features.\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_features_out_ : int\n The total number of output features, which is computed as\n `n_features * n_splines`, where `n_splines` is\n the number of bases elements of the B-splines,\n `n_knots + degree - 1` for non-periodic splines and\n `n_knots - 1` for periodic ones.\n If `include_bias=False`, then it is only\n `n_features * (n_splines - 1)`.\n\n See Also\n --------\n KBinsDiscretizer : Transformer that bins continuous data into intervals.\n\n PolynomialFeatures : Transformer that generates polynomial and interaction\n features.\n\n Notes\n -----\n High degrees and a high number of knots can cause overfitting.\n\n See :ref:`examples/linear_model/plot_polynomial_interpolation.py\n `.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import SplineTransformer\n >>> X = np.arange(6).reshape(6, 1)\n >>> spline = SplineTransformer(degree=2, n_knots=3)\n >>> spline.fit_transform(X)\n array([[0.5 , 0.5 , 0. , 0. ],\n [0.18, 0.74, 0.08, 0. ],\n [0.02, 0.66, 0.32, 0. ],\n [0. , 0.32, 0.66, 0.02],\n [0. , 0.08, 0.74, 0.18],\n [0. , 0. , 0.5 , 0.5 ]])\n \"\"\"\n \n def __init__(self, n_knots=5, degree=3, *, knots='uniform', extrapolation='constant', include_bias=True, order='C'):\n self.n_knots = n_knots\n self.degree = degree\n self.knots = knots\n self.extrapolation = extrapolation\n self.include_bias = include_bias\n self.order = order\n \n @staticmethod\n def _get_base_knot_positions(X, n_knots=10, knots='uniform', sample_weight=None):\n \"\"\"Calculate base knot positions.\n\n Base knots such that first knot <= feature <= last knot. For the\n B-spline construction with scipy.interpolate.BSpline, 2*degree knots\n beyond the base interval are added.\n\n Returns\n -------\n knots : ndarray of shape (n_knots, n_features), dtype=np.float64\n Knot positions (points) of base interval.\n \"\"\"\n if knots == 'quantile':\n percentiles = 100 * np.linspace(start=0, stop=1, num=n_knots, dtype=np.float64)\n if sample_weight is None:\n knots = np.percentile(X, percentiles, axis=0)\n else:\n knots = np.array([_weighted_percentile(X, sample_weight, percentile) for percentile in percentiles])\n else:\n mask = slice(None, None, 1) if sample_weight is None else sample_weight > 0\n x_min = np.amin(X[mask], axis=0)\n x_max = np.amax(X[mask], axis=0)\n knots = linspace(start=x_min, stop=x_max, num=n_knots, endpoint=True, dtype=np.float64)\n return knots\n \n @deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\n def get_feature_names(self, input_features=None):\n \"\"\"Return feature names for output features.\n\n Parameters\n ----------\n input_features : list of str of shape (n_features,), default=None\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\n Returns\n -------\n output_feature_names : list of str of shape (n_output_features,)\n Transformed feature names.\n \"\"\"\n n_splines = self.bsplines_[0].c.shape[0]\n if input_features is None:\n input_features = ['x%d' % i for i in range(self.n_features_in_)]\n feature_names = []\n for i in range(self.n_features_in_):\n for j in range(n_splines - 1 + self.include_bias):\n feature_names.append(f'{input_features[i]}_sp_{j}')\n return feature_names\n \n def get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n n_splines = self.bsplines_[0].c.shape[0]\n input_features = _check_feature_names_in(self, input_features)\n feature_names = []\n for i in range(self.n_features_in_):\n for j in range(n_splines - 1 + self.include_bias):\n feature_names.append(f'{input_features[i]}_sp_{j}')\n return np.asarray(feature_names, dtype=object)\n \n def fit(self, X, y=None, sample_weight=None):\n \"\"\"Compute knot positions of splines.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n y : None\n Ignored.\n\n sample_weight : array-like of shape (n_samples,), default = None\n Individual weights for each sample. Used to calculate quantiles if\n `knots=\"quantile\"`. For `knots=\"uniform\"`, zero weighted\n observations are ignored for finding the min and max of `X`.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n X = self._validate_data(X, reset=True, accept_sparse=False, ensure_min_samples=2, ensure_2d=True)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n (_, n_features) = X.shape\n if not (isinstance(self.degree, numbers.Integral) and self.degree >= 0):\n raise ValueError(f'degree must be a non-negative integer, got {self.degree}.')\n if isinstance(self.knots, str) and self.knots in ['uniform', 'quantile']:\n if not (isinstance(self.n_knots, numbers.Integral) and self.n_knots >= 2):\n raise ValueError(f'n_knots must be a positive integer >= 2, got: {self.n_knots}')\n base_knots = self._get_base_knot_positions(X, n_knots=self.n_knots, knots=self.knots, sample_weight=sample_weight)\n else:\n base_knots = check_array(self.knots, dtype=np.float64)\n if base_knots.shape[0] < 2:\n raise ValueError('Number of knots, knots.shape[0], must be >= 2.')\n elif base_knots.shape[1] != n_features:\n raise ValueError('knots.shape[1] == n_features is violated.')\n elif not np.all(np.diff(base_knots, axis=0) > 0):\n raise ValueError('knots must be sorted without duplicates.')\n if self.extrapolation not in ('error', 'constant', 'linear', 'continue', 'periodic'):\n raise ValueError(\"extrapolation must be one of 'error', 'constant', 'linear', 'continue' or 'periodic'.\")\n if not isinstance(self.include_bias, (bool, np.bool_)):\n raise ValueError('include_bias must be bool.')\n n_knots = base_knots.shape[0]\n if self.extrapolation == 'periodic' and n_knots <= self.degree:\n raise ValueError(f'Periodic splines require degree < n_knots. Got n_knots={n_knots} and degree={self.degree}.')\n if self.extrapolation != 'periodic':\n n_splines = n_knots + self.degree - 1\n else:\n n_splines = n_knots - 1\n degree = self.degree\n n_out = n_features * n_splines\n if self.extrapolation == 'periodic':\n period = base_knots[-1] - base_knots[0]\n knots = np.r_[base_knots[-(degree + 1):-1] - period, base_knots, base_knots[1:degree + 1] + period]\n else:\n dist_min = base_knots[1] - base_knots[0]\n dist_max = base_knots[-1] - base_knots[-2]\n knots = np.r_[linspace(base_knots[0] - degree * dist_min, base_knots[0] - dist_min, num=degree), base_knots, linspace(base_knots[-1] + dist_max, base_knots[-1] + degree * dist_max, num=degree)]\n coef = np.eye(n_splines, dtype=np.float64)\n if self.extrapolation == 'periodic':\n coef = np.concatenate((coef, coef[:degree, :]))\n extrapolate = self.extrapolation in ['periodic', 'continue']\n bsplines = [BSpline.construct_fast(knots[:, i], coef, self.degree, extrapolate=extrapolate) for i in range(n_features)]\n self.bsplines_ = bsplines\n self.n_features_out_ = n_out - n_features * (1 - self.include_bias)\n return self\n \n def transform(self, X):\n \"\"\"Transform each feature data to B-splines.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to transform.\n\n Returns\n -------\n XBS : ndarray of shape (n_samples, n_features * n_splines)\n The matrix of features, where n_splines is the number of bases\n elements of the B-splines, n_knots + degree - 1.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False, accept_sparse=False, ensure_2d=True)\n (n_samples, n_features) = X.shape\n n_splines = self.bsplines_[0].c.shape[1]\n degree = self.degree\n n_out = self.n_features_out_ + n_features * (1 - self.include_bias)\n if X.dtype in FLOAT_DTYPES:\n dtype = X.dtype\n else:\n dtype = np.float64\n XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order)\n for i in range(n_features):\n spl = self.bsplines_[i]\n if self.extrapolation in ('continue', 'error', 'periodic'):\n if self.extrapolation == 'periodic':\n n = spl.t.size - spl.k - 1\n x = spl.t[spl.k] + (X[:, i] - spl.t[spl.k]) % (spl.t[n] - spl.t[spl.k])\n else:\n x = X[:, i]\n XBS[:, i * n_splines:(i + 1) * n_splines] = spl(x)\n else:\n xmin = spl.t[degree]\n xmax = spl.t[-degree - 1]\n mask = (xmin <= X[:, i]) & (X[:, i] <= xmax)\n XBS[mask, i * n_splines:(i + 1) * n_splines] = spl(X[mask, i])\n if self.extrapolation == 'error':\n if np.any(np.isnan(XBS[:, i * n_splines:(i + 1) * n_splines])):\n raise ValueError('X contains values beyond the limits of the knots.')\n elif self.extrapolation == 'constant':\n f_min = spl(xmin)\n f_max = spl(xmax)\n mask = X[:, i] < xmin\n if np.any(mask):\n XBS[mask, i * n_splines:i * n_splines + degree] = f_min[:degree]\n mask = X[:, i] > xmax\n if np.any(mask):\n XBS[mask, (i + 1) * n_splines - degree:(i + 1) * n_splines] = f_max[-degree:]\n elif self.extrapolation == 'linear':\n (f_min, f_max) = (spl(xmin), spl(xmax))\n (fp_min, fp_max) = (spl(xmin, nu=1), spl(xmax, nu=1))\n if degree <= 1:\n degree += 1\n for j in range(degree):\n mask = X[:, i] < xmin\n if np.any(mask):\n XBS[mask, i * n_splines + j] = f_min[j] + (X[mask, i] - xmin) * fp_min[j]\n mask = X[:, i] > xmax\n if np.any(mask):\n k = n_splines - 1 - j\n XBS[mask, i * n_splines + k] = f_max[k] + (X[mask, i] - xmax) * fp_max[k]\n if self.include_bias:\n return XBS\n else:\n indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0]\n return XBS[:, indices]\n" }, @@ -26193,7 +26289,7 @@ "sklearn.random_projection.BaseRandomProjection.transform" ], "is_public": true, - "description": "Base class for random projections.\n\nWarning: This class should not be used directly. Use derived classes instead.", + "description": "Base class for random projections.\n\nWarning: This class should not be used directly.\nUse derived classes instead.", "docstring": "Base class for random projections.\n\n Warning: This class should not be used directly.\n Use derived classes instead.\n ", "source_code": "\n\nclass BaseRandomProjection(TransformerMixin, BaseEstimator, metaclass=ABCMeta):\n \"\"\"Base class for random projections.\n\n Warning: This class should not be used directly.\n Use derived classes instead.\n \"\"\"\n \n @abstractmethod\n def __init__(self, n_components='auto', *, eps=0.1, dense_output=False, random_state=None):\n self.n_components = n_components\n self.eps = eps\n self.dense_output = dense_output\n self.random_state = random_state\n \n @abstractmethod\n def _make_random_matrix(self, n_components, n_features):\n \"\"\"Generate the random projection matrix.\n\n Parameters\n ----------\n n_components : int,\n Dimensionality of the target projection space.\n\n n_features : int,\n Dimensionality of the original source space.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.\n\n \"\"\"\n \n \n def fit(self, X, y=None):\n \"\"\"Generate a sparse random projection matrix.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training set: only the shape is used to find optimal random\n matrix dimensions based on the theory referenced in the\n afore mentioned papers.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n BaseRandomProjection class instance.\n \"\"\"\n X = self._validate_data(X, accept_sparse=['csr', 'csc'])\n (n_samples, n_features) = X.shape\n if self.n_components == 'auto':\n self.n_components_ = johnson_lindenstrauss_min_dim(n_samples=n_samples, eps=self.eps)\n if self.n_components_ <= 0:\n raise ValueError('eps=%f and n_samples=%d lead to a target dimension of %d which is invalid' % (self.eps, n_samples, self.n_components_))\n elif self.n_components_ > n_features:\n raise ValueError('eps=%f and n_samples=%d lead to a target dimension of %d which is larger than the original space with n_features=%d' % (self.eps, n_samples, self.n_components_, n_features))\n else:\n if self.n_components <= 0:\n raise ValueError('n_components must be greater than 0, got %s' % self.n_components)\n elif self.n_components > n_features:\n warnings.warn('The number of components is higher than the number of features: n_features < n_components (%s < %s).The dimensionality of the problem will not be reduced.' % (n_features, self.n_components), DataDimensionalityWarning)\n self.n_components_ = self.n_components\n self.components_ = self._make_random_matrix(self.n_components_, n_features)\n assert self.components_.shape == (self.n_components_, n_features), 'An error has occurred the self.components_ matrix has not the proper shape.'\n return self\n \n def transform(self, X):\n \"\"\"Project the data by using matrix product with the random matrix.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input data to project into a smaller dimensional space.\n\n Returns\n -------\n X_new : {ndarray, sparse matrix} of shape (n_samples, n_components)\n Projected array.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], reset=False)\n if X.shape[1] != self.components_.shape[1]:\n raise ValueError('Impossible to perform projection:X at fit stage had a different number of features. (%s != %s)' % (X.shape[1], self.components_.shape[1]))\n X_new = safe_sparse_dot(X, self.components_.T, dense_output=self.dense_output)\n return X_new\n" }, @@ -26207,9 +26303,9 @@ "sklearn.random_projection.GaussianRandomProjection._make_random_matrix" ], "is_public": true, - "description": "Reduce dimensionality through Gaussian random projection.\n\nThe components of the random matrix are drawn from N(0, 1 / n_components). Read more in the :ref:`User Guide `. .. versionadded:: 0.13", - "docstring": "Reduce dimensionality through Gaussian random projection.\n\n The components of the random matrix are drawn from N(0, 1 / n_components).\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n n_components : int or 'auto', default='auto'\n Dimensionality of the target projection space.\n\n n_components can be automatically adjusted according to the\n number of samples in the dataset and the bound given by the\n Johnson-Lindenstrauss lemma. In that case the quality of the\n embedding is controlled by the ``eps`` parameter.\n\n It should be noted that Johnson-Lindenstrauss lemma can yield\n very conservative estimated of the required number of components\n as it makes no assumption on the structure of the dataset.\n\n eps : float, default=0.1\n Parameter to control the quality of the embedding according to\n the Johnson-Lindenstrauss lemma when `n_components` is set to\n 'auto'. The value should be strictly positive.\n\n Smaller values lead to better embedding and higher number of\n dimensions (n_components) in the target projection space.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the\n projection matrix at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n n_components_ : int\n Concrete number of components computed when n_components=\"auto\".\n\n components_ : ndarray of shape (n_components, n_features)\n Random matrix used for the projection.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SparseRandomProjection : Reduce dimensionality through sparse\n random projection.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.random_projection import GaussianRandomProjection\n >>> rng = np.random.RandomState(42)\n >>> X = rng.rand(100, 10000)\n >>> transformer = GaussianRandomProjection(random_state=rng)\n >>> X_new = transformer.fit_transform(X)\n >>> X_new.shape\n (100, 3947)\n ", - "source_code": "\n\nclass GaussianRandomProjection(BaseRandomProjection):\n \"\"\"Reduce dimensionality through Gaussian random projection.\n\n The components of the random matrix are drawn from N(0, 1 / n_components).\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n n_components : int or 'auto', default='auto'\n Dimensionality of the target projection space.\n\n n_components can be automatically adjusted according to the\n number of samples in the dataset and the bound given by the\n Johnson-Lindenstrauss lemma. In that case the quality of the\n embedding is controlled by the ``eps`` parameter.\n\n It should be noted that Johnson-Lindenstrauss lemma can yield\n very conservative estimated of the required number of components\n as it makes no assumption on the structure of the dataset.\n\n eps : float, default=0.1\n Parameter to control the quality of the embedding according to\n the Johnson-Lindenstrauss lemma when `n_components` is set to\n 'auto'. The value should be strictly positive.\n\n Smaller values lead to better embedding and higher number of\n dimensions (n_components) in the target projection space.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the\n projection matrix at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n n_components_ : int\n Concrete number of components computed when n_components=\"auto\".\n\n components_ : ndarray of shape (n_components, n_features)\n Random matrix used for the projection.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SparseRandomProjection : Reduce dimensionality through sparse\n random projection.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.random_projection import GaussianRandomProjection\n >>> rng = np.random.RandomState(42)\n >>> X = rng.rand(100, 10000)\n >>> transformer = GaussianRandomProjection(random_state=rng)\n >>> X_new = transformer.fit_transform(X)\n >>> X_new.shape\n (100, 3947)\n \"\"\"\n \n def __init__(self, n_components='auto', *, eps=0.1, random_state=None):\n super().__init__(n_components=n_components, eps=eps, dense_output=True, random_state=random_state)\n \n def _make_random_matrix(self, n_components, n_features):\n \"\"\" Generate the random projection matrix.\n\n Parameters\n ----------\n n_components : int,\n Dimensionality of the target projection space.\n\n n_features : int,\n Dimensionality of the original source space.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.\n\n \"\"\"\n random_state = check_random_state(self.random_state)\n return _gaussian_random_matrix(n_components, n_features, random_state=random_state)\n" + "description": "Reduce dimensionality through Gaussian random projection.\n\nThe components of the random matrix are drawn from N(0, 1 / n_components).\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.13", + "docstring": "Reduce dimensionality through Gaussian random projection.\n\n The components of the random matrix are drawn from N(0, 1 / n_components).\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n n_components : int or 'auto', default='auto'\n Dimensionality of the target projection space.\n\n n_components can be automatically adjusted according to the\n number of samples in the dataset and the bound given by the\n Johnson-Lindenstrauss lemma. In that case the quality of the\n embedding is controlled by the ``eps`` parameter.\n\n It should be noted that Johnson-Lindenstrauss lemma can yield\n very conservative estimated of the required number of components\n as it makes no assumption on the structure of the dataset.\n\n eps : float, default=0.1\n Parameter to control the quality of the embedding according to\n the Johnson-Lindenstrauss lemma when `n_components` is set to\n 'auto'. The value should be strictly positive.\n\n Smaller values lead to better embedding and higher number of\n dimensions (n_components) in the target projection space.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the\n projection matrix at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n n_components_ : int\n Concrete number of components computed when n_components=\"auto\".\n\n components_ : ndarray of shape (n_components, n_features)\n Random matrix used for the projection.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SparseRandomProjection : Reduce dimensionality through sparse\n random projection.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.random_projection import GaussianRandomProjection\n >>> rng = np.random.RandomState(42)\n >>> X = rng.rand(25, 3000)\n >>> transformer = GaussianRandomProjection(random_state=rng)\n >>> X_new = transformer.fit_transform(X)\n >>> X_new.shape\n (25, 2759)\n ", + "source_code": "\n\nclass GaussianRandomProjection(BaseRandomProjection):\n \"\"\"Reduce dimensionality through Gaussian random projection.\n\n The components of the random matrix are drawn from N(0, 1 / n_components).\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n n_components : int or 'auto', default='auto'\n Dimensionality of the target projection space.\n\n n_components can be automatically adjusted according to the\n number of samples in the dataset and the bound given by the\n Johnson-Lindenstrauss lemma. In that case the quality of the\n embedding is controlled by the ``eps`` parameter.\n\n It should be noted that Johnson-Lindenstrauss lemma can yield\n very conservative estimated of the required number of components\n as it makes no assumption on the structure of the dataset.\n\n eps : float, default=0.1\n Parameter to control the quality of the embedding according to\n the Johnson-Lindenstrauss lemma when `n_components` is set to\n 'auto'. The value should be strictly positive.\n\n Smaller values lead to better embedding and higher number of\n dimensions (n_components) in the target projection space.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the\n projection matrix at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n n_components_ : int\n Concrete number of components computed when n_components=\"auto\".\n\n components_ : ndarray of shape (n_components, n_features)\n Random matrix used for the projection.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n SparseRandomProjection : Reduce dimensionality through sparse\n random projection.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.random_projection import GaussianRandomProjection\n >>> rng = np.random.RandomState(42)\n >>> X = rng.rand(25, 3000)\n >>> transformer = GaussianRandomProjection(random_state=rng)\n >>> X_new = transformer.fit_transform(X)\n >>> X_new.shape\n (25, 2759)\n \"\"\"\n \n def __init__(self, n_components='auto', *, eps=0.1, random_state=None):\n super().__init__(n_components=n_components, eps=eps, dense_output=True, random_state=random_state)\n \n def _make_random_matrix(self, n_components, n_features):\n \"\"\" Generate the random projection matrix.\n\n Parameters\n ----------\n n_components : int,\n Dimensionality of the target projection space.\n\n n_features : int,\n Dimensionality of the original source space.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.\n\n \"\"\"\n random_state = check_random_state(self.random_state)\n return _gaussian_random_matrix(n_components, n_features, random_state=random_state)\n" }, { "name": "SparseRandomProjection", @@ -26221,9 +26317,9 @@ "sklearn.random_projection.SparseRandomProjection._make_random_matrix" ], "is_public": true, - "description": "Reduce dimensionality through sparse random projection.\n\nSparse random matrix is an alternative to dense random projection matrix that guarantees similar embedding quality while being much more memory efficient and allowing faster computation of the projected data. If we note `s = 1 / density` the components of the random matrix are drawn from: - -sqrt(s) / sqrt(n_components) with probability 1 / 2s - 0 with probability 1 - 1 / s - +sqrt(s) / sqrt(n_components) with probability 1 / 2s Read more in the :ref:`User Guide `. .. versionadded:: 0.13", - "docstring": "Reduce dimensionality through sparse random projection.\n\n Sparse random matrix is an alternative to dense random\n projection matrix that guarantees similar embedding quality while being\n much more memory efficient and allowing faster computation of the\n projected data.\n\n If we note `s = 1 / density` the components of the random matrix are\n drawn from:\n\n - -sqrt(s) / sqrt(n_components) with probability 1 / 2s\n - 0 with probability 1 - 1 / s\n - +sqrt(s) / sqrt(n_components) with probability 1 / 2s\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n n_components : int or 'auto', default='auto'\n Dimensionality of the target projection space.\n\n n_components can be automatically adjusted according to the\n number of samples in the dataset and the bound given by the\n Johnson-Lindenstrauss lemma. In that case the quality of the\n embedding is controlled by the ``eps`` parameter.\n\n It should be noted that Johnson-Lindenstrauss lemma can yield\n very conservative estimated of the required number of components\n as it makes no assumption on the structure of the dataset.\n\n density : float or 'auto', default='auto'\n Ratio in the range (0, 1] of non-zero component in the random\n projection matrix.\n\n If density = 'auto', the value is set to the minimum density\n as recommended by Ping Li et al.: 1 / sqrt(n_features).\n\n Use density = 1 / 3.0 if you want to reproduce the results from\n Achlioptas, 2001.\n\n eps : float, default=0.1\n Parameter to control the quality of the embedding according to\n the Johnson-Lindenstrauss lemma when n_components is set to\n 'auto'. This value should be strictly positive.\n\n Smaller values lead to better embedding and higher number of\n dimensions (n_components) in the target projection space.\n\n dense_output : bool, default=False\n If True, ensure that the output of the random projection is a\n dense numpy array even if the input and random projection matrix\n are both sparse. In practice, if the number of components is\n small the number of zero components in the projected data will\n be very small and it will be more CPU and memory efficient to\n use a dense representation.\n\n If False, the projected data uses a sparse representation if\n the input is sparse.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the\n projection matrix at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n n_components_ : int\n Concrete number of components computed when n_components=\"auto\".\n\n components_ : sparse matrix of shape (n_components, n_features)\n Random matrix used for the projection. Sparse matrix will be of CSR\n format.\n\n density_ : float in range 0.0 - 1.0\n Concrete density computed from when density = \"auto\".\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GaussianRandomProjection : Reduce dimensionality through Gaussian\n random projection.\n\n References\n ----------\n\n .. [1] Ping Li, T. Hastie and K. W. Church, 2006,\n \"Very Sparse Random Projections\".\n https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf\n\n .. [2] D. Achlioptas, 2001, \"Database-friendly random projections\",\n https://users.soe.ucsc.edu/~optas/papers/jl.pdf\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.random_projection import SparseRandomProjection\n >>> rng = np.random.RandomState(42)\n >>> X = rng.rand(100, 10000)\n >>> transformer = SparseRandomProjection(random_state=rng)\n >>> X_new = transformer.fit_transform(X)\n >>> X_new.shape\n (100, 3947)\n >>> # very few components are non-zero\n >>> np.mean(transformer.components_ != 0)\n 0.0100...\n ", - "source_code": "\n\nclass SparseRandomProjection(BaseRandomProjection):\n \"\"\"Reduce dimensionality through sparse random projection.\n\n Sparse random matrix is an alternative to dense random\n projection matrix that guarantees similar embedding quality while being\n much more memory efficient and allowing faster computation of the\n projected data.\n\n If we note `s = 1 / density` the components of the random matrix are\n drawn from:\n\n - -sqrt(s) / sqrt(n_components) with probability 1 / 2s\n - 0 with probability 1 - 1 / s\n - +sqrt(s) / sqrt(n_components) with probability 1 / 2s\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n n_components : int or 'auto', default='auto'\n Dimensionality of the target projection space.\n\n n_components can be automatically adjusted according to the\n number of samples in the dataset and the bound given by the\n Johnson-Lindenstrauss lemma. In that case the quality of the\n embedding is controlled by the ``eps`` parameter.\n\n It should be noted that Johnson-Lindenstrauss lemma can yield\n very conservative estimated of the required number of components\n as it makes no assumption on the structure of the dataset.\n\n density : float or 'auto', default='auto'\n Ratio in the range (0, 1] of non-zero component in the random\n projection matrix.\n\n If density = 'auto', the value is set to the minimum density\n as recommended by Ping Li et al.: 1 / sqrt(n_features).\n\n Use density = 1 / 3.0 if you want to reproduce the results from\n Achlioptas, 2001.\n\n eps : float, default=0.1\n Parameter to control the quality of the embedding according to\n the Johnson-Lindenstrauss lemma when n_components is set to\n 'auto'. This value should be strictly positive.\n\n Smaller values lead to better embedding and higher number of\n dimensions (n_components) in the target projection space.\n\n dense_output : bool, default=False\n If True, ensure that the output of the random projection is a\n dense numpy array even if the input and random projection matrix\n are both sparse. In practice, if the number of components is\n small the number of zero components in the projected data will\n be very small and it will be more CPU and memory efficient to\n use a dense representation.\n\n If False, the projected data uses a sparse representation if\n the input is sparse.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the\n projection matrix at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n n_components_ : int\n Concrete number of components computed when n_components=\"auto\".\n\n components_ : sparse matrix of shape (n_components, n_features)\n Random matrix used for the projection. Sparse matrix will be of CSR\n format.\n\n density_ : float in range 0.0 - 1.0\n Concrete density computed from when density = \"auto\".\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GaussianRandomProjection : Reduce dimensionality through Gaussian\n random projection.\n\n References\n ----------\n\n .. [1] Ping Li, T. Hastie and K. W. Church, 2006,\n \"Very Sparse Random Projections\".\n https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf\n\n .. [2] D. Achlioptas, 2001, \"Database-friendly random projections\",\n https://users.soe.ucsc.edu/~optas/papers/jl.pdf\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.random_projection import SparseRandomProjection\n >>> rng = np.random.RandomState(42)\n >>> X = rng.rand(100, 10000)\n >>> transformer = SparseRandomProjection(random_state=rng)\n >>> X_new = transformer.fit_transform(X)\n >>> X_new.shape\n (100, 3947)\n >>> # very few components are non-zero\n >>> np.mean(transformer.components_ != 0)\n 0.0100...\n \"\"\"\n \n def __init__(self, n_components='auto', *, density='auto', eps=0.1, dense_output=False, random_state=None):\n super().__init__(n_components=n_components, eps=eps, dense_output=dense_output, random_state=random_state)\n self.density = density\n \n def _make_random_matrix(self, n_components, n_features):\n \"\"\" Generate the random projection matrix\n\n Parameters\n ----------\n n_components : int\n Dimensionality of the target projection space.\n\n n_features : int\n Dimensionality of the original source space.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.\n\n \"\"\"\n random_state = check_random_state(self.random_state)\n self.density_ = _check_density(self.density, n_features)\n return _sparse_random_matrix(n_components, n_features, density=self.density_, random_state=random_state)\n" + "description": "Reduce dimensionality through sparse random projection.\n\nSparse random matrix is an alternative to dense random\nprojection matrix that guarantees similar embedding quality while being\nmuch more memory efficient and allowing faster computation of the\nprojected data.\n\nIf we note `s = 1 / density` the components of the random matrix are\ndrawn from:\n\n - -sqrt(s) / sqrt(n_components) with probability 1 / 2s\n - 0 with probability 1 - 1 / s\n - +sqrt(s) / sqrt(n_components) with probability 1 / 2s\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.13", + "docstring": "Reduce dimensionality through sparse random projection.\n\n Sparse random matrix is an alternative to dense random\n projection matrix that guarantees similar embedding quality while being\n much more memory efficient and allowing faster computation of the\n projected data.\n\n If we note `s = 1 / density` the components of the random matrix are\n drawn from:\n\n - -sqrt(s) / sqrt(n_components) with probability 1 / 2s\n - 0 with probability 1 - 1 / s\n - +sqrt(s) / sqrt(n_components) with probability 1 / 2s\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n n_components : int or 'auto', default='auto'\n Dimensionality of the target projection space.\n\n n_components can be automatically adjusted according to the\n number of samples in the dataset and the bound given by the\n Johnson-Lindenstrauss lemma. In that case the quality of the\n embedding is controlled by the ``eps`` parameter.\n\n It should be noted that Johnson-Lindenstrauss lemma can yield\n very conservative estimated of the required number of components\n as it makes no assumption on the structure of the dataset.\n\n density : float or 'auto', default='auto'\n Ratio in the range (0, 1] of non-zero component in the random\n projection matrix.\n\n If density = 'auto', the value is set to the minimum density\n as recommended by Ping Li et al.: 1 / sqrt(n_features).\n\n Use density = 1 / 3.0 if you want to reproduce the results from\n Achlioptas, 2001.\n\n eps : float, default=0.1\n Parameter to control the quality of the embedding according to\n the Johnson-Lindenstrauss lemma when n_components is set to\n 'auto'. This value should be strictly positive.\n\n Smaller values lead to better embedding and higher number of\n dimensions (n_components) in the target projection space.\n\n dense_output : bool, default=False\n If True, ensure that the output of the random projection is a\n dense numpy array even if the input and random projection matrix\n are both sparse. In practice, if the number of components is\n small the number of zero components in the projected data will\n be very small and it will be more CPU and memory efficient to\n use a dense representation.\n\n If False, the projected data uses a sparse representation if\n the input is sparse.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the\n projection matrix at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n n_components_ : int\n Concrete number of components computed when n_components=\"auto\".\n\n components_ : sparse matrix of shape (n_components, n_features)\n Random matrix used for the projection. Sparse matrix will be of CSR\n format.\n\n density_ : float in range 0.0 - 1.0\n Concrete density computed from when density = \"auto\".\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GaussianRandomProjection : Reduce dimensionality through Gaussian\n random projection.\n\n References\n ----------\n\n .. [1] Ping Li, T. Hastie and K. W. Church, 2006,\n \"Very Sparse Random Projections\".\n https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf\n\n .. [2] D. Achlioptas, 2001, \"Database-friendly random projections\",\n https://users.soe.ucsc.edu/~optas/papers/jl.pdf\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.random_projection import SparseRandomProjection\n >>> rng = np.random.RandomState(42)\n >>> X = rng.rand(25, 3000)\n >>> transformer = SparseRandomProjection(random_state=rng)\n >>> X_new = transformer.fit_transform(X)\n >>> X_new.shape\n (25, 2759)\n >>> # very few components are non-zero\n >>> np.mean(transformer.components_ != 0)\n 0.0182...\n ", + "source_code": "\n\nclass SparseRandomProjection(BaseRandomProjection):\n \"\"\"Reduce dimensionality through sparse random projection.\n\n Sparse random matrix is an alternative to dense random\n projection matrix that guarantees similar embedding quality while being\n much more memory efficient and allowing faster computation of the\n projected data.\n\n If we note `s = 1 / density` the components of the random matrix are\n drawn from:\n\n - -sqrt(s) / sqrt(n_components) with probability 1 / 2s\n - 0 with probability 1 - 1 / s\n - +sqrt(s) / sqrt(n_components) with probability 1 / 2s\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.13\n\n Parameters\n ----------\n n_components : int or 'auto', default='auto'\n Dimensionality of the target projection space.\n\n n_components can be automatically adjusted according to the\n number of samples in the dataset and the bound given by the\n Johnson-Lindenstrauss lemma. In that case the quality of the\n embedding is controlled by the ``eps`` parameter.\n\n It should be noted that Johnson-Lindenstrauss lemma can yield\n very conservative estimated of the required number of components\n as it makes no assumption on the structure of the dataset.\n\n density : float or 'auto', default='auto'\n Ratio in the range (0, 1] of non-zero component in the random\n projection matrix.\n\n If density = 'auto', the value is set to the minimum density\n as recommended by Ping Li et al.: 1 / sqrt(n_features).\n\n Use density = 1 / 3.0 if you want to reproduce the results from\n Achlioptas, 2001.\n\n eps : float, default=0.1\n Parameter to control the quality of the embedding according to\n the Johnson-Lindenstrauss lemma when n_components is set to\n 'auto'. This value should be strictly positive.\n\n Smaller values lead to better embedding and higher number of\n dimensions (n_components) in the target projection space.\n\n dense_output : bool, default=False\n If True, ensure that the output of the random projection is a\n dense numpy array even if the input and random projection matrix\n are both sparse. In practice, if the number of components is\n small the number of zero components in the projected data will\n be very small and it will be more CPU and memory efficient to\n use a dense representation.\n\n If False, the projected data uses a sparse representation if\n the input is sparse.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the\n projection matrix at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n n_components_ : int\n Concrete number of components computed when n_components=\"auto\".\n\n components_ : sparse matrix of shape (n_components, n_features)\n Random matrix used for the projection. Sparse matrix will be of CSR\n format.\n\n density_ : float in range 0.0 - 1.0\n Concrete density computed from when density = \"auto\".\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n See Also\n --------\n GaussianRandomProjection : Reduce dimensionality through Gaussian\n random projection.\n\n References\n ----------\n\n .. [1] Ping Li, T. Hastie and K. W. Church, 2006,\n \"Very Sparse Random Projections\".\n https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf\n\n .. [2] D. Achlioptas, 2001, \"Database-friendly random projections\",\n https://users.soe.ucsc.edu/~optas/papers/jl.pdf\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.random_projection import SparseRandomProjection\n >>> rng = np.random.RandomState(42)\n >>> X = rng.rand(25, 3000)\n >>> transformer = SparseRandomProjection(random_state=rng)\n >>> X_new = transformer.fit_transform(X)\n >>> X_new.shape\n (25, 2759)\n >>> # very few components are non-zero\n >>> np.mean(transformer.components_ != 0)\n 0.0182...\n \"\"\"\n \n def __init__(self, n_components='auto', *, density='auto', eps=0.1, dense_output=False, random_state=None):\n super().__init__(n_components=n_components, eps=eps, dense_output=dense_output, random_state=random_state)\n self.density = density\n \n def _make_random_matrix(self, n_components, n_features):\n \"\"\" Generate the random projection matrix\n\n Parameters\n ----------\n n_components : int\n Dimensionality of the target projection space.\n\n n_features : int\n Dimensionality of the original source space.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.\n\n \"\"\"\n random_state = check_random_state(self.random_state)\n self.density_ = _check_density(self.density, n_features)\n return _sparse_random_matrix(n_components, n_features, density=self.density_, random_state=random_state)\n" }, { "name": "BaseLabelPropagation", @@ -26268,7 +26364,7 @@ "sklearn.semi_supervised._label_propagation.LabelSpreading._build_graph" ], "is_public": true, - "description": "LabelSpreading model for semi-supervised learning.\n\nThis model is similar to the basic Label Propagation algorithm, but uses affinity matrix based on the normalized graph Laplacian and soft clamping across the labels. Read more in the :ref:`User Guide `.", + "description": "LabelSpreading model for semi-supervised learning.\n\nThis model is similar to the basic Label Propagation algorithm,\nbut uses affinity matrix based on the normalized graph Laplacian\nand soft clamping across the labels.\n\nRead more in the :ref:`User Guide `.", "docstring": "LabelSpreading model for semi-supervised learning.\n\n This model is similar to the basic Label Propagation algorithm,\n but uses affinity matrix based on the normalized graph Laplacian\n and soft clamping across the labels.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n kernel : {'knn', 'rbf'} or callable, default='rbf'\n String identifier for kernel function to use or the kernel function\n itself. Only 'rbf' and 'knn' strings are valid inputs. The function\n passed should take two inputs, each of shape (n_samples, n_features),\n and return a (n_samples, n_samples) shaped weight matrix.\n\n gamma : float, default=20\n Parameter for rbf kernel.\n\n n_neighbors : int, default=7\n Parameter for knn kernel which is a strictly positive integer.\n\n alpha : float, default=0.2\n Clamping factor. A value in (0, 1) that specifies the relative amount\n that an instance should adopt the information from its neighbors as\n opposed to its initial label.\n alpha=0 means keeping the initial label information; alpha=1 means\n replacing all initial information.\n\n max_iter : int, default=30\n Maximum number of iterations allowed.\n\n tol : float, default=1e-3\n Convergence tolerance: threshold to consider the system at steady\n state.\n\n n_jobs : int, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n X_ : ndarray of shape (n_samples, n_features)\n Input array.\n\n classes_ : ndarray of shape (n_classes,)\n The distinct labels used in classifying instances.\n\n label_distributions_ : ndarray of shape (n_samples, n_classes)\n Categorical distribution for each item.\n\n transduction_ : ndarray of shape (n_samples,)\n Label assigned to each item via the transduction.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations run.\n\n See Also\n --------\n LabelPropagation : Unregularized graph based semi-supervised learning.\n\n References\n ----------\n Dengyong Zhou, Olivier Bousquet, Thomas Navin Lal, Jason Weston,\n Bernhard Schoelkopf. Learning with local and global consistency (2004)\n http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.115.3219\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import datasets\n >>> from sklearn.semi_supervised import LabelSpreading\n >>> label_prop_model = LabelSpreading()\n >>> iris = datasets.load_iris()\n >>> rng = np.random.RandomState(42)\n >>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3\n >>> labels = np.copy(iris.target)\n >>> labels[random_unlabeled_points] = -1\n >>> label_prop_model.fit(iris.data, labels)\n LabelSpreading(...)\n ", "source_code": "\n\nclass LabelSpreading(BaseLabelPropagation):\n \"\"\"LabelSpreading model for semi-supervised learning.\n\n This model is similar to the basic Label Propagation algorithm,\n but uses affinity matrix based on the normalized graph Laplacian\n and soft clamping across the labels.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n kernel : {'knn', 'rbf'} or callable, default='rbf'\n String identifier for kernel function to use or the kernel function\n itself. Only 'rbf' and 'knn' strings are valid inputs. The function\n passed should take two inputs, each of shape (n_samples, n_features),\n and return a (n_samples, n_samples) shaped weight matrix.\n\n gamma : float, default=20\n Parameter for rbf kernel.\n\n n_neighbors : int, default=7\n Parameter for knn kernel which is a strictly positive integer.\n\n alpha : float, default=0.2\n Clamping factor. A value in (0, 1) that specifies the relative amount\n that an instance should adopt the information from its neighbors as\n opposed to its initial label.\n alpha=0 means keeping the initial label information; alpha=1 means\n replacing all initial information.\n\n max_iter : int, default=30\n Maximum number of iterations allowed.\n\n tol : float, default=1e-3\n Convergence tolerance: threshold to consider the system at steady\n state.\n\n n_jobs : int, default=None\n The number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Attributes\n ----------\n X_ : ndarray of shape (n_samples, n_features)\n Input array.\n\n classes_ : ndarray of shape (n_classes,)\n The distinct labels used in classifying instances.\n\n label_distributions_ : ndarray of shape (n_samples, n_classes)\n Categorical distribution for each item.\n\n transduction_ : ndarray of shape (n_samples,)\n Label assigned to each item via the transduction.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Number of iterations run.\n\n See Also\n --------\n LabelPropagation : Unregularized graph based semi-supervised learning.\n\n References\n ----------\n Dengyong Zhou, Olivier Bousquet, Thomas Navin Lal, Jason Weston,\n Bernhard Schoelkopf. Learning with local and global consistency (2004)\n http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.115.3219\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import datasets\n >>> from sklearn.semi_supervised import LabelSpreading\n >>> label_prop_model = LabelSpreading()\n >>> iris = datasets.load_iris()\n >>> rng = np.random.RandomState(42)\n >>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3\n >>> labels = np.copy(iris.target)\n >>> labels[random_unlabeled_points] = -1\n >>> label_prop_model.fit(iris.data, labels)\n LabelSpreading(...)\n \"\"\"\n _variant = 'spreading'\n \n def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=0.001, n_jobs=None):\n super().__init__(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol, n_jobs=n_jobs)\n \n def _build_graph(self):\n \"\"\"Graph matrix for Label Spreading computes the graph laplacian\"\"\"\n if self.kernel == 'knn':\n self.nn_fit = None\n n_samples = self.X_.shape[0]\n affinity_matrix = self._get_kernel(self.X_)\n laplacian = csgraph.laplacian(affinity_matrix, normed=True)\n laplacian = -laplacian\n if sparse.isspmatrix(laplacian):\n diag_mask = laplacian.row == laplacian.col\n laplacian.data[diag_mask] = 0.0\n else:\n laplacian.flat[::n_samples + 1] = 0.0\n return laplacian\n" }, @@ -26287,7 +26383,7 @@ "sklearn.semi_supervised._self_training.SelfTrainingClassifier.score" ], "is_public": true, - "description": "Self-training classifier.\n\nThis class allows a given supervised classifier to function as a semi-supervised classifier, allowing it to learn from unlabeled data. It does this by iteratively predicting pseudo-labels for the unlabeled data and adding them to the training set. The classifier will continue iterating until either max_iter is reached, or no pseudo-labels were added to the training set in the previous iteration. Read more in the :ref:`User Guide `.", + "description": "Self-training classifier.\n\nThis class allows a given supervised classifier to function as a\nsemi-supervised classifier, allowing it to learn from unlabeled data. It\ndoes this by iteratively predicting pseudo-labels for the unlabeled data\nand adding them to the training set.\n\nThe classifier will continue iterating until either max_iter is reached, or\nno pseudo-labels were added to the training set in the previous iteration.\n\nRead more in the :ref:`User Guide `.", "docstring": "Self-training classifier.\n\n This class allows a given supervised classifier to function as a\n semi-supervised classifier, allowing it to learn from unlabeled data. It\n does this by iteratively predicting pseudo-labels for the unlabeled data\n and adding them to the training set.\n\n The classifier will continue iterating until either max_iter is reached, or\n no pseudo-labels were added to the training set in the previous iteration.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n base_estimator : estimator object\n An estimator object implementing `fit` and `predict_proba`.\n Invoking the `fit` method will fit a clone of the passed estimator,\n which will be stored in the `base_estimator_` attribute.\n\n threshold : float, default=0.75\n The decision threshold for use with `criterion='threshold'`.\n Should be in [0, 1). When using the `'threshold'` criterion, a\n :ref:`well calibrated classifier ` should be used.\n\n criterion : {'threshold', 'k_best'}, default='threshold'\n The selection criterion used to select which labels to add to the\n training set. If `'threshold'`, pseudo-labels with prediction\n probabilities above `threshold` are added to the dataset. If `'k_best'`,\n the `k_best` pseudo-labels with highest prediction probabilities are\n added to the dataset. When using the 'threshold' criterion, a\n :ref:`well calibrated classifier ` should be used.\n\n k_best : int, default=10\n The amount of samples to add in each iteration. Only used when\n `criterion='k_best'`.\n\n max_iter : int or None, default=10\n Maximum number of iterations allowed. Should be greater than or equal\n to 0. If it is `None`, the classifier will continue to predict labels\n until no new pseudo-labels are added, or all unlabeled samples have\n been labeled.\n\n verbose : bool, default=False\n Enable verbose output.\n\n Attributes\n ----------\n base_estimator_ : estimator object\n The fitted estimator.\n\n classes_ : ndarray or list of ndarray of shape (n_classes,)\n Class labels for each output. (Taken from the trained\n `base_estimator_`).\n\n transduction_ : ndarray of shape (n_samples,)\n The labels used for the final fit of the classifier, including\n pseudo-labels added during fit.\n\n labeled_iter_ : ndarray of shape (n_samples,)\n The iteration in which each sample was labeled. When a sample has\n iteration 0, the sample was already labeled in the original dataset.\n When a sample has iteration -1, the sample was not labeled in any\n iteration.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The number of rounds of self-training, that is the number of times the\n base estimator is fitted on relabeled variants of the training set.\n\n termination_condition_ : {'max_iter', 'no_change', 'all_labeled'}\n The reason that fitting was stopped.\n\n - `'max_iter'`: `n_iter_` reached `max_iter`.\n - `'no_change'`: no new labels were predicted.\n - `'all_labeled'`: all unlabeled samples were labeled before `max_iter`\n was reached.\n\n See Also\n --------\n LabelPropagation : Label propagation classifier.\n LabelSpreading : Label spreading model for semi-supervised learning.\n\n References\n ----------\n David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling\n supervised methods. In Proceedings of the 33rd annual meeting on\n Association for Computational Linguistics (ACL '95). Association for\n Computational Linguistics, Stroudsburg, PA, USA, 189-196. DOI:\n https://doi.org/10.3115/981658.981684\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import datasets\n >>> from sklearn.semi_supervised import SelfTrainingClassifier\n >>> from sklearn.svm import SVC\n >>> rng = np.random.RandomState(42)\n >>> iris = datasets.load_iris()\n >>> random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3\n >>> iris.target[random_unlabeled_points] = -1\n >>> svc = SVC(probability=True, gamma=\"auto\")\n >>> self_training_model = SelfTrainingClassifier(svc)\n >>> self_training_model.fit(iris.data, iris.target)\n SelfTrainingClassifier(...)\n ", "source_code": "\n\nclass SelfTrainingClassifier(MetaEstimatorMixin, BaseEstimator):\n \"\"\"Self-training classifier.\n\n This class allows a given supervised classifier to function as a\n semi-supervised classifier, allowing it to learn from unlabeled data. It\n does this by iteratively predicting pseudo-labels for the unlabeled data\n and adding them to the training set.\n\n The classifier will continue iterating until either max_iter is reached, or\n no pseudo-labels were added to the training set in the previous iteration.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n base_estimator : estimator object\n An estimator object implementing `fit` and `predict_proba`.\n Invoking the `fit` method will fit a clone of the passed estimator,\n which will be stored in the `base_estimator_` attribute.\n\n threshold : float, default=0.75\n The decision threshold for use with `criterion='threshold'`.\n Should be in [0, 1). When using the `'threshold'` criterion, a\n :ref:`well calibrated classifier ` should be used.\n\n criterion : {'threshold', 'k_best'}, default='threshold'\n The selection criterion used to select which labels to add to the\n training set. If `'threshold'`, pseudo-labels with prediction\n probabilities above `threshold` are added to the dataset. If `'k_best'`,\n the `k_best` pseudo-labels with highest prediction probabilities are\n added to the dataset. When using the 'threshold' criterion, a\n :ref:`well calibrated classifier ` should be used.\n\n k_best : int, default=10\n The amount of samples to add in each iteration. Only used when\n `criterion='k_best'`.\n\n max_iter : int or None, default=10\n Maximum number of iterations allowed. Should be greater than or equal\n to 0. If it is `None`, the classifier will continue to predict labels\n until no new pseudo-labels are added, or all unlabeled samples have\n been labeled.\n\n verbose : bool, default=False\n Enable verbose output.\n\n Attributes\n ----------\n base_estimator_ : estimator object\n The fitted estimator.\n\n classes_ : ndarray or list of ndarray of shape (n_classes,)\n Class labels for each output. (Taken from the trained\n `base_estimator_`).\n\n transduction_ : ndarray of shape (n_samples,)\n The labels used for the final fit of the classifier, including\n pseudo-labels added during fit.\n\n labeled_iter_ : ndarray of shape (n_samples,)\n The iteration in which each sample was labeled. When a sample has\n iteration 0, the sample was already labeled in the original dataset.\n When a sample has iteration -1, the sample was not labeled in any\n iteration.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n The number of rounds of self-training, that is the number of times the\n base estimator is fitted on relabeled variants of the training set.\n\n termination_condition_ : {'max_iter', 'no_change', 'all_labeled'}\n The reason that fitting was stopped.\n\n - `'max_iter'`: `n_iter_` reached `max_iter`.\n - `'no_change'`: no new labels were predicted.\n - `'all_labeled'`: all unlabeled samples were labeled before `max_iter`\n was reached.\n\n See Also\n --------\n LabelPropagation : Label propagation classifier.\n LabelSpreading : Label spreading model for semi-supervised learning.\n\n References\n ----------\n David Yarowsky. 1995. Unsupervised word sense disambiguation rivaling\n supervised methods. In Proceedings of the 33rd annual meeting on\n Association for Computational Linguistics (ACL '95). Association for\n Computational Linguistics, Stroudsburg, PA, USA, 189-196. DOI:\n https://doi.org/10.3115/981658.981684\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import datasets\n >>> from sklearn.semi_supervised import SelfTrainingClassifier\n >>> from sklearn.svm import SVC\n >>> rng = np.random.RandomState(42)\n >>> iris = datasets.load_iris()\n >>> random_unlabeled_points = rng.rand(iris.target.shape[0]) < 0.3\n >>> iris.target[random_unlabeled_points] = -1\n >>> svc = SVC(probability=True, gamma=\"auto\")\n >>> self_training_model = SelfTrainingClassifier(svc)\n >>> self_training_model.fit(iris.data, iris.target)\n SelfTrainingClassifier(...)\n \"\"\"\n _estimator_type = 'classifier'\n \n def __init__(self, base_estimator, threshold=0.75, criterion='threshold', k_best=10, max_iter=10, verbose=False):\n self.base_estimator = base_estimator\n self.threshold = threshold\n self.criterion = criterion\n self.k_best = k_best\n self.max_iter = max_iter\n self.verbose = verbose\n \n def fit(self, X, y):\n \"\"\"\n Fit self-training classifier using `X`, `y` as training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n y : {array-like, sparse matrix} of shape (n_samples,)\n Array representing the labels. Unlabeled samples should have the\n label -1.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'lil', 'dok'], force_all_finite=False)\n if self.base_estimator is None:\n raise ValueError('base_estimator cannot be None!')\n self.base_estimator_ = clone(self.base_estimator)\n if self.max_iter is not None and self.max_iter < 0:\n raise ValueError(f'max_iter must be >= 0 or None, got {self.max_iter}')\n if not 0 <= self.threshold < 1:\n raise ValueError(f'threshold must be in [0,1), got {self.threshold}')\n if self.criterion not in ['threshold', 'k_best']:\n raise ValueError(f\"criterion must be either 'threshold' or 'k_best', got {self.criterion}.\")\n if y.dtype.kind in ['U', 'S']:\n raise ValueError('y has dtype string. If you wish to predict on string targets, use dtype object, and use -1 as the label for unlabeled samples.')\n has_label = y != -1\n if np.all(has_label):\n warnings.warn('y contains no unlabeled samples', UserWarning)\n if self.criterion == 'k_best' and self.k_best > X.shape[0] - np.sum(has_label):\n warnings.warn('k_best is larger than the amount of unlabeled samples. All unlabeled samples will be labeled in the first iteration', UserWarning)\n self.transduction_ = np.copy(y)\n self.labeled_iter_ = np.full_like(y, -1)\n self.labeled_iter_[has_label] = 0\n self.n_iter_ = 0\n while not np.all(has_label) and (self.max_iter is None or self.n_iter_ < self.max_iter):\n self.n_iter_ += 1\n self.base_estimator_.fit(X[safe_mask(X, has_label)], self.transduction_[has_label])\n _validate_estimator(self.base_estimator_)\n prob = self.base_estimator_.predict_proba(X[safe_mask(X, ~has_label)])\n pred = self.base_estimator_.classes_[np.argmax(prob, axis=1)]\n max_proba = np.max(prob, axis=1)\n if self.criterion == 'threshold':\n selected = max_proba > self.threshold\n else:\n n_to_select = min(self.k_best, max_proba.shape[0])\n if n_to_select == max_proba.shape[0]:\n selected = np.ones_like(max_proba, dtype=bool)\n else:\n selected = np.argpartition(-max_proba, n_to_select)[:n_to_select]\n selected_full = np.nonzero(~has_label)[0][selected]\n self.transduction_[selected_full] = pred[selected]\n has_label[selected_full] = True\n self.labeled_iter_[selected_full] = self.n_iter_\n if selected_full.shape[0] == 0:\n self.termination_condition_ = 'no_change'\n break\n if self.verbose:\n print(f'End of iteration {self.n_iter_}, added {selected_full.shape[0]} new labels.')\n if self.n_iter_ == self.max_iter:\n self.termination_condition_ = 'max_iter'\n if np.all(has_label):\n self.termination_condition_ = 'all_labeled'\n self.base_estimator_.fit(X[safe_mask(X, has_label)], self.transduction_[has_label])\n self.classes_ = self.base_estimator_.classes_\n return self\n \n @if_delegate_has_method(delegate='base_estimator')\n def predict(self, X):\n \"\"\"Predict the classes of `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n Array with predicted labels.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, force_all_finite=False, reset=False)\n return self.base_estimator_.predict(X)\n \n def predict_proba(self, X):\n \"\"\"Predict probability for each possible outcome.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_features)\n Array with prediction probabilities.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, force_all_finite=False, reset=False)\n return self.base_estimator_.predict_proba(X)\n \n @if_delegate_has_method(delegate='base_estimator')\n def decision_function(self, X):\n \"\"\"Call decision function of the `base_estimator`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_features)\n Result of the decision function of the `base_estimator`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, force_all_finite=False, reset=False)\n return self.base_estimator_.decision_function(X)\n \n @if_delegate_has_method(delegate='base_estimator')\n def predict_log_proba(self, X):\n \"\"\"Predict log probability for each possible outcome.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_features)\n Array with log prediction probabilities.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, force_all_finite=False, reset=False)\n return self.base_estimator_.predict_log_proba(X)\n \n @if_delegate_has_method(delegate='base_estimator')\n def score(self, X, y):\n \"\"\"Call score on the `base_estimator`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n y : array-like of shape (n_samples,)\n Array representing the labels.\n\n Returns\n -------\n score : float\n Result of calling score on the `base_estimator`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, force_all_finite=False, reset=False)\n return self.base_estimator_.score(X, y)\n" }, @@ -26318,7 +26414,7 @@ "sklearn.svm._base.BaseLibSVM.n_support_@getter" ], "is_public": false, - "description": "Base class for estimators that use libsvm as backing library.\n\nThis implements support vector machine classification and regression. Parameter documentation is in the derived `SVC` class.", + "description": "Base class for estimators that use libsvm as backing library.\n\nThis implements support vector machine classification and regression.\n\nParameter documentation is in the derived `SVC` class.", "docstring": "Base class for estimators that use libsvm as backing library.\n\n This implements support vector machine classification and regression.\n\n Parameter documentation is in the derived `SVC` class.\n ", "source_code": "\n\nclass BaseLibSVM(BaseEstimator, metaclass=ABCMeta):\n \"\"\"Base class for estimators that use libsvm as backing library.\n\n This implements support vector machine classification and regression.\n\n Parameter documentation is in the derived `SVC` class.\n \"\"\"\n _sparse_kernels = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']\n \n @abstractmethod\n def __init__(self, kernel, degree, gamma, coef0, tol, C, nu, epsilon, shrinking, probability, cache_size, class_weight, verbose, max_iter, random_state):\n if self._impl not in LIBSVM_IMPL:\n raise ValueError('impl should be one of %s, %s was given' % (LIBSVM_IMPL, self._impl))\n if gamma == 0:\n msg = \"The gamma value of 0.0 is invalid. Use 'auto' to set gamma to a value of 1 / n_features.\"\n raise ValueError(msg)\n self.kernel = kernel\n self.degree = degree\n self.gamma = gamma\n self.coef0 = coef0\n self.tol = tol\n self.C = C\n self.nu = nu\n self.epsilon = epsilon\n self.shrinking = shrinking\n self.probability = probability\n self.cache_size = cache_size\n self.class_weight = class_weight\n self.verbose = verbose\n self.max_iter = max_iter\n self.random_state = random_state\n \n def _more_tags(self):\n return {'pairwise': self.kernel == 'precomputed'}\n \n @deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n @property\n def _pairwise(self):\n return self.kernel == 'precomputed'\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the SVM model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples, n_samples).\n\n y : array-like of shape (n_samples,)\n Target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Per-sample weights. Rescale C per sample. Higher weights\n force the classifier to put more emphasis on these points.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n If X and y are not C-ordered and contiguous arrays of np.float64 and\n X is not a scipy.sparse.csr_matrix, X and/or y may be copied.\n\n If X is a dense array, then the other methods will not support sparse\n matrices as input.\n \"\"\"\n rnd = check_random_state(self.random_state)\n sparse = sp.isspmatrix(X)\n if sparse and self.kernel == 'precomputed':\n raise TypeError('Sparse precomputed kernels are not supported.')\n self._sparse = sparse and not callable(self.kernel)\n if hasattr(self, 'decision_function_shape'):\n if self.decision_function_shape not in ('ovr', 'ovo'):\n raise ValueError(f\"decision_function_shape must be either 'ovr' or 'ovo', got {self.decision_function_shape}.\")\n if callable(self.kernel):\n check_consistent_length(X, y)\n else:\n (X, y) = self._validate_data(X, y, dtype=np.float64, order='C', accept_sparse='csr', accept_large_sparse=False)\n y = self._validate_targets(y)\n sample_weight = np.asarray([] if sample_weight is None else sample_weight, dtype=np.float64)\n solver_type = LIBSVM_IMPL.index(self._impl)\n n_samples = _num_samples(X)\n if solver_type != 2 and n_samples != y.shape[0]:\n raise ValueError('X and y have incompatible shapes.\\n' + 'X has %s samples, but y has %s.' % (n_samples, y.shape[0]))\n if self.kernel == 'precomputed' and n_samples != X.shape[1]:\n raise ValueError('Precomputed matrix must be a square matrix. Input is a {}x{} matrix.'.format(X.shape[0], X.shape[1]))\n if sample_weight.shape[0] > 0 and sample_weight.shape[0] != n_samples:\n raise ValueError('sample_weight and X have incompatible shapes: %r vs %r\\nNote: Sparse matrices cannot be indexed w/boolean masks (use `indices=True` in CV).' % (sample_weight.shape, X.shape))\n kernel = 'precomputed' if callable(self.kernel) else self.kernel\n if kernel == 'precomputed':\n self._gamma = 0.0\n elif isinstance(self.gamma, str):\n if self.gamma == 'scale':\n X_var = X.multiply(X).mean() - X.mean()**2 if sparse else X.var()\n self._gamma = 1.0 / (X.shape[1] * X_var) if X_var != 0 else 1.0\n elif self.gamma == 'auto':\n self._gamma = 1.0 / X.shape[1]\n else:\n raise ValueError(\"When 'gamma' is a string, it should be either 'scale' or 'auto'. Got '{}' instead.\".format(self.gamma))\n else:\n self._gamma = self.gamma\n fit = self._sparse_fit if self._sparse else self._dense_fit\n if self.verbose:\n print('[LibSVM]', end='')\n seed = rnd.randint(np.iinfo('i').max)\n fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)\n self.shape_fit_ = X.shape if hasattr(X, 'shape') else (n_samples, )\n self._intercept_ = self.intercept_.copy()\n self._dual_coef_ = self.dual_coef_\n if self._impl in ['c_svc', 'nu_svc'] and len(self.classes_) == 2:\n self.intercept_ *= -1\n self.dual_coef_ = -self.dual_coef_\n return self\n \n def _validate_targets(self, y):\n \"\"\"Validation of y and class_weight.\n\n Default implementation for SVR and one-class; overridden in BaseSVC.\n \"\"\"\n self.class_weight_ = np.empty(0)\n return column_or_1d(y, warn=True).astype(np.float64, copy=False)\n \n def _warn_from_fit_status(self):\n assert self.fit_status_ in (0, 1)\n if self.fit_status_ == 1:\n warnings.warn('Solver terminated early (max_iter=%i). Consider pre-processing your data with StandardScaler or MinMaxScaler.' % self.max_iter, ConvergenceWarning)\n \n def _dense_fit(self, X, y, sample_weight, solver_type, kernel, random_seed):\n if callable(self.kernel):\n self.__Xfit = X\n X = self._compute_kernel(X)\n if X.shape[0] != X.shape[1]:\n raise ValueError('X.shape[0] should be equal to X.shape[1]')\n libsvm.set_verbosity_wrap(self.verbose)\n (self.support_, self.support_vectors_, self._n_support, self.dual_coef_, self.intercept_, self._probA, self._probB, self.fit_status_) = libsvm.fit(X, y, svm_type=solver_type, sample_weight=sample_weight, class_weight=self.class_weight_, kernel=kernel, C=self.C, nu=self.nu, probability=self.probability, degree=self.degree, shrinking=self.shrinking, tol=self.tol, cache_size=self.cache_size, coef0=self.coef0, gamma=self._gamma, epsilon=self.epsilon, max_iter=self.max_iter, random_seed=random_seed)\n self._warn_from_fit_status()\n \n def _sparse_fit(self, X, y, sample_weight, solver_type, kernel, random_seed):\n X.data = np.asarray(X.data, dtype=np.float64, order='C')\n X.sort_indices()\n kernel_type = self._sparse_kernels.index(kernel)\n libsvm_sparse.set_verbosity_wrap(self.verbose)\n (self.support_, self.support_vectors_, dual_coef_data, self.intercept_, self._n_support, self._probA, self._probB, self.fit_status_) = libsvm_sparse.libsvm_sparse_train(X.shape[1], X.data, X.indices, X.indptr, y, solver_type, kernel_type, self.degree, self._gamma, self.coef0, self.tol, self.C, self.class_weight_, sample_weight, self.nu, self.cache_size, self.epsilon, int(self.shrinking), int(self.probability), self.max_iter, random_seed)\n self._warn_from_fit_status()\n if hasattr(self, 'classes_'):\n n_class = len(self.classes_) - 1\n else:\n n_class = 1\n n_SV = self.support_vectors_.shape[0]\n dual_coef_indices = np.tile(np.arange(n_SV), n_class)\n if not n_SV:\n self.dual_coef_ = sp.csr_matrix([])\n else:\n dual_coef_indptr = np.arange(0, dual_coef_indices.size + 1, dual_coef_indices.size / n_class)\n self.dual_coef_ = sp.csr_matrix((dual_coef_data, dual_coef_indices, dual_coef_indptr), (n_class, n_SV))\n \n def predict(self, X):\n \"\"\"Perform regression on samples in X.\n\n For an one-class model, +1 (inlier) or -1 (outlier) is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n The predicted values.\n \"\"\"\n X = self._validate_for_predict(X)\n predict = self._sparse_predict if self._sparse else self._dense_predict\n return predict(X)\n \n def _dense_predict(self, X):\n X = self._compute_kernel(X)\n if X.ndim == 1:\n X = check_array(X, order='C', accept_large_sparse=False)\n kernel = self.kernel\n if callable(self.kernel):\n kernel = 'precomputed'\n if X.shape[1] != self.shape_fit_[0]:\n raise ValueError('X.shape[1] = %d should be equal to %d, the number of samples at training time' % (X.shape[1], self.shape_fit_[0]))\n svm_type = LIBSVM_IMPL.index(self._impl)\n return libsvm.predict(X, self.support_, self.support_vectors_, self._n_support, self._dual_coef_, self._intercept_, self._probA, self._probB, svm_type=svm_type, kernel=kernel, degree=self.degree, coef0=self.coef0, gamma=self._gamma, cache_size=self.cache_size)\n \n def _sparse_predict(self, X):\n kernel = self.kernel\n if callable(kernel):\n kernel = 'precomputed'\n kernel_type = self._sparse_kernels.index(kernel)\n C = 0.0\n return libsvm_sparse.libsvm_sparse_predict(X.data, X.indices, X.indptr, self.support_vectors_.data, self.support_vectors_.indices, self.support_vectors_.indptr, self._dual_coef_.data, self._intercept_, LIBSVM_IMPL.index(self._impl), kernel_type, self.degree, self._gamma, self.coef0, self.tol, C, self.class_weight_, self.nu, self.epsilon, self.shrinking, self.probability, self._n_support, self._probA, self._probB)\n \n def _compute_kernel(self, X):\n \"\"\"Return the data transformed by a callable kernel\"\"\"\n if callable(self.kernel):\n kernel = self.kernel(X, self.__Xfit)\n if sp.issparse(kernel):\n kernel = kernel.toarray()\n X = np.asarray(kernel, dtype=np.float64, order='C')\n return X\n \n def _decision_function(self, X):\n \"\"\"Evaluates the decision function for the samples in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n X : array-like of shape (n_samples, n_class * (n_class-1) / 2)\n Returns the decision function of the sample for each class\n in the model.\n \"\"\"\n X = self._validate_for_predict(X)\n X = self._compute_kernel(X)\n if self._sparse:\n dec_func = self._sparse_decision_function(X)\n else:\n dec_func = self._dense_decision_function(X)\n if self._impl in ['c_svc', 'nu_svc'] and len(self.classes_) == 2:\n return -dec_func.ravel()\n return dec_func\n \n def _dense_decision_function(self, X):\n X = check_array(X, dtype=np.float64, order='C', accept_large_sparse=False)\n kernel = self.kernel\n if callable(kernel):\n kernel = 'precomputed'\n return libsvm.decision_function(X, self.support_, self.support_vectors_, self._n_support, self._dual_coef_, self._intercept_, self._probA, self._probB, svm_type=LIBSVM_IMPL.index(self._impl), kernel=kernel, degree=self.degree, cache_size=self.cache_size, coef0=self.coef0, gamma=self._gamma)\n \n def _sparse_decision_function(self, X):\n X.data = np.asarray(X.data, dtype=np.float64, order='C')\n kernel = self.kernel\n if hasattr(kernel, '__call__'):\n kernel = 'precomputed'\n kernel_type = self._sparse_kernels.index(kernel)\n return libsvm_sparse.libsvm_sparse_decision_function(X.data, X.indices, X.indptr, self.support_vectors_.data, self.support_vectors_.indices, self.support_vectors_.indptr, self._dual_coef_.data, self._intercept_, LIBSVM_IMPL.index(self._impl), kernel_type, self.degree, self._gamma, self.coef0, self.tol, self.C, self.class_weight_, self.nu, self.epsilon, self.shrinking, self.probability, self._n_support, self._probA, self._probB)\n \n def _validate_for_predict(self, X):\n check_is_fitted(self)\n if not callable(self.kernel):\n X = self._validate_data(X, accept_sparse='csr', dtype=np.float64, order='C', accept_large_sparse=False, reset=False)\n if self._sparse and not sp.isspmatrix(X):\n X = sp.csr_matrix(X)\n if self._sparse:\n X.sort_indices()\n if sp.issparse(X) and not self._sparse and not callable(self.kernel):\n raise ValueError('cannot use sparse input in %r trained on dense data' % type(self).__name__)\n if self.kernel == 'precomputed':\n if X.shape[1] != self.shape_fit_[0]:\n raise ValueError('X.shape[1] = %d should be equal to %d, the number of samples at training time' % (X.shape[1], self.shape_fit_[0]))\n sv = self.support_vectors_\n if not self._sparse and sv.size > 0 and self.n_support_.sum() != sv.shape[0]:\n raise ValueError(f'The internal representation of {self.__class__.__name__} was altered')\n return X\n \n @property\n def coef_(self):\n \"\"\"Weights assigned to the features when `kernel=\"linear\"`.\n\n Returns\n -------\n ndarray of shape (n_features, n_classes)\n \"\"\"\n if self.kernel != 'linear':\n raise AttributeError('coef_ is only available when using a linear kernel')\n coef = self._get_coef()\n if sp.issparse(coef):\n coef.data.flags.writeable = False\n else:\n coef.flags.writeable = False\n return coef\n \n def _get_coef(self):\n return safe_sparse_dot(self._dual_coef_, self.support_vectors_)\n \n @property\n def n_support_(self):\n \"\"\"Number of support vectors for each class.\"\"\"\n try:\n check_is_fitted(self)\n except NotFittedError:\n raise AttributeError\n svm_type = LIBSVM_IMPL.index(self._impl)\n if svm_type in (0, 1):\n return self._n_support\n else:\n return np.array([self._n_support[0]])\n" }, @@ -26361,7 +26457,7 @@ "sklearn.svm._classes.LinearSVC._more_tags" ], "is_public": true, - "description": "Linear Support Vector Classification.\n\nSimilar to SVC with parameter kernel='linear', but implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples. This class supports both dense and sparse input and the multiclass support is handled according to a one-vs-the-rest scheme. Read more in the :ref:`User Guide `.", + "description": "Linear Support Vector Classification.\n\nSimilar to SVC with parameter kernel='linear', but implemented in terms of\nliblinear rather than libsvm, so it has more flexibility in the choice of\npenalties and loss functions and should scale better to large numbers of\nsamples.\n\nThis class supports both dense and sparse input and the multiclass support\nis handled according to a one-vs-the-rest scheme.\n\nRead more in the :ref:`User Guide `.", "docstring": "Linear Support Vector Classification.\n\n Similar to SVC with parameter kernel='linear', but implemented in terms of\n liblinear rather than libsvm, so it has more flexibility in the choice of\n penalties and loss functions and should scale better to large numbers of\n samples.\n\n This class supports both dense and sparse input and the multiclass support\n is handled according to a one-vs-the-rest scheme.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n penalty : {'l1', 'l2'}, default='l2'\n Specifies the norm used in the penalization. The 'l2'\n penalty is the standard used in SVC. The 'l1' leads to ``coef_``\n vectors that are sparse.\n\n loss : {'hinge', 'squared_hinge'}, default='squared_hinge'\n Specifies the loss function. 'hinge' is the standard SVM loss\n (used e.g. by the SVC class) while 'squared_hinge' is the\n square of the hinge loss. The combination of ``penalty='l1'``\n and ``loss='hinge'`` is not supported.\n\n dual : bool, default=True\n Select the algorithm to either solve the dual or primal\n optimization problem. Prefer dual=False when n_samples > n_features.\n\n tol : float, default=1e-4\n Tolerance for stopping criteria.\n\n C : float, default=1.0\n Regularization parameter. The strength of the regularization is\n inversely proportional to C. Must be strictly positive.\n\n multi_class : {'ovr', 'crammer_singer'}, default='ovr'\n Determines the multi-class strategy if `y` contains more than\n two classes.\n ``\"ovr\"`` trains n_classes one-vs-rest classifiers, while\n ``\"crammer_singer\"`` optimizes a joint objective over all classes.\n While `crammer_singer` is interesting from a theoretical perspective\n as it is consistent, it is seldom used in practice as it rarely leads\n to better accuracy and is more expensive to compute.\n If ``\"crammer_singer\"`` is chosen, the options loss, penalty and dual\n will be ignored.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be already centered).\n\n intercept_scaling : float, default=1\n When self.fit_intercept is True, instance vector x becomes\n ``[x, self.intercept_scaling]``,\n i.e. a \"synthetic\" feature with constant value equals to\n intercept_scaling is appended to the instance vector.\n The intercept becomes intercept_scaling * synthetic feature weight\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\n class_weight : dict or 'balanced', default=None\n Set the parameter C of class i to ``class_weight[i]*C`` for\n SVC. If not given, all classes are supposed to have\n weight one.\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n verbose : int, default=0\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in liblinear that, if enabled, may not work\n properly in a multithreaded context.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data for\n the dual coordinate descent (if ``dual=True``). When ``dual=False`` the\n underlying implementation of :class:`LinearSVC` is not random and\n ``random_state`` has no effect on the results.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n max_iter : int, default=1000\n The maximum number of iterations to be run.\n\n Attributes\n ----------\n coef_ : ndarray of shape (1, n_features) if n_classes == 2 else (n_classes, n_features)\n Weights assigned to the features (coefficients in the primal\n problem).\n\n ``coef_`` is a readonly property derived from ``raw_coef_`` that\n follows the internal memory layout of liblinear.\n\n intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)\n Constants in decision function.\n\n classes_ : ndarray of shape (n_classes,)\n The unique classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Maximum number of iterations run across all classes.\n\n See Also\n --------\n SVC : Implementation of Support Vector Machine classifier using libsvm:\n the kernel can be non-linear but its SMO algorithm does not\n scale to large number of samples as LinearSVC does.\n\n Furthermore SVC multi-class mode is implemented using one\n vs one scheme while LinearSVC uses one vs the rest. It is\n possible to implement one vs the rest with SVC by using the\n :class:`~sklearn.multiclass.OneVsRestClassifier` wrapper.\n\n Finally SVC can fit dense data without memory copy if the input\n is C-contiguous. Sparse data will still incur memory copy though.\n\n sklearn.linear_model.SGDClassifier : SGDClassifier can optimize the same\n cost function as LinearSVC\n by adjusting the penalty and loss parameters. In addition it requires\n less memory, allows incremental (online) learning, and implements\n various loss functions and regularization regimes.\n\n Notes\n -----\n The underlying C implementation uses a random number generator to\n select features when fitting the model. It is thus not uncommon\n to have slightly different results for the same input data. If\n that happens, try with a smaller ``tol`` parameter.\n\n The underlying implementation, liblinear, uses a sparse internal\n representation for the data that will incur a memory copy.\n\n Predict output may not match that of standalone liblinear in certain\n cases. See :ref:`differences from liblinear `\n in the narrative documentation.\n\n References\n ----------\n `LIBLINEAR: A Library for Large Linear Classification\n `__\n\n Examples\n --------\n >>> from sklearn.svm import LinearSVC\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_features=4, random_state=0)\n >>> clf = make_pipeline(StandardScaler(),\n ... LinearSVC(random_state=0, tol=1e-5))\n >>> clf.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])\n\n >>> print(clf.named_steps['linearsvc'].coef_)\n [[0.141... 0.526... 0.679... 0.493...]]\n\n >>> print(clf.named_steps['linearsvc'].intercept_)\n [0.1693...]\n >>> print(clf.predict([[0, 0, 0, 0]]))\n [1]\n ", "source_code": "\n\nclass LinearSVC(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):\n \"\"\"Linear Support Vector Classification.\n\n Similar to SVC with parameter kernel='linear', but implemented in terms of\n liblinear rather than libsvm, so it has more flexibility in the choice of\n penalties and loss functions and should scale better to large numbers of\n samples.\n\n This class supports both dense and sparse input and the multiclass support\n is handled according to a one-vs-the-rest scheme.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n penalty : {'l1', 'l2'}, default='l2'\n Specifies the norm used in the penalization. The 'l2'\n penalty is the standard used in SVC. The 'l1' leads to ``coef_``\n vectors that are sparse.\n\n loss : {'hinge', 'squared_hinge'}, default='squared_hinge'\n Specifies the loss function. 'hinge' is the standard SVM loss\n (used e.g. by the SVC class) while 'squared_hinge' is the\n square of the hinge loss. The combination of ``penalty='l1'``\n and ``loss='hinge'`` is not supported.\n\n dual : bool, default=True\n Select the algorithm to either solve the dual or primal\n optimization problem. Prefer dual=False when n_samples > n_features.\n\n tol : float, default=1e-4\n Tolerance for stopping criteria.\n\n C : float, default=1.0\n Regularization parameter. The strength of the regularization is\n inversely proportional to C. Must be strictly positive.\n\n multi_class : {'ovr', 'crammer_singer'}, default='ovr'\n Determines the multi-class strategy if `y` contains more than\n two classes.\n ``\"ovr\"`` trains n_classes one-vs-rest classifiers, while\n ``\"crammer_singer\"`` optimizes a joint objective over all classes.\n While `crammer_singer` is interesting from a theoretical perspective\n as it is consistent, it is seldom used in practice as it rarely leads\n to better accuracy and is more expensive to compute.\n If ``\"crammer_singer\"`` is chosen, the options loss, penalty and dual\n will be ignored.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be already centered).\n\n intercept_scaling : float, default=1\n When self.fit_intercept is True, instance vector x becomes\n ``[x, self.intercept_scaling]``,\n i.e. a \"synthetic\" feature with constant value equals to\n intercept_scaling is appended to the instance vector.\n The intercept becomes intercept_scaling * synthetic feature weight\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\n class_weight : dict or 'balanced', default=None\n Set the parameter C of class i to ``class_weight[i]*C`` for\n SVC. If not given, all classes are supposed to have\n weight one.\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n verbose : int, default=0\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in liblinear that, if enabled, may not work\n properly in a multithreaded context.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data for\n the dual coordinate descent (if ``dual=True``). When ``dual=False`` the\n underlying implementation of :class:`LinearSVC` is not random and\n ``random_state`` has no effect on the results.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n max_iter : int, default=1000\n The maximum number of iterations to be run.\n\n Attributes\n ----------\n coef_ : ndarray of shape (1, n_features) if n_classes == 2 else (n_classes, n_features)\n Weights assigned to the features (coefficients in the primal\n problem).\n\n ``coef_`` is a readonly property derived from ``raw_coef_`` that\n follows the internal memory layout of liblinear.\n\n intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)\n Constants in decision function.\n\n classes_ : ndarray of shape (n_classes,)\n The unique classes labels.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Maximum number of iterations run across all classes.\n\n See Also\n --------\n SVC : Implementation of Support Vector Machine classifier using libsvm:\n the kernel can be non-linear but its SMO algorithm does not\n scale to large number of samples as LinearSVC does.\n\n Furthermore SVC multi-class mode is implemented using one\n vs one scheme while LinearSVC uses one vs the rest. It is\n possible to implement one vs the rest with SVC by using the\n :class:`~sklearn.multiclass.OneVsRestClassifier` wrapper.\n\n Finally SVC can fit dense data without memory copy if the input\n is C-contiguous. Sparse data will still incur memory copy though.\n\n sklearn.linear_model.SGDClassifier : SGDClassifier can optimize the same\n cost function as LinearSVC\n by adjusting the penalty and loss parameters. In addition it requires\n less memory, allows incremental (online) learning, and implements\n various loss functions and regularization regimes.\n\n Notes\n -----\n The underlying C implementation uses a random number generator to\n select features when fitting the model. It is thus not uncommon\n to have slightly different results for the same input data. If\n that happens, try with a smaller ``tol`` parameter.\n\n The underlying implementation, liblinear, uses a sparse internal\n representation for the data that will incur a memory copy.\n\n Predict output may not match that of standalone liblinear in certain\n cases. See :ref:`differences from liblinear `\n in the narrative documentation.\n\n References\n ----------\n `LIBLINEAR: A Library for Large Linear Classification\n `__\n\n Examples\n --------\n >>> from sklearn.svm import LinearSVC\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.datasets import make_classification\n >>> X, y = make_classification(n_features=4, random_state=0)\n >>> clf = make_pipeline(StandardScaler(),\n ... LinearSVC(random_state=0, tol=1e-5))\n >>> clf.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])\n\n >>> print(clf.named_steps['linearsvc'].coef_)\n [[0.141... 0.526... 0.679... 0.493...]]\n\n >>> print(clf.named_steps['linearsvc'].intercept_)\n [0.1693...]\n >>> print(clf.predict([[0, 0, 0, 0]]))\n [1]\n \"\"\"\n \n def __init__(self, penalty='l2', loss='squared_hinge', *, dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000):\n self.dual = dual\n self.tol = tol\n self.C = C\n self.multi_class = multi_class\n self.fit_intercept = fit_intercept\n self.intercept_scaling = intercept_scaling\n self.class_weight = class_weight\n self.verbose = verbose\n self.random_state = random_state\n self.max_iter = max_iter\n self.penalty = penalty\n self.loss = loss\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Array of weights that are assigned to individual\n samples. If not provided,\n then each sample is given unit weight.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n self : object\n An instance of the estimator.\n \"\"\"\n if self.C < 0:\n raise ValueError('Penalty term must be positive; got (C=%r)' % self.C)\n (X, y) = self._validate_data(X, y, accept_sparse='csr', dtype=np.float64, order='C', accept_large_sparse=False)\n check_classification_targets(y)\n self.classes_ = np.unique(y)\n (self.coef_, self.intercept_, self.n_iter_) = _fit_liblinear(X, y, self.C, self.fit_intercept, self.intercept_scaling, self.class_weight, self.penalty, self.dual, self.verbose, self.max_iter, self.tol, self.random_state, self.multi_class, self.loss, sample_weight=sample_weight)\n if self.multi_class == 'crammer_singer' and len(self.classes_) == 2:\n self.coef_ = (self.coef_[1] - self.coef_[0]).reshape(1, -1)\n if self.fit_intercept:\n intercept = self.intercept_[1] - self.intercept_[0]\n self.intercept_ = np.array([intercept])\n return self\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, @@ -26376,7 +26472,7 @@ "sklearn.svm._classes.LinearSVR._more_tags" ], "is_public": true, - "description": "Linear Support Vector Regression.\n\nSimilar to SVR with parameter kernel='linear', but implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples. This class supports both dense and sparse input. Read more in the :ref:`User Guide `. .. versionadded:: 0.16", + "description": "Linear Support Vector Regression.\n\nSimilar to SVR with parameter kernel='linear', but implemented in terms of\nliblinear rather than libsvm, so it has more flexibility in the choice of\npenalties and loss functions and should scale better to large numbers of\nsamples.\n\nThis class supports both dense and sparse input.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.16", "docstring": "Linear Support Vector Regression.\n\n Similar to SVR with parameter kernel='linear', but implemented in terms of\n liblinear rather than libsvm, so it has more flexibility in the choice of\n penalties and loss functions and should scale better to large numbers of\n samples.\n\n This class supports both dense and sparse input.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.16\n\n Parameters\n ----------\n epsilon : float, default=0.0\n Epsilon parameter in the epsilon-insensitive loss function. Note\n that the value of this parameter depends on the scale of the target\n variable y. If unsure, set ``epsilon=0``.\n\n tol : float, default=1e-4\n Tolerance for stopping criteria.\n\n C : float, default=1.0\n Regularization parameter. The strength of the regularization is\n inversely proportional to C. Must be strictly positive.\n\n loss : {'epsilon_insensitive', 'squared_epsilon_insensitive'}, default='epsilon_insensitive'\n Specifies the loss function. The epsilon-insensitive loss\n (standard SVR) is the L1 loss, while the squared epsilon-insensitive\n loss ('squared_epsilon_insensitive') is the L2 loss.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be already centered).\n\n intercept_scaling : float, default=1.0\n When self.fit_intercept is True, instance vector x becomes\n [x, self.intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equals to\n intercept_scaling is appended to the instance vector.\n The intercept becomes intercept_scaling * synthetic feature weight\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\n dual : bool, default=True\n Select the algorithm to either solve the dual or primal\n optimization problem. Prefer dual=False when n_samples > n_features.\n\n verbose : int, default=0\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in liblinear that, if enabled, may not work\n properly in a multithreaded context.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n max_iter : int, default=1000\n The maximum number of iterations to be run.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features) if n_classes == 2 else (n_classes, n_features)\n Weights assigned to the features (coefficients in the primal\n problem).\n\n `coef_` is a readonly property derived from `raw_coef_` that\n follows the internal memory layout of liblinear.\n\n intercept_ : ndarray of shape (1) if n_classes == 2 else (n_classes)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Maximum number of iterations run across all classes.\n\n See Also\n --------\n LinearSVC : Implementation of Support Vector Machine classifier using the\n same library as this class (liblinear).\n\n SVR : Implementation of Support Vector Machine regression using libsvm:\n the kernel can be non-linear but its SMO algorithm does not\n scale to large number of samples as LinearSVC does.\n\n sklearn.linear_model.SGDRegressor : SGDRegressor can optimize the same cost\n function as LinearSVR\n by adjusting the penalty and loss parameters. In addition it requires\n less memory, allows incremental (online) learning, and implements\n various loss functions and regularization regimes.\n\n Examples\n --------\n >>> from sklearn.svm import LinearSVR\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_features=4, random_state=0)\n >>> regr = make_pipeline(StandardScaler(),\n ... LinearSVR(random_state=0, tol=1e-5))\n >>> regr.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('linearsvr', LinearSVR(random_state=0, tol=1e-05))])\n\n >>> print(regr.named_steps['linearsvr'].coef_)\n [18.582... 27.023... 44.357... 64.522...]\n >>> print(regr.named_steps['linearsvr'].intercept_)\n [-4...]\n >>> print(regr.predict([[0, 0, 0, 0]]))\n [-2.384...]\n ", "source_code": "\n\nclass LinearSVR(RegressorMixin, LinearModel):\n \"\"\"Linear Support Vector Regression.\n\n Similar to SVR with parameter kernel='linear', but implemented in terms of\n liblinear rather than libsvm, so it has more flexibility in the choice of\n penalties and loss functions and should scale better to large numbers of\n samples.\n\n This class supports both dense and sparse input.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.16\n\n Parameters\n ----------\n epsilon : float, default=0.0\n Epsilon parameter in the epsilon-insensitive loss function. Note\n that the value of this parameter depends on the scale of the target\n variable y. If unsure, set ``epsilon=0``.\n\n tol : float, default=1e-4\n Tolerance for stopping criteria.\n\n C : float, default=1.0\n Regularization parameter. The strength of the regularization is\n inversely proportional to C. Must be strictly positive.\n\n loss : {'epsilon_insensitive', 'squared_epsilon_insensitive'}, default='epsilon_insensitive'\n Specifies the loss function. The epsilon-insensitive loss\n (standard SVR) is the L1 loss, while the squared epsilon-insensitive\n loss ('squared_epsilon_insensitive') is the L2 loss.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be already centered).\n\n intercept_scaling : float, default=1.0\n When self.fit_intercept is True, instance vector x becomes\n [x, self.intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equals to\n intercept_scaling is appended to the instance vector.\n The intercept becomes intercept_scaling * synthetic feature weight\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\n dual : bool, default=True\n Select the algorithm to either solve the dual or primal\n optimization problem. Prefer dual=False when n_samples > n_features.\n\n verbose : int, default=0\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in liblinear that, if enabled, may not work\n properly in a multithreaded context.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n max_iter : int, default=1000\n The maximum number of iterations to be run.\n\n Attributes\n ----------\n coef_ : ndarray of shape (n_features) if n_classes == 2 else (n_classes, n_features)\n Weights assigned to the features (coefficients in the primal\n problem).\n\n `coef_` is a readonly property derived from `raw_coef_` that\n follows the internal memory layout of liblinear.\n\n intercept_ : ndarray of shape (1) if n_classes == 2 else (n_classes)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_iter_ : int\n Maximum number of iterations run across all classes.\n\n See Also\n --------\n LinearSVC : Implementation of Support Vector Machine classifier using the\n same library as this class (liblinear).\n\n SVR : Implementation of Support Vector Machine regression using libsvm:\n the kernel can be non-linear but its SMO algorithm does not\n scale to large number of samples as LinearSVC does.\n\n sklearn.linear_model.SGDRegressor : SGDRegressor can optimize the same cost\n function as LinearSVR\n by adjusting the penalty and loss parameters. In addition it requires\n less memory, allows incremental (online) learning, and implements\n various loss functions and regularization regimes.\n\n Examples\n --------\n >>> from sklearn.svm import LinearSVR\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.datasets import make_regression\n >>> X, y = make_regression(n_features=4, random_state=0)\n >>> regr = make_pipeline(StandardScaler(),\n ... LinearSVR(random_state=0, tol=1e-5))\n >>> regr.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('linearsvr', LinearSVR(random_state=0, tol=1e-05))])\n\n >>> print(regr.named_steps['linearsvr'].coef_)\n [18.582... 27.023... 44.357... 64.522...]\n >>> print(regr.named_steps['linearsvr'].intercept_)\n [-4...]\n >>> print(regr.predict([[0, 0, 0, 0]]))\n [-2.384...]\n \"\"\"\n \n def __init__(self, *, epsilon=0.0, tol=0.0001, C=1.0, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=1000):\n self.tol = tol\n self.C = C\n self.epsilon = epsilon\n self.fit_intercept = fit_intercept\n self.intercept_scaling = intercept_scaling\n self.verbose = verbose\n self.random_state = random_state\n self.max_iter = max_iter\n self.dual = dual\n self.loss = loss\n \n def fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Array of weights that are assigned to individual\n samples. If not provided,\n then each sample is given unit weight.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n self : object\n An instance of the estimator.\n \"\"\"\n if self.C < 0:\n raise ValueError('Penalty term must be positive; got (C=%r)' % self.C)\n (X, y) = self._validate_data(X, y, accept_sparse='csr', dtype=np.float64, order='C', accept_large_sparse=False)\n penalty = 'l2'\n (self.coef_, self.intercept_, self.n_iter_) = _fit_liblinear(X, y, self.C, self.fit_intercept, self.intercept_scaling, None, penalty, self.dual, self.verbose, self.max_iter, self.tol, self.random_state, loss=self.loss, epsilon=self.epsilon, sample_weight=sample_weight)\n self.coef_ = self.coef_.ravel()\n return self\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, @@ -26390,9 +26486,9 @@ "sklearn.svm._classes.NuSVC._more_tags" ], "is_public": true, - "description": "Nu-Support Vector Classification.\n\nSimilar to SVC but uses a parameter to control the number of support vectors. The implementation is based on libsvm. Read more in the :ref:`User Guide `.", - "docstring": "Nu-Support Vector Classification.\n\n Similar to SVC but uses a parameter to control the number of support\n vectors.\n\n The implementation is based on libsvm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n nu : float, default=0.5\n An upper bound on the fraction of margin errors (see :ref:`User Guide\n `) and a lower bound of the fraction of support vectors.\n Should be in the interval (0, 1].\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\n a callable.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n probability : bool, default=False\n Whether to enable probability estimates. This must be enabled prior\n to calling `fit`, will slow down that method as it internally uses\n 5-fold cross-validation, and `predict_proba` may be inconsistent with\n `predict`. Read more in the :ref:`User Guide `.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n class_weight : {dict, 'balanced'}, default=None\n Set the parameter C of class i to class_weight[i]*C for\n SVC. If not given, all classes are supposed to have\n weight one. The \"balanced\" mode uses the values of y to automatically\n adjust weights inversely proportional to class frequencies as\n ``n_samples / (n_classes * np.bincount(y))``.\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n decision_function_shape : {'ovo', 'ovr'}, default='ovr'\n Whether to return a one-vs-rest ('ovr') decision function of shape\n (n_samples, n_classes) as all other classifiers, or the original\n one-vs-one ('ovo') decision function of libsvm which has shape\n (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one\n ('ovo') is always used as multi-class strategy. The parameter is\n ignored for binary classification.\n\n .. versionchanged:: 0.19\n decision_function_shape is 'ovr' by default.\n\n .. versionadded:: 0.17\n *decision_function_shape='ovr'* is recommended.\n\n .. versionchanged:: 0.17\n Deprecated *decision_function_shape='ovo' and None*.\n\n break_ties : bool, default=False\n If true, ``decision_function_shape='ovr'``, and number of classes > 2,\n :term:`predict` will break ties according to the confidence values of\n :term:`decision_function`; otherwise the first class among the tied\n classes is returned. Please note that breaking ties comes at a\n relatively high computational cost compared to a simple predict.\n\n .. versionadded:: 0.22\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data for\n probability estimates. Ignored when `probability` is False.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C of each class.\n Computed based on the ``class_weight`` parameter.\n\n classes_ : ndarray of shape (n_classes,)\n The unique classes labels.\n\n coef_ : ndarray of shape (n_classes * (n_classes -1) / 2, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (n_classes - 1, n_SV)\n Dual coefficients of the support vector in the decision\n function (see :ref:`sgd_mathematical_formulation`), multiplied by\n their targets.\n For multiclass, coefficient for all 1-vs-1 classifiers.\n The layout of the coefficients in the multiclass case is somewhat\n non-trivial. See the :ref:`multi-class section of the User Guide\n ` for details.\n\n fit_status_ : int\n 0 if correctly fitted, 1 if the algorithm did not converge.\n\n intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n fit_status_ : int\n 0 if correctly fitted, 1 if the algorithm did not converge.\n\n probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n If `probability=True`, it corresponds to the parameters learned in\n Platt scaling to produce probability estimates from decision values.\n If `probability=False`, it's an empty array. Platt scaling uses the\n logistic function\n ``1 / (1 + exp(decision_value * probA_ + probB_))``\n where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For\n more information on the multiclass case and training procedure see\n section 8 of [1]_.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n See Also\n --------\n SVC : Support Vector Machine for classification using libsvm.\n\n LinearSVC : Scalable linear Support Vector Machine for classification using\n liblinear.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> y = np.array([1, 1, 2, 2])\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.svm import NuSVC\n >>> clf = make_pipeline(StandardScaler(), NuSVC())\n >>> clf.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()), ('nusvc', NuSVC())])\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n ", - "source_code": "\n\nclass NuSVC(BaseSVC):\n \"\"\"Nu-Support Vector Classification.\n\n Similar to SVC but uses a parameter to control the number of support\n vectors.\n\n The implementation is based on libsvm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n nu : float, default=0.5\n An upper bound on the fraction of margin errors (see :ref:`User Guide\n `) and a lower bound of the fraction of support vectors.\n Should be in the interval (0, 1].\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\n a callable.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n probability : bool, default=False\n Whether to enable probability estimates. This must be enabled prior\n to calling `fit`, will slow down that method as it internally uses\n 5-fold cross-validation, and `predict_proba` may be inconsistent with\n `predict`. Read more in the :ref:`User Guide `.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n class_weight : {dict, 'balanced'}, default=None\n Set the parameter C of class i to class_weight[i]*C for\n SVC. If not given, all classes are supposed to have\n weight one. The \"balanced\" mode uses the values of y to automatically\n adjust weights inversely proportional to class frequencies as\n ``n_samples / (n_classes * np.bincount(y))``.\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n decision_function_shape : {'ovo', 'ovr'}, default='ovr'\n Whether to return a one-vs-rest ('ovr') decision function of shape\n (n_samples, n_classes) as all other classifiers, or the original\n one-vs-one ('ovo') decision function of libsvm which has shape\n (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one\n ('ovo') is always used as multi-class strategy. The parameter is\n ignored for binary classification.\n\n .. versionchanged:: 0.19\n decision_function_shape is 'ovr' by default.\n\n .. versionadded:: 0.17\n *decision_function_shape='ovr'* is recommended.\n\n .. versionchanged:: 0.17\n Deprecated *decision_function_shape='ovo' and None*.\n\n break_ties : bool, default=False\n If true, ``decision_function_shape='ovr'``, and number of classes > 2,\n :term:`predict` will break ties according to the confidence values of\n :term:`decision_function`; otherwise the first class among the tied\n classes is returned. Please note that breaking ties comes at a\n relatively high computational cost compared to a simple predict.\n\n .. versionadded:: 0.22\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data for\n probability estimates. Ignored when `probability` is False.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C of each class.\n Computed based on the ``class_weight`` parameter.\n\n classes_ : ndarray of shape (n_classes,)\n The unique classes labels.\n\n coef_ : ndarray of shape (n_classes * (n_classes -1) / 2, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (n_classes - 1, n_SV)\n Dual coefficients of the support vector in the decision\n function (see :ref:`sgd_mathematical_formulation`), multiplied by\n their targets.\n For multiclass, coefficient for all 1-vs-1 classifiers.\n The layout of the coefficients in the multiclass case is somewhat\n non-trivial. See the :ref:`multi-class section of the User Guide\n ` for details.\n\n fit_status_ : int\n 0 if correctly fitted, 1 if the algorithm did not converge.\n\n intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n fit_status_ : int\n 0 if correctly fitted, 1 if the algorithm did not converge.\n\n probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n If `probability=True`, it corresponds to the parameters learned in\n Platt scaling to produce probability estimates from decision values.\n If `probability=False`, it's an empty array. Platt scaling uses the\n logistic function\n ``1 / (1 + exp(decision_value * probA_ + probB_))``\n where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For\n more information on the multiclass case and training procedure see\n section 8 of [1]_.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n See Also\n --------\n SVC : Support Vector Machine for classification using libsvm.\n\n LinearSVC : Scalable linear Support Vector Machine for classification using\n liblinear.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> y = np.array([1, 1, 2, 2])\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.svm import NuSVC\n >>> clf = make_pipeline(StandardScaler(), NuSVC())\n >>> clf.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()), ('nusvc', NuSVC())])\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n \"\"\"\n _impl = 'nu_svc'\n \n def __init__(self, *, nu=0.5, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None):\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=0.0, nu=nu, shrinking=shrinking, probability=probability, cache_size=cache_size, class_weight=class_weight, verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties, random_state=random_state)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_methods_subset_invariance': 'fails for the decision_function method', 'check_class_weight_classifiers': 'class_weight is ignored.', 'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" + "description": "Nu-Support Vector Classification.\n\nSimilar to SVC but uses a parameter to control the number of support\nvectors.\n\nThe implementation is based on libsvm.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Nu-Support Vector Classification.\n\n Similar to SVC but uses a parameter to control the number of support\n vectors.\n\n The implementation is based on libsvm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n nu : float, default=0.5\n An upper bound on the fraction of margin errors (see :ref:`User Guide\n `) and a lower bound of the fraction of support vectors.\n Should be in the interval (0, 1].\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n probability : bool, default=False\n Whether to enable probability estimates. This must be enabled prior\n to calling `fit`, will slow down that method as it internally uses\n 5-fold cross-validation, and `predict_proba` may be inconsistent with\n `predict`. Read more in the :ref:`User Guide `.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n class_weight : {dict, 'balanced'}, default=None\n Set the parameter C of class i to class_weight[i]*C for\n SVC. If not given, all classes are supposed to have\n weight one. The \"balanced\" mode uses the values of y to automatically\n adjust weights inversely proportional to class frequencies as\n ``n_samples / (n_classes * np.bincount(y))``.\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n decision_function_shape : {'ovo', 'ovr'}, default='ovr'\n Whether to return a one-vs-rest ('ovr') decision function of shape\n (n_samples, n_classes) as all other classifiers, or the original\n one-vs-one ('ovo') decision function of libsvm which has shape\n (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one\n ('ovo') is always used as multi-class strategy. The parameter is\n ignored for binary classification.\n\n .. versionchanged:: 0.19\n decision_function_shape is 'ovr' by default.\n\n .. versionadded:: 0.17\n *decision_function_shape='ovr'* is recommended.\n\n .. versionchanged:: 0.17\n Deprecated *decision_function_shape='ovo' and None*.\n\n break_ties : bool, default=False\n If true, ``decision_function_shape='ovr'``, and number of classes > 2,\n :term:`predict` will break ties according to the confidence values of\n :term:`decision_function`; otherwise the first class among the tied\n classes is returned. Please note that breaking ties comes at a\n relatively high computational cost compared to a simple predict.\n\n .. versionadded:: 0.22\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data for\n probability estimates. Ignored when `probability` is False.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C of each class.\n Computed based on the ``class_weight`` parameter.\n\n classes_ : ndarray of shape (n_classes,)\n The unique classes labels.\n\n coef_ : ndarray of shape (n_classes * (n_classes -1) / 2, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (n_classes - 1, n_SV)\n Dual coefficients of the support vector in the decision\n function (see :ref:`sgd_mathematical_formulation`), multiplied by\n their targets.\n For multiclass, coefficient for all 1-vs-1 classifiers.\n The layout of the coefficients in the multiclass case is somewhat\n non-trivial. See the :ref:`multi-class section of the User Guide\n ` for details.\n\n fit_status_ : int\n 0 if correctly fitted, 1 if the algorithm did not converge.\n\n intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n fit_status_ : int\n 0 if correctly fitted, 1 if the algorithm did not converge.\n\n probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n If `probability=True`, it corresponds to the parameters learned in\n Platt scaling to produce probability estimates from decision values.\n If `probability=False`, it's an empty array. Platt scaling uses the\n logistic function\n ``1 / (1 + exp(decision_value * probA_ + probB_))``\n where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For\n more information on the multiclass case and training procedure see\n section 8 of [1]_.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n See Also\n --------\n SVC : Support Vector Machine for classification using libsvm.\n\n LinearSVC : Scalable linear Support Vector Machine for classification using\n liblinear.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> y = np.array([1, 1, 2, 2])\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.svm import NuSVC\n >>> clf = make_pipeline(StandardScaler(), NuSVC())\n >>> clf.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()), ('nusvc', NuSVC())])\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n ", + "source_code": "\n\nclass NuSVC(BaseSVC):\n \"\"\"Nu-Support Vector Classification.\n\n Similar to SVC but uses a parameter to control the number of support\n vectors.\n\n The implementation is based on libsvm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n nu : float, default=0.5\n An upper bound on the fraction of margin errors (see :ref:`User Guide\n `) and a lower bound of the fraction of support vectors.\n Should be in the interval (0, 1].\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n probability : bool, default=False\n Whether to enable probability estimates. This must be enabled prior\n to calling `fit`, will slow down that method as it internally uses\n 5-fold cross-validation, and `predict_proba` may be inconsistent with\n `predict`. Read more in the :ref:`User Guide `.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n class_weight : {dict, 'balanced'}, default=None\n Set the parameter C of class i to class_weight[i]*C for\n SVC. If not given, all classes are supposed to have\n weight one. The \"balanced\" mode uses the values of y to automatically\n adjust weights inversely proportional to class frequencies as\n ``n_samples / (n_classes * np.bincount(y))``.\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n decision_function_shape : {'ovo', 'ovr'}, default='ovr'\n Whether to return a one-vs-rest ('ovr') decision function of shape\n (n_samples, n_classes) as all other classifiers, or the original\n one-vs-one ('ovo') decision function of libsvm which has shape\n (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one\n ('ovo') is always used as multi-class strategy. The parameter is\n ignored for binary classification.\n\n .. versionchanged:: 0.19\n decision_function_shape is 'ovr' by default.\n\n .. versionadded:: 0.17\n *decision_function_shape='ovr'* is recommended.\n\n .. versionchanged:: 0.17\n Deprecated *decision_function_shape='ovo' and None*.\n\n break_ties : bool, default=False\n If true, ``decision_function_shape='ovr'``, and number of classes > 2,\n :term:`predict` will break ties according to the confidence values of\n :term:`decision_function`; otherwise the first class among the tied\n classes is returned. Please note that breaking ties comes at a\n relatively high computational cost compared to a simple predict.\n\n .. versionadded:: 0.22\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data for\n probability estimates. Ignored when `probability` is False.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C of each class.\n Computed based on the ``class_weight`` parameter.\n\n classes_ : ndarray of shape (n_classes,)\n The unique classes labels.\n\n coef_ : ndarray of shape (n_classes * (n_classes -1) / 2, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (n_classes - 1, n_SV)\n Dual coefficients of the support vector in the decision\n function (see :ref:`sgd_mathematical_formulation`), multiplied by\n their targets.\n For multiclass, coefficient for all 1-vs-1 classifiers.\n The layout of the coefficients in the multiclass case is somewhat\n non-trivial. See the :ref:`multi-class section of the User Guide\n ` for details.\n\n fit_status_ : int\n 0 if correctly fitted, 1 if the algorithm did not converge.\n\n intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n fit_status_ : int\n 0 if correctly fitted, 1 if the algorithm did not converge.\n\n probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n If `probability=True`, it corresponds to the parameters learned in\n Platt scaling to produce probability estimates from decision values.\n If `probability=False`, it's an empty array. Platt scaling uses the\n logistic function\n ``1 / (1 + exp(decision_value * probA_ + probB_))``\n where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For\n more information on the multiclass case and training procedure see\n section 8 of [1]_.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n See Also\n --------\n SVC : Support Vector Machine for classification using libsvm.\n\n LinearSVC : Scalable linear Support Vector Machine for classification using\n liblinear.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> y = np.array([1, 1, 2, 2])\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.svm import NuSVC\n >>> clf = make_pipeline(StandardScaler(), NuSVC())\n >>> clf.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()), ('nusvc', NuSVC())])\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n \"\"\"\n _impl = 'nu_svc'\n \n def __init__(self, *, nu=0.5, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None):\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=0.0, nu=nu, shrinking=shrinking, probability=probability, cache_size=cache_size, class_weight=class_weight, verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties, random_state=random_state)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_methods_subset_invariance': 'fails for the decision_function method', 'check_class_weight_classifiers': 'class_weight is ignored.', 'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, { "name": "NuSVR", @@ -26404,9 +26500,9 @@ "sklearn.svm._classes.NuSVR._more_tags" ], "is_public": true, - "description": "Nu Support Vector Regression.\n\nSimilar to NuSVC, for regression, uses a parameter nu to control the number of support vectors. However, unlike NuSVC, where nu replaces C, here nu replaces the parameter epsilon of epsilon-SVR. The implementation is based on libsvm. Read more in the :ref:`User Guide `.", - "docstring": "Nu Support Vector Regression.\n\n Similar to NuSVC, for regression, uses a parameter nu to control\n the number of support vectors. However, unlike NuSVC, where nu\n replaces C, here nu replaces the parameter epsilon of epsilon-SVR.\n\n The implementation is based on libsvm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n nu : float, default=0.5\n An upper bound on the fraction of training errors and a lower bound of\n the fraction of support vectors. Should be in the interval (0, 1]. By\n default 0.5 will be taken.\n\n C : float, default=1.0\n Penalty parameter C of the error term.\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\n a callable.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (1, n_SV)\n Coefficients of the support vector in the decision function.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (1,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n See Also\n --------\n NuSVC : Support Vector Machine for classification implemented with libsvm\n with a parameter to control the number of support vectors.\n\n SVR : Epsilon Support Vector Machine for regression implemented with\n libsvm.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> from sklearn.svm import NuSVR\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> np.random.seed(0)\n >>> y = np.random.randn(n_samples)\n >>> X = np.random.randn(n_samples, n_features)\n >>> regr = make_pipeline(StandardScaler(), NuSVR(C=1.0, nu=0.1))\n >>> regr.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('nusvr', NuSVR(nu=0.1))])\n ", - "source_code": "\n\nclass NuSVR(RegressorMixin, BaseLibSVM):\n \"\"\"Nu Support Vector Regression.\n\n Similar to NuSVC, for regression, uses a parameter nu to control\n the number of support vectors. However, unlike NuSVC, where nu\n replaces C, here nu replaces the parameter epsilon of epsilon-SVR.\n\n The implementation is based on libsvm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n nu : float, default=0.5\n An upper bound on the fraction of training errors and a lower bound of\n the fraction of support vectors. Should be in the interval (0, 1]. By\n default 0.5 will be taken.\n\n C : float, default=1.0\n Penalty parameter C of the error term.\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\n a callable.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (1, n_SV)\n Coefficients of the support vector in the decision function.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (1,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n See Also\n --------\n NuSVC : Support Vector Machine for classification implemented with libsvm\n with a parameter to control the number of support vectors.\n\n SVR : Epsilon Support Vector Machine for regression implemented with\n libsvm.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> from sklearn.svm import NuSVR\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> np.random.seed(0)\n >>> y = np.random.randn(n_samples)\n >>> X = np.random.randn(n_samples, n_features)\n >>> regr = make_pipeline(StandardScaler(), NuSVR(C=1.0, nu=0.1))\n >>> regr.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('nusvr', NuSVR(nu=0.1))])\n \"\"\"\n _impl = 'nu_svr'\n \n def __init__(self, *, nu=0.5, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, tol=0.001, cache_size=200, verbose=False, max_iter=-1):\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, nu=nu, epsilon=0.0, shrinking=shrinking, probability=False, cache_size=cache_size, class_weight=None, verbose=verbose, max_iter=max_iter, random_state=None)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" + "description": "Nu Support Vector Regression.\n\nSimilar to NuSVC, for regression, uses a parameter nu to control\nthe number of support vectors. However, unlike NuSVC, where nu\nreplaces C, here nu replaces the parameter epsilon of epsilon-SVR.\n\nThe implementation is based on libsvm.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Nu Support Vector Regression.\n\n Similar to NuSVC, for regression, uses a parameter nu to control\n the number of support vectors. However, unlike NuSVC, where nu\n replaces C, here nu replaces the parameter epsilon of epsilon-SVR.\n\n The implementation is based on libsvm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n nu : float, default=0.5\n An upper bound on the fraction of training errors and a lower bound of\n the fraction of support vectors. Should be in the interval (0, 1]. By\n default 0.5 will be taken.\n\n C : float, default=1.0\n Penalty parameter C of the error term.\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (1, n_SV)\n Coefficients of the support vector in the decision function.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (1,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n See Also\n --------\n NuSVC : Support Vector Machine for classification implemented with libsvm\n with a parameter to control the number of support vectors.\n\n SVR : Epsilon Support Vector Machine for regression implemented with\n libsvm.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> from sklearn.svm import NuSVR\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> np.random.seed(0)\n >>> y = np.random.randn(n_samples)\n >>> X = np.random.randn(n_samples, n_features)\n >>> regr = make_pipeline(StandardScaler(), NuSVR(C=1.0, nu=0.1))\n >>> regr.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('nusvr', NuSVR(nu=0.1))])\n ", + "source_code": "\n\nclass NuSVR(RegressorMixin, BaseLibSVM):\n \"\"\"Nu Support Vector Regression.\n\n Similar to NuSVC, for regression, uses a parameter nu to control\n the number of support vectors. However, unlike NuSVC, where nu\n replaces C, here nu replaces the parameter epsilon of epsilon-SVR.\n\n The implementation is based on libsvm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n nu : float, default=0.5\n An upper bound on the fraction of training errors and a lower bound of\n the fraction of support vectors. Should be in the interval (0, 1]. By\n default 0.5 will be taken.\n\n C : float, default=1.0\n Penalty parameter C of the error term.\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (1, n_SV)\n Coefficients of the support vector in the decision function.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (1,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n See Also\n --------\n NuSVC : Support Vector Machine for classification implemented with libsvm\n with a parameter to control the number of support vectors.\n\n SVR : Epsilon Support Vector Machine for regression implemented with\n libsvm.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> from sklearn.svm import NuSVR\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> np.random.seed(0)\n >>> y = np.random.randn(n_samples)\n >>> X = np.random.randn(n_samples, n_features)\n >>> regr = make_pipeline(StandardScaler(), NuSVR(C=1.0, nu=0.1))\n >>> regr.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('nusvr', NuSVR(nu=0.1))])\n \"\"\"\n _impl = 'nu_svr'\n \n def __init__(self, *, nu=0.5, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, tol=0.001, cache_size=200, verbose=False, max_iter=-1):\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, nu=nu, epsilon=0.0, shrinking=shrinking, probability=False, cache_size=cache_size, class_weight=None, verbose=verbose, max_iter=max_iter, random_state=None)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, { "name": "OneClassSVM", @@ -26422,9 +26518,9 @@ "sklearn.svm._classes.OneClassSVM._more_tags" ], "is_public": true, - "description": "Unsupervised Outlier Detection.\n\nEstimate the support of a high-dimensional distribution. The implementation is based on libsvm. Read more in the :ref:`User Guide `.", - "docstring": "Unsupervised Outlier Detection.\n\n Estimate the support of a high-dimensional distribution.\n\n The implementation is based on libsvm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\n a callable.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n nu : float, default=0.5\n An upper bound on the fraction of training\n errors and a lower bound of the fraction of support\n vectors. Should be in the interval (0, 1]. By default 0.5\n will be taken.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (1, n_SV)\n Coefficients of the support vectors in the decision function.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (1,)\n Constant in the decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n offset_ : float\n Offset used to define the decision function from the raw scores.\n We have the relation: decision_function = score_samples - `offset_`.\n The offset is the opposite of `intercept_` and is provided for\n consistency with other outlier detection algorithms.\n\n .. versionadded:: 0.20\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n See Also\n --------\n sklearn.linear_model.SGDOneClassSVM : Solves linear One-Class SVM using\n Stochastic Gradient Descent.\n sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection using\n Local Outlier Factor (LOF).\n sklearn.ensemble.IsolationForest : Isolation Forest Algorithm.\n\n Examples\n --------\n >>> from sklearn.svm import OneClassSVM\n >>> X = [[0], [0.44], [0.45], [0.46], [1]]\n >>> clf = OneClassSVM(gamma='auto').fit(X)\n >>> clf.predict(X)\n array([-1, 1, 1, 1, -1])\n >>> clf.score_samples(X)\n array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...])\n ", - "source_code": "\n\nclass OneClassSVM(OutlierMixin, BaseLibSVM):\n \"\"\"Unsupervised Outlier Detection.\n\n Estimate the support of a high-dimensional distribution.\n\n The implementation is based on libsvm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\n a callable.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n nu : float, default=0.5\n An upper bound on the fraction of training\n errors and a lower bound of the fraction of support\n vectors. Should be in the interval (0, 1]. By default 0.5\n will be taken.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (1, n_SV)\n Coefficients of the support vectors in the decision function.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (1,)\n Constant in the decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n offset_ : float\n Offset used to define the decision function from the raw scores.\n We have the relation: decision_function = score_samples - `offset_`.\n The offset is the opposite of `intercept_` and is provided for\n consistency with other outlier detection algorithms.\n\n .. versionadded:: 0.20\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n See Also\n --------\n sklearn.linear_model.SGDOneClassSVM : Solves linear One-Class SVM using\n Stochastic Gradient Descent.\n sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection using\n Local Outlier Factor (LOF).\n sklearn.ensemble.IsolationForest : Isolation Forest Algorithm.\n\n Examples\n --------\n >>> from sklearn.svm import OneClassSVM\n >>> X = [[0], [0.44], [0.45], [0.46], [1]]\n >>> clf = OneClassSVM(gamma='auto').fit(X)\n >>> clf.predict(X)\n array([-1, 1, 1, 1, -1])\n >>> clf.score_samples(X)\n array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...])\n \"\"\"\n _impl = 'one_class'\n \n def __init__(self, *, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1):\n super().__init__(kernel, degree, gamma, coef0, tol, 0.0, nu, 0.0, shrinking, False, cache_size, None, verbose, max_iter, random_state=None)\n \n def fit(self, X, y=None, sample_weight=None, **params):\n \"\"\"Detect the soft boundary of the set of samples X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Set of samples, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Per-sample weights. Rescale C per sample. Higher weights\n force the classifier to put more emphasis on these points.\n\n **params : dict\n Additional fit parameters.\n\n .. deprecated:: 1.0\n The `fit` method will not longer accept extra keyword\n parameters in 1.2. These keyword parameters were\n already discarded.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n If X is not a C-ordered contiguous array it is copied.\n \"\"\"\n if len(params) > 0:\n warnings.warn(f'Passing additional keyword parameters has no effect and is deprecated in 1.0. An error will be raised from 1.2 and beyond. The ignored keyword parameter(s) are: {params.keys()}.', FutureWarning)\n super().fit(X, np.ones(_num_samples(X)), sample_weight=sample_weight)\n self.offset_ = -self._intercept_\n return self\n \n def decision_function(self, X):\n \"\"\"Signed distance to the separating hyperplane.\n\n Signed distance is positive for an inlier and negative for an outlier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n dec : ndarray of shape (n_samples,)\n Returns the decision function of the samples.\n \"\"\"\n dec = self._decision_function(X).ravel()\n return dec\n \n def score_samples(self, X):\n \"\"\"Raw scoring function of the samples.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n score_samples : ndarray of shape (n_samples,)\n Returns the (unshifted) scoring function of the samples.\n \"\"\"\n return self.decision_function(X) + self.offset_\n \n def predict(self, X):\n \"\"\"Perform classification on samples in X.\n\n For a one-class model, +1 or -1 is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples_test, n_samples_train)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Class labels for samples in X.\n \"\"\"\n y = super().predict(X)\n return np.asarray(y, dtype=np.intp)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" + "description": "Unsupervised Outlier Detection.\n\nEstimate the support of a high-dimensional distribution.\n\nThe implementation is based on libsvm.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Unsupervised Outlier Detection.\n\n Estimate the support of a high-dimensional distribution.\n\n The implementation is based on libsvm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n nu : float, default=0.5\n An upper bound on the fraction of training\n errors and a lower bound of the fraction of support\n vectors. Should be in the interval (0, 1]. By default 0.5\n will be taken.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (1, n_SV)\n Coefficients of the support vectors in the decision function.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (1,)\n Constant in the decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n offset_ : float\n Offset used to define the decision function from the raw scores.\n We have the relation: decision_function = score_samples - `offset_`.\n The offset is the opposite of `intercept_` and is provided for\n consistency with other outlier detection algorithms.\n\n .. versionadded:: 0.20\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n See Also\n --------\n sklearn.linear_model.SGDOneClassSVM : Solves linear One-Class SVM using\n Stochastic Gradient Descent.\n sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection using\n Local Outlier Factor (LOF).\n sklearn.ensemble.IsolationForest : Isolation Forest Algorithm.\n\n Examples\n --------\n >>> from sklearn.svm import OneClassSVM\n >>> X = [[0], [0.44], [0.45], [0.46], [1]]\n >>> clf = OneClassSVM(gamma='auto').fit(X)\n >>> clf.predict(X)\n array([-1, 1, 1, 1, -1])\n >>> clf.score_samples(X)\n array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...])\n ", + "source_code": "\n\nclass OneClassSVM(OutlierMixin, BaseLibSVM):\n \"\"\"Unsupervised Outlier Detection.\n\n Estimate the support of a high-dimensional distribution.\n\n The implementation is based on libsvm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n nu : float, default=0.5\n An upper bound on the fraction of training\n errors and a lower bound of the fraction of support\n vectors. Should be in the interval (0, 1]. By default 0.5\n will be taken.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (1, n_SV)\n Coefficients of the support vectors in the decision function.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (1,)\n Constant in the decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n offset_ : float\n Offset used to define the decision function from the raw scores.\n We have the relation: decision_function = score_samples - `offset_`.\n The offset is the opposite of `intercept_` and is provided for\n consistency with other outlier detection algorithms.\n\n .. versionadded:: 0.20\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n See Also\n --------\n sklearn.linear_model.SGDOneClassSVM : Solves linear One-Class SVM using\n Stochastic Gradient Descent.\n sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection using\n Local Outlier Factor (LOF).\n sklearn.ensemble.IsolationForest : Isolation Forest Algorithm.\n\n Examples\n --------\n >>> from sklearn.svm import OneClassSVM\n >>> X = [[0], [0.44], [0.45], [0.46], [1]]\n >>> clf = OneClassSVM(gamma='auto').fit(X)\n >>> clf.predict(X)\n array([-1, 1, 1, 1, -1])\n >>> clf.score_samples(X)\n array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...])\n \"\"\"\n _impl = 'one_class'\n \n def __init__(self, *, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1):\n super().__init__(kernel, degree, gamma, coef0, tol, 0.0, nu, 0.0, shrinking, False, cache_size, None, verbose, max_iter, random_state=None)\n \n def fit(self, X, y=None, sample_weight=None, **params):\n \"\"\"Detect the soft boundary of the set of samples X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Set of samples, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Per-sample weights. Rescale C per sample. Higher weights\n force the classifier to put more emphasis on these points.\n\n **params : dict\n Additional fit parameters.\n\n .. deprecated:: 1.0\n The `fit` method will not longer accept extra keyword\n parameters in 1.2. These keyword parameters were\n already discarded.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n If X is not a C-ordered contiguous array it is copied.\n \"\"\"\n if len(params) > 0:\n warnings.warn(f'Passing additional keyword parameters has no effect and is deprecated in 1.0. An error will be raised from 1.2 and beyond. The ignored keyword parameter(s) are: {params.keys()}.', FutureWarning)\n super().fit(X, np.ones(_num_samples(X)), sample_weight=sample_weight)\n self.offset_ = -self._intercept_\n return self\n \n def decision_function(self, X):\n \"\"\"Signed distance to the separating hyperplane.\n\n Signed distance is positive for an inlier and negative for an outlier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n dec : ndarray of shape (n_samples,)\n Returns the decision function of the samples.\n \"\"\"\n dec = self._decision_function(X).ravel()\n return dec\n \n def score_samples(self, X):\n \"\"\"Raw scoring function of the samples.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n score_samples : ndarray of shape (n_samples,)\n Returns the (unshifted) scoring function of the samples.\n \"\"\"\n return self.decision_function(X) + self.offset_\n \n def predict(self, X):\n \"\"\"Perform classification on samples in X.\n\n For a one-class model, +1 or -1 is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples_test, n_samples_train)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Class labels for samples in X.\n \"\"\"\n y = super().predict(X)\n return np.asarray(y, dtype=np.intp)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, { "name": "SVC", @@ -26436,9 +26532,9 @@ "sklearn.svm._classes.SVC._more_tags" ], "is_public": true, - "description": "C-Support Vector Classification.\n\nThe implementation is based on libsvm. The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples. For large datasets consider using :class:`~sklearn.svm.LinearSVC` or :class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a :class:`~sklearn.kernel_approximation.Nystroem` transformer. The multiclass support is handled according to a one-vs-one scheme. For details on the precise mathematical formulation of the provided kernel functions and how `gamma`, `coef0` and `degree` affect each other, see the corresponding section in the narrative documentation: :ref:`svm_kernels`. Read more in the :ref:`User Guide `.", - "docstring": "C-Support Vector Classification.\n\n The implementation is based on libsvm. The fit time scales at least\n quadratically with the number of samples and may be impractical\n beyond tens of thousands of samples. For large datasets\n consider using :class:`~sklearn.svm.LinearSVC` or\n :class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a\n :class:`~sklearn.kernel_approximation.Nystroem` transformer.\n\n The multiclass support is handled according to a one-vs-one scheme.\n\n For details on the precise mathematical formulation of the provided\n kernel functions and how `gamma`, `coef0` and `degree` affect each\n other, see the corresponding section in the narrative documentation:\n :ref:`svm_kernels`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n C : float, default=1.0\n Regularization parameter. The strength of the regularization is\n inversely proportional to C. Must be strictly positive. The penalty\n is a squared l2 penalty.\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\n a callable.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to pre-compute the kernel matrix from data matrices; that matrix\n should be an array of shape ``(n_samples, n_samples)``.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n probability : bool, default=False\n Whether to enable probability estimates. This must be enabled prior\n to calling `fit`, will slow down that method as it internally uses\n 5-fold cross-validation, and `predict_proba` may be inconsistent with\n `predict`. Read more in the :ref:`User Guide `.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n class_weight : dict or 'balanced', default=None\n Set the parameter C of class i to class_weight[i]*C for\n SVC. If not given, all classes are supposed to have\n weight one.\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n decision_function_shape : {'ovo', 'ovr'}, default='ovr'\n Whether to return a one-vs-rest ('ovr') decision function of shape\n (n_samples, n_classes) as all other classifiers, or the original\n one-vs-one ('ovo') decision function of libsvm which has shape\n (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one\n ('ovo') is always used as multi-class strategy. The parameter is\n ignored for binary classification.\n\n .. versionchanged:: 0.19\n decision_function_shape is 'ovr' by default.\n\n .. versionadded:: 0.17\n *decision_function_shape='ovr'* is recommended.\n\n .. versionchanged:: 0.17\n Deprecated *decision_function_shape='ovo' and None*.\n\n break_ties : bool, default=False\n If true, ``decision_function_shape='ovr'``, and number of classes > 2,\n :term:`predict` will break ties according to the confidence values of\n :term:`decision_function`; otherwise the first class among the tied\n classes is returned. Please note that breaking ties comes at a\n relatively high computational cost compared to a simple predict.\n\n .. versionadded:: 0.22\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data for\n probability estimates. Ignored when `probability` is False.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n coef_ : ndarray of shape (n_classes * (n_classes - 1) / 2, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is a readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (n_classes -1, n_SV)\n Dual coefficients of the support vector in the decision\n function (see :ref:`sgd_mathematical_formulation`), multiplied by\n their targets.\n For multiclass, coefficient for all 1-vs-1 classifiers.\n The layout of the coefficients in the multiclass case is somewhat\n non-trivial. See the :ref:`multi-class section of the User Guide\n ` for details.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n support_ : ndarray of shape (n_SV)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2)\n probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2)\n If `probability=True`, it corresponds to the parameters learned in\n Platt scaling to produce probability estimates from decision values.\n If `probability=False`, it's an empty array. Platt scaling uses the\n logistic function\n ``1 / (1 + exp(decision_value * probA_ + probB_))``\n where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For\n more information on the multiclass case and training procedure see\n section 8 of [1]_.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n See Also\n --------\n SVR : Support Vector Machine for Regression implemented using libsvm.\n\n LinearSVC : Scalable Linear Support Vector Machine for classification\n implemented using liblinear. Check the See Also section of\n LinearSVC for more comparison element.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> y = np.array([1, 1, 2, 2])\n >>> from sklearn.svm import SVC\n >>> clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))\n >>> clf.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('svc', SVC(gamma='auto'))])\n\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n ", - "source_code": "\n\nclass SVC(BaseSVC):\n \"\"\"C-Support Vector Classification.\n\n The implementation is based on libsvm. The fit time scales at least\n quadratically with the number of samples and may be impractical\n beyond tens of thousands of samples. For large datasets\n consider using :class:`~sklearn.svm.LinearSVC` or\n :class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a\n :class:`~sklearn.kernel_approximation.Nystroem` transformer.\n\n The multiclass support is handled according to a one-vs-one scheme.\n\n For details on the precise mathematical formulation of the provided\n kernel functions and how `gamma`, `coef0` and `degree` affect each\n other, see the corresponding section in the narrative documentation:\n :ref:`svm_kernels`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n C : float, default=1.0\n Regularization parameter. The strength of the regularization is\n inversely proportional to C. Must be strictly positive. The penalty\n is a squared l2 penalty.\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\n a callable.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to pre-compute the kernel matrix from data matrices; that matrix\n should be an array of shape ``(n_samples, n_samples)``.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n probability : bool, default=False\n Whether to enable probability estimates. This must be enabled prior\n to calling `fit`, will slow down that method as it internally uses\n 5-fold cross-validation, and `predict_proba` may be inconsistent with\n `predict`. Read more in the :ref:`User Guide `.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n class_weight : dict or 'balanced', default=None\n Set the parameter C of class i to class_weight[i]*C for\n SVC. If not given, all classes are supposed to have\n weight one.\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n decision_function_shape : {'ovo', 'ovr'}, default='ovr'\n Whether to return a one-vs-rest ('ovr') decision function of shape\n (n_samples, n_classes) as all other classifiers, or the original\n one-vs-one ('ovo') decision function of libsvm which has shape\n (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one\n ('ovo') is always used as multi-class strategy. The parameter is\n ignored for binary classification.\n\n .. versionchanged:: 0.19\n decision_function_shape is 'ovr' by default.\n\n .. versionadded:: 0.17\n *decision_function_shape='ovr'* is recommended.\n\n .. versionchanged:: 0.17\n Deprecated *decision_function_shape='ovo' and None*.\n\n break_ties : bool, default=False\n If true, ``decision_function_shape='ovr'``, and number of classes > 2,\n :term:`predict` will break ties according to the confidence values of\n :term:`decision_function`; otherwise the first class among the tied\n classes is returned. Please note that breaking ties comes at a\n relatively high computational cost compared to a simple predict.\n\n .. versionadded:: 0.22\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data for\n probability estimates. Ignored when `probability` is False.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n coef_ : ndarray of shape (n_classes * (n_classes - 1) / 2, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is a readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (n_classes -1, n_SV)\n Dual coefficients of the support vector in the decision\n function (see :ref:`sgd_mathematical_formulation`), multiplied by\n their targets.\n For multiclass, coefficient for all 1-vs-1 classifiers.\n The layout of the coefficients in the multiclass case is somewhat\n non-trivial. See the :ref:`multi-class section of the User Guide\n ` for details.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n support_ : ndarray of shape (n_SV)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2)\n probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2)\n If `probability=True`, it corresponds to the parameters learned in\n Platt scaling to produce probability estimates from decision values.\n If `probability=False`, it's an empty array. Platt scaling uses the\n logistic function\n ``1 / (1 + exp(decision_value * probA_ + probB_))``\n where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For\n more information on the multiclass case and training procedure see\n section 8 of [1]_.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n See Also\n --------\n SVR : Support Vector Machine for Regression implemented using libsvm.\n\n LinearSVC : Scalable Linear Support Vector Machine for classification\n implemented using liblinear. Check the See Also section of\n LinearSVC for more comparison element.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> y = np.array([1, 1, 2, 2])\n >>> from sklearn.svm import SVC\n >>> clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))\n >>> clf.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('svc', SVC(gamma='auto'))])\n\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n \"\"\"\n _impl = 'c_svc'\n \n def __init__(self, *, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None):\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, nu=0.0, shrinking=shrinking, probability=probability, cache_size=cache_size, class_weight=class_weight, verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties, random_state=random_state)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" + "description": "C-Support Vector Classification.\n\nThe implementation is based on libsvm. The fit time scales at least\nquadratically with the number of samples and may be impractical\nbeyond tens of thousands of samples. For large datasets\nconsider using :class:`~sklearn.svm.LinearSVC` or\n:class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a\n:class:`~sklearn.kernel_approximation.Nystroem` transformer.\n\nThe multiclass support is handled according to a one-vs-one scheme.\n\nFor details on the precise mathematical formulation of the provided\nkernel functions and how `gamma`, `coef0` and `degree` affect each\nother, see the corresponding section in the narrative documentation:\n:ref:`svm_kernels`.\n\nRead more in the :ref:`User Guide `.", + "docstring": "C-Support Vector Classification.\n\n The implementation is based on libsvm. The fit time scales at least\n quadratically with the number of samples and may be impractical\n beyond tens of thousands of samples. For large datasets\n consider using :class:`~sklearn.svm.LinearSVC` or\n :class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a\n :class:`~sklearn.kernel_approximation.Nystroem` transformer.\n\n The multiclass support is handled according to a one-vs-one scheme.\n\n For details on the precise mathematical formulation of the provided\n kernel functions and how `gamma`, `coef0` and `degree` affect each\n other, see the corresponding section in the narrative documentation:\n :ref:`svm_kernels`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n C : float, default=1.0\n Regularization parameter. The strength of the regularization is\n inversely proportional to C. Must be strictly positive. The penalty\n is a squared l2 penalty.\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to pre-compute the kernel matrix from data matrices; that matrix\n should be an array of shape ``(n_samples, n_samples)``.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n probability : bool, default=False\n Whether to enable probability estimates. This must be enabled prior\n to calling `fit`, will slow down that method as it internally uses\n 5-fold cross-validation, and `predict_proba` may be inconsistent with\n `predict`. Read more in the :ref:`User Guide `.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n class_weight : dict or 'balanced', default=None\n Set the parameter C of class i to class_weight[i]*C for\n SVC. If not given, all classes are supposed to have\n weight one.\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n decision_function_shape : {'ovo', 'ovr'}, default='ovr'\n Whether to return a one-vs-rest ('ovr') decision function of shape\n (n_samples, n_classes) as all other classifiers, or the original\n one-vs-one ('ovo') decision function of libsvm which has shape\n (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one\n ('ovo') is always used as multi-class strategy. The parameter is\n ignored for binary classification.\n\n .. versionchanged:: 0.19\n decision_function_shape is 'ovr' by default.\n\n .. versionadded:: 0.17\n *decision_function_shape='ovr'* is recommended.\n\n .. versionchanged:: 0.17\n Deprecated *decision_function_shape='ovo' and None*.\n\n break_ties : bool, default=False\n If true, ``decision_function_shape='ovr'``, and number of classes > 2,\n :term:`predict` will break ties according to the confidence values of\n :term:`decision_function`; otherwise the first class among the tied\n classes is returned. Please note that breaking ties comes at a\n relatively high computational cost compared to a simple predict.\n\n .. versionadded:: 0.22\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data for\n probability estimates. Ignored when `probability` is False.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n coef_ : ndarray of shape (n_classes * (n_classes - 1) / 2, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is a readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (n_classes -1, n_SV)\n Dual coefficients of the support vector in the decision\n function (see :ref:`sgd_mathematical_formulation`), multiplied by\n their targets.\n For multiclass, coefficient for all 1-vs-1 classifiers.\n The layout of the coefficients in the multiclass case is somewhat\n non-trivial. See the :ref:`multi-class section of the User Guide\n ` for details.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n support_ : ndarray of shape (n_SV)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2)\n probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2)\n If `probability=True`, it corresponds to the parameters learned in\n Platt scaling to produce probability estimates from decision values.\n If `probability=False`, it's an empty array. Platt scaling uses the\n logistic function\n ``1 / (1 + exp(decision_value * probA_ + probB_))``\n where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For\n more information on the multiclass case and training procedure see\n section 8 of [1]_.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n See Also\n --------\n SVR : Support Vector Machine for Regression implemented using libsvm.\n\n LinearSVC : Scalable Linear Support Vector Machine for classification\n implemented using liblinear. Check the See Also section of\n LinearSVC for more comparison element.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> y = np.array([1, 1, 2, 2])\n >>> from sklearn.svm import SVC\n >>> clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))\n >>> clf.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('svc', SVC(gamma='auto'))])\n\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n ", + "source_code": "\n\nclass SVC(BaseSVC):\n \"\"\"C-Support Vector Classification.\n\n The implementation is based on libsvm. The fit time scales at least\n quadratically with the number of samples and may be impractical\n beyond tens of thousands of samples. For large datasets\n consider using :class:`~sklearn.svm.LinearSVC` or\n :class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a\n :class:`~sklearn.kernel_approximation.Nystroem` transformer.\n\n The multiclass support is handled according to a one-vs-one scheme.\n\n For details on the precise mathematical formulation of the provided\n kernel functions and how `gamma`, `coef0` and `degree` affect each\n other, see the corresponding section in the narrative documentation:\n :ref:`svm_kernels`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n C : float, default=1.0\n Regularization parameter. The strength of the regularization is\n inversely proportional to C. Must be strictly positive. The penalty\n is a squared l2 penalty.\n\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to pre-compute the kernel matrix from data matrices; that matrix\n should be an array of shape ``(n_samples, n_samples)``.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n probability : bool, default=False\n Whether to enable probability estimates. This must be enabled prior\n to calling `fit`, will slow down that method as it internally uses\n 5-fold cross-validation, and `predict_proba` may be inconsistent with\n `predict`. Read more in the :ref:`User Guide `.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n class_weight : dict or 'balanced', default=None\n Set the parameter C of class i to class_weight[i]*C for\n SVC. If not given, all classes are supposed to have\n weight one.\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n decision_function_shape : {'ovo', 'ovr'}, default='ovr'\n Whether to return a one-vs-rest ('ovr') decision function of shape\n (n_samples, n_classes) as all other classifiers, or the original\n one-vs-one ('ovo') decision function of libsvm which has shape\n (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one\n ('ovo') is always used as multi-class strategy. The parameter is\n ignored for binary classification.\n\n .. versionchanged:: 0.19\n decision_function_shape is 'ovr' by default.\n\n .. versionadded:: 0.17\n *decision_function_shape='ovr'* is recommended.\n\n .. versionchanged:: 0.17\n Deprecated *decision_function_shape='ovo' and None*.\n\n break_ties : bool, default=False\n If true, ``decision_function_shape='ovr'``, and number of classes > 2,\n :term:`predict` will break ties according to the confidence values of\n :term:`decision_function`; otherwise the first class among the tied\n classes is returned. Please note that breaking ties comes at a\n relatively high computational cost compared to a simple predict.\n\n .. versionadded:: 0.22\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data for\n probability estimates. Ignored when `probability` is False.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n classes_ : ndarray of shape (n_classes,)\n The classes labels.\n\n coef_ : ndarray of shape (n_classes * (n_classes - 1) / 2, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is a readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (n_classes -1, n_SV)\n Dual coefficients of the support vector in the decision\n function (see :ref:`sgd_mathematical_formulation`), multiplied by\n their targets.\n For multiclass, coefficient for all 1-vs-1 classifiers.\n The layout of the coefficients in the multiclass case is somewhat\n non-trivial. See the :ref:`multi-class section of the User Guide\n ` for details.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (n_classes * (n_classes - 1) / 2,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n support_ : ndarray of shape (n_SV)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n probA_ : ndarray of shape (n_classes * (n_classes - 1) / 2)\n probB_ : ndarray of shape (n_classes * (n_classes - 1) / 2)\n If `probability=True`, it corresponds to the parameters learned in\n Platt scaling to produce probability estimates from decision values.\n If `probability=False`, it's an empty array. Platt scaling uses the\n logistic function\n ``1 / (1 + exp(decision_value * probA_ + probB_))``\n where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For\n more information on the multiclass case and training procedure see\n section 8 of [1]_.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n See Also\n --------\n SVR : Support Vector Machine for Regression implemented using libsvm.\n\n LinearSVC : Scalable Linear Support Vector Machine for classification\n implemented using liblinear. Check the See Also section of\n LinearSVC for more comparison element.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> y = np.array([1, 1, 2, 2])\n >>> from sklearn.svm import SVC\n >>> clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))\n >>> clf.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('svc', SVC(gamma='auto'))])\n\n >>> print(clf.predict([[-0.8, -1]]))\n [1]\n \"\"\"\n _impl = 'c_svc'\n \n def __init__(self, *, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None):\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, nu=0.0, shrinking=shrinking, probability=probability, cache_size=cache_size, class_weight=class_weight, verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties, random_state=random_state)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, { "name": "SVR", @@ -26450,9 +26546,9 @@ "sklearn.svm._classes.SVR._more_tags" ], "is_public": true, - "description": "Epsilon-Support Vector Regression.\n\nThe free parameters in the model are C and epsilon. The implementation is based on libsvm. The fit time complexity is more than quadratic with the number of samples which makes it hard to scale to datasets with more than a couple of 10000 samples. For large datasets consider using :class:`~sklearn.svm.LinearSVR` or :class:`~sklearn.linear_model.SGDRegressor` instead, possibly after a :class:`~sklearn.kernel_approximation.Nystroem` transformer. Read more in the :ref:`User Guide `.", - "docstring": "Epsilon-Support Vector Regression.\n\n The free parameters in the model are C and epsilon.\n\n The implementation is based on libsvm. The fit time complexity\n is more than quadratic with the number of samples which makes it hard\n to scale to datasets with more than a couple of 10000 samples. For large\n datasets consider using :class:`~sklearn.svm.LinearSVR` or\n :class:`~sklearn.linear_model.SGDRegressor` instead, possibly after a\n :class:`~sklearn.kernel_approximation.Nystroem` transformer.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\n a callable.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n C : float, default=1.0\n Regularization parameter. The strength of the regularization is\n inversely proportional to C. Must be strictly positive.\n The penalty is a squared l2 penalty.\n\n epsilon : float, default=0.1\n Epsilon in the epsilon-SVR model. It specifies the epsilon-tube\n within which no penalty is associated in the training loss function\n with points predicted within a distance epsilon from the actual\n value.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (1, n_SV)\n Coefficients of the support vector in the decision function.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (1,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n See Also\n --------\n NuSVR : Support Vector Machine for regression implemented using libsvm\n using a parameter to control the number of support vectors.\n\n LinearSVR : Scalable Linear Support Vector Machine for regression\n implemented using liblinear.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> from sklearn.svm import SVR\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))\n >>> regr.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('svr', SVR(epsilon=0.2))])\n ", - "source_code": "\n\nclass SVR(RegressorMixin, BaseLibSVM):\n \"\"\"Epsilon-Support Vector Regression.\n\n The free parameters in the model are C and epsilon.\n\n The implementation is based on libsvm. The fit time complexity\n is more than quadratic with the number of samples which makes it hard\n to scale to datasets with more than a couple of 10000 samples. For large\n datasets consider using :class:`~sklearn.svm.LinearSVR` or\n :class:`~sklearn.linear_model.SGDRegressor` instead, possibly after a\n :class:`~sklearn.kernel_approximation.Nystroem` transformer.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\n a callable.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n C : float, default=1.0\n Regularization parameter. The strength of the regularization is\n inversely proportional to C. Must be strictly positive.\n The penalty is a squared l2 penalty.\n\n epsilon : float, default=0.1\n Epsilon in the epsilon-SVR model. It specifies the epsilon-tube\n within which no penalty is associated in the training loss function\n with points predicted within a distance epsilon from the actual\n value.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (1, n_SV)\n Coefficients of the support vector in the decision function.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (1,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n See Also\n --------\n NuSVR : Support Vector Machine for regression implemented using libsvm\n using a parameter to control the number of support vectors.\n\n LinearSVR : Scalable Linear Support Vector Machine for regression\n implemented using liblinear.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> from sklearn.svm import SVR\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))\n >>> regr.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('svr', SVR(epsilon=0.2))])\n \"\"\"\n _impl = 'epsilon_svr'\n \n def __init__(self, *, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1):\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, nu=0.0, epsilon=epsilon, verbose=verbose, shrinking=shrinking, probability=False, cache_size=cache_size, class_weight=None, max_iter=max_iter, random_state=None)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" + "description": "Epsilon-Support Vector Regression.\n\nThe free parameters in the model are C and epsilon.\n\nThe implementation is based on libsvm. The fit time complexity\nis more than quadratic with the number of samples which makes it hard\nto scale to datasets with more than a couple of 10000 samples. For large\ndatasets consider using :class:`~sklearn.svm.LinearSVR` or\n:class:`~sklearn.linear_model.SGDRegressor` instead, possibly after a\n:class:`~sklearn.kernel_approximation.Nystroem` transformer.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Epsilon-Support Vector Regression.\n\n The free parameters in the model are C and epsilon.\n\n The implementation is based on libsvm. The fit time complexity\n is more than quadratic with the number of samples which makes it hard\n to scale to datasets with more than a couple of 10000 samples. For large\n datasets consider using :class:`~sklearn.svm.LinearSVR` or\n :class:`~sklearn.linear_model.SGDRegressor` instead, possibly after a\n :class:`~sklearn.kernel_approximation.Nystroem` transformer.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n C : float, default=1.0\n Regularization parameter. The strength of the regularization is\n inversely proportional to C. Must be strictly positive.\n The penalty is a squared l2 penalty.\n\n epsilon : float, default=0.1\n Epsilon in the epsilon-SVR model. It specifies the epsilon-tube\n within which no penalty is associated in the training loss function\n with points predicted within a distance epsilon from the actual\n value.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (1, n_SV)\n Coefficients of the support vector in the decision function.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (1,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n See Also\n --------\n NuSVR : Support Vector Machine for regression implemented using libsvm\n using a parameter to control the number of support vectors.\n\n LinearSVR : Scalable Linear Support Vector Machine for regression\n implemented using liblinear.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> from sklearn.svm import SVR\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))\n >>> regr.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('svr', SVR(epsilon=0.2))])\n ", + "source_code": "\n\nclass SVR(RegressorMixin, BaseLibSVM):\n \"\"\"Epsilon-Support Vector Regression.\n\n The free parameters in the model are C and epsilon.\n\n The implementation is based on libsvm. The fit time complexity\n is more than quadratic with the number of samples which makes it hard\n to scale to datasets with more than a couple of 10000 samples. For large\n datasets consider using :class:`~sklearn.svm.LinearSVR` or\n :class:`~sklearn.linear_model.SGDRegressor` instead, possibly after a\n :class:`~sklearn.kernel_approximation.Nystroem` transformer.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'\n Specifies the kernel type to be used in the algorithm.\n If none is given, 'rbf' will be used. If a callable is given it is\n used to precompute the kernel matrix.\n\n degree : int, default=3\n Degree of the polynomial kernel function ('poly').\n Ignored by all other kernels.\n\n gamma : {'scale', 'auto'} or float, default='scale'\n Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n - if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n - if 'auto', uses 1 / n_features.\n\n .. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'.\n\n coef0 : float, default=0.0\n Independent term in kernel function.\n It is only significant in 'poly' and 'sigmoid'.\n\n tol : float, default=1e-3\n Tolerance for stopping criterion.\n\n C : float, default=1.0\n Regularization parameter. The strength of the regularization is\n inversely proportional to C. Must be strictly positive.\n The penalty is a squared l2 penalty.\n\n epsilon : float, default=0.1\n Epsilon in the epsilon-SVR model. It specifies the epsilon-tube\n within which no penalty is associated in the training loss function\n with points predicted within a distance epsilon from the actual\n value.\n\n shrinking : bool, default=True\n Whether to use the shrinking heuristic.\n See the :ref:`User Guide `.\n\n cache_size : float, default=200\n Specify the size of the kernel cache (in MB).\n\n verbose : bool, default=False\n Enable verbose output. Note that this setting takes advantage of a\n per-process runtime setting in libsvm that, if enabled, may not work\n properly in a multithreaded context.\n\n max_iter : int, default=-1\n Hard limit on iterations within solver, or -1 for no limit.\n\n Attributes\n ----------\n class_weight_ : ndarray of shape (n_classes,)\n Multipliers of parameter C for each class.\n Computed based on the ``class_weight`` parameter.\n\n coef_ : ndarray of shape (1, n_features)\n Weights assigned to the features (coefficients in the primal\n problem). This is only available in the case of a linear kernel.\n\n `coef_` is readonly property derived from `dual_coef_` and\n `support_vectors_`.\n\n dual_coef_ : ndarray of shape (1, n_SV)\n Coefficients of the support vector in the decision function.\n\n fit_status_ : int\n 0 if correctly fitted, 1 otherwise (will raise warning)\n\n intercept_ : ndarray of shape (1,)\n Constants in decision function.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_support_ : ndarray of shape (n_classes,), dtype=int32\n Number of support vectors for each class.\n\n shape_fit_ : tuple of int of shape (n_dimensions_of_X,)\n Array dimensions of training vector ``X``.\n\n support_ : ndarray of shape (n_SV,)\n Indices of support vectors.\n\n support_vectors_ : ndarray of shape (n_SV, n_features)\n Support vectors.\n\n See Also\n --------\n NuSVR : Support Vector Machine for regression implemented using libsvm\n using a parameter to control the number of support vectors.\n\n LinearSVR : Scalable Linear Support Vector Machine for regression\n implemented using liblinear.\n\n References\n ----------\n .. [1] `LIBSVM: A Library for Support Vector Machines\n `_\n\n .. [2] `Platt, John (1999). \"Probabilistic outputs for support vector\n machines and comparison to regularizedlikelihood methods.\"\n `_\n\n Examples\n --------\n >>> from sklearn.svm import SVR\n >>> from sklearn.pipeline import make_pipeline\n >>> from sklearn.preprocessing import StandardScaler\n >>> import numpy as np\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> y = rng.randn(n_samples)\n >>> X = rng.randn(n_samples, n_features)\n >>> regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))\n >>> regr.fit(X, y)\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('svr', SVR(epsilon=0.2))])\n \"\"\"\n _impl = 'epsilon_svr'\n \n def __init__(self, *, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1):\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, nu=0.0, epsilon=epsilon, verbose=verbose, shrinking=shrinking, probability=False, cache_size=cache_size, class_weight=None, max_iter=max_iter, random_state=None)\n \n def _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}\n" }, { "name": "BaseDecisionTree", @@ -26473,7 +26569,7 @@ "sklearn.tree._classes.BaseDecisionTree.feature_importances_@getter" ], "is_public": true, - "description": "Base class for decision trees.\n\nWarning: This class should not be used directly. Use derived classes instead.", + "description": "Base class for decision trees.\n\nWarning: This class should not be used directly.\nUse derived classes instead.", "docstring": "Base class for decision trees.\n\n Warning: This class should not be used directly.\n Use derived classes instead.\n ", "source_code": "\n\nclass BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):\n \"\"\"Base class for decision trees.\n\n Warning: This class should not be used directly.\n Use derived classes instead.\n \"\"\"\n \n @abstractmethod\n def __init__(self, *, criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, max_leaf_nodes, random_state, min_impurity_decrease, class_weight=None, ccp_alpha=0.0):\n self.criterion = criterion\n self.splitter = splitter\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.random_state = random_state\n self.min_impurity_decrease = min_impurity_decrease\n self.class_weight = class_weight\n self.ccp_alpha = ccp_alpha\n \n def get_depth(self):\n \"\"\"Return the depth of the decision tree.\n\n The depth of a tree is the maximum distance between the root\n and any leaf.\n\n Returns\n -------\n self.tree_.max_depth : int\n The maximum depth of the tree.\n \"\"\"\n check_is_fitted(self)\n return self.tree_.max_depth\n \n def get_n_leaves(self):\n \"\"\"Return the number of leaves of the decision tree.\n\n Returns\n -------\n self.tree_.n_leaves : int\n Number of leaves.\n \"\"\"\n check_is_fitted(self)\n return self.tree_.n_leaves\n \n def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted='deprecated'):\n random_state = check_random_state(self.random_state)\n if self.ccp_alpha < 0.0:\n raise ValueError('ccp_alpha must be greater than or equal to 0')\n if check_input:\n check_X_params = dict(dtype=DTYPE, accept_sparse='csc')\n check_y_params = dict(ensure_2d=False, dtype=None)\n (X, y) = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params))\n if issparse(X):\n X.sort_indices()\n if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:\n raise ValueError('No support for np.int64 index based sparse matrices')\n if self.criterion == 'poisson':\n if np.any(y < 0):\n raise ValueError('Some value(s) of y are negative which is not allowed for Poisson regression.')\n if np.sum(y) <= 0:\n raise ValueError('Sum of y is not positive which is necessary for Poisson regression.')\n (n_samples, self.n_features_in_) = X.shape\n is_classification = is_classifier(self)\n y = np.atleast_1d(y)\n expanded_class_weight = None\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n self.n_outputs_ = y.shape[1]\n if is_classification:\n check_classification_targets(y)\n y = np.copy(y)\n self.classes_ = []\n self.n_classes_ = []\n if self.class_weight is not None:\n y_original = np.copy(y)\n y_encoded = np.zeros(y.shape, dtype=int)\n for k in range(self.n_outputs_):\n (classes_k, y_encoded[:, k]) = np.unique(y[:, k], return_inverse=True)\n self.classes_.append(classes_k)\n self.n_classes_.append(classes_k.shape[0])\n y = y_encoded\n if self.class_weight is not None:\n expanded_class_weight = compute_sample_weight(self.class_weight, y_original)\n self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)\n if getattr(y, 'dtype', None) != DOUBLE or not y.flags.contiguous:\n y = np.ascontiguousarray(y, dtype=DOUBLE)\n max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth\n max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes\n if isinstance(self.min_samples_leaf, numbers.Integral):\n if not 1 <= self.min_samples_leaf:\n raise ValueError('min_samples_leaf must be at least 1 or in (0, 0.5], got %s' % self.min_samples_leaf)\n min_samples_leaf = self.min_samples_leaf\n else:\n if not 0.0 < self.min_samples_leaf <= 0.5:\n raise ValueError('min_samples_leaf must be at least 1 or in (0, 0.5], got %s' % self.min_samples_leaf)\n min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))\n if isinstance(self.min_samples_split, numbers.Integral):\n if not 2 <= self.min_samples_split:\n raise ValueError('min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer %s' % self.min_samples_split)\n min_samples_split = self.min_samples_split\n else:\n if not 0.0 < self.min_samples_split <= 1.0:\n raise ValueError('min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the float %s' % self.min_samples_split)\n min_samples_split = int(ceil(self.min_samples_split * n_samples))\n min_samples_split = max(2, min_samples_split)\n min_samples_split = max(min_samples_split, 2 * min_samples_leaf)\n if isinstance(self.max_features, str):\n if self.max_features == 'auto':\n if is_classification:\n max_features = max(1, int(np.sqrt(self.n_features_in_)))\n else:\n max_features = self.n_features_in_\n elif self.max_features == 'sqrt':\n max_features = max(1, int(np.sqrt(self.n_features_in_)))\n elif self.max_features == 'log2':\n max_features = max(1, int(np.log2(self.n_features_in_)))\n else:\n raise ValueError(\"Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.\")\n elif self.max_features is None:\n max_features = self.n_features_in_\n elif isinstance(self.max_features, numbers.Integral):\n max_features = self.max_features\n elif self.max_features > 0.0:\n max_features = max(1, int(self.max_features * self.n_features_in_))\n else:\n max_features = 0\n self.max_features_ = max_features\n if len(y) != n_samples:\n raise ValueError('Number of labels=%d does not match number of samples=%d' % (len(y), n_samples))\n if not 0 <= self.min_weight_fraction_leaf <= 0.5:\n raise ValueError('min_weight_fraction_leaf must in [0, 0.5]')\n if max_depth <= 0:\n raise ValueError('max_depth must be greater than zero. ')\n if not 0 < max_features <= self.n_features_in_:\n raise ValueError('max_features must be in (0, n_features]')\n if not isinstance(max_leaf_nodes, numbers.Integral):\n raise ValueError('max_leaf_nodes must be integral number but was %r' % max_leaf_nodes)\n if -1 < max_leaf_nodes < 2:\n raise ValueError('max_leaf_nodes {0} must be either None or larger than 1'.format(max_leaf_nodes))\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)\n if expanded_class_weight is not None:\n if sample_weight is not None:\n sample_weight = sample_weight * expanded_class_weight\n else:\n sample_weight = expanded_class_weight\n if sample_weight is None:\n min_weight_leaf = self.min_weight_fraction_leaf * n_samples\n else:\n min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)\n if self.min_impurity_decrease < 0.0:\n raise ValueError('min_impurity_decrease must be greater than or equal to 0')\n if X_idx_sorted != 'deprecated':\n warnings.warn(\"The parameter 'X_idx_sorted' is deprecated and has no effect. It will be removed in 1.1 (renaming of 0.26). You can suppress this warning by not passing any value to the 'X_idx_sorted' parameter.\", FutureWarning)\n criterion = self.criterion\n if not isinstance(criterion, Criterion):\n if is_classification:\n criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_)\n else:\n criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples)\n if self.criterion == 'mse':\n warnings.warn(\"Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.\", FutureWarning)\n elif self.criterion == 'mae':\n warnings.warn(\"Criterion 'mae' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='absolute_error'` which is equivalent.\", FutureWarning)\n else:\n criterion = copy.deepcopy(criterion)\n SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS\n splitter = self.splitter\n if not isinstance(self.splitter, Splitter):\n splitter = SPLITTERS[self.splitter](criterion, self.max_features_, min_samples_leaf, min_weight_leaf, random_state)\n if is_classifier(self):\n self.tree_ = Tree(self.n_features_in_, self.n_classes_, self.n_outputs_)\n else:\n self.tree_ = Tree(self.n_features_in_, np.array([1] * self.n_outputs_, dtype=np.intp), self.n_outputs_)\n if max_leaf_nodes < 0:\n builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, self.min_impurity_decrease)\n else:\n builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes, self.min_impurity_decrease)\n builder.build(self.tree_, X, y, sample_weight)\n if self.n_outputs_ == 1 and is_classifier(self):\n self.n_classes_ = self.n_classes_[0]\n self.classes_ = self.classes_[0]\n self._prune_tree()\n return self\n \n def _validate_X_predict(self, X, check_input):\n \"\"\"Validate the training data on predict (probabilities).\"\"\"\n if check_input:\n X = self._validate_data(X, dtype=DTYPE, accept_sparse='csr', reset=False)\n if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):\n raise ValueError('No support for np.int64 index based sparse matrices')\n else:\n self._check_n_features(X, reset=False)\n return X\n \n def predict(self, X, check_input=True):\n \"\"\"Predict class or regression value for X.\n\n For a classification model, the predicted class for each sample in X is\n returned. For a regression model, the predicted value based on X is\n returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The predicted classes, or the predict values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_X_predict(X, check_input)\n proba = self.tree_.predict(X)\n n_samples = X.shape[0]\n if is_classifier(self):\n if self.n_outputs_ == 1:\n return self.classes_.take(np.argmax(proba, axis=1), axis=0)\n else:\n class_type = self.classes_[0].dtype\n predictions = np.zeros((n_samples, self.n_outputs_), dtype=class_type)\n for k in range(self.n_outputs_):\n predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0)\n return predictions\n elif self.n_outputs_ == 1:\n return proba[:, 0]\n else:\n return proba[:, :, 0]\n \n def apply(self, X, check_input=True):\n \"\"\"Return the index of the leaf that each sample is predicted as.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n X_leaves : array-like of shape (n_samples,)\n For each datapoint x in X, return the index of the leaf x\n ends up in. Leaves are numbered within\n ``[0; self.tree_.node_count)``, possibly with gaps in the\n numbering.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_X_predict(X, check_input)\n return self.tree_.apply(X)\n \n def decision_path(self, X, check_input=True):\n \"\"\"Return the decision path in the tree.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n indicator : sparse matrix of shape (n_samples, n_nodes)\n Return a node indicator CSR matrix where non zero elements\n indicates that the samples goes through the nodes.\n \"\"\"\n X = self._validate_X_predict(X, check_input)\n return self.tree_.decision_path(X)\n \n def _prune_tree(self):\n \"\"\"Prune tree using Minimal Cost-Complexity Pruning.\"\"\"\n check_is_fitted(self)\n if self.ccp_alpha < 0.0:\n raise ValueError('ccp_alpha must be greater than or equal to 0')\n if self.ccp_alpha == 0.0:\n return\n if is_classifier(self):\n n_classes = np.atleast_1d(self.n_classes_)\n pruned_tree = Tree(self.n_features_in_, n_classes, self.n_outputs_)\n else:\n pruned_tree = Tree(self.n_features_in_, np.array([1] * self.n_outputs_, dtype=np.intp), self.n_outputs_)\n _build_pruned_tree_ccp(pruned_tree, self.tree_, self.ccp_alpha)\n self.tree_ = pruned_tree\n \n def cost_complexity_pruning_path(self, X, y, sample_weight=None):\n \"\"\"Compute the pruning path during Minimal Cost-Complexity Pruning.\n\n See :ref:`minimal_cost_complexity_pruning` for details on the pruning\n process.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csc_matrix``.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels) as integers or strings.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. Splits are also\n ignored if they would result in any single class carrying a\n negative weight in either child node.\n\n Returns\n -------\n ccp_path : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n ccp_alphas : ndarray\n Effective alphas of subtree during pruning.\n\n impurities : ndarray\n Sum of the impurities of the subtree leaves for the\n corresponding alpha value in ``ccp_alphas``.\n \"\"\"\n est = clone(self).set_params(ccp_alpha=0.0)\n est.fit(X, y, sample_weight=sample_weight)\n return Bunch(**ccp_pruning_path(est.tree_))\n \n @property\n def feature_importances_(self):\n \"\"\"Return the feature importances.\n\n The importance of a feature is computed as the (normalized) total\n reduction of the criterion brought by that feature.\n It is also known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n Normalized total reduction of criteria by feature\n (Gini importance).\n \"\"\"\n check_is_fitted(self)\n return self.tree_.compute_feature_importances()\n" }, @@ -26518,7 +26614,7 @@ "superclasses": ["DecisionTreeClassifier"], "methods": ["sklearn.tree._classes.ExtraTreeClassifier.__init__"], "is_public": true, - "description": "An extremely randomized tree classifier.\n\nExtra-trees differ from classic decision trees in the way they are built. When looking for the best split to separate the samples of a node into two groups, random splits are drawn for each of the `max_features` randomly selected features and the best split among those is chosen. When `max_features` is set 1, this amounts to building a totally random decision tree. Warning: Extra-trees should only be used within ensemble methods. Read more in the :ref:`User Guide `.", + "description": "An extremely randomized tree classifier.\n\nExtra-trees differ from classic decision trees in the way they are built.\nWhen looking for the best split to separate the samples of a node into two\ngroups, random splits are drawn for each of the `max_features` randomly\nselected features and the best split among those is chosen. When\n`max_features` is set 1, this amounts to building a totally random\ndecision tree.\n\nWarning: Extra-trees should only be used within ensemble methods.\n\nRead more in the :ref:`User Guide `.", "docstring": "An extremely randomized tree classifier.\n\n Extra-trees differ from classic decision trees in the way they are built.\n When looking for the best split to separate the samples of a node into two\n groups, random splits are drawn for each of the `max_features` randomly\n selected features and the best split among those is chosen. When\n `max_features` is set 1, this amounts to building a totally random\n decision tree.\n\n Warning: Extra-trees should only be used within ensemble methods.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n criterion : {\"gini\", \"entropy\"}, default=\"gini\"\n The function to measure the quality of a split. Supported criteria are\n \"gini\" for the Gini impurity and \"entropy\" for the information gain.\n\n splitter : {\"random\", \"best\"}, default=\"random\"\n The strategy used to choose the split at each node. Supported\n strategies are \"best\" to choose the best split and \"random\" to choose\n the best random split.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : int, float, {\"auto\", \"sqrt\", \"log2\"} or None, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=sqrt(n_features)`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n random_state : int, RandomState instance or None, default=None\n Used to pick randomly the `max_features` used at each split.\n See :term:`Glossary ` for details.\n\n max_leaf_nodes : int, default=None\n Grow a tree with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n class_weight : dict, list of dict or \"balanced\", default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If None, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n For multi-output, the weights of each column of y will be multiplied.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,) or list of ndarray\n The classes labels (single output problem),\n or a list of arrays of class labels (multi-output problem).\n\n max_features_ : int\n The inferred value of max_features.\n\n n_classes_ : int or list of int\n The number of classes (for single output problems),\n or a list containing the number of classes for each\n output (for multi-output problems).\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n `n_features_` is deprecated in 1.0 and will be removed in\n 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n tree_ : Tree instance\n The underlying Tree object. Please refer to\n ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and\n :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`\n for basic usage of these attributes.\n\n See Also\n --------\n ExtraTreeRegressor : An extremely randomized tree regressor.\n sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier.\n sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor.\n sklearn.ensemble.RandomForestClassifier : A random forest classifier.\n sklearn.ensemble.RandomForestRegressor : A random forest regressor.\n sklearn.ensemble.RandomTreesEmbedding : An ensemble of\n totally random trees.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n References\n ----------\n\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\",\n Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.ensemble import BaggingClassifier\n >>> from sklearn.tree import ExtraTreeClassifier\n >>> X, y = load_iris(return_X_y=True)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> extra_tree = ExtraTreeClassifier(random_state=0)\n >>> cls = BaggingClassifier(extra_tree, random_state=0).fit(\n ... X_train, y_train)\n >>> cls.score(X_test, y_test)\n 0.8947...\n ", "source_code": "\n\nclass ExtraTreeClassifier(DecisionTreeClassifier):\n \"\"\"An extremely randomized tree classifier.\n\n Extra-trees differ from classic decision trees in the way they are built.\n When looking for the best split to separate the samples of a node into two\n groups, random splits are drawn for each of the `max_features` randomly\n selected features and the best split among those is chosen. When\n `max_features` is set 1, this amounts to building a totally random\n decision tree.\n\n Warning: Extra-trees should only be used within ensemble methods.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n criterion : {\"gini\", \"entropy\"}, default=\"gini\"\n The function to measure the quality of a split. Supported criteria are\n \"gini\" for the Gini impurity and \"entropy\" for the information gain.\n\n splitter : {\"random\", \"best\"}, default=\"random\"\n The strategy used to choose the split at each node. Supported\n strategies are \"best\" to choose the best split and \"random\" to choose\n the best random split.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : int, float, {\"auto\", \"sqrt\", \"log2\"} or None, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=sqrt(n_features)`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n random_state : int, RandomState instance or None, default=None\n Used to pick randomly the `max_features` used at each split.\n See :term:`Glossary ` for details.\n\n max_leaf_nodes : int, default=None\n Grow a tree with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n class_weight : dict, list of dict or \"balanced\", default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If None, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n For multi-output, the weights of each column of y will be multiplied.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n classes_ : ndarray of shape (n_classes,) or list of ndarray\n The classes labels (single output problem),\n or a list of arrays of class labels (multi-output problem).\n\n max_features_ : int\n The inferred value of max_features.\n\n n_classes_ : int or list of int\n The number of classes (for single output problems),\n or a list containing the number of classes for each\n output (for multi-output problems).\n\n feature_importances_ : ndarray of shape (n_features,)\n The impurity-based feature importances.\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n `n_features_` is deprecated in 1.0 and will be removed in\n 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n tree_ : Tree instance\n The underlying Tree object. Please refer to\n ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and\n :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`\n for basic usage of these attributes.\n\n See Also\n --------\n ExtraTreeRegressor : An extremely randomized tree regressor.\n sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier.\n sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor.\n sklearn.ensemble.RandomForestClassifier : A random forest classifier.\n sklearn.ensemble.RandomForestRegressor : A random forest regressor.\n sklearn.ensemble.RandomTreesEmbedding : An ensemble of\n totally random trees.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n References\n ----------\n\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\",\n Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.ensemble import BaggingClassifier\n >>> from sklearn.tree import ExtraTreeClassifier\n >>> X, y = load_iris(return_X_y=True)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> extra_tree = ExtraTreeClassifier(random_state=0)\n >>> cls = BaggingClassifier(extra_tree, random_state=0).fit(\n ... X_train, y_train)\n >>> cls.score(X_test, y_test)\n 0.8947...\n \"\"\"\n \n def __init__(self, *, criterion='gini', splitter='random', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0):\n super().__init__(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha)\n" }, @@ -26529,7 +26625,7 @@ "superclasses": ["DecisionTreeRegressor"], "methods": ["sklearn.tree._classes.ExtraTreeRegressor.__init__"], "is_public": true, - "description": "An extremely randomized tree regressor.\n\nExtra-trees differ from classic decision trees in the way they are built. When looking for the best split to separate the samples of a node into two groups, random splits are drawn for each of the `max_features` randomly selected features and the best split among those is chosen. When `max_features` is set 1, this amounts to building a totally random decision tree. Warning: Extra-trees should only be used within ensemble methods. Read more in the :ref:`User Guide `.", + "description": "An extremely randomized tree regressor.\n\nExtra-trees differ from classic decision trees in the way they are built.\nWhen looking for the best split to separate the samples of a node into two\ngroups, random splits are drawn for each of the `max_features` randomly\nselected features and the best split among those is chosen. When\n`max_features` is set 1, this amounts to building a totally random\ndecision tree.\n\nWarning: Extra-trees should only be used within ensemble methods.\n\nRead more in the :ref:`User Guide `.", "docstring": "An extremely randomized tree regressor.\n\n Extra-trees differ from classic decision trees in the way they are built.\n When looking for the best split to separate the samples of a node into two\n groups, random splits are drawn for each of the `max_features` randomly\n selected features and the best split among those is chosen. When\n `max_features` is set 1, this amounts to building a totally random\n decision tree.\n\n Warning: Extra-trees should only be used within ensemble methods.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n criterion : {\"squared_error\", \"friedman_mse\"}, default=\"squared_error\"\n The function to measure the quality of a split. Supported criteria\n are \"squared_error\" for the mean squared error, which is equal to\n variance reduction as feature selection criterion and \"mae\" for the\n mean absolute error.\n\n .. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n .. versionadded:: 0.24\n Poisson deviance criterion.\n\n .. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n .. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent.\n\n splitter : {\"random\", \"best\"}, default=\"random\"\n The strategy used to choose the split at each node. Supported\n strategies are \"best\" to choose the best split and \"random\" to choose\n the best random split.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : int, float, {\"auto\", \"sqrt\", \"log2\"} or None, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=n_features`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n random_state : int, RandomState instance or None, default=None\n Used to pick randomly the `max_features` used at each split.\n See :term:`Glossary ` for details.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n max_leaf_nodes : int, default=None\n Grow a tree with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n max_features_ : int\n The inferred value of max_features.\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n `n_features_` is deprecated in 1.0 and will be removed in\n 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n feature_importances_ : ndarray of shape (n_features,)\n Return impurity-based feature importances (the higher, the more\n important the feature).\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n tree_ : Tree instance\n The underlying Tree object. Please refer to\n ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and\n :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`\n for basic usage of these attributes.\n\n See Also\n --------\n ExtraTreeClassifier : An extremely randomized tree classifier.\n sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier.\n sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n References\n ----------\n\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\",\n Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.ensemble import BaggingRegressor\n >>> from sklearn.tree import ExtraTreeRegressor\n >>> X, y = load_diabetes(return_X_y=True)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> extra_tree = ExtraTreeRegressor(random_state=0)\n >>> reg = BaggingRegressor(extra_tree, random_state=0).fit(\n ... X_train, y_train)\n >>> reg.score(X_test, y_test)\n 0.33...\n ", "source_code": "\n\nclass ExtraTreeRegressor(DecisionTreeRegressor):\n \"\"\"An extremely randomized tree regressor.\n\n Extra-trees differ from classic decision trees in the way they are built.\n When looking for the best split to separate the samples of a node into two\n groups, random splits are drawn for each of the `max_features` randomly\n selected features and the best split among those is chosen. When\n `max_features` is set 1, this amounts to building a totally random\n decision tree.\n\n Warning: Extra-trees should only be used within ensemble methods.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n criterion : {\"squared_error\", \"friedman_mse\"}, default=\"squared_error\"\n The function to measure the quality of a split. Supported criteria\n are \"squared_error\" for the mean squared error, which is equal to\n variance reduction as feature selection criterion and \"mae\" for the\n mean absolute error.\n\n .. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n .. versionadded:: 0.24\n Poisson deviance criterion.\n\n .. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n .. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent.\n\n splitter : {\"random\", \"best\"}, default=\"random\"\n The strategy used to choose the split at each node. Supported\n strategies are \"best\" to choose the best split and \"random\" to choose\n the best random split.\n\n max_depth : int, default=None\n The maximum depth of the tree. If None, then nodes are expanded until\n all leaves are pure or until all leaves contain less than\n min_samples_split samples.\n\n min_samples_split : int or float, default=2\n The minimum number of samples required to split an internal node:\n\n - If int, then consider `min_samples_split` as the minimum number.\n - If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_samples_leaf : int or float, default=1\n The minimum number of samples required to be at a leaf node.\n A split point at any depth will only be considered if it leaves at\n least ``min_samples_leaf`` training samples in each of the left and\n right branches. This may have the effect of smoothing the model,\n especially in regression.\n\n - If int, then consider `min_samples_leaf` as the minimum number.\n - If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n .. versionchanged:: 0.18\n Added float values for fractions.\n\n min_weight_fraction_leaf : float, default=0.0\n The minimum weighted fraction of the sum total of weights (of all\n the input samples) required to be at a leaf node. Samples have\n equal weight when sample_weight is not provided.\n\n max_features : int, float, {\"auto\", \"sqrt\", \"log2\"} or None, default=\"auto\"\n The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=n_features`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\n Note: the search for a split does not stop until at least one\n valid partition of the node samples is found, even if it requires to\n effectively inspect more than ``max_features`` features.\n\n random_state : int, RandomState instance or None, default=None\n Used to pick randomly the `max_features` used at each split.\n See :term:`Glossary ` for details.\n\n min_impurity_decrease : float, default=0.0\n A node will be split if this split induces a decrease of the impurity\n greater than or equal to this value.\n\n The weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\n where ``N`` is the total number of samples, ``N_t`` is the number of\n samples at the current node, ``N_t_L`` is the number of samples in the\n left child, and ``N_t_R`` is the number of samples in the right child.\n\n ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\n if ``sample_weight`` is passed.\n\n .. versionadded:: 0.19\n\n max_leaf_nodes : int, default=None\n Grow a tree with ``max_leaf_nodes`` in best-first fashion.\n Best nodes are defined as relative reduction in impurity.\n If None then unlimited number of leaf nodes.\n\n ccp_alpha : non-negative float, default=0.0\n Complexity parameter used for Minimal Cost-Complexity Pruning. The\n subtree with the largest cost complexity that is smaller than\n ``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n :ref:`minimal_cost_complexity_pruning` for details.\n\n .. versionadded:: 0.22\n\n Attributes\n ----------\n max_features_ : int\n The inferred value of max_features.\n\n n_features_ : int\n The number of features when ``fit`` is performed.\n\n .. deprecated:: 1.0\n `n_features_` is deprecated in 1.0 and will be removed in\n 1.2. Use `n_features_in_` instead.\n\n n_features_in_ : int\n Number of features seen during :term:`fit`.\n\n .. versionadded:: 0.24\n\n feature_names_in_ : ndarray of shape (`n_features_in_`,)\n Names of features seen during :term:`fit`. Defined only when `X`\n has feature names that are all strings.\n\n .. versionadded:: 1.0\n\n feature_importances_ : ndarray of shape (n_features,)\n Return impurity-based feature importances (the higher, the more\n important the feature).\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n n_outputs_ : int\n The number of outputs when ``fit`` is performed.\n\n tree_ : Tree instance\n The underlying Tree object. Please refer to\n ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and\n :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`\n for basic usage of these attributes.\n\n See Also\n --------\n ExtraTreeClassifier : An extremely randomized tree classifier.\n sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier.\n sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor.\n\n Notes\n -----\n The default values for the parameters controlling the size of the trees\n (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and\n unpruned trees which can potentially be very large on some data sets. To\n reduce memory consumption, the complexity and size of the trees should be\n controlled by setting those parameter values.\n\n References\n ----------\n\n .. [1] P. Geurts, D. Ernst., and L. Wehenkel, \"Extremely randomized trees\",\n Machine Learning, 63(1), 3-42, 2006.\n\n Examples\n --------\n >>> from sklearn.datasets import load_diabetes\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.ensemble import BaggingRegressor\n >>> from sklearn.tree import ExtraTreeRegressor\n >>> X, y = load_diabetes(return_X_y=True)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> extra_tree = ExtraTreeRegressor(random_state=0)\n >>> reg = BaggingRegressor(extra_tree, random_state=0).fit(\n ... X_train, y_train)\n >>> reg.score(X_test, y_test)\n 0.33...\n \"\"\"\n \n def __init__(self, *, criterion='squared_error', splitter='random', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', random_state=None, min_impurity_decrease=0.0, max_leaf_nodes=None, ccp_alpha=0.0):\n super().__init__(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha)\n" }, @@ -26591,7 +26687,7 @@ "is_public": false, "description": "", "docstring": null, - "source_code": "\n\nclass _MPLTreeExporter(_BaseTreeExporter):\n \n def __init__(self, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, impurity=True, node_ids=False, proportion=False, rounded=False, precision=3, fontsize=None):\n super().__init__(max_depth=max_depth, feature_names=feature_names, class_names=class_names, label=label, filled=filled, impurity=impurity, node_ids=node_ids, proportion=proportion, rounded=rounded, precision=precision)\n self.fontsize = fontsize\n if isinstance(precision, Integral):\n if precision < 0:\n raise ValueError(\"'precision' should be greater or equal to 0. Got {} instead.\".format(precision))\n else:\n raise ValueError(\"'precision' should be an integer. Got {} instead.\".format(type(precision)))\n self.ranks = {'leaves': []}\n self.colors = {'bounds': None}\n self.characters = ['#', '[', ']', '<=', '\\n', '', '']\n self.bbox_args = dict()\n if self.rounded:\n self.bbox_args['boxstyle'] = 'round'\n self.arrow_args = dict(arrowstyle='<-')\n \n def _make_tree(self, node_id, et, criterion, depth=0):\n name = self.node_to_str(et, node_id, criterion=criterion)\n if et.children_left[node_id] != _tree.TREE_LEAF and (self.max_depth is None or depth <= self.max_depth):\n children = [self._make_tree(et.children_left[node_id], et, criterion, depth=depth + 1), self._make_tree(et.children_right[node_id], et, criterion, depth=depth + 1)]\n else:\n return Tree(name, node_id)\n return Tree(name, node_id, *children)\n \n def export(self, decision_tree, ax=None):\n import matplotlib.pyplot as plt\n from matplotlib.text import Annotation\n if ax is None:\n ax = plt.gca()\n ax.clear()\n ax.set_axis_off()\n my_tree = self._make_tree(0, decision_tree.tree_, decision_tree.criterion)\n draw_tree = buchheim(my_tree)\n (max_x, max_y) = draw_tree.max_extents() + 1\n ax_width = ax.get_window_extent().width\n ax_height = ax.get_window_extent().height\n scale_x = ax_width / max_x\n scale_y = ax_height / max_y\n self.recurse(draw_tree, decision_tree.tree_, ax, scale_x, scale_y, ax_height)\n anns = [ann for ann in ax.get_children() if isinstance(ann, Annotation)]\n renderer = ax.figure.canvas.get_renderer()\n for ann in anns:\n ann.update_bbox_position_size(renderer)\n if self.fontsize is None:\n extents = [ann.get_bbox_patch().get_window_extent() for ann in anns]\n max_width = max([extent.width for extent in extents])\n max_height = max([extent.height for extent in extents])\n size = anns[0].get_fontsize() * min(scale_x / max_width, scale_y / max_height)\n for ann in anns:\n ann.set_fontsize(size)\n return anns\n \n def recurse(self, node, tree, ax, scale_x, scale_y, height, depth=0):\n import matplotlib.pyplot as plt\n kwargs = dict(bbox=self.bbox_args.copy(), ha='center', va='center', zorder=100 - 10 * depth, xycoords='axes points', arrowprops=self.arrow_args.copy())\n kwargs['arrowprops']['edgecolor'] = plt.rcParams['text.color']\n if self.fontsize is not None:\n kwargs['fontsize'] = self.fontsize\n xy = ((node.x + 0.5) * scale_x, height - (node.y + 0.5) * scale_y)\n if self.max_depth is None or depth <= self.max_depth:\n if self.filled:\n kwargs['bbox']['fc'] = self.get_fill_color(tree, node.tree.node_id)\n else:\n kwargs['bbox']['fc'] = ax.get_facecolor()\n if node.parent is None:\n ax.annotate(node.tree.label, xy, **kwargs)\n else:\n xy_parent = ((node.parent.x + 0.5) * scale_x, height - (node.parent.y + 0.5) * scale_y)\n ax.annotate(node.tree.label, xy_parent, xy, **kwargs)\n for child in node.children:\n self.recurse(child, tree, ax, scale_x, scale_y, height, depth=depth + 1)\n else:\n xy_parent = ((node.parent.x + 0.5) * scale_x, height - (node.parent.y + 0.5) * scale_y)\n kwargs['bbox']['fc'] = 'grey'\n ax.annotate('\\n (...) \\n', xy_parent, xy, **kwargs)\n" + "source_code": "\n\nclass _MPLTreeExporter(_BaseTreeExporter):\n \n def __init__(self, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, impurity=True, node_ids=False, proportion=False, rounded=False, precision=3, fontsize=None):\n super().__init__(max_depth=max_depth, feature_names=feature_names, class_names=class_names, label=label, filled=filled, impurity=impurity, node_ids=node_ids, proportion=proportion, rounded=rounded, precision=precision)\n self.fontsize = fontsize\n if isinstance(precision, Integral):\n if precision < 0:\n raise ValueError(\"'precision' should be greater or equal to 0. Got {} instead.\".format(precision))\n else:\n raise ValueError(\"'precision' should be an integer. Got {} instead.\".format(type(precision)))\n self.ranks = {'leaves': []}\n self.colors = {'bounds': None}\n self.characters = ['#', '[', ']', '<=', '\\n', '', '']\n self.bbox_args = dict()\n if self.rounded:\n self.bbox_args['boxstyle'] = 'round'\n self.arrow_args = dict(arrowstyle='<-')\n \n def _make_tree(self, node_id, et, criterion, depth=0):\n name = self.node_to_str(et, node_id, criterion=criterion)\n if et.children_left[node_id] != _tree.TREE_LEAF and (self.max_depth is None or depth <= self.max_depth):\n children = [self._make_tree(et.children_left[node_id], et, criterion, depth=depth + 1), self._make_tree(et.children_right[node_id], et, criterion, depth=depth + 1)]\n else:\n return Tree(name, node_id)\n return Tree(name, node_id, *children)\n \n def export(self, decision_tree, ax=None):\n import matplotlib.pyplot as plt\n from matplotlib.text import Annotation\n if ax is None:\n ax = plt.gca()\n ax.clear()\n ax.set_axis_off()\n my_tree = self._make_tree(0, decision_tree.tree_, decision_tree.criterion)\n draw_tree = buchheim(my_tree)\n (max_x, max_y) = draw_tree.max_extents() + 1\n ax_width = ax.get_window_extent().width\n ax_height = ax.get_window_extent().height\n scale_x = ax_width / max_x\n scale_y = ax_height / max_y\n self.recurse(draw_tree, decision_tree.tree_, ax, max_x, max_y)\n anns = [ann for ann in ax.get_children() if isinstance(ann, Annotation)]\n renderer = ax.figure.canvas.get_renderer()\n for ann in anns:\n ann.update_bbox_position_size(renderer)\n if self.fontsize is None:\n extents = [ann.get_bbox_patch().get_window_extent() for ann in anns]\n max_width = max([extent.width for extent in extents])\n max_height = max([extent.height for extent in extents])\n size = anns[0].get_fontsize() * min(scale_x / max_width, scale_y / max_height)\n for ann in anns:\n ann.set_fontsize(size)\n return anns\n \n def recurse(self, node, tree, ax, max_x, max_y, depth=0):\n import matplotlib.pyplot as plt\n kwargs = dict(bbox=self.bbox_args.copy(), ha='center', va='center', zorder=100 - 10 * depth, xycoords='axes fraction', arrowprops=self.arrow_args.copy())\n kwargs['arrowprops']['edgecolor'] = plt.rcParams['text.color']\n if self.fontsize is not None:\n kwargs['fontsize'] = self.fontsize\n xy = ((node.x + 0.5) / max_x, (max_y - node.y - 0.5) / max_y)\n if self.max_depth is None or depth <= self.max_depth:\n if self.filled:\n kwargs['bbox']['fc'] = self.get_fill_color(tree, node.tree.node_id)\n else:\n kwargs['bbox']['fc'] = ax.get_facecolor()\n if node.parent is None:\n ax.annotate(node.tree.label, xy, **kwargs)\n else:\n xy_parent = ((node.parent.x + 0.5) / max_x, (max_y - node.parent.y - 0.5) / max_y)\n ax.annotate(node.tree.label, xy_parent, xy, **kwargs)\n for child in node.children:\n self.recurse(child, tree, ax, max_x, max_y, depth=depth + 1)\n else:\n xy_parent = ((node.parent.x + 0.5) / max_x, (max_y - node.parent.y - 0.5) / max_y)\n kwargs['bbox']['fc'] = 'grey'\n ax.annotate('\\n (...) \\n', xy_parent, xy, **kwargs)\n" }, { "name": "DrawTree", @@ -26637,7 +26733,7 @@ "sklearn.utils.Bunch.__setstate__" ], "is_public": true, - "description": "Container object exposing keys as attributes.\n\nBunch objects are sometimes used as an output for functions and methods. They extend dictionaries by enabling values to be accessed by key, `bunch[\"value_key\"]`, or by an attribute, `bunch.value_key`.", + "description": "Container object exposing keys as attributes.\n\nBunch objects are sometimes used as an output for functions and methods.\nThey extend dictionaries by enabling values to be accessed by key,\n`bunch[\"value_key\"]`, or by an attribute, `bunch.value_key`.", "docstring": "Container object exposing keys as attributes.\n\n Bunch objects are sometimes used as an output for functions and methods.\n They extend dictionaries by enabling values to be accessed by key,\n `bunch[\"value_key\"]`, or by an attribute, `bunch.value_key`.\n\n Examples\n --------\n >>> from sklearn.utils import Bunch\n >>> b = Bunch(a=1, b=2)\n >>> b['b']\n 2\n >>> b.b\n 2\n >>> b.a = 3\n >>> b['a']\n 3\n >>> b.c = 6\n >>> b['c']\n 6\n ", "source_code": "\n\nclass Bunch(dict):\n \"\"\"Container object exposing keys as attributes.\n\n Bunch objects are sometimes used as an output for functions and methods.\n They extend dictionaries by enabling values to be accessed by key,\n `bunch[\"value_key\"]`, or by an attribute, `bunch.value_key`.\n\n Examples\n --------\n >>> from sklearn.utils import Bunch\n >>> b = Bunch(a=1, b=2)\n >>> b['b']\n 2\n >>> b.b\n 2\n >>> b.a = 3\n >>> b['a']\n 3\n >>> b.c = 6\n >>> b['c']\n 6\n \"\"\"\n \n def __init__(self, **kwargs):\n super().__init__(kwargs)\n \n def __setattr__(self, key, value):\n self[key] = value\n \n def __dir__(self):\n return self.keys()\n \n def __getattr__(self, key):\n try:\n return self[key]\n except KeyError:\n raise AttributeError(key)\n \n def __setstate__(self, state):\n pass\n" }, @@ -26710,7 +26806,7 @@ "sklearn.utils._mocking.CheckingClassifier._more_tags" ], "is_public": false, - "description": "Dummy classifier to test pipelining and meta-estimators.\n\nChecks some property of `X` and `y`in fit / predict. This allows testing whether pipelines / cross-validation or metaestimators changed the input. Can also be used to check if `fit_params` are passed correctly, and to force a certain score to be returned.", + "description": "Dummy classifier to test pipelining and meta-estimators.\n\nChecks some property of `X` and `y`in fit / predict.\nThis allows testing whether pipelines / cross-validation or metaestimators\nchanged the input.\n\nCan also be used to check if `fit_params` are passed correctly, and\nto force a certain score to be returned.", "docstring": "Dummy classifier to test pipelining and meta-estimators.\n\n Checks some property of `X` and `y`in fit / predict.\n This allows testing whether pipelines / cross-validation or metaestimators\n changed the input.\n\n Can also be used to check if `fit_params` are passed correctly, and\n to force a certain score to be returned.\n\n Parameters\n ----------\n check_y, check_X : callable, default=None\n The callable used to validate `X` and `y`. These callable should return\n a bool where `False` will trigger an `AssertionError`.\n\n check_y_params, check_X_params : dict, default=None\n The optional parameters to pass to `check_X` and `check_y`.\n\n methods_to_check : \"all\" or list of str, default=\"all\"\n The methods in which the checks should be applied. By default,\n all checks will be done on all methods (`fit`, `predict`,\n `predict_proba`, `decision_function` and `score`).\n\n foo_param : int, default=0\n A `foo` param. When `foo > 1`, the output of :meth:`score` will be 1\n otherwise it is 0.\n\n expected_fit_params : list of str, default=None\n A list of the expected parameters given when calling `fit`.\n\n Attributes\n ----------\n classes_ : int\n The classes seen during `fit`.\n\n n_features_in_ : int\n The number of features seen during `fit`.\n\n Examples\n --------\n >>> from sklearn.utils._mocking import CheckingClassifier\n\n This helper allow to assert to specificities regarding `X` or `y`. In this\n case we expect `check_X` or `check_y` to return a boolean.\n\n >>> from sklearn.datasets import load_iris\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = CheckingClassifier(check_X=lambda x: x.shape == (150, 4))\n >>> clf.fit(X, y)\n CheckingClassifier(...)\n\n We can also provide a check which might raise an error. In this case, we\n expect `check_X` to return `X` and `check_y` to return `y`.\n\n >>> from sklearn.utils import check_array\n >>> clf = CheckingClassifier(check_X=check_array)\n >>> clf.fit(X, y)\n CheckingClassifier(...)\n ", "source_code": "\n\nclass CheckingClassifier(ClassifierMixin, BaseEstimator):\n \"\"\"Dummy classifier to test pipelining and meta-estimators.\n\n Checks some property of `X` and `y`in fit / predict.\n This allows testing whether pipelines / cross-validation or metaestimators\n changed the input.\n\n Can also be used to check if `fit_params` are passed correctly, and\n to force a certain score to be returned.\n\n Parameters\n ----------\n check_y, check_X : callable, default=None\n The callable used to validate `X` and `y`. These callable should return\n a bool where `False` will trigger an `AssertionError`.\n\n check_y_params, check_X_params : dict, default=None\n The optional parameters to pass to `check_X` and `check_y`.\n\n methods_to_check : \"all\" or list of str, default=\"all\"\n The methods in which the checks should be applied. By default,\n all checks will be done on all methods (`fit`, `predict`,\n `predict_proba`, `decision_function` and `score`).\n\n foo_param : int, default=0\n A `foo` param. When `foo > 1`, the output of :meth:`score` will be 1\n otherwise it is 0.\n\n expected_fit_params : list of str, default=None\n A list of the expected parameters given when calling `fit`.\n\n Attributes\n ----------\n classes_ : int\n The classes seen during `fit`.\n\n n_features_in_ : int\n The number of features seen during `fit`.\n\n Examples\n --------\n >>> from sklearn.utils._mocking import CheckingClassifier\n\n This helper allow to assert to specificities regarding `X` or `y`. In this\n case we expect `check_X` or `check_y` to return a boolean.\n\n >>> from sklearn.datasets import load_iris\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = CheckingClassifier(check_X=lambda x: x.shape == (150, 4))\n >>> clf.fit(X, y)\n CheckingClassifier(...)\n\n We can also provide a check which might raise an error. In this case, we\n expect `check_X` to return `X` and `check_y` to return `y`.\n\n >>> from sklearn.utils import check_array\n >>> clf = CheckingClassifier(check_X=check_array)\n >>> clf.fit(X, y)\n CheckingClassifier(...)\n \"\"\"\n \n def __init__(self, *, check_y=None, check_y_params=None, check_X=None, check_X_params=None, methods_to_check='all', foo_param=0, expected_fit_params=None):\n self.check_y = check_y\n self.check_y_params = check_y_params\n self.check_X = check_X\n self.check_X_params = check_X_params\n self.methods_to_check = methods_to_check\n self.foo_param = foo_param\n self.expected_fit_params = expected_fit_params\n \n def _check_X_y(self, X, y=None, should_be_fitted=True):\n \"\"\"Validate X and y and make extra check.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data set.\n y : array-like of shape (n_samples), default=None\n The corresponding target, by default None.\n should_be_fitted : bool, default=True\n Whether or not the classifier should be already fitted.\n By default True.\n\n Returns\n -------\n X, y\n \"\"\"\n if should_be_fitted:\n check_is_fitted(self)\n if self.check_X is not None:\n params = {} if self.check_X_params is None else self.check_X_params\n checked_X = self.check_X(X, **params)\n if isinstance(checked_X, (bool, np.bool_)):\n assert checked_X\n else:\n X = checked_X\n if y is not None and self.check_y is not None:\n params = {} if self.check_y_params is None else self.check_y_params\n checked_y = self.check_y(y, **params)\n if isinstance(checked_y, (bool, np.bool_)):\n assert checked_y\n else:\n y = checked_y\n return X, y\n \n def fit(self, X, y, **fit_params):\n \"\"\"Fit classifier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples, n_outputs) or (n_samples,), default=None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of the estimator\n\n Returns\n -------\n self\n \"\"\"\n assert _num_samples(X) == _num_samples(y)\n if self.methods_to_check == 'all' or 'fit' in self.methods_to_check:\n (X, y) = self._check_X_y(X, y, should_be_fitted=False)\n self.n_features_in_ = np.shape(X)[1]\n self.classes_ = np.unique(check_array(y, ensure_2d=False, allow_nd=True))\n if self.expected_fit_params:\n missing = set(self.expected_fit_params) - set(fit_params)\n if missing:\n raise AssertionError(f'Expected fit parameter(s) {list(missing)} not seen.')\n for (key, value) in fit_params.items():\n if _num_samples(value) != _num_samples(X):\n raise AssertionError(f'Fit parameter {key} has length {_num_samples(value)}; expected {_num_samples(X)}.')\n return self\n \n def predict(self, X):\n \"\"\"Predict the first class seen in `classes_`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n preds : ndarray of shape (n_samples,)\n Predictions of the first class seens in `classes_`.\n \"\"\"\n if self.methods_to_check == 'all' or 'predict' in self.methods_to_check:\n (X, y) = self._check_X_y(X)\n return self.classes_[np.zeros(_num_samples(X), dtype=int)]\n \n def predict_proba(self, X):\n \"\"\"Predict probabilities for each class.\n\n Here, the dummy classifier will provide a probability of 1 for the\n first class of `classes_` and 0 otherwise.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n proba : ndarray of shape (n_samples, n_classes)\n The probabilities for each sample and class.\n \"\"\"\n if self.methods_to_check == 'all' or 'predict_proba' in self.methods_to_check:\n (X, y) = self._check_X_y(X)\n proba = np.zeros((_num_samples(X), len(self.classes_)))\n proba[:, 0] = 1\n return proba\n \n def decision_function(self, X):\n \"\"\"Confidence score.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n decision : ndarray of shape (n_samples,) if n_classes == 2 else (n_samples, n_classes)\n Confidence score.\n \"\"\"\n if self.methods_to_check == 'all' or 'decision_function' in self.methods_to_check:\n (X, y) = self._check_X_y(X)\n if len(self.classes_) == 2:\n return np.zeros(_num_samples(X))\n else:\n decision = np.zeros((_num_samples(X), len(self.classes_)))\n decision[:, 0] = 1\n return decision\n \n def score(self, X=None, Y=None):\n \"\"\"Fake score.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Y : array-like of shape (n_samples, n_output) or (n_samples,)\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n Returns\n -------\n score : float\n Either 0 or 1 depending of `foo_param` (i.e. `foo_param > 1 =>\n score=1` otherwise `score=0`).\n \"\"\"\n if self.methods_to_check == 'all' or 'score' in self.methods_to_check:\n self._check_X_y(X, Y)\n if self.foo_param > 1:\n score = 1.0\n else:\n score = 0.0\n return score\n \n def _more_tags(self):\n return {'_skip_test': True, 'X_types': ['1dlabel']}\n" }, @@ -26787,7 +26883,7 @@ "sklearn.utils._pprint._EstimatorPrettyPrinter._pprint_key_val_tuple" ], "is_public": false, - "description": "Pretty Printer class for estimator objects.\n\nThis extends the pprint.PrettyPrinter class, because: - we need estimators to be printed with their parameters, e.g. Estimator(param1=value1, ...) which is not supported by default. - the 'compact' parameter of PrettyPrinter is ignored for dicts, which may lead to very long representations that we want to avoid. Quick overview of pprint.PrettyPrinter (see also https://stackoverflow.com/questions/49565047/pprint-with-hex-numbers): - the entry point is the _format() method which calls format() (overridden here) - format() directly calls _safe_repr() for a first try at rendering the object - _safe_repr formats the whole object recursively, only calling itself, not caring about line length or anything - back to _format(), if the output string is too long, _format() then calls the appropriate _pprint_TYPE() method (e.g. _pprint_list()) depending on the type of the object. This where the line length and the compact parameters are taken into account. - those _pprint_TYPE() methods will internally use the format() method for rendering the nested objects of an object (e.g. the elements of a list) In the end, everything has to be implemented twice: in _safe_repr and in the custom _pprint_TYPE methods. Unfortunately PrettyPrinter is really not straightforward to extend (especially when we want a compact output), so the code is a bit convoluted. This class overrides: - format() to support the changed_only parameter - _safe_repr to support printing of estimators (for when they fit on a single line) - _format_dict_items so that dict are correctly 'compacted' - _format_items so that ellipsis is used on long lists and tuples When estimators cannot be printed on a single line, the builtin _format() will call _pprint_estimator() because it was registered to do so (see _dispatch[BaseEstimator.__repr__] = _pprint_estimator). both _format_dict_items() and _pprint_estimator() use the _format_params_or_dict_items() method that will format parameters and key-value pairs respecting the compact parameter. This method needs another subroutine _pprint_key_val_tuple() used when a parameter or a key-value pair is too long to fit on a single line. This subroutine is called in _format() and is registered as well in the _dispatch dict (just like _pprint_estimator). We had to create the two classes KeyValTuple and KeyValTupleParam for this.", + "description": "Pretty Printer class for estimator objects.\n\nThis extends the pprint.PrettyPrinter class, because:\n- we need estimators to be printed with their parameters, e.g.\n Estimator(param1=value1, ...) which is not supported by default.\n- the 'compact' parameter of PrettyPrinter is ignored for dicts, which\n may lead to very long representations that we want to avoid.\n\nQuick overview of pprint.PrettyPrinter (see also\nhttps://stackoverflow.com/questions/49565047/pprint-with-hex-numbers):\n\n- the entry point is the _format() method which calls format() (overridden\n here)\n- format() directly calls _safe_repr() for a first try at rendering the\n object\n- _safe_repr formats the whole object recursively, only calling itself,\n not caring about line length or anything\n- back to _format(), if the output string is too long, _format() then calls\n the appropriate _pprint_TYPE() method (e.g. _pprint_list()) depending on\n the type of the object. This where the line length and the compact\n parameters are taken into account.\n- those _pprint_TYPE() methods will internally use the format() method for\n rendering the nested objects of an object (e.g. the elements of a list)\n\nIn the end, everything has to be implemented twice: in _safe_repr and in\nthe custom _pprint_TYPE methods. Unfortunately PrettyPrinter is really not\nstraightforward to extend (especially when we want a compact output), so\nthe code is a bit convoluted.\n\nThis class overrides:\n- format() to support the changed_only parameter\n- _safe_repr to support printing of estimators (for when they fit on a\n single line)\n- _format_dict_items so that dict are correctly 'compacted'\n- _format_items so that ellipsis is used on long lists and tuples\n\nWhen estimators cannot be printed on a single line, the builtin _format()\nwill call _pprint_estimator() because it was registered to do so (see\n_dispatch[BaseEstimator.__repr__] = _pprint_estimator).\n\nboth _format_dict_items() and _pprint_estimator() use the\n_format_params_or_dict_items() method that will format parameters and\nkey-value pairs respecting the compact parameter. This method needs another\nsubroutine _pprint_key_val_tuple() used when a parameter or a key-value\npair is too long to fit on a single line. This subroutine is called in\n_format() and is registered as well in the _dispatch dict (just like\n_pprint_estimator). We had to create the two classes KeyValTuple and\nKeyValTupleParam for this.", "docstring": "Pretty Printer class for estimator objects.\n\n This extends the pprint.PrettyPrinter class, because:\n - we need estimators to be printed with their parameters, e.g.\n Estimator(param1=value1, ...) which is not supported by default.\n - the 'compact' parameter of PrettyPrinter is ignored for dicts, which\n may lead to very long representations that we want to avoid.\n\n Quick overview of pprint.PrettyPrinter (see also\n https://stackoverflow.com/questions/49565047/pprint-with-hex-numbers):\n\n - the entry point is the _format() method which calls format() (overridden\n here)\n - format() directly calls _safe_repr() for a first try at rendering the\n object\n - _safe_repr formats the whole object recursively, only calling itself,\n not caring about line length or anything\n - back to _format(), if the output string is too long, _format() then calls\n the appropriate _pprint_TYPE() method (e.g. _pprint_list()) depending on\n the type of the object. This where the line length and the compact\n parameters are taken into account.\n - those _pprint_TYPE() methods will internally use the format() method for\n rendering the nested objects of an object (e.g. the elements of a list)\n\n In the end, everything has to be implemented twice: in _safe_repr and in\n the custom _pprint_TYPE methods. Unfortunately PrettyPrinter is really not\n straightforward to extend (especially when we want a compact output), so\n the code is a bit convoluted.\n\n This class overrides:\n - format() to support the changed_only parameter\n - _safe_repr to support printing of estimators (for when they fit on a\n single line)\n - _format_dict_items so that dict are correctly 'compacted'\n - _format_items so that ellipsis is used on long lists and tuples\n\n When estimators cannot be printed on a single line, the builtin _format()\n will call _pprint_estimator() because it was registered to do so (see\n _dispatch[BaseEstimator.__repr__] = _pprint_estimator).\n\n both _format_dict_items() and _pprint_estimator() use the\n _format_params_or_dict_items() method that will format parameters and\n key-value pairs respecting the compact parameter. This method needs another\n subroutine _pprint_key_val_tuple() used when a parameter or a key-value\n pair is too long to fit on a single line. This subroutine is called in\n _format() and is registered as well in the _dispatch dict (just like\n _pprint_estimator). We had to create the two classes KeyValTuple and\n KeyValTupleParam for this.\n ", "source_code": "\n\nclass _EstimatorPrettyPrinter(pprint.PrettyPrinter):\n \"\"\"Pretty Printer class for estimator objects.\n\n This extends the pprint.PrettyPrinter class, because:\n - we need estimators to be printed with their parameters, e.g.\n Estimator(param1=value1, ...) which is not supported by default.\n - the 'compact' parameter of PrettyPrinter is ignored for dicts, which\n may lead to very long representations that we want to avoid.\n\n Quick overview of pprint.PrettyPrinter (see also\n https://stackoverflow.com/questions/49565047/pprint-with-hex-numbers):\n\n - the entry point is the _format() method which calls format() (overridden\n here)\n - format() directly calls _safe_repr() for a first try at rendering the\n object\n - _safe_repr formats the whole object recursively, only calling itself,\n not caring about line length or anything\n - back to _format(), if the output string is too long, _format() then calls\n the appropriate _pprint_TYPE() method (e.g. _pprint_list()) depending on\n the type of the object. This where the line length and the compact\n parameters are taken into account.\n - those _pprint_TYPE() methods will internally use the format() method for\n rendering the nested objects of an object (e.g. the elements of a list)\n\n In the end, everything has to be implemented twice: in _safe_repr and in\n the custom _pprint_TYPE methods. Unfortunately PrettyPrinter is really not\n straightforward to extend (especially when we want a compact output), so\n the code is a bit convoluted.\n\n This class overrides:\n - format() to support the changed_only parameter\n - _safe_repr to support printing of estimators (for when they fit on a\n single line)\n - _format_dict_items so that dict are correctly 'compacted'\n - _format_items so that ellipsis is used on long lists and tuples\n\n When estimators cannot be printed on a single line, the builtin _format()\n will call _pprint_estimator() because it was registered to do so (see\n _dispatch[BaseEstimator.__repr__] = _pprint_estimator).\n\n both _format_dict_items() and _pprint_estimator() use the\n _format_params_or_dict_items() method that will format parameters and\n key-value pairs respecting the compact parameter. This method needs another\n subroutine _pprint_key_val_tuple() used when a parameter or a key-value\n pair is too long to fit on a single line. This subroutine is called in\n _format() and is registered as well in the _dispatch dict (just like\n _pprint_estimator). We had to create the two classes KeyValTuple and\n KeyValTupleParam for this.\n \"\"\"\n \n def __init__(self, indent=1, width=80, depth=None, stream=None, *, compact=False, indent_at_name=True, n_max_elements_to_show=None):\n super().__init__(indent, width, depth, stream, compact=compact)\n self._indent_at_name = indent_at_name\n if self._indent_at_name:\n self._indent_per_level = 1\n self._changed_only = get_config()['print_changed_only']\n self.n_max_elements_to_show = n_max_elements_to_show\n \n def format(self, object, context, maxlevels, level):\n return _safe_repr(object, context, maxlevels, level, changed_only=self._changed_only)\n \n def _pprint_estimator(self, object, stream, indent, allowance, context, level):\n stream.write(object.__class__.__name__ + '(')\n if self._indent_at_name:\n indent += len(object.__class__.__name__)\n if self._changed_only:\n params = _changed_params(object)\n else:\n params = object.get_params(deep=False)\n params = OrderedDict(((name, val) for (name, val) in sorted(params.items())))\n self._format_params(params.items(), stream, indent, allowance + 1, context, level)\n stream.write(')')\n \n def _format_dict_items(self, items, stream, indent, allowance, context, level):\n return self._format_params_or_dict_items(items, stream, indent, allowance, context, level, is_dict=True)\n \n def _format_params(self, items, stream, indent, allowance, context, level):\n return self._format_params_or_dict_items(items, stream, indent, allowance, context, level, is_dict=False)\n \n def _format_params_or_dict_items(self, object, stream, indent, allowance, context, level, is_dict):\n \"\"\"Format dict items or parameters respecting the compact=True\n parameter. For some reason, the builtin rendering of dict items doesn't\n respect compact=True and will use one line per key-value if all cannot\n fit in a single line.\n Dict items will be rendered as <'key': value> while params will be\n rendered as . The implementation is mostly copy/pasting from\n the builtin _format_items().\n This also adds ellipsis if the number of items is greater than\n self.n_max_elements_to_show.\n \"\"\"\n write = stream.write\n indent += self._indent_per_level\n delimnl = ',\\n' + ' ' * indent\n delim = ''\n width = max_width = self._width - indent + 1\n it = iter(object)\n try:\n next_ent = next(it)\n except StopIteration:\n return\n last = False\n n_items = 0\n while not last:\n if n_items == self.n_max_elements_to_show:\n write(', ...')\n break\n n_items += 1\n ent = next_ent\n try:\n next_ent = next(it)\n except StopIteration:\n last = True\n max_width -= allowance\n width -= allowance\n if self._compact:\n (k, v) = ent\n krepr = self._repr(k, context, level)\n vrepr = self._repr(v, context, level)\n if not is_dict:\n krepr = krepr.strip(\"'\")\n middle = ': ' if is_dict else '='\n rep = krepr + middle + vrepr\n w = len(rep) + 2\n if width < w:\n width = max_width\n if delim:\n delim = delimnl\n if width >= w:\n width -= w\n write(delim)\n delim = ', '\n write(rep)\n continue\n write(delim)\n delim = delimnl\n class_ = KeyValTuple if is_dict else KeyValTupleParam\n self._format(class_(ent), stream, indent, allowance if last else 1, context, level)\n \n def _format_items(self, items, stream, indent, allowance, context, level):\n \"\"\"Format the items of an iterable (list, tuple...). Same as the\n built-in _format_items, with support for ellipsis if the number of\n elements is greater than self.n_max_elements_to_show.\n \"\"\"\n write = stream.write\n indent += self._indent_per_level\n if self._indent_per_level > 1:\n write((self._indent_per_level - 1) * ' ')\n delimnl = ',\\n' + ' ' * indent\n delim = ''\n width = max_width = self._width - indent + 1\n it = iter(items)\n try:\n next_ent = next(it)\n except StopIteration:\n return\n last = False\n n_items = 0\n while not last:\n if n_items == self.n_max_elements_to_show:\n write(', ...')\n break\n n_items += 1\n ent = next_ent\n try:\n next_ent = next(it)\n except StopIteration:\n last = True\n max_width -= allowance\n width -= allowance\n if self._compact:\n rep = self._repr(ent, context, level)\n w = len(rep) + 2\n if width < w:\n width = max_width\n if delim:\n delim = delimnl\n if width >= w:\n width -= w\n write(delim)\n delim = ', '\n write(rep)\n continue\n write(delim)\n delim = delimnl\n self._format(ent, stream, indent, allowance if last else 1, context, level)\n \n def _pprint_key_val_tuple(self, object, stream, indent, allowance, context, level):\n \"\"\"Pretty printing for key-value tuples from dict or parameters.\"\"\"\n (k, v) = object\n rep = self._repr(k, context, level)\n if isinstance(object, KeyValTupleParam):\n rep = rep.strip(\"'\")\n middle = '='\n else:\n middle = ': '\n stream.write(rep)\n stream.write(middle)\n self._format(v, stream, indent + len(rep) + len(middle), allowance, context, level)\n _dispatch = pprint.PrettyPrinter._dispatch.copy()\n _dispatch[BaseEstimator.__repr__] = _pprint_estimator\n _dispatch[KeyValTuple.__repr__] = _pprint_key_val_tuple\n" }, @@ -26806,7 +26902,7 @@ "sklearn.utils._testing.MinimalClassifier.score" ], "is_public": false, - "description": "Minimal classifier implementation with inheriting from BaseEstimator.\n\nThis estimator should be tested with: * `check_estimator` in `test_estimator_checks.py`; * within a `Pipeline` in `test_pipeline.py`; * within a `SearchCV` in `test_search.py`.", + "description": "Minimal classifier implementation with inheriting from BaseEstimator.\n\nThis estimator should be tested with:\n\n* `check_estimator` in `test_estimator_checks.py`;\n* within a `Pipeline` in `test_pipeline.py`;\n* within a `SearchCV` in `test_search.py`.", "docstring": "Minimal classifier implementation with inheriting from BaseEstimator.\n\n This estimator should be tested with:\n\n * `check_estimator` in `test_estimator_checks.py`;\n * within a `Pipeline` in `test_pipeline.py`;\n * within a `SearchCV` in `test_search.py`.\n ", "source_code": "\n\nclass MinimalClassifier:\n \"\"\"Minimal classifier implementation with inheriting from BaseEstimator.\n\n This estimator should be tested with:\n\n * `check_estimator` in `test_estimator_checks.py`;\n * within a `Pipeline` in `test_pipeline.py`;\n * within a `SearchCV` in `test_search.py`.\n \"\"\"\n _estimator_type = 'classifier'\n \n def __init__(self, param=None):\n self.param = param\n \n def get_params(self, deep=True):\n return {'param': self.param}\n \n def set_params(self, **params):\n for (key, value) in params.items():\n setattr(self, key, value)\n return self\n \n def fit(self, X, y):\n (X, y) = check_X_y(X, y)\n check_classification_targets(y)\n (self.classes_, counts) = np.unique(y, return_counts=True)\n self._most_frequent_class_idx = counts.argmax()\n return self\n \n def predict_proba(self, X):\n check_is_fitted(self)\n X = check_array(X)\n proba_shape = (X.shape[0], self.classes_.size)\n y_proba = np.zeros(shape=proba_shape, dtype=np.float64)\n y_proba[:, self._most_frequent_class_idx] = 1.0\n return y_proba\n \n def predict(self, X):\n y_proba = self.predict_proba(X)\n y_pred = y_proba.argmax(axis=1)\n return self.classes_[y_pred]\n \n def score(self, X, y):\n from sklearn.metrics import accuracy_score\n return accuracy_score(y, self.predict(X))\n" }, @@ -26824,7 +26920,7 @@ "sklearn.utils._testing.MinimalRegressor.score" ], "is_public": false, - "description": "Minimal regressor implementation with inheriting from BaseEstimator.\n\nThis estimator should be tested with: * `check_estimator` in `test_estimator_checks.py`; * within a `Pipeline` in `test_pipeline.py`; * within a `SearchCV` in `test_search.py`.", + "description": "Minimal regressor implementation with inheriting from BaseEstimator.\n\nThis estimator should be tested with:\n\n* `check_estimator` in `test_estimator_checks.py`;\n* within a `Pipeline` in `test_pipeline.py`;\n* within a `SearchCV` in `test_search.py`.", "docstring": "Minimal regressor implementation with inheriting from BaseEstimator.\n\n This estimator should be tested with:\n\n * `check_estimator` in `test_estimator_checks.py`;\n * within a `Pipeline` in `test_pipeline.py`;\n * within a `SearchCV` in `test_search.py`.\n ", "source_code": "\n\nclass MinimalRegressor:\n \"\"\"Minimal regressor implementation with inheriting from BaseEstimator.\n\n This estimator should be tested with:\n\n * `check_estimator` in `test_estimator_checks.py`;\n * within a `Pipeline` in `test_pipeline.py`;\n * within a `SearchCV` in `test_search.py`.\n \"\"\"\n _estimator_type = 'regressor'\n \n def __init__(self, param=None):\n self.param = param\n \n def get_params(self, deep=True):\n return {'param': self.param}\n \n def set_params(self, **params):\n for (key, value) in params.items():\n setattr(self, key, value)\n return self\n \n def fit(self, X, y):\n (X, y) = check_X_y(X, y)\n self.is_fitted_ = True\n self._mean = np.mean(y)\n return self\n \n def predict(self, X):\n check_is_fitted(self)\n X = check_array(X)\n return np.ones(shape=(X.shape[0], )) * self._mean\n \n def score(self, X, y):\n from sklearn.metrics import r2_score\n return r2_score(y, self.predict(X))\n" }, @@ -26842,7 +26938,7 @@ "sklearn.utils._testing.MinimalTransformer.fit_transform" ], "is_public": false, - "description": "Minimal transformer implementation with inheriting from BaseEstimator.\n\nThis estimator should be tested with: * `check_estimator` in `test_estimator_checks.py`; * within a `Pipeline` in `test_pipeline.py`; * within a `SearchCV` in `test_search.py`.", + "description": "Minimal transformer implementation with inheriting from\nBaseEstimator.\n\nThis estimator should be tested with:\n\n* `check_estimator` in `test_estimator_checks.py`;\n* within a `Pipeline` in `test_pipeline.py`;\n* within a `SearchCV` in `test_search.py`.", "docstring": "Minimal transformer implementation with inheriting from\n BaseEstimator.\n\n This estimator should be tested with:\n\n * `check_estimator` in `test_estimator_checks.py`;\n * within a `Pipeline` in `test_pipeline.py`;\n * within a `SearchCV` in `test_search.py`.\n ", "source_code": "\n\nclass MinimalTransformer:\n \"\"\"Minimal transformer implementation with inheriting from\n BaseEstimator.\n\n This estimator should be tested with:\n\n * `check_estimator` in `test_estimator_checks.py`;\n * within a `Pipeline` in `test_pipeline.py`;\n * within a `SearchCV` in `test_search.py`.\n \"\"\"\n \n def __init__(self, param=None):\n self.param = param\n \n def get_params(self, deep=True):\n return {'param': self.param}\n \n def set_params(self, **params):\n for (key, value) in params.items():\n setattr(self, key, value)\n return self\n \n def fit(self, X, y=None):\n check_array(X)\n self.is_fitted_ = True\n return self\n \n def transform(self, X, y=None):\n check_is_fitted(self)\n X = check_array(X)\n return X\n \n def fit_transform(self, X, y=None):\n return self.fit(X, y).transform(X, y)\n" }, @@ -26874,7 +26970,7 @@ "sklearn.utils._testing._IgnoreWarnings.__exit__" ], "is_public": false, - "description": "Improved and simplified Python warnings context manager and decorator.\n\nThis class allows the user to ignore the warnings raised by a function. Copied from Python 2.7.5 and modified as required.", + "description": "Improved and simplified Python warnings context manager and decorator.\n\nThis class allows the user to ignore the warnings raised by a function.\nCopied from Python 2.7.5 and modified as required.", "docstring": "Improved and simplified Python warnings context manager and decorator.\n\n This class allows the user to ignore the warnings raised by a function.\n Copied from Python 2.7.5 and modified as required.\n\n Parameters\n ----------\n category : tuple of warning class, default=Warning\n The category to filter. By default, all the categories will be muted.\n\n ", "source_code": "\n\nclass _IgnoreWarnings:\n \"\"\"Improved and simplified Python warnings context manager and decorator.\n\n This class allows the user to ignore the warnings raised by a function.\n Copied from Python 2.7.5 and modified as required.\n\n Parameters\n ----------\n category : tuple of warning class, default=Warning\n The category to filter. By default, all the categories will be muted.\n\n \"\"\"\n \n def __init__(self, category):\n self._record = True\n self._module = sys.modules['warnings']\n self._entered = False\n self.log = []\n self.category = category\n \n def __call__(self, fn):\n \"\"\"Decorator to catch and hide warnings without visual nesting.\"\"\"\n \n @wraps(fn)\n def wrapper(*args, **kwargs):\n with warnings.catch_warnings():\n warnings.simplefilter('ignore', self.category)\n return fn(*args, **kwargs)\n return wrapper\n \n def __repr__(self):\n args = []\n if self._record:\n args.append('record=True')\n if self._module is not sys.modules['warnings']:\n args.append('module=%r' % self._module)\n name = type(self).__name__\n return '%s(%s)' % (name, ', '.join(args))\n \n def __enter__(self):\n if self._entered:\n raise RuntimeError('Cannot enter %r twice' % self)\n self._entered = True\n self._filters = self._module.filters\n self._module.filters = self._filters[:]\n self._showwarning = self._module.showwarning\n warnings.simplefilter('ignore', self.category)\n \n def __exit__(self, *exc_info):\n if not self._entered:\n raise RuntimeError('Cannot exit %r without entering first' % self)\n self._module.filters = self._filters\n self._module.showwarning = self._showwarning\n self.log[:] = []\n" }, @@ -26906,7 +27002,7 @@ "sklearn.utils.deprecation.deprecated._update_doc" ], "is_public": true, - "description": "Decorator to mark a function or class as deprecated.\n\nIssue a warning when the function is called/the class is instantiated and adds a warning to the docstring. The optional extra argument will be appended to the deprecation message and the docstring. Note: to use this with the default value for extra, put in an empty of parentheses: >>> from sklearn.utils import deprecated >>> deprecated() >>> @deprecated() ... def some_function(): pass", + "description": "Decorator to mark a function or class as deprecated.\n\nIssue a warning when the function is called/the class is instantiated and\nadds a warning to the docstring.\n\nThe optional extra argument will be appended to the deprecation message\nand the docstring. Note: to use this with the default value for extra, put\nin an empty of parentheses:\n\n>>> from sklearn.utils import deprecated\n>>> deprecated()\n\n\n>>> @deprecated()\n... def some_function(): pass", "docstring": "Decorator to mark a function or class as deprecated.\n\n Issue a warning when the function is called/the class is instantiated and\n adds a warning to the docstring.\n\n The optional extra argument will be appended to the deprecation message\n and the docstring. Note: to use this with the default value for extra, put\n in an empty of parentheses:\n\n >>> from sklearn.utils import deprecated\n >>> deprecated()\n \n\n >>> @deprecated()\n ... def some_function(): pass\n\n Parameters\n ----------\n extra : str, default=''\n To be added to the deprecation messages.\n ", "source_code": "\n\nclass deprecated:\n \"\"\"Decorator to mark a function or class as deprecated.\n\n Issue a warning when the function is called/the class is instantiated and\n adds a warning to the docstring.\n\n The optional extra argument will be appended to the deprecation message\n and the docstring. Note: to use this with the default value for extra, put\n in an empty of parentheses:\n\n >>> from sklearn.utils import deprecated\n >>> deprecated()\n \n\n >>> @deprecated()\n ... def some_function(): pass\n\n Parameters\n ----------\n extra : str, default=''\n To be added to the deprecation messages.\n \"\"\"\n \n def __init__(self, extra=''):\n self.extra = extra\n \n def __call__(self, obj):\n \"\"\"Call method\n\n Parameters\n ----------\n obj : object\n \"\"\"\n if isinstance(obj, type):\n return self._decorate_class(obj)\n elif isinstance(obj, property):\n return self._decorate_property(obj)\n else:\n return self._decorate_fun(obj)\n \n def _decorate_class(self, cls):\n msg = 'Class %s is deprecated' % cls.__name__\n if self.extra:\n msg += '; %s' % self.extra\n init = cls.__init__\n \n def wrapped(*args, **kwargs):\n warnings.warn(msg, category=FutureWarning)\n return init(*args, **kwargs)\n cls.__init__ = wrapped\n wrapped.__name__ = '__init__'\n wrapped.__doc__ = self._update_doc(init.__doc__)\n wrapped.deprecated_original = init\n return cls\n \n def _decorate_fun(self, fun):\n \"\"\"Decorate function fun\"\"\"\n msg = 'Function %s is deprecated' % fun.__name__\n if self.extra:\n msg += '; %s' % self.extra\n \n @functools.wraps(fun)\n def wrapped(*args, **kwargs):\n warnings.warn(msg, category=FutureWarning)\n return fun(*args, **kwargs)\n wrapped.__doc__ = self._update_doc(wrapped.__doc__)\n wrapped.__wrapped__ = fun\n return wrapped\n \n def _decorate_property(self, prop):\n msg = self.extra\n \n @property\n @functools.wraps(prop)\n def wrapped(*args, **kwargs):\n warnings.warn(msg, category=FutureWarning)\n return prop.fget(*args, **kwargs)\n wrapped.__doc__ = self._update_doc(wrapped.__doc__)\n return wrapped\n \n def _update_doc(self, olddoc):\n newdoc = 'DEPRECATED'\n if self.extra:\n newdoc = '%s: %s' % (newdoc, self.extra)\n if olddoc:\n newdoc = '%s\\n\\n %s' % (newdoc, olddoc)\n return newdoc\n" }, @@ -26960,7 +27056,7 @@ "sklearn.utils.metaestimators._AvailableIfDescriptor.__get__" ], "is_public": false, - "description": "Implements a conditional property using the descriptor protocol.\n\nUsing this class to create a decorator will raise an ``AttributeError`` if check(self) returns a falsey value. Note that if check raises an error this will also result in hasattr returning false. See https://docs.python.org/3/howto/descriptor.html for an explanation of descriptors.", + "description": "Implements a conditional property using the descriptor protocol.\n\nUsing this class to create a decorator will raise an ``AttributeError``\nif check(self) returns a falsey value. Note that if check raises an error\nthis will also result in hasattr returning false.\n\nSee https://docs.python.org/3/howto/descriptor.html for an explanation of\ndescriptors.", "docstring": "Implements a conditional property using the descriptor protocol.\n\n Using this class to create a decorator will raise an ``AttributeError``\n if check(self) returns a falsey value. Note that if check raises an error\n this will also result in hasattr returning false.\n\n See https://docs.python.org/3/howto/descriptor.html for an explanation of\n descriptors.\n ", "source_code": "\n\nclass _AvailableIfDescriptor:\n \"\"\"Implements a conditional property using the descriptor protocol.\n\n Using this class to create a decorator will raise an ``AttributeError``\n if check(self) returns a falsey value. Note that if check raises an error\n this will also result in hasattr returning false.\n\n See https://docs.python.org/3/howto/descriptor.html for an explanation of\n descriptors.\n \"\"\"\n \n def __init__(self, fn, check, attribute_name):\n self.fn = fn\n self.check = check\n self.attribute_name = attribute_name\n update_wrapper(self, fn)\n \n def __get__(self, obj, owner=None):\n attr_err = AttributeError(f'This {repr(owner.__name__)} has no attribute {repr(self.attribute_name)}')\n if obj is not None:\n if not self.check(obj):\n raise attr_err\n out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)\n else:\n \n def fn(*args, **kwargs):\n if not self.check(args[0]):\n raise attr_err\n return self.fn(*args, **kwargs)\n out = lambda *args, **kwargs: fn(*args, **kwargs)\n update_wrapper(out, self.fn)\n return out\n" }, @@ -26991,7 +27087,7 @@ "sklearn.utils.metaestimators._IffHasAttrDescriptor._check" ], "is_public": false, - "description": "Implements a conditional property using the descriptor protocol.\n\nUsing this class to create a decorator will raise an ``AttributeError`` if none of the delegates (specified in ``delegate_names``) is an attribute of the base object or the first found delegate does not have an attribute ``attribute_name``. This allows ducktyping of the decorated method based on ``delegate.attribute_name``. Here ``delegate`` is the first item in ``delegate_names`` for which ``hasattr(object, delegate) is True``. See https://docs.python.org/3/howto/descriptor.html for an explanation of descriptors.", + "description": "Implements a conditional property using the descriptor protocol.\n\nUsing this class to create a decorator will raise an ``AttributeError``\nif none of the delegates (specified in ``delegate_names``) is an attribute\nof the base object or the first found delegate does not have an attribute\n``attribute_name``.\n\nThis allows ducktyping of the decorated method based on\n``delegate.attribute_name``. Here ``delegate`` is the first item in\n``delegate_names`` for which ``hasattr(object, delegate) is True``.\n\nSee https://docs.python.org/3/howto/descriptor.html for an explanation of\ndescriptors.", "docstring": "Implements a conditional property using the descriptor protocol.\n\n Using this class to create a decorator will raise an ``AttributeError``\n if none of the delegates (specified in ``delegate_names``) is an attribute\n of the base object or the first found delegate does not have an attribute\n ``attribute_name``.\n\n This allows ducktyping of the decorated method based on\n ``delegate.attribute_name``. Here ``delegate`` is the first item in\n ``delegate_names`` for which ``hasattr(object, delegate) is True``.\n\n See https://docs.python.org/3/howto/descriptor.html for an explanation of\n descriptors.\n ", "source_code": "\n\nclass _IffHasAttrDescriptor(_AvailableIfDescriptor):\n \"\"\"Implements a conditional property using the descriptor protocol.\n\n Using this class to create a decorator will raise an ``AttributeError``\n if none of the delegates (specified in ``delegate_names``) is an attribute\n of the base object or the first found delegate does not have an attribute\n ``attribute_name``.\n\n This allows ducktyping of the decorated method based on\n ``delegate.attribute_name``. Here ``delegate`` is the first item in\n ``delegate_names`` for which ``hasattr(object, delegate) is True``.\n\n See https://docs.python.org/3/howto/descriptor.html for an explanation of\n descriptors.\n \"\"\"\n \n def __init__(self, fn, delegate_names, attribute_name):\n super().__init__(fn, self._check, attribute_name)\n self.delegate_names = delegate_names\n \n def _check(self, obj):\n delegate = None\n for delegate_name in self.delegate_names:\n try:\n delegate = attrgetter(delegate_name)(obj)\n break\n except AttributeError:\n continue\n if delegate is None:\n return False\n getattr(delegate, self.attribute_name)\n return True\n" }, @@ -27023,13 +27119,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef raise_build_error(e):\n local_dir = os.path.split(__file__)[0]\n msg = STANDARD_MSG\n if local_dir == 'sklearn/__check_build':\n msg = INPLACE_MSG\n dir_content = list()\n for (i, filename) in enumerate(os.listdir(local_dir)):\n if (i + 1) % 3:\n dir_content.append(filename.ljust(26))\n else:\n dir_content.append(filename + '\\n')\n raise ImportError('%s\\n___________________________________________________________________________\\nContents of %s:\\n%s\\n___________________________________________________________________________\\nIt seems that scikit-learn has not been built correctly.\\n\\nIf you have installed scikit-learn from source, please do not forget\\nto build the package before using it: run `python setup.py install` or\\n`make` in the source directory.\\n%s' % (e, local_dir, ''.join(dir_content).strip(), msg))" }, { @@ -27047,7 +27144,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -27057,13 +27155,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n from numpy.distutils.misc_util import Configuration\n config = Configuration('__check_build', parent_package, top_path)\n config.add_extension('_check_build', sources=['_check_build.pyx'], include_dirs=[numpy.get_include()])\n return config" }, { @@ -27076,7 +27175,7 @@ "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_cython_version():\n message = 'Please install Cython with a version >= {0} in order to build a scikit-learn from source.'.format(CYTHON_MIN_VERSION)\n try:\n import Cython\n except ModuleNotFoundError as e:\n raise ModuleNotFoundError(message) from e\n if LooseVersion(Cython.__version__) < CYTHON_MIN_VERSION:\n message += ' The current version of Cython is {} installed in {}.'.format(Cython.__version__, Cython.__path__)\n raise ValueError(message)" }, { @@ -27094,7 +27193,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "config", @@ -27104,14 +27204,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check that a recent Cython is available and cythonize extensions", "docstring": "Check that a recent Cython is available and cythonize extensions", - "source_code": "\ndef cythonize_extensions(top_path, config):\n \"\"\"Check that a recent Cython is available and cythonize extensions\"\"\"\n _check_cython_version()\n from Cython.Build import cythonize\n basic_check_build()\n sklearn._OPENMP_SUPPORTED = check_openmp_support()\n n_jobs = 1\n with contextlib.suppress(ImportError):\n import joblib\n if LooseVersion(joblib.__version__) > LooseVersion('0.13.0'):\n n_jobs = joblib.cpu_count()\n config.ext_modules = cythonize(config.ext_modules, nthreads=n_jobs, compile_time_env={'SKLEARN_OPENMP_PARALLELISM_ENABLED': sklearn._OPENMP_SUPPORTED}, compiler_directives={'language_level': 3})" + "source_code": "\ndef cythonize_extensions(top_path, config):\n \"\"\"Check that a recent Cython is available and cythonize extensions\"\"\"\n _check_cython_version()\n from Cython.Build import cythonize\n basic_check_build()\n sklearn._OPENMP_SUPPORTED = check_openmp_support()\n n_jobs = 1\n with contextlib.suppress(ImportError):\n import joblib\n if LooseVersion(joblib.__version__) > LooseVersion('0.13.0'):\n n_jobs = joblib.cpu_count()\n cython_enable_debug_directives = os.environ.get('SKLEARN_ENABLE_DEBUG_CYTHON_DIRECTIVES', '0') != '0'\n config.ext_modules = cythonize(config.ext_modules, nthreads=n_jobs, compile_time_env={'SKLEARN_OPENMP_PARALLELISM_ENABLED': sklearn._OPENMP_SUPPORTED}, compiler_directives={'language_level': 3, 'boundscheck': cython_enable_debug_directives, 'wraparound': False, 'initializedcheck': False, 'nonecheck': False, 'cdivision': True})" }, { "name": "gen_from_templates", @@ -27128,7 +27229,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -27165,13 +27267,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_openmp_flag(compiler):\n if hasattr(compiler, 'compiler'):\n compiler = compiler.compiler[0]\n else:\n compiler = compiler.__class__.__name__\n if sys.platform == 'win32' and ('icc' in compiler or 'icl' in compiler):\n return ['/Qopenmp']\n elif sys.platform == 'win32':\n return ['/openmp']\n elif sys.platform in ('darwin', 'linux') and 'icc' in compiler:\n return ['-qopenmp']\n elif sys.platform == 'darwin' and 'openmp' in os.getenv('CPPFLAGS', ''):\n return []\n return ['-fopenmp']" }, { @@ -27183,8 +27286,8 @@ "parameters": [], "results": [], "is_public": false, - "description": "Get a compiler equivalent to the one that will be used to build sklearn\n\nHandles compiler specified as follows: - python setup.py build_ext --compiler= - CC= python setup.py build_ext", - "docstring": "Get a compiler equivalent to the one that will be used to build sklearn\n\nHandles compiler specified as follows:\n - python setup.py build_ext --compiler=\n - CC= python setup.py build_ext", + "description": "Get a compiler equivalent to the one that will be used to build sklearn\n\nHandles compiler specified as follows:\n - python setup.py build_ext --compiler=\n - CC= python setup.py build_ext", + "docstring": "Get a compiler equivalent to the one that will be used to build sklearn\n\n Handles compiler specified as follows:\n - python setup.py build_ext --compiler=\n - CC= python setup.py build_ext\n ", "source_code": "\ndef _get_compiler():\n \"\"\"Get a compiler equivalent to the one that will be used to build sklearn\n\n Handles compiler specified as follows:\n - python setup.py build_ext --compiler=\n - CC= python setup.py build_ext\n \"\"\"\n dist = Distribution({'script_name': os.path.basename(sys.argv[0]), 'script_args': sys.argv[1:], 'cmdclass': {'config_cc': config_cc}})\n dist.parse_config_files()\n dist.parse_command_line()\n cmd_opts = dist.command_options.get('build_ext')\n if cmd_opts is not None and 'compiler' in cmd_opts:\n compiler = cmd_opts['compiler'][1]\n else:\n compiler = None\n ccompiler = new_compiler(compiler=compiler)\n customize_compiler(ccompiler)\n return ccompiler" }, { @@ -27215,7 +27318,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "extra_preargs", @@ -27225,7 +27329,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "extra_postargs", @@ -27235,7 +27340,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -27253,8 +27359,8 @@ "parameters": [], "results": [], "is_public": false, - "description": "Get a threadlocal **mutable** configuration. If the configuration does not exist, copy the default global configuration.", - "docstring": "Get a threadlocal **mutable** configuration. If the configuration\ndoes not exist, copy the default global configuration.", + "description": "Get a threadlocal **mutable** configuration. If the configuration\ndoes not exist, copy the default global configuration.", + "docstring": "Get a threadlocal **mutable** configuration. If the configuration\n does not exist, copy the default global configuration.", "source_code": "\ndef _get_threadlocal_config():\n \"\"\"Get a threadlocal **mutable** configuration. If the configuration\n does not exist, copy the default global configuration.\"\"\"\n if not hasattr(_threadlocal, 'global_config'):\n _threadlocal.global_config = _global_config.copy()\n return _threadlocal.global_config" }, { @@ -27263,12 +27369,60 @@ "qname": "sklearn._config.config_context", "unique_qname": "sklearn._config.config_context", "decorators": ["contextmanager"], - "parameters": [], + "parameters": [ + { + "name": "assume_finite", + "default_value": "None", + "is_public": true, + "assigned_by": "NAME_ONLY", + "docstring": { + "type": "bool, default=None", + "description": "If True, validation for finiteness will be skipped,\nsaving time, but leading to potential crashes. If\nFalse, validation for finiteness will be performed,\navoiding error. If None, the existing value won't change.\nThe default value is False." + }, + "refined_type": {} + }, + { + "name": "working_memory", + "default_value": "None", + "is_public": true, + "assigned_by": "NAME_ONLY", + "docstring": { + "type": "int, default=None", + "description": "If set, scikit-learn will attempt to limit the size of temporary arrays\nto this number of MiB (per job when parallelised), often saving both\ncomputation time and memory on expensive operations that can be\nperformed in chunks. If None, the existing value won't change.\nThe default value is 1024." + }, + "refined_type": {} + }, + { + "name": "print_changed_only", + "default_value": "None", + "is_public": true, + "assigned_by": "NAME_ONLY", + "docstring": { + "type": "bool, default=None", + "description": "If True, only the parameters that were set to non-default\nvalues will be printed when printing an estimator. For example,\n``print(SVC())`` while True will only print 'SVC()', but would print\n'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters\nwhen False. If None, the existing value won't change.\nThe default value is True.\n\n.. versionchanged:: 0.23\n Default changed from False to True." + }, + "refined_type": {} + }, + { + "name": "display", + "default_value": "None", + "is_public": true, + "assigned_by": "NAME_ONLY", + "docstring": { + "type": "{'text', 'diagram'}, default=None", + "description": "If 'diagram', estimators will be displayed as a diagram in a Jupyter\nlab or notebook context. If 'text', estimators will be displayed as\ntext. If None, the existing value won't change.\nThe default value is 'text'.\n\n.. versionadded:: 0.23" + }, + "refined_type": { + "kind": "EnumType", + "values": ["text", "diagram"] + } + } + ], "results": [], "is_public": true, - "description": "Context manager for global scikit-learn configuration", - "docstring": "Context manager for global scikit-learn configuration\n\nParameters\n----------\nassume_finite : bool, default=False\n If True, validation for finiteness will be skipped,\n saving time, but leading to potential crashes. If\n False, validation for finiteness will be performed,\n avoiding error. Global default: False.\n\nworking_memory : int, default=1024\n If set, scikit-learn will attempt to limit the size of temporary arrays\n to this number of MiB (per job when parallelised), often saving both\n computation time and memory on expensive operations that can be\n performed in chunks. Global default: 1024.\n\nprint_changed_only : bool, default=True\n If True, only the parameters that were set to non-default\n values will be printed when printing an estimator. For example,\n ``print(SVC())`` while True will only print 'SVC()', but would print\n 'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters\n when False. Default is True.\n\n .. versionchanged:: 0.23\n Default changed from False to True.\n\ndisplay : {'text', 'diagram'}, default='text'\n If 'diagram', estimators will be displayed as a diagram in a Jupyter\n lab or notebook context. If 'text', estimators will be displayed as\n text. Default is 'text'.\n\n .. versionadded:: 0.23\n\nNotes\n-----\nAll settings, not just those presently modified, will be returned to\ntheir previous values when the context manager is exited.\n\nExamples\n--------\n>>> import sklearn\n>>> from sklearn.utils.validation import assert_all_finite\n>>> with sklearn.config_context(assume_finite=True):\n... assert_all_finite([float('nan')])\n>>> with sklearn.config_context(assume_finite=True):\n... with sklearn.config_context(assume_finite=False):\n... assert_all_finite([float('nan')])\nTraceback (most recent call last):\n...\nValueError: Input contains NaN, ...\n\nSee Also\n--------\nset_config : Set global scikit-learn configuration.\nget_config : Retrieve current values of the global configuration.", - "source_code": "\n@contextmanager\ndef config_context(**new_config):\n \"\"\"Context manager for global scikit-learn configuration\n\n Parameters\n ----------\n assume_finite : bool, default=False\n If True, validation for finiteness will be skipped,\n saving time, but leading to potential crashes. If\n False, validation for finiteness will be performed,\n avoiding error. Global default: False.\n\n working_memory : int, default=1024\n If set, scikit-learn will attempt to limit the size of temporary arrays\n to this number of MiB (per job when parallelised), often saving both\n computation time and memory on expensive operations that can be\n performed in chunks. Global default: 1024.\n\n print_changed_only : bool, default=True\n If True, only the parameters that were set to non-default\n values will be printed when printing an estimator. For example,\n ``print(SVC())`` while True will only print 'SVC()', but would print\n 'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters\n when False. Default is True.\n\n .. versionchanged:: 0.23\n Default changed from False to True.\n\n display : {'text', 'diagram'}, default='text'\n If 'diagram', estimators will be displayed as a diagram in a Jupyter\n lab or notebook context. If 'text', estimators will be displayed as\n text. Default is 'text'.\n\n .. versionadded:: 0.23\n\n Notes\n -----\n All settings, not just those presently modified, will be returned to\n their previous values when the context manager is exited.\n\n Examples\n --------\n >>> import sklearn\n >>> from sklearn.utils.validation import assert_all_finite\n >>> with sklearn.config_context(assume_finite=True):\n ... assert_all_finite([float('nan')])\n >>> with sklearn.config_context(assume_finite=True):\n ... with sklearn.config_context(assume_finite=False):\n ... assert_all_finite([float('nan')])\n Traceback (most recent call last):\n ...\n ValueError: Input contains NaN, ...\n\n See Also\n --------\n set_config : Set global scikit-learn configuration.\n get_config : Retrieve current values of the global configuration.\n \"\"\"\n old_config = get_config()\n set_config(**new_config)\n try:\n yield\n finally:\n set_config(**old_config)" + "description": "Context manager for global scikit-learn configuration.", + "docstring": "Context manager for global scikit-learn configuration.\n\n Parameters\n ----------\n assume_finite : bool, default=None\n If True, validation for finiteness will be skipped,\n saving time, but leading to potential crashes. If\n False, validation for finiteness will be performed,\n avoiding error. If None, the existing value won't change.\n The default value is False.\n\n working_memory : int, default=None\n If set, scikit-learn will attempt to limit the size of temporary arrays\n to this number of MiB (per job when parallelised), often saving both\n computation time and memory on expensive operations that can be\n performed in chunks. If None, the existing value won't change.\n The default value is 1024.\n\n print_changed_only : bool, default=None\n If True, only the parameters that were set to non-default\n values will be printed when printing an estimator. For example,\n ``print(SVC())`` while True will only print 'SVC()', but would print\n 'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters\n when False. If None, the existing value won't change.\n The default value is True.\n\n .. versionchanged:: 0.23\n Default changed from False to True.\n\n display : {'text', 'diagram'}, default=None\n If 'diagram', estimators will be displayed as a diagram in a Jupyter\n lab or notebook context. If 'text', estimators will be displayed as\n text. If None, the existing value won't change.\n The default value is 'text'.\n\n .. versionadded:: 0.23\n\n Yields\n ------\n None.\n\n See Also\n --------\n set_config : Set global scikit-learn configuration.\n get_config : Retrieve current values of the global configuration.\n\n Notes\n -----\n All settings, not just those presently modified, will be returned to\n their previous values when the context manager is exited.\n\n Examples\n --------\n >>> import sklearn\n >>> from sklearn.utils.validation import assert_all_finite\n >>> with sklearn.config_context(assume_finite=True):\n ... assert_all_finite([float('nan')])\n >>> with sklearn.config_context(assume_finite=True):\n ... with sklearn.config_context(assume_finite=False):\n ... assert_all_finite([float('nan')])\n Traceback (most recent call last):\n ...\n ValueError: Input contains NaN...\n ", + "source_code": "\n@contextmanager\ndef config_context(*, assume_finite=None, working_memory=None, print_changed_only=None, display=None):\n \"\"\"Context manager for global scikit-learn configuration.\n\n Parameters\n ----------\n assume_finite : bool, default=None\n If True, validation for finiteness will be skipped,\n saving time, but leading to potential crashes. If\n False, validation for finiteness will be performed,\n avoiding error. If None, the existing value won't change.\n The default value is False.\n\n working_memory : int, default=None\n If set, scikit-learn will attempt to limit the size of temporary arrays\n to this number of MiB (per job when parallelised), often saving both\n computation time and memory on expensive operations that can be\n performed in chunks. If None, the existing value won't change.\n The default value is 1024.\n\n print_changed_only : bool, default=None\n If True, only the parameters that were set to non-default\n values will be printed when printing an estimator. For example,\n ``print(SVC())`` while True will only print 'SVC()', but would print\n 'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters\n when False. If None, the existing value won't change.\n The default value is True.\n\n .. versionchanged:: 0.23\n Default changed from False to True.\n\n display : {'text', 'diagram'}, default=None\n If 'diagram', estimators will be displayed as a diagram in a Jupyter\n lab or notebook context. If 'text', estimators will be displayed as\n text. If None, the existing value won't change.\n The default value is 'text'.\n\n .. versionadded:: 0.23\n\n Yields\n ------\n None.\n\n See Also\n --------\n set_config : Set global scikit-learn configuration.\n get_config : Retrieve current values of the global configuration.\n\n Notes\n -----\n All settings, not just those presently modified, will be returned to\n their previous values when the context manager is exited.\n\n Examples\n --------\n >>> import sklearn\n >>> from sklearn.utils.validation import assert_all_finite\n >>> with sklearn.config_context(assume_finite=True):\n ... assert_all_finite([float('nan')])\n >>> with sklearn.config_context(assume_finite=True):\n ... with sklearn.config_context(assume_finite=False):\n ... assert_all_finite([float('nan')])\n Traceback (most recent call last):\n ...\n ValueError: Input contains NaN...\n \"\"\"\n old_config = get_config()\n set_config(assume_finite=assume_finite, working_memory=working_memory, print_changed_only=print_changed_only, display=display)\n try:\n yield\n finally:\n set_config(**old_config)" }, { "name": "get_config", @@ -27279,9 +27433,9 @@ "parameters": [], "results": [], "is_public": true, - "description": "Retrieve current values for configuration set by :func:`set_config`", - "docstring": "Retrieve current values for configuration set by :func:`set_config`\n\nReturns\n-------\nconfig : dict\n Keys are parameter names that can be passed to :func:`set_config`.\n\nSee Also\n--------\nconfig_context : Context manager for global scikit-learn configuration.\nset_config : Set global scikit-learn configuration.", - "source_code": "\ndef get_config():\n \"\"\"Retrieve current values for configuration set by :func:`set_config`\n\n Returns\n -------\n config : dict\n Keys are parameter names that can be passed to :func:`set_config`.\n\n See Also\n --------\n config_context : Context manager for global scikit-learn configuration.\n set_config : Set global scikit-learn configuration.\n \"\"\"\n return _get_threadlocal_config().copy()" + "description": "Retrieve current values for configuration set by :func:`set_config`.", + "docstring": "Retrieve current values for configuration set by :func:`set_config`.\n\n Returns\n -------\n config : dict\n Keys are parameter names that can be passed to :func:`set_config`.\n\n See Also\n --------\n config_context : Context manager for global scikit-learn configuration.\n set_config : Set global scikit-learn configuration.\n ", + "source_code": "\ndef get_config():\n \"\"\"Retrieve current values for configuration set by :func:`set_config`.\n\n Returns\n -------\n config : dict\n Keys are parameter names that can be passed to :func:`set_config`.\n\n See Also\n --------\n config_context : Context manager for global scikit-learn configuration.\n set_config : Set global scikit-learn configuration.\n \"\"\"\n return _get_threadlocal_config().copy()" }, { "name": "set_config", @@ -27298,7 +27452,8 @@ "docstring": { "type": "bool, default=None", "description": "If True, validation for finiteness will be skipped,\nsaving time, but leading to potential crashes. If\nFalse, validation for finiteness will be performed,\navoiding error. Global default: False.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "working_memory", @@ -27308,7 +27463,8 @@ "docstring": { "type": "int, default=None", "description": "If set, scikit-learn will attempt to limit the size of temporary arrays\nto this number of MiB (per job when parallelised), often saving both\ncomputation time and memory on expensive operations that can be\nperformed in chunks. Global default: 1024.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "print_changed_only", @@ -27318,7 +27474,8 @@ "docstring": { "type": "bool, default=None", "description": "If True, only the parameters that were set to non-default\nvalues will be printed when printing an estimator. For example,\n``print(SVC())`` while True will only print 'SVC()' while the default\nbehaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with\nall the non-changed parameters.\n\n.. versionadded:: 0.21" - } + }, + "refined_type": {} }, { "name": "display", @@ -27328,13 +27485,17 @@ "docstring": { "type": "{'text', 'diagram'}, default=None", "description": "If 'diagram', estimators will be displayed as a diagram in a Jupyter\nlab or notebook context. If 'text', estimators will be displayed as\ntext. Default is 'text'.\n\n.. versionadded:: 0.23" + }, + "refined_type": { + "kind": "EnumType", + "values": ["text", "diagram"] } } ], "results": [], "is_public": true, "description": "Set global scikit-learn configuration\n\n.. versionadded:: 0.19", - "docstring": "Set global scikit-learn configuration\n\n.. versionadded:: 0.19\n\nParameters\n----------\nassume_finite : bool, default=None\n If True, validation for finiteness will be skipped,\n saving time, but leading to potential crashes. If\n False, validation for finiteness will be performed,\n avoiding error. Global default: False.\n\n .. versionadded:: 0.19\n\nworking_memory : int, default=None\n If set, scikit-learn will attempt to limit the size of temporary arrays\n to this number of MiB (per job when parallelised), often saving both\n computation time and memory on expensive operations that can be\n performed in chunks. Global default: 1024.\n\n .. versionadded:: 0.20\n\nprint_changed_only : bool, default=None\n If True, only the parameters that were set to non-default\n values will be printed when printing an estimator. For example,\n ``print(SVC())`` while True will only print 'SVC()' while the default\n behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with\n all the non-changed parameters.\n\n .. versionadded:: 0.21\n\ndisplay : {'text', 'diagram'}, default=None\n If 'diagram', estimators will be displayed as a diagram in a Jupyter\n lab or notebook context. If 'text', estimators will be displayed as\n text. Default is 'text'.\n\n .. versionadded:: 0.23\n\nSee Also\n--------\nconfig_context : Context manager for global scikit-learn configuration.\nget_config : Retrieve current values of the global configuration.", + "docstring": "Set global scikit-learn configuration\n\n .. versionadded:: 0.19\n\n Parameters\n ----------\n assume_finite : bool, default=None\n If True, validation for finiteness will be skipped,\n saving time, but leading to potential crashes. If\n False, validation for finiteness will be performed,\n avoiding error. Global default: False.\n\n .. versionadded:: 0.19\n\n working_memory : int, default=None\n If set, scikit-learn will attempt to limit the size of temporary arrays\n to this number of MiB (per job when parallelised), often saving both\n computation time and memory on expensive operations that can be\n performed in chunks. Global default: 1024.\n\n .. versionadded:: 0.20\n\n print_changed_only : bool, default=None\n If True, only the parameters that were set to non-default\n values will be printed when printing an estimator. For example,\n ``print(SVC())`` while True will only print 'SVC()' while the default\n behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with\n all the non-changed parameters.\n\n .. versionadded:: 0.21\n\n display : {'text', 'diagram'}, default=None\n If 'diagram', estimators will be displayed as a diagram in a Jupyter\n lab or notebook context. If 'text', estimators will be displayed as\n text. Default is 'text'.\n\n .. versionadded:: 0.23\n\n See Also\n --------\n config_context : Context manager for global scikit-learn configuration.\n get_config : Retrieve current values of the global configuration.\n ", "source_code": "\ndef set_config(assume_finite=None, working_memory=None, print_changed_only=None, display=None):\n \"\"\"Set global scikit-learn configuration\n\n .. versionadded:: 0.19\n\n Parameters\n ----------\n assume_finite : bool, default=None\n If True, validation for finiteness will be skipped,\n saving time, but leading to potential crashes. If\n False, validation for finiteness will be performed,\n avoiding error. Global default: False.\n\n .. versionadded:: 0.19\n\n working_memory : int, default=None\n If set, scikit-learn will attempt to limit the size of temporary arrays\n to this number of MiB (per job when parallelised), often saving both\n computation time and memory on expensive operations that can be\n performed in chunks. Global default: 1024.\n\n .. versionadded:: 0.20\n\n print_changed_only : bool, default=None\n If True, only the parameters that were set to non-default\n values will be printed when printing an estimator. For example,\n ``print(SVC())`` while True will only print 'SVC()' while the default\n behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with\n all the non-changed parameters.\n\n .. versionadded:: 0.21\n\n display : {'text', 'diagram'}, default=None\n If 'diagram', estimators will be displayed as a diagram in a Jupyter\n lab or notebook context. If 'text', estimators will be displayed as\n text. Default is 'text'.\n\n .. versionadded:: 0.23\n\n See Also\n --------\n config_context : Context manager for global scikit-learn configuration.\n get_config : Retrieve current values of the global configuration.\n \"\"\"\n local_config = _get_threadlocal_config()\n if assume_finite is not None:\n local_config['assume_finite'] = assume_finite\n if working_memory is not None:\n local_config['working_memory'] = working_memory\n if print_changed_only is not None:\n local_config['print_changed_only'] = print_changed_only\n if display is not None:\n local_config['display'] = display" }, { @@ -27352,7 +27513,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -27362,7 +27524,8 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -27372,7 +27535,8 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Predicted mean." - } + }, + "refined_type": {} }, { "name": "weights", @@ -27382,13 +27546,17 @@ "docstring": { "type": "{int, array of shape (n_samples,)}, default=1", "description": "Weights or exposure to which variance is inverse proportional." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Compute the deviance.\n\nThe deviance is a weighted sum of the per sample unit deviances, :math:`D = \\sum_i s_i \\cdot d(y_i, y_\\textrm{pred}_i)` with weights :math:`s_i` and unit deviance :math:`d(y,y_\\textrm{pred})`. In terms of the log-likelihood it is :math:`D = -2\\phi\\cdot \\left(loglike(y,y_\\textrm{pred},\\frac{phi}{s}) - loglike(y,y,\\frac{phi}{s})\\right)`.", - "docstring": "Compute the deviance.\n\nThe deviance is a weighted sum of the per sample unit deviances,\n:math:`D = \\sum_i s_i \\cdot d(y_i, y_\\textrm{pred}_i)`\nwith weights :math:`s_i` and unit deviance\n:math:`d(y,y_\\textrm{pred})`.\nIn terms of the log-likelihood it is :math:`D = -2\\phi\\cdot\n\\left(loglike(y,y_\\textrm{pred},\\frac{phi}{s})\n- loglike(y,y,\\frac{phi}{s})\\right)`.\n\nParameters\n----------\ny : array of shape (n_samples,)\n Target values.\n\ny_pred : array of shape (n_samples,)\n Predicted mean.\n\nweights : {int, array of shape (n_samples,)}, default=1\n Weights or exposure to which variance is inverse proportional.", + "description": "Compute the deviance.\n\nThe deviance is a weighted sum of the per sample unit deviances,\n:math:`D = \\sum_i s_i \\cdot d(y_i, y_\\textrm{pred}_i)`\nwith weights :math:`s_i` and unit deviance\n:math:`d(y,y_\\textrm{pred})`.\nIn terms of the log-likelihood it is :math:`D = -2\\phi\\cdot\n\\left(loglike(y,y_\\textrm{pred},\\frac{phi}{s})\n- loglike(y,y,\\frac{phi}{s})\\right)`.", + "docstring": "Compute the deviance.\n\n The deviance is a weighted sum of the per sample unit deviances,\n :math:`D = \\sum_i s_i \\cdot d(y_i, y_\\textrm{pred}_i)`\n with weights :math:`s_i` and unit deviance\n :math:`d(y,y_\\textrm{pred})`.\n In terms of the log-likelihood it is :math:`D = -2\\phi\\cdot\n \\left(loglike(y,y_\\textrm{pred},\\frac{phi}{s})\n - loglike(y,y,\\frac{phi}{s})\\right)`.\n\n Parameters\n ----------\n y : array of shape (n_samples,)\n Target values.\n\n y_pred : array of shape (n_samples,)\n Predicted mean.\n\n weights : {int, array of shape (n_samples,)}, default=1\n Weights or exposure to which variance is inverse proportional.\n ", "source_code": "\ndef deviance(self, y, y_pred, weights=1):\n \"\"\"Compute the deviance.\n\n The deviance is a weighted sum of the per sample unit deviances,\n :math:`D = \\sum_i s_i \\cdot d(y_i, y_\\textrm{pred}_i)`\n with weights :math:`s_i` and unit deviance\n :math:`d(y,y_\\textrm{pred})`.\n In terms of the log-likelihood it is :math:`D = -2\\phi\\cdot\n \\left(loglike(y,y_\\textrm{pred},\\frac{phi}{s})\n - loglike(y,y,\\frac{phi}{s})\\right)`.\n\n Parameters\n ----------\n y : array of shape (n_samples,)\n Target values.\n\n y_pred : array of shape (n_samples,)\n Predicted mean.\n\n weights : {int, array of shape (n_samples,)}, default=1\n Weights or exposure to which variance is inverse proportional.\n \"\"\"\n return np.sum(weights * self.unit_deviance(y, y_pred))" }, { @@ -27406,7 +27574,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -27416,7 +27585,8 @@ "docstring": { "type": "array, shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -27426,7 +27596,8 @@ "docstring": { "type": "array, shape (n_samples,)", "description": "Predicted mean." - } + }, + "refined_type": {} }, { "name": "weights", @@ -27436,13 +27607,17 @@ "docstring": { "type": "{int, array of shape (n_samples,)}, default=1", "description": "Weights or exposure to which variance is inverse proportional." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Compute the derivative of the deviance w.r.t. y_pred.\n\nIt gives :math:`\\frac{\\partial}{\\partial y_\\textrm{pred}} D(y, \\y_\\textrm{pred}; weights)`.", - "docstring": "Compute the derivative of the deviance w.r.t. y_pred.\n\nIt gives :math:`\\frac{\\partial}{\\partial y_\\textrm{pred}}\nD(y, \\y_\\textrm{pred}; weights)`.\n\nParameters\n----------\ny : array, shape (n_samples,)\n Target values.\n\ny_pred : array, shape (n_samples,)\n Predicted mean.\n\nweights : {int, array of shape (n_samples,)}, default=1\n Weights or exposure to which variance is inverse proportional.", + "description": "Compute the derivative of the deviance w.r.t. y_pred.\n\nIt gives :math:`\\frac{\\partial}{\\partial y_\\textrm{pred}}\nD(y, \\y_\\textrm{pred}; weights)`.", + "docstring": "Compute the derivative of the deviance w.r.t. y_pred.\n\n It gives :math:`\\frac{\\partial}{\\partial y_\\textrm{pred}}\n D(y, \\y_\\textrm{pred}; weights)`.\n\n Parameters\n ----------\n y : array, shape (n_samples,)\n Target values.\n\n y_pred : array, shape (n_samples,)\n Predicted mean.\n\n weights : {int, array of shape (n_samples,)}, default=1\n Weights or exposure to which variance is inverse proportional.\n ", "source_code": "\ndef deviance_derivative(self, y, y_pred, weights=1):\n \"\"\"Compute the derivative of the deviance w.r.t. y_pred.\n\n It gives :math:`\\frac{\\partial}{\\partial y_\\textrm{pred}}\n D(y, \\y_\\textrm{pred}; weights)`.\n\n Parameters\n ----------\n y : array, shape (n_samples,)\n Target values.\n\n y_pred : array, shape (n_samples,)\n Predicted mean.\n\n weights : {int, array of shape (n_samples,)}, default=1\n Weights or exposure to which variance is inverse proportional.\n \"\"\"\n return weights * self.unit_deviance_derivative(y, y_pred)" }, { @@ -27460,7 +27635,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -27470,13 +27646,14 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Returns ``True`` if y is in the valid range of Y~EDM.", - "docstring": "Returns ``True`` if y is in the valid range of Y~EDM.\n\nParameters\n----------\ny : array of shape (n_samples,)\n Target values.", + "docstring": "Returns ``True`` if y is in the valid range of Y~EDM.\n\n Parameters\n ----------\n y : array of shape (n_samples,)\n Target values.\n ", "source_code": "\ndef in_y_range(self, y):\n \"\"\"Returns ``True`` if y is in the valid range of Y~EDM.\n\n Parameters\n ----------\n y : array of shape (n_samples,)\n Target values.\n \"\"\"\n if not isinstance(self._lower_bound, DistributionBoundary):\n raise TypeError('_lower_bound attribute must be of type DistributionBoundary')\n if self._lower_bound.inclusive:\n return np.greater_equal(y, self._lower_bound.value)\n else:\n return np.greater(y, self._lower_bound.value)" }, { @@ -27494,7 +27671,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -27504,7 +27682,8 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -27514,7 +27693,8 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Predicted mean." - } + }, + "refined_type": {} }, { "name": "check_input", @@ -27524,13 +27704,14 @@ "docstring": { "type": "bool, default=False", "description": "If True raise an exception on invalid y or y_pred values, otherwise\nthey will be propagated as NaN." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the unit deviance.\n\nThe unit_deviance :math:`d(y,y_\\textrm{pred})` can be defined by the log-likelihood as :math:`d(y,y_\\textrm{pred}) = -2\\phi\\cdot \\left(loglike(y,y_\\textrm{pred},\\phi) - loglike(y,y,\\phi)\\right).`", - "docstring": "Compute the unit deviance.\n\nThe unit_deviance :math:`d(y,y_\\textrm{pred})` can be defined by the\nlog-likelihood as\n:math:`d(y,y_\\textrm{pred}) = -2\\phi\\cdot\n\\left(loglike(y,y_\\textrm{pred},\\phi) - loglike(y,y,\\phi)\\right).`\n\nParameters\n----------\ny : array of shape (n_samples,)\n Target values.\n\ny_pred : array of shape (n_samples,)\n Predicted mean.\n\ncheck_input : bool, default=False\n If True raise an exception on invalid y or y_pred values, otherwise\n they will be propagated as NaN.\nReturns\n-------\ndeviance: array of shape (n_samples,)\n Computed deviance", + "description": "Compute the unit deviance.\n\nThe unit_deviance :math:`d(y,y_\\textrm{pred})` can be defined by the\nlog-likelihood as\n:math:`d(y,y_\\textrm{pred}) = -2\\phi\\cdot\n\\left(loglike(y,y_\\textrm{pred},\\phi) - loglike(y,y,\\phi)\\right).`", + "docstring": "Compute the unit deviance.\n\n The unit_deviance :math:`d(y,y_\\textrm{pred})` can be defined by the\n log-likelihood as\n :math:`d(y,y_\\textrm{pred}) = -2\\phi\\cdot\n \\left(loglike(y,y_\\textrm{pred},\\phi) - loglike(y,y,\\phi)\\right).`\n\n Parameters\n ----------\n y : array of shape (n_samples,)\n Target values.\n\n y_pred : array of shape (n_samples,)\n Predicted mean.\n\n check_input : bool, default=False\n If True raise an exception on invalid y or y_pred values, otherwise\n they will be propagated as NaN.\n Returns\n -------\n deviance: array of shape (n_samples,)\n Computed deviance\n ", "source_code": "\n@abstractmethod\ndef unit_deviance(self, y, y_pred, check_input=False):\n \"\"\"Compute the unit deviance.\n\n The unit_deviance :math:`d(y,y_\\textrm{pred})` can be defined by the\n log-likelihood as\n :math:`d(y,y_\\textrm{pred}) = -2\\phi\\cdot\n \\left(loglike(y,y_\\textrm{pred},\\phi) - loglike(y,y,\\phi)\\right).`\n\n Parameters\n ----------\n y : array of shape (n_samples,)\n Target values.\n\n y_pred : array of shape (n_samples,)\n Predicted mean.\n\n check_input : bool, default=False\n If True raise an exception on invalid y or y_pred values, otherwise\n they will be propagated as NaN.\n Returns\n -------\n deviance: array of shape (n_samples,)\n Computed deviance\n \"\"\"\n " }, { @@ -27548,7 +27729,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -27558,7 +27740,8 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -27568,13 +27751,14 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Predicted mean." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the derivative of the unit deviance w.r.t. y_pred.\n\nThe derivative of the unit deviance is given by :math:`\\frac{\\partial}{\\partialy_\\textrm{pred}}d(y,y_\\textrm{pred}) = -2\\frac{y-y_\\textrm{pred}}{v(y_\\textrm{pred})}` with unit variance :math:`v(y_\\textrm{pred})`.", - "docstring": "Compute the derivative of the unit deviance w.r.t. y_pred.\n\nThe derivative of the unit deviance is given by\n:math:`\\frac{\\partial}{\\partialy_\\textrm{pred}}d(y,y_\\textrm{pred})\n = -2\\frac{y-y_\\textrm{pred}}{v(y_\\textrm{pred})}`\nwith unit variance :math:`v(y_\\textrm{pred})`.\n\nParameters\n----------\ny : array of shape (n_samples,)\n Target values.\n\ny_pred : array of shape (n_samples,)\n Predicted mean.", + "description": "Compute the derivative of the unit deviance w.r.t. y_pred.\n\nThe derivative of the unit deviance is given by\n:math:`\\frac{\\partial}{\\partialy_\\textrm{pred}}d(y,y_\\textrm{pred})\n = -2\\frac{y-y_\\textrm{pred}}{v(y_\\textrm{pred})}`\nwith unit variance :math:`v(y_\\textrm{pred})`.", + "docstring": "Compute the derivative of the unit deviance w.r.t. y_pred.\n\n The derivative of the unit deviance is given by\n :math:`\\frac{\\partial}{\\partialy_\\textrm{pred}}d(y,y_\\textrm{pred})\n = -2\\frac{y-y_\\textrm{pred}}{v(y_\\textrm{pred})}`\n with unit variance :math:`v(y_\\textrm{pred})`.\n\n Parameters\n ----------\n y : array of shape (n_samples,)\n Target values.\n\n y_pred : array of shape (n_samples,)\n Predicted mean.\n ", "source_code": "\ndef unit_deviance_derivative(self, y, y_pred):\n \"\"\"Compute the derivative of the unit deviance w.r.t. y_pred.\n\n The derivative of the unit deviance is given by\n :math:`\\frac{\\partial}{\\partialy_\\textrm{pred}}d(y,y_\\textrm{pred})\n = -2\\frac{y-y_\\textrm{pred}}{v(y_\\textrm{pred})}`\n with unit variance :math:`v(y_\\textrm{pred})`.\n\n Parameters\n ----------\n y : array of shape (n_samples,)\n Target values.\n\n y_pred : array of shape (n_samples,)\n Predicted mean.\n \"\"\"\n return -2 * (y - y_pred) / self.unit_variance(y_pred)" }, { @@ -27592,7 +27776,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -27602,13 +27787,14 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Predicted mean." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the unit variance function.\n\nThe unit variance :math:`v(y_\\textrm{pred})` determines the variance as a function of the mean :math:`y_\\textrm{pred}` by :math:`\\mathrm{Var}[Y_i] = \\phi/s_i*v(y_\\textrm{pred}_i)`. It can also be derived from the unit deviance :math:`d(y,y_\\textrm{pred})` as .. math:: v(y_\\textrm{pred}) = \\frac{2}{ \\frac{\\partial^2 d(y,y_\\textrm{pred})}{ \\partialy_\\textrm{pred}^2}}\\big|_{y=y_\\textrm{pred}} See also :func:`variance`.", - "docstring": "Compute the unit variance function.\n\nThe unit variance :math:`v(y_\\textrm{pred})` determines the variance as\na function of the mean :math:`y_\\textrm{pred}` by\n:math:`\\mathrm{Var}[Y_i] = \\phi/s_i*v(y_\\textrm{pred}_i)`.\nIt can also be derived from the unit deviance\n:math:`d(y,y_\\textrm{pred})` as\n\n.. math:: v(y_\\textrm{pred}) = \\frac{2}{\n \\frac{\\partial^2 d(y,y_\\textrm{pred})}{\n \\partialy_\\textrm{pred}^2}}\\big|_{y=y_\\textrm{pred}}\n\nSee also :func:`variance`.\n\nParameters\n----------\ny_pred : array of shape (n_samples,)\n Predicted mean.", + "description": "Compute the unit variance function.\n\nThe unit variance :math:`v(y_\\textrm{pred})` determines the variance as\na function of the mean :math:`y_\\textrm{pred}` by\n:math:`\\mathrm{Var}[Y_i] = \\phi/s_i*v(y_\\textrm{pred}_i)`.\nIt can also be derived from the unit deviance\n:math:`d(y,y_\\textrm{pred})` as\n\n.. math:: v(y_\\textrm{pred}) = \\frac{2}{\n \\frac{\\partial^2 d(y,y_\\textrm{pred})}{\n \\partialy_\\textrm{pred}^2}}\\big|_{y=y_\\textrm{pred}}\n\nSee also :func:`variance`.", + "docstring": "Compute the unit variance function.\n\n The unit variance :math:`v(y_\\textrm{pred})` determines the variance as\n a function of the mean :math:`y_\\textrm{pred}` by\n :math:`\\mathrm{Var}[Y_i] = \\phi/s_i*v(y_\\textrm{pred}_i)`.\n It can also be derived from the unit deviance\n :math:`d(y,y_\\textrm{pred})` as\n\n .. math:: v(y_\\textrm{pred}) = \\frac{2}{\n \\frac{\\partial^2 d(y,y_\\textrm{pred})}{\n \\partialy_\\textrm{pred}^2}}\\big|_{y=y_\\textrm{pred}}\n\n See also :func:`variance`.\n\n Parameters\n ----------\n y_pred : array of shape (n_samples,)\n Predicted mean.\n ", "source_code": "\n@abstractmethod\ndef unit_variance(self, y_pred):\n \"\"\"Compute the unit variance function.\n\n The unit variance :math:`v(y_\\textrm{pred})` determines the variance as\n a function of the mean :math:`y_\\textrm{pred}` by\n :math:`\\mathrm{Var}[Y_i] = \\phi/s_i*v(y_\\textrm{pred}_i)`.\n It can also be derived from the unit deviance\n :math:`d(y,y_\\textrm{pred})` as\n\n .. math:: v(y_\\textrm{pred}) = \\frac{2}{\n \\frac{\\partial^2 d(y,y_\\textrm{pred})}{\n \\partialy_\\textrm{pred}^2}}\\big|_{y=y_\\textrm{pred}}\n\n See also :func:`variance`.\n\n Parameters\n ----------\n y_pred : array of shape (n_samples,)\n Predicted mean.\n \"\"\"\n " }, { @@ -27626,13 +27812,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self):\n super().__init__(power=2)" }, { @@ -27650,13 +27837,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self):\n super().__init__(power=3)" }, { @@ -27674,13 +27862,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self):\n super().__init__(power=0)" }, { @@ -27698,13 +27887,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self):\n super().__init__(power=1)" }, { @@ -27722,7 +27912,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "power", @@ -27732,13 +27923,17 @@ "docstring": { "type": "float, default=0", "description": "The variance power of the `unit_variance`\n:math:`v(y_\\textrm{pred}) = y_\\textrm{pred}^{power}`.\nFor ``0=1.')\n elif 1 <= power < 2:\n self._lower_bound = DistributionBoundary(0, inclusive=True)\n elif power >= 2:\n self._lower_bound = DistributionBoundary(0, inclusive=False)\n else:\n raise ValueError\n self._power = power" }, { @@ -27814,7 +28012,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -27824,7 +28023,8 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -27834,7 +28034,8 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Predicted mean." - } + }, + "refined_type": {} }, { "name": "check_input", @@ -27844,13 +28045,14 @@ "docstring": { "type": "bool, default=False", "description": "If True raise an exception on invalid y or y_pred values, otherwise\nthey will be propagated as NaN." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the unit deviance.\n\nThe unit_deviance :math:`d(y,y_\\textrm{pred})` can be defined by the log-likelihood as :math:`d(y,y_\\textrm{pred}) = -2\\phi\\cdot \\left(loglike(y,y_\\textrm{pred},\\phi) - loglike(y,y,\\phi)\\right).`", - "docstring": "Compute the unit deviance.\n\nThe unit_deviance :math:`d(y,y_\\textrm{pred})` can be defined by the\nlog-likelihood as\n:math:`d(y,y_\\textrm{pred}) = -2\\phi\\cdot\n\\left(loglike(y,y_\\textrm{pred},\\phi) - loglike(y,y,\\phi)\\right).`\n\nParameters\n----------\ny : array of shape (n_samples,)\n Target values.\n\ny_pred : array of shape (n_samples,)\n Predicted mean.\n\ncheck_input : bool, default=False\n If True raise an exception on invalid y or y_pred values, otherwise\n they will be propagated as NaN.\nReturns\n-------\ndeviance: array of shape (n_samples,)\n Computed deviance", + "description": "Compute the unit deviance.\n\nThe unit_deviance :math:`d(y,y_\\textrm{pred})` can be defined by the\nlog-likelihood as\n:math:`d(y,y_\\textrm{pred}) = -2\\phi\\cdot\n\\left(loglike(y,y_\\textrm{pred},\\phi) - loglike(y,y,\\phi)\\right).`", + "docstring": "Compute the unit deviance.\n\n The unit_deviance :math:`d(y,y_\\textrm{pred})` can be defined by the\n log-likelihood as\n :math:`d(y,y_\\textrm{pred}) = -2\\phi\\cdot\n \\left(loglike(y,y_\\textrm{pred},\\phi) - loglike(y,y,\\phi)\\right).`\n\n Parameters\n ----------\n y : array of shape (n_samples,)\n Target values.\n\n y_pred : array of shape (n_samples,)\n Predicted mean.\n\n check_input : bool, default=False\n If True raise an exception on invalid y or y_pred values, otherwise\n they will be propagated as NaN.\n Returns\n -------\n deviance: array of shape (n_samples,)\n Computed deviance\n ", "source_code": "\ndef unit_deviance(self, y, y_pred, check_input=False):\n \"\"\"Compute the unit deviance.\n\n The unit_deviance :math:`d(y,y_\\textrm{pred})` can be defined by the\n log-likelihood as\n :math:`d(y,y_\\textrm{pred}) = -2\\phi\\cdot\n \\left(loglike(y,y_\\textrm{pred},\\phi) - loglike(y,y,\\phi)\\right).`\n\n Parameters\n ----------\n y : array of shape (n_samples,)\n Target values.\n\n y_pred : array of shape (n_samples,)\n Predicted mean.\n\n check_input : bool, default=False\n If True raise an exception on invalid y or y_pred values, otherwise\n they will be propagated as NaN.\n Returns\n -------\n deviance: array of shape (n_samples,)\n Computed deviance\n \"\"\"\n p = self.power\n if check_input:\n message = 'Mean Tweedie deviance error with power={} can only be used on '.format(p)\n if p < 0:\n if (y_pred <= 0).any():\n raise ValueError(message + 'strictly positive y_pred.')\n elif p == 0:\n pass\n elif 0 < p < 1:\n raise ValueError('Tweedie deviance is only defined for power<=0 and power>=1.')\n elif 1 <= p < 2:\n if (y < 0).any() or (y_pred <= 0).any():\n raise ValueError(message + 'non-negative y and strictly positive y_pred.')\n elif p >= 2:\n if (y <= 0).any() or (y_pred <= 0).any():\n raise ValueError(message + 'strictly positive y and y_pred.')\n else:\n raise ValueError\n if p < 0:\n dev = 2 * (np.power(np.maximum(y, 0), 2 - p) / ((1 - p) * (2 - p)) - y * np.power(y_pred, 1 - p) / (1 - p) + np.power(y_pred, 2 - p) / (2 - p))\n elif p == 0:\n dev = (y - y_pred)**2\n elif p < 1:\n raise ValueError('Tweedie deviance is only defined for power<=0 and power>=1.')\n elif p == 1:\n dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)\n elif p == 2:\n dev = 2 * (np.log(y_pred / y) + y / y_pred - 1)\n else:\n dev = 2 * (np.power(y, 2 - p) / ((1 - p) * (2 - p)) - y * np.power(y_pred, 1 - p) / (1 - p) + np.power(y_pred, 2 - p) / (2 - p))\n return dev" }, { @@ -27868,7 +28070,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -27878,13 +28081,14 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Predicted mean." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the unit variance of a Tweedie distribution v(y_ extrm{pred})=y_ extrm{pred}**power.", - "docstring": "Compute the unit variance of a Tweedie distribution\nv(y_ extrm{pred})=y_ extrm{pred}**power.\n\nParameters\n----------\ny_pred : array of shape (n_samples,)\n Predicted mean.", + "description": "Compute the unit variance of a Tweedie distribution\nv(y_ extrm{pred})=y_ extrm{pred}**power.", + "docstring": "Compute the unit variance of a Tweedie distribution\n v(y_\textrm{pred})=y_\textrm{pred}**power.\n\n Parameters\n ----------\n y_pred : array of shape (n_samples,)\n Predicted mean.\n ", "source_code": "\ndef unit_variance(self, y_pred):\n \"\"\"Compute the unit variance of a Tweedie distribution\n v(y_\textrm{pred})=y_\textrm{pred}**power.\n\n Parameters\n ----------\n y_pred : array of shape (n_samples,)\n Predicted mean.\n \"\"\"\n return np.power(y_pred, self.power)" }, { @@ -27902,13 +28106,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __getstate__(self):\n try:\n state = super().__getstate__()\n except AttributeError:\n state = self.__dict__.copy()\n if type(self).__module__.startswith('sklearn.'):\n return dict(state.items(), _sklearn_version=__version__)\n else:\n return state" }, { @@ -27926,7 +28131,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "N_CHAR_MAX", @@ -27936,13 +28142,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self, N_CHAR_MAX=700):\n from .utils._pprint import _EstimatorPrettyPrinter\n N_MAX_ELEMENTS_TO_SHOW = 30\n pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True, n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW)\n repr_ = pp.pformat(self)\n n_nonblank = len(''.join(repr_.split()))\n if n_nonblank > N_CHAR_MAX:\n lim = N_CHAR_MAX // 2\n regex = '^(\\\\s*\\\\S){%d}' % lim\n left_lim = re.match(regex, repr_).end()\n right_lim = re.match(regex, repr_[::-1]).end()\n if '\\n' in repr_[left_lim:-right_lim]:\n regex += '[^\\\\n]*\\\\n'\n right_lim = re.match(regex, repr_[::-1]).end()\n ellipsis = '...'\n if left_lim + len(ellipsis) < len(repr_) - right_lim:\n repr_ = repr_[:left_lim] + '...' + repr_[-right_lim:]\n return repr_" }, { @@ -27960,7 +28167,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "state", @@ -27970,13 +28178,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __setstate__(self, state):\n if type(self).__module__.startswith('sklearn.'):\n pickle_version = state.pop('_sklearn_version', 'pre-0.18')\n if pickle_version != __version__:\n warnings.warn('Trying to unpickle estimator {0} from version {1} when using version {2}. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\\nhttps://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations'.format(self.__class__.__name__, pickle_version, __version__), UserWarning)\n try:\n super().__setstate__(state)\n except AttributeError:\n self.__dict__.update(state)" }, { @@ -27994,7 +28203,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -28004,6 +28214,10 @@ "docstring": { "type": "{ndarray, dataframe} of shape (n_samples, n_features)", "description": "The input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -28014,13 +28228,14 @@ "docstring": { "type": "bool", "description": "Whether to reset the `feature_names_in_` attribute.\nIf False, the input will be checked for consistency with\nfeature names of data provided when reset was last True.\n.. note::\n It is recommended to call `reset=True` in `fit` and in the first\n call to `partial_fit`. All other methods that validate `X`\n should set `reset=False`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Set or check the `feature_names_in_` attribute.\n\n.. versionadded:: 1.0", - "docstring": "Set or check the `feature_names_in_` attribute.\n\n.. versionadded:: 1.0\n\nParameters\n----------\nX : {ndarray, dataframe} of shape (n_samples, n_features)\n The input samples.\n\nreset : bool\n Whether to reset the `feature_names_in_` attribute.\n If False, the input will be checked for consistency with\n feature names of data provided when reset was last True.\n .. note::\n It is recommended to call `reset=True` in `fit` and in the first\n call to `partial_fit`. All other methods that validate `X`\n should set `reset=False`.", + "docstring": "Set or check the `feature_names_in_` attribute.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n X : {ndarray, dataframe} of shape (n_samples, n_features)\n The input samples.\n\n reset : bool\n Whether to reset the `feature_names_in_` attribute.\n If False, the input will be checked for consistency with\n feature names of data provided when reset was last True.\n .. note::\n It is recommended to call `reset=True` in `fit` and in the first\n call to `partial_fit`. All other methods that validate `X`\n should set `reset=False`.\n ", "source_code": "\ndef _check_feature_names(self, X, *, reset):\n \"\"\"Set or check the `feature_names_in_` attribute.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n X : {ndarray, dataframe} of shape (n_samples, n_features)\n The input samples.\n\n reset : bool\n Whether to reset the `feature_names_in_` attribute.\n If False, the input will be checked for consistency with\n feature names of data provided when reset was last True.\n .. note::\n It is recommended to call `reset=True` in `fit` and in the first\n call to `partial_fit`. All other methods that validate `X`\n should set `reset=False`.\n \"\"\"\n if reset:\n feature_names_in = _get_feature_names(X)\n if feature_names_in is not None:\n self.feature_names_in_ = feature_names_in\n elif hasattr(self, 'feature_names_in_'):\n delattr(self, 'feature_names_in_')\n return\n fitted_feature_names = getattr(self, 'feature_names_in_', None)\n X_feature_names = _get_feature_names(X)\n if fitted_feature_names is None and X_feature_names is None:\n return\n if X_feature_names is not None and fitted_feature_names is None:\n warnings.warn(f'X has feature names, but {self.__class__.__name__} was fitted without feature names')\n return\n if X_feature_names is None and fitted_feature_names is not None:\n warnings.warn(f'X does not have valid feature names, but {self.__class__.__name__} was fitted with feature names')\n return\n if len(fitted_feature_names) != len(X_feature_names) or np.any(fitted_feature_names != X_feature_names):\n message = 'The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.\\n'\n fitted_feature_names_set = set(fitted_feature_names)\n X_feature_names_set = set(X_feature_names)\n unexpected_names = sorted(X_feature_names_set - fitted_feature_names_set)\n missing_names = sorted(fitted_feature_names_set - X_feature_names_set)\n \n def add_names(names):\n output = ''\n max_n_names = 5\n for (i, name) in enumerate(names):\n if i >= max_n_names:\n output += '- ...\\n'\n break\n output += f'- {name}\\n'\n return output\n if unexpected_names:\n message += 'Feature names unseen at fit time:\\n'\n message += add_names(unexpected_names)\n if missing_names:\n message += 'Feature names seen at fit time, yet now missing:\\n'\n message += add_names(missing_names)\n if not missing_names and not missing_names:\n message += 'Feature names must be in the same order as they were in fit.\\n'\n warnings.warn(message, FutureWarning)" }, { @@ -28038,7 +28253,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -28048,6 +28264,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -28058,13 +28278,14 @@ "docstring": { "type": "bool", "description": "If True, the `n_features_in_` attribute is set to `X.shape[1]`.\nIf False and the attribute exists, then check that it is equal to\n`X.shape[1]`. If False and the attribute does *not* exist, then\nthe check is skipped.\n.. note::\n It is recommended to call reset=True in `fit` and in the first\n call to `partial_fit`. All other methods that validate `X`\n should set `reset=False`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Set the `n_features_in_` attribute, or check against it.", - "docstring": "Set the `n_features_in_` attribute, or check against it.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input samples.\nreset : bool\n If True, the `n_features_in_` attribute is set to `X.shape[1]`.\n If False and the attribute exists, then check that it is equal to\n `X.shape[1]`. If False and the attribute does *not* exist, then\n the check is skipped.\n .. note::\n It is recommended to call reset=True in `fit` and in the first\n call to `partial_fit`. All other methods that validate `X`\n should set `reset=False`.", + "docstring": "Set the `n_features_in_` attribute, or check against it.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n reset : bool\n If True, the `n_features_in_` attribute is set to `X.shape[1]`.\n If False and the attribute exists, then check that it is equal to\n `X.shape[1]`. If False and the attribute does *not* exist, then\n the check is skipped.\n .. note::\n It is recommended to call reset=True in `fit` and in the first\n call to `partial_fit`. All other methods that validate `X`\n should set `reset=False`.\n ", "source_code": "\ndef _check_n_features(self, X, reset):\n \"\"\"Set the `n_features_in_` attribute, or check against it.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n reset : bool\n If True, the `n_features_in_` attribute is set to `X.shape[1]`.\n If False and the attribute exists, then check that it is equal to\n `X.shape[1]`. If False and the attribute does *not* exist, then\n the check is skipped.\n .. note::\n It is recommended to call reset=True in `fit` and in the first\n call to `partial_fit`. All other methods that validate `X`\n should set `reset=False`.\n \"\"\"\n try:\n n_features = _num_features(X)\n except TypeError as e:\n if not reset and hasattr(self, 'n_features_in_'):\n raise ValueError(f'X does not contain any features, but {self.__class__.__name__} is expecting {self.n_features_in_} features') from e\n return\n if reset:\n self.n_features_in_ = n_features\n return\n if not hasattr(self, 'n_features_in_'):\n return\n if n_features != self.n_features_in_:\n raise ValueError(f'X has {n_features} features, but {self.__class__.__name__} is expecting {self.n_features_in_} features as input.')" }, { @@ -28082,7 +28303,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -28106,13 +28328,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_tags(self):\n collected_tags = {}\n for base_class in reversed(inspect.getmro(self.__class__)):\n if hasattr(base_class, '_more_tags'):\n more_tags = base_class._more_tags(self)\n collected_tags.update(more_tags)\n return collected_tags" }, { @@ -28130,13 +28353,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return _DEFAULT_TAGS" }, { @@ -28154,13 +28378,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "HTML representation of estimator.\n\nThis is redundant with the logic of `_repr_mimebundle_`. The latter should be favorted in the long term, `_repr_html_` is only implemented for consumers who do not interpret `_repr_mimbundle_`.", - "docstring": "HTML representation of estimator.\n\nThis is redundant with the logic of `_repr_mimebundle_`. The latter\nshould be favorted in the long term, `_repr_html_` is only\nimplemented for consumers who do not interpret `_repr_mimbundle_`.", + "description": "HTML representation of estimator.\n\nThis is redundant with the logic of `_repr_mimebundle_`. The latter\nshould be favorted in the long term, `_repr_html_` is only\nimplemented for consumers who do not interpret `_repr_mimbundle_`.", + "docstring": "HTML representation of estimator.\n\n This is redundant with the logic of `_repr_mimebundle_`. The latter\n should be favorted in the long term, `_repr_html_` is only\n implemented for consumers who do not interpret `_repr_mimbundle_`.\n ", "source_code": "\n@property\ndef _repr_html_(self):\n \"\"\"HTML representation of estimator.\n\n This is redundant with the logic of `_repr_mimebundle_`. The latter\n should be favorted in the long term, `_repr_html_` is only\n implemented for consumers who do not interpret `_repr_mimbundle_`.\n \"\"\"\n if get_config()['display'] != 'diagram':\n raise AttributeError(\"_repr_html_ is only defined when the 'display' configuration option is set to 'diagram'\")\n return self._repr_html_inner" }, { @@ -28178,13 +28403,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "This function is returned by the @property `_repr_html_` to make `hasattr(estimator, \"_repr_html_\") return `True` or `False` depending on `get_config()[\"display\"]`.", - "docstring": "This function is returned by the @property `_repr_html_` to make\n`hasattr(estimator, \"_repr_html_\") return `True` or `False` depending\non `get_config()[\"display\"]`.", + "description": "This function is returned by the @property `_repr_html_` to make\n`hasattr(estimator, \"_repr_html_\") return `True` or `False` depending\non `get_config()[\"display\"]`.", + "docstring": "This function is returned by the @property `_repr_html_` to make\n `hasattr(estimator, \"_repr_html_\") return `True` or `False` depending\n on `get_config()[\"display\"]`.\n ", "source_code": "\ndef _repr_html_inner(self):\n \"\"\"This function is returned by the @property `_repr_html_` to make\n `hasattr(estimator, \"_repr_html_\") return `True` or `False` depending\n on `get_config()[\"display\"]`.\n \"\"\"\n return estimator_html_repr(self)" }, { @@ -28202,7 +28428,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -28226,7 +28453,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -28236,6 +28464,10 @@ "docstring": { "type": "{array-like, sparse matrix, dataframe} of shape (n_samples, n_features), default='no validation'", "description": "The input samples.\nIf `'no_validation'`, no validation is performed on `X`. This is\nuseful for meta-estimator which can delegate input validation to\ntheir underlying estimator(s). In that case `y` must be passed and\nthe only accepted `check_params` are `multi_output` and\n`y_numeric`." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -28246,7 +28478,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default='no_validation'", "description": "The targets.\n\n- If `None`, `check_array` is called on `X`. If the estimator's\n requires_y tag is True, then an error will be raised.\n- If `'no_validation'`, `check_array` is called on `X` and the\n estimator's requires_y tag is ignored. This is a default\n placeholder and is never meant to be explicitly set. In that case\n `X` must be passed.\n- Otherwise, only `y` with `_check_y` or both `X` and `y` are\n checked with either `check_array` or `check_X_y` depending on\n `validate_separately`." - } + }, + "refined_type": {} }, { "name": "reset", @@ -28256,7 +28489,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to reset the `n_features_in_` attribute.\nIf False, the input will be checked for consistency with data\nprovided when reset was last True.\n.. note::\n It is recommended to call reset=True in `fit` and in the first\n call to `partial_fit`. All other methods that validate `X`\n should set `reset=False`." - } + }, + "refined_type": {} }, { "name": "validate_separately", @@ -28266,13 +28500,14 @@ "docstring": { "type": "False or tuple of dicts, default=False", "description": "Only used if y is not None.\nIf False, call validate_X_y(). Else, it must be a tuple of kwargs\nto be used for calling check_array() on X and y respectively." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Validate input data and set or check the `n_features_in_` attribute.", - "docstring": "Validate input data and set or check the `n_features_in_` attribute.\n\nParameters\n----------\nX : {array-like, sparse matrix, dataframe} of shape (n_samples, n_features), default='no validation'\n The input samples.\n If `'no_validation'`, no validation is performed on `X`. This is\n useful for meta-estimator which can delegate input validation to\n their underlying estimator(s). In that case `y` must be passed and\n the only accepted `check_params` are `multi_output` and\n `y_numeric`.\n\ny : array-like of shape (n_samples,), default='no_validation'\n The targets.\n\n - If `None`, `check_array` is called on `X`. If the estimator's\n requires_y tag is True, then an error will be raised.\n - If `'no_validation'`, `check_array` is called on `X` and the\n estimator's requires_y tag is ignored. This is a default\n placeholder and is never meant to be explicitly set. In that case\n `X` must be passed.\n - Otherwise, only `y` with `_check_y` or both `X` and `y` are\n checked with either `check_array` or `check_X_y` depending on\n `validate_separately`.\n\nreset : bool, default=True\n Whether to reset the `n_features_in_` attribute.\n If False, the input will be checked for consistency with data\n provided when reset was last True.\n .. note::\n It is recommended to call reset=True in `fit` and in the first\n call to `partial_fit`. All other methods that validate `X`\n should set `reset=False`.\nvalidate_separately : False or tuple of dicts, default=False\n Only used if y is not None.\n If False, call validate_X_y(). Else, it must be a tuple of kwargs\n to be used for calling check_array() on X and y respectively.\n**check_params : kwargs\n Parameters passed to :func:`sklearn.utils.check_array` or\n :func:`sklearn.utils.check_X_y`. Ignored if validate_separately\n is not False.\n\nReturns\n-------\nout : {ndarray, sparse matrix} or tuple of these\n The validated input. A tuple is returned if both `X` and `y` are\n validated.", + "docstring": "Validate input data and set or check the `n_features_in_` attribute.\n\n Parameters\n ----------\n X : {array-like, sparse matrix, dataframe} of shape (n_samples, n_features), default='no validation'\n The input samples.\n If `'no_validation'`, no validation is performed on `X`. This is\n useful for meta-estimator which can delegate input validation to\n their underlying estimator(s). In that case `y` must be passed and\n the only accepted `check_params` are `multi_output` and\n `y_numeric`.\n\n y : array-like of shape (n_samples,), default='no_validation'\n The targets.\n\n - If `None`, `check_array` is called on `X`. If the estimator's\n requires_y tag is True, then an error will be raised.\n - If `'no_validation'`, `check_array` is called on `X` and the\n estimator's requires_y tag is ignored. This is a default\n placeholder and is never meant to be explicitly set. In that case\n `X` must be passed.\n - Otherwise, only `y` with `_check_y` or both `X` and `y` are\n checked with either `check_array` or `check_X_y` depending on\n `validate_separately`.\n\n reset : bool, default=True\n Whether to reset the `n_features_in_` attribute.\n If False, the input will be checked for consistency with data\n provided when reset was last True.\n .. note::\n It is recommended to call reset=True in `fit` and in the first\n call to `partial_fit`. All other methods that validate `X`\n should set `reset=False`.\n validate_separately : False or tuple of dicts, default=False\n Only used if y is not None.\n If False, call validate_X_y(). Else, it must be a tuple of kwargs\n to be used for calling check_array() on X and y respectively.\n **check_params : kwargs\n Parameters passed to :func:`sklearn.utils.check_array` or\n :func:`sklearn.utils.check_X_y`. Ignored if validate_separately\n is not False.\n\n Returns\n -------\n out : {ndarray, sparse matrix} or tuple of these\n The validated input. A tuple is returned if both `X` and `y` are\n validated.\n ", "source_code": "\ndef _validate_data(self, X='no_validation', y='no_validation', reset=True, validate_separately=False, **check_params):\n \"\"\"Validate input data and set or check the `n_features_in_` attribute.\n\n Parameters\n ----------\n X : {array-like, sparse matrix, dataframe} of shape (n_samples, n_features), default='no validation'\n The input samples.\n If `'no_validation'`, no validation is performed on `X`. This is\n useful for meta-estimator which can delegate input validation to\n their underlying estimator(s). In that case `y` must be passed and\n the only accepted `check_params` are `multi_output` and\n `y_numeric`.\n\n y : array-like of shape (n_samples,), default='no_validation'\n The targets.\n\n - If `None`, `check_array` is called on `X`. If the estimator's\n requires_y tag is True, then an error will be raised.\n - If `'no_validation'`, `check_array` is called on `X` and the\n estimator's requires_y tag is ignored. This is a default\n placeholder and is never meant to be explicitly set. In that case\n `X` must be passed.\n - Otherwise, only `y` with `_check_y` or both `X` and `y` are\n checked with either `check_array` or `check_X_y` depending on\n `validate_separately`.\n\n reset : bool, default=True\n Whether to reset the `n_features_in_` attribute.\n If False, the input will be checked for consistency with data\n provided when reset was last True.\n .. note::\n It is recommended to call reset=True in `fit` and in the first\n call to `partial_fit`. All other methods that validate `X`\n should set `reset=False`.\n validate_separately : False or tuple of dicts, default=False\n Only used if y is not None.\n If False, call validate_X_y(). Else, it must be a tuple of kwargs\n to be used for calling check_array() on X and y respectively.\n **check_params : kwargs\n Parameters passed to :func:`sklearn.utils.check_array` or\n :func:`sklearn.utils.check_X_y`. Ignored if validate_separately\n is not False.\n\n Returns\n -------\n out : {ndarray, sparse matrix} or tuple of these\n The validated input. A tuple is returned if both `X` and `y` are\n validated.\n \"\"\"\n self._check_feature_names(X, reset=reset)\n if y is None and self._get_tags()['requires_y']:\n raise ValueError(f'This {self.__class__.__name__} estimator requires y to be passed, but the target y is None.')\n no_val_X = isinstance(X, str) and X == 'no_validation'\n no_val_y = y is None or isinstance(y, str) and y == 'no_validation'\n if no_val_X and no_val_y:\n raise ValueError('Validation should be done on X, y or both.')\n elif not no_val_X and no_val_y:\n X = check_array(X, **check_params)\n out = X\n elif no_val_X and not no_val_y:\n y = _check_y(y, **check_params)\n out = y\n else:\n if validate_separately:\n (check_X_params, check_y_params) = validate_separately\n X = check_array(X, **check_X_params)\n y = check_array(y, **check_y_params)\n else:\n (X, y) = check_X_y(X, y, **check_params)\n out = (X, y)\n if not no_val_X and check_params.get('ensure_2d', True):\n self._check_n_features(X, reset=reset)\n return out" }, { @@ -28290,7 +28525,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -28300,13 +28536,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, will return the parameters for this estimator and\ncontained subobjects that are estimators." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get parameters for this estimator.", - "docstring": "Get parameters for this estimator.\n\nParameters\n----------\ndeep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\nReturns\n-------\nparams : dict\n Parameter names mapped to their values.", + "docstring": "\n Get parameters for this estimator.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n ", "source_code": "\ndef get_params(self, deep=True):\n \"\"\"\n Get parameters for this estimator.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n \"\"\"\n out = dict()\n for key in self._get_param_names():\n value = getattr(self, key)\n if deep and hasattr(value, 'get_params'):\n deep_items = value.get_params().items()\n out.update(((key + '__' + k, val) for (k, val) in deep_items))\n out[key] = value\n return out" }, { @@ -28324,13 +28561,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Set the parameters of this estimator.\n\nThe method works on simple estimators as well as on nested objects (such as :class:`~sklearn.pipeline.Pipeline`). The latter have parameters of the form ``__`` so that it's possible to update each component of a nested object.", - "docstring": "Set the parameters of this estimator.\n\nThe method works on simple estimators as well as on nested objects\n(such as :class:`~sklearn.pipeline.Pipeline`). The latter have\nparameters of the form ``__`` so that it's\npossible to update each component of a nested object.\n\nParameters\n----------\n**params : dict\n Estimator parameters.\n\nReturns\n-------\nself : estimator instance\n Estimator instance.", + "description": "Set the parameters of this estimator.\n\nThe method works on simple estimators as well as on nested objects\n(such as :class:`~sklearn.pipeline.Pipeline`). The latter have\nparameters of the form ``__`` so that it's\npossible to update each component of a nested object.", + "docstring": "\n Set the parameters of this estimator.\n\n The method works on simple estimators as well as on nested objects\n (such as :class:`~sklearn.pipeline.Pipeline`). The latter have\n parameters of the form ``__`` so that it's\n possible to update each component of a nested object.\n\n Parameters\n ----------\n **params : dict\n Estimator parameters.\n\n Returns\n -------\n self : estimator instance\n Estimator instance.\n ", "source_code": "\ndef set_params(self, **params):\n \"\"\"\n Set the parameters of this estimator.\n\n The method works on simple estimators as well as on nested objects\n (such as :class:`~sklearn.pipeline.Pipeline`). The latter have\n parameters of the form ``__`` so that it's\n possible to update each component of a nested object.\n\n Parameters\n ----------\n **params : dict\n Estimator parameters.\n\n Returns\n -------\n self : estimator instance\n Estimator instance.\n \"\"\"\n if not params:\n return self\n valid_params = self.get_params(deep=True)\n nested_params = defaultdict(dict)\n for (key, value) in params.items():\n (key, delim, sub_key) = key.partition('__')\n if key not in valid_params:\n raise ValueError('Invalid parameter %s for estimator %s. Check the list of available parameters with `estimator.get_params().keys()`.' % (key, self))\n if delim:\n nested_params[key][sub_key] = value\n else:\n setattr(self, key, value)\n valid_params[key] = value\n for (key, sub_params) in nested_params.items():\n valid_params[key].set_params(**sub_params)\n return self" }, { @@ -28348,13 +28586,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Convenient way to get row and column indicators together.\n\nReturns the ``rows_`` and ``columns_`` members.", - "docstring": "Convenient way to get row and column indicators together.\n\nReturns the ``rows_`` and ``columns_`` members.", + "docstring": "Convenient way to get row and column indicators together.\n\n Returns the ``rows_`` and ``columns_`` members.\n ", "source_code": "\n@property\ndef biclusters_(self):\n \"\"\"Convenient way to get row and column indicators together.\n\n Returns the ``rows_`` and ``columns_`` members.\n \"\"\"\n return self.rows_, self.columns_" }, { @@ -28372,7 +28611,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "i", @@ -28382,13 +28622,14 @@ "docstring": { "type": "int", "description": "The index of the cluster." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Row and column indices of the `i`'th bicluster.\n\nOnly works if ``rows_`` and ``columns_`` attributes exist.", - "docstring": "Row and column indices of the `i`'th bicluster.\n\nOnly works if ``rows_`` and ``columns_`` attributes exist.\n\nParameters\n----------\ni : int\n The index of the cluster.\n\nReturns\n-------\nrow_ind : ndarray, dtype=np.intp\n Indices of rows in the dataset that belong to the bicluster.\ncol_ind : ndarray, dtype=np.intp\n Indices of columns in the dataset that belong to the bicluster.", + "docstring": "Row and column indices of the `i`'th bicluster.\n\n Only works if ``rows_`` and ``columns_`` attributes exist.\n\n Parameters\n ----------\n i : int\n The index of the cluster.\n\n Returns\n -------\n row_ind : ndarray, dtype=np.intp\n Indices of rows in the dataset that belong to the bicluster.\n col_ind : ndarray, dtype=np.intp\n Indices of columns in the dataset that belong to the bicluster.\n ", "source_code": "\ndef get_indices(self, i):\n \"\"\"Row and column indices of the `i`'th bicluster.\n\n Only works if ``rows_`` and ``columns_`` attributes exist.\n\n Parameters\n ----------\n i : int\n The index of the cluster.\n\n Returns\n -------\n row_ind : ndarray, dtype=np.intp\n Indices of rows in the dataset that belong to the bicluster.\n col_ind : ndarray, dtype=np.intp\n Indices of columns in the dataset that belong to the bicluster.\n \"\"\"\n rows = self.rows_[i]\n columns = self.columns_[i]\n return np.nonzero(rows)[0], np.nonzero(columns)[0]" }, { @@ -28406,7 +28647,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "i", @@ -28416,13 +28658,14 @@ "docstring": { "type": "int", "description": "The index of the cluster." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Shape of the `i`'th bicluster.", - "docstring": "Shape of the `i`'th bicluster.\n\nParameters\n----------\ni : int\n The index of the cluster.\n\nReturns\n-------\nn_rows : int\n Number of rows in the bicluster.\n\nn_cols : int\n Number of columns in the bicluster.", + "docstring": "Shape of the `i`'th bicluster.\n\n Parameters\n ----------\n i : int\n The index of the cluster.\n\n Returns\n -------\n n_rows : int\n Number of rows in the bicluster.\n\n n_cols : int\n Number of columns in the bicluster.\n ", "source_code": "\ndef get_shape(self, i):\n \"\"\"Shape of the `i`'th bicluster.\n\n Parameters\n ----------\n i : int\n The index of the cluster.\n\n Returns\n -------\n n_rows : int\n Number of rows in the bicluster.\n\n n_cols : int\n Number of columns in the bicluster.\n \"\"\"\n indices = self.get_indices(i)\n return tuple((len(i) for i in indices))" }, { @@ -28440,7 +28683,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "i", @@ -28450,7 +28694,8 @@ "docstring": { "type": "int", "description": "The index of the cluster." - } + }, + "refined_type": {} }, { "name": "data", @@ -28460,13 +28705,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the submatrix corresponding to bicluster `i`.", - "docstring": "Return the submatrix corresponding to bicluster `i`.\n\nParameters\n----------\ni : int\n The index of the cluster.\ndata : array-like of shape (n_samples, n_features)\n The data.\n\nReturns\n-------\nsubmatrix : ndarray of shape (n_rows, n_cols)\n The submatrix corresponding to bicluster `i`.\n\nNotes\n-----\nWorks with sparse matrices. Only works if ``rows_`` and\n``columns_`` attributes exist.", + "docstring": "Return the submatrix corresponding to bicluster `i`.\n\n Parameters\n ----------\n i : int\n The index of the cluster.\n data : array-like of shape (n_samples, n_features)\n The data.\n\n Returns\n -------\n submatrix : ndarray of shape (n_rows, n_cols)\n The submatrix corresponding to bicluster `i`.\n\n Notes\n -----\n Works with sparse matrices. Only works if ``rows_`` and\n ``columns_`` attributes exist.\n ", "source_code": "\ndef get_submatrix(self, i, data):\n \"\"\"Return the submatrix corresponding to bicluster `i`.\n\n Parameters\n ----------\n i : int\n The index of the cluster.\n data : array-like of shape (n_samples, n_features)\n The data.\n\n Returns\n -------\n submatrix : ndarray of shape (n_rows, n_cols)\n The submatrix corresponding to bicluster `i`.\n\n Notes\n -----\n Works with sparse matrices. Only works if ``rows_`` and\n ``columns_`` attributes exist.\n \"\"\"\n from .utils.validation import check_array\n data = check_array(data, accept_sparse='csr')\n (row_ind, col_ind) = self.get_indices(i)\n return data[row_ind[:, np.newaxis], col_ind]" }, { @@ -28484,13 +28730,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'requires_y': True}" }, { @@ -28508,7 +28755,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -28518,7 +28766,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Test samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -28528,7 +28777,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "True labels for `X`." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -28538,13 +28788,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return the mean accuracy on the given test data and labels.\n\nIn multi-label classification, this is the subset accuracy which is a harsh metric since you require for each sample that each label set be correctly predicted.", - "docstring": "Return the mean accuracy on the given test data and labels.\n\nIn multi-label classification, this is the subset accuracy\nwhich is a harsh metric since you require for each sample that\neach label set be correctly predicted.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Test samples.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True labels for `X`.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nscore : float\n Mean accuracy of ``self.predict(X)`` wrt. `y`.", + "description": "Return the mean accuracy on the given test data and labels.\n\nIn multi-label classification, this is the subset accuracy\nwhich is a harsh metric since you require for each sample that\neach label set be correctly predicted.", + "docstring": "\n Return the mean accuracy on the given test data and labels.\n\n In multi-label classification, this is the subset accuracy\n which is a harsh metric since you require for each sample that\n each label set be correctly predicted.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True labels for `X`.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Mean accuracy of ``self.predict(X)`` wrt. `y`.\n ", "source_code": "\ndef score(self, X, y, sample_weight=None):\n \"\"\"\n Return the mean accuracy on the given test data and labels.\n\n In multi-label classification, this is the subset accuracy\n which is a harsh metric since you require for each sample that\n each label set be correctly predicted.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True labels for `X`.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Mean accuracy of ``self.predict(X)`` wrt. `y`.\n \"\"\"\n from .metrics import accuracy_score\n return accuracy_score(y, self.predict(X), sample_weight=sample_weight)" }, { @@ -28562,13 +28813,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'preserves_dtype': []}" }, { @@ -28586,7 +28838,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -28596,7 +28849,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data." - } + }, + "refined_type": {} }, { "name": "y", @@ -28606,13 +28860,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform clustering on `X` and returns cluster labels.", - "docstring": "Perform clustering on `X` and returns cluster labels.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,), dtype=np.int64\n Cluster labels.", + "docstring": "\n Perform clustering on `X` and returns cluster labels.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,), dtype=np.int64\n Cluster labels.\n ", "source_code": "\ndef fit_predict(self, X, y=None):\n \"\"\"\n Perform clustering on `X` and returns cluster labels.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,), dtype=np.int64\n Cluster labels.\n \"\"\"\n self.fit(X)\n return self.labels_" }, { @@ -28630,7 +28885,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -28640,7 +28896,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Test samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -28650,13 +28907,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the score of the model on the data `X`.", - "docstring": "Return the score of the model on the data `X`.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Test samples.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nscore : float", + "docstring": "Return the score of the model on the data `X`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n score : float\n ", "source_code": "\ndef score(self, X, y=None):\n \"\"\"Return the score of the model on the data `X`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n score : float\n \"\"\"\n pass" }, { @@ -28674,13 +28932,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multioutput': True}" }, { @@ -28698,7 +28957,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -28708,6 +28968,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -28718,13 +28982,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform fit on X and returns labels for X.\n\nReturns -1 for outliers and 1 for inliers.", - "docstring": "Perform fit on X and returns labels for X.\n\nReturns -1 for outliers and 1 for inliers.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\ny : ndarray of shape (n_samples,)\n 1 for inliers, -1 for outliers.", + "docstring": "Perform fit on X and returns labels for X.\n\n Returns -1 for outliers and 1 for inliers.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n 1 for inliers, -1 for outliers.\n ", "source_code": "\ndef fit_predict(self, X, y=None):\n \"\"\"Perform fit on X and returns labels for X.\n\n Returns -1 for outliers and 1 for inliers.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n 1 for inliers, -1 for outliers.\n \"\"\"\n return self.fit(X).predict(X)" }, { @@ -28742,13 +29007,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'requires_y': True}" }, { @@ -28766,7 +29032,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -28776,7 +29043,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Test samples. For some estimators this may be a precomputed\nkernel matrix or a list of generic objects instead with shape\n``(n_samples, n_samples_fitted)``, where ``n_samples_fitted``\nis the number of samples used in the fitting for the estimator." - } + }, + "refined_type": {} }, { "name": "y", @@ -28786,7 +29054,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "True values for `X`." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -28796,13 +29065,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return the coefficient of determination of the prediction.\n\nThe coefficient of determination :math:`R^2` is defined as :math:`(1 - \\frac{u}{v})`, where :math:`u` is the residual sum of squares ``((y_true - y_pred)** 2).sum()`` and :math:`v` is the total sum of squares ``((y_true - y_true.mean()) ** 2).sum()``. The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of `y`, disregarding the input features, would get a :math:`R^2` score of 0.0.", - "docstring": "Return the coefficient of determination of the prediction.\n\nThe coefficient of determination :math:`R^2` is defined as\n:math:`(1 - \\frac{u}{v})`, where :math:`u` is the residual\nsum of squares ``((y_true - y_pred)** 2).sum()`` and :math:`v`\nis the total sum of squares ``((y_true - y_true.mean()) ** 2).sum()``.\nThe best possible score is 1.0 and it can be negative (because the\nmodel can be arbitrarily worse). A constant model that always predicts\nthe expected value of `y`, disregarding the input features, would get\na :math:`R^2` score of 0.0.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Test samples. For some estimators this may be a precomputed\n kernel matrix or a list of generic objects instead with shape\n ``(n_samples, n_samples_fitted)``, where ``n_samples_fitted``\n is the number of samples used in the fitting for the estimator.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True values for `X`.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nscore : float\n :math:`R^2` of ``self.predict(X)`` wrt. `y`.\n\nNotes\n-----\nThe :math:`R^2` score used when calling ``score`` on a regressor uses\n``multioutput='uniform_average'`` from version 0.23 to keep consistent\nwith default value of :func:`~sklearn.metrics.r2_score`.\nThis influences the ``score`` method of all the multioutput\nregressors (except for\n:class:`~sklearn.multioutput.MultiOutputRegressor`).", + "description": "Return the coefficient of determination of the prediction.\n\nThe coefficient of determination :math:`R^2` is defined as\n:math:`(1 - \\frac{u}{v})`, where :math:`u` is the residual\nsum of squares ``((y_true - y_pred)** 2).sum()`` and :math:`v`\nis the total sum of squares ``((y_true - y_true.mean()) ** 2).sum()``.\nThe best possible score is 1.0 and it can be negative (because the\nmodel can be arbitrarily worse). A constant model that always predicts\nthe expected value of `y`, disregarding the input features, would get\na :math:`R^2` score of 0.0.", + "docstring": "Return the coefficient of determination of the prediction.\n\n The coefficient of determination :math:`R^2` is defined as\n :math:`(1 - \\frac{u}{v})`, where :math:`u` is the residual\n sum of squares ``((y_true - y_pred)** 2).sum()`` and :math:`v`\n is the total sum of squares ``((y_true - y_true.mean()) ** 2).sum()``.\n The best possible score is 1.0 and it can be negative (because the\n model can be arbitrarily worse). A constant model that always predicts\n the expected value of `y`, disregarding the input features, would get\n a :math:`R^2` score of 0.0.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples. For some estimators this may be a precomputed\n kernel matrix or a list of generic objects instead with shape\n ``(n_samples, n_samples_fitted)``, where ``n_samples_fitted``\n is the number of samples used in the fitting for the estimator.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True values for `X`.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n :math:`R^2` of ``self.predict(X)`` wrt. `y`.\n\n Notes\n -----\n The :math:`R^2` score used when calling ``score`` on a regressor uses\n ``multioutput='uniform_average'`` from version 0.23 to keep consistent\n with default value of :func:`~sklearn.metrics.r2_score`.\n This influences the ``score`` method of all the multioutput\n regressors (except for\n :class:`~sklearn.multioutput.MultiOutputRegressor`).\n ", "source_code": "\ndef score(self, X, y, sample_weight=None):\n \"\"\"Return the coefficient of determination of the prediction.\n\n The coefficient of determination :math:`R^2` is defined as\n :math:`(1 - \\frac{u}{v})`, where :math:`u` is the residual\n sum of squares ``((y_true - y_pred)** 2).sum()`` and :math:`v`\n is the total sum of squares ``((y_true - y_true.mean()) ** 2).sum()``.\n The best possible score is 1.0 and it can be negative (because the\n model can be arbitrarily worse). A constant model that always predicts\n the expected value of `y`, disregarding the input features, would get\n a :math:`R^2` score of 0.0.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples. For some estimators this may be a precomputed\n kernel matrix or a list of generic objects instead with shape\n ``(n_samples, n_samples_fitted)``, where ``n_samples_fitted``\n is the number of samples used in the fitting for the estimator.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True values for `X`.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n :math:`R^2` of ``self.predict(X)`` wrt. `y`.\n\n Notes\n -----\n The :math:`R^2` score used when calling ``score`` on a regressor uses\n ``multioutput='uniform_average'`` from version 0.23 to keep consistent\n with default value of :func:`~sklearn.metrics.r2_score`.\n This influences the ``score`` method of all the multioutput\n regressors (except for\n :class:`~sklearn.multioutput.MultiOutputRegressor`).\n \"\"\"\n from .metrics import r2_score\n y_pred = self.predict(X)\n return r2_score(y, y_pred, sample_weight=sample_weight)" }, { @@ -28820,7 +29090,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -28830,7 +29101,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -28838,15 +29110,16 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": " array-like of shape (n_samples,) or (n_samples, n_outputs), default=None", + "type": "array-like of shape (n_samples,) or (n_samples, n_outputs), default=None", "description": "Target values (None for unsupervised transformations)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit to data, then transform it.\n\nFits transformer to `X` and `y` with optional parameters `fit_params` and returns a transformed version of `X`.", - "docstring": "Fit to data, then transform it.\n\nFits transformer to `X` and `y` with optional parameters `fit_params`\nand returns a transformed version of `X`.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input samples.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n**fit_params : dict\n Additional fit parameters.\n\nReturns\n-------\nX_new : ndarray array of shape (n_samples, n_features_new)\n Transformed array.", + "description": "Fit to data, then transform it.\n\nFits transformer to `X` and `y` with optional parameters `fit_params`\nand returns a transformed version of `X`.", + "docstring": "\n Fit to data, then transform it.\n\n Fits transformer to `X` and `y` with optional parameters `fit_params`\n and returns a transformed version of `X`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input samples.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n **fit_params : dict\n Additional fit parameters.\n\n Returns\n -------\n X_new : ndarray array of shape (n_samples, n_features_new)\n Transformed array.\n ", "source_code": "\ndef fit_transform(self, X, y=None, **fit_params):\n \"\"\"\n Fit to data, then transform it.\n\n Fits transformer to `X` and `y` with optional parameters `fit_params`\n and returns a transformed version of `X`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input samples.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n **fit_params : dict\n Additional fit parameters.\n\n Returns\n -------\n X_new : ndarray array of shape (n_samples, n_features_new)\n Transformed array.\n \"\"\"\n if y is None:\n return self.fit(X, **fit_params).transform(X)\n else:\n return self.fit(X, y, **fit_params).transform(X)" }, { @@ -28864,7 +29137,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -28874,13 +29148,14 @@ "docstring": { "type": "array-like of str or None, default=None", "description": "Input features.\n\n- If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n- If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Get output feature names for transformation.", - "docstring": "Get output feature names for transformation.\n\nParameters\n----------\ninput_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\nReturns\n-------\nfeature_names_out : ndarray of str objects\n Same as input features.", + "docstring": "Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Same as input features.\n ", "source_code": "\ndef get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Same as input features.\n \"\"\"\n return _check_feature_names_in(self, input_features)" }, { @@ -28898,13 +29173,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'non_deterministic': _IS_32BIT or platform.machine().startswith(('ppc', 'powerpc'))}" }, { @@ -28922,13 +29198,14 @@ "docstring": { "type": "object", "description": "Estimator object to test." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Returns True if estimator is pairwise.\n\n- If the `_pairwise` attribute and the tag are present and consistent, then use the value and not issue a warning. - If the `_pairwise` attribute and the tag are present and not consistent, use the `_pairwise` value and issue a deprecation warning. - If only the `_pairwise` attribute is present and it is not False, issue a deprecation warning and use the `_pairwise` value.", - "docstring": "Returns True if estimator is pairwise.\n\n- If the `_pairwise` attribute and the tag are present and consistent,\n then use the value and not issue a warning.\n- If the `_pairwise` attribute and the tag are present and not\n consistent, use the `_pairwise` value and issue a deprecation\n warning.\n- If only the `_pairwise` attribute is present and it is not False,\n issue a deprecation warning and use the `_pairwise` value.\n\nParameters\n----------\nestimator : object\n Estimator object to test.\n\nReturns\n-------\nout : bool\n True if the estimator is pairwise and False otherwise.", + "description": "Returns True if estimator is pairwise.\n\n- If the `_pairwise` attribute and the tag are present and consistent,\n then use the value and not issue a warning.\n- If the `_pairwise` attribute and the tag are present and not\n consistent, use the `_pairwise` value and issue a deprecation\n warning.\n- If only the `_pairwise` attribute is present and it is not False,\n issue a deprecation warning and use the `_pairwise` value.", + "docstring": "Returns True if estimator is pairwise.\n\n - If the `_pairwise` attribute and the tag are present and consistent,\n then use the value and not issue a warning.\n - If the `_pairwise` attribute and the tag are present and not\n consistent, use the `_pairwise` value and issue a deprecation\n warning.\n - If only the `_pairwise` attribute is present and it is not False,\n issue a deprecation warning and use the `_pairwise` value.\n\n Parameters\n ----------\n estimator : object\n Estimator object to test.\n\n Returns\n -------\n out : bool\n True if the estimator is pairwise and False otherwise.\n ", "source_code": "\ndef _is_pairwise(estimator):\n \"\"\"Returns True if estimator is pairwise.\n\n - If the `_pairwise` attribute and the tag are present and consistent,\n then use the value and not issue a warning.\n - If the `_pairwise` attribute and the tag are present and not\n consistent, use the `_pairwise` value and issue a deprecation\n warning.\n - If only the `_pairwise` attribute is present and it is not False,\n issue a deprecation warning and use the `_pairwise` value.\n\n Parameters\n ----------\n estimator : object\n Estimator object to test.\n\n Returns\n -------\n out : bool\n True if the estimator is pairwise and False otherwise.\n \"\"\"\n with warnings.catch_warnings():\n warnings.filterwarnings('ignore', category=FutureWarning)\n has_pairwise_attribute = hasattr(estimator, '_pairwise')\n pairwise_attribute = getattr(estimator, '_pairwise', False)\n pairwise_tag = _safe_tags(estimator, key='pairwise')\n if has_pairwise_attribute:\n if pairwise_attribute != pairwise_tag:\n warnings.warn('_pairwise was deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Set the estimator tags of your estimator instead', FutureWarning)\n return pairwise_attribute\n return pairwise_tag" }, { @@ -28946,7 +29223,8 @@ "docstring": { "type": "dict", "description": "The dictionary to pretty print" - } + }, + "refined_type": {} }, { "name": "offset", @@ -28956,7 +29234,8 @@ "docstring": { "type": "int, default=0", "description": "The offset in characters to add at the begin of each line." - } + }, + "refined_type": {} }, { "name": "printer", @@ -28966,13 +29245,14 @@ "docstring": { "type": "callable, default=repr", "description": "The function to convert entries to strings, typically\nthe builtin str or repr" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Pretty print the dictionary 'params'", - "docstring": "Pretty print the dictionary 'params'\n\nParameters\n----------\nparams : dict\n The dictionary to pretty print\n\noffset : int, default=0\n The offset in characters to add at the begin of each line.\n\nprinter : callable, default=repr\n The function to convert entries to strings, typically\n the builtin str or repr", + "docstring": "Pretty print the dictionary 'params'\n\n Parameters\n ----------\n params : dict\n The dictionary to pretty print\n\n offset : int, default=0\n The offset in characters to add at the begin of each line.\n\n printer : callable, default=repr\n The function to convert entries to strings, typically\n the builtin str or repr\n\n ", "source_code": "\ndef _pprint(params, offset=0, printer=repr):\n \"\"\"Pretty print the dictionary 'params'\n\n Parameters\n ----------\n params : dict\n The dictionary to pretty print\n\n offset : int, default=0\n The offset in characters to add at the begin of each line.\n\n printer : callable, default=repr\n The function to convert entries to strings, typically\n the builtin str or repr\n\n \"\"\"\n options = np.get_printoptions()\n np.set_printoptions(precision=5, threshold=64, edgeitems=2)\n params_list = list()\n this_line_length = offset\n line_sep = ',\\n' + (1 + offset // 2) * ' '\n for (i, (k, v)) in enumerate(sorted(params.items())):\n if type(v) is float:\n this_repr = '%s=%s' % (k, str(v))\n else:\n this_repr = '%s=%s' % (k, printer(v))\n if len(this_repr) > 500:\n this_repr = this_repr[:300] + '...' + this_repr[-100:]\n if i > 0:\n if this_line_length + len(this_repr) >= 75 or '\\n' in this_repr:\n params_list.append(line_sep)\n this_line_length = len(line_sep)\n else:\n params_list.append(', ')\n this_line_length += 2\n params_list.append(this_repr)\n this_line_length += len(this_repr)\n np.set_printoptions(**options)\n lines = ''.join(params_list)\n lines = '\\n'.join((l.rstrip(' ') for l in lines.split('\\n')))\n return lines" }, { @@ -28990,6 +29270,10 @@ "docstring": { "type": "{list, tuple, set} of estimator instance or a single estimator instance", "description": "The estimator or group of estimators to be cloned." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -29000,14 +29284,15 @@ "docstring": { "type": "bool, default=True", "description": "If safe is False, clone will fall back to a deep copy on objects\nthat are not estimators." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Constructs a new unfitted estimator with the same parameters.\n\nClone does a deep copy of the model in an estimator without actually copying attached data. It yields a new estimator with the same parameters that has not been fitted on any data. If the estimator's `random_state` parameter is an integer (or if the estimator doesn't have a `random_state` parameter), an *exact clone* is returned: the clone and the original estimator will give the exact same results. Otherwise, *statistical clone* is returned: the clone might yield different results from the original estimator. More details can be found in :ref:`randomness`.", - "docstring": "Constructs a new unfitted estimator with the same parameters.\n\nClone does a deep copy of the model in an estimator\nwithout actually copying attached data. It yields a new estimator\nwith the same parameters that has not been fitted on any data.\n\nIf the estimator's `random_state` parameter is an integer (or if the\nestimator doesn't have a `random_state` parameter), an *exact clone* is\nreturned: the clone and the original estimator will give the exact same\nresults. Otherwise, *statistical clone* is returned: the clone might\nyield different results from the original estimator. More details can be\nfound in :ref:`randomness`.\n\nParameters\n----------\nestimator : {list, tuple, set} of estimator instance or a single estimator instance\n The estimator or group of estimators to be cloned.\n\nsafe : bool, default=True\n If safe is False, clone will fall back to a deep copy on objects\n that are not estimators.", - "source_code": "\ndef clone(estimator, *, safe=True):\n \"\"\"Constructs a new unfitted estimator with the same parameters.\n\n Clone does a deep copy of the model in an estimator\n without actually copying attached data. It yields a new estimator\n with the same parameters that has not been fitted on any data.\n\n If the estimator's `random_state` parameter is an integer (or if the\n estimator doesn't have a `random_state` parameter), an *exact clone* is\n returned: the clone and the original estimator will give the exact same\n results. Otherwise, *statistical clone* is returned: the clone might\n yield different results from the original estimator. More details can be\n found in :ref:`randomness`.\n\n Parameters\n ----------\n estimator : {list, tuple, set} of estimator instance or a single estimator instance\n The estimator or group of estimators to be cloned.\n\n safe : bool, default=True\n If safe is False, clone will fall back to a deep copy on objects\n that are not estimators.\n\n \"\"\"\n estimator_type = type(estimator)\n if estimator_type in (list, tuple, set, frozenset):\n return estimator_type([clone(e, safe=safe) for e in estimator])\n elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):\n if not safe:\n return copy.deepcopy(estimator)\n elif isinstance(estimator, type):\n raise TypeError('Cannot clone object. ' + 'You should provide an instance of ' + 'scikit-learn estimator instead of a class.')\n else:\n raise TypeError(\"Cannot clone object '%s' (type %s): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.\" % (repr(estimator), type(estimator)))\n klass = estimator.__class__\n new_object_params = estimator.get_params(deep=False)\n for (name, param) in new_object_params.items():\n new_object_params[name] = clone(param, safe=False)\n new_object = klass(**new_object_params)\n params_set = new_object.get_params(deep=False)\n for name in new_object_params:\n param1 = new_object_params[name]\n param2 = params_set[name]\n if param1 is not param2:\n raise RuntimeError('Cannot clone object %s, as the constructor either does not set or modifies parameter %s' % (estimator, name))\n return new_object" + "description": "Construct a new unfitted estimator with the same parameters.\n\nClone does a deep copy of the model in an estimator\nwithout actually copying attached data. It returns a new estimator\nwith the same parameters that has not been fitted on any data.", + "docstring": "Construct a new unfitted estimator with the same parameters.\n\n Clone does a deep copy of the model in an estimator\n without actually copying attached data. It returns a new estimator\n with the same parameters that has not been fitted on any data.\n\n Parameters\n ----------\n estimator : {list, tuple, set} of estimator instance or a single estimator instance\n The estimator or group of estimators to be cloned.\n safe : bool, default=True\n If safe is False, clone will fall back to a deep copy on objects\n that are not estimators.\n\n Returns\n -------\n estimator : object\n The deep copy of the input, an estimator if input is an estimator.\n\n Notes\n -----\n If the estimator's `random_state` parameter is an integer (or if the\n estimator doesn't have a `random_state` parameter), an *exact clone* is\n returned: the clone and the original estimator will give the exact same\n results. Otherwise, *statistical clone* is returned: the clone might\n return different results from the original estimator. More details can be\n found in :ref:`randomness`.\n ", + "source_code": "\ndef clone(estimator, *, safe=True):\n \"\"\"Construct a new unfitted estimator with the same parameters.\n\n Clone does a deep copy of the model in an estimator\n without actually copying attached data. It returns a new estimator\n with the same parameters that has not been fitted on any data.\n\n Parameters\n ----------\n estimator : {list, tuple, set} of estimator instance or a single estimator instance\n The estimator or group of estimators to be cloned.\n safe : bool, default=True\n If safe is False, clone will fall back to a deep copy on objects\n that are not estimators.\n\n Returns\n -------\n estimator : object\n The deep copy of the input, an estimator if input is an estimator.\n\n Notes\n -----\n If the estimator's `random_state` parameter is an integer (or if the\n estimator doesn't have a `random_state` parameter), an *exact clone* is\n returned: the clone and the original estimator will give the exact same\n results. Otherwise, *statistical clone* is returned: the clone might\n return different results from the original estimator. More details can be\n found in :ref:`randomness`.\n \"\"\"\n estimator_type = type(estimator)\n if estimator_type in (list, tuple, set, frozenset):\n return estimator_type([clone(e, safe=safe) for e in estimator])\n elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):\n if not safe:\n return copy.deepcopy(estimator)\n elif isinstance(estimator, type):\n raise TypeError('Cannot clone object. ' + 'You should provide an instance of ' + 'scikit-learn estimator instead of a class.')\n else:\n raise TypeError(\"Cannot clone object '%s' (type %s): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.\" % (repr(estimator), type(estimator)))\n klass = estimator.__class__\n new_object_params = estimator.get_params(deep=False)\n for (name, param) in new_object_params.items():\n new_object_params[name] = clone(param, safe=False)\n new_object = klass(**new_object_params)\n params_set = new_object.get_params(deep=False)\n for name in new_object_params:\n param1 = new_object_params[name]\n param2 = params_set[name]\n if param1 is not param2:\n raise RuntimeError('Cannot clone object %s, as the constructor either does not set or modifies parameter %s' % (estimator, name))\n return new_object" }, { "name": "is_classifier", @@ -29024,13 +29309,14 @@ "docstring": { "type": "object", "description": "Estimator object to test." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return True if the given estimator is (probably) a classifier.", - "docstring": "Return True if the given estimator is (probably) a classifier.\n\nParameters\n----------\nestimator : object\n Estimator object to test.\n\nReturns\n-------\nout : bool\n True if estimator is a classifier and False otherwise.", + "docstring": "Return True if the given estimator is (probably) a classifier.\n\n Parameters\n ----------\n estimator : object\n Estimator object to test.\n\n Returns\n -------\n out : bool\n True if estimator is a classifier and False otherwise.\n ", "source_code": "\ndef is_classifier(estimator):\n \"\"\"Return True if the given estimator is (probably) a classifier.\n\n Parameters\n ----------\n estimator : object\n Estimator object to test.\n\n Returns\n -------\n out : bool\n True if estimator is a classifier and False otherwise.\n \"\"\"\n return getattr(estimator, '_estimator_type', None) == 'classifier'" }, { @@ -29048,13 +29334,14 @@ "docstring": { "type": "estimator instance", "description": "Estimator object to test." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return True if the given estimator is (probably) an outlier detector.", - "docstring": "Return True if the given estimator is (probably) an outlier detector.\n\nParameters\n----------\nestimator : estimator instance\n Estimator object to test.\n\nReturns\n-------\nout : bool\n True if estimator is an outlier detector and False otherwise.", + "docstring": "Return True if the given estimator is (probably) an outlier detector.\n\n Parameters\n ----------\n estimator : estimator instance\n Estimator object to test.\n\n Returns\n -------\n out : bool\n True if estimator is an outlier detector and False otherwise.\n ", "source_code": "\ndef is_outlier_detector(estimator):\n \"\"\"Return True if the given estimator is (probably) an outlier detector.\n\n Parameters\n ----------\n estimator : estimator instance\n Estimator object to test.\n\n Returns\n -------\n out : bool\n True if estimator is an outlier detector and False otherwise.\n \"\"\"\n return getattr(estimator, '_estimator_type', None) == 'outlier_detector'" }, { @@ -29072,13 +29359,14 @@ "docstring": { "type": "estimator instance", "description": "Estimator object to test." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return True if the given estimator is (probably) a regressor.", - "docstring": "Return True if the given estimator is (probably) a regressor.\n\nParameters\n----------\nestimator : estimator instance\n Estimator object to test.\n\nReturns\n-------\nout : bool\n True if estimator is a regressor and False otherwise.", + "docstring": "Return True if the given estimator is (probably) a regressor.\n\n Parameters\n ----------\n estimator : estimator instance\n Estimator object to test.\n\n Returns\n -------\n out : bool\n True if estimator is a regressor and False otherwise.\n ", "source_code": "\ndef is_regressor(estimator):\n \"\"\"Return True if the given estimator is (probably) a regressor.\n\n Parameters\n ----------\n estimator : estimator instance\n Estimator object to test.\n\n Returns\n -------\n out : bool\n True if estimator is a regressor and False otherwise.\n \"\"\"\n return getattr(estimator, '_estimator_type', None) == 'regressor'" }, { @@ -29096,7 +29384,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -29106,7 +29395,8 @@ "docstring": { "type": "estimator instance, default=None", "description": "The classifier whose output need to be calibrated to provide more\naccurate `predict_proba` outputs. The default classifier is\na :class:`~sklearn.svm.LinearSVC`." - } + }, + "refined_type": {} }, { "name": "method", @@ -29116,6 +29406,10 @@ "docstring": { "type": "{'sigmoid', 'isotonic'}, default='sigmoid'", "description": "The method to use for calibration. Can be 'sigmoid' which\ncorresponds to Platt's method (i.e. a logistic regression model) or\n'isotonic' which is a non-parametric approach. It is not advised to\nuse isotonic calibration with too few calibration samples\n``(<<1000)`` since it tends to overfit." + }, + "refined_type": { + "kind": "EnumType", + "values": ["sigmoid", "isotonic"] } }, { @@ -29126,7 +29420,8 @@ "docstring": { "type": "int, cross-validation generator, iterable or \"prefit\", default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross-validation,\n- integer, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor integer/None inputs, if ``y`` is binary or multiclass,\n:class:`~sklearn.model_selection.StratifiedKFold` is used. If ``y`` is\nneither binary nor multiclass, :class:`~sklearn.model_selection.KFold`\nis used.\n\nRefer to the :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\nIf \"prefit\" is passed, it is assumed that `base_estimator` has been\nfitted already and all data is used for calibration.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -29136,7 +29431,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors.\n\nBase estimator clones are fitted in parallel across cross-validation\niterations. Therefore parallelism happens only when `cv != \"prefit\"`.\n\nSee :term:`Glossary ` for more details.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} }, { "name": "ensemble", @@ -29146,13 +29442,14 @@ "docstring": { "type": "bool, default=True", "description": "Determines how the calibrator is fitted when `cv` is not `'prefit'`.\nIgnored if `cv='prefit'`.\n\nIf `True`, the `base_estimator` is fitted using training data and\ncalibrated using testing data, for each `cv` fold. The final estimator\nis an ensemble of `n_cv` fitted classifier and calibrator pairs, where\n`n_cv` is the number of cross-validation folds. The output is the\naverage predicted probabilities of all pairs.\n\nIf `False`, `cv` is used to compute unbiased predictions, via\n:func:`~sklearn.model_selection.cross_val_predict`, which are then\nused for calibration. At prediction time, the classifier used is the\n`base_estimator` trained on all the data.\nNote that this method is also internally implemented in\n:mod:`sklearn.svm` estimators with the `probabilities=True` parameter.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, base_estimator=None, *, method='sigmoid', cv=None, n_jobs=None, ensemble=True):\n self.base_estimator = base_estimator\n self.method = method\n self.cv = cv\n self.n_jobs = n_jobs\n self.ensemble = ensemble" }, { @@ -29170,13 +29467,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'Due to the cross-validation and sample ordering, removing a sample is not strictly equal to putting is weight to zero. Specific unit tests are added for CalibratedClassifierCV specifically.'}}" }, { @@ -29194,7 +29492,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -29204,7 +29503,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -29214,7 +29514,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -29224,13 +29525,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the calibrated model.", - "docstring": "Fit the calibrated model.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\nReturns\n-------\nself : object\n Returns an instance of self.", + "docstring": "Fit the calibrated model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the calibrated model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n check_classification_targets(y)\n (X, y) = indexable(X, y)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if self.base_estimator is None:\n base_estimator = LinearSVC(random_state=0)\n else:\n base_estimator = self.base_estimator\n self.calibrated_classifiers_ = []\n if self.cv == 'prefit':\n check_is_fitted(self.base_estimator, attributes=['classes_'])\n self.classes_ = self.base_estimator.classes_\n (pred_method, method_name) = _get_prediction_method(base_estimator)\n n_classes = len(self.classes_)\n predictions = _compute_predictions(pred_method, method_name, X, n_classes)\n calibrated_classifier = _fit_calibrator(base_estimator, predictions, y, self.classes_, self.method, sample_weight)\n self.calibrated_classifiers_.append(calibrated_classifier)\n else:\n label_encoder_ = LabelEncoder().fit(y)\n self.classes_ = label_encoder_.classes_\n n_classes = len(self.classes_)\n fit_parameters = signature(base_estimator.fit).parameters\n supports_sw = 'sample_weight' in fit_parameters\n if sample_weight is not None and not supports_sw:\n estimator_name = type(base_estimator).__name__\n warnings.warn(f'Since {estimator_name} does not appear to accept sample_weight, sample weights will only be used for the calibration itself. This can be caused by a limitation of the current scikit-learn API. See the following issue for more details: https://github.com/scikit-learn/scikit-learn/issues/21134. Be warned that the result of the calibration is likely to be incorrect.')\n if isinstance(self.cv, int):\n n_folds = self.cv\n elif hasattr(self.cv, 'n_splits'):\n n_folds = self.cv.n_splits\n else:\n n_folds = None\n if n_folds and np.any([np.sum(y == class_) < n_folds for class_ in self.classes_]):\n raise ValueError(f'Requesting {n_folds}-fold cross-validation but provided less than {n_folds} examples for at least one class.')\n cv = check_cv(self.cv, y, classifier=True)\n if self.ensemble:\n parallel = Parallel(n_jobs=self.n_jobs)\n self.calibrated_classifiers_ = parallel((delayed(_fit_classifier_calibrator_pair)(clone(base_estimator), X, y, train=train, test=test, method=self.method, classes=self.classes_, supports_sw=supports_sw, sample_weight=sample_weight) for (train, test) in cv.split(X, y)))\n else:\n this_estimator = clone(base_estimator)\n (_, method_name) = _get_prediction_method(this_estimator)\n fit_params = {'sample_weight': sample_weight} if sample_weight is not None and supports_sw else None\n pred_method = partial(cross_val_predict, estimator=this_estimator, X=X, y=y, cv=cv, method=method_name, n_jobs=self.n_jobs, fit_params=fit_params)\n predictions = _compute_predictions(pred_method, method_name, X, n_classes)\n if sample_weight is not None and supports_sw:\n this_estimator.fit(X, y, sample_weight)\n else:\n this_estimator.fit(X, y)\n calibrated_classifier = _fit_calibrator(this_estimator, predictions, y, self.classes_, self.method, sample_weight)\n self.calibrated_classifiers_.append(calibrated_classifier)\n first_clf = self.calibrated_classifiers_[0].base_estimator\n if hasattr(first_clf, 'n_features_in_'):\n self.n_features_in_ = first_clf.n_features_in_\n if hasattr(first_clf, 'feature_names_in_'):\n self.feature_names_in_ = first_clf.feature_names_in_\n return self" }, { @@ -29248,7 +29550,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -29258,13 +29561,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The samples, as accepted by `base_estimator.predict`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict the target of new samples.\n\nThe predicted class is the class that has the highest probability, and can thus be different from the prediction of the uncalibrated classifier.", - "docstring": "Predict the target of new samples.\n\nThe predicted class is the class that has the highest probability,\nand can thus be different from the prediction of the uncalibrated classifier.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The samples, as accepted by `base_estimator.predict`.\n\nReturns\n-------\nC : ndarray of shape (n_samples,)\n The predicted class.", + "description": "Predict the target of new samples.\n\nThe predicted class is the class that has the highest probability,\nand can thus be different from the prediction of the uncalibrated classifier.", + "docstring": "Predict the target of new samples.\n\n The predicted class is the class that has the highest probability,\n and can thus be different from the prediction of the uncalibrated classifier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The samples, as accepted by `base_estimator.predict`.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n The predicted class.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict the target of new samples.\n\n The predicted class is the class that has the highest probability,\n and can thus be different from the prediction of the uncalibrated classifier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The samples, as accepted by `base_estimator.predict`.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n The predicted class.\n \"\"\"\n check_is_fitted(self)\n return self.classes_[np.argmax(self.predict_proba(X), axis=1)]" }, { @@ -29282,7 +29586,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -29292,13 +29597,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The samples, as accepted by `base_estimator.predict_proba`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Calibrated probabilities of classification.\n\nThis function returns calibrated probabilities of classification according to each class on an array of test vectors X.", - "docstring": "Calibrated probabilities of classification.\n\nThis function returns calibrated probabilities of classification\naccording to each class on an array of test vectors X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The samples, as accepted by `base_estimator.predict_proba`.\n\nReturns\n-------\nC : ndarray of shape (n_samples, n_classes)\n The predicted probas.", + "description": "Calibrated probabilities of classification.\n\nThis function returns calibrated probabilities of classification\naccording to each class on an array of test vectors X.", + "docstring": "Calibrated probabilities of classification.\n\n This function returns calibrated probabilities of classification\n according to each class on an array of test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The samples, as accepted by `base_estimator.predict_proba`.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n The predicted probas.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Calibrated probabilities of classification.\n\n This function returns calibrated probabilities of classification\n according to each class on an array of test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The samples, as accepted by `base_estimator.predict_proba`.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n The predicted probas.\n \"\"\"\n check_is_fitted(self)\n mean_proba = np.zeros((_num_samples(X), len(self.classes_)))\n for calibrated_classifier in self.calibrated_classifiers_:\n proba = calibrated_classifier.predict_proba(X)\n mean_proba += proba\n mean_proba /= len(self.calibrated_classifiers_)\n return mean_proba" }, { @@ -29316,7 +29622,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "prob_true", @@ -29326,7 +29633,8 @@ "docstring": { "type": "ndarray of shape (n_bins,)", "description": "The proportion of samples whose class is the positive class (fraction\nof positives), in each bin." - } + }, + "refined_type": {} }, { "name": "prob_pred", @@ -29336,7 +29644,8 @@ "docstring": { "type": "ndarray of shape (n_bins,)", "description": "The mean predicted probability in each bin." - } + }, + "refined_type": {} }, { "name": "y_prob", @@ -29346,7 +29655,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Probability estimates for the positive class, for each sample." - } + }, + "refined_type": {} }, { "name": "estimator_name", @@ -29356,13 +29666,14 @@ "docstring": { "type": "str, default=None", "description": "Name of estimator. If None, the estimator name is not shown." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, prob_true, prob_pred, y_prob, *, estimator_name=None):\n self.prob_true = prob_true\n self.prob_pred = prob_pred\n self.y_prob = y_prob\n self.estimator_name = estimator_name" }, { @@ -29380,7 +29691,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -29390,7 +29702,8 @@ "docstring": { "type": "estimator instance", "description": "Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\nin which the last estimator is a classifier. The classifier must\nhave a :term:`predict_proba` method." - } + }, + "refined_type": {} }, { "name": "X", @@ -29400,6 +29713,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -29410,7 +29727,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Binary target values." - } + }, + "refined_type": {} }, { "name": "n_bins", @@ -29420,7 +29738,8 @@ "docstring": { "type": "int, default=5", "description": "Number of bins to discretize the [0, 1] interval into when\ncalculating the calibration curve. A bigger number requires more\ndata." - } + }, + "refined_type": {} }, { "name": "strategy", @@ -29430,6 +29749,10 @@ "docstring": { "type": "{'uniform', 'quantile'}, default='uniform'", "description": "Strategy used to define the widths of the bins.\n\n- `'uniform'`: The bins have identical widths.\n- `'quantile'`: The bins have the same number of samples and depend\n on predicted probabilities." + }, + "refined_type": { + "kind": "EnumType", + "values": ["quantile", "uniform"] } }, { @@ -29440,7 +29763,8 @@ "docstring": { "type": "str, default=None", "description": "Name for labeling curve. If `None`, the name of the estimator is\nused." - } + }, + "refined_type": {} }, { "name": "ref_line", @@ -29450,7 +29774,8 @@ "docstring": { "type": "bool, default=True", "description": "If `True`, plots a reference line representing a perfectly\ncalibrated classifier." - } + }, + "refined_type": {} }, { "name": "ax", @@ -29460,13 +29785,14 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Plot calibration curve using a binary classifier and data.\n\nA calibration curve, also known as a reliability diagram, uses inputs from a binary classifier and plots the average predicted probability for each bin against the fraction of positive classes, on the y-axis. Extra keyword arguments will be passed to :func:`matplotlib.pyplot.plot`. Read more about calibration in the :ref:`User Guide ` and more about the scikit-learn visualization API in :ref:`visualizations`. .. versionadded:: 1.0", - "docstring": "Plot calibration curve using a binary classifier and data.\n\nA calibration curve, also known as a reliability diagram, uses inputs\nfrom a binary classifier and plots the average predicted probability\nfor each bin against the fraction of positive classes, on the\ny-axis.\n\nExtra keyword arguments will be passed to\n:func:`matplotlib.pyplot.plot`.\n\nRead more about calibration in the :ref:`User Guide ` and\nmore about the scikit-learn visualization API in :ref:`visualizations`.\n\n.. versionadded:: 1.0\n\nParameters\n----------\nestimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier. The classifier must\n have a :term:`predict_proba` method.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\ny : array-like of shape (n_samples,)\n Binary target values.\n\nn_bins : int, default=5\n Number of bins to discretize the [0, 1] interval into when\n calculating the calibration curve. A bigger number requires more\n data.\n\nstrategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n - `'uniform'`: The bins have identical widths.\n - `'quantile'`: The bins have the same number of samples and depend\n on predicted probabilities.\n\nname : str, default=None\n Name for labeling curve. If `None`, the name of the estimator is\n used.\n\nref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n**kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\nReturns\n-------\ndisplay : :class:`~sklearn.calibration.CalibrationDisplay`.\n Object that stores computed values.\n\nSee Also\n--------\nCalibrationDisplay.from_predictions : Plot calibration curve using true\n and predicted labels.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.model_selection import train_test_split\n>>> from sklearn.linear_model import LogisticRegression\n>>> from sklearn.calibration import CalibrationDisplay\n>>> X, y = make_classification(random_state=0)\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, random_state=0)\n>>> clf = LogisticRegression(random_state=0)\n>>> clf.fit(X_train, y_train)\nLogisticRegression(random_state=0)\n>>> disp = CalibrationDisplay.from_estimator(clf, X_test, y_test)\n>>> plt.show()", + "description": "Plot calibration curve using a binary classifier and data.\n\nA calibration curve, also known as a reliability diagram, uses inputs\nfrom a binary classifier and plots the average predicted probability\nfor each bin against the fraction of positive classes, on the\ny-axis.\n\nExtra keyword arguments will be passed to\n:func:`matplotlib.pyplot.plot`.\n\nRead more about calibration in the :ref:`User Guide ` and\nmore about the scikit-learn visualization API in :ref:`visualizations`.\n\n.. versionadded:: 1.0", + "docstring": "Plot calibration curve using a binary classifier and data.\n\n A calibration curve, also known as a reliability diagram, uses inputs\n from a binary classifier and plots the average predicted probability\n for each bin against the fraction of positive classes, on the\n y-axis.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Read more about calibration in the :ref:`User Guide ` and\n more about the scikit-learn visualization API in :ref:`visualizations`.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier. The classifier must\n have a :term:`predict_proba` method.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Binary target values.\n\n n_bins : int, default=5\n Number of bins to discretize the [0, 1] interval into when\n calculating the calibration curve. A bigger number requires more\n data.\n\n strategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n - `'uniform'`: The bins have identical widths.\n - `'quantile'`: The bins have the same number of samples and depend\n on predicted probabilities.\n\n name : str, default=None\n Name for labeling curve. If `None`, the name of the estimator is\n used.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`.\n Object that stores computed values.\n\n See Also\n --------\n CalibrationDisplay.from_predictions : Plot calibration curve using true\n and predicted labels.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.calibration import CalibrationDisplay\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression(random_state=0)\n >>> clf.fit(X_train, y_train)\n LogisticRegression(random_state=0)\n >>> disp = CalibrationDisplay.from_estimator(clf, X_test, y_test)\n >>> plt.show()\n ", "source_code": "\n@classmethod\ndef from_estimator(cls, estimator, X, y, *, n_bins=5, strategy='uniform', name=None, ref_line=True, ax=None, **kwargs):\n \"\"\"Plot calibration curve using a binary classifier and data.\n\n A calibration curve, also known as a reliability diagram, uses inputs\n from a binary classifier and plots the average predicted probability\n for each bin against the fraction of positive classes, on the\n y-axis.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Read more about calibration in the :ref:`User Guide ` and\n more about the scikit-learn visualization API in :ref:`visualizations`.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier. The classifier must\n have a :term:`predict_proba` method.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Binary target values.\n\n n_bins : int, default=5\n Number of bins to discretize the [0, 1] interval into when\n calculating the calibration curve. A bigger number requires more\n data.\n\n strategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n - `'uniform'`: The bins have identical widths.\n - `'quantile'`: The bins have the same number of samples and depend\n on predicted probabilities.\n\n name : str, default=None\n Name for labeling curve. If `None`, the name of the estimator is\n used.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`.\n Object that stores computed values.\n\n See Also\n --------\n CalibrationDisplay.from_predictions : Plot calibration curve using true\n and predicted labels.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.calibration import CalibrationDisplay\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression(random_state=0)\n >>> clf.fit(X_train, y_train)\n LogisticRegression(random_state=0)\n >>> disp = CalibrationDisplay.from_estimator(clf, X_test, y_test)\n >>> plt.show()\n \"\"\"\n method_name = f'{cls.__name__}.from_estimator'\n check_matplotlib_support(method_name)\n if not is_classifier(estimator):\n raise ValueError(\"'estimator' should be a fitted classifier.\")\n (y_prob, _) = _get_response(X, estimator, response_method='predict_proba', pos_label=None)\n name = name if name is not None else estimator.__class__.__name__\n return cls.from_predictions(y, y_prob, n_bins=n_bins, strategy=strategy, name=name, ref_line=ref_line, ax=ax, **kwargs)" }, { @@ -29484,7 +29810,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -29494,7 +29821,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "y_prob", @@ -29504,7 +29832,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The predicted probabilities of the positive class." - } + }, + "refined_type": {} }, { "name": "n_bins", @@ -29514,7 +29843,8 @@ "docstring": { "type": "int, default=5", "description": "Number of bins to discretize the [0, 1] interval into when\ncalculating the calibration curve. A bigger number requires more\ndata." - } + }, + "refined_type": {} }, { "name": "strategy", @@ -29524,6 +29854,10 @@ "docstring": { "type": "{'uniform', 'quantile'}, default='uniform'", "description": "Strategy used to define the widths of the bins.\n\n- `'uniform'`: The bins have identical widths.\n- `'quantile'`: The bins have the same number of samples and depend\n on predicted probabilities." + }, + "refined_type": { + "kind": "EnumType", + "values": ["quantile", "uniform"] } }, { @@ -29534,7 +29868,8 @@ "docstring": { "type": "str, default=None", "description": "Name for labeling curve." - } + }, + "refined_type": {} }, { "name": "ref_line", @@ -29544,7 +29879,8 @@ "docstring": { "type": "bool, default=True", "description": "If `True`, plots a reference line representing a perfectly\ncalibrated classifier." - } + }, + "refined_type": {} }, { "name": "ax", @@ -29554,13 +29890,14 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Plot calibration curve using true labels and predicted probabilities.\n\nCalibration curve, also known as reliability diagram, uses inputs from a binary classifier and plots the average predicted probability for each bin against the fraction of positive classes, on the y-axis. Extra keyword arguments will be passed to :func:`matplotlib.pyplot.plot`. Read more about calibration in the :ref:`User Guide ` and more about the scikit-learn visualization API in :ref:`visualizations`. .. versionadded:: 1.0", - "docstring": "Plot calibration curve using true labels and predicted probabilities.\n\nCalibration curve, also known as reliability diagram, uses inputs\nfrom a binary classifier and plots the average predicted probability\nfor each bin against the fraction of positive classes, on the\ny-axis.\n\nExtra keyword arguments will be passed to\n:func:`matplotlib.pyplot.plot`.\n\nRead more about calibration in the :ref:`User Guide ` and\nmore about the scikit-learn visualization API in :ref:`visualizations`.\n\n.. versionadded:: 1.0\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n True labels.\n\ny_prob : array-like of shape (n_samples,)\n The predicted probabilities of the positive class.\n\nn_bins : int, default=5\n Number of bins to discretize the [0, 1] interval into when\n calculating the calibration curve. A bigger number requires more\n data.\n\nstrategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n - `'uniform'`: The bins have identical widths.\n - `'quantile'`: The bins have the same number of samples and depend\n on predicted probabilities.\n\nname : str, default=None\n Name for labeling curve.\n\nref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n**kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\nReturns\n-------\ndisplay : :class:`~sklearn.calibration.CalibrationDisplay`.\n Object that stores computed values.\n\nSee Also\n--------\nCalibrationDisplay.from_estimator : Plot calibration curve using an\n estimator and data.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.model_selection import train_test_split\n>>> from sklearn.linear_model import LogisticRegression\n>>> from sklearn.calibration import CalibrationDisplay\n>>> X, y = make_classification(random_state=0)\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, random_state=0)\n>>> clf = LogisticRegression(random_state=0)\n>>> clf.fit(X_train, y_train)\nLogisticRegression(random_state=0)\n>>> y_prob = clf.predict_proba(X_test)[:, 1]\n>>> disp = CalibrationDisplay.from_predictions(y_test, y_prob)\n>>> plt.show()", + "description": "Plot calibration curve using true labels and predicted probabilities.\n\nCalibration curve, also known as reliability diagram, uses inputs\nfrom a binary classifier and plots the average predicted probability\nfor each bin against the fraction of positive classes, on the\ny-axis.\n\nExtra keyword arguments will be passed to\n:func:`matplotlib.pyplot.plot`.\n\nRead more about calibration in the :ref:`User Guide ` and\nmore about the scikit-learn visualization API in :ref:`visualizations`.\n\n.. versionadded:: 1.0", + "docstring": "Plot calibration curve using true labels and predicted probabilities.\n\n Calibration curve, also known as reliability diagram, uses inputs\n from a binary classifier and plots the average predicted probability\n for each bin against the fraction of positive classes, on the\n y-axis.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Read more about calibration in the :ref:`User Guide ` and\n more about the scikit-learn visualization API in :ref:`visualizations`.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_prob : array-like of shape (n_samples,)\n The predicted probabilities of the positive class.\n\n n_bins : int, default=5\n Number of bins to discretize the [0, 1] interval into when\n calculating the calibration curve. A bigger number requires more\n data.\n\n strategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n - `'uniform'`: The bins have identical widths.\n - `'quantile'`: The bins have the same number of samples and depend\n on predicted probabilities.\n\n name : str, default=None\n Name for labeling curve.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`.\n Object that stores computed values.\n\n See Also\n --------\n CalibrationDisplay.from_estimator : Plot calibration curve using an\n estimator and data.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.calibration import CalibrationDisplay\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression(random_state=0)\n >>> clf.fit(X_train, y_train)\n LogisticRegression(random_state=0)\n >>> y_prob = clf.predict_proba(X_test)[:, 1]\n >>> disp = CalibrationDisplay.from_predictions(y_test, y_prob)\n >>> plt.show()\n ", "source_code": "\n@classmethod\ndef from_predictions(cls, y_true, y_prob, *, n_bins=5, strategy='uniform', name=None, ref_line=True, ax=None, **kwargs):\n \"\"\"Plot calibration curve using true labels and predicted probabilities.\n\n Calibration curve, also known as reliability diagram, uses inputs\n from a binary classifier and plots the average predicted probability\n for each bin against the fraction of positive classes, on the\n y-axis.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Read more about calibration in the :ref:`User Guide ` and\n more about the scikit-learn visualization API in :ref:`visualizations`.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_prob : array-like of shape (n_samples,)\n The predicted probabilities of the positive class.\n\n n_bins : int, default=5\n Number of bins to discretize the [0, 1] interval into when\n calculating the calibration curve. A bigger number requires more\n data.\n\n strategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n - `'uniform'`: The bins have identical widths.\n - `'quantile'`: The bins have the same number of samples and depend\n on predicted probabilities.\n\n name : str, default=None\n Name for labeling curve.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`.\n Object that stores computed values.\n\n See Also\n --------\n CalibrationDisplay.from_estimator : Plot calibration curve using an\n estimator and data.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.calibration import CalibrationDisplay\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression(random_state=0)\n >>> clf.fit(X_train, y_train)\n LogisticRegression(random_state=0)\n >>> y_prob = clf.predict_proba(X_test)[:, 1]\n >>> disp = CalibrationDisplay.from_predictions(y_test, y_prob)\n >>> plt.show()\n \"\"\"\n method_name = f'{cls.__name__}.from_estimator'\n check_matplotlib_support(method_name)\n (prob_true, prob_pred) = calibration_curve(y_true, y_prob, n_bins=n_bins, strategy=strategy)\n name = name if name is not None else 'Classifier'\n disp = cls(prob_true=prob_true, prob_pred=prob_pred, y_prob=y_prob, estimator_name=name)\n return disp.plot(ax=ax, ref_line=ref_line, **kwargs)" }, { @@ -29578,7 +29915,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ax", @@ -29588,7 +29926,8 @@ "docstring": { "type": "Matplotlib Axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} }, { "name": "name", @@ -29598,7 +29937,8 @@ "docstring": { "type": "str, default=None", "description": "Name for labeling curve. If `None`, use `estimator_name` if\nnot `None`, otherwise no labeling is shown." - } + }, + "refined_type": {} }, { "name": "ref_line", @@ -29608,14 +29948,15 @@ "docstring": { "type": "bool, default=True", "description": "If `True`, plots a reference line representing a perfectly\ncalibrated classifier." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Plot visualization.\n\nExtra keyword arguments will be passed to :func:`matplotlib.pyplot.plot`.", - "docstring": "Plot visualization.\n\nExtra keyword arguments will be passed to\n:func:`matplotlib.pyplot.plot`.\n\nParameters\n----------\nax : Matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\nname : str, default=None\n Name for labeling curve. If `None`, use `estimator_name` if\n not `None`, otherwise no labeling is shown.\n\nref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n**kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\nReturns\n-------\ndisplay : :class:`~sklearn.calibration.CalibrationDisplay`\n Object that stores computed values.", - "source_code": "\ndef plot(self, *, ax=None, name=None, ref_line=True, **kwargs):\n \"\"\"Plot visualization.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Parameters\n ----------\n ax : Matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name for labeling curve. If `None`, use `estimator_name` if\n not `None`, otherwise no labeling is shown.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`\n Object that stores computed values.\n \"\"\"\n check_matplotlib_support('CalibrationDisplay.plot')\n import matplotlib.pyplot as plt\n if ax is None:\n (fig, ax) = plt.subplots()\n name = self.estimator_name if name is None else name\n line_kwargs = {}\n if name is not None:\n line_kwargs['label'] = name\n line_kwargs.update(**kwargs)\n ref_line_label = 'Perfectly calibrated'\n existing_ref_line = ref_line_label in ax.get_legend_handles_labels()[1]\n if ref_line and not existing_ref_line:\n ax.plot([0, 1], [0, 1], 'k:', label=ref_line_label)\n self.line_ = ax.plot(self.prob_pred, self.prob_true, 's-', **line_kwargs)[0]\n if 'label' in line_kwargs:\n ax.legend(loc='lower right')\n ax.set(xlabel='Mean predicted probability', ylabel='Fraction of positives')\n self.ax_ = ax\n self.figure_ = ax.figure\n return self" + "description": "Plot visualization.\n\nExtra keyword arguments will be passed to\n:func:`matplotlib.pyplot.plot`.", + "docstring": "Plot visualization.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Parameters\n ----------\n ax : Matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name for labeling curve. If `None`, use `estimator_name` if\n not `None`, otherwise no labeling is shown.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`\n Object that stores computed values.\n ", + "source_code": "\ndef plot(self, *, ax=None, name=None, ref_line=True, **kwargs):\n \"\"\"Plot visualization.\n\n Extra keyword arguments will be passed to\n :func:`matplotlib.pyplot.plot`.\n\n Parameters\n ----------\n ax : Matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name for labeling curve. If `None`, use `estimator_name` if\n not `None`, otherwise no labeling is shown.\n\n ref_line : bool, default=True\n If `True`, plots a reference line representing a perfectly\n calibrated classifier.\n\n **kwargs : dict\n Keyword arguments to be passed to :func:`matplotlib.pyplot.plot`.\n\n Returns\n -------\n display : :class:`~sklearn.calibration.CalibrationDisplay`\n Object that stores computed values.\n \"\"\"\n check_matplotlib_support('CalibrationDisplay.plot')\n import matplotlib.pyplot as plt\n if ax is None:\n (fig, ax) = plt.subplots()\n name = self.estimator_name if name is None else name\n line_kwargs = {}\n if name is not None:\n line_kwargs['label'] = name\n line_kwargs.update(**kwargs)\n ref_line_label = 'Perfectly calibrated'\n existing_ref_line = ref_line_label in ax.get_legend_handles_labels()[1]\n if ref_line and not existing_ref_line:\n ax.plot([0, 1], [0, 1], 'k:', label=ref_line_label)\n self.line_ = ax.plot(self.prob_pred, self.prob_true, 's-', **line_kwargs)[0]\n ax.legend(loc='lower right')\n ax.set(xlabel='Mean predicted probability', ylabel='Fraction of positives')\n self.ax_ = ax\n self.figure_ = ax.figure\n return self" }, { "name": "__init__", @@ -29632,7 +29973,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -29642,7 +29984,8 @@ "docstring": { "type": "estimator instance", "description": "Fitted classifier." - } + }, + "refined_type": {} }, { "name": "calibrators", @@ -29652,7 +29995,8 @@ "docstring": { "type": "list of fitted estimator instances", "description": "List of fitted calibrators (either 'IsotonicRegression' or\n'_SigmoidCalibration'). The number of calibrators equals the number of\nclasses. However, if there are 2 classes, the list contains only one\nfitted calibrator." - } + }, + "refined_type": {} }, { "name": "classes", @@ -29662,7 +30006,8 @@ "docstring": { "type": "array-like of shape (n_classes,)", "description": "All the prediction classes." - } + }, + "refined_type": {} }, { "name": "method", @@ -29672,13 +30017,17 @@ "docstring": { "type": "{'sigmoid', 'isotonic'}, default='sigmoid'", "description": "The method to use for calibration. Can be 'sigmoid' which\ncorresponds to Platt's method or 'isotonic' which is a\nnon-parametric approach based on isotonic regression." + }, + "refined_type": { + "kind": "EnumType", + "values": ["sigmoid", "isotonic"] } } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, base_estimator, calibrators, *, classes, method='sigmoid'):\n self.base_estimator = base_estimator\n self.calibrators = calibrators\n self.classes = classes\n self.method = method" }, { @@ -29699,13 +30048,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('`calibrators_` is deprecated in 0.24 and will be removed in 1.1(renaming of 0.26). Use `calibrators` instead.')\n@property\ndef calibrators_(self):\n return self.calibrators" }, { @@ -29723,7 +30073,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -29733,13 +30084,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The sample data." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Calculate calibrated probabilities.\n\nCalculates classification calibrated probabilities for each class, in a one-vs-all manner, for `X`.", - "docstring": "Calculate calibrated probabilities.\n\nCalculates classification calibrated probabilities\nfor each class, in a one-vs-all manner, for `X`.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n The sample data.\n\nReturns\n-------\nproba : array, shape (n_samples, n_classes)\n The predicted probabilities. Can be exact zeros.", + "description": "Calculate calibrated probabilities.\n\nCalculates classification calibrated probabilities\nfor each class, in a one-vs-all manner, for `X`.", + "docstring": "Calculate calibrated probabilities.\n\n Calculates classification calibrated probabilities\n for each class, in a one-vs-all manner, for `X`.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The sample data.\n\n Returns\n -------\n proba : array, shape (n_samples, n_classes)\n The predicted probabilities. Can be exact zeros.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Calculate calibrated probabilities.\n\n Calculates classification calibrated probabilities\n for each class, in a one-vs-all manner, for `X`.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The sample data.\n\n Returns\n -------\n proba : array, shape (n_samples, n_classes)\n The predicted probabilities. Can be exact zeros.\n \"\"\"\n n_classes = len(self.classes)\n (pred_method, method_name) = _get_prediction_method(self.base_estimator)\n predictions = _compute_predictions(pred_method, method_name, X, n_classes)\n label_encoder = LabelEncoder().fit(self.classes)\n pos_class_indices = label_encoder.transform(self.base_estimator.classes_)\n proba = np.zeros((_num_samples(X), n_classes))\n for (class_idx, this_pred, calibrator) in zip(pos_class_indices, predictions.T, self.calibrators):\n if n_classes == 2:\n class_idx += 1\n proba[:, class_idx] = calibrator.predict(this_pred)\n if n_classes == 2:\n proba[:, 0] = 1.0 - proba[:, 1]\n else:\n denominator = np.sum(proba, axis=1)[:, np.newaxis]\n uniform_proba = np.full_like(proba, 1 / n_classes)\n proba = np.divide(proba, denominator, out=uniform_proba, where=denominator != 0)\n proba[(1.0 < proba) & (proba <= 1.0 + 1e-05)] = 1.0\n return proba" }, { @@ -29757,7 +30109,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -29767,7 +30120,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -29777,7 +30131,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Training target." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -29787,13 +30142,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit the model using X, y as training data.", - "docstring": "Fit the model using X, y as training data.\n\nParameters\n----------\nX : array-like of shape (n_samples,)\n Training data.\n\ny : array-like of shape (n_samples,)\n Training target.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\nReturns\n-------\nself : object\n Returns an instance of self.", + "docstring": "Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples,)\n Training data.\n\n y : array-like of shape (n_samples,)\n Training target.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples,)\n Training data.\n\n y : array-like of shape (n_samples,)\n Training target.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n X = column_or_1d(X)\n y = column_or_1d(y)\n (X, y) = indexable(X, y)\n (self.a_, self.b_) = _sigmoid_calibration(X, y, sample_weight)\n return self" }, { @@ -29811,7 +30167,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "T", @@ -29821,13 +30178,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Data to predict from." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Predict new data by linear interpolation.", - "docstring": "Predict new data by linear interpolation.\n\nParameters\n----------\nT : array-like of shape (n_samples,)\n Data to predict from.\n\nReturns\n-------\nT_ : ndarray of shape (n_samples,)\n The predicted data.", + "docstring": "Predict new data by linear interpolation.\n\n Parameters\n ----------\n T : array-like of shape (n_samples,)\n Data to predict from.\n\n Returns\n -------\n T_ : ndarray of shape (n_samples,)\n The predicted data.\n ", "source_code": "\ndef predict(self, T):\n \"\"\"Predict new data by linear interpolation.\n\n Parameters\n ----------\n T : array-like of shape (n_samples,)\n Data to predict from.\n\n Returns\n -------\n T_ : ndarray of shape (n_samples,)\n The predicted data.\n \"\"\"\n T = column_or_1d(T)\n return expit(-(self.a_ * T + self.b_))" }, { @@ -29845,7 +30203,8 @@ "docstring": { "type": "callable", "description": "Prediction method." - } + }, + "refined_type": {} }, { "name": "method_name", @@ -29855,7 +30214,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -29865,7 +30225,8 @@ "docstring": { "type": "array-like or None", "description": "Data used to obtain predictions." - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -29875,13 +30236,14 @@ "docstring": { "type": "int", "description": "Number of classes present." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return predictions for `X` and reshape binary outputs to shape (n_samples, 1).", - "docstring": "Return predictions for `X` and reshape binary outputs to shape\n(n_samples, 1).\n\nParameters\n----------\npred_method : callable\n Prediction method.\n\nmethod_name: str\n Name of the prediction method\n\nX : array-like or None\n Data used to obtain predictions.\n\nn_classes : int\n Number of classes present.\n\nReturns\n-------\npredictions : array-like, shape (X.shape[0], len(clf.classes_))\n The predictions. Note if there are 2 classes, array is of shape\n (X.shape[0], 1).", + "description": "Return predictions for `X` and reshape binary outputs to shape\n(n_samples, 1).", + "docstring": "Return predictions for `X` and reshape binary outputs to shape\n (n_samples, 1).\n\n Parameters\n ----------\n pred_method : callable\n Prediction method.\n\n method_name: str\n Name of the prediction method\n\n X : array-like or None\n Data used to obtain predictions.\n\n n_classes : int\n Number of classes present.\n\n Returns\n -------\n predictions : array-like, shape (X.shape[0], len(clf.classes_))\n The predictions. Note if there are 2 classes, array is of shape\n (X.shape[0], 1).\n ", "source_code": "\ndef _compute_predictions(pred_method, method_name, X, n_classes):\n \"\"\"Return predictions for `X` and reshape binary outputs to shape\n (n_samples, 1).\n\n Parameters\n ----------\n pred_method : callable\n Prediction method.\n\n method_name: str\n Name of the prediction method\n\n X : array-like or None\n Data used to obtain predictions.\n\n n_classes : int\n Number of classes present.\n\n Returns\n -------\n predictions : array-like, shape (X.shape[0], len(clf.classes_))\n The predictions. Note if there are 2 classes, array is of shape\n (X.shape[0], 1).\n \"\"\"\n predictions = pred_method(X=X)\n if method_name == 'decision_function':\n if predictions.ndim == 1:\n predictions = predictions[:, np.newaxis]\n elif method_name == 'predict_proba':\n if n_classes == 2:\n predictions = predictions[:, 1:]\n else:\n raise ValueError(f'Invalid prediction method: {method_name}')\n return predictions" }, { @@ -29899,7 +30261,8 @@ "docstring": { "type": "estimator instance", "description": "Fitted classifier." - } + }, + "refined_type": {} }, { "name": "predictions", @@ -29909,7 +30272,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_classes) or (n_samples, 1) when binary.", "description": "Raw predictions returned by the un-calibrated base classifier." - } + }, + "refined_type": {} }, { "name": "y", @@ -29919,7 +30283,8 @@ "docstring": { "type": "array-like, shape (n_samples,)", "description": "The targets." - } + }, + "refined_type": {} }, { "name": "classes", @@ -29929,7 +30294,8 @@ "docstring": { "type": "ndarray, shape (n_classes,)", "description": "All the prediction classes." - } + }, + "refined_type": {} }, { "name": "method", @@ -29939,6 +30305,10 @@ "docstring": { "type": "{'sigmoid', 'isotonic'}", "description": "The method to use for calibration." + }, + "refined_type": { + "kind": "EnumType", + "values": ["sigmoid", "isotonic"] } }, { @@ -29949,13 +30319,14 @@ "docstring": { "type": "ndarray, shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Fit calibrator(s) and return a `_CalibratedClassifier` instance.\n\n`n_classes` (i.e. `len(clf.classes_)`) calibrators are fitted. However, if `n_classes` equals 2, one calibrator is fitted.", - "docstring": "Fit calibrator(s) and return a `_CalibratedClassifier`\ninstance.\n\n`n_classes` (i.e. `len(clf.classes_)`) calibrators are fitted.\nHowever, if `n_classes` equals 2, one calibrator is fitted.\n\nParameters\n----------\nclf : estimator instance\n Fitted classifier.\n\npredictions : array-like, shape (n_samples, n_classes) or (n_samples, 1) when binary.\n Raw predictions returned by the un-calibrated base classifier.\n\ny : array-like, shape (n_samples,)\n The targets.\n\nclasses : ndarray, shape (n_classes,)\n All the prediction classes.\n\nmethod : {'sigmoid', 'isotonic'}\n The method to use for calibration.\n\nsample_weight : ndarray, shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\nReturns\n-------\npipeline : _CalibratedClassifier instance", + "description": "Fit calibrator(s) and return a `_CalibratedClassifier`\ninstance.\n\n`n_classes` (i.e. `len(clf.classes_)`) calibrators are fitted.\nHowever, if `n_classes` equals 2, one calibrator is fitted.", + "docstring": "Fit calibrator(s) and return a `_CalibratedClassifier`\n instance.\n\n `n_classes` (i.e. `len(clf.classes_)`) calibrators are fitted.\n However, if `n_classes` equals 2, one calibrator is fitted.\n\n Parameters\n ----------\n clf : estimator instance\n Fitted classifier.\n\n predictions : array-like, shape (n_samples, n_classes) or (n_samples, 1) when binary.\n Raw predictions returned by the un-calibrated base classifier.\n\n y : array-like, shape (n_samples,)\n The targets.\n\n classes : ndarray, shape (n_classes,)\n All the prediction classes.\n\n method : {'sigmoid', 'isotonic'}\n The method to use for calibration.\n\n sample_weight : ndarray, shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\n Returns\n -------\n pipeline : _CalibratedClassifier instance\n ", "source_code": "\ndef _fit_calibrator(clf, predictions, y, classes, method, sample_weight=None):\n \"\"\"Fit calibrator(s) and return a `_CalibratedClassifier`\n instance.\n\n `n_classes` (i.e. `len(clf.classes_)`) calibrators are fitted.\n However, if `n_classes` equals 2, one calibrator is fitted.\n\n Parameters\n ----------\n clf : estimator instance\n Fitted classifier.\n\n predictions : array-like, shape (n_samples, n_classes) or (n_samples, 1) when binary.\n Raw predictions returned by the un-calibrated base classifier.\n\n y : array-like, shape (n_samples,)\n The targets.\n\n classes : ndarray, shape (n_classes,)\n All the prediction classes.\n\n method : {'sigmoid', 'isotonic'}\n The method to use for calibration.\n\n sample_weight : ndarray, shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\n Returns\n -------\n pipeline : _CalibratedClassifier instance\n \"\"\"\n Y = label_binarize(y, classes=classes)\n label_encoder = LabelEncoder().fit(classes)\n pos_class_indices = label_encoder.transform(clf.classes_)\n calibrators = []\n for (class_idx, this_pred) in zip(pos_class_indices, predictions.T):\n if method == 'isotonic':\n calibrator = IsotonicRegression(out_of_bounds='clip')\n elif method == 'sigmoid':\n calibrator = _SigmoidCalibration()\n else:\n raise ValueError(f\"'method' should be one of: 'sigmoid' or 'isotonic'. Got {method}.\")\n calibrator.fit(this_pred, Y[:, class_idx], sample_weight)\n calibrators.append(calibrator)\n pipeline = _CalibratedClassifier(clf, calibrators, method=method, classes=classes)\n return pipeline" }, { @@ -29973,7 +30344,8 @@ "docstring": { "type": "estimator instance", "description": "Cloned base estimator." - } + }, + "refined_type": {} }, { "name": "X", @@ -29983,7 +30355,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Sample data." - } + }, + "refined_type": {} }, { "name": "y", @@ -29993,7 +30366,8 @@ "docstring": { "type": "array-like, shape (n_samples,)", "description": "Targets." - } + }, + "refined_type": {} }, { "name": "train", @@ -30003,7 +30377,8 @@ "docstring": { "type": "ndarray, shape (n_train_indicies,)", "description": "Indices of the training subset." - } + }, + "refined_type": {} }, { "name": "test", @@ -30013,7 +30388,8 @@ "docstring": { "type": "ndarray, shape (n_test_indicies,)", "description": "Indices of the testing subset." - } + }, + "refined_type": {} }, { "name": "supports_sw", @@ -30023,7 +30399,8 @@ "docstring": { "type": "bool", "description": "Whether or not the `estimator` supports sample weights." - } + }, + "refined_type": {} }, { "name": "method", @@ -30033,6 +30410,10 @@ "docstring": { "type": "{'sigmoid', 'isotonic'}", "description": "Method to use for calibration." + }, + "refined_type": { + "kind": "EnumType", + "values": ["sigmoid", "isotonic"] } }, { @@ -30043,7 +30424,8 @@ "docstring": { "type": "ndarray, shape (n_classes,)", "description": "The target classes." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -30053,13 +30435,14 @@ "docstring": { "type": "array-like, default=None", "description": "Sample weights for `X`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Fit a classifier/calibration pair on a given train/test split.\n\nFit the classifier on the train set, compute its predictions on the test set and use the predictions as input to fit the calibrator along with the test labels.", - "docstring": "Fit a classifier/calibration pair on a given train/test split.\n\nFit the classifier on the train set, compute its predictions on the test\nset and use the predictions as input to fit the calibrator along with the\ntest labels.\n\nParameters\n----------\nestimator : estimator instance\n Cloned base estimator.\n\nX : array-like, shape (n_samples, n_features)\n Sample data.\n\ny : array-like, shape (n_samples,)\n Targets.\n\ntrain : ndarray, shape (n_train_indicies,)\n Indices of the training subset.\n\ntest : ndarray, shape (n_test_indicies,)\n Indices of the testing subset.\n\nsupports_sw : bool\n Whether or not the `estimator` supports sample weights.\n\nmethod : {'sigmoid', 'isotonic'}\n Method to use for calibration.\n\nclasses : ndarray, shape (n_classes,)\n The target classes.\n\nsample_weight : array-like, default=None\n Sample weights for `X`.\n\nReturns\n-------\ncalibrated_classifier : _CalibratedClassifier instance", + "description": "Fit a classifier/calibration pair on a given train/test split.\n\nFit the classifier on the train set, compute its predictions on the test\nset and use the predictions as input to fit the calibrator along with the\ntest labels.", + "docstring": "Fit a classifier/calibration pair on a given train/test split.\n\n Fit the classifier on the train set, compute its predictions on the test\n set and use the predictions as input to fit the calibrator along with the\n test labels.\n\n Parameters\n ----------\n estimator : estimator instance\n Cloned base estimator.\n\n X : array-like, shape (n_samples, n_features)\n Sample data.\n\n y : array-like, shape (n_samples,)\n Targets.\n\n train : ndarray, shape (n_train_indicies,)\n Indices of the training subset.\n\n test : ndarray, shape (n_test_indicies,)\n Indices of the testing subset.\n\n supports_sw : bool\n Whether or not the `estimator` supports sample weights.\n\n method : {'sigmoid', 'isotonic'}\n Method to use for calibration.\n\n classes : ndarray, shape (n_classes,)\n The target classes.\n\n sample_weight : array-like, default=None\n Sample weights for `X`.\n\n Returns\n -------\n calibrated_classifier : _CalibratedClassifier instance\n ", "source_code": "\ndef _fit_classifier_calibrator_pair(estimator, X, y, train, test, supports_sw, method, classes, sample_weight=None):\n \"\"\"Fit a classifier/calibration pair on a given train/test split.\n\n Fit the classifier on the train set, compute its predictions on the test\n set and use the predictions as input to fit the calibrator along with the\n test labels.\n\n Parameters\n ----------\n estimator : estimator instance\n Cloned base estimator.\n\n X : array-like, shape (n_samples, n_features)\n Sample data.\n\n y : array-like, shape (n_samples,)\n Targets.\n\n train : ndarray, shape (n_train_indicies,)\n Indices of the training subset.\n\n test : ndarray, shape (n_test_indicies,)\n Indices of the testing subset.\n\n supports_sw : bool\n Whether or not the `estimator` supports sample weights.\n\n method : {'sigmoid', 'isotonic'}\n Method to use for calibration.\n\n classes : ndarray, shape (n_classes,)\n The target classes.\n\n sample_weight : array-like, default=None\n Sample weights for `X`.\n\n Returns\n -------\n calibrated_classifier : _CalibratedClassifier instance\n \"\"\"\n (X_train, y_train) = (_safe_indexing(X, train), _safe_indexing(y, train))\n (X_test, y_test) = (_safe_indexing(X, test), _safe_indexing(y, test))\n if supports_sw and sample_weight is not None:\n sw_train = _safe_indexing(sample_weight, train)\n sw_test = _safe_indexing(sample_weight, test)\n else:\n sw_train = None\n sw_test = None\n if supports_sw:\n estimator.fit(X_train, y_train, sample_weight=sw_train)\n else:\n estimator.fit(X_train, y_train)\n n_classes = len(classes)\n (pred_method, method_name) = _get_prediction_method(estimator)\n predictions = _compute_predictions(pred_method, method_name, X_test, n_classes)\n calibrated_classifier = _fit_calibrator(estimator, predictions, y_test, classes, method, sample_weight=sw_test)\n return calibrated_classifier" }, { @@ -30077,13 +30460,14 @@ "docstring": { "type": "Estimator instance", "description": "Fitted classifier to obtain the prediction method from." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return prediction method.\n\n`decision_function` method of `clf` returned, if it exists, otherwise `predict_proba` method returned.", - "docstring": "Return prediction method.\n\n`decision_function` method of `clf` returned, if it\nexists, otherwise `predict_proba` method returned.\n\nParameters\n----------\nclf : Estimator instance\n Fitted classifier to obtain the prediction method from.\n\nReturns\n-------\nprediction_method : callable\n The prediction method.\nmethod_name : str\n The name of the prediction method.", + "description": "Return prediction method.\n\n`decision_function` method of `clf` returned, if it\nexists, otherwise `predict_proba` method returned.", + "docstring": "Return prediction method.\n\n `decision_function` method of `clf` returned, if it\n exists, otherwise `predict_proba` method returned.\n\n Parameters\n ----------\n clf : Estimator instance\n Fitted classifier to obtain the prediction method from.\n\n Returns\n -------\n prediction_method : callable\n The prediction method.\n method_name : str\n The name of the prediction method.\n ", "source_code": "\ndef _get_prediction_method(clf):\n \"\"\"Return prediction method.\n\n `decision_function` method of `clf` returned, if it\n exists, otherwise `predict_proba` method returned.\n\n Parameters\n ----------\n clf : Estimator instance\n Fitted classifier to obtain the prediction method from.\n\n Returns\n -------\n prediction_method : callable\n The prediction method.\n method_name : str\n The name of the prediction method.\n \"\"\"\n if hasattr(clf, 'decision_function'):\n method = getattr(clf, 'decision_function')\n return method, 'decision_function'\n elif hasattr(clf, 'predict_proba'):\n method = getattr(clf, 'predict_proba')\n return method, 'predict_proba'\n else:\n raise RuntimeError(\"'base_estimator' has no 'decision_function' or 'predict_proba' method.\")" }, { @@ -30101,7 +30485,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The decision function or predict proba for the samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -30111,7 +30496,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The targets." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -30121,13 +30507,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Probability Calibration with sigmoid method (Platt 2000)", - "docstring": "Probability Calibration with sigmoid method (Platt 2000)\n\nParameters\n----------\npredictions : ndarray of shape (n_samples,)\n The decision function or predict proba for the samples.\n\ny : ndarray of shape (n_samples,)\n The targets.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\nReturns\n-------\na : float\n The slope.\n\nb : float\n The intercept.\n\nReferences\n----------\nPlatt, \"Probabilistic Outputs for Support Vector Machines\"", + "docstring": "Probability Calibration with sigmoid method (Platt 2000)\n\n Parameters\n ----------\n predictions : ndarray of shape (n_samples,)\n The decision function or predict proba for the samples.\n\n y : ndarray of shape (n_samples,)\n The targets.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\n Returns\n -------\n a : float\n The slope.\n\n b : float\n The intercept.\n\n References\n ----------\n Platt, \"Probabilistic Outputs for Support Vector Machines\"\n ", "source_code": "\ndef _sigmoid_calibration(predictions, y, sample_weight=None):\n \"\"\"Probability Calibration with sigmoid method (Platt 2000)\n\n Parameters\n ----------\n predictions : ndarray of shape (n_samples,)\n The decision function or predict proba for the samples.\n\n y : ndarray of shape (n_samples,)\n The targets.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\n Returns\n -------\n a : float\n The slope.\n\n b : float\n The intercept.\n\n References\n ----------\n Platt, \"Probabilistic Outputs for Support Vector Machines\"\n \"\"\"\n predictions = column_or_1d(predictions)\n y = column_or_1d(y)\n F = predictions\n mask_negative_samples = y <= 0\n if sample_weight is not None:\n prior0 = sample_weight[mask_negative_samples].sum()\n prior1 = sample_weight[~mask_negative_samples].sum()\n else:\n prior0 = float(np.sum(mask_negative_samples))\n prior1 = y.shape[0] - prior0\n T = np.zeros_like(y, dtype=np.float64)\n T[y > 0] = (prior1 + 1.0) / (prior1 + 2.0)\n T[y <= 0] = 1.0 / (prior0 + 2.0)\n T1 = 1.0 - T\n \n def objective(AB):\n P = expit(-(AB[0] * F + AB[1]))\n loss = -(xlogy(T, P) + xlogy(T1, 1.0 - P))\n if sample_weight is not None:\n return (sample_weight * loss).sum()\n else:\n return loss.sum()\n \n def grad(AB):\n P = expit(-(AB[0] * F + AB[1]))\n TEP_minus_T1P = T - P\n if sample_weight is not None:\n TEP_minus_T1P *= sample_weight\n dA = np.dot(TEP_minus_T1P, F)\n dB = np.sum(TEP_minus_T1P)\n return np.array([dA, dB])\n AB0 = np.array([0.0, log((prior0 + 1.0) / (prior1 + 1.0))])\n AB_ = fmin_bfgs(objective, AB0, fprime=grad, disp=False)\n return AB_[0], AB_[1]" }, { @@ -30145,7 +30532,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "True targets." - } + }, + "refined_type": {} }, { "name": "y_prob", @@ -30155,7 +30543,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Probabilities of the positive class." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -30165,7 +30554,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether y_prob needs to be normalized into the [0, 1] interval, i.e.\nis not a proper probability. If True, the smallest value in y_prob\nis linearly mapped onto 0 and the largest one onto 1." - } + }, + "refined_type": {} }, { "name": "n_bins", @@ -30175,7 +30565,8 @@ "docstring": { "type": "int, default=5", "description": "Number of bins to discretize the [0, 1] interval. A bigger number\nrequires more data. Bins with no samples (i.e. without\ncorresponding values in `y_prob`) will not be returned, thus the\nreturned arrays may have less than `n_bins` values." - } + }, + "refined_type": {} }, { "name": "strategy", @@ -30185,13 +30576,17 @@ "docstring": { "type": "{'uniform', 'quantile'}, default='uniform'", "description": "Strategy used to define the widths of the bins.\n\nuniform\n The bins have identical widths.\nquantile\n The bins have the same number of samples and depend on `y_prob`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["quantile", "uniform"] } } ], "results": [], "is_public": true, - "description": "Compute true and predicted probabilities for a calibration curve.\n\nThe method assumes the inputs come from a binary classifier, and discretize the [0, 1] interval into bins. Calibration curves may also be referred to as reliability diagrams. Read more in the :ref:`User Guide `.", - "docstring": "Compute true and predicted probabilities for a calibration curve.\n\nThe method assumes the inputs come from a binary classifier, and\ndiscretize the [0, 1] interval into bins.\n\nCalibration curves may also be referred to as reliability diagrams.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n True targets.\n\ny_prob : array-like of shape (n_samples,)\n Probabilities of the positive class.\n\nnormalize : bool, default=False\n Whether y_prob needs to be normalized into the [0, 1] interval, i.e.\n is not a proper probability. If True, the smallest value in y_prob\n is linearly mapped onto 0 and the largest one onto 1.\n\nn_bins : int, default=5\n Number of bins to discretize the [0, 1] interval. A bigger number\n requires more data. Bins with no samples (i.e. without\n corresponding values in `y_prob`) will not be returned, thus the\n returned arrays may have less than `n_bins` values.\n\nstrategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n uniform\n The bins have identical widths.\n quantile\n The bins have the same number of samples and depend on `y_prob`.\n\nReturns\n-------\nprob_true : ndarray of shape (n_bins,) or smaller\n The proportion of samples whose class is the positive class, in each\n bin (fraction of positives).\n\nprob_pred : ndarray of shape (n_bins,) or smaller\n The mean predicted probability in each bin.\n\nReferences\n----------\nAlexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good\nProbabilities With Supervised Learning, in Proceedings of the 22nd\nInternational Conference on Machine Learning (ICML).\nSee section 4 (Qualitative Analysis of Predictions).\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.calibration import calibration_curve\n>>> y_true = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1])\n>>> y_pred = np.array([0.1, 0.2, 0.3, 0.4, 0.65, 0.7, 0.8, 0.9, 1.])\n>>> prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=3)\n>>> prob_true\narray([0. , 0.5, 1. ])\n>>> prob_pred\narray([0.2 , 0.525, 0.85 ])", + "description": "Compute true and predicted probabilities for a calibration curve.\n\nThe method assumes the inputs come from a binary classifier, and\ndiscretize the [0, 1] interval into bins.\n\nCalibration curves may also be referred to as reliability diagrams.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute true and predicted probabilities for a calibration curve.\n\n The method assumes the inputs come from a binary classifier, and\n discretize the [0, 1] interval into bins.\n\n Calibration curves may also be referred to as reliability diagrams.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True targets.\n\n y_prob : array-like of shape (n_samples,)\n Probabilities of the positive class.\n\n normalize : bool, default=False\n Whether y_prob needs to be normalized into the [0, 1] interval, i.e.\n is not a proper probability. If True, the smallest value in y_prob\n is linearly mapped onto 0 and the largest one onto 1.\n\n n_bins : int, default=5\n Number of bins to discretize the [0, 1] interval. A bigger number\n requires more data. Bins with no samples (i.e. without\n corresponding values in `y_prob`) will not be returned, thus the\n returned arrays may have less than `n_bins` values.\n\n strategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n uniform\n The bins have identical widths.\n quantile\n The bins have the same number of samples and depend on `y_prob`.\n\n Returns\n -------\n prob_true : ndarray of shape (n_bins,) or smaller\n The proportion of samples whose class is the positive class, in each\n bin (fraction of positives).\n\n prob_pred : ndarray of shape (n_bins,) or smaller\n The mean predicted probability in each bin.\n\n References\n ----------\n Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good\n Probabilities With Supervised Learning, in Proceedings of the 22nd\n International Conference on Machine Learning (ICML).\n See section 4 (Qualitative Analysis of Predictions).\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.calibration import calibration_curve\n >>> y_true = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1])\n >>> y_pred = np.array([0.1, 0.2, 0.3, 0.4, 0.65, 0.7, 0.8, 0.9, 1.])\n >>> prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=3)\n >>> prob_true\n array([0. , 0.5, 1. ])\n >>> prob_pred\n array([0.2 , 0.525, 0.85 ])\n ", "source_code": "\ndef calibration_curve(y_true, y_prob, *, normalize=False, n_bins=5, strategy='uniform'):\n \"\"\"Compute true and predicted probabilities for a calibration curve.\n\n The method assumes the inputs come from a binary classifier, and\n discretize the [0, 1] interval into bins.\n\n Calibration curves may also be referred to as reliability diagrams.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True targets.\n\n y_prob : array-like of shape (n_samples,)\n Probabilities of the positive class.\n\n normalize : bool, default=False\n Whether y_prob needs to be normalized into the [0, 1] interval, i.e.\n is not a proper probability. If True, the smallest value in y_prob\n is linearly mapped onto 0 and the largest one onto 1.\n\n n_bins : int, default=5\n Number of bins to discretize the [0, 1] interval. A bigger number\n requires more data. Bins with no samples (i.e. without\n corresponding values in `y_prob`) will not be returned, thus the\n returned arrays may have less than `n_bins` values.\n\n strategy : {'uniform', 'quantile'}, default='uniform'\n Strategy used to define the widths of the bins.\n\n uniform\n The bins have identical widths.\n quantile\n The bins have the same number of samples and depend on `y_prob`.\n\n Returns\n -------\n prob_true : ndarray of shape (n_bins,) or smaller\n The proportion of samples whose class is the positive class, in each\n bin (fraction of positives).\n\n prob_pred : ndarray of shape (n_bins,) or smaller\n The mean predicted probability in each bin.\n\n References\n ----------\n Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good\n Probabilities With Supervised Learning, in Proceedings of the 22nd\n International Conference on Machine Learning (ICML).\n See section 4 (Qualitative Analysis of Predictions).\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.calibration import calibration_curve\n >>> y_true = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1])\n >>> y_pred = np.array([0.1, 0.2, 0.3, 0.4, 0.65, 0.7, 0.8, 0.9, 1.])\n >>> prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=3)\n >>> prob_true\n array([0. , 0.5, 1. ])\n >>> prob_pred\n array([0.2 , 0.525, 0.85 ])\n \"\"\"\n y_true = column_or_1d(y_true)\n y_prob = column_or_1d(y_prob)\n check_consistent_length(y_true, y_prob)\n if normalize:\n y_prob = (y_prob - y_prob.min()) / (y_prob.max() - y_prob.min())\n elif y_prob.min() < 0 or y_prob.max() > 1:\n raise ValueError('y_prob has values outside [0, 1] and normalize is set to False.')\n labels = np.unique(y_true)\n if len(labels) > 2:\n raise ValueError('Only binary classification is supported. Provided labels %s.' % labels)\n y_true = label_binarize(y_true, classes=labels)[:, 0]\n if strategy == 'quantile':\n quantiles = np.linspace(0, 1, n_bins + 1)\n bins = np.percentile(y_prob, quantiles * 100)\n bins[-1] = bins[-1] + 1e-08\n elif strategy == 'uniform':\n bins = np.linspace(0.0, 1.0 + 1e-08, n_bins + 1)\n else:\n raise ValueError(\"Invalid entry to 'strategy' input. Strategy must be either 'quantile' or 'uniform'.\")\n binids = np.digitize(y_prob, bins) - 1\n bin_sums = np.bincount(binids, weights=y_prob, minlength=len(bins))\n bin_true = np.bincount(binids, weights=y_true, minlength=len(bins))\n bin_total = np.bincount(binids, minlength=len(bins))\n nonzero = bin_total != 0\n prob_true = bin_true[nonzero] / bin_total[nonzero]\n prob_pred = bin_sums[nonzero] / bin_total[nonzero]\n return prob_true, prob_pred" }, { @@ -30209,7 +30604,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "damping", @@ -30219,6 +30615,14 @@ "docstring": { "type": "float, default=0.5", "description": "Damping factor in the range `[0.5, 1.0)` is the extent to\nwhich the current value is maintained relative to\nincoming values (weighted 1 - damping). This in order\nto avoid numerical oscillations when updating these\nvalues (messages)." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.5, + "max": 1.0, + "min_inclusive": true, + "max_inclusive": false } }, { @@ -30229,7 +30633,8 @@ "docstring": { "type": "int, default=200", "description": "Maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "convergence_iter", @@ -30239,7 +30644,8 @@ "docstring": { "type": "int, default=15", "description": "Number of iterations with no change in the number\nof estimated clusters that stops the convergence." - } + }, + "refined_type": {} }, { "name": "copy", @@ -30249,7 +30655,8 @@ "docstring": { "type": "bool, default=True", "description": "Make a copy of input data." - } + }, + "refined_type": {} }, { "name": "preference", @@ -30259,7 +30666,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or float, default=None", "description": "Preferences for each point - points with larger values of\npreferences are more likely to be chosen as exemplars. The number\nof exemplars, ie of clusters, is influenced by the input\npreferences value. If the preferences are not passed as arguments,\nthey will be set to the median of the input similarities." - } + }, + "refined_type": {} }, { "name": "affinity", @@ -30269,6 +30677,10 @@ "docstring": { "type": "{'euclidean', 'precomputed'}, default='euclidean'", "description": "Which affinity to use. At the moment 'precomputed' and\n``euclidean`` are supported. 'euclidean' uses the\nnegative squared euclidean distance between points." + }, + "refined_type": { + "kind": "EnumType", + "values": ["euclidean", "precomputed"] } }, { @@ -30279,7 +30691,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to be verbose." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -30289,13 +30702,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pseudo-random number generator to control the starting state.\nUse an int for reproducible results across function calls.\nSee the :term:`Glossary `.\n\n.. versionadded:: 0.23\n this parameter was previously hardcoded as 0." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False, random_state=None):\n self.damping = damping\n self.max_iter = max_iter\n self.convergence_iter = convergence_iter\n self.copy = copy\n self.verbose = verbose\n self.preference = preference\n self.affinity = affinity\n self.random_state = random_state" }, { @@ -30313,13 +30727,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'pairwise': self.affinity == 'precomputed'}" }, { @@ -30340,13 +30755,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef _pairwise(self):\n return self.affinity == 'precomputed'" }, { @@ -30364,7 +30780,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -30374,6 +30791,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features), or array-like of shape (n_samples, n_samples)", "description": "Training instances to cluster, or similarities / affinities between\ninstances if ``affinity='precomputed'``. If a sparse feature matrix\nis provided, it will be converted into a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -30384,13 +30805,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the clustering from features, or affinity matrix.", - "docstring": "Fit the clustering from features, or affinity matrix.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features), or array-like of shape (n_samples, n_samples)\n Training instances to cluster, or similarities / affinities between\n instances if ``affinity='precomputed'``. If a sparse feature matrix\n is provided, it will be converted into a sparse ``csr_matrix``.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself\n Returns the instance itself.", + "docstring": "Fit the clustering from features, or affinity matrix.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), or array-like of shape (n_samples, n_samples)\n Training instances to cluster, or similarities / affinities between\n instances if ``affinity='precomputed'``. If a sparse feature matrix\n is provided, it will be converted into a sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the clustering from features, or affinity matrix.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), or array-like of shape (n_samples, n_samples)\n Training instances to cluster, or similarities / affinities between\n instances if ``affinity='precomputed'``. If a sparse feature matrix\n is provided, it will be converted into a sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Returns the instance itself.\n \"\"\"\n if self.affinity == 'precomputed':\n accept_sparse = False\n else:\n accept_sparse = 'csr'\n X = self._validate_data(X, accept_sparse=accept_sparse)\n if self.affinity == 'precomputed':\n self.affinity_matrix_ = X\n elif self.affinity == 'euclidean':\n self.affinity_matrix_ = -euclidean_distances(X, squared=True)\n else:\n raise ValueError(\"Affinity must be 'precomputed' or 'euclidean'. Got %s instead\" % str(self.affinity))\n check_scalar(self.damping, 'damping', target_type=numbers.Real, min_val=0.5, max_val=1, include_boundaries='left')\n check_scalar(self.max_iter, 'max_iter', target_type=numbers.Integral, min_val=1)\n check_scalar(self.convergence_iter, 'convergence_iter', target_type=numbers.Integral, min_val=1)\n (self.cluster_centers_indices_, self.labels_, self.n_iter_) = affinity_propagation(self.affinity_matrix_, preference=self.preference, max_iter=self.max_iter, convergence_iter=self.convergence_iter, damping=self.damping, copy=self.copy, verbose=self.verbose, return_n_iter=True, random_state=self.random_state)\n if self.affinity != 'precomputed':\n self.cluster_centers_ = X[self.cluster_centers_indices_].copy()\n return self" }, { @@ -30408,7 +30830,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -30418,6 +30841,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features), or array-like of shape (n_samples, n_samples)", "description": "Training instances to cluster, or similarities / affinities between\ninstances if ``affinity='precomputed'``. If a sparse feature matrix\nis provided, it will be converted into a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -30428,13 +30855,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit clustering from features/affinity matrix; return cluster labels.", - "docstring": "Fit clustering from features/affinity matrix; return cluster labels.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features), or array-like of shape (n_samples, n_samples)\n Training instances to cluster, or similarities / affinities between\n instances if ``affinity='precomputed'``. If a sparse feature matrix\n is provided, it will be converted into a sparse ``csr_matrix``.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,)\n Cluster labels.", + "docstring": "Fit clustering from features/affinity matrix; return cluster labels.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), or array-like of shape (n_samples, n_samples)\n Training instances to cluster, or similarities / affinities between\n instances if ``affinity='precomputed'``. If a sparse feature matrix\n is provided, it will be converted into a sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels.\n ", "source_code": "\ndef fit_predict(self, X, y=None):\n \"\"\"Fit clustering from features/affinity matrix; return cluster labels.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), or array-like of shape (n_samples, n_samples)\n Training instances to cluster, or similarities / affinities between\n instances if ``affinity='precomputed'``. If a sparse feature matrix\n is provided, it will be converted into a sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels.\n \"\"\"\n return super().fit_predict(X, y)" }, { @@ -30452,7 +30880,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -30462,13 +30891,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "New data to predict. If a sparse matrix is provided, it will be\nconverted into a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict the closest cluster each sample in X belongs to.", - "docstring": "Predict the closest cluster each sample in X belongs to.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to predict. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,)\n Cluster labels.", + "docstring": "Predict the closest cluster each sample in X belongs to.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to predict. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict the closest cluster each sample in X belongs to.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to predict. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False, accept_sparse='csr')\n if not hasattr(self, 'cluster_centers_'):\n raise ValueError(\"Predict method is not supported when affinity='precomputed'.\")\n if self.cluster_centers_.shape[0] > 0:\n with config_context(assume_finite=True):\n return pairwise_distances_argmin(X, self.cluster_centers_)\n else:\n warnings.warn(\"This model does not have any cluster centers because affinity propagation did not converge. Labeling every sample as '-1'.\", ConvergenceWarning)\n return np.array([-1] * X.shape[0])" }, { @@ -30486,7 +30919,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "preference", @@ -30496,13 +30930,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _equal_similarities_and_preferences(S, preference):\n \n def all_equal_preferences():\n return np.all(preference == preference.flat[0])\n \n def all_equal_similarities():\n mask = np.ones(S.shape, dtype=bool)\n np.fill_diagonal(mask, 0)\n return np.all(S[mask].flat == S[mask].flat[0])\n return all_equal_preferences() and all_equal_similarities()" }, { @@ -30520,7 +30955,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_samples)", "description": "Matrix of similarities between points." - } + }, + "refined_type": {} }, { "name": "preference", @@ -30530,7 +30966,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or float, default=None", "description": "Preferences for each point - points with larger values of\npreferences are more likely to be chosen as exemplars. The number of\nexemplars, i.e. of clusters, is influenced by the input preferences\nvalue. If the preferences are not passed as arguments, they will be\nset to the median of the input similarities (resulting in a moderate\nnumber of clusters). For a smaller amount of clusters, this can be set\nto the minimum value of the similarities." - } + }, + "refined_type": {} }, { "name": "convergence_iter", @@ -30540,7 +30977,8 @@ "docstring": { "type": "int, default=15", "description": "Number of iterations with no change in the number\nof estimated clusters that stops the convergence." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -30549,8 +30987,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "int, default=200", - "description": "Maximum number of iterations" - } + "description": "Maximum number of iterations." + }, + "refined_type": {} }, { "name": "damping", @@ -30560,7 +30999,8 @@ "docstring": { "type": "float, default=0.5", "description": "Damping factor between 0.5 and 1." - } + }, + "refined_type": {} }, { "name": "copy", @@ -30570,7 +31010,8 @@ "docstring": { "type": "bool, default=True", "description": "If copy is False, the affinity matrix is modified inplace by the\nalgorithm, for memory efficiency." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -30580,7 +31021,8 @@ "docstring": { "type": "bool, default=False", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -30590,7 +31032,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether or not to return the number of iterations." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -30600,14 +31043,15 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pseudo-random number generator to control the starting state.\nUse an int for reproducible results across function calls.\nSee the :term:`Glossary `.\n\n.. versionadded:: 0.23\n this parameter was previously hardcoded as 0." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform Affinity Propagation Clustering of data.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Perform Affinity Propagation Clustering of data.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\n\nS : array-like of shape (n_samples, n_samples)\n Matrix of similarities between points.\n\npreference : array-like of shape (n_samples,) or float, default=None\n Preferences for each point - points with larger values of\n preferences are more likely to be chosen as exemplars. The number of\n exemplars, i.e. of clusters, is influenced by the input preferences\n value. If the preferences are not passed as arguments, they will be\n set to the median of the input similarities (resulting in a moderate\n number of clusters). For a smaller amount of clusters, this can be set\n to the minimum value of the similarities.\n\nconvergence_iter : int, default=15\n Number of iterations with no change in the number\n of estimated clusters that stops the convergence.\n\nmax_iter : int, default=200\n Maximum number of iterations\n\ndamping : float, default=0.5\n Damping factor between 0.5 and 1.\n\ncopy : bool, default=True\n If copy is False, the affinity matrix is modified inplace by the\n algorithm, for memory efficiency.\n\nverbose : bool, default=False\n The verbosity level.\n\nreturn_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\nrandom_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the starting state.\n Use an int for reproducible results across function calls.\n See the :term:`Glossary `.\n\n .. versionadded:: 0.23\n this parameter was previously hardcoded as 0.\n\nReturns\n-------\n\ncluster_centers_indices : ndarray of shape (n_clusters,)\n Index of clusters centers.\n\nlabels : ndarray of shape (n_samples,)\n Cluster labels for each point.\n\nn_iter : int\n Number of iterations run. Returned only if `return_n_iter` is\n set to True.\n\nNotes\n-----\nFor an example, see :ref:`examples/cluster/plot_affinity_propagation.py\n`.\n\nWhen the algorithm does not converge, it returns an empty array as\n``cluster_center_indices`` and ``-1`` as label for each training sample.\n\nWhen all training samples have equal similarities and equal preferences,\nthe assignment of cluster centers and labels depends on the preference.\nIf the preference is smaller than the similarities, a single cluster center\nand label ``0`` for every sample will be returned. Otherwise, every\ntraining sample becomes its own cluster center and is assigned a unique\nlabel.\n\nReferences\n----------\nBrendan J. Frey and Delbert Dueck, \"Clustering by Passing Messages\nBetween Data Points\", Science Feb. 2007", - "source_code": "\ndef affinity_propagation(S, *, preference=None, convergence_iter=15, max_iter=200, damping=0.5, copy=True, verbose=False, return_n_iter=False, random_state=None):\n \"\"\"Perform Affinity Propagation Clustering of data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n\n S : array-like of shape (n_samples, n_samples)\n Matrix of similarities between points.\n\n preference : array-like of shape (n_samples,) or float, default=None\n Preferences for each point - points with larger values of\n preferences are more likely to be chosen as exemplars. The number of\n exemplars, i.e. of clusters, is influenced by the input preferences\n value. If the preferences are not passed as arguments, they will be\n set to the median of the input similarities (resulting in a moderate\n number of clusters). For a smaller amount of clusters, this can be set\n to the minimum value of the similarities.\n\n convergence_iter : int, default=15\n Number of iterations with no change in the number\n of estimated clusters that stops the convergence.\n\n max_iter : int, default=200\n Maximum number of iterations\n\n damping : float, default=0.5\n Damping factor between 0.5 and 1.\n\n copy : bool, default=True\n If copy is False, the affinity matrix is modified inplace by the\n algorithm, for memory efficiency.\n\n verbose : bool, default=False\n The verbosity level.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the starting state.\n Use an int for reproducible results across function calls.\n See the :term:`Glossary `.\n\n .. versionadded:: 0.23\n this parameter was previously hardcoded as 0.\n\n Returns\n -------\n\n cluster_centers_indices : ndarray of shape (n_clusters,)\n Index of clusters centers.\n\n labels : ndarray of shape (n_samples,)\n Cluster labels for each point.\n\n n_iter : int\n Number of iterations run. Returned only if `return_n_iter` is\n set to True.\n\n Notes\n -----\n For an example, see :ref:`examples/cluster/plot_affinity_propagation.py\n `.\n\n When the algorithm does not converge, it returns an empty array as\n ``cluster_center_indices`` and ``-1`` as label for each training sample.\n\n When all training samples have equal similarities and equal preferences,\n the assignment of cluster centers and labels depends on the preference.\n If the preference is smaller than the similarities, a single cluster center\n and label ``0`` for every sample will be returned. Otherwise, every\n training sample becomes its own cluster center and is assigned a unique\n label.\n\n References\n ----------\n Brendan J. Frey and Delbert Dueck, \"Clustering by Passing Messages\n Between Data Points\", Science Feb. 2007\n \"\"\"\n S = as_float_array(S, copy=copy)\n n_samples = S.shape[0]\n if S.shape[0] != S.shape[1]:\n raise ValueError('S must be a square array (shape=%s)' % repr(S.shape))\n if preference is None:\n preference = np.median(S)\n preference = np.array(preference)\n if n_samples == 1 or _equal_similarities_and_preferences(S, preference):\n warnings.warn('All samples have mutually equal similarities. Returning arbitrary cluster center(s).')\n if preference.flat[0] >= S.flat[n_samples - 1]:\n return (np.arange(n_samples), np.arange(n_samples), 0) if return_n_iter else (np.arange(n_samples), np.arange(n_samples))\n else:\n return (np.array([0]), np.array([0] * n_samples), 0) if return_n_iter else (np.array([0]), np.array([0] * n_samples))\n random_state = check_random_state(random_state)\n S.flat[::n_samples + 1] = preference\n A = np.zeros((n_samples, n_samples))\n R = np.zeros((n_samples, n_samples))\n tmp = np.zeros((n_samples, n_samples))\n S += (np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100) * random_state.randn(n_samples, n_samples)\n e = np.zeros((n_samples, convergence_iter))\n ind = np.arange(n_samples)\n for it in range(max_iter):\n np.add(A, S, tmp)\n I = np.argmax(tmp, axis=1)\n Y = tmp[ind, I]\n tmp[ind, I] = -np.inf\n Y2 = np.max(tmp, axis=1)\n np.subtract(S, Y[:, None], tmp)\n tmp[ind, I] = S[ind, I] - Y2\n tmp *= 1 - damping\n R *= damping\n R += tmp\n np.maximum(R, 0, tmp)\n tmp.flat[::n_samples + 1] = R.flat[::n_samples + 1]\n tmp -= np.sum(tmp, axis=0)\n dA = np.diag(tmp).copy()\n tmp.clip(0, np.inf, tmp)\n tmp.flat[::n_samples + 1] = dA\n tmp *= 1 - damping\n A *= damping\n A -= tmp\n E = np.diag(A) + np.diag(R) > 0\n e[:, it % convergence_iter] = E\n K = np.sum(E, axis=0)\n if it >= convergence_iter:\n se = np.sum(e, axis=1)\n unconverged = np.sum((se == convergence_iter) + (se == 0)) != n_samples\n if not unconverged and K > 0 or it == max_iter:\n never_converged = False\n if verbose:\n print('Converged after %d iterations.' % it)\n break\n else:\n never_converged = True\n if verbose:\n print('Did not converge')\n I = np.flatnonzero(E)\n K = I.size\n if K > 0 and not never_converged:\n c = np.argmax(S[:, I], axis=1)\n c[I] = np.arange(K)\n for k in range(K):\n ii = np.where(c == k)[0]\n j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))\n I[k] = ii[j]\n c = np.argmax(S[:, I], axis=1)\n c[I] = np.arange(K)\n labels = I[c]\n cluster_centers_indices = np.unique(labels)\n labels = np.searchsorted(cluster_centers_indices, labels)\n else:\n warnings.warn('Affinity propagation did not converge, this model will not have any cluster centers.', ConvergenceWarning)\n labels = np.array([-1] * n_samples)\n cluster_centers_indices = []\n if return_n_iter:\n return cluster_centers_indices, labels, it + 1\n else:\n return cluster_centers_indices, labels" + "docstring": "Perform Affinity Propagation Clustering of data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n\n S : array-like of shape (n_samples, n_samples)\n Matrix of similarities between points.\n\n preference : array-like of shape (n_samples,) or float, default=None\n Preferences for each point - points with larger values of\n preferences are more likely to be chosen as exemplars. The number of\n exemplars, i.e. of clusters, is influenced by the input preferences\n value. If the preferences are not passed as arguments, they will be\n set to the median of the input similarities (resulting in a moderate\n number of clusters). For a smaller amount of clusters, this can be set\n to the minimum value of the similarities.\n\n convergence_iter : int, default=15\n Number of iterations with no change in the number\n of estimated clusters that stops the convergence.\n\n max_iter : int, default=200\n Maximum number of iterations.\n\n damping : float, default=0.5\n Damping factor between 0.5 and 1.\n\n copy : bool, default=True\n If copy is False, the affinity matrix is modified inplace by the\n algorithm, for memory efficiency.\n\n verbose : bool, default=False\n The verbosity level.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the starting state.\n Use an int for reproducible results across function calls.\n See the :term:`Glossary `.\n\n .. versionadded:: 0.23\n this parameter was previously hardcoded as 0.\n\n Returns\n -------\n\n cluster_centers_indices : ndarray of shape (n_clusters,)\n Index of clusters centers.\n\n labels : ndarray of shape (n_samples,)\n Cluster labels for each point.\n\n n_iter : int\n Number of iterations run. Returned only if `return_n_iter` is\n set to True.\n\n Notes\n -----\n For an example, see :ref:`examples/cluster/plot_affinity_propagation.py\n `.\n\n When the algorithm does not converge, it returns an empty array as\n ``cluster_center_indices`` and ``-1`` as label for each training sample.\n\n When all training samples have equal similarities and equal preferences,\n the assignment of cluster centers and labels depends on the preference.\n If the preference is smaller than the similarities, a single cluster center\n and label ``0`` for every sample will be returned. Otherwise, every\n training sample becomes its own cluster center and is assigned a unique\n label.\n\n References\n ----------\n Brendan J. Frey and Delbert Dueck, \"Clustering by Passing Messages\n Between Data Points\", Science Feb. 2007\n ", + "source_code": "\ndef affinity_propagation(S, *, preference=None, convergence_iter=15, max_iter=200, damping=0.5, copy=True, verbose=False, return_n_iter=False, random_state=None):\n \"\"\"Perform Affinity Propagation Clustering of data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n\n S : array-like of shape (n_samples, n_samples)\n Matrix of similarities between points.\n\n preference : array-like of shape (n_samples,) or float, default=None\n Preferences for each point - points with larger values of\n preferences are more likely to be chosen as exemplars. The number of\n exemplars, i.e. of clusters, is influenced by the input preferences\n value. If the preferences are not passed as arguments, they will be\n set to the median of the input similarities (resulting in a moderate\n number of clusters). For a smaller amount of clusters, this can be set\n to the minimum value of the similarities.\n\n convergence_iter : int, default=15\n Number of iterations with no change in the number\n of estimated clusters that stops the convergence.\n\n max_iter : int, default=200\n Maximum number of iterations.\n\n damping : float, default=0.5\n Damping factor between 0.5 and 1.\n\n copy : bool, default=True\n If copy is False, the affinity matrix is modified inplace by the\n algorithm, for memory efficiency.\n\n verbose : bool, default=False\n The verbosity level.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the starting state.\n Use an int for reproducible results across function calls.\n See the :term:`Glossary `.\n\n .. versionadded:: 0.23\n this parameter was previously hardcoded as 0.\n\n Returns\n -------\n\n cluster_centers_indices : ndarray of shape (n_clusters,)\n Index of clusters centers.\n\n labels : ndarray of shape (n_samples,)\n Cluster labels for each point.\n\n n_iter : int\n Number of iterations run. Returned only if `return_n_iter` is\n set to True.\n\n Notes\n -----\n For an example, see :ref:`examples/cluster/plot_affinity_propagation.py\n `.\n\n When the algorithm does not converge, it returns an empty array as\n ``cluster_center_indices`` and ``-1`` as label for each training sample.\n\n When all training samples have equal similarities and equal preferences,\n the assignment of cluster centers and labels depends on the preference.\n If the preference is smaller than the similarities, a single cluster center\n and label ``0`` for every sample will be returned. Otherwise, every\n training sample becomes its own cluster center and is assigned a unique\n label.\n\n References\n ----------\n Brendan J. Frey and Delbert Dueck, \"Clustering by Passing Messages\n Between Data Points\", Science Feb. 2007\n \"\"\"\n S = as_float_array(S, copy=copy)\n n_samples = S.shape[0]\n if S.shape[0] != S.shape[1]:\n raise ValueError('S must be a square array (shape=%s)' % repr(S.shape))\n if preference is None:\n preference = np.median(S)\n preference = np.array(preference)\n if n_samples == 1 or _equal_similarities_and_preferences(S, preference):\n warnings.warn('All samples have mutually equal similarities. Returning arbitrary cluster center(s).')\n if preference.flat[0] >= S.flat[n_samples - 1]:\n return (np.arange(n_samples), np.arange(n_samples), 0) if return_n_iter else (np.arange(n_samples), np.arange(n_samples))\n else:\n return (np.array([0]), np.array([0] * n_samples), 0) if return_n_iter else (np.array([0]), np.array([0] * n_samples))\n random_state = check_random_state(random_state)\n S.flat[::n_samples + 1] = preference\n A = np.zeros((n_samples, n_samples))\n R = np.zeros((n_samples, n_samples))\n tmp = np.zeros((n_samples, n_samples))\n S += (np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100) * random_state.randn(n_samples, n_samples)\n e = np.zeros((n_samples, convergence_iter))\n ind = np.arange(n_samples)\n for it in range(max_iter):\n np.add(A, S, tmp)\n I = np.argmax(tmp, axis=1)\n Y = tmp[ind, I]\n tmp[ind, I] = -np.inf\n Y2 = np.max(tmp, axis=1)\n np.subtract(S, Y[:, None], tmp)\n tmp[ind, I] = S[ind, I] - Y2\n tmp *= 1 - damping\n R *= damping\n R += tmp\n np.maximum(R, 0, tmp)\n tmp.flat[::n_samples + 1] = R.flat[::n_samples + 1]\n tmp -= np.sum(tmp, axis=0)\n dA = np.diag(tmp).copy()\n tmp.clip(0, np.inf, tmp)\n tmp.flat[::n_samples + 1] = dA\n tmp *= 1 - damping\n A *= damping\n A -= tmp\n E = np.diag(A) + np.diag(R) > 0\n e[:, it % convergence_iter] = E\n K = np.sum(E, axis=0)\n if it >= convergence_iter:\n se = np.sum(e, axis=1)\n unconverged = np.sum((se == convergence_iter) + (se == 0)) != n_samples\n if not unconverged and K > 0 or it == max_iter:\n never_converged = False\n if verbose:\n print('Converged after %d iterations.' % it)\n break\n else:\n never_converged = True\n if verbose:\n print('Did not converge')\n I = np.flatnonzero(E)\n K = I.size\n if K > 0 and not never_converged:\n c = np.argmax(S[:, I], axis=1)\n c[I] = np.arange(K)\n for k in range(K):\n ii = np.where(c == k)[0]\n j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))\n I[k] = ii[j]\n c = np.argmax(S[:, I], axis=1)\n c[I] = np.arange(K)\n labels = I[c]\n cluster_centers_indices = np.unique(labels)\n labels = np.searchsorted(cluster_centers_indices, labels)\n else:\n warnings.warn('Affinity propagation did not converge, this model will not have any cluster centers.', ConvergenceWarning)\n labels = np.array([-1] * n_samples)\n cluster_centers_indices = []\n if return_n_iter:\n return cluster_centers_indices, labels, it + 1\n else:\n return cluster_centers_indices, labels" }, { "name": "__init__", @@ -30624,7 +31068,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -30634,7 +31079,8 @@ "docstring": { "type": "int or None, default=2", "description": "The number of clusters to find. It must be ``None`` if\n``distance_threshold`` is not ``None``." - } + }, + "refined_type": {} }, { "name": "affinity", @@ -30644,7 +31090,8 @@ "docstring": { "type": "str or callable, default='euclidean'", "description": "Metric used to compute the linkage. Can be \"euclidean\", \"l1\", \"l2\",\n\"manhattan\", \"cosine\", or \"precomputed\".\nIf linkage is \"ward\", only \"euclidean\" is accepted.\nIf \"precomputed\", a distance matrix (instead of a similarity matrix)\nis needed as input for the fit method." - } + }, + "refined_type": {} }, { "name": "memory", @@ -30654,7 +31101,8 @@ "docstring": { "type": "str or object with the joblib.Memory interface, default=None", "description": "Used to cache the output of the computation of the tree.\nBy default, no caching is done. If a string is given, it is the\npath to the caching directory." - } + }, + "refined_type": {} }, { "name": "connectivity", @@ -30664,7 +31112,8 @@ "docstring": { "type": "array-like or callable, default=None", "description": "Connectivity matrix. Defines for each sample the neighboring\nsamples following a given structure of the data.\nThis can be a connectivity matrix itself or a callable that transforms\nthe data into a connectivity matrix, such as derived from\n`kneighbors_graph`. Default is ``None``, i.e, the\nhierarchical clustering algorithm is unstructured." - } + }, + "refined_type": {} }, { "name": "compute_full_tree", @@ -30674,7 +31123,8 @@ "docstring": { "type": "'auto' or bool, default='auto'", "description": "Stop early the construction of the tree at ``n_clusters``. This is\nuseful to decrease computation time if the number of clusters is not\nsmall compared to the number of samples. This option is useful only\nwhen specifying a connectivity matrix. Note also that when varying the\nnumber of clusters and using caching, it may be advantageous to compute\nthe full tree. It must be ``True`` if ``distance_threshold`` is not\n``None``. By default `compute_full_tree` is \"auto\", which is equivalent\nto `True` when `distance_threshold` is not `None` or that `n_clusters`\nis inferior to the maximum between 100 or `0.02 * n_samples`.\nOtherwise, \"auto\" is equivalent to `False`." - } + }, + "refined_type": {} }, { "name": "linkage", @@ -30684,6 +31134,10 @@ "docstring": { "type": "{'ward', 'complete', 'average', 'single'}, default='ward'", "description": "Which linkage criterion to use. The linkage criterion determines which\ndistance to use between sets of observation. The algorithm will merge\nthe pairs of cluster that minimize this criterion.\n\n- 'ward' minimizes the variance of the clusters being merged.\n- 'average' uses the average of the distances of each observation of\n the two sets.\n- 'complete' or 'maximum' linkage uses the maximum distances between\n all observations of the two sets.\n- 'single' uses the minimum of the distances between all observations\n of the two sets.\n\n.. versionadded:: 0.20\n Added the 'single' option" + }, + "refined_type": { + "kind": "EnumType", + "values": ["complete", "average", "ward", "single"] } }, { @@ -30694,7 +31148,8 @@ "docstring": { "type": "float, default=None", "description": "The linkage distance threshold above which, clusters will not be\nmerged. If not ``None``, ``n_clusters`` must be ``None`` and\n``compute_full_tree`` must be ``True``.\n\n.. versionadded:: 0.21" - } + }, + "refined_type": {} }, { "name": "compute_distances", @@ -30704,13 +31159,14 @@ "docstring": { "type": "bool, default=False", "description": "Computes distances between clusters even if `distance_threshold` is not\nused. This can be used to make dendrogram visualization, but introduces\na computational and memory overhead.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_clusters=2, *, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', distance_threshold=None, compute_distances=False):\n self.n_clusters = n_clusters\n self.distance_threshold = distance_threshold\n self.memory = memory\n self.connectivity = connectivity\n self.compute_full_tree = compute_full_tree\n self.linkage = linkage\n self.affinity = affinity\n self.compute_distances = compute_distances" }, { @@ -30728,7 +31184,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -30738,13 +31195,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "Training instances to cluster, or distances between instances if\n``affinity='precomputed'``." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit without validation", - "docstring": "Fit without validation\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``affinity='precomputed'``.\n\nReturns\n-------\nself : object\n Returns the fitted instance.", + "docstring": "Fit without validation\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``affinity='precomputed'``.\n\n Returns\n -------\n self : object\n Returns the fitted instance.\n ", "source_code": "\ndef _fit(self, X):\n \"\"\"Fit without validation\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``affinity='precomputed'``.\n\n Returns\n -------\n self : object\n Returns the fitted instance.\n \"\"\"\n memory = check_memory(self.memory)\n if self.n_clusters is not None and self.n_clusters <= 0:\n raise ValueError('n_clusters should be an integer greater than 0. %s was provided.' % str(self.n_clusters))\n if not (self.n_clusters is None) ^ (self.distance_threshold is None):\n raise ValueError('Exactly one of n_clusters and distance_threshold has to be set, and the other needs to be None.')\n if self.distance_threshold is not None and not self.compute_full_tree:\n raise ValueError('compute_full_tree must be True if distance_threshold is set.')\n if self.linkage == 'ward' and self.affinity != 'euclidean':\n raise ValueError('%s was provided as affinity. Ward can only work with euclidean distances.' % (self.affinity, ))\n if self.linkage not in _TREE_BUILDERS:\n raise ValueError('Unknown linkage type %s. Valid options are %s' % (self.linkage, _TREE_BUILDERS.keys()))\n tree_builder = _TREE_BUILDERS[self.linkage]\n connectivity = self.connectivity\n if self.connectivity is not None:\n if callable(self.connectivity):\n connectivity = self.connectivity(X)\n connectivity = check_array(connectivity, accept_sparse=['csr', 'coo', 'lil'])\n n_samples = len(X)\n compute_full_tree = self.compute_full_tree\n if self.connectivity is None:\n compute_full_tree = True\n if compute_full_tree == 'auto':\n if self.distance_threshold is not None:\n compute_full_tree = True\n else:\n compute_full_tree = self.n_clusters < max(100, 0.02 * n_samples)\n n_clusters = self.n_clusters\n if compute_full_tree:\n n_clusters = None\n kwargs = {}\n if self.linkage != 'ward':\n kwargs['linkage'] = self.linkage\n kwargs['affinity'] = self.affinity\n distance_threshold = self.distance_threshold\n return_distance = distance_threshold is not None or self.compute_distances\n out = memory.cache(tree_builder)(X, connectivity=connectivity, n_clusters=n_clusters, return_distance=return_distance, **kwargs)\n (self.children_, self.n_connected_components_, self.n_leaves_, parents) = out[:4]\n if return_distance:\n self.distances_ = out[-1]\n if self.distance_threshold is not None:\n self.n_clusters_ = np.count_nonzero(self.distances_ >= distance_threshold) + 1\n else:\n self.n_clusters_ = self.n_clusters\n if compute_full_tree:\n self.labels_ = _hc_cut(self.n_clusters_, self.children_, self.n_leaves_)\n else:\n labels = _hierarchical.hc_get_heads(parents, copy=False)\n labels = np.copy(labels[:n_samples])\n self.labels_ = np.searchsorted(np.unique(labels), labels)\n return self" }, { @@ -30762,7 +31220,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -30772,7 +31231,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_features) or (n_samples, n_samples)", "description": "Training instances to cluster, or distances between instances if\n``affinity='precomputed'``." - } + }, + "refined_type": {} }, { "name": "y", @@ -30782,13 +31242,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the hierarchical clustering from features, or distance matrix.", - "docstring": "Fit the hierarchical clustering from features, or distance matrix.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``affinity='precomputed'``.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the fitted instance.", + "docstring": "Fit the hierarchical clustering from features, or distance matrix.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``affinity='precomputed'``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the fitted instance.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the hierarchical clustering from features, or distance matrix.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``affinity='precomputed'``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the fitted instance.\n \"\"\"\n X = self._validate_data(X, ensure_min_samples=2, estimator=self)\n return self._fit(X)" }, { @@ -30806,7 +31267,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -30816,7 +31278,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "Training instances to cluster, or distances between instances if\n``affinity='precomputed'``." - } + }, + "refined_type": {} }, { "name": "y", @@ -30826,13 +31289,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit and return the result of each sample's clustering assignment.\n\nIn addition to fitting, this method also return the result of the clustering assignment for each sample in the training set.", - "docstring": "Fit and return the result of each sample's clustering assignment.\n\nIn addition to fitting, this method also return the result of the\nclustering assignment for each sample in the training set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``affinity='precomputed'``.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,)\n Cluster labels.", + "description": "Fit and return the result of each sample's clustering assignment.\n\nIn addition to fitting, this method also return the result of the\nclustering assignment for each sample in the training set.", + "docstring": "Fit and return the result of each sample's clustering assignment.\n\n In addition to fitting, this method also return the result of the\n clustering assignment for each sample in the training set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``affinity='precomputed'``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels.\n ", "source_code": "\ndef fit_predict(self, X, y=None):\n \"\"\"Fit and return the result of each sample's clustering assignment.\n\n In addition to fitting, this method also return the result of the\n clustering assignment for each sample in the training set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``affinity='precomputed'``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels.\n \"\"\"\n return super().fit_predict(X, y)" }, { @@ -30850,7 +31314,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -30860,7 +31325,8 @@ "docstring": { "type": "int, default=2", "description": "The number of clusters to find. It must be ``None`` if\n``distance_threshold`` is not ``None``." - } + }, + "refined_type": {} }, { "name": "affinity", @@ -30870,7 +31336,8 @@ "docstring": { "type": "str or callable, default='euclidean'", "description": "Metric used to compute the linkage. Can be \"euclidean\", \"l1\", \"l2\",\n\"manhattan\", \"cosine\", or 'precomputed'.\nIf linkage is \"ward\", only \"euclidean\" is accepted." - } + }, + "refined_type": {} }, { "name": "memory", @@ -30880,7 +31347,8 @@ "docstring": { "type": "str or object with the joblib.Memory interface, default=None", "description": "Used to cache the output of the computation of the tree.\nBy default, no caching is done. If a string is given, it is the\npath to the caching directory." - } + }, + "refined_type": {} }, { "name": "connectivity", @@ -30890,7 +31358,8 @@ "docstring": { "type": "array-like or callable, default=None", "description": "Connectivity matrix. Defines for each feature the neighboring\nfeatures following a given structure of the data.\nThis can be a connectivity matrix itself or a callable that transforms\nthe data into a connectivity matrix, such as derived from\n`kneighbors_graph`. Default is `None`, i.e, the\nhierarchical clustering algorithm is unstructured." - } + }, + "refined_type": {} }, { "name": "compute_full_tree", @@ -30900,7 +31369,8 @@ "docstring": { "type": "'auto' or bool, default='auto'", "description": "Stop early the construction of the tree at `n_clusters`. This is useful\nto decrease computation time if the number of clusters is not small\ncompared to the number of features. This option is useful only when\nspecifying a connectivity matrix. Note also that when varying the\nnumber of clusters and using caching, it may be advantageous to compute\nthe full tree. It must be ``True`` if ``distance_threshold`` is not\n``None``. By default `compute_full_tree` is \"auto\", which is equivalent\nto `True` when `distance_threshold` is not `None` or that `n_clusters`\nis inferior to the maximum between 100 or `0.02 * n_samples`.\nOtherwise, \"auto\" is equivalent to `False`." - } + }, + "refined_type": {} }, { "name": "linkage", @@ -30910,6 +31380,10 @@ "docstring": { "type": "{\"ward\", \"complete\", \"average\", \"single\"}, default=\"ward\"", "description": "Which linkage criterion to use. The linkage criterion determines which\ndistance to use between sets of features. The algorithm will merge\nthe pairs of cluster that minimize this criterion.\n\n- \"ward\" minimizes the variance of the clusters being merged.\n- \"complete\" or maximum linkage uses the maximum distances between\n all features of the two sets.\n- \"average\" uses the average of the distances of each feature of\n the two sets.\n- \"single\" uses the minimum of the distances between all features\n of the two sets." + }, + "refined_type": { + "kind": "EnumType", + "values": ["complete", "average", "ward", "single"] } }, { @@ -30920,7 +31394,8 @@ "docstring": { "type": "callable, default=np.mean", "description": "This combines the values of agglomerated features into a single\nvalue, and should accept an array of shape [M, N] and the keyword\nargument `axis=1`, and reduce it to an array of size [M]." - } + }, + "refined_type": {} }, { "name": "distance_threshold", @@ -30930,7 +31405,8 @@ "docstring": { "type": "float, default=None", "description": "The linkage distance threshold above which, clusters will not be\nmerged. If not ``None``, ``n_clusters`` must be ``None`` and\n``compute_full_tree`` must be ``True``.\n\n.. versionadded:: 0.21" - } + }, + "refined_type": {} }, { "name": "compute_distances", @@ -30940,13 +31416,14 @@ "docstring": { "type": "bool, default=False", "description": "Computes distances between clusters even if `distance_threshold` is not\nused. This can be used to make dendrogram visualization, but introduces\na computational and memory overhead.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_clusters=2, *, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', pooling_func=np.mean, distance_threshold=None, compute_distances=False):\n super().__init__(n_clusters=n_clusters, memory=memory, connectivity=connectivity, compute_full_tree=compute_full_tree, linkage=linkage, affinity=affinity, distance_threshold=distance_threshold, compute_distances=compute_distances)\n self.pooling_func = pooling_func" }, { @@ -30964,7 +31441,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -30974,7 +31452,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data." - } + }, + "refined_type": {} }, { "name": "y", @@ -30984,13 +31463,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the hierarchical clustering on the data.", - "docstring": "Fit the hierarchical clustering on the data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the transformer.", + "docstring": "Fit the hierarchical clustering on the data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the transformer.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the hierarchical clustering on the data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the transformer.\n \"\"\"\n X = self._validate_data(X, ensure_min_features=2, estimator=self)\n super()._fit(X.T)\n return self" }, { @@ -31008,7 +31488,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -31027,7 +31508,7 @@ "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _average_linkage(*args, **kwargs):\n kwargs['linkage'] = 'average'\n return linkage_tree(*args, **kwargs)" }, { @@ -31040,7 +31521,7 @@ "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _complete_linkage(*args, **kwargs):\n kwargs['linkage'] = 'complete'\n return linkage_tree(*args, **kwargs)" }, { @@ -31058,7 +31539,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Feature matrix representing `n_samples` samples to be clustered." - } + }, + "refined_type": {} }, { "name": "connectivity", @@ -31068,7 +31550,8 @@ "docstring": { "type": "sparse matrix, default=None", "description": "Connectivity matrix. Defines for each sample the neighboring samples\nfollowing a given structure of the data. The matrix is assumed to\nbe symmetric and only the upper triangular half is used.\nDefault is `None`, i.e, the Ward algorithm is unstructured." - } + }, + "refined_type": {} }, { "name": "affinity", @@ -31078,13 +31561,17 @@ "docstring": { "type": "{\"euclidean\", \"precomputed\"}, default=\"euclidean\"", "description": "Which affinity to use. At the moment `precomputed` and\n``euclidean`` are supported. `euclidean` uses the\nnegative squared Euclidean distance between points." + }, + "refined_type": { + "kind": "EnumType", + "values": ["euclidean", "precomputed"] } } ], "results": [], "is_public": false, - "description": "Fixes the connectivity matrix.\n\nThe different steps are: - copies it - makes it symmetric - converts it to LIL if necessary - completes it if necessary.", - "docstring": "Fixes the connectivity matrix.\n\nThe different steps are:\n\n- copies it\n- makes it symmetric\n- converts it to LIL if necessary\n- completes it if necessary.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Feature matrix representing `n_samples` samples to be clustered.\n\nconnectivity : sparse matrix, default=None\n Connectivity matrix. Defines for each sample the neighboring samples\n following a given structure of the data. The matrix is assumed to\n be symmetric and only the upper triangular half is used.\n Default is `None`, i.e, the Ward algorithm is unstructured.\n\naffinity : {\"euclidean\", \"precomputed\"}, default=\"euclidean\"\n Which affinity to use. At the moment `precomputed` and\n ``euclidean`` are supported. `euclidean` uses the\n negative squared Euclidean distance between points.\n\nReturns\n-------\nconnectivity : sparse matrix\n The fixed connectivity matrix.\n\nn_connected_components : int\n The number of connected components in the graph.", + "description": "Fixes the connectivity matrix.\n\nThe different steps are:\n\n- copies it\n- makes it symmetric\n- converts it to LIL if necessary\n- completes it if necessary.", + "docstring": "\n Fixes the connectivity matrix.\n\n The different steps are:\n\n - copies it\n - makes it symmetric\n - converts it to LIL if necessary\n - completes it if necessary.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Feature matrix representing `n_samples` samples to be clustered.\n\n connectivity : sparse matrix, default=None\n Connectivity matrix. Defines for each sample the neighboring samples\n following a given structure of the data. The matrix is assumed to\n be symmetric and only the upper triangular half is used.\n Default is `None`, i.e, the Ward algorithm is unstructured.\n\n affinity : {\"euclidean\", \"precomputed\"}, default=\"euclidean\"\n Which affinity to use. At the moment `precomputed` and\n ``euclidean`` are supported. `euclidean` uses the\n negative squared Euclidean distance between points.\n\n Returns\n -------\n connectivity : sparse matrix\n The fixed connectivity matrix.\n\n n_connected_components : int\n The number of connected components in the graph.\n ", "source_code": "\ndef _fix_connectivity(X, connectivity, affinity):\n \"\"\"\n Fixes the connectivity matrix.\n\n The different steps are:\n\n - copies it\n - makes it symmetric\n - converts it to LIL if necessary\n - completes it if necessary.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Feature matrix representing `n_samples` samples to be clustered.\n\n connectivity : sparse matrix, default=None\n Connectivity matrix. Defines for each sample the neighboring samples\n following a given structure of the data. The matrix is assumed to\n be symmetric and only the upper triangular half is used.\n Default is `None`, i.e, the Ward algorithm is unstructured.\n\n affinity : {\"euclidean\", \"precomputed\"}, default=\"euclidean\"\n Which affinity to use. At the moment `precomputed` and\n ``euclidean`` are supported. `euclidean` uses the\n negative squared Euclidean distance between points.\n\n Returns\n -------\n connectivity : sparse matrix\n The fixed connectivity matrix.\n\n n_connected_components : int\n The number of connected components in the graph.\n \"\"\"\n n_samples = X.shape[0]\n if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples:\n raise ValueError('Wrong shape for connectivity matrix: %s when X is %s' % (connectivity.shape, X.shape))\n connectivity = connectivity + connectivity.T\n if not sparse.isspmatrix_lil(connectivity):\n if not sparse.isspmatrix(connectivity):\n connectivity = sparse.lil_matrix(connectivity)\n else:\n connectivity = connectivity.tolil()\n (n_connected_components, labels) = connected_components(connectivity)\n if n_connected_components > 1:\n warnings.warn('the number of connected components of the connectivity matrix is %d > 1. Completing it to avoid stopping the tree early.' % n_connected_components, stacklevel=2)\n connectivity = _fix_connected_components(X=X, graph=connectivity, n_connected_components=n_connected_components, component_labels=labels, metric=affinity, mode='connectivity')\n return connectivity, n_connected_components" }, { @@ -31102,7 +31589,8 @@ "docstring": { "type": "int or ndarray", "description": "The number of clusters to form." - } + }, + "refined_type": {} }, { "name": "children", @@ -31112,7 +31600,8 @@ "docstring": { "type": "ndarray of shape (n_nodes-1, 2)", "description": "The children of each non-leaf node. Values less than `n_samples`\ncorrespond to leaves of the tree which are the original samples.\nA node `i` greater than or equal to `n_samples` is a non-leaf\nnode and has children `children_[i - n_samples]`. Alternatively\nat the i-th iteration, children[i][0] and children[i][1]\nare merged to form node `n_samples + i`." - } + }, + "refined_type": {} }, { "name": "n_leaves", @@ -31122,13 +31611,14 @@ "docstring": { "type": "int", "description": "Number of leaves of the tree." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Function cutting the ward tree for a given number of clusters.", - "docstring": "Function cutting the ward tree for a given number of clusters.\n\nParameters\n----------\nn_clusters : int or ndarray\n The number of clusters to form.\n\nchildren : ndarray of shape (n_nodes-1, 2)\n The children of each non-leaf node. Values less than `n_samples`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_samples` is a non-leaf\n node and has children `children_[i - n_samples]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_samples + i`.\n\nn_leaves : int\n Number of leaves of the tree.\n\nReturns\n-------\nlabels : array [n_samples]\n Cluster labels for each point.", + "docstring": "Function cutting the ward tree for a given number of clusters.\n\n Parameters\n ----------\n n_clusters : int or ndarray\n The number of clusters to form.\n\n children : ndarray of shape (n_nodes-1, 2)\n The children of each non-leaf node. Values less than `n_samples`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_samples` is a non-leaf\n node and has children `children_[i - n_samples]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_samples + i`.\n\n n_leaves : int\n Number of leaves of the tree.\n\n Returns\n -------\n labels : array [n_samples]\n Cluster labels for each point.\n ", "source_code": "\ndef _hc_cut(n_clusters, children, n_leaves):\n \"\"\"Function cutting the ward tree for a given number of clusters.\n\n Parameters\n ----------\n n_clusters : int or ndarray\n The number of clusters to form.\n\n children : ndarray of shape (n_nodes-1, 2)\n The children of each non-leaf node. Values less than `n_samples`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_samples` is a non-leaf\n node and has children `children_[i - n_samples]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_samples + i`.\n\n n_leaves : int\n Number of leaves of the tree.\n\n Returns\n -------\n labels : array [n_samples]\n Cluster labels for each point.\n \"\"\"\n if n_clusters > n_leaves:\n raise ValueError('Cannot extract more clusters than samples: %s clusters where given for a tree with %s leaves.' % (n_clusters, n_leaves))\n nodes = [-(max(children[-1]) + 1)]\n for _ in range(n_clusters - 1):\n these_children = children[-nodes[0] - n_leaves]\n heappush(nodes, -these_children[0])\n heappushpop(nodes, -these_children[1])\n label = np.zeros(n_leaves, dtype=np.intp)\n for (i, node) in enumerate(nodes):\n label[_hierarchical._hc_get_descendent(-node, children, n_leaves)] = i\n return label" }, { @@ -31141,7 +31631,7 @@ "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _single_linkage(*args, **kwargs):\n kwargs['linkage'] = 'single'\n return linkage_tree(*args, **kwargs)" }, { @@ -31159,7 +31649,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -31169,7 +31660,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_nodes", @@ -31179,7 +31671,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -31189,7 +31682,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_connected_components", @@ -31199,7 +31693,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "return_distance", @@ -31209,13 +31704,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Perform single linkage clustering on sparse data via the minimum spanning tree from scipy.sparse.csgraph, then using union-find to label. The parent array is then generated by walking through the tree.", - "docstring": "Perform single linkage clustering on sparse data via the minimum\nspanning tree from scipy.sparse.csgraph, then using union-find to label.\nThe parent array is then generated by walking through the tree.", + "description": "Perform single linkage clustering on sparse data via the minimum\nspanning tree from scipy.sparse.csgraph, then using union-find to label.\nThe parent array is then generated by walking through the tree.", + "docstring": "\n Perform single linkage clustering on sparse data via the minimum\n spanning tree from scipy.sparse.csgraph, then using union-find to label.\n The parent array is then generated by walking through the tree.\n ", "source_code": "\ndef _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, n_connected_components, return_distance):\n \"\"\"\n Perform single linkage clustering on sparse data via the minimum\n spanning tree from scipy.sparse.csgraph, then using union-find to label.\n The parent array is then generated by walking through the tree.\n \"\"\"\n from scipy.sparse.csgraph import minimum_spanning_tree\n connectivity = connectivity.astype('float64', **_astype_copy_false(connectivity))\n epsilon_value = np.finfo(dtype=connectivity.data.dtype).eps\n connectivity.data[connectivity.data == 0] = epsilon_value\n mst = minimum_spanning_tree(connectivity.tocsr())\n mst = mst.tocoo()\n mst.data[mst.data == epsilon_value] = 0\n mst_array = np.vstack([mst.row, mst.col, mst.data]).T\n mst_array = mst_array[np.argsort(mst_array.T[2], kind='mergesort'), :]\n single_linkage_tree = _hierarchical._single_linkage_label(mst_array)\n children_ = single_linkage_tree[:, :2].astype(int)\n parent = np.arange(n_nodes, dtype=np.intp)\n for (i, (left, right)) in enumerate(children_, n_samples):\n if n_clusters is not None and i >= n_nodes:\n break\n if left < n_nodes:\n parent[left] = i\n if right < n_nodes:\n parent[right] = i\n if return_distance:\n distances = single_linkage_tree[:, 2]\n return children_, n_connected_components, n_samples, parent, distances\n return children_, n_connected_components, n_samples, parent" }, { @@ -31233,7 +31729,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Feature matrix representing `n_samples` samples to be clustered." - } + }, + "refined_type": {} }, { "name": "connectivity", @@ -31243,7 +31740,8 @@ "docstring": { "type": "sparse matrix, default=None", "description": "Connectivity matrix. Defines for each sample the neighboring samples\nfollowing a given structure of the data. The matrix is assumed to\nbe symmetric and only the upper triangular half is used.\nDefault is `None`, i.e, the Ward algorithm is unstructured." - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -31253,7 +31751,8 @@ "docstring": { "type": "int, default=None", "description": "Stop early the construction of the tree at `n_clusters`. This is\nuseful to decrease computation time if the number of clusters is\nnot small compared to the number of samples. In this case, the\ncomplete tree is not computed, thus the 'children' output is of\nlimited use, and the 'parents' output should rather be used.\nThis option is valid only when specifying a connectivity matrix." - } + }, + "refined_type": {} }, { "name": "linkage", @@ -31263,6 +31762,10 @@ "docstring": { "type": "{\"average\", \"complete\", \"single\"}, default=\"complete\"", "description": "Which linkage criteria to use. The linkage criterion determines which\ndistance to use between sets of observation.\n - \"average\" uses the average of the distances of each observation of\n the two sets.\n - \"complete\" or maximum linkage uses the maximum distances between\n all observations of the two sets.\n - \"single\" uses the minimum of the distances between all\n observations of the two sets." + }, + "refined_type": { + "kind": "EnumType", + "values": ["complete", "average", "single"] } }, { @@ -31272,8 +31775,9 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "str or callable, default='euclidean'", - "description": "which metric to use. Can be 'euclidean', 'manhattan', or any\ndistance known to paired distance (see metric.pairwise)." - } + "description": "Which metric to use. Can be 'euclidean', 'manhattan', or any\ndistance known to paired distance (see metric.pairwise)." + }, + "refined_type": {} }, { "name": "return_distance", @@ -31282,15 +31786,16 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "bool, default=False", - "description": "whether or not to return the distances between the clusters." - } + "description": "Whether or not to return the distances between the clusters." + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Linkage agglomerative clustering based on a Feature matrix.\n\nThe inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account some topological structure between samples. Read more in the :ref:`User Guide `.", - "docstring": "Linkage agglomerative clustering based on a Feature matrix.\n\nThe inertia matrix uses a Heapq-based representation.\n\nThis is the structured version, that takes into account some topological\nstructure between samples.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Feature matrix representing `n_samples` samples to be clustered.\n\nconnectivity : sparse matrix, default=None\n Connectivity matrix. Defines for each sample the neighboring samples\n following a given structure of the data. The matrix is assumed to\n be symmetric and only the upper triangular half is used.\n Default is `None`, i.e, the Ward algorithm is unstructured.\n\nn_clusters : int, default=None\n Stop early the construction of the tree at `n_clusters`. This is\n useful to decrease computation time if the number of clusters is\n not small compared to the number of samples. In this case, the\n complete tree is not computed, thus the 'children' output is of\n limited use, and the 'parents' output should rather be used.\n This option is valid only when specifying a connectivity matrix.\n\nlinkage : {\"average\", \"complete\", \"single\"}, default=\"complete\"\n Which linkage criteria to use. The linkage criterion determines which\n distance to use between sets of observation.\n - \"average\" uses the average of the distances of each observation of\n the two sets.\n - \"complete\" or maximum linkage uses the maximum distances between\n all observations of the two sets.\n - \"single\" uses the minimum of the distances between all\n observations of the two sets.\n\naffinity : str or callable, default='euclidean'\n which metric to use. Can be 'euclidean', 'manhattan', or any\n distance known to paired distance (see metric.pairwise).\n\nreturn_distance : bool, default=False\n whether or not to return the distances between the clusters.\n\nReturns\n-------\nchildren : ndarray of shape (n_nodes-1, 2)\n The children of each non-leaf node. Values less than `n_samples`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_samples` is a non-leaf\n node and has children `children_[i - n_samples]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_samples + i`.\n\nn_connected_components : int\n The number of connected components in the graph.\n\nn_leaves : int\n The number of leaves in the tree.\n\nparents : ndarray of shape (n_nodes, ) or None\n The parent of each node. Only returned when a connectivity matrix\n is specified, elsewhere 'None' is returned.\n\ndistances : ndarray of shape (n_nodes-1,)\n Returned when `return_distance` is set to `True`.\n\n distances[i] refers to the distance between children[i][0] and\n children[i][1] when they are merged.\n\nSee Also\n--------\nward_tree : Hierarchical clustering with ward linkage.", - "source_code": "\ndef linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete', affinity='euclidean', return_distance=False):\n \"\"\"Linkage agglomerative clustering based on a Feature matrix.\n\n The inertia matrix uses a Heapq-based representation.\n\n This is the structured version, that takes into account some topological\n structure between samples.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Feature matrix representing `n_samples` samples to be clustered.\n\n connectivity : sparse matrix, default=None\n Connectivity matrix. Defines for each sample the neighboring samples\n following a given structure of the data. The matrix is assumed to\n be symmetric and only the upper triangular half is used.\n Default is `None`, i.e, the Ward algorithm is unstructured.\n\n n_clusters : int, default=None\n Stop early the construction of the tree at `n_clusters`. This is\n useful to decrease computation time if the number of clusters is\n not small compared to the number of samples. In this case, the\n complete tree is not computed, thus the 'children' output is of\n limited use, and the 'parents' output should rather be used.\n This option is valid only when specifying a connectivity matrix.\n\n linkage : {\"average\", \"complete\", \"single\"}, default=\"complete\"\n Which linkage criteria to use. The linkage criterion determines which\n distance to use between sets of observation.\n - \"average\" uses the average of the distances of each observation of\n the two sets.\n - \"complete\" or maximum linkage uses the maximum distances between\n all observations of the two sets.\n - \"single\" uses the minimum of the distances between all\n observations of the two sets.\n\n affinity : str or callable, default='euclidean'\n which metric to use. Can be 'euclidean', 'manhattan', or any\n distance known to paired distance (see metric.pairwise).\n\n return_distance : bool, default=False\n whether or not to return the distances between the clusters.\n\n Returns\n -------\n children : ndarray of shape (n_nodes-1, 2)\n The children of each non-leaf node. Values less than `n_samples`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_samples` is a non-leaf\n node and has children `children_[i - n_samples]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_samples + i`.\n\n n_connected_components : int\n The number of connected components in the graph.\n\n n_leaves : int\n The number of leaves in the tree.\n\n parents : ndarray of shape (n_nodes, ) or None\n The parent of each node. Only returned when a connectivity matrix\n is specified, elsewhere 'None' is returned.\n\n distances : ndarray of shape (n_nodes-1,)\n Returned when `return_distance` is set to `True`.\n\n distances[i] refers to the distance between children[i][0] and\n children[i][1] when they are merged.\n\n See Also\n --------\n ward_tree : Hierarchical clustering with ward linkage.\n \"\"\"\n X = np.asarray(X)\n if X.ndim == 1:\n X = np.reshape(X, (-1, 1))\n (n_samples, n_features) = X.shape\n linkage_choices = {'complete': _hierarchical.max_merge, 'average': _hierarchical.average_merge, 'single': None}\n try:\n join_func = linkage_choices[linkage]\n except KeyError as e:\n raise ValueError('Unknown linkage option, linkage should be one of %s, but %s was given' % (linkage_choices.keys(), linkage)) from e\n if affinity == 'cosine' and np.any(~np.any(X, axis=1)):\n raise ValueError('Cosine affinity cannot be used when X contains zero vectors')\n if connectivity is None:\n from scipy.cluster import hierarchy\n if n_clusters is not None:\n warnings.warn('Partial build of the tree is implemented only for structured clustering (i.e. with explicit connectivity). The algorithm will build the full tree and only retain the lower branches required for the specified number of clusters', stacklevel=2)\n if affinity == 'precomputed':\n if X.shape[0] != X.shape[1]:\n raise ValueError('Distance matrix should be square, Got matrix of shape {X.shape}')\n (i, j) = np.triu_indices(X.shape[0], k=1)\n X = X[i, j]\n elif affinity == 'l2':\n affinity = 'euclidean'\n elif affinity in ('l1', 'manhattan'):\n affinity = 'cityblock'\n elif callable(affinity):\n X = affinity(X)\n (i, j) = np.triu_indices(X.shape[0], k=1)\n X = X[i, j]\n if linkage == 'single' and affinity != 'precomputed' and not callable(affinity) and affinity in METRIC_MAPPING:\n dist_metric = DistanceMetric.get_metric(affinity)\n X = np.ascontiguousarray(X, dtype=np.double)\n mst = _hierarchical.mst_linkage_core(X, dist_metric)\n mst = mst[np.argsort(mst.T[2], kind='mergesort'), :]\n out = _hierarchical.single_linkage_label(mst)\n else:\n out = hierarchy.linkage(X, method=linkage, metric=affinity)\n children_ = out[:, :2].astype(int, copy=False)\n if return_distance:\n distances = out[:, 2]\n return children_, 1, n_samples, None, distances\n return children_, 1, n_samples, None\n (connectivity, n_connected_components) = _fix_connectivity(X, connectivity, affinity=affinity)\n connectivity = connectivity.tocoo()\n diag_mask = connectivity.row != connectivity.col\n connectivity.row = connectivity.row[diag_mask]\n connectivity.col = connectivity.col[diag_mask]\n connectivity.data = connectivity.data[diag_mask]\n del diag_mask\n if affinity == 'precomputed':\n distances = X[connectivity.row, connectivity.col].astype('float64', **_astype_copy_false(X))\n else:\n distances = paired_distances(X[connectivity.row], X[connectivity.col], metric=affinity)\n connectivity.data = distances\n if n_clusters is None:\n n_nodes = 2 * n_samples - 1\n else:\n assert n_clusters <= n_samples\n n_nodes = 2 * n_samples - n_clusters\n if linkage == 'single':\n return _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, n_connected_components, return_distance)\n if return_distance:\n distances = np.empty(n_nodes - n_samples)\n A = np.empty(n_nodes, dtype=object)\n inertia = list()\n connectivity = connectivity.tolil()\n for (ind, (data, row)) in enumerate(zip(connectivity.data, connectivity.rows)):\n A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64))\n inertia.extend((_hierarchical.WeightedEdge(d, ind, r) for (r, d) in zip(row, data) if r < ind))\n del connectivity\n heapify(inertia)\n parent = np.arange(n_nodes, dtype=np.intp)\n used_node = np.ones(n_nodes, dtype=np.intp)\n children = []\n for k in range(n_samples, n_nodes):\n while True:\n edge = heappop(inertia)\n if used_node[edge.a] and used_node[edge.b]:\n break\n i = edge.a\n j = edge.b\n if return_distance:\n distances[k - n_samples] = edge.weight\n parent[i] = parent[j] = k\n children.append((i, j))\n n_i = used_node[i]\n n_j = used_node[j]\n used_node[k] = n_i + n_j\n used_node[i] = used_node[j] = False\n coord_col = join_func(A[i], A[j], used_node, n_i, n_j)\n for (col, d) in coord_col:\n A[col].append(k, d)\n heappush(inertia, _hierarchical.WeightedEdge(d, k, col))\n A[k] = coord_col\n A[i] = A[j] = 0\n n_leaves = n_samples\n children = np.array(children)[:, ::-1]\n if return_distance:\n return children, n_connected_components, n_leaves, parent, distances\n return children, n_connected_components, n_leaves, parent" + "description": "Linkage agglomerative clustering based on a Feature matrix.\n\nThe inertia matrix uses a Heapq-based representation.\n\nThis is the structured version, that takes into account some topological\nstructure between samples.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Linkage agglomerative clustering based on a Feature matrix.\n\n The inertia matrix uses a Heapq-based representation.\n\n This is the structured version, that takes into account some topological\n structure between samples.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Feature matrix representing `n_samples` samples to be clustered.\n\n connectivity : sparse matrix, default=None\n Connectivity matrix. Defines for each sample the neighboring samples\n following a given structure of the data. The matrix is assumed to\n be symmetric and only the upper triangular half is used.\n Default is `None`, i.e, the Ward algorithm is unstructured.\n\n n_clusters : int, default=None\n Stop early the construction of the tree at `n_clusters`. This is\n useful to decrease computation time if the number of clusters is\n not small compared to the number of samples. In this case, the\n complete tree is not computed, thus the 'children' output is of\n limited use, and the 'parents' output should rather be used.\n This option is valid only when specifying a connectivity matrix.\n\n linkage : {\"average\", \"complete\", \"single\"}, default=\"complete\"\n Which linkage criteria to use. The linkage criterion determines which\n distance to use between sets of observation.\n - \"average\" uses the average of the distances of each observation of\n the two sets.\n - \"complete\" or maximum linkage uses the maximum distances between\n all observations of the two sets.\n - \"single\" uses the minimum of the distances between all\n observations of the two sets.\n\n affinity : str or callable, default='euclidean'\n Which metric to use. Can be 'euclidean', 'manhattan', or any\n distance known to paired distance (see metric.pairwise).\n\n return_distance : bool, default=False\n Whether or not to return the distances between the clusters.\n\n Returns\n -------\n children : ndarray of shape (n_nodes-1, 2)\n The children of each non-leaf node. Values less than `n_samples`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_samples` is a non-leaf\n node and has children `children_[i - n_samples]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_samples + i`.\n\n n_connected_components : int\n The number of connected components in the graph.\n\n n_leaves : int\n The number of leaves in the tree.\n\n parents : ndarray of shape (n_nodes, ) or None\n The parent of each node. Only returned when a connectivity matrix\n is specified, elsewhere 'None' is returned.\n\n distances : ndarray of shape (n_nodes-1,)\n Returned when `return_distance` is set to `True`.\n\n distances[i] refers to the distance between children[i][0] and\n children[i][1] when they are merged.\n\n See Also\n --------\n ward_tree : Hierarchical clustering with ward linkage.\n ", + "source_code": "\ndef linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete', affinity='euclidean', return_distance=False):\n \"\"\"Linkage agglomerative clustering based on a Feature matrix.\n\n The inertia matrix uses a Heapq-based representation.\n\n This is the structured version, that takes into account some topological\n structure between samples.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Feature matrix representing `n_samples` samples to be clustered.\n\n connectivity : sparse matrix, default=None\n Connectivity matrix. Defines for each sample the neighboring samples\n following a given structure of the data. The matrix is assumed to\n be symmetric and only the upper triangular half is used.\n Default is `None`, i.e, the Ward algorithm is unstructured.\n\n n_clusters : int, default=None\n Stop early the construction of the tree at `n_clusters`. This is\n useful to decrease computation time if the number of clusters is\n not small compared to the number of samples. In this case, the\n complete tree is not computed, thus the 'children' output is of\n limited use, and the 'parents' output should rather be used.\n This option is valid only when specifying a connectivity matrix.\n\n linkage : {\"average\", \"complete\", \"single\"}, default=\"complete\"\n Which linkage criteria to use. The linkage criterion determines which\n distance to use between sets of observation.\n - \"average\" uses the average of the distances of each observation of\n the two sets.\n - \"complete\" or maximum linkage uses the maximum distances between\n all observations of the two sets.\n - \"single\" uses the minimum of the distances between all\n observations of the two sets.\n\n affinity : str or callable, default='euclidean'\n Which metric to use. Can be 'euclidean', 'manhattan', or any\n distance known to paired distance (see metric.pairwise).\n\n return_distance : bool, default=False\n Whether or not to return the distances between the clusters.\n\n Returns\n -------\n children : ndarray of shape (n_nodes-1, 2)\n The children of each non-leaf node. Values less than `n_samples`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_samples` is a non-leaf\n node and has children `children_[i - n_samples]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_samples + i`.\n\n n_connected_components : int\n The number of connected components in the graph.\n\n n_leaves : int\n The number of leaves in the tree.\n\n parents : ndarray of shape (n_nodes, ) or None\n The parent of each node. Only returned when a connectivity matrix\n is specified, elsewhere 'None' is returned.\n\n distances : ndarray of shape (n_nodes-1,)\n Returned when `return_distance` is set to `True`.\n\n distances[i] refers to the distance between children[i][0] and\n children[i][1] when they are merged.\n\n See Also\n --------\n ward_tree : Hierarchical clustering with ward linkage.\n \"\"\"\n X = np.asarray(X)\n if X.ndim == 1:\n X = np.reshape(X, (-1, 1))\n (n_samples, n_features) = X.shape\n linkage_choices = {'complete': _hierarchical.max_merge, 'average': _hierarchical.average_merge, 'single': None}\n try:\n join_func = linkage_choices[linkage]\n except KeyError as e:\n raise ValueError('Unknown linkage option, linkage should be one of %s, but %s was given' % (linkage_choices.keys(), linkage)) from e\n if affinity == 'cosine' and np.any(~np.any(X, axis=1)):\n raise ValueError('Cosine affinity cannot be used when X contains zero vectors')\n if connectivity is None:\n from scipy.cluster import hierarchy\n if n_clusters is not None:\n warnings.warn('Partial build of the tree is implemented only for structured clustering (i.e. with explicit connectivity). The algorithm will build the full tree and only retain the lower branches required for the specified number of clusters', stacklevel=2)\n if affinity == 'precomputed':\n if X.shape[0] != X.shape[1]:\n raise ValueError(f'Distance matrix should be square, got matrix of shape {X.shape}')\n (i, j) = np.triu_indices(X.shape[0], k=1)\n X = X[i, j]\n elif affinity == 'l2':\n affinity = 'euclidean'\n elif affinity in ('l1', 'manhattan'):\n affinity = 'cityblock'\n elif callable(affinity):\n X = affinity(X)\n (i, j) = np.triu_indices(X.shape[0], k=1)\n X = X[i, j]\n if linkage == 'single' and affinity != 'precomputed' and not callable(affinity) and affinity in METRIC_MAPPING:\n dist_metric = DistanceMetric.get_metric(affinity)\n X = np.ascontiguousarray(X, dtype=np.double)\n mst = _hierarchical.mst_linkage_core(X, dist_metric)\n mst = mst[np.argsort(mst.T[2], kind='mergesort'), :]\n out = _hierarchical.single_linkage_label(mst)\n else:\n out = hierarchy.linkage(X, method=linkage, metric=affinity)\n children_ = out[:, :2].astype(int, copy=False)\n if return_distance:\n distances = out[:, 2]\n return children_, 1, n_samples, None, distances\n return children_, 1, n_samples, None\n (connectivity, n_connected_components) = _fix_connectivity(X, connectivity, affinity=affinity)\n connectivity = connectivity.tocoo()\n diag_mask = connectivity.row != connectivity.col\n connectivity.row = connectivity.row[diag_mask]\n connectivity.col = connectivity.col[diag_mask]\n connectivity.data = connectivity.data[diag_mask]\n del diag_mask\n if affinity == 'precomputed':\n distances = X[connectivity.row, connectivity.col].astype('float64', **_astype_copy_false(X))\n else:\n distances = paired_distances(X[connectivity.row], X[connectivity.col], metric=affinity)\n connectivity.data = distances\n if n_clusters is None:\n n_nodes = 2 * n_samples - 1\n else:\n assert n_clusters <= n_samples\n n_nodes = 2 * n_samples - n_clusters\n if linkage == 'single':\n return _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, n_connected_components, return_distance)\n if return_distance:\n distances = np.empty(n_nodes - n_samples)\n A = np.empty(n_nodes, dtype=object)\n inertia = list()\n connectivity = connectivity.tolil()\n for (ind, (data, row)) in enumerate(zip(connectivity.data, connectivity.rows)):\n A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64))\n inertia.extend((_hierarchical.WeightedEdge(d, ind, r) for (r, d) in zip(row, data) if r < ind))\n del connectivity\n heapify(inertia)\n parent = np.arange(n_nodes, dtype=np.intp)\n used_node = np.ones(n_nodes, dtype=np.intp)\n children = []\n for k in range(n_samples, n_nodes):\n while True:\n edge = heappop(inertia)\n if used_node[edge.a] and used_node[edge.b]:\n break\n i = edge.a\n j = edge.b\n if return_distance:\n distances[k - n_samples] = edge.weight\n parent[i] = parent[j] = k\n children.append((i, j))\n n_i = used_node[i]\n n_j = used_node[j]\n used_node[k] = n_i + n_j\n used_node[i] = used_node[j] = False\n coord_col = join_func(A[i], A[j], used_node, n_i, n_j)\n for (col, d) in coord_col:\n A[col].append(k, d)\n heappush(inertia, _hierarchical.WeightedEdge(d, k, col))\n A[k] = coord_col\n A[i] = A[j] = 0\n n_leaves = n_samples\n children = np.array(children)[:, ::-1]\n if return_distance:\n return children, n_connected_components, n_leaves, parent, distances\n return children, n_connected_components, n_leaves, parent" }, { "name": "ward_tree", @@ -31307,7 +31812,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Feature matrix representing `n_samples` samples to be clustered." - } + }, + "refined_type": {} }, { "name": "connectivity", @@ -31317,7 +31823,8 @@ "docstring": { "type": "sparse matrix, default=None", "description": "Connectivity matrix. Defines for each sample the neighboring samples\nfollowing a given structure of the data. The matrix is assumed to\nbe symmetric and only the upper triangular half is used.\nDefault is None, i.e, the Ward algorithm is unstructured." - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -31327,7 +31834,8 @@ "docstring": { "type": "int, default=None", "description": "`n_clusters` should be less than `n_samples`. Stop early the\nconstruction of the tree at `n_clusters.` This is useful to decrease\ncomputation time if the number of clusters is not small compared to the\nnumber of samples. In this case, the complete tree is not computed, thus\nthe 'children' output is of limited use, and the 'parents' output should\nrather be used. This option is valid only when specifying a connectivity\nmatrix." - } + }, + "refined_type": {} }, { "name": "return_distance", @@ -31337,13 +31845,14 @@ "docstring": { "type": "bool, default=False", "description": "If `True`, return the distance between the clusters." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Ward clustering based on a Feature matrix.\n\nRecursively merges the pair of clusters that minimally increases within-cluster variance. The inertia matrix uses a Heapq-based representation. This is the structured version, that takes into account some topological structure between samples. Read more in the :ref:`User Guide `.", - "docstring": "Ward clustering based on a Feature matrix.\n\nRecursively merges the pair of clusters that minimally increases\nwithin-cluster variance.\n\nThe inertia matrix uses a Heapq-based representation.\n\nThis is the structured version, that takes into account some topological\nstructure between samples.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Feature matrix representing `n_samples` samples to be clustered.\n\nconnectivity : sparse matrix, default=None\n Connectivity matrix. Defines for each sample the neighboring samples\n following a given structure of the data. The matrix is assumed to\n be symmetric and only the upper triangular half is used.\n Default is None, i.e, the Ward algorithm is unstructured.\n\nn_clusters : int, default=None\n `n_clusters` should be less than `n_samples`. Stop early the\n construction of the tree at `n_clusters.` This is useful to decrease\n computation time if the number of clusters is not small compared to the\n number of samples. In this case, the complete tree is not computed, thus\n the 'children' output is of limited use, and the 'parents' output should\n rather be used. This option is valid only when specifying a connectivity\n matrix.\n\nreturn_distance : bool, default=False\n If `True`, return the distance between the clusters.\n\nReturns\n-------\nchildren : ndarray of shape (n_nodes-1, 2)\n The children of each non-leaf node. Values less than `n_samples`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_samples` is a non-leaf\n node and has children `children_[i - n_samples]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_samples + i`.\n\nn_connected_components : int\n The number of connected components in the graph.\n\nn_leaves : int\n The number of leaves in the tree.\n\nparents : ndarray of shape (n_nodes,) or None\n The parent of each node. Only returned when a connectivity matrix\n is specified, elsewhere 'None' is returned.\n\ndistances : ndarray of shape (n_nodes-1,)\n Only returned if `return_distance` is set to `True` (for compatibility).\n The distances between the centers of the nodes. `distances[i]`\n corresponds to a weighted Euclidean distance between\n the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to\n leaves of the tree, then `distances[i]` is their unweighted Euclidean\n distance. Distances are updated in the following way\n (from scipy.hierarchy.linkage):\n\n The new entry :math:`d(u,v)` is computed as follows,\n\n .. math::\n\n d(u,v) = \\sqrt{\\frac{|v|+|s|}\n {T}d(v,s)^2\n + \\frac{|v|+|t|}\n {T}d(v,t)^2\n - \\frac{|v|}\n {T}d(s,t)^2}\n\n where :math:`u` is the newly joined cluster consisting of\n clusters :math:`s` and :math:`t`, :math:`v` is an unused\n cluster in the forest, :math:`T=|v|+|s|+|t|`, and\n :math:`|*|` is the cardinality of its argument. This is also\n known as the incremental algorithm.", + "description": "Ward clustering based on a Feature matrix.\n\nRecursively merges the pair of clusters that minimally increases\nwithin-cluster variance.\n\nThe inertia matrix uses a Heapq-based representation.\n\nThis is the structured version, that takes into account some topological\nstructure between samples.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Ward clustering based on a Feature matrix.\n\n Recursively merges the pair of clusters that minimally increases\n within-cluster variance.\n\n The inertia matrix uses a Heapq-based representation.\n\n This is the structured version, that takes into account some topological\n structure between samples.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Feature matrix representing `n_samples` samples to be clustered.\n\n connectivity : sparse matrix, default=None\n Connectivity matrix. Defines for each sample the neighboring samples\n following a given structure of the data. The matrix is assumed to\n be symmetric and only the upper triangular half is used.\n Default is None, i.e, the Ward algorithm is unstructured.\n\n n_clusters : int, default=None\n `n_clusters` should be less than `n_samples`. Stop early the\n construction of the tree at `n_clusters.` This is useful to decrease\n computation time if the number of clusters is not small compared to the\n number of samples. In this case, the complete tree is not computed, thus\n the 'children' output is of limited use, and the 'parents' output should\n rather be used. This option is valid only when specifying a connectivity\n matrix.\n\n return_distance : bool, default=False\n If `True`, return the distance between the clusters.\n\n Returns\n -------\n children : ndarray of shape (n_nodes-1, 2)\n The children of each non-leaf node. Values less than `n_samples`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_samples` is a non-leaf\n node and has children `children_[i - n_samples]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_samples + i`.\n\n n_connected_components : int\n The number of connected components in the graph.\n\n n_leaves : int\n The number of leaves in the tree.\n\n parents : ndarray of shape (n_nodes,) or None\n The parent of each node. Only returned when a connectivity matrix\n is specified, elsewhere 'None' is returned.\n\n distances : ndarray of shape (n_nodes-1,)\n Only returned if `return_distance` is set to `True` (for compatibility).\n The distances between the centers of the nodes. `distances[i]`\n corresponds to a weighted Euclidean distance between\n the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to\n leaves of the tree, then `distances[i]` is their unweighted Euclidean\n distance. Distances are updated in the following way\n (from scipy.hierarchy.linkage):\n\n The new entry :math:`d(u,v)` is computed as follows,\n\n .. math::\n\n d(u,v) = \\sqrt{\\frac{|v|+|s|}\n {T}d(v,s)^2\n + \\frac{|v|+|t|}\n {T}d(v,t)^2\n - \\frac{|v|}\n {T}d(s,t)^2}\n\n where :math:`u` is the newly joined cluster consisting of\n clusters :math:`s` and :math:`t`, :math:`v` is an unused\n cluster in the forest, :math:`T=|v|+|s|+|t|`, and\n :math:`|*|` is the cardinality of its argument. This is also\n known as the incremental algorithm.\n ", "source_code": "\ndef ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):\n \"\"\"Ward clustering based on a Feature matrix.\n\n Recursively merges the pair of clusters that minimally increases\n within-cluster variance.\n\n The inertia matrix uses a Heapq-based representation.\n\n This is the structured version, that takes into account some topological\n structure between samples.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Feature matrix representing `n_samples` samples to be clustered.\n\n connectivity : sparse matrix, default=None\n Connectivity matrix. Defines for each sample the neighboring samples\n following a given structure of the data. The matrix is assumed to\n be symmetric and only the upper triangular half is used.\n Default is None, i.e, the Ward algorithm is unstructured.\n\n n_clusters : int, default=None\n `n_clusters` should be less than `n_samples`. Stop early the\n construction of the tree at `n_clusters.` This is useful to decrease\n computation time if the number of clusters is not small compared to the\n number of samples. In this case, the complete tree is not computed, thus\n the 'children' output is of limited use, and the 'parents' output should\n rather be used. This option is valid only when specifying a connectivity\n matrix.\n\n return_distance : bool, default=False\n If `True`, return the distance between the clusters.\n\n Returns\n -------\n children : ndarray of shape (n_nodes-1, 2)\n The children of each non-leaf node. Values less than `n_samples`\n correspond to leaves of the tree which are the original samples.\n A node `i` greater than or equal to `n_samples` is a non-leaf\n node and has children `children_[i - n_samples]`. Alternatively\n at the i-th iteration, children[i][0] and children[i][1]\n are merged to form node `n_samples + i`.\n\n n_connected_components : int\n The number of connected components in the graph.\n\n n_leaves : int\n The number of leaves in the tree.\n\n parents : ndarray of shape (n_nodes,) or None\n The parent of each node. Only returned when a connectivity matrix\n is specified, elsewhere 'None' is returned.\n\n distances : ndarray of shape (n_nodes-1,)\n Only returned if `return_distance` is set to `True` (for compatibility).\n The distances between the centers of the nodes. `distances[i]`\n corresponds to a weighted Euclidean distance between\n the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to\n leaves of the tree, then `distances[i]` is their unweighted Euclidean\n distance. Distances are updated in the following way\n (from scipy.hierarchy.linkage):\n\n The new entry :math:`d(u,v)` is computed as follows,\n\n .. math::\n\n d(u,v) = \\sqrt{\\frac{|v|+|s|}\n {T}d(v,s)^2\n + \\frac{|v|+|t|}\n {T}d(v,t)^2\n - \\frac{|v|}\n {T}d(s,t)^2}\n\n where :math:`u` is the newly joined cluster consisting of\n clusters :math:`s` and :math:`t`, :math:`v` is an unused\n cluster in the forest, :math:`T=|v|+|s|+|t|`, and\n :math:`|*|` is the cardinality of its argument. This is also\n known as the incremental algorithm.\n \"\"\"\n X = np.asarray(X)\n if X.ndim == 1:\n X = np.reshape(X, (-1, 1))\n (n_samples, n_features) = X.shape\n if connectivity is None:\n from scipy.cluster import hierarchy\n if n_clusters is not None:\n warnings.warn('Partial build of the tree is implemented only for structured clustering (i.e. with explicit connectivity). The algorithm will build the full tree and only retain the lower branches required for the specified number of clusters', stacklevel=2)\n X = np.require(X, requirements='W')\n out = hierarchy.ward(X)\n children_ = out[:, :2].astype(np.intp)\n if return_distance:\n distances = out[:, 2]\n return children_, 1, n_samples, None, distances\n else:\n return children_, 1, n_samples, None\n (connectivity, n_connected_components) = _fix_connectivity(X, connectivity, affinity='euclidean')\n if n_clusters is None:\n n_nodes = 2 * n_samples - 1\n else:\n if n_clusters > n_samples:\n raise ValueError('Cannot provide more clusters than samples. %i n_clusters was asked, and there are %i samples.' % (n_clusters, n_samples))\n n_nodes = 2 * n_samples - n_clusters\n coord_row = []\n coord_col = []\n A = []\n for (ind, row) in enumerate(connectivity.rows):\n A.append(row)\n row = [i for i in row if i < ind]\n coord_row.extend(len(row) * [ind])\n coord_col.extend(row)\n coord_row = np.array(coord_row, dtype=np.intp, order='C')\n coord_col = np.array(coord_col, dtype=np.intp, order='C')\n moments_1 = np.zeros(n_nodes, order='C')\n moments_1[:n_samples] = 1\n moments_2 = np.zeros((n_nodes, n_features), order='C')\n moments_2[:n_samples] = X\n inertia = np.empty(len(coord_row), dtype=np.float64, order='C')\n _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia)\n inertia = list(zip(inertia, coord_row, coord_col))\n heapify(inertia)\n parent = np.arange(n_nodes, dtype=np.intp)\n used_node = np.ones(n_nodes, dtype=bool)\n children = []\n if return_distance:\n distances = np.empty(n_nodes - n_samples)\n not_visited = np.empty(n_nodes, dtype=np.int8, order='C')\n for k in range(n_samples, n_nodes):\n while True:\n (inert, i, j) = heappop(inertia)\n if used_node[i] and used_node[j]:\n break\n (parent[i], parent[j]) = (k, k)\n children.append((i, j))\n used_node[i] = used_node[j] = False\n if return_distance:\n distances[k - n_samples] = inert\n moments_1[k] = moments_1[i] + moments_1[j]\n moments_2[k] = moments_2[i] + moments_2[j]\n coord_col = []\n not_visited.fill(1)\n not_visited[k] = 0\n _hierarchical._get_parents(A[i], coord_col, parent, not_visited)\n _hierarchical._get_parents(A[j], coord_col, parent, not_visited)\n [A[col].append(k) for col in coord_col]\n A.append(coord_col)\n coord_col = np.array(coord_col, dtype=np.intp, order='C')\n coord_row = np.empty(coord_col.shape, dtype=np.intp, order='C')\n coord_row.fill(k)\n n_additions = len(coord_row)\n ini = np.empty(n_additions, dtype=np.float64, order='C')\n _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini)\n [heappush(inertia, (ini[idx], k, coord_col[idx])) for idx in range(n_additions)]\n n_leaves = n_samples\n children = [c[::-1] for c in children]\n children = np.array(children)\n if return_distance:\n distances = np.sqrt(2.0 * distances)\n return children, n_connected_components, n_leaves, parent, distances\n else:\n return children, n_connected_components, n_leaves, parent" }, { @@ -31361,7 +31870,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -31371,7 +31881,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "svd_method", @@ -31381,7 +31892,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_svd_vecs", @@ -31391,7 +31903,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mini_batch", @@ -31401,7 +31914,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "init", @@ -31411,7 +31925,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_init", @@ -31421,7 +31936,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -31431,13 +31947,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, n_clusters=3, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', n_init=10, random_state=None):\n self.n_clusters = n_clusters\n self.svd_method = svd_method\n self.n_svd_vecs = n_svd_vecs\n self.mini_batch = mini_batch\n self.init = init\n self.n_init = n_init\n self.random_state = random_state" }, { @@ -31455,13 +31972,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_parameters(self):\n legal_svd_methods = ('randomized', 'arpack')\n if self.svd_method not in legal_svd_methods:\n raise ValueError(\"Unknown SVD method: '{0}'. svd_method must be one of {1}.\".format(self.svd_method, legal_svd_methods))" }, { @@ -31479,7 +31997,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data", @@ -31489,7 +32008,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -31499,13 +32019,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _k_means(self, data, n_clusters):\n if self.mini_batch:\n model = MiniBatchKMeans(n_clusters, init=self.init, n_init=self.n_init, random_state=self.random_state)\n else:\n model = KMeans(n_clusters, init=self.init, n_init=self.n_init, random_state=self.random_state)\n model.fit(data)\n centroid = model.cluster_centers_\n labels = model.labels_\n return centroid, labels" }, { @@ -31523,13 +32044,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_estimators_dtypes': 'raises nan error', 'check_fit2d_1sample': '_scale_normalize fails', 'check_fit2d_1feature': 'raises apply_along_axis error', 'check_estimator_sparse_data': 'does not fail gracefully', 'check_methods_subset_invariance': 'empty array passed inside', 'check_dont_overwrite_parameters': 'empty array passed inside', 'check_fit2d_predict1d': 'empty array passed inside'}}" }, { @@ -31547,7 +32069,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "array", @@ -31557,7 +32080,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -31567,7 +32091,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_discard", @@ -31577,14 +32102,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Returns first `n_components` left and right singular vectors u and v, discarding the first `n_discard`.", - "docstring": "Returns first `n_components` left and right singular\nvectors u and v, discarding the first `n_discard`.", - "source_code": "\ndef _svd(self, array, n_components, n_discard):\n \"\"\"Returns first `n_components` left and right singular\n vectors u and v, discarding the first `n_discard`.\n\n \"\"\"\n if self.svd_method == 'randomized':\n kwargs = {}\n if self.n_svd_vecs is not None:\n kwargs['n_oversamples'] = self.n_svd_vecs\n (u, _, vt) = randomized_svd(array, n_components, random_state=self.random_state, **kwargs)\n elif self.svd_method == 'arpack':\n (u, _, vt) = svds(array, k=n_components, ncv=self.n_svd_vecs)\n if np.any(np.isnan(vt)):\n A = safe_sparse_dot(array.T, array)\n random_state = check_random_state(self.random_state)\n v0 = random_state.uniform(-1, 1, A.shape[0])\n (_, v) = eigsh(A, ncv=self.n_svd_vecs, v0=v0)\n vt = v.T\n if np.any(np.isnan(u)):\n A = safe_sparse_dot(array, array.T)\n random_state = check_random_state(self.random_state)\n v0 = random_state.uniform(-1, 1, A.shape[0])\n (_, u) = eigsh(A, ncv=self.n_svd_vecs, v0=v0)\n assert_all_finite(u)\n assert_all_finite(vt)\n u = u[:, n_discard:]\n vt = vt[n_discard:]\n return u, vt.T" + "description": "Returns first `n_components` left and right singular\nvectors u and v, discarding the first `n_discard`.", + "docstring": "Returns first `n_components` left and right singular\n vectors u and v, discarding the first `n_discard`.\n ", + "source_code": "\ndef _svd(self, array, n_components, n_discard):\n \"\"\"Returns first `n_components` left and right singular\n vectors u and v, discarding the first `n_discard`.\n \"\"\"\n if self.svd_method == 'randomized':\n kwargs = {}\n if self.n_svd_vecs is not None:\n kwargs['n_oversamples'] = self.n_svd_vecs\n (u, _, vt) = randomized_svd(array, n_components, random_state=self.random_state, **kwargs)\n elif self.svd_method == 'arpack':\n (u, _, vt) = svds(array, k=n_components, ncv=self.n_svd_vecs)\n if np.any(np.isnan(vt)):\n A = safe_sparse_dot(array.T, array)\n random_state = check_random_state(self.random_state)\n v0 = random_state.uniform(-1, 1, A.shape[0])\n (_, v) = eigsh(A, ncv=self.n_svd_vecs, v0=v0)\n vt = v.T\n if np.any(np.isnan(u)):\n A = safe_sparse_dot(array, array.T)\n random_state = check_random_state(self.random_state)\n v0 = random_state.uniform(-1, 1, A.shape[0])\n (_, u) = eigsh(A, ncv=self.n_svd_vecs, v0=v0)\n assert_all_finite(u)\n assert_all_finite(vt)\n u = u[:, n_discard:]\n vt = vt[n_discard:]\n return u, vt.T" }, { "name": "fit", @@ -31601,7 +32127,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -31611,7 +32138,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -31621,13 +32149,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Create a biclustering for X.", - "docstring": "Create a biclustering for X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n SpectralBiclustering instance.", + "docstring": "Create a biclustering for X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n SpectralBiclustering instance.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Create a biclustering for X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n SpectralBiclustering instance.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', dtype=np.float64)\n self._check_parameters()\n self._fit(X)\n return self" }, { @@ -31645,7 +32174,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -31655,7 +32185,8 @@ "docstring": { "type": "int or tuple (n_row_clusters, n_column_clusters), default=3", "description": "The number of row and column clusters in the checkerboard\nstructure." - } + }, + "refined_type": {} }, { "name": "method", @@ -31665,6 +32196,10 @@ "docstring": { "type": "{'bistochastic', 'scale', 'log'}, default='bistochastic'", "description": "Method of normalizing and converting singular vectors into\nbiclusters. May be one of 'scale', 'bistochastic', or 'log'.\nThe authors recommend using 'log'. If the data is sparse,\nhowever, log normalization will not work, which is why the\ndefault is 'bistochastic'.\n\n.. warning::\n if `method='log'`, the data must be sparse." + }, + "refined_type": { + "kind": "EnumType", + "values": ["bistochastic", "scale", "log"] } }, { @@ -31675,7 +32210,8 @@ "docstring": { "type": "int, default=6", "description": "Number of singular vectors to check." - } + }, + "refined_type": {} }, { "name": "n_best", @@ -31685,7 +32221,8 @@ "docstring": { "type": "int, default=3", "description": "Number of best singular vectors to which to project the data\nfor clustering." - } + }, + "refined_type": {} }, { "name": "svd_method", @@ -31695,6 +32232,10 @@ "docstring": { "type": "{'randomized', 'arpack'}, default='randomized'", "description": "Selects the algorithm for finding singular vectors. May be\n'randomized' or 'arpack'. If 'randomized', uses\n:func:`~sklearn.utils.extmath.randomized_svd`, which may be faster\nfor large matrices. If 'arpack', uses\n`scipy.sparse.linalg.svds`, which is more accurate, but\npossibly slower in some cases." + }, + "refined_type": { + "kind": "EnumType", + "values": ["randomized", "arpack"] } }, { @@ -31705,7 +32246,8 @@ "docstring": { "type": "int, default=None", "description": "Number of vectors to use in calculating the SVD. Corresponds\nto `ncv` when `svd_method=arpack` and `n_oversamples` when\n`svd_method` is 'randomized`." - } + }, + "refined_type": {} }, { "name": "mini_batch", @@ -31715,7 +32257,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use mini-batch k-means, which is faster but may get\ndifferent results." - } + }, + "refined_type": {} }, { "name": "init", @@ -31725,6 +32268,10 @@ "docstring": { "type": "{'k-means++', 'random'} or ndarray of (n_clusters, n_features), default='k-means++'", "description": "Method for initialization of k-means algorithm; defaults to\n'k-means++'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "k-means++"] } }, { @@ -31735,7 +32282,8 @@ "docstring": { "type": "int, default=10", "description": "Number of random initializations that are tried with the\nk-means algorithm.\n\nIf mini-batch k-means is used, the best initialization is\nchosen and the algorithm runs once. Otherwise, the algorithm\nis run for each initialization and the best solution chosen." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -31745,13 +32293,14 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used for randomizing the singular value decomposition and the k-means\ninitialization. Use an int to make the randomness deterministic.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_clusters=3, *, method='bistochastic', n_components=6, n_best=3, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', n_init=10, random_state=None):\n super().__init__(n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state)\n self.method = method\n self.n_components = n_components\n self.n_best = n_best" }, { @@ -31769,13 +32318,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_parameters(self):\n super()._check_parameters()\n legal_methods = ('bistochastic', 'scale', 'log')\n if self.method not in legal_methods:\n raise ValueError(\"Unknown method: '{0}'. method must be one of {1}.\".format(self.method, legal_methods))\n try:\n int(self.n_clusters)\n except TypeError:\n try:\n (r, c) = self.n_clusters\n int(r)\n int(c)\n except (ValueError, TypeError) as e:\n raise ValueError('Incorrect parameter n_clusters has value: {}. It should either be a single integer or an iterable with two integers: (n_row_clusters, n_column_clusters)') from e\n if self.n_components < 1:\n raise ValueError('Parameter n_components must be greater than 0, but its value is {}'.format(self.n_components))\n if self.n_best < 1:\n raise ValueError('Parameter n_best must be greater than 0, but its value is {}'.format(self.n_best))\n if self.n_best > self.n_components:\n raise ValueError('n_best cannot be larger than n_components, but {} > {}'.format(self.n_best, self.n_components))" }, { @@ -31793,7 +32343,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -31803,13 +32354,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit(self, X):\n n_sv = self.n_components\n if self.method == 'bistochastic':\n normalized_data = _bistochastic_normalize(X)\n n_sv += 1\n elif self.method == 'scale':\n (normalized_data, _, _) = _scale_normalize(X)\n n_sv += 1\n elif self.method == 'log':\n normalized_data = _log_normalize(X)\n n_discard = 0 if self.method == 'log' else 1\n (u, v) = self._svd(normalized_data, n_sv, n_discard)\n ut = u.T\n vt = v.T\n try:\n (n_row_clusters, n_col_clusters) = self.n_clusters\n except TypeError:\n n_row_clusters = n_col_clusters = self.n_clusters\n best_ut = self._fit_best_piecewise(ut, self.n_best, n_row_clusters)\n best_vt = self._fit_best_piecewise(vt, self.n_best, n_col_clusters)\n self.row_labels_ = self._project_and_cluster(X, best_vt.T, n_row_clusters)\n self.column_labels_ = self._project_and_cluster(X.T, best_ut.T, n_col_clusters)\n self.rows_ = np.vstack([self.row_labels_ == label for label in range(n_row_clusters) for _ in range(n_col_clusters)])\n self.columns_ = np.vstack([self.column_labels_ == label for _ in range(n_row_clusters) for label in range(n_col_clusters)])" }, { @@ -31827,7 +32379,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "vectors", @@ -31837,7 +32390,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_best", @@ -31847,7 +32401,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -31857,13 +32412,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Find the ``n_best`` vectors that are best approximated by piecewise constant vectors.\n\nThe piecewise vectors are found by k-means; the best is chosen according to Euclidean distance.", - "docstring": "Find the ``n_best`` vectors that are best approximated by piecewise\nconstant vectors.\n\nThe piecewise vectors are found by k-means; the best is chosen\naccording to Euclidean distance.", + "description": "Find the ``n_best`` vectors that are best approximated by piecewise\nconstant vectors.\n\nThe piecewise vectors are found by k-means; the best is chosen\naccording to Euclidean distance.", + "docstring": "Find the ``n_best`` vectors that are best approximated by piecewise\n constant vectors.\n\n The piecewise vectors are found by k-means; the best is chosen\n according to Euclidean distance.\n\n ", "source_code": "\ndef _fit_best_piecewise(self, vectors, n_best, n_clusters):\n \"\"\"Find the ``n_best`` vectors that are best approximated by piecewise\n constant vectors.\n\n The piecewise vectors are found by k-means; the best is chosen\n according to Euclidean distance.\n\n \"\"\"\n \n def make_piecewise(v):\n (centroid, labels) = self._k_means(v.reshape(-1, 1), n_clusters)\n return centroid[labels].ravel()\n piecewise_vectors = np.apply_along_axis(make_piecewise, axis=1, arr=vectors)\n dists = np.apply_along_axis(norm, axis=1, arr=vectors - piecewise_vectors)\n result = vectors[np.argsort(dists)[:n_best]]\n return result" }, { @@ -31881,7 +32437,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data", @@ -31891,7 +32448,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "vectors", @@ -31901,7 +32459,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -31911,7 +32470,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -31935,7 +32495,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -31945,7 +32506,8 @@ "docstring": { "type": "int, default=3", "description": "The number of biclusters to find." - } + }, + "refined_type": {} }, { "name": "svd_method", @@ -31955,6 +32517,10 @@ "docstring": { "type": "{'randomized', 'arpack'}, default='randomized'", "description": "Selects the algorithm for finding singular vectors. May be\n'randomized' or 'arpack'. If 'randomized', use\n:func:`sklearn.utils.extmath.randomized_svd`, which may be faster\nfor large matrices. If 'arpack', use\n:func:`scipy.sparse.linalg.svds`, which is more accurate, but\npossibly slower in some cases." + }, + "refined_type": { + "kind": "EnumType", + "values": ["randomized", "arpack"] } }, { @@ -31965,7 +32531,8 @@ "docstring": { "type": "int, default=None", "description": "Number of vectors to use in calculating the SVD. Corresponds\nto `ncv` when `svd_method=arpack` and `n_oversamples` when\n`svd_method` is 'randomized`." - } + }, + "refined_type": {} }, { "name": "mini_batch", @@ -31975,7 +32542,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use mini-batch k-means, which is faster but may get\ndifferent results." - } + }, + "refined_type": {} }, { "name": "init", @@ -31985,7 +32553,8 @@ "docstring": { "type": "{'k-means++', 'random', or ndarray of shape (n_clusters, n_features), default='k-means++'", "description": "Method for initialization of k-means algorithm; defaults to\n'k-means++'." - } + }, + "refined_type": {} }, { "name": "n_init", @@ -31995,7 +32564,8 @@ "docstring": { "type": "int, default=10", "description": "Number of random initializations that are tried with the\nk-means algorithm.\n\nIf mini-batch k-means is used, the best initialization is\nchosen and the algorithm runs once. Otherwise, the algorithm\nis run for each initialization and the best solution chosen." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -32005,13 +32575,14 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used for randomizing the singular value decomposition and the k-means\ninitialization. Use an int to make the randomness deterministic.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_clusters=3, *, svd_method='randomized', n_svd_vecs=None, mini_batch=False, init='k-means++', n_init=10, random_state=None):\n super().__init__(n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state)" }, { @@ -32029,7 +32600,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -32039,13 +32611,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit(self, X):\n (normalized_data, row_diag, col_diag) = _scale_normalize(X)\n n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))\n (u, v) = self._svd(normalized_data, n_sv, n_discard=1)\n z = np.vstack((row_diag[:, np.newaxis] * u, col_diag[:, np.newaxis] * v))\n (_, labels) = self._k_means(z, self.n_clusters)\n n_rows = X.shape[0]\n self.row_labels_ = labels[:n_rows]\n self.column_labels_ = labels[n_rows:]\n self.rows_ = np.vstack([self.row_labels_ == c for c in range(self.n_clusters)])\n self.columns_ = np.vstack([self.column_labels_ == c for c in range(self.n_clusters)])" }, { @@ -32063,7 +32636,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -32073,7 +32647,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -32083,14 +32658,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Normalize rows and columns of ``X`` simultaneously so that all rows sum to one constant and all columns sum to a different constant.", - "docstring": "Normalize rows and columns of ``X`` simultaneously so that all\nrows sum to one constant and all columns sum to a different\nconstant.", - "source_code": "\ndef _bistochastic_normalize(X, max_iter=1000, tol=1e-05):\n \"\"\"Normalize rows and columns of ``X`` simultaneously so that all\n rows sum to one constant and all columns sum to a different\n constant.\n\n \"\"\"\n X = make_nonnegative(X)\n X_scaled = X\n for _ in range(max_iter):\n (X_new, _, _) = _scale_normalize(X_scaled)\n if issparse(X):\n dist = norm(X_scaled.data - X.data)\n else:\n dist = norm(X_scaled - X_new)\n X_scaled = X_new\n if dist is not None and dist < tol:\n break\n return X_scaled" + "description": "Normalize rows and columns of ``X`` simultaneously so that all\nrows sum to one constant and all columns sum to a different\nconstant.", + "docstring": "Normalize rows and columns of ``X`` simultaneously so that all\n rows sum to one constant and all columns sum to a different\n constant.\n ", + "source_code": "\ndef _bistochastic_normalize(X, max_iter=1000, tol=1e-05):\n \"\"\"Normalize rows and columns of ``X`` simultaneously so that all\n rows sum to one constant and all columns sum to a different\n constant.\n \"\"\"\n X = make_nonnegative(X)\n X_scaled = X\n for _ in range(max_iter):\n (X_new, _, _) = _scale_normalize(X_scaled)\n if issparse(X):\n dist = norm(X_scaled.data - X.data)\n else:\n dist = norm(X_scaled - X_new)\n X_scaled = X_new\n if dist is not None and dist < tol:\n break\n return X_scaled" }, { "name": "_log_normalize", @@ -32107,7 +32683,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -32131,14 +32708,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Normalize ``X`` by scaling rows and columns independently.\n\nReturns the normalized matrix and the row and column scaling factors.", - "docstring": "Normalize ``X`` by scaling rows and columns independently.\n\nReturns the normalized matrix and the row and column scaling\nfactors.", - "source_code": "\ndef _scale_normalize(X):\n \"\"\"Normalize ``X`` by scaling rows and columns independently.\n\n Returns the normalized matrix and the row and column scaling\n factors.\n\n \"\"\"\n X = make_nonnegative(X)\n row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()\n col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze()\n row_diag = np.where(np.isnan(row_diag), 0, row_diag)\n col_diag = np.where(np.isnan(col_diag), 0, col_diag)\n if issparse(X):\n (n_rows, n_cols) = X.shape\n r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))\n c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))\n an = r * X * c\n else:\n an = row_diag[:, np.newaxis] * X * col_diag\n return an, row_diag, col_diag" + "description": "Normalize ``X`` by scaling rows and columns independently.\n\nReturns the normalized matrix and the row and column scaling\nfactors.", + "docstring": "Normalize ``X`` by scaling rows and columns independently.\n\n Returns the normalized matrix and the row and column scaling\n factors.\n ", + "source_code": "\ndef _scale_normalize(X):\n \"\"\"Normalize ``X`` by scaling rows and columns independently.\n\n Returns the normalized matrix and the row and column scaling\n factors.\n \"\"\"\n X = make_nonnegative(X)\n row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()\n col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze()\n row_diag = np.where(np.isnan(row_diag), 0, row_diag)\n col_diag = np.where(np.isnan(col_diag), 0, col_diag)\n if issparse(X):\n (n_rows, n_cols) = X.shape\n r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))\n c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))\n an = r * X * c\n else:\n an = row_diag[:, np.newaxis] * X * col_diag\n return an, row_diag, col_diag" }, { "name": "__init__", @@ -32155,7 +32733,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "threshold", @@ -32165,7 +32744,8 @@ "docstring": { "type": "float, default=0.5", "description": "The radius of the subcluster obtained by merging a new sample and the\nclosest subcluster should be lesser than the threshold. Otherwise a new\nsubcluster is started. Setting this value to be very low promotes\nsplitting and vice-versa." - } + }, + "refined_type": {} }, { "name": "branching_factor", @@ -32175,7 +32755,8 @@ "docstring": { "type": "int, default=50", "description": "Maximum number of CF subclusters in each node. If a new samples enters\nsuch that the number of subclusters exceed the branching_factor then\nthat node is split into two nodes with the subclusters redistributed\nin each. The parent subcluster of that node is removed and two new\nsubclusters are added as parents of the 2 split nodes." - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -32185,7 +32766,8 @@ "docstring": { "type": "int, instance of sklearn.cluster model, default=3", "description": "Number of clusters after the final clustering step, which treats the\nsubclusters from the leaves as new samples.\n\n- `None` : the final clustering step is not performed and the\n subclusters are returned as they are.\n\n- :mod:`sklearn.cluster` Estimator : If a model is provided, the model\n is fit treating the subclusters as new samples and the initial data\n is mapped to the label of the closest subcluster.\n\n- `int` : the model fit is :class:`AgglomerativeClustering` with\n `n_clusters` set to be equal to the int." - } + }, + "refined_type": {} }, { "name": "compute_labels", @@ -32195,7 +32777,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to compute labels for each fit." - } + }, + "refined_type": {} }, { "name": "copy", @@ -32205,13 +32788,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to make a copy of the given data. If set to False,\nthe initial data will be overwritten." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, threshold=0.5, branching_factor=50, n_clusters=3, compute_labels=True, copy=True):\n self.threshold = threshold\n self.branching_factor = branching_factor\n self.n_clusters = n_clusters\n self.compute_labels = compute_labels\n self.copy = copy" }, { @@ -32229,7 +32813,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -32239,13 +32824,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_fit(self, X):\n check_is_fitted(self)\n if hasattr(self, 'subcluster_centers_') and X.shape[1] != self.subcluster_centers_.shape[1]:\n raise ValueError('Training data and predicted data do not have same number of features.')" }, { @@ -32263,7 +32849,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -32273,7 +32860,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "partial", @@ -32283,13 +32871,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit(self, X, partial):\n has_root = getattr(self, 'root_', None)\n first_call = not (partial and has_root)\n X = self._validate_data(X, accept_sparse='csr', copy=self.copy, reset=first_call)\n threshold = self.threshold\n branching_factor = self.branching_factor\n if branching_factor <= 1:\n raise ValueError('Branching_factor should be greater than one.')\n (n_samples, n_features) = X.shape\n if first_call:\n self.root_ = _CFNode(threshold=threshold, branching_factor=branching_factor, is_leaf=True, n_features=n_features)\n self.dummy_leaf_ = _CFNode(threshold=threshold, branching_factor=branching_factor, is_leaf=True, n_features=n_features)\n self.dummy_leaf_.next_leaf_ = self.root_\n self.root_.prev_leaf_ = self.dummy_leaf_\n if not sparse.issparse(X):\n iter_func = iter\n else:\n iter_func = _iterate_sparse_X\n for sample in iter_func(X):\n subcluster = _CFSubcluster(linear_sum=sample)\n split = self.root_.insert_cf_subcluster(subcluster)\n if split:\n (new_subcluster1, new_subcluster2) = _split_node(self.root_, threshold, branching_factor)\n del self.root_\n self.root_ = _CFNode(threshold=threshold, branching_factor=branching_factor, is_leaf=False, n_features=n_features)\n self.root_.append_subcluster(new_subcluster1)\n self.root_.append_subcluster(new_subcluster2)\n centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])\n self.subcluster_centers_ = centroids\n self._global_clustering(X)\n return self" }, { @@ -32307,13 +32896,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Retrieve the leaves of the CF Node.", - "docstring": "Retrieve the leaves of the CF Node.\n\nReturns\n-------\nleaves : list of shape (n_leaves,)\n List of the leaf nodes.", + "docstring": "\n Retrieve the leaves of the CF Node.\n\n Returns\n -------\n leaves : list of shape (n_leaves,)\n List of the leaf nodes.\n ", "source_code": "\ndef _get_leaves(self):\n \"\"\"\n Retrieve the leaves of the CF Node.\n\n Returns\n -------\n leaves : list of shape (n_leaves,)\n List of the leaf nodes.\n \"\"\"\n leaf_ptr = self.dummy_leaf_.next_leaf_\n leaves = []\n while leaf_ptr is not None:\n leaves.append(leaf_ptr)\n leaf_ptr = leaf_ptr.next_leaf_\n return leaves" }, { @@ -32331,7 +32921,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -32341,14 +32932,51 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Global clustering for the subclusters obtained after fitting", - "docstring": "Global clustering for the subclusters obtained after fitting", - "source_code": "\ndef _global_clustering(self, X=None):\n \"\"\"\n Global clustering for the subclusters obtained after fitting\n \"\"\"\n clusterer = self.n_clusters\n centroids = self.subcluster_centers_\n compute_labels = X is not None and self.compute_labels\n not_enough_centroids = False\n if isinstance(clusterer, numbers.Integral):\n clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)\n if len(centroids) < self.n_clusters:\n not_enough_centroids = True\n elif clusterer is not None and not hasattr(clusterer, 'fit_predict'):\n raise ValueError('n_clusters should be an instance of ClusterMixin or an int')\n self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)\n if clusterer is None or not_enough_centroids:\n self.subcluster_labels_ = np.arange(len(centroids))\n if not_enough_centroids:\n warnings.warn('Number of subclusters found (%d) by BIRCH is less than (%d). Decrease the threshold.' % (len(centroids), self.n_clusters), ConvergenceWarning)\n else:\n self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)\n if compute_labels:\n self.labels_ = self.predict(X)" + "docstring": "\n Global clustering for the subclusters obtained after fitting\n ", + "source_code": "\ndef _global_clustering(self, X=None):\n \"\"\"\n Global clustering for the subclusters obtained after fitting\n \"\"\"\n clusterer = self.n_clusters\n centroids = self.subcluster_centers_\n compute_labels = X is not None and self.compute_labels\n not_enough_centroids = False\n if isinstance(clusterer, numbers.Integral):\n clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)\n if len(centroids) < self.n_clusters:\n not_enough_centroids = True\n elif clusterer is not None and not hasattr(clusterer, 'fit_predict'):\n raise ValueError('n_clusters should be an instance of ClusterMixin or an int')\n self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)\n if clusterer is None or not_enough_centroids:\n self.subcluster_labels_ = np.arange(len(centroids))\n if not_enough_centroids:\n warnings.warn('Number of subclusters found (%d) by BIRCH is less than (%d). Decrease the threshold.' % (len(centroids), self.n_clusters), ConvergenceWarning)\n else:\n self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)\n if compute_labels:\n self.labels_ = self._predict(X)" + }, + { + "name": "_predict", + "unique_name": "_predict", + "qname": "sklearn.cluster._birch.Birch._predict", + "unique_qname": "sklearn.cluster._birch.Birch._predict", + "decorators": [], + "parameters": [ + { + "name": "self", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + }, + { + "name": "X", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + } + ], + "results": [], + "is_public": false, + "description": "Predict data using the ``centroids_`` of subclusters.", + "docstring": "Predict data using the ``centroids_`` of subclusters.", + "source_code": "\ndef _predict(self, X):\n \"\"\"Predict data using the ``centroids_`` of subclusters.\"\"\"\n kwargs = {'Y_norm_squared': self._subcluster_norms}\n with config_context(assume_finite=True):\n argmin = pairwise_distances_argmin(X, self.subcluster_centers_, metric_kwargs=kwargs)\n return self.subcluster_labels_[argmin]" }, { "name": "fit", @@ -32365,7 +32993,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -32375,6 +33004,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -32385,13 +33018,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Build a CF Tree for the input data.", - "docstring": "Build a CF Tree for the input data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself\n Fitted estimator.", + "docstring": "\n Build a CF Tree for the input data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"\n Build a CF Tree for the input data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Fitted estimator.\n \"\"\"\n (self._deprecated_fit, self._deprecated_partial_fit) = (True, False)\n return self._fit(X, partial=False)" }, { @@ -32412,13 +33046,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('`fit_` is deprecated in 1.0 and will be removed in 1.2.')\n@property\ndef fit_(self):\n return self._deprecated_fit" }, { @@ -32436,7 +33071,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -32446,6 +33082,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features), default=None", "description": "Input data. If X is not provided, only the global clustering\nstep is done." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -32456,13 +33096,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Online learning. Prevents rebuilding of CFTree from scratch.", - "docstring": "Online learning. Prevents rebuilding of CFTree from scratch.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features), default=None\n Input data. If X is not provided, only the global clustering\n step is done.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself\n Fitted estimator.", + "docstring": "\n Online learning. Prevents rebuilding of CFTree from scratch.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None\n Input data. If X is not provided, only the global clustering\n step is done.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Fitted estimator.\n ", "source_code": "\ndef partial_fit(self, X=None, y=None):\n \"\"\"\n Online learning. Prevents rebuilding of CFTree from scratch.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), default=None\n Input data. If X is not provided, only the global clustering\n step is done.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Fitted estimator.\n \"\"\"\n (self._deprecated_partial_fit, self._deprecated_fit) = (True, False)\n if X is None:\n self._global_clustering()\n return self\n else:\n return self._fit(X, partial=True)" }, { @@ -32483,13 +33124,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('`partial_fit_` is deprecated in 1.0 and will be removed in 1.2.')\n@property\ndef partial_fit_(self):\n return self._deprecated_partial_fit" }, { @@ -32507,7 +33149,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -32517,14 +33160,18 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict data using the ``centroids_`` of subclusters.\n\nAvoid computation of the row norms of X.", - "docstring": "Predict data using the ``centroids_`` of subclusters.\n\nAvoid computation of the row norms of X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\nReturns\n-------\nlabels : ndarray of shape(n_samples,)\n Labelled data.", - "source_code": "\ndef predict(self, X):\n \"\"\"\n Predict data using the ``centroids_`` of subclusters.\n\n Avoid computation of the row norms of X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n labels : ndarray of shape(n_samples,)\n Labelled data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n kwargs = {'Y_norm_squared': self._subcluster_norms}\n with config_context(assume_finite=True):\n argmin = pairwise_distances_argmin(X, self.subcluster_centers_, metric_kwargs=kwargs)\n return self.subcluster_labels_[argmin]" + "docstring": "\n Predict data using the ``centroids_`` of subclusters.\n\n Avoid computation of the row norms of X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n labels : ndarray of shape(n_samples,)\n Labelled data.\n ", + "source_code": "\ndef predict(self, X):\n \"\"\"\n Predict data using the ``centroids_`` of subclusters.\n\n Avoid computation of the row norms of X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n labels : ndarray of shape(n_samples,)\n Labelled data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n return self._predict(X)" }, { "name": "transform", @@ -32541,7 +33188,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -32551,13 +33199,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Transform X into subcluster centroids dimension.\n\nEach dimension represents the distance from the sample point to each cluster centroid.", - "docstring": "Transform X into subcluster centroids dimension.\n\nEach dimension represents the distance from the sample point to each\ncluster centroid.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\nReturns\n-------\nX_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)\n Transformed data.", + "description": "Transform X into subcluster centroids dimension.\n\nEach dimension represents the distance from the sample point to each\ncluster centroid.", + "docstring": "\n Transform X into subcluster centroids dimension.\n\n Each dimension represents the distance from the sample point to each\n cluster centroid.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)\n Transformed data.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"\n Transform X into subcluster centroids dimension.\n\n Each dimension represents the distance from the sample point to each\n cluster centroid.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n self._validate_data(X, accept_sparse='csr', reset=False)\n with config_context(assume_finite=True):\n return euclidean_distances(X, self.subcluster_centers_)" }, { @@ -32575,7 +33227,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "threshold", @@ -32585,7 +33238,8 @@ "docstring": { "type": "float", "description": "Threshold needed for a new subcluster to enter a CFSubcluster." - } + }, + "refined_type": {} }, { "name": "branching_factor", @@ -32595,7 +33249,8 @@ "docstring": { "type": "int", "description": "Maximum number of CF subclusters in each node." - } + }, + "refined_type": {} }, { "name": "is_leaf", @@ -32605,7 +33260,8 @@ "docstring": { "type": "bool", "description": "We need to know if the CFNode is a leaf or not, in order to\nretrieve the final subclusters." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -32615,13 +33271,14 @@ "docstring": { "type": "int", "description": "The number of features." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, threshold, branching_factor, is_leaf, n_features):\n self.threshold = threshold\n self.branching_factor = branching_factor\n self.is_leaf = is_leaf\n self.n_features = n_features\n self.subclusters_ = []\n self.init_centroids_ = np.zeros((branching_factor + 1, n_features))\n self.init_sq_norm_ = np.zeros(branching_factor + 1)\n self.squared_norm_ = []\n self.prev_leaf_ = None\n self.next_leaf_ = None" }, { @@ -32639,7 +33296,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "subcluster", @@ -32649,13 +33307,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef append_subcluster(self, subcluster):\n n_samples = len(self.subclusters_)\n self.subclusters_.append(subcluster)\n self.init_centroids_[n_samples] = subcluster.centroid_\n self.init_sq_norm_[n_samples] = subcluster.sq_norm_\n self.centroids_ = self.init_centroids_[:n_samples + 1, :]\n self.squared_norm_ = self.init_sq_norm_[:n_samples + 1]" }, { @@ -32673,7 +33332,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "subcluster", @@ -32683,7 +33343,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -32707,7 +33368,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "subcluster", @@ -32717,7 +33379,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "new_subcluster1", @@ -32727,7 +33390,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "new_subcluster2", @@ -32737,13 +33401,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Remove a subcluster from a node and update it with the split subclusters.", - "docstring": "Remove a subcluster from a node and update it with the\nsplit subclusters.", + "description": "Remove a subcluster from a node and update it with the\nsplit subclusters.", + "docstring": "Remove a subcluster from a node and update it with the\n split subclusters.\n ", "source_code": "\ndef update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):\n \"\"\"Remove a subcluster from a node and update it with the\n split subclusters.\n \"\"\"\n ind = self.subclusters_.index(subcluster)\n self.subclusters_[ind] = new_subcluster1\n self.init_centroids_[ind] = new_subcluster1.centroid_\n self.init_sq_norm_[ind] = new_subcluster1.sq_norm_\n self.append_subcluster(new_subcluster2)" }, { @@ -32761,7 +33426,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "linear_sum", @@ -32771,13 +33437,14 @@ "docstring": { "type": "ndarray of shape (n_features,), default=None", "description": "Sample. This is kept optional to allow initialization of empty\nsubclusters." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, linear_sum=None):\n if linear_sum is None:\n self.n_samples_ = 0\n self.squared_sum_ = 0.0\n self.centroid_ = self.linear_sum_ = 0\n else:\n self.n_samples_ = 1\n self.centroid_ = self.linear_sum_ = linear_sum\n self.squared_sum_ = self.sq_norm_ = np.dot(self.linear_sum_, self.linear_sum_)\n self.child_ = None" }, { @@ -32795,7 +33462,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nominee_cluster", @@ -32805,7 +33473,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "threshold", @@ -32815,13 +33484,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Check if a cluster is worthy enough to be merged. If yes then merge.", - "docstring": "Check if a cluster is worthy enough to be merged. If\nyes then merge.", + "description": "Check if a cluster is worthy enough to be merged. If\nyes then merge.", + "docstring": "Check if a cluster is worthy enough to be merged. If\n yes then merge.\n ", "source_code": "\ndef merge_subcluster(self, nominee_cluster, threshold):\n \"\"\"Check if a cluster is worthy enough to be merged. If\n yes then merge.\n \"\"\"\n new_ss = self.squared_sum_ + nominee_cluster.squared_sum_\n new_ls = self.linear_sum_ + nominee_cluster.linear_sum_\n new_n = self.n_samples_ + nominee_cluster.n_samples_\n new_centroid = 1 / new_n * new_ls\n new_sq_norm = np.dot(new_centroid, new_centroid)\n sq_radius = new_ss / new_n - new_sq_norm\n if sq_radius <= threshold**2:\n (self.n_samples_, self.linear_sum_, self.squared_sum_, self.centroid_, self.sq_norm_) = (new_n, new_ls, new_ss, new_centroid, new_sq_norm)\n return True\n return False" }, { @@ -32839,7 +33509,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -32863,7 +33534,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "subcluster", @@ -32873,13 +33545,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef update(self, subcluster):\n self.n_samples_ += subcluster.n_samples_\n self.linear_sum_ += subcluster.linear_sum_\n self.squared_sum_ += subcluster.squared_sum_\n self.centroid_ = self.linear_sum_ / self.n_samples_\n self.sq_norm_ = np.dot(self.centroid_, self.centroid_)" }, { @@ -32897,13 +33570,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "This little hack returns a densified row when iterating over a sparse matrix, instead of constructing a sparse matrix for every row that is expensive.", - "docstring": "This little hack returns a densified row when iterating over a sparse\nmatrix, instead of constructing a sparse matrix for every row that is\nexpensive.", + "description": "This little hack returns a densified row when iterating over a sparse\nmatrix, instead of constructing a sparse matrix for every row that is\nexpensive.", + "docstring": "This little hack returns a densified row when iterating over a sparse\n matrix, instead of constructing a sparse matrix for every row that is\n expensive.\n ", "source_code": "\ndef _iterate_sparse_X(X):\n \"\"\"This little hack returns a densified row when iterating over a sparse\n matrix, instead of constructing a sparse matrix for every row that is\n expensive.\n \"\"\"\n n_samples = X.shape[0]\n X_indices = X.indices\n X_data = X.data\n X_indptr = X.indptr\n for i in range(n_samples):\n row = np.zeros(X.shape[1])\n (startptr, endptr) = (X_indptr[i], X_indptr[i + 1])\n nonzero_indices = X_indices[startptr:endptr]\n row[nonzero_indices] = X_data[startptr:endptr]\n yield row" }, { @@ -32921,7 +33595,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "threshold", @@ -32931,7 +33606,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "branching_factor", @@ -32941,13 +33617,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "The node has to be split if there is no place for a new subcluster in the node. 1. Two empty nodes and two empty subclusters are initialized. 2. The pair of distant subclusters are found. 3. The properties of the empty subclusters and nodes are updated according to the nearest distance between the subclusters to the pair of distant subclusters. 4. The two nodes are set as children to the two subclusters.", - "docstring": "The node has to be split if there is no place for a new subcluster\nin the node.\n1. Two empty nodes and two empty subclusters are initialized.\n2. The pair of distant subclusters are found.\n3. The properties of the empty subclusters and nodes are updated\n according to the nearest distance between the subclusters to the\n pair of distant subclusters.\n4. The two nodes are set as children to the two subclusters.", + "description": "The node has to be split if there is no place for a new subcluster\nin the node.\n1. Two empty nodes and two empty subclusters are initialized.\n2. The pair of distant subclusters are found.\n3. The properties of the empty subclusters and nodes are updated\n according to the nearest distance between the subclusters to the\n pair of distant subclusters.\n4. The two nodes are set as children to the two subclusters.", + "docstring": "The node has to be split if there is no place for a new subcluster\n in the node.\n 1. Two empty nodes and two empty subclusters are initialized.\n 2. The pair of distant subclusters are found.\n 3. The properties of the empty subclusters and nodes are updated\n according to the nearest distance between the subclusters to the\n pair of distant subclusters.\n 4. The two nodes are set as children to the two subclusters.\n ", "source_code": "\ndef _split_node(node, threshold, branching_factor):\n \"\"\"The node has to be split if there is no place for a new subcluster\n in the node.\n 1. Two empty nodes and two empty subclusters are initialized.\n 2. The pair of distant subclusters are found.\n 3. The properties of the empty subclusters and nodes are updated\n according to the nearest distance between the subclusters to the\n pair of distant subclusters.\n 4. The two nodes are set as children to the two subclusters.\n \"\"\"\n new_subcluster1 = _CFSubcluster()\n new_subcluster2 = _CFSubcluster()\n new_node1 = _CFNode(threshold=threshold, branching_factor=branching_factor, is_leaf=node.is_leaf, n_features=node.n_features)\n new_node2 = _CFNode(threshold=threshold, branching_factor=branching_factor, is_leaf=node.is_leaf, n_features=node.n_features)\n new_subcluster1.child_ = new_node1\n new_subcluster2.child_ = new_node2\n if node.is_leaf:\n if node.prev_leaf_ is not None:\n node.prev_leaf_.next_leaf_ = new_node1\n new_node1.prev_leaf_ = node.prev_leaf_\n new_node1.next_leaf_ = new_node2\n new_node2.prev_leaf_ = new_node1\n new_node2.next_leaf_ = node.next_leaf_\n if node.next_leaf_ is not None:\n node.next_leaf_.prev_leaf_ = new_node2\n dist = euclidean_distances(node.centroids_, Y_norm_squared=node.squared_norm_, squared=True)\n n_clusters = dist.shape[0]\n farthest_idx = np.unravel_index(dist.argmax(), (n_clusters, n_clusters))\n (node1_dist, node2_dist) = dist[farthest_idx, ]\n node1_closer = node1_dist < node2_dist\n for (idx, subcluster) in enumerate(node.subclusters_):\n if node1_closer[idx]:\n new_node1.append_subcluster(subcluster)\n new_subcluster1.update(subcluster)\n else:\n new_node2.append_subcluster(subcluster)\n new_subcluster2.update(subcluster)\n return new_subcluster1, new_subcluster2" }, { @@ -32965,7 +33642,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eps", @@ -32975,7 +33653,8 @@ "docstring": { "type": "float, default=0.5", "description": "The maximum distance between two samples for one to be considered\nas in the neighborhood of the other. This is not a maximum bound\non the distances of points within a cluster. This is the most\nimportant DBSCAN parameter to choose appropriately for your data set\nand distance function." - } + }, + "refined_type": {} }, { "name": "min_samples", @@ -32985,7 +33664,8 @@ "docstring": { "type": "int, default=5", "description": "The number of samples (or total weight) in a neighborhood for a point\nto be considered as a core point. This includes the point itself." - } + }, + "refined_type": {} }, { "name": "metric", @@ -32995,7 +33675,8 @@ "docstring": { "type": "str, or callable, default='euclidean'", "description": "The metric to use when calculating distance between instances in a\nfeature array. If metric is a string or callable, it must be one of\nthe options allowed by :func:`sklearn.metrics.pairwise_distances` for\nits metric parameter.\nIf metric is \"precomputed\", X is assumed to be a distance matrix and\nmust be square. X may be a :term:`Glossary `, in which\ncase only \"nonzero\" elements may be considered neighbors for DBSCAN.\n\n.. versionadded:: 0.17\n metric *precomputed* to accept precomputed sparse matrix." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -33005,7 +33686,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -33015,6 +33697,10 @@ "docstring": { "type": "{'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'", "description": "The algorithm to be used by the NearestNeighbors module\nto compute pointwise distances and find nearest neighbors.\nSee NearestNeighbors module documentation for details." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -33025,7 +33711,8 @@ "docstring": { "type": "int, default=30", "description": "Leaf size passed to BallTree or cKDTree. This can affect the speed\nof the construction and query, as well as the memory required\nto store the tree. The optimal value depends\non the nature of the problem." - } + }, + "refined_type": {} }, { "name": "p", @@ -33035,7 +33722,8 @@ "docstring": { "type": "float, default=None", "description": "The power of the Minkowski metric to be used to calculate distance\nbetween points. If None, then ``p=2`` (equivalent to the Euclidean\ndistance)." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -33045,13 +33733,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, eps=0.5, *, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None):\n self.eps = eps\n self.min_samples = min_samples\n self.metric = metric\n self.metric_params = metric_params\n self.algorithm = algorithm\n self.leaf_size = leaf_size\n self.p = p\n self.n_jobs = n_jobs" }, { @@ -33069,7 +33758,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33079,6 +33769,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features), or (n_samples, n_samples)", "description": "Training instances to cluster, or distances between instances if\n``metric='precomputed'``. If a sparse matrix is provided, it will\nbe converted into a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -33089,7 +33783,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -33099,13 +33794,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weight of each sample, such that a sample with a weight of at least\n``min_samples`` is by itself a core sample; a sample with a\nnegative weight may inhibit its eps-neighbor from being core.\nNote that weights are absolute, and default to 1." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform DBSCAN clustering from features, or distance matrix.", - "docstring": "Perform DBSCAN clustering from features, or distance matrix.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features), or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``metric='precomputed'``. If a sparse matrix is provided, it will\n be converted into a sparse ``csr_matrix``.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weight of each sample, such that a sample with a weight of at least\n ``min_samples`` is by itself a core sample; a sample with a\n negative weight may inhibit its eps-neighbor from being core.\n Note that weights are absolute, and default to 1.\n\nReturns\n-------\nself : object\n Returns a fitted instance of self.", + "docstring": "Perform DBSCAN clustering from features, or distance matrix.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``metric='precomputed'``. If a sparse matrix is provided, it will\n be converted into a sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weight of each sample, such that a sample with a weight of at least\n ``min_samples`` is by itself a core sample; a sample with a\n negative weight may inhibit its eps-neighbor from being core.\n Note that weights are absolute, and default to 1.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n ", "source_code": "\ndef fit(self, X, y=None, sample_weight=None):\n \"\"\"Perform DBSCAN clustering from features, or distance matrix.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``metric='precomputed'``. If a sparse matrix is provided, it will\n be converted into a sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weight of each sample, such that a sample with a weight of at least\n ``min_samples`` is by itself a core sample; a sample with a\n negative weight may inhibit its eps-neighbor from being core.\n Note that weights are absolute, and default to 1.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr')\n if not self.eps > 0.0:\n raise ValueError('eps must be positive.')\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if self.metric == 'precomputed' and sparse.issparse(X):\n with warnings.catch_warnings():\n warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning)\n X.setdiag(X.diagonal())\n neighbors_model = NearestNeighbors(radius=self.eps, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs)\n neighbors_model.fit(X)\n neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False)\n if sample_weight is None:\n n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])\n else:\n n_neighbors = np.array([np.sum(sample_weight[neighbors]) for neighbors in neighborhoods])\n labels = np.full(X.shape[0], -1, dtype=np.intp)\n core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8)\n dbscan_inner(core_samples, neighborhoods, labels)\n self.core_sample_indices_ = np.where(core_samples)[0]\n self.labels_ = labels\n if len(self.core_sample_indices_):\n self.components_ = X[self.core_sample_indices_].copy()\n else:\n self.components_ = np.empty((0, X.shape[1]))\n return self" }, { @@ -33123,7 +33819,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33133,6 +33830,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features), or (n_samples, n_samples)", "description": "Training instances to cluster, or distances between instances if\n``metric='precomputed'``. If a sparse matrix is provided, it will\nbe converted into a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -33143,7 +33844,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -33153,13 +33855,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weight of each sample, such that a sample with a weight of at least\n``min_samples`` is by itself a core sample; a sample with a\nnegative weight may inhibit its eps-neighbor from being core.\nNote that weights are absolute, and default to 1." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute clusters from a data or distance matrix and predict labels.", - "docstring": "Compute clusters from a data or distance matrix and predict labels.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features), or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``metric='precomputed'``. If a sparse matrix is provided, it will\n be converted into a sparse ``csr_matrix``.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weight of each sample, such that a sample with a weight of at least\n ``min_samples`` is by itself a core sample; a sample with a\n negative weight may inhibit its eps-neighbor from being core.\n Note that weights are absolute, and default to 1.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,)\n Cluster labels. Noisy samples are given the label -1.", + "docstring": "Compute clusters from a data or distance matrix and predict labels.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``metric='precomputed'``. If a sparse matrix is provided, it will\n be converted into a sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weight of each sample, such that a sample with a weight of at least\n ``min_samples`` is by itself a core sample; a sample with a\n negative weight may inhibit its eps-neighbor from being core.\n Note that weights are absolute, and default to 1.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels. Noisy samples are given the label -1.\n ", "source_code": "\ndef fit_predict(self, X, y=None, sample_weight=None):\n \"\"\"Compute clusters from a data or distance matrix and predict labels.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features), or (n_samples, n_samples)\n Training instances to cluster, or distances between instances if\n ``metric='precomputed'``. If a sparse matrix is provided, it will\n be converted into a sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weight of each sample, such that a sample with a weight of at least\n ``min_samples`` is by itself a core sample; a sample with a\n negative weight may inhibit its eps-neighbor from being core.\n Note that weights are absolute, and default to 1.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels. Noisy samples are given the label -1.\n \"\"\"\n self.fit(X, sample_weight=sample_weight)\n return self.labels_" }, { @@ -33177,6 +33880,10 @@ "docstring": { "type": "{array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "A feature array, or array of distances between samples if\n``metric='precomputed'``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -33187,7 +33894,8 @@ "docstring": { "type": "float, default=0.5", "description": "The maximum distance between two samples for one to be considered\nas in the neighborhood of the other. This is not a maximum bound\non the distances of points within a cluster. This is the most\nimportant DBSCAN parameter to choose appropriately for your data set\nand distance function." - } + }, + "refined_type": {} }, { "name": "min_samples", @@ -33197,7 +33905,8 @@ "docstring": { "type": "int, default=5", "description": "The number of samples (or total weight) in a neighborhood for a point\nto be considered as a core point. This includes the point itself." - } + }, + "refined_type": {} }, { "name": "metric", @@ -33207,7 +33916,8 @@ "docstring": { "type": "str or callable, default='minkowski'", "description": "The metric to use when calculating distance between instances in a\nfeature array. If metric is a string or callable, it must be one of\nthe options allowed by :func:`sklearn.metrics.pairwise_distances` for\nits metric parameter.\nIf metric is \"precomputed\", X is assumed to be a distance matrix and\nmust be square during fit.\nX may be a :term:`sparse graph `,\nin which case only \"nonzero\" elements may be considered neighbors." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -33217,7 +33927,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -33227,6 +33938,10 @@ "docstring": { "type": "{'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'", "description": "The algorithm to be used by the NearestNeighbors module\nto compute pointwise distances and find nearest neighbors.\nSee NearestNeighbors module documentation for details." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -33237,7 +33952,8 @@ "docstring": { "type": "int, default=30", "description": "Leaf size passed to BallTree or cKDTree. This can affect the speed\nof the construction and query, as well as the memory required\nto store the tree. The optimal value depends\non the nature of the problem." - } + }, + "refined_type": {} }, { "name": "p", @@ -33247,7 +33963,8 @@ "docstring": { "type": "float, default=2", "description": "The power of the Minkowski metric to be used to calculate distance\nbetween points." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -33257,7 +33974,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weight of each sample, such that a sample with a weight of at least\n``min_samples`` is by itself a core sample; a sample with negative\nweight may inhibit its eps-neighbor from being core.\nNote that weights are absolute, and default to 1." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -33267,13 +33985,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search. ``None`` means\n1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means\nusing all processors. See :term:`Glossary ` for more details.\nIf precomputed distance are used, parallel execution is not available\nand thus n_jobs will have no effect." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform DBSCAN clustering from vector array or distance matrix.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Perform DBSCAN clustering from vector array or distance matrix.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n A feature array, or array of distances between samples if\n ``metric='precomputed'``.\n\neps : float, default=0.5\n The maximum distance between two samples for one to be considered\n as in the neighborhood of the other. This is not a maximum bound\n on the distances of points within a cluster. This is the most\n important DBSCAN parameter to choose appropriately for your data set\n and distance function.\n\nmin_samples : int, default=5\n The number of samples (or total weight) in a neighborhood for a point\n to be considered as a core point. This includes the point itself.\n\nmetric : str or callable, default='minkowski'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string or callable, it must be one of\n the options allowed by :func:`sklearn.metrics.pairwise_distances` for\n its metric parameter.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit.\n X may be a :term:`sparse graph `,\n in which case only \"nonzero\" elements may be considered neighbors.\n\nmetric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n .. versionadded:: 0.19\n\nalgorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n The algorithm to be used by the NearestNeighbors module\n to compute pointwise distances and find nearest neighbors.\n See NearestNeighbors module documentation for details.\n\nleaf_size : int, default=30\n Leaf size passed to BallTree or cKDTree. This can affect the speed\n of the construction and query, as well as the memory required\n to store the tree. The optimal value depends\n on the nature of the problem.\n\np : float, default=2\n The power of the Minkowski metric to be used to calculate distance\n between points.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weight of each sample, such that a sample with a weight of at least\n ``min_samples`` is by itself a core sample; a sample with negative\n weight may inhibit its eps-neighbor from being core.\n Note that weights are absolute, and default to 1.\n\nn_jobs : int, default=None\n The number of parallel jobs to run for neighbors search. ``None`` means\n 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means\n using all processors. See :term:`Glossary ` for more details.\n If precomputed distance are used, parallel execution is not available\n and thus n_jobs will have no effect.\n\nReturns\n-------\ncore_samples : ndarray of shape (n_core_samples,)\n Indices of core samples.\n\nlabels : ndarray of shape (n_samples,)\n Cluster labels for each point. Noisy samples are given the label -1.\n\nSee Also\n--------\nDBSCAN : An estimator interface for this clustering algorithm.\nOPTICS : A similar estimator interface clustering at multiple values of\n eps. Our implementation is optimized for memory usage.\n\nNotes\n-----\nFor an example, see :ref:`examples/cluster/plot_dbscan.py\n`.\n\nThis implementation bulk-computes all neighborhood queries, which increases\nthe memory complexity to O(n.d) where d is the average number of neighbors,\nwhile original DBSCAN had memory complexity O(n). It may attract a higher\nmemory complexity when querying these nearest neighborhoods, depending\non the ``algorithm``.\n\nOne way to avoid the query complexity is to pre-compute sparse\nneighborhoods in chunks using\n:func:`NearestNeighbors.radius_neighbors_graph\n` with\n``mode='distance'``, then using ``metric='precomputed'`` here.\n\nAnother way to reduce memory and computation time is to remove\n(near-)duplicate points and use ``sample_weight`` instead.\n\n:func:`cluster.optics ` provides a similar\nclustering with lower memory usage.\n\nReferences\n----------\nEster, M., H. P. Kriegel, J. Sander, and X. Xu, \"A Density-Based\nAlgorithm for Discovering Clusters in Large Spatial Databases with Noise\".\nIn: Proceedings of the 2nd International Conference on Knowledge Discovery\nand Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996\n\nSchubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).\nDBSCAN revisited, revisited: why and how you should (still) use DBSCAN.\nACM Transactions on Database Systems (TODS), 42(3), 19.", + "docstring": "Perform DBSCAN clustering from vector array or distance matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n A feature array, or array of distances between samples if\n ``metric='precomputed'``.\n\n eps : float, default=0.5\n The maximum distance between two samples for one to be considered\n as in the neighborhood of the other. This is not a maximum bound\n on the distances of points within a cluster. This is the most\n important DBSCAN parameter to choose appropriately for your data set\n and distance function.\n\n min_samples : int, default=5\n The number of samples (or total weight) in a neighborhood for a point\n to be considered as a core point. This includes the point itself.\n\n metric : str or callable, default='minkowski'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string or callable, it must be one of\n the options allowed by :func:`sklearn.metrics.pairwise_distances` for\n its metric parameter.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit.\n X may be a :term:`sparse graph `,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n .. versionadded:: 0.19\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n The algorithm to be used by the NearestNeighbors module\n to compute pointwise distances and find nearest neighbors.\n See NearestNeighbors module documentation for details.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or cKDTree. This can affect the speed\n of the construction and query, as well as the memory required\n to store the tree. The optimal value depends\n on the nature of the problem.\n\n p : float, default=2\n The power of the Minkowski metric to be used to calculate distance\n between points.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weight of each sample, such that a sample with a weight of at least\n ``min_samples`` is by itself a core sample; a sample with negative\n weight may inhibit its eps-neighbor from being core.\n Note that weights are absolute, and default to 1.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search. ``None`` means\n 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means\n using all processors. See :term:`Glossary ` for more details.\n If precomputed distance are used, parallel execution is not available\n and thus n_jobs will have no effect.\n\n Returns\n -------\n core_samples : ndarray of shape (n_core_samples,)\n Indices of core samples.\n\n labels : ndarray of shape (n_samples,)\n Cluster labels for each point. Noisy samples are given the label -1.\n\n See Also\n --------\n DBSCAN : An estimator interface for this clustering algorithm.\n OPTICS : A similar estimator interface clustering at multiple values of\n eps. Our implementation is optimized for memory usage.\n\n Notes\n -----\n For an example, see :ref:`examples/cluster/plot_dbscan.py\n `.\n\n This implementation bulk-computes all neighborhood queries, which increases\n the memory complexity to O(n.d) where d is the average number of neighbors,\n while original DBSCAN had memory complexity O(n). It may attract a higher\n memory complexity when querying these nearest neighborhoods, depending\n on the ``algorithm``.\n\n One way to avoid the query complexity is to pre-compute sparse\n neighborhoods in chunks using\n :func:`NearestNeighbors.radius_neighbors_graph\n ` with\n ``mode='distance'``, then using ``metric='precomputed'`` here.\n\n Another way to reduce memory and computation time is to remove\n (near-)duplicate points and use ``sample_weight`` instead.\n\n :func:`cluster.optics ` provides a similar\n clustering with lower memory usage.\n\n References\n ----------\n Ester, M., H. P. Kriegel, J. Sander, and X. Xu, \"A Density-Based\n Algorithm for Discovering Clusters in Large Spatial Databases with Noise\".\n In: Proceedings of the 2nd International Conference on Knowledge Discovery\n and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996\n\n Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).\n DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.\n ACM Transactions on Database Systems (TODS), 42(3), 19.\n ", "source_code": "\ndef dbscan(X, eps=0.5, *, min_samples=5, metric='minkowski', metric_params=None, algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=None):\n \"\"\"Perform DBSCAN clustering from vector array or distance matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n A feature array, or array of distances between samples if\n ``metric='precomputed'``.\n\n eps : float, default=0.5\n The maximum distance between two samples for one to be considered\n as in the neighborhood of the other. This is not a maximum bound\n on the distances of points within a cluster. This is the most\n important DBSCAN parameter to choose appropriately for your data set\n and distance function.\n\n min_samples : int, default=5\n The number of samples (or total weight) in a neighborhood for a point\n to be considered as a core point. This includes the point itself.\n\n metric : str or callable, default='minkowski'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string or callable, it must be one of\n the options allowed by :func:`sklearn.metrics.pairwise_distances` for\n its metric parameter.\n If metric is \"precomputed\", X is assumed to be a distance matrix and\n must be square during fit.\n X may be a :term:`sparse graph `,\n in which case only \"nonzero\" elements may be considered neighbors.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n .. versionadded:: 0.19\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n The algorithm to be used by the NearestNeighbors module\n to compute pointwise distances and find nearest neighbors.\n See NearestNeighbors module documentation for details.\n\n leaf_size : int, default=30\n Leaf size passed to BallTree or cKDTree. This can affect the speed\n of the construction and query, as well as the memory required\n to store the tree. The optimal value depends\n on the nature of the problem.\n\n p : float, default=2\n The power of the Minkowski metric to be used to calculate distance\n between points.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weight of each sample, such that a sample with a weight of at least\n ``min_samples`` is by itself a core sample; a sample with negative\n weight may inhibit its eps-neighbor from being core.\n Note that weights are absolute, and default to 1.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search. ``None`` means\n 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means\n using all processors. See :term:`Glossary ` for more details.\n If precomputed distance are used, parallel execution is not available\n and thus n_jobs will have no effect.\n\n Returns\n -------\n core_samples : ndarray of shape (n_core_samples,)\n Indices of core samples.\n\n labels : ndarray of shape (n_samples,)\n Cluster labels for each point. Noisy samples are given the label -1.\n\n See Also\n --------\n DBSCAN : An estimator interface for this clustering algorithm.\n OPTICS : A similar estimator interface clustering at multiple values of\n eps. Our implementation is optimized for memory usage.\n\n Notes\n -----\n For an example, see :ref:`examples/cluster/plot_dbscan.py\n `.\n\n This implementation bulk-computes all neighborhood queries, which increases\n the memory complexity to O(n.d) where d is the average number of neighbors,\n while original DBSCAN had memory complexity O(n). It may attract a higher\n memory complexity when querying these nearest neighborhoods, depending\n on the ``algorithm``.\n\n One way to avoid the query complexity is to pre-compute sparse\n neighborhoods in chunks using\n :func:`NearestNeighbors.radius_neighbors_graph\n ` with\n ``mode='distance'``, then using ``metric='precomputed'`` here.\n\n Another way to reduce memory and computation time is to remove\n (near-)duplicate points and use ``sample_weight`` instead.\n\n :func:`cluster.optics ` provides a similar\n clustering with lower memory usage.\n\n References\n ----------\n Ester, M., H. P. Kriegel, J. Sander, and X. Xu, \"A Density-Based\n Algorithm for Discovering Clusters in Large Spatial Databases with Noise\".\n In: Proceedings of the 2nd International Conference on Knowledge Discovery\n and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996\n\n Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).\n DBSCAN revisited, revisited: why and how you should (still) use DBSCAN.\n ACM Transactions on Database Systems (TODS), 42(3), 19.\n \"\"\"\n est = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, metric_params=metric_params, algorithm=algorithm, leaf_size=leaf_size, p=p, n_jobs=n_jobs)\n est.fit(X, sample_weight=sample_weight)\n return est.core_sample_indices_, est.labels_" }, { @@ -33291,7 +34010,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Xred", @@ -33301,13 +34021,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_clusters) or (n_clusters,)", "description": "The values to be assigned to each cluster of samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Inverse the transformation and return a vector of size `n_features`.", - "docstring": "Inverse the transformation and return a vector of size `n_features`.\n\nParameters\n----------\nXred : array-like of shape (n_samples, n_clusters) or (n_clusters,)\n The values to be assigned to each cluster of samples.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_features) or (n_features,)\n A vector of size `n_samples` with the values of `Xred` assigned to\n each of the cluster of samples.", + "docstring": "\n Inverse the transformation and return a vector of size `n_features`.\n\n Parameters\n ----------\n Xred : array-like of shape (n_samples, n_clusters) or (n_clusters,)\n The values to be assigned to each cluster of samples.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features) or (n_features,)\n A vector of size `n_samples` with the values of `Xred` assigned to\n each of the cluster of samples.\n ", "source_code": "\ndef inverse_transform(self, Xred):\n \"\"\"\n Inverse the transformation and return a vector of size `n_features`.\n\n Parameters\n ----------\n Xred : array-like of shape (n_samples, n_clusters) or (n_clusters,)\n The values to be assigned to each cluster of samples.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features) or (n_features,)\n A vector of size `n_samples` with the values of `Xred` assigned to\n each of the cluster of samples.\n \"\"\"\n check_is_fitted(self)\n (unil, inverse) = np.unique(self.labels_, return_inverse=True)\n return Xred[..., inverse]" }, { @@ -33325,7 +34046,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33335,13 +34057,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "A M by N array of M observations in N dimensions or a length\nM array of M one-dimensional observations." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Transform a new matrix using the built clustering.", - "docstring": "Transform a new matrix using the built clustering.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n A M by N array of M observations in N dimensions or a length\n M array of M one-dimensional observations.\n\nReturns\n-------\nY : ndarray of shape (n_samples, n_clusters) or (n_clusters,)\n The pooled values for each feature cluster.", + "docstring": "\n Transform a new matrix using the built clustering.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n A M by N array of M observations in N dimensions or a length\n M array of M one-dimensional observations.\n\n Returns\n -------\n Y : ndarray of shape (n_samples, n_clusters) or (n_clusters,)\n The pooled values for each feature cluster.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"\n Transform a new matrix using the built clustering.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n A M by N array of M observations in N dimensions or a length\n M array of M one-dimensional observations.\n\n Returns\n -------\n Y : ndarray of shape (n_samples, n_clusters) or (n_clusters,)\n The pooled values for each feature cluster.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n if self.pooling_func == np.mean and not issparse(X):\n size = np.bincount(self.labels_)\n n_samples = X.shape[0]\n nX = np.array([np.bincount(self.labels_, X[i, :]) / size for i in range(n_samples)])\n else:\n nX = [self.pooling_func(X[:, self.labels_ == l], axis=1) for l in np.unique(self.labels_)]\n nX = np.array(nX).T\n return nX" }, { @@ -33359,7 +34082,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -33369,7 +34093,8 @@ "docstring": { "type": "int, default=8", "description": "The number of clusters to form as well as the number of\ncentroids to generate." - } + }, + "refined_type": {} }, { "name": "init", @@ -33379,6 +34104,10 @@ "docstring": { "type": "{'k-means++', 'random'}, callable or array-like of shape (n_clusters, n_features), default='k-means++'", "description": "Method for initialization:\n\n'k-means++' : selects initial cluster centers for k-mean\nclustering in a smart way to speed up convergence. See section\nNotes in k_init for more details.\n\n'random': choose `n_clusters` observations (rows) at random from data\nfor the initial centroids.\n\nIf an array is passed, it should be of shape (n_clusters, n_features)\nand gives the initial centers.\n\nIf a callable is passed, it should take arguments X, n_clusters and a\nrandom state and return an initialization." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "k-means++"] } }, { @@ -33389,7 +34118,8 @@ "docstring": { "type": "int, default=10", "description": "Number of time the k-means algorithm will be run with different\ncentroid seeds. The final results will be the best output of\nn_init consecutive runs in terms of inertia." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -33399,7 +34129,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations of the k-means algorithm for a\nsingle run." - } + }, + "refined_type": {} }, { "name": "tol", @@ -33409,7 +34140,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Relative tolerance with regards to Frobenius norm of the difference\nin the cluster centers of two consecutive iterations to declare\nconvergence." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -33419,7 +34151,8 @@ "docstring": { "type": "int, default=0", "description": "Verbosity mode." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -33429,7 +34162,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for centroid initialization. Use\nan int to make the randomness deterministic.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "copy_x", @@ -33439,7 +34173,8 @@ "docstring": { "type": "bool, default=True", "description": "When pre-computing distances it is more numerically accurate to center\nthe data first. If copy_x is True (default), then the original data is\nnot modified. If False, the original data is modified, and put back\nbefore the function returns, but small numerical differences may be\nintroduced by subtracting and then adding the data mean. Note that if\nthe original data is not C-contiguous, a copy will be made even if\ncopy_x is False. If the original data is sparse, but not in CSR format,\na copy will be made even if copy_x is False." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -33449,13 +34184,17 @@ "docstring": { "type": "{\"auto\", \"full\", \"elkan\"}, default=\"auto\"", "description": "K-means algorithm to use. The classical EM-style algorithm is \"full\".\nThe \"elkan\" variation is more efficient on data with well-defined\nclusters, by using the triangle inequality. However it's more memory\nintensive due to the allocation of an extra array of shape\n(n_samples, n_clusters).\n\nFor now \"auto\" (kept for backward compatibility) chooses \"elkan\" but it\nmight change in the future for a better heuristic.\n\n.. versionchanged:: 0.18\n Added Elkan algorithm" + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "full", "elkan"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_clusters=8, *, init='k-means++', n_init=10, max_iter=300, tol=0.0001, verbose=0, random_state=None, copy_x=True, algorithm='auto'):\n self.n_clusters = n_clusters\n self.init = init\n self.max_iter = max_iter\n self.tol = tol\n self.n_init = n_init\n self.verbose = verbose\n self.random_state = random_state\n self.copy_x = copy_x\n self.algorithm = algorithm" }, { @@ -33473,7 +34212,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33483,7 +34223,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -33493,7 +34234,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -33517,7 +34259,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33527,13 +34270,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_params(self, X):\n if self.n_init <= 0:\n raise ValueError(f'n_init should be > 0, got {self.n_init} instead.')\n self._n_init = self.n_init\n if self.max_iter <= 0:\n raise ValueError(f'max_iter should be > 0, got {self.max_iter} instead.')\n if X.shape[0] < self.n_clusters:\n raise ValueError(f'n_samples={X.shape[0]} should be >= n_clusters={self.n_clusters}.')\n self._tol = _tolerance(X, self.tol)\n if self.algorithm not in ('auto', 'full', 'elkan'):\n raise ValueError(f\"Algorithm must be 'auto', 'full' or 'elkan', got {self.algorithm} instead.\")\n self._algorithm = self.algorithm\n if self._algorithm == 'auto':\n self._algorithm = 'full' if self.n_clusters == 1 else 'elkan'\n if self._algorithm == 'elkan' and self.n_clusters == 1:\n warnings.warn(\"algorithm='elkan' doesn't make sense for a single cluster. Using 'full' instead.\", RuntimeWarning)\n self._algorithm = 'full'\n if not (hasattr(self.init, '__array__') or callable(self.init) or isinstance(self.init, str) and self.init in ['k-means++', 'random']):\n raise ValueError(f\"init should be either 'k-means++', 'random', a ndarray or a callable, got '{self.init}' instead.\")\n if hasattr(self.init, '__array__') and self._n_init != 1:\n warnings.warn(f'Explicit initial center position passed: performing only one init in {self.__class__.__name__} instead of n_init={self._n_init}.', RuntimeWarning, stacklevel=2)\n self._n_init = 1" }, { @@ -33551,7 +34295,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33561,13 +34306,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_test_data(self, X):\n X = self._validate_data(X, accept_sparse='csr', reset=False, dtype=[np.float64, np.float32], order='C', accept_large_sparse=False)\n return X" }, { @@ -33585,7 +34331,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33595,6 +34342,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -33605,7 +34356,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Squared euclidean norm of each data point. Pass it if you have it\nat hands already to avoid it being recomputed here." - } + }, + "refined_type": {} }, { "name": "init", @@ -33615,6 +34367,10 @@ "docstring": { "type": "{'k-means++', 'random'}, callable or ndarray of shape (n_clusters, n_features)", "description": "Method for initialization." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "k-means++"] } }, { @@ -33625,7 +34381,8 @@ "docstring": { "type": "RandomState instance", "description": "Determines random number generation for centroid initialization.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "init_size", @@ -33635,13 +34392,14 @@ "docstring": { "type": "int, default=None", "description": "Number of samples to randomly sample for speeding up the\ninitialization (sometimes at the expense of accuracy)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the initial centroids.", - "docstring": "Compute the initial centroids.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\nx_squared_norms : ndarray of shape (n_samples,)\n Squared euclidean norm of each data point. Pass it if you have it\n at hands already to avoid it being recomputed here.\n\ninit : {'k-means++', 'random'}, callable or ndarray of shape (n_clusters, n_features)\n Method for initialization.\n\nrandom_state : RandomState instance\n Determines random number generation for centroid initialization.\n See :term:`Glossary `.\n\ninit_size : int, default=None\n Number of samples to randomly sample for speeding up the\n initialization (sometimes at the expense of accuracy).\n\nReturns\n-------\ncenters : ndarray of shape (n_clusters, n_features)", + "docstring": "Compute the initial centroids.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n x_squared_norms : ndarray of shape (n_samples,)\n Squared euclidean norm of each data point. Pass it if you have it\n at hands already to avoid it being recomputed here.\n\n init : {'k-means++', 'random'}, callable or ndarray of shape (n_clusters, n_features)\n Method for initialization.\n\n random_state : RandomState instance\n Determines random number generation for centroid initialization.\n See :term:`Glossary `.\n\n init_size : int, default=None\n Number of samples to randomly sample for speeding up the\n initialization (sometimes at the expense of accuracy).\n\n Returns\n -------\n centers : ndarray of shape (n_clusters, n_features)\n ", "source_code": "\ndef _init_centroids(self, X, x_squared_norms, init, random_state, init_size=None):\n \"\"\"Compute the initial centroids.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n x_squared_norms : ndarray of shape (n_samples,)\n Squared euclidean norm of each data point. Pass it if you have it\n at hands already to avoid it being recomputed here.\n\n init : {'k-means++', 'random'}, callable or ndarray of shape (n_clusters, n_features)\n Method for initialization.\n\n random_state : RandomState instance\n Determines random number generation for centroid initialization.\n See :term:`Glossary `.\n\n init_size : int, default=None\n Number of samples to randomly sample for speeding up the\n initialization (sometimes at the expense of accuracy).\n\n Returns\n -------\n centers : ndarray of shape (n_clusters, n_features)\n \"\"\"\n n_samples = X.shape[0]\n n_clusters = self.n_clusters\n if init_size is not None and init_size < n_samples:\n init_indices = random_state.randint(0, n_samples, init_size)\n X = X[init_indices]\n x_squared_norms = x_squared_norms[init_indices]\n n_samples = X.shape[0]\n if isinstance(init, str) and init == 'k-means++':\n (centers, _) = _kmeans_plusplus(X, n_clusters, random_state=random_state, x_squared_norms=x_squared_norms)\n elif isinstance(init, str) and init == 'random':\n seeds = random_state.permutation(n_samples)[:n_clusters]\n centers = X[seeds]\n elif hasattr(init, '__array__'):\n centers = init\n elif callable(init):\n centers = init(X, n_clusters, random_state=random_state)\n centers = check_array(centers, dtype=X.dtype, copy=False, order='C')\n self._validate_center_shape(X, centers)\n if sp.issparse(centers):\n centers = centers.toarray()\n return centers" }, { @@ -33659,13 +34417,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -33683,7 +34442,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33693,7 +34453,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -33717,7 +34478,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33727,7 +34489,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "centers", @@ -33737,7 +34500,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -33761,7 +34525,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33771,6 +34536,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training instances to cluster. It must be noted that the data\nwill be converted to C ordering, which will cause a memory\ncopy if the given data is not C-contiguous.\nIf a sparse matrix is passed, a copy will be made if it's not in\nCSR format." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -33781,7 +34550,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -33791,13 +34561,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The weights for each observation in X. If None, all observations\nare assigned equal weight.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute k-means clustering.", - "docstring": "Compute k-means clustering.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training instances to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory\n copy if the given data is not C-contiguous.\n If a sparse matrix is passed, a copy will be made if it's not in\n CSR format.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nsample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n .. versionadded:: 0.20\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Compute k-means clustering.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training instances to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory\n copy if the given data is not C-contiguous.\n If a sparse matrix is passed, a copy will be made if it's not in\n CSR format.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y=None, sample_weight=None):\n \"\"\"Compute k-means clustering.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training instances to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory\n copy if the given data is not C-contiguous.\n If a sparse matrix is passed, a copy will be made if it's not in\n CSR format.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', dtype=[np.float64, np.float32], order='C', copy=self.copy_x, accept_large_sparse=False)\n self._check_params(X)\n random_state = check_random_state(self.random_state)\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n self._n_threads = _openmp_effective_n_threads()\n init = self.init\n if hasattr(init, '__array__'):\n init = check_array(init, dtype=X.dtype, copy=True, order='C')\n self._validate_center_shape(X, init)\n if not sp.issparse(X):\n X_mean = X.mean(axis=0)\n X -= X_mean\n if hasattr(init, '__array__'):\n init -= X_mean\n x_squared_norms = row_norms(X, squared=True)\n if self._algorithm == 'full':\n kmeans_single = _kmeans_single_lloyd\n self._check_mkl_vcomp(X, X.shape[0])\n else:\n kmeans_single = _kmeans_single_elkan\n (best_inertia, best_labels) = (None, None)\n for i in range(self._n_init):\n centers_init = self._init_centroids(X, x_squared_norms=x_squared_norms, init=init, random_state=random_state)\n if self.verbose:\n print('Initialization complete')\n (labels, inertia, centers, n_iter_) = kmeans_single(X, sample_weight, centers_init, max_iter=self.max_iter, verbose=self.verbose, tol=self._tol, x_squared_norms=x_squared_norms, n_threads=self._n_threads)\n if best_inertia is None or inertia < best_inertia and not _is_same_clustering(labels, best_labels, self.n_clusters):\n best_labels = labels\n best_centers = centers\n best_inertia = inertia\n best_n_iter = n_iter_\n if not sp.issparse(X):\n if not self.copy_x:\n X += X_mean\n best_centers += X_mean\n distinct_clusters = len(set(best_labels))\n if distinct_clusters < self.n_clusters:\n warnings.warn('Number of distinct clusters ({}) found smaller than n_clusters ({}). Possibly due to duplicate points in X.'.format(distinct_clusters, self.n_clusters), ConvergenceWarning, stacklevel=2)\n self.cluster_centers_ = best_centers\n self.labels_ = best_labels\n self.inertia_ = best_inertia\n self.n_iter_ = best_n_iter\n return self" }, { @@ -33815,7 +34586,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33825,6 +34597,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "New data to transform." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -33835,7 +34611,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -33845,13 +34622,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The weights for each observation in X. If None, all observations\nare assigned equal weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute cluster centers and predict cluster index for each sample.\n\nConvenience method; equivalent to calling fit(X) followed by predict(X).", - "docstring": "Compute cluster centers and predict cluster index for each sample.\n\nConvenience method; equivalent to calling fit(X) followed by\npredict(X).\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to transform.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nsample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.", + "description": "Compute cluster centers and predict cluster index for each sample.\n\nConvenience method; equivalent to calling fit(X) followed by\npredict(X).", + "docstring": "Compute cluster centers and predict cluster index for each sample.\n\n Convenience method; equivalent to calling fit(X) followed by\n predict(X).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to transform.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.\n ", "source_code": "\ndef fit_predict(self, X, y=None, sample_weight=None):\n \"\"\"Compute cluster centers and predict cluster index for each sample.\n\n Convenience method; equivalent to calling fit(X) followed by\n predict(X).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to transform.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.\n \"\"\"\n return self.fit(X, sample_weight=sample_weight).labels_" }, { @@ -33869,7 +34647,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33879,6 +34658,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "New data to transform." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -33889,7 +34672,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -33899,13 +34683,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The weights for each observation in X. If None, all observations\nare assigned equal weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute clustering and transform X to cluster-distance space.\n\nEquivalent to fit(X).transform(X), but more efficiently implemented.", - "docstring": "Compute clustering and transform X to cluster-distance space.\n\nEquivalent to fit(X).transform(X), but more efficiently implemented.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to transform.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nsample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_clusters)\n X transformed in the new space.", + "docstring": "Compute clustering and transform X to cluster-distance space.\n\n Equivalent to fit(X).transform(X), but more efficiently implemented.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to transform.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_clusters)\n X transformed in the new space.\n ", "source_code": "\ndef fit_transform(self, X, y=None, sample_weight=None):\n \"\"\"Compute clustering and transform X to cluster-distance space.\n\n Equivalent to fit(X).transform(X), but more efficiently implemented.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to transform.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_clusters)\n X transformed in the new space.\n \"\"\"\n return self.fit(X, sample_weight=sample_weight)._transform(X)" }, { @@ -33923,7 +34708,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33933,6 +34719,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "New data to predict." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -33943,13 +34733,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The weights for each observation in X. If None, all observations\nare assigned equal weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict the closest cluster each sample in X belongs to.\n\nIn the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book.", - "docstring": "Predict the closest cluster each sample in X belongs to.\n\nIn the vector quantization literature, `cluster_centers_` is called\nthe code book and each value returned by `predict` is the index of\nthe closest code in the code book.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to predict.\n\nsample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.", + "description": "Predict the closest cluster each sample in X belongs to.\n\nIn the vector quantization literature, `cluster_centers_` is called\nthe code book and each value returned by `predict` is the index of\nthe closest code in the code book.", + "docstring": "Predict the closest cluster each sample in X belongs to.\n\n In the vector quantization literature, `cluster_centers_` is called\n the code book and each value returned by `predict` is the index of\n the closest code in the code book.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to predict.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.\n ", "source_code": "\ndef predict(self, X, sample_weight=None):\n \"\"\"Predict the closest cluster each sample in X belongs to.\n\n In the vector quantization literature, `cluster_centers_` is called\n the code book and each value returned by `predict` is the index of\n the closest code in the code book.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to predict.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.\n \"\"\"\n check_is_fitted(self)\n X = self._check_test_data(X)\n x_squared_norms = row_norms(X, squared=True)\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n return _labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms, self.cluster_centers_, self._n_threads)[0]" }, { @@ -33967,7 +34758,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -33977,6 +34769,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "New data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -33987,7 +34783,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -33997,13 +34794,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The weights for each observation in X. If None, all observations\nare assigned equal weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Opposite of the value of X on the K-means objective.", - "docstring": "Opposite of the value of X on the K-means objective.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nsample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\nReturns\n-------\nscore : float\n Opposite of the value of X on the K-means objective.", + "docstring": "Opposite of the value of X on the K-means objective.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n Returns\n -------\n score : float\n Opposite of the value of X on the K-means objective.\n ", "source_code": "\ndef score(self, X, y=None, sample_weight=None):\n \"\"\"Opposite of the value of X on the K-means objective.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n Returns\n -------\n score : float\n Opposite of the value of X on the K-means objective.\n \"\"\"\n check_is_fitted(self)\n X = self._check_test_data(X)\n x_squared_norms = row_norms(X, squared=True)\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n return -_labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms, self.cluster_centers_, self._n_threads)[1]" }, { @@ -34021,7 +34819,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -34031,13 +34830,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "New data to transform." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Transform X to a cluster-distance space.\n\nIn the new space, each dimension is the distance to the cluster centers. Note that even if X is sparse, the array returned by `transform` will typically be dense.", - "docstring": "Transform X to a cluster-distance space.\n\nIn the new space, each dimension is the distance to the cluster\ncenters. Note that even if X is sparse, the array returned by\n`transform` will typically be dense.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to transform.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_clusters)\n X transformed in the new space.", + "description": "Transform X to a cluster-distance space.\n\nIn the new space, each dimension is the distance to the cluster\ncenters. Note that even if X is sparse, the array returned by\n`transform` will typically be dense.", + "docstring": "Transform X to a cluster-distance space.\n\n In the new space, each dimension is the distance to the cluster\n centers. Note that even if X is sparse, the array returned by\n `transform` will typically be dense.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to transform.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_clusters)\n X transformed in the new space.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform X to a cluster-distance space.\n\n In the new space, each dimension is the distance to the cluster\n centers. Note that even if X is sparse, the array returned by\n `transform` will typically be dense.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to transform.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_clusters)\n X transformed in the new space.\n \"\"\"\n check_is_fitted(self)\n X = self._check_test_data(X)\n return self._transform(X)" }, { @@ -34055,7 +34858,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -34065,7 +34869,8 @@ "docstring": { "type": "int, default=8", "description": "The number of clusters to form as well as the number of\ncentroids to generate." - } + }, + "refined_type": {} }, { "name": "init", @@ -34075,6 +34880,10 @@ "docstring": { "type": "{'k-means++', 'random'}, callable or array-like of shape (n_clusters, n_features), default='k-means++'", "description": "Method for initialization:\n\n'k-means++' : selects initial cluster centers for k-mean\nclustering in a smart way to speed up convergence. See section\nNotes in k_init for more details.\n\n'random': choose `n_clusters` observations (rows) at random from data\nfor the initial centroids.\n\nIf an array is passed, it should be of shape (n_clusters, n_features)\nand gives the initial centers.\n\nIf a callable is passed, it should take arguments X, n_clusters and a\nrandom state and return an initialization." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "k-means++"] } }, { @@ -34085,7 +34894,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum number of iterations over the complete dataset before\nstopping independently of any early stopping criterion heuristics." - } + }, + "refined_type": {} }, { "name": "batch_size", @@ -34095,7 +34905,8 @@ "docstring": { "type": "int, default=1024", "description": "Size of the mini batches.\nFor faster compuations, you can set the ``batch_size`` greater than\n256 * number of cores to enable parallelism on all cores.\n\n.. versionchanged:: 1.0\n `batch_size` default changed from 100 to 1024." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -34105,7 +34916,8 @@ "docstring": { "type": "int, default=0", "description": "Verbosity mode." - } + }, + "refined_type": {} }, { "name": "compute_labels", @@ -34115,7 +34927,8 @@ "docstring": { "type": "bool, default=True", "description": "Compute label assignment and inertia for the complete dataset\nonce the minibatch optimization has converged in fit." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -34125,7 +34938,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for centroid initialization and\nrandom reassignment. Use an int to make the randomness deterministic.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "tol", @@ -34135,7 +34949,8 @@ "docstring": { "type": "float, default=0.0", "description": "Control early stopping based on the relative center changes as\nmeasured by a smoothed, variance-normalized of the mean center\nsquared position changes. This early stopping heuristics is\ncloser to the one used for the batch variant of the algorithms\nbut induces a slight computational and memory overhead over the\ninertia heuristic.\n\nTo disable convergence detection based on normalized center\nchange, set tol to 0.0 (default)." - } + }, + "refined_type": {} }, { "name": "max_no_improvement", @@ -34145,7 +34960,8 @@ "docstring": { "type": "int, default=10", "description": "Control early stopping based on the consecutive number of mini\nbatches that does not yield an improvement on the smoothed inertia.\n\nTo disable convergence detection based on inertia, set\nmax_no_improvement to None." - } + }, + "refined_type": {} }, { "name": "init_size", @@ -34155,7 +34971,8 @@ "docstring": { "type": "int, default=None", "description": "Number of samples to randomly sample for speeding up the\ninitialization (sometimes at the expense of accuracy): the\nonly algorithm is initialized by running a batch KMeans on a\nrandom subset of the data. This needs to be larger than n_clusters.\n\nIf `None`, the heuristic is `init_size = 3 * batch_size` if\n`3 * batch_size < n_clusters`, else `init_size = 3 * n_clusters`." - } + }, + "refined_type": {} }, { "name": "n_init", @@ -34165,7 +34982,8 @@ "docstring": { "type": "int, default=3", "description": "Number of random initializations that are tried.\nIn contrast to KMeans, the algorithm is only run once, using the\nbest of the ``n_init`` initializations as measured by inertia." - } + }, + "refined_type": {} }, { "name": "reassignment_ratio", @@ -34175,13 +34993,14 @@ "docstring": { "type": "float, default=0.01", "description": "Control the fraction of the maximum number of counts for a center to\nbe reassigned. A higher value means that low count centers are more\neasily reassigned, which means that the model will take longer to\nconverge, but should converge in a better clustering. However, too high\na value may cause convergence issues, especially with a small batch\nsize." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_clusters=8, *, init='k-means++', max_iter=100, batch_size=1024, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01):\n super().__init__(n_clusters=n_clusters, init=init, max_iter=max_iter, verbose=verbose, random_state=random_state, tol=tol, n_init=n_init)\n self.max_no_improvement = max_no_improvement\n self.batch_size = batch_size\n self.compute_labels = compute_labels\n self.init_size = init_size\n self.reassignment_ratio = reassignment_ratio" }, { @@ -34199,7 +35018,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -34209,13 +35029,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_params(self, X):\n super()._check_params(X)\n if self.max_no_improvement is not None and self.max_no_improvement < 0:\n raise ValueError(f'max_no_improvement should be >= 0, got {self.max_no_improvement} instead.')\n if self.batch_size <= 0:\n raise ValueError(f'batch_size should be > 0, got {self.batch_size} instead.')\n self._batch_size = min(self.batch_size, X.shape[0])\n if self.init_size is not None and self.init_size <= 0:\n raise ValueError(f'init_size should be > 0, got {self.init_size} instead.')\n self._init_size = self.init_size\n if self._init_size is None:\n self._init_size = 3 * self._batch_size\n if self._init_size < self.n_clusters:\n self._init_size = 3 * self.n_clusters\n elif self._init_size < self.n_clusters:\n warnings.warn(f'init_size={self._init_size} should be larger than n_clusters={self.n_clusters}. Setting it to min(3*n_clusters, n_samples)', RuntimeWarning, stacklevel=2)\n self._init_size = 3 * self.n_clusters\n self._init_size = min(self._init_size, X.shape[0])\n if self.reassignment_ratio < 0:\n raise ValueError(f'reassignment_ratio should be >= 0, got {self.reassignment_ratio} instead.')" }, { @@ -34233,7 +35054,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "step", @@ -34243,7 +35065,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_steps", @@ -34253,7 +35076,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -34263,7 +35087,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "centers_squared_diff", @@ -34273,7 +35098,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "batch_inertia", @@ -34283,7 +35109,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -34307,13 +35134,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -34331,13 +35159,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Check if a random reassignment needs to be done.\n\nDo random reassignments each time 10 * n_clusters samples have been processed. If there are empty clusters we always want to reassign.", - "docstring": "Check if a random reassignment needs to be done.\n\nDo random reassignments each time 10 * n_clusters samples have been\nprocessed.\n\nIf there are empty clusters we always want to reassign.", + "description": "Check if a random reassignment needs to be done.\n\nDo random reassignments each time 10 * n_clusters samples have been\nprocessed.\n\nIf there are empty clusters we always want to reassign.", + "docstring": "Check if a random reassignment needs to be done.\n\n Do random reassignments each time 10 * n_clusters samples have been\n processed.\n\n If there are empty clusters we always want to reassign.\n ", "source_code": "\ndef _random_reassign(self):\n \"\"\"Check if a random reassignment needs to be done.\n\n Do random reassignments each time 10 * n_clusters samples have been\n processed.\n\n If there are empty clusters we always want to reassign.\n \"\"\"\n self._n_since_last_reassign += self._batch_size\n if (self._counts == 0).any() or self._n_since_last_reassign >= 10 * self.n_clusters:\n self._n_since_last_reassign = 0\n return True\n return False" }, { @@ -34358,13 +35187,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('The attribute `counts_` is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef counts_(self):\n return self._counts" }, { @@ -34382,7 +35212,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -34392,6 +35223,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training instances to cluster. It must be noted that the data\nwill be converted to C ordering, which will cause a memory copy\nif the given data is not C-contiguous.\nIf a sparse matrix is passed, a copy will be made if it's not in\nCSR format." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -34402,7 +35237,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -34412,13 +35248,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The weights for each observation in X. If None, all observations\nare assigned equal weight.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the centroids on X by chunking it into mini-batches.", - "docstring": "Compute the centroids on X by chunking it into mini-batches.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training instances to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory copy\n if the given data is not C-contiguous.\n If a sparse matrix is passed, a copy will be made if it's not in\n CSR format.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nsample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n .. versionadded:: 0.20\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Compute the centroids on X by chunking it into mini-batches.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training instances to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory copy\n if the given data is not C-contiguous.\n If a sparse matrix is passed, a copy will be made if it's not in\n CSR format.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y=None, sample_weight=None):\n \"\"\"Compute the centroids on X by chunking it into mini-batches.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training instances to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory copy\n if the given data is not C-contiguous.\n If a sparse matrix is passed, a copy will be made if it's not in\n CSR format.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', dtype=[np.float64, np.float32], order='C', accept_large_sparse=False)\n self._check_params(X)\n random_state = check_random_state(self.random_state)\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n self._n_threads = _openmp_effective_n_threads()\n (n_samples, n_features) = X.shape\n init = self.init\n if hasattr(init, '__array__'):\n init = check_array(init, dtype=X.dtype, copy=True, order='C')\n self._validate_center_shape(X, init)\n self._check_mkl_vcomp(X, self._batch_size)\n x_squared_norms = row_norms(X, squared=True)\n validation_indices = random_state.randint(0, n_samples, self._init_size)\n X_valid = X[validation_indices]\n sample_weight_valid = sample_weight[validation_indices]\n x_squared_norms_valid = x_squared_norms[validation_indices]\n best_inertia = None\n for init_idx in range(self._n_init):\n if self.verbose:\n print(f'Init {init_idx + 1}/{self._n_init} with method {init}')\n cluster_centers = self._init_centroids(X, x_squared_norms=x_squared_norms, init=init, random_state=random_state, init_size=self._init_size)\n (_, inertia) = _labels_inertia_threadpool_limit(X_valid, sample_weight_valid, x_squared_norms_valid, cluster_centers, n_threads=self._n_threads)\n if self.verbose:\n print(f'Inertia for init {init_idx + 1}/{self._n_init}: {inertia}')\n if best_inertia is None or inertia < best_inertia:\n init_centers = cluster_centers\n best_inertia = inertia\n centers = init_centers\n centers_new = np.empty_like(centers)\n self._counts = np.zeros(self.n_clusters, dtype=X.dtype)\n self._ewa_inertia = None\n self._ewa_inertia_min = None\n self._no_improvement = 0\n self._n_since_last_reassign = 0\n n_steps = self.max_iter * n_samples // self._batch_size\n with threadpool_limits(limits=1, user_api='blas'):\n for i in range(n_steps):\n minibatch_indices = random_state.randint(0, n_samples, self._batch_size)\n batch_inertia = _mini_batch_step(X=X[minibatch_indices], x_squared_norms=x_squared_norms[minibatch_indices], sample_weight=sample_weight[minibatch_indices], centers=centers, centers_new=centers_new, weight_sums=self._counts, random_state=random_state, random_reassign=self._random_reassign(), reassignment_ratio=self.reassignment_ratio, verbose=self.verbose, n_threads=self._n_threads)\n if self._tol > 0.0:\n centers_squared_diff = np.sum((centers_new - centers)**2)\n else:\n centers_squared_diff = 0\n (centers, centers_new) = (centers_new, centers)\n if self._mini_batch_convergence(i, n_steps, n_samples, centers_squared_diff, batch_inertia):\n break\n self.cluster_centers_ = centers\n self.n_steps_ = i + 1\n self.n_iter_ = int(np.ceil((i + 1) * self._batch_size / n_samples))\n if self.compute_labels:\n (self.labels_, self.inertia_) = _labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms, self.cluster_centers_, n_threads=self._n_threads)\n else:\n self.inertia_ = self._ewa_inertia * n_samples\n return self" }, { @@ -34439,13 +35276,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('The attribute `init_size_` is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef init_size_(self):\n return self._init_size" }, { @@ -34463,7 +35301,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -34473,6 +35312,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training instances to cluster. It must be noted that the data\nwill be converted to C ordering, which will cause a memory copy\nif the given data is not C-contiguous.\nIf a sparse matrix is passed, a copy will be made if it's not in\nCSR format." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -34483,7 +35326,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -34493,13 +35337,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The weights for each observation in X. If None, all observations\nare assigned equal weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Update k means estimate on a single mini-batch X.", - "docstring": "Update k means estimate on a single mini-batch X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training instances to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory copy\n if the given data is not C-contiguous.\n If a sparse matrix is passed, a copy will be made if it's not in\n CSR format.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nsample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\nReturns\n-------\nself : object\n Return updated estimator.", + "docstring": "Update k means estimate on a single mini-batch X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training instances to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory copy\n if the given data is not C-contiguous.\n If a sparse matrix is passed, a copy will be made if it's not in\n CSR format.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n Returns\n -------\n self : object\n Return updated estimator.\n ", "source_code": "\ndef partial_fit(self, X, y=None, sample_weight=None):\n \"\"\"Update k means estimate on a single mini-batch X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training instances to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory copy\n if the given data is not C-contiguous.\n If a sparse matrix is passed, a copy will be made if it's not in\n CSR format.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n Returns\n -------\n self : object\n Return updated estimator.\n \"\"\"\n has_centers = hasattr(self, 'cluster_centers_')\n X = self._validate_data(X, accept_sparse='csr', dtype=[np.float64, np.float32], order='C', accept_large_sparse=False, reset=not has_centers)\n self._random_state = getattr(self, '_random_state', check_random_state(self.random_state))\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n self.n_steps_ = getattr(self, 'n_steps_', 0)\n x_squared_norms = row_norms(X, squared=True)\n if not has_centers:\n self._check_params(X)\n self._n_threads = _openmp_effective_n_threads()\n init = self.init\n if hasattr(init, '__array__'):\n init = check_array(init, dtype=X.dtype, copy=True, order='C')\n self._validate_center_shape(X, init)\n self._check_mkl_vcomp(X, X.shape[0])\n self.cluster_centers_ = self._init_centroids(X, x_squared_norms=x_squared_norms, init=init, random_state=self._random_state, init_size=self._init_size)\n self._counts = np.zeros(self.n_clusters, dtype=X.dtype)\n self._n_since_last_reassign = 0\n with threadpool_limits(limits=1, user_api='blas'):\n _mini_batch_step(X, x_squared_norms=x_squared_norms, sample_weight=sample_weight, centers=self.cluster_centers_, centers_new=self.cluster_centers_, weight_sums=self._counts, random_state=self._random_state, random_reassign=self._random_reassign(), reassignment_ratio=self.reassignment_ratio, verbose=self.verbose, n_threads=self._n_threads)\n if self.compute_labels:\n (self.labels_, self.inertia_) = _labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms, self.cluster_centers_, n_threads=self._n_threads)\n self.n_steps_ += 1\n return self" }, { @@ -34517,7 +35362,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -34527,6 +35373,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "New data to predict." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -34537,13 +35387,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The weights for each observation in X. If None, all observations\nare assigned equal weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict the closest cluster each sample in X belongs to.\n\nIn the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book.", - "docstring": "Predict the closest cluster each sample in X belongs to.\n\nIn the vector quantization literature, `cluster_centers_` is called\nthe code book and each value returned by `predict` is the index of\nthe closest code in the code book.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to predict.\n\nsample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.", + "description": "Predict the closest cluster each sample in X belongs to.\n\nIn the vector quantization literature, `cluster_centers_` is called\nthe code book and each value returned by `predict` is the index of\nthe closest code in the code book.", + "docstring": "Predict the closest cluster each sample in X belongs to.\n\n In the vector quantization literature, `cluster_centers_` is called\n the code book and each value returned by `predict` is the index of\n the closest code in the code book.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to predict.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.\n ", "source_code": "\ndef predict(self, X, sample_weight=None):\n \"\"\"Predict the closest cluster each sample in X belongs to.\n\n In the vector quantization literature, `cluster_centers_` is called\n the code book and each value returned by `predict` is the index of\n the closest code in the code book.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data to predict.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.\n \"\"\"\n check_is_fitted(self)\n X = self._check_test_data(X)\n x_squared_norms = row_norms(X, squared=True)\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n (labels, _) = _labels_inertia_threadpool_limit(X, sample_weight, x_squared_norms, self.cluster_centers_, n_threads=self._n_threads)\n return labels" }, { @@ -34564,13 +35415,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('The attribute `random_state_` is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef random_state_(self):\n return getattr(self, '_random_state', None)" }, { @@ -34588,6 +35440,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "The data to pick seeds for." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -34598,7 +35454,8 @@ "docstring": { "type": "int", "description": "The number of seeds to choose." - } + }, + "refined_type": {} }, { "name": "x_squared_norms", @@ -34608,7 +35465,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Squared Euclidean norm of each data point." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -34618,7 +35476,8 @@ "docstring": { "type": "RandomState instance", "description": "The generator used to initialize the centers.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "n_local_trials", @@ -34628,13 +35487,14 @@ "docstring": { "type": "int, default=None", "description": "The number of seeding trials for each center (except the first),\nof which the one reducing inertia the most is greedily chosen.\nSet to None to make the number of trials depend logarithmically\non the number of seeds (2+log(k)); this is the default." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Computational component for initialization of n_clusters by k-means++. Prior validation of data is assumed.", - "docstring": "Computational component for initialization of n_clusters by\nk-means++. Prior validation of data is assumed.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The data to pick seeds for.\n\nn_clusters : int\n The number of seeds to choose.\n\nx_squared_norms : ndarray of shape (n_samples,)\n Squared Euclidean norm of each data point.\n\nrandom_state : RandomState instance\n The generator used to initialize the centers.\n See :term:`Glossary `.\n\nn_local_trials : int, default=None\n The number of seeding trials for each center (except the first),\n of which the one reducing inertia the most is greedily chosen.\n Set to None to make the number of trials depend logarithmically\n on the number of seeds (2+log(k)); this is the default.\n\nReturns\n-------\ncenters : ndarray of shape (n_clusters, n_features)\n The initial centers for k-means.\n\nindices : ndarray of shape (n_clusters,)\n The index location of the chosen centers in the data array X. For a\n given index and center, X[index] = center.", + "description": "Computational component for initialization of n_clusters by\nk-means++. Prior validation of data is assumed.", + "docstring": "Computational component for initialization of n_clusters by\n k-means++. Prior validation of data is assumed.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The data to pick seeds for.\n\n n_clusters : int\n The number of seeds to choose.\n\n x_squared_norms : ndarray of shape (n_samples,)\n Squared Euclidean norm of each data point.\n\n random_state : RandomState instance\n The generator used to initialize the centers.\n See :term:`Glossary `.\n\n n_local_trials : int, default=None\n The number of seeding trials for each center (except the first),\n of which the one reducing inertia the most is greedily chosen.\n Set to None to make the number of trials depend logarithmically\n on the number of seeds (2+log(k)); this is the default.\n\n Returns\n -------\n centers : ndarray of shape (n_clusters, n_features)\n The initial centers for k-means.\n\n indices : ndarray of shape (n_clusters,)\n The index location of the chosen centers in the data array X. For a\n given index and center, X[index] = center.\n ", "source_code": "\ndef _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):\n \"\"\"Computational component for initialization of n_clusters by\n k-means++. Prior validation of data is assumed.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The data to pick seeds for.\n\n n_clusters : int\n The number of seeds to choose.\n\n x_squared_norms : ndarray of shape (n_samples,)\n Squared Euclidean norm of each data point.\n\n random_state : RandomState instance\n The generator used to initialize the centers.\n See :term:`Glossary `.\n\n n_local_trials : int, default=None\n The number of seeding trials for each center (except the first),\n of which the one reducing inertia the most is greedily chosen.\n Set to None to make the number of trials depend logarithmically\n on the number of seeds (2+log(k)); this is the default.\n\n Returns\n -------\n centers : ndarray of shape (n_clusters, n_features)\n The initial centers for k-means.\n\n indices : ndarray of shape (n_clusters,)\n The index location of the chosen centers in the data array X. For a\n given index and center, X[index] = center.\n \"\"\"\n (n_samples, n_features) = X.shape\n centers = np.empty((n_clusters, n_features), dtype=X.dtype)\n if n_local_trials is None:\n n_local_trials = 2 + int(np.log(n_clusters))\n center_id = random_state.randint(n_samples)\n indices = np.full(n_clusters, -1, dtype=int)\n if sp.issparse(X):\n centers[0] = X[center_id].toarray()\n else:\n centers[0] = X[center_id]\n indices[0] = center_id\n closest_dist_sq = _euclidean_distances(centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, squared=True)\n current_pot = closest_dist_sq.sum()\n for c in range(1, n_clusters):\n rand_vals = random_state.random_sample(n_local_trials) * current_pot\n candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), rand_vals)\n np.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids)\n distance_to_candidates = _euclidean_distances(X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)\n np.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates)\n candidates_pot = distance_to_candidates.sum(axis=1)\n best_candidate = np.argmin(candidates_pot)\n current_pot = candidates_pot[best_candidate]\n closest_dist_sq = distance_to_candidates[best_candidate]\n best_candidate = candidate_ids[best_candidate]\n if sp.issparse(X):\n centers[c] = X[best_candidate].toarray()\n else:\n centers[c] = X[best_candidate]\n indices[c] = best_candidate\n return centers, indices" }, { @@ -34652,6 +35512,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "The observations to cluster. If sparse matrix, must be in CSR format." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -34662,7 +35526,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The weights for each observation in X." - } + }, + "refined_type": {} }, { "name": "centers_init", @@ -34672,7 +35537,8 @@ "docstring": { "type": "ndarray of shape (n_clusters, n_features)", "description": "The initial centers." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -34682,7 +35548,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations of the k-means algorithm to run." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -34692,7 +35559,8 @@ "docstring": { "type": "bool, default=False", "description": "Verbosity mode." - } + }, + "refined_type": {} }, { "name": "x_squared_norms", @@ -34702,7 +35570,8 @@ "docstring": { "type": "array-like, default=None", "description": "Precomputed x_squared_norms." - } + }, + "refined_type": {} }, { "name": "tol", @@ -34712,7 +35581,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Relative tolerance with regards to Frobenius norm of the difference\nin the cluster centers of two consecutive iterations to declare\nconvergence.\nIt's not advised to set `tol=0` since convergence might never be\ndeclared due to rounding errors. Use a very small number instead." - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -34722,13 +35592,14 @@ "docstring": { "type": "int, default=1", "description": "The number of OpenMP threads to use for the computation. Parallelism is\nsample-wise on the main cython loop which assigns each sample to its\nclosest center." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "A single run of k-means elkan, assumes preparation completed prior.", - "docstring": "A single run of k-means elkan, assumes preparation completed prior.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The observations to cluster. If sparse matrix, must be in CSR format.\n\nsample_weight : array-like of shape (n_samples,)\n The weights for each observation in X.\n\ncenters_init : ndarray of shape (n_clusters, n_features)\n The initial centers.\n\nmax_iter : int, default=300\n Maximum number of iterations of the k-means algorithm to run.\n\nverbose : bool, default=False\n Verbosity mode.\n\nx_squared_norms : array-like, default=None\n Precomputed x_squared_norms.\n\ntol : float, default=1e-4\n Relative tolerance with regards to Frobenius norm of the difference\n in the cluster centers of two consecutive iterations to declare\n convergence.\n It's not advised to set `tol=0` since convergence might never be\n declared due to rounding errors. Use a very small number instead.\n\nn_threads : int, default=1\n The number of OpenMP threads to use for the computation. Parallelism is\n sample-wise on the main cython loop which assigns each sample to its\n closest center.\n\nReturns\n-------\ncentroid : ndarray of shape (n_clusters, n_features)\n Centroids found at the last iteration of k-means.\n\nlabel : ndarray of shape (n_samples,)\n label[i] is the code or index of the centroid the\n i'th observation is closest to.\n\ninertia : float\n The final value of the inertia criterion (sum of squared distances to\n the closest centroid for all observations in the training set).\n\nn_iter : int\n Number of iterations run.", + "docstring": "A single run of k-means elkan, assumes preparation completed prior.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The observations to cluster. If sparse matrix, must be in CSR format.\n\n sample_weight : array-like of shape (n_samples,)\n The weights for each observation in X.\n\n centers_init : ndarray of shape (n_clusters, n_features)\n The initial centers.\n\n max_iter : int, default=300\n Maximum number of iterations of the k-means algorithm to run.\n\n verbose : bool, default=False\n Verbosity mode.\n\n x_squared_norms : array-like, default=None\n Precomputed x_squared_norms.\n\n tol : float, default=1e-4\n Relative tolerance with regards to Frobenius norm of the difference\n in the cluster centers of two consecutive iterations to declare\n convergence.\n It's not advised to set `tol=0` since convergence might never be\n declared due to rounding errors. Use a very small number instead.\n\n n_threads : int, default=1\n The number of OpenMP threads to use for the computation. Parallelism is\n sample-wise on the main cython loop which assigns each sample to its\n closest center.\n\n Returns\n -------\n centroid : ndarray of shape (n_clusters, n_features)\n Centroids found at the last iteration of k-means.\n\n label : ndarray of shape (n_samples,)\n label[i] is the code or index of the centroid the\n i'th observation is closest to.\n\n inertia : float\n The final value of the inertia criterion (sum of squared distances to\n the closest centroid for all observations in the training set).\n\n n_iter : int\n Number of iterations run.\n ", "source_code": "\ndef _kmeans_single_elkan(X, sample_weight, centers_init, max_iter=300, verbose=False, x_squared_norms=None, tol=0.0001, n_threads=1):\n \"\"\"A single run of k-means elkan, assumes preparation completed prior.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The observations to cluster. If sparse matrix, must be in CSR format.\n\n sample_weight : array-like of shape (n_samples,)\n The weights for each observation in X.\n\n centers_init : ndarray of shape (n_clusters, n_features)\n The initial centers.\n\n max_iter : int, default=300\n Maximum number of iterations of the k-means algorithm to run.\n\n verbose : bool, default=False\n Verbosity mode.\n\n x_squared_norms : array-like, default=None\n Precomputed x_squared_norms.\n\n tol : float, default=1e-4\n Relative tolerance with regards to Frobenius norm of the difference\n in the cluster centers of two consecutive iterations to declare\n convergence.\n It's not advised to set `tol=0` since convergence might never be\n declared due to rounding errors. Use a very small number instead.\n\n n_threads : int, default=1\n The number of OpenMP threads to use for the computation. Parallelism is\n sample-wise on the main cython loop which assigns each sample to its\n closest center.\n\n Returns\n -------\n centroid : ndarray of shape (n_clusters, n_features)\n Centroids found at the last iteration of k-means.\n\n label : ndarray of shape (n_samples,)\n label[i] is the code or index of the centroid the\n i'th observation is closest to.\n\n inertia : float\n The final value of the inertia criterion (sum of squared distances to\n the closest centroid for all observations in the training set).\n\n n_iter : int\n Number of iterations run.\n \"\"\"\n n_samples = X.shape[0]\n n_clusters = centers_init.shape[0]\n centers = centers_init\n centers_new = np.zeros_like(centers)\n weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)\n labels = np.full(n_samples, -1, dtype=np.int32)\n labels_old = labels.copy()\n center_half_distances = euclidean_distances(centers) / 2\n distance_next_center = np.partition(np.asarray(center_half_distances), kth=1, axis=0)[1]\n upper_bounds = np.zeros(n_samples, dtype=X.dtype)\n lower_bounds = np.zeros((n_samples, n_clusters), dtype=X.dtype)\n center_shift = np.zeros(n_clusters, dtype=X.dtype)\n if sp.issparse(X):\n init_bounds = init_bounds_sparse\n elkan_iter = elkan_iter_chunked_sparse\n _inertia = _inertia_sparse\n else:\n init_bounds = init_bounds_dense\n elkan_iter = elkan_iter_chunked_dense\n _inertia = _inertia_dense\n init_bounds(X, centers, center_half_distances, labels, upper_bounds, lower_bounds)\n strict_convergence = False\n for i in range(max_iter):\n elkan_iter(X, sample_weight, centers, centers_new, weight_in_clusters, center_half_distances, distance_next_center, upper_bounds, lower_bounds, labels, center_shift, n_threads)\n center_half_distances = euclidean_distances(centers_new) / 2\n distance_next_center = np.partition(np.asarray(center_half_distances), kth=1, axis=0)[1]\n if verbose:\n inertia = _inertia(X, sample_weight, centers, labels, n_threads)\n print(f'Iteration {i}, inertia {inertia}')\n (centers, centers_new) = (centers_new, centers)\n if np.array_equal(labels, labels_old):\n if verbose:\n print(f'Converged at iteration {i}: strict convergence.')\n strict_convergence = True\n break\n else:\n center_shift_tot = (center_shift**2).sum()\n if center_shift_tot <= tol:\n if verbose:\n print(f'Converged at iteration {i}: center shift {center_shift_tot} within tolerance {tol}.')\n break\n labels_old[:] = labels\n if not strict_convergence:\n elkan_iter(X, sample_weight, centers, centers, weight_in_clusters, center_half_distances, distance_next_center, upper_bounds, lower_bounds, labels, center_shift, n_threads, update_centers=False)\n inertia = _inertia(X, sample_weight, centers, labels, n_threads)\n return labels, inertia, centers, i + 1" }, { @@ -34746,6 +35617,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "The observations to cluster. If sparse matrix, must be in CSR format." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -34756,7 +35631,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The weights for each observation in X." - } + }, + "refined_type": {} }, { "name": "centers_init", @@ -34766,7 +35642,8 @@ "docstring": { "type": "ndarray of shape (n_clusters, n_features)", "description": "The initial centers." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -34776,7 +35653,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations of the k-means algorithm to run." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -34786,7 +35664,8 @@ "docstring": { "type": "bool, default=False", "description": "Verbosity mode" - } + }, + "refined_type": {} }, { "name": "x_squared_norms", @@ -34796,7 +35675,8 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Precomputed x_squared_norms." - } + }, + "refined_type": {} }, { "name": "tol", @@ -34806,7 +35686,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Relative tolerance with regards to Frobenius norm of the difference\nin the cluster centers of two consecutive iterations to declare\nconvergence.\nIt's not advised to set `tol=0` since convergence might never be\ndeclared due to rounding errors. Use a very small number instead." - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -34816,13 +35697,14 @@ "docstring": { "type": "int, default=1", "description": "The number of OpenMP threads to use for the computation. Parallelism is\nsample-wise on the main cython loop which assigns each sample to its\nclosest center." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "A single run of k-means lloyd, assumes preparation completed prior.", - "docstring": "A single run of k-means lloyd, assumes preparation completed prior.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The observations to cluster. If sparse matrix, must be in CSR format.\n\nsample_weight : ndarray of shape (n_samples,)\n The weights for each observation in X.\n\ncenters_init : ndarray of shape (n_clusters, n_features)\n The initial centers.\n\nmax_iter : int, default=300\n Maximum number of iterations of the k-means algorithm to run.\n\nverbose : bool, default=False\n Verbosity mode\n\nx_squared_norms : ndarray of shape (n_samples,), default=None\n Precomputed x_squared_norms.\n\ntol : float, default=1e-4\n Relative tolerance with regards to Frobenius norm of the difference\n in the cluster centers of two consecutive iterations to declare\n convergence.\n It's not advised to set `tol=0` since convergence might never be\n declared due to rounding errors. Use a very small number instead.\n\nn_threads : int, default=1\n The number of OpenMP threads to use for the computation. Parallelism is\n sample-wise on the main cython loop which assigns each sample to its\n closest center.\n\nReturns\n-------\ncentroid : ndarray of shape (n_clusters, n_features)\n Centroids found at the last iteration of k-means.\n\nlabel : ndarray of shape (n_samples,)\n label[i] is the code or index of the centroid the\n i'th observation is closest to.\n\ninertia : float\n The final value of the inertia criterion (sum of squared distances to\n the closest centroid for all observations in the training set).\n\nn_iter : int\n Number of iterations run.", + "docstring": "A single run of k-means lloyd, assumes preparation completed prior.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The observations to cluster. If sparse matrix, must be in CSR format.\n\n sample_weight : ndarray of shape (n_samples,)\n The weights for each observation in X.\n\n centers_init : ndarray of shape (n_clusters, n_features)\n The initial centers.\n\n max_iter : int, default=300\n Maximum number of iterations of the k-means algorithm to run.\n\n verbose : bool, default=False\n Verbosity mode\n\n x_squared_norms : ndarray of shape (n_samples,), default=None\n Precomputed x_squared_norms.\n\n tol : float, default=1e-4\n Relative tolerance with regards to Frobenius norm of the difference\n in the cluster centers of two consecutive iterations to declare\n convergence.\n It's not advised to set `tol=0` since convergence might never be\n declared due to rounding errors. Use a very small number instead.\n\n n_threads : int, default=1\n The number of OpenMP threads to use for the computation. Parallelism is\n sample-wise on the main cython loop which assigns each sample to its\n closest center.\n\n Returns\n -------\n centroid : ndarray of shape (n_clusters, n_features)\n Centroids found at the last iteration of k-means.\n\n label : ndarray of shape (n_samples,)\n label[i] is the code or index of the centroid the\n i'th observation is closest to.\n\n inertia : float\n The final value of the inertia criterion (sum of squared distances to\n the closest centroid for all observations in the training set).\n\n n_iter : int\n Number of iterations run.\n ", "source_code": "\ndef _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter=300, verbose=False, x_squared_norms=None, tol=0.0001, n_threads=1):\n \"\"\"A single run of k-means lloyd, assumes preparation completed prior.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The observations to cluster. If sparse matrix, must be in CSR format.\n\n sample_weight : ndarray of shape (n_samples,)\n The weights for each observation in X.\n\n centers_init : ndarray of shape (n_clusters, n_features)\n The initial centers.\n\n max_iter : int, default=300\n Maximum number of iterations of the k-means algorithm to run.\n\n verbose : bool, default=False\n Verbosity mode\n\n x_squared_norms : ndarray of shape (n_samples,), default=None\n Precomputed x_squared_norms.\n\n tol : float, default=1e-4\n Relative tolerance with regards to Frobenius norm of the difference\n in the cluster centers of two consecutive iterations to declare\n convergence.\n It's not advised to set `tol=0` since convergence might never be\n declared due to rounding errors. Use a very small number instead.\n\n n_threads : int, default=1\n The number of OpenMP threads to use for the computation. Parallelism is\n sample-wise on the main cython loop which assigns each sample to its\n closest center.\n\n Returns\n -------\n centroid : ndarray of shape (n_clusters, n_features)\n Centroids found at the last iteration of k-means.\n\n label : ndarray of shape (n_samples,)\n label[i] is the code or index of the centroid the\n i'th observation is closest to.\n\n inertia : float\n The final value of the inertia criterion (sum of squared distances to\n the closest centroid for all observations in the training set).\n\n n_iter : int\n Number of iterations run.\n \"\"\"\n n_clusters = centers_init.shape[0]\n centers = centers_init\n centers_new = np.zeros_like(centers)\n labels = np.full(X.shape[0], -1, dtype=np.int32)\n labels_old = labels.copy()\n weight_in_clusters = np.zeros(n_clusters, dtype=X.dtype)\n center_shift = np.zeros(n_clusters, dtype=X.dtype)\n if sp.issparse(X):\n lloyd_iter = lloyd_iter_chunked_sparse\n _inertia = _inertia_sparse\n else:\n lloyd_iter = lloyd_iter_chunked_dense\n _inertia = _inertia_dense\n strict_convergence = False\n with threadpool_limits(limits=1, user_api='blas'):\n for i in range(max_iter):\n lloyd_iter(X, sample_weight, x_squared_norms, centers, centers_new, weight_in_clusters, labels, center_shift, n_threads)\n if verbose:\n inertia = _inertia(X, sample_weight, centers, labels, n_threads)\n print(f'Iteration {i}, inertia {inertia}.')\n (centers, centers_new) = (centers_new, centers)\n if np.array_equal(labels, labels_old):\n if verbose:\n print(f'Converged at iteration {i}: strict convergence.')\n strict_convergence = True\n break\n else:\n center_shift_tot = (center_shift**2).sum()\n if center_shift_tot <= tol:\n if verbose:\n print(f'Converged at iteration {i}: center shift {center_shift_tot} within tolerance {tol}.')\n break\n labels_old[:] = labels\n if not strict_convergence:\n lloyd_iter(X, sample_weight, x_squared_norms, centers, centers, weight_in_clusters, labels, center_shift, n_threads, update_centers=False)\n inertia = _inertia(X, sample_weight, centers, labels, n_threads)\n return labels, inertia, centers, i + 1" }, { @@ -34840,6 +35722,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples to assign to the labels. If sparse matrix, must\nbe in CSR format." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -34850,7 +35736,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The weights for each observation in X." - } + }, + "refined_type": {} }, { "name": "x_squared_norms", @@ -34860,7 +35747,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Precomputed squared euclidean norm of each data point, to speed up\ncomputations." - } + }, + "refined_type": {} }, { "name": "centers", @@ -34870,7 +35758,8 @@ "docstring": { "type": "ndarray of shape (n_clusters, n_features)", "description": "The cluster centers." - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -34880,13 +35769,14 @@ "docstring": { "type": "int, default=1", "description": "The number of OpenMP threads to use for the computation. Parallelism is\nsample-wise on the main cython loop which assigns each sample to its\nclosest center." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "E step of the K-means EM algorithm.\n\nCompute the labels and the inertia of the given samples and centers.", - "docstring": "E step of the K-means EM algorithm.\n\nCompute the labels and the inertia of the given samples and centers.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input samples to assign to the labels. If sparse matrix, must\n be in CSR format.\n\nsample_weight : ndarray of shape (n_samples,)\n The weights for each observation in X.\n\nx_squared_norms : ndarray of shape (n_samples,)\n Precomputed squared euclidean norm of each data point, to speed up\n computations.\n\ncenters : ndarray of shape (n_clusters, n_features)\n The cluster centers.\n\nn_threads : int, default=1\n The number of OpenMP threads to use for the computation. Parallelism is\n sample-wise on the main cython loop which assigns each sample to its\n closest center.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,)\n The resulting assignment.\n\ninertia : float\n Sum of squared distances of samples to their closest cluster center.", + "docstring": "E step of the K-means EM algorithm.\n\n Compute the labels and the inertia of the given samples and centers.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input samples to assign to the labels. If sparse matrix, must\n be in CSR format.\n\n sample_weight : ndarray of shape (n_samples,)\n The weights for each observation in X.\n\n x_squared_norms : ndarray of shape (n_samples,)\n Precomputed squared euclidean norm of each data point, to speed up\n computations.\n\n centers : ndarray of shape (n_clusters, n_features)\n The cluster centers.\n\n n_threads : int, default=1\n The number of OpenMP threads to use for the computation. Parallelism is\n sample-wise on the main cython loop which assigns each sample to its\n closest center.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n The resulting assignment.\n\n inertia : float\n Sum of squared distances of samples to their closest cluster center.\n ", "source_code": "\ndef _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):\n \"\"\"E step of the K-means EM algorithm.\n\n Compute the labels and the inertia of the given samples and centers.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input samples to assign to the labels. If sparse matrix, must\n be in CSR format.\n\n sample_weight : ndarray of shape (n_samples,)\n The weights for each observation in X.\n\n x_squared_norms : ndarray of shape (n_samples,)\n Precomputed squared euclidean norm of each data point, to speed up\n computations.\n\n centers : ndarray of shape (n_clusters, n_features)\n The cluster centers.\n\n n_threads : int, default=1\n The number of OpenMP threads to use for the computation. Parallelism is\n sample-wise on the main cython loop which assigns each sample to its\n closest center.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n The resulting assignment.\n\n inertia : float\n Sum of squared distances of samples to their closest cluster center.\n \"\"\"\n n_samples = X.shape[0]\n n_clusters = centers.shape[0]\n labels = np.full(n_samples, -1, dtype=np.int32)\n weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype)\n center_shift = np.zeros_like(weight_in_clusters)\n if sp.issparse(X):\n _labels = lloyd_iter_chunked_sparse\n _inertia = _inertia_sparse\n else:\n _labels = lloyd_iter_chunked_dense\n _inertia = _inertia_dense\n X = ReadonlyArrayWrapper(X)\n _labels(X, sample_weight, x_squared_norms, centers, centers, weight_in_clusters, labels, center_shift, n_threads, update_centers=False)\n inertia = _inertia(X, sample_weight, centers, labels, n_threads)\n return labels, inertia" }, { @@ -34904,7 +35794,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -34914,7 +35805,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "x_squared_norms", @@ -34924,7 +35816,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "centers", @@ -34934,7 +35827,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -34944,7 +35838,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -34968,6 +35863,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "The original data array. If sparse, must be in CSR format." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -34978,7 +35877,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Squared euclidean norm of each data point." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -34988,7 +35888,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The weights for each observation in X." - } + }, + "refined_type": {} }, { "name": "centers", @@ -34998,7 +35899,8 @@ "docstring": { "type": "ndarray of shape (n_clusters, n_features)", "description": "The cluster centers before the current iteration" - } + }, + "refined_type": {} }, { "name": "centers_new", @@ -35008,7 +35910,8 @@ "docstring": { "type": "ndarray of shape (n_clusters, n_features)", "description": "The cluster centers after the current iteration. Modified in-place." - } + }, + "refined_type": {} }, { "name": "weight_sums", @@ -35018,7 +35921,8 @@ "docstring": { "type": "ndarray of shape (n_clusters,)", "description": "The vector in which we keep track of the numbers of points in a\ncluster. This array is modified in place." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -35028,7 +35932,8 @@ "docstring": { "type": "RandomState instance", "description": "Determines random number generation for low count centers reassignment.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "random_reassign", @@ -35038,7 +35943,8 @@ "docstring": { "type": "boolean, default=False", "description": "If True, centers with very low counts are randomly reassigned\nto observations." - } + }, + "refined_type": {} }, { "name": "reassignment_ratio", @@ -35048,7 +35954,8 @@ "docstring": { "type": "float, default=0.01", "description": "Control the fraction of the maximum number of counts for a\ncenter to be reassigned. A higher value means that low count\ncenters are more likely to be reassigned, which means that the\nmodel will take longer to converge, but should converge in a\nbetter clustering." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -35058,7 +35965,8 @@ "docstring": { "type": "bool, default=False", "description": "Controls the verbosity." - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -35068,13 +35976,14 @@ "docstring": { "type": "int, default=1", "description": "The number of OpenMP threads to use for the computation." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Incremental update of the centers for the Minibatch K-Means algorithm.", - "docstring": "Incremental update of the centers for the Minibatch K-Means algorithm.\n\nParameters\n----------\n\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The original data array. If sparse, must be in CSR format.\n\nx_squared_norms : ndarray of shape (n_samples,)\n Squared euclidean norm of each data point.\n\nsample_weight : ndarray of shape (n_samples,)\n The weights for each observation in X.\n\ncenters : ndarray of shape (n_clusters, n_features)\n The cluster centers before the current iteration\n\ncenters_new : ndarray of shape (n_clusters, n_features)\n The cluster centers after the current iteration. Modified in-place.\n\nweight_sums : ndarray of shape (n_clusters,)\n The vector in which we keep track of the numbers of points in a\n cluster. This array is modified in place.\n\nrandom_state : RandomState instance\n Determines random number generation for low count centers reassignment.\n See :term:`Glossary `.\n\nrandom_reassign : boolean, default=False\n If True, centers with very low counts are randomly reassigned\n to observations.\n\nreassignment_ratio : float, default=0.01\n Control the fraction of the maximum number of counts for a\n center to be reassigned. A higher value means that low count\n centers are more likely to be reassigned, which means that the\n model will take longer to converge, but should converge in a\n better clustering.\n\nverbose : bool, default=False\n Controls the verbosity.\n\nn_threads : int, default=1\n The number of OpenMP threads to use for the computation.\n\nReturns\n-------\ninertia : float\n Sum of squared distances of samples to their closest cluster center.\n The inertia is computed after finding the labels and before updating\n the centers.", + "docstring": "Incremental update of the centers for the Minibatch K-Means algorithm.\n\n Parameters\n ----------\n\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The original data array. If sparse, must be in CSR format.\n\n x_squared_norms : ndarray of shape (n_samples,)\n Squared euclidean norm of each data point.\n\n sample_weight : ndarray of shape (n_samples,)\n The weights for each observation in X.\n\n centers : ndarray of shape (n_clusters, n_features)\n The cluster centers before the current iteration\n\n centers_new : ndarray of shape (n_clusters, n_features)\n The cluster centers after the current iteration. Modified in-place.\n\n weight_sums : ndarray of shape (n_clusters,)\n The vector in which we keep track of the numbers of points in a\n cluster. This array is modified in place.\n\n random_state : RandomState instance\n Determines random number generation for low count centers reassignment.\n See :term:`Glossary `.\n\n random_reassign : boolean, default=False\n If True, centers with very low counts are randomly reassigned\n to observations.\n\n reassignment_ratio : float, default=0.01\n Control the fraction of the maximum number of counts for a\n center to be reassigned. A higher value means that low count\n centers are more likely to be reassigned, which means that the\n model will take longer to converge, but should converge in a\n better clustering.\n\n verbose : bool, default=False\n Controls the verbosity.\n\n n_threads : int, default=1\n The number of OpenMP threads to use for the computation.\n\n Returns\n -------\n inertia : float\n Sum of squared distances of samples to their closest cluster center.\n The inertia is computed after finding the labels and before updating\n the centers.\n ", "source_code": "\ndef _mini_batch_step(X, x_squared_norms, sample_weight, centers, centers_new, weight_sums, random_state, random_reassign=False, reassignment_ratio=0.01, verbose=False, n_threads=1):\n \"\"\"Incremental update of the centers for the Minibatch K-Means algorithm.\n\n Parameters\n ----------\n\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The original data array. If sparse, must be in CSR format.\n\n x_squared_norms : ndarray of shape (n_samples,)\n Squared euclidean norm of each data point.\n\n sample_weight : ndarray of shape (n_samples,)\n The weights for each observation in X.\n\n centers : ndarray of shape (n_clusters, n_features)\n The cluster centers before the current iteration\n\n centers_new : ndarray of shape (n_clusters, n_features)\n The cluster centers after the current iteration. Modified in-place.\n\n weight_sums : ndarray of shape (n_clusters,)\n The vector in which we keep track of the numbers of points in a\n cluster. This array is modified in place.\n\n random_state : RandomState instance\n Determines random number generation for low count centers reassignment.\n See :term:`Glossary `.\n\n random_reassign : boolean, default=False\n If True, centers with very low counts are randomly reassigned\n to observations.\n\n reassignment_ratio : float, default=0.01\n Control the fraction of the maximum number of counts for a\n center to be reassigned. A higher value means that low count\n centers are more likely to be reassigned, which means that the\n model will take longer to converge, but should converge in a\n better clustering.\n\n verbose : bool, default=False\n Controls the verbosity.\n\n n_threads : int, default=1\n The number of OpenMP threads to use for the computation.\n\n Returns\n -------\n inertia : float\n Sum of squared distances of samples to their closest cluster center.\n The inertia is computed after finding the labels and before updating\n the centers.\n \"\"\"\n (labels, inertia) = _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=n_threads)\n if sp.issparse(X):\n _minibatch_update_sparse(X, sample_weight, centers, centers_new, weight_sums, labels, n_threads)\n else:\n _minibatch_update_dense(ReadonlyArrayWrapper(X), sample_weight, centers, centers_new, weight_sums, labels, n_threads)\n if random_reassign and reassignment_ratio > 0:\n to_reassign = weight_sums < reassignment_ratio * weight_sums.max()\n if to_reassign.sum() > 0.5 * X.shape[0]:\n indices_dont_reassign = np.argsort(weight_sums)[int(0.5 * X.shape[0]):]\n to_reassign[indices_dont_reassign] = False\n n_reassigns = to_reassign.sum()\n if n_reassigns:\n new_centers = random_state.choice(X.shape[0], replace=False, size=n_reassigns)\n if verbose:\n print(f'[MiniBatchKMeans] Reassigning {n_reassigns} cluster centers.')\n if sp.issparse(X):\n assign_rows_csr(X, new_centers.astype(np.intp, copy=False), np.where(to_reassign)[0].astype(np.intp, copy=False), centers_new)\n else:\n centers_new[to_reassign] = X[new_centers]\n weight_sums[to_reassign] = np.min(weight_sums[~to_reassign])\n return inertia" }, { @@ -35092,7 +36001,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -35102,7 +36012,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -35126,6 +36037,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The observations to cluster. It must be noted that the data\nwill be converted to C ordering, which will cause a memory copy\nif the given data is not C-contiguous." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -35136,7 +36051,8 @@ "docstring": { "type": "int", "description": "The number of clusters to form as well as the number of\ncentroids to generate." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -35145,8 +36061,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "array-like of shape (n_samples,), default=None", - "description": "The weights for each observation in X. If None, all observations\nare assigned equal weight." - } + "description": "The weights for each observation in `X`. If `None`, all observations\nare assigned equal weight." + }, + "refined_type": {} }, { "name": "init", @@ -35155,7 +36072,11 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "{'k-means++', 'random'}, callable or array-like of shape (n_clusters, n_features), default='k-means++'", - "description": "Method for initialization:\n\n'k-means++' : selects initial cluster centers for k-mean\nclustering in a smart way to speed up convergence. See section\nNotes in k_init for more details.\n\n'random': choose `n_clusters` observations (rows) at random from data\nfor the initial centroids.\n\nIf an array is passed, it should be of shape (n_clusters, n_features)\nand gives the initial centers.\n\nIf a callable is passed, it should take arguments X, n_clusters and a\nrandom state and return an initialization." + "description": "Method for initialization:\n\n- `'k-means++'` : selects initial cluster centers for k-mean\n clustering in a smart way to speed up convergence. See section\n Notes in k_init for more details.\n- `'random'`: choose `n_clusters` observations (rows) at random from data\n for the initial centroids.\n- If an array is passed, it should be of shape `(n_clusters, n_features)`\n and gives the initial centers.\n- If a callable is passed, it should take arguments `X`, `n_clusters` and a\n random state and return an initialization." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "k-means++"] } }, { @@ -35165,8 +36086,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "int, default=10", - "description": "Number of time the k-means algorithm will be run with different\ncentroid seeds. The final results will be the best output of\nn_init consecutive runs in terms of inertia." - } + "description": "Number of time the k-means algorithm will be run with different\ncentroid seeds. The final results will be the best output of\n`n_init` consecutive runs in terms of inertia." + }, + "refined_type": {} }, { "name": "max_iter", @@ -35176,7 +36098,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations of the k-means algorithm to run." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -35186,7 +36109,8 @@ "docstring": { "type": "bool, default=False", "description": "Verbosity mode." - } + }, + "refined_type": {} }, { "name": "tol", @@ -35196,7 +36120,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Relative tolerance with regards to Frobenius norm of the difference\nin the cluster centers of two consecutive iterations to declare\nconvergence." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -35206,7 +36131,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for centroid initialization. Use\nan int to make the randomness deterministic.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "copy_x", @@ -35215,8 +36141,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "bool, default=True", - "description": "When pre-computing distances it is more numerically accurate to center\nthe data first. If copy_x is True (default), then the original data is\nnot modified. If False, the original data is modified, and put back\nbefore the function returns, but small numerical differences may be\nintroduced by subtracting and then adding the data mean. Note that if\nthe original data is not C-contiguous, a copy will be made even if\ncopy_x is False. If the original data is sparse, but not in CSR format,\na copy will be made even if copy_x is False." - } + "description": "When pre-computing distances it is more numerically accurate to center\nthe data first. If `copy_x` is True (default), then the original data is\nnot modified. If False, the original data is modified, and put back\nbefore the function returns, but small numerical differences may be\nintroduced by subtracting and then adding the data mean. Note that if\nthe original data is not C-contiguous, a copy will be made even if\n`copy_x` is False. If the original data is sparse, but not in CSR format,\na copy will be made even if `copy_x` is False." + }, + "refined_type": {} }, { "name": "algorithm", @@ -35225,7 +36152,11 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "{\"auto\", \"full\", \"elkan\"}, default=\"auto\"", - "description": "K-means algorithm to use. The classical EM-style algorithm is \"full\".\nThe \"elkan\" variation is more efficient on data with well-defined\nclusters, by using the triangle inequality. However it's more memory\nintensive due to the allocation of an extra array of shape\n(n_samples, n_clusters).\n\nFor now \"auto\" (kept for backward compatibility) chooses \"elkan\" but it\nmight change in the future for a better heuristic." + "description": "K-means algorithm to use. The classical EM-style algorithm is `\"full\"`.\nThe `\"elkan\"` variation is more efficient on data with well-defined\nclusters, by using the triangle inequality. However it's more memory\nintensive due to the allocation of an extra array of shape\n`(n_samples, n_clusters)`.\n\nFor now `\"auto\"` (kept for backward compatibility) chooses `\"elkan\"` but it\nmight change in the future for a better heuristic." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "full", "elkan"] } }, { @@ -35236,14 +36167,15 @@ "docstring": { "type": "bool, default=False", "description": "Whether or not to return the number of iterations." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "K-means clustering algorithm.\n\nRead more in the :ref:`User Guide `.", - "docstring": "K-means clustering algorithm.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The observations to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory copy\n if the given data is not C-contiguous.\n\nn_clusters : int\n The number of clusters to form as well as the number of\n centroids to generate.\n\nsample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\ninit : {'k-means++', 'random'}, callable or array-like of shape (n_clusters, n_features), default='k-means++'\n Method for initialization:\n\n 'k-means++' : selects initial cluster centers for k-mean\n clustering in a smart way to speed up convergence. See section\n Notes in k_init for more details.\n\n 'random': choose `n_clusters` observations (rows) at random from data\n for the initial centroids.\n\n If an array is passed, it should be of shape (n_clusters, n_features)\n and gives the initial centers.\n\n If a callable is passed, it should take arguments X, n_clusters and a\n random state and return an initialization.\n\nn_init : int, default=10\n Number of time the k-means algorithm will be run with different\n centroid seeds. The final results will be the best output of\n n_init consecutive runs in terms of inertia.\n\nmax_iter : int, default=300\n Maximum number of iterations of the k-means algorithm to run.\n\nverbose : bool, default=False\n Verbosity mode.\n\ntol : float, default=1e-4\n Relative tolerance with regards to Frobenius norm of the difference\n in the cluster centers of two consecutive iterations to declare\n convergence.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for centroid initialization. Use\n an int to make the randomness deterministic.\n See :term:`Glossary `.\n\ncopy_x : bool, default=True\n When pre-computing distances it is more numerically accurate to center\n the data first. If copy_x is True (default), then the original data is\n not modified. If False, the original data is modified, and put back\n before the function returns, but small numerical differences may be\n introduced by subtracting and then adding the data mean. Note that if\n the original data is not C-contiguous, a copy will be made even if\n copy_x is False. If the original data is sparse, but not in CSR format,\n a copy will be made even if copy_x is False.\n\nalgorithm : {\"auto\", \"full\", \"elkan\"}, default=\"auto\"\n K-means algorithm to use. The classical EM-style algorithm is \"full\".\n The \"elkan\" variation is more efficient on data with well-defined\n clusters, by using the triangle inequality. However it's more memory\n intensive due to the allocation of an extra array of shape\n (n_samples, n_clusters).\n\n For now \"auto\" (kept for backward compatibility) chooses \"elkan\" but it\n might change in the future for a better heuristic.\n\nreturn_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\nReturns\n-------\ncentroid : ndarray of shape (n_clusters, n_features)\n Centroids found at the last iteration of k-means.\n\nlabel : ndarray of shape (n_samples,)\n label[i] is the code or index of the centroid the\n i'th observation is closest to.\n\ninertia : float\n The final value of the inertia criterion (sum of squared distances to\n the closest centroid for all observations in the training set).\n\nbest_n_iter : int\n Number of iterations corresponding to the best results.\n Returned only if `return_n_iter` is set to True.", - "source_code": "\ndef k_means(X, n_clusters, *, sample_weight=None, init='k-means++', n_init=10, max_iter=300, verbose=False, tol=0.0001, random_state=None, copy_x=True, algorithm='auto', return_n_iter=False):\n \"\"\"K-means clustering algorithm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The observations to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory copy\n if the given data is not C-contiguous.\n\n n_clusters : int\n The number of clusters to form as well as the number of\n centroids to generate.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in X. If None, all observations\n are assigned equal weight.\n\n init : {'k-means++', 'random'}, callable or array-like of shape (n_clusters, n_features), default='k-means++'\n Method for initialization:\n\n 'k-means++' : selects initial cluster centers for k-mean\n clustering in a smart way to speed up convergence. See section\n Notes in k_init for more details.\n\n 'random': choose `n_clusters` observations (rows) at random from data\n for the initial centroids.\n\n If an array is passed, it should be of shape (n_clusters, n_features)\n and gives the initial centers.\n\n If a callable is passed, it should take arguments X, n_clusters and a\n random state and return an initialization.\n\n n_init : int, default=10\n Number of time the k-means algorithm will be run with different\n centroid seeds. The final results will be the best output of\n n_init consecutive runs in terms of inertia.\n\n max_iter : int, default=300\n Maximum number of iterations of the k-means algorithm to run.\n\n verbose : bool, default=False\n Verbosity mode.\n\n tol : float, default=1e-4\n Relative tolerance with regards to Frobenius norm of the difference\n in the cluster centers of two consecutive iterations to declare\n convergence.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for centroid initialization. Use\n an int to make the randomness deterministic.\n See :term:`Glossary `.\n\n copy_x : bool, default=True\n When pre-computing distances it is more numerically accurate to center\n the data first. If copy_x is True (default), then the original data is\n not modified. If False, the original data is modified, and put back\n before the function returns, but small numerical differences may be\n introduced by subtracting and then adding the data mean. Note that if\n the original data is not C-contiguous, a copy will be made even if\n copy_x is False. If the original data is sparse, but not in CSR format,\n a copy will be made even if copy_x is False.\n\n algorithm : {\"auto\", \"full\", \"elkan\"}, default=\"auto\"\n K-means algorithm to use. The classical EM-style algorithm is \"full\".\n The \"elkan\" variation is more efficient on data with well-defined\n clusters, by using the triangle inequality. However it's more memory\n intensive due to the allocation of an extra array of shape\n (n_samples, n_clusters).\n\n For now \"auto\" (kept for backward compatibility) chooses \"elkan\" but it\n might change in the future for a better heuristic.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n centroid : ndarray of shape (n_clusters, n_features)\n Centroids found at the last iteration of k-means.\n\n label : ndarray of shape (n_samples,)\n label[i] is the code or index of the centroid the\n i'th observation is closest to.\n\n inertia : float\n The final value of the inertia criterion (sum of squared distances to\n the closest centroid for all observations in the training set).\n\n best_n_iter : int\n Number of iterations corresponding to the best results.\n Returned only if `return_n_iter` is set to True.\n \"\"\"\n est = KMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, verbose=verbose, tol=tol, random_state=random_state, copy_x=copy_x, algorithm=algorithm).fit(X, sample_weight=sample_weight)\n if return_n_iter:\n return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_\n else:\n return est.cluster_centers_, est.labels_, est.inertia_" + "description": "Perform K-means clustering algorithm.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Perform K-means clustering algorithm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The observations to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory copy\n if the given data is not C-contiguous.\n\n n_clusters : int\n The number of clusters to form as well as the number of\n centroids to generate.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in `X`. If `None`, all observations\n are assigned equal weight.\n\n init : {'k-means++', 'random'}, callable or array-like of shape (n_clusters, n_features), default='k-means++'\n Method for initialization:\n\n - `'k-means++'` : selects initial cluster centers for k-mean\n clustering in a smart way to speed up convergence. See section\n Notes in k_init for more details.\n - `'random'`: choose `n_clusters` observations (rows) at random from data\n for the initial centroids.\n - If an array is passed, it should be of shape `(n_clusters, n_features)`\n and gives the initial centers.\n - If a callable is passed, it should take arguments `X`, `n_clusters` and a\n random state and return an initialization.\n\n n_init : int, default=10\n Number of time the k-means algorithm will be run with different\n centroid seeds. The final results will be the best output of\n `n_init` consecutive runs in terms of inertia.\n\n max_iter : int, default=300\n Maximum number of iterations of the k-means algorithm to run.\n\n verbose : bool, default=False\n Verbosity mode.\n\n tol : float, default=1e-4\n Relative tolerance with regards to Frobenius norm of the difference\n in the cluster centers of two consecutive iterations to declare\n convergence.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for centroid initialization. Use\n an int to make the randomness deterministic.\n See :term:`Glossary `.\n\n copy_x : bool, default=True\n When pre-computing distances it is more numerically accurate to center\n the data first. If `copy_x` is True (default), then the original data is\n not modified. If False, the original data is modified, and put back\n before the function returns, but small numerical differences may be\n introduced by subtracting and then adding the data mean. Note that if\n the original data is not C-contiguous, a copy will be made even if\n `copy_x` is False. If the original data is sparse, but not in CSR format,\n a copy will be made even if `copy_x` is False.\n\n algorithm : {\"auto\", \"full\", \"elkan\"}, default=\"auto\"\n K-means algorithm to use. The classical EM-style algorithm is `\"full\"`.\n The `\"elkan\"` variation is more efficient on data with well-defined\n clusters, by using the triangle inequality. However it's more memory\n intensive due to the allocation of an extra array of shape\n `(n_samples, n_clusters)`.\n\n For now `\"auto\"` (kept for backward compatibility) chooses `\"elkan\"` but it\n might change in the future for a better heuristic.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n centroid : ndarray of shape (n_clusters, n_features)\n Centroids found at the last iteration of k-means.\n\n label : ndarray of shape (n_samples,)\n The `label[i]` is the code or index of the centroid the\n i'th observation is closest to.\n\n inertia : float\n The final value of the inertia criterion (sum of squared distances to\n the closest centroid for all observations in the training set).\n\n best_n_iter : int\n Number of iterations corresponding to the best results.\n Returned only if `return_n_iter` is set to True.\n ", + "source_code": "\ndef k_means(X, n_clusters, *, sample_weight=None, init='k-means++', n_init=10, max_iter=300, verbose=False, tol=0.0001, random_state=None, copy_x=True, algorithm='auto', return_n_iter=False):\n \"\"\"Perform K-means clustering algorithm.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The observations to cluster. It must be noted that the data\n will be converted to C ordering, which will cause a memory copy\n if the given data is not C-contiguous.\n\n n_clusters : int\n The number of clusters to form as well as the number of\n centroids to generate.\n\n sample_weight : array-like of shape (n_samples,), default=None\n The weights for each observation in `X`. If `None`, all observations\n are assigned equal weight.\n\n init : {'k-means++', 'random'}, callable or array-like of shape (n_clusters, n_features), default='k-means++'\n Method for initialization:\n\n - `'k-means++'` : selects initial cluster centers for k-mean\n clustering in a smart way to speed up convergence. See section\n Notes in k_init for more details.\n - `'random'`: choose `n_clusters` observations (rows) at random from data\n for the initial centroids.\n - If an array is passed, it should be of shape `(n_clusters, n_features)`\n and gives the initial centers.\n - If a callable is passed, it should take arguments `X`, `n_clusters` and a\n random state and return an initialization.\n\n n_init : int, default=10\n Number of time the k-means algorithm will be run with different\n centroid seeds. The final results will be the best output of\n `n_init` consecutive runs in terms of inertia.\n\n max_iter : int, default=300\n Maximum number of iterations of the k-means algorithm to run.\n\n verbose : bool, default=False\n Verbosity mode.\n\n tol : float, default=1e-4\n Relative tolerance with regards to Frobenius norm of the difference\n in the cluster centers of two consecutive iterations to declare\n convergence.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for centroid initialization. Use\n an int to make the randomness deterministic.\n See :term:`Glossary `.\n\n copy_x : bool, default=True\n When pre-computing distances it is more numerically accurate to center\n the data first. If `copy_x` is True (default), then the original data is\n not modified. If False, the original data is modified, and put back\n before the function returns, but small numerical differences may be\n introduced by subtracting and then adding the data mean. Note that if\n the original data is not C-contiguous, a copy will be made even if\n `copy_x` is False. If the original data is sparse, but not in CSR format,\n a copy will be made even if `copy_x` is False.\n\n algorithm : {\"auto\", \"full\", \"elkan\"}, default=\"auto\"\n K-means algorithm to use. The classical EM-style algorithm is `\"full\"`.\n The `\"elkan\"` variation is more efficient on data with well-defined\n clusters, by using the triangle inequality. However it's more memory\n intensive due to the allocation of an extra array of shape\n `(n_samples, n_clusters)`.\n\n For now `\"auto\"` (kept for backward compatibility) chooses `\"elkan\"` but it\n might change in the future for a better heuristic.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n centroid : ndarray of shape (n_clusters, n_features)\n Centroids found at the last iteration of k-means.\n\n label : ndarray of shape (n_samples,)\n The `label[i]` is the code or index of the centroid the\n i'th observation is closest to.\n\n inertia : float\n The final value of the inertia criterion (sum of squared distances to\n the closest centroid for all observations in the training set).\n\n best_n_iter : int\n Number of iterations corresponding to the best results.\n Returned only if `return_n_iter` is set to True.\n \"\"\"\n est = KMeans(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, verbose=verbose, tol=tol, random_state=random_state, copy_x=copy_x, algorithm=algorithm).fit(X, sample_weight=sample_weight)\n if return_n_iter:\n return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_\n else:\n return est.cluster_centers_, est.labels_, est.inertia_" }, { "name": "kmeans_plusplus", @@ -35260,6 +36192,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data to pick seeds from." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -35270,7 +36206,8 @@ "docstring": { "type": "int", "description": "The number of centroids to initialize" - } + }, + "refined_type": {} }, { "name": "x_squared_norms", @@ -35280,7 +36217,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Squared Euclidean norm of each data point." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -35290,7 +36228,8 @@ "docstring": { "type": "int or RandomState instance, default=None", "description": "Determines random number generation for centroid initialization. Pass\nan int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "n_local_trials", @@ -35300,13 +36239,14 @@ "docstring": { "type": "int, default=None", "description": "The number of seeding trials for each center (except the first),\nof which the one reducing inertia the most is greedily chosen.\nSet to None to make the number of trials depend logarithmically\non the number of seeds (2+log(k))." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Init n_clusters seeds according to k-means++\n\n.. versionadded:: 0.24", - "docstring": "Init n_clusters seeds according to k-means++\n\n.. versionadded:: 0.24\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to pick seeds from.\n\nn_clusters : int\n The number of centroids to initialize\n\nx_squared_norms : array-like of shape (n_samples,), default=None\n Squared Euclidean norm of each data point.\n\nrandom_state : int or RandomState instance, default=None\n Determines random number generation for centroid initialization. Pass\n an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nn_local_trials : int, default=None\n The number of seeding trials for each center (except the first),\n of which the one reducing inertia the most is greedily chosen.\n Set to None to make the number of trials depend logarithmically\n on the number of seeds (2+log(k)).\n\nReturns\n-------\ncenters : ndarray of shape (n_clusters, n_features)\n The initial centers for k-means.\n\nindices : ndarray of shape (n_clusters,)\n The index location of the chosen centers in the data array X. For a\n given index and center, X[index] = center.\n\nNotes\n-----\nSelects initial cluster centers for k-mean clustering in a smart way\nto speed up convergence. see: Arthur, D. and Vassilvitskii, S.\n\"k-means++: the advantages of careful seeding\". ACM-SIAM symposium\non Discrete algorithms. 2007\n\nExamples\n--------\n\n>>> from sklearn.cluster import kmeans_plusplus\n>>> import numpy as np\n>>> X = np.array([[1, 2], [1, 4], [1, 0],\n... [10, 2], [10, 4], [10, 0]])\n>>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)\n>>> centers\narray([[10, 4],\n [ 1, 0]])\n>>> indices\narray([4, 2])", + "docstring": "Init n_clusters seeds according to k-means++\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to pick seeds from.\n\n n_clusters : int\n The number of centroids to initialize\n\n x_squared_norms : array-like of shape (n_samples,), default=None\n Squared Euclidean norm of each data point.\n\n random_state : int or RandomState instance, default=None\n Determines random number generation for centroid initialization. Pass\n an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n n_local_trials : int, default=None\n The number of seeding trials for each center (except the first),\n of which the one reducing inertia the most is greedily chosen.\n Set to None to make the number of trials depend logarithmically\n on the number of seeds (2+log(k)).\n\n Returns\n -------\n centers : ndarray of shape (n_clusters, n_features)\n The initial centers for k-means.\n\n indices : ndarray of shape (n_clusters,)\n The index location of the chosen centers in the data array X. For a\n given index and center, X[index] = center.\n\n Notes\n -----\n Selects initial cluster centers for k-mean clustering in a smart way\n to speed up convergence. see: Arthur, D. and Vassilvitskii, S.\n \"k-means++: the advantages of careful seeding\". ACM-SIAM symposium\n on Discrete algorithms. 2007\n\n Examples\n --------\n\n >>> from sklearn.cluster import kmeans_plusplus\n >>> import numpy as np\n >>> X = np.array([[1, 2], [1, 4], [1, 0],\n ... [10, 2], [10, 4], [10, 0]])\n >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)\n >>> centers\n array([[10, 4],\n [ 1, 0]])\n >>> indices\n array([4, 2])\n ", "source_code": "\ndef kmeans_plusplus(X, n_clusters, *, x_squared_norms=None, random_state=None, n_local_trials=None):\n \"\"\"Init n_clusters seeds according to k-means++\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to pick seeds from.\n\n n_clusters : int\n The number of centroids to initialize\n\n x_squared_norms : array-like of shape (n_samples,), default=None\n Squared Euclidean norm of each data point.\n\n random_state : int or RandomState instance, default=None\n Determines random number generation for centroid initialization. Pass\n an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n n_local_trials : int, default=None\n The number of seeding trials for each center (except the first),\n of which the one reducing inertia the most is greedily chosen.\n Set to None to make the number of trials depend logarithmically\n on the number of seeds (2+log(k)).\n\n Returns\n -------\n centers : ndarray of shape (n_clusters, n_features)\n The initial centers for k-means.\n\n indices : ndarray of shape (n_clusters,)\n The index location of the chosen centers in the data array X. For a\n given index and center, X[index] = center.\n\n Notes\n -----\n Selects initial cluster centers for k-mean clustering in a smart way\n to speed up convergence. see: Arthur, D. and Vassilvitskii, S.\n \"k-means++: the advantages of careful seeding\". ACM-SIAM symposium\n on Discrete algorithms. 2007\n\n Examples\n --------\n\n >>> from sklearn.cluster import kmeans_plusplus\n >>> import numpy as np\n >>> X = np.array([[1, 2], [1, 4], [1, 0],\n ... [10, 2], [10, 4], [10, 0]])\n >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)\n >>> centers\n array([[10, 4],\n [ 1, 0]])\n >>> indices\n array([4, 2])\n \"\"\"\n check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32])\n if X.shape[0] < n_clusters:\n raise ValueError(f'n_samples={X.shape[0]} should be >= n_clusters={n_clusters}.')\n if x_squared_norms is None:\n x_squared_norms = row_norms(X, squared=True)\n else:\n x_squared_norms = check_array(x_squared_norms, dtype=X.dtype, ensure_2d=False)\n if x_squared_norms.shape[0] != X.shape[0]:\n raise ValueError(f'The length of x_squared_norms {x_squared_norms.shape[0]} should be equal to the length of n_samples {X.shape[0]}.')\n if n_local_trials is not None and n_local_trials < 1:\n raise ValueError(f'n_local_trials is set to {n_local_trials} but should be an integer value greater than zero.')\n random_state = check_random_state(random_state)\n (centers, indices) = _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, n_local_trials)\n return centers, indices" }, { @@ -35324,7 +36264,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bandwidth", @@ -35334,7 +36275,8 @@ "docstring": { "type": "float, default=None", "description": "Bandwidth used in the RBF kernel.\n\nIf not given, the bandwidth is estimated using\nsklearn.cluster.estimate_bandwidth; see the documentation for that\nfunction for hints on scalability (see also the Notes, below)." - } + }, + "refined_type": {} }, { "name": "seeds", @@ -35344,7 +36286,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features), default=None", "description": "Seeds used to initialize kernels. If not set,\nthe seeds are calculated by clustering.get_bin_seeds\nwith bandwidth as the grid size and default values for\nother parameters." - } + }, + "refined_type": {} }, { "name": "bin_seeding", @@ -35354,7 +36297,8 @@ "docstring": { "type": "bool, default=False", "description": "If true, initial kernel locations are not locations of all\npoints, but rather the location of the discretized version of\npoints, where points are binned onto a grid whose coarseness\ncorresponds to the bandwidth. Setting this option to True will speed\nup the algorithm because fewer seeds will be initialized.\nThe default value is False.\nIgnored if seeds argument is not None." - } + }, + "refined_type": {} }, { "name": "min_bin_freq", @@ -35364,7 +36308,8 @@ "docstring": { "type": "int, default=1", "description": "To speed up the algorithm, accept only those bins with at least\nmin_bin_freq points as seeds." - } + }, + "refined_type": {} }, { "name": "cluster_all", @@ -35374,7 +36319,8 @@ "docstring": { "type": "bool, default=True", "description": "If true, then all points are clustered, even those orphans that are\nnot within any kernel. Orphans are assigned to the nearest kernel.\nIf false, then orphans are given cluster label -1." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -35384,7 +36330,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation. This works by computing\neach of the n_init runs in parallel.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -35394,13 +36341,14 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations, per seed point before the clustering\noperation terminates (for that seed point), if has not converged yet.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=300):\n self.bandwidth = bandwidth\n self.seeds = seeds\n self.bin_seeding = bin_seeding\n self.cluster_all = cluster_all\n self.min_bin_freq = min_bin_freq\n self.n_jobs = n_jobs\n self.max_iter = max_iter" }, { @@ -35418,7 +36366,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -35428,7 +36377,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Samples to cluster." - } + }, + "refined_type": {} }, { "name": "y", @@ -35438,13 +36388,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform clustering.", - "docstring": "Perform clustering.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Samples to cluster.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Fitted instance.", + "docstring": "Perform clustering.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Samples to cluster.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted instance.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Perform clustering.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Samples to cluster.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted instance.\n \"\"\"\n X = self._validate_data(X)\n bandwidth = self.bandwidth\n if bandwidth is None:\n bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)\n elif bandwidth <= 0:\n raise ValueError('bandwidth needs to be greater than zero or None, got %f' % bandwidth)\n seeds = self.seeds\n if seeds is None:\n if self.bin_seeding:\n seeds = get_bin_seeds(X, bandwidth, self.min_bin_freq)\n else:\n seeds = X\n (n_samples, n_features) = X.shape\n center_intensity_dict = {}\n nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)\n all_res = Parallel(n_jobs=self.n_jobs)((delayed(_mean_shift_single_seed)(seed, X, nbrs, self.max_iter) for seed in seeds))\n for i in range(len(seeds)):\n if all_res[i][1]:\n center_intensity_dict[all_res[i][0]] = all_res[i][1]\n self.n_iter_ = max([x[2] for x in all_res])\n if not center_intensity_dict:\n raise ValueError('No point was within bandwidth=%f of any seed. Try a different seeding strategy or increase the bandwidth.' % bandwidth)\n sorted_by_intensity = sorted(center_intensity_dict.items(), key=lambda tup: (tup[1], tup[0]), reverse=True)\n sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])\n unique = np.ones(len(sorted_centers), dtype=bool)\n nbrs = NearestNeighbors(radius=bandwidth, n_jobs=self.n_jobs).fit(sorted_centers)\n for (i, center) in enumerate(sorted_centers):\n if unique[i]:\n neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[0]\n unique[neighbor_idxs] = 0\n unique[i] = 1\n cluster_centers = sorted_centers[unique]\n nbrs = NearestNeighbors(n_neighbors=1, n_jobs=self.n_jobs).fit(cluster_centers)\n labels = np.zeros(n_samples, dtype=int)\n (distances, idxs) = nbrs.kneighbors(X)\n if self.cluster_all:\n labels = idxs.flatten()\n else:\n labels.fill(-1)\n bool_selector = distances.flatten() <= bandwidth\n labels[bool_selector] = idxs.flatten()[bool_selector]\n (self.cluster_centers_, self.labels_) = (cluster_centers, labels)\n return self" }, { @@ -35462,7 +36413,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -35472,13 +36424,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "New data to predict." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict the closest cluster each sample in X belongs to.", - "docstring": "Predict the closest cluster each sample in X belongs to.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n New data to predict.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.", + "docstring": "Predict the closest cluster each sample in X belongs to.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n New data to predict.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict the closest cluster each sample in X belongs to.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n New data to predict.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Index of the cluster each sample belongs to.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n with config_context(assume_finite=True):\n return pairwise_distances_argmin(X, self.cluster_centers_)" }, { @@ -35496,7 +36449,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -35506,7 +36460,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nbrs", @@ -35516,7 +36471,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -35526,13 +36482,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _mean_shift_single_seed(my_mean, X, nbrs, max_iter):\n bandwidth = nbrs.get_params()['radius']\n stop_thresh = 0.001 * bandwidth\n completed_iterations = 0\n while True:\n i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth, return_distance=False)[0]\n points_within = X[i_nbrs]\n if len(points_within) == 0:\n break\n my_old_mean = my_mean\n my_mean = np.mean(points_within, axis=0)\n if np.linalg.norm(my_mean - my_old_mean) < stop_thresh or completed_iterations == max_iter:\n break\n completed_iterations += 1\n return tuple(my_mean), len(points_within), completed_iterations" }, { @@ -35550,7 +36507,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input points." - } + }, + "refined_type": {} }, { "name": "quantile", @@ -35559,8 +36517,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "float, default=0.3", - "description": "should be between [0, 1]\n0.5 means that the median of all pairwise distances is used." - } + "description": "Should be between [0, 1]\n0.5 means that the median of all pairwise distances is used." + }, + "refined_type": {} }, { "name": "n_samples", @@ -35570,7 +36529,8 @@ "docstring": { "type": "int, default=None", "description": "The number of samples to use. If not given, all samples are used." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -35580,7 +36540,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "The generator used to randomly select the samples from input points\nfor bandwidth estimation. Use an int to make the randomness\ndeterministic.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -35590,14 +36551,15 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Estimate the bandwidth to use with the mean-shift algorithm.\n\nThat this function takes time at least quadratic in n_samples. For large datasets, it's wise to set that parameter to a small value.", - "docstring": "Estimate the bandwidth to use with the mean-shift algorithm.\n\nThat this function takes time at least quadratic in n_samples. For large\ndatasets, it's wise to set that parameter to a small value.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input points.\n\nquantile : float, default=0.3\n should be between [0, 1]\n 0.5 means that the median of all pairwise distances is used.\n\nn_samples : int, default=None\n The number of samples to use. If not given, all samples are used.\n\nrandom_state : int, RandomState instance, default=None\n The generator used to randomly select the samples from input points\n for bandwidth estimation. Use an int to make the randomness\n deterministic.\n See :term:`Glossary `.\n\nn_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nReturns\n-------\nbandwidth : float\n The bandwidth parameter.", - "source_code": "\ndef estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0, n_jobs=None):\n \"\"\"Estimate the bandwidth to use with the mean-shift algorithm.\n\n That this function takes time at least quadratic in n_samples. For large\n datasets, it's wise to set that parameter to a small value.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input points.\n\n quantile : float, default=0.3\n should be between [0, 1]\n 0.5 means that the median of all pairwise distances is used.\n\n n_samples : int, default=None\n The number of samples to use. If not given, all samples are used.\n\n random_state : int, RandomState instance, default=None\n The generator used to randomly select the samples from input points\n for bandwidth estimation. Use an int to make the randomness\n deterministic.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n bandwidth : float\n The bandwidth parameter.\n \"\"\"\n X = check_array(X)\n random_state = check_random_state(random_state)\n if n_samples is not None:\n idx = random_state.permutation(X.shape[0])[:n_samples]\n X = X[idx]\n n_neighbors = int(X.shape[0] * quantile)\n if n_neighbors < 1:\n n_neighbors = 1\n nbrs = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=n_jobs)\n nbrs.fit(X)\n bandwidth = 0.0\n for batch in gen_batches(len(X), 500):\n (d, _) = nbrs.kneighbors(X[batch, :], return_distance=True)\n bandwidth += np.max(d, axis=1).sum()\n return bandwidth / X.shape[0]" + "description": "Estimate the bandwidth to use with the mean-shift algorithm.\n\nThat this function takes time at least quadratic in n_samples. For large\ndatasets, it's wise to set that parameter to a small value.", + "docstring": "Estimate the bandwidth to use with the mean-shift algorithm.\n\n That this function takes time at least quadratic in n_samples. For large\n datasets, it's wise to set that parameter to a small value.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input points.\n\n quantile : float, default=0.3\n Should be between [0, 1]\n 0.5 means that the median of all pairwise distances is used.\n\n n_samples : int, default=None\n The number of samples to use. If not given, all samples are used.\n\n random_state : int, RandomState instance, default=None\n The generator used to randomly select the samples from input points\n for bandwidth estimation. Use an int to make the randomness\n deterministic.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n bandwidth : float\n The bandwidth parameter.\n ", + "source_code": "\ndef estimate_bandwidth(X, *, quantile=0.3, n_samples=None, random_state=0, n_jobs=None):\n \"\"\"Estimate the bandwidth to use with the mean-shift algorithm.\n\n That this function takes time at least quadratic in n_samples. For large\n datasets, it's wise to set that parameter to a small value.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input points.\n\n quantile : float, default=0.3\n Should be between [0, 1]\n 0.5 means that the median of all pairwise distances is used.\n\n n_samples : int, default=None\n The number of samples to use. If not given, all samples are used.\n\n random_state : int, RandomState instance, default=None\n The generator used to randomly select the samples from input points\n for bandwidth estimation. Use an int to make the randomness\n deterministic.\n See :term:`Glossary `.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n bandwidth : float\n The bandwidth parameter.\n \"\"\"\n X = check_array(X)\n random_state = check_random_state(random_state)\n if n_samples is not None:\n idx = random_state.permutation(X.shape[0])[:n_samples]\n X = X[idx]\n n_neighbors = int(X.shape[0] * quantile)\n if n_neighbors < 1:\n n_neighbors = 1\n nbrs = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=n_jobs)\n nbrs.fit(X)\n bandwidth = 0.0\n for batch in gen_batches(len(X), 500):\n (d, _) = nbrs.kneighbors(X[batch, :], return_distance=True)\n bandwidth += np.max(d, axis=1).sum()\n return bandwidth / X.shape[0]" }, { "name": "get_bin_seeds", @@ -35614,7 +36576,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input points, the same points that will be used in mean_shift." - } + }, + "refined_type": {} }, { "name": "bin_size", @@ -35624,7 +36587,8 @@ "docstring": { "type": "float", "description": "Controls the coarseness of the binning. Smaller values lead\nto more seeding (which is computationally more expensive). If you're\nnot sure how to set this, set it to the value of the bandwidth used\nin clustering.mean_shift." - } + }, + "refined_type": {} }, { "name": "min_bin_freq", @@ -35634,14 +36598,15 @@ "docstring": { "type": "int, default=1", "description": "Only bins with at least min_bin_freq will be selected as seeds.\nRaising this value decreases the number of seeds found, which\nmakes mean_shift computationally cheaper." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Finds seeds for mean_shift.\n\nFinds seeds by first binning data onto a grid whose lines are spaced bin_size apart, and then choosing those bins with at least min_bin_freq points.", - "docstring": "Finds seeds for mean_shift.\n\nFinds seeds by first binning data onto a grid whose lines are\nspaced bin_size apart, and then choosing those bins with at least\nmin_bin_freq points.\n\nParameters\n----------\n\nX : array-like of shape (n_samples, n_features)\n Input points, the same points that will be used in mean_shift.\n\nbin_size : float\n Controls the coarseness of the binning. Smaller values lead\n to more seeding (which is computationally more expensive). If you're\n not sure how to set this, set it to the value of the bandwidth used\n in clustering.mean_shift.\n\nmin_bin_freq : int, default=1\n Only bins with at least min_bin_freq will be selected as seeds.\n Raising this value decreases the number of seeds found, which\n makes mean_shift computationally cheaper.\n\nReturns\n-------\nbin_seeds : array-like of shape (n_samples, n_features)\n Points used as initial kernel positions in clustering.mean_shift.", - "source_code": "\ndef get_bin_seeds(X, bin_size, min_bin_freq=1):\n \"\"\"Finds seeds for mean_shift.\n\n Finds seeds by first binning data onto a grid whose lines are\n spaced bin_size apart, and then choosing those bins with at least\n min_bin_freq points.\n\n Parameters\n ----------\n\n X : array-like of shape (n_samples, n_features)\n Input points, the same points that will be used in mean_shift.\n\n bin_size : float\n Controls the coarseness of the binning. Smaller values lead\n to more seeding (which is computationally more expensive). If you're\n not sure how to set this, set it to the value of the bandwidth used\n in clustering.mean_shift.\n\n min_bin_freq : int, default=1\n Only bins with at least min_bin_freq will be selected as seeds.\n Raising this value decreases the number of seeds found, which\n makes mean_shift computationally cheaper.\n\n Returns\n -------\n bin_seeds : array-like of shape (n_samples, n_features)\n Points used as initial kernel positions in clustering.mean_shift.\n \"\"\"\n if bin_size == 0:\n return X\n bin_sizes = defaultdict(int)\n for point in X:\n binned_point = np.round(point / bin_size)\n bin_sizes[tuple(binned_point)] += 1\n bin_seeds = np.array([point for (point, freq) in bin_sizes.items() if freq >= min_bin_freq], dtype=np.float32)\n if len(bin_seeds) == len(X):\n warnings.warn('Binning data failed with provided bin_size=%f, using data points as seeds.' % bin_size)\n return X\n bin_seeds = bin_seeds * bin_size\n return bin_seeds" + "description": "Find seeds for mean_shift.\n\nFinds seeds by first binning data onto a grid whose lines are\nspaced bin_size apart, and then choosing those bins with at least\nmin_bin_freq points.", + "docstring": "Find seeds for mean_shift.\n\n Finds seeds by first binning data onto a grid whose lines are\n spaced bin_size apart, and then choosing those bins with at least\n min_bin_freq points.\n\n Parameters\n ----------\n\n X : array-like of shape (n_samples, n_features)\n Input points, the same points that will be used in mean_shift.\n\n bin_size : float\n Controls the coarseness of the binning. Smaller values lead\n to more seeding (which is computationally more expensive). If you're\n not sure how to set this, set it to the value of the bandwidth used\n in clustering.mean_shift.\n\n min_bin_freq : int, default=1\n Only bins with at least min_bin_freq will be selected as seeds.\n Raising this value decreases the number of seeds found, which\n makes mean_shift computationally cheaper.\n\n Returns\n -------\n bin_seeds : array-like of shape (n_samples, n_features)\n Points used as initial kernel positions in clustering.mean_shift.\n ", + "source_code": "\ndef get_bin_seeds(X, bin_size, min_bin_freq=1):\n \"\"\"Find seeds for mean_shift.\n\n Finds seeds by first binning data onto a grid whose lines are\n spaced bin_size apart, and then choosing those bins with at least\n min_bin_freq points.\n\n Parameters\n ----------\n\n X : array-like of shape (n_samples, n_features)\n Input points, the same points that will be used in mean_shift.\n\n bin_size : float\n Controls the coarseness of the binning. Smaller values lead\n to more seeding (which is computationally more expensive). If you're\n not sure how to set this, set it to the value of the bandwidth used\n in clustering.mean_shift.\n\n min_bin_freq : int, default=1\n Only bins with at least min_bin_freq will be selected as seeds.\n Raising this value decreases the number of seeds found, which\n makes mean_shift computationally cheaper.\n\n Returns\n -------\n bin_seeds : array-like of shape (n_samples, n_features)\n Points used as initial kernel positions in clustering.mean_shift.\n \"\"\"\n if bin_size == 0:\n return X\n bin_sizes = defaultdict(int)\n for point in X:\n binned_point = np.round(point / bin_size)\n bin_sizes[tuple(binned_point)] += 1\n bin_seeds = np.array([point for (point, freq) in bin_sizes.items() if freq >= min_bin_freq], dtype=np.float32)\n if len(bin_seeds) == len(X):\n warnings.warn('Binning data failed with provided bin_size=%f, using data points as seeds.' % bin_size)\n return X\n bin_seeds = bin_seeds * bin_size\n return bin_seeds" }, { "name": "mean_shift", @@ -35658,7 +36623,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data." - } + }, + "refined_type": {} }, { "name": "bandwidth", @@ -35668,7 +36634,8 @@ "docstring": { "type": "float, default=None", "description": "Kernel bandwidth.\n\nIf bandwidth is not given, it is determined using a heuristic based on\nthe median of all pairwise distances. This will take quadratic time in\nthe number of samples. The sklearn.cluster.estimate_bandwidth function\ncan be used to do this more efficiently." - } + }, + "refined_type": {} }, { "name": "seeds", @@ -35678,7 +36645,8 @@ "docstring": { "type": "array-like of shape (n_seeds, n_features) or None", "description": "Point used as initial kernel locations. If None and bin_seeding=False,\neach data point is used as a seed. If None and bin_seeding=True,\nsee bin_seeding." - } + }, + "refined_type": {} }, { "name": "bin_seeding", @@ -35688,7 +36656,8 @@ "docstring": { "type": "bool, default=False", "description": "If true, initial kernel locations are not locations of all\npoints, but rather the location of the discretized version of\npoints, where points are binned onto a grid whose coarseness\ncorresponds to the bandwidth. Setting this option to True will speed\nup the algorithm because fewer seeds will be initialized.\nIgnored if seeds argument is not None." - } + }, + "refined_type": {} }, { "name": "min_bin_freq", @@ -35698,7 +36667,8 @@ "docstring": { "type": "int, default=1", "description": "To speed up the algorithm, accept only those bins with at least\nmin_bin_freq points as seeds." - } + }, + "refined_type": {} }, { "name": "cluster_all", @@ -35708,7 +36678,8 @@ "docstring": { "type": "bool, default=True", "description": "If true, then all points are clustered, even those orphans that are\nnot within any kernel. Orphans are assigned to the nearest kernel.\nIf false, then orphans are given cluster label -1." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -35718,7 +36689,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations, per seed point before the clustering\noperation terminates (for that seed point), if has not converged yet." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -35728,14 +36700,15 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation. This works by computing\neach of the n_init runs in parallel.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\n\n.. versionadded:: 0.17\n Parallel Execution using *n_jobs*." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform mean shift clustering of data using a flat kernel.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Perform mean shift clustering of data using a flat kernel.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\n\nX : array-like of shape (n_samples, n_features)\n Input data.\n\nbandwidth : float, default=None\n Kernel bandwidth.\n\n If bandwidth is not given, it is determined using a heuristic based on\n the median of all pairwise distances. This will take quadratic time in\n the number of samples. The sklearn.cluster.estimate_bandwidth function\n can be used to do this more efficiently.\n\nseeds : array-like of shape (n_seeds, n_features) or None\n Point used as initial kernel locations. If None and bin_seeding=False,\n each data point is used as a seed. If None and bin_seeding=True,\n see bin_seeding.\n\nbin_seeding : bool, default=False\n If true, initial kernel locations are not locations of all\n points, but rather the location of the discretized version of\n points, where points are binned onto a grid whose coarseness\n corresponds to the bandwidth. Setting this option to True will speed\n up the algorithm because fewer seeds will be initialized.\n Ignored if seeds argument is not None.\n\nmin_bin_freq : int, default=1\n To speed up the algorithm, accept only those bins with at least\n min_bin_freq points as seeds.\n\ncluster_all : bool, default=True\n If true, then all points are clustered, even those orphans that are\n not within any kernel. Orphans are assigned to the nearest kernel.\n If false, then orphans are given cluster label -1.\n\nmax_iter : int, default=300\n Maximum number of iterations, per seed point before the clustering\n operation terminates (for that seed point), if has not converged yet.\n\nn_jobs : int, default=None\n The number of jobs to use for the computation. This works by computing\n each of the n_init runs in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.17\n Parallel Execution using *n_jobs*.\n\nReturns\n-------\n\ncluster_centers : ndarray of shape (n_clusters, n_features)\n Coordinates of cluster centers.\n\nlabels : ndarray of shape (n_samples,)\n Cluster labels for each point.\n\nNotes\n-----\nFor an example, see :ref:`examples/cluster/plot_mean_shift.py\n`.", - "source_code": "\ndef mean_shift(X, *, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, max_iter=300, n_jobs=None):\n \"\"\"Perform mean shift clustering of data using a flat kernel.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n bandwidth : float, default=None\n Kernel bandwidth.\n\n If bandwidth is not given, it is determined using a heuristic based on\n the median of all pairwise distances. This will take quadratic time in\n the number of samples. The sklearn.cluster.estimate_bandwidth function\n can be used to do this more efficiently.\n\n seeds : array-like of shape (n_seeds, n_features) or None\n Point used as initial kernel locations. If None and bin_seeding=False,\n each data point is used as a seed. If None and bin_seeding=True,\n see bin_seeding.\n\n bin_seeding : bool, default=False\n If true, initial kernel locations are not locations of all\n points, but rather the location of the discretized version of\n points, where points are binned onto a grid whose coarseness\n corresponds to the bandwidth. Setting this option to True will speed\n up the algorithm because fewer seeds will be initialized.\n Ignored if seeds argument is not None.\n\n min_bin_freq : int, default=1\n To speed up the algorithm, accept only those bins with at least\n min_bin_freq points as seeds.\n\n cluster_all : bool, default=True\n If true, then all points are clustered, even those orphans that are\n not within any kernel. Orphans are assigned to the nearest kernel.\n If false, then orphans are given cluster label -1.\n\n max_iter : int, default=300\n Maximum number of iterations, per seed point before the clustering\n operation terminates (for that seed point), if has not converged yet.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by computing\n each of the n_init runs in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.17\n Parallel Execution using *n_jobs*.\n\n Returns\n -------\n\n cluster_centers : ndarray of shape (n_clusters, n_features)\n Coordinates of cluster centers.\n\n labels : ndarray of shape (n_samples,)\n Cluster labels for each point.\n\n Notes\n -----\n For an example, see :ref:`examples/cluster/plot_mean_shift.py\n `.\n\n \"\"\"\n model = MeanShift(bandwidth=bandwidth, seeds=seeds, min_bin_freq=min_bin_freq, bin_seeding=bin_seeding, cluster_all=cluster_all, n_jobs=n_jobs, max_iter=max_iter).fit(X)\n return model.cluster_centers_, model.labels_" + "docstring": "Perform mean shift clustering of data using a flat kernel.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n bandwidth : float, default=None\n Kernel bandwidth.\n\n If bandwidth is not given, it is determined using a heuristic based on\n the median of all pairwise distances. This will take quadratic time in\n the number of samples. The sklearn.cluster.estimate_bandwidth function\n can be used to do this more efficiently.\n\n seeds : array-like of shape (n_seeds, n_features) or None\n Point used as initial kernel locations. If None and bin_seeding=False,\n each data point is used as a seed. If None and bin_seeding=True,\n see bin_seeding.\n\n bin_seeding : bool, default=False\n If true, initial kernel locations are not locations of all\n points, but rather the location of the discretized version of\n points, where points are binned onto a grid whose coarseness\n corresponds to the bandwidth. Setting this option to True will speed\n up the algorithm because fewer seeds will be initialized.\n Ignored if seeds argument is not None.\n\n min_bin_freq : int, default=1\n To speed up the algorithm, accept only those bins with at least\n min_bin_freq points as seeds.\n\n cluster_all : bool, default=True\n If true, then all points are clustered, even those orphans that are\n not within any kernel. Orphans are assigned to the nearest kernel.\n If false, then orphans are given cluster label -1.\n\n max_iter : int, default=300\n Maximum number of iterations, per seed point before the clustering\n operation terminates (for that seed point), if has not converged yet.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by computing\n each of the n_init runs in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.17\n Parallel Execution using *n_jobs*.\n\n Returns\n -------\n\n cluster_centers : ndarray of shape (n_clusters, n_features)\n Coordinates of cluster centers.\n\n labels : ndarray of shape (n_samples,)\n Cluster labels for each point.\n\n Notes\n -----\n For an example, see :ref:`examples/cluster/plot_mean_shift.py\n `.\n ", + "source_code": "\ndef mean_shift(X, *, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, max_iter=300, n_jobs=None):\n \"\"\"Perform mean shift clustering of data using a flat kernel.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n bandwidth : float, default=None\n Kernel bandwidth.\n\n If bandwidth is not given, it is determined using a heuristic based on\n the median of all pairwise distances. This will take quadratic time in\n the number of samples. The sklearn.cluster.estimate_bandwidth function\n can be used to do this more efficiently.\n\n seeds : array-like of shape (n_seeds, n_features) or None\n Point used as initial kernel locations. If None and bin_seeding=False,\n each data point is used as a seed. If None and bin_seeding=True,\n see bin_seeding.\n\n bin_seeding : bool, default=False\n If true, initial kernel locations are not locations of all\n points, but rather the location of the discretized version of\n points, where points are binned onto a grid whose coarseness\n corresponds to the bandwidth. Setting this option to True will speed\n up the algorithm because fewer seeds will be initialized.\n Ignored if seeds argument is not None.\n\n min_bin_freq : int, default=1\n To speed up the algorithm, accept only those bins with at least\n min_bin_freq points as seeds.\n\n cluster_all : bool, default=True\n If true, then all points are clustered, even those orphans that are\n not within any kernel. Orphans are assigned to the nearest kernel.\n If false, then orphans are given cluster label -1.\n\n max_iter : int, default=300\n Maximum number of iterations, per seed point before the clustering\n operation terminates (for that seed point), if has not converged yet.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by computing\n each of the n_init runs in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionadded:: 0.17\n Parallel Execution using *n_jobs*.\n\n Returns\n -------\n\n cluster_centers : ndarray of shape (n_clusters, n_features)\n Coordinates of cluster centers.\n\n labels : ndarray of shape (n_samples,)\n Cluster labels for each point.\n\n Notes\n -----\n For an example, see :ref:`examples/cluster/plot_mean_shift.py\n `.\n \"\"\"\n model = MeanShift(bandwidth=bandwidth, seeds=seeds, min_bin_freq=min_bin_freq, bin_seeding=bin_seeding, cluster_all=cluster_all, n_jobs=n_jobs, max_iter=max_iter).fit(X)\n return model.cluster_centers_, model.labels_" }, { "name": "__init__", @@ -35752,7 +36725,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_samples", @@ -35762,7 +36736,8 @@ "docstring": { "type": "int > 1 or float between 0 and 1, default=5", "description": "The number of samples in a neighborhood for a point to be considered as\na core point. Also, up and down steep regions can't have more than\n``min_samples`` consecutive non-steep points. Expressed as an absolute\nnumber or a fraction of the number of samples (rounded to be at least\n2)." - } + }, + "refined_type": {} }, { "name": "max_eps", @@ -35772,7 +36747,8 @@ "docstring": { "type": "float, default=np.inf", "description": "The maximum distance between two samples for one to be considered as\nin the neighborhood of the other. Default value of ``np.inf`` will\nidentify clusters across all scales; reducing ``max_eps`` will result\nin shorter run times." - } + }, + "refined_type": {} }, { "name": "metric", @@ -35782,7 +36758,8 @@ "docstring": { "type": "str or callable, default='minkowski'", "description": "Metric to use for distance computation. Any metric from scikit-learn\nor scipy.spatial.distance can be used.\n\nIf metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two arrays as input and return one value indicating the\ndistance between them. This works for Scipy's metrics, but is less\nefficient than passing the metric name as a string. If metric is\n\"precomputed\", X is assumed to be a distance matrix and must be square.\n\nValid values for metric are:\n\n- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\nSee the documentation for scipy.spatial.distance for details on these\nmetrics." - } + }, + "refined_type": {} }, { "name": "p", @@ -35792,7 +36769,8 @@ "docstring": { "type": "int, default=2", "description": "Parameter for the Minkowski metric from\n:class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is\nequivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -35802,7 +36780,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function." - } + }, + "refined_type": {} }, { "name": "cluster_method", @@ -35812,7 +36791,8 @@ "docstring": { "type": "str, default='xi'", "description": "The extraction method used to extract clusters using the calculated\nreachability and ordering. Possible values are \"xi\" and \"dbscan\"." - } + }, + "refined_type": {} }, { "name": "eps", @@ -35822,7 +36802,8 @@ "docstring": { "type": "float, default=None", "description": "The maximum distance between two samples for one to be considered as\nin the neighborhood of the other. By default it assumes the same value\nas ``max_eps``.\nUsed only when ``cluster_method='dbscan'``." - } + }, + "refined_type": {} }, { "name": "xi", @@ -35832,7 +36813,8 @@ "docstring": { "type": "float between 0 and 1, default=0.05", "description": "Determines the minimum steepness on the reachability plot that\nconstitutes a cluster boundary. For example, an upwards point in the\nreachability plot is defined by the ratio from one point to its\nsuccessor being at most 1-xi.\nUsed only when ``cluster_method='xi'``." - } + }, + "refined_type": {} }, { "name": "predecessor_correction", @@ -35842,7 +36824,8 @@ "docstring": { "type": "bool, default=True", "description": "Correct clusters according to the predecessors calculated by OPTICS\n[2]_. This parameter has minimal effect on most datasets.\nUsed only when ``cluster_method='xi'``." - } + }, + "refined_type": {} }, { "name": "min_cluster_size", @@ -35852,7 +36835,8 @@ "docstring": { "type": "int > 1 or float between 0 and 1, default=None", "description": "Minimum number of samples in an OPTICS cluster, expressed as an\nabsolute number or a fraction of the number of samples (rounded to be\nat least 2). If ``None``, the value of ``min_samples`` is used instead.\nUsed only when ``cluster_method='xi'``." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -35862,6 +36846,10 @@ "docstring": { "type": "{'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'", "description": "Algorithm used to compute the nearest neighbors:\n\n- 'ball_tree' will use :class:`BallTree`\n- 'kd_tree' will use :class:`KDTree`\n- 'brute' will use a brute-force search.\n- 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method. (default)\n\nNote: fitting on sparse input will override the setting of\nthis parameter, using brute force." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -35872,7 +36860,8 @@ "docstring": { "type": "int, default=30", "description": "Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can\naffect the speed of the construction and query, as well as the memory\nrequired to store the tree. The optimal value depends on the\nnature of the problem." - } + }, + "refined_type": {} }, { "name": "memory", @@ -35882,7 +36871,8 @@ "docstring": { "type": "str or object with the joblib.Memory interface, default=None", "description": "Used to cache the output of the computation of the tree.\nBy default, no caching is done. If a string is given, it is the\npath to the caching directory." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -35892,13 +36882,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, min_samples=5, max_eps=np.inf, metric='minkowski', p=2, metric_params=None, cluster_method='xi', eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm='auto', leaf_size=30, memory=None, n_jobs=None):\n self.max_eps = max_eps\n self.min_samples = min_samples\n self.min_cluster_size = min_cluster_size\n self.algorithm = algorithm\n self.metric = metric\n self.metric_params = metric_params\n self.p = p\n self.leaf_size = leaf_size\n self.cluster_method = cluster_method\n self.eps = eps\n self.xi = xi\n self.predecessor_correction = predecessor_correction\n self.memory = memory\n self.n_jobs = n_jobs" }, { @@ -35916,7 +36907,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -35926,7 +36918,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features), or (n_samples, n_samples) if metric=\u2019precomputed\u2019", "description": "A feature array, or array of distances between samples if\nmetric='precomputed'." - } + }, + "refined_type": {} }, { "name": "y", @@ -35936,13 +36929,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Perform OPTICS clustering.\n\nExtracts an ordered list of points and reachability distances, and performs initial clustering using ``max_eps`` distance specified at OPTICS object instantiation.", - "docstring": "Perform OPTICS clustering.\n\nExtracts an ordered list of points and reachability distances, and\nperforms initial clustering using ``max_eps`` distance specified at\nOPTICS object instantiation.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features), or (n_samples, n_samples) if metric=\u2019precomputed\u2019\n A feature array, or array of distances between samples if\n metric='precomputed'.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns a fitted instance of self.", + "description": "Perform OPTICS clustering.\n\nExtracts an ordered list of points and reachability distances, and\nperforms initial clustering using ``max_eps`` distance specified at\nOPTICS object instantiation.", + "docstring": "Perform OPTICS clustering.\n\n Extracts an ordered list of points and reachability distances, and\n performs initial clustering using ``max_eps`` distance specified at\n OPTICS object instantiation.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features), or (n_samples, n_samples) if metric=\u2019precomputed\u2019\n A feature array, or array of distances between samples if\n metric='precomputed'.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Perform OPTICS clustering.\n\n Extracts an ordered list of points and reachability distances, and\n performs initial clustering using ``max_eps`` distance specified at\n OPTICS object instantiation.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features), or (n_samples, n_samples) if metric=\u2019precomputed\u2019\n A feature array, or array of distances between samples if\n metric='precomputed'.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float\n if dtype == bool and X.dtype != bool:\n msg = f'Data will be converted to boolean for metric {self.metric}, to avoid this warning, you may convert the data prior to calling fit.'\n warnings.warn(msg, DataConversionWarning)\n X = self._validate_data(X, dtype=dtype)\n memory = check_memory(self.memory)\n if self.cluster_method not in ['dbscan', 'xi']:\n raise ValueError(\"cluster_method should be one of 'dbscan' or 'xi' but is %s\" % self.cluster_method)\n (self.ordering_, self.core_distances_, self.reachability_, self.predecessor_) = memory.cache(compute_optics_graph)(X=X, min_samples=self.min_samples, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs, max_eps=self.max_eps)\n if self.cluster_method == 'xi':\n (labels_, clusters_) = cluster_optics_xi(reachability=self.reachability_, predecessor=self.predecessor_, ordering=self.ordering_, min_samples=self.min_samples, min_cluster_size=self.min_cluster_size, xi=self.xi, predecessor_correction=self.predecessor_correction)\n self.cluster_hierarchy_ = clusters_\n elif self.cluster_method == 'dbscan':\n if self.eps is None:\n eps = self.max_eps\n else:\n eps = self.eps\n if eps > self.max_eps:\n raise ValueError('Specify an epsilon smaller than %s. Got %s.' % (self.max_eps, eps))\n labels_ = cluster_optics_dbscan(reachability=self.reachability_, core_distances=self.core_distances_, ordering=self.ordering_, eps=eps)\n self.labels_ = labels_\n return self" }, { @@ -35960,7 +36954,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data." - } + }, + "refined_type": {} }, { "name": "neighbors", @@ -35970,7 +36965,8 @@ "docstring": { "type": "NearestNeighbors instance", "description": "The fitted nearest neighbors estimator." - } + }, + "refined_type": {} }, { "name": "min_samples", @@ -35980,7 +36976,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "working_memory", @@ -35990,13 +36987,14 @@ "docstring": { "type": "int, default=None", "description": "The sought maximum memory for temporary distance matrix chunks.\nWhen None (default), the value of\n``sklearn.get_config()['working_memory']`` is used." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the k-th nearest neighbor of each sample.\n\nEquivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1] but with more memory efficiency.", - "docstring": "Compute the k-th nearest neighbor of each sample.\n\nEquivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]\nbut with more memory efficiency.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data.\nneighbors : NearestNeighbors instance\n The fitted nearest neighbors estimator.\nworking_memory : int, default=None\n The sought maximum memory for temporary distance matrix chunks.\n When None (default), the value of\n ``sklearn.get_config()['working_memory']`` is used.\n\nReturns\n-------\ncore_distances : ndarray of shape (n_samples,)\n Distance at which each sample becomes a core point.\n Points which will never be core have a distance of inf.", + "description": "Compute the k-th nearest neighbor of each sample.\n\nEquivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]\nbut with more memory efficiency.", + "docstring": "Compute the k-th nearest neighbor of each sample.\n\n Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]\n but with more memory efficiency.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n neighbors : NearestNeighbors instance\n The fitted nearest neighbors estimator.\n working_memory : int, default=None\n The sought maximum memory for temporary distance matrix chunks.\n When None (default), the value of\n ``sklearn.get_config()['working_memory']`` is used.\n\n Returns\n -------\n core_distances : ndarray of shape (n_samples,)\n Distance at which each sample becomes a core point.\n Points which will never be core have a distance of inf.\n ", "source_code": "\ndef _compute_core_distances_(X, neighbors, min_samples, working_memory):\n \"\"\"Compute the k-th nearest neighbor of each sample.\n\n Equivalent to neighbors.kneighbors(X, self.min_samples)[0][:, -1]\n but with more memory efficiency.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n neighbors : NearestNeighbors instance\n The fitted nearest neighbors estimator.\n working_memory : int, default=None\n The sought maximum memory for temporary distance matrix chunks.\n When None (default), the value of\n ``sklearn.get_config()['working_memory']`` is used.\n\n Returns\n -------\n core_distances : ndarray of shape (n_samples,)\n Distance at which each sample becomes a core point.\n Points which will never be core have a distance of inf.\n \"\"\"\n n_samples = X.shape[0]\n core_distances = np.empty(n_samples)\n core_distances.fill(np.nan)\n chunk_n_rows = get_chunk_n_rows(row_bytes=16 * min_samples, max_n_rows=n_samples, working_memory=working_memory)\n slices = gen_batches(n_samples, chunk_n_rows)\n for sl in slices:\n core_distances[sl] = neighbors.kneighbors(X[sl], min_samples)[0][:, -1]\n return core_distances" }, { @@ -36014,7 +37012,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "predecessor_plot", @@ -36024,7 +37023,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ordering", @@ -36034,7 +37034,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "s", @@ -36044,7 +37045,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "e", @@ -36054,13 +37056,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Correct for predecessors.\n\nApplies Algorithm 2 of [1]_. Input parameters are ordered by the computer OPTICS ordering. .. [1] Schubert, Erich, Michael Gertz. \"Improving the Cluster Structure Extracted from OPTICS Plots.\" Proc. of the Conference \"Lernen, Wissen, Daten, Analysen\" (LWDA) (2018): 318-329.", - "docstring": "Correct for predecessors.\n\nApplies Algorithm 2 of [1]_.\n\nInput parameters are ordered by the computer OPTICS ordering.\n\n.. [1] Schubert, Erich, Michael Gertz.\n \"Improving the Cluster Structure Extracted from OPTICS Plots.\" Proc. of\n the Conference \"Lernen, Wissen, Daten, Analysen\" (LWDA) (2018): 318-329.", + "description": "Correct for predecessors.\n\nApplies Algorithm 2 of [1]_.\n\nInput parameters are ordered by the computer OPTICS ordering.\n\n.. [1] Schubert, Erich, Michael Gertz.\n \"Improving the Cluster Structure Extracted from OPTICS Plots.\" Proc. of\n the Conference \"Lernen, Wissen, Daten, Analysen\" (LWDA) (2018): 318-329.", + "docstring": "Correct for predecessors.\n\n Applies Algorithm 2 of [1]_.\n\n Input parameters are ordered by the computer OPTICS ordering.\n\n .. [1] Schubert, Erich, Michael Gertz.\n \"Improving the Cluster Structure Extracted from OPTICS Plots.\" Proc. of\n the Conference \"Lernen, Wissen, Daten, Analysen\" (LWDA) (2018): 318-329.\n ", "source_code": "\ndef _correct_predecessor(reachability_plot, predecessor_plot, ordering, s, e):\n \"\"\"Correct for predecessors.\n\n Applies Algorithm 2 of [1]_.\n\n Input parameters are ordered by the computer OPTICS ordering.\n\n .. [1] Schubert, Erich, Michael Gertz.\n \"Improving the Cluster Structure Extracted from OPTICS Plots.\" Proc. of\n the Conference \"Lernen, Wissen, Daten, Analysen\" (LWDA) (2018): 318-329.\n \"\"\"\n while s < e:\n if reachability_plot[s] > reachability_plot[e]:\n return s, e\n p_e = ordering[predecessor_plot[e]]\n for i in range(s, e):\n if p_e == ordering[i]:\n return s, e\n e -= 1\n return None, None" }, { @@ -36078,7 +37081,8 @@ "docstring": { "type": "ndarray of shape (n_samples,), dtype=bool", "description": "True if the point is steep downward (upward)." - } + }, + "refined_type": {} }, { "name": "xward_point", @@ -36088,7 +37092,8 @@ "docstring": { "type": "ndarray of shape (n_samples,), dtype=bool", "description": "True if the point is an upward (respectively downward) point." - } + }, + "refined_type": {} }, { "name": "start", @@ -36098,7 +37103,8 @@ "docstring": { "type": "int", "description": "The start of the xward region." - } + }, + "refined_type": {} }, { "name": "min_samples", @@ -36108,13 +37114,14 @@ "docstring": { "type": "int", "description": "The same as the min_samples given to OPTICS. Up and down steep\nregions can't have more then ``min_samples`` consecutive non-steep\npoints." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Extend the area until it's maximal.\n\nIt's the same function for both upward and downward reagions, depending on the given input parameters. Assuming: - steep_{upward/downward}: bool array indicating whether a point is a steep {upward/downward}; - upward/downward: bool array indicating whether a point is upward/downward; To extend an upward reagion, ``steep_point=steep_upward`` and ``xward_point=downward`` are expected, and to extend a downward region, ``steep_point=steep_downward`` and ``xward_point=upward``.", - "docstring": "Extend the area until it's maximal.\n\nIt's the same function for both upward and downward reagions, depending on\nthe given input parameters. Assuming:\n\n - steep_{upward/downward}: bool array indicating whether a point is a\n steep {upward/downward};\n - upward/downward: bool array indicating whether a point is\n upward/downward;\n\nTo extend an upward reagion, ``steep_point=steep_upward`` and\n``xward_point=downward`` are expected, and to extend a downward region,\n``steep_point=steep_downward`` and ``xward_point=upward``.\n\nParameters\n----------\nsteep_point : ndarray of shape (n_samples,), dtype=bool\n True if the point is steep downward (upward).\n\nxward_point : ndarray of shape (n_samples,), dtype=bool\n True if the point is an upward (respectively downward) point.\n\nstart : int\n The start of the xward region.\n\nmin_samples : int\n The same as the min_samples given to OPTICS. Up and down steep\n regions can't have more then ``min_samples`` consecutive non-steep\n points.\n\nReturns\n-------\nindex : int\n The current index iterating over all the samples, i.e. where we are up\n to in our search.\n\nend : int\n The end of the region, which can be behind the index. The region\n includes the ``end`` index.", + "description": "Extend the area until it's maximal.\n\nIt's the same function for both upward and downward reagions, depending on\nthe given input parameters. Assuming:\n\n - steep_{upward/downward}: bool array indicating whether a point is a\n steep {upward/downward};\n - upward/downward: bool array indicating whether a point is\n upward/downward;\n\nTo extend an upward reagion, ``steep_point=steep_upward`` and\n``xward_point=downward`` are expected, and to extend a downward region,\n``steep_point=steep_downward`` and ``xward_point=upward``.", + "docstring": "Extend the area until it's maximal.\n\n It's the same function for both upward and downward reagions, depending on\n the given input parameters. Assuming:\n\n - steep_{upward/downward}: bool array indicating whether a point is a\n steep {upward/downward};\n - upward/downward: bool array indicating whether a point is\n upward/downward;\n\n To extend an upward reagion, ``steep_point=steep_upward`` and\n ``xward_point=downward`` are expected, and to extend a downward region,\n ``steep_point=steep_downward`` and ``xward_point=upward``.\n\n Parameters\n ----------\n steep_point : ndarray of shape (n_samples,), dtype=bool\n True if the point is steep downward (upward).\n\n xward_point : ndarray of shape (n_samples,), dtype=bool\n True if the point is an upward (respectively downward) point.\n\n start : int\n The start of the xward region.\n\n min_samples : int\n The same as the min_samples given to OPTICS. Up and down steep\n regions can't have more then ``min_samples`` consecutive non-steep\n points.\n\n Returns\n -------\n index : int\n The current index iterating over all the samples, i.e. where we are up\n to in our search.\n\n end : int\n The end of the region, which can be behind the index. The region\n includes the ``end`` index.\n ", "source_code": "\ndef _extend_region(steep_point, xward_point, start, min_samples):\n \"\"\"Extend the area until it's maximal.\n\n It's the same function for both upward and downward reagions, depending on\n the given input parameters. Assuming:\n\n - steep_{upward/downward}: bool array indicating whether a point is a\n steep {upward/downward};\n - upward/downward: bool array indicating whether a point is\n upward/downward;\n\n To extend an upward reagion, ``steep_point=steep_upward`` and\n ``xward_point=downward`` are expected, and to extend a downward region,\n ``steep_point=steep_downward`` and ``xward_point=upward``.\n\n Parameters\n ----------\n steep_point : ndarray of shape (n_samples,), dtype=bool\n True if the point is steep downward (upward).\n\n xward_point : ndarray of shape (n_samples,), dtype=bool\n True if the point is an upward (respectively downward) point.\n\n start : int\n The start of the xward region.\n\n min_samples : int\n The same as the min_samples given to OPTICS. Up and down steep\n regions can't have more then ``min_samples`` consecutive non-steep\n points.\n\n Returns\n -------\n index : int\n The current index iterating over all the samples, i.e. where we are up\n to in our search.\n\n end : int\n The end of the region, which can be behind the index. The region\n includes the ``end`` index.\n \"\"\"\n n_samples = len(steep_point)\n non_xward_points = 0\n index = start\n end = start\n while index < n_samples:\n if steep_point[index]:\n non_xward_points = 0\n end = index\n elif not xward_point[index]:\n non_xward_points += 1\n if non_xward_points > min_samples:\n break\n else:\n return end\n index += 1\n return end" }, { @@ -36132,7 +37139,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The ordering of points calculated by OPTICS" - } + }, + "refined_type": {} }, { "name": "clusters", @@ -36142,13 +37150,14 @@ "docstring": { "type": "array-like of shape (n_clusters, 2)", "description": "List of clusters i.e. (start, end) tuples,\nas returned by `_xi_cluster`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Extracts the labels from the clusters returned by `_xi_cluster`. We rely on the fact that clusters are stored with the smaller clusters coming before the larger ones.", - "docstring": "Extracts the labels from the clusters returned by `_xi_cluster`.\nWe rely on the fact that clusters are stored\nwith the smaller clusters coming before the larger ones.\n\nParameters\n----------\nordering : array-like of shape (n_samples,)\n The ordering of points calculated by OPTICS\n\nclusters : array-like of shape (n_clusters, 2)\n List of clusters i.e. (start, end) tuples,\n as returned by `_xi_cluster`.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,)", + "description": "Extracts the labels from the clusters returned by `_xi_cluster`.\nWe rely on the fact that clusters are stored\nwith the smaller clusters coming before the larger ones.", + "docstring": "Extracts the labels from the clusters returned by `_xi_cluster`.\n We rely on the fact that clusters are stored\n with the smaller clusters coming before the larger ones.\n\n Parameters\n ----------\n ordering : array-like of shape (n_samples,)\n The ordering of points calculated by OPTICS\n\n clusters : array-like of shape (n_clusters, 2)\n List of clusters i.e. (start, end) tuples,\n as returned by `_xi_cluster`.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n ", "source_code": "\ndef _extract_xi_labels(ordering, clusters):\n \"\"\"Extracts the labels from the clusters returned by `_xi_cluster`.\n We rely on the fact that clusters are stored\n with the smaller clusters coming before the larger ones.\n\n Parameters\n ----------\n ordering : array-like of shape (n_samples,)\n The ordering of points calculated by OPTICS\n\n clusters : array-like of shape (n_clusters, 2)\n List of clusters i.e. (start, end) tuples,\n as returned by `_xi_cluster`.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n \"\"\"\n labels = np.full(len(ordering), -1, dtype=int)\n label = 0\n for c in clusters:\n if not np.any(labels[c[0]:c[1] + 1] != -1):\n labels[c[0]:c[1] + 1] = label\n label += 1\n labels[ordering] = labels.copy()\n return labels" }, { @@ -36166,7 +37175,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "reachability_", @@ -36176,7 +37186,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "predecessor_", @@ -36186,7 +37197,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "point_index", @@ -36196,7 +37208,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "processed", @@ -36206,7 +37219,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -36216,7 +37230,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nbrs", @@ -36226,7 +37241,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "metric", @@ -36236,7 +37252,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -36246,7 +37263,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "p", @@ -36256,7 +37274,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_eps", @@ -36266,13 +37285,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _set_reach_dist(core_distances_, reachability_, predecessor_, point_index, processed, X, nbrs, metric, metric_params, p, max_eps):\n P = X[point_index:point_index + 1]\n indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0]\n unproc = np.compress(~np.take(processed, indices), indices)\n if not unproc.size:\n return\n if metric == 'precomputed':\n dists = X[point_index, unproc]\n else:\n _params = dict() if metric_params is None else metric_params.copy()\n if metric == 'minkowski' and 'p' not in _params:\n _params['p'] = p\n dists = pairwise_distances(P, np.take(X, unproc, axis=0), metric=metric, n_jobs=None, **_params).ravel()\n rdists = np.maximum(dists, core_distances_[point_index])\n np.around(rdists, decimals=np.finfo(rdists.dtype).precision, out=rdists)\n improved = np.where(rdists < np.take(reachability_, unproc))\n reachability_[unproc[improved]] = rdists[improved]\n predecessor_[unproc[improved]] = point_index" }, { @@ -36290,7 +37310,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mib", @@ -36300,7 +37321,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "xi_complement", @@ -36310,7 +37332,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "reachability_plot", @@ -36320,13 +37343,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Update steep down areas (SDAs) using the new maximum in between (mib) value, and the given complement of xi, i.e. ``1 - xi``.", - "docstring": "Update steep down areas (SDAs) using the new maximum in between (mib)\nvalue, and the given complement of xi, i.e. ``1 - xi``.", + "description": "Update steep down areas (SDAs) using the new maximum in between (mib)\nvalue, and the given complement of xi, i.e. ``1 - xi``.", + "docstring": "Update steep down areas (SDAs) using the new maximum in between (mib)\n value, and the given complement of xi, i.e. ``1 - xi``.\n ", "source_code": "\ndef _update_filter_sdas(sdas, mib, xi_complement, reachability_plot):\n \"\"\"Update steep down areas (SDAs) using the new maximum in between (mib)\n value, and the given complement of xi, i.e. ``1 - xi``.\n \"\"\"\n if np.isinf(mib):\n return []\n res = [sda for sda in sdas if mib <= reachability_plot[sda['start']] * xi_complement]\n for sda in res:\n sda['mib'] = max(sda['mib'], mib)\n return res" }, { @@ -36344,7 +37368,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -36354,7 +37379,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "param_name", @@ -36364,13 +37390,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_size(size, n_samples, param_name):\n if size <= 0 or size != int(size) and size > 1:\n raise ValueError('%s must be a positive integer or a float between 0 and 1. Got %r' % (param_name, size))\n elif size > n_samples:\n raise ValueError('%s must be no greater than the number of samples (%d). Got %d' % (param_name, n_samples, size))" }, { @@ -36388,7 +37415,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The reachability plot, i.e. reachability ordered according to\nthe calculated ordering, all computed by OPTICS." - } + }, + "refined_type": {} }, { "name": "predecessor_plot", @@ -36398,7 +37426,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Predecessors ordered according to the calculated ordering." - } + }, + "refined_type": {} }, { "name": "ordering", @@ -36408,7 +37437,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "xi", @@ -36418,7 +37448,8 @@ "docstring": { "type": "float, between 0 and 1", "description": "Determines the minimum steepness on the reachability plot that\nconstitutes a cluster boundary. For example, an upwards point in the\nreachability plot is defined by the ratio from one point to its\nsuccessor being at most 1-xi." - } + }, + "refined_type": {} }, { "name": "min_samples", @@ -36428,7 +37459,8 @@ "docstring": { "type": "int > 1", "description": "The same as the min_samples given to OPTICS. Up and down steep regions\ncan't have more then ``min_samples`` consecutive non-steep points." - } + }, + "refined_type": {} }, { "name": "min_cluster_size", @@ -36438,7 +37470,8 @@ "docstring": { "type": "int > 1", "description": "Minimum number of samples in an OPTICS cluster." - } + }, + "refined_type": {} }, { "name": "predecessor_correction", @@ -36448,13 +37481,14 @@ "docstring": { "type": "bool", "description": "Correct clusters based on the calculated predecessors." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Automatically extract clusters according to the Xi-steep method.\n\nThis is rouphly an implementation of Figure 19 of the OPTICS paper.", - "docstring": "Automatically extract clusters according to the Xi-steep method.\n\nThis is rouphly an implementation of Figure 19 of the OPTICS paper.\n\nParameters\n----------\nreachability_plot : array-like of shape (n_samples,)\n The reachability plot, i.e. reachability ordered according to\n the calculated ordering, all computed by OPTICS.\n\npredecessor_plot : array-like of shape (n_samples,)\n Predecessors ordered according to the calculated ordering.\n\nxi : float, between 0 and 1\n Determines the minimum steepness on the reachability plot that\n constitutes a cluster boundary. For example, an upwards point in the\n reachability plot is defined by the ratio from one point to its\n successor being at most 1-xi.\n\nmin_samples : int > 1\n The same as the min_samples given to OPTICS. Up and down steep regions\n can't have more then ``min_samples`` consecutive non-steep points.\n\nmin_cluster_size : int > 1\n Minimum number of samples in an OPTICS cluster.\n\npredecessor_correction : bool\n Correct clusters based on the calculated predecessors.\n\nReturns\n-------\nclusters : ndarray of shape (n_clusters, 2)\n The list of clusters in the form of [start, end] in each row, with all\n indices inclusive. The clusters are ordered in a way that larger\n clusters encompassing smaller clusters come after those smaller\n clusters.", + "docstring": "Automatically extract clusters according to the Xi-steep method.\n\n This is rouphly an implementation of Figure 19 of the OPTICS paper.\n\n Parameters\n ----------\n reachability_plot : array-like of shape (n_samples,)\n The reachability plot, i.e. reachability ordered according to\n the calculated ordering, all computed by OPTICS.\n\n predecessor_plot : array-like of shape (n_samples,)\n Predecessors ordered according to the calculated ordering.\n\n xi : float, between 0 and 1\n Determines the minimum steepness on the reachability plot that\n constitutes a cluster boundary. For example, an upwards point in the\n reachability plot is defined by the ratio from one point to its\n successor being at most 1-xi.\n\n min_samples : int > 1\n The same as the min_samples given to OPTICS. Up and down steep regions\n can't have more then ``min_samples`` consecutive non-steep points.\n\n min_cluster_size : int > 1\n Minimum number of samples in an OPTICS cluster.\n\n predecessor_correction : bool\n Correct clusters based on the calculated predecessors.\n\n Returns\n -------\n clusters : ndarray of shape (n_clusters, 2)\n The list of clusters in the form of [start, end] in each row, with all\n indices inclusive. The clusters are ordered in a way that larger\n clusters encompassing smaller clusters come after those smaller\n clusters.\n ", "source_code": "\ndef _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples, min_cluster_size, predecessor_correction):\n \"\"\"Automatically extract clusters according to the Xi-steep method.\n\n This is rouphly an implementation of Figure 19 of the OPTICS paper.\n\n Parameters\n ----------\n reachability_plot : array-like of shape (n_samples,)\n The reachability plot, i.e. reachability ordered according to\n the calculated ordering, all computed by OPTICS.\n\n predecessor_plot : array-like of shape (n_samples,)\n Predecessors ordered according to the calculated ordering.\n\n xi : float, between 0 and 1\n Determines the minimum steepness on the reachability plot that\n constitutes a cluster boundary. For example, an upwards point in the\n reachability plot is defined by the ratio from one point to its\n successor being at most 1-xi.\n\n min_samples : int > 1\n The same as the min_samples given to OPTICS. Up and down steep regions\n can't have more then ``min_samples`` consecutive non-steep points.\n\n min_cluster_size : int > 1\n Minimum number of samples in an OPTICS cluster.\n\n predecessor_correction : bool\n Correct clusters based on the calculated predecessors.\n\n Returns\n -------\n clusters : ndarray of shape (n_clusters, 2)\n The list of clusters in the form of [start, end] in each row, with all\n indices inclusive. The clusters are ordered in a way that larger\n clusters encompassing smaller clusters come after those smaller\n clusters.\n \"\"\"\n reachability_plot = np.hstack((reachability_plot, np.inf))\n xi_complement = 1 - xi\n sdas = []\n clusters = []\n index = 0\n mib = 0.0\n with np.errstate(invalid='ignore'):\n ratio = reachability_plot[:-1] / reachability_plot[1:]\n steep_upward = ratio <= xi_complement\n steep_downward = ratio >= 1 / xi_complement\n downward = ratio > 1\n upward = ratio < 1\n for steep_index in iter(np.flatnonzero(steep_upward | steep_downward)):\n if steep_index < index:\n continue\n mib = max(mib, np.max(reachability_plot[index:steep_index + 1]))\n if steep_downward[steep_index]:\n sdas = _update_filter_sdas(sdas, mib, xi_complement, reachability_plot)\n D_start = steep_index\n D_end = _extend_region(steep_downward, upward, D_start, min_samples)\n D = {'start': D_start, 'end': D_end, 'mib': 0.0}\n sdas.append(D)\n index = D_end + 1\n mib = reachability_plot[index]\n else:\n sdas = _update_filter_sdas(sdas, mib, xi_complement, reachability_plot)\n U_start = steep_index\n U_end = _extend_region(steep_upward, downward, U_start, min_samples)\n index = U_end + 1\n mib = reachability_plot[index]\n U_clusters = []\n for D in sdas:\n c_start = D['start']\n c_end = U_end\n if reachability_plot[c_end + 1] * xi_complement < D['mib']:\n continue\n D_max = reachability_plot[D['start']]\n if D_max * xi_complement >= reachability_plot[c_end + 1]:\n while reachability_plot[c_start + 1] > reachability_plot[c_end + 1] and c_start < D['end']:\n c_start += 1\n elif reachability_plot[c_end + 1] * xi_complement >= D_max:\n while reachability_plot[c_end - 1] > D_max and c_end > U_start:\n c_end -= 1\n if predecessor_correction:\n (c_start, c_end) = _correct_predecessor(reachability_plot, predecessor_plot, ordering, c_start, c_end)\n if c_start is None:\n continue\n if c_end - c_start + 1 < min_cluster_size:\n continue\n if c_start > D['end']:\n continue\n if c_end < U_start:\n continue\n U_clusters.append((c_start, c_end))\n U_clusters.reverse()\n clusters.extend(U_clusters)\n return np.array(clusters)" }, { @@ -36471,8 +37505,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "array of shape (n_samples,)", - "description": "Reachability distances calculated by OPTICS (``reachability_``)" - } + "description": "Reachability distances calculated by OPTICS (``reachability_``)." + }, + "refined_type": {} }, { "name": "core_distances", @@ -36481,8 +37516,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "array of shape (n_samples,)", - "description": "Distances at which points become core (``core_distances_``)" - } + "description": "Distances at which points become core (``core_distances_``)." + }, + "refined_type": {} }, { "name": "ordering", @@ -36491,8 +37527,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "array of shape (n_samples,)", - "description": "OPTICS ordered point indices (``ordering_``)" - } + "description": "OPTICS ordered point indices (``ordering_``)." + }, + "refined_type": {} }, { "name": "eps", @@ -36502,14 +37539,15 @@ "docstring": { "type": "float", "description": "DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results\nwill be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close\nto one another." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Performs DBSCAN extraction for an arbitrary epsilon.\n\nExtracting the clusters runs in linear time. Note that this results in ``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with similar settings and ``eps``, only if ``eps`` is close to ``max_eps``.", - "docstring": "Performs DBSCAN extraction for an arbitrary epsilon.\n\nExtracting the clusters runs in linear time. Note that this results in\n``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with\nsimilar settings and ``eps``, only if ``eps`` is close to ``max_eps``.\n\nParameters\n----------\nreachability : array of shape (n_samples,)\n Reachability distances calculated by OPTICS (``reachability_``)\n\ncore_distances : array of shape (n_samples,)\n Distances at which points become core (``core_distances_``)\n\nordering : array of shape (n_samples,)\n OPTICS ordered point indices (``ordering_``)\n\neps : float\n DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results\n will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close\n to one another.\n\nReturns\n-------\nlabels_ : array of shape (n_samples,)\n The estimated labels.", - "source_code": "\ndef cluster_optics_dbscan(*, reachability, core_distances, ordering, eps):\n \"\"\"Performs DBSCAN extraction for an arbitrary epsilon.\n\n Extracting the clusters runs in linear time. Note that this results in\n ``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with\n similar settings and ``eps``, only if ``eps`` is close to ``max_eps``.\n\n Parameters\n ----------\n reachability : array of shape (n_samples,)\n Reachability distances calculated by OPTICS (``reachability_``)\n\n core_distances : array of shape (n_samples,)\n Distances at which points become core (``core_distances_``)\n\n ordering : array of shape (n_samples,)\n OPTICS ordered point indices (``ordering_``)\n\n eps : float\n DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results\n will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close\n to one another.\n\n Returns\n -------\n labels_ : array of shape (n_samples,)\n The estimated labels.\n\n \"\"\"\n n_samples = len(core_distances)\n labels = np.zeros(n_samples, dtype=int)\n far_reach = reachability > eps\n near_core = core_distances <= eps\n labels[ordering] = np.cumsum(far_reach[ordering] & near_core[ordering]) - 1\n labels[far_reach & ~near_core] = -1\n return labels" + "description": "Perform DBSCAN extraction for an arbitrary epsilon.\n\nExtracting the clusters runs in linear time. Note that this results in\n``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with\nsimilar settings and ``eps``, only if ``eps`` is close to ``max_eps``.", + "docstring": "Perform DBSCAN extraction for an arbitrary epsilon.\n\n Extracting the clusters runs in linear time. Note that this results in\n ``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with\n similar settings and ``eps``, only if ``eps`` is close to ``max_eps``.\n\n Parameters\n ----------\n reachability : array of shape (n_samples,)\n Reachability distances calculated by OPTICS (``reachability_``).\n\n core_distances : array of shape (n_samples,)\n Distances at which points become core (``core_distances_``).\n\n ordering : array of shape (n_samples,)\n OPTICS ordered point indices (``ordering_``).\n\n eps : float\n DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results\n will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close\n to one another.\n\n Returns\n -------\n labels_ : array of shape (n_samples,)\n The estimated labels.\n ", + "source_code": "\ndef cluster_optics_dbscan(*, reachability, core_distances, ordering, eps):\n \"\"\"Perform DBSCAN extraction for an arbitrary epsilon.\n\n Extracting the clusters runs in linear time. Note that this results in\n ``labels_`` which are close to a :class:`~sklearn.cluster.DBSCAN` with\n similar settings and ``eps``, only if ``eps`` is close to ``max_eps``.\n\n Parameters\n ----------\n reachability : array of shape (n_samples,)\n Reachability distances calculated by OPTICS (``reachability_``).\n\n core_distances : array of shape (n_samples,)\n Distances at which points become core (``core_distances_``).\n\n ordering : array of shape (n_samples,)\n OPTICS ordered point indices (``ordering_``).\n\n eps : float\n DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results\n will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close\n to one another.\n\n Returns\n -------\n labels_ : array of shape (n_samples,)\n The estimated labels.\n \"\"\"\n n_samples = len(core_distances)\n labels = np.zeros(n_samples, dtype=int)\n far_reach = reachability > eps\n near_core = core_distances <= eps\n labels[ordering] = np.cumsum(far_reach[ordering] & near_core[ordering]) - 1\n labels[far_reach & ~near_core] = -1\n return labels" }, { "name": "cluster_optics_xi", @@ -36526,7 +37564,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Reachability distances calculated by OPTICS (`reachability_`)" - } + }, + "refined_type": {} }, { "name": "predecessor", @@ -36536,7 +37575,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Predecessors calculated by OPTICS." - } + }, + "refined_type": {} }, { "name": "ordering", @@ -36546,7 +37586,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "OPTICS ordered point indices (`ordering_`)" - } + }, + "refined_type": {} }, { "name": "min_samples", @@ -36556,7 +37597,8 @@ "docstring": { "type": "int > 1 or float between 0 and 1", "description": "The same as the min_samples given to OPTICS. Up and down steep regions\ncan't have more then ``min_samples`` consecutive non-steep points.\nExpressed as an absolute number or a fraction of the number of samples\n(rounded to be at least 2)." - } + }, + "refined_type": {} }, { "name": "min_cluster_size", @@ -36566,7 +37608,8 @@ "docstring": { "type": "int > 1 or float between 0 and 1, default=None", "description": "Minimum number of samples in an OPTICS cluster, expressed as an\nabsolute number or a fraction of the number of samples (rounded to be\nat least 2). If ``None``, the value of ``min_samples`` is used instead." - } + }, + "refined_type": {} }, { "name": "xi", @@ -36576,7 +37619,8 @@ "docstring": { "type": "float between 0 and 1, default=0.05", "description": "Determines the minimum steepness on the reachability plot that\nconstitutes a cluster boundary. For example, an upwards point in the\nreachability plot is defined by the ratio from one point to its\nsuccessor being at most 1-xi." - } + }, + "refined_type": {} }, { "name": "predecessor_correction", @@ -36586,13 +37630,14 @@ "docstring": { "type": "bool, default=True", "description": "Correct clusters based on the calculated predecessors." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Automatically extract clusters according to the Xi-steep method.", - "docstring": "Automatically extract clusters according to the Xi-steep method.\n\nParameters\n----------\nreachability : ndarray of shape (n_samples,)\n Reachability distances calculated by OPTICS (`reachability_`)\n\npredecessor : ndarray of shape (n_samples,)\n Predecessors calculated by OPTICS.\n\nordering : ndarray of shape (n_samples,)\n OPTICS ordered point indices (`ordering_`)\n\nmin_samples : int > 1 or float between 0 and 1\n The same as the min_samples given to OPTICS. Up and down steep regions\n can't have more then ``min_samples`` consecutive non-steep points.\n Expressed as an absolute number or a fraction of the number of samples\n (rounded to be at least 2).\n\nmin_cluster_size : int > 1 or float between 0 and 1, default=None\n Minimum number of samples in an OPTICS cluster, expressed as an\n absolute number or a fraction of the number of samples (rounded to be\n at least 2). If ``None``, the value of ``min_samples`` is used instead.\n\nxi : float between 0 and 1, default=0.05\n Determines the minimum steepness on the reachability plot that\n constitutes a cluster boundary. For example, an upwards point in the\n reachability plot is defined by the ratio from one point to its\n successor being at most 1-xi.\n\npredecessor_correction : bool, default=True\n Correct clusters based on the calculated predecessors.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,)\n The labels assigned to samples. Points which are not included\n in any cluster are labeled as -1.\n\nclusters : ndarray of shape (n_clusters, 2)\n The list of clusters in the form of ``[start, end]`` in each row, with\n all indices inclusive. The clusters are ordered according to ``(end,\n -start)`` (ascending) so that larger clusters encompassing smaller\n clusters come after such nested smaller clusters. Since ``labels`` does\n not reflect the hierarchy, usually ``len(clusters) >\n np.unique(labels)``.", + "docstring": "Automatically extract clusters according to the Xi-steep method.\n\n Parameters\n ----------\n reachability : ndarray of shape (n_samples,)\n Reachability distances calculated by OPTICS (`reachability_`)\n\n predecessor : ndarray of shape (n_samples,)\n Predecessors calculated by OPTICS.\n\n ordering : ndarray of shape (n_samples,)\n OPTICS ordered point indices (`ordering_`)\n\n min_samples : int > 1 or float between 0 and 1\n The same as the min_samples given to OPTICS. Up and down steep regions\n can't have more then ``min_samples`` consecutive non-steep points.\n Expressed as an absolute number or a fraction of the number of samples\n (rounded to be at least 2).\n\n min_cluster_size : int > 1 or float between 0 and 1, default=None\n Minimum number of samples in an OPTICS cluster, expressed as an\n absolute number or a fraction of the number of samples (rounded to be\n at least 2). If ``None``, the value of ``min_samples`` is used instead.\n\n xi : float between 0 and 1, default=0.05\n Determines the minimum steepness on the reachability plot that\n constitutes a cluster boundary. For example, an upwards point in the\n reachability plot is defined by the ratio from one point to its\n successor being at most 1-xi.\n\n predecessor_correction : bool, default=True\n Correct clusters based on the calculated predecessors.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n The labels assigned to samples. Points which are not included\n in any cluster are labeled as -1.\n\n clusters : ndarray of shape (n_clusters, 2)\n The list of clusters in the form of ``[start, end]`` in each row, with\n all indices inclusive. The clusters are ordered according to ``(end,\n -start)`` (ascending) so that larger clusters encompassing smaller\n clusters come after such nested smaller clusters. Since ``labels`` does\n not reflect the hierarchy, usually ``len(clusters) >\n np.unique(labels)``.\n ", "source_code": "\ndef cluster_optics_xi(*, reachability, predecessor, ordering, min_samples, min_cluster_size=None, xi=0.05, predecessor_correction=True):\n \"\"\"Automatically extract clusters according to the Xi-steep method.\n\n Parameters\n ----------\n reachability : ndarray of shape (n_samples,)\n Reachability distances calculated by OPTICS (`reachability_`)\n\n predecessor : ndarray of shape (n_samples,)\n Predecessors calculated by OPTICS.\n\n ordering : ndarray of shape (n_samples,)\n OPTICS ordered point indices (`ordering_`)\n\n min_samples : int > 1 or float between 0 and 1\n The same as the min_samples given to OPTICS. Up and down steep regions\n can't have more then ``min_samples`` consecutive non-steep points.\n Expressed as an absolute number or a fraction of the number of samples\n (rounded to be at least 2).\n\n min_cluster_size : int > 1 or float between 0 and 1, default=None\n Minimum number of samples in an OPTICS cluster, expressed as an\n absolute number or a fraction of the number of samples (rounded to be\n at least 2). If ``None``, the value of ``min_samples`` is used instead.\n\n xi : float between 0 and 1, default=0.05\n Determines the minimum steepness on the reachability plot that\n constitutes a cluster boundary. For example, an upwards point in the\n reachability plot is defined by the ratio from one point to its\n successor being at most 1-xi.\n\n predecessor_correction : bool, default=True\n Correct clusters based on the calculated predecessors.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n The labels assigned to samples. Points which are not included\n in any cluster are labeled as -1.\n\n clusters : ndarray of shape (n_clusters, 2)\n The list of clusters in the form of ``[start, end]`` in each row, with\n all indices inclusive. The clusters are ordered according to ``(end,\n -start)`` (ascending) so that larger clusters encompassing smaller\n clusters come after such nested smaller clusters. Since ``labels`` does\n not reflect the hierarchy, usually ``len(clusters) >\n np.unique(labels)``.\n \"\"\"\n n_samples = len(reachability)\n _validate_size(min_samples, n_samples, 'min_samples')\n if min_samples <= 1:\n min_samples = max(2, int(min_samples * n_samples))\n if min_cluster_size is None:\n min_cluster_size = min_samples\n _validate_size(min_cluster_size, n_samples, 'min_cluster_size')\n if min_cluster_size <= 1:\n min_cluster_size = max(2, int(min_cluster_size * n_samples))\n clusters = _xi_cluster(reachability[ordering], predecessor[ordering], ordering, xi, min_samples, min_cluster_size, predecessor_correction)\n labels = _extract_xi_labels(ordering, clusters)\n return labels, clusters" }, { @@ -36610,7 +37655,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features), or (n_samples, n_samples) if metric=\u2019precomputed\u2019.", "description": "A feature array, or array of distances between samples if\nmetric='precomputed'" - } + }, + "refined_type": {} }, { "name": "min_samples", @@ -36620,7 +37666,8 @@ "docstring": { "type": "int > 1 or float between 0 and 1", "description": "The number of samples in a neighborhood for a point to be considered\nas a core point. Expressed as an absolute number or a fraction of the\nnumber of samples (rounded to be at least 2)." - } + }, + "refined_type": {} }, { "name": "max_eps", @@ -36630,7 +37677,8 @@ "docstring": { "type": "float, default=np.inf", "description": "The maximum distance between two samples for one to be considered as\nin the neighborhood of the other. Default value of ``np.inf`` will\nidentify clusters across all scales; reducing ``max_eps`` will result\nin shorter run times." - } + }, + "refined_type": {} }, { "name": "metric", @@ -36640,7 +37688,8 @@ "docstring": { "type": "str or callable, default='minkowski'", "description": "Metric to use for distance computation. Any metric from scikit-learn\nor scipy.spatial.distance can be used.\n\nIf metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two arrays as input and return one value indicating the\ndistance between them. This works for Scipy's metrics, but is less\nefficient than passing the metric name as a string. If metric is\n\"precomputed\", X is assumed to be a distance matrix and must be square.\n\nValid values for metric are:\n\n- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\nSee the documentation for scipy.spatial.distance for details on these\nmetrics." - } + }, + "refined_type": {} }, { "name": "p", @@ -36650,7 +37699,8 @@ "docstring": { "type": "int, default=2", "description": "Parameter for the Minkowski metric from\n:class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is\nequivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -36660,7 +37710,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -36670,6 +37721,10 @@ "docstring": { "type": "{'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'", "description": "Algorithm used to compute the nearest neighbors:\n\n- 'ball_tree' will use :class:`BallTree`\n- 'kd_tree' will use :class:`KDTree`\n- 'brute' will use a brute-force search.\n- 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method. (default)\n\nNote: fitting on sparse input will override the setting of\nthis parameter, using brute force." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -36680,7 +37735,8 @@ "docstring": { "type": "int, default=30", "description": "Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can\naffect the speed of the construction and query, as well as the memory\nrequired to store the tree. The optimal value depends on the\nnature of the problem." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -36690,14 +37746,15 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Computes the OPTICS reachability graph.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Computes the OPTICS reachability graph.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features), or (n_samples, n_samples) if metric=\u2019precomputed\u2019.\n A feature array, or array of distances between samples if\n metric='precomputed'\n\nmin_samples : int > 1 or float between 0 and 1\n The number of samples in a neighborhood for a point to be considered\n as a core point. Expressed as an absolute number or a fraction of the\n number of samples (rounded to be at least 2).\n\nmax_eps : float, default=np.inf\n The maximum distance between two samples for one to be considered as\n in the neighborhood of the other. Default value of ``np.inf`` will\n identify clusters across all scales; reducing ``max_eps`` will result\n in shorter run times.\n\nmetric : str or callable, default='minkowski'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string. If metric is\n \"precomputed\", X is assumed to be a distance matrix and must be square.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\np : int, default=2\n Parameter for the Minkowski metric from\n :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\nmetric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\nalgorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method. (default)\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\nleaf_size : int, default=30\n Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can\n affect the speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\nn_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nReturns\n-------\nordering_ : array of shape (n_samples,)\n The cluster ordered list of sample indices.\n\ncore_distances_ : array of shape (n_samples,)\n Distance at which each sample becomes a core point, indexed by object\n order. Points which will never be core have a distance of inf. Use\n ``clust.core_distances_[clust.ordering_]`` to access in cluster order.\n\nreachability_ : array of shape (n_samples,)\n Reachability distances per sample, indexed by object order. Use\n ``clust.reachability_[clust.ordering_]`` to access in cluster order.\n\npredecessor_ : array of shape (n_samples,)\n Point that a sample was reached from, indexed by object order.\n Seed points have a predecessor of -1.\n\nReferences\n----------\n.. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,\n and J\u00f6rg Sander. \"OPTICS: ordering points to identify the clustering\n structure.\" ACM SIGMOD Record 28, no. 2 (1999): 49-60.", - "source_code": "\ndef compute_optics_graph(X, *, min_samples, max_eps, metric, p, metric_params, algorithm, leaf_size, n_jobs):\n \"\"\"Computes the OPTICS reachability graph.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features), or (n_samples, n_samples) if metric=\u2019precomputed\u2019.\n A feature array, or array of distances between samples if\n metric='precomputed'\n\n min_samples : int > 1 or float between 0 and 1\n The number of samples in a neighborhood for a point to be considered\n as a core point. Expressed as an absolute number or a fraction of the\n number of samples (rounded to be at least 2).\n\n max_eps : float, default=np.inf\n The maximum distance between two samples for one to be considered as\n in the neighborhood of the other. Default value of ``np.inf`` will\n identify clusters across all scales; reducing ``max_eps`` will result\n in shorter run times.\n\n metric : str or callable, default='minkowski'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string. If metric is\n \"precomputed\", X is assumed to be a distance matrix and must be square.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method. (default)\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can\n affect the speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n ordering_ : array of shape (n_samples,)\n The cluster ordered list of sample indices.\n\n core_distances_ : array of shape (n_samples,)\n Distance at which each sample becomes a core point, indexed by object\n order. Points which will never be core have a distance of inf. Use\n ``clust.core_distances_[clust.ordering_]`` to access in cluster order.\n\n reachability_ : array of shape (n_samples,)\n Reachability distances per sample, indexed by object order. Use\n ``clust.reachability_[clust.ordering_]`` to access in cluster order.\n\n predecessor_ : array of shape (n_samples,)\n Point that a sample was reached from, indexed by object order.\n Seed points have a predecessor of -1.\n\n References\n ----------\n .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,\n and J\u00f6rg Sander. \"OPTICS: ordering points to identify the clustering\n structure.\" ACM SIGMOD Record 28, no. 2 (1999): 49-60.\n \"\"\"\n n_samples = X.shape[0]\n _validate_size(min_samples, n_samples, 'min_samples')\n if min_samples <= 1:\n min_samples = max(2, int(min_samples * n_samples))\n reachability_ = np.empty(n_samples)\n reachability_.fill(np.inf)\n predecessor_ = np.empty(n_samples, dtype=int)\n predecessor_.fill(-1)\n nbrs = NearestNeighbors(n_neighbors=min_samples, algorithm=algorithm, leaf_size=leaf_size, metric=metric, metric_params=metric_params, p=p, n_jobs=n_jobs)\n nbrs.fit(X)\n core_distances_ = _compute_core_distances_(X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None)\n core_distances_[core_distances_ > max_eps] = np.inf\n np.around(core_distances_, decimals=np.finfo(core_distances_.dtype).precision, out=core_distances_)\n processed = np.zeros(X.shape[0], dtype=bool)\n ordering = np.zeros(X.shape[0], dtype=int)\n for ordering_idx in range(X.shape[0]):\n index = np.where(processed == 0)[0]\n point = index[np.argmin(reachability_[index])]\n processed[point] = True\n ordering[ordering_idx] = point\n if core_distances_[point] != np.inf:\n _set_reach_dist(core_distances_=core_distances_, reachability_=reachability_, predecessor_=predecessor_, point_index=point, processed=processed, X=X, nbrs=nbrs, metric=metric, metric_params=metric_params, p=p, max_eps=max_eps)\n if np.all(np.isinf(reachability_)):\n warnings.warn('All reachability values are inf. Set a larger max_eps or all data will be considered outliers.', UserWarning)\n return ordering, core_distances_, reachability_, predecessor_" + "description": "Compute the OPTICS reachability graph.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the OPTICS reachability graph.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features), or (n_samples, n_samples) if metric=\u2019precomputed\u2019.\n A feature array, or array of distances between samples if\n metric='precomputed'\n\n min_samples : int > 1 or float between 0 and 1\n The number of samples in a neighborhood for a point to be considered\n as a core point. Expressed as an absolute number or a fraction of the\n number of samples (rounded to be at least 2).\n\n max_eps : float, default=np.inf\n The maximum distance between two samples for one to be considered as\n in the neighborhood of the other. Default value of ``np.inf`` will\n identify clusters across all scales; reducing ``max_eps`` will result\n in shorter run times.\n\n metric : str or callable, default='minkowski'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string. If metric is\n \"precomputed\", X is assumed to be a distance matrix and must be square.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method. (default)\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can\n affect the speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n ordering_ : array of shape (n_samples,)\n The cluster ordered list of sample indices.\n\n core_distances_ : array of shape (n_samples,)\n Distance at which each sample becomes a core point, indexed by object\n order. Points which will never be core have a distance of inf. Use\n ``clust.core_distances_[clust.ordering_]`` to access in cluster order.\n\n reachability_ : array of shape (n_samples,)\n Reachability distances per sample, indexed by object order. Use\n ``clust.reachability_[clust.ordering_]`` to access in cluster order.\n\n predecessor_ : array of shape (n_samples,)\n Point that a sample was reached from, indexed by object order.\n Seed points have a predecessor of -1.\n\n References\n ----------\n .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,\n and J\u00f6rg Sander. \"OPTICS: ordering points to identify the clustering\n structure.\" ACM SIGMOD Record 28, no. 2 (1999): 49-60.\n ", + "source_code": "\ndef compute_optics_graph(X, *, min_samples, max_eps, metric, p, metric_params, algorithm, leaf_size, n_jobs):\n \"\"\"Compute the OPTICS reachability graph.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features), or (n_samples, n_samples) if metric=\u2019precomputed\u2019.\n A feature array, or array of distances between samples if\n metric='precomputed'\n\n min_samples : int > 1 or float between 0 and 1\n The number of samples in a neighborhood for a point to be considered\n as a core point. Expressed as an absolute number or a fraction of the\n number of samples (rounded to be at least 2).\n\n max_eps : float, default=np.inf\n The maximum distance between two samples for one to be considered as\n in the neighborhood of the other. Default value of ``np.inf`` will\n identify clusters across all scales; reducing ``max_eps`` will result\n in shorter run times.\n\n metric : str or callable, default='minkowski'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string. If metric is\n \"precomputed\", X is assumed to be a distance matrix and must be square.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n p : int, default=2\n Parameter for the Minkowski metric from\n :class:`~sklearn.metrics.pairwise_distances`. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n Additional keyword arguments for the metric function.\n\n algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'\n Algorithm used to compute the nearest neighbors:\n\n - 'ball_tree' will use :class:`BallTree`\n - 'kd_tree' will use :class:`KDTree`\n - 'brute' will use a brute-force search.\n - 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method. (default)\n\n Note: fitting on sparse input will override the setting of\n this parameter, using brute force.\n\n leaf_size : int, default=30\n Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can\n affect the speed of the construction and query, as well as the memory\n required to store the tree. The optimal value depends on the\n nature of the problem.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n ordering_ : array of shape (n_samples,)\n The cluster ordered list of sample indices.\n\n core_distances_ : array of shape (n_samples,)\n Distance at which each sample becomes a core point, indexed by object\n order. Points which will never be core have a distance of inf. Use\n ``clust.core_distances_[clust.ordering_]`` to access in cluster order.\n\n reachability_ : array of shape (n_samples,)\n Reachability distances per sample, indexed by object order. Use\n ``clust.reachability_[clust.ordering_]`` to access in cluster order.\n\n predecessor_ : array of shape (n_samples,)\n Point that a sample was reached from, indexed by object order.\n Seed points have a predecessor of -1.\n\n References\n ----------\n .. [1] Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel,\n and J\u00f6rg Sander. \"OPTICS: ordering points to identify the clustering\n structure.\" ACM SIGMOD Record 28, no. 2 (1999): 49-60.\n \"\"\"\n n_samples = X.shape[0]\n _validate_size(min_samples, n_samples, 'min_samples')\n if min_samples <= 1:\n min_samples = max(2, int(min_samples * n_samples))\n reachability_ = np.empty(n_samples)\n reachability_.fill(np.inf)\n predecessor_ = np.empty(n_samples, dtype=int)\n predecessor_.fill(-1)\n nbrs = NearestNeighbors(n_neighbors=min_samples, algorithm=algorithm, leaf_size=leaf_size, metric=metric, metric_params=metric_params, p=p, n_jobs=n_jobs)\n nbrs.fit(X)\n core_distances_ = _compute_core_distances_(X=X, neighbors=nbrs, min_samples=min_samples, working_memory=None)\n core_distances_[core_distances_ > max_eps] = np.inf\n np.around(core_distances_, decimals=np.finfo(core_distances_.dtype).precision, out=core_distances_)\n processed = np.zeros(X.shape[0], dtype=bool)\n ordering = np.zeros(X.shape[0], dtype=int)\n for ordering_idx in range(X.shape[0]):\n index = np.where(processed == 0)[0]\n point = index[np.argmin(reachability_[index])]\n processed[point] = True\n ordering[ordering_idx] = point\n if core_distances_[point] != np.inf:\n _set_reach_dist(core_distances_=core_distances_, reachability_=reachability_, predecessor_=predecessor_, point_index=point, processed=processed, X=X, nbrs=nbrs, metric=metric, metric_params=metric_params, p=p, max_eps=max_eps)\n if np.all(np.isinf(reachability_)):\n warnings.warn('All reachability values are inf. Set a larger max_eps or all data will be considered outliers.', UserWarning)\n return ordering, core_distances_, reachability_, predecessor_" }, { "name": "__init__", @@ -36714,7 +37771,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -36724,7 +37782,8 @@ "docstring": { "type": "int, default=8", "description": "The dimension of the projection subspace." - } + }, + "refined_type": {} }, { "name": "eigen_solver", @@ -36734,6 +37793,10 @@ "docstring": { "type": "{'arpack', 'lobpcg', 'amg'}, default=None", "description": "The eigenvalue decomposition strategy to use. AMG requires pyamg\nto be installed. It can be faster on very large, sparse problems,\nbut may also lead to instabilities. If None, then ``'arpack'`` is\nused. See [4]_ for more details regarding `'lobpcg'`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["lobpcg", "amg", "arpack"] } }, { @@ -36744,7 +37807,8 @@ "docstring": { "type": "int, default=n_clusters", "description": "Number of eigenvectors to use for the spectral embedding." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -36754,7 +37818,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "A pseudo random number generator used for the initialization\nof the lobpcg eigenvectors decomposition when `eigen_solver ==\n'amg'`, and for the K-Means initialization. Use an int to make\nthe results deterministic across calls (See\n:term:`Glossary `).\n\n.. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information." - } + }, + "refined_type": {} }, { "name": "n_init", @@ -36764,7 +37829,8 @@ "docstring": { "type": "int, default=10", "description": "Number of time the k-means algorithm will be run with different\ncentroid seeds. The final results will be the best output of n_init\nconsecutive runs in terms of inertia. Only used if\n``assign_labels='kmeans'``." - } + }, + "refined_type": {} }, { "name": "gamma", @@ -36774,7 +37840,8 @@ "docstring": { "type": "float, default=1.0", "description": "Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.\nIgnored for ``affinity='nearest_neighbors'``." - } + }, + "refined_type": {} }, { "name": "affinity", @@ -36784,7 +37851,8 @@ "docstring": { "type": "str or callable, default='rbf'", "description": "How to construct the affinity matrix.\n - 'nearest_neighbors': construct the affinity matrix by computing a\n graph of nearest neighbors.\n - 'rbf': construct the affinity matrix using a radial basis function\n (RBF) kernel.\n - 'precomputed': interpret ``X`` as a precomputed affinity matrix,\n where larger values indicate greater similarity between instances.\n - 'precomputed_nearest_neighbors': interpret ``X`` as a sparse graph\n of precomputed distances, and construct a binary affinity matrix\n from the ``n_neighbors`` nearest neighbors of each instance.\n - one of the kernels supported by\n :func:`~sklearn.metrics.pairwise_kernels`.\n\nOnly kernels that produce similarity scores (non-negative values that\nincrease with similarity) should be used. This property is not checked\nby the clustering algorithm." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -36794,7 +37862,8 @@ "docstring": { "type": "int, default=10", "description": "Number of neighbors to use when constructing the affinity matrix using\nthe nearest neighbors method. Ignored for ``affinity='rbf'``." - } + }, + "refined_type": {} }, { "name": "eigen_tol", @@ -36804,7 +37873,8 @@ "docstring": { "type": "float, default=0.0", "description": "Stopping criterion for eigendecomposition of the Laplacian matrix\nwhen ``eigen_solver='arpack'``." - } + }, + "refined_type": {} }, { "name": "assign_labels", @@ -36814,6 +37884,10 @@ "docstring": { "type": "{'kmeans', 'discretize'}, default='kmeans'", "description": "The strategy for assigning labels in the embedding space. There are two\nways to assign labels after the Laplacian embedding. k-means is a\npopular choice, but it can be sensitive to initialization.\nDiscretization is another approach which is less sensitive to random\ninitialization [3]_." + }, + "refined_type": { + "kind": "EnumType", + "values": ["discretize", "kmeans"] } }, { @@ -36824,7 +37898,8 @@ "docstring": { "type": "float, default=3", "description": "Degree of the polynomial kernel. Ignored by other kernels." - } + }, + "refined_type": {} }, { "name": "coef0", @@ -36834,7 +37909,8 @@ "docstring": { "type": "float, default=1", "description": "Zero coefficient for polynomial and sigmoid kernels.\nIgnored by other kernels." - } + }, + "refined_type": {} }, { "name": "kernel_params", @@ -36844,7 +37920,8 @@ "docstring": { "type": "dict of str to any, default=None", "description": "Parameters (keyword arguments) and values for kernel passed as\ncallable object. Ignored by other kernels." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -36854,7 +37931,8 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run when `affinity='nearest_neighbors'`\nor `affinity='precomputed_nearest_neighbors'`. The neighbors search\nwill be done in parallel.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -36864,13 +37942,14 @@ "docstring": { "type": "bool, default=False", "description": "Verbosity mode.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_clusters=8, *, eigen_solver=None, n_components=None, random_state=None, n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, n_jobs=None, verbose=False):\n self.n_clusters = n_clusters\n self.eigen_solver = eigen_solver\n self.n_components = n_components\n self.random_state = random_state\n self.n_init = n_init\n self.gamma = gamma\n self.affinity = affinity\n self.n_neighbors = n_neighbors\n self.eigen_tol = eigen_tol\n self.assign_labels = assign_labels\n self.degree = degree\n self.coef0 = coef0\n self.kernel_params = kernel_params\n self.n_jobs = n_jobs\n self.verbose = verbose" }, { @@ -36888,13 +37967,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'pairwise': self.affinity in ['precomputed', 'precomputed_nearest_neighbors']}" }, { @@ -36915,13 +37995,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef _pairwise(self):\n return self.affinity in ['precomputed', 'precomputed_nearest_neighbors']" }, { @@ -36939,7 +38020,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -36949,6 +38031,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "Training instances to cluster, similarities / affinities between\ninstances if ``affinity='precomputed'``, or distances between\ninstances if ``affinity='precomputed_nearest_neighbors``. If a\nsparse matrix is provided in a format other than ``csr_matrix``,\n``csc_matrix``, or ``coo_matrix``, it will be converted into a\nsparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -36959,13 +38045,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform spectral clustering from features, or affinity matrix.", - "docstring": "Perform spectral clustering from features, or affinity matrix.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, similarities / affinities between\n instances if ``affinity='precomputed'``, or distances between\n instances if ``affinity='precomputed_nearest_neighbors``. If a\n sparse matrix is provided in a format other than ``csr_matrix``,\n ``csc_matrix``, or ``coo_matrix``, it will be converted into a\n sparse ``csr_matrix``.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n A fitted instance of the estimator.", + "docstring": "Perform spectral clustering from features, or affinity matrix.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, similarities / affinities between\n instances if ``affinity='precomputed'``, or distances between\n instances if ``affinity='precomputed_nearest_neighbors``. If a\n sparse matrix is provided in a format other than ``csr_matrix``,\n ``csc_matrix``, or ``coo_matrix``, it will be converted into a\n sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n A fitted instance of the estimator.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Perform spectral clustering from features, or affinity matrix.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, similarities / affinities between\n instances if ``affinity='precomputed'``, or distances between\n instances if ``affinity='precomputed_nearest_neighbors``. If a\n sparse matrix is provided in a format other than ``csr_matrix``,\n ``csc_matrix``, or ``coo_matrix``, it will be converted into a\n sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n A fitted instance of the estimator.\n \"\"\"\n X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64, ensure_min_samples=2)\n allow_squared = self.affinity in ['precomputed', 'precomputed_nearest_neighbors']\n if X.shape[0] == X.shape[1] and not allow_squared:\n warnings.warn('The spectral clustering API has changed. ``fit``now constructs an affinity matrix from data. To use a custom affinity matrix, set ``affinity=precomputed``.')\n if self.affinity == 'nearest_neighbors':\n connectivity = kneighbors_graph(X, n_neighbors=self.n_neighbors, include_self=True, n_jobs=self.n_jobs)\n self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)\n elif self.affinity == 'precomputed_nearest_neighbors':\n estimator = NearestNeighbors(n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric='precomputed').fit(X)\n connectivity = estimator.kneighbors_graph(X=X, mode='connectivity')\n self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)\n elif self.affinity == 'precomputed':\n self.affinity_matrix_ = X\n else:\n params = self.kernel_params\n if params is None:\n params = {}\n if not callable(self.affinity):\n params['gamma'] = self.gamma\n params['degree'] = self.degree\n params['coef0'] = self.coef0\n self.affinity_matrix_ = pairwise_kernels(X, metric=self.affinity, filter_params=True, **params)\n random_state = check_random_state(self.random_state)\n self.labels_ = spectral_clustering(self.affinity_matrix_, n_clusters=self.n_clusters, n_components=self.n_components, eigen_solver=self.eigen_solver, random_state=random_state, n_init=self.n_init, eigen_tol=self.eigen_tol, assign_labels=self.assign_labels, verbose=self.verbose)\n return self" }, { @@ -36983,7 +38070,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -36993,6 +38081,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "Training instances to cluster, similarities / affinities between\ninstances if ``affinity='precomputed'``, or distances between\ninstances if ``affinity='precomputed_nearest_neighbors``. If a\nsparse matrix is provided in a format other than ``csr_matrix``,\n``csc_matrix``, or ``coo_matrix``, it will be converted into a\nsparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -37003,13 +38095,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform spectral clustering on `X` and return cluster labels.", - "docstring": "Perform spectral clustering on `X` and return cluster labels.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, similarities / affinities between\n instances if ``affinity='precomputed'``, or distances between\n instances if ``affinity='precomputed_nearest_neighbors``. If a\n sparse matrix is provided in a format other than ``csr_matrix``,\n ``csc_matrix``, or ``coo_matrix``, it will be converted into a\n sparse ``csr_matrix``.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nlabels : ndarray of shape (n_samples,)\n Cluster labels.", + "docstring": "Perform spectral clustering on `X` and return cluster labels.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, similarities / affinities between\n instances if ``affinity='precomputed'``, or distances between\n instances if ``affinity='precomputed_nearest_neighbors``. If a\n sparse matrix is provided in a format other than ``csr_matrix``,\n ``csc_matrix``, or ``coo_matrix``, it will be converted into a\n sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels.\n ", "source_code": "\ndef fit_predict(self, X, y=None):\n \"\"\"Perform spectral clustering on `X` and return cluster labels.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n Training instances to cluster, similarities / affinities between\n instances if ``affinity='precomputed'``, or distances between\n instances if ``affinity='precomputed_nearest_neighbors``. If a\n sparse matrix is provided in a format other than ``csr_matrix``,\n ``csc_matrix``, or ``coo_matrix``, it will be converted into a\n sparse ``csr_matrix``.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n labels : ndarray of shape (n_samples,)\n Cluster labels.\n \"\"\"\n return super().fit_predict(X, y)" }, { @@ -37027,7 +38120,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_clusters)", "description": "The embedding space of the samples." - } + }, + "refined_type": {} }, { "name": "copy", @@ -37037,7 +38131,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to copy vectors, or perform in-place normalization." - } + }, + "refined_type": {} }, { "name": "max_svd_restarts", @@ -37047,7 +38142,8 @@ "docstring": { "type": "int, default=30", "description": "Maximum number of attempts to restart SVD if convergence fails" - } + }, + "refined_type": {} }, { "name": "n_iter_max", @@ -37057,7 +38153,8 @@ "docstring": { "type": "int, default=30", "description": "Maximum number of iterations to attempt in rotation and partition\nmatrix search if machine precision convergence is not reached" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -37067,14 +38164,15 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Determines random number generation for rotation matrix initialization.\nUse an int to make the randomness deterministic.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Search for a partition matrix which is closest to the eigenvector embedding.\n\nThis implementation was proposed in [1]_.", - "docstring": "Search for a partition matrix which is closest to the eigenvector embedding.\n\nThis implementation was proposed in [1]_.\n\nParameters\n----------\nvectors : array-like of shape (n_samples, n_clusters)\n The embedding space of the samples.\n\ncopy : bool, default=True\n Whether to copy vectors, or perform in-place normalization.\n\nmax_svd_restarts : int, default=30\n Maximum number of attempts to restart SVD if convergence fails\n\nn_iter_max : int, default=30\n Maximum number of iterations to attempt in rotation and partition\n matrix search if machine precision convergence is not reached\n\nrandom_state : int, RandomState instance, default=None\n Determines random number generation for rotation matrix initialization.\n Use an int to make the randomness deterministic.\n See :term:`Glossary `.\n\nReturns\n-------\nlabels : array of integers, shape: n_samples\n The labels of the clusters.\n\nReferences\n----------\n\n.. [1] `Multiclass spectral clustering, 2003\n Stella X. Yu, Jianbo Shi\n `_\n\nNotes\n-----\n\nThe eigenvector embedding is used to iteratively search for the\nclosest discrete partition. First, the eigenvector embedding is\nnormalized to the space of partition matrices. An optimal discrete\npartition matrix closest to this normalized embedding multiplied by\nan initial rotation is calculated. Fixing this discrete partition\nmatrix, an optimal rotation matrix is calculated. These two\ncalculations are performed until convergence. The discrete partition\nmatrix is returned as the clustering solution. Used in spectral\nclustering, this method tends to be faster and more robust to random\ninitialization than k-means.", - "source_code": "\ndef discretize(vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None):\n \"\"\"Search for a partition matrix which is closest to the eigenvector embedding.\n\n This implementation was proposed in [1]_.\n\n Parameters\n ----------\n vectors : array-like of shape (n_samples, n_clusters)\n The embedding space of the samples.\n\n copy : bool, default=True\n Whether to copy vectors, or perform in-place normalization.\n\n max_svd_restarts : int, default=30\n Maximum number of attempts to restart SVD if convergence fails\n\n n_iter_max : int, default=30\n Maximum number of iterations to attempt in rotation and partition\n matrix search if machine precision convergence is not reached\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for rotation matrix initialization.\n Use an int to make the randomness deterministic.\n See :term:`Glossary `.\n\n Returns\n -------\n labels : array of integers, shape: n_samples\n The labels of the clusters.\n\n References\n ----------\n\n .. [1] `Multiclass spectral clustering, 2003\n Stella X. Yu, Jianbo Shi\n `_\n\n Notes\n -----\n\n The eigenvector embedding is used to iteratively search for the\n closest discrete partition. First, the eigenvector embedding is\n normalized to the space of partition matrices. An optimal discrete\n partition matrix closest to this normalized embedding multiplied by\n an initial rotation is calculated. Fixing this discrete partition\n matrix, an optimal rotation matrix is calculated. These two\n calculations are performed until convergence. The discrete partition\n matrix is returned as the clustering solution. Used in spectral\n clustering, this method tends to be faster and more robust to random\n initialization than k-means.\n\n \"\"\"\n from scipy.sparse import csc_matrix\n from scipy.linalg import LinAlgError\n random_state = check_random_state(random_state)\n vectors = as_float_array(vectors, copy=copy)\n eps = np.finfo(float).eps\n (n_samples, n_components) = vectors.shape\n norm_ones = np.sqrt(n_samples)\n for i in range(vectors.shape[1]):\n vectors[:, i] = vectors[:, i] / np.linalg.norm(vectors[:, i]) * norm_ones\n if vectors[0, i] != 0:\n vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i])\n vectors = vectors / np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis]\n svd_restarts = 0\n has_converged = False\n while svd_restarts < max_svd_restarts and not has_converged:\n rotation = np.zeros((n_components, n_components))\n rotation[:, 0] = vectors[random_state.randint(n_samples), :].T\n c = np.zeros(n_samples)\n for j in range(1, n_components):\n c += np.abs(np.dot(vectors, rotation[:, j - 1]))\n rotation[:, j] = vectors[c.argmin(), :].T\n last_objective_value = 0.0\n n_iter = 0\n while not has_converged:\n n_iter += 1\n t_discrete = np.dot(vectors, rotation)\n labels = t_discrete.argmax(axis=1)\n vectors_discrete = csc_matrix((np.ones(len(labels)), (np.arange(0, n_samples), labels)), shape=(n_samples, n_components))\n t_svd = vectors_discrete.T * vectors\n try:\n (U, S, Vh) = np.linalg.svd(t_svd)\n svd_restarts += 1\n except LinAlgError:\n print('SVD did not converge, randomizing and trying again')\n break\n ncut_value = 2.0 * (n_samples - S.sum())\n if abs(ncut_value - last_objective_value) < eps or n_iter > n_iter_max:\n has_converged = True\n else:\n last_objective_value = ncut_value\n rotation = np.dot(Vh.T, U.T)\n if not has_converged:\n raise LinAlgError('SVD did not converge')\n return labels" + "docstring": "Search for a partition matrix which is closest to the eigenvector embedding.\n\n This implementation was proposed in [1]_.\n\n Parameters\n ----------\n vectors : array-like of shape (n_samples, n_clusters)\n The embedding space of the samples.\n\n copy : bool, default=True\n Whether to copy vectors, or perform in-place normalization.\n\n max_svd_restarts : int, default=30\n Maximum number of attempts to restart SVD if convergence fails\n\n n_iter_max : int, default=30\n Maximum number of iterations to attempt in rotation and partition\n matrix search if machine precision convergence is not reached\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for rotation matrix initialization.\n Use an int to make the randomness deterministic.\n See :term:`Glossary `.\n\n Returns\n -------\n labels : array of integers, shape: n_samples\n The labels of the clusters.\n\n References\n ----------\n\n .. [1] `Multiclass spectral clustering, 2003\n Stella X. Yu, Jianbo Shi\n `_\n\n Notes\n -----\n\n The eigenvector embedding is used to iteratively search for the\n closest discrete partition. First, the eigenvector embedding is\n normalized to the space of partition matrices. An optimal discrete\n partition matrix closest to this normalized embedding multiplied by\n an initial rotation is calculated. Fixing this discrete partition\n matrix, an optimal rotation matrix is calculated. These two\n calculations are performed until convergence. The discrete partition\n matrix is returned as the clustering solution. Used in spectral\n clustering, this method tends to be faster and more robust to random\n initialization than k-means.\n\n ", + "source_code": "\ndef discretize(vectors, *, copy=True, max_svd_restarts=30, n_iter_max=20, random_state=None):\n \"\"\"Search for a partition matrix which is closest to the eigenvector embedding.\n\n This implementation was proposed in [1]_.\n\n Parameters\n ----------\n vectors : array-like of shape (n_samples, n_clusters)\n The embedding space of the samples.\n\n copy : bool, default=True\n Whether to copy vectors, or perform in-place normalization.\n\n max_svd_restarts : int, default=30\n Maximum number of attempts to restart SVD if convergence fails\n\n n_iter_max : int, default=30\n Maximum number of iterations to attempt in rotation and partition\n matrix search if machine precision convergence is not reached\n\n random_state : int, RandomState instance, default=None\n Determines random number generation for rotation matrix initialization.\n Use an int to make the randomness deterministic.\n See :term:`Glossary `.\n\n Returns\n -------\n labels : array of integers, shape: n_samples\n The labels of the clusters.\n\n References\n ----------\n\n .. [1] `Multiclass spectral clustering, 2003\n Stella X. Yu, Jianbo Shi\n `_\n\n Notes\n -----\n\n The eigenvector embedding is used to iteratively search for the\n closest discrete partition. First, the eigenvector embedding is\n normalized to the space of partition matrices. An optimal discrete\n partition matrix closest to this normalized embedding multiplied by\n an initial rotation is calculated. Fixing this discrete partition\n matrix, an optimal rotation matrix is calculated. These two\n calculations are performed until convergence. The discrete partition\n matrix is returned as the clustering solution. Used in spectral\n clustering, this method tends to be faster and more robust to random\n initialization than k-means.\n\n \"\"\"\n from scipy.sparse import csc_matrix\n from scipy.linalg import LinAlgError\n random_state = check_random_state(random_state)\n vectors = as_float_array(vectors, copy=copy)\n eps = np.finfo(float).eps\n (n_samples, n_components) = vectors.shape\n norm_ones = np.sqrt(n_samples)\n for i in range(vectors.shape[1]):\n vectors[:, i] = vectors[:, i] / np.linalg.norm(vectors[:, i]) * norm_ones\n if vectors[0, i] != 0:\n vectors[:, i] = -1 * vectors[:, i] * np.sign(vectors[0, i])\n vectors = vectors / np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis]\n svd_restarts = 0\n has_converged = False\n while svd_restarts < max_svd_restarts and not has_converged:\n rotation = np.zeros((n_components, n_components))\n rotation[:, 0] = vectors[random_state.randint(n_samples), :].T\n c = np.zeros(n_samples)\n for j in range(1, n_components):\n c += np.abs(np.dot(vectors, rotation[:, j - 1]))\n rotation[:, j] = vectors[c.argmin(), :].T\n last_objective_value = 0.0\n n_iter = 0\n while not has_converged:\n n_iter += 1\n t_discrete = np.dot(vectors, rotation)\n labels = t_discrete.argmax(axis=1)\n vectors_discrete = csc_matrix((np.ones(len(labels)), (np.arange(0, n_samples), labels)), shape=(n_samples, n_components))\n t_svd = vectors_discrete.T * vectors\n try:\n (U, S, Vh) = np.linalg.svd(t_svd)\n except LinAlgError:\n svd_restarts += 1\n print('SVD did not converge, randomizing and trying again')\n break\n ncut_value = 2.0 * (n_samples - S.sum())\n if abs(ncut_value - last_objective_value) < eps or n_iter > n_iter_max:\n has_converged = True\n else:\n last_objective_value = ncut_value\n rotation = np.dot(Vh.T, U.T)\n if not has_converged:\n raise LinAlgError('SVD did not converge')\n return labels" }, { "name": "spectral_clustering", @@ -37091,6 +38189,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_samples)", "description": "The affinity matrix describing the relationship of the samples to\nembed. **Must be symmetric**.\n\nPossible examples:\n - adjacency matrix of a graph,\n - heat kernel of the pairwise distance matrix of the samples,\n - symmetric k-nearest neighbours connectivity matrix of the samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -37101,7 +38203,8 @@ "docstring": { "type": "int, default=None", "description": "Number of clusters to extract." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -37111,7 +38214,8 @@ "docstring": { "type": "int, default=n_clusters", "description": "Number of eigenvectors to use for the spectral embedding" - } + }, + "refined_type": {} }, { "name": "eigen_solver", @@ -37121,6 +38225,10 @@ "docstring": { "type": "{None, 'arpack', 'lobpcg', or 'amg'}", "description": "The eigenvalue decomposition strategy to use. AMG requires pyamg\nto be installed. It can be faster on very large, sparse problems,\nbut may also lead to instabilities. If None, then ``'arpack'`` is\nused. See [4]_ for more details regarding `'lobpcg'`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["lobpcg", "amg", "arpack"] } }, { @@ -37131,7 +38239,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "A pseudo random number generator used for the initialization\nof the lobpcg eigenvectors decomposition when `eigen_solver ==\n'amg'`, and for the K-Means initialization. Use an int to make\nthe results deterministic across calls (See\n:term:`Glossary `).\n\n.. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information." - } + }, + "refined_type": {} }, { "name": "n_init", @@ -37141,7 +38250,8 @@ "docstring": { "type": "int, default=10", "description": "Number of time the k-means algorithm will be run with different\ncentroid seeds. The final results will be the best output of n_init\nconsecutive runs in terms of inertia. Only used if\n``assign_labels='kmeans'``." - } + }, + "refined_type": {} }, { "name": "eigen_tol", @@ -37151,7 +38261,8 @@ "docstring": { "type": "float, default=0.0", "description": "Stopping criterion for eigendecomposition of the Laplacian matrix\nwhen using arpack eigen_solver." - } + }, + "refined_type": {} }, { "name": "assign_labels", @@ -37161,6 +38272,10 @@ "docstring": { "type": "{'kmeans', 'discretize'}, default='kmeans'", "description": "The strategy to use to assign labels in the embedding\nspace. There are two ways to assign labels after the Laplacian\nembedding. k-means can be applied and is a popular choice. But it can\nalso be sensitive to initialization. Discretization is another\napproach which is less sensitive to random initialization [3]_." + }, + "refined_type": { + "kind": "EnumType", + "values": ["discretize", "kmeans"] } }, { @@ -37171,13 +38286,14 @@ "docstring": { "type": "bool, default=False", "description": "Verbosity mode.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Apply clustering to a projection of the normalized Laplacian.\n\nIn practice Spectral Clustering is very useful when the structure of the individual clusters is highly non-convex or more generally when a measure of the center and spread of the cluster is not a suitable description of the complete cluster. For instance, when clusters are nested circles on the 2D plane. If affinity is the adjacency matrix of a graph, this method can be used to find normalized graph cuts [1]_, [2]_. Read more in the :ref:`User Guide `.", - "docstring": "Apply clustering to a projection of the normalized Laplacian.\n\nIn practice Spectral Clustering is very useful when the structure of\nthe individual clusters is highly non-convex or more generally when\na measure of the center and spread of the cluster is not a suitable\ndescription of the complete cluster. For instance, when clusters are\nnested circles on the 2D plane.\n\nIf affinity is the adjacency matrix of a graph, this method can be\nused to find normalized graph cuts [1]_, [2]_.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\naffinity : {array-like, sparse matrix} of shape (n_samples, n_samples)\n The affinity matrix describing the relationship of the samples to\n embed. **Must be symmetric**.\n\n Possible examples:\n - adjacency matrix of a graph,\n - heat kernel of the pairwise distance matrix of the samples,\n - symmetric k-nearest neighbours connectivity matrix of the samples.\n\nn_clusters : int, default=None\n Number of clusters to extract.\n\nn_components : int, default=n_clusters\n Number of eigenvectors to use for the spectral embedding\n\neigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}\n The eigenvalue decomposition strategy to use. AMG requires pyamg\n to be installed. It can be faster on very large, sparse problems,\n but may also lead to instabilities. If None, then ``'arpack'`` is\n used. See [4]_ for more details regarding `'lobpcg'`.\n\nrandom_state : int, RandomState instance, default=None\n A pseudo random number generator used for the initialization\n of the lobpcg eigenvectors decomposition when `eigen_solver ==\n 'amg'`, and for the K-Means initialization. Use an int to make\n the results deterministic across calls (See\n :term:`Glossary `).\n\n .. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information.\n\nn_init : int, default=10\n Number of time the k-means algorithm will be run with different\n centroid seeds. The final results will be the best output of n_init\n consecutive runs in terms of inertia. Only used if\n ``assign_labels='kmeans'``.\n\neigen_tol : float, default=0.0\n Stopping criterion for eigendecomposition of the Laplacian matrix\n when using arpack eigen_solver.\n\nassign_labels : {'kmeans', 'discretize'}, default='kmeans'\n The strategy to use to assign labels in the embedding\n space. There are two ways to assign labels after the Laplacian\n embedding. k-means can be applied and is a popular choice. But it can\n also be sensitive to initialization. Discretization is another\n approach which is less sensitive to random initialization [3]_.\n\nverbose : bool, default=False\n Verbosity mode.\n\n .. versionadded:: 0.24\n\nReturns\n-------\nlabels : array of integers, shape: n_samples\n The labels of the clusters.\n\nReferences\n----------\n\n.. [1] `Normalized cuts and image segmentation, 2000\n Jianbo Shi, Jitendra Malik\n `_\n\n.. [2] `A Tutorial on Spectral Clustering, 2007\n Ulrike von Luxburg\n `_\n\n.. [3] `Multiclass spectral clustering, 2003\n Stella X. Yu, Jianbo Shi\n `_\n\n.. [4] `Toward the Optimal Preconditioned Eigensolver:\n Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001.\n A. V. Knyazev\n SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.\n `_\n\nNotes\n-----\nThe graph should contain only one connect component, elsewhere\nthe results make little sense.\n\nThis algorithm solves the normalized cut for k=2: it is a\nnormalized spectral clustering.", + "description": "Apply clustering to a projection of the normalized Laplacian.\n\nIn practice Spectral Clustering is very useful when the structure of\nthe individual clusters is highly non-convex or more generally when\na measure of the center and spread of the cluster is not a suitable\ndescription of the complete cluster. For instance, when clusters are\nnested circles on the 2D plane.\n\nIf affinity is the adjacency matrix of a graph, this method can be\nused to find normalized graph cuts [1]_, [2]_.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Apply clustering to a projection of the normalized Laplacian.\n\n In practice Spectral Clustering is very useful when the structure of\n the individual clusters is highly non-convex or more generally when\n a measure of the center and spread of the cluster is not a suitable\n description of the complete cluster. For instance, when clusters are\n nested circles on the 2D plane.\n\n If affinity is the adjacency matrix of a graph, this method can be\n used to find normalized graph cuts [1]_, [2]_.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n affinity : {array-like, sparse matrix} of shape (n_samples, n_samples)\n The affinity matrix describing the relationship of the samples to\n embed. **Must be symmetric**.\n\n Possible examples:\n - adjacency matrix of a graph,\n - heat kernel of the pairwise distance matrix of the samples,\n - symmetric k-nearest neighbours connectivity matrix of the samples.\n\n n_clusters : int, default=None\n Number of clusters to extract.\n\n n_components : int, default=n_clusters\n Number of eigenvectors to use for the spectral embedding\n\n eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}\n The eigenvalue decomposition strategy to use. AMG requires pyamg\n to be installed. It can be faster on very large, sparse problems,\n but may also lead to instabilities. If None, then ``'arpack'`` is\n used. See [4]_ for more details regarding `'lobpcg'`.\n\n random_state : int, RandomState instance, default=None\n A pseudo random number generator used for the initialization\n of the lobpcg eigenvectors decomposition when `eigen_solver ==\n 'amg'`, and for the K-Means initialization. Use an int to make\n the results deterministic across calls (See\n :term:`Glossary `).\n\n .. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information.\n\n n_init : int, default=10\n Number of time the k-means algorithm will be run with different\n centroid seeds. The final results will be the best output of n_init\n consecutive runs in terms of inertia. Only used if\n ``assign_labels='kmeans'``.\n\n eigen_tol : float, default=0.0\n Stopping criterion for eigendecomposition of the Laplacian matrix\n when using arpack eigen_solver.\n\n assign_labels : {'kmeans', 'discretize'}, default='kmeans'\n The strategy to use to assign labels in the embedding\n space. There are two ways to assign labels after the Laplacian\n embedding. k-means can be applied and is a popular choice. But it can\n also be sensitive to initialization. Discretization is another\n approach which is less sensitive to random initialization [3]_.\n\n verbose : bool, default=False\n Verbosity mode.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n labels : array of integers, shape: n_samples\n The labels of the clusters.\n\n References\n ----------\n\n .. [1] `Normalized cuts and image segmentation, 2000\n Jianbo Shi, Jitendra Malik\n `_\n\n .. [2] `A Tutorial on Spectral Clustering, 2007\n Ulrike von Luxburg\n `_\n\n .. [3] `Multiclass spectral clustering, 2003\n Stella X. Yu, Jianbo Shi\n `_\n\n .. [4] `Toward the Optimal Preconditioned Eigensolver:\n Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001.\n A. V. Knyazev\n SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.\n `_\n\n Notes\n -----\n The graph should contain only one connect component, elsewhere\n the results make little sense.\n\n This algorithm solves the normalized cut for k=2: it is a\n normalized spectral clustering.\n ", "source_code": "\ndef spectral_clustering(affinity, *, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, eigen_tol=0.0, assign_labels='kmeans', verbose=False):\n \"\"\"Apply clustering to a projection of the normalized Laplacian.\n\n In practice Spectral Clustering is very useful when the structure of\n the individual clusters is highly non-convex or more generally when\n a measure of the center and spread of the cluster is not a suitable\n description of the complete cluster. For instance, when clusters are\n nested circles on the 2D plane.\n\n If affinity is the adjacency matrix of a graph, this method can be\n used to find normalized graph cuts [1]_, [2]_.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n affinity : {array-like, sparse matrix} of shape (n_samples, n_samples)\n The affinity matrix describing the relationship of the samples to\n embed. **Must be symmetric**.\n\n Possible examples:\n - adjacency matrix of a graph,\n - heat kernel of the pairwise distance matrix of the samples,\n - symmetric k-nearest neighbours connectivity matrix of the samples.\n\n n_clusters : int, default=None\n Number of clusters to extract.\n\n n_components : int, default=n_clusters\n Number of eigenvectors to use for the spectral embedding\n\n eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}\n The eigenvalue decomposition strategy to use. AMG requires pyamg\n to be installed. It can be faster on very large, sparse problems,\n but may also lead to instabilities. If None, then ``'arpack'`` is\n used. See [4]_ for more details regarding `'lobpcg'`.\n\n random_state : int, RandomState instance, default=None\n A pseudo random number generator used for the initialization\n of the lobpcg eigenvectors decomposition when `eigen_solver ==\n 'amg'`, and for the K-Means initialization. Use an int to make\n the results deterministic across calls (See\n :term:`Glossary `).\n\n .. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information.\n\n n_init : int, default=10\n Number of time the k-means algorithm will be run with different\n centroid seeds. The final results will be the best output of n_init\n consecutive runs in terms of inertia. Only used if\n ``assign_labels='kmeans'``.\n\n eigen_tol : float, default=0.0\n Stopping criterion for eigendecomposition of the Laplacian matrix\n when using arpack eigen_solver.\n\n assign_labels : {'kmeans', 'discretize'}, default='kmeans'\n The strategy to use to assign labels in the embedding\n space. There are two ways to assign labels after the Laplacian\n embedding. k-means can be applied and is a popular choice. But it can\n also be sensitive to initialization. Discretization is another\n approach which is less sensitive to random initialization [3]_.\n\n verbose : bool, default=False\n Verbosity mode.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n labels : array of integers, shape: n_samples\n The labels of the clusters.\n\n References\n ----------\n\n .. [1] `Normalized cuts and image segmentation, 2000\n Jianbo Shi, Jitendra Malik\n `_\n\n .. [2] `A Tutorial on Spectral Clustering, 2007\n Ulrike von Luxburg\n `_\n\n .. [3] `Multiclass spectral clustering, 2003\n Stella X. Yu, Jianbo Shi\n `_\n\n .. [4] `Toward the Optimal Preconditioned Eigensolver:\n Locally Optimal Block Preconditioned Conjugate Gradient Method, 2001.\n A. V. Knyazev\n SIAM Journal on Scientific Computing 23, no. 2, pp. 517-541.\n `_\n\n Notes\n -----\n The graph should contain only one connect component, elsewhere\n the results make little sense.\n\n This algorithm solves the normalized cut for k=2: it is a\n normalized spectral clustering.\n \"\"\"\n if assign_labels not in ('kmeans', 'discretize'):\n raise ValueError(\"The 'assign_labels' parameter should be 'kmeans' or 'discretize', but '%s' was given\" % assign_labels)\n if isinstance(affinity, np.matrix):\n raise TypeError('spectral_clustering does not support passing in affinity as an np.matrix. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html')\n random_state = check_random_state(random_state)\n n_components = n_clusters if n_components is None else n_components\n maps = spectral_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, eigen_tol=eigen_tol, drop_first=False)\n if verbose:\n print(f'Computing label assignment using {assign_labels}')\n if assign_labels == 'kmeans':\n (_, labels, _) = k_means(maps, n_clusters, random_state=random_state, n_init=n_init, verbose=verbose)\n else:\n labels = discretize(maps, random_state=random_state)\n return labels" }, { @@ -37195,7 +38311,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -37205,13 +38322,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n from numpy.distutils.misc_util import Configuration\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config = Configuration('cluster', parent_package, top_path)\n config.add_extension('_dbscan_inner', sources=['_dbscan_inner.pyx'], include_dirs=[numpy.get_include()], language='c++')\n config.add_extension('_hierarchical_fast', sources=['_hierarchical_fast.pyx'], language='c++', include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_k_means_common', sources=['_k_means_common.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_k_means_lloyd', sources=['_k_means_lloyd.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_k_means_elkan', sources=['_k_means_elkan.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_k_means_minibatch', sources=['_k_means_minibatch.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_subpackage('tests')\n return config" }, { @@ -37229,7 +38347,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformers", @@ -37239,6 +38358,10 @@ "docstring": { "type": "list of tuples", "description": "List of (name, transformer, columns) tuples specifying the\ntransformer objects to be applied to subsets of the data.\n\nname : str\n Like in Pipeline and FeatureUnion, this allows the transformer and\n its parameters to be set using ``set_params`` and searched in grid\n search.\ntransformer : {'drop', 'passthrough'} or estimator\n Estimator must support :term:`fit` and :term:`transform`.\n Special-cased strings 'drop' and 'passthrough' are accepted as\n well, to indicate to drop the columns or to pass them through\n untransformed, respectively.\ncolumns : str, array-like of str, int, array-like of int, array-like of bool, slice or callable\n Indexes the data on its second axis. Integers are interpreted as\n positional columns, while strings can reference DataFrame columns\n by name. A scalar string or int should be used where\n ``transformer`` expects X to be a 1d array-like (vector),\n otherwise a 2d array will be passed to the transformer.\n A callable is passed the input data `X` and can return any of the\n above. To select multiple columns by name or dtype, you can use\n :obj:`make_column_selector`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["passthrough", "drop"] } }, { @@ -37249,6 +38372,10 @@ "docstring": { "type": "{'drop', 'passthrough'} or estimator, default='drop'", "description": "By default, only the specified columns in `transformers` are\ntransformed and combined in the output, and the non-specified\ncolumns are dropped. (default of ``'drop'``).\nBy specifying ``remainder='passthrough'``, all remaining columns that\nwere not specified in `transformers` will be automatically passed\nthrough. This subset of columns is concatenated with the output of\nthe transformers.\nBy setting ``remainder`` to be an estimator, the remaining\nnon-specified columns will use the ``remainder`` estimator. The\nestimator must support :term:`fit` and :term:`transform`.\nNote that using this feature requires that the DataFrame columns\ninput at :term:`fit` and :term:`transform` have identical order." + }, + "refined_type": { + "kind": "EnumType", + "values": ["passthrough", "drop"] } }, { @@ -37259,7 +38386,8 @@ "docstring": { "type": "float, default=0.3", "description": "If the output of the different transformers contains sparse matrices,\nthese will be stacked as a sparse matrix if the overall density is\nlower than this value. Use ``sparse_threshold=0`` to always return\ndense. When the transformed output consists of all dense data, the\nstacked result will be dense, and this keyword will be ignored." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -37269,7 +38397,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "transformer_weights", @@ -37279,7 +38408,8 @@ "docstring": { "type": "dict, default=None", "description": "Multiplicative weights for features per transformer. The output of the\ntransformer is multiplied by these weights. Keys are transformer names,\nvalues the weights." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -37289,7 +38419,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, the time elapsed while fitting each transformer will be\nprinted as it is completed." - } + }, + "refined_type": {} }, { "name": "verbose_feature_names_out", @@ -37299,13 +38430,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, :meth:`get_feature_names_out` will prefix all feature names\nwith the name of the transformer that generated that feature.\nIf False, :meth:`get_feature_names_out` will not prefix any feature\nnames and will error if feature names are not unique.\n\n.. versionadded:: 1.0" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, transformers, *, remainder='drop', sparse_threshold=0.3, n_jobs=None, transformer_weights=None, verbose=False, verbose_feature_names_out=True):\n self.transformers = transformers\n self.remainder = remainder\n self.sparse_threshold = sparse_threshold\n self.n_jobs = n_jobs\n self.transformer_weights = transformer_weights\n self.verbose = verbose\n self.verbose_feature_names_out = verbose_feature_names_out" }, { @@ -37323,7 +38455,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -37333,7 +38466,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -37343,7 +38477,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "func", @@ -37353,7 +38488,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fitted", @@ -37363,7 +38499,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "column_as_strings", @@ -37373,13 +38510,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Private function to fit and/or transform on demand.\n\nReturn value (transformers and/or transformed X data) depends on the passed function. ``fitted=True`` ensures the fitted transformers are used.", - "docstring": "Private function to fit and/or transform on demand.\n\nReturn value (transformers and/or transformed X data) depends\non the passed function.\n``fitted=True`` ensures the fitted transformers are used.", + "description": "Private function to fit and/or transform on demand.\n\nReturn value (transformers and/or transformed X data) depends\non the passed function.\n``fitted=True`` ensures the fitted transformers are used.", + "docstring": "\n Private function to fit and/or transform on demand.\n\n Return value (transformers and/or transformed X data) depends\n on the passed function.\n ``fitted=True`` ensures the fitted transformers are used.\n ", "source_code": "\ndef _fit_transform(self, X, y, func, fitted=False, column_as_strings=False):\n \"\"\"\n Private function to fit and/or transform on demand.\n\n Return value (transformers and/or transformed X data) depends\n on the passed function.\n ``fitted=True`` ensures the fitted transformers are used.\n \"\"\"\n transformers = list(self._iter(fitted=fitted, replace_strings=True, column_as_strings=column_as_strings))\n try:\n return Parallel(n_jobs=self.n_jobs)((delayed(func)(transformer=clone(trans) if not fitted else trans, X=_safe_indexing(X, column, axis=1), y=y, weight=weight, message_clsname='ColumnTransformer', message=self._log_message(name, idx, len(transformers))) for (idx, (name, trans, column, weight)) in enumerate(transformers, 1)))\n except ValueError as e:\n if 'Expected 2D array, got 1D array instead' in str(e):\n raise ValueError(_ERR_MSG_1DCOLUMN) from e\n else:\n raise" }, { @@ -37397,7 +38535,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "name", @@ -37407,7 +38546,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "trans", @@ -37417,7 +38557,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "column", @@ -37427,7 +38568,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "feature_names_in", @@ -37437,13 +38579,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Gets feature names of transformer.\n\nUsed in conjunction with self._iter(fitted=True) in get_feature_names_out.", - "docstring": "Gets feature names of transformer.\n\nUsed in conjunction with self._iter(fitted=True) in get_feature_names_out.", + "docstring": "Gets feature names of transformer.\n\n Used in conjunction with self._iter(fitted=True) in get_feature_names_out.\n ", "source_code": "\ndef _get_feature_name_out_for_transformer(self, name, trans, column, feature_names_in):\n \"\"\"Gets feature names of transformer.\n\n Used in conjunction with self._iter(fitted=True) in get_feature_names_out.\n \"\"\"\n if trans == 'drop' or _is_empty_column_selection(column):\n return\n elif trans == 'passthrough':\n if not isinstance(column, slice) and all((isinstance(col, str) for col in column)):\n return column\n else:\n return feature_names_in[column]\n if not hasattr(trans, 'get_feature_names_out'):\n raise AttributeError(f'Transformer {name} (type {type(trans).__name__}) does not provide get_feature_names_out.')\n if isinstance(column, Iterable) and not all((isinstance(col, str) for col in column)):\n column = _safe_indexing(feature_names_in, column)\n return trans.get_feature_names_out(column)" }, { @@ -37461,7 +38604,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Xs", @@ -37471,13 +38615,17 @@ "docstring": { "type": "list of {array-like, sparse matrix, dataframe}", "description": "" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Stacks Xs horizontally.\n\nThis allows subclasses to control the stacking behavior, while reusing everything else from ColumnTransformer.", - "docstring": "Stacks Xs horizontally.\n\nThis allows subclasses to control the stacking behavior, while reusing\neverything else from ColumnTransformer.\n\nParameters\n----------\nXs : list of {array-like, sparse matrix, dataframe}", + "description": "Stacks Xs horizontally.\n\nThis allows subclasses to control the stacking behavior, while reusing\neverything else from ColumnTransformer.", + "docstring": "Stacks Xs horizontally.\n\n This allows subclasses to control the stacking behavior, while reusing\n everything else from ColumnTransformer.\n\n Parameters\n ----------\n Xs : list of {array-like, sparse matrix, dataframe}\n ", "source_code": "\ndef _hstack(self, Xs):\n \"\"\"Stacks Xs horizontally.\n\n This allows subclasses to control the stacking behavior, while reusing\n everything else from ColumnTransformer.\n\n Parameters\n ----------\n Xs : list of {array-like, sparse matrix, dataframe}\n \"\"\"\n if self.sparse_output_:\n try:\n converted_Xs = [check_array(X, accept_sparse=True, force_all_finite=False) for X in Xs]\n except ValueError as e:\n raise ValueError('For a sparse output, all columns should be a numeric or convertible to a numeric.') from e\n return sparse.hstack(converted_Xs).tocsr()\n else:\n Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]\n return np.hstack(Xs)" }, { @@ -37495,7 +38643,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fitted", @@ -37505,7 +38654,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "replace_strings", @@ -37515,7 +38665,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "column_as_strings", @@ -37525,13 +38676,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Generate (name, trans, column, weight) tuples.\n\nIf fitted=True, use the fitted transformers, else use the user specified transformers updated with converted column names and potentially appended with transformer for remainder.", - "docstring": "Generate (name, trans, column, weight) tuples.\n\nIf fitted=True, use the fitted transformers, else use the\nuser specified transformers updated with converted column names\nand potentially appended with transformer for remainder.", + "description": "Generate (name, trans, column, weight) tuples.\n\nIf fitted=True, use the fitted transformers, else use the\nuser specified transformers updated with converted column names\nand potentially appended with transformer for remainder.", + "docstring": "\n Generate (name, trans, column, weight) tuples.\n\n If fitted=True, use the fitted transformers, else use the\n user specified transformers updated with converted column names\n and potentially appended with transformer for remainder.\n\n ", "source_code": "\ndef _iter(self, fitted=False, replace_strings=False, column_as_strings=False):\n \"\"\"\n Generate (name, trans, column, weight) tuples.\n\n If fitted=True, use the fitted transformers, else use the\n user specified transformers updated with converted column names\n and potentially appended with transformer for remainder.\n\n \"\"\"\n if fitted:\n transformers = self.transformers_\n else:\n transformers = [(name, trans, column) for ((name, trans, _), column) in zip(self.transformers, self._columns)]\n if self._remainder[2]:\n transformers = chain(transformers, [self._remainder])\n get_weight = (self.transformer_weights or {}).get\n for (name, trans, columns) in transformers:\n if replace_strings:\n if trans == 'passthrough':\n trans = FunctionTransformer(accept_sparse=True, check_inverse=False)\n elif trans == 'drop':\n continue\n elif _is_empty_column_selection(columns):\n continue\n if column_as_strings:\n columns_is_scalar = np.isscalar(columns)\n indices = self._transformer_to_input_indices[name]\n columns = self.feature_names_in_[indices]\n if columns_is_scalar:\n columns = columns[0]\n yield (name, trans, columns, get_weight(name))" }, { @@ -37549,7 +38701,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "name", @@ -37559,7 +38712,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "idx", @@ -37569,7 +38723,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "total", @@ -37579,13 +38734,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _log_message(self, name, idx, total):\n if not self.verbose:\n return None\n return '(%d of %d) Processing %s' % (idx, total, name)" }, { @@ -37603,7 +38759,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Xs", @@ -37613,13 +38770,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Record which transformer produced which column.", - "docstring": "Record which transformer produced which column.", + "docstring": "\n Record which transformer produced which column.\n ", "source_code": "\ndef _record_output_indices(self, Xs):\n \"\"\"\n Record which transformer produced which column.\n \"\"\"\n idx = 0\n self.output_indices_ = {}\n for (transformer_idx, (name, _, _, _)) in enumerate(self._iter(fitted=True, replace_strings=True)):\n n_columns = Xs[transformer_idx].shape[1]\n self.output_indices_[name] = slice(idx, idx + n_columns)\n idx += n_columns\n all_names = [t[0] for t in self.transformers] + ['remainder']\n for name in all_names:\n if name not in self.output_indices_:\n self.output_indices_[name] = slice(0, 0)" }, { @@ -37637,13 +38795,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sk_visual_block_(self):\n if isinstance(self.remainder, str) and self.remainder == 'drop':\n transformers = self.transformers\n elif hasattr(self, '_remainder'):\n remainder_columns = self._remainder[2]\n if hasattr(self, 'feature_names_in_') and remainder_columns and not all((isinstance(col, str) for col in remainder_columns)):\n remainder_columns = self.feature_names_in_[remainder_columns].tolist()\n transformers = chain(self.transformers, [('remainder', self.remainder, remainder_columns)])\n else:\n transformers = chain(self.transformers, [('remainder', self.remainder, '')])\n (names, transformers, name_details) = zip(*transformers)\n return _VisualBlock('parallel', transformers, names=names, name_details=name_details)" }, { @@ -37661,13 +38820,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Internal list of transformer only containing the name and transformers, dropping the columns. This is for the implementation of get_params via BaseComposition._get_params which expects lists of tuples of len 2.", - "docstring": "Internal list of transformer only containing the name and\ntransformers, dropping the columns. This is for the implementation\nof get_params via BaseComposition._get_params which expects lists\nof tuples of len 2.", + "description": "Internal list of transformer only containing the name and\ntransformers, dropping the columns. This is for the implementation\nof get_params via BaseComposition._get_params which expects lists\nof tuples of len 2.", + "docstring": "\n Internal list of transformer only containing the name and\n transformers, dropping the columns. This is for the implementation\n of get_params via BaseComposition._get_params which expects lists\n of tuples of len 2.\n ", "source_code": "\n@property\ndef _transformers(self):\n \"\"\"\n Internal list of transformer only containing the name and\n transformers, dropping the columns. This is for the implementation\n of get_params via BaseComposition._get_params which expects lists\n of tuples of len 2.\n \"\"\"\n return [(name, trans) for (name, trans, _) in self.transformers]" }, { @@ -37685,7 +38845,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -37695,13 +38856,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@_transformers.setter\ndef _transformers(self, value):\n self.transformers = [(name, trans, col) for ((name, trans), (_, _, col)) in zip(value, self.transformers)]" }, { @@ -37719,7 +38881,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformers", @@ -37729,13 +38892,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _update_fitted_transformers(self, transformers):\n fitted_transformers = iter(transformers)\n transformers_ = []\n for (name, old, column, _) in self._iter():\n if old == 'drop':\n trans = 'drop'\n elif old == 'passthrough':\n next(fitted_transformers)\n trans = 'passthrough'\n elif _is_empty_column_selection(column):\n trans = old\n else:\n trans = next(fitted_transformers)\n transformers_.append((name, trans, column))\n assert not list(fitted_transformers)\n self.transformers_ = transformers_" }, { @@ -37753,7 +38917,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -37763,13 +38928,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Converts callable column specifications.", - "docstring": "Converts callable column specifications.", + "docstring": "\n Converts callable column specifications.\n ", "source_code": "\ndef _validate_column_callables(self, X):\n \"\"\"\n Converts callable column specifications.\n \"\"\"\n all_columns = []\n transformer_to_input_indices = {}\n for (name, _, columns) in self.transformers:\n if callable(columns):\n columns = columns(X)\n all_columns.append(columns)\n transformer_to_input_indices[name] = _get_column_indices(X, columns)\n self._columns = all_columns\n self._transformer_to_input_indices = transformer_to_input_indices" }, { @@ -37787,7 +38953,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "result", @@ -37797,13 +38964,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Ensure that the output of each transformer is 2D. Otherwise hstack can raise an error or produce incorrect results.", - "docstring": "Ensure that the output of each transformer is 2D. Otherwise\nhstack can raise an error or produce incorrect results.", + "description": "Ensure that the output of each transformer is 2D. Otherwise\nhstack can raise an error or produce incorrect results.", + "docstring": "\n Ensure that the output of each transformer is 2D. Otherwise\n hstack can raise an error or produce incorrect results.\n ", "source_code": "\ndef _validate_output(self, result):\n \"\"\"\n Ensure that the output of each transformer is 2D. Otherwise\n hstack can raise an error or produce incorrect results.\n \"\"\"\n names = [name for (name, _, _, _) in self._iter(fitted=True, replace_strings=True)]\n for (Xs, name) in zip(result, names):\n if not getattr(Xs, 'ndim', 0) == 2:\n raise ValueError(\"The output of the '{0}' transformer should be 2D (scipy matrix, array, or pandas DataFrame).\".format(name))" }, { @@ -37821,7 +38989,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -37831,13 +39000,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Validates ``remainder`` and defines ``_remainder`` targeting the remaining columns.", - "docstring": "Validates ``remainder`` and defines ``_remainder`` targeting\nthe remaining columns.", + "description": "Validates ``remainder`` and defines ``_remainder`` targeting\nthe remaining columns.", + "docstring": "\n Validates ``remainder`` and defines ``_remainder`` targeting\n the remaining columns.\n ", "source_code": "\ndef _validate_remainder(self, X):\n \"\"\"\n Validates ``remainder`` and defines ``_remainder`` targeting\n the remaining columns.\n \"\"\"\n is_transformer = (hasattr(self.remainder, 'fit') or hasattr(self.remainder, 'fit_transform')) and hasattr(self.remainder, 'transform')\n if self.remainder not in ('drop', 'passthrough') and not is_transformer:\n raise ValueError(\"The remainder keyword needs to be one of 'drop', 'passthrough', or estimator. '%s' was passed instead\" % self.remainder)\n self._n_features = X.shape[1]\n cols = set(chain(*self._transformer_to_input_indices.values()))\n remaining = sorted(set(range(self._n_features)) - cols)\n self._remainder = ('remainder', self.remainder, remaining)\n self._transformer_to_input_indices['remainder'] = remaining" }, { @@ -37855,13 +39025,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_transformers(self):\n if not self.transformers:\n return\n (names, transformers, _) = zip(*self.transformers)\n self._validate_names(names)\n for t in transformers:\n if t in ('drop', 'passthrough'):\n continue\n if not (hasattr(t, 'fit') or hasattr(t, 'fit_transform')) or not hasattr(t, 'transform'):\n raise TypeError(\"All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. '%s' (type %s) doesn't.\" % (t, type(t)))" }, { @@ -37879,7 +39050,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -37889,6 +39061,10 @@ "docstring": { "type": "{array-like, dataframe} of shape (n_samples, n_features)", "description": "Input data, of which specified subsets are used to fit the\ntransformers." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -37899,13 +39075,14 @@ "docstring": { "type": "array-like of shape (n_samples,...), default=None", "description": "Targets for supervised learning." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit all transformers using X.", - "docstring": "Fit all transformers using X.\n\nParameters\n----------\nX : {array-like, dataframe} of shape (n_samples, n_features)\n Input data, of which specified subsets are used to fit the\n transformers.\n\ny : array-like of shape (n_samples,...), default=None\n Targets for supervised learning.\n\nReturns\n-------\nself : ColumnTransformer\n This estimator.", + "docstring": "Fit all transformers using X.\n\n Parameters\n ----------\n X : {array-like, dataframe} of shape (n_samples, n_features)\n Input data, of which specified subsets are used to fit the\n transformers.\n\n y : array-like of shape (n_samples,...), default=None\n Targets for supervised learning.\n\n Returns\n -------\n self : ColumnTransformer\n This estimator.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit all transformers using X.\n\n Parameters\n ----------\n X : {array-like, dataframe} of shape (n_samples, n_features)\n Input data, of which specified subsets are used to fit the\n transformers.\n\n y : array-like of shape (n_samples,...), default=None\n Targets for supervised learning.\n\n Returns\n -------\n self : ColumnTransformer\n This estimator.\n \"\"\"\n self.fit_transform(X, y=y)\n return self" }, { @@ -37923,7 +39100,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -37933,6 +39111,10 @@ "docstring": { "type": "{array-like, dataframe} of shape (n_samples, n_features)", "description": "Input data, of which specified subsets are used to fit the\ntransformers." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -37943,13 +39125,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Targets for supervised learning." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit all transformers, transform the data and concatenate results.", - "docstring": "Fit all transformers, transform the data and concatenate results.\n\nParameters\n----------\nX : {array-like, dataframe} of shape (n_samples, n_features)\n Input data, of which specified subsets are used to fit the\n transformers.\n\ny : array-like of shape (n_samples,), default=None\n Targets for supervised learning.\n\nReturns\n-------\nX_t : {array-like, sparse matrix} of shape (n_samples, sum_n_components)\n Horizontally stacked results of transformers. sum_n_components is the\n sum of n_components (output dimension) over transformers. If\n any result is a sparse matrix, everything will be converted to\n sparse matrices.", + "docstring": "Fit all transformers, transform the data and concatenate results.\n\n Parameters\n ----------\n X : {array-like, dataframe} of shape (n_samples, n_features)\n Input data, of which specified subsets are used to fit the\n transformers.\n\n y : array-like of shape (n_samples,), default=None\n Targets for supervised learning.\n\n Returns\n -------\n X_t : {array-like, sparse matrix} of shape (n_samples, sum_n_components)\n Horizontally stacked results of transformers. sum_n_components is the\n sum of n_components (output dimension) over transformers. If\n any result is a sparse matrix, everything will be converted to\n sparse matrices.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Fit all transformers, transform the data and concatenate results.\n\n Parameters\n ----------\n X : {array-like, dataframe} of shape (n_samples, n_features)\n Input data, of which specified subsets are used to fit the\n transformers.\n\n y : array-like of shape (n_samples,), default=None\n Targets for supervised learning.\n\n Returns\n -------\n X_t : {array-like, sparse matrix} of shape (n_samples, sum_n_components)\n Horizontally stacked results of transformers. sum_n_components is the\n sum of n_components (output dimension) over transformers. If\n any result is a sparse matrix, everything will be converted to\n sparse matrices.\n \"\"\"\n self._check_feature_names(X, reset=True)\n X = _check_X(X)\n self._check_n_features(X, reset=True)\n self._validate_transformers()\n self._validate_column_callables(X)\n self._validate_remainder(X)\n result = self._fit_transform(X, y, _fit_transform_one)\n if not result:\n self._update_fitted_transformers([])\n return np.zeros((X.shape[0], 0))\n (Xs, transformers) = zip(*result)\n if any((sparse.issparse(X) for X in Xs)):\n nnz = sum((X.nnz if sparse.issparse(X) else X.size for X in Xs))\n total = sum((X.shape[0] * X.shape[1] if sparse.issparse(X) else X.size for X in Xs))\n density = nnz / total\n self.sparse_output_ = density < self.sparse_threshold\n else:\n self.sparse_output_ = False\n self._update_fitted_transformers(transformers)\n self._validate_output(Xs)\n self._record_output_indices(Xs)\n return self._hstack(list(Xs))" }, { @@ -37969,13 +39152,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get feature names from all transformers.", - "docstring": "Get feature names from all transformers.\n\nReturns\n-------\nfeature_names : list of strings\n Names of the features produced by transform.", + "docstring": "Get feature names from all transformers.\n\n Returns\n -------\n feature_names : list of strings\n Names of the features produced by transform.\n ", "source_code": "\n@deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\ndef get_feature_names(self):\n \"\"\"Get feature names from all transformers.\n\n Returns\n -------\n feature_names : list of strings\n Names of the features produced by transform.\n \"\"\"\n check_is_fitted(self)\n feature_names = []\n for (name, trans, column, _) in self._iter(fitted=True):\n if trans == 'drop' or _is_empty_column_selection(column):\n continue\n if trans == 'passthrough':\n if hasattr(self, 'feature_names_in_'):\n if not isinstance(column, slice) and all((isinstance(col, str) for col in column)):\n feature_names.extend(column)\n else:\n feature_names.extend(self.feature_names_in_[column])\n else:\n indices = np.arange(self._n_features)\n feature_names.extend(['x%d' % i for i in indices[column]])\n continue\n if not hasattr(trans, 'get_feature_names'):\n raise AttributeError('Transformer %s (type %s) does not provide get_feature_names.' % (str(name), type(trans).__name__))\n feature_names.extend([f'{name}__{f}' for f in trans.get_feature_names()])\n return feature_names" }, { @@ -37993,7 +39177,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -38003,13 +39188,14 @@ "docstring": { "type": "array-like of str or None, default=None", "description": "Input features.\n\n- If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n- If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get output feature names for transformation.", - "docstring": "Get output feature names for transformation.\n\nParameters\n----------\ninput_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\nReturns\n-------\nfeature_names_out : ndarray of str objects\n Transformed feature names.", + "docstring": "Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n ", "source_code": "\ndef get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n check_is_fitted(self)\n input_features = _check_feature_names_in(self, input_features)\n transformer_with_feature_names_out = []\n for (name, trans, column, _) in self._iter(fitted=True):\n feature_names_out = self._get_feature_name_out_for_transformer(name, trans, column, input_features)\n if feature_names_out is None:\n continue\n transformer_with_feature_names_out.append((name, feature_names_out))\n if not transformer_with_feature_names_out:\n return np.array([], dtype=object)\n if self.verbose_feature_names_out:\n names = list(chain.from_iterable(((f'{name}__{i}' for i in feature_names_out) for (name, feature_names_out) in transformer_with_feature_names_out)))\n return np.asarray(names, dtype=object)\n feature_names_count = Counter(chain.from_iterable((s for (_, s) in transformer_with_feature_names_out)))\n top_6_overlap = [name for (name, count) in feature_names_count.most_common(6) if count > 1]\n top_6_overlap.sort()\n if top_6_overlap:\n if len(top_6_overlap) == 6:\n names_repr = str(top_6_overlap[:5])[:-1] + ', ...]'\n else:\n names_repr = str(top_6_overlap)\n raise ValueError(f'Output feature names: {names_repr} are not unique. Please set verbose_feature_names_out=True to add prefixes to feature names')\n return np.concatenate([name for (_, name) in transformer_with_feature_names_out])" }, { @@ -38027,7 +39213,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -38037,13 +39224,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, will return the parameters for this estimator and\ncontained subobjects that are estimators." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Get parameters for this estimator.\n\nReturns the parameters given in the constructor as well as the estimators contained within the `transformers` of the `ColumnTransformer`.", - "docstring": "Get parameters for this estimator.\n\nReturns the parameters given in the constructor as well as the\nestimators contained within the `transformers` of the\n`ColumnTransformer`.\n\nParameters\n----------\ndeep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\nReturns\n-------\nparams : dict\n Parameter names mapped to their values.", + "description": "Get parameters for this estimator.\n\nReturns the parameters given in the constructor as well as the\nestimators contained within the `transformers` of the\n`ColumnTransformer`.", + "docstring": "Get parameters for this estimator.\n\n Returns the parameters given in the constructor as well as the\n estimators contained within the `transformers` of the\n `ColumnTransformer`.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n ", "source_code": "\ndef get_params(self, deep=True):\n \"\"\"Get parameters for this estimator.\n\n Returns the parameters given in the constructor as well as the\n estimators contained within the `transformers` of the\n `ColumnTransformer`.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n \"\"\"\n return self._get_params('_transformers', deep=deep)" }, { @@ -38061,13 +39249,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Access the fitted transformer by name.\n\nRead-only attribute to access any transformer by given name. Keys are transformer names and values are the fitted transformer objects.", - "docstring": "Access the fitted transformer by name.\n\nRead-only attribute to access any transformer by given name.\nKeys are transformer names and values are the fitted transformer\nobjects.", + "description": "Access the fitted transformer by name.\n\nRead-only attribute to access any transformer by given name.\nKeys are transformer names and values are the fitted transformer\nobjects.", + "docstring": "Access the fitted transformer by name.\n\n Read-only attribute to access any transformer by given name.\n Keys are transformer names and values are the fitted transformer\n objects.\n ", "source_code": "\n@property\ndef named_transformers_(self):\n \"\"\"Access the fitted transformer by name.\n\n Read-only attribute to access any transformer by given name.\n Keys are transformer names and values are the fitted transformer\n objects.\n \"\"\"\n return Bunch(**{name: trans for (name, trans, _) in self.transformers_})" }, { @@ -38085,13 +39274,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Set the parameters of this estimator.\n\nValid parameter keys can be listed with ``get_params()``. Note that you can directly set the parameters of the estimators contained in `transformers` of `ColumnTransformer`.", - "docstring": "Set the parameters of this estimator.\n\nValid parameter keys can be listed with ``get_params()``. Note that you\ncan directly set the parameters of the estimators contained in\n`transformers` of `ColumnTransformer`.\n\nParameters\n----------\n**kwargs : dict\n Estimator parameters.\n\nReturns\n-------\nself : ColumnTransformer\n This estimator.", + "description": "Set the parameters of this estimator.\n\nValid parameter keys can be listed with ``get_params()``. Note that you\ncan directly set the parameters of the estimators contained in\n`transformers` of `ColumnTransformer`.", + "docstring": "Set the parameters of this estimator.\n\n Valid parameter keys can be listed with ``get_params()``. Note that you\n can directly set the parameters of the estimators contained in\n `transformers` of `ColumnTransformer`.\n\n Parameters\n ----------\n **kwargs : dict\n Estimator parameters.\n\n Returns\n -------\n self : ColumnTransformer\n This estimator.\n ", "source_code": "\ndef set_params(self, **kwargs):\n \"\"\"Set the parameters of this estimator.\n\n Valid parameter keys can be listed with ``get_params()``. Note that you\n can directly set the parameters of the estimators contained in\n `transformers` of `ColumnTransformer`.\n\n Parameters\n ----------\n **kwargs : dict\n Estimator parameters.\n\n Returns\n -------\n self : ColumnTransformer\n This estimator.\n \"\"\"\n self._set_params('_transformers', **kwargs)\n return self" }, { @@ -38109,7 +39299,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -38119,13 +39310,17 @@ "docstring": { "type": "{array-like, dataframe} of shape (n_samples, n_features)", "description": "The data to be transformed by subset." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Transform X separately by each transformer, concatenate results.", - "docstring": "Transform X separately by each transformer, concatenate results.\n\nParameters\n----------\nX : {array-like, dataframe} of shape (n_samples, n_features)\n The data to be transformed by subset.\n\nReturns\n-------\nX_t : {array-like, sparse matrix} of shape (n_samples, sum_n_components)\n Horizontally stacked results of transformers. sum_n_components is the\n sum of n_components (output dimension) over transformers. If\n any result is a sparse matrix, everything will be converted to\n sparse matrices.", + "docstring": "Transform X separately by each transformer, concatenate results.\n\n Parameters\n ----------\n X : {array-like, dataframe} of shape (n_samples, n_features)\n The data to be transformed by subset.\n\n Returns\n -------\n X_t : {array-like, sparse matrix} of shape (n_samples, sum_n_components)\n Horizontally stacked results of transformers. sum_n_components is the\n sum of n_components (output dimension) over transformers. If\n any result is a sparse matrix, everything will be converted to\n sparse matrices.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform X separately by each transformer, concatenate results.\n\n Parameters\n ----------\n X : {array-like, dataframe} of shape (n_samples, n_features)\n The data to be transformed by subset.\n\n Returns\n -------\n X_t : {array-like, sparse matrix} of shape (n_samples, sum_n_components)\n Horizontally stacked results of transformers. sum_n_components is the\n sum of n_components (output dimension) over transformers. If\n any result is a sparse matrix, everything will be converted to\n sparse matrices.\n \"\"\"\n check_is_fitted(self)\n X = _check_X(X)\n fit_dataframe_and_transform_dataframe = hasattr(self, 'feature_names_in_') and hasattr(X, 'columns')\n if fit_dataframe_and_transform_dataframe:\n named_transformers = self.named_transformers_\n non_dropped_indices = [ind for (name, ind) in self._transformer_to_input_indices.items() if name in named_transformers and isinstance(named_transformers[name], str) and named_transformers[name] != 'drop']\n all_indices = set(chain(*non_dropped_indices))\n all_names = set((self.feature_names_in_[ind] for ind in all_indices))\n diff = all_names - set(X.columns)\n if diff:\n raise ValueError(f'columns are missing: {diff}')\n else:\n self._check_n_features(X, reset=False)\n Xs = self._fit_transform(X, None, _transform_one, fitted=True, column_as_strings=fit_dataframe_and_transform_dataframe)\n self._validate_output(Xs)\n if not Xs:\n return np.zeros((X.shape[0], 0))\n return self._hstack(list(Xs))" }, { @@ -38143,7 +39338,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -38167,13 +39363,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Construct (name, trans, column) tuples from list", - "docstring": "Construct (name, trans, column) tuples from list", + "docstring": "\n Construct (name, trans, column) tuples from list\n\n ", "source_code": "\ndef _get_transformer_list(estimators):\n \"\"\"\n Construct (name, trans, column) tuples from list\n\n \"\"\"\n (transformers, columns) = zip(*estimators)\n (names, _) = zip(*_name_estimators(transformers))\n transformer_list = list(zip(names, transformers, columns))\n return transformer_list" }, { @@ -38191,13 +39388,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return True if the column selection is empty (empty list or all-False boolean array).", - "docstring": "Return True if the column selection is empty (empty list or all-False\nboolean array).", + "description": "Return True if the column selection is empty (empty list or all-False\nboolean array).", + "docstring": "\n Return True if the column selection is empty (empty list or all-False\n boolean array).\n\n ", "source_code": "\ndef _is_empty_column_selection(column):\n \"\"\"\n Return True if the column selection is empty (empty list or all-False\n boolean array).\n\n \"\"\"\n if hasattr(column, 'dtype') and np.issubdtype(column.dtype, np.bool_):\n return not column.any()\n elif hasattr(column, '__len__'):\n return len(column) == 0 or all((isinstance(col, bool) for col in column)) and not any(column)\n else:\n return False" }, { @@ -38215,7 +39413,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "df", @@ -38225,13 +39424,14 @@ "docstring": { "type": "dataframe of shape (n_features, n_samples)", "description": "DataFrame to select columns from." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Callable for column selection to be used by a :class:`ColumnTransformer`.", - "docstring": "Callable for column selection to be used by a\n:class:`ColumnTransformer`.\n\nParameters\n----------\ndf : dataframe of shape (n_features, n_samples)\n DataFrame to select columns from.", + "description": "Callable for column selection to be used by a\n:class:`ColumnTransformer`.", + "docstring": "Callable for column selection to be used by a\n :class:`ColumnTransformer`.\n\n Parameters\n ----------\n df : dataframe of shape (n_features, n_samples)\n DataFrame to select columns from.\n ", "source_code": "\ndef __call__(self, df):\n \"\"\"Callable for column selection to be used by a\n :class:`ColumnTransformer`.\n\n Parameters\n ----------\n df : dataframe of shape (n_features, n_samples)\n DataFrame to select columns from.\n \"\"\"\n if not hasattr(df, 'iloc'):\n raise ValueError('make_column_selector can only be applied to pandas dataframes')\n df_row = df.iloc[:1]\n if self.dtype_include is not None or self.dtype_exclude is not None:\n df_row = df_row.select_dtypes(include=self.dtype_include, exclude=self.dtype_exclude)\n cols = df_row.columns\n if self.pattern is not None:\n cols = cols[cols.str.contains(self.pattern, regex=True)]\n return cols.tolist()" }, { @@ -38249,7 +39449,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "pattern", @@ -38259,7 +39460,8 @@ "docstring": { "type": "str, default=None", "description": "Name of columns containing this regex pattern will be included. If\nNone, column selection will not be selected based on pattern." - } + }, + "refined_type": {} }, { "name": "dtype_include", @@ -38269,7 +39471,8 @@ "docstring": { "type": "column dtype or list of column dtypes, default=None", "description": "A selection of dtypes to include. For more details, see\n:meth:`pandas.DataFrame.select_dtypes`." - } + }, + "refined_type": {} }, { "name": "dtype_exclude", @@ -38279,13 +39482,14 @@ "docstring": { "type": "column dtype or list of column dtypes, default=None", "description": "A selection of dtypes to exclude. For more details, see\n:meth:`pandas.DataFrame.select_dtypes`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, pattern=None, *, dtype_include=None, dtype_exclude=None):\n self.pattern = pattern\n self.dtype_include = dtype_include\n self.dtype_exclude = dtype_exclude" }, { @@ -38303,6 +39507,10 @@ "docstring": { "type": "{'drop', 'passthrough'} or estimator, default='drop'", "description": "By default, only the specified columns in `transformers` are\ntransformed and combined in the output, and the non-specified\ncolumns are dropped. (default of ``'drop'``).\nBy specifying ``remainder='passthrough'``, all remaining columns that\nwere not specified in `transformers` will be automatically passed\nthrough. This subset of columns is concatenated with the output of\nthe transformers.\nBy setting ``remainder`` to be an estimator, the remaining\nnon-specified columns will use the ``remainder`` estimator. The\nestimator must support :term:`fit` and :term:`transform`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["passthrough", "drop"] } }, { @@ -38313,7 +39521,8 @@ "docstring": { "type": "float, default=0.3", "description": "If the transformed output consists of a mix of sparse and dense data,\nit will be stacked as a sparse matrix if the density is lower than this\nvalue. Use ``sparse_threshold=0`` to always return dense.\nWhen the transformed output consists of all sparse or all dense data,\nthe stacked result will be sparse or dense, respectively, and this\nkeyword will be ignored." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -38323,7 +39532,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -38333,7 +39543,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, the time elapsed while fitting each transformer will be\nprinted as it is completed." - } + }, + "refined_type": {} }, { "name": "verbose_feature_names_out", @@ -38343,13 +39554,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, :meth:`get_feature_names_out` will prefix all feature names\nwith the name of the transformer that generated that feature.\nIf False, :meth:`get_feature_names_out` will not prefix any feature\nnames and will error if feature names are not unique.\n\n.. versionadded:: 1.0" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Construct a ColumnTransformer from the given transformers.\n\nThis is a shorthand for the ColumnTransformer constructor; it does not require, and does not permit, naming the transformers. Instead, they will be given names automatically based on their types. It also does not allow weighting with ``transformer_weights``. Read more in the :ref:`User Guide `.", - "docstring": "Construct a ColumnTransformer from the given transformers.\n\nThis is a shorthand for the ColumnTransformer constructor; it does not\nrequire, and does not permit, naming the transformers. Instead, they will\nbe given names automatically based on their types. It also does not allow\nweighting with ``transformer_weights``.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\n*transformers : tuples\n Tuples of the form (transformer, columns) specifying the\n transformer objects to be applied to subsets of the data.\n\n transformer : {'drop', 'passthrough'} or estimator\n Estimator must support :term:`fit` and :term:`transform`.\n Special-cased strings 'drop' and 'passthrough' are accepted as\n well, to indicate to drop the columns or to pass them through\n untransformed, respectively.\n columns : str, array-like of str, int, array-like of int, slice, array-like of bool or callable\n Indexes the data on its second axis. Integers are interpreted as\n positional columns, while strings can reference DataFrame columns\n by name. A scalar string or int should be used where\n ``transformer`` expects X to be a 1d array-like (vector),\n otherwise a 2d array will be passed to the transformer.\n A callable is passed the input data `X` and can return any of the\n above. To select multiple columns by name or dtype, you can use\n :obj:`make_column_selector`.\n\nremainder : {'drop', 'passthrough'} or estimator, default='drop'\n By default, only the specified columns in `transformers` are\n transformed and combined in the output, and the non-specified\n columns are dropped. (default of ``'drop'``).\n By specifying ``remainder='passthrough'``, all remaining columns that\n were not specified in `transformers` will be automatically passed\n through. This subset of columns is concatenated with the output of\n the transformers.\n By setting ``remainder`` to be an estimator, the remaining\n non-specified columns will use the ``remainder`` estimator. The\n estimator must support :term:`fit` and :term:`transform`.\n\nsparse_threshold : float, default=0.3\n If the transformed output consists of a mix of sparse and dense data,\n it will be stacked as a sparse matrix if the density is lower than this\n value. Use ``sparse_threshold=0`` to always return dense.\n When the transformed output consists of all sparse or all dense data,\n the stacked result will be sparse or dense, respectively, and this\n keyword will be ignored.\n\nn_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nverbose : bool, default=False\n If True, the time elapsed while fitting each transformer will be\n printed as it is completed.\n\nverbose_feature_names_out : bool, default=True\n If True, :meth:`get_feature_names_out` will prefix all feature names\n with the name of the transformer that generated that feature.\n If False, :meth:`get_feature_names_out` will not prefix any feature\n names and will error if feature names are not unique.\n\n .. versionadded:: 1.0\n\nReturns\n-------\nct : ColumnTransformer\n\nSee Also\n--------\nColumnTransformer : Class that allows combining the\n outputs of multiple transformer objects used on column subsets\n of the data into a single feature space.\n\nExamples\n--------\n>>> from sklearn.preprocessing import StandardScaler, OneHotEncoder\n>>> from sklearn.compose import make_column_transformer\n>>> make_column_transformer(\n... (StandardScaler(), ['numerical_column']),\n... (OneHotEncoder(), ['categorical_column']))\nColumnTransformer(transformers=[('standardscaler', StandardScaler(...),\n ['numerical_column']),\n ('onehotencoder', OneHotEncoder(...),\n ['categorical_column'])])", + "description": "Construct a ColumnTransformer from the given transformers.\n\nThis is a shorthand for the ColumnTransformer constructor; it does not\nrequire, and does not permit, naming the transformers. Instead, they will\nbe given names automatically based on their types. It also does not allow\nweighting with ``transformer_weights``.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Construct a ColumnTransformer from the given transformers.\n\n This is a shorthand for the ColumnTransformer constructor; it does not\n require, and does not permit, naming the transformers. Instead, they will\n be given names automatically based on their types. It also does not allow\n weighting with ``transformer_weights``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n *transformers : tuples\n Tuples of the form (transformer, columns) specifying the\n transformer objects to be applied to subsets of the data.\n\n transformer : {'drop', 'passthrough'} or estimator\n Estimator must support :term:`fit` and :term:`transform`.\n Special-cased strings 'drop' and 'passthrough' are accepted as\n well, to indicate to drop the columns or to pass them through\n untransformed, respectively.\n columns : str, array-like of str, int, array-like of int, slice, array-like of bool or callable\n Indexes the data on its second axis. Integers are interpreted as\n positional columns, while strings can reference DataFrame columns\n by name. A scalar string or int should be used where\n ``transformer`` expects X to be a 1d array-like (vector),\n otherwise a 2d array will be passed to the transformer.\n A callable is passed the input data `X` and can return any of the\n above. To select multiple columns by name or dtype, you can use\n :obj:`make_column_selector`.\n\n remainder : {'drop', 'passthrough'} or estimator, default='drop'\n By default, only the specified columns in `transformers` are\n transformed and combined in the output, and the non-specified\n columns are dropped. (default of ``'drop'``).\n By specifying ``remainder='passthrough'``, all remaining columns that\n were not specified in `transformers` will be automatically passed\n through. This subset of columns is concatenated with the output of\n the transformers.\n By setting ``remainder`` to be an estimator, the remaining\n non-specified columns will use the ``remainder`` estimator. The\n estimator must support :term:`fit` and :term:`transform`.\n\n sparse_threshold : float, default=0.3\n If the transformed output consists of a mix of sparse and dense data,\n it will be stacked as a sparse matrix if the density is lower than this\n value. Use ``sparse_threshold=0`` to always return dense.\n When the transformed output consists of all sparse or all dense data,\n the stacked result will be sparse or dense, respectively, and this\n keyword will be ignored.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : bool, default=False\n If True, the time elapsed while fitting each transformer will be\n printed as it is completed.\n\n verbose_feature_names_out : bool, default=True\n If True, :meth:`get_feature_names_out` will prefix all feature names\n with the name of the transformer that generated that feature.\n If False, :meth:`get_feature_names_out` will not prefix any feature\n names and will error if feature names are not unique.\n\n .. versionadded:: 1.0\n\n Returns\n -------\n ct : ColumnTransformer\n\n See Also\n --------\n ColumnTransformer : Class that allows combining the\n outputs of multiple transformer objects used on column subsets\n of the data into a single feature space.\n\n Examples\n --------\n >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder\n >>> from sklearn.compose import make_column_transformer\n >>> make_column_transformer(\n ... (StandardScaler(), ['numerical_column']),\n ... (OneHotEncoder(), ['categorical_column']))\n ColumnTransformer(transformers=[('standardscaler', StandardScaler(...),\n ['numerical_column']),\n ('onehotencoder', OneHotEncoder(...),\n ['categorical_column'])])\n\n ", "source_code": "\ndef make_column_transformer(*transformers, remainder='drop', sparse_threshold=0.3, n_jobs=None, verbose=False, verbose_feature_names_out=True):\n \"\"\"Construct a ColumnTransformer from the given transformers.\n\n This is a shorthand for the ColumnTransformer constructor; it does not\n require, and does not permit, naming the transformers. Instead, they will\n be given names automatically based on their types. It also does not allow\n weighting with ``transformer_weights``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n *transformers : tuples\n Tuples of the form (transformer, columns) specifying the\n transformer objects to be applied to subsets of the data.\n\n transformer : {'drop', 'passthrough'} or estimator\n Estimator must support :term:`fit` and :term:`transform`.\n Special-cased strings 'drop' and 'passthrough' are accepted as\n well, to indicate to drop the columns or to pass them through\n untransformed, respectively.\n columns : str, array-like of str, int, array-like of int, slice, array-like of bool or callable\n Indexes the data on its second axis. Integers are interpreted as\n positional columns, while strings can reference DataFrame columns\n by name. A scalar string or int should be used where\n ``transformer`` expects X to be a 1d array-like (vector),\n otherwise a 2d array will be passed to the transformer.\n A callable is passed the input data `X` and can return any of the\n above. To select multiple columns by name or dtype, you can use\n :obj:`make_column_selector`.\n\n remainder : {'drop', 'passthrough'} or estimator, default='drop'\n By default, only the specified columns in `transformers` are\n transformed and combined in the output, and the non-specified\n columns are dropped. (default of ``'drop'``).\n By specifying ``remainder='passthrough'``, all remaining columns that\n were not specified in `transformers` will be automatically passed\n through. This subset of columns is concatenated with the output of\n the transformers.\n By setting ``remainder`` to be an estimator, the remaining\n non-specified columns will use the ``remainder`` estimator. The\n estimator must support :term:`fit` and :term:`transform`.\n\n sparse_threshold : float, default=0.3\n If the transformed output consists of a mix of sparse and dense data,\n it will be stacked as a sparse matrix if the density is lower than this\n value. Use ``sparse_threshold=0`` to always return dense.\n When the transformed output consists of all sparse or all dense data,\n the stacked result will be sparse or dense, respectively, and this\n keyword will be ignored.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : bool, default=False\n If True, the time elapsed while fitting each transformer will be\n printed as it is completed.\n\n verbose_feature_names_out : bool, default=True\n If True, :meth:`get_feature_names_out` will prefix all feature names\n with the name of the transformer that generated that feature.\n If False, :meth:`get_feature_names_out` will not prefix any feature\n names and will error if feature names are not unique.\n\n .. versionadded:: 1.0\n\n Returns\n -------\n ct : ColumnTransformer\n\n See Also\n --------\n ColumnTransformer : Class that allows combining the\n outputs of multiple transformer objects used on column subsets\n of the data into a single feature space.\n\n Examples\n --------\n >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder\n >>> from sklearn.compose import make_column_transformer\n >>> make_column_transformer(\n ... (StandardScaler(), ['numerical_column']),\n ... (OneHotEncoder(), ['categorical_column']))\n ColumnTransformer(transformers=[('standardscaler', StandardScaler(...),\n ['numerical_column']),\n ('onehotencoder', OneHotEncoder(...),\n ['categorical_column'])])\n\n \"\"\"\n transformer_list = _get_transformer_list(transformers)\n return ColumnTransformer(transformer_list, n_jobs=n_jobs, remainder=remainder, sparse_threshold=sparse_threshold, verbose=verbose, verbose_feature_names_out=verbose_feature_names_out)" }, { @@ -38367,7 +39579,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "regressor", @@ -38377,7 +39590,8 @@ "docstring": { "type": "object, default=None", "description": "Regressor object such as derived from\n:class:`~sklearn.base.RegressorMixin`. This regressor will\nautomatically be cloned each time prior to fitting. If `regressor is\nNone`, :class:`~sklearn.linear_model.LinearRegression` is created and used." - } + }, + "refined_type": {} }, { "name": "transformer", @@ -38387,7 +39601,8 @@ "docstring": { "type": "object, default=None", "description": "Estimator object such as derived from\n:class:`~sklearn.base.TransformerMixin`. Cannot be set at the same time\nas `func` and `inverse_func`. If `transformer is None` as well as\n`func` and `inverse_func`, the transformer will be an identity\ntransformer. Note that the transformer will be cloned during fitting.\nAlso, the transformer is restricting `y` to be a numpy array." - } + }, + "refined_type": {} }, { "name": "func", @@ -38397,7 +39612,8 @@ "docstring": { "type": "function, default=None", "description": "Function to apply to `y` before passing to :meth:`fit`. Cannot be set\nat the same time as `transformer`. The function needs to return a\n2-dimensional array. If `func is None`, the function used will be the\nidentity function." - } + }, + "refined_type": {} }, { "name": "inverse_func", @@ -38407,7 +39623,8 @@ "docstring": { "type": "function, default=None", "description": "Function to apply to the prediction of the regressor. Cannot be set at\nthe same time as `transformer`. The function needs to return a\n2-dimensional array. The inverse function is used to return\npredictions to the same space of the original training labels." - } + }, + "refined_type": {} }, { "name": "check_inverse", @@ -38417,13 +39634,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to check that `transform` followed by `inverse_transform`\nor `func` followed by `inverse_func` leads to the original targets." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, regressor=None, *, transformer=None, func=None, inverse_func=None, check_inverse=True):\n self.regressor = regressor\n self.transformer = transformer\n self.func = func\n self.inverse_func = inverse_func\n self.check_inverse = check_inverse" }, { @@ -38441,7 +39659,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -38451,13 +39670,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Check transformer and fit transformer.\n\nCreate the default transformer, fit it and make additional inverse check on a subset (optional).", - "docstring": "Check transformer and fit transformer.\n\nCreate the default transformer, fit it and make additional inverse\ncheck on a subset (optional).", + "description": "Check transformer and fit transformer.\n\nCreate the default transformer, fit it and make additional inverse\ncheck on a subset (optional).", + "docstring": "Check transformer and fit transformer.\n\n Create the default transformer, fit it and make additional inverse\n check on a subset (optional).\n\n ", "source_code": "\ndef _fit_transformer(self, y):\n \"\"\"Check transformer and fit transformer.\n\n Create the default transformer, fit it and make additional inverse\n check on a subset (optional).\n\n \"\"\"\n if self.transformer is not None and (self.func is not None or self.inverse_func is not None):\n raise ValueError(\"'transformer' and functions 'func'/'inverse_func' cannot both be set.\")\n elif self.transformer is not None:\n self.transformer_ = clone(self.transformer)\n else:\n if self.func is not None and self.inverse_func is None:\n raise ValueError(\"When 'func' is provided, 'inverse_func' must also be provided\")\n self.transformer_ = FunctionTransformer(func=self.func, inverse_func=self.inverse_func, validate=True, check_inverse=self.check_inverse)\n self.transformer_.fit(y)\n if self.check_inverse:\n idx_selected = slice(None, None, max(1, y.shape[0] // 10))\n y_sel = _safe_indexing(y, idx_selected)\n y_sel_t = self.transformer_.transform(y_sel)\n if not np.allclose(y_sel, self.transformer_.inverse_transform(y_sel_t)):\n warnings.warn(\"The provided functions or transformer are not strictly inverse of each other. If you are sure you want to proceed regardless, set 'check_inverse=False'\", UserWarning)" }, { @@ -38475,13 +39695,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n regressor = self.regressor\n if regressor is None:\n from ..linear_model import LinearRegression\n regressor = LinearRegression()\n return {'poor_score': True, 'multioutput': _safe_tags(regressor, key='multioutput')}" }, { @@ -38499,7 +39720,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -38509,6 +39731,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -38519,13 +39745,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model according to the given training data.", - "docstring": "Fit the model according to the given training data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target values.\n\n**fit_params : dict\n Parameters passed to the `fit` method of the underlying\n regressor.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n **fit_params : dict\n Parameters passed to the `fit` method of the underlying\n regressor.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, **fit_params):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n **fit_params : dict\n Parameters passed to the `fit` method of the underlying\n regressor.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n y = check_array(y, accept_sparse=False, force_all_finite=True, ensure_2d=False, dtype='numeric', allow_nd=True)\n self._training_dim = y.ndim\n if y.ndim == 1:\n y_2d = y.reshape(-1, 1)\n else:\n y_2d = y\n self._fit_transformer(y_2d)\n y_trans = self.transformer_.transform(y_2d)\n if y_trans.ndim == 2 and y_trans.shape[1] == 1:\n y_trans = y_trans.squeeze(axis=1)\n if self.regressor is None:\n from ..linear_model import LinearRegression\n self.regressor_ = LinearRegression()\n else:\n self.regressor_ = clone(self.regressor)\n self.regressor_.fit(X, y_trans, **fit_params)\n if hasattr(self.regressor_, 'feature_names_in_'):\n self.feature_names_in_ = self.regressor_.feature_names_in_\n return self" }, { @@ -38543,7 +39770,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -38567,7 +39795,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -38577,13 +39806,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict using the base regressor, applying inverse.\n\nThe regressor is used to predict and the `inverse_func` or `inverse_transform` is applied before returning the prediction.", - "docstring": "Predict using the base regressor, applying inverse.\n\nThe regressor is used to predict and the `inverse_func` or\n`inverse_transform` is applied before returning the prediction.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n**predict_params : dict of str -> object\n Parameters passed to the `predict` method of the underlying\n regressor.\n\nReturns\n-------\ny_hat : ndarray of shape (n_samples,)\n Predicted values.", + "description": "Predict using the base regressor, applying inverse.\n\nThe regressor is used to predict and the `inverse_func` or\n`inverse_transform` is applied before returning the prediction.", + "docstring": "Predict using the base regressor, applying inverse.\n\n The regressor is used to predict and the `inverse_func` or\n `inverse_transform` is applied before returning the prediction.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n **predict_params : dict of str -> object\n Parameters passed to the `predict` method of the underlying\n regressor.\n\n Returns\n -------\n y_hat : ndarray of shape (n_samples,)\n Predicted values.\n ", "source_code": "\ndef predict(self, X, **predict_params):\n \"\"\"Predict using the base regressor, applying inverse.\n\n The regressor is used to predict and the `inverse_func` or\n `inverse_transform` is applied before returning the prediction.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n **predict_params : dict of str -> object\n Parameters passed to the `predict` method of the underlying\n regressor.\n\n Returns\n -------\n y_hat : ndarray of shape (n_samples,)\n Predicted values.\n \"\"\"\n check_is_fitted(self)\n pred = self.regressor_.predict(X, **predict_params)\n if pred.ndim == 1:\n pred_trans = self.transformer_.inverse_transform(pred.reshape(-1, 1))\n else:\n pred_trans = self.transformer_.inverse_transform(pred)\n if self._training_dim == 1 and pred_trans.ndim == 2 and pred_trans.shape[1] == 1:\n pred_trans = pred_trans.squeeze(axis=1)\n return pred_trans" }, { @@ -38601,7 +39834,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -38619,8 +39853,8 @@ "parameters": [], "results": [], "is_public": true, - "description": "Setup and teardown fixture for matplotlib.\n\nThis fixture checks if we can import matplotlib. If not, the tests will be skipped. Otherwise, we close the figures before and after running the functions.", - "docstring": "Setup and teardown fixture for matplotlib.\n\nThis fixture checks if we can import matplotlib. If not, the tests will be\nskipped. Otherwise, we close the figures before and after running the\nfunctions.\n\nReturns\n-------\npyplot : module\n The ``matplotlib.pyplot`` module.", + "description": "Setup and teardown fixture for matplotlib.\n\nThis fixture checks if we can import matplotlib. If not, the tests will be\nskipped. Otherwise, we close the figures before and after running the\nfunctions.", + "docstring": "Setup and teardown fixture for matplotlib.\n\n This fixture checks if we can import matplotlib. If not, the tests will be\n skipped. Otherwise, we close the figures before and after running the\n functions.\n\n Returns\n -------\n pyplot : module\n The ``matplotlib.pyplot`` module.\n ", "source_code": "\n@pytest.fixture(scope='function')\ndef pyplot():\n \"\"\"Setup and teardown fixture for matplotlib.\n\n This fixture checks if we can import matplotlib. If not, the tests will be\n skipped. Otherwise, we close the figures before and after running the\n functions.\n\n Returns\n -------\n pyplot : module\n The ``matplotlib.pyplot`` module.\n \"\"\"\n pyplot = pytest.importorskip('matplotlib.pyplot')\n pyplot.close('all')\n yield pyplot\n pyplot.close('all')" }, { @@ -38638,7 +39872,8 @@ "docstring": { "type": "pytest config", "description": "" - } + }, + "refined_type": {} }, { "name": "items", @@ -38648,13 +39883,14 @@ "docstring": { "type": "list of collected items", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Called after collect is completed.", - "docstring": "Called after collect is completed.\n\nParameters\n----------\nconfig : pytest config\nitems : list of collected items", + "docstring": "Called after collect is completed.\n\n Parameters\n ----------\n config : pytest config\n items : list of collected items\n ", "source_code": "\ndef pytest_collection_modifyitems(config, items):\n \"\"\"Called after collect is completed.\n\n Parameters\n ----------\n config : pytest config\n items : list of collected items\n \"\"\"\n run_network_tests = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0'\n skip_network = pytest.mark.skip(reason='test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0')\n dataset_features_set = set(dataset_fetchers)\n datasets_to_download = set()\n for item in items:\n if not hasattr(item, 'fixturenames'):\n continue\n item_fixtures = set(item.fixturenames)\n dataset_to_fetch = item_fixtures & dataset_features_set\n if not dataset_to_fetch:\n continue\n if run_network_tests:\n datasets_to_download |= dataset_to_fetch\n else:\n item.add_marker(skip_network)\n worker_id = environ.get('PYTEST_XDIST_WORKER', 'gw0')\n if worker_id == 'gw0' and run_network_tests:\n for name in datasets_to_download:\n dataset_fetchers[name]()\n for item in items:\n if item.name.endswith(('_hash.FeatureHasher', 'text.HashingVectorizer')) and platform.python_implementation() == 'PyPy':\n marker = pytest.mark.skip(reason='FeatureHasher is not compatible with PyPy')\n item.add_marker(marker)\n elif item.name.endswith('GradientBoostingClassifier') and platform.machine() == 'aarch64':\n marker = pytest.mark.xfail(reason='know failure. See https://github.com/scikit-learn/scikit-learn/issues/17797')\n item.add_marker(marker)\n skip_doctests = False\n try:\n import matplotlib\n except ImportError:\n skip_doctests = True\n reason = 'matplotlib is required to run the doctests'\n try:\n if np_version < parse_version('1.14'):\n reason = 'doctests are only run for numpy >= 1.14'\n skip_doctests = True\n elif _IS_32BIT:\n reason = 'doctest are only run when the default numpy int is 64 bits.'\n skip_doctests = True\n elif sys.platform.startswith('win32'):\n reason = 'doctests are not run for Windows because numpy arrays repr is inconsistent across platforms.'\n skip_doctests = True\n except ImportError:\n pass\n for item in items:\n if isinstance(item, DoctestItem):\n item.dtest.globs = {}\n if skip_doctests:\n skip_marker = pytest.mark.skip(reason=reason)\n for item in items:\n if isinstance(item, DoctestItem):\n if item.name != 'sklearn._config.config_context':\n item.add_marker(skip_marker)\n elif not _pilutil.pillow_installed:\n skip_marker = pytest.mark.skip(reason='pillow (or PIL) not installed!')\n for item in items:\n if item.name in ['sklearn.feature_extraction.image.PatchExtractor', 'sklearn.feature_extraction.image.extract_patches_2d']:\n item.add_marker(skip_marker)" }, { @@ -38672,13 +39908,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef pytest_configure(config):\n try:\n import matplotlib\n matplotlib.use('agg')\n except ImportError:\n pass" }, { @@ -38696,13 +39933,14 @@ "docstring": { "type": "pytest item", "description": "item to be processed" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Set the number of openmp threads based on the number of workers xdist is using to prevent oversubscription.", - "docstring": "Set the number of openmp threads based on the number of workers\nxdist is using to prevent oversubscription.\n\nParameters\n----------\nitem : pytest item\n item to be processed", + "description": "Set the number of openmp threads based on the number of workers\nxdist is using to prevent oversubscription.", + "docstring": "Set the number of openmp threads based on the number of workers\n xdist is using to prevent oversubscription.\n\n Parameters\n ----------\n item : pytest item\n item to be processed\n ", "source_code": "\ndef pytest_runtest_setup(item):\n \"\"\"Set the number of openmp threads based on the number of workers\n xdist is using to prevent oversubscription.\n\n Parameters\n ----------\n item : pytest item\n item to be processed\n \"\"\"\n xdist_worker_count = environ.get('PYTEST_XDIST_WORKER_COUNT')\n if xdist_worker_count is None:\n return\n else:\n xdist_worker_count = int(xdist_worker_count)\n openmp_threads = _openmp_effective_n_threads()\n threads_per_worker = max(openmp_threads // xdist_worker_count, 1)\n threadpool_limits(threads_per_worker, user_api='openmp')" }, { @@ -38720,7 +39958,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "store_precision", @@ -38730,7 +39969,8 @@ "docstring": { "type": "bool, default=True", "description": "Specify if the estimated precision is stored." - } + }, + "refined_type": {} }, { "name": "assume_centered", @@ -38740,7 +39980,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, the support of robust location and covariance estimates\nis computed, and a covariance estimate is recomputed from it,\nwithout centering the data.\nUseful to work with data whose mean is significantly equal to\nzero but is not exactly zero.\nIf False, the robust location and covariance are directly computed\nwith the FastMCD algorithm without additional treatment." - } + }, + "refined_type": {} }, { "name": "support_fraction", @@ -38750,7 +39991,8 @@ "docstring": { "type": "float, default=None", "description": "The proportion of points to be included in the support of the raw\nMCD estimate. If None, the minimum value of support_fraction will\nbe used within the algorithm: `[n_sample + n_features + 1] / 2`.\nRange is (0, 1)." - } + }, + "refined_type": {} }, { "name": "contamination", @@ -38760,7 +40002,8 @@ "docstring": { "type": "float, default=0.1", "description": "The amount of contamination of the data set, i.e. the proportion\nof outliers in the data set. Range is (0, 0.5]." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -38770,13 +40013,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines the pseudo random number generator for shuffling\nthe data. Pass an int for reproducible results across multiple function\ncalls. See :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, store_precision=True, assume_centered=False, support_fraction=None, contamination=0.1, random_state=None):\n super().__init__(store_precision=store_precision, assume_centered=assume_centered, support_fraction=support_fraction, random_state=random_state)\n self.contamination = contamination" }, { @@ -38794,7 +40038,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -38804,13 +40049,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the decision function of the given observations.", - "docstring": "Compute the decision function of the given observations.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix.\n\nReturns\n-------\ndecision : ndarray of shape (n_samples,)\n Decision function of the samples.\n It is equal to the shifted Mahalanobis distances.\n The threshold for being an outlier is 0, which ensures a\n compatibility with other outlier detection algorithms.", + "docstring": "Compute the decision function of the given observations.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n decision : ndarray of shape (n_samples,)\n Decision function of the samples.\n It is equal to the shifted Mahalanobis distances.\n The threshold for being an outlier is 0, which ensures a\n compatibility with other outlier detection algorithms.\n ", "source_code": "\ndef decision_function(self, X):\n \"\"\"Compute the decision function of the given observations.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n decision : ndarray of shape (n_samples,)\n Decision function of the samples.\n It is equal to the shifted Mahalanobis distances.\n The threshold for being an outlier is 0, which ensures a\n compatibility with other outlier detection algorithms.\n \"\"\"\n check_is_fitted(self)\n negative_mahal_dist = self.score_samples(X)\n return negative_mahal_dist - self.offset_" }, { @@ -38828,7 +40074,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -38838,6 +40085,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -38848,13 +40099,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the EllipticEnvelope model.", - "docstring": "Fit the EllipticEnvelope model.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the EllipticEnvelope model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the EllipticEnvelope model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n if self.contamination != 'auto':\n if not 0.0 < self.contamination <= 0.5:\n raise ValueError('contamination must be in (0, 0.5], got: %f' % self.contamination)\n super().fit(X)\n self.offset_ = np.percentile(-self.dist_, 100.0 * self.contamination)\n return self" }, { @@ -38872,7 +40124,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -38882,13 +40135,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict labels (1 inlier, -1 outlier) of X according to fitted model.", - "docstring": "Predict labels (1 inlier, -1 outlier) of X according to fitted model.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix.\n\nReturns\n-------\nis_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and +1 for inliers.", + "docstring": "\n Predict labels (1 inlier, -1 outlier) of X according to fitted model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and +1 for inliers.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"\n Predict labels (1 inlier, -1 outlier) of X according to fitted model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and +1 for inliers.\n \"\"\"\n values = self.decision_function(X)\n is_inlier = np.full(values.shape[0], -1, dtype=int)\n is_inlier[values >= 0] = 1\n return is_inlier" }, { @@ -38906,7 +40160,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -38916,7 +40171,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Test samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -38926,7 +40182,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "True labels for X." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -38936,13 +40193,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return the mean accuracy on the given test data and labels.\n\nIn multi-label classification, this is the subset accuracy which is a harsh metric since you require for each sample that each label set be correctly predicted.", - "docstring": "Return the mean accuracy on the given test data and labels.\n\nIn multi-label classification, this is the subset accuracy\nwhich is a harsh metric since you require for each sample that\neach label set be correctly predicted.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Test samples.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True labels for X.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nscore : float\n Mean accuracy of self.predict(X) w.r.t. y.", + "description": "Return the mean accuracy on the given test data and labels.\n\nIn multi-label classification, this is the subset accuracy\nwhich is a harsh metric since you require for each sample that\neach label set be correctly predicted.", + "docstring": "Return the mean accuracy on the given test data and labels.\n\n In multi-label classification, this is the subset accuracy\n which is a harsh metric since you require for each sample that\n each label set be correctly predicted.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True labels for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Mean accuracy of self.predict(X) w.r.t. y.\n ", "source_code": "\ndef score(self, X, y, sample_weight=None):\n \"\"\"Return the mean accuracy on the given test data and labels.\n\n In multi-label classification, this is the subset accuracy\n which is a harsh metric since you require for each sample that\n each label set be correctly predicted.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True labels for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Mean accuracy of self.predict(X) w.r.t. y.\n \"\"\"\n return accuracy_score(y, self.predict(X), sample_weight=sample_weight)" }, { @@ -38960,7 +40218,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -38970,13 +40229,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the negative Mahalanobis distances.", - "docstring": "Compute the negative Mahalanobis distances.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix.\n\nReturns\n-------\nnegative_mahal_distances : array-like of shape (n_samples,)\n Opposite of the Mahalanobis distances.", + "docstring": "Compute the negative Mahalanobis distances.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n negative_mahal_distances : array-like of shape (n_samples,)\n Opposite of the Mahalanobis distances.\n ", "source_code": "\ndef score_samples(self, X):\n \"\"\"Compute the negative Mahalanobis distances.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n negative_mahal_distances : array-like of shape (n_samples,)\n Opposite of the Mahalanobis distances.\n \"\"\"\n check_is_fitted(self)\n return -self.mahalanobis(X)" }, { @@ -38994,7 +40254,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "store_precision", @@ -39004,7 +40265,8 @@ "docstring": { "type": "bool, default=True", "description": "Specifies if the estimated precision is stored." - } + }, + "refined_type": {} }, { "name": "assume_centered", @@ -39014,13 +40276,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, data are not centered before computation.\nUseful when working with data whose mean is almost, but not exactly\nzero.\nIf False (default), data are centered before computation." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, store_precision=True, assume_centered=False):\n self.store_precision = store_precision\n self.assume_centered = assume_centered" }, { @@ -39038,7 +40301,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "covariance", @@ -39048,13 +40312,14 @@ "docstring": { "type": "array-like of shape (n_features, n_features)", "description": "Estimated covariance matrix to be stored, and from which precision\nis computed." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Saves the covariance and precision estimates\n\nStorage is done accordingly to `self.store_precision`. Precision stored only if invertible.", - "docstring": "Saves the covariance and precision estimates\n\nStorage is done accordingly to `self.store_precision`.\nPrecision stored only if invertible.\n\nParameters\n----------\ncovariance : array-like of shape (n_features, n_features)\n Estimated covariance matrix to be stored, and from which precision\n is computed.", + "description": "Saves the covariance and precision estimates\n\nStorage is done accordingly to `self.store_precision`.\nPrecision stored only if invertible.", + "docstring": "Saves the covariance and precision estimates\n\n Storage is done accordingly to `self.store_precision`.\n Precision stored only if invertible.\n\n Parameters\n ----------\n covariance : array-like of shape (n_features, n_features)\n Estimated covariance matrix to be stored, and from which precision\n is computed.\n ", "source_code": "\ndef _set_covariance(self, covariance):\n \"\"\"Saves the covariance and precision estimates\n\n Storage is done accordingly to `self.store_precision`.\n Precision stored only if invertible.\n\n Parameters\n ----------\n covariance : array-like of shape (n_features, n_features)\n Estimated covariance matrix to be stored, and from which precision\n is computed.\n \"\"\"\n covariance = check_array(covariance)\n self.covariance_ = covariance\n if self.store_precision:\n self.precision_ = linalg.pinvh(covariance, check_finite=False)\n else:\n self.precision_ = None" }, { @@ -39072,7 +40337,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "comp_cov", @@ -39082,7 +40348,8 @@ "docstring": { "type": "array-like of shape (n_features, n_features)", "description": "The covariance to compare with." - } + }, + "refined_type": {} }, { "name": "norm", @@ -39092,6 +40359,10 @@ "docstring": { "type": "{\"frobenius\", \"spectral\"}, default=\"frobenius\"", "description": "The type of norm used to compute the error. Available error types:\n- 'frobenius' (default): sqrt(tr(A^t.A))\n- 'spectral': sqrt(max(eigenvalues(A^t.A))\nwhere A is the error ``(comp_cov - self.covariance_)``." + }, + "refined_type": { + "kind": "EnumType", + "values": ["frobenius", "spectral"] } }, { @@ -39102,7 +40373,8 @@ "docstring": { "type": "bool, default=True", "description": "If True (default), the squared error norm is divided by n_features.\nIf False, the squared error norm is not rescaled." - } + }, + "refined_type": {} }, { "name": "squared", @@ -39112,13 +40384,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to compute the squared error norm or the error norm.\nIf True (default), the squared error norm is returned.\nIf False, the error norm is returned." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the Mean Squared Error between two covariance estimators.", - "docstring": "Compute the Mean Squared Error between two covariance estimators.\n\nParameters\n----------\ncomp_cov : array-like of shape (n_features, n_features)\n The covariance to compare with.\n\nnorm : {\"frobenius\", \"spectral\"}, default=\"frobenius\"\n The type of norm used to compute the error. Available error types:\n - 'frobenius' (default): sqrt(tr(A^t.A))\n - 'spectral': sqrt(max(eigenvalues(A^t.A))\n where A is the error ``(comp_cov - self.covariance_)``.\n\nscaling : bool, default=True\n If True (default), the squared error norm is divided by n_features.\n If False, the squared error norm is not rescaled.\n\nsquared : bool, default=True\n Whether to compute the squared error norm or the error norm.\n If True (default), the squared error norm is returned.\n If False, the error norm is returned.\n\nReturns\n-------\nresult : float\n The Mean Squared Error (in the sense of the Frobenius norm) between\n `self` and `comp_cov` covariance estimators.", + "docstring": "Compute the Mean Squared Error between two covariance estimators.\n\n Parameters\n ----------\n comp_cov : array-like of shape (n_features, n_features)\n The covariance to compare with.\n\n norm : {\"frobenius\", \"spectral\"}, default=\"frobenius\"\n The type of norm used to compute the error. Available error types:\n - 'frobenius' (default): sqrt(tr(A^t.A))\n - 'spectral': sqrt(max(eigenvalues(A^t.A))\n where A is the error ``(comp_cov - self.covariance_)``.\n\n scaling : bool, default=True\n If True (default), the squared error norm is divided by n_features.\n If False, the squared error norm is not rescaled.\n\n squared : bool, default=True\n Whether to compute the squared error norm or the error norm.\n If True (default), the squared error norm is returned.\n If False, the error norm is returned.\n\n Returns\n -------\n result : float\n The Mean Squared Error (in the sense of the Frobenius norm) between\n `self` and `comp_cov` covariance estimators.\n ", "source_code": "\ndef error_norm(self, comp_cov, norm='frobenius', scaling=True, squared=True):\n \"\"\"Compute the Mean Squared Error between two covariance estimators.\n\n Parameters\n ----------\n comp_cov : array-like of shape (n_features, n_features)\n The covariance to compare with.\n\n norm : {\"frobenius\", \"spectral\"}, default=\"frobenius\"\n The type of norm used to compute the error. Available error types:\n - 'frobenius' (default): sqrt(tr(A^t.A))\n - 'spectral': sqrt(max(eigenvalues(A^t.A))\n where A is the error ``(comp_cov - self.covariance_)``.\n\n scaling : bool, default=True\n If True (default), the squared error norm is divided by n_features.\n If False, the squared error norm is not rescaled.\n\n squared : bool, default=True\n Whether to compute the squared error norm or the error norm.\n If True (default), the squared error norm is returned.\n If False, the error norm is returned.\n\n Returns\n -------\n result : float\n The Mean Squared Error (in the sense of the Frobenius norm) between\n `self` and `comp_cov` covariance estimators.\n \"\"\"\n error = comp_cov - self.covariance_\n if norm == 'frobenius':\n squared_norm = np.sum(error**2)\n elif norm == 'spectral':\n squared_norm = np.amax(linalg.svdvals(np.dot(error.T, error)))\n else:\n raise NotImplementedError('Only spectral and frobenius norms are implemented')\n if scaling:\n squared_norm = squared_norm / error.shape[0]\n if squared:\n result = squared_norm\n else:\n result = np.sqrt(squared_norm)\n return result" }, { @@ -39136,7 +40409,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -39146,7 +40420,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -39156,13 +40431,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the maximum liklihood covariance estimator to X.", - "docstring": "Fit the maximum liklihood covariance estimator to X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the maximum liklihood covariance estimator to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the maximum liklihood covariance estimator to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X)\n if self.assume_centered:\n self.location_ = np.zeros(X.shape[1])\n else:\n self.location_ = X.mean(0)\n covariance = empirical_covariance(X, assume_centered=self.assume_centered)\n self._set_covariance(covariance)\n return self" }, { @@ -39180,13 +40456,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Getter for the precision matrix.", - "docstring": "Getter for the precision matrix.\n\nReturns\n-------\nprecision_ : array-like of shape (n_features, n_features)\n The precision matrix associated to the current covariance object.", + "docstring": "Getter for the precision matrix.\n\n Returns\n -------\n precision_ : array-like of shape (n_features, n_features)\n The precision matrix associated to the current covariance object.\n ", "source_code": "\ndef get_precision(self):\n \"\"\"Getter for the precision matrix.\n\n Returns\n -------\n precision_ : array-like of shape (n_features, n_features)\n The precision matrix associated to the current covariance object.\n \"\"\"\n if self.store_precision:\n precision = self.precision_\n else:\n precision = linalg.pinvh(self.covariance_, check_finite=False)\n return precision" }, { @@ -39204,7 +40481,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -39214,13 +40492,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The observations, the Mahalanobis distances of the which we\ncompute. Observations are assumed to be drawn from the same\ndistribution than the data used in fit." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the squared Mahalanobis distances of given observations.", - "docstring": "Compute the squared Mahalanobis distances of given observations.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The observations, the Mahalanobis distances of the which we\n compute. Observations are assumed to be drawn from the same\n distribution than the data used in fit.\n\nReturns\n-------\ndist : ndarray of shape (n_samples,)\n Squared Mahalanobis distances of the observations.", + "docstring": "Compute the squared Mahalanobis distances of given observations.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The observations, the Mahalanobis distances of the which we\n compute. Observations are assumed to be drawn from the same\n distribution than the data used in fit.\n\n Returns\n -------\n dist : ndarray of shape (n_samples,)\n Squared Mahalanobis distances of the observations.\n ", "source_code": "\ndef mahalanobis(self, X):\n \"\"\"Compute the squared Mahalanobis distances of given observations.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The observations, the Mahalanobis distances of the which we\n compute. Observations are assumed to be drawn from the same\n distribution than the data used in fit.\n\n Returns\n -------\n dist : ndarray of shape (n_samples,)\n Squared Mahalanobis distances of the observations.\n \"\"\"\n X = self._validate_data(X, reset=False)\n precision = self.get_precision()\n with config_context(assume_finite=True):\n dist = pairwise_distances(X, self.location_[np.newaxis, :], metric='mahalanobis', VI=precision)\n return np.reshape(dist, (len(X), ))**2" }, { @@ -39238,7 +40517,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_test", @@ -39248,7 +40528,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Test data of which we compute the likelihood, where `n_samples` is\nthe number of samples and `n_features` is the number of features.\n`X_test` is assumed to be drawn from the same distribution than\nthe data used in fit (including centering)." - } + }, + "refined_type": {} }, { "name": "y", @@ -39258,13 +40539,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the log-likelihood of `X_test` under the estimated Gaussian model.\n\nThe Gaussian model is defined by its mean and covariance matrix which are represented respectively by `self.location_` and `self.covariance_`.", - "docstring": "Compute the log-likelihood of `X_test` under the estimated Gaussian model.\n\nThe Gaussian model is defined by its mean and covariance matrix which are\nrepresented respectively by `self.location_` and `self.covariance_`.\n\nParameters\n----------\nX_test : array-like of shape (n_samples, n_features)\n Test data of which we compute the likelihood, where `n_samples` is\n the number of samples and `n_features` is the number of features.\n `X_test` is assumed to be drawn from the same distribution than\n the data used in fit (including centering).\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nres : float\n The log-likelihood of `X_test` with `self.location_` and `self.covariance_`\n as estimators of the Gaussian model mean and covariance matrix respectively.", + "description": "Compute the log-likelihood of `X_test` under the estimated Gaussian model.\n\nThe Gaussian model is defined by its mean and covariance matrix which are\nrepresented respectively by `self.location_` and `self.covariance_`.", + "docstring": "Compute the log-likelihood of `X_test` under the estimated Gaussian model.\n\n The Gaussian model is defined by its mean and covariance matrix which are\n represented respectively by `self.location_` and `self.covariance_`.\n\n Parameters\n ----------\n X_test : array-like of shape (n_samples, n_features)\n Test data of which we compute the likelihood, where `n_samples` is\n the number of samples and `n_features` is the number of features.\n `X_test` is assumed to be drawn from the same distribution than\n the data used in fit (including centering).\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n res : float\n The log-likelihood of `X_test` with `self.location_` and `self.covariance_`\n as estimators of the Gaussian model mean and covariance matrix respectively.\n ", "source_code": "\ndef score(self, X_test, y=None):\n \"\"\"Compute the log-likelihood of `X_test` under the estimated Gaussian model.\n\n The Gaussian model is defined by its mean and covariance matrix which are\n represented respectively by `self.location_` and `self.covariance_`.\n\n Parameters\n ----------\n X_test : array-like of shape (n_samples, n_features)\n Test data of which we compute the likelihood, where `n_samples` is\n the number of samples and `n_features` is the number of features.\n `X_test` is assumed to be drawn from the same distribution than\n the data used in fit (including centering).\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n res : float\n The log-likelihood of `X_test` with `self.location_` and `self.covariance_`\n as estimators of the Gaussian model mean and covariance matrix respectively.\n \"\"\"\n X_test = self._validate_data(X_test, reset=False)\n test_cov = empirical_covariance(X_test - self.location_, assume_centered=True)\n res = log_likelihood(test_cov, self.get_precision())\n return res" }, { @@ -39281,8 +40563,9 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "ndarray of shape (n_samples, n_features)", - "description": "Data from which to compute the covariance estimate" - } + "description": "Data from which to compute the covariance estimate." + }, + "refined_type": {} }, { "name": "assume_centered", @@ -39291,15 +40574,16 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "bool, default=False", - "description": "If True, data will not be centered before computation.\nUseful when working with data whose mean is almost, but not exactly\nzero.\nIf False, data will be centered before computation." - } + "description": "If `True`, data will not be centered before computation.\nUseful when working with data whose mean is almost, but not exactly\nzero.\nIf `False`, data will be centered before computation." + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Computes the Maximum likelihood covariance estimator", - "docstring": "Computes the Maximum likelihood covariance estimator\n\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Data from which to compute the covariance estimate\n\nassume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful when working with data whose mean is almost, but not exactly\n zero.\n If False, data will be centered before computation.\n\nReturns\n-------\ncovariance : ndarray of shape (n_features, n_features)\n Empirical covariance (Maximum Likelihood Estimator).\n\nExamples\n--------\n>>> from sklearn.covariance import empirical_covariance\n>>> X = [[1,1,1],[1,1,1],[1,1,1],\n... [0,0,0],[0,0,0],[0,0,0]]\n>>> empirical_covariance(X)\narray([[0.25, 0.25, 0.25],\n [0.25, 0.25, 0.25],\n [0.25, 0.25, 0.25]])", - "source_code": "\ndef empirical_covariance(X, *, assume_centered=False):\n \"\"\"Computes the Maximum likelihood covariance estimator\n\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data from which to compute the covariance estimate\n\n assume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful when working with data whose mean is almost, but not exactly\n zero.\n If False, data will be centered before computation.\n\n Returns\n -------\n covariance : ndarray of shape (n_features, n_features)\n Empirical covariance (Maximum Likelihood Estimator).\n\n Examples\n --------\n >>> from sklearn.covariance import empirical_covariance\n >>> X = [[1,1,1],[1,1,1],[1,1,1],\n ... [0,0,0],[0,0,0],[0,0,0]]\n >>> empirical_covariance(X)\n array([[0.25, 0.25, 0.25],\n [0.25, 0.25, 0.25],\n [0.25, 0.25, 0.25]])\n \"\"\"\n X = np.asarray(X)\n if X.ndim == 1:\n X = np.reshape(X, (1, -1))\n if X.shape[0] == 1:\n warnings.warn('Only one sample available. You may want to reshape your data array')\n if assume_centered:\n covariance = np.dot(X.T, X) / X.shape[0]\n else:\n covariance = np.cov(X.T, bias=1)\n if covariance.ndim == 0:\n covariance = np.array([[covariance]])\n return covariance" + "description": "Compute the Maximum likelihood covariance estimator.", + "docstring": "Compute the Maximum likelihood covariance estimator.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\n assume_centered : bool, default=False\n If `True`, data will not be centered before computation.\n Useful when working with data whose mean is almost, but not exactly\n zero.\n If `False`, data will be centered before computation.\n\n Returns\n -------\n covariance : ndarray of shape (n_features, n_features)\n Empirical covariance (Maximum Likelihood Estimator).\n\n Examples\n --------\n >>> from sklearn.covariance import empirical_covariance\n >>> X = [[1,1,1],[1,1,1],[1,1,1],\n ... [0,0,0],[0,0,0],[0,0,0]]\n >>> empirical_covariance(X)\n array([[0.25, 0.25, 0.25],\n [0.25, 0.25, 0.25],\n [0.25, 0.25, 0.25]])\n ", + "source_code": "\ndef empirical_covariance(X, *, assume_centered=False):\n \"\"\"Compute the Maximum likelihood covariance estimator.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\n assume_centered : bool, default=False\n If `True`, data will not be centered before computation.\n Useful when working with data whose mean is almost, but not exactly\n zero.\n If `False`, data will be centered before computation.\n\n Returns\n -------\n covariance : ndarray of shape (n_features, n_features)\n Empirical covariance (Maximum Likelihood Estimator).\n\n Examples\n --------\n >>> from sklearn.covariance import empirical_covariance\n >>> X = [[1,1,1],[1,1,1],[1,1,1],\n ... [0,0,0],[0,0,0],[0,0,0]]\n >>> empirical_covariance(X)\n array([[0.25, 0.25, 0.25],\n [0.25, 0.25, 0.25],\n [0.25, 0.25, 0.25]])\n \"\"\"\n X = np.asarray(X)\n if X.ndim == 1:\n X = np.reshape(X, (1, -1))\n if X.shape[0] == 1:\n warnings.warn('Only one sample available. You may want to reshape your data array')\n if assume_centered:\n covariance = np.dot(X.T, X) / X.shape[0]\n else:\n covariance = np.cov(X.T, bias=1)\n if covariance.ndim == 0:\n covariance = np.array([[covariance]])\n return covariance" }, { "name": "log_likelihood", @@ -39316,7 +40600,8 @@ "docstring": { "type": "ndarray of shape (n_features, n_features)", "description": "Maximum Likelihood Estimator of covariance." - } + }, + "refined_type": {} }, { "name": "precision", @@ -39326,14 +40611,15 @@ "docstring": { "type": "ndarray of shape (n_features, n_features)", "description": "The precision matrix of the covariance model to be tested." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Computes the sample mean of the log_likelihood under a covariance model\n\ncomputes the empirical expected log-likelihood (accounting for the normalization terms and scaling), allowing for universal comparison (beyond this software package)", - "docstring": "Computes the sample mean of the log_likelihood under a covariance model\n\ncomputes the empirical expected log-likelihood (accounting for the\nnormalization terms and scaling), allowing for universal comparison (beyond\nthis software package)\n\nParameters\n----------\nemp_cov : ndarray of shape (n_features, n_features)\n Maximum Likelihood Estimator of covariance.\n\nprecision : ndarray of shape (n_features, n_features)\n The precision matrix of the covariance model to be tested.\n\nReturns\n-------\nlog_likelihood_ : float\n Sample mean of the log-likelihood.", - "source_code": "\ndef log_likelihood(emp_cov, precision):\n \"\"\"Computes the sample mean of the log_likelihood under a covariance model\n\n computes the empirical expected log-likelihood (accounting for the\n normalization terms and scaling), allowing for universal comparison (beyond\n this software package)\n\n Parameters\n ----------\n emp_cov : ndarray of shape (n_features, n_features)\n Maximum Likelihood Estimator of covariance.\n\n precision : ndarray of shape (n_features, n_features)\n The precision matrix of the covariance model to be tested.\n\n Returns\n -------\n log_likelihood_ : float\n Sample mean of the log-likelihood.\n \"\"\"\n p = precision.shape[0]\n log_likelihood_ = -np.sum(emp_cov * precision) + fast_logdet(precision)\n log_likelihood_ -= p * np.log(2 * np.pi)\n log_likelihood_ /= 2.0\n return log_likelihood_" + "description": "Compute the sample mean of the log_likelihood under a covariance model.\n\nComputes the empirical expected log-likelihood, allowing for universal\ncomparison (beyond this software package), and accounts for normalization\nterms and scaling.", + "docstring": "Compute the sample mean of the log_likelihood under a covariance model.\n\n Computes the empirical expected log-likelihood, allowing for universal\n comparison (beyond this software package), and accounts for normalization\n terms and scaling.\n\n Parameters\n ----------\n emp_cov : ndarray of shape (n_features, n_features)\n Maximum Likelihood Estimator of covariance.\n\n precision : ndarray of shape (n_features, n_features)\n The precision matrix of the covariance model to be tested.\n\n Returns\n -------\n log_likelihood_ : float\n Sample mean of the log-likelihood.\n ", + "source_code": "\ndef log_likelihood(emp_cov, precision):\n \"\"\"Compute the sample mean of the log_likelihood under a covariance model.\n\n Computes the empirical expected log-likelihood, allowing for universal\n comparison (beyond this software package), and accounts for normalization\n terms and scaling.\n\n Parameters\n ----------\n emp_cov : ndarray of shape (n_features, n_features)\n Maximum Likelihood Estimator of covariance.\n\n precision : ndarray of shape (n_features, n_features)\n The precision matrix of the covariance model to be tested.\n\n Returns\n -------\n log_likelihood_ : float\n Sample mean of the log-likelihood.\n \"\"\"\n p = precision.shape[0]\n log_likelihood_ = -np.sum(emp_cov * precision) + fast_logdet(precision)\n log_likelihood_ -= p * np.log(2 * np.pi)\n log_likelihood_ /= 2.0\n return log_likelihood_" }, { "name": "__init__", @@ -39350,7 +40636,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -39360,7 +40647,8 @@ "docstring": { "type": "float, default=0.01", "description": "The regularization parameter: the higher alpha, the more\nregularization, the sparser the inverse covariance.\nRange is (0, inf]." - } + }, + "refined_type": {} }, { "name": "mode", @@ -39370,6 +40658,10 @@ "docstring": { "type": "{'cd', 'lars'}, default='cd'", "description": "The Lasso solver to use: coordinate descent or LARS. Use LARS for\nvery sparse underlying graphs, where p > n. Elsewhere prefer cd\nwhich is more numerically stable." + }, + "refined_type": { + "kind": "EnumType", + "values": ["cd", "lars"] } }, { @@ -39380,7 +40672,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance to declare convergence: if the dual gap goes below\nthis value, iterations are stopped. Range is (0, inf]." - } + }, + "refined_type": {} }, { "name": "enet_tol", @@ -39390,7 +40683,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance for the elastic net solver used to calculate the descent\ndirection. This parameter controls the accuracy of the search direction\nfor a given column update, not of the overall parameter estimate. Only\nused for mode='cd'. Range is (0, inf]." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -39400,7 +40694,8 @@ "docstring": { "type": "int, default=100", "description": "The maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -39410,7 +40705,8 @@ "docstring": { "type": "bool, default=False", "description": "If verbose is True, the objective function and dual gap are\nplotted at each iteration." - } + }, + "refined_type": {} }, { "name": "assume_centered", @@ -39420,13 +40716,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, data are not centered before computation.\nUseful when working with data whose mean is almost, but not exactly\nzero.\nIf False, data are centered before computation." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alpha=0.01, *, mode='cd', tol=0.0001, enet_tol=0.0001, max_iter=100, verbose=False, assume_centered=False):\n super().__init__(assume_centered=assume_centered)\n self.alpha = alpha\n self.mode = mode\n self.tol = tol\n self.enet_tol = enet_tol\n self.max_iter = max_iter\n self.verbose = verbose" }, { @@ -39444,7 +40741,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -39454,7 +40752,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data from which to compute the covariance estimate." - } + }, + "refined_type": {} }, { "name": "y", @@ -39464,13 +40763,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the GraphicalLasso model to X.", - "docstring": "Fit the GraphicalLasso model to X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the GraphicalLasso model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the GraphicalLasso model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2, estimator=self)\n if self.assume_centered:\n self.location_ = np.zeros(X.shape[1])\n else:\n self.location_ = X.mean(0)\n emp_cov = empirical_covariance(X, assume_centered=self.assume_centered)\n (self.covariance_, self.precision_, self.n_iter_) = graphical_lasso(emp_cov, alpha=self.alpha, mode=self.mode, tol=self.tol, enet_tol=self.enet_tol, max_iter=self.max_iter, verbose=self.verbose, return_n_iter=True)\n return self" }, { @@ -39488,7 +40788,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alphas", @@ -39498,7 +40799,8 @@ "docstring": { "type": "int or array-like of shape (n_alphas,), dtype=float, default=4", "description": "If an integer is given, it fixes the number of points on the\ngrids of alpha to be used. If a list is given, it gives the\ngrid to be used. See the notes in the class docstring for\nmore details. Range is (0, inf] when floats given." - } + }, + "refined_type": {} }, { "name": "n_refinements", @@ -39508,7 +40810,8 @@ "docstring": { "type": "int, default=4", "description": "The number of times the grid is refined. Not used if explicit\nvalues of alphas are passed. Range is [1, inf)." - } + }, + "refined_type": {} }, { "name": "cv", @@ -39518,7 +40821,8 @@ "docstring": { "type": "int, cross-validation generator or iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross-validation,\n- integer, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor integer/None inputs :class:`KFold` is used.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.20\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "tol", @@ -39528,7 +40832,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance to declare convergence: if the dual gap goes below\nthis value, iterations are stopped. Range is (0, inf]." - } + }, + "refined_type": {} }, { "name": "enet_tol", @@ -39538,7 +40843,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance for the elastic net solver used to calculate the descent\ndirection. This parameter controls the accuracy of the search direction\nfor a given column update, not of the overall parameter estimate. Only\nused for mode='cd'. Range is (0, inf]." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -39548,7 +40854,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "mode", @@ -39558,6 +40865,10 @@ "docstring": { "type": "{'cd', 'lars'}, default='cd'", "description": "The Lasso solver to use: coordinate descent or LARS. Use LARS for\nvery sparse underlying graphs, where number of features is greater\nthan number of samples. Elsewhere prefer cd which is more numerically\nstable." + }, + "refined_type": { + "kind": "EnumType", + "values": ["cd", "lars"] } }, { @@ -39568,7 +40879,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\n\n.. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -39578,7 +40890,8 @@ "docstring": { "type": "bool, default=False", "description": "If verbose is True, the objective function and duality gap are\nprinted at each iteration." - } + }, + "refined_type": {} }, { "name": "assume_centered", @@ -39588,13 +40901,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, data are not centered before computation.\nUseful when working with data whose mean is almost, but not exactly\nzero.\nIf False, data are centered before computation." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, alphas=4, n_refinements=4, cv=None, tol=0.0001, enet_tol=0.0001, max_iter=100, mode='cd', n_jobs=None, verbose=False, assume_centered=False):\n super().__init__(mode=mode, tol=tol, verbose=verbose, enet_tol=enet_tol, max_iter=max_iter, assume_centered=assume_centered)\n self.alphas = alphas\n self.n_refinements = n_refinements\n self.cv = cv\n self.n_jobs = n_jobs" }, { @@ -39615,13 +40929,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated(\"The `cv_alphas_` attribute is deprecated in version 0.24 in favor of `cv_results_['alpha']` and will be removed in version 1.1 (renaming of 0.26).\")\n@property\ndef cv_alphas_(self):\n return self.cv_results_['alphas'].tolist()" }, { @@ -39639,7 +40954,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -39649,7 +40965,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data from which to compute the covariance estimate." - } + }, + "refined_type": {} }, { "name": "y", @@ -39659,13 +40976,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the GraphicalLasso covariance model to X.", - "docstring": "Fit the GraphicalLasso covariance model to X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the GraphicalLasso covariance model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the GraphicalLasso covariance model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, ensure_min_features=2, estimator=self)\n if self.assume_centered:\n self.location_ = np.zeros(X.shape[1])\n else:\n self.location_ = X.mean(0)\n emp_cov = empirical_covariance(X, assume_centered=self.assume_centered)\n cv = check_cv(self.cv, y, classifier=False)\n path = list()\n n_alphas = self.alphas\n inner_verbose = max(0, self.verbose - 1)\n if isinstance(n_alphas, Sequence):\n alphas = self.alphas\n n_refinements = 1\n else:\n n_refinements = self.n_refinements\n alpha_1 = alpha_max(emp_cov)\n alpha_0 = 0.01 * alpha_1\n alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1), n_alphas)[::-1]\n t0 = time.time()\n for i in range(n_refinements):\n with warnings.catch_warnings():\n warnings.simplefilter('ignore', ConvergenceWarning)\n this_path = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)((delayed(graphical_lasso_path)(X[train], alphas=alphas, X_test=X[test], mode=self.mode, tol=self.tol, enet_tol=self.enet_tol, max_iter=int(0.1 * self.max_iter), verbose=inner_verbose) for (train, test) in cv.split(X, y)))\n (covs, _, scores) = zip(*this_path)\n covs = zip(*covs)\n scores = zip(*scores)\n path.extend(zip(alphas, scores, covs))\n path = sorted(path, key=operator.itemgetter(0), reverse=True)\n best_score = -np.inf\n last_finite_idx = 0\n for (index, (alpha, scores, _)) in enumerate(path):\n this_score = np.mean(scores)\n if this_score >= 0.1 / np.finfo(np.float64).eps:\n this_score = np.nan\n if np.isfinite(this_score):\n last_finite_idx = index\n if this_score >= best_score:\n best_score = this_score\n best_index = index\n if best_index == 0:\n alpha_1 = path[0][0]\n alpha_0 = path[1][0]\n elif best_index == last_finite_idx and not best_index == len(path) - 1:\n alpha_1 = path[best_index][0]\n alpha_0 = path[best_index + 1][0]\n elif best_index == len(path) - 1:\n alpha_1 = path[best_index][0]\n alpha_0 = 0.01 * path[best_index][0]\n else:\n alpha_1 = path[best_index - 1][0]\n alpha_0 = path[best_index + 1][0]\n if not isinstance(n_alphas, Sequence):\n alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0), n_alphas + 2)\n alphas = alphas[1:-1]\n if self.verbose and n_refinements > 1:\n print('[GraphicalLassoCV] Done refinement % 2i out of %i: % 3is' % (i + 1, n_refinements, time.time() - t0))\n path = list(zip(*path))\n grid_scores = list(path[1])\n alphas = list(path[0])\n alphas.append(0)\n grid_scores.append(cross_val_score(EmpiricalCovariance(), X, cv=cv, n_jobs=self.n_jobs, verbose=inner_verbose))\n grid_scores = np.array(grid_scores)\n self.cv_results_ = _DictWithDeprecatedKeys(alphas=np.array(alphas))\n for i in range(grid_scores.shape[1]):\n self.cv_results_._set_deprecated(grid_scores[:, i], new_key=f'split{i}_test_score', deprecated_key=f'split{i}_score')\n self.cv_results_._set_deprecated(np.mean(grid_scores, axis=1), new_key='mean_test_score', deprecated_key='mean_score')\n self.cv_results_._set_deprecated(np.std(grid_scores, axis=1), new_key='std_test_score', deprecated_key='std_score')\n best_alpha = alphas[best_index]\n self.alpha_ = best_alpha\n (self.covariance_, self.precision_, self.n_iter_) = graphical_lasso(emp_cov, alpha=best_alpha, mode=self.mode, tol=self.tol, enet_tol=self.enet_tol, max_iter=self.max_iter, verbose=inner_verbose, return_n_iter=True)\n return self" }, { @@ -39686,13 +41004,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('The `grid_scores_` attribute is deprecated in version 0.24 in favor of `cv_results_` and will be removed in version 1.1 (renaming of 0.26).')\n@property\ndef grid_scores_(self):\n n_splits = len([key for key in self.cv_results_ if key.startswith('split') and key.endswith('_test_score')])\n return np.asarray([self.cv_results_['split{}_test_score'.format(i)] for i in range(n_splits)]).T" }, { @@ -39710,7 +41029,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "key", @@ -39720,13 +41040,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __getitem__(self, key):\n if key in self._deprecated_key_to_new_key:\n warnings.warn(f\"Key: '{key}', is deprecated in 1.0 and will be removed in 1.2. Use '{self._deprecated_key_to_new_key[key]}' instead\", FutureWarning)\n return super().__getitem__(key)" }, { @@ -39744,13 +41065,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, **kwargs):\n super().__init__(**kwargs)\n self._deprecated_key_to_new_key = {}" }, { @@ -39768,7 +41090,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -39778,7 +41101,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "new_key", @@ -39788,7 +41112,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deprecated_key", @@ -39798,13 +41123,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _set_deprecated(self, value, *, new_key, deprecated_key):\n self._deprecated_key_to_new_key[deprecated_key] = new_key\n self[new_key] = self[deprecated_key] = value" }, { @@ -39822,7 +41148,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "precision_", @@ -39832,7 +41159,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -39842,13 +41170,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Expression of the dual gap convergence criterion\n\nThe specific definition is given in Duchi \"Projected Subgradient Methods for Learning Sparse Gaussians\".", - "docstring": "Expression of the dual gap convergence criterion\n\nThe specific definition is given in Duchi \"Projected Subgradient Methods\nfor Learning Sparse Gaussians\".", + "description": "Expression of the dual gap convergence criterion\n\nThe specific definition is given in Duchi \"Projected Subgradient Methods\nfor Learning Sparse Gaussians\".", + "docstring": "Expression of the dual gap convergence criterion\n\n The specific definition is given in Duchi \"Projected Subgradient Methods\n for Learning Sparse Gaussians\".\n ", "source_code": "\ndef _dual_gap(emp_cov, precision_, alpha):\n \"\"\"Expression of the dual gap convergence criterion\n\n The specific definition is given in Duchi \"Projected Subgradient Methods\n for Learning Sparse Gaussians\".\n \"\"\"\n gap = np.sum(emp_cov * precision_)\n gap -= precision_.shape[0]\n gap += alpha * (np.abs(precision_).sum() - np.abs(np.diag(precision_)).sum())\n return gap" }, { @@ -39866,7 +41195,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "precision_", @@ -39876,7 +41206,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -39886,13 +41217,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Evaluation of the graphical-lasso objective function\n\nthe objective function is made of a shifted scaled version of the normalized log-likelihood (i.e. its empirical mean over the samples) and a penalisation term to promote sparsity", - "docstring": "Evaluation of the graphical-lasso objective function\n\nthe objective function is made of a shifted scaled version of the\nnormalized log-likelihood (i.e. its empirical mean over the samples) and a\npenalisation term to promote sparsity", + "description": "Evaluation of the graphical-lasso objective function\n\nthe objective function is made of a shifted scaled version of the\nnormalized log-likelihood (i.e. its empirical mean over the samples) and a\npenalisation term to promote sparsity", + "docstring": "Evaluation of the graphical-lasso objective function\n\n the objective function is made of a shifted scaled version of the\n normalized log-likelihood (i.e. its empirical mean over the samples) and a\n penalisation term to promote sparsity\n ", "source_code": "\ndef _objective(mle, precision_, alpha):\n \"\"\"Evaluation of the graphical-lasso objective function\n\n the objective function is made of a shifted scaled version of the\n normalized log-likelihood (i.e. its empirical mean over the samples) and a\n penalisation term to promote sparsity\n \"\"\"\n p = precision_.shape[0]\n cost = -2.0 * log_likelihood(mle, precision_) + p * np.log(2 * np.pi)\n cost += alpha * (np.abs(precision_).sum() - np.abs(np.diag(precision_)).sum())\n return cost" }, { @@ -39910,13 +41242,14 @@ "docstring": { "type": "ndarray of shape (n_features, n_features)", "description": "The sample covariance matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Find the maximum alpha for which there are some non-zeros off-diagonal.", - "docstring": "Find the maximum alpha for which there are some non-zeros off-diagonal.\n\nParameters\n----------\nemp_cov : ndarray of shape (n_features, n_features)\n The sample covariance matrix.\n\nNotes\n-----\nThis results from the bound for the all the Lasso that are solved\nin GraphicalLasso: each time, the row of cov corresponds to Xy. As the\nbound for alpha is given by `max(abs(Xy))`, the result follows.", + "docstring": "Find the maximum alpha for which there are some non-zeros off-diagonal.\n\n Parameters\n ----------\n emp_cov : ndarray of shape (n_features, n_features)\n The sample covariance matrix.\n\n Notes\n -----\n This results from the bound for the all the Lasso that are solved\n in GraphicalLasso: each time, the row of cov corresponds to Xy. As the\n bound for alpha is given by `max(abs(Xy))`, the result follows.\n ", "source_code": "\ndef alpha_max(emp_cov):\n \"\"\"Find the maximum alpha for which there are some non-zeros off-diagonal.\n\n Parameters\n ----------\n emp_cov : ndarray of shape (n_features, n_features)\n The sample covariance matrix.\n\n Notes\n -----\n This results from the bound for the all the Lasso that are solved\n in GraphicalLasso: each time, the row of cov corresponds to Xy. As the\n bound for alpha is given by `max(abs(Xy))`, the result follows.\n \"\"\"\n A = np.copy(emp_cov)\n A.flat[::A.shape[0] + 1] = 0\n return np.max(np.abs(A))" }, { @@ -39934,7 +41267,8 @@ "docstring": { "type": "ndarray of shape (n_features, n_features)", "description": "Empirical covariance from which to compute the covariance estimate." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -39944,7 +41278,8 @@ "docstring": { "type": "float", "description": "The regularization parameter: the higher alpha, the more\nregularization, the sparser the inverse covariance.\nRange is (0, inf]." - } + }, + "refined_type": {} }, { "name": "cov_init", @@ -39954,7 +41289,8 @@ "docstring": { "type": "array of shape (n_features, n_features), default=None", "description": "The initial guess for the covariance. If None, then the empirical\ncovariance is used." - } + }, + "refined_type": {} }, { "name": "mode", @@ -39964,6 +41300,10 @@ "docstring": { "type": "{'cd', 'lars'}, default='cd'", "description": "The Lasso solver to use: coordinate descent or LARS. Use LARS for\nvery sparse underlying graphs, where p > n. Elsewhere prefer cd\nwhich is more numerically stable." + }, + "refined_type": { + "kind": "EnumType", + "values": ["cd", "lars"] } }, { @@ -39974,7 +41314,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance to declare convergence: if the dual gap goes below\nthis value, iterations are stopped. Range is (0, inf]." - } + }, + "refined_type": {} }, { "name": "enet_tol", @@ -39984,7 +41325,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance for the elastic net solver used to calculate the descent\ndirection. This parameter controls the accuracy of the search direction\nfor a given column update, not of the overall parameter estimate. Only\nused for mode='cd'. Range is (0, inf]." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -39994,7 +41336,8 @@ "docstring": { "type": "int, default=100", "description": "The maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -40004,7 +41347,8 @@ "docstring": { "type": "bool, default=False", "description": "If verbose is True, the objective function and dual gap are\nprinted at each iteration." - } + }, + "refined_type": {} }, { "name": "return_costs", @@ -40014,7 +41358,8 @@ "docstring": { "type": "bool, default=Flase", "description": "If return_costs is True, the objective function and dual gap\nat each iteration are returned." - } + }, + "refined_type": {} }, { "name": "eps", @@ -40024,7 +41369,8 @@ "docstring": { "type": "float, default=eps", "description": "The machine-precision regularization in the computation of the\nCholesky diagonal factors. Increase this for very ill-conditioned\nsystems. Default is `np.finfo(np.float64).eps`." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -40034,13 +41380,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether or not to return the number of iterations." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "l1-penalized covariance estimator\n\nRead more in the :ref:`User Guide `. .. versionchanged:: v0.20 graph_lasso has been renamed to graphical_lasso", - "docstring": "l1-penalized covariance estimator\n\nRead more in the :ref:`User Guide `.\n\n.. versionchanged:: v0.20\n graph_lasso has been renamed to graphical_lasso\n\nParameters\n----------\nemp_cov : ndarray of shape (n_features, n_features)\n Empirical covariance from which to compute the covariance estimate.\n\nalpha : float\n The regularization parameter: the higher alpha, the more\n regularization, the sparser the inverse covariance.\n Range is (0, inf].\n\ncov_init : array of shape (n_features, n_features), default=None\n The initial guess for the covariance. If None, then the empirical\n covariance is used.\n\nmode : {'cd', 'lars'}, default='cd'\n The Lasso solver to use: coordinate descent or LARS. Use LARS for\n very sparse underlying graphs, where p > n. Elsewhere prefer cd\n which is more numerically stable.\n\ntol : float, default=1e-4\n The tolerance to declare convergence: if the dual gap goes below\n this value, iterations are stopped. Range is (0, inf].\n\nenet_tol : float, default=1e-4\n The tolerance for the elastic net solver used to calculate the descent\n direction. This parameter controls the accuracy of the search direction\n for a given column update, not of the overall parameter estimate. Only\n used for mode='cd'. Range is (0, inf].\n\nmax_iter : int, default=100\n The maximum number of iterations.\n\nverbose : bool, default=False\n If verbose is True, the objective function and dual gap are\n printed at each iteration.\n\nreturn_costs : bool, default=Flase\n If return_costs is True, the objective function and dual gap\n at each iteration are returned.\n\neps : float, default=eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Default is `np.finfo(np.float64).eps`.\n\nreturn_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\nReturns\n-------\ncovariance : ndarray of shape (n_features, n_features)\n The estimated covariance matrix.\n\nprecision : ndarray of shape (n_features, n_features)\n The estimated (sparse) precision matrix.\n\ncosts : list of (objective, dual_gap) pairs\n The list of values of the objective function and the dual gap at\n each iteration. Returned only if return_costs is True.\n\nn_iter : int\n Number of iterations. Returned only if `return_n_iter` is set to True.\n\nSee Also\n--------\nGraphicalLasso, GraphicalLassoCV\n\nNotes\n-----\nThe algorithm employed to solve this problem is the GLasso algorithm,\nfrom the Friedman 2008 Biostatistics paper. It is the same algorithm\nas in the R `glasso` package.\n\nOne possible difference with the `glasso` R package is that the\ndiagonal coefficients are not penalized.", + "description": "l1-penalized covariance estimator\n\nRead more in the :ref:`User Guide `.\n\n.. versionchanged:: v0.20\n graph_lasso has been renamed to graphical_lasso", + "docstring": "l1-penalized covariance estimator\n\n Read more in the :ref:`User Guide `.\n\n .. versionchanged:: v0.20\n graph_lasso has been renamed to graphical_lasso\n\n Parameters\n ----------\n emp_cov : ndarray of shape (n_features, n_features)\n Empirical covariance from which to compute the covariance estimate.\n\n alpha : float\n The regularization parameter: the higher alpha, the more\n regularization, the sparser the inverse covariance.\n Range is (0, inf].\n\n cov_init : array of shape (n_features, n_features), default=None\n The initial guess for the covariance. If None, then the empirical\n covariance is used.\n\n mode : {'cd', 'lars'}, default='cd'\n The Lasso solver to use: coordinate descent or LARS. Use LARS for\n very sparse underlying graphs, where p > n. Elsewhere prefer cd\n which is more numerically stable.\n\n tol : float, default=1e-4\n The tolerance to declare convergence: if the dual gap goes below\n this value, iterations are stopped. Range is (0, inf].\n\n enet_tol : float, default=1e-4\n The tolerance for the elastic net solver used to calculate the descent\n direction. This parameter controls the accuracy of the search direction\n for a given column update, not of the overall parameter estimate. Only\n used for mode='cd'. Range is (0, inf].\n\n max_iter : int, default=100\n The maximum number of iterations.\n\n verbose : bool, default=False\n If verbose is True, the objective function and dual gap are\n printed at each iteration.\n\n return_costs : bool, default=Flase\n If return_costs is True, the objective function and dual gap\n at each iteration are returned.\n\n eps : float, default=eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Default is `np.finfo(np.float64).eps`.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n covariance : ndarray of shape (n_features, n_features)\n The estimated covariance matrix.\n\n precision : ndarray of shape (n_features, n_features)\n The estimated (sparse) precision matrix.\n\n costs : list of (objective, dual_gap) pairs\n The list of values of the objective function and the dual gap at\n each iteration. Returned only if return_costs is True.\n\n n_iter : int\n Number of iterations. Returned only if `return_n_iter` is set to True.\n\n See Also\n --------\n GraphicalLasso, GraphicalLassoCV\n\n Notes\n -----\n The algorithm employed to solve this problem is the GLasso algorithm,\n from the Friedman 2008 Biostatistics paper. It is the same algorithm\n as in the R `glasso` package.\n\n One possible difference with the `glasso` R package is that the\n diagonal coefficients are not penalized.\n ", "source_code": "\ndef graphical_lasso(emp_cov, alpha, *, cov_init=None, mode='cd', tol=0.0001, enet_tol=0.0001, max_iter=100, verbose=False, return_costs=False, eps=np.finfo(np.float64).eps, return_n_iter=False):\n \"\"\"l1-penalized covariance estimator\n\n Read more in the :ref:`User Guide `.\n\n .. versionchanged:: v0.20\n graph_lasso has been renamed to graphical_lasso\n\n Parameters\n ----------\n emp_cov : ndarray of shape (n_features, n_features)\n Empirical covariance from which to compute the covariance estimate.\n\n alpha : float\n The regularization parameter: the higher alpha, the more\n regularization, the sparser the inverse covariance.\n Range is (0, inf].\n\n cov_init : array of shape (n_features, n_features), default=None\n The initial guess for the covariance. If None, then the empirical\n covariance is used.\n\n mode : {'cd', 'lars'}, default='cd'\n The Lasso solver to use: coordinate descent or LARS. Use LARS for\n very sparse underlying graphs, where p > n. Elsewhere prefer cd\n which is more numerically stable.\n\n tol : float, default=1e-4\n The tolerance to declare convergence: if the dual gap goes below\n this value, iterations are stopped. Range is (0, inf].\n\n enet_tol : float, default=1e-4\n The tolerance for the elastic net solver used to calculate the descent\n direction. This parameter controls the accuracy of the search direction\n for a given column update, not of the overall parameter estimate. Only\n used for mode='cd'. Range is (0, inf].\n\n max_iter : int, default=100\n The maximum number of iterations.\n\n verbose : bool, default=False\n If verbose is True, the objective function and dual gap are\n printed at each iteration.\n\n return_costs : bool, default=Flase\n If return_costs is True, the objective function and dual gap\n at each iteration are returned.\n\n eps : float, default=eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Default is `np.finfo(np.float64).eps`.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n covariance : ndarray of shape (n_features, n_features)\n The estimated covariance matrix.\n\n precision : ndarray of shape (n_features, n_features)\n The estimated (sparse) precision matrix.\n\n costs : list of (objective, dual_gap) pairs\n The list of values of the objective function and the dual gap at\n each iteration. Returned only if return_costs is True.\n\n n_iter : int\n Number of iterations. Returned only if `return_n_iter` is set to True.\n\n See Also\n --------\n GraphicalLasso, GraphicalLassoCV\n\n Notes\n -----\n The algorithm employed to solve this problem is the GLasso algorithm,\n from the Friedman 2008 Biostatistics paper. It is the same algorithm\n as in the R `glasso` package.\n\n One possible difference with the `glasso` R package is that the\n diagonal coefficients are not penalized.\n \"\"\"\n (_, n_features) = emp_cov.shape\n if alpha == 0:\n if return_costs:\n precision_ = linalg.inv(emp_cov)\n cost = -2.0 * log_likelihood(emp_cov, precision_)\n cost += n_features * np.log(2 * np.pi)\n d_gap = np.sum(emp_cov * precision_) - n_features\n if return_n_iter:\n return emp_cov, precision_, (cost, d_gap), 0\n else:\n return emp_cov, precision_, (cost, d_gap)\n elif return_n_iter:\n return emp_cov, linalg.inv(emp_cov), 0\n else:\n return emp_cov, linalg.inv(emp_cov)\n if cov_init is None:\n covariance_ = emp_cov.copy()\n else:\n covariance_ = cov_init.copy()\n covariance_ *= 0.95\n diagonal = emp_cov.flat[::n_features + 1]\n covariance_.flat[::n_features + 1] = diagonal\n precision_ = linalg.pinvh(covariance_)\n indices = np.arange(n_features)\n costs = list()\n if mode == 'cd':\n errors = dict(over='raise', invalid='ignore')\n else:\n errors = dict(invalid='raise')\n try:\n d_gap = np.inf\n sub_covariance = np.copy(covariance_[1:, 1:], order='C')\n for i in range(max_iter):\n for idx in range(n_features):\n if idx > 0:\n di = idx - 1\n sub_covariance[di] = covariance_[di][indices != idx]\n sub_covariance[:, di] = covariance_[:, di][indices != idx]\n else:\n sub_covariance[:] = covariance_[1:, 1:]\n row = emp_cov[idx, indices != idx]\n with np.errstate(**errors):\n if mode == 'cd':\n coefs = -(precision_[indices != idx, idx] / (precision_[idx, idx] + 1000 * eps))\n (coefs, _, _, _) = cd_fast.enet_coordinate_descent_gram(coefs, alpha, 0, sub_covariance, row, row, max_iter, enet_tol, check_random_state(None), False)\n else:\n (_, _, coefs) = lars_path_gram(Xy=row, Gram=sub_covariance, n_samples=row.size, alpha_min=alpha / (n_features - 1), copy_Gram=True, eps=eps, method='lars', return_path=False)\n precision_[idx, idx] = 1.0 / (covariance_[idx, idx] - np.dot(covariance_[indices != idx, idx], coefs))\n precision_[indices != idx, idx] = -precision_[idx, idx] * coefs\n precision_[idx, indices != idx] = -precision_[idx, idx] * coefs\n coefs = np.dot(sub_covariance, coefs)\n covariance_[idx, indices != idx] = coefs\n covariance_[indices != idx, idx] = coefs\n if not np.isfinite(precision_.sum()):\n raise FloatingPointError('The system is too ill-conditioned for this solver')\n d_gap = _dual_gap(emp_cov, precision_, alpha)\n cost = _objective(emp_cov, precision_, alpha)\n if verbose:\n print('[graphical_lasso] Iteration % 3i, cost % 3.2e, dual gap %.3e' % (i, cost, d_gap))\n if return_costs:\n costs.append((cost, d_gap))\n if np.abs(d_gap) < tol:\n break\n if not np.isfinite(cost) and i > 0:\n raise FloatingPointError('Non SPD result: the system is too ill-conditioned for this solver')\n else:\n warnings.warn('graphical_lasso: did not converge after %i iteration: dual gap: %.3e' % (max_iter, d_gap), ConvergenceWarning)\n except FloatingPointError as e:\n e.args = (e.args[0] + '. The system is too ill-conditioned for this solver', )\n raise e\n if return_costs:\n if return_n_iter:\n return covariance_, precision_, costs, i + 1\n else:\n return covariance_, precision_, costs\n elif return_n_iter:\n return covariance_, precision_, i + 1\n else:\n return covariance_, precision_" }, { @@ -40058,7 +41405,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Data from which to compute the covariance estimate." - } + }, + "refined_type": {} }, { "name": "alphas", @@ -40068,7 +41416,8 @@ "docstring": { "type": "array-like of shape (n_alphas,)", "description": "The list of regularization parameters, decreasing order." - } + }, + "refined_type": {} }, { "name": "cov_init", @@ -40078,7 +41427,8 @@ "docstring": { "type": "array of shape (n_features, n_features), default=None", "description": "The initial guess for the covariance." - } + }, + "refined_type": {} }, { "name": "X_test", @@ -40088,7 +41438,8 @@ "docstring": { "type": "array of shape (n_test_samples, n_features), default=None", "description": "Optional test matrix to measure generalisation error." - } + }, + "refined_type": {} }, { "name": "mode", @@ -40098,6 +41449,10 @@ "docstring": { "type": "{'cd', 'lars'}, default='cd'", "description": "The Lasso solver to use: coordinate descent or LARS. Use LARS for\nvery sparse underlying graphs, where p > n. Elsewhere prefer cd\nwhich is more numerically stable." + }, + "refined_type": { + "kind": "EnumType", + "values": ["cd", "lars"] } }, { @@ -40108,7 +41463,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance to declare convergence: if the dual gap goes below\nthis value, iterations are stopped. The tolerance must be a positive\nnumber." - } + }, + "refined_type": {} }, { "name": "enet_tol", @@ -40118,7 +41474,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance for the elastic net solver used to calculate the descent\ndirection. This parameter controls the accuracy of the search direction\nfor a given column update, not of the overall parameter estimate. Only\nused for mode='cd'. The tolerance must be a positive number." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -40128,7 +41485,8 @@ "docstring": { "type": "int, default=100", "description": "The maximum number of iterations. This parameter should be a strictly\npositive integer." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -40138,13 +41496,14 @@ "docstring": { "type": "int or bool, default=False", "description": "The higher the verbosity flag, the more information is printed\nduring the fitting." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "l1-penalized covariance estimator along a path of decreasing alphas\n\nRead more in the :ref:`User Guide `.", - "docstring": "l1-penalized covariance estimator along a path of decreasing alphas\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\nalphas : array-like of shape (n_alphas,)\n The list of regularization parameters, decreasing order.\n\ncov_init : array of shape (n_features, n_features), default=None\n The initial guess for the covariance.\n\nX_test : array of shape (n_test_samples, n_features), default=None\n Optional test matrix to measure generalisation error.\n\nmode : {'cd', 'lars'}, default='cd'\n The Lasso solver to use: coordinate descent or LARS. Use LARS for\n very sparse underlying graphs, where p > n. Elsewhere prefer cd\n which is more numerically stable.\n\ntol : float, default=1e-4\n The tolerance to declare convergence: if the dual gap goes below\n this value, iterations are stopped. The tolerance must be a positive\n number.\n\nenet_tol : float, default=1e-4\n The tolerance for the elastic net solver used to calculate the descent\n direction. This parameter controls the accuracy of the search direction\n for a given column update, not of the overall parameter estimate. Only\n used for mode='cd'. The tolerance must be a positive number.\n\nmax_iter : int, default=100\n The maximum number of iterations. This parameter should be a strictly\n positive integer.\n\nverbose : int or bool, default=False\n The higher the verbosity flag, the more information is printed\n during the fitting.\n\nReturns\n-------\ncovariances_ : list of shape (n_alphas,) of ndarray of shape (n_features, n_features)\n The estimated covariance matrices.\n\nprecisions_ : list of shape (n_alphas,) of ndarray of shape (n_features, n_features)\n The estimated (sparse) precision matrices.\n\nscores_ : list of shape (n_alphas,), dtype=float\n The generalisation error (log-likelihood) on the test data.\n Returned only if test data is passed.", + "docstring": "l1-penalized covariance estimator along a path of decreasing alphas\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\n alphas : array-like of shape (n_alphas,)\n The list of regularization parameters, decreasing order.\n\n cov_init : array of shape (n_features, n_features), default=None\n The initial guess for the covariance.\n\n X_test : array of shape (n_test_samples, n_features), default=None\n Optional test matrix to measure generalisation error.\n\n mode : {'cd', 'lars'}, default='cd'\n The Lasso solver to use: coordinate descent or LARS. Use LARS for\n very sparse underlying graphs, where p > n. Elsewhere prefer cd\n which is more numerically stable.\n\n tol : float, default=1e-4\n The tolerance to declare convergence: if the dual gap goes below\n this value, iterations are stopped. The tolerance must be a positive\n number.\n\n enet_tol : float, default=1e-4\n The tolerance for the elastic net solver used to calculate the descent\n direction. This parameter controls the accuracy of the search direction\n for a given column update, not of the overall parameter estimate. Only\n used for mode='cd'. The tolerance must be a positive number.\n\n max_iter : int, default=100\n The maximum number of iterations. This parameter should be a strictly\n positive integer.\n\n verbose : int or bool, default=False\n The higher the verbosity flag, the more information is printed\n during the fitting.\n\n Returns\n -------\n covariances_ : list of shape (n_alphas,) of ndarray of shape (n_features, n_features)\n The estimated covariance matrices.\n\n precisions_ : list of shape (n_alphas,) of ndarray of shape (n_features, n_features)\n The estimated (sparse) precision matrices.\n\n scores_ : list of shape (n_alphas,), dtype=float\n The generalisation error (log-likelihood) on the test data.\n Returned only if test data is passed.\n ", "source_code": "\ndef graphical_lasso_path(X, alphas, cov_init=None, X_test=None, mode='cd', tol=0.0001, enet_tol=0.0001, max_iter=100, verbose=False):\n \"\"\"l1-penalized covariance estimator along a path of decreasing alphas\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\n alphas : array-like of shape (n_alphas,)\n The list of regularization parameters, decreasing order.\n\n cov_init : array of shape (n_features, n_features), default=None\n The initial guess for the covariance.\n\n X_test : array of shape (n_test_samples, n_features), default=None\n Optional test matrix to measure generalisation error.\n\n mode : {'cd', 'lars'}, default='cd'\n The Lasso solver to use: coordinate descent or LARS. Use LARS for\n very sparse underlying graphs, where p > n. Elsewhere prefer cd\n which is more numerically stable.\n\n tol : float, default=1e-4\n The tolerance to declare convergence: if the dual gap goes below\n this value, iterations are stopped. The tolerance must be a positive\n number.\n\n enet_tol : float, default=1e-4\n The tolerance for the elastic net solver used to calculate the descent\n direction. This parameter controls the accuracy of the search direction\n for a given column update, not of the overall parameter estimate. Only\n used for mode='cd'. The tolerance must be a positive number.\n\n max_iter : int, default=100\n The maximum number of iterations. This parameter should be a strictly\n positive integer.\n\n verbose : int or bool, default=False\n The higher the verbosity flag, the more information is printed\n during the fitting.\n\n Returns\n -------\n covariances_ : list of shape (n_alphas,) of ndarray of shape (n_features, n_features)\n The estimated covariance matrices.\n\n precisions_ : list of shape (n_alphas,) of ndarray of shape (n_features, n_features)\n The estimated (sparse) precision matrices.\n\n scores_ : list of shape (n_alphas,), dtype=float\n The generalisation error (log-likelihood) on the test data.\n Returned only if test data is passed.\n \"\"\"\n inner_verbose = max(0, verbose - 1)\n emp_cov = empirical_covariance(X)\n if cov_init is None:\n covariance_ = emp_cov.copy()\n else:\n covariance_ = cov_init\n covariances_ = list()\n precisions_ = list()\n scores_ = list()\n if X_test is not None:\n test_emp_cov = empirical_covariance(X_test)\n for alpha in alphas:\n try:\n (covariance_, precision_) = graphical_lasso(emp_cov, alpha=alpha, cov_init=covariance_, mode=mode, tol=tol, enet_tol=enet_tol, max_iter=max_iter, verbose=inner_verbose)\n covariances_.append(covariance_)\n precisions_.append(precision_)\n if X_test is not None:\n this_score = log_likelihood(test_emp_cov, precision_)\n except FloatingPointError:\n this_score = -np.inf\n covariances_.append(np.nan)\n precisions_.append(np.nan)\n if X_test is not None:\n if not np.isfinite(this_score):\n this_score = -np.inf\n scores_.append(this_score)\n if verbose == 1:\n sys.stderr.write('.')\n elif verbose > 1:\n if X_test is not None:\n print('[graphical_lasso_path] alpha: %.2e, score: %.2e' % (alpha, this_score))\n else:\n print('[graphical_lasso_path] alpha: %.2e' % alpha)\n if X_test is not None:\n return covariances_, precisions_, scores_\n return covariances_, precisions_" }, { @@ -40162,7 +41521,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "store_precision", @@ -40172,7 +41532,8 @@ "docstring": { "type": "bool, default=True", "description": "Specify if the estimated precision is stored." - } + }, + "refined_type": {} }, { "name": "assume_centered", @@ -40182,7 +41543,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, the support of the robust location and the covariance\nestimates is computed, and a covariance estimate is recomputed from\nit, without centering the data.\nUseful to work with data whose mean is significantly equal to\nzero but is not exactly zero.\nIf False, the robust location and covariance are directly computed\nwith the FastMCD algorithm without additional treatment." - } + }, + "refined_type": {} }, { "name": "support_fraction", @@ -40192,7 +41554,8 @@ "docstring": { "type": "float, default=None", "description": "The proportion of points to be included in the support of the raw\nMCD estimate. Default is None, which implies that the minimum\nvalue of support_fraction will be used within the algorithm:\n`(n_sample + n_features + 1) / 2`. The parameter must be in the range\n(0, 1)." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -40202,13 +41565,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines the pseudo random number generator for shuffling the data.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, store_precision=True, assume_centered=False, support_fraction=None, random_state=None):\n self.store_precision = store_precision\n self.assume_centered = assume_centered\n self.support_fraction = support_fraction\n self.random_state = random_state" }, { @@ -40226,7 +41590,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data", @@ -40236,13 +41601,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix, with p features and n samples.\nThe data set must be the one which was used to compute\nthe raw estimates." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Apply a correction to raw Minimum Covariance Determinant estimates.\n\nCorrection using the empirical correction factor suggested by Rousseeuw and Van Driessen in [RVD]_.", - "docstring": "Apply a correction to raw Minimum Covariance Determinant estimates.\n\nCorrection using the empirical correction factor suggested\nby Rousseeuw and Van Driessen in [RVD]_.\n\nParameters\n----------\ndata : array-like of shape (n_samples, n_features)\n The data matrix, with p features and n samples.\n The data set must be the one which was used to compute\n the raw estimates.\n\nReturns\n-------\ncovariance_corrected : ndarray of shape (n_features, n_features)\n Corrected robust covariance estimate.\n\nReferences\n----------\n\n.. [RVD] A Fast Algorithm for the Minimum Covariance\n Determinant Estimator, 1999, American Statistical Association\n and the American Society for Quality, TECHNOMETRICS", + "description": "Apply a correction to raw Minimum Covariance Determinant estimates.\n\nCorrection using the empirical correction factor suggested\nby Rousseeuw and Van Driessen in [RVD]_.", + "docstring": "Apply a correction to raw Minimum Covariance Determinant estimates.\n\n Correction using the empirical correction factor suggested\n by Rousseeuw and Van Driessen in [RVD]_.\n\n Parameters\n ----------\n data : array-like of shape (n_samples, n_features)\n The data matrix, with p features and n samples.\n The data set must be the one which was used to compute\n the raw estimates.\n\n Returns\n -------\n covariance_corrected : ndarray of shape (n_features, n_features)\n Corrected robust covariance estimate.\n\n References\n ----------\n\n .. [RVD] A Fast Algorithm for the Minimum Covariance\n Determinant Estimator, 1999, American Statistical Association\n and the American Society for Quality, TECHNOMETRICS\n ", "source_code": "\ndef correct_covariance(self, data):\n \"\"\"Apply a correction to raw Minimum Covariance Determinant estimates.\n\n Correction using the empirical correction factor suggested\n by Rousseeuw and Van Driessen in [RVD]_.\n\n Parameters\n ----------\n data : array-like of shape (n_samples, n_features)\n The data matrix, with p features and n samples.\n The data set must be the one which was used to compute\n the raw estimates.\n\n Returns\n -------\n covariance_corrected : ndarray of shape (n_features, n_features)\n Corrected robust covariance estimate.\n\n References\n ----------\n\n .. [RVD] A Fast Algorithm for the Minimum Covariance\n Determinant Estimator, 1999, American Statistical Association\n and the American Society for Quality, TECHNOMETRICS\n \"\"\"\n n_samples = len(self.dist_)\n n_support = np.sum(self.support_)\n if n_support < n_samples and np.allclose(self.raw_covariance_, 0):\n raise ValueError('The covariance matrix of the support data is equal to 0, try to increase support_fraction')\n correction = np.median(self.dist_) / chi2(data.shape[1]).isf(0.5)\n covariance_corrected = self.raw_covariance_ * correction\n self.dist_ /= correction\n return covariance_corrected" }, { @@ -40260,7 +41626,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -40270,7 +41637,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -40280,13 +41648,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit a Minimum Covariance Determinant with the FastMCD algorithm.", - "docstring": "Fit a Minimum Covariance Determinant with the FastMCD algorithm.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit a Minimum Covariance Determinant with the FastMCD algorithm.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit a Minimum Covariance Determinant with the FastMCD algorithm.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, ensure_min_samples=2, estimator='MinCovDet')\n random_state = check_random_state(self.random_state)\n (n_samples, n_features) = X.shape\n if (linalg.svdvals(np.dot(X.T, X)) > 1e-08).sum() != n_features:\n warnings.warn('The covariance matrix associated to your dataset is not full rank')\n (raw_location, raw_covariance, raw_support, raw_dist) = fast_mcd(X, support_fraction=self.support_fraction, cov_computation_method=self._nonrobust_covariance, random_state=random_state)\n if self.assume_centered:\n raw_location = np.zeros(n_features)\n raw_covariance = self._nonrobust_covariance(X[raw_support], assume_centered=True)\n precision = linalg.pinvh(raw_covariance)\n raw_dist = np.sum(np.dot(X, precision) * X, 1)\n self.raw_location_ = raw_location\n self.raw_covariance_ = raw_covariance\n self.raw_support_ = raw_support\n self.location_ = raw_location\n self.support_ = raw_support\n self.dist_ = raw_dist\n self.correct_covariance(X)\n self.reweight_covariance(X)\n return self" }, { @@ -40304,7 +41673,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data", @@ -40314,13 +41684,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix, with p features and n samples.\nThe data set must be the one which was used to compute\nthe raw estimates." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Re-weight raw Minimum Covariance Determinant estimates.\n\nRe-weight observations using Rousseeuw's method (equivalent to deleting outlying observations from the data set before computing location and covariance estimates) described in [RVDriessen]_.", - "docstring": "Re-weight raw Minimum Covariance Determinant estimates.\n\nRe-weight observations using Rousseeuw's method (equivalent to\ndeleting outlying observations from the data set before\ncomputing location and covariance estimates) described\nin [RVDriessen]_.\n\nParameters\n----------\ndata : array-like of shape (n_samples, n_features)\n The data matrix, with p features and n samples.\n The data set must be the one which was used to compute\n the raw estimates.\n\nReturns\n-------\nlocation_reweighted : ndarray of shape (n_features,)\n Re-weighted robust location estimate.\n\ncovariance_reweighted : ndarray of shape (n_features, n_features)\n Re-weighted robust covariance estimate.\n\nsupport_reweighted : ndarray of shape (n_samples,), dtype=bool\n A mask of the observations that have been used to compute\n the re-weighted robust location and covariance estimates.\n\nReferences\n----------\n\n.. [RVDriessen] A Fast Algorithm for the Minimum Covariance\n Determinant Estimator, 1999, American Statistical Association\n and the American Society for Quality, TECHNOMETRICS", + "description": "Re-weight raw Minimum Covariance Determinant estimates.\n\nRe-weight observations using Rousseeuw's method (equivalent to\ndeleting outlying observations from the data set before\ncomputing location and covariance estimates) described\nin [RVDriessen]_.", + "docstring": "Re-weight raw Minimum Covariance Determinant estimates.\n\n Re-weight observations using Rousseeuw's method (equivalent to\n deleting outlying observations from the data set before\n computing location and covariance estimates) described\n in [RVDriessen]_.\n\n Parameters\n ----------\n data : array-like of shape (n_samples, n_features)\n The data matrix, with p features and n samples.\n The data set must be the one which was used to compute\n the raw estimates.\n\n Returns\n -------\n location_reweighted : ndarray of shape (n_features,)\n Re-weighted robust location estimate.\n\n covariance_reweighted : ndarray of shape (n_features, n_features)\n Re-weighted robust covariance estimate.\n\n support_reweighted : ndarray of shape (n_samples,), dtype=bool\n A mask of the observations that have been used to compute\n the re-weighted robust location and covariance estimates.\n\n References\n ----------\n\n .. [RVDriessen] A Fast Algorithm for the Minimum Covariance\n Determinant Estimator, 1999, American Statistical Association\n and the American Society for Quality, TECHNOMETRICS\n ", "source_code": "\ndef reweight_covariance(self, data):\n \"\"\"Re-weight raw Minimum Covariance Determinant estimates.\n\n Re-weight observations using Rousseeuw's method (equivalent to\n deleting outlying observations from the data set before\n computing location and covariance estimates) described\n in [RVDriessen]_.\n\n Parameters\n ----------\n data : array-like of shape (n_samples, n_features)\n The data matrix, with p features and n samples.\n The data set must be the one which was used to compute\n the raw estimates.\n\n Returns\n -------\n location_reweighted : ndarray of shape (n_features,)\n Re-weighted robust location estimate.\n\n covariance_reweighted : ndarray of shape (n_features, n_features)\n Re-weighted robust covariance estimate.\n\n support_reweighted : ndarray of shape (n_samples,), dtype=bool\n A mask of the observations that have been used to compute\n the re-weighted robust location and covariance estimates.\n\n References\n ----------\n\n .. [RVDriessen] A Fast Algorithm for the Minimum Covariance\n Determinant Estimator, 1999, American Statistical Association\n and the American Society for Quality, TECHNOMETRICS\n \"\"\"\n (n_samples, n_features) = data.shape\n mask = self.dist_ < chi2(n_features).isf(0.025)\n if self.assume_centered:\n location_reweighted = np.zeros(n_features)\n else:\n location_reweighted = data[mask].mean(0)\n covariance_reweighted = self._nonrobust_covariance(data[mask], assume_centered=self.assume_centered)\n support_reweighted = np.zeros(n_samples, dtype=bool)\n support_reweighted[mask] = True\n self._set_covariance(covariance_reweighted)\n self.location_ = location_reweighted\n self.support_ = support_reweighted\n X_centered = data - self.location_\n self.dist_ = np.sum(np.dot(X_centered, self.get_precision()) * X_centered, 1)\n return location_reweighted, covariance_reweighted, support_reweighted" }, { @@ -40338,7 +41709,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_support", @@ -40348,7 +41720,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -40358,7 +41731,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "remaining_iterations", @@ -40368,7 +41742,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "initial_estimates", @@ -40378,7 +41753,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -40388,7 +41764,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cov_computation_method", @@ -40398,13 +41775,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _c_step(X, n_support, random_state, remaining_iterations=30, initial_estimates=None, verbose=False, cov_computation_method=empirical_covariance):\n (n_samples, n_features) = X.shape\n dist = np.inf\n support = np.zeros(n_samples, dtype=bool)\n if initial_estimates is None:\n support[random_state.permutation(n_samples)[:n_support]] = True\n else:\n location = initial_estimates[0]\n covariance = initial_estimates[1]\n precision = linalg.pinvh(covariance)\n X_centered = X - location\n dist = (np.dot(X_centered, precision) * X_centered).sum(1)\n support[np.argsort(dist)[:n_support]] = True\n X_support = X[support]\n location = X_support.mean(0)\n covariance = cov_computation_method(X_support)\n det = fast_logdet(covariance)\n if np.isinf(det):\n precision = linalg.pinvh(covariance)\n previous_det = np.inf\n while det < previous_det and remaining_iterations > 0 and not np.isinf(det):\n previous_location = location\n previous_covariance = covariance\n previous_det = det\n previous_support = support\n precision = linalg.pinvh(covariance)\n X_centered = X - location\n dist = (np.dot(X_centered, precision) * X_centered).sum(axis=1)\n support = np.zeros(n_samples, dtype=bool)\n support[np.argsort(dist)[:n_support]] = True\n X_support = X[support]\n location = X_support.mean(axis=0)\n covariance = cov_computation_method(X_support)\n det = fast_logdet(covariance)\n remaining_iterations -= 1\n previous_dist = dist\n dist = (np.dot(X - location, precision) * (X - location)).sum(axis=1)\n if np.isinf(det):\n results = (location, covariance, det, support, dist)\n if np.allclose(det, previous_det):\n if verbose:\n print('Optimal couple (location, covariance) found before ending iterations (%d left)' % remaining_iterations)\n results = (location, covariance, det, support, dist)\n elif det > previous_det:\n warnings.warn('Determinant has increased; this should not happen: log(det) > log(previous_det) (%.15f > %.15f). You may want to try with a higher value of support_fraction (current value: %.3f).' % (det, previous_det, n_support / n_samples), RuntimeWarning)\n results = (previous_location, previous_covariance, previous_det, previous_support, previous_dist)\n if remaining_iterations == 0:\n if verbose:\n print('Maximum number of iterations reached')\n results = (location, covariance, det, support, dist)\n return results" }, { @@ -40422,7 +41800,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data set in which we look for the n_support observations whose\nscatter matrix has minimum determinant." - } + }, + "refined_type": {} }, { "name": "n_support", @@ -40432,7 +41811,8 @@ "docstring": { "type": "int", "description": "Number of observations to compute the robust estimates of location\nand covariance from. This parameter must be greater than\n`n_samples / 2`." - } + }, + "refined_type": {} }, { "name": "remaining_iterations", @@ -40442,7 +41822,8 @@ "docstring": { "type": "int, default=30", "description": "Number of iterations to perform.\nAccording to [Rouseeuw1999]_, two iterations are sufficient to get\nclose to the minimum, and we never need more than 30 to reach\nconvergence." - } + }, + "refined_type": {} }, { "name": "initial_estimates", @@ -40452,7 +41833,8 @@ "docstring": { "type": "tuple of shape (2,), default=None", "description": "Initial estimates of location and shape from which to run the c_step\nprocedure:\n- initial_estimates[0]: an initial location estimate\n- initial_estimates[1]: an initial covariance estimate" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -40462,7 +41844,8 @@ "docstring": { "type": "bool, default=False", "description": "Verbose mode." - } + }, + "refined_type": {} }, { "name": "cov_computation_method", @@ -40472,7 +41855,8 @@ "docstring": { "type": "callable, default=:func:`sklearn.covariance.empirical_covariance`", "description": "The function which will be used to compute the covariance.\nMust return array of shape (n_features, n_features)." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -40482,13 +41866,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines the pseudo random number generator for shuffling the data.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "C_step procedure described in [Rouseeuw1984]_ aiming at computing MCD.", - "docstring": "C_step procedure described in [Rouseeuw1984]_ aiming at computing MCD.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data set in which we look for the n_support observations whose\n scatter matrix has minimum determinant.\n\nn_support : int\n Number of observations to compute the robust estimates of location\n and covariance from. This parameter must be greater than\n `n_samples / 2`.\n\nremaining_iterations : int, default=30\n Number of iterations to perform.\n According to [Rouseeuw1999]_, two iterations are sufficient to get\n close to the minimum, and we never need more than 30 to reach\n convergence.\n\ninitial_estimates : tuple of shape (2,), default=None\n Initial estimates of location and shape from which to run the c_step\n procedure:\n - initial_estimates[0]: an initial location estimate\n - initial_estimates[1]: an initial covariance estimate\n\nverbose : bool, default=False\n Verbose mode.\n\ncov_computation_method : callable, default=:func:`sklearn.covariance.empirical_covariance`\n The function which will be used to compute the covariance.\n Must return array of shape (n_features, n_features).\n\nrandom_state : int, RandomState instance or None, default=None\n Determines the pseudo random number generator for shuffling the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nlocation : ndarray of shape (n_features,)\n Robust location estimates.\n\ncovariance : ndarray of shape (n_features, n_features)\n Robust covariance estimates.\n\nsupport : ndarray of shape (n_samples,)\n A mask for the `n_support` observations whose scatter matrix has\n minimum determinant.\n\nReferences\n----------\n.. [Rouseeuw1999] A Fast Algorithm for the Minimum Covariance Determinant\n Estimator, 1999, American Statistical Association and the American\n Society for Quality, TECHNOMETRICS", + "docstring": "C_step procedure described in [Rouseeuw1984]_ aiming at computing MCD.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data set in which we look for the n_support observations whose\n scatter matrix has minimum determinant.\n\n n_support : int\n Number of observations to compute the robust estimates of location\n and covariance from. This parameter must be greater than\n `n_samples / 2`.\n\n remaining_iterations : int, default=30\n Number of iterations to perform.\n According to [Rouseeuw1999]_, two iterations are sufficient to get\n close to the minimum, and we never need more than 30 to reach\n convergence.\n\n initial_estimates : tuple of shape (2,), default=None\n Initial estimates of location and shape from which to run the c_step\n procedure:\n - initial_estimates[0]: an initial location estimate\n - initial_estimates[1]: an initial covariance estimate\n\n verbose : bool, default=False\n Verbose mode.\n\n cov_computation_method : callable, default=:func:`sklearn.covariance.empirical_covariance`\n The function which will be used to compute the covariance.\n Must return array of shape (n_features, n_features).\n\n random_state : int, RandomState instance or None, default=None\n Determines the pseudo random number generator for shuffling the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n location : ndarray of shape (n_features,)\n Robust location estimates.\n\n covariance : ndarray of shape (n_features, n_features)\n Robust covariance estimates.\n\n support : ndarray of shape (n_samples,)\n A mask for the `n_support` observations whose scatter matrix has\n minimum determinant.\n\n References\n ----------\n .. [Rouseeuw1999] A Fast Algorithm for the Minimum Covariance Determinant\n Estimator, 1999, American Statistical Association and the American\n Society for Quality, TECHNOMETRICS\n ", "source_code": "\ndef c_step(X, n_support, remaining_iterations=30, initial_estimates=None, verbose=False, cov_computation_method=empirical_covariance, random_state=None):\n \"\"\"C_step procedure described in [Rouseeuw1984]_ aiming at computing MCD.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data set in which we look for the n_support observations whose\n scatter matrix has minimum determinant.\n\n n_support : int\n Number of observations to compute the robust estimates of location\n and covariance from. This parameter must be greater than\n `n_samples / 2`.\n\n remaining_iterations : int, default=30\n Number of iterations to perform.\n According to [Rouseeuw1999]_, two iterations are sufficient to get\n close to the minimum, and we never need more than 30 to reach\n convergence.\n\n initial_estimates : tuple of shape (2,), default=None\n Initial estimates of location and shape from which to run the c_step\n procedure:\n - initial_estimates[0]: an initial location estimate\n - initial_estimates[1]: an initial covariance estimate\n\n verbose : bool, default=False\n Verbose mode.\n\n cov_computation_method : callable, default=:func:`sklearn.covariance.empirical_covariance`\n The function which will be used to compute the covariance.\n Must return array of shape (n_features, n_features).\n\n random_state : int, RandomState instance or None, default=None\n Determines the pseudo random number generator for shuffling the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n location : ndarray of shape (n_features,)\n Robust location estimates.\n\n covariance : ndarray of shape (n_features, n_features)\n Robust covariance estimates.\n\n support : ndarray of shape (n_samples,)\n A mask for the `n_support` observations whose scatter matrix has\n minimum determinant.\n\n References\n ----------\n .. [Rouseeuw1999] A Fast Algorithm for the Minimum Covariance Determinant\n Estimator, 1999, American Statistical Association and the American\n Society for Quality, TECHNOMETRICS\n \"\"\"\n X = np.asarray(X)\n random_state = check_random_state(random_state)\n return _c_step(X, n_support, remaining_iterations=remaining_iterations, initial_estimates=initial_estimates, verbose=verbose, cov_computation_method=cov_computation_method, random_state=random_state)" }, { @@ -40506,7 +41891,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix, with p features and n samples." - } + }, + "refined_type": {} }, { "name": "support_fraction", @@ -40516,7 +41902,8 @@ "docstring": { "type": "float, default=None", "description": "The proportion of points to be included in the support of the raw\nMCD estimate. Default is `None`, which implies that the minimum\nvalue of `support_fraction` will be used within the algorithm:\n`(n_sample + n_features + 1) / 2`. This parameter must be in the\nrange (0, 1)." - } + }, + "refined_type": {} }, { "name": "cov_computation_method", @@ -40526,7 +41913,8 @@ "docstring": { "type": "callable, default=:func:`sklearn.covariance.empirical_covariance`", "description": "The function which will be used to compute the covariance.\nMust return an array of shape (n_features, n_features)." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -40536,13 +41924,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines the pseudo random number generator for shuffling the data.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Estimates the Minimum Covariance Determinant matrix.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Estimates the Minimum Covariance Determinant matrix.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix, with p features and n samples.\n\nsupport_fraction : float, default=None\n The proportion of points to be included in the support of the raw\n MCD estimate. Default is `None`, which implies that the minimum\n value of `support_fraction` will be used within the algorithm:\n `(n_sample + n_features + 1) / 2`. This parameter must be in the\n range (0, 1).\n\ncov_computation_method : callable, default=:func:`sklearn.covariance.empirical_covariance`\n The function which will be used to compute the covariance.\n Must return an array of shape (n_features, n_features).\n\nrandom_state : int, RandomState instance or None, default=None\n Determines the pseudo random number generator for shuffling the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nlocation : ndarray of shape (n_features,)\n Robust location of the data.\n\ncovariance : ndarray of shape (n_features, n_features)\n Robust covariance of the features.\n\nsupport : ndarray of shape (n_samples,), dtype=bool\n A mask of the observations that have been used to compute\n the robust location and covariance estimates of the data set.\n\nNotes\n-----\nThe FastMCD algorithm has been introduced by Rousseuw and Van Driessen\nin \"A Fast Algorithm for the Minimum Covariance Determinant Estimator,\n1999, American Statistical Association and the American Society\nfor Quality, TECHNOMETRICS\".\nThe principle is to compute robust estimates and random subsets before\npooling them into a larger subsets, and finally into the full data set.\nDepending on the size of the initial sample, we have one, two or three\nsuch computation levels.\n\nNote that only raw estimates are returned. If one is interested in\nthe correction and reweighting steps described in [RouseeuwVan]_,\nsee the MinCovDet object.\n\nReferences\n----------\n\n.. [RouseeuwVan] A Fast Algorithm for the Minimum Covariance\n Determinant Estimator, 1999, American Statistical Association\n and the American Society for Quality, TECHNOMETRICS\n\n.. [Butler1993] R. W. Butler, P. L. Davies and M. Jhun,\n Asymptotics For The Minimum Covariance Determinant Estimator,\n The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400", + "docstring": "Estimates the Minimum Covariance Determinant matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix, with p features and n samples.\n\n support_fraction : float, default=None\n The proportion of points to be included in the support of the raw\n MCD estimate. Default is `None`, which implies that the minimum\n value of `support_fraction` will be used within the algorithm:\n `(n_sample + n_features + 1) / 2`. This parameter must be in the\n range (0, 1).\n\n cov_computation_method : callable, default=:func:`sklearn.covariance.empirical_covariance`\n The function which will be used to compute the covariance.\n Must return an array of shape (n_features, n_features).\n\n random_state : int, RandomState instance or None, default=None\n Determines the pseudo random number generator for shuffling the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n location : ndarray of shape (n_features,)\n Robust location of the data.\n\n covariance : ndarray of shape (n_features, n_features)\n Robust covariance of the features.\n\n support : ndarray of shape (n_samples,), dtype=bool\n A mask of the observations that have been used to compute\n the robust location and covariance estimates of the data set.\n\n Notes\n -----\n The FastMCD algorithm has been introduced by Rousseuw and Van Driessen\n in \"A Fast Algorithm for the Minimum Covariance Determinant Estimator,\n 1999, American Statistical Association and the American Society\n for Quality, TECHNOMETRICS\".\n The principle is to compute robust estimates and random subsets before\n pooling them into a larger subsets, and finally into the full data set.\n Depending on the size of the initial sample, we have one, two or three\n such computation levels.\n\n Note that only raw estimates are returned. If one is interested in\n the correction and reweighting steps described in [RouseeuwVan]_,\n see the MinCovDet object.\n\n References\n ----------\n\n .. [RouseeuwVan] A Fast Algorithm for the Minimum Covariance\n Determinant Estimator, 1999, American Statistical Association\n and the American Society for Quality, TECHNOMETRICS\n\n .. [Butler1993] R. W. Butler, P. L. Davies and M. Jhun,\n Asymptotics For The Minimum Covariance Determinant Estimator,\n The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400\n ", "source_code": "\ndef fast_mcd(X, support_fraction=None, cov_computation_method=empirical_covariance, random_state=None):\n \"\"\"Estimates the Minimum Covariance Determinant matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix, with p features and n samples.\n\n support_fraction : float, default=None\n The proportion of points to be included in the support of the raw\n MCD estimate. Default is `None`, which implies that the minimum\n value of `support_fraction` will be used within the algorithm:\n `(n_sample + n_features + 1) / 2`. This parameter must be in the\n range (0, 1).\n\n cov_computation_method : callable, default=:func:`sklearn.covariance.empirical_covariance`\n The function which will be used to compute the covariance.\n Must return an array of shape (n_features, n_features).\n\n random_state : int, RandomState instance or None, default=None\n Determines the pseudo random number generator for shuffling the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n location : ndarray of shape (n_features,)\n Robust location of the data.\n\n covariance : ndarray of shape (n_features, n_features)\n Robust covariance of the features.\n\n support : ndarray of shape (n_samples,), dtype=bool\n A mask of the observations that have been used to compute\n the robust location and covariance estimates of the data set.\n\n Notes\n -----\n The FastMCD algorithm has been introduced by Rousseuw and Van Driessen\n in \"A Fast Algorithm for the Minimum Covariance Determinant Estimator,\n 1999, American Statistical Association and the American Society\n for Quality, TECHNOMETRICS\".\n The principle is to compute robust estimates and random subsets before\n pooling them into a larger subsets, and finally into the full data set.\n Depending on the size of the initial sample, we have one, two or three\n such computation levels.\n\n Note that only raw estimates are returned. If one is interested in\n the correction and reweighting steps described in [RouseeuwVan]_,\n see the MinCovDet object.\n\n References\n ----------\n\n .. [RouseeuwVan] A Fast Algorithm for the Minimum Covariance\n Determinant Estimator, 1999, American Statistical Association\n and the American Society for Quality, TECHNOMETRICS\n\n .. [Butler1993] R. W. Butler, P. L. Davies and M. Jhun,\n Asymptotics For The Minimum Covariance Determinant Estimator,\n The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400\n \"\"\"\n random_state = check_random_state(random_state)\n X = check_array(X, ensure_min_samples=2, estimator='fast_mcd')\n (n_samples, n_features) = X.shape\n if support_fraction is None:\n n_support = int(np.ceil(0.5 * (n_samples + n_features + 1)))\n else:\n n_support = int(support_fraction * n_samples)\n if n_features == 1:\n if n_support < n_samples:\n X_sorted = np.sort(np.ravel(X))\n diff = X_sorted[n_support:] - X_sorted[:n_samples - n_support]\n halves_start = np.where(diff == np.min(diff))[0]\n location = 0.5 * (X_sorted[n_support + halves_start] + X_sorted[halves_start]).mean()\n support = np.zeros(n_samples, dtype=bool)\n X_centered = X - location\n support[np.argsort(np.abs(X_centered), 0)[:n_support]] = True\n covariance = np.asarray([[np.var(X[support])]])\n location = np.array([location])\n precision = linalg.pinvh(covariance)\n dist = (np.dot(X_centered, precision) * X_centered).sum(axis=1)\n else:\n support = np.ones(n_samples, dtype=bool)\n covariance = np.asarray([[np.var(X)]])\n location = np.asarray([np.mean(X)])\n X_centered = X - location\n precision = linalg.pinvh(covariance)\n dist = (np.dot(X_centered, precision) * X_centered).sum(axis=1)\n if n_samples > 500 and n_features > 1:\n n_subsets = n_samples // 300\n n_samples_subsets = n_samples // n_subsets\n samples_shuffle = random_state.permutation(n_samples)\n h_subset = int(np.ceil(n_samples_subsets * (n_support / float(n_samples))))\n n_trials_tot = 500\n n_best_sub = 10\n n_trials = max(10, n_trials_tot // n_subsets)\n n_best_tot = n_subsets * n_best_sub\n all_best_locations = np.zeros((n_best_tot, n_features))\n try:\n all_best_covariances = np.zeros((n_best_tot, n_features, n_features))\n except MemoryError:\n n_best_tot = 10\n all_best_covariances = np.zeros((n_best_tot, n_features, n_features))\n n_best_sub = 2\n for i in range(n_subsets):\n low_bound = i * n_samples_subsets\n high_bound = low_bound + n_samples_subsets\n current_subset = X[samples_shuffle[low_bound:high_bound]]\n (best_locations_sub, best_covariances_sub, _, _) = select_candidates(current_subset, h_subset, n_trials, select=n_best_sub, n_iter=2, cov_computation_method=cov_computation_method, random_state=random_state)\n subset_slice = np.arange(i * n_best_sub, (i + 1) * n_best_sub)\n all_best_locations[subset_slice] = best_locations_sub\n all_best_covariances[subset_slice] = best_covariances_sub\n n_samples_merged = min(1500, n_samples)\n h_merged = int(np.ceil(n_samples_merged * (n_support / float(n_samples))))\n if n_samples > 1500:\n n_best_merged = 10\n else:\n n_best_merged = 1\n selection = random_state.permutation(n_samples)[:n_samples_merged]\n (locations_merged, covariances_merged, supports_merged, d) = select_candidates(X[selection], h_merged, n_trials=(all_best_locations, all_best_covariances), select=n_best_merged, cov_computation_method=cov_computation_method, random_state=random_state)\n if n_samples < 1500:\n location = locations_merged[0]\n covariance = covariances_merged[0]\n support = np.zeros(n_samples, dtype=bool)\n dist = np.zeros(n_samples)\n support[selection] = supports_merged[0]\n dist[selection] = d[0]\n else:\n (locations_full, covariances_full, supports_full, d) = select_candidates(X, n_support, n_trials=(locations_merged, covariances_merged), select=1, cov_computation_method=cov_computation_method, random_state=random_state)\n location = locations_full[0]\n covariance = covariances_full[0]\n support = supports_full[0]\n dist = d[0]\n elif n_features > 1:\n n_trials = 30\n n_best = 10\n (locations_best, covariances_best, _, _) = select_candidates(X, n_support, n_trials=n_trials, select=n_best, n_iter=2, cov_computation_method=cov_computation_method, random_state=random_state)\n (locations_full, covariances_full, supports_full, d) = select_candidates(X, n_support, n_trials=(locations_best, covariances_best), select=1, cov_computation_method=cov_computation_method, random_state=random_state)\n location = locations_full[0]\n covariance = covariances_full[0]\n support = supports_full[0]\n dist = d[0]\n return location, covariance, support, dist" }, { @@ -40560,7 +41949,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data (sub)set in which we look for the n_support purest observations." - } + }, + "refined_type": {} }, { "name": "n_support", @@ -40570,7 +41960,8 @@ "docstring": { "type": "int", "description": "The number of samples the pure data set must contain.\nThis parameter must be in the range `[(n + p + 1)/2] < n_support < n`." - } + }, + "refined_type": {} }, { "name": "n_trials", @@ -40580,7 +41971,8 @@ "docstring": { "type": "int or tuple of shape (2,)", "description": "Number of different initial sets of observations from which to\nrun the algorithm. This parameter should be a strictly positive\ninteger.\nInstead of giving a number of trials to perform, one can provide a\nlist of initial estimates that will be used to iteratively run\nc_step procedures. In this case:\n- n_trials[0]: array-like, shape (n_trials, n_features)\n is the list of `n_trials` initial location estimates\n- n_trials[1]: array-like, shape (n_trials, n_features, n_features)\n is the list of `n_trials` initial covariances estimates" - } + }, + "refined_type": {} }, { "name": "select", @@ -40590,7 +41982,8 @@ "docstring": { "type": "int, default=1", "description": "Number of best candidates results to return. This parameter must be\na strictly positive integer." - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -40600,7 +41993,8 @@ "docstring": { "type": "int, default=30", "description": "Maximum number of iterations for the c_step procedure.\n(2 is enough to be close to the final solution. \"Never\" exceeds 20).\nThis parameter must be a strictly positive integer." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -40610,7 +42004,8 @@ "docstring": { "type": "bool, default=False", "description": "Control the output verbosity." - } + }, + "refined_type": {} }, { "name": "cov_computation_method", @@ -40620,7 +42015,8 @@ "docstring": { "type": "callable, default=:func:`sklearn.covariance.empirical_covariance`", "description": "The function which will be used to compute the covariance.\nMust return an array of shape (n_features, n_features)." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -40630,13 +42026,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines the pseudo random number generator for shuffling the data.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Finds the best pure subset of observations to compute MCD from it.\n\nThe purpose of this function is to find the best sets of n_support observations with respect to a minimization of their covariance matrix determinant. Equivalently, it removes n_samples-n_support observations to construct what we call a pure data set (i.e. not containing outliers). The list of the observations of the pure data set is referred to as the `support`. Starting from a random support, the pure data set is found by the c_step procedure introduced by Rousseeuw and Van Driessen in [RV]_.", - "docstring": "Finds the best pure subset of observations to compute MCD from it.\n\nThe purpose of this function is to find the best sets of n_support\nobservations with respect to a minimization of their covariance\nmatrix determinant. Equivalently, it removes n_samples-n_support\nobservations to construct what we call a pure data set (i.e. not\ncontaining outliers). The list of the observations of the pure\ndata set is referred to as the `support`.\n\nStarting from a random support, the pure data set is found by the\nc_step procedure introduced by Rousseeuw and Van Driessen in\n[RV]_.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data (sub)set in which we look for the n_support purest observations.\n\nn_support : int\n The number of samples the pure data set must contain.\n This parameter must be in the range `[(n + p + 1)/2] < n_support < n`.\n\nn_trials : int or tuple of shape (2,)\n Number of different initial sets of observations from which to\n run the algorithm. This parameter should be a strictly positive\n integer.\n Instead of giving a number of trials to perform, one can provide a\n list of initial estimates that will be used to iteratively run\n c_step procedures. In this case:\n - n_trials[0]: array-like, shape (n_trials, n_features)\n is the list of `n_trials` initial location estimates\n - n_trials[1]: array-like, shape (n_trials, n_features, n_features)\n is the list of `n_trials` initial covariances estimates\n\nselect : int, default=1\n Number of best candidates results to return. This parameter must be\n a strictly positive integer.\n\nn_iter : int, default=30\n Maximum number of iterations for the c_step procedure.\n (2 is enough to be close to the final solution. \"Never\" exceeds 20).\n This parameter must be a strictly positive integer.\n\nverbose : bool, default=False\n Control the output verbosity.\n\ncov_computation_method : callable, default=:func:`sklearn.covariance.empirical_covariance`\n The function which will be used to compute the covariance.\n Must return an array of shape (n_features, n_features).\n\nrandom_state : int, RandomState instance or None, default=None\n Determines the pseudo random number generator for shuffling the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nSee Also\n---------\nc_step\n\nReturns\n-------\nbest_locations : ndarray of shape (select, n_features)\n The `select` location estimates computed from the `select` best\n supports found in the data set (`X`).\n\nbest_covariances : ndarray of shape (select, n_features, n_features)\n The `select` covariance estimates computed from the `select`\n best supports found in the data set (`X`).\n\nbest_supports : ndarray of shape (select, n_samples)\n The `select` best supports found in the data set (`X`).\n\nReferences\n----------\n.. [RV] A Fast Algorithm for the Minimum Covariance Determinant\n Estimator, 1999, American Statistical Association and the American\n Society for Quality, TECHNOMETRICS", + "description": "Finds the best pure subset of observations to compute MCD from it.\n\nThe purpose of this function is to find the best sets of n_support\nobservations with respect to a minimization of their covariance\nmatrix determinant. Equivalently, it removes n_samples-n_support\nobservations to construct what we call a pure data set (i.e. not\ncontaining outliers). The list of the observations of the pure\ndata set is referred to as the `support`.\n\nStarting from a random support, the pure data set is found by the\nc_step procedure introduced by Rousseeuw and Van Driessen in\n[RV]_.", + "docstring": "Finds the best pure subset of observations to compute MCD from it.\n\n The purpose of this function is to find the best sets of n_support\n observations with respect to a minimization of their covariance\n matrix determinant. Equivalently, it removes n_samples-n_support\n observations to construct what we call a pure data set (i.e. not\n containing outliers). The list of the observations of the pure\n data set is referred to as the `support`.\n\n Starting from a random support, the pure data set is found by the\n c_step procedure introduced by Rousseeuw and Van Driessen in\n [RV]_.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data (sub)set in which we look for the n_support purest observations.\n\n n_support : int\n The number of samples the pure data set must contain.\n This parameter must be in the range `[(n + p + 1)/2] < n_support < n`.\n\n n_trials : int or tuple of shape (2,)\n Number of different initial sets of observations from which to\n run the algorithm. This parameter should be a strictly positive\n integer.\n Instead of giving a number of trials to perform, one can provide a\n list of initial estimates that will be used to iteratively run\n c_step procedures. In this case:\n - n_trials[0]: array-like, shape (n_trials, n_features)\n is the list of `n_trials` initial location estimates\n - n_trials[1]: array-like, shape (n_trials, n_features, n_features)\n is the list of `n_trials` initial covariances estimates\n\n select : int, default=1\n Number of best candidates results to return. This parameter must be\n a strictly positive integer.\n\n n_iter : int, default=30\n Maximum number of iterations for the c_step procedure.\n (2 is enough to be close to the final solution. \"Never\" exceeds 20).\n This parameter must be a strictly positive integer.\n\n verbose : bool, default=False\n Control the output verbosity.\n\n cov_computation_method : callable, default=:func:`sklearn.covariance.empirical_covariance`\n The function which will be used to compute the covariance.\n Must return an array of shape (n_features, n_features).\n\n random_state : int, RandomState instance or None, default=None\n Determines the pseudo random number generator for shuffling the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n See Also\n ---------\n c_step\n\n Returns\n -------\n best_locations : ndarray of shape (select, n_features)\n The `select` location estimates computed from the `select` best\n supports found in the data set (`X`).\n\n best_covariances : ndarray of shape (select, n_features, n_features)\n The `select` covariance estimates computed from the `select`\n best supports found in the data set (`X`).\n\n best_supports : ndarray of shape (select, n_samples)\n The `select` best supports found in the data set (`X`).\n\n References\n ----------\n .. [RV] A Fast Algorithm for the Minimum Covariance Determinant\n Estimator, 1999, American Statistical Association and the American\n Society for Quality, TECHNOMETRICS\n ", "source_code": "\ndef select_candidates(X, n_support, n_trials, select=1, n_iter=30, verbose=False, cov_computation_method=empirical_covariance, random_state=None):\n \"\"\"Finds the best pure subset of observations to compute MCD from it.\n\n The purpose of this function is to find the best sets of n_support\n observations with respect to a minimization of their covariance\n matrix determinant. Equivalently, it removes n_samples-n_support\n observations to construct what we call a pure data set (i.e. not\n containing outliers). The list of the observations of the pure\n data set is referred to as the `support`.\n\n Starting from a random support, the pure data set is found by the\n c_step procedure introduced by Rousseeuw and Van Driessen in\n [RV]_.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data (sub)set in which we look for the n_support purest observations.\n\n n_support : int\n The number of samples the pure data set must contain.\n This parameter must be in the range `[(n + p + 1)/2] < n_support < n`.\n\n n_trials : int or tuple of shape (2,)\n Number of different initial sets of observations from which to\n run the algorithm. This parameter should be a strictly positive\n integer.\n Instead of giving a number of trials to perform, one can provide a\n list of initial estimates that will be used to iteratively run\n c_step procedures. In this case:\n - n_trials[0]: array-like, shape (n_trials, n_features)\n is the list of `n_trials` initial location estimates\n - n_trials[1]: array-like, shape (n_trials, n_features, n_features)\n is the list of `n_trials` initial covariances estimates\n\n select : int, default=1\n Number of best candidates results to return. This parameter must be\n a strictly positive integer.\n\n n_iter : int, default=30\n Maximum number of iterations for the c_step procedure.\n (2 is enough to be close to the final solution. \"Never\" exceeds 20).\n This parameter must be a strictly positive integer.\n\n verbose : bool, default=False\n Control the output verbosity.\n\n cov_computation_method : callable, default=:func:`sklearn.covariance.empirical_covariance`\n The function which will be used to compute the covariance.\n Must return an array of shape (n_features, n_features).\n\n random_state : int, RandomState instance or None, default=None\n Determines the pseudo random number generator for shuffling the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n See Also\n ---------\n c_step\n\n Returns\n -------\n best_locations : ndarray of shape (select, n_features)\n The `select` location estimates computed from the `select` best\n supports found in the data set (`X`).\n\n best_covariances : ndarray of shape (select, n_features, n_features)\n The `select` covariance estimates computed from the `select`\n best supports found in the data set (`X`).\n\n best_supports : ndarray of shape (select, n_samples)\n The `select` best supports found in the data set (`X`).\n\n References\n ----------\n .. [RV] A Fast Algorithm for the Minimum Covariance Determinant\n Estimator, 1999, American Statistical Association and the American\n Society for Quality, TECHNOMETRICS\n \"\"\"\n random_state = check_random_state(random_state)\n if isinstance(n_trials, numbers.Integral):\n run_from_estimates = False\n elif isinstance(n_trials, tuple):\n run_from_estimates = True\n estimates_list = n_trials\n n_trials = estimates_list[0].shape[0]\n else:\n raise TypeError(\"Invalid 'n_trials' parameter, expected tuple or integer, got %s (%s)\" % (n_trials, type(n_trials)))\n all_estimates = []\n if not run_from_estimates:\n for j in range(n_trials):\n all_estimates.append(_c_step(X, n_support, remaining_iterations=n_iter, verbose=verbose, cov_computation_method=cov_computation_method, random_state=random_state))\n else:\n for j in range(n_trials):\n initial_estimates = (estimates_list[0][j], estimates_list[1][j])\n all_estimates.append(_c_step(X, n_support, remaining_iterations=n_iter, initial_estimates=initial_estimates, verbose=verbose, cov_computation_method=cov_computation_method, random_state=random_state))\n (all_locs_sub, all_covs_sub, all_dets_sub, all_supports_sub, all_ds_sub) = zip(*all_estimates)\n index_best = np.argsort(all_dets_sub)[:select]\n best_locations = np.asarray(all_locs_sub)[index_best]\n best_covariances = np.asarray(all_covs_sub)[index_best]\n best_supports = np.asarray(all_supports_sub)[index_best]\n best_ds = np.asarray(all_ds_sub)[index_best]\n return best_locations, best_covariances, best_supports, best_ds" }, { @@ -40654,7 +42051,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "store_precision", @@ -40664,7 +42062,8 @@ "docstring": { "type": "bool, default=True", "description": "Specify if the estimated precision is stored." - } + }, + "refined_type": {} }, { "name": "assume_centered", @@ -40674,7 +42073,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, data will not be centered before computation.\nUseful when working with data whose mean is almost, but not exactly\nzero.\nIf False (default), data will be centered before computation." - } + }, + "refined_type": {} }, { "name": "block_size", @@ -40684,13 +42084,14 @@ "docstring": { "type": "int, default=1000", "description": "Size of blocks into which the covariance matrix will be split\nduring its Ledoit-Wolf estimation. This is purely a memory\noptimization and does not affect results." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, store_precision=True, assume_centered=False, block_size=1000):\n super().__init__(store_precision=store_precision, assume_centered=assume_centered)\n self.block_size = block_size" }, { @@ -40708,7 +42109,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -40718,7 +42120,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -40728,13 +42131,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the Ledoit-Wolf shrunk covariance model to X.", - "docstring": "Fit the Ledoit-Wolf shrunk covariance model to X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the Ledoit-Wolf shrunk covariance model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the Ledoit-Wolf shrunk covariance model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X)\n if self.assume_centered:\n self.location_ = np.zeros(X.shape[1])\n else:\n self.location_ = X.mean(0)\n with config_context(assume_finite=True):\n (covariance, shrinkage) = ledoit_wolf(X - self.location_, assume_centered=True, block_size=self.block_size)\n self.shrinkage_ = shrinkage\n self._set_covariance(covariance)\n return self" }, { @@ -40752,7 +42156,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -40762,7 +42167,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -40772,13 +42178,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the Oracle Approximating Shrinkage covariance model to X.", - "docstring": "Fit the Oracle Approximating Shrinkage covariance model to X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the Oracle Approximating Shrinkage covariance model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the Oracle Approximating Shrinkage covariance model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X)\n if self.assume_centered:\n self.location_ = np.zeros(X.shape[1])\n else:\n self.location_ = X.mean(0)\n (covariance, shrinkage) = oas(X - self.location_, assume_centered=True)\n self.shrinkage_ = shrinkage\n self._set_covariance(covariance)\n return self" }, { @@ -40796,7 +42203,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "store_precision", @@ -40806,7 +42214,8 @@ "docstring": { "type": "bool, default=True", "description": "Specify if the estimated precision is stored." - } + }, + "refined_type": {} }, { "name": "assume_centered", @@ -40816,7 +42225,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, data will not be centered before computation.\nUseful when working with data whose mean is almost, but not exactly\nzero.\nIf False, data will be centered before computation." - } + }, + "refined_type": {} }, { "name": "shrinkage", @@ -40826,13 +42236,14 @@ "docstring": { "type": "float, default=0.1", "description": "Coefficient in the convex combination used for the computation\nof the shrunk estimate. Range is [0, 1]." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, store_precision=True, assume_centered=False, shrinkage=0.1):\n super().__init__(store_precision=store_precision, assume_centered=assume_centered)\n self.shrinkage = shrinkage" }, { @@ -40850,7 +42261,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -40860,7 +42272,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -40870,13 +42283,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the shrunk covariance model to X.", - "docstring": "Fit the shrunk covariance model to X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the shrunk covariance model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the shrunk covariance model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X)\n if self.assume_centered:\n self.location_ = np.zeros(X.shape[1])\n else:\n self.location_ = X.mean(0)\n covariance = empirical_covariance(X, assume_centered=self.assume_centered)\n covariance = shrunk_covariance(covariance, self.shrinkage)\n self._set_covariance(covariance)\n return self" }, { @@ -40894,7 +42308,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data from which to compute the covariance estimate" - } + }, + "refined_type": {} }, { "name": "assume_centered", @@ -40904,7 +42319,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, data will not be centered before computation.\nUseful to work with data whose mean is significantly equal to\nzero but is not exactly zero.\nIf False, data will be centered before computation." - } + }, + "refined_type": {} }, { "name": "block_size", @@ -40914,13 +42330,14 @@ "docstring": { "type": "int, default=1000", "description": "Size of blocks into which the covariance matrix will be split.\nThis is purely a memory optimization and does not affect results." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Estimates the shrunk Ledoit-Wolf covariance matrix.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Estimates the shrunk Ledoit-Wolf covariance matrix.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate\n\nassume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful to work with data whose mean is significantly equal to\n zero but is not exactly zero.\n If False, data will be centered before computation.\n\nblock_size : int, default=1000\n Size of blocks into which the covariance matrix will be split.\n This is purely a memory optimization and does not affect results.\n\nReturns\n-------\nshrunk_cov : ndarray of shape (n_features, n_features)\n Shrunk covariance.\n\nshrinkage : float\n Coefficient in the convex combination used for the computation\n of the shrunk estimate.\n\nNotes\n-----\nThe regularized (shrunk) covariance is:\n\n(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\nwhere mu = trace(cov) / n_features", + "docstring": "Estimates the shrunk Ledoit-Wolf covariance matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate\n\n assume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful to work with data whose mean is significantly equal to\n zero but is not exactly zero.\n If False, data will be centered before computation.\n\n block_size : int, default=1000\n Size of blocks into which the covariance matrix will be split.\n This is purely a memory optimization and does not affect results.\n\n Returns\n -------\n shrunk_cov : ndarray of shape (n_features, n_features)\n Shrunk covariance.\n\n shrinkage : float\n Coefficient in the convex combination used for the computation\n of the shrunk estimate.\n\n Notes\n -----\n The regularized (shrunk) covariance is:\n\n (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\n where mu = trace(cov) / n_features\n ", "source_code": "\ndef ledoit_wolf(X, *, assume_centered=False, block_size=1000):\n \"\"\"Estimates the shrunk Ledoit-Wolf covariance matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate\n\n assume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful to work with data whose mean is significantly equal to\n zero but is not exactly zero.\n If False, data will be centered before computation.\n\n block_size : int, default=1000\n Size of blocks into which the covariance matrix will be split.\n This is purely a memory optimization and does not affect results.\n\n Returns\n -------\n shrunk_cov : ndarray of shape (n_features, n_features)\n Shrunk covariance.\n\n shrinkage : float\n Coefficient in the convex combination used for the computation\n of the shrunk estimate.\n\n Notes\n -----\n The regularized (shrunk) covariance is:\n\n (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\n where mu = trace(cov) / n_features\n \"\"\"\n X = check_array(X)\n if len(X.shape) == 2 and X.shape[1] == 1:\n if not assume_centered:\n X = X - X.mean()\n return np.atleast_2d((X**2).mean()), 0.0\n if X.ndim == 1:\n X = np.reshape(X, (1, -1))\n warnings.warn('Only one sample available. You may want to reshape your data array')\n n_features = X.size\n else:\n (_, n_features) = X.shape\n shrinkage = ledoit_wolf_shrinkage(X, assume_centered=assume_centered, block_size=block_size)\n emp_cov = empirical_covariance(X, assume_centered=assume_centered)\n mu = np.sum(np.trace(emp_cov)) / n_features\n shrunk_cov = (1.0 - shrinkage) * emp_cov\n shrunk_cov.flat[::n_features + 1] += shrinkage * mu\n return shrunk_cov, shrinkage" }, { @@ -40938,7 +42355,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data from which to compute the Ledoit-Wolf shrunk covariance shrinkage." - } + }, + "refined_type": {} }, { "name": "assume_centered", @@ -40948,7 +42366,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, data will not be centered before computation.\nUseful to work with data whose mean is significantly equal to\nzero but is not exactly zero.\nIf False, data will be centered before computation." - } + }, + "refined_type": {} }, { "name": "block_size", @@ -40958,13 +42377,14 @@ "docstring": { "type": "int, default=1000", "description": "Size of blocks into which the covariance matrix will be split." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Estimates the shrunk Ledoit-Wolf covariance matrix.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Estimates the shrunk Ledoit-Wolf covariance matrix.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data from which to compute the Ledoit-Wolf shrunk covariance shrinkage.\n\nassume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful to work with data whose mean is significantly equal to\n zero but is not exactly zero.\n If False, data will be centered before computation.\n\nblock_size : int, default=1000\n Size of blocks into which the covariance matrix will be split.\n\nReturns\n-------\nshrinkage : float\n Coefficient in the convex combination used for the computation\n of the shrunk estimate.\n\nNotes\n-----\nThe regularized (shrunk) covariance is:\n\n(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\nwhere mu = trace(cov) / n_features", + "docstring": "Estimates the shrunk Ledoit-Wolf covariance matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data from which to compute the Ledoit-Wolf shrunk covariance shrinkage.\n\n assume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful to work with data whose mean is significantly equal to\n zero but is not exactly zero.\n If False, data will be centered before computation.\n\n block_size : int, default=1000\n Size of blocks into which the covariance matrix will be split.\n\n Returns\n -------\n shrinkage : float\n Coefficient in the convex combination used for the computation\n of the shrunk estimate.\n\n Notes\n -----\n The regularized (shrunk) covariance is:\n\n (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\n where mu = trace(cov) / n_features\n ", "source_code": "\ndef ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000):\n \"\"\"Estimates the shrunk Ledoit-Wolf covariance matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data from which to compute the Ledoit-Wolf shrunk covariance shrinkage.\n\n assume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful to work with data whose mean is significantly equal to\n zero but is not exactly zero.\n If False, data will be centered before computation.\n\n block_size : int, default=1000\n Size of blocks into which the covariance matrix will be split.\n\n Returns\n -------\n shrinkage : float\n Coefficient in the convex combination used for the computation\n of the shrunk estimate.\n\n Notes\n -----\n The regularized (shrunk) covariance is:\n\n (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\n where mu = trace(cov) / n_features\n \"\"\"\n X = check_array(X)\n if len(X.shape) == 2 and X.shape[1] == 1:\n return 0.0\n if X.ndim == 1:\n X = np.reshape(X, (1, -1))\n if X.shape[0] == 1:\n warnings.warn('Only one sample available. You may want to reshape your data array')\n (n_samples, n_features) = X.shape\n if not assume_centered:\n X = X - X.mean(0)\n n_splits = int(n_features / block_size)\n X2 = X**2\n emp_cov_trace = np.sum(X2, axis=0) / n_samples\n mu = np.sum(emp_cov_trace) / n_features\n beta_ = 0.0\n delta_ = 0.0\n for i in range(n_splits):\n for j in range(n_splits):\n rows = slice(block_size * i, block_size * (i + 1))\n cols = slice(block_size * j, block_size * (j + 1))\n beta_ += np.sum(np.dot(X2.T[rows], X2[:, cols]))\n delta_ += np.sum(np.dot(X.T[rows], X[:, cols])**2)\n rows = slice(block_size * i, block_size * (i + 1))\n beta_ += np.sum(np.dot(X2.T[rows], X2[:, block_size * n_splits:]))\n delta_ += np.sum(np.dot(X.T[rows], X[:, block_size * n_splits:])**2)\n for j in range(n_splits):\n cols = slice(block_size * j, block_size * (j + 1))\n beta_ += np.sum(np.dot(X2.T[block_size * n_splits:], X2[:, cols]))\n delta_ += np.sum(np.dot(X.T[block_size * n_splits:], X[:, cols])**2)\n delta_ += np.sum(np.dot(X.T[block_size * n_splits:], X[:, block_size * n_splits:])**2)\n delta_ /= n_samples**2\n beta_ += np.sum(np.dot(X2.T[block_size * n_splits:], X2[:, block_size * n_splits:]))\n beta = 1.0 / (n_features * n_samples) * (beta_ / n_samples - delta_)\n delta = delta_ - 2.0 * mu * emp_cov_trace.sum() + n_features * mu**2\n delta /= n_features\n beta = min(beta, delta)\n shrinkage = 0 if beta == 0 else beta / delta\n return shrinkage" }, { @@ -40982,7 +42402,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data from which to compute the covariance estimate." - } + }, + "refined_type": {} }, { "name": "assume_centered", @@ -40992,13 +42413,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, data will not be centered before computation.\nUseful to work with data whose mean is significantly equal to\nzero but is not exactly zero.\nIf False, data will be centered before computation." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Estimate covariance with the Oracle Approximating Shrinkage algorithm.", - "docstring": "Estimate covariance with the Oracle Approximating Shrinkage algorithm.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\nassume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful to work with data whose mean is significantly equal to\n zero but is not exactly zero.\n If False, data will be centered before computation.\n\nReturns\n-------\nshrunk_cov : array-like of shape (n_features, n_features)\n Shrunk covariance.\n\nshrinkage : float\n Coefficient in the convex combination used for the computation\n of the shrunk estimate.\n\nNotes\n-----\nThe regularised (shrunk) covariance is:\n\n(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\nwhere mu = trace(cov) / n_features\n\nThe formula we used to implement the OAS is slightly modified compared\nto the one given in the article. See :class:`OAS` for more details.", + "docstring": "Estimate covariance with the Oracle Approximating Shrinkage algorithm.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\n assume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful to work with data whose mean is significantly equal to\n zero but is not exactly zero.\n If False, data will be centered before computation.\n\n Returns\n -------\n shrunk_cov : array-like of shape (n_features, n_features)\n Shrunk covariance.\n\n shrinkage : float\n Coefficient in the convex combination used for the computation\n of the shrunk estimate.\n\n Notes\n -----\n The regularised (shrunk) covariance is:\n\n (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\n where mu = trace(cov) / n_features\n\n The formula we used to implement the OAS is slightly modified compared\n to the one given in the article. See :class:`OAS` for more details.\n ", "source_code": "\ndef oas(X, *, assume_centered=False):\n \"\"\"Estimate covariance with the Oracle Approximating Shrinkage algorithm.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data from which to compute the covariance estimate.\n\n assume_centered : bool, default=False\n If True, data will not be centered before computation.\n Useful to work with data whose mean is significantly equal to\n zero but is not exactly zero.\n If False, data will be centered before computation.\n\n Returns\n -------\n shrunk_cov : array-like of shape (n_features, n_features)\n Shrunk covariance.\n\n shrinkage : float\n Coefficient in the convex combination used for the computation\n of the shrunk estimate.\n\n Notes\n -----\n The regularised (shrunk) covariance is:\n\n (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\n where mu = trace(cov) / n_features\n\n The formula we used to implement the OAS is slightly modified compared\n to the one given in the article. See :class:`OAS` for more details.\n \"\"\"\n X = np.asarray(X)\n if len(X.shape) == 2 and X.shape[1] == 1:\n if not assume_centered:\n X = X - X.mean()\n return np.atleast_2d((X**2).mean()), 0.0\n if X.ndim == 1:\n X = np.reshape(X, (1, -1))\n warnings.warn('Only one sample available. You may want to reshape your data array')\n n_samples = 1\n n_features = X.size\n else:\n (n_samples, n_features) = X.shape\n emp_cov = empirical_covariance(X, assume_centered=assume_centered)\n mu = np.trace(emp_cov) / n_features\n alpha = np.mean(emp_cov**2)\n num = alpha + mu**2\n den = (n_samples + 1.0) * (alpha - mu**2 / n_features)\n shrinkage = 1.0 if den == 0 else min(num / den, 1.0)\n shrunk_cov = (1.0 - shrinkage) * emp_cov\n shrunk_cov.flat[::n_features + 1] += shrinkage * mu\n return shrunk_cov, shrinkage" }, { @@ -41016,7 +42438,8 @@ "docstring": { "type": "array-like of shape (n_features, n_features)", "description": "Covariance matrix to be shrunk" - } + }, + "refined_type": {} }, { "name": "shrinkage", @@ -41026,13 +42449,14 @@ "docstring": { "type": "float, default=0.1", "description": "Coefficient in the convex combination used for the computation\nof the shrunk estimate. Range is [0, 1]." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Calculates a covariance matrix shrunk on the diagonal\n\nRead more in the :ref:`User Guide `.", - "docstring": "Calculates a covariance matrix shrunk on the diagonal\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nemp_cov : array-like of shape (n_features, n_features)\n Covariance matrix to be shrunk\n\nshrinkage : float, default=0.1\n Coefficient in the convex combination used for the computation\n of the shrunk estimate. Range is [0, 1].\n\nReturns\n-------\nshrunk_cov : ndarray of shape (n_features, n_features)\n Shrunk covariance.\n\nNotes\n-----\nThe regularized (shrunk) covariance is given by:\n\n(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\nwhere mu = trace(cov) / n_features", + "docstring": "Calculates a covariance matrix shrunk on the diagonal\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n emp_cov : array-like of shape (n_features, n_features)\n Covariance matrix to be shrunk\n\n shrinkage : float, default=0.1\n Coefficient in the convex combination used for the computation\n of the shrunk estimate. Range is [0, 1].\n\n Returns\n -------\n shrunk_cov : ndarray of shape (n_features, n_features)\n Shrunk covariance.\n\n Notes\n -----\n The regularized (shrunk) covariance is given by:\n\n (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\n where mu = trace(cov) / n_features\n ", "source_code": "\ndef shrunk_covariance(emp_cov, shrinkage=0.1):\n \"\"\"Calculates a covariance matrix shrunk on the diagonal\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n emp_cov : array-like of shape (n_features, n_features)\n Covariance matrix to be shrunk\n\n shrinkage : float, default=0.1\n Coefficient in the convex combination used for the computation\n of the shrunk estimate. Range is [0, 1].\n\n Returns\n -------\n shrunk_cov : ndarray of shape (n_features, n_features)\n Shrunk covariance.\n\n Notes\n -----\n The regularized (shrunk) covariance is given by:\n\n (1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)\n\n where mu = trace(cov) / n_features\n \"\"\"\n emp_cov = check_array(emp_cov)\n n_features = emp_cov.shape[0]\n mu = np.trace(emp_cov) / n_features\n shrunk_cov = (1.0 - shrinkage) * emp_cov\n shrunk_cov.flat[::n_features + 1] += shrinkage * mu\n return shrunk_cov" }, { @@ -41050,7 +42474,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -41060,7 +42485,8 @@ "docstring": { "type": "int, default=2", "description": "Number of components to keep. Should be in `[1, min(n_samples,\nn_features, n_targets)]`." - } + }, + "refined_type": {} }, { "name": "scale", @@ -41070,7 +42496,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to scale `X` and `Y`." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -41080,7 +42507,8 @@ "docstring": { "type": "int, default=500", "description": "The maximum number of iterations of the power method." - } + }, + "refined_type": {} }, { "name": "tol", @@ -41090,6 +42518,10 @@ "docstring": { "type": "float, default=1e-06", "description": "The tolerance used as convergence criteria in the power method: the\nalgorithm stops whenever the squared norm of `u_i - u_{i-1}` is less\nthan `tol`, where `u` corresponds to the left singular vector." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -41100,13 +42532,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to copy `X` and `Y` in fit before applying centering, and\npotentially scaling. If False, these operations will be done inplace,\nmodifying both arrays." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=2, *, scale=True, max_iter=500, tol=1e-06, copy=True):\n super().__init__(n_components=n_components, scale=scale, deflation_mode='canonical', mode='B', algorithm='nipals', max_iter=max_iter, tol=tol, copy=copy)" }, { @@ -41124,7 +42557,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -41134,7 +42568,8 @@ "docstring": { "type": "int, default=2", "description": "Number of components to keep. Should be in `[1, min(n_samples,\nn_features, n_targets)]`." - } + }, + "refined_type": {} }, { "name": "scale", @@ -41144,7 +42579,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to scale `X` and `Y`." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -41154,6 +42590,10 @@ "docstring": { "type": "{'nipals', 'svd'}, default='nipals'", "description": "The algorithm used to estimate the first singular vectors of the\ncross-covariance matrix. 'nipals' uses the power method while 'svd'\nwill compute the whole SVD." + }, + "refined_type": { + "kind": "EnumType", + "values": ["svd", "nipals"] } }, { @@ -41164,7 +42604,8 @@ "docstring": { "type": "int, default=500", "description": "The maximum number of iterations of the power method when\n`algorithm='nipals'`. Ignored otherwise." - } + }, + "refined_type": {} }, { "name": "tol", @@ -41174,6 +42615,10 @@ "docstring": { "type": "float, default=1e-06", "description": "The tolerance used as convergence criteria in the power method: the\nalgorithm stops whenever the squared norm of `u_i - u_{i-1}` is less\nthan `tol`, where `u` corresponds to the left singular vector." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -41184,13 +42629,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to copy `X` and `Y` in fit before applying centering, and\npotentially scaling. If False, these operations will be done inplace,\nmodifying both arrays." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=2, *, scale=True, algorithm='nipals', max_iter=500, tol=1e-06, copy=True):\n super().__init__(n_components=n_components, scale=scale, deflation_mode='canonical', mode='A', algorithm=algorithm, max_iter=max_iter, tol=tol, copy=copy)" }, { @@ -41208,7 +42654,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -41218,7 +42665,8 @@ "docstring": { "type": "int, default=2", "description": "Number of components to keep. Should be in `[1, min(n_samples,\nn_features, n_targets)]`." - } + }, + "refined_type": {} }, { "name": "scale", @@ -41228,7 +42676,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to scale `X` and `Y`." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -41238,7 +42687,8 @@ "docstring": { "type": "int, default=500", "description": "The maximum number of iterations of the power method when\n`algorithm='nipals'`. Ignored otherwise." - } + }, + "refined_type": {} }, { "name": "tol", @@ -41248,6 +42698,10 @@ "docstring": { "type": "float, default=1e-06", "description": "The tolerance used as convergence criteria in the power method: the\nalgorithm stops whenever the squared norm of `u_i - u_{i-1}` is less\nthan `tol`, where `u` corresponds to the left singular vector." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -41258,13 +42712,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to copy `X` and `Y` in :term:`fit` before applying centering,\nand potentially scaling. If `False`, these operations will be done\ninplace, modifying both arrays." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=2, *, scale=True, max_iter=500, tol=1e-06, copy=True):\n super().__init__(n_components=n_components, scale=scale, deflation_mode='regression', mode='A', algorithm='nipals', max_iter=max_iter, tol=tol, copy=copy)" }, { @@ -41282,7 +42737,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -41292,7 +42748,8 @@ "docstring": { "type": "int, default=2", "description": "The number of components to keep. Should be in `[1,\nmin(n_samples, n_features, n_targets)]`." - } + }, + "refined_type": {} }, { "name": "scale", @@ -41302,7 +42759,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to scale `X` and `Y`." - } + }, + "refined_type": {} }, { "name": "copy", @@ -41312,13 +42770,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to copy `X` and `Y` in fit before applying centering, and\npotentially scaling. If `False`, these operations will be done inplace,\nmodifying both arrays." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=2, *, scale=True, copy=True):\n self.n_components = n_components\n self.scale = scale\n self.copy = copy" }, { @@ -41336,7 +42795,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -41346,7 +42806,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training samples." - } + }, + "refined_type": {} }, { "name": "Y", @@ -41356,13 +42817,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Targets." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit model to data.", - "docstring": "Fit model to data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training samples.\n\nY : array-like of shape (n_samples,) or (n_samples, n_targets)\n Targets.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit model to data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training samples.\n\n Y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Targets.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, Y):\n \"\"\"Fit model to data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training samples.\n\n Y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Targets.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n check_consistent_length(X, Y)\n X = self._validate_data(X, dtype=np.float64, copy=self.copy, ensure_min_samples=2)\n Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)\n if Y.ndim == 1:\n Y = Y.reshape(-1, 1)\n n_components = self.n_components\n rank_upper_bound = min(X.shape[0], X.shape[1], Y.shape[1])\n if not 1 <= n_components <= rank_upper_bound:\n warnings.warn(f'As of version 0.24, n_components({n_components}) should be in [1, min(n_features, n_samples, n_targets)] = [1, {rank_upper_bound}]. n_components={rank_upper_bound} will be used instead. In version 1.1 (renaming of 0.26), an error will be raised.', FutureWarning)\n n_components = rank_upper_bound\n (X, Y, self._x_mean, self._y_mean, self._x_std, self._y_std) = _center_scale_xy(X, Y, self.scale)\n C = np.dot(X.T, Y)\n (U, s, Vt) = svd(C, full_matrices=False)\n U = U[:, :n_components]\n Vt = Vt[:n_components]\n (U, Vt) = svd_flip(U, Vt)\n V = Vt.T\n self._x_scores = np.dot(X, U)\n self._y_scores = np.dot(Y, V)\n self.x_weights_ = U\n self.y_weights_ = V\n return self" }, { @@ -41380,7 +42842,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -41390,7 +42853,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -41400,13 +42864,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets), default=None", "description": "Targets." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Learn and apply the dimensionality reduction.", - "docstring": "Learn and apply the dimensionality reduction.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training samples.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Targets.\n\nReturns\n-------\nout : array-like or tuple of array-like\n The transformed data `X_tranformed` if `Y is not None`,\n `(X_transformed, Y_transformed)` otherwise.", + "docstring": "Learn and apply the dimensionality reduction.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training samples.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Targets.\n\n Returns\n -------\n out : array-like or tuple of array-like\n The transformed data `X_tranformed` if `Y is not None`,\n `(X_transformed, Y_transformed)` otherwise.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Learn and apply the dimensionality reduction.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training samples.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Targets.\n\n Returns\n -------\n out : array-like or tuple of array-like\n The transformed data `X_tranformed` if `Y is not None`,\n `(X_transformed, Y_transformed)` otherwise.\n \"\"\"\n return self.fit(X, y).transform(X, y)" }, { @@ -41424,7 +42889,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -41434,7 +42900,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Samples to be transformed." - } + }, + "refined_type": {} }, { "name": "Y", @@ -41444,13 +42911,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets), default=None", "description": "Targets." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Apply the dimensionality reduction.", - "docstring": "Apply the dimensionality reduction.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Samples to be transformed.\n\nY : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Targets.\n\nReturns\n-------\nx_scores : array-like or tuple of array-like\n The transformed data `X_tranformed` if `Y is not None`,\n `(X_transformed, Y_transformed)` otherwise.", + "docstring": "\n Apply the dimensionality reduction.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Samples to be transformed.\n\n Y : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Targets.\n\n Returns\n -------\n x_scores : array-like or tuple of array-like\n The transformed data `X_tranformed` if `Y is not None`,\n `(X_transformed, Y_transformed)` otherwise.\n ", "source_code": "\ndef transform(self, X, Y=None):\n \"\"\"\n Apply the dimensionality reduction.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Samples to be transformed.\n\n Y : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Targets.\n\n Returns\n -------\n x_scores : array-like or tuple of array-like\n The transformed data `X_tranformed` if `Y is not None`,\n `(X_transformed, Y_transformed)` otherwise.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, dtype=np.float64, reset=False)\n Xr = (X - self._x_mean) / self._x_std\n x_scores = np.dot(Xr, self.x_weights_)\n if Y is not None:\n Y = check_array(Y, ensure_2d=False, dtype=np.float64)\n if Y.ndim == 1:\n Y = Y.reshape(-1, 1)\n Yr = (Y - self._y_mean) / self._y_std\n y_scores = np.dot(Yr, self.y_weights_)\n return x_scores, y_scores\n return x_scores" }, { @@ -41471,13 +42939,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `x_mean_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef x_mean_(self):\n return self._x_mean" }, { @@ -41498,13 +42967,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `x_scores_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26). Use est.transform(X) on the training data instead.')\n@property\ndef x_scores_(self):\n return self._x_scores" }, { @@ -41525,13 +42995,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `x_std_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef x_std_(self):\n return self._x_std" }, { @@ -41552,13 +43023,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `y_mean_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef y_mean_(self):\n return self._y_mean" }, { @@ -41579,13 +43051,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `y_scores_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26). Use est.transform(X, Y) on the training data instead.')\n@property\ndef y_scores_(self):\n return self._y_scores" }, { @@ -41606,13 +43079,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `y_std_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef y_std_(self):\n return self._y_std" }, { @@ -41630,7 +43104,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -41640,7 +43115,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scale", @@ -41650,7 +43126,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deflation_mode", @@ -41660,7 +43137,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mode", @@ -41670,7 +43148,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -41680,7 +43159,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -41690,7 +43170,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -41700,7 +43181,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy", @@ -41710,13 +43192,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, n_components=2, *, scale=True, deflation_mode='regression', mode='A', algorithm='nipals', max_iter=500, tol=1e-06, copy=True):\n self.n_components = n_components\n self.deflation_mode = deflation_mode\n self.mode = mode\n self.scale = scale\n self.algorithm = algorithm\n self.max_iter = max_iter\n self.tol = tol\n self.copy = copy" }, { @@ -41734,13 +43217,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'poor_score': True, 'requires_y': False}" }, { @@ -41758,7 +43242,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -41768,7 +43253,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of predictors." - } + }, + "refined_type": {} }, { "name": "Y", @@ -41778,13 +43264,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target vectors, where `n_samples` is the number of samples and\n`n_targets` is the number of response variables." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit model to data.", - "docstring": "Fit model to data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of predictors.\n\nY : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target vectors, where `n_samples` is the number of samples and\n `n_targets` is the number of response variables.\n\nReturns\n-------\nself : object\n Fitted model.", + "docstring": "Fit model to data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of predictors.\n\n Y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target vectors, where `n_samples` is the number of samples and\n `n_targets` is the number of response variables.\n\n Returns\n -------\n self : object\n Fitted model.\n ", "source_code": "\ndef fit(self, X, Y):\n \"\"\"Fit model to data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of predictors.\n\n Y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target vectors, where `n_samples` is the number of samples and\n `n_targets` is the number of response variables.\n\n Returns\n -------\n self : object\n Fitted model.\n \"\"\"\n check_consistent_length(X, Y)\n X = self._validate_data(X, dtype=np.float64, copy=self.copy, ensure_min_samples=2)\n Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)\n if Y.ndim == 1:\n Y = Y.reshape(-1, 1)\n n = X.shape[0]\n p = X.shape[1]\n q = Y.shape[1]\n n_components = self.n_components\n if self.deflation_mode == 'regression':\n rank_upper_bound = p\n if not 1 <= n_components <= rank_upper_bound:\n warnings.warn(f'As of version 0.24, n_components({n_components}) should be in [1, n_features].n_components={rank_upper_bound} will be used instead. In version 1.1 (renaming of 0.26), an error will be raised.', FutureWarning)\n n_components = rank_upper_bound\n else:\n rank_upper_bound = min(n, p, q)\n if not 1 <= self.n_components <= rank_upper_bound:\n warnings.warn(f'As of version 0.24, n_components({n_components}) should be in [1, min(n_features, n_samples, n_targets)] = [1, {rank_upper_bound}]. n_components={rank_upper_bound} will be used instead. In version 1.1 (renaming of 0.26), an error will be raised.', FutureWarning)\n n_components = rank_upper_bound\n if self.algorithm not in ('svd', 'nipals'):\n raise ValueError(f\"algorithm should be 'svd' or 'nipals', got {self.algorithm}.\")\n self._norm_y_weights = self.deflation_mode == 'canonical'\n norm_y_weights = self._norm_y_weights\n (Xk, Yk, self._x_mean, self._y_mean, self._x_std, self._y_std) = _center_scale_xy(X, Y, self.scale)\n self.x_weights_ = np.zeros((p, n_components))\n self.y_weights_ = np.zeros((q, n_components))\n self._x_scores = np.zeros((n, n_components))\n self._y_scores = np.zeros((n, n_components))\n self.x_loadings_ = np.zeros((p, n_components))\n self.y_loadings_ = np.zeros((q, n_components))\n self.n_iter_ = []\n Y_eps = np.finfo(Yk.dtype).eps\n for k in range(n_components):\n if self.algorithm == 'nipals':\n Yk_mask = np.all(np.abs(Yk) < 10 * Y_eps, axis=0)\n Yk[:, Yk_mask] = 0.0\n try:\n (x_weights, y_weights, n_iter_) = _get_first_singular_vectors_power_method(Xk, Yk, mode=self.mode, max_iter=self.max_iter, tol=self.tol, norm_y_weights=norm_y_weights)\n except StopIteration as e:\n if str(e) != 'Y residual is constant':\n raise\n warnings.warn(f'Y residual is constant at iteration {k}')\n break\n self.n_iter_.append(n_iter_)\n elif self.algorithm == 'svd':\n (x_weights, y_weights) = _get_first_singular_vectors_svd(Xk, Yk)\n _svd_flip_1d(x_weights, y_weights)\n x_scores = np.dot(Xk, x_weights)\n if norm_y_weights:\n y_ss = 1\n else:\n y_ss = np.dot(y_weights, y_weights)\n y_scores = np.dot(Yk, y_weights) / y_ss\n x_loadings = np.dot(x_scores, Xk) / np.dot(x_scores, x_scores)\n Xk -= np.outer(x_scores, x_loadings)\n if self.deflation_mode == 'canonical':\n y_loadings = np.dot(y_scores, Yk) / np.dot(y_scores, y_scores)\n Yk -= np.outer(y_scores, y_loadings)\n if self.deflation_mode == 'regression':\n y_loadings = np.dot(x_scores, Yk) / np.dot(x_scores, x_scores)\n Yk -= np.outer(x_scores, y_loadings)\n self.x_weights_[:, k] = x_weights\n self.y_weights_[:, k] = y_weights\n self._x_scores[:, k] = x_scores\n self._y_scores[:, k] = y_scores\n self.x_loadings_[:, k] = x_loadings\n self.y_loadings_[:, k] = y_loadings\n self.x_rotations_ = np.dot(self.x_weights_, pinv2(np.dot(self.x_loadings_.T, self.x_weights_), check_finite=False))\n self.y_rotations_ = np.dot(self.y_weights_, pinv2(np.dot(self.y_loadings_.T, self.y_weights_), check_finite=False))\n self.coef_ = np.dot(self.x_rotations_, self.y_loadings_.T)\n self.coef_ = self.coef_ * self._y_std\n return self" }, { @@ -41802,7 +43289,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -41812,7 +43300,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of predictors." - } + }, + "refined_type": {} }, { "name": "y", @@ -41822,13 +43311,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_targets), default=None", "description": "Target vectors, where `n_samples` is the number of samples and\n`n_targets` is the number of response variables." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Learn and apply the dimension reduction on the train data.", - "docstring": "Learn and apply the dimension reduction on the train data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of predictors.\n\ny : array-like of shape (n_samples, n_targets), default=None\n Target vectors, where `n_samples` is the number of samples and\n `n_targets` is the number of response variables.\n\nReturns\n-------\nself : ndarray of shape (n_samples, n_components)\n Return `x_scores` if `Y` is not given, `(x_scores, y_scores)` otherwise.", + "docstring": "Learn and apply the dimension reduction on the train data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of predictors.\n\n y : array-like of shape (n_samples, n_targets), default=None\n Target vectors, where `n_samples` is the number of samples and\n `n_targets` is the number of response variables.\n\n Returns\n -------\n self : ndarray of shape (n_samples, n_components)\n Return `x_scores` if `Y` is not given, `(x_scores, y_scores)` otherwise.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Learn and apply the dimension reduction on the train data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of predictors.\n\n y : array-like of shape (n_samples, n_targets), default=None\n Target vectors, where `n_samples` is the number of samples and\n `n_targets` is the number of response variables.\n\n Returns\n -------\n self : ndarray of shape (n_samples, n_components)\n Return `x_scores` if `Y` is not given, `(x_scores, y_scores)` otherwise.\n \"\"\"\n return self.fit(X, y).transform(X, y)" }, { @@ -41846,7 +43336,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -41856,13 +43347,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "New data, where `n_samples` is the number of samples\nand `n_components` is the number of pls components." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Transform data back to its original space.", - "docstring": "Transform data back to its original space.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_components)\n New data, where `n_samples` is the number of samples\n and `n_components` is the number of pls components.\n\nReturns\n-------\nself : ndarray of shape (n_samples, n_features)\n Return the reconstructed array.\n\nNotes\n-----\nThis transformation will only be exact if `n_components=n_features`.", + "docstring": "Transform data back to its original space.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_components)\n New data, where `n_samples` is the number of samples\n and `n_components` is the number of pls components.\n\n Returns\n -------\n self : ndarray of shape (n_samples, n_features)\n Return the reconstructed array.\n\n Notes\n -----\n This transformation will only be exact if `n_components=n_features`.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Transform data back to its original space.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_components)\n New data, where `n_samples` is the number of samples\n and `n_components` is the number of pls components.\n\n Returns\n -------\n self : ndarray of shape (n_samples, n_features)\n Return the reconstructed array.\n\n Notes\n -----\n This transformation will only be exact if `n_components=n_features`.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, dtype=FLOAT_DTYPES)\n X_reconstructed = np.matmul(X, self.x_loadings_.T)\n X_reconstructed *= self._x_std\n X_reconstructed += self._x_mean\n return X_reconstructed" }, { @@ -41883,13 +43375,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `norm_y_weights` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef norm_y_weights(self):\n return self._norm_y_weights" }, { @@ -41907,7 +43400,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -41917,7 +43411,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Samples." - } + }, + "refined_type": {} }, { "name": "copy", @@ -41927,13 +43422,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to copy `X` and `Y`, or perform in-place normalization." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Predict targets of given samples.", - "docstring": "Predict targets of given samples.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Samples.\n\ncopy : bool, default=True\n Whether to copy `X` and `Y`, or perform in-place normalization.\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Returns predicted values.\n\nNotes\n-----\nThis call requires the estimation of a matrix of shape\n`(n_features, n_targets)`, which may be an issue in high dimensional\nspace.", + "docstring": "Predict targets of given samples.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Samples.\n\n copy : bool, default=True\n Whether to copy `X` and `Y`, or perform in-place normalization.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Returns predicted values.\n\n Notes\n -----\n This call requires the estimation of a matrix of shape\n `(n_features, n_targets)`, which may be an issue in high dimensional\n space.\n ", "source_code": "\ndef predict(self, X, copy=True):\n \"\"\"Predict targets of given samples.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Samples.\n\n copy : bool, default=True\n Whether to copy `X` and `Y`, or perform in-place normalization.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Returns predicted values.\n\n Notes\n -----\n This call requires the estimation of a matrix of shape\n `(n_features, n_targets)`, which may be an issue in high dimensional\n space.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, copy=copy, dtype=FLOAT_DTYPES, reset=False)\n X -= self._x_mean\n X /= self._x_std\n Ypred = np.dot(X, self.coef_)\n return Ypred + self._y_mean" }, { @@ -41951,7 +43447,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -41961,7 +43458,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Samples to transform." - } + }, + "refined_type": {} }, { "name": "Y", @@ -41971,7 +43469,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_targets), default=None", "description": "Target vectors." - } + }, + "refined_type": {} }, { "name": "copy", @@ -41981,13 +43480,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to copy `X` and `Y`, or perform in-place normalization." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Apply the dimension reduction.", - "docstring": "Apply the dimension reduction.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Samples to transform.\n\nY : array-like of shape (n_samples, n_targets), default=None\n Target vectors.\n\ncopy : bool, default=True\n Whether to copy `X` and `Y`, or perform in-place normalization.\n\nReturns\n-------\nx_scores, y_scores : array-like or tuple of array-like\n Return `x_scores` if `Y` is not given, `(x_scores, y_scores)` otherwise.", + "docstring": "Apply the dimension reduction.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Samples to transform.\n\n Y : array-like of shape (n_samples, n_targets), default=None\n Target vectors.\n\n copy : bool, default=True\n Whether to copy `X` and `Y`, or perform in-place normalization.\n\n Returns\n -------\n x_scores, y_scores : array-like or tuple of array-like\n Return `x_scores` if `Y` is not given, `(x_scores, y_scores)` otherwise.\n ", "source_code": "\ndef transform(self, X, Y=None, copy=True):\n \"\"\"Apply the dimension reduction.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Samples to transform.\n\n Y : array-like of shape (n_samples, n_targets), default=None\n Target vectors.\n\n copy : bool, default=True\n Whether to copy `X` and `Y`, or perform in-place normalization.\n\n Returns\n -------\n x_scores, y_scores : array-like or tuple of array-like\n Return `x_scores` if `Y` is not given, `(x_scores, y_scores)` otherwise.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, copy=copy, dtype=FLOAT_DTYPES, reset=False)\n X -= self._x_mean\n X /= self._x_std\n x_scores = np.dot(X, self.x_rotations_)\n if Y is not None:\n Y = check_array(Y, ensure_2d=False, copy=copy, dtype=FLOAT_DTYPES)\n if Y.ndim == 1:\n Y = Y.reshape(-1, 1)\n Y -= self._y_mean\n Y /= self._y_std\n y_scores = np.dot(Y, self.y_rotations_)\n return x_scores, y_scores\n return x_scores" }, { @@ -42008,13 +43508,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `x_mean_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef x_mean_(self):\n return self._x_mean" }, { @@ -42032,7 +43533,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -42059,13 +43561,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `x_std_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef x_std_(self):\n return self._x_std" }, { @@ -42086,13 +43589,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `y_mean_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef y_mean_(self):\n return self._y_mean" }, { @@ -42110,7 +43614,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -42137,13 +43642,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `y_std_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef y_std_(self):\n return self._y_std" }, { @@ -42161,7 +43667,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -42171,7 +43678,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scale", @@ -42181,13 +43689,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Center X, Y and scale if the scale parameter==True", - "docstring": "Center X, Y and scale if the scale parameter==True\n\nReturns\n-------\n X, Y, x_mean, y_mean, x_std, y_std", + "docstring": "Center X, Y and scale if the scale parameter==True\n\n Returns\n -------\n X, Y, x_mean, y_mean, x_std, y_std\n ", "source_code": "\ndef _center_scale_xy(X, Y, scale=True):\n \"\"\"Center X, Y and scale if the scale parameter==True\n\n Returns\n -------\n X, Y, x_mean, y_mean, x_std, y_std\n \"\"\"\n x_mean = X.mean(axis=0)\n X -= x_mean\n y_mean = Y.mean(axis=0)\n Y -= y_mean\n if scale:\n x_std = X.std(axis=0, ddof=1)\n x_std[x_std == 0.0] = 1.0\n X /= x_std\n y_std = Y.std(axis=0, ddof=1)\n y_std[y_std == 0.0] = 1.0\n Y /= y_std\n else:\n x_std = np.ones(X.shape[1])\n y_std = np.ones(Y.shape[1])\n return X, Y, x_mean, y_mean, x_std, y_std" }, { @@ -42205,7 +43714,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -42215,7 +43725,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mode", @@ -42225,7 +43736,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -42235,7 +43747,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -42245,7 +43758,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "norm_y_weights", @@ -42255,13 +43769,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return the first left and right singular vectors of X'Y.\n\nProvides an alternative to the svd(X'Y) and uses the power method instead. With norm_y_weights to True and in mode A, this corresponds to the algorithm section 11.3 of the Wegelin's review, except this starts at the \"update saliences\" part.", - "docstring": "Return the first left and right singular vectors of X'Y.\n\nProvides an alternative to the svd(X'Y) and uses the power method instead.\nWith norm_y_weights to True and in mode A, this corresponds to the\nalgorithm section 11.3 of the Wegelin's review, except this starts at the\n\"update saliences\" part.", + "description": "Return the first left and right singular vectors of X'Y.\n\nProvides an alternative to the svd(X'Y) and uses the power method instead.\nWith norm_y_weights to True and in mode A, this corresponds to the\nalgorithm section 11.3 of the Wegelin's review, except this starts at the\n\"update saliences\" part.", + "docstring": "Return the first left and right singular vectors of X'Y.\n\n Provides an alternative to the svd(X'Y) and uses the power method instead.\n With norm_y_weights to True and in mode A, this corresponds to the\n algorithm section 11.3 of the Wegelin's review, except this starts at the\n \"update saliences\" part.\n ", "source_code": "\ndef _get_first_singular_vectors_power_method(X, Y, mode='A', max_iter=500, tol=1e-06, norm_y_weights=False):\n \"\"\"Return the first left and right singular vectors of X'Y.\n\n Provides an alternative to the svd(X'Y) and uses the power method instead.\n With norm_y_weights to True and in mode A, this corresponds to the\n algorithm section 11.3 of the Wegelin's review, except this starts at the\n \"update saliences\" part.\n \"\"\"\n eps = np.finfo(X.dtype).eps\n try:\n y_score = next((col for col in Y.T if np.any(np.abs(col) > eps)))\n except StopIteration as e:\n raise StopIteration('Y residual is constant') from e\n x_weights_old = 100\n if mode == 'B':\n (X_pinv, Y_pinv) = (_pinv2_old(X), _pinv2_old(Y))\n for i in range(max_iter):\n if mode == 'B':\n x_weights = np.dot(X_pinv, y_score)\n else:\n x_weights = np.dot(X.T, y_score) / np.dot(y_score, y_score)\n x_weights /= np.sqrt(np.dot(x_weights, x_weights)) + eps\n x_score = np.dot(X, x_weights)\n if mode == 'B':\n y_weights = np.dot(Y_pinv, x_score)\n else:\n y_weights = np.dot(Y.T, x_score) / np.dot(x_score.T, x_score)\n if norm_y_weights:\n y_weights /= np.sqrt(np.dot(y_weights, y_weights)) + eps\n y_score = np.dot(Y, y_weights) / (np.dot(y_weights, y_weights) + eps)\n x_weights_diff = x_weights - x_weights_old\n if np.dot(x_weights_diff, x_weights_diff) < tol or Y.shape[1] == 1:\n break\n x_weights_old = x_weights\n n_iter = i + 1\n if n_iter == max_iter:\n warnings.warn('Maximum number of iterations reached', ConvergenceWarning)\n return x_weights, y_weights, n_iter" }, { @@ -42279,7 +43794,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -42289,13 +43805,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return the first left and right singular vectors of X'Y.\n\nHere the whole SVD is computed.", - "docstring": "Return the first left and right singular vectors of X'Y.\n\nHere the whole SVD is computed.", + "docstring": "Return the first left and right singular vectors of X'Y.\n\n Here the whole SVD is computed.\n ", "source_code": "\ndef _get_first_singular_vectors_svd(X, Y):\n \"\"\"Return the first left and right singular vectors of X'Y.\n\n Here the whole SVD is computed.\n \"\"\"\n C = np.dot(X.T, Y)\n (U, _, Vt) = svd(C, full_matrices=False)\n return U[:, 0], Vt[0, :]" }, { @@ -42313,13 +43830,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _pinv2_old(a):\n (u, s, vh) = svd(a, full_matrices=False, check_finite=False)\n t = u.dtype.char.lower()\n factor = {'f': 1000.0, 'd': 1000000.0}\n cond = np.max(s) * factor[t] * np.finfo(t).eps\n rank = np.sum(s > cond)\n u = u[:, :rank]\n u /= s[:rank]\n return np.transpose(np.conjugate(np.dot(u, vh[:rank])))" }, { @@ -42337,7 +43855,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "v", @@ -42347,7 +43866,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -42371,7 +43891,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data", @@ -42381,7 +43902,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "target", @@ -42391,7 +43913,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "feature_names", @@ -42401,7 +43924,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "target_names", @@ -42411,7 +43935,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sparse_data", @@ -42421,13 +43946,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _convert_data_dataframe(caller_name, data, target, feature_names, target_names, sparse_data=False):\n pd = check_pandas_support('{} with as_frame=True'.format(caller_name))\n if not sparse_data:\n data_df = pd.DataFrame(data, columns=feature_names)\n else:\n data_df = pd.DataFrame.sparse.from_spmatrix(data, columns=feature_names)\n target_df = pd.DataFrame(target, columns=target_names)\n combined_df = pd.concat([data_df, target_df], axis=1)\n X = combined_df[feature_names]\n y = combined_df[target_names]\n if y.shape[1] == 1:\n y = y.iloc[:, 0]\n return combined_df, X, y" }, { @@ -42445,7 +43971,8 @@ "docstring": { "type": "RemoteFileMetadata", "description": "Named tuple containing remote dataset meta information: url, filename\nand checksum" - } + }, + "refined_type": {} }, { "name": "dirname", @@ -42455,13 +43982,14 @@ "docstring": { "type": "str", "description": "Directory to save the file to." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Helper function to download a remote dataset into path\n\nFetch a dataset pointed by remote's url, save into path using remote's filename and ensure its integrity based on the SHA256 Checksum of the downloaded file.", - "docstring": "Helper function to download a remote dataset into path\n\nFetch a dataset pointed by remote's url, save into path using remote's\nfilename and ensure its integrity based on the SHA256 Checksum of the\ndownloaded file.\n\nParameters\n----------\nremote : RemoteFileMetadata\n Named tuple containing remote dataset meta information: url, filename\n and checksum\n\ndirname : str\n Directory to save the file to.\n\nReturns\n-------\nfile_path: str\n Full path of the created file.", + "description": "Helper function to download a remote dataset into path\n\nFetch a dataset pointed by remote's url, save into path using remote's\nfilename and ensure its integrity based on the SHA256 Checksum of the\ndownloaded file.", + "docstring": "Helper function to download a remote dataset into path\n\n Fetch a dataset pointed by remote's url, save into path using remote's\n filename and ensure its integrity based on the SHA256 Checksum of the\n downloaded file.\n\n Parameters\n ----------\n remote : RemoteFileMetadata\n Named tuple containing remote dataset meta information: url, filename\n and checksum\n\n dirname : str\n Directory to save the file to.\n\n Returns\n -------\n file_path: str\n Full path of the created file.\n ", "source_code": "\ndef _fetch_remote(remote, dirname=None):\n \"\"\"Helper function to download a remote dataset into path\n\n Fetch a dataset pointed by remote's url, save into path using remote's\n filename and ensure its integrity based on the SHA256 Checksum of the\n downloaded file.\n\n Parameters\n ----------\n remote : RemoteFileMetadata\n Named tuple containing remote dataset meta information: url, filename\n and checksum\n\n dirname : str\n Directory to save the file to.\n\n Returns\n -------\n file_path: str\n Full path of the created file.\n \"\"\"\n file_path = remote.filename if dirname is None else join(dirname, remote.filename)\n urlretrieve(remote.url, file_path)\n checksum = _sha256(file_path)\n if remote.checksum != checksum:\n raise IOError('{} has an SHA256 checksum ({}) differing from expected ({}), file may be corrupted.'.format(file_path, checksum, remote.checksum))\n return file_path" }, { @@ -42473,8 +44001,8 @@ "parameters": [], "results": [], "is_public": false, - "description": "Return filename for Python 3 pickles\n\nargs[-1] is expected to be the \".pkl\" filename. For compatibility with older scikit-learn versions, a suffix is inserted before the extension. _pkl_filepath('/path/to/folder', 'filename.pkl') returns '/path/to/folder/filename_py3.pkl'", - "docstring": "Return filename for Python 3 pickles\n\nargs[-1] is expected to be the \".pkl\" filename. For compatibility with\nolder scikit-learn versions, a suffix is inserted before the extension.\n\n_pkl_filepath('/path/to/folder', 'filename.pkl') returns\n'/path/to/folder/filename_py3.pkl'", + "description": "Return filename for Python 3 pickles\n\nargs[-1] is expected to be the \".pkl\" filename. For compatibility with\nolder scikit-learn versions, a suffix is inserted before the extension.\n\n_pkl_filepath('/path/to/folder', 'filename.pkl') returns\n'/path/to/folder/filename_py3.pkl'", + "docstring": "Return filename for Python 3 pickles\n\n args[-1] is expected to be the \".pkl\" filename. For compatibility with\n older scikit-learn versions, a suffix is inserted before the extension.\n\n _pkl_filepath('/path/to/folder', 'filename.pkl') returns\n '/path/to/folder/filename_py3.pkl'\n\n ", "source_code": "\ndef _pkl_filepath(*args, **kwargs):\n \"\"\"Return filename for Python 3 pickles\n\n args[-1] is expected to be the \".pkl\" filename. For compatibility with\n older scikit-learn versions, a suffix is inserted before the extension.\n\n _pkl_filepath('/path/to/folder', 'filename.pkl') returns\n '/path/to/folder/filename_py3.pkl'\n\n \"\"\"\n py3_suffix = kwargs.get('py3_suffix', '_py3')\n (basename, ext) = splitext(args[-1])\n basename += py3_suffix\n new_args = args[:-1] + (basename + ext, )\n return join(*new_args)" }, { @@ -42492,7 +44020,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -42516,13 +44045,14 @@ "docstring": { "type": "str, default=None", "description": "The path to scikit-learn data directory. If `None`, the default path\nis `~/sklearn_learn_data`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Delete all the content of the data home cache.", - "docstring": "Delete all the content of the data home cache.\n\nParameters\n----------\ndata_home : str, default=None\n The path to scikit-learn data directory. If `None`, the default path\n is `~/sklearn_learn_data`.", + "docstring": "Delete all the content of the data home cache.\n\n Parameters\n ----------\n data_home : str, default=None\n The path to scikit-learn data directory. If `None`, the default path\n is `~/sklearn_learn_data`.\n ", "source_code": "\ndef clear_data_home(data_home=None):\n \"\"\"Delete all the content of the data home cache.\n\n Parameters\n ----------\n data_home : str, default=None\n The path to scikit-learn data directory. If `None`, the default path\n is `~/sklearn_learn_data`.\n \"\"\"\n data_home = get_data_home(data_home)\n shutil.rmtree(data_home)" }, { @@ -42540,13 +44070,14 @@ "docstring": { "type": "str, default=None", "description": "The path to scikit-learn data directory. If `None`, the default path\nis `~/sklearn_learn_data`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return the path of the scikit-learn data dir.\n\nThis folder is used by some large dataset loaders to avoid downloading the data several times. By default the data dir is set to a folder named 'scikit_learn_data' in the user home folder. Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment variable or programmatically by giving an explicit folder path. The '~' symbol is expanded to the user home folder. If the folder does not already exist, it is automatically created.", - "docstring": "Return the path of the scikit-learn data dir.\n\nThis folder is used by some large dataset loaders to avoid downloading the\ndata several times.\n\nBy default the data dir is set to a folder named 'scikit_learn_data' in the\nuser home folder.\n\nAlternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment\nvariable or programmatically by giving an explicit folder path. The '~'\nsymbol is expanded to the user home folder.\n\nIf the folder does not already exist, it is automatically created.\n\nParameters\n----------\ndata_home : str, default=None\n The path to scikit-learn data directory. If `None`, the default path\n is `~/sklearn_learn_data`.", + "description": "Return the path of the scikit-learn data dir.\n\nThis folder is used by some large dataset loaders to avoid downloading the\ndata several times.\n\nBy default the data dir is set to a folder named 'scikit_learn_data' in the\nuser home folder.\n\nAlternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment\nvariable or programmatically by giving an explicit folder path. The '~'\nsymbol is expanded to the user home folder.\n\nIf the folder does not already exist, it is automatically created.", + "docstring": "Return the path of the scikit-learn data dir.\n\n This folder is used by some large dataset loaders to avoid downloading the\n data several times.\n\n By default the data dir is set to a folder named 'scikit_learn_data' in the\n user home folder.\n\n Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment\n variable or programmatically by giving an explicit folder path. The '~'\n symbol is expanded to the user home folder.\n\n If the folder does not already exist, it is automatically created.\n\n Parameters\n ----------\n data_home : str, default=None\n The path to scikit-learn data directory. If `None`, the default path\n is `~/sklearn_learn_data`.\n ", "source_code": "\ndef get_data_home(data_home=None) -> str:\n \"\"\"Return the path of the scikit-learn data dir.\n\n This folder is used by some large dataset loaders to avoid downloading the\n data several times.\n\n By default the data dir is set to a folder named 'scikit_learn_data' in the\n user home folder.\n\n Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment\n variable or programmatically by giving an explicit folder path. The '~'\n symbol is expanded to the user home folder.\n\n If the folder does not already exist, it is automatically created.\n\n Parameters\n ----------\n data_home : str, default=None\n The path to scikit-learn data directory. If `None`, the default path\n is `~/sklearn_learn_data`.\n \"\"\"\n if data_home is None:\n data_home = environ.get('SCIKIT_LEARN_DATA', join('~', 'scikit_learn_data'))\n data_home = expanduser(data_home)\n makedirs(data_home, exist_ok=True)\n return data_home" }, { @@ -42566,13 +44097,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns ``(data, target)`` instead of a Bunch object.\nSee below for more information about the `data` and `target` object.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load and return the boston house-prices dataset (regression).\n\n============== ============== Samples total 506 Dimensionality 13 Features real, positive Targets real 5. - 50. ============== ============== Read more in the :ref:`User Guide `. .. deprecated:: 1.0 This function is deprecated in 1.0 and will be removed in 1.2. See the warning message below for further details regarding the alternative datasets. .. warning:: The Boston housing prices dataset has an ethical problem: as investigated in [1]_, the authors of this dataset engineered a non-invertible variable \"B\" assuming that racial self-segregation had a positive impact on house prices [2]_. Furthermore the goal of the research that led to the creation of this dataset was to study the impact of air quality but it did not give adequate demonstration of the validity of this assumption. The scikit-learn maintainers therefore strongly discourage the use of this dataset unless the purpose of the code is to study and educate about ethical issues in data science and machine learning. In this special case, you can fetch the dataset from the original source:: import pandas as pd # doctest: +SKIP import numpy as np data_url = \"http://lib.stat.cmu.edu/datasets/boston\" raw_df = pd.read_csv(data_url, sep=\"s+\", skiprows=22, header=None) data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) target = raw_df.values[1::2, 2] Alternative datasets include the California housing dataset [3]_ (i.e. :func:`~sklearn.datasets.fetch_california_housing`) and Ames housing dataset [4]_. You can load the datasets as follows:: from sklearn.datasets import fetch_california_housing housing = fetch_california_housing() for the California housing dataset and:: from sklearn.datasets import fetch_openml housing = fetch_openml(name=\"house_prices\", as_frame=True) # noqa for the Ames housing dataset.", - "docstring": "Load and return the boston house-prices dataset (regression).\n\n============== ==============\nSamples total 506\nDimensionality 13\nFeatures real, positive\nTargets real 5. - 50.\n============== ==============\n\nRead more in the :ref:`User Guide `.\n\n.. deprecated:: 1.0\n This function is deprecated in 1.0 and will be removed in 1.2. See the\n warning message below for further details regarding the alternative\n datasets.\n\n.. warning::\n The Boston housing prices dataset has an ethical problem: as\n investigated in [1]_, the authors of this dataset engineered a\n non-invertible variable \"B\" assuming that racial self-segregation had a\n positive impact on house prices [2]_. Furthermore the goal of the\n research that led to the creation of this dataset was to study the\n impact of air quality but it did not give adequate demonstration of the\n validity of this assumption.\n\n The scikit-learn maintainers therefore strongly discourage the use of\n this dataset unless the purpose of the code is to study and educate\n about ethical issues in data science and machine learning.\n\n In this special case, you can fetch the dataset from the original\n source::\n\n import pandas as pd # doctest: +SKIP\n import numpy as np\n\n\n data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\n raw_df = pd.read_csv(data_url, sep=\"s+\", skiprows=22, header=None)\n data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])\n target = raw_df.values[1::2, 2]\n\n Alternative datasets include the California housing dataset [3]_\n (i.e. :func:`~sklearn.datasets.fetch_california_housing`) and Ames\n housing dataset [4]_. You can load the datasets as follows::\n\n from sklearn.datasets import fetch_california_housing\n housing = fetch_california_housing()\n\n for the California housing dataset and::\n\n from sklearn.datasets import fetch_openml\n housing = fetch_openml(name=\"house_prices\", as_frame=True) # noqa\n\n for the Ames housing dataset.\n\nParameters\n----------\nreturn_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray of shape (506, 13)\n The data matrix.\n target : ndarray of shape (506,)\n The regression target.\n filename : str\n The physical location of boston csv dataset.\n\n .. versionadded:: 0.20\n\n DESCR : str\n The full description of the dataset.\n feature_names : ndarray\n The names of features\n\n(data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n\nNotes\n-----\n .. versionchanged:: 0.20\n Fixed a wrong data point at [445, 0].\n\nReferences\n----------\n.. [1] `Racist data destruction? M Carlisle,\n `_\n.. [2] `Harrison Jr, David, and Daniel L. Rubinfeld.\n \"Hedonic housing prices and the demand for clean air.\"\n Journal of environmental economics and management 5.1 (1978): 81-102.\n `_\n.. [3] `California housing dataset\n `_\n.. [4] `Ames housing dataset\n `_\n\nExamples\n--------\n>>> import warnings\n>>> from sklearn.datasets import load_boston\n>>> with warnings.catch_warnings():\n... # You should probably not use this dataset.\n... warnings.filterwarnings(\"ignore\")\n... X, y = load_boston(return_X_y=True)\n>>> print(X.shape)\n(506, 13)", + "description": "Load and return the boston house-prices dataset (regression).\n\n============== ==============\nSamples total 506\nDimensionality 13\nFeatures real, positive\nTargets real 5. - 50.\n============== ==============\n\nRead more in the :ref:`User Guide `.\n\n.. deprecated:: 1.0\n This function is deprecated in 1.0 and will be removed in 1.2. See the\n warning message below for further details regarding the alternative\n datasets.\n\n.. warning::\n The Boston housing prices dataset has an ethical problem: as\n investigated in [1]_, the authors of this dataset engineered a\n non-invertible variable \"B\" assuming that racial self-segregation had a\n positive impact on house prices [2]_. Furthermore the goal of the\n research that led to the creation of this dataset was to study the\n impact of air quality but it did not give adequate demonstration of the\n validity of this assumption.\n\n The scikit-learn maintainers therefore strongly discourage the use of\n this dataset unless the purpose of the code is to study and educate\n about ethical issues in data science and machine learning.\n\n In this special case, you can fetch the dataset from the original\n source::\n\n import pandas as pd # doctest: +SKIP\n import numpy as np\n\n data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\n raw_df = pd.read_csv(data_url, sep=\"s+\", skiprows=22, header=None)\n data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])\n target = raw_df.values[1::2, 2]\n\n Alternative datasets include the California housing dataset [3]_\n (i.e. :func:`~sklearn.datasets.fetch_california_housing`) and Ames\n housing dataset [4]_. You can load the datasets as follows::\n\n from sklearn.datasets import fetch_california_housing\n housing = fetch_california_housing()\n\n for the California housing dataset and::\n\n from sklearn.datasets import fetch_openml\n housing = fetch_openml(name=\"house_prices\", as_frame=True) # noqa\n\n for the Ames housing dataset.", + "docstring": "Load and return the boston house-prices dataset (regression).\n\n ============== ==============\n Samples total 506\n Dimensionality 13\n Features real, positive\n Targets real 5. - 50.\n ============== ==============\n\n Read more in the :ref:`User Guide `.\n\n .. deprecated:: 1.0\n This function is deprecated in 1.0 and will be removed in 1.2. See the\n warning message below for further details regarding the alternative\n datasets.\n\n .. warning::\n The Boston housing prices dataset has an ethical problem: as\n investigated in [1]_, the authors of this dataset engineered a\n non-invertible variable \"B\" assuming that racial self-segregation had a\n positive impact on house prices [2]_. Furthermore the goal of the\n research that led to the creation of this dataset was to study the\n impact of air quality but it did not give adequate demonstration of the\n validity of this assumption.\n\n The scikit-learn maintainers therefore strongly discourage the use of\n this dataset unless the purpose of the code is to study and educate\n about ethical issues in data science and machine learning.\n\n In this special case, you can fetch the dataset from the original\n source::\n\n import pandas as pd # doctest: +SKIP\n import numpy as np\n\n\n data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\n raw_df = pd.read_csv(data_url, sep=\"s+\", skiprows=22, header=None)\n data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])\n target = raw_df.values[1::2, 2]\n\n Alternative datasets include the California housing dataset [3]_\n (i.e. :func:`~sklearn.datasets.fetch_california_housing`) and Ames\n housing dataset [4]_. You can load the datasets as follows::\n\n from sklearn.datasets import fetch_california_housing\n housing = fetch_california_housing()\n\n for the California housing dataset and::\n\n from sklearn.datasets import fetch_openml\n housing = fetch_openml(name=\"house_prices\", as_frame=True) # noqa\n\n for the Ames housing dataset.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray of shape (506, 13)\n The data matrix.\n target : ndarray of shape (506,)\n The regression target.\n filename : str\n The physical location of boston csv dataset.\n\n .. versionadded:: 0.20\n\n DESCR : str\n The full description of the dataset.\n feature_names : ndarray\n The names of features\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n\n Notes\n -----\n .. versionchanged:: 0.20\n Fixed a wrong data point at [445, 0].\n\n References\n ----------\n .. [1] `Racist data destruction? M Carlisle,\n `_\n .. [2] `Harrison Jr, David, and Daniel L. Rubinfeld.\n \"Hedonic housing prices and the demand for clean air.\"\n Journal of environmental economics and management 5.1 (1978): 81-102.\n `_\n .. [3] `California housing dataset\n `_\n .. [4] `Ames housing dataset\n `_\n\n Examples\n --------\n >>> import warnings\n >>> from sklearn.datasets import load_boston\n >>> with warnings.catch_warnings():\n ... # You should probably not use this dataset.\n ... warnings.filterwarnings(\"ignore\")\n ... X, y = load_boston(return_X_y=True)\n >>> print(X.shape)\n (506, 13)\n ", "source_code": "\n@deprecated('`load_boston` is deprecated in 1.0 and will be removed in 1.2.\\n\\n The Boston housing prices dataset has an ethical problem. You can refer to\\n the documentation of this function for further details.\\n\\n The scikit-learn maintainers therefore strongly discourage the use of this\\n dataset unless the purpose of the code is to study and educate about\\n ethical issues in data science and machine learning.\\n\\n In this special case, you can fetch the dataset from the original\\n source::\\n\\n import pandas as pd\\n import numpy as np\\n\\n\\n data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\\n raw_df = pd.read_csv(data_url, sep=\"\\\\s+\", skiprows=22, header=None)\\n data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])\\n target = raw_df.values[1::2, 2]\\n\\n Alternative datasets include the California housing dataset (i.e.\\n :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing\\n dataset. You can load the datasets as follows::\\n\\n from sklearn.datasets import fetch_california_housing\\n housing = fetch_california_housing()\\n\\n for the California housing dataset and::\\n\\n from sklearn.datasets import fetch_openml\\n housing = fetch_openml(name=\"house_prices\", as_frame=True)\\n\\n for the Ames housing dataset.\\n ')\ndef load_boston(*, return_X_y=False):\n \"\"\"Load and return the boston house-prices dataset (regression).\n\n ============== ==============\n Samples total 506\n Dimensionality 13\n Features real, positive\n Targets real 5. - 50.\n ============== ==============\n\n Read more in the :ref:`User Guide `.\n\n .. deprecated:: 1.0\n This function is deprecated in 1.0 and will be removed in 1.2. See the\n warning message below for further details regarding the alternative\n datasets.\n\n .. warning::\n The Boston housing prices dataset has an ethical problem: as\n investigated in [1]_, the authors of this dataset engineered a\n non-invertible variable \"B\" assuming that racial self-segregation had a\n positive impact on house prices [2]_. Furthermore the goal of the\n research that led to the creation of this dataset was to study the\n impact of air quality but it did not give adequate demonstration of the\n validity of this assumption.\n\n The scikit-learn maintainers therefore strongly discourage the use of\n this dataset unless the purpose of the code is to study and educate\n about ethical issues in data science and machine learning.\n\n In this special case, you can fetch the dataset from the original\n source::\n\n import pandas as pd # doctest: +SKIP\n import numpy as np\n\n\n data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\n raw_df = pd.read_csv(data_url, sep=\"s+\", skiprows=22, header=None)\n data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])\n target = raw_df.values[1::2, 2]\n\n Alternative datasets include the California housing dataset [3]_\n (i.e. :func:`~sklearn.datasets.fetch_california_housing`) and Ames\n housing dataset [4]_. You can load the datasets as follows::\n\n from sklearn.datasets import fetch_california_housing\n housing = fetch_california_housing()\n\n for the California housing dataset and::\n\n from sklearn.datasets import fetch_openml\n housing = fetch_openml(name=\"house_prices\", as_frame=True) # noqa\n\n for the Ames housing dataset.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray of shape (506, 13)\n The data matrix.\n target : ndarray of shape (506,)\n The regression target.\n filename : str\n The physical location of boston csv dataset.\n\n .. versionadded:: 0.20\n\n DESCR : str\n The full description of the dataset.\n feature_names : ndarray\n The names of features\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n\n Notes\n -----\n .. versionchanged:: 0.20\n Fixed a wrong data point at [445, 0].\n\n References\n ----------\n .. [1] `Racist data destruction? M Carlisle,\n `_\n .. [2] `Harrison Jr, David, and Daniel L. Rubinfeld.\n \"Hedonic housing prices and the demand for clean air.\"\n Journal of environmental economics and management 5.1 (1978): 81-102.\n `_\n .. [3] `California housing dataset\n `_\n .. [4] `Ames housing dataset\n `_\n\n Examples\n --------\n >>> import warnings\n >>> from sklearn.datasets import load_boston\n >>> with warnings.catch_warnings():\n ... # You should probably not use this dataset.\n ... warnings.filterwarnings(\"ignore\")\n ... X, y = load_boston(return_X_y=True)\n >>> print(X.shape)\n (506, 13)\n \"\"\"\n descr_text = load_descr('boston_house_prices.rst')\n data_file_name = 'boston_house_prices.csv'\n with resources.open_text(DATA_MODULE, data_file_name) as f:\n data_file = csv.reader(f)\n temp = next(data_file)\n n_samples = int(temp[0])\n n_features = int(temp[1])\n data = np.empty((n_samples, n_features))\n target = np.empty((n_samples, ))\n temp = next(data_file)\n feature_names = np.array(temp)\n for (i, d) in enumerate(data_file):\n data[i] = np.asarray(d[:-1], dtype=np.float64)\n target[i] = np.asarray(d[-1], dtype=np.float64)\n if return_X_y:\n return data, target\n return Bunch(data=data, target=target, feature_names=feature_names[:-1], DESCR=descr_text, filename=data_file_name, data_module=DATA_MODULE)" }, { @@ -42590,7 +44122,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns ``(data, target)`` instead of a Bunch object.\nSee below for more information about the `data` and `target` object.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "as_frame", @@ -42600,13 +44133,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the data is a pandas DataFrame including columns with\nappropriate dtypes (numeric). The target is\na pandas DataFrame or Series depending on the number of target columns.\nIf `return_X_y` is True, then (`data`, `target`) will be pandas\nDataFrames or Series as described below.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load and return the breast cancer wisconsin dataset (classification).\n\nThe breast cancer dataset is a classic and very easy binary classification dataset. ================= ============== Classes 2 Samples per class 212(M),357(B) Samples total 569 Dimensionality 30 Features real, positive ================= ============== Read more in the :ref:`User Guide `.", - "docstring": "Load and return the breast cancer wisconsin dataset (classification).\n\nThe breast cancer dataset is a classic and very easy binary classification\ndataset.\n\n================= ==============\nClasses 2\nSamples per class 212(M),357(B)\nSamples total 569\nDimensionality 30\nFeatures real, positive\n================= ==============\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nreturn_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\nas_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (569, 30)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (569,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n frame: DataFrame of shape (569, 31)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n filename: str\n The path to the location of the data.\n\n .. versionadded:: 0.20\n\n(data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n\nThe copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is\ndownloaded from:\nhttps://goo.gl/U2Uwz2\n\nExamples\n--------\nLet's say you are interested in the samples 10, 50, and 85, and want to\nknow their class name.\n\n>>> from sklearn.datasets import load_breast_cancer\n>>> data = load_breast_cancer()\n>>> data.target[[10, 50, 85]]\narray([0, 1, 0])\n>>> list(data.target_names)\n['malignant', 'benign']", + "description": "Load and return the breast cancer wisconsin dataset (classification).\n\nThe breast cancer dataset is a classic and very easy binary classification\ndataset.\n\n================= ==============\nClasses 2\nSamples per class 212(M),357(B)\nSamples total 569\nDimensionality 30\nFeatures real, positive\n================= ==============\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load and return the breast cancer wisconsin dataset (classification).\n\n The breast cancer dataset is a classic and very easy binary classification\n dataset.\n\n ================= ==============\n Classes 2\n Samples per class 212(M),357(B)\n Samples total 569\n Dimensionality 30\n Features real, positive\n ================= ==============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (569, 30)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (569,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n frame: DataFrame of shape (569, 31)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n filename: str\n The path to the location of the data.\n\n .. versionadded:: 0.20\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n\n The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is\n downloaded from:\n https://goo.gl/U2Uwz2\n\n Examples\n --------\n Let's say you are interested in the samples 10, 50, and 85, and want to\n know their class name.\n\n >>> from sklearn.datasets import load_breast_cancer\n >>> data = load_breast_cancer()\n >>> data.target[[10, 50, 85]]\n array([0, 1, 0])\n >>> list(data.target_names)\n ['malignant', 'benign']\n ", "source_code": "\ndef load_breast_cancer(*, return_X_y=False, as_frame=False):\n \"\"\"Load and return the breast cancer wisconsin dataset (classification).\n\n The breast cancer dataset is a classic and very easy binary classification\n dataset.\n\n ================= ==============\n Classes 2\n Samples per class 212(M),357(B)\n Samples total 569\n Dimensionality 30\n Features real, positive\n ================= ==============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (569, 30)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (569,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n frame: DataFrame of shape (569, 31)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n filename: str\n The path to the location of the data.\n\n .. versionadded:: 0.20\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n\n The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is\n downloaded from:\n https://goo.gl/U2Uwz2\n\n Examples\n --------\n Let's say you are interested in the samples 10, 50, and 85, and want to\n know their class name.\n\n >>> from sklearn.datasets import load_breast_cancer\n >>> data = load_breast_cancer()\n >>> data.target[[10, 50, 85]]\n array([0, 1, 0])\n >>> list(data.target_names)\n ['malignant', 'benign']\n \"\"\"\n data_file_name = 'breast_cancer.csv'\n (data, target, target_names, fdescr) = load_csv_data(data_file_name=data_file_name, descr_file_name='breast_cancer.rst')\n feature_names = np.array(['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension', 'radius error', 'texture error', 'perimeter error', 'area error', 'smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension'])\n frame = None\n target_columns = ['target']\n if as_frame:\n (frame, data, target) = _convert_data_dataframe('load_breast_cancer', data, target, feature_names, target_columns)\n if return_X_y:\n return data, target\n return Bunch(data=data, target=target, frame=frame, target_names=target_names, DESCR=fdescr, feature_names=feature_names, filename=data_file_name, data_module=DATA_MODULE)" }, { @@ -42624,7 +44158,8 @@ "docstring": { "type": "str", "description": "Name of csv file to be loaded from `data_module/data_file_name`.\nFor example `'wine_data.csv'`." - } + }, + "refined_type": {} }, { "name": "data_module", @@ -42634,7 +44169,8 @@ "docstring": { "type": "str or module, default='sklearn.datasets.data'", "description": "Module where data lives. The default is `'sklearn.datasets.data'`." - } + }, + "refined_type": {} }, { "name": "descr_file_name", @@ -42644,7 +44180,8 @@ "docstring": { "type": "str, default=None", "description": "Name of rst file to be loaded from `descr_module/descr_file_name`.\nFor example `'wine_data.rst'`. See also :func:`load_descr`.\nIf not None, also returns the corresponding description of\nthe dataset." - } + }, + "refined_type": {} }, { "name": "descr_module", @@ -42654,13 +44191,14 @@ "docstring": { "type": "str or module, default='sklearn.datasets.descr'", "description": "Module where `descr_file_name` lives. See also :func:`load_descr`.\nThe default is `'sklearn.datasets.descr'`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Loads `data_file_name` from `data_module with `importlib.resources`.", - "docstring": "Loads `data_file_name` from `data_module with `importlib.resources`.\n\nParameters\n----------\ndata_file_name : str\n Name of csv file to be loaded from `data_module/data_file_name`.\n For example `'wine_data.csv'`.\n\ndata_module : str or module, default='sklearn.datasets.data'\n Module where data lives. The default is `'sklearn.datasets.data'`.\n\ndescr_file_name : str, default=None\n Name of rst file to be loaded from `descr_module/descr_file_name`.\n For example `'wine_data.rst'`. See also :func:`load_descr`.\n If not None, also returns the corresponding description of\n the dataset.\n\ndescr_module : str or module, default='sklearn.datasets.descr'\n Module where `descr_file_name` lives. See also :func:`load_descr`.\n The default is `'sklearn.datasets.descr'`.\n\nReturns\n-------\ndata : ndarray of shape (n_samples, n_features)\n A 2D array with each row representing one sample and each column\n representing the features of a given sample.\n\ntarget : ndarry of shape (n_samples,)\n A 1D array holding target variables for all the samples in `data`.\n For example target[0] is the target variable for data[0].\n\ntarget_names : ndarry of shape (n_samples,)\n A 1D array containing the names of the classifications. For example\n target_names[0] is the name of the target[0] class.\n\ndescr : str, optional\n Description of the dataset (the content of `descr_file_name`).\n Only returned if `descr_file_name` is not None.", + "docstring": "Loads `data_file_name` from `data_module with `importlib.resources`.\n\n Parameters\n ----------\n data_file_name : str\n Name of csv file to be loaded from `data_module/data_file_name`.\n For example `'wine_data.csv'`.\n\n data_module : str or module, default='sklearn.datasets.data'\n Module where data lives. The default is `'sklearn.datasets.data'`.\n\n descr_file_name : str, default=None\n Name of rst file to be loaded from `descr_module/descr_file_name`.\n For example `'wine_data.rst'`. See also :func:`load_descr`.\n If not None, also returns the corresponding description of\n the dataset.\n\n descr_module : str or module, default='sklearn.datasets.descr'\n Module where `descr_file_name` lives. See also :func:`load_descr`.\n The default is `'sklearn.datasets.descr'`.\n\n Returns\n -------\n data : ndarray of shape (n_samples, n_features)\n A 2D array with each row representing one sample and each column\n representing the features of a given sample.\n\n target : ndarry of shape (n_samples,)\n A 1D array holding target variables for all the samples in `data`.\n For example target[0] is the target variable for data[0].\n\n target_names : ndarry of shape (n_samples,)\n A 1D array containing the names of the classifications. For example\n target_names[0] is the name of the target[0] class.\n\n descr : str, optional\n Description of the dataset (the content of `descr_file_name`).\n Only returned if `descr_file_name` is not None.\n ", "source_code": "\ndef load_csv_data(data_file_name, *, data_module=DATA_MODULE, descr_file_name=None, descr_module=DESCR_MODULE):\n \"\"\"Loads `data_file_name` from `data_module with `importlib.resources`.\n\n Parameters\n ----------\n data_file_name : str\n Name of csv file to be loaded from `data_module/data_file_name`.\n For example `'wine_data.csv'`.\n\n data_module : str or module, default='sklearn.datasets.data'\n Module where data lives. The default is `'sklearn.datasets.data'`.\n\n descr_file_name : str, default=None\n Name of rst file to be loaded from `descr_module/descr_file_name`.\n For example `'wine_data.rst'`. See also :func:`load_descr`.\n If not None, also returns the corresponding description of\n the dataset.\n\n descr_module : str or module, default='sklearn.datasets.descr'\n Module where `descr_file_name` lives. See also :func:`load_descr`.\n The default is `'sklearn.datasets.descr'`.\n\n Returns\n -------\n data : ndarray of shape (n_samples, n_features)\n A 2D array with each row representing one sample and each column\n representing the features of a given sample.\n\n target : ndarry of shape (n_samples,)\n A 1D array holding target variables for all the samples in `data`.\n For example target[0] is the target variable for data[0].\n\n target_names : ndarry of shape (n_samples,)\n A 1D array containing the names of the classifications. For example\n target_names[0] is the name of the target[0] class.\n\n descr : str, optional\n Description of the dataset (the content of `descr_file_name`).\n Only returned if `descr_file_name` is not None.\n \"\"\"\n with resources.open_text(data_module, data_file_name) as csv_file:\n data_file = csv.reader(csv_file)\n temp = next(data_file)\n n_samples = int(temp[0])\n n_features = int(temp[1])\n target_names = np.array(temp[2:])\n data = np.empty((n_samples, n_features))\n target = np.empty((n_samples, ), dtype=int)\n for (i, ir) in enumerate(data_file):\n data[i] = np.asarray(ir[:-1], dtype=np.float64)\n target[i] = np.asarray(ir[-1], dtype=int)\n if descr_file_name is None:\n return data, target, target_names\n else:\n assert descr_module is not None\n descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name)\n return data, target, target_names, descr" }, { @@ -42678,7 +44216,8 @@ "docstring": { "type": "str, default=None", "description": "Name of rst file to be loaded from `descr_module/descr_file_name`.\nFor example `'wine_data.rst'`. See also :func:`load_descr`.\nIf not None, also returns the corresponding description of\nthe dataset." - } + }, + "refined_type": {} }, { "name": "descr_module", @@ -42688,13 +44227,14 @@ "docstring": { "type": "str or module, default='sklearn.datasets.descr'", "description": "Module where `descr_file_name` lives. See also :func:`load_descr`.\nThe default is `'sklearn.datasets.descr'`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Load `descr_file_name` from `descr_module` with `importlib.resources`.", - "docstring": "Load `descr_file_name` from `descr_module` with `importlib.resources`.\n\nParameters\n----------\ndescr_file_name : str, default=None\n Name of rst file to be loaded from `descr_module/descr_file_name`.\n For example `'wine_data.rst'`. See also :func:`load_descr`.\n If not None, also returns the corresponding description of\n the dataset.\n\ndescr_module : str or module, default='sklearn.datasets.descr'\n Module where `descr_file_name` lives. See also :func:`load_descr`.\n The default is `'sklearn.datasets.descr'`.\n\nReturns\n-------\nfdescr : str\n Content of `descr_file_name`.", + "docstring": "Load `descr_file_name` from `descr_module` with `importlib.resources`.\n\n Parameters\n ----------\n descr_file_name : str, default=None\n Name of rst file to be loaded from `descr_module/descr_file_name`.\n For example `'wine_data.rst'`. See also :func:`load_descr`.\n If not None, also returns the corresponding description of\n the dataset.\n\n descr_module : str or module, default='sklearn.datasets.descr'\n Module where `descr_file_name` lives. See also :func:`load_descr`.\n The default is `'sklearn.datasets.descr'`.\n\n Returns\n -------\n fdescr : str\n Content of `descr_file_name`.\n ", "source_code": "\ndef load_descr(descr_file_name, *, descr_module=DESCR_MODULE):\n \"\"\"Load `descr_file_name` from `descr_module` with `importlib.resources`.\n\n Parameters\n ----------\n descr_file_name : str, default=None\n Name of rst file to be loaded from `descr_module/descr_file_name`.\n For example `'wine_data.rst'`. See also :func:`load_descr`.\n If not None, also returns the corresponding description of\n the dataset.\n\n descr_module : str or module, default='sklearn.datasets.descr'\n Module where `descr_file_name` lives. See also :func:`load_descr`.\n The default is `'sklearn.datasets.descr'`.\n\n Returns\n -------\n fdescr : str\n Content of `descr_file_name`.\n \"\"\"\n fdescr = resources.read_text(descr_module, descr_file_name)\n return fdescr" }, { @@ -42710,9 +44250,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "bool, default=False.", + "type": "bool, default=False", "description": "If True, returns ``(data, target)`` instead of a Bunch object.\nSee below for more information about the `data` and `target` object.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "as_frame", @@ -42722,14 +44263,15 @@ "docstring": { "type": "bool, default=False", "description": "If True, the data is a pandas DataFrame including columns with\nappropriate dtypes (numeric). The target is\na pandas DataFrame or Series depending on the number of target columns.\nIf `return_X_y` is True, then (`data`, `target`) will be pandas\nDataFrames or Series as described below.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load and return the diabetes dataset (regression).\n\n============== ================== Samples total 442 Dimensionality 10 Features real, -.2 < x < .2 Targets integer 25 - 346 ============== ================== .. note:: The meaning of each feature (i.e. `feature_names`) might be unclear (especially for `ltg`) as the documentation of the original dataset is not explicit. We provide information that seems correct in regard with the scientific literature in this field of research. Read more in the :ref:`User Guide `.", - "docstring": "Load and return the diabetes dataset (regression).\n\n============== ==================\nSamples total 442\nDimensionality 10\nFeatures real, -.2 < x < .2\nTargets integer 25 - 346\n============== ==================\n\n.. note::\n The meaning of each feature (i.e. `feature_names`) might be unclear\n (especially for `ltg`) as the documentation of the original dataset is\n not explicit. We provide information that seems correct in regard with\n the scientific literature in this field of research.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nreturn_X_y : bool, default=False.\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\nas_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (442, 10)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (442,)\n The regression target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n frame: DataFrame of shape (442, 11)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n data_filename: str\n The path to the location of the data.\n target_filename: str\n The path to the location of the target.\n\n(data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18", - "source_code": "\ndef load_diabetes(*, return_X_y=False, as_frame=False):\n \"\"\"Load and return the diabetes dataset (regression).\n\n ============== ==================\n Samples total 442\n Dimensionality 10\n Features real, -.2 < x < .2\n Targets integer 25 - 346\n ============== ==================\n\n .. note::\n The meaning of each feature (i.e. `feature_names`) might be unclear\n (especially for `ltg`) as the documentation of the original dataset is\n not explicit. We provide information that seems correct in regard with\n the scientific literature in this field of research.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n return_X_y : bool, default=False.\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (442, 10)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (442,)\n The regression target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n frame: DataFrame of shape (442, 11)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n data_filename: str\n The path to the location of the data.\n target_filename: str\n The path to the location of the target.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n \"\"\"\n data_filename = 'diabetes_data.csv.gz'\n target_filename = 'diabetes_target.csv.gz'\n data = load_gzip_compressed_csv_data(data_filename)\n target = load_gzip_compressed_csv_data(target_filename)\n fdescr = load_descr('diabetes.rst')\n feature_names = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']\n frame = None\n target_columns = ['target']\n if as_frame:\n (frame, data, target) = _convert_data_dataframe('load_diabetes', data, target, feature_names, target_columns)\n if return_X_y:\n return data, target\n return Bunch(data=data, target=target, frame=frame, DESCR=fdescr, feature_names=feature_names, data_filename=data_filename, target_filename=target_filename, data_module=DATA_MODULE)" + "description": "Load and return the diabetes dataset (regression).\n\n============== ==================\nSamples total 442\nDimensionality 10\nFeatures real, -.2 < x < .2\nTargets integer 25 - 346\n============== ==================\n\n.. note::\n The meaning of each feature (i.e. `feature_names`) might be unclear\n (especially for `ltg`) as the documentation of the original dataset is\n not explicit. We provide information that seems correct in regard with\n the scientific literature in this field of research.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load and return the diabetes dataset (regression).\n\n ============== ==================\n Samples total 442\n Dimensionality 10\n Features real, -.2 < x < .2\n Targets integer 25 - 346\n ============== ==================\n\n .. note::\n The meaning of each feature (i.e. `feature_names`) might be unclear\n (especially for `ltg`) as the documentation of the original dataset is\n not explicit. We provide information that seems correct in regard with\n the scientific literature in this field of research.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (442, 10)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (442,)\n The regression target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n frame: DataFrame of shape (442, 11)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n data_filename: str\n The path to the location of the data.\n target_filename: str\n The path to the location of the target.\n\n (data, target) : tuple if ``return_X_y`` is True\n Returns a tuple of two ndarray of shape (n_samples, n_features)\n A 2D array with each row representing one sample and each column\n representing the features and/or target of a given sample.\n .. versionadded:: 0.18\n ", + "source_code": "\ndef load_diabetes(*, return_X_y=False, as_frame=False):\n \"\"\"Load and return the diabetes dataset (regression).\n\n ============== ==================\n Samples total 442\n Dimensionality 10\n Features real, -.2 < x < .2\n Targets integer 25 - 346\n ============== ==================\n\n .. note::\n The meaning of each feature (i.e. `feature_names`) might be unclear\n (especially for `ltg`) as the documentation of the original dataset is\n not explicit. We provide information that seems correct in regard with\n the scientific literature in this field of research.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (442, 10)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (442,)\n The regression target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n frame: DataFrame of shape (442, 11)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n data_filename: str\n The path to the location of the data.\n target_filename: str\n The path to the location of the target.\n\n (data, target) : tuple if ``return_X_y`` is True\n Returns a tuple of two ndarray of shape (n_samples, n_features)\n A 2D array with each row representing one sample and each column\n representing the features and/or target of a given sample.\n .. versionadded:: 0.18\n \"\"\"\n data_filename = 'diabetes_data.csv.gz'\n target_filename = 'diabetes_target.csv.gz'\n data = load_gzip_compressed_csv_data(data_filename)\n target = load_gzip_compressed_csv_data(target_filename)\n fdescr = load_descr('diabetes.rst')\n feature_names = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']\n frame = None\n target_columns = ['target']\n if as_frame:\n (frame, data, target) = _convert_data_dataframe('load_diabetes', data, target, feature_names, target_columns)\n if return_X_y:\n return data, target\n return Bunch(data=data, target=target, frame=frame, DESCR=fdescr, feature_names=feature_names, data_filename=data_filename, target_filename=target_filename, data_module=DATA_MODULE)" }, { "name": "load_digits", @@ -42746,7 +44288,8 @@ "docstring": { "type": "int, default=10", "description": "The number of classes to return. Between 0 and 10." - } + }, + "refined_type": {} }, { "name": "return_X_y", @@ -42756,7 +44299,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns ``(data, target)`` instead of a Bunch object.\nSee below for more information about the `data` and `target` object.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "as_frame", @@ -42766,13 +44310,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the data is a pandas DataFrame including columns with\nappropriate dtypes (numeric). The target is\na pandas DataFrame or Series depending on the number of target columns.\nIf `return_X_y` is True, then (`data`, `target`) will be pandas\nDataFrames or Series as described below.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load and return the digits dataset (classification).\n\nEach datapoint is a 8x8 image of a digit. ================= ============== Classes 10 Samples per class ~180 Samples total 1797 Dimensionality 64 Features integers 0-16 ================= ============== Read more in the :ref:`User Guide `.", - "docstring": "Load and return the digits dataset (classification).\n\nEach datapoint is a 8x8 image of a digit.\n\n================= ==============\nClasses 10\nSamples per class ~180\nSamples total 1797\nDimensionality 64\nFeatures integers 0-16\n================= ==============\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_class : int, default=10\n The number of classes to return. Between 0 and 10.\n\nreturn_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\nas_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (1797, 64)\n The flattened data matrix. If `as_frame=True`, `data` will be\n a pandas DataFrame.\n target: {ndarray, Series} of shape (1797,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n\n .. versionadded:: 0.20\n\n frame: DataFrame of shape (1797, 65)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n images: {ndarray} of shape (1797, 8, 8)\n The raw image data.\n DESCR: str\n The full description of the dataset.\n\n(data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttps://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nExamples\n--------\nTo load the data and visualize the images::\n\n >>> from sklearn.datasets import load_digits\n >>> digits = load_digits()\n >>> print(digits.data.shape)\n (1797, 64)\n >>> import matplotlib.pyplot as plt\n >>> plt.gray()\n >>> plt.matshow(digits.images[0])\n <...>\n >>> plt.show()", + "description": "Load and return the digits dataset (classification).\n\nEach datapoint is a 8x8 image of a digit.\n\n================= ==============\nClasses 10\nSamples per class ~180\nSamples total 1797\nDimensionality 64\nFeatures integers 0-16\n================= ==============\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load and return the digits dataset (classification).\n\n Each datapoint is a 8x8 image of a digit.\n\n ================= ==============\n Classes 10\n Samples per class ~180\n Samples total 1797\n Dimensionality 64\n Features integers 0-16\n ================= ==============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_class : int, default=10\n The number of classes to return. Between 0 and 10.\n\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (1797, 64)\n The flattened data matrix. If `as_frame=True`, `data` will be\n a pandas DataFrame.\n target: {ndarray, Series} of shape (1797,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n\n .. versionadded:: 0.20\n\n frame: DataFrame of shape (1797, 65)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n images: {ndarray} of shape (1797, 8, 8)\n The raw image data.\n DESCR: str\n The full description of the dataset.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n\n This is a copy of the test set of the UCI ML hand-written digits datasets\n https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\n Examples\n --------\n To load the data and visualize the images::\n\n >>> from sklearn.datasets import load_digits\n >>> digits = load_digits()\n >>> print(digits.data.shape)\n (1797, 64)\n >>> import matplotlib.pyplot as plt\n >>> plt.gray()\n >>> plt.matshow(digits.images[0])\n <...>\n >>> plt.show()\n ", "source_code": "\ndef load_digits(*, n_class=10, return_X_y=False, as_frame=False):\n \"\"\"Load and return the digits dataset (classification).\n\n Each datapoint is a 8x8 image of a digit.\n\n ================= ==============\n Classes 10\n Samples per class ~180\n Samples total 1797\n Dimensionality 64\n Features integers 0-16\n ================= ==============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_class : int, default=10\n The number of classes to return. Between 0 and 10.\n\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (1797, 64)\n The flattened data matrix. If `as_frame=True`, `data` will be\n a pandas DataFrame.\n target: {ndarray, Series} of shape (1797,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n\n .. versionadded:: 0.20\n\n frame: DataFrame of shape (1797, 65)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n images: {ndarray} of shape (1797, 8, 8)\n The raw image data.\n DESCR: str\n The full description of the dataset.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n\n This is a copy of the test set of the UCI ML hand-written digits datasets\n https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\n Examples\n --------\n To load the data and visualize the images::\n\n >>> from sklearn.datasets import load_digits\n >>> digits = load_digits()\n >>> print(digits.data.shape)\n (1797, 64)\n >>> import matplotlib.pyplot as plt\n >>> plt.gray()\n >>> plt.matshow(digits.images[0])\n <...>\n >>> plt.show()\n \"\"\"\n (data, fdescr) = load_gzip_compressed_csv_data(data_file_name='digits.csv.gz', descr_file_name='digits.rst', delimiter=',')\n target = data[:, -1].astype(int, copy=False)\n flat_data = data[:, :-1]\n images = flat_data.view()\n images.shape = (-1, 8, 8)\n if n_class < 10:\n idx = target < n_class\n (flat_data, target) = (flat_data[idx], target[idx])\n images = images[idx]\n feature_names = ['pixel_{}_{}'.format(row_idx, col_idx) for row_idx in range(8) for col_idx in range(8)]\n frame = None\n target_columns = ['target']\n if as_frame:\n (frame, flat_data, target) = _convert_data_dataframe('load_digits', flat_data, target, feature_names, target_columns)\n if return_X_y:\n return flat_data, target\n return Bunch(data=flat_data, target=target, frame=frame, feature_names=feature_names, target_names=np.arange(10), images=images, DESCR=fdescr)" }, { @@ -42789,8 +44334,9 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "str", - "description": "Path to the main folder holding one subfolder per category" - } + "description": "Path to the main folder holding one subfolder per category." + }, + "refined_type": {} }, { "name": "description", @@ -42800,7 +44346,8 @@ "docstring": { "type": "str, default=None", "description": "A paragraph describing the characteristic of the dataset: its source,\nreference, etc." - } + }, + "refined_type": {} }, { "name": "categories", @@ -42810,7 +44357,8 @@ "docstring": { "type": "list of str, default=None", "description": "If None (default), load all the categories. If not None, list of\ncategory names to load (other categories ignored)." - } + }, + "refined_type": {} }, { "name": "load_content", @@ -42820,7 +44368,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to load or not the content of the different files. If true a\n'data' attribute containing the text information is present in the data\nstructure returned. If not, a filenames attribute gives the path to the\nfiles." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -42830,7 +44379,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to shuffle the data: might be important for models that\nmake the assumption that the samples are independent and identically\ndistributed (i.i.d.), such as stochastic gradient descent." - } + }, + "refined_type": {} }, { "name": "encoding", @@ -42840,7 +44390,8 @@ "docstring": { "type": "str, default=None", "description": "If None, do not try to decode the content of the files (e.g. for images\nor other non-text content). If not None, encoding to use to decode text\nfiles to Unicode if load_content is True." - } + }, + "refined_type": {} }, { "name": "decode_error", @@ -42850,6 +44401,10 @@ "docstring": { "type": "{'strict', 'ignore', 'replace'}, default='strict'", "description": "Instruction on what to do if a byte sequence is given to analyze that\ncontains characters not of the given `encoding`. Passed as keyword\nargument 'errors' to bytes.decode." + }, + "refined_type": { + "kind": "EnumType", + "values": ["strict", "replace", "ignore"] } }, { @@ -42860,14 +44415,15 @@ "docstring": { "type": "int, RandomState instance or None, default=0", "description": "Determines random number generation for dataset shuffling. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load text files with categories as subfolder names.\n\nIndividual samples are assumed to be files stored a two levels folder structure such as the following: container_folder/ category_1_folder/ file_1.txt file_2.txt ... file_42.txt category_2_folder/ file_43.txt file_44.txt ... The folder names are used as supervised signal label names. The individual file names are not important. This function does not try to extract features into a numpy array or scipy sparse matrix. In addition, if load_content is false it does not try to load the files in memory. To use text files in a scikit-learn classification or clustering algorithm, you will need to use the :mod`~sklearn.feature_extraction.text` module to build a feature extraction transformer that suits your problem. If you set load_content=True, you should also specify the encoding of the text using the 'encoding' parameter. For many modern text files, 'utf-8' will be the correct encoding. If you leave encoding equal to None, then the content will be made of bytes instead of Unicode, and you will not be able to use most functions in :mod:`~sklearn.feature_extraction.text`. Similar feature extractors should be built for other kind of unstructured data input such as images, audio, video, ... Read more in the :ref:`User Guide `.", - "docstring": "Load text files with categories as subfolder names.\n\nIndividual samples are assumed to be files stored a two levels folder\nstructure such as the following:\n\n container_folder/\n category_1_folder/\n file_1.txt\n file_2.txt\n ...\n file_42.txt\n category_2_folder/\n file_43.txt\n file_44.txt\n ...\n\nThe folder names are used as supervised signal label names. The individual\nfile names are not important.\n\nThis function does not try to extract features into a numpy array or scipy\nsparse matrix. In addition, if load_content is false it does not try to\nload the files in memory.\n\nTo use text files in a scikit-learn classification or clustering algorithm,\nyou will need to use the :mod`~sklearn.feature_extraction.text` module to\nbuild a feature extraction transformer that suits your problem.\n\nIf you set load_content=True, you should also specify the encoding of the\ntext using the 'encoding' parameter. For many modern text files, 'utf-8'\nwill be the correct encoding. If you leave encoding equal to None, then the\ncontent will be made of bytes instead of Unicode, and you will not be able\nto use most functions in :mod:`~sklearn.feature_extraction.text`.\n\nSimilar feature extractors should be built for other kind of unstructured\ndata input such as images, audio, video, ...\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ncontainer_path : str\n Path to the main folder holding one subfolder per category\n\ndescription : str, default=None\n A paragraph describing the characteristic of the dataset: its source,\n reference, etc.\n\ncategories : list of str, default=None\n If None (default), load all the categories. If not None, list of\n category names to load (other categories ignored).\n\nload_content : bool, default=True\n Whether to load or not the content of the different files. If true a\n 'data' attribute containing the text information is present in the data\n structure returned. If not, a filenames attribute gives the path to the\n files.\n\nshuffle : bool, default=True\n Whether or not to shuffle the data: might be important for models that\n make the assumption that the samples are independent and identically\n distributed (i.i.d.), such as stochastic gradient descent.\n\nencoding : str, default=None\n If None, do not try to decode the content of the files (e.g. for images\n or other non-text content). If not None, encoding to use to decode text\n files to Unicode if load_content is True.\n\ndecode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. Passed as keyword\n argument 'errors' to bytes.decode.\n\nrandom_state : int, RandomState instance or None, default=0\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : list of str\n Only present when `load_content=True`.\n The raw text data to learn.\n target : ndarray\n The target labels (integer index).\n target_names : list\n The names of target classes.\n DESCR : str\n The full description of the dataset.\n filenames: ndarray\n The filenames holding the dataset.", - "source_code": "\ndef load_files(container_path, *, description=None, categories=None, load_content=True, shuffle=True, encoding=None, decode_error='strict', random_state=0):\n \"\"\"Load text files with categories as subfolder names.\n\n Individual samples are assumed to be files stored a two levels folder\n structure such as the following:\n\n container_folder/\n category_1_folder/\n file_1.txt\n file_2.txt\n ...\n file_42.txt\n category_2_folder/\n file_43.txt\n file_44.txt\n ...\n\n The folder names are used as supervised signal label names. The individual\n file names are not important.\n\n This function does not try to extract features into a numpy array or scipy\n sparse matrix. In addition, if load_content is false it does not try to\n load the files in memory.\n\n To use text files in a scikit-learn classification or clustering algorithm,\n you will need to use the :mod`~sklearn.feature_extraction.text` module to\n build a feature extraction transformer that suits your problem.\n\n If you set load_content=True, you should also specify the encoding of the\n text using the 'encoding' parameter. For many modern text files, 'utf-8'\n will be the correct encoding. If you leave encoding equal to None, then the\n content will be made of bytes instead of Unicode, and you will not be able\n to use most functions in :mod:`~sklearn.feature_extraction.text`.\n\n Similar feature extractors should be built for other kind of unstructured\n data input such as images, audio, video, ...\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n container_path : str\n Path to the main folder holding one subfolder per category\n\n description : str, default=None\n A paragraph describing the characteristic of the dataset: its source,\n reference, etc.\n\n categories : list of str, default=None\n If None (default), load all the categories. If not None, list of\n category names to load (other categories ignored).\n\n load_content : bool, default=True\n Whether to load or not the content of the different files. If true a\n 'data' attribute containing the text information is present in the data\n structure returned. If not, a filenames attribute gives the path to the\n files.\n\n shuffle : bool, default=True\n Whether or not to shuffle the data: might be important for models that\n make the assumption that the samples are independent and identically\n distributed (i.i.d.), such as stochastic gradient descent.\n\n encoding : str, default=None\n If None, do not try to decode the content of the files (e.g. for images\n or other non-text content). If not None, encoding to use to decode text\n files to Unicode if load_content is True.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. Passed as keyword\n argument 'errors' to bytes.decode.\n\n random_state : int, RandomState instance or None, default=0\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : list of str\n Only present when `load_content=True`.\n The raw text data to learn.\n target : ndarray\n The target labels (integer index).\n target_names : list\n The names of target classes.\n DESCR : str\n The full description of the dataset.\n filenames: ndarray\n The filenames holding the dataset.\n \"\"\"\n target = []\n target_names = []\n filenames = []\n folders = [f for f in sorted(listdir(container_path)) if isdir(join(container_path, f))]\n if categories is not None:\n folders = [f for f in folders if f in categories]\n for (label, folder) in enumerate(folders):\n target_names.append(folder)\n folder_path = join(container_path, folder)\n documents = [join(folder_path, d) for d in sorted(listdir(folder_path))]\n target.extend(len(documents) * [label])\n filenames.extend(documents)\n filenames = np.array(filenames)\n target = np.array(target)\n if shuffle:\n random_state = check_random_state(random_state)\n indices = np.arange(filenames.shape[0])\n random_state.shuffle(indices)\n filenames = filenames[indices]\n target = target[indices]\n if load_content:\n data = []\n for filename in filenames:\n with open(filename, 'rb') as f:\n data.append(f.read())\n if encoding is not None:\n data = [d.decode(encoding, decode_error) for d in data]\n return Bunch(data=data, filenames=filenames, target_names=target_names, target=target, DESCR=description)\n return Bunch(filenames=filenames, target_names=target_names, target=target, DESCR=description)" + "description": "Load text files with categories as subfolder names.\n\nIndividual samples are assumed to be files stored a two levels folder\nstructure such as the following:\n\n container_folder/\n category_1_folder/\n file_1.txt\n file_2.txt\n ...\n file_42.txt\n category_2_folder/\n file_43.txt\n file_44.txt\n ...\n\nThe folder names are used as supervised signal label names. The individual\nfile names are not important.\n\nThis function does not try to extract features into a numpy array or scipy\nsparse matrix. In addition, if load_content is false it does not try to\nload the files in memory.\n\nTo use text files in a scikit-learn classification or clustering algorithm,\nyou will need to use the :mod`~sklearn.feature_extraction.text` module to\nbuild a feature extraction transformer that suits your problem.\n\nIf you set load_content=True, you should also specify the encoding of the\ntext using the 'encoding' parameter. For many modern text files, 'utf-8'\nwill be the correct encoding. If you leave encoding equal to None, then the\ncontent will be made of bytes instead of Unicode, and you will not be able\nto use most functions in :mod:`~sklearn.feature_extraction.text`.\n\nSimilar feature extractors should be built for other kind of unstructured\ndata input such as images, audio, video, ...\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load text files with categories as subfolder names.\n\n Individual samples are assumed to be files stored a two levels folder\n structure such as the following:\n\n container_folder/\n category_1_folder/\n file_1.txt\n file_2.txt\n ...\n file_42.txt\n category_2_folder/\n file_43.txt\n file_44.txt\n ...\n\n The folder names are used as supervised signal label names. The individual\n file names are not important.\n\n This function does not try to extract features into a numpy array or scipy\n sparse matrix. In addition, if load_content is false it does not try to\n load the files in memory.\n\n To use text files in a scikit-learn classification or clustering algorithm,\n you will need to use the :mod`~sklearn.feature_extraction.text` module to\n build a feature extraction transformer that suits your problem.\n\n If you set load_content=True, you should also specify the encoding of the\n text using the 'encoding' parameter. For many modern text files, 'utf-8'\n will be the correct encoding. If you leave encoding equal to None, then the\n content will be made of bytes instead of Unicode, and you will not be able\n to use most functions in :mod:`~sklearn.feature_extraction.text`.\n\n Similar feature extractors should be built for other kind of unstructured\n data input such as images, audio, video, ...\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n container_path : str\n Path to the main folder holding one subfolder per category.\n\n description : str, default=None\n A paragraph describing the characteristic of the dataset: its source,\n reference, etc.\n\n categories : list of str, default=None\n If None (default), load all the categories. If not None, list of\n category names to load (other categories ignored).\n\n load_content : bool, default=True\n Whether to load or not the content of the different files. If true a\n 'data' attribute containing the text information is present in the data\n structure returned. If not, a filenames attribute gives the path to the\n files.\n\n shuffle : bool, default=True\n Whether or not to shuffle the data: might be important for models that\n make the assumption that the samples are independent and identically\n distributed (i.i.d.), such as stochastic gradient descent.\n\n encoding : str, default=None\n If None, do not try to decode the content of the files (e.g. for images\n or other non-text content). If not None, encoding to use to decode text\n files to Unicode if load_content is True.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. Passed as keyword\n argument 'errors' to bytes.decode.\n\n random_state : int, RandomState instance or None, default=0\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : list of str\n Only present when `load_content=True`.\n The raw text data to learn.\n target : ndarray\n The target labels (integer index).\n target_names : list\n The names of target classes.\n DESCR : str\n The full description of the dataset.\n filenames: ndarray\n The filenames holding the dataset.\n ", + "source_code": "\ndef load_files(container_path, *, description=None, categories=None, load_content=True, shuffle=True, encoding=None, decode_error='strict', random_state=0):\n \"\"\"Load text files with categories as subfolder names.\n\n Individual samples are assumed to be files stored a two levels folder\n structure such as the following:\n\n container_folder/\n category_1_folder/\n file_1.txt\n file_2.txt\n ...\n file_42.txt\n category_2_folder/\n file_43.txt\n file_44.txt\n ...\n\n The folder names are used as supervised signal label names. The individual\n file names are not important.\n\n This function does not try to extract features into a numpy array or scipy\n sparse matrix. In addition, if load_content is false it does not try to\n load the files in memory.\n\n To use text files in a scikit-learn classification or clustering algorithm,\n you will need to use the :mod`~sklearn.feature_extraction.text` module to\n build a feature extraction transformer that suits your problem.\n\n If you set load_content=True, you should also specify the encoding of the\n text using the 'encoding' parameter. For many modern text files, 'utf-8'\n will be the correct encoding. If you leave encoding equal to None, then the\n content will be made of bytes instead of Unicode, and you will not be able\n to use most functions in :mod:`~sklearn.feature_extraction.text`.\n\n Similar feature extractors should be built for other kind of unstructured\n data input such as images, audio, video, ...\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n container_path : str\n Path to the main folder holding one subfolder per category.\n\n description : str, default=None\n A paragraph describing the characteristic of the dataset: its source,\n reference, etc.\n\n categories : list of str, default=None\n If None (default), load all the categories. If not None, list of\n category names to load (other categories ignored).\n\n load_content : bool, default=True\n Whether to load or not the content of the different files. If true a\n 'data' attribute containing the text information is present in the data\n structure returned. If not, a filenames attribute gives the path to the\n files.\n\n shuffle : bool, default=True\n Whether or not to shuffle the data: might be important for models that\n make the assumption that the samples are independent and identically\n distributed (i.i.d.), such as stochastic gradient descent.\n\n encoding : str, default=None\n If None, do not try to decode the content of the files (e.g. for images\n or other non-text content). If not None, encoding to use to decode text\n files to Unicode if load_content is True.\n\n decode_error : {'strict', 'ignore', 'replace'}, default='strict'\n Instruction on what to do if a byte sequence is given to analyze that\n contains characters not of the given `encoding`. Passed as keyword\n argument 'errors' to bytes.decode.\n\n random_state : int, RandomState instance or None, default=0\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : list of str\n Only present when `load_content=True`.\n The raw text data to learn.\n target : ndarray\n The target labels (integer index).\n target_names : list\n The names of target classes.\n DESCR : str\n The full description of the dataset.\n filenames: ndarray\n The filenames holding the dataset.\n \"\"\"\n target = []\n target_names = []\n filenames = []\n folders = [f for f in sorted(listdir(container_path)) if isdir(join(container_path, f))]\n if categories is not None:\n folders = [f for f in folders if f in categories]\n for (label, folder) in enumerate(folders):\n target_names.append(folder)\n folder_path = join(container_path, folder)\n documents = [join(folder_path, d) for d in sorted(listdir(folder_path))]\n target.extend(len(documents) * [label])\n filenames.extend(documents)\n filenames = np.array(filenames)\n target = np.array(target)\n if shuffle:\n random_state = check_random_state(random_state)\n indices = np.arange(filenames.shape[0])\n random_state.shuffle(indices)\n filenames = filenames[indices]\n target = target[indices]\n if load_content:\n data = []\n for filename in filenames:\n with open(filename, 'rb') as f:\n data.append(f.read())\n if encoding is not None:\n data = [d.decode(encoding, decode_error) for d in data]\n return Bunch(data=data, filenames=filenames, target_names=target_names, target=target, DESCR=description)\n return Bunch(filenames=filenames, target_names=target_names, target=target, DESCR=description)" }, { "name": "load_gzip_compressed_csv_data", @@ -42884,7 +44440,8 @@ "docstring": { "type": "str", "description": "Name of gzip-compressed csv file (`'*.csv.gz'`) to be loaded from\n`data_module/data_file_name`. For example `'diabetes_data.csv.gz'`." - } + }, + "refined_type": {} }, { "name": "data_module", @@ -42894,7 +44451,8 @@ "docstring": { "type": "str or module, default='sklearn.datasets.data'", "description": "Module where data lives. The default is `'sklearn.datasets.data'`." - } + }, + "refined_type": {} }, { "name": "descr_file_name", @@ -42904,7 +44462,8 @@ "docstring": { "type": "str, default=None", "description": "Name of rst file to be loaded from `descr_module/descr_file_name`.\nFor example `'wine_data.rst'`. See also :func:`load_descr`.\nIf not None, also returns the corresponding description of\nthe dataset." - } + }, + "refined_type": {} }, { "name": "descr_module", @@ -42914,7 +44473,8 @@ "docstring": { "type": "str or module, default='sklearn.datasets.descr'", "description": "Module where `descr_file_name` lives. See also :func:`load_descr`.\nThe default is `'sklearn.datasets.descr'`." - } + }, + "refined_type": {} }, { "name": "encoding", @@ -42924,13 +44484,14 @@ "docstring": { "type": "str, default=\"utf-8\"", "description": "Name of the encoding that the gzip-decompressed file will be\ndecoded with. The default is 'utf-8'." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Loads gzip-compressed `data_file_name` from `data_module` with `importlib.resources`.\n\n1) Open resource file with `importlib.resources.open_binary` 2) Decompress file obj with `gzip.open` 3) Load decompressed data with `np.loadtxt`", - "docstring": "Loads gzip-compressed `data_file_name` from `data_module` with `importlib.resources`.\n\n1) Open resource file with `importlib.resources.open_binary`\n2) Decompress file obj with `gzip.open`\n3) Load decompressed data with `np.loadtxt`\n\nParameters\n----------\ndata_file_name : str\n Name of gzip-compressed csv file (`'*.csv.gz'`) to be loaded from\n `data_module/data_file_name`. For example `'diabetes_data.csv.gz'`.\n\ndata_module : str or module, default='sklearn.datasets.data'\n Module where data lives. The default is `'sklearn.datasets.data'`.\n\ndescr_file_name : str, default=None\n Name of rst file to be loaded from `descr_module/descr_file_name`.\n For example `'wine_data.rst'`. See also :func:`load_descr`.\n If not None, also returns the corresponding description of\n the dataset.\n\ndescr_module : str or module, default='sklearn.datasets.descr'\n Module where `descr_file_name` lives. See also :func:`load_descr`.\n The default is `'sklearn.datasets.descr'`.\n\nencoding : str, default=\"utf-8\"\n Name of the encoding that the gzip-decompressed file will be\n decoded with. The default is 'utf-8'.\n\n**kwargs : dict, optional\n Keyword arguments to be passed to `np.loadtxt`;\n e.g. delimiter=','.\n\nReturns\n-------\ndata : ndarray of shape (n_samples, n_features)\n A 2D array with each row representing one sample and each column\n representing the features and/or target of a given sample.\n\ndescr : str, optional\n Description of the dataset (the content of `descr_file_name`).\n Only returned if `descr_file_name` is not None.", + "description": "Loads gzip-compressed `data_file_name` from `data_module` with `importlib.resources`.\n\n1) Open resource file with `importlib.resources.open_binary`\n2) Decompress file obj with `gzip.open`\n3) Load decompressed data with `np.loadtxt`", + "docstring": "Loads gzip-compressed `data_file_name` from `data_module` with `importlib.resources`.\n\n 1) Open resource file with `importlib.resources.open_binary`\n 2) Decompress file obj with `gzip.open`\n 3) Load decompressed data with `np.loadtxt`\n\n Parameters\n ----------\n data_file_name : str\n Name of gzip-compressed csv file (`'*.csv.gz'`) to be loaded from\n `data_module/data_file_name`. For example `'diabetes_data.csv.gz'`.\n\n data_module : str or module, default='sklearn.datasets.data'\n Module where data lives. The default is `'sklearn.datasets.data'`.\n\n descr_file_name : str, default=None\n Name of rst file to be loaded from `descr_module/descr_file_name`.\n For example `'wine_data.rst'`. See also :func:`load_descr`.\n If not None, also returns the corresponding description of\n the dataset.\n\n descr_module : str or module, default='sklearn.datasets.descr'\n Module where `descr_file_name` lives. See also :func:`load_descr`.\n The default is `'sklearn.datasets.descr'`.\n\n encoding : str, default=\"utf-8\"\n Name of the encoding that the gzip-decompressed file will be\n decoded with. The default is 'utf-8'.\n\n **kwargs : dict, optional\n Keyword arguments to be passed to `np.loadtxt`;\n e.g. delimiter=','.\n\n Returns\n -------\n data : ndarray of shape (n_samples, n_features)\n A 2D array with each row representing one sample and each column\n representing the features and/or target of a given sample.\n\n descr : str, optional\n Description of the dataset (the content of `descr_file_name`).\n Only returned if `descr_file_name` is not None.\n ", "source_code": "\ndef load_gzip_compressed_csv_data(data_file_name, *, data_module=DATA_MODULE, descr_file_name=None, descr_module=DESCR_MODULE, encoding='utf-8', **kwargs):\n \"\"\"Loads gzip-compressed `data_file_name` from `data_module` with `importlib.resources`.\n\n 1) Open resource file with `importlib.resources.open_binary`\n 2) Decompress file obj with `gzip.open`\n 3) Load decompressed data with `np.loadtxt`\n\n Parameters\n ----------\n data_file_name : str\n Name of gzip-compressed csv file (`'*.csv.gz'`) to be loaded from\n `data_module/data_file_name`. For example `'diabetes_data.csv.gz'`.\n\n data_module : str or module, default='sklearn.datasets.data'\n Module where data lives. The default is `'sklearn.datasets.data'`.\n\n descr_file_name : str, default=None\n Name of rst file to be loaded from `descr_module/descr_file_name`.\n For example `'wine_data.rst'`. See also :func:`load_descr`.\n If not None, also returns the corresponding description of\n the dataset.\n\n descr_module : str or module, default='sklearn.datasets.descr'\n Module where `descr_file_name` lives. See also :func:`load_descr`.\n The default is `'sklearn.datasets.descr'`.\n\n encoding : str, default=\"utf-8\"\n Name of the encoding that the gzip-decompressed file will be\n decoded with. The default is 'utf-8'.\n\n **kwargs : dict, optional\n Keyword arguments to be passed to `np.loadtxt`;\n e.g. delimiter=','.\n\n Returns\n -------\n data : ndarray of shape (n_samples, n_features)\n A 2D array with each row representing one sample and each column\n representing the features and/or target of a given sample.\n\n descr : str, optional\n Description of the dataset (the content of `descr_file_name`).\n Only returned if `descr_file_name` is not None.\n \"\"\"\n with resources.open_binary(data_module, data_file_name) as compressed_file:\n compressed_file = gzip.open(compressed_file, mode='rt', encoding=encoding)\n data = np.loadtxt(compressed_file, **kwargs)\n if descr_file_name is None:\n return data\n else:\n assert descr_module is not None\n descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name)\n return data, descr" }, { @@ -42948,7 +44509,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns ``(data, target)`` instead of a Bunch object. See\nbelow for more information about the `data` and `target` object.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "as_frame", @@ -42958,14 +44520,15 @@ "docstring": { "type": "bool, default=False", "description": "If True, the data is a pandas DataFrame including columns with\nappropriate dtypes (numeric). The target is\na pandas DataFrame or Series depending on the number of target columns.\nIf `return_X_y` is True, then (`data`, `target`) will be pandas\nDataFrames or Series as described below.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load and return the iris dataset (classification).\n\nThe iris dataset is a classic and very easy multi-class classification dataset. ================= ============== Classes 3 Samples per class 50 Samples total 150 Dimensionality 4 Features real, positive ================= ============== Read more in the :ref:`User Guide `.", - "docstring": "Load and return the iris dataset (classification).\n\nThe iris dataset is a classic and very easy multi-class classification\ndataset.\n\n================= ==============\nClasses 3\nSamples per class 50\nSamples total 150\nDimensionality 4\nFeatures real, positive\n================= ==============\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nreturn_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object. See\n below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\nas_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (150, 4)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (150,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n frame: DataFrame of shape (150, 5)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n filename: str\n The path to the location of the data.\n\n .. versionadded:: 0.20\n\n(data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n\nNotes\n-----\n .. versionchanged:: 0.20\n Fixed two wrong data points according to Fisher's paper.\n The new version is the same as in R, but not as in the UCI\n Machine Learning Repository.\n\nExamples\n--------\nLet's say you are interested in the samples 10, 25, and 50, and want to\nknow their class name.\n\n>>> from sklearn.datasets import load_iris\n>>> data = load_iris()\n>>> data.target[[10, 25, 50]]\narray([0, 0, 1])\n>>> list(data.target_names)\n['setosa', 'versicolor', 'virginica']", - "source_code": "\ndef load_iris(*, return_X_y=False, as_frame=False):\n \"\"\"Load and return the iris dataset (classification).\n\n The iris dataset is a classic and very easy multi-class classification\n dataset.\n\n ================= ==============\n Classes 3\n Samples per class 50\n Samples total 150\n Dimensionality 4\n Features real, positive\n ================= ==============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object. See\n below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (150, 4)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (150,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n frame: DataFrame of shape (150, 5)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n filename: str\n The path to the location of the data.\n\n .. versionadded:: 0.20\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n\n Notes\n -----\n .. versionchanged:: 0.20\n Fixed two wrong data points according to Fisher's paper.\n The new version is the same as in R, but not as in the UCI\n Machine Learning Repository.\n\n Examples\n --------\n Let's say you are interested in the samples 10, 25, and 50, and want to\n know their class name.\n\n >>> from sklearn.datasets import load_iris\n >>> data = load_iris()\n >>> data.target[[10, 25, 50]]\n array([0, 0, 1])\n >>> list(data.target_names)\n ['setosa', 'versicolor', 'virginica']\n \"\"\"\n data_file_name = 'iris.csv'\n (data, target, target_names, fdescr) = load_csv_data(data_file_name=data_file_name, descr_file_name='iris.rst')\n feature_names = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']\n frame = None\n target_columns = ['target']\n if as_frame:\n (frame, data, target) = _convert_data_dataframe('load_iris', data, target, feature_names, target_columns)\n if return_X_y:\n return data, target\n return Bunch(data=data, target=target, frame=frame, target_names=target_names, DESCR=fdescr, feature_names=feature_names, filename=data_file_name, data_module=DATA_MODULE)" + "description": "Load and return the iris dataset (classification).\n\nThe iris dataset is a classic and very easy multi-class classification\ndataset.\n\n================= ==============\nClasses 3\nSamples per class 50\nSamples total 150\nDimensionality 4\nFeatures real, positive\n================= ==============\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load and return the iris dataset (classification).\n\n The iris dataset is a classic and very easy multi-class classification\n dataset.\n\n ================= ==============\n Classes 3\n Samples per class 50\n Samples total 150\n Dimensionality 4\n Features real, positive\n ================= ==============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object. See\n below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (150, 4)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (150,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n frame: DataFrame of shape (150, 5)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n filename: str\n The path to the location of the data.\n\n .. versionadded:: 0.20\n\n (data, target) : tuple if ``return_X_y`` is True\n A tuple of two ndarray. The first containing a 2D array of shape\n (n_samples, n_features) with each row representing one sample and\n each column representing the features. The second ndarray of shape\n (n_samples,) containing the target samples.\n\n .. versionadded:: 0.18\n\n Notes\n -----\n .. versionchanged:: 0.20\n Fixed two wrong data points according to Fisher's paper.\n The new version is the same as in R, but not as in the UCI\n Machine Learning Repository.\n\n Examples\n --------\n Let's say you are interested in the samples 10, 25, and 50, and want to\n know their class name.\n\n >>> from sklearn.datasets import load_iris\n >>> data = load_iris()\n >>> data.target[[10, 25, 50]]\n array([0, 0, 1])\n >>> list(data.target_names)\n ['setosa', 'versicolor', 'virginica']\n ", + "source_code": "\ndef load_iris(*, return_X_y=False, as_frame=False):\n \"\"\"Load and return the iris dataset (classification).\n\n The iris dataset is a classic and very easy multi-class classification\n dataset.\n\n ================= ==============\n Classes 3\n Samples per class 50\n Samples total 150\n Dimensionality 4\n Features real, positive\n ================= ==============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object. See\n below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (150, 4)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (150,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n frame: DataFrame of shape (150, 5)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n filename: str\n The path to the location of the data.\n\n .. versionadded:: 0.20\n\n (data, target) : tuple if ``return_X_y`` is True\n A tuple of two ndarray. The first containing a 2D array of shape\n (n_samples, n_features) with each row representing one sample and\n each column representing the features. The second ndarray of shape\n (n_samples,) containing the target samples.\n\n .. versionadded:: 0.18\n\n Notes\n -----\n .. versionchanged:: 0.20\n Fixed two wrong data points according to Fisher's paper.\n The new version is the same as in R, but not as in the UCI\n Machine Learning Repository.\n\n Examples\n --------\n Let's say you are interested in the samples 10, 25, and 50, and want to\n know their class name.\n\n >>> from sklearn.datasets import load_iris\n >>> data = load_iris()\n >>> data.target[[10, 25, 50]]\n array([0, 0, 1])\n >>> list(data.target_names)\n ['setosa', 'versicolor', 'virginica']\n \"\"\"\n data_file_name = 'iris.csv'\n (data, target, target_names, fdescr) = load_csv_data(data_file_name=data_file_name, descr_file_name='iris.rst')\n feature_names = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']\n frame = None\n target_columns = ['target']\n if as_frame:\n (frame, data, target) = _convert_data_dataframe('load_iris', data, target, feature_names, target_columns)\n if return_X_y:\n return data, target\n return Bunch(data=data, target=target, frame=frame, target_names=target_names, DESCR=fdescr, feature_names=feature_names, filename=data_file_name, data_module=DATA_MODULE)" }, { "name": "load_linnerud", @@ -42982,7 +44545,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns ``(data, target)`` instead of a Bunch object.\nSee below for more information about the `data` and `target` object.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "as_frame", @@ -42992,13 +44556,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the data is a pandas DataFrame including columns with\nappropriate dtypes (numeric, string or categorical). The target is\na pandas DataFrame or Series depending on the number of target columns.\nIf `return_X_y` is True, then (`data`, `target`) will be pandas\nDataFrames or Series as described below.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load and return the physical exercise Linnerud dataset.\n\nThis dataset is suitable for multi-ouput regression tasks. ============== ============================ Samples total 20 Dimensionality 3 (for both data and target) Features integer Targets integer ============== ============================ Read more in the :ref:`User Guide `.", - "docstring": "Load and return the physical exercise Linnerud dataset.\n\nThis dataset is suitable for multi-ouput regression tasks.\n\n============== ============================\nSamples total 20\nDimensionality 3 (for both data and target)\nFeatures integer\nTargets integer\n============== ============================\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nreturn_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\nas_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric, string or categorical). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (20, 3)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, dataframe} of shape (20, 3)\n The regression targets. If `as_frame=True`, `target` will be\n a pandas DataFrame.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of the target columns.\n frame: DataFrame of shape (20, 6)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n data_filename: str\n The path to the location of the data.\n target_filename: str\n The path to the location of the target.\n\n .. versionadded:: 0.20\n\n(data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18", + "description": "Load and return the physical exercise Linnerud dataset.\n\nThis dataset is suitable for multi-ouput regression tasks.\n\n============== ============================\nSamples total 20\nDimensionality 3 (for both data and target)\nFeatures integer\nTargets integer\n============== ============================\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load and return the physical exercise Linnerud dataset.\n\n This dataset is suitable for multi-ouput regression tasks.\n\n ============== ============================\n Samples total 20\n Dimensionality 3 (for both data and target)\n Features integer\n Targets integer\n ============== ============================\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric, string or categorical). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (20, 3)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, dataframe} of shape (20, 3)\n The regression targets. If `as_frame=True`, `target` will be\n a pandas DataFrame.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of the target columns.\n frame: DataFrame of shape (20, 6)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n data_filename: str\n The path to the location of the data.\n target_filename: str\n The path to the location of the target.\n\n .. versionadded:: 0.20\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n ", "source_code": "\ndef load_linnerud(*, return_X_y=False, as_frame=False):\n \"\"\"Load and return the physical exercise Linnerud dataset.\n\n This dataset is suitable for multi-ouput regression tasks.\n\n ============== ============================\n Samples total 20\n Dimensionality 3 (for both data and target)\n Features integer\n Targets integer\n ============== ============================\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.18\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric, string or categorical). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (20, 3)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, dataframe} of shape (20, 3)\n The regression targets. If `as_frame=True`, `target` will be\n a pandas DataFrame.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of the target columns.\n frame: DataFrame of shape (20, 6)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n data_filename: str\n The path to the location of the data.\n target_filename: str\n The path to the location of the target.\n\n .. versionadded:: 0.20\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.18\n \"\"\"\n data_filename = 'linnerud_exercise.csv'\n target_filename = 'linnerud_physiological.csv'\n with resources.open_text(DATA_MODULE, data_filename) as f:\n header_exercise = f.readline().split()\n f.seek(0)\n data_exercise = np.loadtxt(f, skiprows=1)\n with resources.open_text(DATA_MODULE, target_filename) as f:\n header_physiological = f.readline().split()\n f.seek(0)\n data_physiological = np.loadtxt(f, skiprows=1)\n fdescr = load_descr('linnerud.rst')\n frame = None\n if as_frame:\n (frame, data_exercise, data_physiological) = _convert_data_dataframe('load_linnerud', data_exercise, data_physiological, header_exercise, header_physiological)\n if return_X_y:\n return data_exercise, data_physiological\n return Bunch(data=data_exercise, feature_names=header_exercise, target=data_physiological, target_names=header_physiological, frame=frame, DESCR=fdescr, data_filename=data_filename, target_filename=target_filename, data_module=DATA_MODULE)" }, { @@ -43016,13 +44581,17 @@ "docstring": { "type": "{`china.jpg`, `flower.jpg`}", "description": "The name of the sample image loaded" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Load the numpy array of a single sample image\n\nRead more in the :ref:`User Guide `.", - "docstring": "Load the numpy array of a single sample image\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nimage_name : {`china.jpg`, `flower.jpg`}\n The name of the sample image loaded\n\nReturns\n-------\nimg : 3D array\n The image as a numpy array: height x width x color\n\nExamples\n--------\n\n>>> from sklearn.datasets import load_sample_image\n>>> china = load_sample_image('china.jpg') # doctest: +SKIP\n>>> china.dtype # doctest: +SKIP\ndtype('uint8')\n>>> china.shape # doctest: +SKIP\n(427, 640, 3)\n>>> flower = load_sample_image('flower.jpg') # doctest: +SKIP\n>>> flower.dtype # doctest: +SKIP\ndtype('uint8')\n>>> flower.shape # doctest: +SKIP\n(427, 640, 3)", + "docstring": "Load the numpy array of a single sample image\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n image_name : {`china.jpg`, `flower.jpg`}\n The name of the sample image loaded\n\n Returns\n -------\n img : 3D array\n The image as a numpy array: height x width x color\n\n Examples\n --------\n\n >>> from sklearn.datasets import load_sample_image\n >>> china = load_sample_image('china.jpg') # doctest: +SKIP\n >>> china.dtype # doctest: +SKIP\n dtype('uint8')\n >>> china.shape # doctest: +SKIP\n (427, 640, 3)\n >>> flower = load_sample_image('flower.jpg') # doctest: +SKIP\n >>> flower.dtype # doctest: +SKIP\n dtype('uint8')\n >>> flower.shape # doctest: +SKIP\n (427, 640, 3)\n ", "source_code": "\ndef load_sample_image(image_name):\n \"\"\"Load the numpy array of a single sample image\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n image_name : {`china.jpg`, `flower.jpg`}\n The name of the sample image loaded\n\n Returns\n -------\n img : 3D array\n The image as a numpy array: height x width x color\n\n Examples\n --------\n\n >>> from sklearn.datasets import load_sample_image\n >>> china = load_sample_image('china.jpg') # doctest: +SKIP\n >>> china.dtype # doctest: +SKIP\n dtype('uint8')\n >>> china.shape # doctest: +SKIP\n (427, 640, 3)\n >>> flower = load_sample_image('flower.jpg') # doctest: +SKIP\n >>> flower.dtype # doctest: +SKIP\n dtype('uint8')\n >>> flower.shape # doctest: +SKIP\n (427, 640, 3)\n \"\"\"\n images = load_sample_images()\n index = None\n for (i, filename) in enumerate(images.filenames):\n if filename.endswith(image_name):\n index = i\n break\n if index is None:\n raise AttributeError('Cannot find sample image: %s' % image_name)\n return images.images[index]" }, { @@ -43034,8 +44603,8 @@ "parameters": [], "results": [], "is_public": true, - "description": "Load sample images for image manipulation.\n\nLoads both, ``china`` and ``flower``. Read more in the :ref:`User Guide `.", - "docstring": "Load sample images for image manipulation.\n\nLoads both, ``china`` and ``flower``.\n\nRead more in the :ref:`User Guide `.\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n images : list of ndarray of shape (427, 640, 3)\n The two sample image.\n filenames : list\n The filenames for the images.\n DESCR : str\n The full description of the dataset.\n\nExamples\n--------\nTo load the data and visualize the images:\n\n>>> from sklearn.datasets import load_sample_images\n>>> dataset = load_sample_images() #doctest: +SKIP\n>>> len(dataset.images) #doctest: +SKIP\n2\n>>> first_img_data = dataset.images[0] #doctest: +SKIP\n>>> first_img_data.shape #doctest: +SKIP\n(427, 640, 3)\n>>> first_img_data.dtype #doctest: +SKIP\ndtype('uint8')", + "description": "Load sample images for image manipulation.\n\nLoads both, ``china`` and ``flower``.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load sample images for image manipulation.\n\n Loads both, ``china`` and ``flower``.\n\n Read more in the :ref:`User Guide `.\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n images : list of ndarray of shape (427, 640, 3)\n The two sample image.\n filenames : list\n The filenames for the images.\n DESCR : str\n The full description of the dataset.\n\n Examples\n --------\n To load the data and visualize the images:\n\n >>> from sklearn.datasets import load_sample_images\n >>> dataset = load_sample_images() #doctest: +SKIP\n >>> len(dataset.images) #doctest: +SKIP\n 2\n >>> first_img_data = dataset.images[0] #doctest: +SKIP\n >>> first_img_data.shape #doctest: +SKIP\n (427, 640, 3)\n >>> first_img_data.dtype #doctest: +SKIP\n dtype('uint8')\n ", "source_code": "\ndef load_sample_images():\n \"\"\"Load sample images for image manipulation.\n\n Loads both, ``china`` and ``flower``.\n\n Read more in the :ref:`User Guide `.\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n images : list of ndarray of shape (427, 640, 3)\n The two sample image.\n filenames : list\n The filenames for the images.\n DESCR : str\n The full description of the dataset.\n\n Examples\n --------\n To load the data and visualize the images:\n\n >>> from sklearn.datasets import load_sample_images\n >>> dataset = load_sample_images() #doctest: +SKIP\n >>> len(dataset.images) #doctest: +SKIP\n 2\n >>> first_img_data = dataset.images[0] #doctest: +SKIP\n >>> first_img_data.shape #doctest: +SKIP\n (427, 640, 3)\n >>> first_img_data.dtype #doctest: +SKIP\n dtype('uint8')\n \"\"\"\n from ..externals._pilutil import imread\n descr = load_descr('README.txt', descr_module=IMAGES_MODULE)\n (filenames, images) = ([], [])\n for filename in sorted(resources.contents(IMAGES_MODULE)):\n if filename.endswith('.jpg'):\n filenames.append(filename)\n with resources.open_binary(IMAGES_MODULE, filename) as image_file:\n image = imread(image_file)\n images.append(image)\n return Bunch(images=images, filenames=filenames, DESCR=descr)" }, { @@ -43053,7 +44622,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns ``(data, target)`` instead of a Bunch object.\nSee below for more information about the `data` and `target` object." - } + }, + "refined_type": {} }, { "name": "as_frame", @@ -43063,13 +44633,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the data is a pandas DataFrame including columns with\nappropriate dtypes (numeric). The target is\na pandas DataFrame or Series depending on the number of target columns.\nIf `return_X_y` is True, then (`data`, `target`) will be pandas\nDataFrames or Series as described below.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load and return the wine dataset (classification).\n\n.. versionadded:: 0.18 The wine dataset is a classic and very easy multi-class classification dataset. ================= ============== Classes 3 Samples per class [59,71,48] Samples total 178 Dimensionality 13 Features real, positive ================= ============== Read more in the :ref:`User Guide `.", - "docstring": "Load and return the wine dataset (classification).\n\n.. versionadded:: 0.18\n\nThe wine dataset is a classic and very easy multi-class classification\ndataset.\n\n================= ==============\nClasses 3\nSamples per class [59,71,48]\nSamples total 178\nDimensionality 13\nFeatures real, positive\n================= ==============\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nreturn_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\nas_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (178, 13)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (178,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n frame: DataFrame of shape (178, 14)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n\n(data, target) : tuple if ``return_X_y`` is True\n\nThe copy of UCI ML Wine Data Set dataset is downloaded and modified to fit\nstandard format from:\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n\nExamples\n--------\nLet's say you are interested in the samples 10, 80, and 140, and want to\nknow their class name.\n\n>>> from sklearn.datasets import load_wine\n>>> data = load_wine()\n>>> data.target[[10, 80, 140]]\narray([0, 1, 2])\n>>> list(data.target_names)\n['class_0', 'class_1', 'class_2']", + "description": "Load and return the wine dataset (classification).\n\n.. versionadded:: 0.18\n\nThe wine dataset is a classic and very easy multi-class classification\ndataset.\n\n================= ==============\nClasses 3\nSamples per class [59,71,48]\nSamples total 178\nDimensionality 13\nFeatures real, positive\n================= ==============\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load and return the wine dataset (classification).\n\n .. versionadded:: 0.18\n\n The wine dataset is a classic and very easy multi-class classification\n dataset.\n\n ================= ==============\n Classes 3\n Samples per class [59,71,48]\n Samples total 178\n Dimensionality 13\n Features real, positive\n ================= ==============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (178, 13)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (178,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n frame: DataFrame of shape (178, 14)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit\n standard format from:\n https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n\n Examples\n --------\n Let's say you are interested in the samples 10, 80, and 140, and want to\n know their class name.\n\n >>> from sklearn.datasets import load_wine\n >>> data = load_wine()\n >>> data.target[[10, 80, 140]]\n array([0, 1, 2])\n >>> list(data.target_names)\n ['class_0', 'class_1', 'class_2']\n ", "source_code": "\ndef load_wine(*, return_X_y=False, as_frame=False):\n \"\"\"Load and return the wine dataset (classification).\n\n .. versionadded:: 0.18\n\n The wine dataset is a classic and very easy multi-class classification\n dataset.\n\n ================= ==============\n Classes 3\n Samples per class [59,71,48]\n Samples total 178\n Dimensionality 13\n Features real, positive\n ================= ==============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object.\n See below for more information about the `data` and `target` object.\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is\n a pandas DataFrame or Series depending on the number of target columns.\n If `return_X_y` is True, then (`data`, `target`) will be pandas\n DataFrames or Series as described below.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (178, 13)\n The data matrix. If `as_frame=True`, `data` will be a pandas\n DataFrame.\n target: {ndarray, Series} of shape (178,)\n The classification target. If `as_frame=True`, `target` will be\n a pandas Series.\n feature_names: list\n The names of the dataset columns.\n target_names: list\n The names of target classes.\n frame: DataFrame of shape (178, 14)\n Only present when `as_frame=True`. DataFrame with `data` and\n `target`.\n\n .. versionadded:: 0.23\n DESCR: str\n The full description of the dataset.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit\n standard format from:\n https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n\n Examples\n --------\n Let's say you are interested in the samples 10, 80, and 140, and want to\n know their class name.\n\n >>> from sklearn.datasets import load_wine\n >>> data = load_wine()\n >>> data.target[[10, 80, 140]]\n array([0, 1, 2])\n >>> list(data.target_names)\n ['class_0', 'class_1', 'class_2']\n \"\"\"\n (data, target, target_names, fdescr) = load_csv_data(data_file_name='wine_data.csv', descr_file_name='wine_data.rst')\n feature_names = ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']\n frame = None\n target_columns = ['target']\n if as_frame:\n (frame, data, target) = _convert_data_dataframe('load_wine', data, target, feature_names, target_columns)\n if return_X_y:\n return data, target\n return Bunch(data=data, target=target, frame=frame, target_names=target_names, DESCR=fdescr, feature_names=feature_names)" }, { @@ -43087,7 +44658,8 @@ "docstring": { "type": "str, default=None", "description": "Specify another download and cache folder for the datasets. By default\nall scikit-learn data is stored in '~/scikit_learn_data' subfolders." - } + }, + "refined_type": {} }, { "name": "download_if_missing", @@ -43097,7 +44669,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, raise a IOError if the data is not locally available\ninstead of trying to download the data from the source site." - } + }, + "refined_type": {} }, { "name": "return_X_y", @@ -43107,7 +44680,8 @@ "docstring": { "type": "bool, default=False.", "description": "If True, returns ``(data.data, data.target)`` instead of a Bunch\nobject.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "as_frame", @@ -43117,13 +44691,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the data is a pandas DataFrame including columns with\nappropriate dtypes (numeric, string or categorical). The target is\na pandas DataFrame or Series depending on the number of target_columns.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load the California housing dataset (regression).\n\n============== ============== Samples total 20640 Dimensionality 8 Features real Target real 0.15 - 5. ============== ============== Read more in the :ref:`User Guide `.", - "docstring": "Load the California housing dataset (regression).\n\n============== ==============\nSamples total 20640\nDimensionality 8\nFeatures real\nTarget real 0.15 - 5.\n============== ==============\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ndata_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\ndownload_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n\nreturn_X_y : bool, default=False.\n If True, returns ``(data.data, data.target)`` instead of a Bunch\n object.\n\n .. versionadded:: 0.20\n\nas_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric, string or categorical). The target is\n a pandas DataFrame or Series depending on the number of target_columns.\n\n .. versionadded:: 0.23\n\nReturns\n-------\ndataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray, shape (20640, 8)\n Each row corresponding to the 8 feature values in order.\n If ``as_frame`` is True, ``data`` is a pandas object.\n target : numpy array of shape (20640,)\n Each value corresponds to the average\n house value in units of 100,000.\n If ``as_frame`` is True, ``target`` is a pandas object.\n feature_names : list of length 8\n Array of ordered feature names used in the dataset.\n DESCR : str\n Description of the California housing dataset.\n frame : pandas DataFrame\n Only present when `as_frame=True`. DataFrame with ``data`` and\n ``target``.\n\n .. versionadded:: 0.23\n\n(data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20\n\nNotes\n-----\n\nThis dataset consists of 20,640 samples and 9 features.", + "description": "Load the California housing dataset (regression).\n\n============== ==============\nSamples total 20640\nDimensionality 8\nFeatures real\nTarget real 0.15 - 5.\n============== ==============\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load the California housing dataset (regression).\n\n ============== ==============\n Samples total 20640\n Dimensionality 8\n Features real\n Target real 0.15 - 5.\n ============== ==============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n\n return_X_y : bool, default=False.\n If True, returns ``(data.data, data.target)`` instead of a Bunch\n object.\n\n .. versionadded:: 0.20\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric, string or categorical). The target is\n a pandas DataFrame or Series depending on the number of target_columns.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n dataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray, shape (20640, 8)\n Each row corresponding to the 8 feature values in order.\n If ``as_frame`` is True, ``data`` is a pandas object.\n target : numpy array of shape (20640,)\n Each value corresponds to the average\n house value in units of 100,000.\n If ``as_frame`` is True, ``target`` is a pandas object.\n feature_names : list of length 8\n Array of ordered feature names used in the dataset.\n DESCR : str\n Description of the California housing dataset.\n frame : pandas DataFrame\n Only present when `as_frame=True`. DataFrame with ``data`` and\n ``target``.\n\n .. versionadded:: 0.23\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20\n\n Notes\n -----\n\n This dataset consists of 20,640 samples and 9 features.\n ", "source_code": "\ndef fetch_california_housing(*, data_home=None, download_if_missing=True, return_X_y=False, as_frame=False):\n \"\"\"Load the California housing dataset (regression).\n\n ============== ==============\n Samples total 20640\n Dimensionality 8\n Features real\n Target real 0.15 - 5.\n ============== ==============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n\n return_X_y : bool, default=False.\n If True, returns ``(data.data, data.target)`` instead of a Bunch\n object.\n\n .. versionadded:: 0.20\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric, string or categorical). The target is\n a pandas DataFrame or Series depending on the number of target_columns.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n dataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray, shape (20640, 8)\n Each row corresponding to the 8 feature values in order.\n If ``as_frame`` is True, ``data`` is a pandas object.\n target : numpy array of shape (20640,)\n Each value corresponds to the average\n house value in units of 100,000.\n If ``as_frame`` is True, ``target`` is a pandas object.\n feature_names : list of length 8\n Array of ordered feature names used in the dataset.\n DESCR : str\n Description of the California housing dataset.\n frame : pandas DataFrame\n Only present when `as_frame=True`. DataFrame with ``data`` and\n ``target``.\n\n .. versionadded:: 0.23\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20\n\n Notes\n -----\n\n This dataset consists of 20,640 samples and 9 features.\n \"\"\"\n data_home = get_data_home(data_home=data_home)\n if not exists(data_home):\n makedirs(data_home)\n filepath = _pkl_filepath(data_home, 'cal_housing.pkz')\n if not exists(filepath):\n if not download_if_missing:\n raise IOError('Data not found and `download_if_missing` is False')\n logger.info('Downloading Cal. housing from {} to {}'.format(ARCHIVE.url, data_home))\n archive_path = _fetch_remote(ARCHIVE, dirname=data_home)\n with tarfile.open(mode='r:gz', name=archive_path) as f:\n cal_housing = np.loadtxt(f.extractfile('CaliforniaHousing/cal_housing.data'), delimiter=',')\n columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]\n cal_housing = cal_housing[:, columns_index]\n joblib.dump(cal_housing, filepath, compress=6)\n remove(archive_path)\n else:\n cal_housing = joblib.load(filepath)\n feature_names = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']\n (target, data) = (cal_housing[:, 0], cal_housing[:, 1:])\n data[:, 2] /= data[:, 5]\n data[:, 3] /= data[:, 5]\n data[:, 5] = data[:, 4] / data[:, 5]\n target = target / 100000.0\n descr = load_descr('california_housing.rst')\n X = data\n y = target\n frame = None\n target_names = ['MedHouseVal']\n if as_frame:\n (frame, X, y) = _convert_data_dataframe('fetch_california_housing', data, target, feature_names, target_names)\n if return_X_y:\n return X, y\n return Bunch(data=X, target=y, frame=frame, target_names=target_names, feature_names=feature_names, DESCR=descr)" }, { @@ -43141,7 +44716,8 @@ "docstring": { "type": "str, default=None", "description": "Specify another download and cache folder for the datasets. By default\nall scikit-learn data is stored in '~/scikit_learn_data' subfolders." - } + }, + "refined_type": {} }, { "name": "download_if_missing", @@ -43151,7 +44727,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, raise a IOError if the data is not locally available\ninstead of trying to download the data from the source site." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -43161,7 +44738,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset shuffling. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -43171,7 +44749,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to shuffle dataset." - } + }, + "refined_type": {} }, { "name": "return_X_y", @@ -43181,7 +44760,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns ``(data.data, data.target)`` instead of a Bunch\nobject.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "as_frame", @@ -43191,13 +44771,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the data is a pandas DataFrame including columns with\nappropriate dtypes (numeric). The target is a pandas DataFrame or\nSeries depending on the number of target columns. If `return_X_y` is\nTrue, then (`data`, `target`) will be pandas DataFrames or Series as\ndescribed below.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load the covertype dataset (classification).\n\nDownload it if necessary. ================= ============ Classes 7 Samples total 581012 Dimensionality 54 Features int ================= ============ Read more in the :ref:`User Guide `.", - "docstring": "Load the covertype dataset (classification).\n\nDownload it if necessary.\n\n================= ============\nClasses 7\nSamples total 581012\nDimensionality 54\nFeatures int\n================= ============\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ndata_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\ndownload_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nshuffle : bool, default=False\n Whether to shuffle dataset.\n\nreturn_X_y : bool, default=False\n If True, returns ``(data.data, data.target)`` instead of a Bunch\n object.\n\n .. versionadded:: 0.20\n\nas_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is a pandas DataFrame or\n Series depending on the number of target columns. If `return_X_y` is\n True, then (`data`, `target`) will be pandas DataFrames or Series as\n described below.\n\n .. versionadded:: 0.24\n\nReturns\n-------\ndataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray of shape (581012, 54)\n Each row corresponds to the 54 features in the dataset.\n target : ndarray of shape (581012,)\n Each value corresponds to one of\n the 7 forest covertypes with values\n ranging between 1 to 7.\n frame : dataframe of shape (581012, 55)\n Only present when `as_frame=True`. Contains `data` and `target`.\n DESCR : str\n Description of the forest covertype dataset.\n feature_names : list\n The names of the dataset columns.\n target_names: list\n The names of the target columns.\n\n(data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20", + "description": "Load the covertype dataset (classification).\n\nDownload it if necessary.\n\n================= ============\nClasses 7\nSamples total 581012\nDimensionality 54\nFeatures int\n================= ============\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load the covertype dataset (classification).\n\n Download it if necessary.\n\n ================= ============\n Classes 7\n Samples total 581012\n Dimensionality 54\n Features int\n ================= ============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n shuffle : bool, default=False\n Whether to shuffle dataset.\n\n return_X_y : bool, default=False\n If True, returns ``(data.data, data.target)`` instead of a Bunch\n object.\n\n .. versionadded:: 0.20\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is a pandas DataFrame or\n Series depending on the number of target columns. If `return_X_y` is\n True, then (`data`, `target`) will be pandas DataFrames or Series as\n described below.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n dataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray of shape (581012, 54)\n Each row corresponds to the 54 features in the dataset.\n target : ndarray of shape (581012,)\n Each value corresponds to one of\n the 7 forest covertypes with values\n ranging between 1 to 7.\n frame : dataframe of shape (581012, 55)\n Only present when `as_frame=True`. Contains `data` and `target`.\n DESCR : str\n Description of the forest covertype dataset.\n feature_names : list\n The names of the dataset columns.\n target_names: list\n The names of the target columns.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20\n\n ", "source_code": "\ndef fetch_covtype(*, data_home=None, download_if_missing=True, random_state=None, shuffle=False, return_X_y=False, as_frame=False):\n \"\"\"Load the covertype dataset (classification).\n\n Download it if necessary.\n\n ================= ============\n Classes 7\n Samples total 581012\n Dimensionality 54\n Features int\n ================= ============\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n shuffle : bool, default=False\n Whether to shuffle dataset.\n\n return_X_y : bool, default=False\n If True, returns ``(data.data, data.target)`` instead of a Bunch\n object.\n\n .. versionadded:: 0.20\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric). The target is a pandas DataFrame or\n Series depending on the number of target columns. If `return_X_y` is\n True, then (`data`, `target`) will be pandas DataFrames or Series as\n described below.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n dataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray of shape (581012, 54)\n Each row corresponds to the 54 features in the dataset.\n target : ndarray of shape (581012,)\n Each value corresponds to one of\n the 7 forest covertypes with values\n ranging between 1 to 7.\n frame : dataframe of shape (581012, 55)\n Only present when `as_frame=True`. Contains `data` and `target`.\n DESCR : str\n Description of the forest covertype dataset.\n feature_names : list\n The names of the dataset columns.\n target_names: list\n The names of the target columns.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20\n\n \"\"\"\n data_home = get_data_home(data_home=data_home)\n covtype_dir = join(data_home, 'covertype')\n samples_path = _pkl_filepath(covtype_dir, 'samples')\n targets_path = _pkl_filepath(covtype_dir, 'targets')\n available = exists(samples_path)\n if download_if_missing and not available:\n if not exists(covtype_dir):\n makedirs(covtype_dir)\n logger.info('Downloading %s' % ARCHIVE.url)\n archive_path = _fetch_remote(ARCHIVE, dirname=covtype_dir)\n Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=',')\n remove(archive_path)\n X = Xy[:, :-1]\n y = Xy[:, -1].astype(np.int32, copy=False)\n joblib.dump(X, samples_path, compress=9)\n joblib.dump(y, targets_path, compress=9)\n elif not available and not download_if_missing:\n raise IOError('Data not found and `download_if_missing` is False')\n try:\n (X, y)\n except NameError:\n X = joblib.load(samples_path)\n y = joblib.load(targets_path)\n if shuffle:\n ind = np.arange(X.shape[0])\n rng = check_random_state(random_state)\n rng.shuffle(ind)\n X = X[ind]\n y = y[ind]\n fdescr = load_descr('covtype.rst')\n frame = None\n if as_frame:\n (frame, X, y) = _convert_data_dataframe(caller_name='fetch_covtype', data=X, target=y, feature_names=FEATURE_NAMES, target_names=TARGET_NAMES)\n if return_X_y:\n return X, y\n return Bunch(data=X, target=y, frame=frame, target_names=TARGET_NAMES, feature_names=FEATURE_NAMES, DESCR=fdescr)" }, { @@ -43215,7 +44796,8 @@ "docstring": { "type": "str, default=None", "description": "Specify another download and cache folder for the datasets. By default\nall scikit-learn data is stored in '~/scikit_learn_data' subfolders." - } + }, + "refined_type": {} }, { "name": "download_if_missing", @@ -43225,7 +44807,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, raise a IOError if the data is not locally available\ninstead of trying to download the data from the source site." - } + }, + "refined_type": {} }, { "name": "percent10", @@ -43235,13 +44818,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to load only 10 percent of the data." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Load the kddcup99 dataset, downloading it if necessary.", - "docstring": "Load the kddcup99 dataset, downloading it if necessary.\n\nParameters\n----------\ndata_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\ndownload_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\npercent10 : bool, default=True\n Whether to load only 10 percent of the data.\n\nReturns\n-------\ndataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray of shape (494021, 41)\n Each row corresponds to the 41 features in the dataset.\n target : ndarray of shape (494021,)\n Each value corresponds to one of the 21 attack types or to the\n label 'normal.'.\n feature_names : list\n The names of the dataset columns\n target_names: list\n The names of the target columns\n DESCR : str\n Description of the kddcup99 dataset.", + "docstring": "Load the kddcup99 dataset, downloading it if necessary.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n percent10 : bool, default=True\n Whether to load only 10 percent of the data.\n\n Returns\n -------\n dataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray of shape (494021, 41)\n Each row corresponds to the 41 features in the dataset.\n target : ndarray of shape (494021,)\n Each value corresponds to one of the 21 attack types or to the\n label 'normal.'.\n feature_names : list\n The names of the dataset columns\n target_names: list\n The names of the target columns\n DESCR : str\n Description of the kddcup99 dataset.\n\n ", "source_code": "\ndef _fetch_brute_kddcup99(data_home=None, download_if_missing=True, percent10=True):\n \"\"\"Load the kddcup99 dataset, downloading it if necessary.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n percent10 : bool, default=True\n Whether to load only 10 percent of the data.\n\n Returns\n -------\n dataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray of shape (494021, 41)\n Each row corresponds to the 41 features in the dataset.\n target : ndarray of shape (494021,)\n Each value corresponds to one of the 21 attack types or to the\n label 'normal.'.\n feature_names : list\n The names of the dataset columns\n target_names: list\n The names of the target columns\n DESCR : str\n Description of the kddcup99 dataset.\n\n \"\"\"\n data_home = get_data_home(data_home=data_home)\n dir_suffix = '-py3'\n if percent10:\n kddcup_dir = join(data_home, 'kddcup99_10' + dir_suffix)\n archive = ARCHIVE_10_PERCENT\n else:\n kddcup_dir = join(data_home, 'kddcup99' + dir_suffix)\n archive = ARCHIVE\n samples_path = join(kddcup_dir, 'samples')\n targets_path = join(kddcup_dir, 'targets')\n available = exists(samples_path)\n dt = [('duration', int), ('protocol_type', 'S4'), ('service', 'S11'), ('flag', 'S6'), ('src_bytes', int), ('dst_bytes', int), ('land', int), ('wrong_fragment', int), ('urgent', int), ('hot', int), ('num_failed_logins', int), ('logged_in', int), ('num_compromised', int), ('root_shell', int), ('su_attempted', int), ('num_root', int), ('num_file_creations', int), ('num_shells', int), ('num_access_files', int), ('num_outbound_cmds', int), ('is_host_login', int), ('is_guest_login', int), ('count', int), ('srv_count', int), ('serror_rate', float), ('srv_serror_rate', float), ('rerror_rate', float), ('srv_rerror_rate', float), ('same_srv_rate', float), ('diff_srv_rate', float), ('srv_diff_host_rate', float), ('dst_host_count', int), ('dst_host_srv_count', int), ('dst_host_same_srv_rate', float), ('dst_host_diff_srv_rate', float), ('dst_host_same_src_port_rate', float), ('dst_host_srv_diff_host_rate', float), ('dst_host_serror_rate', float), ('dst_host_srv_serror_rate', float), ('dst_host_rerror_rate', float), ('dst_host_srv_rerror_rate', float), ('labels', 'S16')]\n column_names = [c[0] for c in dt]\n target_names = column_names[-1]\n feature_names = column_names[:-1]\n if available:\n try:\n X = joblib.load(samples_path)\n y = joblib.load(targets_path)\n except Exception as e:\n raise IOError(f'The cache for fetch_kddcup99 is invalid, please delete {str(kddcup_dir)} and run the fetch_kddcup99 again') from e\n elif download_if_missing:\n _mkdirp(kddcup_dir)\n logger.info('Downloading %s' % archive.url)\n _fetch_remote(archive, dirname=kddcup_dir)\n DT = np.dtype(dt)\n logger.debug('extracting archive')\n archive_path = join(kddcup_dir, archive.filename)\n file_ = GzipFile(filename=archive_path, mode='r')\n Xy = []\n for line in file_.readlines():\n line = line.decode()\n Xy.append(line.replace('\\n', '').split(','))\n file_.close()\n logger.debug('extraction done')\n os.remove(archive_path)\n Xy = np.asarray(Xy, dtype=object)\n for j in range(42):\n Xy[:, j] = Xy[:, j].astype(DT[j])\n X = Xy[:, :-1]\n y = Xy[:, -1]\n joblib.dump(X, samples_path, compress=0)\n joblib.dump(y, targets_path, compress=0)\n else:\n raise IOError('Data not found and `download_if_missing` is False')\n return Bunch(data=X, target=y, feature_names=feature_names, target_names=[target_names])" }, { @@ -43259,13 +44843,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Ensure directory d exists (like mkdir -p on Unix) No guarantee that the directory is writable.", - "docstring": "Ensure directory d exists (like mkdir -p on Unix)\nNo guarantee that the directory is writable.", + "description": "Ensure directory d exists (like mkdir -p on Unix)\nNo guarantee that the directory is writable.", + "docstring": "Ensure directory d exists (like mkdir -p on Unix)\n No guarantee that the directory is writable.\n ", "source_code": "\ndef _mkdirp(d):\n \"\"\"Ensure directory d exists (like mkdir -p on Unix)\n No guarantee that the directory is writable.\n \"\"\"\n try:\n os.makedirs(d)\n except OSError as e:\n if e.errno != errno.EEXIST:\n raise" }, { @@ -43283,6 +44868,10 @@ "docstring": { "type": "{'SA', 'SF', 'http', 'smtp'}, default=None", "description": "To return the corresponding classical subsets of kddcup 99.\nIf None, return the entire kddcup 99 dataset." + }, + "refined_type": { + "kind": "EnumType", + "values": ["SF", "SA", "http", "smtp"] } }, { @@ -43293,7 +44882,8 @@ "docstring": { "type": "str, default=None", "description": "Specify another download and cache folder for the datasets. By default\nall scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -43303,7 +44893,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to shuffle dataset." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -43313,7 +44904,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset shuffling and for\nselection of abnormal samples if `subset='SA'`. Pass an int for\nreproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "percent10", @@ -43323,7 +44915,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to load only 10 percent of the data." - } + }, + "refined_type": {} }, { "name": "download_if_missing", @@ -43333,7 +44926,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, raise a IOError if the data is not locally available\ninstead of trying to download the data from the source site." - } + }, + "refined_type": {} }, { "name": "return_X_y", @@ -43343,7 +44937,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns ``(data, target)`` instead of a Bunch object. See\nbelow for more information about the `data` and `target` object.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "as_frame", @@ -43353,13 +44948,14 @@ "docstring": { "type": "bool, default=False", "description": "If `True`, returns a pandas Dataframe for the ``data`` and ``target``\nobjects in the `Bunch` returned object; `Bunch` return object will also\nhave a ``frame`` member.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load the kddcup99 dataset (classification).\n\nDownload it if necessary. ================= ==================================== Classes 23 Samples total 4898431 Dimensionality 41 Features discrete (int) or continuous (float) ================= ==================================== Read more in the :ref:`User Guide `. .. versionadded:: 0.18", - "docstring": "Load the kddcup99 dataset (classification).\n\nDownload it if necessary.\n\n================= ====================================\nClasses 23\nSamples total 4898431\nDimensionality 41\nFeatures discrete (int) or continuous (float)\n================= ====================================\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18\n\nParameters\n----------\nsubset : {'SA', 'SF', 'http', 'smtp'}, default=None\n To return the corresponding classical subsets of kddcup 99.\n If None, return the entire kddcup 99 dataset.\n\ndata_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n .. versionadded:: 0.19\n\nshuffle : bool, default=False\n Whether to shuffle dataset.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling and for\n selection of abnormal samples if `subset='SA'`. Pass an int for\n reproducible output across multiple function calls.\n See :term:`Glossary `.\n\npercent10 : bool, default=True\n Whether to load only 10 percent of the data.\n\ndownload_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\nreturn_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object. See\n below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.20\n\nas_frame : bool, default=False\n If `True`, returns a pandas Dataframe for the ``data`` and ``target``\n objects in the `Bunch` returned object; `Bunch` return object will also\n have a ``frame`` member.\n\n .. versionadded:: 0.24\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (494021, 41)\n The data matrix to learn. If `as_frame=True`, `data` will be a\n pandas DataFrame.\n target : {ndarray, series} of shape (494021,)\n The regression target for each sample. If `as_frame=True`, `target`\n will be a pandas Series.\n frame : dataframe of shape (494021, 42)\n Only present when `as_frame=True`. Contains `data` and `target`.\n DESCR : str\n The full description of the dataset.\n feature_names : list\n The names of the dataset columns\n target_names: list\n The names of the target columns\n\n(data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20", + "description": "Load the kddcup99 dataset (classification).\n\nDownload it if necessary.\n\n================= ====================================\nClasses 23\nSamples total 4898431\nDimensionality 41\nFeatures discrete (int) or continuous (float)\n================= ====================================\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.18", + "docstring": "Load the kddcup99 dataset (classification).\n\n Download it if necessary.\n\n ================= ====================================\n Classes 23\n Samples total 4898431\n Dimensionality 41\n Features discrete (int) or continuous (float)\n ================= ====================================\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n subset : {'SA', 'SF', 'http', 'smtp'}, default=None\n To return the corresponding classical subsets of kddcup 99.\n If None, return the entire kddcup 99 dataset.\n\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n .. versionadded:: 0.19\n\n shuffle : bool, default=False\n Whether to shuffle dataset.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling and for\n selection of abnormal samples if `subset='SA'`. Pass an int for\n reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n percent10 : bool, default=True\n Whether to load only 10 percent of the data.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object. See\n below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.20\n\n as_frame : bool, default=False\n If `True`, returns a pandas Dataframe for the ``data`` and ``target``\n objects in the `Bunch` returned object; `Bunch` return object will also\n have a ``frame`` member.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (494021, 41)\n The data matrix to learn. If `as_frame=True`, `data` will be a\n pandas DataFrame.\n target : {ndarray, series} of shape (494021,)\n The regression target for each sample. If `as_frame=True`, `target`\n will be a pandas Series.\n frame : dataframe of shape (494021, 42)\n Only present when `as_frame=True`. Contains `data` and `target`.\n DESCR : str\n The full description of the dataset.\n feature_names : list\n The names of the dataset columns\n target_names: list\n The names of the target columns\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20\n ", "source_code": "\ndef fetch_kddcup99(*, subset=None, data_home=None, shuffle=False, random_state=None, percent10=True, download_if_missing=True, return_X_y=False, as_frame=False):\n \"\"\"Load the kddcup99 dataset (classification).\n\n Download it if necessary.\n\n ================= ====================================\n Classes 23\n Samples total 4898431\n Dimensionality 41\n Features discrete (int) or continuous (float)\n ================= ====================================\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n subset : {'SA', 'SF', 'http', 'smtp'}, default=None\n To return the corresponding classical subsets of kddcup 99.\n If None, return the entire kddcup 99 dataset.\n\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n .. versionadded:: 0.19\n\n shuffle : bool, default=False\n Whether to shuffle dataset.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling and for\n selection of abnormal samples if `subset='SA'`. Pass an int for\n reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n percent10 : bool, default=True\n Whether to load only 10 percent of the data.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object. See\n below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.20\n\n as_frame : bool, default=False\n If `True`, returns a pandas Dataframe for the ``data`` and ``target``\n objects in the `Bunch` returned object; `Bunch` return object will also\n have a ``frame`` member.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : {ndarray, dataframe} of shape (494021, 41)\n The data matrix to learn. If `as_frame=True`, `data` will be a\n pandas DataFrame.\n target : {ndarray, series} of shape (494021,)\n The regression target for each sample. If `as_frame=True`, `target`\n will be a pandas Series.\n frame : dataframe of shape (494021, 42)\n Only present when `as_frame=True`. Contains `data` and `target`.\n DESCR : str\n The full description of the dataset.\n feature_names : list\n The names of the dataset columns\n target_names: list\n The names of the target columns\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20\n \"\"\"\n data_home = get_data_home(data_home=data_home)\n kddcup99 = _fetch_brute_kddcup99(data_home=data_home, percent10=percent10, download_if_missing=download_if_missing)\n data = kddcup99.data\n target = kddcup99.target\n feature_names = kddcup99.feature_names\n target_names = kddcup99.target_names\n if subset == 'SA':\n s = target == b'normal.'\n t = np.logical_not(s)\n normal_samples = data[s, :]\n normal_targets = target[s]\n abnormal_samples = data[t, :]\n abnormal_targets = target[t]\n n_samples_abnormal = abnormal_samples.shape[0]\n random_state = check_random_state(random_state)\n r = random_state.randint(0, n_samples_abnormal, 3377)\n abnormal_samples = abnormal_samples[r]\n abnormal_targets = abnormal_targets[r]\n data = np.r_[normal_samples, abnormal_samples]\n target = np.r_[normal_targets, abnormal_targets]\n if subset == 'SF' or subset == 'http' or subset == 'smtp':\n s = data[:, 11] == 1\n data = np.c_[data[s, :11], data[s, 12:]]\n feature_names = feature_names[:11] + feature_names[12:]\n target = target[s]\n data[:, 0] = np.log((data[:, 0] + 0.1).astype(float, copy=False))\n data[:, 4] = np.log((data[:, 4] + 0.1).astype(float, copy=False))\n data[:, 5] = np.log((data[:, 5] + 0.1).astype(float, copy=False))\n if subset == 'http':\n s = data[:, 2] == b'http'\n data = data[s]\n target = target[s]\n data = np.c_[data[:, 0], data[:, 4], data[:, 5]]\n feature_names = [feature_names[0], feature_names[4], feature_names[5]]\n if subset == 'smtp':\n s = data[:, 2] == b'smtp'\n data = data[s]\n target = target[s]\n data = np.c_[data[:, 0], data[:, 4], data[:, 5]]\n feature_names = [feature_names[0], feature_names[4], feature_names[5]]\n if subset == 'SF':\n data = np.c_[data[:, 0], data[:, 2], data[:, 4], data[:, 5]]\n feature_names = [feature_names[0], feature_names[2], feature_names[4], feature_names[5]]\n if shuffle:\n (data, target) = shuffle_method(data, target, random_state=random_state)\n fdescr = load_descr('kddcup99.rst')\n frame = None\n if as_frame:\n (frame, data, target) = _convert_data_dataframe('fetch_kddcup99', data, target, feature_names, target_names)\n if return_X_y:\n return data, target\n return Bunch(data=data, target=target, frame=frame, target_names=target_names, feature_names=feature_names, DESCR=fdescr)" }, { @@ -43377,7 +44973,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "funneled", @@ -43387,7 +44984,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "download_if_missing", @@ -43397,7 +44995,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -43421,7 +45020,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data_folder_path", @@ -43431,7 +45031,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "slice_", @@ -43441,7 +45042,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "color", @@ -43451,7 +45053,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "resize", @@ -43461,13 +45064,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Perform the actual data loading for the LFW pairs dataset\n\nThis operation is meant to be cached by a joblib wrapper.", - "docstring": "Perform the actual data loading for the LFW pairs dataset\n\nThis operation is meant to be cached by a joblib wrapper.", + "docstring": "Perform the actual data loading for the LFW pairs dataset\n\n This operation is meant to be cached by a joblib wrapper.\n ", "source_code": "\ndef _fetch_lfw_pairs(index_file_path, data_folder_path, slice_=None, color=False, resize=None):\n \"\"\"Perform the actual data loading for the LFW pairs dataset\n\n This operation is meant to be cached by a joblib wrapper.\n \"\"\"\n with open(index_file_path, 'rb') as index_file:\n split_lines = [ln.decode().strip().split('\\t') for ln in index_file]\n pair_specs = [sl for sl in split_lines if len(sl) > 2]\n n_pairs = len(pair_specs)\n target = np.zeros(n_pairs, dtype=int)\n file_paths = list()\n for (i, components) in enumerate(pair_specs):\n if len(components) == 3:\n target[i] = 1\n pair = ((components[0], int(components[1]) - 1), (components[0], int(components[2]) - 1))\n elif len(components) == 4:\n target[i] = 0\n pair = ((components[0], int(components[1]) - 1), (components[2], int(components[3]) - 1))\n else:\n raise ValueError('invalid line %d: %r' % (i + 1, components))\n for (j, (name, idx)) in enumerate(pair):\n try:\n person_folder = join(data_folder_path, name)\n except TypeError:\n person_folder = join(data_folder_path, str(name, 'UTF-8'))\n filenames = list(sorted(listdir(person_folder)))\n file_path = join(person_folder, filenames[idx])\n file_paths.append(file_path)\n pairs = _load_imgs(file_paths, slice_, color, resize)\n shape = list(pairs.shape)\n n_faces = shape.pop(0)\n shape.insert(0, 2)\n shape.insert(0, n_faces // 2)\n pairs.shape = shape\n return pairs, target, np.array(['Different persons', 'Same person'])" }, { @@ -43485,7 +45089,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "slice_", @@ -43495,7 +45100,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "color", @@ -43505,7 +45111,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "resize", @@ -43515,7 +45122,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_faces_per_person", @@ -43525,13 +45133,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Perform the actual data loading for the lfw people dataset\n\nThis operation is meant to be cached by a joblib wrapper.", - "docstring": "Perform the actual data loading for the lfw people dataset\n\nThis operation is meant to be cached by a joblib wrapper.", + "docstring": "Perform the actual data loading for the lfw people dataset\n\n This operation is meant to be cached by a joblib wrapper.\n ", "source_code": "\ndef _fetch_lfw_people(data_folder_path, slice_=None, color=False, resize=None, min_faces_per_person=0):\n \"\"\"Perform the actual data loading for the lfw people dataset\n\n This operation is meant to be cached by a joblib wrapper.\n \"\"\"\n (person_names, file_paths) = ([], [])\n for person_name in sorted(listdir(data_folder_path)):\n folder_path = join(data_folder_path, person_name)\n if not isdir(folder_path):\n continue\n paths = [join(folder_path, f) for f in sorted(listdir(folder_path))]\n n_pictures = len(paths)\n if n_pictures >= min_faces_per_person:\n person_name = person_name.replace('_', ' ')\n person_names.extend([person_name] * n_pictures)\n file_paths.extend(paths)\n n_faces = len(file_paths)\n if n_faces == 0:\n raise ValueError('min_faces_per_person=%d is too restrictive' % min_faces_per_person)\n target_names = np.unique(person_names)\n target = np.searchsorted(target_names, person_names)\n faces = _load_imgs(file_paths, slice_, color, resize)\n indices = np.arange(n_faces)\n np.random.RandomState(42).shuffle(indices)\n (faces, target) = (faces[indices], target[indices])\n return faces, target, target_names" }, { @@ -43549,7 +45158,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "slice_", @@ -43559,7 +45169,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "color", @@ -43569,7 +45180,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "resize", @@ -43579,7 +45191,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -43603,6 +45216,10 @@ "docstring": { "type": "{'train', 'test', '10_folds'}, default='train'", "description": "Select the dataset to load: 'train' for the development training\nset, 'test' for the development test set, and '10_folds' for the\nofficial evaluation set that is meant to be used with a 10-folds\ncross validation." + }, + "refined_type": { + "kind": "EnumType", + "values": ["train", "10_folds", "test"] } }, { @@ -43613,7 +45230,8 @@ "docstring": { "type": "str, default=None", "description": "Specify another download and cache folder for the datasets. By\ndefault all scikit-learn data is stored in '~/scikit_learn_data'\nsubfolders." - } + }, + "refined_type": {} }, { "name": "funneled", @@ -43623,7 +45241,8 @@ "docstring": { "type": "bool, default=True", "description": "Download and use the funneled variant of the dataset." - } + }, + "refined_type": {} }, { "name": "resize", @@ -43633,7 +45252,8 @@ "docstring": { "type": "float, default=0.5", "description": "Ratio used to resize the each face picture." - } + }, + "refined_type": {} }, { "name": "color", @@ -43643,7 +45263,8 @@ "docstring": { "type": "bool, default=False", "description": "Keep the 3 RGB channels instead of averaging them to a single\ngray level channel. If color is True the shape of the data has\none more dimension than the shape with color = False." - } + }, + "refined_type": {} }, { "name": "slice_", @@ -43653,7 +45274,8 @@ "docstring": { "type": "tuple of slice, default=(slice(70, 195), slice(78, 172))", "description": "Provide a custom 2D slice (height, width) to extract the\n'interesting' part of the jpeg files and avoid use statistical\ncorrelation from the background" - } + }, + "refined_type": {} }, { "name": "download_if_missing", @@ -43663,13 +45285,14 @@ "docstring": { "type": "bool, default=True", "description": "If False, raise a IOError if the data is not locally available\ninstead of trying to download the data from the source site." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load the Labeled Faces in the Wild (LFW) pairs dataset (classification).\n\nDownload it if necessary. ================= ======================= Classes 2 Samples total 13233 Dimensionality 5828 Features real, between 0 and 255 ================= ======================= In the official `README.txt`_ this task is described as the \"Restricted\" task. As I am not sure as to implement the \"Unrestricted\" variant correctly, I left it as unsupported for now. .. _`README.txt`: http://vis-www.cs.umass.edu/lfw/README.txt The original images are 250 x 250 pixels, but the default slice and resize arguments reduce them to 62 x 47. Read more in the :ref:`User Guide `.", - "docstring": "Load the Labeled Faces in the Wild (LFW) pairs dataset (classification).\n\nDownload it if necessary.\n\n================= =======================\nClasses 2\nSamples total 13233\nDimensionality 5828\nFeatures real, between 0 and 255\n================= =======================\n\nIn the official `README.txt`_ this task is described as the\n\"Restricted\" task. As I am not sure as to implement the\n\"Unrestricted\" variant correctly, I left it as unsupported for now.\n\n .. _`README.txt`: http://vis-www.cs.umass.edu/lfw/README.txt\n\nThe original images are 250 x 250 pixels, but the default slice and resize\narguments reduce them to 62 x 47.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nsubset : {'train', 'test', '10_folds'}, default='train'\n Select the dataset to load: 'train' for the development training\n set, 'test' for the development test set, and '10_folds' for the\n official evaluation set that is meant to be used with a 10-folds\n cross validation.\n\ndata_home : str, default=None\n Specify another download and cache folder for the datasets. By\n default all scikit-learn data is stored in '~/scikit_learn_data'\n subfolders.\n\nfunneled : bool, default=True\n Download and use the funneled variant of the dataset.\n\nresize : float, default=0.5\n Ratio used to resize the each face picture.\n\ncolor : bool, default=False\n Keep the 3 RGB channels instead of averaging them to a single\n gray level channel. If color is True the shape of the data has\n one more dimension than the shape with color = False.\n\nslice_ : tuple of slice, default=(slice(70, 195), slice(78, 172))\n Provide a custom 2D slice (height, width) to extract the\n 'interesting' part of the jpeg files and avoid use statistical\n correlation from the background\n\ndownload_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray of shape (2200, 5828). Shape depends on ``subset``.\n Each row corresponds to 2 ravel'd face images\n of original size 62 x 47 pixels.\n Changing the ``slice_``, ``resize`` or ``subset`` parameters\n will change the shape of the output.\n pairs : ndarray of shape (2200, 2, 62, 47). Shape depends on ``subset``\n Each row has 2 face images corresponding\n to same or different person from the dataset\n containing 5749 people. Changing the ``slice_``,\n ``resize`` or ``subset`` parameters will change the shape of the\n output.\n target : numpy array of shape (2200,). Shape depends on ``subset``.\n Labels associated to each pair of images.\n The two label values being different persons or the same person.\n DESCR : str\n Description of the Labeled Faces in the Wild (LFW) dataset.", + "description": "Load the Labeled Faces in the Wild (LFW) pairs dataset (classification).\n\nDownload it if necessary.\n\n================= =======================\nClasses 2\nSamples total 13233\nDimensionality 5828\nFeatures real, between 0 and 255\n================= =======================\n\nIn the official `README.txt`_ this task is described as the\n\"Restricted\" task. As I am not sure as to implement the\n\"Unrestricted\" variant correctly, I left it as unsupported for now.\n\n .. _`README.txt`: http://vis-www.cs.umass.edu/lfw/README.txt\n\nThe original images are 250 x 250 pixels, but the default slice and resize\narguments reduce them to 62 x 47.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load the Labeled Faces in the Wild (LFW) pairs dataset (classification).\n\n Download it if necessary.\n\n ================= =======================\n Classes 2\n Samples total 13233\n Dimensionality 5828\n Features real, between 0 and 255\n ================= =======================\n\n In the official `README.txt`_ this task is described as the\n \"Restricted\" task. As I am not sure as to implement the\n \"Unrestricted\" variant correctly, I left it as unsupported for now.\n\n .. _`README.txt`: http://vis-www.cs.umass.edu/lfw/README.txt\n\n The original images are 250 x 250 pixels, but the default slice and resize\n arguments reduce them to 62 x 47.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n subset : {'train', 'test', '10_folds'}, default='train'\n Select the dataset to load: 'train' for the development training\n set, 'test' for the development test set, and '10_folds' for the\n official evaluation set that is meant to be used with a 10-folds\n cross validation.\n\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By\n default all scikit-learn data is stored in '~/scikit_learn_data'\n subfolders.\n\n funneled : bool, default=True\n Download and use the funneled variant of the dataset.\n\n resize : float, default=0.5\n Ratio used to resize the each face picture.\n\n color : bool, default=False\n Keep the 3 RGB channels instead of averaging them to a single\n gray level channel. If color is True the shape of the data has\n one more dimension than the shape with color = False.\n\n slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172))\n Provide a custom 2D slice (height, width) to extract the\n 'interesting' part of the jpeg files and avoid use statistical\n correlation from the background\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray of shape (2200, 5828). Shape depends on ``subset``.\n Each row corresponds to 2 ravel'd face images\n of original size 62 x 47 pixels.\n Changing the ``slice_``, ``resize`` or ``subset`` parameters\n will change the shape of the output.\n pairs : ndarray of shape (2200, 2, 62, 47). Shape depends on ``subset``\n Each row has 2 face images corresponding\n to same or different person from the dataset\n containing 5749 people. Changing the ``slice_``,\n ``resize`` or ``subset`` parameters will change the shape of the\n output.\n target : numpy array of shape (2200,). Shape depends on ``subset``.\n Labels associated to each pair of images.\n The two label values being different persons or the same person.\n DESCR : str\n Description of the Labeled Faces in the Wild (LFW) dataset.\n\n ", "source_code": "\ndef fetch_lfw_pairs(*, subset='train', data_home=None, funneled=True, resize=0.5, color=False, slice_=(slice(70, 195), slice(78, 172)), download_if_missing=True):\n \"\"\"Load the Labeled Faces in the Wild (LFW) pairs dataset (classification).\n\n Download it if necessary.\n\n ================= =======================\n Classes 2\n Samples total 13233\n Dimensionality 5828\n Features real, between 0 and 255\n ================= =======================\n\n In the official `README.txt`_ this task is described as the\n \"Restricted\" task. As I am not sure as to implement the\n \"Unrestricted\" variant correctly, I left it as unsupported for now.\n\n .. _`README.txt`: http://vis-www.cs.umass.edu/lfw/README.txt\n\n The original images are 250 x 250 pixels, but the default slice and resize\n arguments reduce them to 62 x 47.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n subset : {'train', 'test', '10_folds'}, default='train'\n Select the dataset to load: 'train' for the development training\n set, 'test' for the development test set, and '10_folds' for the\n official evaluation set that is meant to be used with a 10-folds\n cross validation.\n\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By\n default all scikit-learn data is stored in '~/scikit_learn_data'\n subfolders.\n\n funneled : bool, default=True\n Download and use the funneled variant of the dataset.\n\n resize : float, default=0.5\n Ratio used to resize the each face picture.\n\n color : bool, default=False\n Keep the 3 RGB channels instead of averaging them to a single\n gray level channel. If color is True the shape of the data has\n one more dimension than the shape with color = False.\n\n slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172))\n Provide a custom 2D slice (height, width) to extract the\n 'interesting' part of the jpeg files and avoid use statistical\n correlation from the background\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : ndarray of shape (2200, 5828). Shape depends on ``subset``.\n Each row corresponds to 2 ravel'd face images\n of original size 62 x 47 pixels.\n Changing the ``slice_``, ``resize`` or ``subset`` parameters\n will change the shape of the output.\n pairs : ndarray of shape (2200, 2, 62, 47). Shape depends on ``subset``\n Each row has 2 face images corresponding\n to same or different person from the dataset\n containing 5749 people. Changing the ``slice_``,\n ``resize`` or ``subset`` parameters will change the shape of the\n output.\n target : numpy array of shape (2200,). Shape depends on ``subset``.\n Labels associated to each pair of images.\n The two label values being different persons or the same person.\n DESCR : str\n Description of the Labeled Faces in the Wild (LFW) dataset.\n\n \"\"\"\n (lfw_home, data_folder_path) = _check_fetch_lfw(data_home=data_home, funneled=funneled, download_if_missing=download_if_missing)\n logger.debug('Loading %s LFW pairs from %s', subset, lfw_home)\n if parse_version(joblib.__version__) < parse_version('0.12'):\n m = Memory(cachedir=lfw_home, compress=6, verbose=0)\n else:\n m = Memory(location=lfw_home, compress=6, verbose=0)\n load_func = m.cache(_fetch_lfw_pairs)\n label_filenames = {'train': 'pairsDevTrain.txt', 'test': 'pairsDevTest.txt', '10_folds': 'pairs.txt'}\n if subset not in label_filenames:\n raise ValueError(\"subset='%s' is invalid: should be one of %r\" % (subset, list(sorted(label_filenames.keys()))))\n index_file_path = join(lfw_home, label_filenames[subset])\n (pairs, target, target_names) = load_func(index_file_path, data_folder_path, resize=resize, color=color, slice_=slice_)\n fdescr = load_descr('lfw.rst')\n return Bunch(data=pairs.reshape(len(pairs), -1), pairs=pairs, target=target, target_names=target_names, DESCR=fdescr)" }, { @@ -43687,7 +45310,8 @@ "docstring": { "type": "str, default=None", "description": "Specify another download and cache folder for the datasets. By default\nall scikit-learn data is stored in '~/scikit_learn_data' subfolders." - } + }, + "refined_type": {} }, { "name": "funneled", @@ -43697,7 +45321,8 @@ "docstring": { "type": "bool, default=True", "description": "Download and use the funneled variant of the dataset." - } + }, + "refined_type": {} }, { "name": "resize", @@ -43707,7 +45332,8 @@ "docstring": { "type": "float, default=0.5", "description": "Ratio used to resize the each face picture." - } + }, + "refined_type": {} }, { "name": "min_faces_per_person", @@ -43717,7 +45343,8 @@ "docstring": { "type": "int, default=None", "description": "The extracted dataset will only retain pictures of people that have at\nleast `min_faces_per_person` different pictures." - } + }, + "refined_type": {} }, { "name": "color", @@ -43727,7 +45354,8 @@ "docstring": { "type": "bool, default=False", "description": "Keep the 3 RGB channels instead of averaging them to a single\ngray level channel. If color is True the shape of the data has\none more dimension than the shape with color = False." - } + }, + "refined_type": {} }, { "name": "slice_", @@ -43737,7 +45365,8 @@ "docstring": { "type": "tuple of slice, default=(slice(70, 195), slice(78, 172))", "description": "Provide a custom 2D slice (height, width) to extract the\n'interesting' part of the jpeg files and avoid use statistical\ncorrelation from the background" - } + }, + "refined_type": {} }, { "name": "download_if_missing", @@ -43747,7 +45376,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, raise a IOError if the data is not locally available\ninstead of trying to download the data from the source site." - } + }, + "refined_type": {} }, { "name": "return_X_y", @@ -43757,13 +45387,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch\nobject. See below for more information about the `dataset.data` and\n`dataset.target` object.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load the Labeled Faces in the Wild (LFW) people dataset (classification).\n\nDownload it if necessary. ================= ======================= Classes 5749 Samples total 13233 Dimensionality 5828 Features real, between 0 and 255 ================= ======================= Read more in the :ref:`User Guide `.", - "docstring": "Load the Labeled Faces in the Wild (LFW) people dataset (classification).\n\nDownload it if necessary.\n\n================= =======================\nClasses 5749\nSamples total 13233\nDimensionality 5828\nFeatures real, between 0 and 255\n================= =======================\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ndata_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\nfunneled : bool, default=True\n Download and use the funneled variant of the dataset.\n\nresize : float, default=0.5\n Ratio used to resize the each face picture.\n\nmin_faces_per_person : int, default=None\n The extracted dataset will only retain pictures of people that have at\n least `min_faces_per_person` different pictures.\n\ncolor : bool, default=False\n Keep the 3 RGB channels instead of averaging them to a single\n gray level channel. If color is True the shape of the data has\n one more dimension than the shape with color = False.\n\nslice_ : tuple of slice, default=(slice(70, 195), slice(78, 172))\n Provide a custom 2D slice (height, width) to extract the\n 'interesting' part of the jpeg files and avoid use statistical\n correlation from the background\n\ndownload_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\nreturn_X_y : bool, default=False\n If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch\n object. See below for more information about the `dataset.data` and\n `dataset.target` object.\n\n .. versionadded:: 0.20\n\nReturns\n-------\ndataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : numpy array of shape (13233, 2914)\n Each row corresponds to a ravelled face image\n of original size 62 x 47 pixels.\n Changing the ``slice_`` or resize parameters will change the\n shape of the output.\n images : numpy array of shape (13233, 62, 47)\n Each row is a face image corresponding to one of the 5749 people in\n the dataset. Changing the ``slice_``\n or resize parameters will change the shape of the output.\n target : numpy array of shape (13233,)\n Labels associated to each face image.\n Those labels range from 0-5748 and correspond to the person IDs.\n DESCR : str\n Description of the Labeled Faces in the Wild (LFW) dataset.\n\n(data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20", + "description": "Load the Labeled Faces in the Wild (LFW) people dataset (classification).\n\nDownload it if necessary.\n\n================= =======================\nClasses 5749\nSamples total 13233\nDimensionality 5828\nFeatures real, between 0 and 255\n================= =======================\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load the Labeled Faces in the Wild (LFW) people dataset (classification).\n\n Download it if necessary.\n\n ================= =======================\n Classes 5749\n Samples total 13233\n Dimensionality 5828\n Features real, between 0 and 255\n ================= =======================\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n funneled : bool, default=True\n Download and use the funneled variant of the dataset.\n\n resize : float, default=0.5\n Ratio used to resize the each face picture.\n\n min_faces_per_person : int, default=None\n The extracted dataset will only retain pictures of people that have at\n least `min_faces_per_person` different pictures.\n\n color : bool, default=False\n Keep the 3 RGB channels instead of averaging them to a single\n gray level channel. If color is True the shape of the data has\n one more dimension than the shape with color = False.\n\n slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172))\n Provide a custom 2D slice (height, width) to extract the\n 'interesting' part of the jpeg files and avoid use statistical\n correlation from the background\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n return_X_y : bool, default=False\n If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch\n object. See below for more information about the `dataset.data` and\n `dataset.target` object.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n dataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : numpy array of shape (13233, 2914)\n Each row corresponds to a ravelled face image\n of original size 62 x 47 pixels.\n Changing the ``slice_`` or resize parameters will change the\n shape of the output.\n images : numpy array of shape (13233, 62, 47)\n Each row is a face image corresponding to one of the 5749 people in\n the dataset. Changing the ``slice_``\n or resize parameters will change the shape of the output.\n target : numpy array of shape (13233,)\n Labels associated to each face image.\n Those labels range from 0-5748 and correspond to the person IDs.\n DESCR : str\n Description of the Labeled Faces in the Wild (LFW) dataset.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20\n\n ", "source_code": "\ndef fetch_lfw_people(*, data_home=None, funneled=True, resize=0.5, min_faces_per_person=0, color=False, slice_=(slice(70, 195), slice(78, 172)), download_if_missing=True, return_X_y=False):\n \"\"\"Load the Labeled Faces in the Wild (LFW) people dataset (classification).\n\n Download it if necessary.\n\n ================= =======================\n Classes 5749\n Samples total 13233\n Dimensionality 5828\n Features real, between 0 and 255\n ================= =======================\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n funneled : bool, default=True\n Download and use the funneled variant of the dataset.\n\n resize : float, default=0.5\n Ratio used to resize the each face picture.\n\n min_faces_per_person : int, default=None\n The extracted dataset will only retain pictures of people that have at\n least `min_faces_per_person` different pictures.\n\n color : bool, default=False\n Keep the 3 RGB channels instead of averaging them to a single\n gray level channel. If color is True the shape of the data has\n one more dimension than the shape with color = False.\n\n slice_ : tuple of slice, default=(slice(70, 195), slice(78, 172))\n Provide a custom 2D slice (height, width) to extract the\n 'interesting' part of the jpeg files and avoid use statistical\n correlation from the background\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n return_X_y : bool, default=False\n If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch\n object. See below for more information about the `dataset.data` and\n `dataset.target` object.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n dataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : numpy array of shape (13233, 2914)\n Each row corresponds to a ravelled face image\n of original size 62 x 47 pixels.\n Changing the ``slice_`` or resize parameters will change the\n shape of the output.\n images : numpy array of shape (13233, 62, 47)\n Each row is a face image corresponding to one of the 5749 people in\n the dataset. Changing the ``slice_``\n or resize parameters will change the shape of the output.\n target : numpy array of shape (13233,)\n Labels associated to each face image.\n Those labels range from 0-5748 and correspond to the person IDs.\n DESCR : str\n Description of the Labeled Faces in the Wild (LFW) dataset.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20\n\n \"\"\"\n (lfw_home, data_folder_path) = _check_fetch_lfw(data_home=data_home, funneled=funneled, download_if_missing=download_if_missing)\n logger.debug('Loading LFW people faces from %s', lfw_home)\n if parse_version(joblib.__version__) < parse_version('0.12'):\n m = Memory(cachedir=lfw_home, compress=6, verbose=0)\n else:\n m = Memory(location=lfw_home, compress=6, verbose=0)\n load_func = m.cache(_fetch_lfw_people)\n (faces, target, target_names) = load_func(data_folder_path, resize=resize, min_faces_per_person=min_faces_per_person, color=color, slice_=slice_)\n X = faces.reshape(len(faces), -1)\n fdescr = load_descr('lfw.rst')\n if return_X_y:\n return X, target\n return Bunch(data=X, images=faces, target=target, target_names=target_names, DESCR=fdescr)" }, { @@ -43781,7 +45412,8 @@ "docstring": { "type": "str, default=None", "description": "Specify another download and cache folder for the datasets. By default\nall scikit-learn data is stored in '~/scikit_learn_data' subfolders." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -43791,7 +45423,8 @@ "docstring": { "type": "bool, default=False", "description": "If True the order of the dataset is shuffled to avoid having\nimages of the same person grouped." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -43801,7 +45434,8 @@ "docstring": { "type": "int, RandomState instance or None, default=0", "description": "Determines random number generation for dataset shuffling. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "download_if_missing", @@ -43811,7 +45445,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, raise a IOError if the data is not locally available\ninstead of trying to download the data from the source site." - } + }, + "refined_type": {} }, { "name": "return_X_y", @@ -43821,13 +45456,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns `(data, target)` instead of a `Bunch` object. See\nbelow for more information about the `data` and `target` object.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load the Olivetti faces data-set from AT&T (classification).\n\nDownload it if necessary. ================= ===================== Classes 40 Samples total 400 Dimensionality 4096 Features real, between 0 and 1 ================= ===================== Read more in the :ref:`User Guide `.", - "docstring": "Load the Olivetti faces data-set from AT&T (classification).\n\nDownload it if necessary.\n\n================= =====================\nClasses 40\nSamples total 400\nDimensionality 4096\nFeatures real, between 0 and 1\n================= =====================\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ndata_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\nshuffle : bool, default=False\n If True the order of the dataset is shuffled to avoid having\n images of the same person grouped.\n\nrandom_state : int, RandomState instance or None, default=0\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\ndownload_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\nreturn_X_y : bool, default=False\n If True, returns `(data, target)` instead of a `Bunch` object. See\n below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.22\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data: ndarray, shape (400, 4096)\n Each row corresponds to a ravelled\n face image of original size 64 x 64 pixels.\n images : ndarray, shape (400, 64, 64)\n Each row is a face image\n corresponding to one of the 40 subjects of the dataset.\n target : ndarray, shape (400,)\n Labels associated to each face image.\n Those labels are ranging from 0-39 and correspond to the\n Subject IDs.\n DESCR : str\n Description of the modified Olivetti Faces Dataset.\n\n(data, target) : tuple if `return_X_y=True`\n .. versionadded:: 0.22", + "description": "Load the Olivetti faces data-set from AT&T (classification).\n\nDownload it if necessary.\n\n================= =====================\nClasses 40\nSamples total 400\nDimensionality 4096\nFeatures real, between 0 and 1\n================= =====================\n\nRead more in the :ref:`User Guide `.", + "docstring": "Load the Olivetti faces data-set from AT&T (classification).\n\n Download it if necessary.\n\n ================= =====================\n Classes 40\n Samples total 400\n Dimensionality 4096\n Features real, between 0 and 1\n ================= =====================\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n shuffle : bool, default=False\n If True the order of the dataset is shuffled to avoid having\n images of the same person grouped.\n\n random_state : int, RandomState instance or None, default=0\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n return_X_y : bool, default=False\n If True, returns `(data, target)` instead of a `Bunch` object. See\n below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data: ndarray, shape (400, 4096)\n Each row corresponds to a ravelled\n face image of original size 64 x 64 pixels.\n images : ndarray, shape (400, 64, 64)\n Each row is a face image\n corresponding to one of the 40 subjects of the dataset.\n target : ndarray, shape (400,)\n Labels associated to each face image.\n Those labels are ranging from 0-39 and correspond to the\n Subject IDs.\n DESCR : str\n Description of the modified Olivetti Faces Dataset.\n\n (data, target) : tuple if `return_X_y=True`\n .. versionadded:: 0.22\n ", "source_code": "\ndef fetch_olivetti_faces(*, data_home=None, shuffle=False, random_state=0, download_if_missing=True, return_X_y=False):\n \"\"\"Load the Olivetti faces data-set from AT&T (classification).\n\n Download it if necessary.\n\n ================= =====================\n Classes 40\n Samples total 400\n Dimensionality 4096\n Features real, between 0 and 1\n ================= =====================\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n shuffle : bool, default=False\n If True the order of the dataset is shuffled to avoid having\n images of the same person grouped.\n\n random_state : int, RandomState instance or None, default=0\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n return_X_y : bool, default=False\n If True, returns `(data, target)` instead of a `Bunch` object. See\n below for more information about the `data` and `target` object.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data: ndarray, shape (400, 4096)\n Each row corresponds to a ravelled\n face image of original size 64 x 64 pixels.\n images : ndarray, shape (400, 64, 64)\n Each row is a face image\n corresponding to one of the 40 subjects of the dataset.\n target : ndarray, shape (400,)\n Labels associated to each face image.\n Those labels are ranging from 0-39 and correspond to the\n Subject IDs.\n DESCR : str\n Description of the modified Olivetti Faces Dataset.\n\n (data, target) : tuple if `return_X_y=True`\n .. versionadded:: 0.22\n \"\"\"\n data_home = get_data_home(data_home=data_home)\n if not exists(data_home):\n makedirs(data_home)\n filepath = _pkl_filepath(data_home, 'olivetti.pkz')\n if not exists(filepath):\n if not download_if_missing:\n raise IOError('Data not found and `download_if_missing` is False')\n print('downloading Olivetti faces from %s to %s' % (FACES.url, data_home))\n mat_path = _fetch_remote(FACES, dirname=data_home)\n mfile = loadmat(file_name=mat_path)\n remove(mat_path)\n faces = mfile['faces'].T.copy()\n joblib.dump(faces, filepath, compress=6)\n del mfile\n else:\n faces = joblib.load(filepath)\n faces = np.float32(faces)\n faces = faces - faces.min()\n faces /= faces.max()\n faces = faces.reshape((400, 64, 64)).transpose(0, 2, 1)\n target = np.array([i // 10 for i in range(400)])\n if shuffle:\n random_state = check_random_state(random_state)\n order = random_state.permutation(len(faces))\n faces = faces[order]\n target = target[order]\n faces_vectorized = faces.reshape(len(faces), -1)\n fdescr = load_descr('olivetti_faces.rst')\n if return_X_y:\n return faces_vectorized, target\n return Bunch(data=faces_vectorized, images=faces, target=target, DESCR=fdescr)" }, { @@ -43845,7 +45481,8 @@ "docstring": { "type": "dict", "description": "As obtained from liac-arff object." - } + }, + "refined_type": {} }, { "name": "col_slice_x", @@ -43855,7 +45492,8 @@ "docstring": { "type": "list", "description": "The column indices that are sliced from the original array to return\nas X data" - } + }, + "refined_type": {} }, { "name": "col_slice_y", @@ -43865,7 +45503,8 @@ "docstring": { "type": "list", "description": "The column indices that are sliced from the original array to return\nas y data" - } + }, + "refined_type": {} }, { "name": "shape", @@ -43875,13 +45514,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "converts the arff object into the appropriate matrix type (np.array or scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the liac-arff dict, the object from the 'data' key)", - "docstring": "converts the arff object into the appropriate matrix type (np.array or\nscipy.sparse.csr_matrix) based on the 'data part' (i.e., in the\nliac-arff dict, the object from the 'data' key)\n\nParameters\n----------\narff : dict\n As obtained from liac-arff object.\n\ncol_slice_x : list\n The column indices that are sliced from the original array to return\n as X data\n\ncol_slice_y : list\n The column indices that are sliced from the original array to return\n as y data\n\nReturns\n-------\nX : np.array or scipy.sparse.csr_matrix\ny : np.array", + "description": "converts the arff object into the appropriate matrix type (np.array or\nscipy.sparse.csr_matrix) based on the 'data part' (i.e., in the\nliac-arff dict, the object from the 'data' key)", + "docstring": "\n converts the arff object into the appropriate matrix type (np.array or\n scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the\n liac-arff dict, the object from the 'data' key)\n\n Parameters\n ----------\n arff : dict\n As obtained from liac-arff object.\n\n col_slice_x : list\n The column indices that are sliced from the original array to return\n as X data\n\n col_slice_y : list\n The column indices that are sliced from the original array to return\n as y data\n\n Returns\n -------\n X : np.array or scipy.sparse.csr_matrix\n y : np.array\n ", "source_code": "\ndef _convert_arff_data(arff: ArffContainerType, col_slice_x: List[int], col_slice_y: List[int], shape: Optional[Tuple] = None) -> Tuple:\n \"\"\"\n converts the arff object into the appropriate matrix type (np.array or\n scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the\n liac-arff dict, the object from the 'data' key)\n\n Parameters\n ----------\n arff : dict\n As obtained from liac-arff object.\n\n col_slice_x : list\n The column indices that are sliced from the original array to return\n as X data\n\n col_slice_y : list\n The column indices that are sliced from the original array to return\n as y data\n\n Returns\n -------\n X : np.array or scipy.sparse.csr_matrix\n y : np.array\n \"\"\"\n arff_data = arff['data']\n if isinstance(arff_data, Generator):\n if shape is None:\n raise ValueError(\"shape must be provided when arr['data'] is a Generator\")\n if shape[0] == -1:\n count = -1\n else:\n count = shape[0] * shape[1]\n data = np.fromiter(itertools.chain.from_iterable(arff_data), dtype='float64', count=count)\n data = data.reshape(*shape)\n X = data[:, col_slice_x]\n y = data[:, col_slice_y]\n return X, y\n elif isinstance(arff_data, tuple):\n arff_data_X = _split_sparse_columns(arff_data, col_slice_x)\n num_obs = max(arff_data[1]) + 1\n X_shape = (num_obs, len(col_slice_x))\n X = scipy.sparse.coo_matrix((arff_data_X[0], (arff_data_X[1], arff_data_X[2])), shape=X_shape, dtype=np.float64)\n X = X.tocsr()\n y = _sparse_data_to_array(arff_data, col_slice_y)\n return X, y\n else:\n raise ValueError('Unexpected Data Type obtained from arff.')" }, { @@ -43899,7 +45539,8 @@ "docstring": { "type": "dict", "description": "As obtained from liac-arff object." - } + }, + "refined_type": {} }, { "name": "columns", @@ -43909,7 +45550,8 @@ "docstring": { "type": "list", "description": "Columns from dataframe to return." - } + }, + "refined_type": {} }, { "name": "features_dict", @@ -43919,13 +45561,14 @@ "docstring": { "type": "dict", "description": "Maps feature name to feature info from openml." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Convert the ARFF object into a pandas DataFrame.", - "docstring": "Convert the ARFF object into a pandas DataFrame.\n\nParameters\n----------\narff : dict\n As obtained from liac-arff object.\n\ncolumns : list\n Columns from dataframe to return.\n\nfeatures_dict : dict\n Maps feature name to feature info from openml.\n\nReturns\n-------\nresult : tuple\n tuple with the resulting dataframe", + "docstring": "Convert the ARFF object into a pandas DataFrame.\n\n Parameters\n ----------\n arff : dict\n As obtained from liac-arff object.\n\n columns : list\n Columns from dataframe to return.\n\n features_dict : dict\n Maps feature name to feature info from openml.\n\n Returns\n -------\n result : tuple\n tuple with the resulting dataframe\n ", "source_code": "\ndef _convert_arff_data_dataframe(arff: ArffContainerType, columns: List, features_dict: Dict[str, Any]) -> Tuple:\n \"\"\"Convert the ARFF object into a pandas DataFrame.\n\n Parameters\n ----------\n arff : dict\n As obtained from liac-arff object.\n\n columns : list\n Columns from dataframe to return.\n\n features_dict : dict\n Maps feature name to feature info from openml.\n\n Returns\n -------\n result : tuple\n tuple with the resulting dataframe\n \"\"\"\n pd = check_pandas_support('fetch_openml with as_frame=True')\n attributes = OrderedDict(arff['attributes'])\n arff_columns = list(attributes)\n if not isinstance(arff['data'], Generator):\n raise ValueError(\"arff['data'] must be a generator when converting to pd.DataFrame.\")\n first_row = next(arff['data'])\n first_df = pd.DataFrame([first_row], columns=arff_columns)\n row_bytes = first_df.memory_usage(deep=True).sum()\n chunksize = get_chunk_n_rows(row_bytes)\n columns_to_keep = [col for col in arff_columns if col in columns]\n dfs = []\n dfs.append(first_df[columns_to_keep])\n for data in _chunk_generator(arff['data'], chunksize):\n dfs.append(pd.DataFrame(data, columns=arff_columns)[columns_to_keep])\n df = pd.concat(dfs, ignore_index=True)\n for column in columns_to_keep:\n dtype = _feature_to_dtype(features_dict[column])\n if dtype == 'category':\n cats_without_missing = [cat for cat in attributes[column] if cat is not None and not is_scalar_nan(cat)]\n dtype = pd.api.types.CategoricalDtype(cats_without_missing)\n df[column] = df[column].astype(dtype, copy=False)\n return (df, )" }, { @@ -43943,7 +45586,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sparse", @@ -43953,7 +45597,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data_home", @@ -43963,7 +45608,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "as_frame", @@ -43973,7 +45619,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "features_list", @@ -43983,7 +45630,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data_columns", @@ -43993,7 +45641,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "target_columns", @@ -44003,7 +45652,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "shape", @@ -44013,7 +45663,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "md5_checksum", @@ -44023,7 +45674,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -44047,7 +45699,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -44071,7 +45724,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data_home", @@ -44081,13 +45735,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_data_description_by_id(data_id: int, data_home: Optional[str]) -> Dict[str, Any]:\n url = _DATA_INFO.format(data_id)\n error_message = 'Dataset with data_id {} not found.'.format(data_id)\n json_data = _get_json_content_from_openml_api(url, error_message, data_home=data_home)\n return json_data['data_set_description']" }, { @@ -44105,7 +45760,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data_home", @@ -44115,13 +45771,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_data_features(data_id: int, data_home: Optional[str]) -> OpenmlFeaturesType:\n url = _DATA_FEATURES.format(data_id)\n error_message = 'Dataset with data_id {} not found.'.format(data_id)\n json_data = _get_json_content_from_openml_api(url, error_message, data_home=data_home)\n return json_data['data_features']['feature']" }, { @@ -44139,7 +45796,8 @@ "docstring": { "type": "str", "description": "name of the dataset" - } + }, + "refined_type": {} }, { "name": "version", @@ -44149,7 +45807,8 @@ "docstring": { "type": "int or str", "description": "If version is an integer, the exact name/version will be obtained from\nOpenML. If version is a string (value: \"active\") it will take the first\nversion from OpenML that is annotated as active. Any other string\nvalues except \"active\" are treated as integer." - } + }, + "refined_type": {} }, { "name": "data_home", @@ -44159,13 +45818,14 @@ "docstring": { "type": "str or None", "description": "Location to cache the response. None if no cache is required." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Utilizes the openml dataset listing api to find a dataset by name/version OpenML api function: https://www.openml.org/api_docs#!/data/get_data_list_data_name_data_name", - "docstring": "Utilizes the openml dataset listing api to find a dataset by\nname/version\nOpenML api function:\nhttps://www.openml.org/api_docs#!/data/get_data_list_data_name_data_name\n\nParameters\n----------\nname : str\n name of the dataset\n\nversion : int or str\n If version is an integer, the exact name/version will be obtained from\n OpenML. If version is a string (value: \"active\") it will take the first\n version from OpenML that is annotated as active. Any other string\n values except \"active\" are treated as integer.\n\ndata_home : str or None\n Location to cache the response. None if no cache is required.\n\nReturns\n-------\nfirst_dataset : json\n json representation of the first dataset object that adhired to the\n search criteria", + "description": "Utilizes the openml dataset listing api to find a dataset by\nname/version\nOpenML api function:\nhttps://www.openml.org/api_docs#!/data/get_data_list_data_name_data_name", + "docstring": "\n Utilizes the openml dataset listing api to find a dataset by\n name/version\n OpenML api function:\n https://www.openml.org/api_docs#!/data/get_data_list_data_name_data_name\n\n Parameters\n ----------\n name : str\n name of the dataset\n\n version : int or str\n If version is an integer, the exact name/version will be obtained from\n OpenML. If version is a string (value: \"active\") it will take the first\n version from OpenML that is annotated as active. Any other string\n values except \"active\" are treated as integer.\n\n data_home : str or None\n Location to cache the response. None if no cache is required.\n\n Returns\n -------\n first_dataset : json\n json representation of the first dataset object that adhired to the\n search criteria\n\n ", "source_code": "\ndef _get_data_info_by_name(name: str, version: Union[int, str], data_home: Optional[str]):\n \"\"\"\n Utilizes the openml dataset listing api to find a dataset by\n name/version\n OpenML api function:\n https://www.openml.org/api_docs#!/data/get_data_list_data_name_data_name\n\n Parameters\n ----------\n name : str\n name of the dataset\n\n version : int or str\n If version is an integer, the exact name/version will be obtained from\n OpenML. If version is a string (value: \"active\") it will take the first\n version from OpenML that is annotated as active. Any other string\n values except \"active\" are treated as integer.\n\n data_home : str or None\n Location to cache the response. None if no cache is required.\n\n Returns\n -------\n first_dataset : json\n json representation of the first dataset object that adhired to the\n search criteria\n\n \"\"\"\n if version == 'active':\n url = _SEARCH_NAME.format(name) + '/status/active/'\n error_msg = 'No active dataset {} found.'.format(name)\n json_data = _get_json_content_from_openml_api(url, error_msg, data_home=data_home)\n res = json_data['data']['dataset']\n if len(res) > 1:\n warn('Multiple active versions of the dataset matching the name {name} exist. Versions may be fundamentally different, returning version {version}.'.format(name=name, version=res[0]['version']))\n return res[0]\n url = (_SEARCH_NAME + '/data_version/{}').format(name, version)\n try:\n json_data = _get_json_content_from_openml_api(url, error_message=None, data_home=data_home)\n except OpenMLError:\n url += '/status/deactivated'\n error_msg = 'Dataset {} with version {} not found.'.format(name, version)\n json_data = _get_json_content_from_openml_api(url, error_msg, data_home=data_home)\n return json_data['data']['dataset'][0]" }, { @@ -44183,7 +45843,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data_home", @@ -44193,13 +45854,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_data_qualities(data_id: int, data_home: Optional[str]) -> OpenmlQualitiesType:\n url = _DATA_QUALITIES.format(data_id)\n error_message = 'Dataset with data_id {} not found.'.format(data_id)\n json_data = _get_json_content_from_openml_api(url, error_message, data_home=data_home)\n return json_data.get('data_qualities', {}).get('quality', [])" }, { @@ -44217,7 +45879,8 @@ "docstring": { "type": "str", "description": "The URL to load from. Should be an official OpenML endpoint" - } + }, + "refined_type": {} }, { "name": "error_message", @@ -44227,7 +45890,8 @@ "docstring": { "type": "str or None", "description": "The error message to raise if an acceptable OpenML error is thrown\n(acceptable error is, e.g., data id not found. Other errors, like 404's\nwill throw the native error message)" - } + }, + "refined_type": {} }, { "name": "data_home", @@ -44237,13 +45901,14 @@ "docstring": { "type": "str or None", "description": "Location to cache the response. None if no cache is required." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Loads json data from the openml api", - "docstring": "Loads json data from the openml api\n\nParameters\n----------\nurl : str\n The URL to load from. Should be an official OpenML endpoint\n\nerror_message : str or None\n The error message to raise if an acceptable OpenML error is thrown\n (acceptable error is, e.g., data id not found. Other errors, like 404's\n will throw the native error message)\n\ndata_home : str or None\n Location to cache the response. None if no cache is required.\n\nReturns\n-------\njson_data : json\n the json result from the OpenML server if the call was successful.\n An exception otherwise.", + "docstring": "\n Loads json data from the openml api\n\n Parameters\n ----------\n url : str\n The URL to load from. Should be an official OpenML endpoint\n\n error_message : str or None\n The error message to raise if an acceptable OpenML error is thrown\n (acceptable error is, e.g., data id not found. Other errors, like 404's\n will throw the native error message)\n\n data_home : str or None\n Location to cache the response. None if no cache is required.\n\n Returns\n -------\n json_data : json\n the json result from the OpenML server if the call was successful.\n An exception otherwise.\n ", "source_code": "\ndef _get_json_content_from_openml_api(url: str, error_message: Optional[str], data_home: Optional[str]) -> Dict:\n \"\"\"\n Loads json data from the openml api\n\n Parameters\n ----------\n url : str\n The URL to load from. Should be an official OpenML endpoint\n\n error_message : str or None\n The error message to raise if an acceptable OpenML error is thrown\n (acceptable error is, e.g., data id not found. Other errors, like 404's\n will throw the native error message)\n\n data_home : str or None\n Location to cache the response. None if no cache is required.\n\n Returns\n -------\n json_data : json\n the json result from the OpenML server if the call was successful.\n An exception otherwise.\n \"\"\"\n \n @_retry_with_clean_cache(url, data_home)\n def _load_json():\n with closing(_open_openml_url(url, data_home)) as response:\n return json.loads(response.read().decode('utf-8'))\n try:\n return _load_json()\n except HTTPError as error:\n if error.code != 412:\n raise error\n raise OpenMLError(error_message)" }, { @@ -44261,7 +45926,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data_home", @@ -44271,13 +45937,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_local_path(openml_path: str, data_home: str) -> str:\n return os.path.join(data_home, 'openml.org', openml_path + '.gz')" }, { @@ -44295,13 +45962,14 @@ "docstring": { "type": "list of dict", "description": "Used to retrieve the number of instances (samples) in the dataset." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Get the number of samples from data qualities.", - "docstring": "Get the number of samples from data qualities.\n\nParameters\n----------\ndata_qualities : list of dict\n Used to retrieve the number of instances (samples) in the dataset.\n\nReturns\n-------\nn_samples : int\n The number of samples in the dataset or -1 if data qualities are\n unavailable.", + "docstring": "Get the number of samples from data qualities.\n\n Parameters\n ----------\n data_qualities : list of dict\n Used to retrieve the number of instances (samples) in the dataset.\n\n Returns\n -------\n n_samples : int\n The number of samples in the dataset or -1 if data qualities are\n unavailable.\n ", "source_code": "\ndef _get_num_samples(data_qualities: OpenmlQualitiesType) -> int:\n \"\"\"Get the number of samples from data qualities.\n\n Parameters\n ----------\n data_qualities : list of dict\n Used to retrieve the number of instances (samples) in the dataset.\n\n Returns\n -------\n n_samples : int\n The number of samples in the dataset or -1 if data qualities are\n unavailable.\n \"\"\"\n default_n_samples = -1\n qualities = {d['name']: d['value'] for d in data_qualities}\n return int(float(qualities.get('NumberOfInstances', default_n_samples)))" }, { @@ -44319,7 +45987,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data_home", @@ -44329,7 +45998,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "return_type", @@ -44339,7 +46009,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "encode_nominal", @@ -44349,7 +46020,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "parse_arff", @@ -44359,7 +46031,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "md5_checksum", @@ -44369,7 +46042,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -44393,7 +46067,8 @@ "docstring": { "type": "str", "description": "OpenML URL that will be accessed. This will be prefixes with\n_OPENML_PREFIX" - } + }, + "refined_type": {} }, { "name": "data_home", @@ -44403,14 +46078,15 @@ "docstring": { "type": "str", "description": "Directory to which the files will be cached. If None, no caching will\nbe applied." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Returns a resource from OpenML.org. Caches it to data_home if required.", - "docstring": "Returns a resource from OpenML.org. Caches it to data_home if required.\n\nParameters\n----------\nopenml_path : str\n OpenML URL that will be accessed. This will be prefixes with\n _OPENML_PREFIX\n\ndata_home : str\n Directory to which the files will be cached. If None, no caching will\n be applied.\n\nReturns\n-------\nresult : stream\n A stream to the OpenML resource", - "source_code": "\ndef _open_openml_url(openml_path: str, data_home: Optional[str]):\n \"\"\"\n Returns a resource from OpenML.org. Caches it to data_home if required.\n\n Parameters\n ----------\n openml_path : str\n OpenML URL that will be accessed. This will be prefixes with\n _OPENML_PREFIX\n\n data_home : str\n Directory to which the files will be cached. If None, no caching will\n be applied.\n\n Returns\n -------\n result : stream\n A stream to the OpenML resource\n \"\"\"\n \n def is_gzip_encoded(_fsrc):\n return _fsrc.info().get('Content-Encoding', '') == 'gzip'\n req = Request(_OPENML_PREFIX + openml_path)\n req.add_header('Accept-encoding', 'gzip')\n if data_home is None:\n fsrc = urlopen(req)\n if is_gzip_encoded(fsrc):\n return gzip.GzipFile(fileobj=fsrc, mode='rb')\n return fsrc\n local_path = _get_local_path(openml_path, data_home)\n if not os.path.exists(local_path):\n try:\n os.makedirs(os.path.dirname(local_path))\n except OSError:\n pass\n try:\n with closing(urlopen(req)) as fsrc:\n opener: Callable\n if is_gzip_encoded(fsrc):\n opener = open\n else:\n opener = gzip.GzipFile\n with opener(local_path, 'wb') as fdst:\n shutil.copyfileobj(fsrc, fdst)\n except Exception:\n if os.path.exists(local_path):\n os.unlink(local_path)\n raise\n return gzip.GzipFile(local_path, 'rb')" + "docstring": "\n Returns a resource from OpenML.org. Caches it to data_home if required.\n\n Parameters\n ----------\n openml_path : str\n OpenML URL that will be accessed. This will be prefixes with\n _OPENML_PREFIX\n\n data_home : str\n Directory to which the files will be cached. If None, no caching will\n be applied.\n\n Returns\n -------\n result : stream\n A stream to the OpenML resource\n ", + "source_code": "\ndef _open_openml_url(openml_path: str, data_home: Optional[str]):\n \"\"\"\n Returns a resource from OpenML.org. Caches it to data_home if required.\n\n Parameters\n ----------\n openml_path : str\n OpenML URL that will be accessed. This will be prefixes with\n _OPENML_PREFIX\n\n data_home : str\n Directory to which the files will be cached. If None, no caching will\n be applied.\n\n Returns\n -------\n result : stream\n A stream to the OpenML resource\n \"\"\"\n \n def is_gzip_encoded(_fsrc):\n return _fsrc.info().get('Content-Encoding', '') == 'gzip'\n req = Request(_OPENML_PREFIX + openml_path)\n req.add_header('Accept-encoding', 'gzip')\n if data_home is None:\n fsrc = urlopen(req)\n if is_gzip_encoded(fsrc):\n return gzip.GzipFile(fileobj=fsrc, mode='rb')\n return fsrc\n local_path = _get_local_path(openml_path, data_home)\n (dir_name, file_name) = os.path.split(local_path)\n if not os.path.exists(local_path):\n os.makedirs(dir_name, exist_ok=True)\n try:\n with TemporaryDirectory(dir=dir_name) as tmpdir:\n with closing(urlopen(req)) as fsrc:\n opener: Callable\n if is_gzip_encoded(fsrc):\n opener = open\n else:\n opener = gzip.GzipFile\n with opener(os.path.join(tmpdir, file_name), 'wb') as fdst:\n shutil.copyfileobj(fsrc, fdst)\n shutil.move(fdst.name, local_path)\n except Exception:\n if os.path.exists(local_path):\n os.unlink(local_path)\n raise\n return gzip.GzipFile(local_path, 'rb')" }, { "name": "_retry_with_clean_cache", @@ -44427,7 +46103,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data_home", @@ -44437,13 +46114,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "If the first call to the decorated function fails, the local cached file is removed, and the function is called again. If ``data_home`` is ``None``, then the function is called once.", - "docstring": "If the first call to the decorated function fails, the local cached\nfile is removed, and the function is called again. If ``data_home`` is\n``None``, then the function is called once.", + "description": "If the first call to the decorated function fails, the local cached\nfile is removed, and the function is called again. If ``data_home`` is\n``None``, then the function is called once.", + "docstring": "If the first call to the decorated function fails, the local cached\n file is removed, and the function is called again. If ``data_home`` is\n ``None``, then the function is called once.\n ", "source_code": "\ndef _retry_with_clean_cache(openml_path: str, data_home: Optional[str]) -> Callable:\n \"\"\"If the first call to the decorated function fails, the local cached\n file is removed, and the function is called again. If ``data_home`` is\n ``None``, then the function is called once.\n \"\"\"\n \n def decorator(f):\n \n @wraps(f)\n def wrapper(*args, **kw):\n if data_home is None:\n return f(*args, **kw)\n try:\n return f(*args, **kw)\n except HTTPError:\n raise\n except Exception:\n warn('Invalid cache, redownloading file', RuntimeWarning)\n local_path = _get_local_path(openml_path, data_home)\n if os.path.exists(local_path):\n os.unlink(local_path)\n return f(*args, **kw)\n return wrapper\n return decorator" }, { @@ -44461,7 +46139,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "include_columns", @@ -44471,13 +46150,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sparse_data_to_array(arff_data: ArffSparseDataType, include_columns: List) -> np.ndarray:\n num_obs = max(arff_data[1]) + 1\n y_shape = (num_obs, len(include_columns))\n reindexed_columns = {column_idx: array_idx for (array_idx, column_idx) in enumerate(include_columns)}\n y = np.empty(y_shape, dtype=np.float64)\n for (val, row_idx, col_idx) in zip(arff_data[0], arff_data[1], arff_data[2]):\n if col_idx in include_columns:\n y[row_idx, reindexed_columns[col_idx]] = val\n return y" }, { @@ -44495,7 +46175,8 @@ "docstring": { "type": "tuple", "description": "A tuple of three lists of equal size; first list indicating the value,\nsecond the x coordinate and the third the y coordinate." - } + }, + "refined_type": {} }, { "name": "include_columns", @@ -44505,13 +46186,14 @@ "docstring": { "type": "list", "description": "A list of columns to include." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "obtains several columns from sparse arff representation. Additionally, the column indices are re-labelled, given the columns that are not included. (e.g., when including [1, 2, 3], the columns will be relabelled to [0, 1, 2])", - "docstring": "obtains several columns from sparse arff representation. Additionally, the\ncolumn indices are re-labelled, given the columns that are not included.\n(e.g., when including [1, 2, 3], the columns will be relabelled to\n[0, 1, 2])\n\nParameters\n----------\narff_data : tuple\n A tuple of three lists of equal size; first list indicating the value,\n second the x coordinate and the third the y coordinate.\n\ninclude_columns : list\n A list of columns to include.\n\nReturns\n-------\narff_data_new : tuple\n Subset of arff data with only the include columns indicated by the\n include_columns argument.", + "description": "obtains several columns from sparse arff representation. Additionally, the\ncolumn indices are re-labelled, given the columns that are not included.\n(e.g., when including [1, 2, 3], the columns will be relabelled to\n[0, 1, 2])", + "docstring": "\n obtains several columns from sparse arff representation. Additionally, the\n column indices are re-labelled, given the columns that are not included.\n (e.g., when including [1, 2, 3], the columns will be relabelled to\n [0, 1, 2])\n\n Parameters\n ----------\n arff_data : tuple\n A tuple of three lists of equal size; first list indicating the value,\n second the x coordinate and the third the y coordinate.\n\n include_columns : list\n A list of columns to include.\n\n Returns\n -------\n arff_data_new : tuple\n Subset of arff data with only the include columns indicated by the\n include_columns argument.\n ", "source_code": "\ndef _split_sparse_columns(arff_data: ArffSparseDataType, include_columns: List) -> ArffSparseDataType:\n \"\"\"\n obtains several columns from sparse arff representation. Additionally, the\n column indices are re-labelled, given the columns that are not included.\n (e.g., when including [1, 2, 3], the columns will be relabelled to\n [0, 1, 2])\n\n Parameters\n ----------\n arff_data : tuple\n A tuple of three lists of equal size; first list indicating the value,\n second the x coordinate and the third the y coordinate.\n\n include_columns : list\n A list of columns to include.\n\n Returns\n -------\n arff_data_new : tuple\n Subset of arff data with only the include columns indicated by the\n include_columns argument.\n \"\"\"\n arff_data_new: ArffSparseDataType = (list(), list(), list())\n reindexed_columns = {column_idx: array_idx for (array_idx, column_idx) in enumerate(include_columns)}\n for (val, row_idx, col_idx) in zip(arff_data[0], arff_data[1], arff_data[2]):\n if col_idx in include_columns:\n arff_data_new[0].append(val)\n arff_data_new[1].append(row_idx)\n arff_data_new[2].append(reindexed_columns[col_idx])\n return arff_data_new" }, { @@ -44529,7 +46211,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "target_columns", @@ -44539,13 +46222,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _valid_data_column_names(features_list, target_columns):\n valid_data_column_names = []\n for feature in features_list:\n if feature['name'] not in target_columns and feature['is_ignore'] != 'true' and feature['is_row_identifier'] != 'true':\n valid_data_column_names.append(feature['name'])\n return valid_data_column_names" }, { @@ -44563,7 +46247,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "target_columns", @@ -44573,13 +46258,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _verify_target_data_type(features_dict, target_columns):\n if not isinstance(target_columns, list):\n raise ValueError('target_column should be list, got: %s' % type(target_columns))\n found_types = set()\n for target_column in target_columns:\n if target_column not in features_dict:\n raise KeyError('Could not find target_column={}')\n if features_dict[target_column]['data_type'] == 'numeric':\n found_types.add(np.float64)\n else:\n found_types.add(object)\n if features_dict[target_column]['is_ignore'] == 'true':\n warn('target_column={} has flag is_ignore.'.format(target_column))\n if features_dict[target_column]['is_row_identifier'] == 'true':\n warn('target_column={} has flag is_row_identifier.'.format(target_column))\n if len(found_types) > 1:\n raise ValueError('Can only handle homogeneous multi-target datasets, i.e., all targets are either numeric or categorical.')" }, { @@ -44597,7 +46283,8 @@ "docstring": { "type": "str, default=None", "description": "String identifier of the dataset. Note that OpenML can have multiple\ndatasets with the same name." - } + }, + "refined_type": {} }, { "name": "version", @@ -44607,7 +46294,8 @@ "docstring": { "type": "int or 'active', default='active'", "description": "Version of the dataset. Can only be provided if also ``name`` is given.\nIf 'active' the oldest version that's still active is used. Since\nthere may be more than one active version of a dataset, and those\nversions may fundamentally be different from one another, setting an\nexact version is highly recommended." - } + }, + "refined_type": {} }, { "name": "data_id", @@ -44617,7 +46305,8 @@ "docstring": { "type": "int, default=None", "description": "OpenML ID of the dataset. The most specific way of retrieving a\ndataset. If data_id is not given, name (and potential version) are\nused to obtain a dataset." - } + }, + "refined_type": {} }, { "name": "data_home", @@ -44627,7 +46316,8 @@ "docstring": { "type": "str, default=None", "description": "Specify another download and cache folder for the data sets. By default\nall scikit-learn data is stored in '~/scikit_learn_data' subfolders." - } + }, + "refined_type": {} }, { "name": "target_column", @@ -44637,7 +46327,8 @@ "docstring": { "type": "str, list or None, default='default-target'", "description": "Specify the column name in the data to use as target. If\n'default-target', the standard target column a stored on the server\nis used. If ``None``, all columns are returned as data and the\ntarget is ``None``. If list (of strings), all columns with these names\nare returned as multi-target (Note: not all scikit-learn classifiers\ncan handle all types of multi-output combinations)" - } + }, + "refined_type": {} }, { "name": "cache", @@ -44647,7 +46338,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to cache downloaded datasets using joblib." - } + }, + "refined_type": {} }, { "name": "return_X_y", @@ -44657,7 +46349,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns ``(data, target)`` instead of a Bunch object. See\nbelow for more information about the `data` and `target` objects." - } + }, + "refined_type": {} }, { "name": "as_frame", @@ -44667,13 +46360,14 @@ "docstring": { "type": "bool or 'auto', default='auto'", "description": "If True, the data is a pandas DataFrame including columns with\nappropriate dtypes (numeric, string or categorical). The target is\na pandas DataFrame or Series depending on the number of target_columns.\nThe Bunch will contain a ``frame`` attribute with the target and the\ndata. If ``return_X_y`` is True, then ``(data, target)`` will be pandas\nDataFrames or Series as describe above.\n\nIf as_frame is 'auto', the data and target will be converted to\nDataFrame or Series as if as_frame is set to True, unless the dataset\nis stored in sparse format.\n\n.. versionchanged:: 0.24\n The default value of `as_frame` changed from `False` to `'auto'`\n in 0.24." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fetch dataset from openml by name or dataset id.\n\nDatasets are uniquely identified by either an integer ID or by a combination of name and version (i.e. there might be multiple versions of the 'iris' dataset). Please give either name or data_id (not both). In case a name is given, a version can also be provided. Read more in the :ref:`User Guide `. .. versionadded:: 0.20 .. note:: EXPERIMENTAL The API is experimental (particularly the return value structure), and might have small backward-incompatible changes without notice or warning in future releases.", - "docstring": "Fetch dataset from openml by name or dataset id.\n\nDatasets are uniquely identified by either an integer ID or by a\ncombination of name and version (i.e. there might be multiple\nversions of the 'iris' dataset). Please give either name or data_id\n(not both). In case a name is given, a version can also be\nprovided.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20\n\n.. note:: EXPERIMENTAL\n\n The API is experimental (particularly the return value structure),\n and might have small backward-incompatible changes without notice\n or warning in future releases.\n\nParameters\n----------\nname : str, default=None\n String identifier of the dataset. Note that OpenML can have multiple\n datasets with the same name.\n\nversion : int or 'active', default='active'\n Version of the dataset. Can only be provided if also ``name`` is given.\n If 'active' the oldest version that's still active is used. Since\n there may be more than one active version of a dataset, and those\n versions may fundamentally be different from one another, setting an\n exact version is highly recommended.\n\ndata_id : int, default=None\n OpenML ID of the dataset. The most specific way of retrieving a\n dataset. If data_id is not given, name (and potential version) are\n used to obtain a dataset.\n\ndata_home : str, default=None\n Specify another download and cache folder for the data sets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\ntarget_column : str, list or None, default='default-target'\n Specify the column name in the data to use as target. If\n 'default-target', the standard target column a stored on the server\n is used. If ``None``, all columns are returned as data and the\n target is ``None``. If list (of strings), all columns with these names\n are returned as multi-target (Note: not all scikit-learn classifiers\n can handle all types of multi-output combinations)\n\ncache : bool, default=True\n Whether to cache downloaded datasets using joblib.\n\nreturn_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object. See\n below for more information about the `data` and `target` objects.\n\nas_frame : bool or 'auto', default='auto'\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric, string or categorical). The target is\n a pandas DataFrame or Series depending on the number of target_columns.\n The Bunch will contain a ``frame`` attribute with the target and the\n data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas\n DataFrames or Series as describe above.\n\n If as_frame is 'auto', the data and target will be converted to\n DataFrame or Series as if as_frame is set to True, unless the dataset\n is stored in sparse format.\n\n .. versionchanged:: 0.24\n The default value of `as_frame` changed from `False` to `'auto'`\n in 0.24.\n\nReturns\n-------\n\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame\n The feature matrix. Categorical features are encoded as ordinals.\n target : np.array, pandas Series or DataFrame\n The regression target or classification labels, if applicable.\n Dtype is float if numeric, and object if categorical. If\n ``as_frame`` is True, ``target`` is a pandas object.\n DESCR : str\n The full description of the dataset\n feature_names : list\n The names of the dataset columns\n target_names: list\n The names of the target columns\n\n .. versionadded:: 0.22\n\n categories : dict or None\n Maps each categorical feature name to a list of values, such\n that the value encoded as i is ith in the list. If ``as_frame``\n is True, this is None.\n details : dict\n More metadata from OpenML\n frame : pandas DataFrame\n Only present when `as_frame=True`. DataFrame with ``data`` and\n ``target``.\n\n(data, target) : tuple if ``return_X_y`` is True\n\n .. note:: EXPERIMENTAL\n\n This interface is **experimental** and subsequent releases may\n change attributes without notice (although there should only be\n minor changes to ``data`` and ``target``).\n\n Missing values in the 'data' are represented as NaN's. Missing values\n in 'target' are represented as NaN's (numerical target) or None\n (categorical target)", + "description": "Fetch dataset from openml by name or dataset id.\n\nDatasets are uniquely identified by either an integer ID or by a\ncombination of name and version (i.e. there might be multiple\nversions of the 'iris' dataset). Please give either name or data_id\n(not both). In case a name is given, a version can also be\nprovided.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20\n\n.. note:: EXPERIMENTAL\n\n The API is experimental (particularly the return value structure),\n and might have small backward-incompatible changes without notice\n or warning in future releases.", + "docstring": "Fetch dataset from openml by name or dataset id.\n\n Datasets are uniquely identified by either an integer ID or by a\n combination of name and version (i.e. there might be multiple\n versions of the 'iris' dataset). Please give either name or data_id\n (not both). In case a name is given, a version can also be\n provided.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n .. note:: EXPERIMENTAL\n\n The API is experimental (particularly the return value structure),\n and might have small backward-incompatible changes without notice\n or warning in future releases.\n\n Parameters\n ----------\n name : str, default=None\n String identifier of the dataset. Note that OpenML can have multiple\n datasets with the same name.\n\n version : int or 'active', default='active'\n Version of the dataset. Can only be provided if also ``name`` is given.\n If 'active' the oldest version that's still active is used. Since\n there may be more than one active version of a dataset, and those\n versions may fundamentally be different from one another, setting an\n exact version is highly recommended.\n\n data_id : int, default=None\n OpenML ID of the dataset. The most specific way of retrieving a\n dataset. If data_id is not given, name (and potential version) are\n used to obtain a dataset.\n\n data_home : str, default=None\n Specify another download and cache folder for the data sets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n target_column : str, list or None, default='default-target'\n Specify the column name in the data to use as target. If\n 'default-target', the standard target column a stored on the server\n is used. If ``None``, all columns are returned as data and the\n target is ``None``. If list (of strings), all columns with these names\n are returned as multi-target (Note: not all scikit-learn classifiers\n can handle all types of multi-output combinations)\n\n cache : bool, default=True\n Whether to cache downloaded datasets using joblib.\n\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object. See\n below for more information about the `data` and `target` objects.\n\n as_frame : bool or 'auto', default='auto'\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric, string or categorical). The target is\n a pandas DataFrame or Series depending on the number of target_columns.\n The Bunch will contain a ``frame`` attribute with the target and the\n data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas\n DataFrames or Series as describe above.\n\n If as_frame is 'auto', the data and target will be converted to\n DataFrame or Series as if as_frame is set to True, unless the dataset\n is stored in sparse format.\n\n .. versionchanged:: 0.24\n The default value of `as_frame` changed from `False` to `'auto'`\n in 0.24.\n\n Returns\n -------\n\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame\n The feature matrix. Categorical features are encoded as ordinals.\n target : np.array, pandas Series or DataFrame\n The regression target or classification labels, if applicable.\n Dtype is float if numeric, and object if categorical. If\n ``as_frame`` is True, ``target`` is a pandas object.\n DESCR : str\n The full description of the dataset\n feature_names : list\n The names of the dataset columns\n target_names: list\n The names of the target columns\n\n .. versionadded:: 0.22\n\n categories : dict or None\n Maps each categorical feature name to a list of values, such\n that the value encoded as i is ith in the list. If ``as_frame``\n is True, this is None.\n details : dict\n More metadata from OpenML\n frame : pandas DataFrame\n Only present when `as_frame=True`. DataFrame with ``data`` and\n ``target``.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. note:: EXPERIMENTAL\n\n This interface is **experimental** and subsequent releases may\n change attributes without notice (although there should only be\n minor changes to ``data`` and ``target``).\n\n Missing values in the 'data' are represented as NaN's. Missing values\n in 'target' are represented as NaN's (numerical target) or None\n (categorical target)\n ", "source_code": "\ndef fetch_openml(name: Optional[str] = None, *, version: Union[str, int] = 'active', data_id: Optional[int] = None, data_home: Optional[str] = None, target_column: Optional[Union[str, List]] = 'default-target', cache: bool = True, return_X_y: bool = False, as_frame: Union[str, bool] = 'auto'):\n \"\"\"Fetch dataset from openml by name or dataset id.\n\n Datasets are uniquely identified by either an integer ID or by a\n combination of name and version (i.e. there might be multiple\n versions of the 'iris' dataset). Please give either name or data_id\n (not both). In case a name is given, a version can also be\n provided.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n .. note:: EXPERIMENTAL\n\n The API is experimental (particularly the return value structure),\n and might have small backward-incompatible changes without notice\n or warning in future releases.\n\n Parameters\n ----------\n name : str, default=None\n String identifier of the dataset. Note that OpenML can have multiple\n datasets with the same name.\n\n version : int or 'active', default='active'\n Version of the dataset. Can only be provided if also ``name`` is given.\n If 'active' the oldest version that's still active is used. Since\n there may be more than one active version of a dataset, and those\n versions may fundamentally be different from one another, setting an\n exact version is highly recommended.\n\n data_id : int, default=None\n OpenML ID of the dataset. The most specific way of retrieving a\n dataset. If data_id is not given, name (and potential version) are\n used to obtain a dataset.\n\n data_home : str, default=None\n Specify another download and cache folder for the data sets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n target_column : str, list or None, default='default-target'\n Specify the column name in the data to use as target. If\n 'default-target', the standard target column a stored on the server\n is used. If ``None``, all columns are returned as data and the\n target is ``None``. If list (of strings), all columns with these names\n are returned as multi-target (Note: not all scikit-learn classifiers\n can handle all types of multi-output combinations)\n\n cache : bool, default=True\n Whether to cache downloaded datasets using joblib.\n\n return_X_y : bool, default=False\n If True, returns ``(data, target)`` instead of a Bunch object. See\n below for more information about the `data` and `target` objects.\n\n as_frame : bool or 'auto', default='auto'\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric, string or categorical). The target is\n a pandas DataFrame or Series depending on the number of target_columns.\n The Bunch will contain a ``frame`` attribute with the target and the\n data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas\n DataFrames or Series as describe above.\n\n If as_frame is 'auto', the data and target will be converted to\n DataFrame or Series as if as_frame is set to True, unless the dataset\n is stored in sparse format.\n\n .. versionchanged:: 0.24\n The default value of `as_frame` changed from `False` to `'auto'`\n in 0.24.\n\n Returns\n -------\n\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame\n The feature matrix. Categorical features are encoded as ordinals.\n target : np.array, pandas Series or DataFrame\n The regression target or classification labels, if applicable.\n Dtype is float if numeric, and object if categorical. If\n ``as_frame`` is True, ``target`` is a pandas object.\n DESCR : str\n The full description of the dataset\n feature_names : list\n The names of the dataset columns\n target_names: list\n The names of the target columns\n\n .. versionadded:: 0.22\n\n categories : dict or None\n Maps each categorical feature name to a list of values, such\n that the value encoded as i is ith in the list. If ``as_frame``\n is True, this is None.\n details : dict\n More metadata from OpenML\n frame : pandas DataFrame\n Only present when `as_frame=True`. DataFrame with ``data`` and\n ``target``.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. note:: EXPERIMENTAL\n\n This interface is **experimental** and subsequent releases may\n change attributes without notice (although there should only be\n minor changes to ``data`` and ``target``).\n\n Missing values in the 'data' are represented as NaN's. Missing values\n in 'target' are represented as NaN's (numerical target) or None\n (categorical target)\n \"\"\"\n if cache is False:\n data_home = None\n else:\n data_home = get_data_home(data_home=data_home)\n data_home = join(data_home, 'openml')\n if name is not None:\n name = name.lower()\n if data_id is not None:\n raise ValueError('Dataset data_id={} and name={} passed, but you can only specify a numeric data_id or a name, not both.'.format(data_id, name))\n data_info = _get_data_info_by_name(name, version, data_home)\n data_id = data_info['did']\n elif data_id is not None:\n if version != 'active':\n raise ValueError('Dataset data_id={} and version={} passed, but you can only specify a numeric data_id or a version, not both.'.format(data_id, version))\n else:\n raise ValueError('Neither name nor data_id are provided. Please provide name or data_id.')\n data_description = _get_data_description_by_id(data_id, data_home)\n if data_description['status'] != 'active':\n warn('Version {} of dataset {} is inactive, meaning that issues have been found in the dataset. Try using a newer version from this URL: {}'.format(data_description['version'], data_description['name'], data_description['url']))\n if 'error' in data_description:\n warn('OpenML registered a problem with the dataset. It might be unusable. Error: {}'.format(data_description['error']))\n if 'warning' in data_description:\n warn('OpenML raised a warning on the dataset. It might be unusable. Warning: {}'.format(data_description['warning']))\n return_sparse = False\n if data_description['format'].lower() == 'sparse_arff':\n return_sparse = True\n if as_frame == 'auto':\n as_frame = not return_sparse\n if as_frame and return_sparse:\n raise ValueError('Cannot return dataframe with sparse data')\n features_list = _get_data_features(data_id, data_home)\n if not as_frame:\n for feature in features_list:\n if 'true' in (feature['is_ignore'], feature['is_row_identifier']):\n continue\n if feature['data_type'] == 'string':\n raise ValueError('STRING attributes are not supported for array representation. Try as_frame=True')\n if target_column == 'default-target':\n target_columns = [feature['name'] for feature in features_list if feature['is_target'] == 'true']\n elif isinstance(target_column, str):\n target_columns = [target_column]\n elif target_column is None:\n target_columns = []\n elif isinstance(target_column, list):\n target_columns = target_column\n else:\n raise TypeError('Did not recognize type of target_columnShould be str, list or None. Got: {}'.format(type(target_column)))\n data_columns = _valid_data_column_names(features_list, target_columns)\n shape: Optional[Tuple[int, int]]\n if not return_sparse:\n data_qualities = _get_data_qualities(data_id, data_home)\n shape = (_get_num_samples(data_qualities), len(features_list))\n else:\n shape = None\n url = _DATA_FILE.format(data_description['file_id'])\n bunch = _download_data_to_bunch(url, return_sparse, data_home, as_frame=bool(as_frame), features_list=features_list, shape=shape, target_columns=target_columns, data_columns=data_columns, md5_checksum=data_description['md5_checksum'])\n if return_X_y:\n return bunch.data, bunch.target\n description = '{}\\n\\nDownloaded from openml.org.'.format(data_description.pop('description'))\n bunch.update(DESCR=description, details=data_description, url='https://www.openml.org/d/{}'.format(data_id))\n return bunch" }, { @@ -44691,7 +46385,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b", @@ -44701,7 +46396,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -44725,7 +46421,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -44749,7 +46446,8 @@ "docstring": { "type": "str, default=None", "description": "Specify another download and cache folder for the datasets. By default\nall scikit-learn data is stored in '~/scikit_learn_data' subfolders." - } + }, + "refined_type": {} }, { "name": "subset", @@ -44759,6 +46457,10 @@ "docstring": { "type": "{'train', 'test', 'all'}, default='all'", "description": "Select the dataset to load: 'train' for the training set\n(23149 samples), 'test' for the test set (781265 samples),\n'all' for both, with the training samples first if shuffle is False.\nThis follows the official LYRL2004 chronological split." + }, + "refined_type": { + "kind": "EnumType", + "values": ["train", "all", "test"] } }, { @@ -44769,7 +46471,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, raise a IOError if the data is not locally available\ninstead of trying to download the data from the source site." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -44779,7 +46482,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset shuffling. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -44789,7 +46493,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to shuffle dataset." - } + }, + "refined_type": {} }, { "name": "return_X_y", @@ -44799,13 +46504,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch\nobject. See below for more information about the `dataset.data` and\n`dataset.target` object.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load the RCV1 multilabel dataset (classification).\n\nDownload it if necessary. Version: RCV1-v2, vectors, full sets, topics multilabels. ================= ===================== Classes 103 Samples total 804414 Dimensionality 47236 Features real, between 0 and 1 ================= ===================== Read more in the :ref:`User Guide `. .. versionadded:: 0.17", - "docstring": "Load the RCV1 multilabel dataset (classification).\n\nDownload it if necessary.\n\nVersion: RCV1-v2, vectors, full sets, topics multilabels.\n\n================= =====================\nClasses 103\nSamples total 804414\nDimensionality 47236\nFeatures real, between 0 and 1\n================= =====================\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.17\n\nParameters\n----------\ndata_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\nsubset : {'train', 'test', 'all'}, default='all'\n Select the dataset to load: 'train' for the training set\n (23149 samples), 'test' for the test set (781265 samples),\n 'all' for both, with the training samples first if shuffle is False.\n This follows the official LYRL2004 chronological split.\n\ndownload_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nshuffle : bool, default=False\n Whether to shuffle dataset.\n\nreturn_X_y : bool, default=False\n If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch\n object. See below for more information about the `dataset.data` and\n `dataset.target` object.\n\n .. versionadded:: 0.20\n\nReturns\n-------\ndataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : sparse matrix of shape (804414, 47236), dtype=np.float64\n The array has 0.16% of non zero values. Will be of CSR format.\n target : sparse matrix of shape (804414, 103), dtype=np.uint8\n Each sample has a value of 1 in its categories, and 0 in others.\n The array has 3.15% of non zero values. Will be of CSR format.\n sample_id : ndarray of shape (804414,), dtype=np.uint32,\n Identification number of each sample, as ordered in dataset.data.\n target_names : ndarray of shape (103,), dtype=object\n Names of each target (RCV1 topics), as ordered in dataset.target.\n DESCR : str\n Description of the RCV1 dataset.\n\n(data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20", + "description": "Load the RCV1 multilabel dataset (classification).\n\nDownload it if necessary.\n\nVersion: RCV1-v2, vectors, full sets, topics multilabels.\n\n================= =====================\nClasses 103\nSamples total 804414\nDimensionality 47236\nFeatures real, between 0 and 1\n================= =====================\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.17", + "docstring": "Load the RCV1 multilabel dataset (classification).\n\n Download it if necessary.\n\n Version: RCV1-v2, vectors, full sets, topics multilabels.\n\n ================= =====================\n Classes 103\n Samples total 804414\n Dimensionality 47236\n Features real, between 0 and 1\n ================= =====================\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n subset : {'train', 'test', 'all'}, default='all'\n Select the dataset to load: 'train' for the training set\n (23149 samples), 'test' for the test set (781265 samples),\n 'all' for both, with the training samples first if shuffle is False.\n This follows the official LYRL2004 chronological split.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n shuffle : bool, default=False\n Whether to shuffle dataset.\n\n return_X_y : bool, default=False\n If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch\n object. See below for more information about the `dataset.data` and\n `dataset.target` object.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n dataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : sparse matrix of shape (804414, 47236), dtype=np.float64\n The array has 0.16% of non zero values. Will be of CSR format.\n target : sparse matrix of shape (804414, 103), dtype=np.uint8\n Each sample has a value of 1 in its categories, and 0 in others.\n The array has 3.15% of non zero values. Will be of CSR format.\n sample_id : ndarray of shape (804414,), dtype=np.uint32,\n Identification number of each sample, as ordered in dataset.data.\n target_names : ndarray of shape (103,), dtype=object\n Names of each target (RCV1 topics), as ordered in dataset.target.\n DESCR : str\n Description of the RCV1 dataset.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20\n ", "source_code": "\ndef fetch_rcv1(*, data_home=None, subset='all', download_if_missing=True, random_state=None, shuffle=False, return_X_y=False):\n \"\"\"Load the RCV1 multilabel dataset (classification).\n\n Download it if necessary.\n\n Version: RCV1-v2, vectors, full sets, topics multilabels.\n\n ================= =====================\n Classes 103\n Samples total 804414\n Dimensionality 47236\n Features real, between 0 and 1\n ================= =====================\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n subset : {'train', 'test', 'all'}, default='all'\n Select the dataset to load: 'train' for the training set\n (23149 samples), 'test' for the test set (781265 samples),\n 'all' for both, with the training samples first if shuffle is False.\n This follows the official LYRL2004 chronological split.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n shuffle : bool, default=False\n Whether to shuffle dataset.\n\n return_X_y : bool, default=False\n If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch\n object. See below for more information about the `dataset.data` and\n `dataset.target` object.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n dataset : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : sparse matrix of shape (804414, 47236), dtype=np.float64\n The array has 0.16% of non zero values. Will be of CSR format.\n target : sparse matrix of shape (804414, 103), dtype=np.uint8\n Each sample has a value of 1 in its categories, and 0 in others.\n The array has 3.15% of non zero values. Will be of CSR format.\n sample_id : ndarray of shape (804414,), dtype=np.uint32,\n Identification number of each sample, as ordered in dataset.data.\n target_names : ndarray of shape (103,), dtype=object\n Names of each target (RCV1 topics), as ordered in dataset.target.\n DESCR : str\n Description of the RCV1 dataset.\n\n (data, target) : tuple if ``return_X_y`` is True\n\n .. versionadded:: 0.20\n \"\"\"\n N_SAMPLES = 804414\n N_FEATURES = 47236\n N_CATEGORIES = 103\n N_TRAIN = 23149\n data_home = get_data_home(data_home=data_home)\n rcv1_dir = join(data_home, 'RCV1')\n if download_if_missing:\n if not exists(rcv1_dir):\n makedirs(rcv1_dir)\n samples_path = _pkl_filepath(rcv1_dir, 'samples.pkl')\n sample_id_path = _pkl_filepath(rcv1_dir, 'sample_id.pkl')\n sample_topics_path = _pkl_filepath(rcv1_dir, 'sample_topics.pkl')\n topics_path = _pkl_filepath(rcv1_dir, 'topics_names.pkl')\n if download_if_missing and (not exists(samples_path) or not exists(sample_id_path)):\n files = []\n for each in XY_METADATA:\n logger.info('Downloading %s' % each.url)\n file_path = _fetch_remote(each, dirname=rcv1_dir)\n files.append(GzipFile(filename=file_path))\n Xy = load_svmlight_files(files, n_features=N_FEATURES)\n X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr()\n sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7]))\n sample_id = sample_id.astype(np.uint32, copy=False)\n joblib.dump(X, samples_path, compress=9)\n joblib.dump(sample_id, sample_id_path, compress=9)\n for f in files:\n f.close()\n remove(f.name)\n else:\n X = joblib.load(samples_path)\n sample_id = joblib.load(sample_id_path)\n if download_if_missing and (not exists(sample_topics_path) or not exists(topics_path)):\n logger.info('Downloading %s' % TOPICS_METADATA.url)\n topics_archive_path = _fetch_remote(TOPICS_METADATA, dirname=rcv1_dir)\n n_cat = -1\n n_doc = -1\n doc_previous = -1\n y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8)\n sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32)\n category_names = {}\n with GzipFile(filename=topics_archive_path, mode='rb') as f:\n for line in f:\n line_components = line.decode('ascii').split(' ')\n if len(line_components) == 3:\n (cat, doc, _) = line_components\n if cat not in category_names:\n n_cat += 1\n category_names[cat] = n_cat\n doc = int(doc)\n if doc != doc_previous:\n doc_previous = doc\n n_doc += 1\n sample_id_bis[n_doc] = doc\n y[n_doc, category_names[cat]] = 1\n remove(topics_archive_path)\n permutation = _find_permutation(sample_id_bis, sample_id)\n y = y[permutation, :]\n categories = np.empty(N_CATEGORIES, dtype=object)\n for k in category_names.keys():\n categories[category_names[k]] = k\n order = np.argsort(categories)\n categories = categories[order]\n y = sp.csr_matrix(y[:, order])\n joblib.dump(y, sample_topics_path, compress=9)\n joblib.dump(categories, topics_path, compress=9)\n else:\n y = joblib.load(sample_topics_path)\n categories = joblib.load(topics_path)\n if subset == 'all':\n pass\n elif subset == 'train':\n X = X[:N_TRAIN, :]\n y = y[:N_TRAIN, :]\n sample_id = sample_id[:N_TRAIN]\n elif subset == 'test':\n X = X[N_TRAIN:, :]\n y = y[N_TRAIN:, :]\n sample_id = sample_id[N_TRAIN:]\n else:\n raise ValueError(\"Unknown subset parameter. Got '%s' instead of one of ('all', 'train', test')\" % subset)\n if shuffle:\n (X, y, sample_id) = shuffle_(X, y, sample_id, random_state=random_state)\n fdescr = load_descr('rcv1.rst')\n if return_X_y:\n return X, y\n return Bunch(data=X, target=y, sample_id=sample_id, target_names=categories, DESCR=fdescr)" }, { @@ -44823,7 +46529,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dimensions", @@ -44833,7 +46540,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "rng", @@ -44843,7 +46551,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -44867,7 +46576,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -44877,13 +46587,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _shuffle(data, random_state=None):\n generator = check_random_state(random_state)\n (n_rows, n_cols) = data.shape\n row_idx = generator.permutation(n_rows)\n col_idx = generator.permutation(n_cols)\n result = data[row_idx][:, col_idx]\n return result, row_idx, col_idx" }, { @@ -44901,7 +46612,8 @@ "docstring": { "type": "iterable of shape (n_rows, n_cols)", "description": "The shape of the result." - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -44911,7 +46623,8 @@ "docstring": { "type": "int", "description": "The number of biclusters." - } + }, + "refined_type": {} }, { "name": "noise", @@ -44921,7 +46634,8 @@ "docstring": { "type": "float, default=0.0", "description": "The standard deviation of the gaussian noise." - } + }, + "refined_type": {} }, { "name": "minval", @@ -44931,7 +46645,8 @@ "docstring": { "type": "int, default=10", "description": "Minimum value of a bicluster." - } + }, + "refined_type": {} }, { "name": "maxval", @@ -44941,7 +46656,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum value of a bicluster." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -44951,7 +46667,8 @@ "docstring": { "type": "bool, default=True", "description": "Shuffle the samples." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -44961,13 +46678,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate an array with constant block diagonal structure for biclustering.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Generate an array with constant block diagonal structure for\nbiclustering.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nshape : iterable of shape (n_rows, n_cols)\n The shape of the result.\n\nn_clusters : int\n The number of biclusters.\n\nnoise : float, default=0.0\n The standard deviation of the gaussian noise.\n\nminval : int, default=10\n Minimum value of a bicluster.\n\nmaxval : int, default=100\n Maximum value of a bicluster.\n\nshuffle : bool, default=True\n Shuffle the samples.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape `shape`\n The generated array.\n\nrows : ndarray of shape (n_clusters, X.shape[0])\n The indicators for cluster membership of each row.\n\ncols : ndarray of shape (n_clusters, X.shape[1])\n The indicators for cluster membership of each column.\n\nReferences\n----------\n\n.. [1] Dhillon, I. S. (2001, August). Co-clustering documents and\n words using bipartite spectral graph partitioning. In Proceedings\n of the seventh ACM SIGKDD international conference on Knowledge\n discovery and data mining (pp. 269-274). ACM.\n\nSee Also\n--------\nmake_checkerboard", + "description": "Generate an array with constant block diagonal structure for\nbiclustering.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate an array with constant block diagonal structure for\n biclustering.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n shape : iterable of shape (n_rows, n_cols)\n The shape of the result.\n\n n_clusters : int\n The number of biclusters.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise.\n\n minval : int, default=10\n Minimum value of a bicluster.\n\n maxval : int, default=100\n Maximum value of a bicluster.\n\n shuffle : bool, default=True\n Shuffle the samples.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape `shape`\n The generated array.\n\n rows : ndarray of shape (n_clusters, X.shape[0])\n The indicators for cluster membership of each row.\n\n cols : ndarray of shape (n_clusters, X.shape[1])\n The indicators for cluster membership of each column.\n\n References\n ----------\n\n .. [1] Dhillon, I. S. (2001, August). Co-clustering documents and\n words using bipartite spectral graph partitioning. In Proceedings\n of the seventh ACM SIGKDD international conference on Knowledge\n discovery and data mining (pp. 269-274). ACM.\n\n See Also\n --------\n make_checkerboard\n ", "source_code": "\ndef make_biclusters(shape, n_clusters, *, noise=0.0, minval=10, maxval=100, shuffle=True, random_state=None):\n \"\"\"Generate an array with constant block diagonal structure for\n biclustering.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n shape : iterable of shape (n_rows, n_cols)\n The shape of the result.\n\n n_clusters : int\n The number of biclusters.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise.\n\n minval : int, default=10\n Minimum value of a bicluster.\n\n maxval : int, default=100\n Maximum value of a bicluster.\n\n shuffle : bool, default=True\n Shuffle the samples.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape `shape`\n The generated array.\n\n rows : ndarray of shape (n_clusters, X.shape[0])\n The indicators for cluster membership of each row.\n\n cols : ndarray of shape (n_clusters, X.shape[1])\n The indicators for cluster membership of each column.\n\n References\n ----------\n\n .. [1] Dhillon, I. S. (2001, August). Co-clustering documents and\n words using bipartite spectral graph partitioning. In Proceedings\n of the seventh ACM SIGKDD international conference on Knowledge\n discovery and data mining (pp. 269-274). ACM.\n\n See Also\n --------\n make_checkerboard\n \"\"\"\n generator = check_random_state(random_state)\n (n_rows, n_cols) = shape\n consts = generator.uniform(minval, maxval, n_clusters)\n row_sizes = generator.multinomial(n_rows, np.repeat(1.0 / n_clusters, n_clusters))\n col_sizes = generator.multinomial(n_cols, np.repeat(1.0 / n_clusters, n_clusters))\n row_labels = np.hstack(list((np.repeat(val, rep) for (val, rep) in zip(range(n_clusters), row_sizes))))\n col_labels = np.hstack(list((np.repeat(val, rep) for (val, rep) in zip(range(n_clusters), col_sizes))))\n result = np.zeros(shape, dtype=np.float64)\n for i in range(n_clusters):\n selector = np.outer(row_labels == i, col_labels == i)\n result[selector] += consts[i]\n if noise > 0:\n result += generator.normal(scale=noise, size=result.shape)\n if shuffle:\n (result, row_idx, col_idx) = _shuffle(result, random_state)\n row_labels = row_labels[row_idx]\n col_labels = col_labels[col_idx]\n rows = np.vstack([row_labels == c for c in range(n_clusters)])\n cols = np.vstack([col_labels == c for c in range(n_clusters)])\n return result, rows, cols" }, { @@ -44985,7 +46703,8 @@ "docstring": { "type": "int or array-like, default=100", "description": "If int, it is the total number of points equally divided among\nclusters.\nIf array-like, each element of the sequence indicates\nthe number of samples per cluster.\n\n.. versionchanged:: v0.20\n one can now pass an array-like to the ``n_samples`` parameter" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -44995,7 +46714,8 @@ "docstring": { "type": "int, default=2", "description": "The number of features for each sample." - } + }, + "refined_type": {} }, { "name": "centers", @@ -45005,7 +46725,8 @@ "docstring": { "type": "int or ndarray of shape (n_centers, n_features), default=None", "description": "The number of centers to generate, or the fixed center locations.\nIf n_samples is an int and centers is None, 3 centers are generated.\nIf n_samples is array-like, centers must be\neither None or an array of length equal to the length of n_samples." - } + }, + "refined_type": {} }, { "name": "cluster_std", @@ -45015,7 +46736,8 @@ "docstring": { "type": "float or array-like of float, default=1.0", "description": "The standard deviation of the clusters." - } + }, + "refined_type": {} }, { "name": "center_box", @@ -45025,7 +46747,8 @@ "docstring": { "type": "tuple of float (min, max), default=(-10.0, 10.0)", "description": "The bounding box for each cluster center when centers are\ngenerated at random." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -45035,7 +46758,8 @@ "docstring": { "type": "bool, default=True", "description": "Shuffle the samples." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45045,7 +46769,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "return_centers", @@ -45055,13 +46780,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, then return the centers of each cluster\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate isotropic Gaussian blobs for clustering.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Generate isotropic Gaussian blobs for clustering.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int or array-like, default=100\n If int, it is the total number of points equally divided among\n clusters.\n If array-like, each element of the sequence indicates\n the number of samples per cluster.\n\n .. versionchanged:: v0.20\n one can now pass an array-like to the ``n_samples`` parameter\n\nn_features : int, default=2\n The number of features for each sample.\n\ncenters : int or ndarray of shape (n_centers, n_features), default=None\n The number of centers to generate, or the fixed center locations.\n If n_samples is an int and centers is None, 3 centers are generated.\n If n_samples is array-like, centers must be\n either None or an array of length equal to the length of n_samples.\n\ncluster_std : float or array-like of float, default=1.0\n The standard deviation of the clusters.\n\ncenter_box : tuple of float (min, max), default=(-10.0, 10.0)\n The bounding box for each cluster center when centers are\n generated at random.\n\nshuffle : bool, default=True\n Shuffle the samples.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nreturn_centers : bool, default=False\n If True, then return the centers of each cluster\n\n .. versionadded:: 0.23\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_features)\n The generated samples.\n\ny : ndarray of shape (n_samples,)\n The integer labels for cluster membership of each sample.\n\ncenters : ndarray of shape (n_centers, n_features)\n The centers of each cluster. Only returned if\n ``return_centers=True``.\n\nExamples\n--------\n>>> from sklearn.datasets import make_blobs\n>>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,\n... random_state=0)\n>>> print(X.shape)\n(10, 2)\n>>> y\narray([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])\n>>> X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2,\n... random_state=0)\n>>> print(X.shape)\n(10, 2)\n>>> y\narray([0, 1, 2, 0, 2, 2, 2, 1, 1, 0])\n\nSee Also\n--------\nmake_classification : A more intricate variant.", + "docstring": "Generate isotropic Gaussian blobs for clustering.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int or array-like, default=100\n If int, it is the total number of points equally divided among\n clusters.\n If array-like, each element of the sequence indicates\n the number of samples per cluster.\n\n .. versionchanged:: v0.20\n one can now pass an array-like to the ``n_samples`` parameter\n\n n_features : int, default=2\n The number of features for each sample.\n\n centers : int or ndarray of shape (n_centers, n_features), default=None\n The number of centers to generate, or the fixed center locations.\n If n_samples is an int and centers is None, 3 centers are generated.\n If n_samples is array-like, centers must be\n either None or an array of length equal to the length of n_samples.\n\n cluster_std : float or array-like of float, default=1.0\n The standard deviation of the clusters.\n\n center_box : tuple of float (min, max), default=(-10.0, 10.0)\n The bounding box for each cluster center when centers are\n generated at random.\n\n shuffle : bool, default=True\n Shuffle the samples.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n return_centers : bool, default=False\n If True, then return the centers of each cluster\n\n .. versionadded:: 0.23\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The generated samples.\n\n y : ndarray of shape (n_samples,)\n The integer labels for cluster membership of each sample.\n\n centers : ndarray of shape (n_centers, n_features)\n The centers of each cluster. Only returned if\n ``return_centers=True``.\n\n Examples\n --------\n >>> from sklearn.datasets import make_blobs\n >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,\n ... random_state=0)\n >>> print(X.shape)\n (10, 2)\n >>> y\n array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])\n >>> X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2,\n ... random_state=0)\n >>> print(X.shape)\n (10, 2)\n >>> y\n array([0, 1, 2, 0, 2, 2, 2, 1, 1, 0])\n\n See Also\n --------\n make_classification : A more intricate variant.\n ", "source_code": "\ndef make_blobs(n_samples=100, n_features=2, *, centers=None, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=None, return_centers=False):\n \"\"\"Generate isotropic Gaussian blobs for clustering.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int or array-like, default=100\n If int, it is the total number of points equally divided among\n clusters.\n If array-like, each element of the sequence indicates\n the number of samples per cluster.\n\n .. versionchanged:: v0.20\n one can now pass an array-like to the ``n_samples`` parameter\n\n n_features : int, default=2\n The number of features for each sample.\n\n centers : int or ndarray of shape (n_centers, n_features), default=None\n The number of centers to generate, or the fixed center locations.\n If n_samples is an int and centers is None, 3 centers are generated.\n If n_samples is array-like, centers must be\n either None or an array of length equal to the length of n_samples.\n\n cluster_std : float or array-like of float, default=1.0\n The standard deviation of the clusters.\n\n center_box : tuple of float (min, max), default=(-10.0, 10.0)\n The bounding box for each cluster center when centers are\n generated at random.\n\n shuffle : bool, default=True\n Shuffle the samples.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n return_centers : bool, default=False\n If True, then return the centers of each cluster\n\n .. versionadded:: 0.23\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The generated samples.\n\n y : ndarray of shape (n_samples,)\n The integer labels for cluster membership of each sample.\n\n centers : ndarray of shape (n_centers, n_features)\n The centers of each cluster. Only returned if\n ``return_centers=True``.\n\n Examples\n --------\n >>> from sklearn.datasets import make_blobs\n >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,\n ... random_state=0)\n >>> print(X.shape)\n (10, 2)\n >>> y\n array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])\n >>> X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2,\n ... random_state=0)\n >>> print(X.shape)\n (10, 2)\n >>> y\n array([0, 1, 2, 0, 2, 2, 2, 1, 1, 0])\n\n See Also\n --------\n make_classification : A more intricate variant.\n \"\"\"\n generator = check_random_state(random_state)\n if isinstance(n_samples, numbers.Integral):\n if centers is None:\n centers = 3\n if isinstance(centers, numbers.Integral):\n n_centers = centers\n centers = generator.uniform(center_box[0], center_box[1], size=(n_centers, n_features))\n else:\n centers = check_array(centers)\n n_features = centers.shape[1]\n n_centers = centers.shape[0]\n else:\n n_centers = len(n_samples)\n if centers is None:\n centers = generator.uniform(center_box[0], center_box[1], size=(n_centers, n_features))\n try:\n assert len(centers) == n_centers\n except TypeError as e:\n raise ValueError('Parameter `centers` must be array-like. Got {!r} instead'.format(centers)) from e\n except AssertionError as e:\n raise ValueError(f'Length of `n_samples` not consistent with number of centers. Got n_samples = {n_samples} and centers = {centers}') from e\n else:\n centers = check_array(centers)\n n_features = centers.shape[1]\n if hasattr(cluster_std, '__len__') and len(cluster_std) != n_centers:\n raise ValueError('Length of `clusters_std` not consistent with number of centers. Got centers = {} and cluster_std = {}'.format(centers, cluster_std))\n if isinstance(cluster_std, numbers.Real):\n cluster_std = np.full(len(centers), cluster_std)\n X = []\n y = []\n if isinstance(n_samples, Iterable):\n n_samples_per_center = n_samples\n else:\n n_samples_per_center = [int(n_samples // n_centers)] * n_centers\n for i in range(n_samples % n_centers):\n n_samples_per_center[i] += 1\n for (i, (n, std)) in enumerate(zip(n_samples_per_center, cluster_std)):\n X.append(generator.normal(loc=centers[i], scale=std, size=(n, n_features)))\n y += [i] * n\n X = np.concatenate(X)\n y = np.array(y)\n if shuffle:\n total_n_samples = np.sum(n_samples)\n indices = np.arange(total_n_samples)\n generator.shuffle(indices)\n X = X[indices]\n y = y[indices]\n if return_centers:\n return X, y, centers\n else:\n return X, y" }, { @@ -45079,7 +46805,8 @@ "docstring": { "type": "tuple of shape (n_rows, n_cols)", "description": "The shape of the result." - } + }, + "refined_type": {} }, { "name": "n_clusters", @@ -45089,7 +46816,8 @@ "docstring": { "type": "int or array-like or shape (n_row_clusters, n_column_clusters)", "description": "The number of row and column clusters." - } + }, + "refined_type": {} }, { "name": "noise", @@ -45099,7 +46827,8 @@ "docstring": { "type": "float, default=0.0", "description": "The standard deviation of the gaussian noise." - } + }, + "refined_type": {} }, { "name": "minval", @@ -45109,7 +46838,8 @@ "docstring": { "type": "int, default=10", "description": "Minimum value of a bicluster." - } + }, + "refined_type": {} }, { "name": "maxval", @@ -45119,7 +46849,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum value of a bicluster." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -45129,7 +46860,8 @@ "docstring": { "type": "bool, default=True", "description": "Shuffle the samples." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45139,13 +46871,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate an array with block checkerboard structure for biclustering.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Generate an array with block checkerboard structure for\nbiclustering.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nshape : tuple of shape (n_rows, n_cols)\n The shape of the result.\n\nn_clusters : int or array-like or shape (n_row_clusters, n_column_clusters)\n The number of row and column clusters.\n\nnoise : float, default=0.0\n The standard deviation of the gaussian noise.\n\nminval : int, default=10\n Minimum value of a bicluster.\n\nmaxval : int, default=100\n Maximum value of a bicluster.\n\nshuffle : bool, default=True\n Shuffle the samples.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape `shape`\n The generated array.\n\nrows : ndarray of shape (n_clusters, X.shape[0])\n The indicators for cluster membership of each row.\n\ncols : ndarray of shape (n_clusters, X.shape[1])\n The indicators for cluster membership of each column.\n\n\nReferences\n----------\n\n.. [1] Kluger, Y., Basri, R., Chang, J. T., & Gerstein, M. (2003).\n Spectral biclustering of microarray data: coclustering genes\n and conditions. Genome research, 13(4), 703-716.\n\nSee Also\n--------\nmake_biclusters", + "description": "Generate an array with block checkerboard structure for\nbiclustering.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate an array with block checkerboard structure for\n biclustering.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n shape : tuple of shape (n_rows, n_cols)\n The shape of the result.\n\n n_clusters : int or array-like or shape (n_row_clusters, n_column_clusters)\n The number of row and column clusters.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise.\n\n minval : int, default=10\n Minimum value of a bicluster.\n\n maxval : int, default=100\n Maximum value of a bicluster.\n\n shuffle : bool, default=True\n Shuffle the samples.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape `shape`\n The generated array.\n\n rows : ndarray of shape (n_clusters, X.shape[0])\n The indicators for cluster membership of each row.\n\n cols : ndarray of shape (n_clusters, X.shape[1])\n The indicators for cluster membership of each column.\n\n\n References\n ----------\n\n .. [1] Kluger, Y., Basri, R., Chang, J. T., & Gerstein, M. (2003).\n Spectral biclustering of microarray data: coclustering genes\n and conditions. Genome research, 13(4), 703-716.\n\n See Also\n --------\n make_biclusters\n ", "source_code": "\ndef make_checkerboard(shape, n_clusters, *, noise=0.0, minval=10, maxval=100, shuffle=True, random_state=None):\n \"\"\"Generate an array with block checkerboard structure for\n biclustering.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n shape : tuple of shape (n_rows, n_cols)\n The shape of the result.\n\n n_clusters : int or array-like or shape (n_row_clusters, n_column_clusters)\n The number of row and column clusters.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise.\n\n minval : int, default=10\n Minimum value of a bicluster.\n\n maxval : int, default=100\n Maximum value of a bicluster.\n\n shuffle : bool, default=True\n Shuffle the samples.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape `shape`\n The generated array.\n\n rows : ndarray of shape (n_clusters, X.shape[0])\n The indicators for cluster membership of each row.\n\n cols : ndarray of shape (n_clusters, X.shape[1])\n The indicators for cluster membership of each column.\n\n\n References\n ----------\n\n .. [1] Kluger, Y., Basri, R., Chang, J. T., & Gerstein, M. (2003).\n Spectral biclustering of microarray data: coclustering genes\n and conditions. Genome research, 13(4), 703-716.\n\n See Also\n --------\n make_biclusters\n \"\"\"\n generator = check_random_state(random_state)\n if hasattr(n_clusters, '__len__'):\n (n_row_clusters, n_col_clusters) = n_clusters\n else:\n n_row_clusters = n_col_clusters = n_clusters\n (n_rows, n_cols) = shape\n row_sizes = generator.multinomial(n_rows, np.repeat(1.0 / n_row_clusters, n_row_clusters))\n col_sizes = generator.multinomial(n_cols, np.repeat(1.0 / n_col_clusters, n_col_clusters))\n row_labels = np.hstack(list((np.repeat(val, rep) for (val, rep) in zip(range(n_row_clusters), row_sizes))))\n col_labels = np.hstack(list((np.repeat(val, rep) for (val, rep) in zip(range(n_col_clusters), col_sizes))))\n result = np.zeros(shape, dtype=np.float64)\n for i in range(n_row_clusters):\n for j in range(n_col_clusters):\n selector = np.outer(row_labels == i, col_labels == j)\n result[selector] += generator.uniform(minval, maxval)\n if noise > 0:\n result += generator.normal(scale=noise, size=result.shape)\n if shuffle:\n (result, row_idx, col_idx) = _shuffle(result, random_state)\n row_labels = row_labels[row_idx]\n col_labels = col_labels[col_idx]\n rows = np.vstack([row_labels == label for label in range(n_row_clusters) for _ in range(n_col_clusters)])\n cols = np.vstack([col_labels == label for _ in range(n_row_clusters) for label in range(n_col_clusters)])\n return result, rows, cols" }, { @@ -45163,7 +46896,8 @@ "docstring": { "type": "int or tuple of shape (2,), dtype=int, default=100", "description": "If int, it is the total number of points generated.\nFor odd numbers, the inner circle will have one point more than the\nouter circle.\nIf two-element tuple, number of points in outer circle and inner\ncircle.\n\n.. versionchanged:: 0.23\n Added two-element tuple." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -45173,7 +46907,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to shuffle the samples." - } + }, + "refined_type": {} }, { "name": "noise", @@ -45183,7 +46918,8 @@ "docstring": { "type": "float, default=None", "description": "Standard deviation of Gaussian noise added to the data." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45193,7 +46929,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset shuffling and noise.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "factor", @@ -45203,13 +46940,21 @@ "docstring": { "type": "float, default=.8", "description": "Scale factor between inner and outer circle in the range `(0, 1)`." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": false, + "max_inclusive": false } } ], "results": [], "is_public": true, - "description": "Make a large circle containing a smaller circle in 2d.\n\nA simple toy dataset to visualize clustering and classification algorithms. Read more in the :ref:`User Guide `.", - "docstring": "Make a large circle containing a smaller circle in 2d.\n\nA simple toy dataset to visualize clustering and classification\nalgorithms.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int or tuple of shape (2,), dtype=int, default=100\n If int, it is the total number of points generated.\n For odd numbers, the inner circle will have one point more than the\n outer circle.\n If two-element tuple, number of points in outer circle and inner\n circle.\n\n .. versionchanged:: 0.23\n Added two-element tuple.\n\nshuffle : bool, default=True\n Whether to shuffle the samples.\n\nnoise : float, default=None\n Standard deviation of Gaussian noise added to the data.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling and noise.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nfactor : float, default=.8\n Scale factor between inner and outer circle in the range `(0, 1)`.\n\nReturns\n-------\nX : ndarray of shape (n_samples, 2)\n The generated samples.\n\ny : ndarray of shape (n_samples,)\n The integer labels (0 or 1) for class membership of each sample.", + "description": "Make a large circle containing a smaller circle in 2d.\n\nA simple toy dataset to visualize clustering and classification\nalgorithms.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Make a large circle containing a smaller circle in 2d.\n\n A simple toy dataset to visualize clustering and classification\n algorithms.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int or tuple of shape (2,), dtype=int, default=100\n If int, it is the total number of points generated.\n For odd numbers, the inner circle will have one point more than the\n outer circle.\n If two-element tuple, number of points in outer circle and inner\n circle.\n\n .. versionchanged:: 0.23\n Added two-element tuple.\n\n shuffle : bool, default=True\n Whether to shuffle the samples.\n\n noise : float, default=None\n Standard deviation of Gaussian noise added to the data.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling and noise.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n factor : float, default=.8\n Scale factor between inner and outer circle in the range `(0, 1)`.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 2)\n The generated samples.\n\n y : ndarray of shape (n_samples,)\n The integer labels (0 or 1) for class membership of each sample.\n ", "source_code": "\ndef make_circles(n_samples=100, *, shuffle=True, noise=None, random_state=None, factor=0.8):\n \"\"\"Make a large circle containing a smaller circle in 2d.\n\n A simple toy dataset to visualize clustering and classification\n algorithms.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int or tuple of shape (2,), dtype=int, default=100\n If int, it is the total number of points generated.\n For odd numbers, the inner circle will have one point more than the\n outer circle.\n If two-element tuple, number of points in outer circle and inner\n circle.\n\n .. versionchanged:: 0.23\n Added two-element tuple.\n\n shuffle : bool, default=True\n Whether to shuffle the samples.\n\n noise : float, default=None\n Standard deviation of Gaussian noise added to the data.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling and noise.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n factor : float, default=.8\n Scale factor between inner and outer circle in the range `(0, 1)`.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 2)\n The generated samples.\n\n y : ndarray of shape (n_samples,)\n The integer labels (0 or 1) for class membership of each sample.\n \"\"\"\n if factor >= 1 or factor < 0:\n raise ValueError(\"'factor' has to be between 0 and 1.\")\n if isinstance(n_samples, numbers.Integral):\n n_samples_out = n_samples // 2\n n_samples_in = n_samples - n_samples_out\n else:\n try:\n (n_samples_out, n_samples_in) = n_samples\n except ValueError as e:\n raise ValueError('`n_samples` can be either an int or a two-element tuple.') from e\n generator = check_random_state(random_state)\n linspace_out = np.linspace(0, 2 * np.pi, n_samples_out, endpoint=False)\n linspace_in = np.linspace(0, 2 * np.pi, n_samples_in, endpoint=False)\n outer_circ_x = np.cos(linspace_out)\n outer_circ_y = np.sin(linspace_out)\n inner_circ_x = np.cos(linspace_in) * factor\n inner_circ_y = np.sin(linspace_in) * factor\n X = np.vstack([np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)]).T\n y = np.hstack([np.zeros(n_samples_out, dtype=np.intp), np.ones(n_samples_in, dtype=np.intp)])\n if shuffle:\n (X, y) = util_shuffle(X, y, random_state=generator)\n if noise is not None:\n X += generator.normal(scale=noise, size=X.shape)\n return X, y" }, { @@ -45227,7 +46972,8 @@ "docstring": { "type": "int, default=100", "description": "The number of samples." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -45237,7 +46983,8 @@ "docstring": { "type": "int, default=20", "description": "The total number of features. These comprise ``n_informative``\ninformative features, ``n_redundant`` redundant features,\n``n_repeated`` duplicated features and\n``n_features-n_informative-n_redundant-n_repeated`` useless features\ndrawn at random." - } + }, + "refined_type": {} }, { "name": "n_informative", @@ -45247,7 +46994,8 @@ "docstring": { "type": "int, default=2", "description": "The number of informative features. Each class is composed of a number\nof gaussian clusters each located around the vertices of a hypercube\nin a subspace of dimension ``n_informative``. For each cluster,\ninformative features are drawn independently from N(0, 1) and then\nrandomly linearly combined within each cluster in order to add\ncovariance. The clusters are then placed on the vertices of the\nhypercube." - } + }, + "refined_type": {} }, { "name": "n_redundant", @@ -45257,7 +47005,8 @@ "docstring": { "type": "int, default=2", "description": "The number of redundant features. These features are generated as\nrandom linear combinations of the informative features." - } + }, + "refined_type": {} }, { "name": "n_repeated", @@ -45267,7 +47016,8 @@ "docstring": { "type": "int, default=0", "description": "The number of duplicated features, drawn randomly from the informative\nand the redundant features." - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -45277,7 +47027,8 @@ "docstring": { "type": "int, default=2", "description": "The number of classes (or labels) of the classification problem." - } + }, + "refined_type": {} }, { "name": "n_clusters_per_class", @@ -45287,7 +47038,8 @@ "docstring": { "type": "int, default=2", "description": "The number of clusters per class." - } + }, + "refined_type": {} }, { "name": "weights", @@ -45297,7 +47049,8 @@ "docstring": { "type": "array-like of shape (n_classes,) or (n_classes - 1,), default=None", "description": "The proportions of samples assigned to each class. If None, then\nclasses are balanced. Note that if ``len(weights) == n_classes - 1``,\nthen the last class weight is automatically inferred.\nMore than ``n_samples`` samples may be returned if the sum of\n``weights`` exceeds 1. Note that the actual class proportions will\nnot exactly match ``weights`` when ``flip_y`` isn't 0." - } + }, + "refined_type": {} }, { "name": "flip_y", @@ -45307,7 +47060,8 @@ "docstring": { "type": "float, default=0.01", "description": "The fraction of samples whose class is assigned randomly. Larger\nvalues introduce noise in the labels and make the classification\ntask harder. Note that the default setting flip_y > 0 might lead\nto less than ``n_classes`` in y in some cases." - } + }, + "refined_type": {} }, { "name": "class_sep", @@ -45317,7 +47071,8 @@ "docstring": { "type": "float, default=1.0", "description": "The factor multiplying the hypercube size. Larger values spread\nout the clusters/classes and make the classification task easier." - } + }, + "refined_type": {} }, { "name": "hypercube", @@ -45327,7 +47082,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, the clusters are put on the vertices of a hypercube. If\nFalse, the clusters are put on the vertices of a random polytope." - } + }, + "refined_type": {} }, { "name": "shift", @@ -45337,7 +47093,8 @@ "docstring": { "type": "float, ndarray of shape (n_features,) or None, default=0.0", "description": "Shift features by the specified value. If None, then features\nare shifted by a random value drawn in [-class_sep, class_sep]." - } + }, + "refined_type": {} }, { "name": "scale", @@ -45347,7 +47104,8 @@ "docstring": { "type": "float, ndarray of shape (n_features,) or None, default=1.0", "description": "Multiply features by the specified value. If None, then features\nare scaled by a random value drawn in [1, 100]. Note that scaling\nhappens after shifting." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -45357,7 +47115,8 @@ "docstring": { "type": "bool, default=True", "description": "Shuffle the samples and the features." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45367,13 +47126,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate a random n-class classification problem.\n\nThis initially creates clusters of points normally distributed (std=1) about vertices of an ``n_informative``-dimensional hypercube with sides of length ``2*class_sep`` and assigns an equal number of clusters to each class. It introduces interdependence between these features and adds various types of further noise to the data. Without shuffling, ``X`` horizontally stacks features in the following order: the primary ``n_informative`` features, followed by ``n_redundant`` linear combinations of the informative features, followed by ``n_repeated`` duplicates, drawn randomly with replacement from the informative and redundant features. The remaining features are filled with random noise. Thus, without shuffling, all useful features are contained in the columns ``X[:, :n_informative + n_redundant + n_repeated]``. Read more in the :ref:`User Guide `.", - "docstring": "Generate a random n-class classification problem.\n\nThis initially creates clusters of points normally distributed (std=1)\nabout vertices of an ``n_informative``-dimensional hypercube with sides of\nlength ``2*class_sep`` and assigns an equal number of clusters to each\nclass. It introduces interdependence between these features and adds\nvarious types of further noise to the data.\n\nWithout shuffling, ``X`` horizontally stacks features in the following\norder: the primary ``n_informative`` features, followed by ``n_redundant``\nlinear combinations of the informative features, followed by ``n_repeated``\nduplicates, drawn randomly with replacement from the informative and\nredundant features. The remaining features are filled with random noise.\nThus, without shuffling, all useful features are contained in the columns\n``X[:, :n_informative + n_redundant + n_repeated]``.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int, default=100\n The number of samples.\n\nn_features : int, default=20\n The total number of features. These comprise ``n_informative``\n informative features, ``n_redundant`` redundant features,\n ``n_repeated`` duplicated features and\n ``n_features-n_informative-n_redundant-n_repeated`` useless features\n drawn at random.\n\nn_informative : int, default=2\n The number of informative features. Each class is composed of a number\n of gaussian clusters each located around the vertices of a hypercube\n in a subspace of dimension ``n_informative``. For each cluster,\n informative features are drawn independently from N(0, 1) and then\n randomly linearly combined within each cluster in order to add\n covariance. The clusters are then placed on the vertices of the\n hypercube.\n\nn_redundant : int, default=2\n The number of redundant features. These features are generated as\n random linear combinations of the informative features.\n\nn_repeated : int, default=0\n The number of duplicated features, drawn randomly from the informative\n and the redundant features.\n\nn_classes : int, default=2\n The number of classes (or labels) of the classification problem.\n\nn_clusters_per_class : int, default=2\n The number of clusters per class.\n\nweights : array-like of shape (n_classes,) or (n_classes - 1,), default=None\n The proportions of samples assigned to each class. If None, then\n classes are balanced. Note that if ``len(weights) == n_classes - 1``,\n then the last class weight is automatically inferred.\n More than ``n_samples`` samples may be returned if the sum of\n ``weights`` exceeds 1. Note that the actual class proportions will\n not exactly match ``weights`` when ``flip_y`` isn't 0.\n\nflip_y : float, default=0.01\n The fraction of samples whose class is assigned randomly. Larger\n values introduce noise in the labels and make the classification\n task harder. Note that the default setting flip_y > 0 might lead\n to less than ``n_classes`` in y in some cases.\n\nclass_sep : float, default=1.0\n The factor multiplying the hypercube size. Larger values spread\n out the clusters/classes and make the classification task easier.\n\nhypercube : bool, default=True\n If True, the clusters are put on the vertices of a hypercube. If\n False, the clusters are put on the vertices of a random polytope.\n\nshift : float, ndarray of shape (n_features,) or None, default=0.0\n Shift features by the specified value. If None, then features\n are shifted by a random value drawn in [-class_sep, class_sep].\n\nscale : float, ndarray of shape (n_features,) or None, default=1.0\n Multiply features by the specified value. If None, then features\n are scaled by a random value drawn in [1, 100]. Note that scaling\n happens after shifting.\n\nshuffle : bool, default=True\n Shuffle the samples and the features.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_features)\n The generated samples.\n\ny : ndarray of shape (n_samples,)\n The integer labels for class membership of each sample.\n\nNotes\n-----\nThe algorithm is adapted from Guyon [1] and was designed to generate\nthe \"Madelon\" dataset.\n\nReferences\n----------\n.. [1] I. Guyon, \"Design of experiments for the NIPS 2003 variable\n selection benchmark\", 2003.\n\nSee Also\n--------\nmake_blobs : Simplified variant.\nmake_multilabel_classification : Unrelated generator for multilabel tasks.", + "description": "Generate a random n-class classification problem.\n\nThis initially creates clusters of points normally distributed (std=1)\nabout vertices of an ``n_informative``-dimensional hypercube with sides of\nlength ``2*class_sep`` and assigns an equal number of clusters to each\nclass. It introduces interdependence between these features and adds\nvarious types of further noise to the data.\n\nWithout shuffling, ``X`` horizontally stacks features in the following\norder: the primary ``n_informative`` features, followed by ``n_redundant``\nlinear combinations of the informative features, followed by ``n_repeated``\nduplicates, drawn randomly with replacement from the informative and\nredundant features. The remaining features are filled with random noise.\nThus, without shuffling, all useful features are contained in the columns\n``X[:, :n_informative + n_redundant + n_repeated]``.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate a random n-class classification problem.\n\n This initially creates clusters of points normally distributed (std=1)\n about vertices of an ``n_informative``-dimensional hypercube with sides of\n length ``2*class_sep`` and assigns an equal number of clusters to each\n class. It introduces interdependence between these features and adds\n various types of further noise to the data.\n\n Without shuffling, ``X`` horizontally stacks features in the following\n order: the primary ``n_informative`` features, followed by ``n_redundant``\n linear combinations of the informative features, followed by ``n_repeated``\n duplicates, drawn randomly with replacement from the informative and\n redundant features. The remaining features are filled with random noise.\n Thus, without shuffling, all useful features are contained in the columns\n ``X[:, :n_informative + n_redundant + n_repeated]``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n n_features : int, default=20\n The total number of features. These comprise ``n_informative``\n informative features, ``n_redundant`` redundant features,\n ``n_repeated`` duplicated features and\n ``n_features-n_informative-n_redundant-n_repeated`` useless features\n drawn at random.\n\n n_informative : int, default=2\n The number of informative features. Each class is composed of a number\n of gaussian clusters each located around the vertices of a hypercube\n in a subspace of dimension ``n_informative``. For each cluster,\n informative features are drawn independently from N(0, 1) and then\n randomly linearly combined within each cluster in order to add\n covariance. The clusters are then placed on the vertices of the\n hypercube.\n\n n_redundant : int, default=2\n The number of redundant features. These features are generated as\n random linear combinations of the informative features.\n\n n_repeated : int, default=0\n The number of duplicated features, drawn randomly from the informative\n and the redundant features.\n\n n_classes : int, default=2\n The number of classes (or labels) of the classification problem.\n\n n_clusters_per_class : int, default=2\n The number of clusters per class.\n\n weights : array-like of shape (n_classes,) or (n_classes - 1,), default=None\n The proportions of samples assigned to each class. If None, then\n classes are balanced. Note that if ``len(weights) == n_classes - 1``,\n then the last class weight is automatically inferred.\n More than ``n_samples`` samples may be returned if the sum of\n ``weights`` exceeds 1. Note that the actual class proportions will\n not exactly match ``weights`` when ``flip_y`` isn't 0.\n\n flip_y : float, default=0.01\n The fraction of samples whose class is assigned randomly. Larger\n values introduce noise in the labels and make the classification\n task harder. Note that the default setting flip_y > 0 might lead\n to less than ``n_classes`` in y in some cases.\n\n class_sep : float, default=1.0\n The factor multiplying the hypercube size. Larger values spread\n out the clusters/classes and make the classification task easier.\n\n hypercube : bool, default=True\n If True, the clusters are put on the vertices of a hypercube. If\n False, the clusters are put on the vertices of a random polytope.\n\n shift : float, ndarray of shape (n_features,) or None, default=0.0\n Shift features by the specified value. If None, then features\n are shifted by a random value drawn in [-class_sep, class_sep].\n\n scale : float, ndarray of shape (n_features,) or None, default=1.0\n Multiply features by the specified value. If None, then features\n are scaled by a random value drawn in [1, 100]. Note that scaling\n happens after shifting.\n\n shuffle : bool, default=True\n Shuffle the samples and the features.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The generated samples.\n\n y : ndarray of shape (n_samples,)\n The integer labels for class membership of each sample.\n\n Notes\n -----\n The algorithm is adapted from Guyon [1] and was designed to generate\n the \"Madelon\" dataset.\n\n References\n ----------\n .. [1] I. Guyon, \"Design of experiments for the NIPS 2003 variable\n selection benchmark\", 2003.\n\n See Also\n --------\n make_blobs : Simplified variant.\n make_multilabel_classification : Unrelated generator for multilabel tasks.\n ", "source_code": "\ndef make_classification(n_samples=100, n_features=20, *, n_informative=2, n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None):\n \"\"\"Generate a random n-class classification problem.\n\n This initially creates clusters of points normally distributed (std=1)\n about vertices of an ``n_informative``-dimensional hypercube with sides of\n length ``2*class_sep`` and assigns an equal number of clusters to each\n class. It introduces interdependence between these features and adds\n various types of further noise to the data.\n\n Without shuffling, ``X`` horizontally stacks features in the following\n order: the primary ``n_informative`` features, followed by ``n_redundant``\n linear combinations of the informative features, followed by ``n_repeated``\n duplicates, drawn randomly with replacement from the informative and\n redundant features. The remaining features are filled with random noise.\n Thus, without shuffling, all useful features are contained in the columns\n ``X[:, :n_informative + n_redundant + n_repeated]``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n n_features : int, default=20\n The total number of features. These comprise ``n_informative``\n informative features, ``n_redundant`` redundant features,\n ``n_repeated`` duplicated features and\n ``n_features-n_informative-n_redundant-n_repeated`` useless features\n drawn at random.\n\n n_informative : int, default=2\n The number of informative features. Each class is composed of a number\n of gaussian clusters each located around the vertices of a hypercube\n in a subspace of dimension ``n_informative``. For each cluster,\n informative features are drawn independently from N(0, 1) and then\n randomly linearly combined within each cluster in order to add\n covariance. The clusters are then placed on the vertices of the\n hypercube.\n\n n_redundant : int, default=2\n The number of redundant features. These features are generated as\n random linear combinations of the informative features.\n\n n_repeated : int, default=0\n The number of duplicated features, drawn randomly from the informative\n and the redundant features.\n\n n_classes : int, default=2\n The number of classes (or labels) of the classification problem.\n\n n_clusters_per_class : int, default=2\n The number of clusters per class.\n\n weights : array-like of shape (n_classes,) or (n_classes - 1,), default=None\n The proportions of samples assigned to each class. If None, then\n classes are balanced. Note that if ``len(weights) == n_classes - 1``,\n then the last class weight is automatically inferred.\n More than ``n_samples`` samples may be returned if the sum of\n ``weights`` exceeds 1. Note that the actual class proportions will\n not exactly match ``weights`` when ``flip_y`` isn't 0.\n\n flip_y : float, default=0.01\n The fraction of samples whose class is assigned randomly. Larger\n values introduce noise in the labels and make the classification\n task harder. Note that the default setting flip_y > 0 might lead\n to less than ``n_classes`` in y in some cases.\n\n class_sep : float, default=1.0\n The factor multiplying the hypercube size. Larger values spread\n out the clusters/classes and make the classification task easier.\n\n hypercube : bool, default=True\n If True, the clusters are put on the vertices of a hypercube. If\n False, the clusters are put on the vertices of a random polytope.\n\n shift : float, ndarray of shape (n_features,) or None, default=0.0\n Shift features by the specified value. If None, then features\n are shifted by a random value drawn in [-class_sep, class_sep].\n\n scale : float, ndarray of shape (n_features,) or None, default=1.0\n Multiply features by the specified value. If None, then features\n are scaled by a random value drawn in [1, 100]. Note that scaling\n happens after shifting.\n\n shuffle : bool, default=True\n Shuffle the samples and the features.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The generated samples.\n\n y : ndarray of shape (n_samples,)\n The integer labels for class membership of each sample.\n\n Notes\n -----\n The algorithm is adapted from Guyon [1] and was designed to generate\n the \"Madelon\" dataset.\n\n References\n ----------\n .. [1] I. Guyon, \"Design of experiments for the NIPS 2003 variable\n selection benchmark\", 2003.\n\n See Also\n --------\n make_blobs : Simplified variant.\n make_multilabel_classification : Unrelated generator for multilabel tasks.\n \"\"\"\n generator = check_random_state(random_state)\n if n_informative + n_redundant + n_repeated > n_features:\n raise ValueError('Number of informative, redundant and repeated features must sum to less than the number of total features')\n if n_informative < np.log2(n_classes * n_clusters_per_class):\n msg = 'n_classes({}) * n_clusters_per_class({}) must be'\n msg += ' smaller or equal 2**n_informative({})={}'\n raise ValueError(msg.format(n_classes, n_clusters_per_class, n_informative, 2**n_informative))\n if weights is not None:\n if len(weights) not in [n_classes, n_classes - 1]:\n raise ValueError('Weights specified but incompatible with number of classes.')\n if len(weights) == n_classes - 1:\n if isinstance(weights, list):\n weights = weights + [1.0 - sum(weights)]\n else:\n weights = np.resize(weights, n_classes)\n weights[-1] = 1.0 - sum(weights[:-1])\n else:\n weights = [1.0 / n_classes] * n_classes\n n_useless = n_features - n_informative - n_redundant - n_repeated\n n_clusters = n_classes * n_clusters_per_class\n n_samples_per_cluster = [int(n_samples * weights[k % n_classes] / n_clusters_per_class) for k in range(n_clusters)]\n for i in range(n_samples - sum(n_samples_per_cluster)):\n n_samples_per_cluster[i % n_clusters] += 1\n X = np.zeros((n_samples, n_features))\n y = np.zeros(n_samples, dtype=int)\n centroids = _generate_hypercube(n_clusters, n_informative, generator).astype(float, copy=False)\n centroids *= 2 * class_sep\n centroids -= class_sep\n if not hypercube:\n centroids *= generator.rand(n_clusters, 1)\n centroids *= generator.rand(1, n_informative)\n X[:, :n_informative] = generator.randn(n_samples, n_informative)\n stop = 0\n for (k, centroid) in enumerate(centroids):\n (start, stop) = (stop, stop + n_samples_per_cluster[k])\n y[start:stop] = k % n_classes\n X_k = X[start:stop, :n_informative]\n A = 2 * generator.rand(n_informative, n_informative) - 1\n X_k[...] = np.dot(X_k, A)\n X_k += centroid\n if n_redundant > 0:\n B = 2 * generator.rand(n_informative, n_redundant) - 1\n X[:, n_informative:n_informative + n_redundant] = np.dot(X[:, :n_informative], B)\n if n_repeated > 0:\n n = n_informative + n_redundant\n indices = ((n - 1) * generator.rand(n_repeated) + 0.5).astype(np.intp)\n X[:, n:n + n_repeated] = X[:, indices]\n if n_useless > 0:\n X[:, -n_useless:] = generator.randn(n_samples, n_useless)\n if flip_y >= 0.0:\n flip_mask = generator.rand(n_samples) < flip_y\n y[flip_mask] = generator.randint(n_classes, size=flip_mask.sum())\n if shift is None:\n shift = (2 * generator.rand(n_features) - 1) * class_sep\n X += shift\n if scale is None:\n scale = 1 + 100 * generator.rand(n_features)\n X *= scale\n if shuffle:\n (X, y) = util_shuffle(X, y, random_state=generator)\n indices = np.arange(n_features)\n generator.shuffle(indices)\n X[:, :] = X[:, indices]\n return X, y" }, { @@ -45391,7 +47151,8 @@ "docstring": { "type": "int, default=100", "description": "The number of samples." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -45401,7 +47162,8 @@ "docstring": { "type": "int, default=10", "description": "The number of features. Should be at least 5." - } + }, + "refined_type": {} }, { "name": "noise", @@ -45411,7 +47173,8 @@ "docstring": { "type": "float, default=0.0", "description": "The standard deviation of the gaussian noise applied to the output." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45421,13 +47184,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset noise. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate the \"Friedman #1\" regression problem.\n\nThis dataset is described in Friedman [1] and Breiman [2]. Inputs `X` are independent features uniformly distributed on the interval [0, 1]. The output `y` is created according to the formula:: y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1). Out of the `n_features` features, only 5 are actually used to compute `y`. The remaining features are independent of `y`. The number of features has to be >= 5. Read more in the :ref:`User Guide `.", - "docstring": "Generate the \"Friedman #1\" regression problem.\n\nThis dataset is described in Friedman [1] and Breiman [2].\n\nInputs `X` are independent features uniformly distributed on the interval\n[0, 1]. The output `y` is created according to the formula::\n\n y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1).\n\nOut of the `n_features` features, only 5 are actually used to compute\n`y`. The remaining features are independent of `y`.\n\nThe number of features has to be >= 5.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int, default=100\n The number of samples.\n\nn_features : int, default=10\n The number of features. Should be at least 5.\n\nnoise : float, default=0.0\n The standard deviation of the gaussian noise applied to the output.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset noise. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_features)\n The input samples.\n\ny : ndarray of shape (n_samples,)\n The output values.\n\nReferences\n----------\n.. [1] J. Friedman, \"Multivariate adaptive regression splines\", The Annals\n of Statistics 19 (1), pages 1-67, 1991.\n\n.. [2] L. Breiman, \"Bagging predictors\", Machine Learning 24,\n pages 123-140, 1996.", + "description": "Generate the \"Friedman #1\" regression problem.\n\nThis dataset is described in Friedman [1] and Breiman [2].\n\nInputs `X` are independent features uniformly distributed on the interval\n[0, 1]. The output `y` is created according to the formula::\n\n y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1).\n\nOut of the `n_features` features, only 5 are actually used to compute\n`y`. The remaining features are independent of `y`.\n\nThe number of features has to be >= 5.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate the \"Friedman #1\" regression problem.\n\n This dataset is described in Friedman [1] and Breiman [2].\n\n Inputs `X` are independent features uniformly distributed on the interval\n [0, 1]. The output `y` is created according to the formula::\n\n y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1).\n\n Out of the `n_features` features, only 5 are actually used to compute\n `y`. The remaining features are independent of `y`.\n\n The number of features has to be >= 5.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n n_features : int, default=10\n The number of features. Should be at least 5.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise applied to the output.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset noise. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The input samples.\n\n y : ndarray of shape (n_samples,)\n The output values.\n\n References\n ----------\n .. [1] J. Friedman, \"Multivariate adaptive regression splines\", The Annals\n of Statistics 19 (1), pages 1-67, 1991.\n\n .. [2] L. Breiman, \"Bagging predictors\", Machine Learning 24,\n pages 123-140, 1996.\n ", "source_code": "\ndef make_friedman1(n_samples=100, n_features=10, *, noise=0.0, random_state=None):\n \"\"\"Generate the \"Friedman #1\" regression problem.\n\n This dataset is described in Friedman [1] and Breiman [2].\n\n Inputs `X` are independent features uniformly distributed on the interval\n [0, 1]. The output `y` is created according to the formula::\n\n y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1).\n\n Out of the `n_features` features, only 5 are actually used to compute\n `y`. The remaining features are independent of `y`.\n\n The number of features has to be >= 5.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n n_features : int, default=10\n The number of features. Should be at least 5.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise applied to the output.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset noise. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The input samples.\n\n y : ndarray of shape (n_samples,)\n The output values.\n\n References\n ----------\n .. [1] J. Friedman, \"Multivariate adaptive regression splines\", The Annals\n of Statistics 19 (1), pages 1-67, 1991.\n\n .. [2] L. Breiman, \"Bagging predictors\", Machine Learning 24,\n pages 123-140, 1996.\n \"\"\"\n if n_features < 5:\n raise ValueError('n_features must be at least five.')\n generator = check_random_state(random_state)\n X = generator.rand(n_samples, n_features)\n y = 10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5)**2 + 10 * X[:, 3] + 5 * X[:, 4] + noise * generator.randn(n_samples)\n return X, y" }, { @@ -45445,7 +47209,8 @@ "docstring": { "type": "int, default=100", "description": "The number of samples." - } + }, + "refined_type": {} }, { "name": "noise", @@ -45455,7 +47220,8 @@ "docstring": { "type": "float, default=0.0", "description": "The standard deviation of the gaussian noise applied to the output." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45465,13 +47231,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset noise. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate the \"Friedman #2\" regression problem.\n\nThis dataset is described in Friedman [1] and Breiman [2]. Inputs `X` are 4 independent features uniformly distributed on the intervals:: 0 <= X[:, 0] <= 100, 40 * pi <= X[:, 1] <= 560 * pi, 0 <= X[:, 2] <= 1, 1 <= X[:, 3] <= 11. The output `y` is created according to the formula:: y(X) = (X[:, 0] ** 2 + (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 + noise * N(0, 1). Read more in the :ref:`User Guide `.", - "docstring": "Generate the \"Friedman #2\" regression problem.\n\nThis dataset is described in Friedman [1] and Breiman [2].\n\nInputs `X` are 4 independent features uniformly distributed on the\nintervals::\n\n 0 <= X[:, 0] <= 100,\n 40 * pi <= X[:, 1] <= 560 * pi,\n 0 <= X[:, 2] <= 1,\n 1 <= X[:, 3] <= 11.\n\nThe output `y` is created according to the formula::\n\n y(X) = (X[:, 0] ** 2 + (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 + noise * N(0, 1).\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int, default=100\n The number of samples.\n\nnoise : float, default=0.0\n The standard deviation of the gaussian noise applied to the output.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset noise. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, 4)\n The input samples.\n\ny : ndarray of shape (n_samples,)\n The output values.\n\nReferences\n----------\n.. [1] J. Friedman, \"Multivariate adaptive regression splines\", The Annals\n of Statistics 19 (1), pages 1-67, 1991.\n\n.. [2] L. Breiman, \"Bagging predictors\", Machine Learning 24,\n pages 123-140, 1996.", + "description": "Generate the \"Friedman #2\" regression problem.\n\nThis dataset is described in Friedman [1] and Breiman [2].\n\nInputs `X` are 4 independent features uniformly distributed on the\nintervals::\n\n 0 <= X[:, 0] <= 100,\n 40 * pi <= X[:, 1] <= 560 * pi,\n 0 <= X[:, 2] <= 1,\n 1 <= X[:, 3] <= 11.\n\nThe output `y` is created according to the formula::\n\n y(X) = (X[:, 0] ** 2 + (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 + noise * N(0, 1).\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate the \"Friedman #2\" regression problem.\n\n This dataset is described in Friedman [1] and Breiman [2].\n\n Inputs `X` are 4 independent features uniformly distributed on the\n intervals::\n\n 0 <= X[:, 0] <= 100,\n 40 * pi <= X[:, 1] <= 560 * pi,\n 0 <= X[:, 2] <= 1,\n 1 <= X[:, 3] <= 11.\n\n The output `y` is created according to the formula::\n\n y(X) = (X[:, 0] ** 2 + (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 + noise * N(0, 1).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise applied to the output.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset noise. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 4)\n The input samples.\n\n y : ndarray of shape (n_samples,)\n The output values.\n\n References\n ----------\n .. [1] J. Friedman, \"Multivariate adaptive regression splines\", The Annals\n of Statistics 19 (1), pages 1-67, 1991.\n\n .. [2] L. Breiman, \"Bagging predictors\", Machine Learning 24,\n pages 123-140, 1996.\n ", "source_code": "\ndef make_friedman2(n_samples=100, *, noise=0.0, random_state=None):\n \"\"\"Generate the \"Friedman #2\" regression problem.\n\n This dataset is described in Friedman [1] and Breiman [2].\n\n Inputs `X` are 4 independent features uniformly distributed on the\n intervals::\n\n 0 <= X[:, 0] <= 100,\n 40 * pi <= X[:, 1] <= 560 * pi,\n 0 <= X[:, 2] <= 1,\n 1 <= X[:, 3] <= 11.\n\n The output `y` is created according to the formula::\n\n y(X) = (X[:, 0] ** 2 + (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) ** 2) ** 0.5 + noise * N(0, 1).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise applied to the output.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset noise. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 4)\n The input samples.\n\n y : ndarray of shape (n_samples,)\n The output values.\n\n References\n ----------\n .. [1] J. Friedman, \"Multivariate adaptive regression splines\", The Annals\n of Statistics 19 (1), pages 1-67, 1991.\n\n .. [2] L. Breiman, \"Bagging predictors\", Machine Learning 24,\n pages 123-140, 1996.\n \"\"\"\n generator = check_random_state(random_state)\n X = generator.rand(n_samples, 4)\n X[:, 0] *= 100\n X[:, 1] *= 520 * np.pi\n X[:, 1] += 40 * np.pi\n X[:, 3] *= 10\n X[:, 3] += 1\n y = (X[:, 0]**2 + (X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3]))**2)**0.5 + noise * generator.randn(n_samples)\n return X, y" }, { @@ -45489,7 +47256,8 @@ "docstring": { "type": "int, default=100", "description": "The number of samples." - } + }, + "refined_type": {} }, { "name": "noise", @@ -45499,7 +47267,8 @@ "docstring": { "type": "float, default=0.0", "description": "The standard deviation of the gaussian noise applied to the output." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45509,13 +47278,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset noise. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate the \"Friedman #3\" regression problem.\n\nThis dataset is described in Friedman [1] and Breiman [2]. Inputs `X` are 4 independent features uniformly distributed on the intervals:: 0 <= X[:, 0] <= 100, 40 * pi <= X[:, 1] <= 560 * pi, 0 <= X[:, 2] <= 1, 1 <= X[:, 3] <= 11. The output `y` is created according to the formula:: y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]) + noise * N(0, 1). Read more in the :ref:`User Guide `.", - "docstring": "Generate the \"Friedman #3\" regression problem.\n\nThis dataset is described in Friedman [1] and Breiman [2].\n\nInputs `X` are 4 independent features uniformly distributed on the\nintervals::\n\n 0 <= X[:, 0] <= 100,\n 40 * pi <= X[:, 1] <= 560 * pi,\n 0 <= X[:, 2] <= 1,\n 1 <= X[:, 3] <= 11.\n\nThe output `y` is created according to the formula::\n\n y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]) + noise * N(0, 1).\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int, default=100\n The number of samples.\n\nnoise : float, default=0.0\n The standard deviation of the gaussian noise applied to the output.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset noise. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, 4)\n The input samples.\n\ny : ndarray of shape (n_samples,)\n The output values.\n\nReferences\n----------\n.. [1] J. Friedman, \"Multivariate adaptive regression splines\", The Annals\n of Statistics 19 (1), pages 1-67, 1991.\n\n.. [2] L. Breiman, \"Bagging predictors\", Machine Learning 24,\n pages 123-140, 1996.", + "description": "Generate the \"Friedman #3\" regression problem.\n\nThis dataset is described in Friedman [1] and Breiman [2].\n\nInputs `X` are 4 independent features uniformly distributed on the\nintervals::\n\n 0 <= X[:, 0] <= 100,\n 40 * pi <= X[:, 1] <= 560 * pi,\n 0 <= X[:, 2] <= 1,\n 1 <= X[:, 3] <= 11.\n\nThe output `y` is created according to the formula::\n\n y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]) + noise * N(0, 1).\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate the \"Friedman #3\" regression problem.\n\n This dataset is described in Friedman [1] and Breiman [2].\n\n Inputs `X` are 4 independent features uniformly distributed on the\n intervals::\n\n 0 <= X[:, 0] <= 100,\n 40 * pi <= X[:, 1] <= 560 * pi,\n 0 <= X[:, 2] <= 1,\n 1 <= X[:, 3] <= 11.\n\n The output `y` is created according to the formula::\n\n y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]) + noise * N(0, 1).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise applied to the output.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset noise. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 4)\n The input samples.\n\n y : ndarray of shape (n_samples,)\n The output values.\n\n References\n ----------\n .. [1] J. Friedman, \"Multivariate adaptive regression splines\", The Annals\n of Statistics 19 (1), pages 1-67, 1991.\n\n .. [2] L. Breiman, \"Bagging predictors\", Machine Learning 24,\n pages 123-140, 1996.\n ", "source_code": "\ndef make_friedman3(n_samples=100, *, noise=0.0, random_state=None):\n \"\"\"Generate the \"Friedman #3\" regression problem.\n\n This dataset is described in Friedman [1] and Breiman [2].\n\n Inputs `X` are 4 independent features uniformly distributed on the\n intervals::\n\n 0 <= X[:, 0] <= 100,\n 40 * pi <= X[:, 1] <= 560 * pi,\n 0 <= X[:, 2] <= 1,\n 1 <= X[:, 3] <= 11.\n\n The output `y` is created according to the formula::\n\n y(X) = arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]) + noise * N(0, 1).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise applied to the output.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset noise. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 4)\n The input samples.\n\n y : ndarray of shape (n_samples,)\n The output values.\n\n References\n ----------\n .. [1] J. Friedman, \"Multivariate adaptive regression splines\", The Annals\n of Statistics 19 (1), pages 1-67, 1991.\n\n .. [2] L. Breiman, \"Bagging predictors\", Machine Learning 24,\n pages 123-140, 1996.\n \"\"\"\n generator = check_random_state(random_state)\n X = generator.rand(n_samples, 4)\n X[:, 0] *= 100\n X[:, 1] *= 520 * np.pi\n X[:, 1] += 40 * np.pi\n X[:, 3] *= 10\n X[:, 3] += 1\n y = np.arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]) + noise * generator.randn(n_samples)\n return X, y" }, { @@ -45533,7 +47303,8 @@ "docstring": { "type": "ndarray of shape (n_features,), default=None", "description": "The mean of the multi-dimensional normal distribution.\nIf None then use the origin (0, 0, ...)." - } + }, + "refined_type": {} }, { "name": "cov", @@ -45543,7 +47314,8 @@ "docstring": { "type": "float, default=1.0", "description": "The covariance matrix will be this value times the unit matrix. This\ndataset only produces symmetric normal distributions." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -45553,7 +47325,8 @@ "docstring": { "type": "int, default=100", "description": "The total number of points equally divided among classes." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -45563,7 +47336,8 @@ "docstring": { "type": "int, default=2", "description": "The number of features for each sample." - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -45573,7 +47347,8 @@ "docstring": { "type": "int, default=3", "description": "The number of classes" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -45583,7 +47358,8 @@ "docstring": { "type": "bool, default=True", "description": "Shuffle the samples." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45593,13 +47369,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate isotropic Gaussian and label samples by quantile.\n\nThis classification dataset is constructed by taking a multi-dimensional standard normal distribution and defining classes separated by nested concentric multi-dimensional spheres such that roughly equal numbers of samples are in each class (quantiles of the :math:`\\chi^2` distribution). Read more in the :ref:`User Guide `.", - "docstring": "Generate isotropic Gaussian and label samples by quantile.\n\nThis classification dataset is constructed by taking a multi-dimensional\nstandard normal distribution and defining classes separated by nested\nconcentric multi-dimensional spheres such that roughly equal numbers of\nsamples are in each class (quantiles of the :math:`\\chi^2` distribution).\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nmean : ndarray of shape (n_features,), default=None\n The mean of the multi-dimensional normal distribution.\n If None then use the origin (0, 0, ...).\n\ncov : float, default=1.0\n The covariance matrix will be this value times the unit matrix. This\n dataset only produces symmetric normal distributions.\n\nn_samples : int, default=100\n The total number of points equally divided among classes.\n\nn_features : int, default=2\n The number of features for each sample.\n\nn_classes : int, default=3\n The number of classes\n\nshuffle : bool, default=True\n Shuffle the samples.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_features)\n The generated samples.\n\ny : ndarray of shape (n_samples,)\n The integer labels for quantile membership of each sample.\n\nNotes\n-----\nThe dataset is from Zhu et al [1].\n\nReferences\n----------\n.. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\", 2009.", + "description": "Generate isotropic Gaussian and label samples by quantile.\n\nThis classification dataset is constructed by taking a multi-dimensional\nstandard normal distribution and defining classes separated by nested\nconcentric multi-dimensional spheres such that roughly equal numbers of\nsamples are in each class (quantiles of the :math:`\\chi^2` distribution).\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate isotropic Gaussian and label samples by quantile.\n\n This classification dataset is constructed by taking a multi-dimensional\n standard normal distribution and defining classes separated by nested\n concentric multi-dimensional spheres such that roughly equal numbers of\n samples are in each class (quantiles of the :math:`\\chi^2` distribution).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n mean : ndarray of shape (n_features,), default=None\n The mean of the multi-dimensional normal distribution.\n If None then use the origin (0, 0, ...).\n\n cov : float, default=1.0\n The covariance matrix will be this value times the unit matrix. This\n dataset only produces symmetric normal distributions.\n\n n_samples : int, default=100\n The total number of points equally divided among classes.\n\n n_features : int, default=2\n The number of features for each sample.\n\n n_classes : int, default=3\n The number of classes\n\n shuffle : bool, default=True\n Shuffle the samples.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The generated samples.\n\n y : ndarray of shape (n_samples,)\n The integer labels for quantile membership of each sample.\n\n Notes\n -----\n The dataset is from Zhu et al [1].\n\n References\n ----------\n .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\", 2009.\n\n ", "source_code": "\ndef make_gaussian_quantiles(*, mean=None, cov=1.0, n_samples=100, n_features=2, n_classes=3, shuffle=True, random_state=None):\n \"\"\"Generate isotropic Gaussian and label samples by quantile.\n\n This classification dataset is constructed by taking a multi-dimensional\n standard normal distribution and defining classes separated by nested\n concentric multi-dimensional spheres such that roughly equal numbers of\n samples are in each class (quantiles of the :math:`\\chi^2` distribution).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n mean : ndarray of shape (n_features,), default=None\n The mean of the multi-dimensional normal distribution.\n If None then use the origin (0, 0, ...).\n\n cov : float, default=1.0\n The covariance matrix will be this value times the unit matrix. This\n dataset only produces symmetric normal distributions.\n\n n_samples : int, default=100\n The total number of points equally divided among classes.\n\n n_features : int, default=2\n The number of features for each sample.\n\n n_classes : int, default=3\n The number of classes\n\n shuffle : bool, default=True\n Shuffle the samples.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The generated samples.\n\n y : ndarray of shape (n_samples,)\n The integer labels for quantile membership of each sample.\n\n Notes\n -----\n The dataset is from Zhu et al [1].\n\n References\n ----------\n .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\", 2009.\n\n \"\"\"\n if n_samples < n_classes:\n raise ValueError('n_samples must be at least n_classes')\n generator = check_random_state(random_state)\n if mean is None:\n mean = np.zeros(n_features)\n else:\n mean = np.array(mean)\n X = generator.multivariate_normal(mean, cov * np.identity(n_features), (n_samples, ))\n idx = np.argsort(np.sum((X - mean[np.newaxis, :])**2, axis=1))\n X = X[idx, :]\n step = n_samples // n_classes\n y = np.hstack([np.repeat(np.arange(n_classes), step), np.repeat(n_classes - 1, n_samples - step * n_classes)])\n if shuffle:\n (X, y) = util_shuffle(X, y, random_state=generator)\n return X, y" }, { @@ -45617,7 +47394,8 @@ "docstring": { "type": "int, default=12000", "description": "The number of samples." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45627,13 +47405,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generates data for binary classification used in Hastie et al. 2009, Example 10.2.\n\nThe ten features are standard independent Gaussian and the target ``y`` is defined by:: y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1 Read more in the :ref:`User Guide `.", - "docstring": "Generates data for binary classification used in\nHastie et al. 2009, Example 10.2.\n\nThe ten features are standard independent Gaussian and\nthe target ``y`` is defined by::\n\n y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int, default=12000\n The number of samples.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, 10)\n The input samples.\n\ny : ndarray of shape (n_samples,)\n The output values.\n\nReferences\n----------\n.. [1] T. Hastie, R. Tibshirani and J. Friedman, \"Elements of Statistical\n Learning Ed. 2\", Springer, 2009.\n\nSee Also\n--------\nmake_gaussian_quantiles : A generalization of this dataset approach.", + "description": "Generates data for binary classification used in\nHastie et al. 2009, Example 10.2.\n\nThe ten features are standard independent Gaussian and\nthe target ``y`` is defined by::\n\n y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generates data for binary classification used in\n Hastie et al. 2009, Example 10.2.\n\n The ten features are standard independent Gaussian and\n the target ``y`` is defined by::\n\n y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=12000\n The number of samples.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 10)\n The input samples.\n\n y : ndarray of shape (n_samples,)\n The output values.\n\n References\n ----------\n .. [1] T. Hastie, R. Tibshirani and J. Friedman, \"Elements of Statistical\n Learning Ed. 2\", Springer, 2009.\n\n See Also\n --------\n make_gaussian_quantiles : A generalization of this dataset approach.\n ", "source_code": "\ndef make_hastie_10_2(n_samples=12000, *, random_state=None):\n \"\"\"Generates data for binary classification used in\n Hastie et al. 2009, Example 10.2.\n\n The ten features are standard independent Gaussian and\n the target ``y`` is defined by::\n\n y[i] = 1 if np.sum(X[i] ** 2) > 9.34 else -1\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=12000\n The number of samples.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 10)\n The input samples.\n\n y : ndarray of shape (n_samples,)\n The output values.\n\n References\n ----------\n .. [1] T. Hastie, R. Tibshirani and J. Friedman, \"Elements of Statistical\n Learning Ed. 2\", Springer, 2009.\n\n See Also\n --------\n make_gaussian_quantiles : A generalization of this dataset approach.\n \"\"\"\n rs = check_random_state(random_state)\n shape = (n_samples, 10)\n X = rs.normal(size=shape).reshape(shape)\n y = ((X**2.0).sum(axis=1) > 9.34).astype(np.float64, copy=False)\n y[y == 0.0] = -1.0\n return X, y" }, { @@ -45651,7 +47430,8 @@ "docstring": { "type": "int, default=100", "description": "The number of samples." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -45661,7 +47441,8 @@ "docstring": { "type": "int, default=100", "description": "The number of features." - } + }, + "refined_type": {} }, { "name": "effective_rank", @@ -45671,7 +47452,8 @@ "docstring": { "type": "int, default=10", "description": "The approximate number of singular vectors required to explain most of\nthe data by linear combinations." - } + }, + "refined_type": {} }, { "name": "tail_strength", @@ -45681,7 +47463,8 @@ "docstring": { "type": "float, default=0.5", "description": "The relative importance of the fat noisy tail of the singular values\nprofile. The value should be between 0 and 1." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45691,13 +47474,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate a mostly low rank matrix with bell-shaped singular values.\n\nMost of the variance can be explained by a bell-shaped curve of width effective_rank: the low rank part of the singular values profile is:: (1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2) The remaining singular values' tail is fat, decreasing as:: tail_strength * exp(-0.1 * i / effective_rank). The low rank part of the profile can be considered the structured signal part of the data while the tail can be considered the noisy part of the data that cannot be summarized by a low number of linear components (singular vectors). This kind of singular profiles is often seen in practice, for instance: - gray level pictures of faces - TF-IDF vectors of text documents crawled from the web Read more in the :ref:`User Guide `.", - "docstring": "Generate a mostly low rank matrix with bell-shaped singular values.\n\nMost of the variance can be explained by a bell-shaped curve of width\neffective_rank: the low rank part of the singular values profile is::\n\n (1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2)\n\nThe remaining singular values' tail is fat, decreasing as::\n\n tail_strength * exp(-0.1 * i / effective_rank).\n\nThe low rank part of the profile can be considered the structured\nsignal part of the data while the tail can be considered the noisy\npart of the data that cannot be summarized by a low number of linear\ncomponents (singular vectors).\n\nThis kind of singular profiles is often seen in practice, for instance:\n - gray level pictures of faces\n - TF-IDF vectors of text documents crawled from the web\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int, default=100\n The number of samples.\n\nn_features : int, default=100\n The number of features.\n\neffective_rank : int, default=10\n The approximate number of singular vectors required to explain most of\n the data by linear combinations.\n\ntail_strength : float, default=0.5\n The relative importance of the fat noisy tail of the singular values\n profile. The value should be between 0 and 1.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_features)\n The matrix.", + "description": "Generate a mostly low rank matrix with bell-shaped singular values.\n\nMost of the variance can be explained by a bell-shaped curve of width\neffective_rank: the low rank part of the singular values profile is::\n\n (1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2)\n\nThe remaining singular values' tail is fat, decreasing as::\n\n tail_strength * exp(-0.1 * i / effective_rank).\n\nThe low rank part of the profile can be considered the structured\nsignal part of the data while the tail can be considered the noisy\npart of the data that cannot be summarized by a low number of linear\ncomponents (singular vectors).\n\nThis kind of singular profiles is often seen in practice, for instance:\n - gray level pictures of faces\n - TF-IDF vectors of text documents crawled from the web\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate a mostly low rank matrix with bell-shaped singular values.\n\n Most of the variance can be explained by a bell-shaped curve of width\n effective_rank: the low rank part of the singular values profile is::\n\n (1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2)\n\n The remaining singular values' tail is fat, decreasing as::\n\n tail_strength * exp(-0.1 * i / effective_rank).\n\n The low rank part of the profile can be considered the structured\n signal part of the data while the tail can be considered the noisy\n part of the data that cannot be summarized by a low number of linear\n components (singular vectors).\n\n This kind of singular profiles is often seen in practice, for instance:\n - gray level pictures of faces\n - TF-IDF vectors of text documents crawled from the web\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n n_features : int, default=100\n The number of features.\n\n effective_rank : int, default=10\n The approximate number of singular vectors required to explain most of\n the data by linear combinations.\n\n tail_strength : float, default=0.5\n The relative importance of the fat noisy tail of the singular values\n profile. The value should be between 0 and 1.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The matrix.\n ", "source_code": "\ndef make_low_rank_matrix(n_samples=100, n_features=100, *, effective_rank=10, tail_strength=0.5, random_state=None):\n \"\"\"Generate a mostly low rank matrix with bell-shaped singular values.\n\n Most of the variance can be explained by a bell-shaped curve of width\n effective_rank: the low rank part of the singular values profile is::\n\n (1 - tail_strength) * exp(-1.0 * (i / effective_rank) ** 2)\n\n The remaining singular values' tail is fat, decreasing as::\n\n tail_strength * exp(-0.1 * i / effective_rank).\n\n The low rank part of the profile can be considered the structured\n signal part of the data while the tail can be considered the noisy\n part of the data that cannot be summarized by a low number of linear\n components (singular vectors).\n\n This kind of singular profiles is often seen in practice, for instance:\n - gray level pictures of faces\n - TF-IDF vectors of text documents crawled from the web\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n n_features : int, default=100\n The number of features.\n\n effective_rank : int, default=10\n The approximate number of singular vectors required to explain most of\n the data by linear combinations.\n\n tail_strength : float, default=0.5\n The relative importance of the fat noisy tail of the singular values\n profile. The value should be between 0 and 1.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The matrix.\n \"\"\"\n generator = check_random_state(random_state)\n n = min(n_samples, n_features)\n (u, _) = linalg.qr(generator.randn(n_samples, n), mode='economic', check_finite=False)\n (v, _) = linalg.qr(generator.randn(n_features, n), mode='economic', check_finite=False)\n singular_ind = np.arange(n, dtype=np.float64)\n low_rank = (1 - tail_strength) * np.exp(-1.0 * (singular_ind / effective_rank)**2)\n tail = tail_strength * np.exp(-0.1 * singular_ind / effective_rank)\n s = np.identity(n) * (low_rank + tail)\n return np.dot(np.dot(u, s), v.T)" }, { @@ -45715,7 +47499,8 @@ "docstring": { "type": "int or tuple of shape (2,), dtype=int, default=100", "description": "If int, the total number of points generated.\nIf two-element tuple, number of points in each of two moons.\n\n.. versionchanged:: 0.23\n Added two-element tuple." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -45725,7 +47510,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to shuffle the samples." - } + }, + "refined_type": {} }, { "name": "noise", @@ -45735,7 +47521,8 @@ "docstring": { "type": "float, default=None", "description": "Standard deviation of Gaussian noise added to the data." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45745,13 +47532,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset shuffling and noise.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Make two interleaving half circles.\n\nA simple toy dataset to visualize clustering and classification algorithms. Read more in the :ref:`User Guide `.", - "docstring": "Make two interleaving half circles.\n\nA simple toy dataset to visualize clustering and classification\nalgorithms. Read more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int or tuple of shape (2,), dtype=int, default=100\n If int, the total number of points generated.\n If two-element tuple, number of points in each of two moons.\n\n .. versionchanged:: 0.23\n Added two-element tuple.\n\nshuffle : bool, default=True\n Whether to shuffle the samples.\n\nnoise : float, default=None\n Standard deviation of Gaussian noise added to the data.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling and noise.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, 2)\n The generated samples.\n\ny : ndarray of shape (n_samples,)\n The integer labels (0 or 1) for class membership of each sample.", + "description": "Make two interleaving half circles.\n\nA simple toy dataset to visualize clustering and classification\nalgorithms. Read more in the :ref:`User Guide `.", + "docstring": "Make two interleaving half circles.\n\n A simple toy dataset to visualize clustering and classification\n algorithms. Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int or tuple of shape (2,), dtype=int, default=100\n If int, the total number of points generated.\n If two-element tuple, number of points in each of two moons.\n\n .. versionchanged:: 0.23\n Added two-element tuple.\n\n shuffle : bool, default=True\n Whether to shuffle the samples.\n\n noise : float, default=None\n Standard deviation of Gaussian noise added to the data.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling and noise.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 2)\n The generated samples.\n\n y : ndarray of shape (n_samples,)\n The integer labels (0 or 1) for class membership of each sample.\n ", "source_code": "\ndef make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None):\n \"\"\"Make two interleaving half circles.\n\n A simple toy dataset to visualize clustering and classification\n algorithms. Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int or tuple of shape (2,), dtype=int, default=100\n If int, the total number of points generated.\n If two-element tuple, number of points in each of two moons.\n\n .. versionchanged:: 0.23\n Added two-element tuple.\n\n shuffle : bool, default=True\n Whether to shuffle the samples.\n\n noise : float, default=None\n Standard deviation of Gaussian noise added to the data.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling and noise.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 2)\n The generated samples.\n\n y : ndarray of shape (n_samples,)\n The integer labels (0 or 1) for class membership of each sample.\n \"\"\"\n if isinstance(n_samples, numbers.Integral):\n n_samples_out = n_samples // 2\n n_samples_in = n_samples - n_samples_out\n else:\n try:\n (n_samples_out, n_samples_in) = n_samples\n except ValueError as e:\n raise ValueError('`n_samples` can be either an int or a two-element tuple.') from e\n generator = check_random_state(random_state)\n outer_circ_x = np.cos(np.linspace(0, np.pi, n_samples_out))\n outer_circ_y = np.sin(np.linspace(0, np.pi, n_samples_out))\n inner_circ_x = 1 - np.cos(np.linspace(0, np.pi, n_samples_in))\n inner_circ_y = 1 - np.sin(np.linspace(0, np.pi, n_samples_in)) - 0.5\n X = np.vstack([np.append(outer_circ_x, inner_circ_x), np.append(outer_circ_y, inner_circ_y)]).T\n y = np.hstack([np.zeros(n_samples_out, dtype=np.intp), np.ones(n_samples_in, dtype=np.intp)])\n if shuffle:\n (X, y) = util_shuffle(X, y, random_state=generator)\n if noise is not None:\n X += generator.normal(scale=noise, size=X.shape)\n return X, y" }, { @@ -45769,7 +47557,8 @@ "docstring": { "type": "int, default=100", "description": "The number of samples." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -45779,7 +47568,8 @@ "docstring": { "type": "int, default=20", "description": "The total number of features." - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -45789,7 +47579,8 @@ "docstring": { "type": "int, default=5", "description": "The number of classes of the classification problem." - } + }, + "refined_type": {} }, { "name": "n_labels", @@ -45799,7 +47590,8 @@ "docstring": { "type": "int, default=2", "description": "The average number of labels per instance. More precisely, the number\nof labels per sample is drawn from a Poisson distribution with\n``n_labels`` as its expected value, but samples are bounded (using\nrejection sampling) by ``n_classes``, and must be nonzero if\n``allow_unlabeled`` is False." - } + }, + "refined_type": {} }, { "name": "length", @@ -45809,7 +47601,8 @@ "docstring": { "type": "int, default=50", "description": "The sum of the features (number of words if documents) is drawn from\na Poisson distribution with this expected value." - } + }, + "refined_type": {} }, { "name": "allow_unlabeled", @@ -45819,7 +47612,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, some instances might not belong to any class." - } + }, + "refined_type": {} }, { "name": "sparse", @@ -45829,7 +47623,8 @@ "docstring": { "type": "bool, default=False", "description": "If ``True``, return a sparse feature matrix\n\n.. versionadded:: 0.17\n parameter to allow *sparse* output." - } + }, + "refined_type": {} }, { "name": "return_indicator", @@ -45839,6 +47634,10 @@ "docstring": { "type": "{'dense', 'sparse'} or False, default='dense'", "description": "If ``'dense'`` return ``Y`` in the dense binary indicator format. If\n``'sparse'`` return ``Y`` in the sparse binary indicator format.\n``False`` returns a list of lists of labels." + }, + "refined_type": { + "kind": "EnumType", + "values": ["dense", "sparse"] } }, { @@ -45849,7 +47648,8 @@ "docstring": { "type": "bool, default=False", "description": "If ``True``, return the prior class probability and conditional\nprobabilities of features given classes, from which the data was\ndrawn." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45859,13 +47659,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate a random multilabel classification problem.\n\nFor each sample, the generative process is: - pick the number of labels: n ~ Poisson(n_labels) - n times, choose a class c: c ~ Multinomial(theta) - pick the document length: k ~ Poisson(length) - k times, choose a word: w ~ Multinomial(theta_c) In the above process, rejection sampling is used to make sure that n is never zero or more than `n_classes`, and that the document length is never zero. Likewise, we reject classes which have already been chosen. Read more in the :ref:`User Guide `.", - "docstring": "Generate a random multilabel classification problem.\n\nFor each sample, the generative process is:\n - pick the number of labels: n ~ Poisson(n_labels)\n - n times, choose a class c: c ~ Multinomial(theta)\n - pick the document length: k ~ Poisson(length)\n - k times, choose a word: w ~ Multinomial(theta_c)\n\nIn the above process, rejection sampling is used to make sure that\nn is never zero or more than `n_classes`, and that the document length\nis never zero. Likewise, we reject classes which have already been chosen.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int, default=100\n The number of samples.\n\nn_features : int, default=20\n The total number of features.\n\nn_classes : int, default=5\n The number of classes of the classification problem.\n\nn_labels : int, default=2\n The average number of labels per instance. More precisely, the number\n of labels per sample is drawn from a Poisson distribution with\n ``n_labels`` as its expected value, but samples are bounded (using\n rejection sampling) by ``n_classes``, and must be nonzero if\n ``allow_unlabeled`` is False.\n\nlength : int, default=50\n The sum of the features (number of words if documents) is drawn from\n a Poisson distribution with this expected value.\n\nallow_unlabeled : bool, default=True\n If ``True``, some instances might not belong to any class.\n\nsparse : bool, default=False\n If ``True``, return a sparse feature matrix\n\n .. versionadded:: 0.17\n parameter to allow *sparse* output.\n\nreturn_indicator : {'dense', 'sparse'} or False, default='dense'\n If ``'dense'`` return ``Y`` in the dense binary indicator format. If\n ``'sparse'`` return ``Y`` in the sparse binary indicator format.\n ``False`` returns a list of lists of labels.\n\nreturn_distributions : bool, default=False\n If ``True``, return the prior class probability and conditional\n probabilities of features given classes, from which the data was\n drawn.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_features)\n The generated samples.\n\nY : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n The label sets. Sparse matrix should be of CSR format.\n\np_c : ndarray of shape (n_classes,)\n The probability of each class being drawn. Only returned if\n ``return_distributions=True``.\n\np_w_c : ndarray of shape (n_features, n_classes)\n The probability of each feature being drawn given each class.\n Only returned if ``return_distributions=True``.", + "description": "Generate a random multilabel classification problem.\n\nFor each sample, the generative process is:\n - pick the number of labels: n ~ Poisson(n_labels)\n - n times, choose a class c: c ~ Multinomial(theta)\n - pick the document length: k ~ Poisson(length)\n - k times, choose a word: w ~ Multinomial(theta_c)\n\nIn the above process, rejection sampling is used to make sure that\nn is never zero or more than `n_classes`, and that the document length\nis never zero. Likewise, we reject classes which have already been chosen.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate a random multilabel classification problem.\n\n For each sample, the generative process is:\n - pick the number of labels: n ~ Poisson(n_labels)\n - n times, choose a class c: c ~ Multinomial(theta)\n - pick the document length: k ~ Poisson(length)\n - k times, choose a word: w ~ Multinomial(theta_c)\n\n In the above process, rejection sampling is used to make sure that\n n is never zero or more than `n_classes`, and that the document length\n is never zero. Likewise, we reject classes which have already been chosen.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n n_features : int, default=20\n The total number of features.\n\n n_classes : int, default=5\n The number of classes of the classification problem.\n\n n_labels : int, default=2\n The average number of labels per instance. More precisely, the number\n of labels per sample is drawn from a Poisson distribution with\n ``n_labels`` as its expected value, but samples are bounded (using\n rejection sampling) by ``n_classes``, and must be nonzero if\n ``allow_unlabeled`` is False.\n\n length : int, default=50\n The sum of the features (number of words if documents) is drawn from\n a Poisson distribution with this expected value.\n\n allow_unlabeled : bool, default=True\n If ``True``, some instances might not belong to any class.\n\n sparse : bool, default=False\n If ``True``, return a sparse feature matrix\n\n .. versionadded:: 0.17\n parameter to allow *sparse* output.\n\n return_indicator : {'dense', 'sparse'} or False, default='dense'\n If ``'dense'`` return ``Y`` in the dense binary indicator format. If\n ``'sparse'`` return ``Y`` in the sparse binary indicator format.\n ``False`` returns a list of lists of labels.\n\n return_distributions : bool, default=False\n If ``True``, return the prior class probability and conditional\n probabilities of features given classes, from which the data was\n drawn.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The generated samples.\n\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n The label sets. Sparse matrix should be of CSR format.\n\n p_c : ndarray of shape (n_classes,)\n The probability of each class being drawn. Only returned if\n ``return_distributions=True``.\n\n p_w_c : ndarray of shape (n_features, n_classes)\n The probability of each feature being drawn given each class.\n Only returned if ``return_distributions=True``.\n\n ", "source_code": "\ndef make_multilabel_classification(n_samples=100, n_features=20, *, n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator='dense', return_distributions=False, random_state=None):\n \"\"\"Generate a random multilabel classification problem.\n\n For each sample, the generative process is:\n - pick the number of labels: n ~ Poisson(n_labels)\n - n times, choose a class c: c ~ Multinomial(theta)\n - pick the document length: k ~ Poisson(length)\n - k times, choose a word: w ~ Multinomial(theta_c)\n\n In the above process, rejection sampling is used to make sure that\n n is never zero or more than `n_classes`, and that the document length\n is never zero. Likewise, we reject classes which have already been chosen.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n n_features : int, default=20\n The total number of features.\n\n n_classes : int, default=5\n The number of classes of the classification problem.\n\n n_labels : int, default=2\n The average number of labels per instance. More precisely, the number\n of labels per sample is drawn from a Poisson distribution with\n ``n_labels`` as its expected value, but samples are bounded (using\n rejection sampling) by ``n_classes``, and must be nonzero if\n ``allow_unlabeled`` is False.\n\n length : int, default=50\n The sum of the features (number of words if documents) is drawn from\n a Poisson distribution with this expected value.\n\n allow_unlabeled : bool, default=True\n If ``True``, some instances might not belong to any class.\n\n sparse : bool, default=False\n If ``True``, return a sparse feature matrix\n\n .. versionadded:: 0.17\n parameter to allow *sparse* output.\n\n return_indicator : {'dense', 'sparse'} or False, default='dense'\n If ``'dense'`` return ``Y`` in the dense binary indicator format. If\n ``'sparse'`` return ``Y`` in the sparse binary indicator format.\n ``False`` returns a list of lists of labels.\n\n return_distributions : bool, default=False\n If ``True``, return the prior class probability and conditional\n probabilities of features given classes, from which the data was\n drawn.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The generated samples.\n\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n The label sets. Sparse matrix should be of CSR format.\n\n p_c : ndarray of shape (n_classes,)\n The probability of each class being drawn. Only returned if\n ``return_distributions=True``.\n\n p_w_c : ndarray of shape (n_features, n_classes)\n The probability of each feature being drawn given each class.\n Only returned if ``return_distributions=True``.\n\n \"\"\"\n if n_classes < 1:\n raise ValueError(\"'n_classes' should be an integer greater than 0. Got {} instead.\".format(n_classes))\n if length < 1:\n raise ValueError(\"'length' should be an integer greater than 0. Got {} instead.\".format(length))\n generator = check_random_state(random_state)\n p_c = generator.rand(n_classes)\n p_c /= p_c.sum()\n cumulative_p_c = np.cumsum(p_c)\n p_w_c = generator.rand(n_features, n_classes)\n p_w_c /= np.sum(p_w_c, axis=0)\n \n def sample_example():\n (_, n_classes) = p_w_c.shape\n y_size = n_classes + 1\n while not allow_unlabeled and y_size == 0 or y_size > n_classes:\n y_size = generator.poisson(n_labels)\n y = set()\n while len(y) != y_size:\n c = np.searchsorted(cumulative_p_c, generator.rand(y_size - len(y)))\n y.update(c)\n y = list(y)\n n_words = 0\n while n_words == 0:\n n_words = generator.poisson(length)\n if len(y) == 0:\n words = generator.randint(n_features, size=n_words)\n return words, y\n cumulative_p_w_sample = p_w_c.take(y, axis=1).sum(axis=1).cumsum()\n cumulative_p_w_sample /= cumulative_p_w_sample[-1]\n words = np.searchsorted(cumulative_p_w_sample, generator.rand(n_words))\n return words, y\n X_indices = array.array('i')\n X_indptr = array.array('i', [0])\n Y = []\n for i in range(n_samples):\n (words, y) = sample_example()\n X_indices.extend(words)\n X_indptr.append(len(X_indices))\n Y.append(y)\n X_data = np.ones(len(X_indices), dtype=np.float64)\n X = sp.csr_matrix((X_data, X_indices, X_indptr), shape=(n_samples, n_features))\n X.sum_duplicates()\n if not sparse:\n X = X.toarray()\n if return_indicator in (True, 'sparse', 'dense'):\n lb = MultiLabelBinarizer(sparse_output=return_indicator == 'sparse')\n Y = lb.fit([range(n_classes)]).transform(Y)\n elif return_indicator is not False:\n raise ValueError(\"return_indicator must be either 'sparse', 'dense' or False.\")\n if return_distributions:\n return X, Y, p_c, p_w_c\n return X, Y" }, { @@ -45883,7 +47684,8 @@ "docstring": { "type": "int, default=100", "description": "The number of samples." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -45893,7 +47695,8 @@ "docstring": { "type": "int, default=100", "description": "The number of features." - } + }, + "refined_type": {} }, { "name": "n_informative", @@ -45903,7 +47706,8 @@ "docstring": { "type": "int, default=10", "description": "The number of informative features, i.e., the number of features used\nto build the linear model used to generate the output." - } + }, + "refined_type": {} }, { "name": "n_targets", @@ -45913,7 +47717,8 @@ "docstring": { "type": "int, default=1", "description": "The number of regression targets, i.e., the dimension of the y output\nvector associated with a sample. By default, the output is a scalar." - } + }, + "refined_type": {} }, { "name": "bias", @@ -45923,7 +47728,8 @@ "docstring": { "type": "float, default=0.0", "description": "The bias term in the underlying linear model." - } + }, + "refined_type": {} }, { "name": "effective_rank", @@ -45933,7 +47739,8 @@ "docstring": { "type": "int, default=None", "description": "if not None:\n The approximate number of singular vectors required to explain most\n of the input data by linear combinations. Using this kind of\n singular spectrum in the input allows the generator to reproduce\n the correlations often observed in practice.\nif None:\n The input set is well conditioned, centered and gaussian with\n unit variance." - } + }, + "refined_type": {} }, { "name": "tail_strength", @@ -45943,7 +47750,8 @@ "docstring": { "type": "float, default=0.5", "description": "The relative importance of the fat noisy tail of the singular values\nprofile if `effective_rank` is not None. When a float, it should be\nbetween 0 and 1." - } + }, + "refined_type": {} }, { "name": "noise", @@ -45953,7 +47761,8 @@ "docstring": { "type": "float, default=0.0", "description": "The standard deviation of the gaussian noise applied to the output." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -45963,7 +47772,8 @@ "docstring": { "type": "bool, default=True", "description": "Shuffle the samples and the features." - } + }, + "refined_type": {} }, { "name": "coef", @@ -45973,7 +47783,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, the coefficients of the underlying linear model are returned." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -45983,13 +47794,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate a random regression problem.\n\nThe input set can either be well conditioned (by default) or have a low rank-fat tail singular profile. See :func:`make_low_rank_matrix` for more details. The output is generated by applying a (potentially biased) random linear regression model with `n_informative` nonzero regressors to the previously generated input and some gaussian centered noise with some adjustable scale. Read more in the :ref:`User Guide `.", - "docstring": "Generate a random regression problem.\n\nThe input set can either be well conditioned (by default) or have a low\nrank-fat tail singular profile. See :func:`make_low_rank_matrix` for\nmore details.\n\nThe output is generated by applying a (potentially biased) random linear\nregression model with `n_informative` nonzero regressors to the previously\ngenerated input and some gaussian centered noise with some adjustable\nscale.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int, default=100\n The number of samples.\n\nn_features : int, default=100\n The number of features.\n\nn_informative : int, default=10\n The number of informative features, i.e., the number of features used\n to build the linear model used to generate the output.\n\nn_targets : int, default=1\n The number of regression targets, i.e., the dimension of the y output\n vector associated with a sample. By default, the output is a scalar.\n\nbias : float, default=0.0\n The bias term in the underlying linear model.\n\neffective_rank : int, default=None\n if not None:\n The approximate number of singular vectors required to explain most\n of the input data by linear combinations. Using this kind of\n singular spectrum in the input allows the generator to reproduce\n the correlations often observed in practice.\n if None:\n The input set is well conditioned, centered and gaussian with\n unit variance.\n\ntail_strength : float, default=0.5\n The relative importance of the fat noisy tail of the singular values\n profile if `effective_rank` is not None. When a float, it should be\n between 0 and 1.\n\nnoise : float, default=0.0\n The standard deviation of the gaussian noise applied to the output.\n\nshuffle : bool, default=True\n Shuffle the samples and the features.\n\ncoef : bool, default=False\n If True, the coefficients of the underlying linear model are returned.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_features)\n The input samples.\n\ny : ndarray of shape (n_samples,) or (n_samples, n_targets)\n The output values.\n\ncoef : ndarray of shape (n_features,) or (n_features, n_targets)\n The coefficient of the underlying linear model. It is returned only if\n coef is True.", + "description": "Generate a random regression problem.\n\nThe input set can either be well conditioned (by default) or have a low\nrank-fat tail singular profile. See :func:`make_low_rank_matrix` for\nmore details.\n\nThe output is generated by applying a (potentially biased) random linear\nregression model with `n_informative` nonzero regressors to the previously\ngenerated input and some gaussian centered noise with some adjustable\nscale.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate a random regression problem.\n\n The input set can either be well conditioned (by default) or have a low\n rank-fat tail singular profile. See :func:`make_low_rank_matrix` for\n more details.\n\n The output is generated by applying a (potentially biased) random linear\n regression model with `n_informative` nonzero regressors to the previously\n generated input and some gaussian centered noise with some adjustable\n scale.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n n_features : int, default=100\n The number of features.\n\n n_informative : int, default=10\n The number of informative features, i.e., the number of features used\n to build the linear model used to generate the output.\n\n n_targets : int, default=1\n The number of regression targets, i.e., the dimension of the y output\n vector associated with a sample. By default, the output is a scalar.\n\n bias : float, default=0.0\n The bias term in the underlying linear model.\n\n effective_rank : int, default=None\n if not None:\n The approximate number of singular vectors required to explain most\n of the input data by linear combinations. Using this kind of\n singular spectrum in the input allows the generator to reproduce\n the correlations often observed in practice.\n if None:\n The input set is well conditioned, centered and gaussian with\n unit variance.\n\n tail_strength : float, default=0.5\n The relative importance of the fat noisy tail of the singular values\n profile if `effective_rank` is not None. When a float, it should be\n between 0 and 1.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise applied to the output.\n\n shuffle : bool, default=True\n Shuffle the samples and the features.\n\n coef : bool, default=False\n If True, the coefficients of the underlying linear model are returned.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The input samples.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n The output values.\n\n coef : ndarray of shape (n_features,) or (n_features, n_targets)\n The coefficient of the underlying linear model. It is returned only if\n coef is True.\n ", "source_code": "\ndef make_regression(n_samples=100, n_features=100, *, n_informative=10, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None):\n \"\"\"Generate a random regression problem.\n\n The input set can either be well conditioned (by default) or have a low\n rank-fat tail singular profile. See :func:`make_low_rank_matrix` for\n more details.\n\n The output is generated by applying a (potentially biased) random linear\n regression model with `n_informative` nonzero regressors to the previously\n generated input and some gaussian centered noise with some adjustable\n scale.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n n_features : int, default=100\n The number of features.\n\n n_informative : int, default=10\n The number of informative features, i.e., the number of features used\n to build the linear model used to generate the output.\n\n n_targets : int, default=1\n The number of regression targets, i.e., the dimension of the y output\n vector associated with a sample. By default, the output is a scalar.\n\n bias : float, default=0.0\n The bias term in the underlying linear model.\n\n effective_rank : int, default=None\n if not None:\n The approximate number of singular vectors required to explain most\n of the input data by linear combinations. Using this kind of\n singular spectrum in the input allows the generator to reproduce\n the correlations often observed in practice.\n if None:\n The input set is well conditioned, centered and gaussian with\n unit variance.\n\n tail_strength : float, default=0.5\n The relative importance of the fat noisy tail of the singular values\n profile if `effective_rank` is not None. When a float, it should be\n between 0 and 1.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise applied to the output.\n\n shuffle : bool, default=True\n Shuffle the samples and the features.\n\n coef : bool, default=False\n If True, the coefficients of the underlying linear model are returned.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The input samples.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n The output values.\n\n coef : ndarray of shape (n_features,) or (n_features, n_targets)\n The coefficient of the underlying linear model. It is returned only if\n coef is True.\n \"\"\"\n n_informative = min(n_features, n_informative)\n generator = check_random_state(random_state)\n if effective_rank is None:\n X = generator.randn(n_samples, n_features)\n else:\n X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=effective_rank, tail_strength=tail_strength, random_state=generator)\n ground_truth = np.zeros((n_features, n_targets))\n ground_truth[:n_informative, :] = 100 * generator.rand(n_informative, n_targets)\n y = np.dot(X, ground_truth) + bias\n if noise > 0.0:\n y += generator.normal(scale=noise, size=y.shape)\n if shuffle:\n (X, y) = util_shuffle(X, y, random_state=generator)\n indices = np.arange(n_features)\n generator.shuffle(indices)\n X[:, :] = X[:, indices]\n ground_truth = ground_truth[indices]\n y = np.squeeze(y)\n if coef:\n return X, y, np.squeeze(ground_truth)\n else:\n return X, y" }, { @@ -46007,7 +47819,8 @@ "docstring": { "type": "int, default=100", "description": "The number of sample points on the S curve." - } + }, + "refined_type": {} }, { "name": "noise", @@ -46017,7 +47830,8 @@ "docstring": { "type": "float, default=0.0", "description": "The standard deviation of the gaussian noise." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -46027,13 +47841,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate an S curve dataset.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Generate an S curve dataset.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int, default=100\n The number of sample points on the S curve.\n\nnoise : float, default=0.0\n The standard deviation of the gaussian noise.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, 3)\n The points.\n\nt : ndarray of shape (n_samples,)\n The univariate position of the sample according to the main dimension\n of the points in the manifold.", + "docstring": "Generate an S curve dataset.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of sample points on the S curve.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 3)\n The points.\n\n t : ndarray of shape (n_samples,)\n The univariate position of the sample according to the main dimension\n of the points in the manifold.\n ", "source_code": "\ndef make_s_curve(n_samples=100, *, noise=0.0, random_state=None):\n \"\"\"Generate an S curve dataset.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of sample points on the S curve.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 3)\n The points.\n\n t : ndarray of shape (n_samples,)\n The univariate position of the sample according to the main dimension\n of the points in the manifold.\n \"\"\"\n generator = check_random_state(random_state)\n t = 3 * np.pi * (generator.rand(1, n_samples) - 0.5)\n x = np.sin(t)\n y = 2.0 * generator.rand(1, n_samples)\n z = np.sign(t) * (np.cos(t) - 1)\n X = np.concatenate((x, y, z))\n X += noise * generator.randn(3, n_samples)\n X = X.T\n t = np.squeeze(t)\n return X, t" }, { @@ -46051,7 +47866,8 @@ "docstring": { "type": "int", "description": "Number of samples to generate" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -46061,7 +47877,8 @@ "docstring": { "type": "int", "description": "Number of components in the dictionary" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -46071,7 +47888,8 @@ "docstring": { "type": "int", "description": "Number of features of the dataset to generate" - } + }, + "refined_type": {} }, { "name": "n_nonzero_coefs", @@ -46081,7 +47899,8 @@ "docstring": { "type": "int", "description": "Number of active (non-zero) coefficients in each sample" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -46091,13 +47910,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate a signal as a sparse combination of dictionary elements.\n\nReturns a matrix Y = DX, such as D is (n_features, n_components), X is (n_components, n_samples) and each column of X has exactly n_nonzero_coefs non-zero elements. Read more in the :ref:`User Guide `.", - "docstring": "Generate a signal as a sparse combination of dictionary elements.\n\nReturns a matrix Y = DX, such as D is (n_features, n_components),\nX is (n_components, n_samples) and each column of X has exactly\nn_nonzero_coefs non-zero elements.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int\n Number of samples to generate\n\nn_components : int\n Number of components in the dictionary\n\nn_features : int\n Number of features of the dataset to generate\n\nn_nonzero_coefs : int\n Number of active (non-zero) coefficients in each sample\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\ndata : ndarray of shape (n_features, n_samples)\n The encoded signal (Y).\n\ndictionary : ndarray of shape (n_features, n_components)\n The dictionary with normalized components (D).\n\ncode : ndarray of shape (n_components, n_samples)\n The sparse code such that each column of this matrix has exactly\n n_nonzero_coefs non-zero items (X).", + "description": "Generate a signal as a sparse combination of dictionary elements.\n\nReturns a matrix Y = DX, such as D is (n_features, n_components),\nX is (n_components, n_samples) and each column of X has exactly\nn_nonzero_coefs non-zero elements.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate a signal as a sparse combination of dictionary elements.\n\n Returns a matrix Y = DX, such as D is (n_features, n_components),\n X is (n_components, n_samples) and each column of X has exactly\n n_nonzero_coefs non-zero elements.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int\n Number of samples to generate\n\n n_components : int\n Number of components in the dictionary\n\n n_features : int\n Number of features of the dataset to generate\n\n n_nonzero_coefs : int\n Number of active (non-zero) coefficients in each sample\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n data : ndarray of shape (n_features, n_samples)\n The encoded signal (Y).\n\n dictionary : ndarray of shape (n_features, n_components)\n The dictionary with normalized components (D).\n\n code : ndarray of shape (n_components, n_samples)\n The sparse code such that each column of this matrix has exactly\n n_nonzero_coefs non-zero items (X).\n\n ", "source_code": "\ndef make_sparse_coded_signal(n_samples, *, n_components, n_features, n_nonzero_coefs, random_state=None):\n \"\"\"Generate a signal as a sparse combination of dictionary elements.\n\n Returns a matrix Y = DX, such as D is (n_features, n_components),\n X is (n_components, n_samples) and each column of X has exactly\n n_nonzero_coefs non-zero elements.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int\n Number of samples to generate\n\n n_components : int\n Number of components in the dictionary\n\n n_features : int\n Number of features of the dataset to generate\n\n n_nonzero_coefs : int\n Number of active (non-zero) coefficients in each sample\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n data : ndarray of shape (n_features, n_samples)\n The encoded signal (Y).\n\n dictionary : ndarray of shape (n_features, n_components)\n The dictionary with normalized components (D).\n\n code : ndarray of shape (n_components, n_samples)\n The sparse code such that each column of this matrix has exactly\n n_nonzero_coefs non-zero items (X).\n\n \"\"\"\n generator = check_random_state(random_state)\n D = generator.randn(n_features, n_components)\n D /= np.sqrt(np.sum(D**2, axis=0))\n X = np.zeros((n_components, n_samples))\n for i in range(n_samples):\n idx = np.arange(n_components)\n generator.shuffle(idx)\n idx = idx[:n_nonzero_coefs]\n X[idx, i] = generator.randn(n_nonzero_coefs)\n Y = np.dot(D, X)\n return map(np.squeeze, (Y, D, X))" }, { @@ -46115,7 +47935,8 @@ "docstring": { "type": "int, default=1", "description": "The size of the random matrix to generate." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -46125,7 +47946,8 @@ "docstring": { "type": "float, default=0.95", "description": "The probability that a coefficient is zero (see notes). Larger values\nenforce more sparsity. The value should be in the range 0 and 1." - } + }, + "refined_type": {} }, { "name": "norm_diag", @@ -46135,7 +47957,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to normalize the output matrix to make the leading diagonal\nelements all 1" - } + }, + "refined_type": {} }, { "name": "smallest_coef", @@ -46145,7 +47968,8 @@ "docstring": { "type": "float, default=0.1", "description": "The value of the smallest coefficient between 0 and 1." - } + }, + "refined_type": {} }, { "name": "largest_coef", @@ -46155,7 +47979,8 @@ "docstring": { "type": "float, default=0.9", "description": "The value of the largest coefficient between 0 and 1." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -46165,13 +47990,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate a sparse symmetric definite positive matrix.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Generate a sparse symmetric definite positive matrix.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ndim : int, default=1\n The size of the random matrix to generate.\n\nalpha : float, default=0.95\n The probability that a coefficient is zero (see notes). Larger values\n enforce more sparsity. The value should be in the range 0 and 1.\n\nnorm_diag : bool, default=False\n Whether to normalize the output matrix to make the leading diagonal\n elements all 1\n\nsmallest_coef : float, default=0.1\n The value of the smallest coefficient between 0 and 1.\n\nlargest_coef : float, default=0.9\n The value of the largest coefficient between 0 and 1.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nprec : sparse matrix of shape (dim, dim)\n The generated matrix.\n\nNotes\n-----\nThe sparsity is actually imposed on the cholesky factor of the matrix.\nThus alpha does not translate directly into the filling fraction of\nthe matrix itself.\n\nSee Also\n--------\nmake_spd_matrix", + "docstring": "Generate a sparse symmetric definite positive matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n dim : int, default=1\n The size of the random matrix to generate.\n\n alpha : float, default=0.95\n The probability that a coefficient is zero (see notes). Larger values\n enforce more sparsity. The value should be in the range 0 and 1.\n\n norm_diag : bool, default=False\n Whether to normalize the output matrix to make the leading diagonal\n elements all 1\n\n smallest_coef : float, default=0.1\n The value of the smallest coefficient between 0 and 1.\n\n largest_coef : float, default=0.9\n The value of the largest coefficient between 0 and 1.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n prec : sparse matrix of shape (dim, dim)\n The generated matrix.\n\n Notes\n -----\n The sparsity is actually imposed on the cholesky factor of the matrix.\n Thus alpha does not translate directly into the filling fraction of\n the matrix itself.\n\n See Also\n --------\n make_spd_matrix\n ", "source_code": "\ndef make_sparse_spd_matrix(dim=1, *, alpha=0.95, norm_diag=False, smallest_coef=0.1, largest_coef=0.9, random_state=None):\n \"\"\"Generate a sparse symmetric definite positive matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n dim : int, default=1\n The size of the random matrix to generate.\n\n alpha : float, default=0.95\n The probability that a coefficient is zero (see notes). Larger values\n enforce more sparsity. The value should be in the range 0 and 1.\n\n norm_diag : bool, default=False\n Whether to normalize the output matrix to make the leading diagonal\n elements all 1\n\n smallest_coef : float, default=0.1\n The value of the smallest coefficient between 0 and 1.\n\n largest_coef : float, default=0.9\n The value of the largest coefficient between 0 and 1.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n prec : sparse matrix of shape (dim, dim)\n The generated matrix.\n\n Notes\n -----\n The sparsity is actually imposed on the cholesky factor of the matrix.\n Thus alpha does not translate directly into the filling fraction of\n the matrix itself.\n\n See Also\n --------\n make_spd_matrix\n \"\"\"\n random_state = check_random_state(random_state)\n chol = -np.eye(dim)\n aux = random_state.rand(dim, dim)\n aux[aux < alpha] = 0\n aux[aux > alpha] = smallest_coef + (largest_coef - smallest_coef) * random_state.rand(np.sum(aux > alpha))\n aux = np.tril(aux, k=-1)\n permutation = random_state.permutation(dim)\n aux = aux[permutation].T[permutation]\n chol += aux\n prec = np.dot(chol.T, chol)\n if norm_diag:\n d = np.diag(prec).reshape(1, prec.shape[0])\n d = 1.0 / np.sqrt(d)\n prec *= d\n prec *= d.T\n return prec" }, { @@ -46189,7 +48015,8 @@ "docstring": { "type": "int, default=100", "description": "The number of samples." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -46199,7 +48026,8 @@ "docstring": { "type": "int, default=10", "description": "The number of features." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -46209,13 +48037,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate a random regression problem with sparse uncorrelated design.\n\nThis dataset is described in Celeux et al [1]. as:: X ~ N(0, 1) y(X) = X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3] Only the first 4 features are informative. The remaining features are useless. Read more in the :ref:`User Guide `.", - "docstring": "Generate a random regression problem with sparse uncorrelated design.\n\nThis dataset is described in Celeux et al [1]. as::\n\n X ~ N(0, 1)\n y(X) = X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]\n\nOnly the first 4 features are informative. The remaining features are\nuseless.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int, default=100\n The number of samples.\n\nn_features : int, default=10\n The number of features.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_features)\n The input samples.\n\ny : ndarray of shape (n_samples,)\n The output values.\n\nReferences\n----------\n.. [1] G. Celeux, M. El Anbari, J.-M. Marin, C. P. Robert,\n \"Regularization in regression: comparing Bayesian and frequentist\n methods in a poorly informative situation\", 2009.", + "description": "Generate a random regression problem with sparse uncorrelated design.\n\nThis dataset is described in Celeux et al [1]. as::\n\n X ~ N(0, 1)\n y(X) = X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]\n\nOnly the first 4 features are informative. The remaining features are\nuseless.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate a random regression problem with sparse uncorrelated design.\n\n This dataset is described in Celeux et al [1]. as::\n\n X ~ N(0, 1)\n y(X) = X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]\n\n Only the first 4 features are informative. The remaining features are\n useless.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n n_features : int, default=10\n The number of features.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The input samples.\n\n y : ndarray of shape (n_samples,)\n The output values.\n\n References\n ----------\n .. [1] G. Celeux, M. El Anbari, J.-M. Marin, C. P. Robert,\n \"Regularization in regression: comparing Bayesian and frequentist\n methods in a poorly informative situation\", 2009.\n ", "source_code": "\ndef make_sparse_uncorrelated(n_samples=100, n_features=10, *, random_state=None):\n \"\"\"Generate a random regression problem with sparse uncorrelated design.\n\n This dataset is described in Celeux et al [1]. as::\n\n X ~ N(0, 1)\n y(X) = X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3]\n\n Only the first 4 features are informative. The remaining features are\n useless.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of samples.\n\n n_features : int, default=10\n The number of features.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The input samples.\n\n y : ndarray of shape (n_samples,)\n The output values.\n\n References\n ----------\n .. [1] G. Celeux, M. El Anbari, J.-M. Marin, C. P. Robert,\n \"Regularization in regression: comparing Bayesian and frequentist\n methods in a poorly informative situation\", 2009.\n \"\"\"\n generator = check_random_state(random_state)\n X = generator.normal(loc=0, scale=1, size=(n_samples, n_features))\n y = generator.normal(loc=X[:, 0] + 2 * X[:, 1] - 2 * X[:, 2] - 1.5 * X[:, 3], scale=np.ones(n_samples))\n return X, y" }, { @@ -46233,7 +48062,8 @@ "docstring": { "type": "int", "description": "The matrix dimension." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -46243,13 +48073,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate a random symmetric, positive-definite matrix.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Generate a random symmetric, positive-definite matrix.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_dim : int\n The matrix dimension.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_dim, n_dim)\n The random symmetric, positive-definite matrix.\n\nSee Also\n--------\nmake_sparse_spd_matrix", + "docstring": "Generate a random symmetric, positive-definite matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_dim : int\n The matrix dimension.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_dim, n_dim)\n The random symmetric, positive-definite matrix.\n\n See Also\n --------\n make_sparse_spd_matrix\n ", "source_code": "\ndef make_spd_matrix(n_dim, *, random_state=None):\n \"\"\"Generate a random symmetric, positive-definite matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_dim : int\n The matrix dimension.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_dim, n_dim)\n The random symmetric, positive-definite matrix.\n\n See Also\n --------\n make_sparse_spd_matrix\n \"\"\"\n generator = check_random_state(random_state)\n A = generator.rand(n_dim, n_dim)\n (U, _, Vt) = linalg.svd(np.dot(A.T, A), check_finite=False)\n X = np.dot(np.dot(U, 1.0 + np.diag(generator.rand(n_dim))), Vt)\n return X" }, { @@ -46267,7 +48098,8 @@ "docstring": { "type": "int, default=100", "description": "The number of sample points on the S curve." - } + }, + "refined_type": {} }, { "name": "noise", @@ -46277,7 +48109,8 @@ "docstring": { "type": "float, default=0.0", "description": "The standard deviation of the gaussian noise." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -46287,13 +48120,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset creation. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate a swiss roll dataset.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Generate a swiss roll dataset.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int, default=100\n The number of sample points on the S curve.\n\nnoise : float, default=0.0\n The standard deviation of the gaussian noise.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, 3)\n The points.\n\nt : ndarray of shape (n_samples,)\n The univariate position of the sample according to the main dimension\n of the points in the manifold.\n\nNotes\n-----\nThe algorithm is from Marsland [1].\n\nReferences\n----------\n.. [1] S. Marsland, \"Machine Learning: An Algorithmic Perspective\",\n Chapter 10, 2009.\n http://seat.massey.ac.nz/personal/s.r.marsland/Code/10/lle.py", + "docstring": "Generate a swiss roll dataset.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of sample points on the S curve.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 3)\n The points.\n\n t : ndarray of shape (n_samples,)\n The univariate position of the sample according to the main dimension\n of the points in the manifold.\n\n Notes\n -----\n The algorithm is from Marsland [1].\n\n References\n ----------\n .. [1] S. Marsland, \"Machine Learning: An Algorithmic Perspective\",\n Chapter 10, 2009.\n http://seat.massey.ac.nz/personal/s.r.marsland/Code/10/lle.py\n ", "source_code": "\ndef make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None):\n \"\"\"Generate a swiss roll dataset.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int, default=100\n The number of sample points on the S curve.\n\n noise : float, default=0.0\n The standard deviation of the gaussian noise.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset creation. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, 3)\n The points.\n\n t : ndarray of shape (n_samples,)\n The univariate position of the sample according to the main dimension\n of the points in the manifold.\n\n Notes\n -----\n The algorithm is from Marsland [1].\n\n References\n ----------\n .. [1] S. Marsland, \"Machine Learning: An Algorithmic Perspective\",\n Chapter 10, 2009.\n http://seat.massey.ac.nz/personal/s.r.marsland/Code/10/lle.py\n \"\"\"\n generator = check_random_state(random_state)\n t = 1.5 * np.pi * (1 + 2 * generator.rand(1, n_samples))\n x = t * np.cos(t)\n y = 21 * generator.rand(1, n_samples)\n z = t * np.sin(t)\n X = np.concatenate((x, y, z))\n X += noise * generator.randn(3, n_samples)\n X = X.T\n t = np.squeeze(t)\n return X, t" }, { @@ -46311,7 +48145,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "header_length", @@ -46321,7 +48156,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -46331,13 +48167,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Load a coverage file from an open file object.\n\nThis will return a numpy array of the given dtype", - "docstring": "Load a coverage file from an open file object.\n\nThis will return a numpy array of the given dtype", + "docstring": "Load a coverage file from an open file object.\n\n This will return a numpy array of the given dtype\n ", "source_code": "\ndef _load_coverage(F, header_length=6, dtype=np.int16):\n \"\"\"Load a coverage file from an open file object.\n\n This will return a numpy array of the given dtype\n \"\"\"\n header = [F.readline() for _ in range(header_length)]\n make_tuple = lambda t: (t.split()[0], float(t.split()[1]))\n header = dict([make_tuple(line) for line in header])\n M = np.loadtxt(F, dtype=dtype)\n nodata = int(header[b'NODATA_value'])\n if nodata != -9999:\n M[nodata] = -9999\n return M" }, { @@ -46355,13 +48192,14 @@ "docstring": { "type": "file object", "description": "CSV file open in byte mode." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Load csv file.", - "docstring": "Load csv file.\n\nParameters\n----------\nF : file object\n CSV file open in byte mode.\n\nReturns\n-------\nrec : np.ndarray\n record array representing the data", + "docstring": "Load csv file.\n\n Parameters\n ----------\n F : file object\n CSV file open in byte mode.\n\n Returns\n -------\n rec : np.ndarray\n record array representing the data\n ", "source_code": "\ndef _load_csv(F):\n \"\"\"Load csv file.\n\n Parameters\n ----------\n F : file object\n CSV file open in byte mode.\n\n Returns\n -------\n rec : np.ndarray\n record array representing the data\n \"\"\"\n names = F.readline().decode('ascii').strip().split(',')\n rec = np.loadtxt(F, skiprows=0, delimiter=',', dtype='a22,f4,f4')\n rec.dtype.names = names\n return rec" }, { @@ -46379,13 +48217,14 @@ "docstring": { "type": "Batch object", "description": "The object returned by :func:`fetch_species_distributions`" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Construct the map grid from the batch object", - "docstring": "Construct the map grid from the batch object\n\nParameters\n----------\nbatch : Batch object\n The object returned by :func:`fetch_species_distributions`\n\nReturns\n-------\n(xgrid, ygrid) : 1-D arrays\n The grid corresponding to the values in batch.coverages", + "docstring": "Construct the map grid from the batch object\n\n Parameters\n ----------\n batch : Batch object\n The object returned by :func:`fetch_species_distributions`\n\n Returns\n -------\n (xgrid, ygrid) : 1-D arrays\n The grid corresponding to the values in batch.coverages\n ", "source_code": "\ndef construct_grids(batch):\n \"\"\"Construct the map grid from the batch object\n\n Parameters\n ----------\n batch : Batch object\n The object returned by :func:`fetch_species_distributions`\n\n Returns\n -------\n (xgrid, ygrid) : 1-D arrays\n The grid corresponding to the values in batch.coverages\n \"\"\"\n xmin = batch.x_left_lower_corner + batch.grid_size\n xmax = xmin + batch.Nx * batch.grid_size\n ymin = batch.y_left_lower_corner + batch.grid_size\n ymax = ymin + batch.Ny * batch.grid_size\n xgrid = np.arange(xmin, xmax, batch.grid_size)\n ygrid = np.arange(ymin, ymax, batch.grid_size)\n return xgrid, ygrid" }, { @@ -46403,7 +48242,8 @@ "docstring": { "type": "str, default=None", "description": "Specify another download and cache folder for the datasets. By default\nall scikit-learn data is stored in '~/scikit_learn_data' subfolders." - } + }, + "refined_type": {} }, { "name": "download_if_missing", @@ -46413,13 +48253,14 @@ "docstring": { "type": "bool, default=True", "description": "If False, raise a IOError if the data is not locally available\ninstead of trying to download the data from the source site." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Loader for species distribution dataset from Phillips et. al. (2006)\n\nRead more in the :ref:`User Guide `.", - "docstring": "Loader for species distribution dataset from Phillips et. al. (2006)\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ndata_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\ndownload_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\nReturns\n-------\ndata : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n coverages : array, shape = [14, 1592, 1212]\n These represent the 14 features measured\n at each point of the map grid.\n The latitude/longitude values for the grid are discussed below.\n Missing data is represented by the value -9999.\n train : record array, shape = (1624,)\n The training points for the data. Each point has three fields:\n\n - train['species'] is the species name\n - train['dd long'] is the longitude, in degrees\n - train['dd lat'] is the latitude, in degrees\n test : record array, shape = (620,)\n The test points for the data. Same format as the training data.\n Nx, Ny : integers\n The number of longitudes (x) and latitudes (y) in the grid\n x_left_lower_corner, y_left_lower_corner : floats\n The (x,y) position of the lower-left corner, in degrees\n grid_size : float\n The spacing between points of the grid, in degrees\n\nReferences\n----------\n\n* `\"Maximum entropy modeling of species geographic distributions\"\n `_\n S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,\n 190:231-259, 2006.\n\nNotes\n-----\n\nThis dataset represents the geographic distribution of species.\nThe dataset is provided by Phillips et. al. (2006).\n\nThe two species are:\n\n- `\"Bradypus variegatus\"\n `_ ,\n the Brown-throated Sloth.\n\n- `\"Microryzomys minutus\"\n `_ ,\n also known as the Forest Small Rice Rat, a rodent that lives in Peru,\n Colombia, Ecuador, Peru, and Venezuela.\n\n- For an example of using this dataset with scikit-learn, see\n :ref:`examples/applications/plot_species_distribution_modeling.py\n `.", + "docstring": "Loader for species distribution dataset from Phillips et. al. (2006)\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n coverages : array, shape = [14, 1592, 1212]\n These represent the 14 features measured\n at each point of the map grid.\n The latitude/longitude values for the grid are discussed below.\n Missing data is represented by the value -9999.\n train : record array, shape = (1624,)\n The training points for the data. Each point has three fields:\n\n - train['species'] is the species name\n - train['dd long'] is the longitude, in degrees\n - train['dd lat'] is the latitude, in degrees\n test : record array, shape = (620,)\n The test points for the data. Same format as the training data.\n Nx, Ny : integers\n The number of longitudes (x) and latitudes (y) in the grid\n x_left_lower_corner, y_left_lower_corner : floats\n The (x,y) position of the lower-left corner, in degrees\n grid_size : float\n The spacing between points of the grid, in degrees\n\n References\n ----------\n\n * `\"Maximum entropy modeling of species geographic distributions\"\n `_\n S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,\n 190:231-259, 2006.\n\n Notes\n -----\n\n This dataset represents the geographic distribution of species.\n The dataset is provided by Phillips et. al. (2006).\n\n The two species are:\n\n - `\"Bradypus variegatus\"\n `_ ,\n the Brown-throated Sloth.\n\n - `\"Microryzomys minutus\"\n `_ ,\n also known as the Forest Small Rice Rat, a rodent that lives in Peru,\n Colombia, Ecuador, Peru, and Venezuela.\n\n - For an example of using this dataset with scikit-learn, see\n :ref:`examples/applications/plot_species_distribution_modeling.py\n `.\n ", "source_code": "\ndef fetch_species_distributions(*, data_home=None, download_if_missing=True):\n \"\"\"Loader for species distribution dataset from Phillips et. al. (2006)\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify another download and cache folder for the datasets. By default\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n download_if_missing : bool, default=True\n If False, raise a IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n Returns\n -------\n data : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n coverages : array, shape = [14, 1592, 1212]\n These represent the 14 features measured\n at each point of the map grid.\n The latitude/longitude values for the grid are discussed below.\n Missing data is represented by the value -9999.\n train : record array, shape = (1624,)\n The training points for the data. Each point has three fields:\n\n - train['species'] is the species name\n - train['dd long'] is the longitude, in degrees\n - train['dd lat'] is the latitude, in degrees\n test : record array, shape = (620,)\n The test points for the data. Same format as the training data.\n Nx, Ny : integers\n The number of longitudes (x) and latitudes (y) in the grid\n x_left_lower_corner, y_left_lower_corner : floats\n The (x,y) position of the lower-left corner, in degrees\n grid_size : float\n The spacing between points of the grid, in degrees\n\n References\n ----------\n\n * `\"Maximum entropy modeling of species geographic distributions\"\n `_\n S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,\n 190:231-259, 2006.\n\n Notes\n -----\n\n This dataset represents the geographic distribution of species.\n The dataset is provided by Phillips et. al. (2006).\n\n The two species are:\n\n - `\"Bradypus variegatus\"\n `_ ,\n the Brown-throated Sloth.\n\n - `\"Microryzomys minutus\"\n `_ ,\n also known as the Forest Small Rice Rat, a rodent that lives in Peru,\n Colombia, Ecuador, Peru, and Venezuela.\n\n - For an example of using this dataset with scikit-learn, see\n :ref:`examples/applications/plot_species_distribution_modeling.py\n `.\n \"\"\"\n data_home = get_data_home(data_home)\n if not exists(data_home):\n makedirs(data_home)\n extra_params = dict(x_left_lower_corner=-94.8, Nx=1212, y_left_lower_corner=-56.05, Ny=1592, grid_size=0.05)\n dtype = np.int16\n archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME)\n if not exists(archive_path):\n if not download_if_missing:\n raise IOError('Data not found and `download_if_missing` is False')\n logger.info('Downloading species data from %s to %s' % (SAMPLES.url, data_home))\n samples_path = _fetch_remote(SAMPLES, dirname=data_home)\n with np.load(samples_path) as X:\n for f in X.files:\n fhandle = BytesIO(X[f])\n if 'train' in f:\n train = _load_csv(fhandle)\n if 'test' in f:\n test = _load_csv(fhandle)\n remove(samples_path)\n logger.info('Downloading coverage data from %s to %s' % (COVERAGES.url, data_home))\n coverages_path = _fetch_remote(COVERAGES, dirname=data_home)\n with np.load(coverages_path) as X:\n coverages = []\n for f in X.files:\n fhandle = BytesIO(X[f])\n logger.debug(' - converting {}'.format(f))\n coverages.append(_load_coverage(fhandle))\n coverages = np.asarray(coverages, dtype=dtype)\n remove(coverages_path)\n bunch = Bunch(coverages=coverages, test=test, train=train, **extra_params)\n joblib.dump(bunch, archive_path, compress=9)\n else:\n bunch = joblib.load(archive_path)\n return bunch" }, { @@ -46437,7 +48278,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -46447,7 +48289,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "f", @@ -46457,7 +48300,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "multilabel", @@ -46467,7 +48311,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "one_based", @@ -46477,7 +48322,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "comment", @@ -46487,7 +48333,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "query_id", @@ -46497,13 +48344,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id):\n X_is_sp = int(hasattr(X, 'tocsr'))\n y_is_sp = int(hasattr(y, 'tocsr'))\n if X.dtype.kind == 'i':\n value_pattern = '%d:%d'\n else:\n value_pattern = '%d:%.16g'\n if y.dtype.kind == 'i':\n label_pattern = '%d'\n else:\n label_pattern = '%.16g'\n line_pattern = '%s'\n if query_id is not None:\n line_pattern += ' qid:%d'\n line_pattern += ' %s\\n'\n if comment:\n f.write(('# Generated by dump_svmlight_file from scikit-learn %s\\n' % __version__).encode())\n f.write(('# Column indices are %s-based\\n' % ['zero', 'one'][one_based]).encode())\n f.write(b'#\\n')\n f.writelines((b'# %s\\n' % line for line in comment.splitlines()))\n for i in range(X.shape[0]):\n if X_is_sp:\n span = slice(X.indptr[i], X.indptr[i + 1])\n row = zip(X.indices[span], X.data[span])\n else:\n nz = X[i] != 0\n row = zip(np.where(nz)[0], X[i, nz])\n s = ' '.join((value_pattern % (j + one_based, x) for (j, x) in row))\n if multilabel:\n if y_is_sp:\n nz_labels = y[i].nonzero()[1]\n else:\n nz_labels = np.where(y[i] != 0)[0]\n labels_str = ','.join((label_pattern % j for j in nz_labels))\n elif y_is_sp:\n labels_str = label_pattern % y.data[i]\n else:\n labels_str = label_pattern % y[i]\n if query_id is not None:\n feat = (labels_str, query_id[i], s)\n else:\n feat = (labels_str, s)\n f.write((line_pattern % feat).encode('ascii'))" }, { @@ -46521,13 +48369,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _gen_open(f):\n if isinstance(f, int):\n return io.open(f, 'rb', closefd=False)\n elif not isinstance(f, str):\n raise TypeError('expected {str, int, file-like}, got %s' % type(f))\n (_, ext) = os.path.splitext(f)\n if ext == '.gz':\n import gzip\n return gzip.open(f, 'rb')\n elif ext == '.bz2':\n from bz2 import BZ2File\n return BZ2File(f, 'rb')\n else:\n return open(f, 'rb')" }, { @@ -46540,7 +48389,7 @@ "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _load_svmlight_file(*args, **kwargs):\n raise NotImplementedError('load_svmlight_file is currently not compatible with PyPy (see https://github.com/scikit-learn/scikit-learn/issues/11543 for the status updates).')" }, { @@ -46558,7 +48407,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -46568,7 +48418,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "multilabel", @@ -46578,7 +48429,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "zero_based", @@ -46588,7 +48440,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "query_id", @@ -46598,7 +48451,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "offset", @@ -46608,7 +48462,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "length", @@ -46618,13 +48473,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _open_and_load(f, dtype, multilabel, zero_based, query_id, offset=0, length=-1):\n if hasattr(f, 'read'):\n (actual_dtype, data, ind, indptr, labels, query) = _load_svmlight_file(f, dtype, multilabel, zero_based, query_id, offset, length)\n else:\n with closing(_gen_open(f)) as f:\n (actual_dtype, data, ind, indptr, labels, query) = _load_svmlight_file(f, dtype, multilabel, zero_based, query_id, offset, length)\n if not multilabel:\n labels = np.frombuffer(labels, np.float64)\n data = np.frombuffer(data, actual_dtype)\n indices = np.frombuffer(ind, np.longlong)\n indptr = np.frombuffer(indptr, dtype=np.longlong)\n query = np.frombuffer(query, np.int64)\n data = np.asarray(data, dtype=dtype)\n return data, indices, indptr, labels, query" }, { @@ -46642,6 +48498,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -46652,6 +48512,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape = [n_samples (, n_labels)]", "description": "Target values. Class labels must be an\ninteger or float, or array-like objects of integer or float for\nmultilabel classifications." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -46662,7 +48526,8 @@ "docstring": { "type": "str or file-like in binary mode", "description": "If string, specifies the path that will contain the data.\nIf file-like, data will be written to f. f should be opened in binary\nmode." - } + }, + "refined_type": {} }, { "name": "zero_based", @@ -46672,7 +48537,8 @@ "docstring": { "type": "boolean, default=True", "description": "Whether column indices should be written zero-based (True) or one-based\n(False)." - } + }, + "refined_type": {} }, { "name": "comment", @@ -46682,7 +48548,8 @@ "docstring": { "type": "str, default=None", "description": "Comment to insert at the top of the file. This should be either a\nUnicode string, which will be encoded as UTF-8, or an ASCII byte\nstring.\nIf a comment is given, then it will be preceded by one that identifies\nthe file as having been dumped by scikit-learn. Note that not all\ntools grok comments in SVMlight files." - } + }, + "refined_type": {} }, { "name": "query_id", @@ -46692,7 +48559,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Array containing pairwise preference constraints (qid in svmlight\nformat)." - } + }, + "refined_type": {} }, { "name": "multilabel", @@ -46702,13 +48570,14 @@ "docstring": { "type": "boolean, default=False", "description": "Samples may have several labels each (see\nhttps://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)\n\n.. versionadded:: 0.17\n parameter *multilabel* to support multilabel datasets." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Dump the dataset in svmlight / libsvm file format.\n\nThis format is a text-based format, with one sample per line. It does not store zero valued features hence is suitable for sparse dataset. The first element of each line can be used to store a target variable to predict.", - "docstring": "Dump the dataset in svmlight / libsvm file format.\n\nThis format is a text-based format, with one sample per line. It does\nnot store zero valued features hence is suitable for sparse dataset.\n\nThe first element of each line can be used to store a target variable\nto predict.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : {array-like, sparse matrix}, shape = [n_samples (, n_labels)]\n Target values. Class labels must be an\n integer or float, or array-like objects of integer or float for\n multilabel classifications.\n\nf : str or file-like in binary mode\n If string, specifies the path that will contain the data.\n If file-like, data will be written to f. f should be opened in binary\n mode.\n\nzero_based : boolean, default=True\n Whether column indices should be written zero-based (True) or one-based\n (False).\n\ncomment : str, default=None\n Comment to insert at the top of the file. This should be either a\n Unicode string, which will be encoded as UTF-8, or an ASCII byte\n string.\n If a comment is given, then it will be preceded by one that identifies\n the file as having been dumped by scikit-learn. Note that not all\n tools grok comments in SVMlight files.\n\nquery_id : array-like of shape (n_samples,), default=None\n Array containing pairwise preference constraints (qid in svmlight\n format).\n\nmultilabel : boolean, default=False\n Samples may have several labels each (see\n https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)\n\n .. versionadded:: 0.17\n parameter *multilabel* to support multilabel datasets.", + "description": "Dump the dataset in svmlight / libsvm file format.\n\nThis format is a text-based format, with one sample per line. It does\nnot store zero valued features hence is suitable for sparse dataset.\n\nThe first element of each line can be used to store a target variable\nto predict.", + "docstring": "Dump the dataset in svmlight / libsvm file format.\n\n This format is a text-based format, with one sample per line. It does\n not store zero valued features hence is suitable for sparse dataset.\n\n The first element of each line can be used to store a target variable\n to predict.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : {array-like, sparse matrix}, shape = [n_samples (, n_labels)]\n Target values. Class labels must be an\n integer or float, or array-like objects of integer or float for\n multilabel classifications.\n\n f : str or file-like in binary mode\n If string, specifies the path that will contain the data.\n If file-like, data will be written to f. f should be opened in binary\n mode.\n\n zero_based : boolean, default=True\n Whether column indices should be written zero-based (True) or one-based\n (False).\n\n comment : str, default=None\n Comment to insert at the top of the file. This should be either a\n Unicode string, which will be encoded as UTF-8, or an ASCII byte\n string.\n If a comment is given, then it will be preceded by one that identifies\n the file as having been dumped by scikit-learn. Note that not all\n tools grok comments in SVMlight files.\n\n query_id : array-like of shape (n_samples,), default=None\n Array containing pairwise preference constraints (qid in svmlight\n format).\n\n multilabel : boolean, default=False\n Samples may have several labels each (see\n https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)\n\n .. versionadded:: 0.17\n parameter *multilabel* to support multilabel datasets.\n ", "source_code": "\ndef dump_svmlight_file(X, y, f, *, zero_based=True, comment=None, query_id=None, multilabel=False):\n \"\"\"Dump the dataset in svmlight / libsvm file format.\n\n This format is a text-based format, with one sample per line. It does\n not store zero valued features hence is suitable for sparse dataset.\n\n The first element of each line can be used to store a target variable\n to predict.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : {array-like, sparse matrix}, shape = [n_samples (, n_labels)]\n Target values. Class labels must be an\n integer or float, or array-like objects of integer or float for\n multilabel classifications.\n\n f : str or file-like in binary mode\n If string, specifies the path that will contain the data.\n If file-like, data will be written to f. f should be opened in binary\n mode.\n\n zero_based : boolean, default=True\n Whether column indices should be written zero-based (True) or one-based\n (False).\n\n comment : str, default=None\n Comment to insert at the top of the file. This should be either a\n Unicode string, which will be encoded as UTF-8, or an ASCII byte\n string.\n If a comment is given, then it will be preceded by one that identifies\n the file as having been dumped by scikit-learn. Note that not all\n tools grok comments in SVMlight files.\n\n query_id : array-like of shape (n_samples,), default=None\n Array containing pairwise preference constraints (qid in svmlight\n format).\n\n multilabel : boolean, default=False\n Samples may have several labels each (see\n https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)\n\n .. versionadded:: 0.17\n parameter *multilabel* to support multilabel datasets.\n \"\"\"\n if comment is not None:\n if isinstance(comment, bytes):\n comment.decode('ascii')\n else:\n comment = comment.encode('utf-8')\n if b'\\x00' in comment:\n raise ValueError('comment string contains NUL byte')\n yval = check_array(y, accept_sparse='csr', ensure_2d=False)\n if sp.issparse(yval):\n if yval.shape[1] != 1 and not multilabel:\n raise ValueError('expected y of shape (n_samples, 1), got %r' % (yval.shape, ))\n elif yval.ndim != 1 and not multilabel:\n raise ValueError('expected y of shape (n_samples,), got %r' % (yval.shape, ))\n Xval = check_array(X, accept_sparse='csr')\n if Xval.shape[0] != yval.shape[0]:\n raise ValueError('X.shape[0] and y.shape[0] should be the same, got %r and %r instead.' % (Xval.shape[0], yval.shape[0]))\n if yval is y and hasattr(yval, 'sorted_indices'):\n y = yval.sorted_indices()\n else:\n y = yval\n if hasattr(y, 'sort_indices'):\n y.sort_indices()\n if Xval is X and hasattr(Xval, 'sorted_indices'):\n X = Xval.sorted_indices()\n else:\n X = Xval\n if hasattr(X, 'sort_indices'):\n X.sort_indices()\n if query_id is not None:\n query_id = np.asarray(query_id)\n if query_id.shape[0] != y.shape[0]:\n raise ValueError('expected query_id of shape (n_samples,), got %r' % (query_id.shape, ))\n one_based = not zero_based\n if hasattr(f, 'write'):\n _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id)\n else:\n with open(f, 'wb') as f:\n _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id)" }, { @@ -46726,7 +48595,8 @@ "docstring": { "type": "str, file-like or int", "description": "(Path to) a file to load. If a path ends in \".gz\" or \".bz2\", it will\nbe uncompressed on the fly. If an integer is passed, it is assumed to\nbe a file descriptor. A file-like or file descriptor will not be closed\nby this function. A file-like object must be opened in binary mode." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -46736,7 +48606,8 @@ "docstring": { "type": "int, default=None", "description": "The number of features to use. If None, it will be inferred. This\nargument is useful to load several files that are subsets of a\nbigger sliced dataset: each subset might not have examples of\nevery feature, hence the inferred shape might vary from one\nslice to another.\nn_features is only required if ``offset`` or ``length`` are passed a\nnon-default value." - } + }, + "refined_type": {} }, { "name": "dtype", @@ -46746,7 +48617,8 @@ "docstring": { "type": "numpy data type, default=np.float64", "description": "Data type of dataset to be loaded. This will be the data type of the\noutput numpy arrays ``X`` and ``y``." - } + }, + "refined_type": {} }, { "name": "multilabel", @@ -46756,7 +48628,8 @@ "docstring": { "type": "bool, default=False", "description": "Samples may have several labels each (see\nhttps://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)" - } + }, + "refined_type": {} }, { "name": "zero_based", @@ -46766,7 +48639,8 @@ "docstring": { "type": "bool or \"auto\", default=\"auto\"", "description": "Whether column indices in f are zero-based (True) or one-based\n(False). If column indices are one-based, they are transformed to\nzero-based to match Python/NumPy conventions.\nIf set to \"auto\", a heuristic check is applied to determine this from\nthe file contents. Both kinds of files occur \"in the wild\", but they\nare unfortunately not self-identifying. Using \"auto\" or True should\nalways be safe when no ``offset`` or ``length`` is passed.\nIf ``offset`` or ``length`` are passed, the \"auto\" mode falls back\nto ``zero_based=True`` to avoid having the heuristic check yield\ninconsistent results on different segments of the file." - } + }, + "refined_type": {} }, { "name": "query_id", @@ -46776,7 +48650,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, will return the query_id array for each file." - } + }, + "refined_type": {} }, { "name": "offset", @@ -46786,7 +48661,8 @@ "docstring": { "type": "int, default=0", "description": "Ignore the offset first bytes by seeking forward, then\ndiscarding the following bytes up until the next new line\ncharacter." - } + }, + "refined_type": {} }, { "name": "length", @@ -46796,13 +48672,14 @@ "docstring": { "type": "int, default=-1", "description": "If strictly positive, stop reading any new line of data once the\nposition in the file has reached the (offset + length) bytes threshold." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load datasets in the svmlight / libsvm format into sparse CSR matrix\n\nThis format is a text-based format, with one sample per line. It does not store zero valued features hence is suitable for sparse dataset. The first element of each line can be used to store a target variable to predict. This format is used as the default format for both svmlight and the libsvm command line programs. Parsing a text based source can be expensive. When repeatedly working on the same dataset, it is recommended to wrap this loader with joblib.Memory.cache to store a memmapped backup of the CSR results of the first call and benefit from the near instantaneous loading of memmapped structures for the subsequent calls. In case the file contains a pairwise preference constraint (known as \"qid\" in the svmlight format) these are ignored unless the query_id parameter is set to True. These pairwise preference constraints can be used to constraint the combination of samples when using pairwise loss functions (as is the case in some learning to rank problems) so that only pairs with the same query_id value are considered. This implementation is written in Cython and is reasonably fast. However, a faster API-compatible loader is also available at: https://github.com/mblondel/svmlight-loader", - "docstring": "Load datasets in the svmlight / libsvm format into sparse CSR matrix\n\nThis format is a text-based format, with one sample per line. It does\nnot store zero valued features hence is suitable for sparse dataset.\n\nThe first element of each line can be used to store a target variable\nto predict.\n\nThis format is used as the default format for both svmlight and the\nlibsvm command line programs.\n\nParsing a text based source can be expensive. When repeatedly\nworking on the same dataset, it is recommended to wrap this\nloader with joblib.Memory.cache to store a memmapped backup of the\nCSR results of the first call and benefit from the near instantaneous\nloading of memmapped structures for the subsequent calls.\n\nIn case the file contains a pairwise preference constraint (known\nas \"qid\" in the svmlight format) these are ignored unless the\nquery_id parameter is set to True. These pairwise preference\nconstraints can be used to constraint the combination of samples\nwhen using pairwise loss functions (as is the case in some\nlearning to rank problems) so that only pairs with the same\nquery_id value are considered.\n\nThis implementation is written in Cython and is reasonably fast.\nHowever, a faster API-compatible loader is also available at:\n\n https://github.com/mblondel/svmlight-loader\n\nParameters\n----------\nf : str, file-like or int\n (Path to) a file to load. If a path ends in \".gz\" or \".bz2\", it will\n be uncompressed on the fly. If an integer is passed, it is assumed to\n be a file descriptor. A file-like or file descriptor will not be closed\n by this function. A file-like object must be opened in binary mode.\n\nn_features : int, default=None\n The number of features to use. If None, it will be inferred. This\n argument is useful to load several files that are subsets of a\n bigger sliced dataset: each subset might not have examples of\n every feature, hence the inferred shape might vary from one\n slice to another.\n n_features is only required if ``offset`` or ``length`` are passed a\n non-default value.\n\ndtype : numpy data type, default=np.float64\n Data type of dataset to be loaded. This will be the data type of the\n output numpy arrays ``X`` and ``y``.\n\nmultilabel : bool, default=False\n Samples may have several labels each (see\n https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)\n\nzero_based : bool or \"auto\", default=\"auto\"\n Whether column indices in f are zero-based (True) or one-based\n (False). If column indices are one-based, they are transformed to\n zero-based to match Python/NumPy conventions.\n If set to \"auto\", a heuristic check is applied to determine this from\n the file contents. Both kinds of files occur \"in the wild\", but they\n are unfortunately not self-identifying. Using \"auto\" or True should\n always be safe when no ``offset`` or ``length`` is passed.\n If ``offset`` or ``length`` are passed, the \"auto\" mode falls back\n to ``zero_based=True`` to avoid having the heuristic check yield\n inconsistent results on different segments of the file.\n\nquery_id : bool, default=False\n If True, will return the query_id array for each file.\n\noffset : int, default=0\n Ignore the offset first bytes by seeking forward, then\n discarding the following bytes up until the next new line\n character.\n\nlength : int, default=-1\n If strictly positive, stop reading any new line of data once the\n position in the file has reached the (offset + length) bytes threshold.\n\nReturns\n-------\nX : scipy.sparse matrix of shape (n_samples, n_features)\n\ny : ndarray of shape (n_samples,), or, in the multilabel a list of\n tuples of length n_samples.\n\nquery_id : array of shape (n_samples,)\n query_id for each sample. Only returned when query_id is set to\n True.\n\nSee Also\n--------\nload_svmlight_files : Similar function for loading multiple files in this\n format, enforcing the same number of features/columns on all of them.\n\nExamples\n--------\nTo use joblib.Memory to cache the svmlight file::\n\n from joblib import Memory\n from .datasets import load_svmlight_file\n mem = Memory(\"./mycache\")\n\n @mem.cache\n def get_data():\n data = load_svmlight_file(\"mysvmlightfile\")\n return data[0], data[1]\n\n X, y = get_data()", + "description": "Load datasets in the svmlight / libsvm format into sparse CSR matrix\n\nThis format is a text-based format, with one sample per line. It does\nnot store zero valued features hence is suitable for sparse dataset.\n\nThe first element of each line can be used to store a target variable\nto predict.\n\nThis format is used as the default format for both svmlight and the\nlibsvm command line programs.\n\nParsing a text based source can be expensive. When repeatedly\nworking on the same dataset, it is recommended to wrap this\nloader with joblib.Memory.cache to store a memmapped backup of the\nCSR results of the first call and benefit from the near instantaneous\nloading of memmapped structures for the subsequent calls.\n\nIn case the file contains a pairwise preference constraint (known\nas \"qid\" in the svmlight format) these are ignored unless the\nquery_id parameter is set to True. These pairwise preference\nconstraints can be used to constraint the combination of samples\nwhen using pairwise loss functions (as is the case in some\nlearning to rank problems) so that only pairs with the same\nquery_id value are considered.\n\nThis implementation is written in Cython and is reasonably fast.\nHowever, a faster API-compatible loader is also available at:\n\n https://github.com/mblondel/svmlight-loader", + "docstring": "Load datasets in the svmlight / libsvm format into sparse CSR matrix\n\n This format is a text-based format, with one sample per line. It does\n not store zero valued features hence is suitable for sparse dataset.\n\n The first element of each line can be used to store a target variable\n to predict.\n\n This format is used as the default format for both svmlight and the\n libsvm command line programs.\n\n Parsing a text based source can be expensive. When repeatedly\n working on the same dataset, it is recommended to wrap this\n loader with joblib.Memory.cache to store a memmapped backup of the\n CSR results of the first call and benefit from the near instantaneous\n loading of memmapped structures for the subsequent calls.\n\n In case the file contains a pairwise preference constraint (known\n as \"qid\" in the svmlight format) these are ignored unless the\n query_id parameter is set to True. These pairwise preference\n constraints can be used to constraint the combination of samples\n when using pairwise loss functions (as is the case in some\n learning to rank problems) so that only pairs with the same\n query_id value are considered.\n\n This implementation is written in Cython and is reasonably fast.\n However, a faster API-compatible loader is also available at:\n\n https://github.com/mblondel/svmlight-loader\n\n Parameters\n ----------\n f : str, file-like or int\n (Path to) a file to load. If a path ends in \".gz\" or \".bz2\", it will\n be uncompressed on the fly. If an integer is passed, it is assumed to\n be a file descriptor. A file-like or file descriptor will not be closed\n by this function. A file-like object must be opened in binary mode.\n\n n_features : int, default=None\n The number of features to use. If None, it will be inferred. This\n argument is useful to load several files that are subsets of a\n bigger sliced dataset: each subset might not have examples of\n every feature, hence the inferred shape might vary from one\n slice to another.\n n_features is only required if ``offset`` or ``length`` are passed a\n non-default value.\n\n dtype : numpy data type, default=np.float64\n Data type of dataset to be loaded. This will be the data type of the\n output numpy arrays ``X`` and ``y``.\n\n multilabel : bool, default=False\n Samples may have several labels each (see\n https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)\n\n zero_based : bool or \"auto\", default=\"auto\"\n Whether column indices in f are zero-based (True) or one-based\n (False). If column indices are one-based, they are transformed to\n zero-based to match Python/NumPy conventions.\n If set to \"auto\", a heuristic check is applied to determine this from\n the file contents. Both kinds of files occur \"in the wild\", but they\n are unfortunately not self-identifying. Using \"auto\" or True should\n always be safe when no ``offset`` or ``length`` is passed.\n If ``offset`` or ``length`` are passed, the \"auto\" mode falls back\n to ``zero_based=True`` to avoid having the heuristic check yield\n inconsistent results on different segments of the file.\n\n query_id : bool, default=False\n If True, will return the query_id array for each file.\n\n offset : int, default=0\n Ignore the offset first bytes by seeking forward, then\n discarding the following bytes up until the next new line\n character.\n\n length : int, default=-1\n If strictly positive, stop reading any new line of data once the\n position in the file has reached the (offset + length) bytes threshold.\n\n Returns\n -------\n X : scipy.sparse matrix of shape (n_samples, n_features)\n\n y : ndarray of shape (n_samples,), or, in the multilabel a list of\n tuples of length n_samples.\n\n query_id : array of shape (n_samples,)\n query_id for each sample. Only returned when query_id is set to\n True.\n\n See Also\n --------\n load_svmlight_files : Similar function for loading multiple files in this\n format, enforcing the same number of features/columns on all of them.\n\n Examples\n --------\n To use joblib.Memory to cache the svmlight file::\n\n from joblib import Memory\n from .datasets import load_svmlight_file\n mem = Memory(\"./mycache\")\n\n @mem.cache\n def get_data():\n data = load_svmlight_file(\"mysvmlightfile\")\n return data[0], data[1]\n\n X, y = get_data()\n ", "source_code": "\ndef load_svmlight_file(f, *, n_features=None, dtype=np.float64, multilabel=False, zero_based='auto', query_id=False, offset=0, length=-1):\n \"\"\"Load datasets in the svmlight / libsvm format into sparse CSR matrix\n\n This format is a text-based format, with one sample per line. It does\n not store zero valued features hence is suitable for sparse dataset.\n\n The first element of each line can be used to store a target variable\n to predict.\n\n This format is used as the default format for both svmlight and the\n libsvm command line programs.\n\n Parsing a text based source can be expensive. When repeatedly\n working on the same dataset, it is recommended to wrap this\n loader with joblib.Memory.cache to store a memmapped backup of the\n CSR results of the first call and benefit from the near instantaneous\n loading of memmapped structures for the subsequent calls.\n\n In case the file contains a pairwise preference constraint (known\n as \"qid\" in the svmlight format) these are ignored unless the\n query_id parameter is set to True. These pairwise preference\n constraints can be used to constraint the combination of samples\n when using pairwise loss functions (as is the case in some\n learning to rank problems) so that only pairs with the same\n query_id value are considered.\n\n This implementation is written in Cython and is reasonably fast.\n However, a faster API-compatible loader is also available at:\n\n https://github.com/mblondel/svmlight-loader\n\n Parameters\n ----------\n f : str, file-like or int\n (Path to) a file to load. If a path ends in \".gz\" or \".bz2\", it will\n be uncompressed on the fly. If an integer is passed, it is assumed to\n be a file descriptor. A file-like or file descriptor will not be closed\n by this function. A file-like object must be opened in binary mode.\n\n n_features : int, default=None\n The number of features to use. If None, it will be inferred. This\n argument is useful to load several files that are subsets of a\n bigger sliced dataset: each subset might not have examples of\n every feature, hence the inferred shape might vary from one\n slice to another.\n n_features is only required if ``offset`` or ``length`` are passed a\n non-default value.\n\n dtype : numpy data type, default=np.float64\n Data type of dataset to be loaded. This will be the data type of the\n output numpy arrays ``X`` and ``y``.\n\n multilabel : bool, default=False\n Samples may have several labels each (see\n https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)\n\n zero_based : bool or \"auto\", default=\"auto\"\n Whether column indices in f are zero-based (True) or one-based\n (False). If column indices are one-based, they are transformed to\n zero-based to match Python/NumPy conventions.\n If set to \"auto\", a heuristic check is applied to determine this from\n the file contents. Both kinds of files occur \"in the wild\", but they\n are unfortunately not self-identifying. Using \"auto\" or True should\n always be safe when no ``offset`` or ``length`` is passed.\n If ``offset`` or ``length`` are passed, the \"auto\" mode falls back\n to ``zero_based=True`` to avoid having the heuristic check yield\n inconsistent results on different segments of the file.\n\n query_id : bool, default=False\n If True, will return the query_id array for each file.\n\n offset : int, default=0\n Ignore the offset first bytes by seeking forward, then\n discarding the following bytes up until the next new line\n character.\n\n length : int, default=-1\n If strictly positive, stop reading any new line of data once the\n position in the file has reached the (offset + length) bytes threshold.\n\n Returns\n -------\n X : scipy.sparse matrix of shape (n_samples, n_features)\n\n y : ndarray of shape (n_samples,), or, in the multilabel a list of\n tuples of length n_samples.\n\n query_id : array of shape (n_samples,)\n query_id for each sample. Only returned when query_id is set to\n True.\n\n See Also\n --------\n load_svmlight_files : Similar function for loading multiple files in this\n format, enforcing the same number of features/columns on all of them.\n\n Examples\n --------\n To use joblib.Memory to cache the svmlight file::\n\n from joblib import Memory\n from .datasets import load_svmlight_file\n mem = Memory(\"./mycache\")\n\n @mem.cache\n def get_data():\n data = load_svmlight_file(\"mysvmlightfile\")\n return data[0], data[1]\n\n X, y = get_data()\n \"\"\"\n return tuple(load_svmlight_files([f], n_features=n_features, dtype=dtype, multilabel=multilabel, zero_based=zero_based, query_id=query_id, offset=offset, length=length))" }, { @@ -46820,7 +48697,8 @@ "docstring": { "type": "array-like, dtype=str, file-like or int", "description": "(Paths of) files to load. If a path ends in \".gz\" or \".bz2\", it will\nbe uncompressed on the fly. If an integer is passed, it is assumed to\nbe a file descriptor. File-likes and file descriptors will not be\nclosed by this function. File-like objects must be opened in binary\nmode." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -46830,7 +48708,8 @@ "docstring": { "type": "int, default=None", "description": "The number of features to use. If None, it will be inferred from the\nmaximum column index occurring in any of the files.\n\nThis can be set to a higher value than the actual number of features\nin any of the input files, but setting it to a lower value will cause\nan exception to be raised." - } + }, + "refined_type": {} }, { "name": "dtype", @@ -46840,7 +48719,8 @@ "docstring": { "type": "numpy data type, default=np.float64", "description": "Data type of dataset to be loaded. This will be the data type of the\noutput numpy arrays ``X`` and ``y``." - } + }, + "refined_type": {} }, { "name": "multilabel", @@ -46850,7 +48730,8 @@ "docstring": { "type": "bool, default=False", "description": "Samples may have several labels each (see\nhttps://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)" - } + }, + "refined_type": {} }, { "name": "zero_based", @@ -46860,7 +48741,8 @@ "docstring": { "type": "bool or \"auto\", default=\"auto\"", "description": "Whether column indices in f are zero-based (True) or one-based\n(False). If column indices are one-based, they are transformed to\nzero-based to match Python/NumPy conventions.\nIf set to \"auto\", a heuristic check is applied to determine this from\nthe file contents. Both kinds of files occur \"in the wild\", but they\nare unfortunately not self-identifying. Using \"auto\" or True should\nalways be safe when no offset or length is passed.\nIf offset or length are passed, the \"auto\" mode falls back\nto zero_based=True to avoid having the heuristic check yield\ninconsistent results on different segments of the file." - } + }, + "refined_type": {} }, { "name": "query_id", @@ -46870,7 +48752,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, will return the query_id array for each file." - } + }, + "refined_type": {} }, { "name": "offset", @@ -46880,7 +48763,8 @@ "docstring": { "type": "int, default=0", "description": "Ignore the offset first bytes by seeking forward, then\ndiscarding the following bytes up until the next new line\ncharacter." - } + }, + "refined_type": {} }, { "name": "length", @@ -46890,13 +48774,14 @@ "docstring": { "type": "int, default=-1", "description": "If strictly positive, stop reading any new line of data once the\nposition in the file has reached the (offset + length) bytes threshold." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load dataset from multiple files in SVMlight format\n\nThis function is equivalent to mapping load_svmlight_file over a list of files, except that the results are concatenated into a single, flat list and the samples vectors are constrained to all have the same number of features. In case the file contains a pairwise preference constraint (known as \"qid\" in the svmlight format) these are ignored unless the query_id parameter is set to True. These pairwise preference constraints can be used to constraint the combination of samples when using pairwise loss functions (as is the case in some learning to rank problems) so that only pairs with the same query_id value are considered.", - "docstring": "Load dataset from multiple files in SVMlight format\n\nThis function is equivalent to mapping load_svmlight_file over a list of\nfiles, except that the results are concatenated into a single, flat list\nand the samples vectors are constrained to all have the same number of\nfeatures.\n\nIn case the file contains a pairwise preference constraint (known\nas \"qid\" in the svmlight format) these are ignored unless the\nquery_id parameter is set to True. These pairwise preference\nconstraints can be used to constraint the combination of samples\nwhen using pairwise loss functions (as is the case in some\nlearning to rank problems) so that only pairs with the same\nquery_id value are considered.\n\nParameters\n----------\nfiles : array-like, dtype=str, file-like or int\n (Paths of) files to load. If a path ends in \".gz\" or \".bz2\", it will\n be uncompressed on the fly. If an integer is passed, it is assumed to\n be a file descriptor. File-likes and file descriptors will not be\n closed by this function. File-like objects must be opened in binary\n mode.\n\nn_features : int, default=None\n The number of features to use. If None, it will be inferred from the\n maximum column index occurring in any of the files.\n\n This can be set to a higher value than the actual number of features\n in any of the input files, but setting it to a lower value will cause\n an exception to be raised.\n\ndtype : numpy data type, default=np.float64\n Data type of dataset to be loaded. This will be the data type of the\n output numpy arrays ``X`` and ``y``.\n\nmultilabel : bool, default=False\n Samples may have several labels each (see\n https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)\n\nzero_based : bool or \"auto\", default=\"auto\"\n Whether column indices in f are zero-based (True) or one-based\n (False). If column indices are one-based, they are transformed to\n zero-based to match Python/NumPy conventions.\n If set to \"auto\", a heuristic check is applied to determine this from\n the file contents. Both kinds of files occur \"in the wild\", but they\n are unfortunately not self-identifying. Using \"auto\" or True should\n always be safe when no offset or length is passed.\n If offset or length are passed, the \"auto\" mode falls back\n to zero_based=True to avoid having the heuristic check yield\n inconsistent results on different segments of the file.\n\nquery_id : bool, default=False\n If True, will return the query_id array for each file.\n\noffset : int, default=0\n Ignore the offset first bytes by seeking forward, then\n discarding the following bytes up until the next new line\n character.\n\nlength : int, default=-1\n If strictly positive, stop reading any new line of data once the\n position in the file has reached the (offset + length) bytes threshold.\n\nReturns\n-------\n[X1, y1, ..., Xn, yn]\nwhere each (Xi, yi) pair is the result from load_svmlight_file(files[i]).\n\nIf query_id is set to True, this will return instead [X1, y1, q1,\n..., Xn, yn, qn] where (Xi, yi, qi) is the result from\nload_svmlight_file(files[i])\n\nNotes\n-----\nWhen fitting a model to a matrix X_train and evaluating it against a\nmatrix X_test, it is essential that X_train and X_test have the same\nnumber of features (X_train.shape[1] == X_test.shape[1]). This may not\nbe the case if you load the files individually with load_svmlight_file.\n\nSee Also\n--------\nload_svmlight_file", + "description": "Load dataset from multiple files in SVMlight format\n\nThis function is equivalent to mapping load_svmlight_file over a list of\nfiles, except that the results are concatenated into a single, flat list\nand the samples vectors are constrained to all have the same number of\nfeatures.\n\nIn case the file contains a pairwise preference constraint (known\nas \"qid\" in the svmlight format) these are ignored unless the\nquery_id parameter is set to True. These pairwise preference\nconstraints can be used to constraint the combination of samples\nwhen using pairwise loss functions (as is the case in some\nlearning to rank problems) so that only pairs with the same\nquery_id value are considered.", + "docstring": "Load dataset from multiple files in SVMlight format\n\n This function is equivalent to mapping load_svmlight_file over a list of\n files, except that the results are concatenated into a single, flat list\n and the samples vectors are constrained to all have the same number of\n features.\n\n In case the file contains a pairwise preference constraint (known\n as \"qid\" in the svmlight format) these are ignored unless the\n query_id parameter is set to True. These pairwise preference\n constraints can be used to constraint the combination of samples\n when using pairwise loss functions (as is the case in some\n learning to rank problems) so that only pairs with the same\n query_id value are considered.\n\n Parameters\n ----------\n files : array-like, dtype=str, file-like or int\n (Paths of) files to load. If a path ends in \".gz\" or \".bz2\", it will\n be uncompressed on the fly. If an integer is passed, it is assumed to\n be a file descriptor. File-likes and file descriptors will not be\n closed by this function. File-like objects must be opened in binary\n mode.\n\n n_features : int, default=None\n The number of features to use. If None, it will be inferred from the\n maximum column index occurring in any of the files.\n\n This can be set to a higher value than the actual number of features\n in any of the input files, but setting it to a lower value will cause\n an exception to be raised.\n\n dtype : numpy data type, default=np.float64\n Data type of dataset to be loaded. This will be the data type of the\n output numpy arrays ``X`` and ``y``.\n\n multilabel : bool, default=False\n Samples may have several labels each (see\n https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)\n\n zero_based : bool or \"auto\", default=\"auto\"\n Whether column indices in f are zero-based (True) or one-based\n (False). If column indices are one-based, they are transformed to\n zero-based to match Python/NumPy conventions.\n If set to \"auto\", a heuristic check is applied to determine this from\n the file contents. Both kinds of files occur \"in the wild\", but they\n are unfortunately not self-identifying. Using \"auto\" or True should\n always be safe when no offset or length is passed.\n If offset or length are passed, the \"auto\" mode falls back\n to zero_based=True to avoid having the heuristic check yield\n inconsistent results on different segments of the file.\n\n query_id : bool, default=False\n If True, will return the query_id array for each file.\n\n offset : int, default=0\n Ignore the offset first bytes by seeking forward, then\n discarding the following bytes up until the next new line\n character.\n\n length : int, default=-1\n If strictly positive, stop reading any new line of data once the\n position in the file has reached the (offset + length) bytes threshold.\n\n Returns\n -------\n [X1, y1, ..., Xn, yn]\n where each (Xi, yi) pair is the result from load_svmlight_file(files[i]).\n\n If query_id is set to True, this will return instead [X1, y1, q1,\n ..., Xn, yn, qn] where (Xi, yi, qi) is the result from\n load_svmlight_file(files[i])\n\n Notes\n -----\n When fitting a model to a matrix X_train and evaluating it against a\n matrix X_test, it is essential that X_train and X_test have the same\n number of features (X_train.shape[1] == X_test.shape[1]). This may not\n be the case if you load the files individually with load_svmlight_file.\n\n See Also\n --------\n load_svmlight_file\n ", "source_code": "\ndef load_svmlight_files(files, *, n_features=None, dtype=np.float64, multilabel=False, zero_based='auto', query_id=False, offset=0, length=-1):\n \"\"\"Load dataset from multiple files in SVMlight format\n\n This function is equivalent to mapping load_svmlight_file over a list of\n files, except that the results are concatenated into a single, flat list\n and the samples vectors are constrained to all have the same number of\n features.\n\n In case the file contains a pairwise preference constraint (known\n as \"qid\" in the svmlight format) these are ignored unless the\n query_id parameter is set to True. These pairwise preference\n constraints can be used to constraint the combination of samples\n when using pairwise loss functions (as is the case in some\n learning to rank problems) so that only pairs with the same\n query_id value are considered.\n\n Parameters\n ----------\n files : array-like, dtype=str, file-like or int\n (Paths of) files to load. If a path ends in \".gz\" or \".bz2\", it will\n be uncompressed on the fly. If an integer is passed, it is assumed to\n be a file descriptor. File-likes and file descriptors will not be\n closed by this function. File-like objects must be opened in binary\n mode.\n\n n_features : int, default=None\n The number of features to use. If None, it will be inferred from the\n maximum column index occurring in any of the files.\n\n This can be set to a higher value than the actual number of features\n in any of the input files, but setting it to a lower value will cause\n an exception to be raised.\n\n dtype : numpy data type, default=np.float64\n Data type of dataset to be loaded. This will be the data type of the\n output numpy arrays ``X`` and ``y``.\n\n multilabel : bool, default=False\n Samples may have several labels each (see\n https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)\n\n zero_based : bool or \"auto\", default=\"auto\"\n Whether column indices in f are zero-based (True) or one-based\n (False). If column indices are one-based, they are transformed to\n zero-based to match Python/NumPy conventions.\n If set to \"auto\", a heuristic check is applied to determine this from\n the file contents. Both kinds of files occur \"in the wild\", but they\n are unfortunately not self-identifying. Using \"auto\" or True should\n always be safe when no offset or length is passed.\n If offset or length are passed, the \"auto\" mode falls back\n to zero_based=True to avoid having the heuristic check yield\n inconsistent results on different segments of the file.\n\n query_id : bool, default=False\n If True, will return the query_id array for each file.\n\n offset : int, default=0\n Ignore the offset first bytes by seeking forward, then\n discarding the following bytes up until the next new line\n character.\n\n length : int, default=-1\n If strictly positive, stop reading any new line of data once the\n position in the file has reached the (offset + length) bytes threshold.\n\n Returns\n -------\n [X1, y1, ..., Xn, yn]\n where each (Xi, yi) pair is the result from load_svmlight_file(files[i]).\n\n If query_id is set to True, this will return instead [X1, y1, q1,\n ..., Xn, yn, qn] where (Xi, yi, qi) is the result from\n load_svmlight_file(files[i])\n\n Notes\n -----\n When fitting a model to a matrix X_train and evaluating it against a\n matrix X_test, it is essential that X_train and X_test have the same\n number of features (X_train.shape[1] == X_test.shape[1]). This may not\n be the case if you load the files individually with load_svmlight_file.\n\n See Also\n --------\n load_svmlight_file\n \"\"\"\n if (offset != 0 or length > 0) and zero_based == 'auto':\n zero_based = True\n if (offset != 0 or length > 0) and n_features is None:\n raise ValueError('n_features is required when offset or length is specified.')\n r = [_open_and_load(f, dtype, multilabel, bool(zero_based), bool(query_id), offset=offset, length=length) for f in files]\n if zero_based is False or zero_based == 'auto' and all((len(tmp[1]) and np.min(tmp[1]) > 0 for tmp in r)):\n for (_, indices, _, _, _) in r:\n indices -= 1\n n_f = max((ind[1].max() if len(ind[1]) else 0 for ind in r)) + 1\n if n_features is None:\n n_features = n_f\n elif n_features < n_f:\n raise ValueError('n_features was set to {}, but input file contains {} features'.format(n_features, n_f))\n result = []\n for (data, indices, indptr, y, query_values) in r:\n shape = (indptr.shape[0] - 1, n_features)\n X = sp.csr_matrix((data, indices, indptr), shape)\n X.sort_indices()\n result += (X, y)\n if query_id:\n result.append(query_values)\n return result" }, { @@ -46914,7 +48799,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cache_path", @@ -46924,7 +48810,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -46948,7 +48835,8 @@ "docstring": { "type": "str, default=None", "description": "Specify a download and cache folder for the datasets. If None,\nall scikit-learn data is stored in '~/scikit_learn_data' subfolders." - } + }, + "refined_type": {} }, { "name": "subset", @@ -46958,6 +48846,10 @@ "docstring": { "type": "{'train', 'test', 'all'}, default='train'", "description": "Select the dataset to load: 'train' for the training set, 'test'\nfor the test set, 'all' for both, with shuffled ordering." + }, + "refined_type": { + "kind": "EnumType", + "values": ["train", "all", "test"] } }, { @@ -46968,7 +48860,8 @@ "docstring": { "type": "array-like, dtype=str, default=None", "description": "If None (default), load all the categories.\nIf not None, list of category names to load (other categories\nignored)." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -46978,7 +48871,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to shuffle the data: might be important for models that\nmake the assumption that the samples are independent and identically\ndistributed (i.i.d.), such as stochastic gradient descent." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -46988,7 +48882,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for dataset shuffling. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "remove", @@ -46998,7 +48893,8 @@ "docstring": { "type": "tuple, default=()", "description": "May contain any subset of ('headers', 'footers', 'quotes'). Each of\nthese are kinds of text that will be detected and removed from the\nnewsgroup posts, preventing classifiers from overfitting on\nmetadata.\n\n'headers' removes newsgroup headers, 'footers' removes blocks at the\nends of posts that look like signatures, and 'quotes' removes lines\nthat appear to be quoting another post.\n\n'headers' follows an exact standard; the other filters are not always\ncorrect." - } + }, + "refined_type": {} }, { "name": "download_if_missing", @@ -47008,7 +48904,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, raise an IOError if the data is not locally available\ninstead of trying to download the data from the source site." - } + }, + "refined_type": {} }, { "name": "return_X_y", @@ -47018,13 +48915,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns `(data.data, data.target)` instead of a Bunch\nobject.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load the filenames and data from the 20 newsgroups dataset (classification).\n\nDownload it if necessary. ================= ========== Classes 20 Samples total 18846 Dimensionality 1 Features text ================= ========== Read more in the :ref:`User Guide <20newsgroups_dataset>`.", - "docstring": "Load the filenames and data from the 20 newsgroups dataset (classification).\n\nDownload it if necessary.\n\n================= ==========\nClasses 20\nSamples total 18846\nDimensionality 1\nFeatures text\n================= ==========\n\nRead more in the :ref:`User Guide <20newsgroups_dataset>`.\n\nParameters\n----------\ndata_home : str, default=None\n Specify a download and cache folder for the datasets. If None,\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\nsubset : {'train', 'test', 'all'}, default='train'\n Select the dataset to load: 'train' for the training set, 'test'\n for the test set, 'all' for both, with shuffled ordering.\n\ncategories : array-like, dtype=str, default=None\n If None (default), load all the categories.\n If not None, list of category names to load (other categories\n ignored).\n\nshuffle : bool, default=True\n Whether or not to shuffle the data: might be important for models that\n make the assumption that the samples are independent and identically\n distributed (i.i.d.), such as stochastic gradient descent.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nremove : tuple, default=()\n May contain any subset of ('headers', 'footers', 'quotes'). Each of\n these are kinds of text that will be detected and removed from the\n newsgroup posts, preventing classifiers from overfitting on\n metadata.\n\n 'headers' removes newsgroup headers, 'footers' removes blocks at the\n ends of posts that look like signatures, and 'quotes' removes lines\n that appear to be quoting another post.\n\n 'headers' follows an exact standard; the other filters are not always\n correct.\n\ndownload_if_missing : bool, default=True\n If False, raise an IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\nreturn_X_y : bool, default=False\n If True, returns `(data.data, data.target)` instead of a Bunch\n object.\n\n .. versionadded:: 0.22\n\nReturns\n-------\nbunch : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : list of shape (n_samples,)\n The data list to learn.\n target: ndarray of shape (n_samples,)\n The target labels.\n filenames: list of shape (n_samples,)\n The path to the location of the data.\n DESCR: str\n The full description of the dataset.\n target_names: list of shape (n_classes,)\n The names of target classes.\n\n(data, target) : tuple if `return_X_y=True`\n .. versionadded:: 0.22", + "description": "Load the filenames and data from the 20 newsgroups dataset (classification).\n\nDownload it if necessary.\n\n================= ==========\nClasses 20\nSamples total 18846\nDimensionality 1\nFeatures text\n================= ==========\n\nRead more in the :ref:`User Guide <20newsgroups_dataset>`.", + "docstring": "Load the filenames and data from the 20 newsgroups dataset (classification).\n\n Download it if necessary.\n\n ================= ==========\n Classes 20\n Samples total 18846\n Dimensionality 1\n Features text\n ================= ==========\n\n Read more in the :ref:`User Guide <20newsgroups_dataset>`.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify a download and cache folder for the datasets. If None,\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n subset : {'train', 'test', 'all'}, default='train'\n Select the dataset to load: 'train' for the training set, 'test'\n for the test set, 'all' for both, with shuffled ordering.\n\n categories : array-like, dtype=str, default=None\n If None (default), load all the categories.\n If not None, list of category names to load (other categories\n ignored).\n\n shuffle : bool, default=True\n Whether or not to shuffle the data: might be important for models that\n make the assumption that the samples are independent and identically\n distributed (i.i.d.), such as stochastic gradient descent.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n remove : tuple, default=()\n May contain any subset of ('headers', 'footers', 'quotes'). Each of\n these are kinds of text that will be detected and removed from the\n newsgroup posts, preventing classifiers from overfitting on\n metadata.\n\n 'headers' removes newsgroup headers, 'footers' removes blocks at the\n ends of posts that look like signatures, and 'quotes' removes lines\n that appear to be quoting another post.\n\n 'headers' follows an exact standard; the other filters are not always\n correct.\n\n download_if_missing : bool, default=True\n If False, raise an IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n return_X_y : bool, default=False\n If True, returns `(data.data, data.target)` instead of a Bunch\n object.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n bunch : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : list of shape (n_samples,)\n The data list to learn.\n target: ndarray of shape (n_samples,)\n The target labels.\n filenames: list of shape (n_samples,)\n The path to the location of the data.\n DESCR: str\n The full description of the dataset.\n target_names: list of shape (n_classes,)\n The names of target classes.\n\n (data, target) : tuple if `return_X_y=True`\n .. versionadded:: 0.22\n ", "source_code": "\ndef fetch_20newsgroups(*, data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True, return_X_y=False):\n \"\"\"Load the filenames and data from the 20 newsgroups dataset (classification).\n\n Download it if necessary.\n\n ================= ==========\n Classes 20\n Samples total 18846\n Dimensionality 1\n Features text\n ================= ==========\n\n Read more in the :ref:`User Guide <20newsgroups_dataset>`.\n\n Parameters\n ----------\n data_home : str, default=None\n Specify a download and cache folder for the datasets. If None,\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n subset : {'train', 'test', 'all'}, default='train'\n Select the dataset to load: 'train' for the training set, 'test'\n for the test set, 'all' for both, with shuffled ordering.\n\n categories : array-like, dtype=str, default=None\n If None (default), load all the categories.\n If not None, list of category names to load (other categories\n ignored).\n\n shuffle : bool, default=True\n Whether or not to shuffle the data: might be important for models that\n make the assumption that the samples are independent and identically\n distributed (i.i.d.), such as stochastic gradient descent.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for dataset shuffling. Pass an int\n for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n remove : tuple, default=()\n May contain any subset of ('headers', 'footers', 'quotes'). Each of\n these are kinds of text that will be detected and removed from the\n newsgroup posts, preventing classifiers from overfitting on\n metadata.\n\n 'headers' removes newsgroup headers, 'footers' removes blocks at the\n ends of posts that look like signatures, and 'quotes' removes lines\n that appear to be quoting another post.\n\n 'headers' follows an exact standard; the other filters are not always\n correct.\n\n download_if_missing : bool, default=True\n If False, raise an IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n return_X_y : bool, default=False\n If True, returns `(data.data, data.target)` instead of a Bunch\n object.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n bunch : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data : list of shape (n_samples,)\n The data list to learn.\n target: ndarray of shape (n_samples,)\n The target labels.\n filenames: list of shape (n_samples,)\n The path to the location of the data.\n DESCR: str\n The full description of the dataset.\n target_names: list of shape (n_classes,)\n The names of target classes.\n\n (data, target) : tuple if `return_X_y=True`\n .. versionadded:: 0.22\n \"\"\"\n data_home = get_data_home(data_home=data_home)\n cache_path = _pkl_filepath(data_home, CACHE_NAME)\n twenty_home = os.path.join(data_home, '20news_home')\n cache = None\n if os.path.exists(cache_path):\n try:\n with open(cache_path, 'rb') as f:\n compressed_content = f.read()\n uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')\n cache = pickle.loads(uncompressed_content)\n except Exception as e:\n print(80 * '_')\n print('Cache loading failed')\n print(80 * '_')\n print(e)\n if cache is None:\n if download_if_missing:\n logger.info('Downloading 20news dataset. This may take a few minutes.')\n cache = _download_20newsgroups(target_dir=twenty_home, cache_path=cache_path)\n else:\n raise IOError('20Newsgroups dataset not found')\n if subset in ('train', 'test'):\n data = cache[subset]\n elif subset == 'all':\n data_lst = list()\n target = list()\n filenames = list()\n for subset in ('train', 'test'):\n data = cache[subset]\n data_lst.extend(data.data)\n target.extend(data.target)\n filenames.extend(data.filenames)\n data.data = data_lst\n data.target = np.array(target)\n data.filenames = np.array(filenames)\n else:\n raise ValueError(\"subset can only be 'train', 'test' or 'all', got '%s'\" % subset)\n fdescr = load_descr('twenty_newsgroups.rst')\n data.DESCR = fdescr\n if 'headers' in remove:\n data.data = [strip_newsgroup_header(text) for text in data.data]\n if 'footers' in remove:\n data.data = [strip_newsgroup_footer(text) for text in data.data]\n if 'quotes' in remove:\n data.data = [strip_newsgroup_quoting(text) for text in data.data]\n if categories is not None:\n labels = [(data.target_names.index(cat), cat) for cat in categories]\n labels.sort()\n (labels, categories) = zip(*labels)\n mask = np.in1d(data.target, labels)\n data.filenames = data.filenames[mask]\n data.target = data.target[mask]\n data.target = np.searchsorted(labels, data.target)\n data.target_names = list(categories)\n data_lst = np.array(data.data, dtype=object)\n data_lst = data_lst[mask]\n data.data = data_lst.tolist()\n if shuffle:\n random_state = check_random_state(random_state)\n indices = np.arange(data.target.shape[0])\n random_state.shuffle(indices)\n data.filenames = data.filenames[indices]\n data.target = data.target[indices]\n data_lst = np.array(data.data, dtype=object)\n data_lst = data_lst[indices]\n data.data = data_lst.tolist()\n if return_X_y:\n return data.data, data.target\n return data" }, { @@ -47042,6 +48940,10 @@ "docstring": { "type": "{'train', 'test', 'all'}, default='train'", "description": "Select the dataset to load: 'train' for the training set, 'test'\nfor the test set, 'all' for both, with shuffled ordering." + }, + "refined_type": { + "kind": "EnumType", + "values": ["train", "all", "test"] } }, { @@ -47052,7 +48954,8 @@ "docstring": { "type": "tuple, default=()", "description": "May contain any subset of ('headers', 'footers', 'quotes'). Each of\nthese are kinds of text that will be detected and removed from the\nnewsgroup posts, preventing classifiers from overfitting on\nmetadata.\n\n'headers' removes newsgroup headers, 'footers' removes blocks at the\nends of posts that look like signatures, and 'quotes' removes lines\nthat appear to be quoting another post." - } + }, + "refined_type": {} }, { "name": "data_home", @@ -47062,7 +48965,8 @@ "docstring": { "type": "str, default=None", "description": "Specify an download and cache folder for the datasets. If None,\nall scikit-learn data is stored in '~/scikit_learn_data' subfolders." - } + }, + "refined_type": {} }, { "name": "download_if_missing", @@ -47072,7 +48976,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, raise an IOError if the data is not locally available\ninstead of trying to download the data from the source site." - } + }, + "refined_type": {} }, { "name": "return_X_y", @@ -47082,7 +48987,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns ``(data.data, data.target)`` instead of a Bunch\nobject.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "normalize", @@ -47092,7 +48998,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, normalizes each document's feature vector to unit norm using\n:func:`sklearn.preprocessing.normalize`.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "as_frame", @@ -47102,13 +49009,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the data is a pandas DataFrame including columns with\nappropriate dtypes (numeric, string, or categorical). The target is\na pandas DataFrame or Series depending on the number of\n`target_columns`.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Load and vectorize the 20 newsgroups dataset (classification).\n\nDownload it if necessary. This is a convenience function; the transformation is done using the default settings for :class:`~sklearn.feature_extraction.text.CountVectorizer`. For more advanced usage (stopword filtering, n-gram extraction, etc.), combine fetch_20newsgroups with a custom :class:`~sklearn.feature_extraction.text.CountVectorizer`, :class:`~sklearn.feature_extraction.text.HashingVectorizer`, :class:`~sklearn.feature_extraction.text.TfidfTransformer` or :class:`~sklearn.feature_extraction.text.TfidfVectorizer`. The resulting counts are normalized using :func:`sklearn.preprocessing.normalize` unless normalize is set to False. ================= ========== Classes 20 Samples total 18846 Dimensionality 130107 Features real ================= ========== Read more in the :ref:`User Guide <20newsgroups_dataset>`.", - "docstring": "Load and vectorize the 20 newsgroups dataset (classification).\n\nDownload it if necessary.\n\nThis is a convenience function; the transformation is done using the\ndefault settings for\n:class:`~sklearn.feature_extraction.text.CountVectorizer`. For more\nadvanced usage (stopword filtering, n-gram extraction, etc.), combine\nfetch_20newsgroups with a custom\n:class:`~sklearn.feature_extraction.text.CountVectorizer`,\n:class:`~sklearn.feature_extraction.text.HashingVectorizer`,\n:class:`~sklearn.feature_extraction.text.TfidfTransformer` or\n:class:`~sklearn.feature_extraction.text.TfidfVectorizer`.\n\nThe resulting counts are normalized using\n:func:`sklearn.preprocessing.normalize` unless normalize is set to False.\n\n================= ==========\nClasses 20\nSamples total 18846\nDimensionality 130107\nFeatures real\n================= ==========\n\nRead more in the :ref:`User Guide <20newsgroups_dataset>`.\n\nParameters\n----------\nsubset : {'train', 'test', 'all'}, default='train'\n Select the dataset to load: 'train' for the training set, 'test'\n for the test set, 'all' for both, with shuffled ordering.\n\nremove : tuple, default=()\n May contain any subset of ('headers', 'footers', 'quotes'). Each of\n these are kinds of text that will be detected and removed from the\n newsgroup posts, preventing classifiers from overfitting on\n metadata.\n\n 'headers' removes newsgroup headers, 'footers' removes blocks at the\n ends of posts that look like signatures, and 'quotes' removes lines\n that appear to be quoting another post.\n\ndata_home : str, default=None\n Specify an download and cache folder for the datasets. If None,\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\ndownload_if_missing : bool, default=True\n If False, raise an IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\nreturn_X_y : bool, default=False\n If True, returns ``(data.data, data.target)`` instead of a Bunch\n object.\n\n .. versionadded:: 0.20\n\nnormalize : bool, default=True\n If True, normalizes each document's feature vector to unit norm using\n :func:`sklearn.preprocessing.normalize`.\n\n .. versionadded:: 0.22\n\nas_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric, string, or categorical). The target is\n a pandas DataFrame or Series depending on the number of\n `target_columns`.\n\n .. versionadded:: 0.24\n\nReturns\n-------\nbunch : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data: {sparse matrix, dataframe} of shape (n_samples, n_features)\n The input data matrix. If ``as_frame`` is `True`, ``data`` is\n a pandas DataFrame with sparse columns.\n target: {ndarray, series} of shape (n_samples,)\n The target labels. If ``as_frame`` is `True`, ``target`` is a\n pandas Series.\n target_names: list of shape (n_classes,)\n The names of target classes.\n DESCR: str\n The full description of the dataset.\n frame: dataframe of shape (n_samples, n_features + 1)\n Only present when `as_frame=True`. Pandas DataFrame with ``data``\n and ``target``.\n\n .. versionadded:: 0.24\n\n(data, target) : tuple if ``return_X_y`` is True\n `data` and `target` would be of the format defined in the `Bunch`\n description above.\n\n .. versionadded:: 0.20", + "description": "Load and vectorize the 20 newsgroups dataset (classification).\n\nDownload it if necessary.\n\nThis is a convenience function; the transformation is done using the\ndefault settings for\n:class:`~sklearn.feature_extraction.text.CountVectorizer`. For more\nadvanced usage (stopword filtering, n-gram extraction, etc.), combine\nfetch_20newsgroups with a custom\n:class:`~sklearn.feature_extraction.text.CountVectorizer`,\n:class:`~sklearn.feature_extraction.text.HashingVectorizer`,\n:class:`~sklearn.feature_extraction.text.TfidfTransformer` or\n:class:`~sklearn.feature_extraction.text.TfidfVectorizer`.\n\nThe resulting counts are normalized using\n:func:`sklearn.preprocessing.normalize` unless normalize is set to False.\n\n================= ==========\nClasses 20\nSamples total 18846\nDimensionality 130107\nFeatures real\n================= ==========\n\nRead more in the :ref:`User Guide <20newsgroups_dataset>`.", + "docstring": "Load and vectorize the 20 newsgroups dataset (classification).\n\n Download it if necessary.\n\n This is a convenience function; the transformation is done using the\n default settings for\n :class:`~sklearn.feature_extraction.text.CountVectorizer`. For more\n advanced usage (stopword filtering, n-gram extraction, etc.), combine\n fetch_20newsgroups with a custom\n :class:`~sklearn.feature_extraction.text.CountVectorizer`,\n :class:`~sklearn.feature_extraction.text.HashingVectorizer`,\n :class:`~sklearn.feature_extraction.text.TfidfTransformer` or\n :class:`~sklearn.feature_extraction.text.TfidfVectorizer`.\n\n The resulting counts are normalized using\n :func:`sklearn.preprocessing.normalize` unless normalize is set to False.\n\n ================= ==========\n Classes 20\n Samples total 18846\n Dimensionality 130107\n Features real\n ================= ==========\n\n Read more in the :ref:`User Guide <20newsgroups_dataset>`.\n\n Parameters\n ----------\n subset : {'train', 'test', 'all'}, default='train'\n Select the dataset to load: 'train' for the training set, 'test'\n for the test set, 'all' for both, with shuffled ordering.\n\n remove : tuple, default=()\n May contain any subset of ('headers', 'footers', 'quotes'). Each of\n these are kinds of text that will be detected and removed from the\n newsgroup posts, preventing classifiers from overfitting on\n metadata.\n\n 'headers' removes newsgroup headers, 'footers' removes blocks at the\n ends of posts that look like signatures, and 'quotes' removes lines\n that appear to be quoting another post.\n\n data_home : str, default=None\n Specify an download and cache folder for the datasets. If None,\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n download_if_missing : bool, default=True\n If False, raise an IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n return_X_y : bool, default=False\n If True, returns ``(data.data, data.target)`` instead of a Bunch\n object.\n\n .. versionadded:: 0.20\n\n normalize : bool, default=True\n If True, normalizes each document's feature vector to unit norm using\n :func:`sklearn.preprocessing.normalize`.\n\n .. versionadded:: 0.22\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric, string, or categorical). The target is\n a pandas DataFrame or Series depending on the number of\n `target_columns`.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n bunch : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data: {sparse matrix, dataframe} of shape (n_samples, n_features)\n The input data matrix. If ``as_frame`` is `True`, ``data`` is\n a pandas DataFrame with sparse columns.\n target: {ndarray, series} of shape (n_samples,)\n The target labels. If ``as_frame`` is `True`, ``target`` is a\n pandas Series.\n target_names: list of shape (n_classes,)\n The names of target classes.\n DESCR: str\n The full description of the dataset.\n frame: dataframe of shape (n_samples, n_features + 1)\n Only present when `as_frame=True`. Pandas DataFrame with ``data``\n and ``target``.\n\n .. versionadded:: 0.24\n\n (data, target) : tuple if ``return_X_y`` is True\n `data` and `target` would be of the format defined in the `Bunch`\n description above.\n\n .. versionadded:: 0.20\n ", "source_code": "\ndef fetch_20newsgroups_vectorized(*, subset='train', remove=(), data_home=None, download_if_missing=True, return_X_y=False, normalize=True, as_frame=False):\n \"\"\"Load and vectorize the 20 newsgroups dataset (classification).\n\n Download it if necessary.\n\n This is a convenience function; the transformation is done using the\n default settings for\n :class:`~sklearn.feature_extraction.text.CountVectorizer`. For more\n advanced usage (stopword filtering, n-gram extraction, etc.), combine\n fetch_20newsgroups with a custom\n :class:`~sklearn.feature_extraction.text.CountVectorizer`,\n :class:`~sklearn.feature_extraction.text.HashingVectorizer`,\n :class:`~sklearn.feature_extraction.text.TfidfTransformer` or\n :class:`~sklearn.feature_extraction.text.TfidfVectorizer`.\n\n The resulting counts are normalized using\n :func:`sklearn.preprocessing.normalize` unless normalize is set to False.\n\n ================= ==========\n Classes 20\n Samples total 18846\n Dimensionality 130107\n Features real\n ================= ==========\n\n Read more in the :ref:`User Guide <20newsgroups_dataset>`.\n\n Parameters\n ----------\n subset : {'train', 'test', 'all'}, default='train'\n Select the dataset to load: 'train' for the training set, 'test'\n for the test set, 'all' for both, with shuffled ordering.\n\n remove : tuple, default=()\n May contain any subset of ('headers', 'footers', 'quotes'). Each of\n these are kinds of text that will be detected and removed from the\n newsgroup posts, preventing classifiers from overfitting on\n metadata.\n\n 'headers' removes newsgroup headers, 'footers' removes blocks at the\n ends of posts that look like signatures, and 'quotes' removes lines\n that appear to be quoting another post.\n\n data_home : str, default=None\n Specify an download and cache folder for the datasets. If None,\n all scikit-learn data is stored in '~/scikit_learn_data' subfolders.\n\n download_if_missing : bool, default=True\n If False, raise an IOError if the data is not locally available\n instead of trying to download the data from the source site.\n\n return_X_y : bool, default=False\n If True, returns ``(data.data, data.target)`` instead of a Bunch\n object.\n\n .. versionadded:: 0.20\n\n normalize : bool, default=True\n If True, normalizes each document's feature vector to unit norm using\n :func:`sklearn.preprocessing.normalize`.\n\n .. versionadded:: 0.22\n\n as_frame : bool, default=False\n If True, the data is a pandas DataFrame including columns with\n appropriate dtypes (numeric, string, or categorical). The target is\n a pandas DataFrame or Series depending on the number of\n `target_columns`.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n bunch : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n data: {sparse matrix, dataframe} of shape (n_samples, n_features)\n The input data matrix. If ``as_frame`` is `True`, ``data`` is\n a pandas DataFrame with sparse columns.\n target: {ndarray, series} of shape (n_samples,)\n The target labels. If ``as_frame`` is `True`, ``target`` is a\n pandas Series.\n target_names: list of shape (n_classes,)\n The names of target classes.\n DESCR: str\n The full description of the dataset.\n frame: dataframe of shape (n_samples, n_features + 1)\n Only present when `as_frame=True`. Pandas DataFrame with ``data``\n and ``target``.\n\n .. versionadded:: 0.24\n\n (data, target) : tuple if ``return_X_y`` is True\n `data` and `target` would be of the format defined in the `Bunch`\n description above.\n\n .. versionadded:: 0.20\n \"\"\"\n data_home = get_data_home(data_home=data_home)\n filebase = '20newsgroup_vectorized'\n if remove:\n filebase += 'remove-' + '-'.join(remove)\n target_file = _pkl_filepath(data_home, filebase + '.pkl')\n data_train = fetch_20newsgroups(data_home=data_home, subset='train', categories=None, shuffle=True, random_state=12, remove=remove, download_if_missing=download_if_missing)\n data_test = fetch_20newsgroups(data_home=data_home, subset='test', categories=None, shuffle=True, random_state=12, remove=remove, download_if_missing=download_if_missing)\n if os.path.exists(target_file):\n try:\n (X_train, X_test, feature_names) = joblib.load(target_file)\n except ValueError as e:\n raise ValueError(f'The cached dataset located in {target_file} was fetched with an older scikit-learn version and it is not compatible with the scikit-learn version imported. You need to manually delete the file: {target_file}.') from e\n else:\n vectorizer = CountVectorizer(dtype=np.int16)\n X_train = vectorizer.fit_transform(data_train.data).tocsr()\n X_test = vectorizer.transform(data_test.data).tocsr()\n feature_names = vectorizer.get_feature_names_out()\n joblib.dump((X_train, X_test, feature_names), target_file, compress=9)\n if normalize:\n X_train = X_train.astype(np.float64)\n X_test = X_test.astype(np.float64)\n preprocessing.normalize(X_train, copy=False)\n preprocessing.normalize(X_test, copy=False)\n target_names = data_train.target_names\n if subset == 'train':\n data = X_train\n target = data_train.target\n elif subset == 'test':\n data = X_test\n target = data_test.target\n elif subset == 'all':\n data = sp.vstack((X_train, X_test)).tocsr()\n target = np.concatenate((data_train.target, data_test.target))\n else:\n raise ValueError(\"%r is not a valid subset: should be one of ['train', 'test', 'all']\" % subset)\n fdescr = load_descr('twenty_newsgroups.rst')\n frame = None\n target_name = ['category_class']\n if as_frame:\n (frame, data, target) = _convert_data_dataframe('fetch_20newsgroups_vectorized', data, target, feature_names, target_names=target_name, sparse_data=True)\n if return_X_y:\n return data, target\n return Bunch(data=data, target=target, frame=frame, target_names=target_names, feature_names=feature_names, DESCR=fdescr)" }, { @@ -47126,13 +49034,14 @@ "docstring": { "type": "str", "description": "The text from which to remove the signature block." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Given text in \"news\" format, attempt to remove a signature block.\n\nAs a rough heuristic, we assume that signatures are set apart by either a blank line or a line made of hyphens, and that it is the last such line in the file (disregarding blank lines at the end).", - "docstring": "Given text in \"news\" format, attempt to remove a signature block.\n\nAs a rough heuristic, we assume that signatures are set apart by either\na blank line or a line made of hyphens, and that it is the last such line\nin the file (disregarding blank lines at the end).\n\nParameters\n----------\ntext : str\n The text from which to remove the signature block.", + "description": "Given text in \"news\" format, attempt to remove a signature block.\n\nAs a rough heuristic, we assume that signatures are set apart by either\na blank line or a line made of hyphens, and that it is the last such line\nin the file (disregarding blank lines at the end).", + "docstring": "\n Given text in \"news\" format, attempt to remove a signature block.\n\n As a rough heuristic, we assume that signatures are set apart by either\n a blank line or a line made of hyphens, and that it is the last such line\n in the file (disregarding blank lines at the end).\n\n Parameters\n ----------\n text : str\n The text from which to remove the signature block.\n ", "source_code": "\ndef strip_newsgroup_footer(text):\n \"\"\"\n Given text in \"news\" format, attempt to remove a signature block.\n\n As a rough heuristic, we assume that signatures are set apart by either\n a blank line or a line made of hyphens, and that it is the last such line\n in the file (disregarding blank lines at the end).\n\n Parameters\n ----------\n text : str\n The text from which to remove the signature block.\n \"\"\"\n lines = text.strip().split('\\n')\n for line_num in range(len(lines) - 1, -1, -1):\n line = lines[line_num]\n if line.strip().strip('-') == '':\n break\n if line_num > 0:\n return '\\n'.join(lines[:line_num])\n else:\n return text" }, { @@ -47150,13 +49059,14 @@ "docstring": { "type": "str", "description": "The text from which to remove the signature block." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Given text in \"news\" format, strip the headers, by removing everything before the first blank line.", - "docstring": "Given text in \"news\" format, strip the headers, by removing everything\nbefore the first blank line.\n\nParameters\n----------\ntext : str\n The text from which to remove the signature block.", + "description": "Given text in \"news\" format, strip the headers, by removing everything\nbefore the first blank line.", + "docstring": "\n Given text in \"news\" format, strip the headers, by removing everything\n before the first blank line.\n\n Parameters\n ----------\n text : str\n The text from which to remove the signature block.\n ", "source_code": "\ndef strip_newsgroup_header(text):\n \"\"\"\n Given text in \"news\" format, strip the headers, by removing everything\n before the first blank line.\n\n Parameters\n ----------\n text : str\n The text from which to remove the signature block.\n \"\"\"\n (_before, _blankline, after) = text.partition('\\n\\n')\n return after" }, { @@ -47174,13 +49084,14 @@ "docstring": { "type": "str", "description": "The text from which to remove the signature block." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Given text in \"news\" format, strip lines beginning with the quote characters > or |, plus lines that often introduce a quoted section (for example, because they contain the string 'writes:'.)", - "docstring": "Given text in \"news\" format, strip lines beginning with the quote\ncharacters > or |, plus lines that often introduce a quoted section\n(for example, because they contain the string 'writes:'.)\n\nParameters\n----------\ntext : str\n The text from which to remove the signature block.", + "description": "Given text in \"news\" format, strip lines beginning with the quote\ncharacters > or |, plus lines that often introduce a quoted section\n(for example, because they contain the string 'writes:'.)", + "docstring": "\n Given text in \"news\" format, strip lines beginning with the quote\n characters > or |, plus lines that often introduce a quoted section\n (for example, because they contain the string 'writes:'.)\n\n Parameters\n ----------\n text : str\n The text from which to remove the signature block.\n ", "source_code": "\ndef strip_newsgroup_quoting(text):\n \"\"\"\n Given text in \"news\" format, strip lines beginning with the quote\n characters > or |, plus lines that often introduce a quoted section\n (for example, because they contain the string 'writes:'.)\n\n Parameters\n ----------\n text : str\n The text from which to remove the signature block.\n \"\"\"\n good_lines = [line for line in text.split('\\n') if not _QUOTE_RE.search(line)]\n return '\\n'.join(good_lines)" }, { @@ -47198,7 +49109,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -47208,13 +49120,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n from numpy.distutils.misc_util import Configuration\n config = Configuration('datasets', parent_package, top_path)\n config.add_data_dir('data')\n config.add_data_dir('descr')\n config.add_data_dir('images')\n config.add_data_dir(os.path.join('tests', 'data'))\n if platform.python_implementation() != 'PyPy':\n config.add_extension('_svmlight_format_fast', sources=['_svmlight_format_fast.pyx'], include_dirs=[numpy.get_include()])\n config.add_subpackage('tests')\n return config" }, { @@ -47232,7 +49145,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -47242,7 +49156,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -47252,13 +49167,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Placeholder for fit. Subclasses should implement this method!\n\nFit the model with X.", - "docstring": "Placeholder for fit. Subclasses should implement this method!\n\nFit the model with X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Placeholder for fit. Subclasses should implement this method!\n\n Fit the model with X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\n@abstractmethod\ndef fit(self, X, y=None):\n \"\"\"Placeholder for fit. Subclasses should implement this method!\n\n Fit the model with X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n " }, { @@ -47276,13 +49192,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute data covariance with the generative model.\n\n``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)`` where S**2 contains the explained variances, and sigma2 contains the noise variances.", - "docstring": "Compute data covariance with the generative model.\n\n``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)``\nwhere S**2 contains the explained variances, and sigma2 contains the\nnoise variances.\n\nReturns\n-------\ncov : array of shape=(n_features, n_features)\n Estimated covariance of data.", + "description": "Compute data covariance with the generative model.\n\n``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)``\nwhere S**2 contains the explained variances, and sigma2 contains the\nnoise variances.", + "docstring": "Compute data covariance with the generative model.\n\n ``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)``\n where S**2 contains the explained variances, and sigma2 contains the\n noise variances.\n\n Returns\n -------\n cov : array of shape=(n_features, n_features)\n Estimated covariance of data.\n ", "source_code": "\ndef get_covariance(self):\n \"\"\"Compute data covariance with the generative model.\n\n ``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)``\n where S**2 contains the explained variances, and sigma2 contains the\n noise variances.\n\n Returns\n -------\n cov : array of shape=(n_features, n_features)\n Estimated covariance of data.\n \"\"\"\n components_ = self.components_\n exp_var = self.explained_variance_\n if self.whiten:\n components_ = components_ * np.sqrt(exp_var[:, np.newaxis])\n exp_var_diff = np.maximum(exp_var - self.noise_variance_, 0.0)\n cov = np.dot(components_.T * exp_var_diff, components_)\n cov.flat[::len(cov) + 1] += self.noise_variance_\n return cov" }, { @@ -47300,13 +49217,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute data precision matrix with the generative model.\n\nEquals the inverse of the covariance but computed with the matrix inversion lemma for efficiency.", - "docstring": "Compute data precision matrix with the generative model.\n\nEquals the inverse of the covariance but computed with\nthe matrix inversion lemma for efficiency.\n\nReturns\n-------\nprecision : array, shape=(n_features, n_features)\n Estimated precision of data.", + "description": "Compute data precision matrix with the generative model.\n\nEquals the inverse of the covariance but computed with\nthe matrix inversion lemma for efficiency.", + "docstring": "Compute data precision matrix with the generative model.\n\n Equals the inverse of the covariance but computed with\n the matrix inversion lemma for efficiency.\n\n Returns\n -------\n precision : array, shape=(n_features, n_features)\n Estimated precision of data.\n ", "source_code": "\ndef get_precision(self):\n \"\"\"Compute data precision matrix with the generative model.\n\n Equals the inverse of the covariance but computed with\n the matrix inversion lemma for efficiency.\n\n Returns\n -------\n precision : array, shape=(n_features, n_features)\n Estimated precision of data.\n \"\"\"\n n_features = self.components_.shape[1]\n if self.n_components_ == 0:\n return np.eye(n_features) / self.noise_variance_\n if self.n_components_ == n_features:\n return linalg.inv(self.get_covariance())\n components_ = self.components_\n exp_var = self.explained_variance_\n if self.whiten:\n components_ = components_ * np.sqrt(exp_var[:, np.newaxis])\n exp_var_diff = np.maximum(exp_var - self.noise_variance_, 0.0)\n precision = np.dot(components_, components_.T) / self.noise_variance_\n precision.flat[::len(precision) + 1] += 1.0 / exp_var_diff\n precision = np.dot(components_.T, np.dot(linalg.inv(precision), components_))\n precision /= -self.noise_variance_**2\n precision.flat[::len(precision) + 1] += 1.0 / self.noise_variance_\n return precision" }, { @@ -47324,7 +49242,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -47334,13 +49253,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "New data, where `n_samples` is the number of samples\nand `n_components` is the number of components." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Transform data back to its original space.\n\nIn other words, return an input `X_original` whose transform would be X.", - "docstring": "Transform data back to its original space.\n\nIn other words, return an input `X_original` whose transform would be X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_components)\n New data, where `n_samples` is the number of samples\n and `n_components` is the number of components.\n\nReturns\n-------\nX_original array-like of shape (n_samples, n_features)\n Original data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\nNotes\n-----\nIf whitening is enabled, inverse_transform will compute the\nexact inverse operation, which includes reversing whitening.", + "docstring": "Transform data back to its original space.\n\n In other words, return an input `X_original` whose transform would be X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_components)\n New data, where `n_samples` is the number of samples\n and `n_components` is the number of components.\n\n Returns\n -------\n X_original array-like of shape (n_samples, n_features)\n Original data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Notes\n -----\n If whitening is enabled, inverse_transform will compute the\n exact inverse operation, which includes reversing whitening.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Transform data back to its original space.\n\n In other words, return an input `X_original` whose transform would be X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_components)\n New data, where `n_samples` is the number of samples\n and `n_components` is the number of components.\n\n Returns\n -------\n X_original array-like of shape (n_samples, n_features)\n Original data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Notes\n -----\n If whitening is enabled, inverse_transform will compute the\n exact inverse operation, which includes reversing whitening.\n \"\"\"\n if self.whiten:\n return np.dot(X, np.sqrt(self.explained_variance_[:, np.newaxis]) * self.components_) + self.mean_\n else:\n return np.dot(X, self.components_) + self.mean_" }, { @@ -47358,7 +49278,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -47368,13 +49289,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "New data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Apply dimensionality reduction to X.\n\nX is projected on the first principal components previously extracted from a training set.", - "docstring": "Apply dimensionality reduction to X.\n\nX is projected on the first principal components previously extracted\nfrom a training set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\nReturns\n-------\nX_new : array-like of shape (n_samples, n_components)\n Projection of X in the first principal components, where `n_samples`\n is the number of samples and `n_components` is the number of the components.", + "description": "Apply dimensionality reduction to X.\n\nX is projected on the first principal components previously extracted\nfrom a training set.", + "docstring": "Apply dimensionality reduction to X.\n\n X is projected on the first principal components previously extracted\n from a training set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : array-like of shape (n_samples, n_components)\n Projection of X in the first principal components, where `n_samples`\n is the number of samples and `n_components` is the number of the components.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Apply dimensionality reduction to X.\n\n X is projected on the first principal components previously extracted\n from a training set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : array-like of shape (n_samples, n_components)\n Projection of X in the first principal components, where `n_samples`\n is the number of samples and `n_components` is the number of the components.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, dtype=[np.float64, np.float32], reset=False)\n if self.mean_ is not None:\n X = X - self.mean_\n X_transformed = np.dot(X, self.components_.T)\n if self.whiten:\n X_transformed /= np.sqrt(self.explained_variance_)\n return X_transformed" }, { @@ -47392,7 +49314,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -47400,9 +49323,10 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "int, default=n_features", - "description": "Number of dictionary elements to extract." - } + "type": "int, default=None", + "description": "Number of dictionary elements to extract. If None, then ``n_components``\nis set to ``n_features``." + }, + "refined_type": {} }, { "name": "alpha", @@ -47412,7 +49336,8 @@ "docstring": { "type": "float, default=1.0", "description": "Sparsity controlling parameter." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -47422,7 +49347,8 @@ "docstring": { "type": "int, default=1000", "description": "Maximum number of iterations to perform." - } + }, + "refined_type": {} }, { "name": "tol", @@ -47432,7 +49358,8 @@ "docstring": { "type": "float, default=1e-8", "description": "Tolerance for numerical error." - } + }, + "refined_type": {} }, { "name": "fit_algorithm", @@ -47442,6 +49369,10 @@ "docstring": { "type": "{'lars', 'cd'}, default='lars'", "description": "* `'lars'`: uses the least angle regression method to solve the lasso\n problem (:func:`~sklearn.linear_model.lars_path`);\n* `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (:class:`~sklearn.linear_model.Lasso`). Lars will be\n faster if the estimated components are sparse.\n\n.. versionadded:: 0.17\n *cd* coordinate descent method to improve speed." + }, + "refined_type": { + "kind": "EnumType", + "values": ["cd", "lars"] } }, { @@ -47452,6 +49383,16 @@ "docstring": { "type": "{'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'", "description": "Algorithm used to transform the data:\n\n- `'lars'`: uses the least angle regression method\n (:func:`~sklearn.linear_model.lars_path`);\n- `'lasso_lars'`: uses Lars to compute the Lasso solution.\n- `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (:class:`~sklearn.linear_model.Lasso`). `'lasso_lars'`\n will be faster if the estimated components are sparse.\n- `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution.\n- `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``.\n\n.. versionadded:: 0.17\n *lasso_cd* coordinate descent method to improve speed." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "omp", + "lasso_cd", + "lasso_lars", + "lars", + "threshold" + ] } }, { @@ -47462,7 +49403,8 @@ "docstring": { "type": "int, default=None", "description": "Number of nonzero coefficients to target in each column of the\nsolution. This is only used by `algorithm='lars'` and\n`algorithm='omp'`. If `None`, then\n`transform_n_nonzero_coefs=int(n_features / 10)`." - } + }, + "refined_type": {} }, { "name": "transform_alpha", @@ -47472,7 +49414,8 @@ "docstring": { "type": "float, default=None", "description": "If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\npenalty applied to the L1 norm.\nIf `algorithm='threshold'`, `alpha` is the absolute value of the\nthreshold below which coefficients will be squashed to zero.\nIf `None`, defaults to `alpha`." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -47482,7 +49425,8 @@ "docstring": { "type": "int or None, default=None", "description": "Number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "code_init", @@ -47492,7 +49436,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_components), default=None", "description": "Initial value for the code, for warm restart. Only used if `code_init`\nand `dict_init` are not None." - } + }, + "refined_type": {} }, { "name": "dict_init", @@ -47502,7 +49447,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_features), default=None", "description": "Initial values for the dictionary, for warm restart. Only used if\n`code_init` and `dict_init` are not None." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -47512,7 +49458,8 @@ "docstring": { "type": "bool, default=False", "description": "To control the verbosity of the procedure." - } + }, + "refined_type": {} }, { "name": "split_sign", @@ -47522,7 +49469,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to split the sparse feature vector into the concatenation of\nits negative part and its positive part. This can improve the\nperformance of downstream classifiers." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -47532,7 +49480,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used for initializing the dictionary when ``dict_init`` is not\nspecified, randomly shuffling the data when ``shuffle`` is set to\n``True``, and updating the dictionary. Pass an int for reproducible\nresults across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "positive_code", @@ -47542,7 +49491,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enforce positivity when finding the code.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "positive_dict", @@ -47552,7 +49502,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enforce positivity when finding the dictionary.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "transform_max_iter", @@ -47562,13 +49513,14 @@ "docstring": { "type": "int, default=1000", "description": "Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n`'lasso_lars'`.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=None, *, alpha=1, max_iter=1000, tol=1e-08, fit_algorithm='lars', transform_algorithm='omp', transform_n_nonzero_coefs=None, transform_alpha=None, n_jobs=None, code_init=None, dict_init=None, verbose=False, split_sign=False, random_state=None, positive_code=False, positive_dict=False, transform_max_iter=1000):\n super().__init__(transform_algorithm, transform_n_nonzero_coefs, transform_alpha, split_sign, n_jobs, positive_code, transform_max_iter)\n self.n_components = n_components\n self.alpha = alpha\n self.max_iter = max_iter\n self.tol = tol\n self.fit_algorithm = fit_algorithm\n self.code_init = code_init\n self.dict_init = dict_init\n self.verbose = verbose\n self.random_state = random_state\n self.positive_dict = positive_dict" }, { @@ -47586,7 +49538,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -47596,7 +49549,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -47606,13 +49560,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model from data in X.", - "docstring": "Fit the model from data in X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X)\n if self.n_components is None:\n n_components = X.shape[1]\n else:\n n_components = self.n_components\n (V, U, E, self.n_iter_) = dict_learning(X, n_components, alpha=self.alpha, tol=self.tol, max_iter=self.max_iter, method=self.fit_algorithm, method_max_iter=self.transform_max_iter, n_jobs=self.n_jobs, code_init=self.code_init, dict_init=self.dict_init, verbose=self.verbose, random_state=random_state, return_n_iter=True, positive_dict=self.positive_dict, positive_code=self.positive_code)\n self.components_ = U\n self.error_ = E\n return self" }, { @@ -47630,7 +49585,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -47640,7 +49596,8 @@ "docstring": { "type": "int, default=None", "description": "Number of dictionary elements to extract." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -47650,7 +49607,8 @@ "docstring": { "type": "float, default=1", "description": "Sparsity controlling parameter." - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -47660,7 +49618,8 @@ "docstring": { "type": "int, default=1000", "description": "Total number of iterations to perform." - } + }, + "refined_type": {} }, { "name": "fit_algorithm", @@ -47670,6 +49629,10 @@ "docstring": { "type": "{'lars', 'cd'}, default='lars'", "description": "The algorithm used:\n\n- `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`)\n- `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse." + }, + "refined_type": { + "kind": "EnumType", + "values": ["cd", "lars"] } }, { @@ -47680,7 +49643,8 @@ "docstring": { "type": "int, default=None", "description": "Number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "batch_size", @@ -47690,7 +49654,8 @@ "docstring": { "type": "int, default=3", "description": "Number of samples in each mini-batch." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -47700,7 +49665,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to shuffle the samples before forming batches." - } + }, + "refined_type": {} }, { "name": "dict_init", @@ -47710,7 +49676,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_features), default=None", "description": "Initial value of the dictionary for warm restart scenarios." - } + }, + "refined_type": {} }, { "name": "transform_algorithm", @@ -47720,6 +49687,16 @@ "docstring": { "type": "{'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'", "description": "Algorithm used to transform the data:\n\n- `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n- `'lasso_lars'`: uses Lars to compute the Lasso solution.\n- `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). `'lasso_lars'` will be faster\n if the estimated components are sparse.\n- `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution.\n- `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "omp", + "lasso_cd", + "lasso_lars", + "lars", + "threshold" + ] } }, { @@ -47730,7 +49707,8 @@ "docstring": { "type": "int, default=None", "description": "Number of nonzero coefficients to target in each column of the\nsolution. This is only used by `algorithm='lars'` and\n`algorithm='omp'`. If `None`, then\n`transform_n_nonzero_coefs=int(n_features / 10)`." - } + }, + "refined_type": {} }, { "name": "transform_alpha", @@ -47740,7 +49718,8 @@ "docstring": { "type": "float, default=None", "description": "If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\npenalty applied to the L1 norm.\nIf `algorithm='threshold'`, `alpha` is the absolute value of the\nthreshold below which coefficients will be squashed to zero.\nIf `None`, defaults to `alpha`." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -47750,7 +49729,8 @@ "docstring": { "type": "bool, default=False", "description": "To control the verbosity of the procedure." - } + }, + "refined_type": {} }, { "name": "split_sign", @@ -47760,7 +49740,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to split the sparse feature vector into the concatenation of\nits negative part and its positive part. This can improve the\nperformance of downstream classifiers." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -47770,7 +49751,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used for initializing the dictionary when ``dict_init`` is not\nspecified, randomly shuffling the data when ``shuffle`` is set to\n``True``, and updating the dictionary. Pass an int for reproducible\nresults across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "positive_code", @@ -47780,7 +49762,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enforce positivity when finding the code.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "positive_dict", @@ -47790,7 +49773,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enforce positivity when finding the dictionary.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "transform_max_iter", @@ -47800,13 +49784,14 @@ "docstring": { "type": "int, default=1000", "description": "Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n`'lasso_lars'`.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=None, *, alpha=1, n_iter=1000, fit_algorithm='lars', n_jobs=None, batch_size=3, shuffle=True, dict_init=None, transform_algorithm='omp', transform_n_nonzero_coefs=None, transform_alpha=None, verbose=False, split_sign=False, random_state=None, positive_code=False, positive_dict=False, transform_max_iter=1000):\n super().__init__(transform_algorithm, transform_n_nonzero_coefs, transform_alpha, split_sign, n_jobs, positive_code, transform_max_iter)\n self.n_components = n_components\n self.alpha = alpha\n self.n_iter = n_iter\n self.fit_algorithm = fit_algorithm\n self.dict_init = dict_init\n self.verbose = verbose\n self.shuffle = shuffle\n self.batch_size = batch_size\n self.split_sign = split_sign\n self.random_state = random_state\n self.positive_dict = positive_dict" }, { @@ -47824,7 +49809,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -47834,7 +49820,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -47844,13 +49831,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model from data in X.", - "docstring": "Fit the model from data in X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X)\n (U, (A, B), self.n_iter_) = dict_learning_online(X, self.n_components, alpha=self.alpha, n_iter=self.n_iter, return_code=False, method=self.fit_algorithm, method_max_iter=self.transform_max_iter, n_jobs=self.n_jobs, dict_init=self.dict_init, batch_size=self.batch_size, shuffle=self.shuffle, verbose=self.verbose, random_state=random_state, return_inner_stats=True, return_n_iter=True, positive_dict=self.positive_dict, positive_code=self.positive_code)\n self.components_ = U\n self.inner_stats_ = (A, B)\n self.iter_offset_ = self.n_iter\n self.random_state_ = random_state\n return self" }, { @@ -47868,7 +49856,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -47878,7 +49867,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -47888,7 +49878,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "iter_offset", @@ -47898,13 +49889,14 @@ "docstring": { "type": "int, default=None", "description": "The number of iteration on data batches that has been\nperformed before this call to `partial_fit`. This is optional:\nif no number is passed, the memory of the object is\nused." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Update the model using the data in X as a mini-batch.", - "docstring": "Update the model using the data in X as a mini-batch.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\niter_offset : int, default=None\n The number of iteration on data batches that has been\n performed before this call to `partial_fit`. This is optional:\n if no number is passed, the memory of the object is\n used.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Update the model using the data in X as a mini-batch.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n iter_offset : int, default=None\n The number of iteration on data batches that has been\n performed before this call to `partial_fit`. This is optional:\n if no number is passed, the memory of the object is\n used.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef partial_fit(self, X, y=None, iter_offset=None):\n \"\"\"Update the model using the data in X as a mini-batch.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n iter_offset : int, default=None\n The number of iteration on data batches that has been\n performed before this call to `partial_fit`. This is optional:\n if no number is passed, the memory of the object is\n used.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n if not hasattr(self, 'random_state_'):\n self.random_state_ = check_random_state(self.random_state)\n if hasattr(self, 'components_'):\n dict_init = self.components_\n else:\n dict_init = self.dict_init\n inner_stats = getattr(self, 'inner_stats_', None)\n if iter_offset is None:\n iter_offset = getattr(self, 'iter_offset_', 0)\n X = self._validate_data(X, reset=iter_offset == 0)\n (U, (A, B)) = dict_learning_online(X, self.n_components, alpha=self.alpha, n_iter=1, method=self.fit_algorithm, method_max_iter=self.transform_max_iter, n_jobs=self.n_jobs, dict_init=dict_init, batch_size=len(X), shuffle=False, verbose=self.verbose, return_code=False, iter_offset=iter_offset, random_state=self.random_state_, return_inner_stats=True, inner_stats=inner_stats, positive_dict=self.positive_dict, positive_code=self.positive_code)\n self.components_ = U\n self.inner_stats_ = (A, B)\n self.iter_offset_ = iter_offset + 1\n return self" }, { @@ -47922,7 +49914,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dictionary", @@ -47932,7 +49925,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_features)", "description": "The dictionary atoms used for sparse coding. Lines are assumed to be\nnormalized to unit norm." - } + }, + "refined_type": {} }, { "name": "transform_algorithm", @@ -47942,6 +49936,16 @@ "docstring": { "type": "{'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='omp'", "description": "Algorithm used to transform the data:\n\n- `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n- `'lasso_lars'`: uses Lars to compute the Lasso solution;\n- `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (linear_model.Lasso). `'lasso_lars'` will be faster if\n the estimated components are sparse;\n- `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution;\n- `'threshold'`: squashes to zero all coefficients less than alpha from\n the projection ``dictionary * X'``." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "omp", + "lasso_cd", + "lasso_lars", + "lars", + "threshold" + ] } }, { @@ -47952,7 +49956,8 @@ "docstring": { "type": "int, default=None", "description": "Number of nonzero coefficients to target in each column of the\nsolution. This is only used by `algorithm='lars'` and `algorithm='omp'`\nand is overridden by `alpha` in the `omp` case. If `None`, then\n`transform_n_nonzero_coefs=int(n_features / 10)`." - } + }, + "refined_type": {} }, { "name": "transform_alpha", @@ -47962,7 +49967,8 @@ "docstring": { "type": "float, default=None", "description": "If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\npenalty applied to the L1 norm.\nIf `algorithm='threshold'`, `alpha` is the absolute value of the\nthreshold below which coefficients will be squashed to zero.\nIf `algorithm='omp'`, `alpha` is the tolerance parameter: the value of\nthe reconstruction error targeted. In this case, it overrides\n`n_nonzero_coefs`.\nIf `None`, default to 1." - } + }, + "refined_type": {} }, { "name": "split_sign", @@ -47972,7 +49978,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to split the sparse feature vector into the concatenation of\nits negative part and its positive part. This can improve the\nperformance of downstream classifiers." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -47982,7 +49989,8 @@ "docstring": { "type": "int, default=None", "description": "Number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "positive_code", @@ -47992,7 +50000,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enforce positivity when finding the code.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "transform_max_iter", @@ -48002,13 +50011,14 @@ "docstring": { "type": "int, default=1000", "description": "Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n`lasso_lars`.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, dictionary, *, transform_algorithm='omp', transform_n_nonzero_coefs=None, transform_alpha=None, split_sign=False, n_jobs=None, positive_code=False, transform_max_iter=1000):\n super().__init__(transform_algorithm, transform_n_nonzero_coefs, transform_alpha, split_sign, n_jobs, positive_code, transform_max_iter)\n self.dictionary = dictionary" }, { @@ -48026,13 +50036,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'requires_fit': False}" }, { @@ -48053,13 +50064,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('The attribute `components_` is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Use the `dictionary` instead.')\n@property\ndef components_(self):\n return self.dictionary" }, { @@ -48077,7 +50089,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -48087,7 +50100,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "y", @@ -48097,13 +50111,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Do nothing and return the estimator unchanged.\n\nThis method is just there to implement the usual API and hence work in pipelines.", - "docstring": "Do nothing and return the estimator unchanged.\n\nThis method is just there to implement the usual API and hence\nwork in pipelines.\n\nParameters\n----------\nX : Ignored\n Not used, present for API consistency by convention.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "description": "Do nothing and return the estimator unchanged.\n\nThis method is just there to implement the usual API and hence\nwork in pipelines.", + "docstring": "Do nothing and return the estimator unchanged.\n\n This method is just there to implement the usual API and hence\n work in pipelines.\n\n Parameters\n ----------\n X : Ignored\n Not used, present for API consistency by convention.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Do nothing and return the estimator unchanged.\n\n This method is just there to implement the usual API and hence\n work in pipelines.\n\n Parameters\n ----------\n X : Ignored\n Not used, present for API consistency by convention.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n return self" }, { @@ -48121,7 +50136,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -48145,7 +50161,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -48169,7 +50186,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -48179,7 +50197,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -48189,13 +50208,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Encode the data as a sparse combination of the dictionary atoms.\n\nCoding method is determined by the object parameter `transform_algorithm`.", - "docstring": "Encode the data as a sparse combination of the dictionary atoms.\n\nCoding method is determined by the object parameter\n`transform_algorithm`.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Transformed data.", + "description": "Encode the data as a sparse combination of the dictionary atoms.\n\nCoding method is determined by the object parameter\n`transform_algorithm`.", + "docstring": "Encode the data as a sparse combination of the dictionary atoms.\n\n Coding method is determined by the object parameter\n `transform_algorithm`.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed data.\n ", "source_code": "\ndef transform(self, X, y=None):\n \"\"\"Encode the data as a sparse combination of the dictionary atoms.\n\n Coding method is determined by the object parameter\n `transform_algorithm`.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n return super()._transform(X, self.dictionary)" }, { @@ -48213,7 +50233,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transform_algorithm", @@ -48223,7 +50244,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transform_n_nonzero_coefs", @@ -48233,7 +50255,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transform_alpha", @@ -48243,7 +50266,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "split_sign", @@ -48253,7 +50277,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -48263,7 +50288,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "positive_code", @@ -48273,7 +50299,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transform_max_iter", @@ -48283,13 +50310,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, transform_algorithm, transform_n_nonzero_coefs, transform_alpha, split_sign, n_jobs, positive_code, transform_max_iter):\n self.transform_algorithm = transform_algorithm\n self.transform_n_nonzero_coefs = transform_n_nonzero_coefs\n self.transform_alpha = transform_alpha\n self.transform_max_iter = transform_max_iter\n self.split_sign = split_sign\n self.n_jobs = n_jobs\n self.positive_code = positive_code" }, { @@ -48307,7 +50335,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -48317,7 +50346,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dictionary", @@ -48327,13 +50357,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Private method allowing to accommodate both DictionaryLearning and SparseCoder.", - "docstring": "Private method allowing to accommodate both DictionaryLearning and\nSparseCoder.", + "description": "Private method allowing to accommodate both DictionaryLearning and\nSparseCoder.", + "docstring": "Private method allowing to accommodate both DictionaryLearning and\n SparseCoder.", "source_code": "\ndef _transform(self, X, dictionary):\n \"\"\"Private method allowing to accommodate both DictionaryLearning and\n SparseCoder.\"\"\"\n X = self._validate_data(X, reset=False)\n if hasattr(self, 'alpha') and self.alpha != 1.0 and self.transform_alpha is None:\n warnings.warn('By default transform_alpha will be equal toalpha instead of 1.0 starting from version 1.2', FutureWarning)\n transform_alpha = 1.0\n else:\n transform_alpha = self.transform_alpha\n code = sparse_encode(X, dictionary, algorithm=self.transform_algorithm, n_nonzero_coefs=self.transform_n_nonzero_coefs, alpha=transform_alpha, max_iter=self.transform_max_iter, n_jobs=self.n_jobs, positive=self.positive_code)\n if self.split_sign:\n (n_samples, n_features) = code.shape\n split_code = np.empty((n_samples, 2 * n_features))\n split_code[:, :n_features] = np.maximum(code, 0)\n split_code[:, n_features:] = -np.minimum(code, 0)\n code = split_code\n return code" }, { @@ -48351,7 +50382,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -48361,13 +50393,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Test data to be transformed, must have the same number of\nfeatures as the data used to train the model." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Encode the data as a sparse combination of the dictionary atoms.\n\nCoding method is determined by the object parameter `transform_algorithm`.", - "docstring": "Encode the data as a sparse combination of the dictionary atoms.\n\nCoding method is determined by the object parameter\n`transform_algorithm`.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Test data to be transformed, must have the same number of\n features as the data used to train the model.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Transformed data.", + "description": "Encode the data as a sparse combination of the dictionary atoms.\n\nCoding method is determined by the object parameter\n`transform_algorithm`.", + "docstring": "Encode the data as a sparse combination of the dictionary atoms.\n\n Coding method is determined by the object parameter\n `transform_algorithm`.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Test data to be transformed, must have the same number of\n features as the data used to train the model.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed data.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Encode the data as a sparse combination of the dictionary atoms.\n\n Coding method is determined by the object parameter\n `transform_algorithm`.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Test data to be transformed, must have the same number of\n features as the data used to train the model.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n return self._transform(X, self.components_)" }, { @@ -48385,7 +50418,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "positive", @@ -48395,13 +50429,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_positive_coding(method, positive):\n if positive and method in ['omp', 'lars']:\n raise ValueError(\"Positive constraint not supported for '{}' coding method.\".format(method))" }, { @@ -48419,7 +50454,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Data matrix." - } + }, + "refined_type": {} }, { "name": "dictionary", @@ -48429,7 +50465,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_features)", "description": "The dictionary matrix against which to solve the sparse coding of\nthe data. Some of the algorithms assume normalized rows." - } + }, + "refined_type": {} }, { "name": "gram", @@ -48439,7 +50476,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_components) or None", "description": "Precomputed Gram matrix, `dictionary * dictionary'`\ngram can be `None` if method is 'threshold'." - } + }, + "refined_type": {} }, { "name": "cov", @@ -48449,7 +50487,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_samples), default=None", "description": "Precomputed covariance, `dictionary * X'`." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -48459,6 +50498,16 @@ "docstring": { "type": "{'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='lasso_lars'", "description": "The algorithm used:\n\n* `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n* `'lasso_lars'`: uses Lars to compute the Lasso solution;\n* `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). lasso_lars will be faster if\n the estimated components are sparse;\n* `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution;\n* `'threshold'`: squashes to zero all coefficients less than\n regularization from the projection `dictionary * data'`." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "omp", + "lasso_cd", + "lasso_lars", + "lars", + "threshold" + ] } }, { @@ -48469,7 +50518,8 @@ "docstring": { "type": "int or float, default=None", "description": "The regularization parameter. It corresponds to alpha when\nalgorithm is `'lasso_lars'`, `'lasso_cd'` or `'threshold'`.\nOtherwise it corresponds to `n_nonzero_coefs`." - } + }, + "refined_type": {} }, { "name": "copy_cov", @@ -48479,7 +50529,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to copy the precomputed covariance matrix; if `False`, it may\nbe overwritten." - } + }, + "refined_type": {} }, { "name": "init", @@ -48489,7 +50540,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_components), default=None", "description": "Initialization value of the sparse code. Only used if\n`algorithm='lasso_cd'`." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -48499,7 +50551,8 @@ "docstring": { "type": "int, default=1000", "description": "Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n`'lasso_lars'`." - } + }, + "refined_type": {} }, { "name": "check_input", @@ -48509,7 +50562,8 @@ "docstring": { "type": "bool, default=True", "description": "If `False`, the input arrays `X` and dictionary will not be checked." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -48519,7 +50573,8 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity; the higher, the more messages." - } + }, + "refined_type": {} }, { "name": "positive", @@ -48529,13 +50584,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Generic sparse coding.\n\nEach column of the result is the solution to a Lasso problem.", - "docstring": "Generic sparse coding.\n\nEach column of the result is the solution to a Lasso problem.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Data matrix.\n\ndictionary : ndarray of shape (n_components, n_features)\n The dictionary matrix against which to solve the sparse coding of\n the data. Some of the algorithms assume normalized rows.\n\ngram : ndarray of shape (n_components, n_components) or None\n Precomputed Gram matrix, `dictionary * dictionary'`\n gram can be `None` if method is 'threshold'.\n\ncov : ndarray of shape (n_components, n_samples), default=None\n Precomputed covariance, `dictionary * X'`.\n\nalgorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='lasso_lars'\n The algorithm used:\n\n * `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n * `'lasso_lars'`: uses Lars to compute the Lasso solution;\n * `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). lasso_lars will be faster if\n the estimated components are sparse;\n * `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution;\n * `'threshold'`: squashes to zero all coefficients less than\n regularization from the projection `dictionary * data'`.\n\nregularization : int or float, default=None\n The regularization parameter. It corresponds to alpha when\n algorithm is `'lasso_lars'`, `'lasso_cd'` or `'threshold'`.\n Otherwise it corresponds to `n_nonzero_coefs`.\n\ninit : ndarray of shape (n_samples, n_components), default=None\n Initialization value of the sparse code. Only used if\n `algorithm='lasso_cd'`.\n\nmax_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\ncopy_cov : bool, default=True\n Whether to copy the precomputed covariance matrix; if `False`, it may\n be overwritten.\n\ncheck_input : bool, default=True\n If `False`, the input arrays `X` and dictionary will not be checked.\n\nverbose : int, default=0\n Controls the verbosity; the higher, the more messages.\n\npositive: bool, default=False\n Whether to enforce a positivity constraint on the sparse code.\n\n .. versionadded:: 0.20\n\nReturns\n-------\ncode : ndarray of shape (n_components, n_features)\n The sparse codes.\n\nSee Also\n--------\nsklearn.linear_model.lars_path\nsklearn.linear_model.orthogonal_mp\nsklearn.linear_model.Lasso\nSparseCoder", + "docstring": "Generic sparse coding.\n\n Each column of the result is the solution to a Lasso problem.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data matrix.\n\n dictionary : ndarray of shape (n_components, n_features)\n The dictionary matrix against which to solve the sparse coding of\n the data. Some of the algorithms assume normalized rows.\n\n gram : ndarray of shape (n_components, n_components) or None\n Precomputed Gram matrix, `dictionary * dictionary'`\n gram can be `None` if method is 'threshold'.\n\n cov : ndarray of shape (n_components, n_samples), default=None\n Precomputed covariance, `dictionary * X'`.\n\n algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='lasso_lars'\n The algorithm used:\n\n * `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n * `'lasso_lars'`: uses Lars to compute the Lasso solution;\n * `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). lasso_lars will be faster if\n the estimated components are sparse;\n * `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution;\n * `'threshold'`: squashes to zero all coefficients less than\n regularization from the projection `dictionary * data'`.\n\n regularization : int or float, default=None\n The regularization parameter. It corresponds to alpha when\n algorithm is `'lasso_lars'`, `'lasso_cd'` or `'threshold'`.\n Otherwise it corresponds to `n_nonzero_coefs`.\n\n init : ndarray of shape (n_samples, n_components), default=None\n Initialization value of the sparse code. Only used if\n `algorithm='lasso_cd'`.\n\n max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\n copy_cov : bool, default=True\n Whether to copy the precomputed covariance matrix; if `False`, it may\n be overwritten.\n\n check_input : bool, default=True\n If `False`, the input arrays `X` and dictionary will not be checked.\n\n verbose : int, default=0\n Controls the verbosity; the higher, the more messages.\n\n positive: bool, default=False\n Whether to enforce a positivity constraint on the sparse code.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n code : ndarray of shape (n_components, n_features)\n The sparse codes.\n\n See Also\n --------\n sklearn.linear_model.lars_path\n sklearn.linear_model.orthogonal_mp\n sklearn.linear_model.Lasso\n SparseCoder\n ", "source_code": "\ndef _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars', regularization=None, copy_cov=True, init=None, max_iter=1000, check_input=True, verbose=0, positive=False):\n \"\"\"Generic sparse coding.\n\n Each column of the result is the solution to a Lasso problem.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data matrix.\n\n dictionary : ndarray of shape (n_components, n_features)\n The dictionary matrix against which to solve the sparse coding of\n the data. Some of the algorithms assume normalized rows.\n\n gram : ndarray of shape (n_components, n_components) or None\n Precomputed Gram matrix, `dictionary * dictionary'`\n gram can be `None` if method is 'threshold'.\n\n cov : ndarray of shape (n_components, n_samples), default=None\n Precomputed covariance, `dictionary * X'`.\n\n algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='lasso_lars'\n The algorithm used:\n\n * `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n * `'lasso_lars'`: uses Lars to compute the Lasso solution;\n * `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). lasso_lars will be faster if\n the estimated components are sparse;\n * `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution;\n * `'threshold'`: squashes to zero all coefficients less than\n regularization from the projection `dictionary * data'`.\n\n regularization : int or float, default=None\n The regularization parameter. It corresponds to alpha when\n algorithm is `'lasso_lars'`, `'lasso_cd'` or `'threshold'`.\n Otherwise it corresponds to `n_nonzero_coefs`.\n\n init : ndarray of shape (n_samples, n_components), default=None\n Initialization value of the sparse code. Only used if\n `algorithm='lasso_cd'`.\n\n max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\n copy_cov : bool, default=True\n Whether to copy the precomputed covariance matrix; if `False`, it may\n be overwritten.\n\n check_input : bool, default=True\n If `False`, the input arrays `X` and dictionary will not be checked.\n\n verbose : int, default=0\n Controls the verbosity; the higher, the more messages.\n\n positive: bool, default=False\n Whether to enforce a positivity constraint on the sparse code.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n code : ndarray of shape (n_components, n_features)\n The sparse codes.\n\n See Also\n --------\n sklearn.linear_model.lars_path\n sklearn.linear_model.orthogonal_mp\n sklearn.linear_model.Lasso\n SparseCoder\n \"\"\"\n if X.ndim == 1:\n X = X[:, np.newaxis]\n (n_samples, n_features) = X.shape\n n_components = dictionary.shape[0]\n if dictionary.shape[1] != X.shape[1]:\n raise ValueError('Dictionary and X have different numbers of features:dictionary.shape: {} X.shape{}'.format(dictionary.shape, X.shape))\n if cov is None and algorithm != 'lasso_cd':\n copy_cov = False\n cov = np.dot(dictionary, X.T)\n _check_positive_coding(algorithm, positive)\n if algorithm == 'lasso_lars':\n alpha = float(regularization) / n_features\n try:\n err_mgt = np.seterr(all='ignore')\n lasso_lars = LassoLars(alpha=alpha, fit_intercept=False, verbose=verbose, normalize=False, precompute=gram, fit_path=False, positive=positive, max_iter=max_iter)\n lasso_lars.fit(dictionary.T, X.T, Xy=cov)\n new_code = lasso_lars.coef_\n finally:\n np.seterr(**err_mgt)\n elif algorithm == 'lasso_cd':\n alpha = float(regularization) / n_features\n clf = Lasso(alpha=alpha, fit_intercept=False, normalize='deprecated', precompute=gram, max_iter=max_iter, warm_start=True, positive=positive)\n if init is not None:\n clf.coef_ = init\n clf.fit(dictionary.T, X.T, check_input=check_input)\n new_code = clf.coef_\n elif algorithm == 'lars':\n try:\n err_mgt = np.seterr(all='ignore')\n lars = Lars(fit_intercept=False, verbose=verbose, normalize=False, precompute=gram, n_nonzero_coefs=int(regularization), fit_path=False)\n lars.fit(dictionary.T, X.T, Xy=cov)\n new_code = lars.coef_\n finally:\n np.seterr(**err_mgt)\n elif algorithm == 'threshold':\n new_code = (np.sign(cov) * np.maximum(np.abs(cov) - regularization, 0)).T\n if positive:\n np.clip(new_code, 0, None, out=new_code)\n elif algorithm == 'omp':\n new_code = orthogonal_mp_gram(Gram=gram, Xy=cov, n_nonzero_coefs=int(regularization), tol=None, norms_squared=row_norms(X, squared=True), copy_Xy=copy_cov).T\n else:\n raise ValueError('Sparse coding method must be \"lasso_lars\" \"lasso_cd\", \"lasso\", \"threshold\" or \"omp\", got %s.' % algorithm)\n if new_code.ndim != 2:\n return new_code.reshape(n_samples, n_components)\n return new_code" }, { @@ -48553,7 +50609,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_features)", "description": "Value of the dictionary at the previous iteration." - } + }, + "refined_type": {} }, { "name": "Y", @@ -48563,7 +50620,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Data matrix." - } + }, + "refined_type": {} }, { "name": "code", @@ -48573,7 +50631,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_components)", "description": "Sparse coding of the data against which to optimize the dictionary." - } + }, + "refined_type": {} }, { "name": "A", @@ -48583,7 +50642,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_components), default=None", "description": "Together with `B`, sufficient stats of the online model to update the\ndictionary." - } + }, + "refined_type": {} }, { "name": "B", @@ -48593,7 +50653,8 @@ "docstring": { "type": "ndarray of shape (n_features, n_components), default=None", "description": "Together with `A`, sufficient stats of the online model to update the\ndictionary." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -48603,7 +50664,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -48613,7 +50675,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used for randomly initializing the dictionary. Pass an int for\nreproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "positive", @@ -48623,14 +50686,15 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enforce positivity when finding the dictionary.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Update the dense dictionary factor in place.", - "docstring": "Update the dense dictionary factor in place.\n\nParameters\n----------\ndictionary : ndarray of shape (n_components, n_features)\n Value of the dictionary at the previous iteration.\n\nY : ndarray of shape (n_samples, n_features)\n Data matrix.\n\ncode : ndarray of shape (n_samples, n_components)\n Sparse coding of the data against which to optimize the dictionary.\n\nA : ndarray of shape (n_components, n_components), default=None\n Together with `B`, sufficient stats of the online model to update the\n dictionary.\n\nB : ndarray of shape (n_features, n_components), default=None\n Together with `A`, sufficient stats of the online model to update the\n dictionary.\n\nverbose: bool, default=False\n Degree of output the procedure will print.\n\nrandom_state : int, RandomState instance or None, default=None\n Used for randomly initializing the dictionary. Pass an int for\n reproducible results across multiple function calls.\n See :term:`Glossary `.\n\npositive : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20", - "source_code": "\ndef _update_dict(dictionary, Y, code, A=None, B=None, verbose=False, random_state=None, positive=False):\n \"\"\"Update the dense dictionary factor in place.\n\n Parameters\n ----------\n dictionary : ndarray of shape (n_components, n_features)\n Value of the dictionary at the previous iteration.\n\n Y : ndarray of shape (n_samples, n_features)\n Data matrix.\n\n code : ndarray of shape (n_samples, n_components)\n Sparse coding of the data against which to optimize the dictionary.\n\n A : ndarray of shape (n_components, n_components), default=None\n Together with `B`, sufficient stats of the online model to update the\n dictionary.\n\n B : ndarray of shape (n_features, n_components), default=None\n Together with `A`, sufficient stats of the online model to update the\n dictionary.\n\n verbose: bool, default=False\n Degree of output the procedure will print.\n\n random_state : int, RandomState instance or None, default=None\n Used for randomly initializing the dictionary. Pass an int for\n reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n positive : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n \"\"\"\n (n_samples, n_components) = code.shape\n random_state = check_random_state(random_state)\n if A is None:\n A = code.T @ code\n if B is None:\n B = Y.T @ code\n n_unused = 0\n for k in range(n_components):\n if A[k, k] > 1e-06:\n dictionary[k] += (B[:, k] - A[k] @ dictionary) / A[k, k]\n else:\n newd = Y[random_state.choice(n_samples)]\n noise_level = 0.01 * (newd.std() or 1)\n noise = random_state.normal(0, noise_level, size=len(newd))\n dictionary[k] = newd + noise\n code[:, k] = 0\n n_unused += 1\n if positive:\n np.clip(dictionary[k], 0, None, out=dictionary[k])\n dictionary[k] /= linalg.norm(dictionary[k])\n if verbose and n_unused > 0:\n print(f'{n_unused} unused atoms resampled.')" + "docstring": "Update the dense dictionary factor in place.\n\n Parameters\n ----------\n dictionary : ndarray of shape (n_components, n_features)\n Value of the dictionary at the previous iteration.\n\n Y : ndarray of shape (n_samples, n_features)\n Data matrix.\n\n code : ndarray of shape (n_samples, n_components)\n Sparse coding of the data against which to optimize the dictionary.\n\n A : ndarray of shape (n_components, n_components), default=None\n Together with `B`, sufficient stats of the online model to update the\n dictionary.\n\n B : ndarray of shape (n_features, n_components), default=None\n Together with `A`, sufficient stats of the online model to update the\n dictionary.\n\n verbose: bool, default=False\n Degree of output the procedure will print.\n\n random_state : int, RandomState instance or None, default=None\n Used for randomly initializing the dictionary. Pass an int for\n reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n positive : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n ", + "source_code": "\ndef _update_dict(dictionary, Y, code, A=None, B=None, verbose=False, random_state=None, positive=False):\n \"\"\"Update the dense dictionary factor in place.\n\n Parameters\n ----------\n dictionary : ndarray of shape (n_components, n_features)\n Value of the dictionary at the previous iteration.\n\n Y : ndarray of shape (n_samples, n_features)\n Data matrix.\n\n code : ndarray of shape (n_samples, n_components)\n Sparse coding of the data against which to optimize the dictionary.\n\n A : ndarray of shape (n_components, n_components), default=None\n Together with `B`, sufficient stats of the online model to update the\n dictionary.\n\n B : ndarray of shape (n_features, n_components), default=None\n Together with `A`, sufficient stats of the online model to update the\n dictionary.\n\n verbose: bool, default=False\n Degree of output the procedure will print.\n\n random_state : int, RandomState instance or None, default=None\n Used for randomly initializing the dictionary. Pass an int for\n reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n positive : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n \"\"\"\n (n_samples, n_components) = code.shape\n random_state = check_random_state(random_state)\n if A is None:\n A = code.T @ code\n if B is None:\n B = Y.T @ code\n n_unused = 0\n for k in range(n_components):\n if A[k, k] > 1e-06:\n dictionary[k] += (B[:, k] - A[k] @ dictionary) / A[k, k]\n else:\n newd = Y[random_state.choice(n_samples)]\n noise_level = 0.01 * (newd.std() or 1)\n noise = random_state.normal(0, noise_level, size=len(newd))\n dictionary[k] = newd + noise\n code[:, k] = 0\n n_unused += 1\n if positive:\n np.clip(dictionary[k], 0, None, out=dictionary[k])\n dictionary[k] /= max(linalg.norm(dictionary[k]), 1)\n if verbose and n_unused > 0:\n print(f'{n_unused} unused atoms resampled.')" }, { "name": "dict_learning", @@ -48647,7 +50711,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Data matrix." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -48657,7 +50722,8 @@ "docstring": { "type": "int", "description": "Number of dictionary atoms to extract." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -48667,7 +50733,8 @@ "docstring": { "type": "int", "description": "Sparsity controlling parameter." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -48677,7 +50744,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum number of iterations to perform." - } + }, + "refined_type": {} }, { "name": "tol", @@ -48687,7 +50755,8 @@ "docstring": { "type": "float, default=1e-8", "description": "Tolerance for the stopping condition." - } + }, + "refined_type": {} }, { "name": "method", @@ -48697,6 +50766,10 @@ "docstring": { "type": "{'lars', 'cd'}, default='lars'", "description": "The method used:\n\n* `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`);\n* `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse." + }, + "refined_type": { + "kind": "EnumType", + "values": ["cd", "lars"] } }, { @@ -48707,7 +50780,8 @@ "docstring": { "type": "int, default=None", "description": "Number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "dict_init", @@ -48717,7 +50791,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_features), default=None", "description": "Initial value for the dictionary for warm restart scenarios. Only used\nif `code_init` and `dict_init` are not None." - } + }, + "refined_type": {} }, { "name": "code_init", @@ -48727,7 +50802,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_components), default=None", "description": "Initial value for the sparse code for warm restart scenarios. Only used\nif `code_init` and `dict_init` are not None." - } + }, + "refined_type": {} }, { "name": "callback", @@ -48737,7 +50813,8 @@ "docstring": { "type": "callable, default=None", "description": "Callable that gets invoked every five iterations" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -48747,7 +50824,8 @@ "docstring": { "type": "bool, default=False", "description": "To control the verbosity of the procedure." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -48757,7 +50835,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used for randomly initializing the dictionary. Pass an int for\nreproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -48767,7 +50846,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether or not to return the number of iterations." - } + }, + "refined_type": {} }, { "name": "positive_dict", @@ -48777,7 +50857,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enforce positivity when finding the dictionary.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "positive_code", @@ -48787,7 +50868,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enforce positivity when finding the code.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "method_max_iter", @@ -48797,13 +50879,14 @@ "docstring": { "type": "int, default=1000", "description": "Maximum number of iterations to perform.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Solves a dictionary learning matrix factorization problem.\n\nFinds the best dictionary and the corresponding sparse code for approximating the data matrix X by solving:: (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1 (U,V) with || V_k ||_2 = 1 for all 0 <= k < n_components where V is the dictionary and U is the sparse code. ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm which is the sum of the absolute values of all the entries in the matrix. Read more in the :ref:`User Guide `.", - "docstring": "Solves a dictionary learning matrix factorization problem.\n\nFinds the best dictionary and the corresponding sparse code for\napproximating the data matrix X by solving::\n\n (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\nwhere V is the dictionary and U is the sparse code. ||.||_Fro stands for\nthe Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm\nwhich is the sum of the absolute values of all the entries in the matrix.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Data matrix.\n\nn_components : int\n Number of dictionary atoms to extract.\n\nalpha : int\n Sparsity controlling parameter.\n\nmax_iter : int, default=100\n Maximum number of iterations to perform.\n\ntol : float, default=1e-8\n Tolerance for the stopping condition.\n\nmethod : {'lars', 'cd'}, default='lars'\n The method used:\n\n * `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`);\n * `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse.\n\nn_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\ndict_init : ndarray of shape (n_components, n_features), default=None\n Initial value for the dictionary for warm restart scenarios. Only used\n if `code_init` and `dict_init` are not None.\n\ncode_init : ndarray of shape (n_samples, n_components), default=None\n Initial value for the sparse code for warm restart scenarios. Only used\n if `code_init` and `dict_init` are not None.\n\ncallback : callable, default=None\n Callable that gets invoked every five iterations\n\nverbose : bool, default=False\n To control the verbosity of the procedure.\n\nrandom_state : int, RandomState instance or None, default=None\n Used for randomly initializing the dictionary. Pass an int for\n reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nreturn_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\npositive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\npositive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\nmethod_max_iter : int, default=1000\n Maximum number of iterations to perform.\n\n .. versionadded:: 0.22\n\nReturns\n-------\ncode : ndarray of shape (n_samples, n_components)\n The sparse code factor in the matrix factorization.\n\ndictionary : ndarray of shape (n_components, n_features),\n The dictionary factor in the matrix factorization.\n\nerrors : array\n Vector of errors at each iteration.\n\nn_iter : int\n Number of iterations run. Returned only if `return_n_iter` is\n set to True.\n\nSee Also\n--------\ndict_learning_online\nDictionaryLearning\nMiniBatchDictionaryLearning\nSparsePCA\nMiniBatchSparsePCA", + "description": "Solves a dictionary learning matrix factorization problem.\n\nFinds the best dictionary and the corresponding sparse code for\napproximating the data matrix X by solving::\n\n (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\nwhere V is the dictionary and U is the sparse code. ||.||_Fro stands for\nthe Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm\nwhich is the sum of the absolute values of all the entries in the matrix.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Solves a dictionary learning matrix factorization problem.\n\n Finds the best dictionary and the corresponding sparse code for\n approximating the data matrix X by solving::\n\n (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\n where V is the dictionary and U is the sparse code. ||.||_Fro stands for\n the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm\n which is the sum of the absolute values of all the entries in the matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data matrix.\n\n n_components : int\n Number of dictionary atoms to extract.\n\n alpha : int\n Sparsity controlling parameter.\n\n max_iter : int, default=100\n Maximum number of iterations to perform.\n\n tol : float, default=1e-8\n Tolerance for the stopping condition.\n\n method : {'lars', 'cd'}, default='lars'\n The method used:\n\n * `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`);\n * `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial value for the dictionary for warm restart scenarios. Only used\n if `code_init` and `dict_init` are not None.\n\n code_init : ndarray of shape (n_samples, n_components), default=None\n Initial value for the sparse code for warm restart scenarios. Only used\n if `code_init` and `dict_init` are not None.\n\n callback : callable, default=None\n Callable that gets invoked every five iterations\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n random_state : int, RandomState instance or None, default=None\n Used for randomly initializing the dictionary. Pass an int for\n reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n method_max_iter : int, default=1000\n Maximum number of iterations to perform.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n code : ndarray of shape (n_samples, n_components)\n The sparse code factor in the matrix factorization.\n\n dictionary : ndarray of shape (n_components, n_features),\n The dictionary factor in the matrix factorization.\n\n errors : array\n Vector of errors at each iteration.\n\n n_iter : int\n Number of iterations run. Returned only if `return_n_iter` is\n set to True.\n\n See Also\n --------\n dict_learning_online\n DictionaryLearning\n MiniBatchDictionaryLearning\n SparsePCA\n MiniBatchSparsePCA\n ", "source_code": "\ndef dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-08, method='lars', n_jobs=None, dict_init=None, code_init=None, callback=None, verbose=False, random_state=None, return_n_iter=False, positive_dict=False, positive_code=False, method_max_iter=1000):\n \"\"\"Solves a dictionary learning matrix factorization problem.\n\n Finds the best dictionary and the corresponding sparse code for\n approximating the data matrix X by solving::\n\n (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\n where V is the dictionary and U is the sparse code. ||.||_Fro stands for\n the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm\n which is the sum of the absolute values of all the entries in the matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data matrix.\n\n n_components : int\n Number of dictionary atoms to extract.\n\n alpha : int\n Sparsity controlling parameter.\n\n max_iter : int, default=100\n Maximum number of iterations to perform.\n\n tol : float, default=1e-8\n Tolerance for the stopping condition.\n\n method : {'lars', 'cd'}, default='lars'\n The method used:\n\n * `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`);\n * `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial value for the dictionary for warm restart scenarios. Only used\n if `code_init` and `dict_init` are not None.\n\n code_init : ndarray of shape (n_samples, n_components), default=None\n Initial value for the sparse code for warm restart scenarios. Only used\n if `code_init` and `dict_init` are not None.\n\n callback : callable, default=None\n Callable that gets invoked every five iterations\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n random_state : int, RandomState instance or None, default=None\n Used for randomly initializing the dictionary. Pass an int for\n reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n method_max_iter : int, default=1000\n Maximum number of iterations to perform.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n code : ndarray of shape (n_samples, n_components)\n The sparse code factor in the matrix factorization.\n\n dictionary : ndarray of shape (n_components, n_features),\n The dictionary factor in the matrix factorization.\n\n errors : array\n Vector of errors at each iteration.\n\n n_iter : int\n Number of iterations run. Returned only if `return_n_iter` is\n set to True.\n\n See Also\n --------\n dict_learning_online\n DictionaryLearning\n MiniBatchDictionaryLearning\n SparsePCA\n MiniBatchSparsePCA\n \"\"\"\n if method not in ('lars', 'cd'):\n raise ValueError('Coding method %r not supported as a fit algorithm.' % method)\n _check_positive_coding(method, positive_code)\n method = 'lasso_' + method\n t0 = time.time()\n alpha = float(alpha)\n random_state = check_random_state(random_state)\n if code_init is not None and dict_init is not None:\n code = np.array(code_init, order='F')\n dictionary = dict_init\n else:\n (code, S, dictionary) = linalg.svd(X, full_matrices=False)\n (code, dictionary) = svd_flip(code, dictionary)\n dictionary = S[:, np.newaxis] * dictionary\n r = len(dictionary)\n if n_components <= r:\n code = code[:, :n_components]\n dictionary = dictionary[:n_components, :]\n else:\n code = np.c_[code, np.zeros((len(code), n_components - r))]\n dictionary = np.r_[dictionary, np.zeros((n_components - r, dictionary.shape[1]))]\n dictionary = np.asfortranarray(dictionary)\n errors = []\n current_cost = np.nan\n if verbose == 1:\n print('[dict_learning]', end=' ')\n ii = -1\n for ii in range(max_iter):\n dt = time.time() - t0\n if verbose == 1:\n sys.stdout.write('.')\n sys.stdout.flush()\n elif verbose:\n print('Iteration % 3i (elapsed time: % 3is, % 4.1fmn, current cost % 7.3f)' % (ii, dt, dt / 60, current_cost))\n code = sparse_encode(X, dictionary, algorithm=method, alpha=alpha, init=code, n_jobs=n_jobs, positive=positive_code, max_iter=method_max_iter, verbose=verbose)\n _update_dict(dictionary, X, code, verbose=verbose, random_state=random_state, positive=positive_dict)\n current_cost = 0.5 * np.sum((X - code @ dictionary)**2) + alpha * np.sum(np.abs(code))\n errors.append(current_cost)\n if ii > 0:\n dE = errors[-2] - errors[-1]\n if dE < tol * errors[-1]:\n if verbose == 1:\n print('')\n elif verbose:\n print('--- Convergence reached after %d iterations' % ii)\n break\n if ii % 5 == 0 and callback is not None:\n callback(locals())\n if return_n_iter:\n return code, dictionary, errors, ii + 1\n else:\n return code, dictionary, errors" }, { @@ -48821,7 +50904,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Data matrix." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -48829,9 +50913,10 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "int, default=2", - "description": "Number of dictionary atoms to extract." - } + "type": "int or None, default=2", + "description": "Number of dictionary atoms to extract. If None, then ``n_components``\nis set to ``n_features``." + }, + "refined_type": {} }, { "name": "alpha", @@ -48841,7 +50926,8 @@ "docstring": { "type": "float, default=1", "description": "Sparsity controlling parameter." - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -48851,7 +50937,8 @@ "docstring": { "type": "int, default=100", "description": "Number of mini-batch iterations to perform." - } + }, + "refined_type": {} }, { "name": "return_code", @@ -48861,7 +50948,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to also return the code U or just the dictionary `V`." - } + }, + "refined_type": {} }, { "name": "dict_init", @@ -48871,7 +50959,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_features), default=None", "description": "Initial value for the dictionary for warm restart scenarios." - } + }, + "refined_type": {} }, { "name": "callback", @@ -48881,7 +50970,8 @@ "docstring": { "type": "callable, default=None", "description": "callable that gets invoked every five iterations." - } + }, + "refined_type": {} }, { "name": "batch_size", @@ -48891,7 +50981,8 @@ "docstring": { "type": "int, default=3", "description": "The number of samples to take in each batch." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -48901,7 +50992,8 @@ "docstring": { "type": "bool, default=False", "description": "To control the verbosity of the procedure." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -48911,7 +51003,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to shuffle the data before splitting it in batches." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -48921,7 +51014,8 @@ "docstring": { "type": "int, default=None", "description": "Number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "method", @@ -48931,6 +51025,10 @@ "docstring": { "type": "{'lars', 'cd'}, default='lars'", "description": "* `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`);\n* `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse." + }, + "refined_type": { + "kind": "EnumType", + "values": ["cd", "lars"] } }, { @@ -48941,7 +51039,8 @@ "docstring": { "type": "int, default=0", "description": "Number of previous iterations completed on the dictionary used for\ninitialization." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -48951,7 +51050,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used for initializing the dictionary when ``dict_init`` is not\nspecified, randomly shuffling the data when ``shuffle`` is set to\n``True``, and updating the dictionary. Pass an int for reproducible\nresults across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "return_inner_stats", @@ -48961,7 +51061,8 @@ "docstring": { "type": "bool, default=False", "description": "Return the inner statistics A (dictionary covariance) and B\n(data approximation). Useful to restart the algorithm in an\nonline setting. If `return_inner_stats` is `True`, `return_code` is\nignored." - } + }, + "refined_type": {} }, { "name": "inner_stats", @@ -48971,7 +51072,8 @@ "docstring": { "type": "tuple of (A, B) ndarrays, default=None", "description": "Inner sufficient statistics that are kept by the algorithm.\nPassing them at initialization is useful in online settings, to\navoid losing the history of the evolution.\n`A` `(n_components, n_components)` is the dictionary covariance matrix.\n`B` `(n_features, n_components)` is the data approximation matrix." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -48981,7 +51083,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether or not to return the number of iterations." - } + }, + "refined_type": {} }, { "name": "positive_dict", @@ -48991,7 +51094,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enforce positivity when finding the dictionary.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "positive_code", @@ -49001,7 +51105,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enforce positivity when finding the code.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "method_max_iter", @@ -49011,14 +51116,15 @@ "docstring": { "type": "int, default=1000", "description": "Maximum number of iterations to perform when solving the lasso problem.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Solves a dictionary learning matrix factorization problem online.\n\nFinds the best dictionary and the corresponding sparse code for approximating the data matrix X by solving:: (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1 (U,V) with || V_k ||_2 = 1 for all 0 <= k < n_components where V is the dictionary and U is the sparse code. ||.||_Fro stands for the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm which is the sum of the absolute values of all the entries in the matrix. This is accomplished by repeatedly iterating over mini-batches by slicing the input data. Read more in the :ref:`User Guide `.", - "docstring": "Solves a dictionary learning matrix factorization problem online.\n\nFinds the best dictionary and the corresponding sparse code for\napproximating the data matrix X by solving::\n\n (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\nwhere V is the dictionary and U is the sparse code. ||.||_Fro stands for\nthe Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm\nwhich is the sum of the absolute values of all the entries in the matrix.\nThis is accomplished by repeatedly iterating over mini-batches by slicing\nthe input data.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Data matrix.\n\nn_components : int, default=2\n Number of dictionary atoms to extract.\n\nalpha : float, default=1\n Sparsity controlling parameter.\n\nn_iter : int, default=100\n Number of mini-batch iterations to perform.\n\nreturn_code : bool, default=True\n Whether to also return the code U or just the dictionary `V`.\n\ndict_init : ndarray of shape (n_components, n_features), default=None\n Initial value for the dictionary for warm restart scenarios.\n\ncallback : callable, default=None\n callable that gets invoked every five iterations.\n\nbatch_size : int, default=3\n The number of samples to take in each batch.\n\nverbose : bool, default=False\n To control the verbosity of the procedure.\n\nshuffle : bool, default=True\n Whether to shuffle the data before splitting it in batches.\n\nn_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nmethod : {'lars', 'cd'}, default='lars'\n * `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`);\n * `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse.\n\niter_offset : int, default=0\n Number of previous iterations completed on the dictionary used for\n initialization.\n\nrandom_state : int, RandomState instance or None, default=None\n Used for initializing the dictionary when ``dict_init`` is not\n specified, randomly shuffling the data when ``shuffle`` is set to\n ``True``, and updating the dictionary. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\nreturn_inner_stats : bool, default=False\n Return the inner statistics A (dictionary covariance) and B\n (data approximation). Useful to restart the algorithm in an\n online setting. If `return_inner_stats` is `True`, `return_code` is\n ignored.\n\ninner_stats : tuple of (A, B) ndarrays, default=None\n Inner sufficient statistics that are kept by the algorithm.\n Passing them at initialization is useful in online settings, to\n avoid losing the history of the evolution.\n `A` `(n_components, n_components)` is the dictionary covariance matrix.\n `B` `(n_features, n_components)` is the data approximation matrix.\n\nreturn_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\npositive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\npositive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\nmethod_max_iter : int, default=1000\n Maximum number of iterations to perform when solving the lasso problem.\n\n .. versionadded:: 0.22\n\nReturns\n-------\ncode : ndarray of shape (n_samples, n_components),\n The sparse code (only returned if `return_code=True`).\n\ndictionary : ndarray of shape (n_components, n_features),\n The solutions to the dictionary learning problem.\n\nn_iter : int\n Number of iterations run. Returned only if `return_n_iter` is\n set to `True`.\n\nSee Also\n--------\ndict_learning\nDictionaryLearning\nMiniBatchDictionaryLearning\nSparsePCA\nMiniBatchSparsePCA", - "source_code": "\ndef dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, return_code=True, dict_init=None, callback=None, batch_size=3, verbose=False, shuffle=True, n_jobs=None, method='lars', iter_offset=0, random_state=None, return_inner_stats=False, inner_stats=None, return_n_iter=False, positive_dict=False, positive_code=False, method_max_iter=1000):\n \"\"\"Solves a dictionary learning matrix factorization problem online.\n\n Finds the best dictionary and the corresponding sparse code for\n approximating the data matrix X by solving::\n\n (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\n where V is the dictionary and U is the sparse code. ||.||_Fro stands for\n the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm\n which is the sum of the absolute values of all the entries in the matrix.\n This is accomplished by repeatedly iterating over mini-batches by slicing\n the input data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data matrix.\n\n n_components : int, default=2\n Number of dictionary atoms to extract.\n\n alpha : float, default=1\n Sparsity controlling parameter.\n\n n_iter : int, default=100\n Number of mini-batch iterations to perform.\n\n return_code : bool, default=True\n Whether to also return the code U or just the dictionary `V`.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial value for the dictionary for warm restart scenarios.\n\n callback : callable, default=None\n callable that gets invoked every five iterations.\n\n batch_size : int, default=3\n The number of samples to take in each batch.\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n shuffle : bool, default=True\n Whether to shuffle the data before splitting it in batches.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n method : {'lars', 'cd'}, default='lars'\n * `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`);\n * `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse.\n\n iter_offset : int, default=0\n Number of previous iterations completed on the dictionary used for\n initialization.\n\n random_state : int, RandomState instance or None, default=None\n Used for initializing the dictionary when ``dict_init`` is not\n specified, randomly shuffling the data when ``shuffle`` is set to\n ``True``, and updating the dictionary. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n return_inner_stats : bool, default=False\n Return the inner statistics A (dictionary covariance) and B\n (data approximation). Useful to restart the algorithm in an\n online setting. If `return_inner_stats` is `True`, `return_code` is\n ignored.\n\n inner_stats : tuple of (A, B) ndarrays, default=None\n Inner sufficient statistics that are kept by the algorithm.\n Passing them at initialization is useful in online settings, to\n avoid losing the history of the evolution.\n `A` `(n_components, n_components)` is the dictionary covariance matrix.\n `B` `(n_features, n_components)` is the data approximation matrix.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n method_max_iter : int, default=1000\n Maximum number of iterations to perform when solving the lasso problem.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n code : ndarray of shape (n_samples, n_components),\n The sparse code (only returned if `return_code=True`).\n\n dictionary : ndarray of shape (n_components, n_features),\n The solutions to the dictionary learning problem.\n\n n_iter : int\n Number of iterations run. Returned only if `return_n_iter` is\n set to `True`.\n\n See Also\n --------\n dict_learning\n DictionaryLearning\n MiniBatchDictionaryLearning\n SparsePCA\n MiniBatchSparsePCA\n \"\"\"\n if n_components is None:\n n_components = X.shape[1]\n if method not in ('lars', 'cd'):\n raise ValueError('Coding method not supported as a fit algorithm.')\n _check_positive_coding(method, positive_code)\n method = 'lasso_' + method\n t0 = time.time()\n (n_samples, n_features) = X.shape\n alpha = float(alpha)\n random_state = check_random_state(random_state)\n if dict_init is not None:\n dictionary = dict_init\n else:\n (_, S, dictionary) = randomized_svd(X, n_components, random_state=random_state)\n dictionary = S[:, np.newaxis] * dictionary\n r = len(dictionary)\n if n_components <= r:\n dictionary = dictionary[:n_components, :]\n else:\n dictionary = np.r_[dictionary, np.zeros((n_components - r, dictionary.shape[1]))]\n if verbose == 1:\n print('[dict_learning]', end=' ')\n if shuffle:\n X_train = X.copy()\n random_state.shuffle(X_train)\n else:\n X_train = X\n dictionary = check_array(dictionary, order='F', dtype=np.float64, copy=False)\n dictionary = np.require(dictionary, requirements='W')\n X_train = check_array(X_train, order='C', dtype=np.float64, copy=False)\n batches = gen_batches(n_samples, batch_size)\n batches = itertools.cycle(batches)\n if inner_stats is None:\n A = np.zeros((n_components, n_components))\n B = np.zeros((n_features, n_components))\n else:\n A = inner_stats[0].copy()\n B = inner_stats[1].copy()\n ii = iter_offset - 1\n for (ii, batch) in zip(range(iter_offset, iter_offset + n_iter), batches):\n this_X = X_train[batch]\n dt = time.time() - t0\n if verbose == 1:\n sys.stdout.write('.')\n sys.stdout.flush()\n elif verbose:\n if verbose > 10 or ii % ceil(100.0 / verbose) == 0:\n print('Iteration % 3i (elapsed time: % 3is, % 4.1fmn)' % (ii, dt, dt / 60))\n this_code = sparse_encode(this_X, dictionary, algorithm=method, alpha=alpha, n_jobs=n_jobs, check_input=False, positive=positive_code, max_iter=method_max_iter, verbose=verbose)\n if ii < batch_size - 1:\n theta = float((ii + 1) * batch_size)\n else:\n theta = float(batch_size**2 + ii + 1 - batch_size)\n beta = (theta + 1 - batch_size) / (theta + 1)\n A *= beta\n A += np.dot(this_code.T, this_code)\n B *= beta\n B += np.dot(this_X.T, this_code)\n _update_dict(dictionary, this_X, this_code, A, B, verbose=verbose, random_state=random_state, positive=positive_dict)\n if callback is not None:\n callback(locals())\n if return_inner_stats:\n if return_n_iter:\n return dictionary, (A, B), ii - iter_offset + 1\n else:\n return dictionary, (A, B)\n if return_code:\n if verbose > 1:\n print('Learning code...', end=' ')\n elif verbose == 1:\n print('|', end=' ')\n code = sparse_encode(X, dictionary, algorithm=method, alpha=alpha, n_jobs=n_jobs, check_input=False, positive=positive_code, max_iter=method_max_iter, verbose=verbose)\n if verbose > 1:\n dt = time.time() - t0\n print('done (total time: % 3is, % 4.1fmn)' % (dt, dt / 60))\n if return_n_iter:\n return code, dictionary, ii - iter_offset + 1\n else:\n return code, dictionary\n if return_n_iter:\n return dictionary, ii - iter_offset + 1\n else:\n return dictionary" + "description": "Solves a dictionary learning matrix factorization problem online.\n\nFinds the best dictionary and the corresponding sparse code for\napproximating the data matrix X by solving::\n\n (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\nwhere V is the dictionary and U is the sparse code. ||.||_Fro stands for\nthe Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm\nwhich is the sum of the absolute values of all the entries in the matrix.\nThis is accomplished by repeatedly iterating over mini-batches by slicing\nthe input data.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Solves a dictionary learning matrix factorization problem online.\n\n Finds the best dictionary and the corresponding sparse code for\n approximating the data matrix X by solving::\n\n (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\n where V is the dictionary and U is the sparse code. ||.||_Fro stands for\n the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm\n which is the sum of the absolute values of all the entries in the matrix.\n This is accomplished by repeatedly iterating over mini-batches by slicing\n the input data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data matrix.\n\n n_components : int or None, default=2\n Number of dictionary atoms to extract. If None, then ``n_components``\n is set to ``n_features``.\n\n alpha : float, default=1\n Sparsity controlling parameter.\n\n n_iter : int, default=100\n Number of mini-batch iterations to perform.\n\n return_code : bool, default=True\n Whether to also return the code U or just the dictionary `V`.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial value for the dictionary for warm restart scenarios.\n\n callback : callable, default=None\n callable that gets invoked every five iterations.\n\n batch_size : int, default=3\n The number of samples to take in each batch.\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n shuffle : bool, default=True\n Whether to shuffle the data before splitting it in batches.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n method : {'lars', 'cd'}, default='lars'\n * `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`);\n * `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse.\n\n iter_offset : int, default=0\n Number of previous iterations completed on the dictionary used for\n initialization.\n\n random_state : int, RandomState instance or None, default=None\n Used for initializing the dictionary when ``dict_init`` is not\n specified, randomly shuffling the data when ``shuffle`` is set to\n ``True``, and updating the dictionary. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n return_inner_stats : bool, default=False\n Return the inner statistics A (dictionary covariance) and B\n (data approximation). Useful to restart the algorithm in an\n online setting. If `return_inner_stats` is `True`, `return_code` is\n ignored.\n\n inner_stats : tuple of (A, B) ndarrays, default=None\n Inner sufficient statistics that are kept by the algorithm.\n Passing them at initialization is useful in online settings, to\n avoid losing the history of the evolution.\n `A` `(n_components, n_components)` is the dictionary covariance matrix.\n `B` `(n_features, n_components)` is the data approximation matrix.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n method_max_iter : int, default=1000\n Maximum number of iterations to perform when solving the lasso problem.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n code : ndarray of shape (n_samples, n_components),\n The sparse code (only returned if `return_code=True`).\n\n dictionary : ndarray of shape (n_components, n_features),\n The solutions to the dictionary learning problem.\n\n n_iter : int\n Number of iterations run. Returned only if `return_n_iter` is\n set to `True`.\n\n See Also\n --------\n dict_learning\n DictionaryLearning\n MiniBatchDictionaryLearning\n SparsePCA\n MiniBatchSparsePCA\n ", + "source_code": "\ndef dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, return_code=True, dict_init=None, callback=None, batch_size=3, verbose=False, shuffle=True, n_jobs=None, method='lars', iter_offset=0, random_state=None, return_inner_stats=False, inner_stats=None, return_n_iter=False, positive_dict=False, positive_code=False, method_max_iter=1000):\n \"\"\"Solves a dictionary learning matrix factorization problem online.\n\n Finds the best dictionary and the corresponding sparse code for\n approximating the data matrix X by solving::\n\n (U^*, V^*) = argmin 0.5 || X - U V ||_Fro^2 + alpha * || U ||_1,1\n (U,V)\n with || V_k ||_2 = 1 for all 0 <= k < n_components\n\n where V is the dictionary and U is the sparse code. ||.||_Fro stands for\n the Frobenius norm and ||.||_1,1 stands for the entry-wise matrix norm\n which is the sum of the absolute values of all the entries in the matrix.\n This is accomplished by repeatedly iterating over mini-batches by slicing\n the input data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data matrix.\n\n n_components : int or None, default=2\n Number of dictionary atoms to extract. If None, then ``n_components``\n is set to ``n_features``.\n\n alpha : float, default=1\n Sparsity controlling parameter.\n\n n_iter : int, default=100\n Number of mini-batch iterations to perform.\n\n return_code : bool, default=True\n Whether to also return the code U or just the dictionary `V`.\n\n dict_init : ndarray of shape (n_components, n_features), default=None\n Initial value for the dictionary for warm restart scenarios.\n\n callback : callable, default=None\n callable that gets invoked every five iterations.\n\n batch_size : int, default=3\n The number of samples to take in each batch.\n\n verbose : bool, default=False\n To control the verbosity of the procedure.\n\n shuffle : bool, default=True\n Whether to shuffle the data before splitting it in batches.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n method : {'lars', 'cd'}, default='lars'\n * `'lars'`: uses the least angle regression method to solve the lasso\n problem (`linear_model.lars_path`);\n * `'cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). Lars will be faster if\n the estimated components are sparse.\n\n iter_offset : int, default=0\n Number of previous iterations completed on the dictionary used for\n initialization.\n\n random_state : int, RandomState instance or None, default=None\n Used for initializing the dictionary when ``dict_init`` is not\n specified, randomly shuffling the data when ``shuffle`` is set to\n ``True``, and updating the dictionary. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n return_inner_stats : bool, default=False\n Return the inner statistics A (dictionary covariance) and B\n (data approximation). Useful to restart the algorithm in an\n online setting. If `return_inner_stats` is `True`, `return_code` is\n ignored.\n\n inner_stats : tuple of (A, B) ndarrays, default=None\n Inner sufficient statistics that are kept by the algorithm.\n Passing them at initialization is useful in online settings, to\n avoid losing the history of the evolution.\n `A` `(n_components, n_components)` is the dictionary covariance matrix.\n `B` `(n_features, n_components)` is the data approximation matrix.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n positive_dict : bool, default=False\n Whether to enforce positivity when finding the dictionary.\n\n .. versionadded:: 0.20\n\n positive_code : bool, default=False\n Whether to enforce positivity when finding the code.\n\n .. versionadded:: 0.20\n\n method_max_iter : int, default=1000\n Maximum number of iterations to perform when solving the lasso problem.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n code : ndarray of shape (n_samples, n_components),\n The sparse code (only returned if `return_code=True`).\n\n dictionary : ndarray of shape (n_components, n_features),\n The solutions to the dictionary learning problem.\n\n n_iter : int\n Number of iterations run. Returned only if `return_n_iter` is\n set to `True`.\n\n See Also\n --------\n dict_learning\n DictionaryLearning\n MiniBatchDictionaryLearning\n SparsePCA\n MiniBatchSparsePCA\n \"\"\"\n if n_components is None:\n n_components = X.shape[1]\n if method not in ('lars', 'cd'):\n raise ValueError('Coding method not supported as a fit algorithm.')\n _check_positive_coding(method, positive_code)\n method = 'lasso_' + method\n t0 = time.time()\n (n_samples, n_features) = X.shape\n alpha = float(alpha)\n random_state = check_random_state(random_state)\n if dict_init is not None:\n dictionary = dict_init\n else:\n (_, S, dictionary) = randomized_svd(X, n_components, random_state=random_state)\n dictionary = S[:, np.newaxis] * dictionary\n r = len(dictionary)\n if n_components <= r:\n dictionary = dictionary[:n_components, :]\n else:\n dictionary = np.r_[dictionary, np.zeros((n_components - r, dictionary.shape[1]))]\n if verbose == 1:\n print('[dict_learning]', end=' ')\n if shuffle:\n X_train = X.copy()\n random_state.shuffle(X_train)\n else:\n X_train = X\n dictionary = check_array(dictionary, order='F', dtype=np.float64, copy=False)\n dictionary = np.require(dictionary, requirements='W')\n X_train = check_array(X_train, order='C', dtype=np.float64, copy=False)\n batches = gen_batches(n_samples, batch_size)\n batches = itertools.cycle(batches)\n if inner_stats is None:\n A = np.zeros((n_components, n_components))\n B = np.zeros((n_features, n_components))\n else:\n A = inner_stats[0].copy()\n B = inner_stats[1].copy()\n ii = iter_offset - 1\n for (ii, batch) in zip(range(iter_offset, iter_offset + n_iter), batches):\n this_X = X_train[batch]\n dt = time.time() - t0\n if verbose == 1:\n sys.stdout.write('.')\n sys.stdout.flush()\n elif verbose:\n if verbose > 10 or ii % ceil(100.0 / verbose) == 0:\n print('Iteration % 3i (elapsed time: % 3is, % 4.1fmn)' % (ii, dt, dt / 60))\n this_code = sparse_encode(this_X, dictionary, algorithm=method, alpha=alpha, n_jobs=n_jobs, check_input=False, positive=positive_code, max_iter=method_max_iter, verbose=verbose)\n if ii < batch_size - 1:\n theta = float((ii + 1) * batch_size)\n else:\n theta = float(batch_size**2 + ii + 1 - batch_size)\n beta = (theta + 1 - batch_size) / (theta + 1)\n A *= beta\n A += np.dot(this_code.T, this_code)\n B *= beta\n B += np.dot(this_X.T, this_code)\n _update_dict(dictionary, this_X, this_code, A, B, verbose=verbose, random_state=random_state, positive=positive_dict)\n if callback is not None:\n callback(locals())\n if return_inner_stats:\n if return_n_iter:\n return dictionary, (A, B), ii - iter_offset + 1\n else:\n return dictionary, (A, B)\n if return_code:\n if verbose > 1:\n print('Learning code...', end=' ')\n elif verbose == 1:\n print('|', end=' ')\n code = sparse_encode(X, dictionary, algorithm=method, alpha=alpha, n_jobs=n_jobs, check_input=False, positive=positive_code, max_iter=method_max_iter, verbose=verbose)\n if verbose > 1:\n dt = time.time() - t0\n print('done (total time: % 3is, % 4.1fmn)' % (dt, dt / 60))\n if return_n_iter:\n return code, dictionary, ii - iter_offset + 1\n else:\n return code, dictionary\n if return_n_iter:\n return dictionary, ii - iter_offset + 1\n else:\n return dictionary" }, { "name": "sparse_encode", @@ -49035,7 +51141,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Data matrix." - } + }, + "refined_type": {} }, { "name": "dictionary", @@ -49045,7 +51152,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_features)", "description": "The dictionary matrix against which to solve the sparse coding of\nthe data. Some of the algorithms assume normalized rows for meaningful\noutput." - } + }, + "refined_type": {} }, { "name": "gram", @@ -49055,7 +51163,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_components), default=None", "description": "Precomputed Gram matrix, `dictionary * dictionary'`." - } + }, + "refined_type": {} }, { "name": "cov", @@ -49065,7 +51174,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_samples), default=None", "description": "Precomputed covariance, `dictionary' * X`." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -49075,6 +51185,16 @@ "docstring": { "type": "{'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='lasso_lars'", "description": "The algorithm used:\n\n* `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n* `'lasso_lars'`: uses Lars to compute the Lasso solution;\n* `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). lasso_lars will be faster if\n the estimated components are sparse;\n* `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution;\n* `'threshold'`: squashes to zero all coefficients less than\n regularization from the projection `dictionary * data'`." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "omp", + "lasso_cd", + "lasso_lars", + "lars", + "threshold" + ] } }, { @@ -49085,7 +51205,8 @@ "docstring": { "type": "int, default=None", "description": "Number of nonzero coefficients to target in each column of the\nsolution. This is only used by `algorithm='lars'` and `algorithm='omp'`\nand is overridden by `alpha` in the `omp` case. If `None`, then\n`n_nonzero_coefs=int(n_features / 10)`." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -49095,7 +51216,8 @@ "docstring": { "type": "float, default=None", "description": "If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\npenalty applied to the L1 norm.\nIf `algorithm='threshold'`, `alpha` is the absolute value of the\nthreshold below which coefficients will be squashed to zero.\nIf `algorithm='omp'`, `alpha` is the tolerance parameter: the value of\nthe reconstruction error targeted. In this case, it overrides\n`n_nonzero_coefs`.\nIf `None`, default to 1." - } + }, + "refined_type": {} }, { "name": "copy_cov", @@ -49105,7 +51227,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to copy the precomputed covariance matrix; if `False`, it may\nbe overwritten." - } + }, + "refined_type": {} }, { "name": "init", @@ -49115,7 +51238,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_components), default=None", "description": "Initialization value of the sparse codes. Only used if\n`algorithm='lasso_cd'`." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -49125,7 +51249,8 @@ "docstring": { "type": "int, default=1000", "description": "Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n`'lasso_lars'`." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -49135,7 +51260,8 @@ "docstring": { "type": "int, default=None", "description": "Number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "check_input", @@ -49145,7 +51271,8 @@ "docstring": { "type": "bool, default=True", "description": "If `False`, the input arrays X and dictionary will not be checked." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -49155,7 +51282,8 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity; the higher, the more messages." - } + }, + "refined_type": {} }, { "name": "positive", @@ -49165,13 +51293,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enforce positivity when finding the encoding.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Sparse coding\n\nEach row of the result is the solution to a sparse coding problem. The goal is to find a sparse array `code` such that:: X ~= code * dictionary Read more in the :ref:`User Guide `.", - "docstring": "Sparse coding\n\nEach row of the result is the solution to a sparse coding problem.\nThe goal is to find a sparse array `code` such that::\n\n X ~= code * dictionary\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Data matrix.\n\ndictionary : ndarray of shape (n_components, n_features)\n The dictionary matrix against which to solve the sparse coding of\n the data. Some of the algorithms assume normalized rows for meaningful\n output.\n\ngram : ndarray of shape (n_components, n_components), default=None\n Precomputed Gram matrix, `dictionary * dictionary'`.\n\ncov : ndarray of shape (n_components, n_samples), default=None\n Precomputed covariance, `dictionary' * X`.\n\nalgorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='lasso_lars'\n The algorithm used:\n\n * `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n * `'lasso_lars'`: uses Lars to compute the Lasso solution;\n * `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). lasso_lars will be faster if\n the estimated components are sparse;\n * `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution;\n * `'threshold'`: squashes to zero all coefficients less than\n regularization from the projection `dictionary * data'`.\n\nn_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and `algorithm='omp'`\n and is overridden by `alpha` in the `omp` case. If `None`, then\n `n_nonzero_coefs=int(n_features / 10)`.\n\nalpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `algorithm='omp'`, `alpha` is the tolerance parameter: the value of\n the reconstruction error targeted. In this case, it overrides\n `n_nonzero_coefs`.\n If `None`, default to 1.\n\ncopy_cov : bool, default=True\n Whether to copy the precomputed covariance matrix; if `False`, it may\n be overwritten.\n\ninit : ndarray of shape (n_samples, n_components), default=None\n Initialization value of the sparse codes. Only used if\n `algorithm='lasso_cd'`.\n\nmax_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\nn_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\ncheck_input : bool, default=True\n If `False`, the input arrays X and dictionary will not be checked.\n\nverbose : int, default=0\n Controls the verbosity; the higher, the more messages.\n\npositive : bool, default=False\n Whether to enforce positivity when finding the encoding.\n\n .. versionadded:: 0.20\n\nReturns\n-------\ncode : ndarray of shape (n_samples, n_components)\n The sparse codes\n\nSee Also\n--------\nsklearn.linear_model.lars_path\nsklearn.linear_model.orthogonal_mp\nsklearn.linear_model.Lasso\nSparseCoder", + "description": "Sparse coding\n\nEach row of the result is the solution to a sparse coding problem.\nThe goal is to find a sparse array `code` such that::\n\n X ~= code * dictionary\n\nRead more in the :ref:`User Guide `.", + "docstring": "Sparse coding\n\n Each row of the result is the solution to a sparse coding problem.\n The goal is to find a sparse array `code` such that::\n\n X ~= code * dictionary\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data matrix.\n\n dictionary : ndarray of shape (n_components, n_features)\n The dictionary matrix against which to solve the sparse coding of\n the data. Some of the algorithms assume normalized rows for meaningful\n output.\n\n gram : ndarray of shape (n_components, n_components), default=None\n Precomputed Gram matrix, `dictionary * dictionary'`.\n\n cov : ndarray of shape (n_components, n_samples), default=None\n Precomputed covariance, `dictionary' * X`.\n\n algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='lasso_lars'\n The algorithm used:\n\n * `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n * `'lasso_lars'`: uses Lars to compute the Lasso solution;\n * `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). lasso_lars will be faster if\n the estimated components are sparse;\n * `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution;\n * `'threshold'`: squashes to zero all coefficients less than\n regularization from the projection `dictionary * data'`.\n\n n_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and `algorithm='omp'`\n and is overridden by `alpha` in the `omp` case. If `None`, then\n `n_nonzero_coefs=int(n_features / 10)`.\n\n alpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `algorithm='omp'`, `alpha` is the tolerance parameter: the value of\n the reconstruction error targeted. In this case, it overrides\n `n_nonzero_coefs`.\n If `None`, default to 1.\n\n copy_cov : bool, default=True\n Whether to copy the precomputed covariance matrix; if `False`, it may\n be overwritten.\n\n init : ndarray of shape (n_samples, n_components), default=None\n Initialization value of the sparse codes. Only used if\n `algorithm='lasso_cd'`.\n\n max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n check_input : bool, default=True\n If `False`, the input arrays X and dictionary will not be checked.\n\n verbose : int, default=0\n Controls the verbosity; the higher, the more messages.\n\n positive : bool, default=False\n Whether to enforce positivity when finding the encoding.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n code : ndarray of shape (n_samples, n_components)\n The sparse codes\n\n See Also\n --------\n sklearn.linear_model.lars_path\n sklearn.linear_model.orthogonal_mp\n sklearn.linear_model.Lasso\n SparseCoder\n ", "source_code": "\ndef sparse_encode(X, dictionary, *, gram=None, cov=None, algorithm='lasso_lars', n_nonzero_coefs=None, alpha=None, copy_cov=True, init=None, max_iter=1000, n_jobs=None, check_input=True, verbose=0, positive=False):\n \"\"\"Sparse coding\n\n Each row of the result is the solution to a sparse coding problem.\n The goal is to find a sparse array `code` such that::\n\n X ~= code * dictionary\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data matrix.\n\n dictionary : ndarray of shape (n_components, n_features)\n The dictionary matrix against which to solve the sparse coding of\n the data. Some of the algorithms assume normalized rows for meaningful\n output.\n\n gram : ndarray of shape (n_components, n_components), default=None\n Precomputed Gram matrix, `dictionary * dictionary'`.\n\n cov : ndarray of shape (n_components, n_samples), default=None\n Precomputed covariance, `dictionary' * X`.\n\n algorithm : {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'}, default='lasso_lars'\n The algorithm used:\n\n * `'lars'`: uses the least angle regression method\n (`linear_model.lars_path`);\n * `'lasso_lars'`: uses Lars to compute the Lasso solution;\n * `'lasso_cd'`: uses the coordinate descent method to compute the\n Lasso solution (`linear_model.Lasso`). lasso_lars will be faster if\n the estimated components are sparse;\n * `'omp'`: uses orthogonal matching pursuit to estimate the sparse\n solution;\n * `'threshold'`: squashes to zero all coefficients less than\n regularization from the projection `dictionary * data'`.\n\n n_nonzero_coefs : int, default=None\n Number of nonzero coefficients to target in each column of the\n solution. This is only used by `algorithm='lars'` and `algorithm='omp'`\n and is overridden by `alpha` in the `omp` case. If `None`, then\n `n_nonzero_coefs=int(n_features / 10)`.\n\n alpha : float, default=None\n If `algorithm='lasso_lars'` or `algorithm='lasso_cd'`, `alpha` is the\n penalty applied to the L1 norm.\n If `algorithm='threshold'`, `alpha` is the absolute value of the\n threshold below which coefficients will be squashed to zero.\n If `algorithm='omp'`, `alpha` is the tolerance parameter: the value of\n the reconstruction error targeted. In this case, it overrides\n `n_nonzero_coefs`.\n If `None`, default to 1.\n\n copy_cov : bool, default=True\n Whether to copy the precomputed covariance matrix; if `False`, it may\n be overwritten.\n\n init : ndarray of shape (n_samples, n_components), default=None\n Initialization value of the sparse codes. Only used if\n `algorithm='lasso_cd'`.\n\n max_iter : int, default=1000\n Maximum number of iterations to perform if `algorithm='lasso_cd'` or\n `'lasso_lars'`.\n\n n_jobs : int, default=None\n Number of parallel jobs to run.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n check_input : bool, default=True\n If `False`, the input arrays X and dictionary will not be checked.\n\n verbose : int, default=0\n Controls the verbosity; the higher, the more messages.\n\n positive : bool, default=False\n Whether to enforce positivity when finding the encoding.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n code : ndarray of shape (n_samples, n_components)\n The sparse codes\n\n See Also\n --------\n sklearn.linear_model.lars_path\n sklearn.linear_model.orthogonal_mp\n sklearn.linear_model.Lasso\n SparseCoder\n \"\"\"\n if check_input:\n if algorithm == 'lasso_cd':\n dictionary = check_array(dictionary, order='C', dtype='float64')\n X = check_array(X, order='C', dtype='float64')\n else:\n dictionary = check_array(dictionary)\n X = check_array(X)\n (n_samples, n_features) = X.shape\n n_components = dictionary.shape[0]\n if gram is None and algorithm != 'threshold':\n gram = np.dot(dictionary, dictionary.T)\n if cov is None and algorithm != 'lasso_cd':\n copy_cov = False\n cov = np.dot(dictionary, X.T)\n if algorithm in ('lars', 'omp'):\n regularization = n_nonzero_coefs\n if regularization is None:\n regularization = min(max(n_features / 10, 1), n_components)\n else:\n regularization = alpha\n if regularization is None:\n regularization = 1.0\n if effective_n_jobs(n_jobs) == 1 or algorithm == 'threshold':\n code = _sparse_encode(X, dictionary, gram, cov=cov, algorithm=algorithm, regularization=regularization, copy_cov=copy_cov, init=init, max_iter=max_iter, check_input=False, verbose=verbose, positive=positive)\n return code\n code = np.empty((n_samples, n_components))\n slices = list(gen_even_slices(n_samples, effective_n_jobs(n_jobs)))\n code_views = Parallel(n_jobs=n_jobs, verbose=verbose)((delayed(_sparse_encode)(X[this_slice], dictionary, gram, cov[:, this_slice] if cov is not None else None, algorithm, regularization=regularization, copy_cov=copy_cov, init=init[this_slice] if init is not None else None, max_iter=max_iter, check_input=False, verbose=verbose, positive=positive) for this_slice in slices))\n for (this_slice, this_view) in zip(slices, code_views):\n code[this_slice] = this_view\n return code" }, { @@ -49189,7 +51318,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -49199,7 +51329,8 @@ "docstring": { "type": "int, default=None", "description": "Dimensionality of latent space, the number of components\nof ``X`` that are obtained after ``transform``.\nIf None, n_components is set to the number of features." - } + }, + "refined_type": {} }, { "name": "tol", @@ -49209,7 +51340,8 @@ "docstring": { "type": "float, default=1e-2", "description": "Stopping tolerance for log-likelihood increase." - } + }, + "refined_type": {} }, { "name": "copy", @@ -49219,7 +51351,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to make a copy of X. If ``False``, the input X gets overwritten\nduring fitting." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -49229,7 +51362,8 @@ "docstring": { "type": "int, default=1000", "description": "Maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "noise_variance_init", @@ -49239,7 +51373,8 @@ "docstring": { "type": "ndarray of shape (n_features,), default=None", "description": "The initial guess of the noise variance for each feature.\nIf None, it defaults to np.ones(n_features)." - } + }, + "refined_type": {} }, { "name": "svd_method", @@ -49249,6 +51384,10 @@ "docstring": { "type": "{'lapack', 'randomized'}, default='randomized'", "description": "Which SVD method to use. If 'lapack' use standard SVD from\nscipy.linalg, if 'randomized' use fast ``randomized_svd`` function.\nDefaults to 'randomized'. For most applications 'randomized' will\nbe sufficiently precise while providing significant speed gains.\nAccuracy can also be improved by setting higher values for\n`iterated_power`. If this is not sufficient, for maximum precision\nyou should choose 'lapack'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["lapack", "randomized"] } }, { @@ -49259,7 +51398,8 @@ "docstring": { "type": "int, default=3", "description": "Number of iterations for the power method. 3 by default. Only used\nif ``svd_method`` equals 'randomized'." - } + }, + "refined_type": {} }, { "name": "rotation", @@ -49269,6 +51409,10 @@ "docstring": { "type": "{'varimax', 'quartimax'}, default=None", "description": "If not None, apply the indicated rotation. Currently, varimax and\nquartimax are implemented. See\n`\"The varimax criterion for analytic rotation in factor analysis\"\n`_\nH. F. Kaiser, 1958.\n\n.. versionadded:: 0.24" + }, + "refined_type": { + "kind": "EnumType", + "values": ["varimax", "quartimax"] } }, { @@ -49279,13 +51423,14 @@ "docstring": { "type": "int or RandomState instance, default=0", "description": "Only used when ``svd_method`` equals 'randomized'. Pass an int for\nreproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=None, *, tol=0.01, copy=True, max_iter=1000, noise_variance_init=None, svd_method='randomized', iterated_power=3, rotation=None, random_state=0):\n self.n_components = n_components\n self.copy = copy\n self.tol = tol\n self.max_iter = max_iter\n if svd_method not in ['lapack', 'randomized']:\n raise ValueError('SVD method %s is not supported. Please consider the documentation' % svd_method)\n self.svd_method = svd_method\n self.noise_variance_init = noise_variance_init\n self.iterated_power = iterated_power\n self.random_state = random_state\n self.rotation = rotation" }, { @@ -49303,7 +51448,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "components", @@ -49313,7 +51459,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -49323,7 +51470,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -49333,7 +51481,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -49357,7 +51506,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -49367,7 +51517,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -49377,13 +51528,14 @@ "docstring": { "type": "Ignored", "description": "Ignored parameter." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the FactorAnalysis model to X using SVD based approach.", - "docstring": "Fit the FactorAnalysis model to X using SVD based approach.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : Ignored\n Ignored parameter.\n\nReturns\n-------\nself : object\n FactorAnalysis class instance.", + "docstring": "Fit the FactorAnalysis model to X using SVD based approach.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Ignored parameter.\n\n Returns\n -------\n self : object\n FactorAnalysis class instance.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the FactorAnalysis model to X using SVD based approach.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Ignored parameter.\n\n Returns\n -------\n self : object\n FactorAnalysis class instance.\n \"\"\"\n X = self._validate_data(X, copy=self.copy, dtype=np.float64)\n (n_samples, n_features) = X.shape\n n_components = self.n_components\n if n_components is None:\n n_components = n_features\n self.mean_ = np.mean(X, axis=0)\n X -= self.mean_\n nsqrt = sqrt(n_samples)\n llconst = n_features * log(2.0 * np.pi) + n_components\n var = np.var(X, axis=0)\n if self.noise_variance_init is None:\n psi = np.ones(n_features, dtype=X.dtype)\n else:\n if len(self.noise_variance_init) != n_features:\n raise ValueError('noise_variance_init dimension does not with number of features : %d != %d' % (len(self.noise_variance_init), n_features))\n psi = np.array(self.noise_variance_init)\n loglike = []\n old_ll = -np.inf\n SMALL = 1e-12\n if self.svd_method == 'lapack':\n \n def my_svd(X):\n (_, s, Vt) = linalg.svd(X, full_matrices=False, check_finite=False)\n return s[:n_components], Vt[:n_components], squared_norm(s[n_components:])\n elif self.svd_method == 'randomized':\n random_state = check_random_state(self.random_state)\n \n def my_svd(X):\n (_, s, Vt) = randomized_svd(X, n_components, random_state=random_state, n_iter=self.iterated_power)\n return s, Vt, squared_norm(X) - squared_norm(s)\n else:\n raise ValueError('SVD method %s is not supported. Please consider the documentation' % self.svd_method)\n for i in range(self.max_iter):\n sqrt_psi = np.sqrt(psi) + SMALL\n (s, Vt, unexp_var) = my_svd(X / (sqrt_psi * nsqrt))\n s **= 2\n W = np.sqrt(np.maximum(s - 1.0, 0.0))[:, np.newaxis] * Vt\n del Vt\n W *= sqrt_psi\n ll = llconst + np.sum(np.log(s))\n ll += unexp_var + np.sum(np.log(psi))\n ll *= -n_samples / 2.0\n loglike.append(ll)\n if ll - old_ll < self.tol:\n break\n old_ll = ll\n psi = np.maximum(var - np.sum(W**2, axis=0), SMALL)\n else:\n warnings.warn('FactorAnalysis did not converge.' + ' You might want' + ' to increase the number of iterations.', ConvergenceWarning)\n self.components_ = W\n if self.rotation is not None:\n self.components_ = self._rotate(W)\n self.noise_variance_ = psi\n self.loglike_ = loglike\n self.n_iter_ = i + 1\n return self" }, { @@ -49401,13 +51553,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute data covariance with the FactorAnalysis model.\n\n``cov = components_.T * components_ + diag(noise_variance)``", - "docstring": "Compute data covariance with the FactorAnalysis model.\n\n``cov = components_.T * components_ + diag(noise_variance)``\n\nReturns\n-------\ncov : ndarray of shape (n_features, n_features)\n Estimated covariance of data.", + "docstring": "Compute data covariance with the FactorAnalysis model.\n\n ``cov = components_.T * components_ + diag(noise_variance)``\n\n Returns\n -------\n cov : ndarray of shape (n_features, n_features)\n Estimated covariance of data.\n ", "source_code": "\ndef get_covariance(self):\n \"\"\"Compute data covariance with the FactorAnalysis model.\n\n ``cov = components_.T * components_ + diag(noise_variance)``\n\n Returns\n -------\n cov : ndarray of shape (n_features, n_features)\n Estimated covariance of data.\n \"\"\"\n check_is_fitted(self)\n cov = np.dot(self.components_.T, self.components_)\n cov.flat[::len(cov) + 1] += self.noise_variance_\n return cov" }, { @@ -49425,13 +51578,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute data precision matrix with the FactorAnalysis model.", - "docstring": "Compute data precision matrix with the FactorAnalysis model.\n\nReturns\n-------\nprecision : ndarray of shape (n_features, n_features)\n Estimated precision of data.", + "docstring": "Compute data precision matrix with the FactorAnalysis model.\n\n Returns\n -------\n precision : ndarray of shape (n_features, n_features)\n Estimated precision of data.\n ", "source_code": "\ndef get_precision(self):\n \"\"\"Compute data precision matrix with the FactorAnalysis model.\n\n Returns\n -------\n precision : ndarray of shape (n_features, n_features)\n Estimated precision of data.\n \"\"\"\n check_is_fitted(self)\n n_features = self.components_.shape[1]\n if self.n_components == 0:\n return np.diag(1.0 / self.noise_variance_)\n if self.n_components == n_features:\n return linalg.inv(self.get_covariance())\n components_ = self.components_\n precision = np.dot(components_ / self.noise_variance_, components_.T)\n precision.flat[::len(precision) + 1] += 1.0\n precision = np.dot(components_.T, np.dot(linalg.inv(precision), components_))\n precision /= self.noise_variance_[:, np.newaxis]\n precision /= -self.noise_variance_[np.newaxis, :]\n precision.flat[::len(precision) + 1] += 1.0 / self.noise_variance_\n return precision" }, { @@ -49449,7 +51603,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -49459,7 +51614,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The data." - } + }, + "refined_type": {} }, { "name": "y", @@ -49469,13 +51625,14 @@ "docstring": { "type": "Ignored", "description": "Ignored parameter." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the average log-likelihood of the samples.", - "docstring": "Compute the average log-likelihood of the samples.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n The data.\n\ny : Ignored\n Ignored parameter.\n\nReturns\n-------\nll : float\n Average log-likelihood of the samples under the current model.", + "docstring": "Compute the average log-likelihood of the samples.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data.\n\n y : Ignored\n Ignored parameter.\n\n Returns\n -------\n ll : float\n Average log-likelihood of the samples under the current model.\n ", "source_code": "\ndef score(self, X, y=None):\n \"\"\"Compute the average log-likelihood of the samples.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data.\n\n y : Ignored\n Ignored parameter.\n\n Returns\n -------\n ll : float\n Average log-likelihood of the samples under the current model.\n \"\"\"\n return np.mean(self.score_samples(X))" }, { @@ -49493,7 +51650,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -49503,13 +51661,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the log-likelihood of each sample.", - "docstring": "Compute the log-likelihood of each sample.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n The data.\n\nReturns\n-------\nll : ndarray of shape (n_samples,)\n Log-likelihood of each sample under the current model.", + "docstring": "Compute the log-likelihood of each sample.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data.\n\n Returns\n -------\n ll : ndarray of shape (n_samples,)\n Log-likelihood of each sample under the current model.\n ", "source_code": "\ndef score_samples(self, X):\n \"\"\"Compute the log-likelihood of each sample.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data.\n\n Returns\n -------\n ll : ndarray of shape (n_samples,)\n Log-likelihood of each sample under the current model.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n Xr = X - self.mean_\n precision = self.get_precision()\n n_features = X.shape[1]\n log_like = -0.5 * (Xr * np.dot(Xr, precision)).sum(axis=1)\n log_like -= 0.5 * (n_features * log(2.0 * np.pi) - fast_logdet(precision))\n return log_like" }, { @@ -49527,7 +51686,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -49537,13 +51697,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Apply dimensionality reduction to X using the model.\n\nCompute the expected mean of the latent variables. See Barber, 21.2.33 (or Bishop, 12.66).", - "docstring": "Apply dimensionality reduction to X using the model.\n\nCompute the expected mean of the latent variables.\nSee Barber, 21.2.33 (or Bishop, 12.66).\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n The latent variables of X.", + "description": "Apply dimensionality reduction to X using the model.\n\nCompute the expected mean of the latent variables.\nSee Barber, 21.2.33 (or Bishop, 12.66).", + "docstring": "Apply dimensionality reduction to X using the model.\n\n Compute the expected mean of the latent variables.\n See Barber, 21.2.33 (or Bishop, 12.66).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n The latent variables of X.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Apply dimensionality reduction to X using the model.\n\n Compute the expected mean of the latent variables.\n See Barber, 21.2.33 (or Bishop, 12.66).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n The latent variables of X.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n Ih = np.eye(len(self.components_))\n X_transformed = X - self.mean_\n Wpsi = self.components_ / self.noise_variance_\n cov_z = linalg.inv(Ih + np.dot(Wpsi, self.components_.T))\n tmp = np.dot(X_transformed, Wpsi.T)\n X_transformed = np.dot(tmp, cov_z)\n return X_transformed" }, { @@ -49561,7 +51722,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "method", @@ -49571,7 +51733,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -49581,7 +51744,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -49591,7 +51755,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -49615,7 +51780,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -49625,7 +51791,8 @@ "docstring": { "type": "int, default=None", "description": "Number of components to use. If None is passed, all are used." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -49635,6 +51802,10 @@ "docstring": { "type": "{'parallel', 'deflation'}, default='parallel'", "description": "Apply parallel or deflational algorithm for FastICA." + }, + "refined_type": { + "kind": "EnumType", + "values": ["deflation", "parallel"] } }, { @@ -49645,7 +51816,8 @@ "docstring": { "type": "bool, default=True", "description": "If whiten is false, the data is already considered to be\nwhitened, and no whitening is performed." - } + }, + "refined_type": {} }, { "name": "fun", @@ -49655,6 +51827,10 @@ "docstring": { "type": "{'logcosh', 'exp', 'cube'} or callable, default='logcosh'", "description": "The functional form of the G function used in the\napproximation to neg-entropy. Could be either 'logcosh', 'exp',\nor 'cube'.\nYou can also provide your own function. It should return a tuple\ncontaining the value of the function, and of its derivative, in the\npoint. Example::\n\n def my_g(x):\n return x ** 3, (3 * x ** 2).mean(axis=-1)" + }, + "refined_type": { + "kind": "EnumType", + "values": ["exp", "cube", "logcosh"] } }, { @@ -49665,6 +51841,10 @@ "docstring": { "type": "dict, default=None", "description": "Arguments to send to the functional form.\nIf empty and if fun='logcosh', fun_args will take value\n{'alpha' : 1.0}." + }, + "refined_type": { + "kind": "EnumType", + "values": ["alpha"] } }, { @@ -49675,7 +51855,8 @@ "docstring": { "type": "int, default=200", "description": "Maximum number of iterations during fit." - } + }, + "refined_type": {} }, { "name": "tol", @@ -49685,7 +51866,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance on update at each iteration." - } + }, + "refined_type": {} }, { "name": "w_init", @@ -49695,7 +51877,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_components), default=None", "description": "The mixing matrix to be used to initialize the algorithm." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -49705,13 +51888,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used to initialize ``w_init`` when not specified, with a\nnormal distribution. Pass an int, for reproducible results\nacross multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=None, *, algorithm='parallel', whiten=True, fun='logcosh', fun_args=None, max_iter=200, tol=0.0001, w_init=None, random_state=None):\n super().__init__()\n if max_iter < 1:\n raise ValueError('max_iter should be greater than 1, got (max_iter={})'.format(max_iter))\n self.n_components = n_components\n self.algorithm = algorithm\n self.whiten = whiten\n self.fun = fun\n self.fun_args = fun_args\n self.max_iter = max_iter\n self.tol = tol\n self.w_init = w_init\n self.random_state = random_state" }, { @@ -49729,7 +51913,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -49739,7 +51924,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "compute_sources", @@ -49749,13 +51935,14 @@ "docstring": { "type": "bool, default=False", "description": "If False, sources are not computes but only the rotation matrix.\nThis can save memory when working with big data. Defaults to False." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit the model", - "docstring": "Fit the model\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ncompute_sources : bool, default=False\n If False, sources are not computes but only the rotation matrix.\n This can save memory when working with big data. Defaults to False.\n\nReturns\n-------\nS : ndarray of shape (n_samples, n_components) or None\n Sources matrix. `None` if `compute_sources` is `False`.", + "docstring": "Fit the model\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n compute_sources : bool, default=False\n If False, sources are not computes but only the rotation matrix.\n This can save memory when working with big data. Defaults to False.\n\n Returns\n -------\n S : ndarray of shape (n_samples, n_components) or None\n Sources matrix. `None` if `compute_sources` is `False`.\n ", "source_code": "\ndef _fit(self, X, compute_sources=False):\n \"\"\"Fit the model\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n compute_sources : bool, default=False\n If False, sources are not computes but only the rotation matrix.\n This can save memory when working with big data. Defaults to False.\n\n Returns\n -------\n S : ndarray of shape (n_samples, n_components) or None\n Sources matrix. `None` if `compute_sources` is `False`.\n \"\"\"\n XT = self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES, ensure_min_samples=2).T\n fun_args = {} if self.fun_args is None else self.fun_args\n random_state = check_random_state(self.random_state)\n alpha = fun_args.get('alpha', 1.0)\n if not 1 <= alpha <= 2:\n raise ValueError('alpha must be in [1,2]')\n if self.fun == 'logcosh':\n g = _logcosh\n elif self.fun == 'exp':\n g = _exp\n elif self.fun == 'cube':\n g = _cube\n elif callable(self.fun):\n \n def g(x, fun_args):\n return self.fun(x, **fun_args)\n else:\n exc = ValueError if isinstance(self.fun, str) else TypeError\n raise exc(\"Unknown function %r; should be one of 'logcosh', 'exp', 'cube' or callable\" % self.fun)\n (n_features, n_samples) = XT.shape\n n_components = self.n_components\n if not self.whiten and n_components is not None:\n n_components = None\n warnings.warn('Ignoring n_components with whiten=False.')\n if n_components is None:\n n_components = min(n_samples, n_features)\n if n_components > min(n_samples, n_features):\n n_components = min(n_samples, n_features)\n warnings.warn('n_components is too large: it will be set to %s' % n_components)\n if self.whiten:\n X_mean = XT.mean(axis=-1)\n XT -= X_mean[:, np.newaxis]\n (u, d, _) = linalg.svd(XT, full_matrices=False, check_finite=False)\n del _\n K = (u / d).T[:n_components]\n del u, d\n X1 = np.dot(K, XT)\n X1 *= np.sqrt(n_samples)\n else:\n X1 = as_float_array(XT, copy=False)\n w_init = self.w_init\n if w_init is None:\n w_init = np.asarray(random_state.normal(size=(n_components, n_components)), dtype=X1.dtype)\n else:\n w_init = np.asarray(w_init)\n if w_init.shape != (n_components, n_components):\n raise ValueError('w_init has invalid shape -- should be %(shape)s' % {'shape': (n_components, n_components)})\n kwargs = {'tol': self.tol, 'g': g, 'fun_args': fun_args, 'max_iter': self.max_iter, 'w_init': w_init}\n if self.algorithm == 'parallel':\n (W, n_iter) = _ica_par(X1, **kwargs)\n elif self.algorithm == 'deflation':\n (W, n_iter) = _ica_def(X1, **kwargs)\n else:\n raise ValueError('Invalid algorithm: must be either `parallel` or `deflation`.')\n del X1\n if compute_sources:\n if self.whiten:\n S = np.linalg.multi_dot([W, K, XT]).T\n else:\n S = np.dot(W, XT).T\n else:\n S = None\n self.n_iter_ = n_iter\n if self.whiten:\n self.components_ = np.dot(W, K)\n self.mean_ = X_mean\n self.whitening_ = K\n else:\n self.components_ = W\n self.mixing_ = linalg.pinv(self.components_, check_finite=False)\n self._unmixing = W\n return S" }, { @@ -49773,7 +51960,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -49783,7 +51971,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -49793,13 +51982,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model to X.", - "docstring": "Fit the model to X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self._fit(X, compute_sources=False)\n return self" }, { @@ -49817,7 +52007,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -49827,7 +52018,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -49837,13 +52029,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model and recover the sources from X.", - "docstring": "Fit the model and recover the sources from X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Estimated sources obtained by transforming the data with the\n estimated unmixing matrix.", + "docstring": "Fit the model and recover the sources from X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Estimated sources obtained by transforming the data with the\n estimated unmixing matrix.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Fit the model and recover the sources from X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Estimated sources obtained by transforming the data with the\n estimated unmixing matrix.\n \"\"\"\n return self._fit(X, compute_sources=True)" }, { @@ -49861,7 +52054,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -49871,7 +52065,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "Sources, where `n_samples` is the number of samples\nand `n_components` is the number of components." - } + }, + "refined_type": {} }, { "name": "copy", @@ -49881,13 +52076,14 @@ "docstring": { "type": "bool, default=True", "description": "If False, data passed to fit are overwritten. Defaults to True." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform the sources back to the mixed data (apply mixing matrix).", - "docstring": "Transform the sources back to the mixed data (apply mixing matrix).\n\nParameters\n----------\nX : array-like of shape (n_samples, n_components)\n Sources, where `n_samples` is the number of samples\n and `n_components` is the number of components.\ncopy : bool, default=True\n If False, data passed to fit are overwritten. Defaults to True.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_features)\n Reconstructed data obtained with the mixing matrix.", + "docstring": "Transform the sources back to the mixed data (apply mixing matrix).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_components)\n Sources, where `n_samples` is the number of samples\n and `n_components` is the number of components.\n copy : bool, default=True\n If False, data passed to fit are overwritten. Defaults to True.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_features)\n Reconstructed data obtained with the mixing matrix.\n ", "source_code": "\ndef inverse_transform(self, X, copy=True):\n \"\"\"Transform the sources back to the mixed data (apply mixing matrix).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_components)\n Sources, where `n_samples` is the number of samples\n and `n_components` is the number of components.\n copy : bool, default=True\n If False, data passed to fit are overwritten. Defaults to True.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_features)\n Reconstructed data obtained with the mixing matrix.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, copy=copy and self.whiten, dtype=FLOAT_DTYPES)\n X = np.dot(X, self.mixing_.T)\n if self.whiten:\n X += self.mean_\n return X" }, { @@ -49905,7 +52101,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -49915,7 +52112,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data to transform, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "copy", @@ -49925,13 +52123,14 @@ "docstring": { "type": "bool, default=True", "description": "If False, data passed to fit can be overwritten. Defaults to True." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Recover the sources from X (apply the unmixing matrix).", - "docstring": "Recover the sources from X (apply the unmixing matrix).\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data to transform, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ncopy : bool, default=True\n If False, data passed to fit can be overwritten. Defaults to True.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Estimated sources obtained by transforming the data with the\n estimated unmixing matrix.", + "docstring": "Recover the sources from X (apply the unmixing matrix).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to transform, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n copy : bool, default=True\n If False, data passed to fit can be overwritten. Defaults to True.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Estimated sources obtained by transforming the data with the\n estimated unmixing matrix.\n ", "source_code": "\ndef transform(self, X, copy=True):\n \"\"\"Recover the sources from X (apply the unmixing matrix).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to transform, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n copy : bool, default=True\n If False, data passed to fit can be overwritten. Defaults to True.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Estimated sources obtained by transforming the data with the\n estimated unmixing matrix.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, copy=copy and self.whiten, dtype=FLOAT_DTYPES, reset=False)\n if self.whiten:\n X -= self.mean_\n return np.dot(X, self.components_.T)" }, { @@ -49949,7 +52148,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fun_args", @@ -49959,13 +52159,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _cube(x, fun_args):\n return x**3, (3 * x**2).mean(axis=-1)" }, { @@ -49983,7 +52184,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fun_args", @@ -49993,13 +52195,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _exp(x, fun_args):\n exp = np.exp(-x**2 / 2)\n gx = x * exp\n g_x = (1 - x**2) * exp\n return gx, g_x.mean(axis=-1)" }, { @@ -50017,7 +52220,8 @@ "docstring": { "type": "ndarray of shape (n,)", "description": "Array to be orthogonalized" - } + }, + "refined_type": {} }, { "name": "W", @@ -50027,7 +52231,8 @@ "docstring": { "type": "ndarray of shape (p, n)", "description": "Null space definition" - } + }, + "refined_type": {} }, { "name": "j", @@ -50037,13 +52242,14 @@ "docstring": { "type": "int < p", "description": "The no of (from the first) rows of Null space W wrt which w is\northogonalized." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Orthonormalize w wrt the first j rows of W.", - "docstring": "Orthonormalize w wrt the first j rows of W.\n\nParameters\n----------\nw : ndarray of shape (n,)\n Array to be orthogonalized\n\nW : ndarray of shape (p, n)\n Null space definition\n\nj : int < p\n The no of (from the first) rows of Null space W wrt which w is\n orthogonalized.\n\nNotes\n-----\nAssumes that W is orthogonal\nw changed in place", + "docstring": "\n Orthonormalize w wrt the first j rows of W.\n\n Parameters\n ----------\n w : ndarray of shape (n,)\n Array to be orthogonalized\n\n W : ndarray of shape (p, n)\n Null space definition\n\n j : int < p\n The no of (from the first) rows of Null space W wrt which w is\n orthogonalized.\n\n Notes\n -----\n Assumes that W is orthogonal\n w changed in place\n ", "source_code": "\ndef _gs_decorrelation(w, W, j):\n \"\"\"\n Orthonormalize w wrt the first j rows of W.\n\n Parameters\n ----------\n w : ndarray of shape (n,)\n Array to be orthogonalized\n\n W : ndarray of shape (p, n)\n Null space definition\n\n j : int < p\n The no of (from the first) rows of Null space W wrt which w is\n orthogonalized.\n\n Notes\n -----\n Assumes that W is orthogonal\n w changed in place\n \"\"\"\n w -= np.linalg.multi_dot([w, W[:j].T, W[:j]])\n return w" }, { @@ -50061,7 +52267,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -50071,7 +52278,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "g", @@ -50081,7 +52289,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fun_args", @@ -50091,7 +52300,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -50101,7 +52311,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "w_init", @@ -50111,13 +52322,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Deflationary FastICA using fun approx to neg-entropy function\n\nUsed internally by FastICA.", - "docstring": "Deflationary FastICA using fun approx to neg-entropy function\n\nUsed internally by FastICA.", + "docstring": "Deflationary FastICA using fun approx to neg-entropy function\n\n Used internally by FastICA.\n ", "source_code": "\ndef _ica_def(X, tol, g, fun_args, max_iter, w_init):\n \"\"\"Deflationary FastICA using fun approx to neg-entropy function\n\n Used internally by FastICA.\n \"\"\"\n n_components = w_init.shape[0]\n W = np.zeros((n_components, n_components), dtype=X.dtype)\n n_iter = []\n for j in range(n_components):\n w = w_init[j, :].copy()\n w /= np.sqrt((w**2).sum())\n for i in range(max_iter):\n (gwtx, g_wtx) = g(np.dot(w.T, X), fun_args)\n w1 = (X * gwtx).mean(axis=1) - g_wtx.mean() * w\n _gs_decorrelation(w1, W, j)\n w1 /= np.sqrt((w1**2).sum())\n lim = np.abs(np.abs((w1 * w).sum()) - 1)\n w = w1\n if lim < tol:\n break\n n_iter.append(i + 1)\n W[j, :] = w\n return W, max(n_iter)" }, { @@ -50135,7 +52347,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -50145,7 +52358,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "g", @@ -50155,7 +52369,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fun_args", @@ -50165,7 +52380,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -50175,7 +52391,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "w_init", @@ -50185,13 +52402,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Parallel FastICA.\n\nUsed internally by FastICA --main loop", - "docstring": "Parallel FastICA.\n\nUsed internally by FastICA --main loop", + "docstring": "Parallel FastICA.\n\n Used internally by FastICA --main loop\n\n ", "source_code": "\ndef _ica_par(X, tol, g, fun_args, max_iter, w_init):\n \"\"\"Parallel FastICA.\n\n Used internally by FastICA --main loop\n\n \"\"\"\n W = _sym_decorrelation(w_init)\n del w_init\n p_ = float(X.shape[1])\n for ii in range(max_iter):\n (gwtx, g_wtx) = g(np.dot(W, X), fun_args)\n W1 = _sym_decorrelation(np.dot(gwtx, X.T) / p_ - g_wtx[:, np.newaxis] * W)\n del gwtx, g_wtx\n lim = max(abs(abs(np.diag(np.dot(W1, W.T))) - 1))\n W = W1\n if lim < tol:\n break\n else:\n warnings.warn('FastICA did not converge. Consider increasing tolerance or the maximum number of iterations.', ConvergenceWarning)\n return W, ii + 1" }, { @@ -50209,7 +52427,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fun_args", @@ -50219,13 +52438,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _logcosh(x, fun_args=None):\n alpha = fun_args.get('alpha', 1.0)\n x *= alpha\n gx = np.tanh(x, x)\n g_x = np.empty(x.shape[0])\n for (i, gx_i) in enumerate(gx):\n g_x[i] = (alpha * (1 - gx_i**2)).mean()\n return gx, g_x" }, { @@ -50243,13 +52463,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Symmetric decorrelation i.e. W <- (W * W.T) ^{-1/2} * W", - "docstring": "Symmetric decorrelation\ni.e. W <- (W * W.T) ^{-1/2} * W", + "description": "Symmetric decorrelation\ni.e. W <- (W * W.T) ^{-1/2} * W", + "docstring": "Symmetric decorrelation\n i.e. W <- (W * W.T) ^{-1/2} * W\n ", "source_code": "\ndef _sym_decorrelation(W):\n \"\"\"Symmetric decorrelation\n i.e. W <- (W * W.T) ^{-1/2} * W\n \"\"\"\n (s, u) = linalg.eigh(np.dot(W, W.T))\n return np.linalg.multi_dot([u * (1.0 / np.sqrt(s)), u.T, W])" }, { @@ -50267,7 +52488,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -50277,7 +52499,8 @@ "docstring": { "type": "int, default=None", "description": "Number of components to extract. If None no dimension reduction\nis performed." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -50287,6 +52510,10 @@ "docstring": { "type": "{'parallel', 'deflation'}, default='parallel'", "description": "Apply a parallel or deflational FASTICA algorithm." + }, + "refined_type": { + "kind": "EnumType", + "values": ["deflation", "parallel"] } }, { @@ -50297,7 +52524,8 @@ "docstring": { "type": "bool, default=True", "description": "If True perform an initial whitening of the data.\nIf False, the data is assumed to have already been\npreprocessed: it should be centered, normed and white.\nOtherwise you will get incorrect results.\nIn this case the parameter n_components will be ignored." - } + }, + "refined_type": {} }, { "name": "fun", @@ -50307,6 +52535,10 @@ "docstring": { "type": "{'logcosh', 'exp', 'cube'} or callable, default='logcosh'", "description": "The functional form of the G function used in the\napproximation to neg-entropy. Could be either 'logcosh', 'exp',\nor 'cube'.\nYou can also provide your own function. It should return a tuple\ncontaining the value of the function, and of its derivative, in the\npoint. The derivative should be averaged along its last dimension.\nExample:\n\ndef my_g(x):\n return x ** 3, np.mean(3 * x ** 2, axis=-1)" + }, + "refined_type": { + "kind": "EnumType", + "values": ["exp", "cube", "logcosh"] } }, { @@ -50317,6 +52549,10 @@ "docstring": { "type": "dict, default=None", "description": "Arguments to send to the functional form.\nIf empty or None and if fun='logcosh', fun_args will take value\n{'alpha' : 1.0}" + }, + "refined_type": { + "kind": "EnumType", + "values": ["alpha"] } }, { @@ -50327,7 +52563,8 @@ "docstring": { "type": "int, default=200", "description": "Maximum number of iterations to perform." - } + }, + "refined_type": {} }, { "name": "tol", @@ -50337,7 +52574,8 @@ "docstring": { "type": "float, default=1e-04", "description": "A positive scalar giving the tolerance at which the\nun-mixing matrix is considered to have converged." - } + }, + "refined_type": {} }, { "name": "w_init", @@ -50347,7 +52585,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_components), default=None", "description": "Initial un-mixing array of dimension (n.comp,n.comp).\nIf None (default) then an array of normal r.v.'s is used." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -50357,7 +52596,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used to initialize ``w_init`` when not specified, with a\nnormal distribution. Pass an int, for reproducible results\nacross multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "return_X_mean", @@ -50367,7 +52607,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, X_mean is returned too." - } + }, + "refined_type": {} }, { "name": "compute_sources", @@ -50377,7 +52618,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, sources are not computed, but only the rotation matrix.\nThis can save memory when working with big data. Defaults to True." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -50387,13 +52629,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether or not to return the number of iterations." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Perform Fast Independent Component Analysis.\n\nThe implementation is based on [1]_. Read more in the :ref:`User Guide `.", - "docstring": "Perform Fast Independent Component Analysis.\n\nThe implementation is based on [1]_.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nn_components : int, default=None\n Number of components to extract. If None no dimension reduction\n is performed.\n\nalgorithm : {'parallel', 'deflation'}, default='parallel'\n Apply a parallel or deflational FASTICA algorithm.\n\nwhiten : bool, default=True\n If True perform an initial whitening of the data.\n If False, the data is assumed to have already been\n preprocessed: it should be centered, normed and white.\n Otherwise you will get incorrect results.\n In this case the parameter n_components will be ignored.\n\nfun : {'logcosh', 'exp', 'cube'} or callable, default='logcosh'\n The functional form of the G function used in the\n approximation to neg-entropy. Could be either 'logcosh', 'exp',\n or 'cube'.\n You can also provide your own function. It should return a tuple\n containing the value of the function, and of its derivative, in the\n point. The derivative should be averaged along its last dimension.\n Example:\n\n def my_g(x):\n return x ** 3, np.mean(3 * x ** 2, axis=-1)\n\nfun_args : dict, default=None\n Arguments to send to the functional form.\n If empty or None and if fun='logcosh', fun_args will take value\n {'alpha' : 1.0}\n\nmax_iter : int, default=200\n Maximum number of iterations to perform.\n\ntol : float, default=1e-04\n A positive scalar giving the tolerance at which the\n un-mixing matrix is considered to have converged.\n\nw_init : ndarray of shape (n_components, n_components), default=None\n Initial un-mixing array of dimension (n.comp,n.comp).\n If None (default) then an array of normal r.v.'s is used.\n\nrandom_state : int, RandomState instance or None, default=None\n Used to initialize ``w_init`` when not specified, with a\n normal distribution. Pass an int, for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\nreturn_X_mean : bool, default=False\n If True, X_mean is returned too.\n\ncompute_sources : bool, default=True\n If False, sources are not computed, but only the rotation matrix.\n This can save memory when working with big data. Defaults to True.\n\nreturn_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\nReturns\n-------\nK : ndarray of shape (n_components, n_features) or None\n If whiten is 'True', K is the pre-whitening matrix that projects data\n onto the first n_components principal components. If whiten is 'False',\n K is 'None'.\n\nW : ndarray of shape (n_components, n_components)\n The square matrix that unmixes the data after whitening.\n The mixing matrix is the pseudo-inverse of matrix ``W K``\n if K is not None, else it is the inverse of W.\n\nS : ndarray of shape (n_samples, n_components) or None\n Estimated source matrix\n\nX_mean : ndarray of shape (n_features,)\n The mean over features. Returned only if return_X_mean is True.\n\nn_iter : int\n If the algorithm is \"deflation\", n_iter is the\n maximum number of iterations run across all components. Else\n they are just the number of iterations taken to converge. This is\n returned only when return_n_iter is set to `True`.\n\nNotes\n-----\nThe data matrix X is considered to be a linear combination of\nnon-Gaussian (independent) components i.e. X = AS where columns of S\ncontain the independent components and A is a linear mixing\nmatrix. In short ICA attempts to `un-mix' the data by estimating an\nun-mixing matrix W where ``S = W K X.``\nWhile FastICA was proposed to estimate as many sources\nas features, it is possible to estimate less by setting\nn_components < n_features. It this case K is not a square matrix\nand the estimated A is the pseudo-inverse of ``W K``.\n\nThis implementation was originally made for data of shape\n[n_features, n_samples]. Now the input is transposed\nbefore the algorithm is applied. This makes it slightly\nfaster for Fortran-ordered input.\n\nReferences\n----------\n.. [1] A. Hyvarinen and E. Oja, \"Fast Independent Component Analysis\",\n Algorithms and Applications, Neural Networks, 13(4-5), 2000,\n pp. 411-430.", + "description": "Perform Fast Independent Component Analysis.\n\nThe implementation is based on [1]_.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Perform Fast Independent Component Analysis.\n\n The implementation is based on [1]_.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n n_components : int, default=None\n Number of components to extract. If None no dimension reduction\n is performed.\n\n algorithm : {'parallel', 'deflation'}, default='parallel'\n Apply a parallel or deflational FASTICA algorithm.\n\n whiten : bool, default=True\n If True perform an initial whitening of the data.\n If False, the data is assumed to have already been\n preprocessed: it should be centered, normed and white.\n Otherwise you will get incorrect results.\n In this case the parameter n_components will be ignored.\n\n fun : {'logcosh', 'exp', 'cube'} or callable, default='logcosh'\n The functional form of the G function used in the\n approximation to neg-entropy. Could be either 'logcosh', 'exp',\n or 'cube'.\n You can also provide your own function. It should return a tuple\n containing the value of the function, and of its derivative, in the\n point. The derivative should be averaged along its last dimension.\n Example:\n\n def my_g(x):\n return x ** 3, np.mean(3 * x ** 2, axis=-1)\n\n fun_args : dict, default=None\n Arguments to send to the functional form.\n If empty or None and if fun='logcosh', fun_args will take value\n {'alpha' : 1.0}\n\n max_iter : int, default=200\n Maximum number of iterations to perform.\n\n tol : float, default=1e-04\n A positive scalar giving the tolerance at which the\n un-mixing matrix is considered to have converged.\n\n w_init : ndarray of shape (n_components, n_components), default=None\n Initial un-mixing array of dimension (n.comp,n.comp).\n If None (default) then an array of normal r.v.'s is used.\n\n random_state : int, RandomState instance or None, default=None\n Used to initialize ``w_init`` when not specified, with a\n normal distribution. Pass an int, for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n return_X_mean : bool, default=False\n If True, X_mean is returned too.\n\n compute_sources : bool, default=True\n If False, sources are not computed, but only the rotation matrix.\n This can save memory when working with big data. Defaults to True.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n K : ndarray of shape (n_components, n_features) or None\n If whiten is 'True', K is the pre-whitening matrix that projects data\n onto the first n_components principal components. If whiten is 'False',\n K is 'None'.\n\n W : ndarray of shape (n_components, n_components)\n The square matrix that unmixes the data after whitening.\n The mixing matrix is the pseudo-inverse of matrix ``W K``\n if K is not None, else it is the inverse of W.\n\n S : ndarray of shape (n_samples, n_components) or None\n Estimated source matrix\n\n X_mean : ndarray of shape (n_features,)\n The mean over features. Returned only if return_X_mean is True.\n\n n_iter : int\n If the algorithm is \"deflation\", n_iter is the\n maximum number of iterations run across all components. Else\n they are just the number of iterations taken to converge. This is\n returned only when return_n_iter is set to `True`.\n\n Notes\n -----\n The data matrix X is considered to be a linear combination of\n non-Gaussian (independent) components i.e. X = AS where columns of S\n contain the independent components and A is a linear mixing\n matrix. In short ICA attempts to `un-mix' the data by estimating an\n un-mixing matrix W where ``S = W K X.``\n While FastICA was proposed to estimate as many sources\n as features, it is possible to estimate less by setting\n n_components < n_features. It this case K is not a square matrix\n and the estimated A is the pseudo-inverse of ``W K``.\n\n This implementation was originally made for data of shape\n [n_features, n_samples]. Now the input is transposed\n before the algorithm is applied. This makes it slightly\n faster for Fortran-ordered input.\n\n References\n ----------\n .. [1] A. Hyvarinen and E. Oja, \"Fast Independent Component Analysis\",\n Algorithms and Applications, Neural Networks, 13(4-5), 2000,\n pp. 411-430.\n ", "source_code": "\ndef fastica(X, n_components=None, *, algorithm='parallel', whiten=True, fun='logcosh', fun_args=None, max_iter=200, tol=0.0001, w_init=None, random_state=None, return_X_mean=False, compute_sources=True, return_n_iter=False):\n \"\"\"Perform Fast Independent Component Analysis.\n\n The implementation is based on [1]_.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n n_components : int, default=None\n Number of components to extract. If None no dimension reduction\n is performed.\n\n algorithm : {'parallel', 'deflation'}, default='parallel'\n Apply a parallel or deflational FASTICA algorithm.\n\n whiten : bool, default=True\n If True perform an initial whitening of the data.\n If False, the data is assumed to have already been\n preprocessed: it should be centered, normed and white.\n Otherwise you will get incorrect results.\n In this case the parameter n_components will be ignored.\n\n fun : {'logcosh', 'exp', 'cube'} or callable, default='logcosh'\n The functional form of the G function used in the\n approximation to neg-entropy. Could be either 'logcosh', 'exp',\n or 'cube'.\n You can also provide your own function. It should return a tuple\n containing the value of the function, and of its derivative, in the\n point. The derivative should be averaged along its last dimension.\n Example:\n\n def my_g(x):\n return x ** 3, np.mean(3 * x ** 2, axis=-1)\n\n fun_args : dict, default=None\n Arguments to send to the functional form.\n If empty or None and if fun='logcosh', fun_args will take value\n {'alpha' : 1.0}\n\n max_iter : int, default=200\n Maximum number of iterations to perform.\n\n tol : float, default=1e-04\n A positive scalar giving the tolerance at which the\n un-mixing matrix is considered to have converged.\n\n w_init : ndarray of shape (n_components, n_components), default=None\n Initial un-mixing array of dimension (n.comp,n.comp).\n If None (default) then an array of normal r.v.'s is used.\n\n random_state : int, RandomState instance or None, default=None\n Used to initialize ``w_init`` when not specified, with a\n normal distribution. Pass an int, for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n return_X_mean : bool, default=False\n If True, X_mean is returned too.\n\n compute_sources : bool, default=True\n If False, sources are not computed, but only the rotation matrix.\n This can save memory when working with big data. Defaults to True.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n K : ndarray of shape (n_components, n_features) or None\n If whiten is 'True', K is the pre-whitening matrix that projects data\n onto the first n_components principal components. If whiten is 'False',\n K is 'None'.\n\n W : ndarray of shape (n_components, n_components)\n The square matrix that unmixes the data after whitening.\n The mixing matrix is the pseudo-inverse of matrix ``W K``\n if K is not None, else it is the inverse of W.\n\n S : ndarray of shape (n_samples, n_components) or None\n Estimated source matrix\n\n X_mean : ndarray of shape (n_features,)\n The mean over features. Returned only if return_X_mean is True.\n\n n_iter : int\n If the algorithm is \"deflation\", n_iter is the\n maximum number of iterations run across all components. Else\n they are just the number of iterations taken to converge. This is\n returned only when return_n_iter is set to `True`.\n\n Notes\n -----\n The data matrix X is considered to be a linear combination of\n non-Gaussian (independent) components i.e. X = AS where columns of S\n contain the independent components and A is a linear mixing\n matrix. In short ICA attempts to `un-mix' the data by estimating an\n un-mixing matrix W where ``S = W K X.``\n While FastICA was proposed to estimate as many sources\n as features, it is possible to estimate less by setting\n n_components < n_features. It this case K is not a square matrix\n and the estimated A is the pseudo-inverse of ``W K``.\n\n This implementation was originally made for data of shape\n [n_features, n_samples]. Now the input is transposed\n before the algorithm is applied. This makes it slightly\n faster for Fortran-ordered input.\n\n References\n ----------\n .. [1] A. Hyvarinen and E. Oja, \"Fast Independent Component Analysis\",\n Algorithms and Applications, Neural Networks, 13(4-5), 2000,\n pp. 411-430.\n \"\"\"\n est = FastICA(n_components=n_components, algorithm=algorithm, whiten=whiten, fun=fun, fun_args=fun_args, max_iter=max_iter, tol=tol, w_init=w_init, random_state=random_state)\n sources = est._fit(X, compute_sources=compute_sources)\n if whiten:\n if return_X_mean:\n if return_n_iter:\n return est.whitening_, est._unmixing, sources, est.mean_, est.n_iter_\n else:\n return est.whitening_, est._unmixing, sources, est.mean_\n elif return_n_iter:\n return est.whitening_, est._unmixing, sources, est.n_iter_\n else:\n return est.whitening_, est._unmixing, sources\n elif return_X_mean:\n if return_n_iter:\n return None, est._unmixing, sources, None, est.n_iter_\n else:\n return None, est._unmixing, sources, None\n elif return_n_iter:\n return None, est._unmixing, sources, est.n_iter_\n else:\n return None, est._unmixing, sources" }, { @@ -50411,7 +52654,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -50421,7 +52665,8 @@ "docstring": { "type": "int, default=None", "description": "Number of components to keep. If ``n_components`` is ``None``,\nthen ``n_components`` is set to ``min(n_samples, n_features)``." - } + }, + "refined_type": {} }, { "name": "whiten", @@ -50431,7 +52676,8 @@ "docstring": { "type": "bool, default=False", "description": "When True (False by default) the ``components_`` vectors are divided\nby ``n_samples`` times ``components_`` to ensure uncorrelated outputs\nwith unit component-wise variances.\n\nWhitening will remove some information from the transformed signal\n(the relative variance scales of the components) but can sometimes\nimprove the predictive accuracy of the downstream estimators by\nmaking data respect some hard-wired assumptions." - } + }, + "refined_type": {} }, { "name": "copy", @@ -50441,7 +52687,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, X will be overwritten. ``copy=False`` can be used to\nsave memory but is unsafe for general use." - } + }, + "refined_type": {} }, { "name": "batch_size", @@ -50451,13 +52698,14 @@ "docstring": { "type": "int, default=None", "description": "The number of samples to use for each batch. Only used when calling\n``fit``. If ``batch_size`` is ``None``, then ``batch_size``\nis inferred from the data and set to ``5 * n_features``, to provide a\nbalance between approximation accuracy and memory consumption." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=None, *, whiten=False, copy=True, batch_size=None):\n self.n_components = n_components\n self.whiten = whiten\n self.copy = copy\n self.batch_size = batch_size" }, { @@ -50475,7 +52723,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -50485,6 +52734,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -50495,13 +52748,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model with X, using minibatches of size batch_size.", - "docstring": "Fit the model with X, using minibatches of size batch_size.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the model with X, using minibatches of size batch_size.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model with X, using minibatches of size batch_size.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self.components_ = None\n self.n_samples_seen_ = 0\n self.mean_ = 0.0\n self.var_ = 0.0\n self.singular_values_ = None\n self.explained_variance_ = None\n self.explained_variance_ratio_ = None\n self.noise_variance_ = None\n X = self._validate_data(X, accept_sparse=['csr', 'csc', 'lil'], copy=self.copy, dtype=[np.float64, np.float32])\n (n_samples, n_features) = X.shape\n if self.batch_size is None:\n self.batch_size_ = 5 * n_features\n else:\n self.batch_size_ = self.batch_size\n for batch in gen_batches(n_samples, self.batch_size_, min_batch_size=self.n_components or 0):\n X_batch = X[batch]\n if sparse.issparse(X_batch):\n X_batch = X_batch.toarray()\n self.partial_fit(X_batch, check_input=False)\n return self" }, { @@ -50519,7 +52773,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -50529,7 +52784,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -50539,7 +52795,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "check_input", @@ -50549,13 +52806,14 @@ "docstring": { "type": "bool, default=True", "description": "Run check_array on X." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Incremental fit with X. All of X is processed as a single batch.", - "docstring": "Incremental fit with X. All of X is processed as a single batch.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\ncheck_input : bool, default=True\n Run check_array on X.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Incremental fit with X. All of X is processed as a single batch.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n check_input : bool, default=True\n Run check_array on X.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef partial_fit(self, X, y=None, check_input=True):\n \"\"\"Incremental fit with X. All of X is processed as a single batch.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n check_input : bool, default=True\n Run check_array on X.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n first_pass = not hasattr(self, 'components_')\n if check_input:\n if sparse.issparse(X):\n raise TypeError('IncrementalPCA.partial_fit does not support sparse input. Either convert data to dense or use IncrementalPCA.fit to do so in batches.')\n X = self._validate_data(X, copy=self.copy, dtype=[np.float64, np.float32], reset=first_pass)\n (n_samples, n_features) = X.shape\n if first_pass:\n self.components_ = None\n if self.n_components is None:\n if self.components_ is None:\n self.n_components_ = min(n_samples, n_features)\n else:\n self.n_components_ = self.components_.shape[0]\n elif not 1 <= self.n_components <= n_features:\n raise ValueError('n_components=%r invalid for n_features=%d, need more rows than columns for IncrementalPCA processing' % (self.n_components, n_features))\n elif not self.n_components <= n_samples:\n raise ValueError('n_components=%r must be less or equal to the batch number of samples %d.' % (self.n_components, n_samples))\n else:\n self.n_components_ = self.n_components\n if self.components_ is not None and self.components_.shape[0] != self.n_components_:\n raise ValueError('Number of input features has changed from %i to %i between calls to partial_fit! Try setting n_components to a fixed value.' % (self.components_.shape[0], self.n_components_))\n if not hasattr(self, 'n_samples_seen_'):\n self.n_samples_seen_ = 0\n self.mean_ = 0.0\n self.var_ = 0.0\n (col_mean, col_var, n_total_samples) = _incremental_mean_and_var(X, last_mean=self.mean_, last_variance=self.var_, last_sample_count=np.repeat(self.n_samples_seen_, X.shape[1]))\n n_total_samples = n_total_samples[0]\n if self.n_samples_seen_ == 0:\n X -= col_mean\n else:\n col_batch_mean = np.mean(X, axis=0)\n X -= col_batch_mean\n mean_correction = np.sqrt(self.n_samples_seen_ / n_total_samples * n_samples) * (self.mean_ - col_batch_mean)\n X = np.vstack((self.singular_values_.reshape((-1, 1)) * self.components_, X, mean_correction))\n (U, S, Vt) = linalg.svd(X, full_matrices=False, check_finite=False)\n (U, Vt) = svd_flip(U, Vt, u_based_decision=False)\n explained_variance = S**2 / (n_total_samples - 1)\n explained_variance_ratio = S**2 / np.sum(col_var * n_total_samples)\n self.n_samples_seen_ = n_total_samples\n self.components_ = Vt[:self.n_components_]\n self.singular_values_ = S[:self.n_components_]\n self.mean_ = col_mean\n self.var_ = col_var\n self.explained_variance_ = explained_variance[:self.n_components_]\n self.explained_variance_ratio_ = explained_variance_ratio[:self.n_components_]\n if self.n_components_ < n_features:\n self.noise_variance_ = explained_variance[self.n_components_:].mean()\n else:\n self.noise_variance_ = 0.0\n return self" }, { @@ -50573,7 +52831,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -50583,13 +52842,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "New data, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Apply dimensionality reduction to X.\n\nX is projected on the first principal components previously extracted from a training set, using minibatches of size batch_size if X is sparse.", - "docstring": "Apply dimensionality reduction to X.\n\nX is projected on the first principal components previously extracted\nfrom a training set, using minibatches of size batch_size if X is\nsparse.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Projection of X in the first principal components.\n\nExamples\n--------\n\n>>> import numpy as np\n>>> from sklearn.decomposition import IncrementalPCA\n>>> X = np.array([[-1, -1], [-2, -1], [-3, -2],\n... [1, 1], [2, 1], [3, 2]])\n>>> ipca = IncrementalPCA(n_components=2, batch_size=3)\n>>> ipca.fit(X)\nIncrementalPCA(batch_size=3, n_components=2)\n>>> ipca.transform(X) # doctest: +SKIP", + "description": "Apply dimensionality reduction to X.\n\nX is projected on the first principal components previously extracted\nfrom a training set, using minibatches of size batch_size if X is\nsparse.", + "docstring": "Apply dimensionality reduction to X.\n\n X is projected on the first principal components previously extracted\n from a training set, using minibatches of size batch_size if X is\n sparse.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Projection of X in the first principal components.\n\n Examples\n --------\n\n >>> import numpy as np\n >>> from sklearn.decomposition import IncrementalPCA\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2],\n ... [1, 1], [2, 1], [3, 2]])\n >>> ipca = IncrementalPCA(n_components=2, batch_size=3)\n >>> ipca.fit(X)\n IncrementalPCA(batch_size=3, n_components=2)\n >>> ipca.transform(X) # doctest: +SKIP\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Apply dimensionality reduction to X.\n\n X is projected on the first principal components previously extracted\n from a training set, using minibatches of size batch_size if X is\n sparse.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Projection of X in the first principal components.\n\n Examples\n --------\n\n >>> import numpy as np\n >>> from sklearn.decomposition import IncrementalPCA\n >>> X = np.array([[-1, -1], [-2, -1], [-3, -2],\n ... [1, 1], [2, 1], [3, 2]])\n >>> ipca = IncrementalPCA(n_components=2, batch_size=3)\n >>> ipca.fit(X)\n IncrementalPCA(batch_size=3, n_components=2)\n >>> ipca.transform(X) # doctest: +SKIP\n \"\"\"\n if sparse.issparse(X):\n n_samples = X.shape[0]\n output = []\n for batch in gen_batches(n_samples, self.batch_size_, min_batch_size=self.n_components or 0):\n output.append(super().transform(X[batch].toarray()))\n return np.vstack(output)\n else:\n return super().transform(X)" }, { @@ -50607,7 +52870,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -50617,7 +52881,8 @@ "docstring": { "type": "int, default=None", "description": "Number of components. If None, all non-zero components are kept." - } + }, + "refined_type": {} }, { "name": "kernel", @@ -50627,6 +52892,17 @@ "docstring": { "type": "{'linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'}, default='linear'", "description": "Kernel used for PCA." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "poly", + "cosine", + "rbf", + "sigmoid", + "linear", + "precomputed" + ] } }, { @@ -50637,7 +52913,8 @@ "docstring": { "type": "float, default=None", "description": "Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other\nkernels. If ``gamma`` is ``None``, then it is set to ``1/n_features``." - } + }, + "refined_type": {} }, { "name": "degree", @@ -50647,7 +52924,8 @@ "docstring": { "type": "int, default=3", "description": "Degree for poly kernels. Ignored by other kernels." - } + }, + "refined_type": {} }, { "name": "coef0", @@ -50657,7 +52935,8 @@ "docstring": { "type": "float, default=1", "description": "Independent term in poly and sigmoid kernels.\nIgnored by other kernels." - } + }, + "refined_type": {} }, { "name": "kernel_params", @@ -50667,7 +52946,8 @@ "docstring": { "type": "dict, default=None", "description": "Parameters (keyword arguments) and\nvalues for kernel passed as callable object.\nIgnored by other kernels." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -50677,7 +52957,8 @@ "docstring": { "type": "float, default=1.0", "description": "Hyperparameter of the ridge regression that learns the\ninverse transform (when fit_inverse_transform=True)." - } + }, + "refined_type": {} }, { "name": "fit_inverse_transform", @@ -50686,8 +52967,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "bool, default=False", - "description": "Learn the inverse transform for non-precomputed kernels\n(i.e. learn to find the pre-image of a point)." - } + "description": "Learn the inverse transform for non-precomputed kernels\n(i.e. learn to find the pre-image of a point). This method is based\non [2]_." + }, + "refined_type": {} }, { "name": "eigen_solver", @@ -50695,8 +52977,12 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "{'auto', 'dense', 'arpack', 'randomized'}, default='auto'", - "description": "Select eigensolver to use. If `n_components` is much\nless than the number of training samples, randomized (or arpack to a\nsmaller extend) may be more efficient than the dense eigensolver.\nRandomized SVD is performed according to the method of Halko et al.\n\nauto :\n the solver is selected by a default policy based on n_samples\n (the number of training samples) and `n_components`:\n if the number of components to extract is less than 10 (strict) and\n the number of samples is more than 200 (strict), the 'arpack'\n method is enabled. Otherwise the exact full eigenvalue\n decomposition is computed and optionally truncated afterwards\n ('dense' method).\ndense :\n run exact full eigenvalue decomposition calling the standard\n LAPACK solver via `scipy.linalg.eigh`, and select the components\n by postprocessing\narpack :\n run SVD truncated to n_components calling ARPACK solver using\n `scipy.sparse.linalg.eigsh`. It requires strictly\n 0 < n_components < n_samples\nrandomized :\n run randomized SVD by the method of Halko et al. The current\n implementation selects eigenvalues based on their module; therefore\n using this method can lead to unexpected results if the kernel is\n not positive semi-definite.\n\n.. versionchanged:: 1.0\n `'randomized'` was added." + "type": "{'auto', 'dense', 'arpack', 'randomized'}, default='auto'", + "description": "Select eigensolver to use. If `n_components` is much\nless than the number of training samples, randomized (or arpack to a\nsmaller extend) may be more efficient than the dense eigensolver.\nRandomized SVD is performed according to the method of Halko et al\n[3]_.\n\nauto :\n the solver is selected by a default policy based on n_samples\n (the number of training samples) and `n_components`:\n if the number of components to extract is less than 10 (strict) and\n the number of samples is more than 200 (strict), the 'arpack'\n method is enabled. Otherwise the exact full eigenvalue\n decomposition is computed and optionally truncated afterwards\n ('dense' method).\ndense :\n run exact full eigenvalue decomposition calling the standard\n LAPACK solver via `scipy.linalg.eigh`, and select the components\n by postprocessing\narpack :\n run SVD truncated to n_components calling ARPACK solver using\n `scipy.sparse.linalg.eigsh`. It requires strictly\n 0 < n_components < n_samples\nrandomized :\n run randomized SVD by the method of Halko et al. [3]_. The current\n implementation selects eigenvalues based on their module; therefore\n using this method can lead to unexpected results if the kernel is\n not positive semi-definite. See also [4]_.\n\n.. versionchanged:: 1.0\n `'randomized'` was added." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "dense", "randomized", "arpack"] } }, { @@ -50707,7 +52993,8 @@ "docstring": { "type": "float, default=0", "description": "Convergence tolerance for arpack.\nIf 0, optimal value will be chosen by arpack." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -50717,7 +53004,8 @@ "docstring": { "type": "int, default=None", "description": "Maximum number of iterations for arpack.\nIf None, optimal value will be chosen by arpack." - } + }, + "refined_type": {} }, { "name": "iterated_power", @@ -50727,7 +53015,8 @@ "docstring": { "type": "int >= 0, or 'auto', default='auto'", "description": "Number of iterations for the power method computed by\nsvd_solver == 'randomized'. When 'auto', it is set to 7 when\n`n_components < 0.1 * min(X.shape)`, other it is set to 4.\n\n.. versionadded:: 1.0" - } + }, + "refined_type": {} }, { "name": "remove_zero_eig", @@ -50737,7 +53026,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, then all components with zero eigenvalues are removed, so\nthat the number of components in the output may be < n_components\n(and sometimes even zero due to numerical instability).\nWhen n_components is None, this parameter is ignored and components\nwith zero eigenvalues are removed regardless." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -50747,7 +53037,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used when ``eigen_solver`` == 'arpack' or 'randomized'. Pass an int\nfor reproducible results across multiple function calls.\nSee :term:`Glossary `.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -50757,7 +53048,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, input X is copied and stored by the model in the `X_fit_`\nattribute. If no further changes will be done to X, setting\n`copy_X=False` saves memory by storing a reference.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -50767,13 +53059,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=None, *, kernel='linear', gamma=None, degree=3, coef0=1, kernel_params=None, alpha=1.0, fit_inverse_transform=False, eigen_solver='auto', tol=0, max_iter=None, iterated_power='auto', remove_zero_eig=False, random_state=None, copy_X=True, n_jobs=None):\n if fit_inverse_transform and kernel == 'precomputed':\n raise ValueError('Cannot fit_inverse_transform with a precomputed kernel.')\n self.n_components = n_components\n self.kernel = kernel\n self.kernel_params = kernel_params\n self.gamma = gamma\n self.degree = degree\n self.coef0 = coef0\n self.alpha = alpha\n self.fit_inverse_transform = fit_inverse_transform\n self.eigen_solver = eigen_solver\n self.tol = tol\n self.max_iter = max_iter\n self.iterated_power = iterated_power\n self.remove_zero_eig = remove_zero_eig\n self.random_state = random_state\n self.n_jobs = n_jobs\n self.copy_X = copy_X" }, { @@ -50791,7 +53084,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_transformed", @@ -50801,7 +53095,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -50811,13 +53106,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit_inverse_transform(self, X_transformed, X):\n if hasattr(X, 'tocsr'):\n raise NotImplementedError('Inverse transform not implemented for sparse matrices!')\n n_samples = X_transformed.shape[0]\n K = self._get_kernel(X_transformed)\n K.flat[::n_samples + 1] += self.alpha\n self.dual_coef_ = linalg.solve(K, X, sym_pos=True, overwrite_a=True)\n self.X_transformed_fit_ = X_transformed" }, { @@ -50835,7 +53131,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "K", @@ -50845,7 +53142,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -50869,7 +53167,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -50879,7 +53178,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -50889,13 +53189,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_kernel(self, X, Y=None):\n if callable(self.kernel):\n params = self.kernel_params or {}\n else:\n params = {'gamma': self.gamma, 'degree': self.degree, 'coef0': self.coef0}\n return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, n_jobs=self.n_jobs, **params)" }, { @@ -50913,13 +53214,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'preserves_dtype': [np.float64, np.float32], 'pairwise': self.kernel == 'precomputed'}" }, { @@ -50940,13 +53242,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef _pairwise(self):\n return self.kernel == 'precomputed'" }, { @@ -50967,13 +53270,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `alphas_` was deprecated in version 1.0 and will be removed in 1.2. Use `eigenvectors_` instead.')\n@property\ndef alphas_(self):\n return self.eigenvectors_" }, { @@ -50991,7 +53295,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51001,6 +53306,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -51011,13 +53320,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model from data in X.", - "docstring": "Fit the model from data in X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the model from data in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', copy=self.copy_X)\n self._centerer = KernelCenterer()\n K = self._get_kernel(X)\n self._fit_transform(K)\n if self.fit_inverse_transform:\n X_transformed = self.eigenvectors_ * np.sqrt(self.eigenvalues_)\n self._fit_inverse_transform(X_transformed, X)\n self.X_fit_ = X\n return self" }, { @@ -51035,7 +53345,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51045,6 +53356,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -51055,13 +53370,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model from data in X and transform X.", - "docstring": "Fit the model from data in X and transform X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\n**params : kwargs\n Parameters (keyword arguments) and values passed to\n the fit_transform instance.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.", + "docstring": "Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n **params : kwargs\n Parameters (keyword arguments) and values passed to\n the fit_transform instance.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.\n ", "source_code": "\ndef fit_transform(self, X, y=None, **params):\n \"\"\"Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n **params : kwargs\n Parameters (keyword arguments) and values passed to\n the fit_transform instance.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.\n \"\"\"\n self.fit(X, **params)\n X_transformed = self.eigenvectors_ * np.sqrt(self.eigenvalues_)\n if self.fit_inverse_transform:\n self._fit_inverse_transform(X_transformed, X)\n return X_transformed" }, { @@ -51079,7 +53395,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51089,14 +53406,18 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_components)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Transform X back to original space.\n\n``inverse_transform`` approximates the inverse transformation using a learned pre-image. The pre-image is learned by kernel ridge regression of the original data on their low-dimensional representation vectors. .. note: :meth:`~sklearn.decomposition.fit` internally uses a centered kernel. As the centered kernel no longer contains the information of the mean of kernel features, such information is not taken into account in reconstruction. .. note:: When users want to compute inverse transformation for 'linear' kernel, it is recommended that they use :class:`~sklearn.decomposition.PCA` instead. Unlike :class:`~sklearn.decomposition.PCA`, :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform`` does not reconstruct the mean of data when 'linear' kernel is used due to the use of centered kernel.", - "docstring": "Transform X back to original space.\n\n``inverse_transform`` approximates the inverse transformation using\na learned pre-image. The pre-image is learned by kernel ridge\nregression of the original data on their low-dimensional representation\nvectors.\n\n.. note:\n :meth:`~sklearn.decomposition.fit` internally uses a centered\n kernel. As the centered kernel no longer contains the information\n of the mean of kernel features, such information is not taken into\n account in reconstruction.\n\n.. note::\n When users want to compute inverse transformation for 'linear'\n kernel, it is recommended that they use\n :class:`~sklearn.decomposition.PCA` instead. Unlike\n :class:`~sklearn.decomposition.PCA`,\n :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform``\n does not reconstruct the mean of data when 'linear' kernel is used\n due to the use of centered kernel.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_components)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_features)\n Returns the instance itself.\n\nReferences\n----------\n\"Learning to Find Pre-Images\", G BakIr et al, 2004.", - "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Transform X back to original space.\n\n ``inverse_transform`` approximates the inverse transformation using\n a learned pre-image. The pre-image is learned by kernel ridge\n regression of the original data on their low-dimensional representation\n vectors.\n\n .. note:\n :meth:`~sklearn.decomposition.fit` internally uses a centered\n kernel. As the centered kernel no longer contains the information\n of the mean of kernel features, such information is not taken into\n account in reconstruction.\n\n .. note::\n When users want to compute inverse transformation for 'linear'\n kernel, it is recommended that they use\n :class:`~sklearn.decomposition.PCA` instead. Unlike\n :class:`~sklearn.decomposition.PCA`,\n :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform``\n does not reconstruct the mean of data when 'linear' kernel is used\n due to the use of centered kernel.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_components)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_features)\n Returns the instance itself.\n\n References\n ----------\n \"Learning to Find Pre-Images\", G BakIr et al, 2004.\n \"\"\"\n if not self.fit_inverse_transform:\n raise NotFittedError('The fit_inverse_transform parameter was not set to True when instantiating and hence the inverse transform is not available.')\n K = self._get_kernel(X, self.X_transformed_fit_)\n return np.dot(K, self.dual_coef_)" + "description": "Transform X back to original space.\n\n``inverse_transform`` approximates the inverse transformation using\na learned pre-image. The pre-image is learned by kernel ridge\nregression of the original data on their low-dimensional representation\nvectors.\n\n.. note:\n :meth:`~sklearn.decomposition.fit` internally uses a centered\n kernel. As the centered kernel no longer contains the information\n of the mean of kernel features, such information is not taken into\n account in reconstruction.\n\n.. note::\n When users want to compute inverse transformation for 'linear'\n kernel, it is recommended that they use\n :class:`~sklearn.decomposition.PCA` instead. Unlike\n :class:`~sklearn.decomposition.PCA`,\n :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform``\n does not reconstruct the mean of data when 'linear' kernel is used\n due to the use of centered kernel.", + "docstring": "Transform X back to original space.\n\n ``inverse_transform`` approximates the inverse transformation using\n a learned pre-image. The pre-image is learned by kernel ridge\n regression of the original data on their low-dimensional representation\n vectors.\n\n .. note:\n :meth:`~sklearn.decomposition.fit` internally uses a centered\n kernel. As the centered kernel no longer contains the information\n of the mean of kernel features, such information is not taken into\n account in reconstruction.\n\n .. note::\n When users want to compute inverse transformation for 'linear'\n kernel, it is recommended that they use\n :class:`~sklearn.decomposition.PCA` instead. Unlike\n :class:`~sklearn.decomposition.PCA`,\n :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform``\n does not reconstruct the mean of data when 'linear' kernel is used\n due to the use of centered kernel.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_components)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_features)\n Returns the instance itself.\n\n References\n ----------\n `Bak\u0131r, G\u00f6khan H., Jason Weston, and Bernhard Sch\u00f6lkopf.\n \"Learning to find pre-images.\"\n Advances in neural information processing systems 16 (2004): 449-456.\n `_\n ", + "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Transform X back to original space.\n\n ``inverse_transform`` approximates the inverse transformation using\n a learned pre-image. The pre-image is learned by kernel ridge\n regression of the original data on their low-dimensional representation\n vectors.\n\n .. note:\n :meth:`~sklearn.decomposition.fit` internally uses a centered\n kernel. As the centered kernel no longer contains the information\n of the mean of kernel features, such information is not taken into\n account in reconstruction.\n\n .. note::\n When users want to compute inverse transformation for 'linear'\n kernel, it is recommended that they use\n :class:`~sklearn.decomposition.PCA` instead. Unlike\n :class:`~sklearn.decomposition.PCA`,\n :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform``\n does not reconstruct the mean of data when 'linear' kernel is used\n due to the use of centered kernel.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_components)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_features)\n Returns the instance itself.\n\n References\n ----------\n `Bak\u0131r, G\u00f6khan H., Jason Weston, and Bernhard Sch\u00f6lkopf.\n \"Learning to find pre-images.\"\n Advances in neural information processing systems 16 (2004): 449-456.\n `_\n \"\"\"\n if not self.fit_inverse_transform:\n raise NotFittedError('The fit_inverse_transform parameter was not set to True when instantiating and hence the inverse transform is not available.')\n K = self._get_kernel(X, self.X_transformed_fit_)\n return np.dot(K, self.dual_coef_)" }, { "name": "lambdas_", @@ -51116,13 +53437,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `lambdas_` was deprecated in version 1.0 and will be removed in 1.2. Use `eigenvalues_` instead.')\n@property\ndef lambdas_(self):\n return self.eigenvalues_" }, { @@ -51140,7 +53462,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51150,13 +53473,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Transform X.", - "docstring": "Transform X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.", + "docstring": "Transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n K = self._centerer.transform(self._get_kernel(X, self.X_fit_))\n non_zeros = np.flatnonzero(self.eigenvalues_)\n scaled_alphas = np.zeros_like(self.eigenvectors_)\n scaled_alphas[:, non_zeros] = self.eigenvectors_[:, non_zeros] / np.sqrt(self.eigenvalues_[non_zeros])\n return np.dot(K, scaled_alphas)" }, { @@ -51174,7 +53501,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -51184,7 +53512,8 @@ "docstring": { "type": "int, default=10", "description": "Number of topics.\n\n.. versionchanged:: 0.19\n ``n_topics`` was renamed to ``n_components``" - } + }, + "refined_type": {} }, { "name": "doc_topic_prior", @@ -51194,7 +53523,8 @@ "docstring": { "type": "float, default=None", "description": "Prior of document topic distribution `theta`. If the value is None,\ndefaults to `1 / n_components`.\nIn [1]_, this is called `alpha`." - } + }, + "refined_type": {} }, { "name": "topic_word_prior", @@ -51204,7 +53534,8 @@ "docstring": { "type": "float, default=None", "description": "Prior of topic word distribution `beta`. If the value is None, defaults\nto `1 / n_components`.\nIn [1]_, this is called `eta`." - } + }, + "refined_type": {} }, { "name": "learning_method", @@ -51214,6 +53545,10 @@ "docstring": { "type": "{'batch', 'online'}, default='batch'", "description": "Method used to update `_component`. Only used in :meth:`fit` method.\nIn general, if the data size is large, the online update will be much\nfaster than the batch update.\n\nValid options::\n\n 'batch': Batch variational Bayes method. Use all training data in\n each EM update.\n Old `components_` will be overwritten in each iteration.\n 'online': Online variational Bayes method. In each EM update, use\n mini-batch of training data to update the ``components_``\n variable incrementally. The learning rate is controlled by the\n ``learning_decay`` and the ``learning_offset`` parameters.\n\n.. versionchanged:: 0.20\n The default learning method is now ``\"batch\"``." + }, + "refined_type": { + "kind": "EnumType", + "values": ["online", "batch"] } }, { @@ -51224,7 +53559,8 @@ "docstring": { "type": "float, default=0.7", "description": "It is a parameter that control learning rate in the online learning\nmethod. The value should be set between (0.5, 1.0] to guarantee\nasymptotic convergence. When the value is 0.0 and batch_size is\n``n_samples``, the update method is same as batch learning. In the\nliterature, this is called kappa." - } + }, + "refined_type": {} }, { "name": "learning_offset", @@ -51234,7 +53570,8 @@ "docstring": { "type": "float, default=10.0", "description": "A (positive) parameter that downweights early iterations in online\nlearning. It should be greater than 1.0. In the literature, this is\ncalled tau_0." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -51244,7 +53581,8 @@ "docstring": { "type": "int, default=10", "description": "The maximum number of passes over the training data (aka epochs).\nIt only impacts the behavior in the :meth:`fit` method, and not the\n:meth:`partial_fit` method." - } + }, + "refined_type": {} }, { "name": "batch_size", @@ -51254,7 +53592,8 @@ "docstring": { "type": "int, default=128", "description": "Number of documents to use in each EM iteration. Only used in online\nlearning." - } + }, + "refined_type": {} }, { "name": "evaluate_every", @@ -51264,7 +53603,8 @@ "docstring": { "type": "int, default=-1", "description": "How often to evaluate perplexity. Only used in `fit` method.\nset it to 0 or negative number to not evaluate perplexity in\ntraining at all. Evaluating perplexity can help you check convergence\nin training process, but it will also increase total training time.\nEvaluating perplexity in every iteration might increase training time\nup to two-fold." - } + }, + "refined_type": {} }, { "name": "total_samples", @@ -51274,7 +53614,8 @@ "docstring": { "type": "int, default=1e6", "description": "Total number of documents. Only used in the :meth:`partial_fit` method." - } + }, + "refined_type": {} }, { "name": "perp_tol", @@ -51284,7 +53625,8 @@ "docstring": { "type": "float, default=1e-1", "description": "Perplexity tolerance in batch learning. Only used when\n``evaluate_every`` is greater than 0." - } + }, + "refined_type": {} }, { "name": "mean_change_tol", @@ -51294,7 +53636,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Stopping tolerance for updating document topic distribution in E-step." - } + }, + "refined_type": {} }, { "name": "max_doc_update_iter", @@ -51304,7 +53647,8 @@ "docstring": { "type": "int, default=100", "description": "Max number of iterations for updating document topic distribution in\nthe E-step." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -51314,7 +53658,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use in the E-step.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -51324,7 +53669,8 @@ "docstring": { "type": "int, default=0", "description": "Verbosity level." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -51334,13 +53680,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=10, *, doc_topic_prior=None, topic_word_prior=None, learning_method='batch', learning_decay=0.7, learning_offset=10.0, max_iter=10, batch_size=128, evaluate_every=-1, total_samples=1000000.0, perp_tol=0.1, mean_change_tol=0.001, max_doc_update_iter=100, n_jobs=None, verbose=0, random_state=None):\n self.n_components = n_components\n self.doc_topic_prior = doc_topic_prior\n self.topic_word_prior = topic_word_prior\n self.learning_method = learning_method\n self.learning_decay = learning_decay\n self.learning_offset = learning_offset\n self.max_iter = max_iter\n self.batch_size = batch_size\n self.evaluate_every = evaluate_every\n self.total_samples = total_samples\n self.perp_tol = perp_tol\n self.mean_change_tol = mean_change_tol\n self.max_doc_update_iter = max_doc_update_iter\n self.n_jobs = n_jobs\n self.verbose = verbose\n self.random_state = random_state" }, { @@ -51358,7 +53705,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51368,6 +53716,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Document word matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -51378,7 +53730,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_components)", "description": "Document topic distribution. In the literature, this is called\ngamma." - } + }, + "refined_type": {} }, { "name": "sub_sampling", @@ -51388,13 +53741,14 @@ "docstring": { "type": "bool, default=False", "description": "Compensate for subsampling of documents.\nIt is used in calculate bound in online learning." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Estimate the variational bound.\n\nEstimate the variational bound over \"all documents\" using only the documents passed in as X. Since log-likelihood of each word cannot be computed directly, we use this bound to estimate it.", - "docstring": "Estimate the variational bound.\n\nEstimate the variational bound over \"all documents\" using only the\ndocuments passed in as X. Since log-likelihood of each word cannot\nbe computed directly, we use this bound to estimate it.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\ndoc_topic_distr : ndarray of shape (n_samples, n_components)\n Document topic distribution. In the literature, this is called\n gamma.\n\nsub_sampling : bool, default=False\n Compensate for subsampling of documents.\n It is used in calculate bound in online learning.\n\nReturns\n-------\nscore : float", + "description": "Estimate the variational bound.\n\nEstimate the variational bound over \"all documents\" using only the\ndocuments passed in as X. Since log-likelihood of each word cannot\nbe computed directly, we use this bound to estimate it.", + "docstring": "Estimate the variational bound.\n\n Estimate the variational bound over \"all documents\" using only the\n documents passed in as X. Since log-likelihood of each word cannot\n be computed directly, we use this bound to estimate it.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n doc_topic_distr : ndarray of shape (n_samples, n_components)\n Document topic distribution. In the literature, this is called\n gamma.\n\n sub_sampling : bool, default=False\n Compensate for subsampling of documents.\n It is used in calculate bound in online learning.\n\n Returns\n -------\n score : float\n\n ", "source_code": "\ndef _approx_bound(self, X, doc_topic_distr, sub_sampling):\n \"\"\"Estimate the variational bound.\n\n Estimate the variational bound over \"all documents\" using only the\n documents passed in as X. Since log-likelihood of each word cannot\n be computed directly, we use this bound to estimate it.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n doc_topic_distr : ndarray of shape (n_samples, n_components)\n Document topic distribution. In the literature, this is called\n gamma.\n\n sub_sampling : bool, default=False\n Compensate for subsampling of documents.\n It is used in calculate bound in online learning.\n\n Returns\n -------\n score : float\n\n \"\"\"\n \n def _loglikelihood(prior, distr, dirichlet_distr, size):\n score = np.sum((prior - distr) * dirichlet_distr)\n score += np.sum(gammaln(distr) - gammaln(prior))\n score += np.sum(gammaln(prior * size) - gammaln(np.sum(distr, 1)))\n return score\n is_sparse_x = sp.issparse(X)\n (n_samples, n_components) = doc_topic_distr.shape\n n_features = self.components_.shape[1]\n score = 0\n dirichlet_doc_topic = _dirichlet_expectation_2d(doc_topic_distr)\n dirichlet_component_ = _dirichlet_expectation_2d(self.components_)\n doc_topic_prior = self.doc_topic_prior_\n topic_word_prior = self.topic_word_prior_\n if is_sparse_x:\n X_data = X.data\n X_indices = X.indices\n X_indptr = X.indptr\n for idx_d in range(0, n_samples):\n if is_sparse_x:\n ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]]\n cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]]\n else:\n ids = np.nonzero(X[idx_d, :])[0]\n cnts = X[idx_d, ids]\n temp = dirichlet_doc_topic[idx_d, :, np.newaxis] + dirichlet_component_[:, ids]\n norm_phi = logsumexp(temp, axis=0)\n score += np.dot(cnts, norm_phi)\n score += _loglikelihood(doc_topic_prior, doc_topic_distr, dirichlet_doc_topic, self.n_components)\n if sub_sampling:\n doc_ratio = float(self.total_samples) / n_samples\n score *= doc_ratio\n score += _loglikelihood(topic_word_prior, self.components_, dirichlet_component_, n_features)\n return score" }, { @@ -51412,7 +53766,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51420,9 +53775,10 @@ "is_public": false, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": " array-like or sparse matrix", + "type": "array-like or sparse matrix", "description": "" - } + }, + "refined_type": {} }, { "name": "reset_n_features", @@ -51432,7 +53788,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "whom", @@ -51442,13 +53799,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "check X format\n\ncheck X format and make sure no negative value in X.", - "docstring": "check X format\n\ncheck X format and make sure no negative value in X.\n\nParameters\n----------\nX : array-like or sparse matrix", + "docstring": "check X format\n\n check X format and make sure no negative value in X.\n\n Parameters\n ----------\n X : array-like or sparse matrix\n\n ", "source_code": "\ndef _check_non_neg_array(self, X, reset_n_features, whom):\n \"\"\"check X format\n\n check X format and make sure no negative value in X.\n\n Parameters\n ----------\n X : array-like or sparse matrix\n\n \"\"\"\n X = self._validate_data(X, reset=reset_n_features, accept_sparse='csr')\n check_non_negative(X, whom)\n return X" }, { @@ -51466,7 +53824,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -51490,7 +53849,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51500,6 +53860,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Document word matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -51510,7 +53874,8 @@ "docstring": { "type": "bool", "description": "Parameter that indicate whether to calculate sufficient statistics\nor not. Set ``cal_sstats`` to True when we need to run M-step." - } + }, + "refined_type": {} }, { "name": "random_init", @@ -51520,7 +53885,8 @@ "docstring": { "type": "bool", "description": "Parameter that indicate whether to initialize document topic\ndistribution randomly in the E-step. Set it to True in training\nsteps." - } + }, + "refined_type": {} }, { "name": "parallel", @@ -51530,13 +53896,14 @@ "docstring": { "type": "joblib.Parallel, default=None", "description": "Pre-initialized instance of joblib.Parallel." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "E-step in EM update.", - "docstring": "E-step in EM update.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\ncal_sstats : bool\n Parameter that indicate whether to calculate sufficient statistics\n or not. Set ``cal_sstats`` to True when we need to run M-step.\n\nrandom_init : bool\n Parameter that indicate whether to initialize document topic\n distribution randomly in the E-step. Set it to True in training\n steps.\n\nparallel : joblib.Parallel, default=None\n Pre-initialized instance of joblib.Parallel.\n\nReturns\n-------\n(doc_topic_distr, suff_stats) :\n `doc_topic_distr` is unnormalized topic distribution for each\n document. In the literature, this is called `gamma`.\n `suff_stats` is expected sufficient statistics for the M-step.\n When `cal_sstats == False`, it will be None.", + "docstring": "E-step in EM update.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n cal_sstats : bool\n Parameter that indicate whether to calculate sufficient statistics\n or not. Set ``cal_sstats`` to True when we need to run M-step.\n\n random_init : bool\n Parameter that indicate whether to initialize document topic\n distribution randomly in the E-step. Set it to True in training\n steps.\n\n parallel : joblib.Parallel, default=None\n Pre-initialized instance of joblib.Parallel.\n\n Returns\n -------\n (doc_topic_distr, suff_stats) :\n `doc_topic_distr` is unnormalized topic distribution for each\n document. In the literature, this is called `gamma`.\n `suff_stats` is expected sufficient statistics for the M-step.\n When `cal_sstats == False`, it will be None.\n\n ", "source_code": "\ndef _e_step(self, X, cal_sstats, random_init, parallel=None):\n \"\"\"E-step in EM update.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n cal_sstats : bool\n Parameter that indicate whether to calculate sufficient statistics\n or not. Set ``cal_sstats`` to True when we need to run M-step.\n\n random_init : bool\n Parameter that indicate whether to initialize document topic\n distribution randomly in the E-step. Set it to True in training\n steps.\n\n parallel : joblib.Parallel, default=None\n Pre-initialized instance of joblib.Parallel.\n\n Returns\n -------\n (doc_topic_distr, suff_stats) :\n `doc_topic_distr` is unnormalized topic distribution for each\n document. In the literature, this is called `gamma`.\n `suff_stats` is expected sufficient statistics for the M-step.\n When `cal_sstats == False`, it will be None.\n\n \"\"\"\n random_state = self.random_state_ if random_init else None\n n_jobs = effective_n_jobs(self.n_jobs)\n if parallel is None:\n parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1))\n results = parallel((delayed(_update_doc_distribution)(X[idx_slice, :], self.exp_dirichlet_component_, self.doc_topic_prior_, self.max_doc_update_iter, self.mean_change_tol, cal_sstats, random_state) for idx_slice in gen_even_slices(X.shape[0], n_jobs)))\n (doc_topics, sstats_list) = zip(*results)\n doc_topic_distr = np.vstack(doc_topics)\n if cal_sstats:\n suff_stats = np.zeros(self.components_.shape)\n for sstats in sstats_list:\n suff_stats += sstats\n suff_stats *= self.exp_dirichlet_component_\n else:\n suff_stats = None\n return doc_topic_distr, suff_stats" }, { @@ -51554,7 +53921,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51564,6 +53932,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Document word matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -51574,7 +53946,8 @@ "docstring": { "type": "int", "description": "Total number of documents. It is only used when\nbatch_update is `False`." - } + }, + "refined_type": {} }, { "name": "batch_update", @@ -51584,7 +53957,8 @@ "docstring": { "type": "bool", "description": "Parameter that controls updating method.\n`True` for batch learning, `False` for online learning." - } + }, + "refined_type": {} }, { "name": "parallel", @@ -51594,13 +53968,14 @@ "docstring": { "type": "joblib.Parallel, default=None", "description": "Pre-initialized instance of joblib.Parallel" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "EM update for 1 iteration.\n\nupdate `_component` by batch VB or online VB.", - "docstring": "EM update for 1 iteration.\n\nupdate `_component` by batch VB or online VB.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\ntotal_samples : int\n Total number of documents. It is only used when\n batch_update is `False`.\n\nbatch_update : bool\n Parameter that controls updating method.\n `True` for batch learning, `False` for online learning.\n\nparallel : joblib.Parallel, default=None\n Pre-initialized instance of joblib.Parallel\n\nReturns\n-------\ndoc_topic_distr : ndarray of shape (n_samples, n_components)\n Unnormalized document topic distribution.", + "docstring": "EM update for 1 iteration.\n\n update `_component` by batch VB or online VB.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n total_samples : int\n Total number of documents. It is only used when\n batch_update is `False`.\n\n batch_update : bool\n Parameter that controls updating method.\n `True` for batch learning, `False` for online learning.\n\n parallel : joblib.Parallel, default=None\n Pre-initialized instance of joblib.Parallel\n\n Returns\n -------\n doc_topic_distr : ndarray of shape (n_samples, n_components)\n Unnormalized document topic distribution.\n ", "source_code": "\ndef _em_step(self, X, total_samples, batch_update, parallel=None):\n \"\"\"EM update for 1 iteration.\n\n update `_component` by batch VB or online VB.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n total_samples : int\n Total number of documents. It is only used when\n batch_update is `False`.\n\n batch_update : bool\n Parameter that controls updating method.\n `True` for batch learning, `False` for online learning.\n\n parallel : joblib.Parallel, default=None\n Pre-initialized instance of joblib.Parallel\n\n Returns\n -------\n doc_topic_distr : ndarray of shape (n_samples, n_components)\n Unnormalized document topic distribution.\n \"\"\"\n (_, suff_stats) = self._e_step(X, cal_sstats=True, random_init=True, parallel=parallel)\n if batch_update:\n self.components_ = self.topic_word_prior_ + suff_stats\n else:\n weight = np.power(self.learning_offset + self.n_batch_iter_, -self.learning_decay)\n doc_ratio = float(total_samples) / X.shape[0]\n self.components_ *= 1 - weight\n self.components_ += weight * (self.topic_word_prior_ + doc_ratio * suff_stats)\n self.exp_dirichlet_component_ = np.exp(_dirichlet_expectation_2d(self.components_))\n self.n_batch_iter_ += 1\n return" }, { @@ -51618,7 +53993,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -51628,7 +54004,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -51652,13 +54029,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'requires_positive_X': True}" }, { @@ -51676,7 +54054,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51686,6 +54065,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Document word matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -51696,7 +54079,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_components), default=None", "description": "Document topic distribution.\nIf it is None, it will be generated by applying transform on X." - } + }, + "refined_type": {} }, { "name": "sub_sampling", @@ -51706,13 +54090,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Calculate approximate perplexity for data X with ability to accept precomputed doc_topic_distr\n\nPerplexity is defined as exp(-1. * log-likelihood per word)", - "docstring": "Calculate approximate perplexity for data X with ability to accept\nprecomputed doc_topic_distr\n\nPerplexity is defined as exp(-1. * log-likelihood per word)\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\ndoc_topic_distr : ndarray of shape (n_samples, n_components), default=None\n Document topic distribution.\n If it is None, it will be generated by applying transform on X.\n\nReturns\n-------\nscore : float\n Perplexity score.", + "description": "Calculate approximate perplexity for data X with ability to accept\nprecomputed doc_topic_distr\n\nPerplexity is defined as exp(-1. * log-likelihood per word)", + "docstring": "Calculate approximate perplexity for data X with ability to accept\n precomputed doc_topic_distr\n\n Perplexity is defined as exp(-1. * log-likelihood per word)\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n doc_topic_distr : ndarray of shape (n_samples, n_components), default=None\n Document topic distribution.\n If it is None, it will be generated by applying transform on X.\n\n Returns\n -------\n score : float\n Perplexity score.\n ", "source_code": "\ndef _perplexity_precomp_distr(self, X, doc_topic_distr=None, sub_sampling=False):\n \"\"\"Calculate approximate perplexity for data X with ability to accept\n precomputed doc_topic_distr\n\n Perplexity is defined as exp(-1. * log-likelihood per word)\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n doc_topic_distr : ndarray of shape (n_samples, n_components), default=None\n Document topic distribution.\n If it is None, it will be generated by applying transform on X.\n\n Returns\n -------\n score : float\n Perplexity score.\n \"\"\"\n if doc_topic_distr is None:\n doc_topic_distr = self._unnormalized_transform(X)\n else:\n (n_samples, n_components) = doc_topic_distr.shape\n if n_samples != X.shape[0]:\n raise ValueError('Number of samples in X and doc_topic_distr do not match.')\n if n_components != self.n_components:\n raise ValueError('Number of topics does not match.')\n current_samples = X.shape[0]\n bound = self._approx_bound(X, doc_topic_distr, sub_sampling)\n if sub_sampling:\n word_cnt = X.sum() * (float(self.total_samples) / current_samples)\n else:\n word_cnt = X.sum()\n perword_bound = bound / word_cnt\n return np.exp(-1.0 * perword_bound)" }, { @@ -51730,7 +54115,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51740,13 +54126,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Document word matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Transform data X according to fitted model.", - "docstring": "Transform data X according to fitted model.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\nReturns\n-------\ndoc_topic_distr : ndarray of shape (n_samples, n_components)\n Document topic distribution for X.", + "docstring": "Transform data X according to fitted model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n Returns\n -------\n doc_topic_distr : ndarray of shape (n_samples, n_components)\n Document topic distribution for X.\n ", "source_code": "\ndef _unnormalized_transform(self, X):\n \"\"\"Transform data X according to fitted model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n Returns\n -------\n doc_topic_distr : ndarray of shape (n_samples, n_components)\n Document topic distribution for X.\n \"\"\"\n (doc_topic_distr, _) = self._e_step(X, cal_sstats=False, random_init=False)\n return doc_topic_distr" }, { @@ -51764,7 +54154,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51774,6 +54165,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Document word matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -51784,13 +54179,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Learn model for the data X with variational Bayes method.\n\nWhen `learning_method` is 'online', use mini-batch update. Otherwise, use batch update.", - "docstring": "Learn model for the data X with variational Bayes method.\n\nWhen `learning_method` is 'online', use mini-batch update.\nOtherwise, use batch update.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself\n Fitted estimator.", + "description": "Learn model for the data X with variational Bayes method.\n\nWhen `learning_method` is 'online', use mini-batch update.\nOtherwise, use batch update.", + "docstring": "Learn model for the data X with variational Bayes method.\n\n When `learning_method` is 'online', use mini-batch update.\n Otherwise, use batch update.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Learn model for the data X with variational Bayes method.\n\n When `learning_method` is 'online', use mini-batch update.\n Otherwise, use batch update.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Fitted estimator.\n \"\"\"\n self._check_params()\n X = self._check_non_neg_array(X, reset_n_features=True, whom='LatentDirichletAllocation.fit')\n (n_samples, n_features) = X.shape\n max_iter = self.max_iter\n evaluate_every = self.evaluate_every\n learning_method = self.learning_method\n batch_size = self.batch_size\n self._init_latent_vars(n_features)\n last_bound = None\n n_jobs = effective_n_jobs(self.n_jobs)\n with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel:\n for i in range(max_iter):\n if learning_method == 'online':\n for idx_slice in gen_batches(n_samples, batch_size):\n self._em_step(X[idx_slice, :], total_samples=n_samples, batch_update=False, parallel=parallel)\n else:\n self._em_step(X, total_samples=n_samples, batch_update=True, parallel=parallel)\n if evaluate_every > 0 and (i + 1) % evaluate_every == 0:\n (doc_topics_distr, _) = self._e_step(X, cal_sstats=False, random_init=False, parallel=parallel)\n bound = self._perplexity_precomp_distr(X, doc_topics_distr, sub_sampling=False)\n if self.verbose:\n print('iteration: %d of max_iter: %d, perplexity: %.4f' % (i + 1, max_iter, bound))\n if last_bound and abs(last_bound - bound) < self.perp_tol:\n break\n last_bound = bound\n elif self.verbose:\n print('iteration: %d of max_iter: %d' % (i + 1, max_iter))\n self.n_iter_ += 1\n (doc_topics_distr, _) = self._e_step(X, cal_sstats=False, random_init=False, parallel=parallel)\n self.bound_ = self._perplexity_precomp_distr(X, doc_topics_distr, sub_sampling=False)\n return self" }, { @@ -51808,7 +54204,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51818,6 +54215,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Document word matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -51828,13 +54229,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Online VB with Mini-Batch update.", - "docstring": "Online VB with Mini-Batch update.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself\n Partially fitted estimator.", + "docstring": "Online VB with Mini-Batch update.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Partially fitted estimator.\n ", "source_code": "\ndef partial_fit(self, X, y=None):\n \"\"\"Online VB with Mini-Batch update.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self\n Partially fitted estimator.\n \"\"\"\n self._check_params()\n first_time = not hasattr(self, 'components_')\n X = self._check_non_neg_array(X, reset_n_features=first_time, whom='LatentDirichletAllocation.partial_fit')\n (n_samples, n_features) = X.shape\n batch_size = self.batch_size\n if first_time:\n self._init_latent_vars(n_features)\n if n_features != self.components_.shape[1]:\n raise ValueError('The provided data has %d dimensions while the model was trained with feature size %d.' % (n_features, self.components_.shape[1]))\n n_jobs = effective_n_jobs(self.n_jobs)\n with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel:\n for idx_slice in gen_batches(n_samples, batch_size):\n self._em_step(X[idx_slice, :], total_samples=self.total_samples, batch_update=False, parallel=parallel)\n return self" }, { @@ -51852,7 +54254,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51862,6 +54265,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Document word matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -51872,13 +54279,14 @@ "docstring": { "type": "bool", "description": "Do sub-sampling or not." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Calculate approximate perplexity for data X.\n\nPerplexity is defined as exp(-1. * log-likelihood per word) .. versionchanged:: 0.19 *doc_topic_distr* argument has been deprecated and is ignored because user no longer has access to unnormalized distribution", - "docstring": "Calculate approximate perplexity for data X.\n\nPerplexity is defined as exp(-1. * log-likelihood per word)\n\n.. versionchanged:: 0.19\n *doc_topic_distr* argument has been deprecated and is ignored\n because user no longer has access to unnormalized distribution\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\nsub_sampling : bool\n Do sub-sampling or not.\n\nReturns\n-------\nscore : float\n Perplexity score.", + "description": "Calculate approximate perplexity for data X.\n\nPerplexity is defined as exp(-1. * log-likelihood per word)\n\n.. versionchanged:: 0.19\n *doc_topic_distr* argument has been deprecated and is ignored\n because user no longer has access to unnormalized distribution", + "docstring": "Calculate approximate perplexity for data X.\n\n Perplexity is defined as exp(-1. * log-likelihood per word)\n\n .. versionchanged:: 0.19\n *doc_topic_distr* argument has been deprecated and is ignored\n because user no longer has access to unnormalized distribution\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n sub_sampling : bool\n Do sub-sampling or not.\n\n Returns\n -------\n score : float\n Perplexity score.\n ", "source_code": "\ndef perplexity(self, X, sub_sampling=False):\n \"\"\"Calculate approximate perplexity for data X.\n\n Perplexity is defined as exp(-1. * log-likelihood per word)\n\n .. versionchanged:: 0.19\n *doc_topic_distr* argument has been deprecated and is ignored\n because user no longer has access to unnormalized distribution\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n sub_sampling : bool\n Do sub-sampling or not.\n\n Returns\n -------\n score : float\n Perplexity score.\n \"\"\"\n check_is_fitted(self)\n X = self._check_non_neg_array(X, reset_n_features=True, whom='LatentDirichletAllocation.perplexity')\n return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling)" }, { @@ -51896,7 +54304,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51906,6 +54315,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Document word matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -51916,13 +54329,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Calculate approximate log-likelihood as score.", - "docstring": "Calculate approximate log-likelihood as score.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nscore : float\n Use approximate bound as score.", + "docstring": "Calculate approximate log-likelihood as score.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n score : float\n Use approximate bound as score.\n ", "source_code": "\ndef score(self, X, y=None):\n \"\"\"Calculate approximate log-likelihood as score.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n score : float\n Use approximate bound as score.\n \"\"\"\n check_is_fitted(self)\n X = self._check_non_neg_array(X, reset_n_features=False, whom='LatentDirichletAllocation.score')\n doc_topic_distr = self._unnormalized_transform(X)\n score = self._approx_bound(X, doc_topic_distr, sub_sampling=False)\n return score" }, { @@ -51940,7 +54354,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -51950,13 +54365,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Document word matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Transform data X according to the fitted model.\n\n .. versionchanged:: 0.18 *doc_topic_distr* is now normalized", - "docstring": "Transform data X according to the fitted model.\n\n .. versionchanged:: 0.18\n *doc_topic_distr* is now normalized\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\nReturns\n-------\ndoc_topic_distr : ndarray of shape (n_samples, n_components)\n Document topic distribution for X.", + "description": "Transform data X according to the fitted model.\n\n .. versionchanged:: 0.18\n *doc_topic_distr* is now normalized", + "docstring": "Transform data X according to the fitted model.\n\n .. versionchanged:: 0.18\n *doc_topic_distr* is now normalized\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n Returns\n -------\n doc_topic_distr : ndarray of shape (n_samples, n_components)\n Document topic distribution for X.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform data X according to the fitted model.\n\n .. versionchanged:: 0.18\n *doc_topic_distr* is now normalized\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n Returns\n -------\n doc_topic_distr : ndarray of shape (n_samples, n_components)\n Document topic distribution for X.\n \"\"\"\n check_is_fitted(self)\n X = self._check_non_neg_array(X, reset_n_features=False, whom='LatentDirichletAllocation.transform')\n doc_topic_distr = self._unnormalized_transform(X)\n doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis]\n return doc_topic_distr" }, { @@ -51974,6 +54393,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Document word matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -51984,7 +54407,8 @@ "docstring": { "type": "ndarray of shape (n_topics, n_features)", "description": "Exponential value of expectation of log topic word distribution.\nIn the literature, this is `exp(E[log(beta)])`." - } + }, + "refined_type": {} }, { "name": "doc_topic_prior", @@ -51994,7 +54418,8 @@ "docstring": { "type": "float", "description": "Prior of document topic distribution `theta`." - } + }, + "refined_type": {} }, { "name": "max_doc_update_iter", @@ -52004,7 +54429,8 @@ "docstring": { "type": "int", "description": "Max number of iterations for updating document topic distribution in\nthe E-step." - } + }, + "refined_type": {} }, { "name": "mean_change_tol", @@ -52014,7 +54440,8 @@ "docstring": { "type": "float", "description": "Stopping tolerance for updating document topic distribution in E-step." - } + }, + "refined_type": {} }, { "name": "cal_sstats", @@ -52024,7 +54451,8 @@ "docstring": { "type": "bool", "description": "Parameter that indicate to calculate sufficient statistics or not.\nSet `cal_sstats` to `True` when we need to run M-step." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -52034,13 +54462,14 @@ "docstring": { "type": "RandomState instance or None", "description": "Parameter that indicate how to initialize document topic distribution.\nSet `random_state` to None will initialize document topic distribution\nto a constant number." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "E-step: update document-topic distribution.", - "docstring": "E-step: update document-topic distribution.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\nexp_topic_word_distr : ndarray of shape (n_topics, n_features)\n Exponential value of expectation of log topic word distribution.\n In the literature, this is `exp(E[log(beta)])`.\n\ndoc_topic_prior : float\n Prior of document topic distribution `theta`.\n\nmax_doc_update_iter : int\n Max number of iterations for updating document topic distribution in\n the E-step.\n\nmean_change_tol : float\n Stopping tolerance for updating document topic distribution in E-step.\n\ncal_sstats : bool\n Parameter that indicate to calculate sufficient statistics or not.\n Set `cal_sstats` to `True` when we need to run M-step.\n\nrandom_state : RandomState instance or None\n Parameter that indicate how to initialize document topic distribution.\n Set `random_state` to None will initialize document topic distribution\n to a constant number.\n\nReturns\n-------\n(doc_topic_distr, suff_stats) :\n `doc_topic_distr` is unnormalized topic distribution for each document.\n In the literature, this is `gamma`. we can calculate `E[log(theta)]`\n from it.\n `suff_stats` is expected sufficient statistics for the M-step.\n When `cal_sstats == False`, this will be None.", + "docstring": "E-step: update document-topic distribution.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n exp_topic_word_distr : ndarray of shape (n_topics, n_features)\n Exponential value of expectation of log topic word distribution.\n In the literature, this is `exp(E[log(beta)])`.\n\n doc_topic_prior : float\n Prior of document topic distribution `theta`.\n\n max_doc_update_iter : int\n Max number of iterations for updating document topic distribution in\n the E-step.\n\n mean_change_tol : float\n Stopping tolerance for updating document topic distribution in E-step.\n\n cal_sstats : bool\n Parameter that indicate to calculate sufficient statistics or not.\n Set `cal_sstats` to `True` when we need to run M-step.\n\n random_state : RandomState instance or None\n Parameter that indicate how to initialize document topic distribution.\n Set `random_state` to None will initialize document topic distribution\n to a constant number.\n\n Returns\n -------\n (doc_topic_distr, suff_stats) :\n `doc_topic_distr` is unnormalized topic distribution for each document.\n In the literature, this is `gamma`. we can calculate `E[log(theta)]`\n from it.\n `suff_stats` is expected sufficient statistics for the M-step.\n When `cal_sstats == False`, this will be None.\n\n ", "source_code": "\ndef _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, max_doc_update_iter, mean_change_tol, cal_sstats, random_state):\n \"\"\"E-step: update document-topic distribution.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document word matrix.\n\n exp_topic_word_distr : ndarray of shape (n_topics, n_features)\n Exponential value of expectation of log topic word distribution.\n In the literature, this is `exp(E[log(beta)])`.\n\n doc_topic_prior : float\n Prior of document topic distribution `theta`.\n\n max_doc_update_iter : int\n Max number of iterations for updating document topic distribution in\n the E-step.\n\n mean_change_tol : float\n Stopping tolerance for updating document topic distribution in E-step.\n\n cal_sstats : bool\n Parameter that indicate to calculate sufficient statistics or not.\n Set `cal_sstats` to `True` when we need to run M-step.\n\n random_state : RandomState instance or None\n Parameter that indicate how to initialize document topic distribution.\n Set `random_state` to None will initialize document topic distribution\n to a constant number.\n\n Returns\n -------\n (doc_topic_distr, suff_stats) :\n `doc_topic_distr` is unnormalized topic distribution for each document.\n In the literature, this is `gamma`. we can calculate `E[log(theta)]`\n from it.\n `suff_stats` is expected sufficient statistics for the M-step.\n When `cal_sstats == False`, this will be None.\n\n \"\"\"\n is_sparse_x = sp.issparse(X)\n (n_samples, n_features) = X.shape\n n_topics = exp_topic_word_distr.shape[0]\n if random_state:\n doc_topic_distr = random_state.gamma(100.0, 0.01, (n_samples, n_topics))\n else:\n doc_topic_distr = np.ones((n_samples, n_topics))\n exp_doc_topic = np.exp(_dirichlet_expectation_2d(doc_topic_distr))\n suff_stats = np.zeros(exp_topic_word_distr.shape) if cal_sstats else None\n if is_sparse_x:\n X_data = X.data\n X_indices = X.indices\n X_indptr = X.indptr\n for idx_d in range(n_samples):\n if is_sparse_x:\n ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]]\n cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]]\n else:\n ids = np.nonzero(X[idx_d, :])[0]\n cnts = X[idx_d, ids]\n doc_topic_d = doc_topic_distr[idx_d, :]\n exp_doc_topic_d = exp_doc_topic[idx_d, :].copy()\n exp_topic_word_d = exp_topic_word_distr[:, ids]\n for _ in range(0, max_doc_update_iter):\n last_d = doc_topic_d\n norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS\n doc_topic_d = exp_doc_topic_d * np.dot(cnts / norm_phi, exp_topic_word_d.T)\n _dirichlet_expectation_1d(doc_topic_d, doc_topic_prior, exp_doc_topic_d)\n if mean_change(last_d, doc_topic_d) < mean_change_tol:\n break\n doc_topic_distr[idx_d, :] = doc_topic_d\n if cal_sstats:\n norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS\n suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi)\n return doc_topic_distr, suff_stats" }, { @@ -52058,7 +54487,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -52068,7 +54498,8 @@ "docstring": { "type": "int, default=None", "description": "Number of components, if n_components is not set all features\nare kept." - } + }, + "refined_type": {} }, { "name": "init", @@ -52078,6 +54509,16 @@ "docstring": { "type": "{'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None", "description": "Method used to initialize the procedure.\nDefault: None.\nValid options:\n\n- `None`: 'nndsvd' if n_components <= min(n_samples, n_features),\n otherwise random.\n\n- `'random'`: non-negative random matrices, scaled with:\n sqrt(X.mean() / n_components)\n\n- `'nndsvd'`: Nonnegative Double Singular Value Decomposition (NNDSVD)\n initialization (better for sparseness)\n\n- `'nndsvda'`: NNDSVD with zeros filled with the average of X\n (better when sparsity is not desired)\n\n- `'nndsvdar'` NNDSVD with zeros filled with small random values\n (generally faster, less accurate alternative to NNDSVDa\n for when sparsity is not desired)\n\n- `'custom'`: use custom matrices W and H" + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "random", + "custom", + "nndsvda", + "nndsvd", + "nndsvdar" + ] } }, { @@ -52088,6 +54529,10 @@ "docstring": { "type": "{'cd', 'mu'}, default='cd'", "description": "Numerical solver to use:\n'cd' is a Coordinate Descent solver.\n'mu' is a Multiplicative Update solver.\n\n.. versionadded:: 0.17\n Coordinate Descent solver.\n\n.. versionadded:: 0.19\n Multiplicative Update solver." + }, + "refined_type": { + "kind": "EnumType", + "values": ["cd", "mu"] } }, { @@ -52098,6 +54543,14 @@ "docstring": { "type": "float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius'", "description": "Beta divergence to be minimized, measuring the distance between X\nand the dot product WH. Note that values different from 'frobenius'\n(or 2) and 'kullback-leibler' (or 1) lead to significantly slower\nfits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input\nmatrix X cannot contain zeros. Used only in 'mu' solver.\n\n.. versionadded:: 0.19" + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "itakura-saito", + "frobenius", + "kullback-leibler" + ] } }, { @@ -52108,7 +54561,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance of the stopping condition." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -52118,7 +54572,8 @@ "docstring": { "type": "int, default=200", "description": "Maximum number of iterations before timing out." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -52128,7 +54583,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used for initialisation (when ``init`` == 'nndsvdar' or\n'random'), and in Coordinate Descent. Pass an int for reproducible\nresults across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -52138,7 +54594,8 @@ "docstring": { "type": "float, default=0.0", "description": "Constant that multiplies the regularization terms. Set it to zero to\nhave no regularization. When using `alpha` instead of `alpha_W` and `alpha_H`,\nthe regularization terms are not scaled by the `n_features` (resp. `n_samples`)\nfactors for `W` (resp. `H`).\n\n.. versionadded:: 0.17\n *alpha* used in the Coordinate Descent solver.\n\n.. deprecated:: 1.0\n The `alpha` parameter is deprecated in 1.0 and will be removed in 1.2.\n Use `alpha_W` and `alpha_H` instead." - } + }, + "refined_type": {} }, { "name": "alpha_W", @@ -52148,7 +54605,8 @@ "docstring": { "type": "float, default=0.0", "description": "Constant that multiplies the regularization terms of `W`. Set it to zero\n(default) to have no regularization on `W`.\n\n.. versionadded:: 1.0" - } + }, + "refined_type": {} }, { "name": "alpha_H", @@ -52158,7 +54616,8 @@ "docstring": { "type": "float or \"same\", default=\"same\"", "description": "Constant that multiplies the regularization terms of `H`. Set it to zero to\nhave no regularization on `H`. If \"same\" (default), it takes the same value as\n`alpha_W`.\n\n.. versionadded:: 1.0" - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -52168,7 +54627,8 @@ "docstring": { "type": "float, default=0.0", "description": "The regularization mixing parameter, with 0 <= l1_ratio <= 1.\nFor l1_ratio = 0 the penalty is an elementwise L2 penalty\n(aka Frobenius Norm).\nFor l1_ratio = 1 it is an elementwise L1 penalty.\nFor 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.\n\n.. versionadded:: 0.17\n Regularization parameter *l1_ratio* used in the Coordinate Descent\n solver." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -52178,7 +54638,8 @@ "docstring": { "type": "int, default=0", "description": "Whether to be verbose." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -52188,7 +54649,8 @@ "docstring": { "type": "bool, default=False", "description": "If true, randomize the order of coordinates in the CD solver.\n\n.. versionadded:: 0.17\n *shuffle* parameter used in the Coordinate Descent solver." - } + }, + "refined_type": {} }, { "name": "regularization", @@ -52198,13 +54660,17 @@ "docstring": { "type": "{'both', 'components', 'transformation', None}, default='both'", "description": "Select whether the regularization affects the components (H), the\ntransformation (W), both or none of them.\n\n.. versionadded:: 0.24\n\n.. deprecated:: 1.0\n The `regularization` parameter is deprecated in 1.0 and will be removed in\n 1.2. Use `alpha_W` and `alpha_H` instead." + }, + "refined_type": { + "kind": "EnumType", + "values": ["components", "transformation", "both"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=None, *, init='warn', solver='cd', beta_loss='frobenius', tol=0.0001, max_iter=200, random_state=None, alpha='deprecated', alpha_W=0.0, alpha_H='same', l1_ratio=0.0, verbose=0, shuffle=False, regularization='deprecated'):\n self.n_components = n_components\n self.init = init\n self.solver = solver\n self.beta_loss = beta_loss\n self.tol = tol\n self.max_iter = max_iter\n self.random_state = random_state\n self.alpha = alpha\n self.alpha_W = alpha_W\n self.alpha_H = alpha_H\n self.l1_ratio = l1_ratio\n self.verbose = verbose\n self.shuffle = shuffle\n self.regularization = regularization" }, { @@ -52222,7 +54688,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -52232,13 +54699,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_params(self, X):\n self._n_components = self.n_components\n if self._n_components is None:\n self._n_components = X.shape[1]\n if not isinstance(self._n_components, numbers.Integral) or self._n_components <= 0:\n raise ValueError(f'Number of components must be a positive integer; got (n_components={self._n_components!r})')\n if not isinstance(self.max_iter, numbers.Integral) or self.max_iter < 0:\n raise ValueError(f'Maximum number of iterations must be a positive integer; got (max_iter={self.max_iter!r})')\n if not isinstance(self.tol, numbers.Number) or self.tol < 0:\n raise ValueError(f'Tolerance for stopping criteria must be positive; got (tol={self.tol!r})')\n self._beta_loss = _beta_loss_to_float(self.beta_loss)\n allowed_solver = ('cd', 'mu')\n if self.solver not in allowed_solver:\n raise ValueError(f'Invalid solver parameter: got {self.solver!r} instead of one of {allowed_solver}')\n if self.solver != 'mu' and self.beta_loss not in (2, 'frobenius'):\n raise ValueError(f'Invalid beta_loss parameter: solver {self.solver!r} does not handle beta_loss = {self.beta_loss!r}')\n if self.solver == 'mu' and self.init == 'nndsvd':\n warnings.warn(\"The multiplicative update ('mu') solver cannot update zeros present in the initialization, and so leads to poorer results when used jointly with init='nndsvd'. You may try init='nndsvda' or init='nndsvdar' instead.\", UserWarning)\n if self.alpha != 'deprecated':\n warnings.warn('`alpha` was deprecated in version 1.0 and will be removed in 1.2. Use `alpha_W` and `alpha_H` instead', FutureWarning)\n alpha = self.alpha\n else:\n alpha = 0.0\n if self.regularization != 'deprecated':\n warnings.warn('`regularization` was deprecated in version 1.0 and will be removed in 1.2. Use `alpha_W` and `alpha_H` instead', FutureWarning)\n allowed_regularization = ('both', 'components', 'transformation', None)\n if self.regularization not in allowed_regularization:\n raise ValueError(f'Invalid regularization parameter: got {self.regularization!r} instead of one of {allowed_regularization}')\n regularization = self.regularization\n else:\n regularization = 'both'\n (self._l1_reg_W, self._l1_reg_H, self._l2_reg_W, self._l2_reg_H) = _compute_regularization(alpha, self.alpha_W, self.alpha_H, self.l1_ratio, regularization)\n return self" }, { @@ -52256,7 +54724,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -52266,7 +54735,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "W", @@ -52276,7 +54746,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "H", @@ -52286,7 +54757,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "update_H", @@ -52296,13 +54768,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_w_h(self, X, W, H, update_H):\n (n_samples, n_features) = X.shape\n if self.init == 'custom' and update_H:\n _check_init(H, (self._n_components, n_features), 'NMF (input H)')\n _check_init(W, (n_samples, self._n_components), 'NMF (input W)')\n if H.dtype != X.dtype or W.dtype != X.dtype:\n raise TypeError('H and W should have the same dtype as X. Got H.dtype = {} and W.dtype = {}.'.format(H.dtype, W.dtype))\n elif not update_H:\n _check_init(H, (self._n_components, n_features), 'NMF (input H)')\n if H.dtype != X.dtype:\n raise TypeError('H should have the same dtype as X. Got H.dtype = {}.'.format(H.dtype))\n if self.solver == 'mu':\n avg = np.sqrt(X.mean() / self._n_components)\n W = np.full((n_samples, self._n_components), avg, dtype=X.dtype)\n else:\n W = np.zeros((n_samples, self._n_components), dtype=X.dtype)\n else:\n (W, H) = _initialize_nmf(X, self._n_components, init=self.init, random_state=self.random_state)\n return W, H" }, { @@ -52320,7 +54793,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -52330,6 +54804,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Data matrix to be decomposed" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -52340,7 +54818,8 @@ "docstring": { "type": "Ignored", "description": "" - } + }, + "refined_type": {} }, { "name": "W", @@ -52350,7 +54829,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "If init='custom', it is used as initial guess for the solution." - } + }, + "refined_type": {} }, { "name": "H", @@ -52360,7 +54840,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "If init='custom', it is used as initial guess for the solution.\nIf update_H=False, it is used as a constant, to solve for W only." - } + }, + "refined_type": {} }, { "name": "update_H", @@ -52370,13 +54851,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, both W and H will be estimated from initial guesses,\nthis corresponds to a call to the 'fit_transform' method.\nIf False, only W will be estimated, this corresponds to a call\nto the 'transform' method." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Learn a NMF model for the data X and returns the transformed data.", - "docstring": "Learn a NMF model for the data X and returns the transformed data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Data matrix to be decomposed\n\ny : Ignored\n\nW : array-like of shape (n_samples, n_components)\n If init='custom', it is used as initial guess for the solution.\n\nH : array-like of shape (n_components, n_features)\n If init='custom', it is used as initial guess for the solution.\n If update_H=False, it is used as a constant, to solve for W only.\n\nupdate_H : bool, default=True\n If True, both W and H will be estimated from initial guesses,\n this corresponds to a call to the 'fit_transform' method.\n If False, only W will be estimated, this corresponds to a call\n to the 'transform' method.\n\nReturns\n-------\nW : ndarray of shape (n_samples, n_components)\n Transformed data.\n\nH : ndarray of shape (n_components, n_features)\n Factorization matrix, sometimes called 'dictionary'.\n\nn_iter_ : int\n Actual number of iterations.", + "docstring": "Learn a NMF model for the data X and returns the transformed data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Data matrix to be decomposed\n\n y : Ignored\n\n W : array-like of shape (n_samples, n_components)\n If init='custom', it is used as initial guess for the solution.\n\n H : array-like of shape (n_components, n_features)\n If init='custom', it is used as initial guess for the solution.\n If update_H=False, it is used as a constant, to solve for W only.\n\n update_H : bool, default=True\n If True, both W and H will be estimated from initial guesses,\n this corresponds to a call to the 'fit_transform' method.\n If False, only W will be estimated, this corresponds to a call\n to the 'transform' method.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Transformed data.\n\n H : ndarray of shape (n_components, n_features)\n Factorization matrix, sometimes called 'dictionary'.\n\n n_iter_ : int\n Actual number of iterations.\n ", "source_code": "\ndef _fit_transform(self, X, y=None, W=None, H=None, update_H=True):\n \"\"\"Learn a NMF model for the data X and returns the transformed data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Data matrix to be decomposed\n\n y : Ignored\n\n W : array-like of shape (n_samples, n_components)\n If init='custom', it is used as initial guess for the solution.\n\n H : array-like of shape (n_components, n_features)\n If init='custom', it is used as initial guess for the solution.\n If update_H=False, it is used as a constant, to solve for W only.\n\n update_H : bool, default=True\n If True, both W and H will be estimated from initial guesses,\n this corresponds to a call to the 'fit_transform' method.\n If False, only W will be estimated, this corresponds to a call\n to the 'transform' method.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Transformed data.\n\n H : ndarray of shape (n_components, n_features)\n Factorization matrix, sometimes called 'dictionary'.\n\n n_iter_ : int\n Actual number of iterations.\n \"\"\"\n check_non_negative(X, 'NMF (input X)')\n self._check_params(X)\n if X.min() == 0 and self._beta_loss <= 0:\n raise ValueError('When beta_loss <= 0 and X contains zeros, the solver may diverge. Please add small values to X, or use a positive beta_loss.')\n (W, H) = self._check_w_h(X, W, H, update_H)\n (l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H) = self._scale_regularization(X)\n if self.solver == 'cd':\n (W, H, n_iter) = _fit_coordinate_descent(X, W, H, self.tol, self.max_iter, l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H, update_H=update_H, verbose=self.verbose, shuffle=self.shuffle, random_state=self.random_state)\n elif self.solver == 'mu':\n (W, H, n_iter) = _fit_multiplicative_update(X, W, H, self._beta_loss, self.max_iter, self.tol, l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H, update_H=update_H, verbose=self.verbose)\n else:\n raise ValueError(\"Invalid solver parameter '%s'.\" % self.solver)\n if n_iter == self.max_iter and self.tol > 0:\n warnings.warn('Maximum number of iterations %d reached. Increase it to improve convergence.' % self.max_iter, ConvergenceWarning)\n return W, H, n_iter" }, { @@ -52394,13 +54876,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'requires_positive_X': True}" }, { @@ -52418,7 +54901,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -52428,13 +54912,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _scale_regularization(self, X):\n (n_samples, n_features) = X.shape\n if self.alpha_W != 0 or self.alpha_H != 'same':\n l1_reg_W = n_features * self._l1_reg_W\n l1_reg_H = n_samples * self._l1_reg_H\n l2_reg_W = n_features * self._l2_reg_W\n l2_reg_H = n_samples * self._l2_reg_H\n else:\n l1_reg_W = self._l1_reg_W\n l1_reg_H = self._l1_reg_H\n l2_reg_W = self._l2_reg_W\n l2_reg_H = self._l2_reg_H\n return l1_reg_W, l1_reg_H, l2_reg_W, l2_reg_H" }, { @@ -52452,7 +54937,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -52462,6 +54948,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -52472,13 +54962,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Learn a NMF model for the data X.", - "docstring": "Learn a NMF model for the data X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\n**params : kwargs\n Parameters (keyword arguments) and values passed to\n the fit_transform instance.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Learn a NMF model for the data X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n **params : kwargs\n Parameters (keyword arguments) and values passed to\n the fit_transform instance.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None, **params):\n \"\"\"Learn a NMF model for the data X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n **params : kwargs\n Parameters (keyword arguments) and values passed to\n the fit_transform instance.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self.fit_transform(X, **params)\n return self" }, { @@ -52496,7 +54987,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -52506,6 +54998,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -52516,7 +55012,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "W", @@ -52526,7 +55023,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "If init='custom', it is used as initial guess for the solution." - } + }, + "refined_type": {} }, { "name": "H", @@ -52536,13 +55034,14 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "If init='custom', it is used as initial guess for the solution." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Learn a NMF model for the data X and returns the transformed data.\n\nThis is more efficient than calling fit followed by transform.", - "docstring": "Learn a NMF model for the data X and returns the transformed data.\n\nThis is more efficient than calling fit followed by transform.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nW : array-like of shape (n_samples, n_components)\n If init='custom', it is used as initial guess for the solution.\n\nH : array-like of shape (n_components, n_features)\n If init='custom', it is used as initial guess for the solution.\n\nReturns\n-------\nW : ndarray of shape (n_samples, n_components)\n Transformed data.", + "docstring": "Learn a NMF model for the data X and returns the transformed data.\n\n This is more efficient than calling fit followed by transform.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n W : array-like of shape (n_samples, n_components)\n If init='custom', it is used as initial guess for the solution.\n\n H : array-like of shape (n_components, n_features)\n If init='custom', it is used as initial guess for the solution.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Transformed data.\n ", "source_code": "\ndef fit_transform(self, X, y=None, W=None, H=None):\n \"\"\"Learn a NMF model for the data X and returns the transformed data.\n\n This is more efficient than calling fit followed by transform.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n W : array-like of shape (n_samples, n_components)\n If init='custom', it is used as initial guess for the solution.\n\n H : array-like of shape (n_components, n_features)\n If init='custom', it is used as initial guess for the solution.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), dtype=[np.float64, np.float32])\n with config_context(assume_finite=True):\n (W, H, n_iter) = self._fit_transform(X, W=W, H=H)\n self.reconstruction_err_ = _beta_divergence(X, W, H, self._beta_loss, square_root=True)\n self.n_components_ = H.shape[0]\n self.components_ = H\n self.n_iter_ = n_iter\n return W" }, { @@ -52560,7 +55059,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "W", @@ -52570,13 +55070,17 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_components)", "description": "Transformed data matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Transform data back to its original space.\n\n.. versionadded:: 0.18", - "docstring": "Transform data back to its original space.\n\n.. versionadded:: 0.18\n\nParameters\n----------\nW : {ndarray, sparse matrix} of shape (n_samples, n_components)\n Transformed data matrix.\n\nReturns\n-------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Returns a data matrix of the original shape.", + "docstring": "Transform data back to its original space.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n W : {ndarray, sparse matrix} of shape (n_samples, n_components)\n Transformed data matrix.\n\n Returns\n -------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Returns a data matrix of the original shape.\n ", "source_code": "\ndef inverse_transform(self, W):\n \"\"\"Transform data back to its original space.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n W : {ndarray, sparse matrix} of shape (n_samples, n_components)\n Transformed data matrix.\n\n Returns\n -------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Returns a data matrix of the original shape.\n \"\"\"\n check_is_fitted(self)\n return np.dot(W, self.components_)" }, { @@ -52594,7 +55098,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -52604,13 +55109,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Transform the data X according to the fitted NMF model.", - "docstring": "Transform the data X according to the fitted NMF model.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\nReturns\n-------\nW : ndarray of shape (n_samples, n_components)\n Transformed data.", + "docstring": "Transform the data X according to the fitted NMF model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Transformed data.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform the data X according to the fitted NMF model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), dtype=[np.float64, np.float32], reset=False)\n with config_context(assume_finite=True):\n (W, *_) = self._fit_transform(X, H=self.components_, update_H=False)\n return W" }, { @@ -52628,7 +55137,8 @@ "docstring": { "type": "float or array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "W", @@ -52638,7 +55148,8 @@ "docstring": { "type": "float or array-like of shape (n_samples, n_components)", "description": "" - } + }, + "refined_type": {} }, { "name": "H", @@ -52648,7 +55159,8 @@ "docstring": { "type": "float or array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "beta", @@ -52658,6 +55170,14 @@ "docstring": { "type": "float or {'frobenius', 'kullback-leibler', 'itakura-saito'}", "description": "Parameter of the beta-divergence.\nIf beta == 2, this is half the Frobenius *squared* norm.\nIf beta == 1, this is the generalized Kullback-Leibler divergence.\nIf beta == 0, this is the Itakura-Saito divergence.\nElse, this is the general beta-divergence." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "itakura-saito", + "frobenius", + "kullback-leibler" + ] } }, { @@ -52668,13 +55188,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, return np.sqrt(2 * res)\nFor beta == 2, it corresponds to the Frobenius norm." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the beta-divergence of X and dot(W, H).", - "docstring": "Compute the beta-divergence of X and dot(W, H).\n\nParameters\n----------\nX : float or array-like of shape (n_samples, n_features)\n\nW : float or array-like of shape (n_samples, n_components)\n\nH : float or array-like of shape (n_components, n_features)\n\nbeta : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}\n Parameter of the beta-divergence.\n If beta == 2, this is half the Frobenius *squared* norm.\n If beta == 1, this is the generalized Kullback-Leibler divergence.\n If beta == 0, this is the Itakura-Saito divergence.\n Else, this is the general beta-divergence.\n\nsquare_root : bool, default=False\n If True, return np.sqrt(2 * res)\n For beta == 2, it corresponds to the Frobenius norm.\n\nReturns\n-------\n res : float\n Beta divergence of X and np.dot(X, H).", + "docstring": "Compute the beta-divergence of X and dot(W, H).\n\n Parameters\n ----------\n X : float or array-like of shape (n_samples, n_features)\n\n W : float or array-like of shape (n_samples, n_components)\n\n H : float or array-like of shape (n_components, n_features)\n\n beta : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}\n Parameter of the beta-divergence.\n If beta == 2, this is half the Frobenius *squared* norm.\n If beta == 1, this is the generalized Kullback-Leibler divergence.\n If beta == 0, this is the Itakura-Saito divergence.\n Else, this is the general beta-divergence.\n\n square_root : bool, default=False\n If True, return np.sqrt(2 * res)\n For beta == 2, it corresponds to the Frobenius norm.\n\n Returns\n -------\n res : float\n Beta divergence of X and np.dot(X, H).\n ", "source_code": "\ndef _beta_divergence(X, W, H, beta, square_root=False):\n \"\"\"Compute the beta-divergence of X and dot(W, H).\n\n Parameters\n ----------\n X : float or array-like of shape (n_samples, n_features)\n\n W : float or array-like of shape (n_samples, n_components)\n\n H : float or array-like of shape (n_components, n_features)\n\n beta : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}\n Parameter of the beta-divergence.\n If beta == 2, this is half the Frobenius *squared* norm.\n If beta == 1, this is the generalized Kullback-Leibler divergence.\n If beta == 0, this is the Itakura-Saito divergence.\n Else, this is the general beta-divergence.\n\n square_root : bool, default=False\n If True, return np.sqrt(2 * res)\n For beta == 2, it corresponds to the Frobenius norm.\n\n Returns\n -------\n res : float\n Beta divergence of X and np.dot(X, H).\n \"\"\"\n beta = _beta_loss_to_float(beta)\n if not sp.issparse(X):\n X = np.atleast_2d(X)\n W = np.atleast_2d(W)\n H = np.atleast_2d(H)\n if beta == 2:\n if sp.issparse(X):\n norm_X = np.dot(X.data, X.data)\n norm_WH = trace_dot(np.linalg.multi_dot([W.T, W, H]), H)\n cross_prod = trace_dot(X * H.T, W)\n res = (norm_X + norm_WH - 2.0 * cross_prod) / 2.0\n else:\n res = squared_norm(X - np.dot(W, H)) / 2.0\n if square_root:\n return np.sqrt(res * 2)\n else:\n return res\n if sp.issparse(X):\n WH_data = _special_sparse_dot(W, H, X).data\n X_data = X.data\n else:\n WH = np.dot(W, H)\n WH_data = WH.ravel()\n X_data = X.ravel()\n indices = X_data > EPSILON\n WH_data = WH_data[indices]\n X_data = X_data[indices]\n WH_data[WH_data == 0] = EPSILON\n if beta == 1:\n sum_WH = np.dot(np.sum(W, axis=0), np.sum(H, axis=1))\n div = X_data / WH_data\n res = np.dot(X_data, np.log(div))\n res += sum_WH - X_data.sum()\n elif beta == 0:\n div = X_data / WH_data\n res = np.sum(div) - np.product(X.shape) - np.sum(np.log(div))\n else:\n if sp.issparse(X):\n sum_WH_beta = 0\n for i in range(X.shape[1]):\n sum_WH_beta += np.sum(np.dot(W, H[:, i])**beta)\n else:\n sum_WH_beta = np.sum(WH**beta)\n sum_X_WH = np.dot(X_data, WH_data**(beta - 1))\n res = (X_data**beta).sum() - beta * sum_X_WH\n res += sum_WH_beta * (beta - 1)\n res /= beta * (beta - 1)\n if square_root:\n return np.sqrt(2 * res)\n else:\n return res" }, { @@ -52692,7 +55213,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -52716,7 +55238,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "shape", @@ -52726,7 +55249,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "whom", @@ -52736,13 +55260,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_init(A, shape, whom):\n A = check_array(A)\n if np.shape(A) != shape:\n raise ValueError('Array with wrong shape passed to %s. Expected %s, but got %s ' % (whom, shape, np.shape(A)))\n check_non_negative(A, whom)\n if np.max(A) == 0:\n raise ValueError('Array passed to %s is full of zeros.' % whom)" }, { @@ -52760,7 +55285,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha_W", @@ -52770,7 +55296,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha_H", @@ -52780,7 +55307,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -52790,7 +55318,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "regularization", @@ -52800,7 +55329,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -52824,7 +55354,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Constant matrix." - } + }, + "refined_type": {} }, { "name": "W", @@ -52834,7 +55365,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "Initial guess for the solution." - } + }, + "refined_type": {} }, { "name": "H", @@ -52844,7 +55376,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "Initial guess for the solution." - } + }, + "refined_type": {} }, { "name": "tol", @@ -52854,7 +55387,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance of the stopping condition." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -52864,7 +55398,8 @@ "docstring": { "type": "int, default=200", "description": "Maximum number of iterations before timing out." - } + }, + "refined_type": {} }, { "name": "l1_reg_W", @@ -52874,7 +55409,8 @@ "docstring": { "type": "float, default=0.", "description": "L1 regularization parameter for W." - } + }, + "refined_type": {} }, { "name": "l1_reg_H", @@ -52884,7 +55420,8 @@ "docstring": { "type": "float, default=0.", "description": "L1 regularization parameter for H." - } + }, + "refined_type": {} }, { "name": "l2_reg_W", @@ -52894,7 +55431,8 @@ "docstring": { "type": "float, default=0.", "description": "L2 regularization parameter for W." - } + }, + "refined_type": {} }, { "name": "l2_reg_H", @@ -52904,7 +55442,8 @@ "docstring": { "type": "float, default=0.", "description": "L2 regularization parameter for H." - } + }, + "refined_type": {} }, { "name": "update_H", @@ -52914,7 +55453,8 @@ "docstring": { "type": "bool, default=True", "description": "Set to True, both W and H will be estimated from initial guesses.\nSet to False, only W will be estimated." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -52924,7 +55464,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -52934,7 +55475,8 @@ "docstring": { "type": "bool, default=False", "description": "If true, randomize the order of coordinates in the CD solver." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -52944,13 +55486,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used to randomize the coordinates in the CD solver, when\n``shuffle`` is set to ``True``. Pass an int for reproducible\nresults across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent\n\nThe objective function is minimized with an alternating minimization of W and H. Each minimization is done with a cyclic (up to a permutation of the features) Coordinate Descent.", - "docstring": "Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent\n\nThe objective function is minimized with an alternating minimization of W\nand H. Each minimization is done with a cyclic (up to a permutation of the\nfeatures) Coordinate Descent.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Constant matrix.\n\nW : array-like of shape (n_samples, n_components)\n Initial guess for the solution.\n\nH : array-like of shape (n_components, n_features)\n Initial guess for the solution.\n\ntol : float, default=1e-4\n Tolerance of the stopping condition.\n\nmax_iter : int, default=200\n Maximum number of iterations before timing out.\n\nl1_reg_W : float, default=0.\n L1 regularization parameter for W.\n\nl1_reg_H : float, default=0.\n L1 regularization parameter for H.\n\nl2_reg_W : float, default=0.\n L2 regularization parameter for W.\n\nl2_reg_H : float, default=0.\n L2 regularization parameter for H.\n\nupdate_H : bool, default=True\n Set to True, both W and H will be estimated from initial guesses.\n Set to False, only W will be estimated.\n\nverbose : int, default=0\n The verbosity level.\n\nshuffle : bool, default=False\n If true, randomize the order of coordinates in the CD solver.\n\nrandom_state : int, RandomState instance or None, default=None\n Used to randomize the coordinates in the CD solver, when\n ``shuffle`` is set to ``True``. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nW : ndarray of shape (n_samples, n_components)\n Solution to the non-negative least squares problem.\n\nH : ndarray of shape (n_components, n_features)\n Solution to the non-negative least squares problem.\n\nn_iter : int\n The number of iterations done by the algorithm.\n\nReferences\n----------\nCichocki, Andrzej, and Phan, Anh-Huy. \"Fast local algorithms for\nlarge scale nonnegative matrix and tensor factorizations.\"\nIEICE transactions on fundamentals of electronics, communications and\ncomputer sciences 92.3: 708-721, 2009.", + "description": "Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent\n\nThe objective function is minimized with an alternating minimization of W\nand H. Each minimization is done with a cyclic (up to a permutation of the\nfeatures) Coordinate Descent.", + "docstring": "Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent\n\n The objective function is minimized with an alternating minimization of W\n and H. Each minimization is done with a cyclic (up to a permutation of the\n features) Coordinate Descent.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Constant matrix.\n\n W : array-like of shape (n_samples, n_components)\n Initial guess for the solution.\n\n H : array-like of shape (n_components, n_features)\n Initial guess for the solution.\n\n tol : float, default=1e-4\n Tolerance of the stopping condition.\n\n max_iter : int, default=200\n Maximum number of iterations before timing out.\n\n l1_reg_W : float, default=0.\n L1 regularization parameter for W.\n\n l1_reg_H : float, default=0.\n L1 regularization parameter for H.\n\n l2_reg_W : float, default=0.\n L2 regularization parameter for W.\n\n l2_reg_H : float, default=0.\n L2 regularization parameter for H.\n\n update_H : bool, default=True\n Set to True, both W and H will be estimated from initial guesses.\n Set to False, only W will be estimated.\n\n verbose : int, default=0\n The verbosity level.\n\n shuffle : bool, default=False\n If true, randomize the order of coordinates in the CD solver.\n\n random_state : int, RandomState instance or None, default=None\n Used to randomize the coordinates in the CD solver, when\n ``shuffle`` is set to ``True``. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Solution to the non-negative least squares problem.\n\n H : ndarray of shape (n_components, n_features)\n Solution to the non-negative least squares problem.\n\n n_iter : int\n The number of iterations done by the algorithm.\n\n References\n ----------\n Cichocki, Andrzej, and Phan, Anh-Huy. \"Fast local algorithms for\n large scale nonnegative matrix and tensor factorizations.\"\n IEICE transactions on fundamentals of electronics, communications and\n computer sciences 92.3: 708-721, 2009.\n ", "source_code": "\ndef _fit_coordinate_descent(X, W, H, tol=0.0001, max_iter=200, l1_reg_W=0, l1_reg_H=0, l2_reg_W=0, l2_reg_H=0, update_H=True, verbose=0, shuffle=False, random_state=None):\n \"\"\"Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent\n\n The objective function is minimized with an alternating minimization of W\n and H. Each minimization is done with a cyclic (up to a permutation of the\n features) Coordinate Descent.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Constant matrix.\n\n W : array-like of shape (n_samples, n_components)\n Initial guess for the solution.\n\n H : array-like of shape (n_components, n_features)\n Initial guess for the solution.\n\n tol : float, default=1e-4\n Tolerance of the stopping condition.\n\n max_iter : int, default=200\n Maximum number of iterations before timing out.\n\n l1_reg_W : float, default=0.\n L1 regularization parameter for W.\n\n l1_reg_H : float, default=0.\n L1 regularization parameter for H.\n\n l2_reg_W : float, default=0.\n L2 regularization parameter for W.\n\n l2_reg_H : float, default=0.\n L2 regularization parameter for H.\n\n update_H : bool, default=True\n Set to True, both W and H will be estimated from initial guesses.\n Set to False, only W will be estimated.\n\n verbose : int, default=0\n The verbosity level.\n\n shuffle : bool, default=False\n If true, randomize the order of coordinates in the CD solver.\n\n random_state : int, RandomState instance or None, default=None\n Used to randomize the coordinates in the CD solver, when\n ``shuffle`` is set to ``True``. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Solution to the non-negative least squares problem.\n\n H : ndarray of shape (n_components, n_features)\n Solution to the non-negative least squares problem.\n\n n_iter : int\n The number of iterations done by the algorithm.\n\n References\n ----------\n Cichocki, Andrzej, and Phan, Anh-Huy. \"Fast local algorithms for\n large scale nonnegative matrix and tensor factorizations.\"\n IEICE transactions on fundamentals of electronics, communications and\n computer sciences 92.3: 708-721, 2009.\n \"\"\"\n Ht = check_array(H.T, order='C')\n X = check_array(X, accept_sparse='csr')\n rng = check_random_state(random_state)\n for n_iter in range(1, max_iter + 1):\n violation = 0.0\n violation += _update_coordinate_descent(X, W, Ht, l1_reg_W, l2_reg_W, shuffle, rng)\n if update_H:\n violation += _update_coordinate_descent(X.T, Ht, W, l1_reg_H, l2_reg_H, shuffle, rng)\n if n_iter == 1:\n violation_init = violation\n if violation_init == 0:\n break\n if verbose:\n print('violation:', violation / violation_init)\n if violation / violation_init <= tol:\n if verbose:\n print('Converged at iteration', n_iter + 1)\n break\n return W, Ht.T, n_iter" }, { @@ -52968,7 +55511,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Constant input matrix." - } + }, + "refined_type": {} }, { "name": "W", @@ -52978,7 +55522,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "Initial guess for the solution." - } + }, + "refined_type": {} }, { "name": "H", @@ -52988,7 +55533,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "Initial guess for the solution." - } + }, + "refined_type": {} }, { "name": "beta_loss", @@ -52998,6 +55544,14 @@ "docstring": { "type": "float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius'", "description": "String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}.\nBeta divergence to be minimized, measuring the distance between X\nand the dot product WH. Note that values different from 'frobenius'\n(or 2) and 'kullback-leibler' (or 1) lead to significantly slower\nfits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input\nmatrix X cannot contain zeros." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "itakura-saito", + "frobenius", + "kullback-leibler" + ] } }, { @@ -53008,7 +55562,8 @@ "docstring": { "type": "int, default=200", "description": "Number of iterations." - } + }, + "refined_type": {} }, { "name": "tol", @@ -53018,7 +55573,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance of the stopping condition." - } + }, + "refined_type": {} }, { "name": "l1_reg_W", @@ -53028,7 +55584,8 @@ "docstring": { "type": "float, default=0.", "description": "L1 regularization parameter for W." - } + }, + "refined_type": {} }, { "name": "l1_reg_H", @@ -53038,7 +55595,8 @@ "docstring": { "type": "float, default=0.", "description": "L1 regularization parameter for H." - } + }, + "refined_type": {} }, { "name": "l2_reg_W", @@ -53048,7 +55606,8 @@ "docstring": { "type": "float, default=0.", "description": "L2 regularization parameter for W." - } + }, + "refined_type": {} }, { "name": "l2_reg_H", @@ -53058,7 +55617,8 @@ "docstring": { "type": "float, default=0.", "description": "L2 regularization parameter for H." - } + }, + "refined_type": {} }, { "name": "update_H", @@ -53068,7 +55628,8 @@ "docstring": { "type": "bool, default=True", "description": "Set to True, both W and H will be estimated from initial guesses.\nSet to False, only W will be estimated." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -53078,13 +55639,14 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute Non-negative Matrix Factorization with Multiplicative Update.\n\nThe objective function is _beta_divergence(X, WH) and is minimized with an alternating minimization of W and H. Each minimization is done with a Multiplicative Update.", - "docstring": "Compute Non-negative Matrix Factorization with Multiplicative Update.\n\nThe objective function is _beta_divergence(X, WH) and is minimized with an\nalternating minimization of W and H. Each minimization is done with a\nMultiplicative Update.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Constant input matrix.\n\nW : array-like of shape (n_samples, n_components)\n Initial guess for the solution.\n\nH : array-like of shape (n_components, n_features)\n Initial guess for the solution.\n\nbeta_loss : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius'\n String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}.\n Beta divergence to be minimized, measuring the distance between X\n and the dot product WH. Note that values different from 'frobenius'\n (or 2) and 'kullback-leibler' (or 1) lead to significantly slower\n fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input\n matrix X cannot contain zeros.\n\nmax_iter : int, default=200\n Number of iterations.\n\ntol : float, default=1e-4\n Tolerance of the stopping condition.\n\nl1_reg_W : float, default=0.\n L1 regularization parameter for W.\n\nl1_reg_H : float, default=0.\n L1 regularization parameter for H.\n\nl2_reg_W : float, default=0.\n L2 regularization parameter for W.\n\nl2_reg_H : float, default=0.\n L2 regularization parameter for H.\n\nupdate_H : bool, default=True\n Set to True, both W and H will be estimated from initial guesses.\n Set to False, only W will be estimated.\n\nverbose : int, default=0\n The verbosity level.\n\nReturns\n-------\nW : ndarray of shape (n_samples, n_components)\n Solution to the non-negative least squares problem.\n\nH : ndarray of shape (n_components, n_features)\n Solution to the non-negative least squares problem.\n\nn_iter : int\n The number of iterations done by the algorithm.\n\nReferences\n----------\nFevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix\nfactorization with the beta-divergence. Neural Computation, 23(9).", + "description": "Compute Non-negative Matrix Factorization with Multiplicative Update.\n\nThe objective function is _beta_divergence(X, WH) and is minimized with an\nalternating minimization of W and H. Each minimization is done with a\nMultiplicative Update.", + "docstring": "Compute Non-negative Matrix Factorization with Multiplicative Update.\n\n The objective function is _beta_divergence(X, WH) and is minimized with an\n alternating minimization of W and H. Each minimization is done with a\n Multiplicative Update.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Constant input matrix.\n\n W : array-like of shape (n_samples, n_components)\n Initial guess for the solution.\n\n H : array-like of shape (n_components, n_features)\n Initial guess for the solution.\n\n beta_loss : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius'\n String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}.\n Beta divergence to be minimized, measuring the distance between X\n and the dot product WH. Note that values different from 'frobenius'\n (or 2) and 'kullback-leibler' (or 1) lead to significantly slower\n fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input\n matrix X cannot contain zeros.\n\n max_iter : int, default=200\n Number of iterations.\n\n tol : float, default=1e-4\n Tolerance of the stopping condition.\n\n l1_reg_W : float, default=0.\n L1 regularization parameter for W.\n\n l1_reg_H : float, default=0.\n L1 regularization parameter for H.\n\n l2_reg_W : float, default=0.\n L2 regularization parameter for W.\n\n l2_reg_H : float, default=0.\n L2 regularization parameter for H.\n\n update_H : bool, default=True\n Set to True, both W and H will be estimated from initial guesses.\n Set to False, only W will be estimated.\n\n verbose : int, default=0\n The verbosity level.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Solution to the non-negative least squares problem.\n\n H : ndarray of shape (n_components, n_features)\n Solution to the non-negative least squares problem.\n\n n_iter : int\n The number of iterations done by the algorithm.\n\n References\n ----------\n Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix\n factorization with the beta-divergence. Neural Computation, 23(9).\n ", "source_code": "\ndef _fit_multiplicative_update(X, W, H, beta_loss='frobenius', max_iter=200, tol=0.0001, l1_reg_W=0, l1_reg_H=0, l2_reg_W=0, l2_reg_H=0, update_H=True, verbose=0):\n \"\"\"Compute Non-negative Matrix Factorization with Multiplicative Update.\n\n The objective function is _beta_divergence(X, WH) and is minimized with an\n alternating minimization of W and H. Each minimization is done with a\n Multiplicative Update.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Constant input matrix.\n\n W : array-like of shape (n_samples, n_components)\n Initial guess for the solution.\n\n H : array-like of shape (n_components, n_features)\n Initial guess for the solution.\n\n beta_loss : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius'\n String must be in {'frobenius', 'kullback-leibler', 'itakura-saito'}.\n Beta divergence to be minimized, measuring the distance between X\n and the dot product WH. Note that values different from 'frobenius'\n (or 2) and 'kullback-leibler' (or 1) lead to significantly slower\n fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input\n matrix X cannot contain zeros.\n\n max_iter : int, default=200\n Number of iterations.\n\n tol : float, default=1e-4\n Tolerance of the stopping condition.\n\n l1_reg_W : float, default=0.\n L1 regularization parameter for W.\n\n l1_reg_H : float, default=0.\n L1 regularization parameter for H.\n\n l2_reg_W : float, default=0.\n L2 regularization parameter for W.\n\n l2_reg_H : float, default=0.\n L2 regularization parameter for H.\n\n update_H : bool, default=True\n Set to True, both W and H will be estimated from initial guesses.\n Set to False, only W will be estimated.\n\n verbose : int, default=0\n The verbosity level.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Solution to the non-negative least squares problem.\n\n H : ndarray of shape (n_components, n_features)\n Solution to the non-negative least squares problem.\n\n n_iter : int\n The number of iterations done by the algorithm.\n\n References\n ----------\n Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix\n factorization with the beta-divergence. Neural Computation, 23(9).\n \"\"\"\n start_time = time.time()\n beta_loss = _beta_loss_to_float(beta_loss)\n if beta_loss < 1:\n gamma = 1.0 / (2.0 - beta_loss)\n elif beta_loss > 2:\n gamma = 1.0 / (beta_loss - 1.0)\n else:\n gamma = 1.0\n error_at_init = _beta_divergence(X, W, H, beta_loss, square_root=True)\n previous_error = error_at_init\n (H_sum, HHt, XHt) = (None, None, None)\n for n_iter in range(1, max_iter + 1):\n (delta_W, H_sum, HHt, XHt) = _multiplicative_update_w(X, W, H, beta_loss, l1_reg_W, l2_reg_W, gamma, H_sum, HHt, XHt, update_H)\n W *= delta_W\n if beta_loss < 1:\n W[W < np.finfo(np.float64).eps] = 0.0\n if update_H:\n delta_H = _multiplicative_update_h(X, W, H, beta_loss, l1_reg_H, l2_reg_H, gamma)\n H *= delta_H\n (H_sum, HHt, XHt) = (None, None, None)\n if beta_loss <= 1:\n H[H < np.finfo(np.float64).eps] = 0.0\n if tol > 0 and n_iter % 10 == 0:\n error = _beta_divergence(X, W, H, beta_loss, square_root=True)\n if verbose:\n iter_time = time.time()\n print('Epoch %02d reached after %.3f seconds, error: %f' % (n_iter, iter_time - start_time, error))\n if (previous_error - error) / error_at_init < tol:\n break\n previous_error = error\n if verbose and (tol == 0 or n_iter % 10 != 0):\n end_time = time.time()\n print('Epoch %02d reached after %.3f seconds.' % (n_iter, end_time - start_time))\n return W, H, n_iter" }, { @@ -53102,7 +55664,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix to be decomposed." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -53112,7 +55675,8 @@ "docstring": { "type": "int", "description": "The number of components desired in the approximation." - } + }, + "refined_type": {} }, { "name": "init", @@ -53120,8 +55684,12 @@ "is_public": false, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": " {'random', 'nndsvd', 'nndsvda', 'nndsvdar'}, default=None", + "type": "{'random', 'nndsvd', 'nndsvda', 'nndsvdar'}, default=None", "description": "Method used to initialize the procedure.\nDefault: None.\nValid options:\n\n- None: 'nndsvd' if n_components <= min(n_samples, n_features),\n otherwise 'random'.\n\n- 'random': non-negative random matrices, scaled with:\n sqrt(X.mean() / n_components)\n\n- 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)\n initialization (better for sparseness)\n\n- 'nndsvda': NNDSVD with zeros filled with the average of X\n (better when sparsity is not desired)\n\n- 'nndsvdar': NNDSVD with zeros filled with small random values\n (generally faster, less accurate alternative to NNDSVDa\n for when sparsity is not desired)\n\n- 'custom': use custom matrices W and H" + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "nndsvd", "nndsvdar", "nndsvda"] } }, { @@ -53132,7 +55700,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Truncate all values less then this in output to zero." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -53142,13 +55711,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used when ``init`` == 'nndsvdar' or 'random'. Pass an int for\nreproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Algorithms for NMF initialization.\n\nComputes an initial guess for the non-negative rank k matrix approximation for X: X = WH.", - "docstring": "Algorithms for NMF initialization.\n\nComputes an initial guess for the non-negative\nrank k matrix approximation for X: X = WH.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix to be decomposed.\n\nn_components : int\n The number of components desired in the approximation.\n\ninit : {'random', 'nndsvd', 'nndsvda', 'nndsvdar'}, default=None\n Method used to initialize the procedure.\n Default: None.\n Valid options:\n\n - None: 'nndsvd' if n_components <= min(n_samples, n_features),\n otherwise 'random'.\n\n - 'random': non-negative random matrices, scaled with:\n sqrt(X.mean() / n_components)\n\n - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)\n initialization (better for sparseness)\n\n - 'nndsvda': NNDSVD with zeros filled with the average of X\n (better when sparsity is not desired)\n\n - 'nndsvdar': NNDSVD with zeros filled with small random values\n (generally faster, less accurate alternative to NNDSVDa\n for when sparsity is not desired)\n\n - 'custom': use custom matrices W and H\n\neps : float, default=1e-6\n Truncate all values less then this in output to zero.\n\nrandom_state : int, RandomState instance or None, default=None\n Used when ``init`` == 'nndsvdar' or 'random'. Pass an int for\n reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nW : array-like of shape (n_samples, n_components)\n Initial guesses for solving X ~= WH.\n\nH : array-like of shape (n_components, n_features)\n Initial guesses for solving X ~= WH.\n\nReferences\n----------\nC. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for\nnonnegative matrix factorization - Pattern Recognition, 2008\nhttp://tinyurl.com/nndsvd", + "description": "Algorithms for NMF initialization.\n\nComputes an initial guess for the non-negative\nrank k matrix approximation for X: X = WH.", + "docstring": "Algorithms for NMF initialization.\n\n Computes an initial guess for the non-negative\n rank k matrix approximation for X: X = WH.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix to be decomposed.\n\n n_components : int\n The number of components desired in the approximation.\n\n init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar'}, default=None\n Method used to initialize the procedure.\n Default: None.\n Valid options:\n\n - None: 'nndsvd' if n_components <= min(n_samples, n_features),\n otherwise 'random'.\n\n - 'random': non-negative random matrices, scaled with:\n sqrt(X.mean() / n_components)\n\n - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)\n initialization (better for sparseness)\n\n - 'nndsvda': NNDSVD with zeros filled with the average of X\n (better when sparsity is not desired)\n\n - 'nndsvdar': NNDSVD with zeros filled with small random values\n (generally faster, less accurate alternative to NNDSVDa\n for when sparsity is not desired)\n\n - 'custom': use custom matrices W and H\n\n eps : float, default=1e-6\n Truncate all values less then this in output to zero.\n\n random_state : int, RandomState instance or None, default=None\n Used when ``init`` == 'nndsvdar' or 'random'. Pass an int for\n reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n W : array-like of shape (n_samples, n_components)\n Initial guesses for solving X ~= WH.\n\n H : array-like of shape (n_components, n_features)\n Initial guesses for solving X ~= WH.\n\n References\n ----------\n C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for\n nonnegative matrix factorization - Pattern Recognition, 2008\n http://tinyurl.com/nndsvd\n ", "source_code": "\ndef _initialize_nmf(X, n_components, init='warn', eps=1e-06, random_state=None):\n \"\"\"Algorithms for NMF initialization.\n\n Computes an initial guess for the non-negative\n rank k matrix approximation for X: X = WH.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix to be decomposed.\n\n n_components : int\n The number of components desired in the approximation.\n\n init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar'}, default=None\n Method used to initialize the procedure.\n Default: None.\n Valid options:\n\n - None: 'nndsvd' if n_components <= min(n_samples, n_features),\n otherwise 'random'.\n\n - 'random': non-negative random matrices, scaled with:\n sqrt(X.mean() / n_components)\n\n - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)\n initialization (better for sparseness)\n\n - 'nndsvda': NNDSVD with zeros filled with the average of X\n (better when sparsity is not desired)\n\n - 'nndsvdar': NNDSVD with zeros filled with small random values\n (generally faster, less accurate alternative to NNDSVDa\n for when sparsity is not desired)\n\n - 'custom': use custom matrices W and H\n\n eps : float, default=1e-6\n Truncate all values less then this in output to zero.\n\n random_state : int, RandomState instance or None, default=None\n Used when ``init`` == 'nndsvdar' or 'random'. Pass an int for\n reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n W : array-like of shape (n_samples, n_components)\n Initial guesses for solving X ~= WH.\n\n H : array-like of shape (n_components, n_features)\n Initial guesses for solving X ~= WH.\n\n References\n ----------\n C. Boutsidis, E. Gallopoulos: SVD based initialization: A head start for\n nonnegative matrix factorization - Pattern Recognition, 2008\n http://tinyurl.com/nndsvd\n \"\"\"\n if init == 'warn':\n warnings.warn(\"The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).\", FutureWarning)\n init = None\n check_non_negative(X, 'NMF initialization')\n (n_samples, n_features) = X.shape\n if init is not None and init != 'random' and n_components > min(n_samples, n_features):\n raise ValueError(\"init = '{}' can only be used when n_components <= min(n_samples, n_features)\".format(init))\n if init is None:\n if n_components <= min(n_samples, n_features):\n init = 'nndsvd'\n else:\n init = 'random'\n if init == 'random':\n avg = np.sqrt(X.mean() / n_components)\n rng = check_random_state(random_state)\n H = avg * rng.randn(n_components, n_features).astype(X.dtype, copy=False)\n W = avg * rng.randn(n_samples, n_components).astype(X.dtype, copy=False)\n np.abs(H, out=H)\n np.abs(W, out=W)\n return W, H\n (U, S, V) = randomized_svd(X, n_components, random_state=random_state)\n W = np.zeros_like(U)\n H = np.zeros_like(V)\n W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])\n H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])\n for j in range(1, n_components):\n (x, y) = (U[:, j], V[j, :])\n (x_p, y_p) = (np.maximum(x, 0), np.maximum(y, 0))\n (x_n, y_n) = (np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0)))\n (x_p_nrm, y_p_nrm) = (norm(x_p), norm(y_p))\n (x_n_nrm, y_n_nrm) = (norm(x_n), norm(y_n))\n (m_p, m_n) = (x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm)\n if m_p > m_n:\n u = x_p / x_p_nrm\n v = y_p / y_p_nrm\n sigma = m_p\n else:\n u = x_n / x_n_nrm\n v = y_n / y_n_nrm\n sigma = m_n\n lbd = np.sqrt(S[j] * sigma)\n W[:, j] = lbd * u\n H[j, :] = lbd * v\n W[W < eps] = 0\n H[H < eps] = 0\n if init == 'nndsvd':\n pass\n elif init == 'nndsvda':\n avg = X.mean()\n W[W == 0] = avg\n H[H == 0] = avg\n elif init == 'nndsvdar':\n rng = check_random_state(random_state)\n avg = X.mean()\n W[W == 0] = abs(avg * rng.randn(len(W[W == 0])) / 100)\n H[H == 0] = abs(avg * rng.randn(len(H[H == 0])) / 100)\n else:\n raise ValueError('Invalid init parameter: got %r instead of one of %r' % (init, (None, 'random', 'nndsvd', 'nndsvda', 'nndsvdar')))\n return W, H" }, { @@ -53166,7 +55736,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "W", @@ -53176,7 +55747,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "H", @@ -53186,7 +55758,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "beta_loss", @@ -53196,7 +55769,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l1_reg_H", @@ -53206,7 +55780,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l2_reg_H", @@ -53216,7 +55791,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gamma", @@ -53226,7 +55802,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -53250,7 +55827,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "W", @@ -53260,7 +55838,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "H", @@ -53270,7 +55849,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "beta_loss", @@ -53280,7 +55860,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l1_reg_W", @@ -53290,7 +55871,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l2_reg_W", @@ -53300,7 +55882,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gamma", @@ -53310,7 +55893,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "H_sum", @@ -53320,7 +55904,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "HHt", @@ -53330,7 +55915,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "XHt", @@ -53340,7 +55926,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "update_H", @@ -53350,7 +55937,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -53374,7 +55962,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "H", @@ -53384,7 +55973,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -53394,7 +55984,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -53418,7 +56009,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "W", @@ -53428,7 +56020,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Ht", @@ -53438,7 +56031,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l1_reg", @@ -53448,7 +56042,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l2_reg", @@ -53458,7 +56053,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -53468,7 +56064,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -53478,13 +56075,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Helper function for _fit_coordinate_descent.\n\nUpdate W to minimize the objective function, iterating once over all coordinates. By symmetry, to update H, one can call _update_coordinate_descent(X.T, Ht, W, ...).", - "docstring": "Helper function for _fit_coordinate_descent.\n\nUpdate W to minimize the objective function, iterating once over all\ncoordinates. By symmetry, to update H, one can call\n_update_coordinate_descent(X.T, Ht, W, ...).", + "description": "Helper function for _fit_coordinate_descent.\n\nUpdate W to minimize the objective function, iterating once over all\ncoordinates. By symmetry, to update H, one can call\n_update_coordinate_descent(X.T, Ht, W, ...).", + "docstring": "Helper function for _fit_coordinate_descent.\n\n Update W to minimize the objective function, iterating once over all\n coordinates. By symmetry, to update H, one can call\n _update_coordinate_descent(X.T, Ht, W, ...).\n\n ", "source_code": "\ndef _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle, random_state):\n \"\"\"Helper function for _fit_coordinate_descent.\n\n Update W to minimize the objective function, iterating once over all\n coordinates. By symmetry, to update H, one can call\n _update_coordinate_descent(X.T, Ht, W, ...).\n\n \"\"\"\n n_components = Ht.shape[1]\n HHt = np.dot(Ht.T, Ht)\n XHt = safe_sparse_dot(X, Ht)\n if l2_reg != 0.0:\n HHt.flat[::n_components + 1] += l2_reg\n if l1_reg != 0.0:\n XHt -= l1_reg\n if shuffle:\n permutation = random_state.permutation(n_components)\n else:\n permutation = np.arange(n_components)\n permutation = np.asarray(permutation, dtype=np.intp)\n return _update_cdnmf_fast(W, HHt, XHt, permutation)" }, { @@ -53502,7 +56100,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Constant matrix." - } + }, + "refined_type": {} }, { "name": "W", @@ -53512,7 +56111,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_components), default=None", "description": "If init='custom', it is used as initial guess for the solution." - } + }, + "refined_type": {} }, { "name": "H", @@ -53522,7 +56122,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features), default=None", "description": "If init='custom', it is used as initial guess for the solution.\nIf update_H=False, it is used as a constant, to solve for W only." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -53532,7 +56133,8 @@ "docstring": { "type": "int, default=None", "description": "Number of components, if n_components is not set all features\nare kept." - } + }, + "refined_type": {} }, { "name": "init", @@ -53542,6 +56144,16 @@ "docstring": { "type": "{'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None", "description": "Method used to initialize the procedure.\n\nValid options:\n\n- None: 'nndsvd' if n_components < n_features, otherwise 'random'.\n\n- 'random': non-negative random matrices, scaled with:\n sqrt(X.mean() / n_components)\n\n- 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)\n initialization (better for sparseness)\n\n- 'nndsvda': NNDSVD with zeros filled with the average of X\n (better when sparsity is not desired)\n\n- 'nndsvdar': NNDSVD with zeros filled with small random values\n (generally faster, less accurate alternative to NNDSVDa\n for when sparsity is not desired)\n\n- 'custom': use custom matrices W and H if `update_H=True`. If\n `update_H=False`, then only custom matrix H is used.\n\n.. versionchanged:: 0.23\n The default value of `init` changed from 'random' to None in 0.23." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "random", + "custom", + "nndsvda", + "nndsvd", + "nndsvdar" + ] } }, { @@ -53552,7 +56164,8 @@ "docstring": { "type": "bool, default=True", "description": "Set to True, both W and H will be estimated from initial guesses.\nSet to False, only W will be estimated." - } + }, + "refined_type": {} }, { "name": "solver", @@ -53562,6 +56175,10 @@ "docstring": { "type": "{'cd', 'mu'}, default='cd'", "description": "Numerical solver to use:\n\n- 'cd' is a Coordinate Descent solver that uses Fast Hierarchical\n Alternating Least Squares (Fast HALS).\n\n- 'mu' is a Multiplicative Update solver.\n\n.. versionadded:: 0.17\n Coordinate Descent solver.\n\n.. versionadded:: 0.19\n Multiplicative Update solver." + }, + "refined_type": { + "kind": "EnumType", + "values": ["cd", "mu"] } }, { @@ -53572,6 +56189,14 @@ "docstring": { "type": "float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius'", "description": "Beta divergence to be minimized, measuring the distance between X\nand the dot product WH. Note that values different from 'frobenius'\n(or 2) and 'kullback-leibler' (or 1) lead to significantly slower\nfits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input\nmatrix X cannot contain zeros. Used only in 'mu' solver.\n\n.. versionadded:: 0.19" + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "itakura-saito", + "frobenius", + "kullback-leibler" + ] } }, { @@ -53582,7 +56207,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance of the stopping condition." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -53592,7 +56218,8 @@ "docstring": { "type": "int, default=200", "description": "Maximum number of iterations before timing out." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -53602,7 +56229,8 @@ "docstring": { "type": "float, default=0.0", "description": "Constant that multiplies the regularization terms. Set it to zero to have no\nregularization. When using `alpha` instead of `alpha_W` and `alpha_H`, the\nregularization terms are not scaled by the `n_features` (resp. `n_samples`)\nfactors for `W` (resp. `H`).\n\n.. deprecated:: 1.0\n The `alpha` parameter is deprecated in 1.0 and will be removed in 1.2.\n Use `alpha_W` and `alpha_H` instead." - } + }, + "refined_type": {} }, { "name": "alpha_W", @@ -53612,7 +56240,8 @@ "docstring": { "type": "float, default=0.0", "description": "Constant that multiplies the regularization terms of `W`. Set it to zero\n(default) to have no regularization on `W`.\n\n.. versionadded:: 1.0" - } + }, + "refined_type": {} }, { "name": "alpha_H", @@ -53622,7 +56251,8 @@ "docstring": { "type": "float or \"same\", default=\"same\"", "description": "Constant that multiplies the regularization terms of `H`. Set it to zero to\nhave no regularization on `H`. If \"same\" (default), it takes the same value as\n`alpha_W`.\n\n.. versionadded:: 1.0" - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -53632,7 +56262,8 @@ "docstring": { "type": "float, default=0.0", "description": "The regularization mixing parameter, with 0 <= l1_ratio <= 1.\nFor l1_ratio = 0 the penalty is an elementwise L2 penalty\n(aka Frobenius Norm).\nFor l1_ratio = 1 it is an elementwise L1 penalty.\nFor 0 < l1_ratio < 1, the penalty is a combination of L1 and L2." - } + }, + "refined_type": {} }, { "name": "regularization", @@ -53642,6 +56273,10 @@ "docstring": { "type": "{'both', 'components', 'transformation'}, default=None", "description": "Select whether the regularization affects the components (H), the\ntransformation (W), both or none of them.\n\n.. deprecated:: 1.0\n The `regularization` parameter is deprecated in 1.0 and will be removed in\n 1.2. Use `alpha_W` and `alpha_H` instead." + }, + "refined_type": { + "kind": "EnumType", + "values": ["components", "transformation", "both"] } }, { @@ -53652,7 +56287,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used for NMF initialisation (when ``init`` == 'nndsvdar' or\n'random'), and in Coordinate Descent. Pass an int for reproducible\nresults across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -53662,7 +56298,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -53672,13 +56309,14 @@ "docstring": { "type": "bool, default=False", "description": "If true, randomize the order of coordinates in the CD solver." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute Non-negative Matrix Factorization (NMF).\n\nFind two non-negative matrices (W, H) whose product approximates the non- negative matrix X. This factorization can be used for example for dimensionality reduction, source separation or topic extraction. The objective function is: .. math:: 0.5 * ||X - WH||_{loss}^2 + alpha\\_W * l1_{ratio} * n\\_features * ||vec(W)||_1 + alpha\\_H * l1_{ratio} * n\\_samples * ||vec(H)||_1 + 0.5 * alpha\\_W * (1 - l1_{ratio}) * n\\_features * ||W||_{Fro}^2 + 0.5 * alpha\\_H * (1 - l1_{ratio}) * n\\_samples * ||H||_{Fro}^2 Where: :math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm) :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm) The generic norm :math:`||X - WH||_{loss}^2` may represent the Frobenius norm or another supported beta-divergence loss. The choice between options is controlled by the `beta_loss` parameter. The regularization terms are scaled by `n_features` for `W` and by `n_samples` for `H` to keep their impact balanced with respect to one another and to the data fit term as independent as possible of the size `n_samples` of the training set. The objective function is minimized with an alternating minimization of W and H. If H is given and update_H=False, it solves for W only.", - "docstring": "Compute Non-negative Matrix Factorization (NMF).\n\nFind two non-negative matrices (W, H) whose product approximates the non-\nnegative matrix X. This factorization can be used for example for\ndimensionality reduction, source separation or topic extraction.\n\nThe objective function is:\n\n .. math::\n\n 0.5 * ||X - WH||_{loss}^2\n\n + alpha\\_W * l1_{ratio} * n\\_features * ||vec(W)||_1\n\n + alpha\\_H * l1_{ratio} * n\\_samples * ||vec(H)||_1\n\n + 0.5 * alpha\\_W * (1 - l1_{ratio}) * n\\_features * ||W||_{Fro}^2\n\n + 0.5 * alpha\\_H * (1 - l1_{ratio}) * n\\_samples * ||H||_{Fro}^2\n\nWhere:\n\n:math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm)\n\n:math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm)\n\nThe generic norm :math:`||X - WH||_{loss}^2` may represent\nthe Frobenius norm or another supported beta-divergence loss.\nThe choice between options is controlled by the `beta_loss` parameter.\n\nThe regularization terms are scaled by `n_features` for `W` and by `n_samples` for\n`H` to keep their impact balanced with respect to one another and to the data fit\nterm as independent as possible of the size `n_samples` of the training set.\n\nThe objective function is minimized with an alternating minimization of W\nand H. If H is given and update_H=False, it solves for W only.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Constant matrix.\n\nW : array-like of shape (n_samples, n_components), default=None\n If init='custom', it is used as initial guess for the solution.\n\nH : array-like of shape (n_components, n_features), default=None\n If init='custom', it is used as initial guess for the solution.\n If update_H=False, it is used as a constant, to solve for W only.\n\nn_components : int, default=None\n Number of components, if n_components is not set all features\n are kept.\n\ninit : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None\n Method used to initialize the procedure.\n\n Valid options:\n\n - None: 'nndsvd' if n_components < n_features, otherwise 'random'.\n\n - 'random': non-negative random matrices, scaled with:\n sqrt(X.mean() / n_components)\n\n - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)\n initialization (better for sparseness)\n\n - 'nndsvda': NNDSVD with zeros filled with the average of X\n (better when sparsity is not desired)\n\n - 'nndsvdar': NNDSVD with zeros filled with small random values\n (generally faster, less accurate alternative to NNDSVDa\n for when sparsity is not desired)\n\n - 'custom': use custom matrices W and H if `update_H=True`. If\n `update_H=False`, then only custom matrix H is used.\n\n .. versionchanged:: 0.23\n The default value of `init` changed from 'random' to None in 0.23.\n\nupdate_H : bool, default=True\n Set to True, both W and H will be estimated from initial guesses.\n Set to False, only W will be estimated.\n\nsolver : {'cd', 'mu'}, default='cd'\n Numerical solver to use:\n\n - 'cd' is a Coordinate Descent solver that uses Fast Hierarchical\n Alternating Least Squares (Fast HALS).\n\n - 'mu' is a Multiplicative Update solver.\n\n .. versionadded:: 0.17\n Coordinate Descent solver.\n\n .. versionadded:: 0.19\n Multiplicative Update solver.\n\nbeta_loss : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius'\n Beta divergence to be minimized, measuring the distance between X\n and the dot product WH. Note that values different from 'frobenius'\n (or 2) and 'kullback-leibler' (or 1) lead to significantly slower\n fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input\n matrix X cannot contain zeros. Used only in 'mu' solver.\n\n .. versionadded:: 0.19\n\ntol : float, default=1e-4\n Tolerance of the stopping condition.\n\nmax_iter : int, default=200\n Maximum number of iterations before timing out.\n\nalpha : float, default=0.0\n Constant that multiplies the regularization terms. Set it to zero to have no\n regularization. When using `alpha` instead of `alpha_W` and `alpha_H`, the\n regularization terms are not scaled by the `n_features` (resp. `n_samples`)\n factors for `W` (resp. `H`).\n\n .. deprecated:: 1.0\n The `alpha` parameter is deprecated in 1.0 and will be removed in 1.2.\n Use `alpha_W` and `alpha_H` instead.\n\nalpha_W : float, default=0.0\n Constant that multiplies the regularization terms of `W`. Set it to zero\n (default) to have no regularization on `W`.\n\n .. versionadded:: 1.0\n\nalpha_H : float or \"same\", default=\"same\"\n Constant that multiplies the regularization terms of `H`. Set it to zero to\n have no regularization on `H`. If \"same\" (default), it takes the same value as\n `alpha_W`.\n\n .. versionadded:: 1.0\n\nl1_ratio : float, default=0.0\n The regularization mixing parameter, with 0 <= l1_ratio <= 1.\n For l1_ratio = 0 the penalty is an elementwise L2 penalty\n (aka Frobenius Norm).\n For l1_ratio = 1 it is an elementwise L1 penalty.\n For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.\n\nregularization : {'both', 'components', 'transformation'}, default=None\n Select whether the regularization affects the components (H), the\n transformation (W), both or none of them.\n\n .. deprecated:: 1.0\n The `regularization` parameter is deprecated in 1.0 and will be removed in\n 1.2. Use `alpha_W` and `alpha_H` instead.\n\nrandom_state : int, RandomState instance or None, default=None\n Used for NMF initialisation (when ``init`` == 'nndsvdar' or\n 'random'), and in Coordinate Descent. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\nverbose : int, default=0\n The verbosity level.\n\nshuffle : bool, default=False\n If true, randomize the order of coordinates in the CD solver.\n\nReturns\n-------\nW : ndarray of shape (n_samples, n_components)\n Solution to the non-negative least squares problem.\n\nH : ndarray of shape (n_components, n_features)\n Solution to the non-negative least squares problem.\n\nn_iter : int\n Actual number of iterations.\n\nExamples\n--------\n>>> import numpy as np\n>>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])\n>>> from sklearn.decomposition import non_negative_factorization\n>>> W, H, n_iter = non_negative_factorization(X, n_components=2,\n... init='random', random_state=0)\n\nReferences\n----------\nCichocki, Andrzej, and P. H. A. N. Anh-Huy. \"Fast local algorithms for\nlarge scale nonnegative matrix and tensor factorizations.\"\nIEICE transactions on fundamentals of electronics, communications and\ncomputer sciences 92.3: 708-721, 2009.\n\nFevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix\nfactorization with the beta-divergence. Neural Computation, 23(9).", + "description": "Compute Non-negative Matrix Factorization (NMF).\n\nFind two non-negative matrices (W, H) whose product approximates the non-\nnegative matrix X. This factorization can be used for example for\ndimensionality reduction, source separation or topic extraction.\n\nThe objective function is:\n\n .. math::\n\n 0.5 * ||X - WH||_{loss}^2\n\n + alpha\\_W * l1_{ratio} * n\\_features * ||vec(W)||_1\n\n + alpha\\_H * l1_{ratio} * n\\_samples * ||vec(H)||_1\n\n + 0.5 * alpha\\_W * (1 - l1_{ratio}) * n\\_features * ||W||_{Fro}^2\n\n + 0.5 * alpha\\_H * (1 - l1_{ratio}) * n\\_samples * ||H||_{Fro}^2\n\nWhere:\n\n:math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm)\n\n:math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm)\n\nThe generic norm :math:`||X - WH||_{loss}^2` may represent\nthe Frobenius norm or another supported beta-divergence loss.\nThe choice between options is controlled by the `beta_loss` parameter.\n\nThe regularization terms are scaled by `n_features` for `W` and by `n_samples` for\n`H` to keep their impact balanced with respect to one another and to the data fit\nterm as independent as possible of the size `n_samples` of the training set.\n\nThe objective function is minimized with an alternating minimization of W\nand H. If H is given and update_H=False, it solves for W only.", + "docstring": "Compute Non-negative Matrix Factorization (NMF).\n\n Find two non-negative matrices (W, H) whose product approximates the non-\n negative matrix X. This factorization can be used for example for\n dimensionality reduction, source separation or topic extraction.\n\n The objective function is:\n\n .. math::\n\n 0.5 * ||X - WH||_{loss}^2\n\n + alpha\\_W * l1_{ratio} * n\\_features * ||vec(W)||_1\n\n + alpha\\_H * l1_{ratio} * n\\_samples * ||vec(H)||_1\n\n + 0.5 * alpha\\_W * (1 - l1_{ratio}) * n\\_features * ||W||_{Fro}^2\n\n + 0.5 * alpha\\_H * (1 - l1_{ratio}) * n\\_samples * ||H||_{Fro}^2\n\n Where:\n\n :math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm)\n\n :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm)\n\n The generic norm :math:`||X - WH||_{loss}^2` may represent\n the Frobenius norm or another supported beta-divergence loss.\n The choice between options is controlled by the `beta_loss` parameter.\n\n The regularization terms are scaled by `n_features` for `W` and by `n_samples` for\n `H` to keep their impact balanced with respect to one another and to the data fit\n term as independent as possible of the size `n_samples` of the training set.\n\n The objective function is minimized with an alternating minimization of W\n and H. If H is given and update_H=False, it solves for W only.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Constant matrix.\n\n W : array-like of shape (n_samples, n_components), default=None\n If init='custom', it is used as initial guess for the solution.\n\n H : array-like of shape (n_components, n_features), default=None\n If init='custom', it is used as initial guess for the solution.\n If update_H=False, it is used as a constant, to solve for W only.\n\n n_components : int, default=None\n Number of components, if n_components is not set all features\n are kept.\n\n init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None\n Method used to initialize the procedure.\n\n Valid options:\n\n - None: 'nndsvd' if n_components < n_features, otherwise 'random'.\n\n - 'random': non-negative random matrices, scaled with:\n sqrt(X.mean() / n_components)\n\n - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)\n initialization (better for sparseness)\n\n - 'nndsvda': NNDSVD with zeros filled with the average of X\n (better when sparsity is not desired)\n\n - 'nndsvdar': NNDSVD with zeros filled with small random values\n (generally faster, less accurate alternative to NNDSVDa\n for when sparsity is not desired)\n\n - 'custom': use custom matrices W and H if `update_H=True`. If\n `update_H=False`, then only custom matrix H is used.\n\n .. versionchanged:: 0.23\n The default value of `init` changed from 'random' to None in 0.23.\n\n update_H : bool, default=True\n Set to True, both W and H will be estimated from initial guesses.\n Set to False, only W will be estimated.\n\n solver : {'cd', 'mu'}, default='cd'\n Numerical solver to use:\n\n - 'cd' is a Coordinate Descent solver that uses Fast Hierarchical\n Alternating Least Squares (Fast HALS).\n\n - 'mu' is a Multiplicative Update solver.\n\n .. versionadded:: 0.17\n Coordinate Descent solver.\n\n .. versionadded:: 0.19\n Multiplicative Update solver.\n\n beta_loss : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius'\n Beta divergence to be minimized, measuring the distance between X\n and the dot product WH. Note that values different from 'frobenius'\n (or 2) and 'kullback-leibler' (or 1) lead to significantly slower\n fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input\n matrix X cannot contain zeros. Used only in 'mu' solver.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-4\n Tolerance of the stopping condition.\n\n max_iter : int, default=200\n Maximum number of iterations before timing out.\n\n alpha : float, default=0.0\n Constant that multiplies the regularization terms. Set it to zero to have no\n regularization. When using `alpha` instead of `alpha_W` and `alpha_H`, the\n regularization terms are not scaled by the `n_features` (resp. `n_samples`)\n factors for `W` (resp. `H`).\n\n .. deprecated:: 1.0\n The `alpha` parameter is deprecated in 1.0 and will be removed in 1.2.\n Use `alpha_W` and `alpha_H` instead.\n\n alpha_W : float, default=0.0\n Constant that multiplies the regularization terms of `W`. Set it to zero\n (default) to have no regularization on `W`.\n\n .. versionadded:: 1.0\n\n alpha_H : float or \"same\", default=\"same\"\n Constant that multiplies the regularization terms of `H`. Set it to zero to\n have no regularization on `H`. If \"same\" (default), it takes the same value as\n `alpha_W`.\n\n .. versionadded:: 1.0\n\n l1_ratio : float, default=0.0\n The regularization mixing parameter, with 0 <= l1_ratio <= 1.\n For l1_ratio = 0 the penalty is an elementwise L2 penalty\n (aka Frobenius Norm).\n For l1_ratio = 1 it is an elementwise L1 penalty.\n For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.\n\n regularization : {'both', 'components', 'transformation'}, default=None\n Select whether the regularization affects the components (H), the\n transformation (W), both or none of them.\n\n .. deprecated:: 1.0\n The `regularization` parameter is deprecated in 1.0 and will be removed in\n 1.2. Use `alpha_W` and `alpha_H` instead.\n\n random_state : int, RandomState instance or None, default=None\n Used for NMF initialisation (when ``init`` == 'nndsvdar' or\n 'random'), and in Coordinate Descent. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n verbose : int, default=0\n The verbosity level.\n\n shuffle : bool, default=False\n If true, randomize the order of coordinates in the CD solver.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Solution to the non-negative least squares problem.\n\n H : ndarray of shape (n_components, n_features)\n Solution to the non-negative least squares problem.\n\n n_iter : int\n Actual number of iterations.\n\n Examples\n --------\n >>> import numpy as np\n >>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])\n >>> from sklearn.decomposition import non_negative_factorization\n >>> W, H, n_iter = non_negative_factorization(X, n_components=2,\n ... init='random', random_state=0)\n\n References\n ----------\n Cichocki, Andrzej, and P. H. A. N. Anh-Huy. \"Fast local algorithms for\n large scale nonnegative matrix and tensor factorizations.\"\n IEICE transactions on fundamentals of electronics, communications and\n computer sciences 92.3: 708-721, 2009.\n\n Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix\n factorization with the beta-divergence. Neural Computation, 23(9).\n ", "source_code": "\ndef non_negative_factorization(X, W=None, H=None, n_components=None, *, init='warn', update_H=True, solver='cd', beta_loss='frobenius', tol=0.0001, max_iter=200, alpha='deprecated', alpha_W=0.0, alpha_H='same', l1_ratio=0.0, regularization='deprecated', random_state=None, verbose=0, shuffle=False):\n \"\"\"Compute Non-negative Matrix Factorization (NMF).\n\n Find two non-negative matrices (W, H) whose product approximates the non-\n negative matrix X. This factorization can be used for example for\n dimensionality reduction, source separation or topic extraction.\n\n The objective function is:\n\n .. math::\n\n 0.5 * ||X - WH||_{loss}^2\n\n + alpha\\_W * l1_{ratio} * n\\_features * ||vec(W)||_1\n\n + alpha\\_H * l1_{ratio} * n\\_samples * ||vec(H)||_1\n\n + 0.5 * alpha\\_W * (1 - l1_{ratio}) * n\\_features * ||W||_{Fro}^2\n\n + 0.5 * alpha\\_H * (1 - l1_{ratio}) * n\\_samples * ||H||_{Fro}^2\n\n Where:\n\n :math:`||A||_{Fro}^2 = \\sum_{i,j} A_{ij}^2` (Frobenius norm)\n\n :math:`||vec(A)||_1 = \\sum_{i,j} abs(A_{ij})` (Elementwise L1 norm)\n\n The generic norm :math:`||X - WH||_{loss}^2` may represent\n the Frobenius norm or another supported beta-divergence loss.\n The choice between options is controlled by the `beta_loss` parameter.\n\n The regularization terms are scaled by `n_features` for `W` and by `n_samples` for\n `H` to keep their impact balanced with respect to one another and to the data fit\n term as independent as possible of the size `n_samples` of the training set.\n\n The objective function is minimized with an alternating minimization of W\n and H. If H is given and update_H=False, it solves for W only.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Constant matrix.\n\n W : array-like of shape (n_samples, n_components), default=None\n If init='custom', it is used as initial guess for the solution.\n\n H : array-like of shape (n_components, n_features), default=None\n If init='custom', it is used as initial guess for the solution.\n If update_H=False, it is used as a constant, to solve for W only.\n\n n_components : int, default=None\n Number of components, if n_components is not set all features\n are kept.\n\n init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None\n Method used to initialize the procedure.\n\n Valid options:\n\n - None: 'nndsvd' if n_components < n_features, otherwise 'random'.\n\n - 'random': non-negative random matrices, scaled with:\n sqrt(X.mean() / n_components)\n\n - 'nndsvd': Nonnegative Double Singular Value Decomposition (NNDSVD)\n initialization (better for sparseness)\n\n - 'nndsvda': NNDSVD with zeros filled with the average of X\n (better when sparsity is not desired)\n\n - 'nndsvdar': NNDSVD with zeros filled with small random values\n (generally faster, less accurate alternative to NNDSVDa\n for when sparsity is not desired)\n\n - 'custom': use custom matrices W and H if `update_H=True`. If\n `update_H=False`, then only custom matrix H is used.\n\n .. versionchanged:: 0.23\n The default value of `init` changed from 'random' to None in 0.23.\n\n update_H : bool, default=True\n Set to True, both W and H will be estimated from initial guesses.\n Set to False, only W will be estimated.\n\n solver : {'cd', 'mu'}, default='cd'\n Numerical solver to use:\n\n - 'cd' is a Coordinate Descent solver that uses Fast Hierarchical\n Alternating Least Squares (Fast HALS).\n\n - 'mu' is a Multiplicative Update solver.\n\n .. versionadded:: 0.17\n Coordinate Descent solver.\n\n .. versionadded:: 0.19\n Multiplicative Update solver.\n\n beta_loss : float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius'\n Beta divergence to be minimized, measuring the distance between X\n and the dot product WH. Note that values different from 'frobenius'\n (or 2) and 'kullback-leibler' (or 1) lead to significantly slower\n fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input\n matrix X cannot contain zeros. Used only in 'mu' solver.\n\n .. versionadded:: 0.19\n\n tol : float, default=1e-4\n Tolerance of the stopping condition.\n\n max_iter : int, default=200\n Maximum number of iterations before timing out.\n\n alpha : float, default=0.0\n Constant that multiplies the regularization terms. Set it to zero to have no\n regularization. When using `alpha` instead of `alpha_W` and `alpha_H`, the\n regularization terms are not scaled by the `n_features` (resp. `n_samples`)\n factors for `W` (resp. `H`).\n\n .. deprecated:: 1.0\n The `alpha` parameter is deprecated in 1.0 and will be removed in 1.2.\n Use `alpha_W` and `alpha_H` instead.\n\n alpha_W : float, default=0.0\n Constant that multiplies the regularization terms of `W`. Set it to zero\n (default) to have no regularization on `W`.\n\n .. versionadded:: 1.0\n\n alpha_H : float or \"same\", default=\"same\"\n Constant that multiplies the regularization terms of `H`. Set it to zero to\n have no regularization on `H`. If \"same\" (default), it takes the same value as\n `alpha_W`.\n\n .. versionadded:: 1.0\n\n l1_ratio : float, default=0.0\n The regularization mixing parameter, with 0 <= l1_ratio <= 1.\n For l1_ratio = 0 the penalty is an elementwise L2 penalty\n (aka Frobenius Norm).\n For l1_ratio = 1 it is an elementwise L1 penalty.\n For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.\n\n regularization : {'both', 'components', 'transformation'}, default=None\n Select whether the regularization affects the components (H), the\n transformation (W), both or none of them.\n\n .. deprecated:: 1.0\n The `regularization` parameter is deprecated in 1.0 and will be removed in\n 1.2. Use `alpha_W` and `alpha_H` instead.\n\n random_state : int, RandomState instance or None, default=None\n Used for NMF initialisation (when ``init`` == 'nndsvdar' or\n 'random'), and in Coordinate Descent. Pass an int for reproducible\n results across multiple function calls.\n See :term:`Glossary `.\n\n verbose : int, default=0\n The verbosity level.\n\n shuffle : bool, default=False\n If true, randomize the order of coordinates in the CD solver.\n\n Returns\n -------\n W : ndarray of shape (n_samples, n_components)\n Solution to the non-negative least squares problem.\n\n H : ndarray of shape (n_components, n_features)\n Solution to the non-negative least squares problem.\n\n n_iter : int\n Actual number of iterations.\n\n Examples\n --------\n >>> import numpy as np\n >>> X = np.array([[1,1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])\n >>> from sklearn.decomposition import non_negative_factorization\n >>> W, H, n_iter = non_negative_factorization(X, n_components=2,\n ... init='random', random_state=0)\n\n References\n ----------\n Cichocki, Andrzej, and P. H. A. N. Anh-Huy. \"Fast local algorithms for\n large scale nonnegative matrix and tensor factorizations.\"\n IEICE transactions on fundamentals of electronics, communications and\n computer sciences 92.3: 708-721, 2009.\n\n Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix\n factorization with the beta-divergence. Neural Computation, 23(9).\n \"\"\"\n X = check_array(X, accept_sparse=('csr', 'csc'), dtype=[np.float64, np.float32])\n est = NMF(n_components=n_components, init=init, solver=solver, beta_loss=beta_loss, tol=tol, max_iter=max_iter, random_state=random_state, alpha=alpha, alpha_W=alpha_W, alpha_H=alpha_H, l1_ratio=l1_ratio, verbose=verbose, shuffle=shuffle, regularization=regularization)\n with config_context(assume_finite=True):\n (W, H, n_iter) = est._fit_transform(X, W=W, H=H, update_H=update_H)\n return W, H, n_iter" }, { @@ -53696,13 +56334,14 @@ "docstring": { "type": "array-like", "description": "Vector for which to compute the norm." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Dot product-based Euclidean norm implementation.\n\nSee: http://fseoane.net/blog/2011/computing-the-vector-norm/", - "docstring": "Dot product-based Euclidean norm implementation.\n\nSee: http://fseoane.net/blog/2011/computing-the-vector-norm/\n\nParameters\n----------\nx : array-like\n Vector for which to compute the norm.", + "docstring": "Dot product-based Euclidean norm implementation.\n\n See: http://fseoane.net/blog/2011/computing-the-vector-norm/\n\n Parameters\n ----------\n x : array-like\n Vector for which to compute the norm.\n ", "source_code": "\ndef norm(x):\n \"\"\"Dot product-based Euclidean norm implementation.\n\n See: http://fseoane.net/blog/2011/computing-the-vector-norm/\n\n Parameters\n ----------\n x : array-like\n Vector for which to compute the norm.\n \"\"\"\n return sqrt(squared_norm(x))" }, { @@ -53720,7 +56359,8 @@ "docstring": { "type": "array-like", "description": "First matrix." - } + }, + "refined_type": {} }, { "name": "Y", @@ -53730,13 +56370,14 @@ "docstring": { "type": "array-like", "description": "Second matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Trace of np.dot(X, Y.T).", - "docstring": "Trace of np.dot(X, Y.T).\n\nParameters\n----------\nX : array-like\n First matrix.\nY : array-like\n Second matrix.", + "docstring": "Trace of np.dot(X, Y.T).\n\n Parameters\n ----------\n X : array-like\n First matrix.\n Y : array-like\n Second matrix.\n ", "source_code": "\ndef trace_dot(X, Y):\n \"\"\"Trace of np.dot(X, Y.T).\n\n Parameters\n ----------\n X : array-like\n First matrix.\n Y : array-like\n Second matrix.\n \"\"\"\n return np.dot(X.ravel(), Y.ravel())" }, { @@ -53754,7 +56395,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -53764,7 +56406,8 @@ "docstring": { "type": "int, float or 'mle', default=None", "description": "Number of components to keep.\nif n_components is not set all components are kept::\n\n n_components == min(n_samples, n_features)\n\nIf ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's\nMLE is used to guess the dimension. Use of ``n_components == 'mle'``\nwill interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``.\n\nIf ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the\nnumber of components such that the amount of variance that needs to be\nexplained is greater than the percentage specified by n_components.\n\nIf ``svd_solver == 'arpack'``, the number of components must be\nstrictly less than the minimum of n_features and n_samples.\n\nHence, the None case results in::\n\n n_components == min(n_samples, n_features) - 1" - } + }, + "refined_type": {} }, { "name": "copy", @@ -53774,7 +56417,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, data passed to fit are overwritten and running\nfit(X).transform(X) will not yield the expected results,\nuse fit_transform(X) instead." - } + }, + "refined_type": {} }, { "name": "whiten", @@ -53784,7 +56428,8 @@ "docstring": { "type": "bool, default=False", "description": "When True (False by default) the `components_` vectors are multiplied\nby the square root of n_samples and then divided by the singular values\nto ensure uncorrelated outputs with unit component-wise variances.\n\nWhitening will remove some information from the transformed signal\n(the relative variance scales of the components) but can sometime\nimprove the predictive accuracy of the downstream estimators by\nmaking their data respect some hard-wired assumptions." - } + }, + "refined_type": {} }, { "name": "svd_solver", @@ -53794,6 +56439,10 @@ "docstring": { "type": "{'auto', 'full', 'arpack', 'randomized'}, default='auto'", "description": "If auto :\n The solver is selected by a default policy based on `X.shape` and\n `n_components`: if the input data is larger than 500x500 and the\n number of components to extract is lower than 80% of the smallest\n dimension of the data, then the more efficient 'randomized'\n method is enabled. Otherwise the exact full SVD is computed and\n optionally truncated afterwards.\nIf full :\n run exact full SVD calling the standard LAPACK solver via\n `scipy.linalg.svd` and select the components by postprocessing\nIf arpack :\n run SVD truncated to n_components calling ARPACK solver via\n `scipy.sparse.linalg.svds`. It requires strictly\n 0 < n_components < min(X.shape)\nIf randomized :\n run randomized SVD by the method of Halko et al.\n\n.. versionadded:: 0.18.0" + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "full", "randomized", "arpack"] } }, { @@ -53804,6 +56453,14 @@ "docstring": { "type": "float, default=0.0", "description": "Tolerance for singular values computed by svd_solver == 'arpack'.\nMust be of range [0.0, infinity).\n\n.. versionadded:: 0.18.0" + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": "Infinity", + "min_inclusive": true, + "max_inclusive": false } }, { @@ -53814,6 +56471,14 @@ "docstring": { "type": "int or 'auto', default='auto'", "description": "Number of iterations for the power method computed by\nsvd_solver == 'randomized'.\nMust be of range [0, infinity).\n\n.. versionadded:: 0.18.0" + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": "Infinity", + "min_inclusive": true, + "max_inclusive": false } }, { @@ -53824,13 +56489,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used when the 'arpack' or 'randomized' solvers are used. Pass an int\nfor reproducible results across multiple function calls.\nSee :term:`Glossary `.\n\n.. versionadded:: 0.18.0" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=None, *, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None):\n self.n_components = n_components\n self.copy = copy\n self.whiten = whiten\n self.svd_solver = svd_solver\n self.tol = tol\n self.iterated_power = iterated_power\n self.random_state = random_state" }, { @@ -53848,7 +56514,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -53858,7 +56525,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -53882,7 +56550,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -53892,7 +56561,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -53902,7 +56572,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -53926,7 +56597,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -53936,7 +56608,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -53946,7 +56619,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "svd_solver", @@ -53956,13 +56630,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Fit the model by computing truncated SVD (by ARPACK or randomized) on X.", - "docstring": "Fit the model by computing truncated SVD (by ARPACK or randomized)\non X.", + "description": "Fit the model by computing truncated SVD (by ARPACK or randomized)\non X.", + "docstring": "Fit the model by computing truncated SVD (by ARPACK or randomized)\n on X.\n ", "source_code": "\ndef _fit_truncated(self, X, n_components, svd_solver):\n \"\"\"Fit the model by computing truncated SVD (by ARPACK or randomized)\n on X.\n \"\"\"\n (n_samples, n_features) = X.shape\n if isinstance(n_components, str):\n raise ValueError(\"n_components=%r cannot be a string with svd_solver='%s'\" % (n_components, svd_solver))\n elif not 1 <= n_components <= min(n_samples, n_features):\n raise ValueError(\"n_components=%r must be between 1 and min(n_samples, n_features)=%r with svd_solver='%s'\" % (n_components, min(n_samples, n_features), svd_solver))\n elif not isinstance(n_components, numbers.Integral):\n raise ValueError('n_components=%r must be of type int when greater than or equal to 1, was of type=%r' % (n_components, type(n_components)))\n elif svd_solver == 'arpack' and n_components == min(n_samples, n_features):\n raise ValueError(\"n_components=%r must be strictly less than min(n_samples, n_features)=%r with svd_solver='%s'\" % (n_components, min(n_samples, n_features), svd_solver))\n random_state = check_random_state(self.random_state)\n self.mean_ = np.mean(X, axis=0)\n X -= self.mean_\n if svd_solver == 'arpack':\n v0 = _init_arpack_v0(min(X.shape), random_state)\n (U, S, Vt) = svds(X, k=n_components, tol=self.tol, v0=v0)\n S = S[::-1]\n (U, Vt) = svd_flip(U[:, ::-1], Vt[::-1])\n elif svd_solver == 'randomized':\n (U, S, Vt) = randomized_svd(X, n_components=n_components, n_iter=self.iterated_power, flip_sign=True, random_state=random_state)\n (self.n_samples_, self.n_features_) = (n_samples, n_features)\n self.components_ = Vt\n self.n_components_ = n_components\n self.explained_variance_ = S**2 / (n_samples - 1)\n total_var = np.var(X, ddof=1, axis=0)\n self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum()\n self.singular_values_ = S.copy()\n if self.n_components_ < min(n_features, n_samples):\n self.noise_variance_ = total_var.sum() - self.explained_variance_.sum()\n self.noise_variance_ /= min(n_features, n_samples) - n_components\n else:\n self.noise_variance_ = 0.0\n return U, S, Vt" }, { @@ -53980,13 +56655,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'preserves_dtype': [np.float64, np.float32]}" }, { @@ -54004,7 +56680,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -54014,7 +56691,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -54024,13 +56702,14 @@ "docstring": { "type": "Ignored", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model with X.", - "docstring": "Fit the model with X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Ignored.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the model with X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Ignored.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model with X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Ignored.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self._fit(X)\n return self" }, { @@ -54048,7 +56727,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -54058,7 +56738,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -54068,13 +56749,14 @@ "docstring": { "type": "Ignored", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model with X and apply the dimensionality reduction on X.", - "docstring": "Fit the model with X and apply the dimensionality reduction on X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Ignored.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Transformed values.\n\nNotes\n-----\nThis method returns a Fortran-ordered array. To convert it to a\nC-ordered array, use 'np.ascontiguousarray'.", + "docstring": "Fit the model with X and apply the dimensionality reduction on X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Ignored.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed values.\n\n Notes\n -----\n This method returns a Fortran-ordered array. To convert it to a\n C-ordered array, use 'np.ascontiguousarray'.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Fit the model with X and apply the dimensionality reduction on X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Ignored.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed values.\n\n Notes\n -----\n This method returns a Fortran-ordered array. To convert it to a\n C-ordered array, use 'np.ascontiguousarray'.\n \"\"\"\n (U, S, Vt) = self._fit(X)\n U = U[:, :self.n_components_]\n if self.whiten:\n U *= sqrt(X.shape[0] - 1)\n else:\n U *= S[:self.n_components_]\n return U" }, { @@ -54092,7 +56774,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -54102,7 +56785,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data." - } + }, + "refined_type": {} }, { "name": "y", @@ -54112,13 +56796,14 @@ "docstring": { "type": "Ignored", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return the average log-likelihood of all samples.\n\nSee. \"Pattern Recognition and Machine Learning\" by C. Bishop, 12.2.1 p. 574 or http://www.miketipping.com/papers/met-mppca.pdf", - "docstring": "Return the average log-likelihood of all samples.\n\nSee. \"Pattern Recognition and Machine Learning\"\nby C. Bishop, 12.2.1 p. 574\nor http://www.miketipping.com/papers/met-mppca.pdf\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data.\n\ny : Ignored\n Ignored.\n\nReturns\n-------\nll : float\n Average log-likelihood of the samples under the current model.", + "description": "Return the average log-likelihood of all samples.\n\nSee. \"Pattern Recognition and Machine Learning\"\nby C. Bishop, 12.2.1 p. 574\nor http://www.miketipping.com/papers/met-mppca.pdf", + "docstring": "Return the average log-likelihood of all samples.\n\n See. \"Pattern Recognition and Machine Learning\"\n by C. Bishop, 12.2.1 p. 574\n or http://www.miketipping.com/papers/met-mppca.pdf\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n y : Ignored\n Ignored.\n\n Returns\n -------\n ll : float\n Average log-likelihood of the samples under the current model.\n ", "source_code": "\ndef score(self, X, y=None):\n \"\"\"Return the average log-likelihood of all samples.\n\n See. \"Pattern Recognition and Machine Learning\"\n by C. Bishop, 12.2.1 p. 574\n or http://www.miketipping.com/papers/met-mppca.pdf\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n y : Ignored\n Ignored.\n\n Returns\n -------\n ll : float\n Average log-likelihood of the samples under the current model.\n \"\"\"\n return np.mean(self.score_samples(X))" }, { @@ -54136,7 +56821,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -54146,13 +56832,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return the log-likelihood of each sample.\n\nSee. \"Pattern Recognition and Machine Learning\" by C. Bishop, 12.2.1 p. 574 or http://www.miketipping.com/papers/met-mppca.pdf", - "docstring": "Return the log-likelihood of each sample.\n\nSee. \"Pattern Recognition and Machine Learning\"\nby C. Bishop, 12.2.1 p. 574\nor http://www.miketipping.com/papers/met-mppca.pdf\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data.\n\nReturns\n-------\nll : ndarray of shape (n_samples,)\n Log-likelihood of each sample under the current model.", + "description": "Return the log-likelihood of each sample.\n\nSee. \"Pattern Recognition and Machine Learning\"\nby C. Bishop, 12.2.1 p. 574\nor http://www.miketipping.com/papers/met-mppca.pdf", + "docstring": "Return the log-likelihood of each sample.\n\n See. \"Pattern Recognition and Machine Learning\"\n by C. Bishop, 12.2.1 p. 574\n or http://www.miketipping.com/papers/met-mppca.pdf\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n Returns\n -------\n ll : ndarray of shape (n_samples,)\n Log-likelihood of each sample under the current model.\n ", "source_code": "\ndef score_samples(self, X):\n \"\"\"Return the log-likelihood of each sample.\n\n See. \"Pattern Recognition and Machine Learning\"\n by C. Bishop, 12.2.1 p. 574\n or http://www.miketipping.com/papers/met-mppca.pdf\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n Returns\n -------\n ll : ndarray of shape (n_samples,)\n Log-likelihood of each sample under the current model.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, dtype=[np.float64, np.float32], reset=False)\n Xr = X - self.mean_\n n_features = X.shape[1]\n precision = self.get_precision()\n log_like = -0.5 * (Xr * np.dot(Xr, precision)).sum(axis=1)\n log_like -= 0.5 * (n_features * log(2.0 * np.pi) - fast_logdet(precision))\n return log_like" }, { @@ -54170,7 +56857,8 @@ "docstring": { "type": "ndarray of shape (n_features,)", "description": "Data spectrum." - } + }, + "refined_type": {} }, { "name": "rank", @@ -54180,7 +56868,8 @@ "docstring": { "type": "int", "description": "Tested rank value. It should be strictly lower than n_features,\notherwise the method isn't specified (division by zero in equation\n(31) from the paper)." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -54190,13 +56879,14 @@ "docstring": { "type": "int", "description": "Number of samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the log-likelihood of a rank ``rank`` dataset.\n\nThe dataset is assumed to be embedded in gaussian noise of shape(n, dimf) having spectrum ``spectrum``. This implements the method of T. P. Minka.", - "docstring": "Compute the log-likelihood of a rank ``rank`` dataset.\n\nThe dataset is assumed to be embedded in gaussian noise of shape(n,\ndimf) having spectrum ``spectrum``. This implements the method of\nT. P. Minka.\n\nParameters\n----------\nspectrum : ndarray of shape (n_features,)\n Data spectrum.\nrank : int\n Tested rank value. It should be strictly lower than n_features,\n otherwise the method isn't specified (division by zero in equation\n (31) from the paper).\nn_samples : int\n Number of samples.\n\nReturns\n-------\nll : float\n The log-likelihood.\n\nReferences\n----------\nThis implements the method of `Thomas P. Minka:\nAutomatic Choice of Dimensionality for PCA. NIPS 2000: 598-604\n`_", + "description": "Compute the log-likelihood of a rank ``rank`` dataset.\n\nThe dataset is assumed to be embedded in gaussian noise of shape(n,\ndimf) having spectrum ``spectrum``. This implements the method of\nT. P. Minka.", + "docstring": "Compute the log-likelihood of a rank ``rank`` dataset.\n\n The dataset is assumed to be embedded in gaussian noise of shape(n,\n dimf) having spectrum ``spectrum``. This implements the method of\n T. P. Minka.\n\n Parameters\n ----------\n spectrum : ndarray of shape (n_features,)\n Data spectrum.\n rank : int\n Tested rank value. It should be strictly lower than n_features,\n otherwise the method isn't specified (division by zero in equation\n (31) from the paper).\n n_samples : int\n Number of samples.\n\n Returns\n -------\n ll : float\n The log-likelihood.\n\n References\n ----------\n This implements the method of `Thomas P. Minka:\n Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604\n `_\n ", "source_code": "\ndef _assess_dimension(spectrum, rank, n_samples):\n \"\"\"Compute the log-likelihood of a rank ``rank`` dataset.\n\n The dataset is assumed to be embedded in gaussian noise of shape(n,\n dimf) having spectrum ``spectrum``. This implements the method of\n T. P. Minka.\n\n Parameters\n ----------\n spectrum : ndarray of shape (n_features,)\n Data spectrum.\n rank : int\n Tested rank value. It should be strictly lower than n_features,\n otherwise the method isn't specified (division by zero in equation\n (31) from the paper).\n n_samples : int\n Number of samples.\n\n Returns\n -------\n ll : float\n The log-likelihood.\n\n References\n ----------\n This implements the method of `Thomas P. Minka:\n Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604\n `_\n \"\"\"\n n_features = spectrum.shape[0]\n if not 1 <= rank < n_features:\n raise ValueError('the tested rank should be in [1, n_features - 1]')\n eps = 1e-15\n if spectrum[rank - 1] < eps:\n return -np.inf\n pu = -rank * log(2.0)\n for i in range(1, rank + 1):\n pu += gammaln((n_features - i + 1) / 2.0) - log(np.pi) * (n_features - i + 1) / 2.0\n pl = np.sum(np.log(spectrum[:rank]))\n pl = -pl * n_samples / 2.0\n v = max(eps, np.sum(spectrum[rank:]) / (n_features - rank))\n pv = -np.log(v) * n_samples * (n_features - rank) / 2.0\n m = n_features * rank - rank * (rank + 1.0) / 2.0\n pp = log(2.0 * np.pi) * (m + rank) / 2.0\n pa = 0.0\n spectrum_ = spectrum.copy()\n spectrum_[rank:n_features] = v\n for i in range(rank):\n for j in range(i + 1, len(spectrum)):\n pa += log((spectrum[i] - spectrum[j]) * (1.0 / spectrum_[j] - 1.0 / spectrum_[i])) + log(n_samples)\n ll = pu + pl + pv + pp - pa / 2.0 - rank * log(n_samples) / 2.0\n return ll" }, { @@ -54214,7 +56904,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -54224,13 +56915,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Infers the dimension of a dataset with a given spectrum.\n\nThe returned value will be in [1, n_features - 1].", - "docstring": "Infers the dimension of a dataset with a given spectrum.\n\nThe returned value will be in [1, n_features - 1].", + "docstring": "Infers the dimension of a dataset with a given spectrum.\n\n The returned value will be in [1, n_features - 1].\n ", "source_code": "\ndef _infer_dimension(spectrum, n_samples):\n \"\"\"Infers the dimension of a dataset with a given spectrum.\n\n The returned value will be in [1, n_features - 1].\n \"\"\"\n ll = np.empty_like(spectrum)\n ll[0] = -np.inf\n for rank in range(1, spectrum.shape[0]):\n ll[rank] = _assess_dimension(spectrum, rank, n_samples)\n return ll.argmax()" }, { @@ -54248,7 +56940,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -54257,8 +56950,9 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "int, default=None", - "description": "Number of sparse atoms to extract." - } + "description": "Number of sparse atoms to extract. If None, then ``n_components``\nis set to ``n_features``." + }, + "refined_type": {} }, { "name": "alpha", @@ -54268,7 +56962,8 @@ "docstring": { "type": "int, default=1", "description": "Sparsity controlling parameter. Higher values lead to sparser\ncomponents." - } + }, + "refined_type": {} }, { "name": "ridge_alpha", @@ -54278,7 +56973,8 @@ "docstring": { "type": "float, default=0.01", "description": "Amount of ridge shrinkage to apply in order to improve\nconditioning when calling the transform method." - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -54288,7 +56984,8 @@ "docstring": { "type": "int, default=100", "description": "Number of iterations to perform for each mini batch." - } + }, + "refined_type": {} }, { "name": "callback", @@ -54298,7 +56995,8 @@ "docstring": { "type": "callable, default=None", "description": "Callable that gets invoked every five iterations." - } + }, + "refined_type": {} }, { "name": "batch_size", @@ -54308,7 +57006,8 @@ "docstring": { "type": "int, default=3", "description": "The number of features to take in each mini batch." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -54318,7 +57017,8 @@ "docstring": { "type": "int or bool, default=False", "description": "Controls the verbosity; the higher, the more messages. Defaults to 0." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -54328,7 +57028,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to shuffle the data before splitting it in batches." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -54338,7 +57039,8 @@ "docstring": { "type": "int, default=None", "description": "Number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "method", @@ -54348,6 +57050,10 @@ "docstring": { "type": "{'lars', 'cd'}, default='lars'", "description": "Method to be used for optimization.\nlars: uses the least angle regression method to solve the lasso problem\n(linear_model.lars_path)\ncd: uses the coordinate descent method to compute the\nLasso solution (linear_model.Lasso). Lars will be faster if\nthe estimated components are sparse." + }, + "refined_type": { + "kind": "EnumType", + "values": ["cd", "lars"] } }, { @@ -54358,13 +57064,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used for random shuffling when ``shuffle`` is set to ``True``,\nduring online dictionary learning. Pass an int for reproducible results\nacross multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01, n_iter=100, callback=None, batch_size=3, verbose=False, shuffle=True, n_jobs=None, method='lars', random_state=None):\n super().__init__(n_components=n_components, alpha=alpha, verbose=verbose, ridge_alpha=ridge_alpha, n_jobs=n_jobs, method=method, random_state=random_state)\n self.n_iter = n_iter\n self.callback = callback\n self.batch_size = batch_size\n self.shuffle = shuffle" }, { @@ -54382,7 +57089,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -54392,7 +57100,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -54402,13 +57111,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model from data in X.", - "docstring": "Fit the model from data in X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X)\n self.mean_ = X.mean(axis=0)\n X = X - self.mean_\n if self.n_components is None:\n n_components = X.shape[1]\n else:\n n_components = self.n_components\n (Vt, _, self.n_iter_) = dict_learning_online(X.T, n_components, alpha=self.alpha, n_iter=self.n_iter, return_code=True, dict_init=None, verbose=self.verbose, callback=self.callback, batch_size=self.batch_size, shuffle=self.shuffle, n_jobs=self.n_jobs, method=self.method, random_state=random_state, return_n_iter=True)\n self.components_ = Vt.T\n components_norm = np.linalg.norm(self.components_, axis=1)[:, np.newaxis]\n components_norm[components_norm == 0] = 1\n self.components_ /= components_norm\n self.n_components_ = len(self.components_)\n return self" }, { @@ -54426,7 +57136,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -54435,8 +57146,9 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "int, default=None", - "description": "Number of sparse atoms to extract." - } + "description": "Number of sparse atoms to extract. If None, then ``n_components``\nis set to ``n_features``." + }, + "refined_type": {} }, { "name": "alpha", @@ -54446,7 +57158,8 @@ "docstring": { "type": "float, default=1", "description": "Sparsity controlling parameter. Higher values lead to sparser\ncomponents." - } + }, + "refined_type": {} }, { "name": "ridge_alpha", @@ -54456,7 +57169,8 @@ "docstring": { "type": "float, default=0.01", "description": "Amount of ridge shrinkage to apply in order to improve\nconditioning when calling the transform method." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -54466,7 +57180,8 @@ "docstring": { "type": "int, default=1000", "description": "Maximum number of iterations to perform." - } + }, + "refined_type": {} }, { "name": "tol", @@ -54476,7 +57191,8 @@ "docstring": { "type": "float, default=1e-8", "description": "Tolerance for the stopping condition." - } + }, + "refined_type": {} }, { "name": "method", @@ -54486,6 +57202,10 @@ "docstring": { "type": "{'lars', 'cd'}, default='lars'", "description": "Method to be used for optimization.\nlars: uses the least angle regression method to solve the lasso problem\n(linear_model.lars_path)\ncd: uses the coordinate descent method to compute the\nLasso solution (linear_model.Lasso). Lars will be faster if\nthe estimated components are sparse." + }, + "refined_type": { + "kind": "EnumType", + "values": ["cd", "lars"] } }, { @@ -54496,7 +57216,8 @@ "docstring": { "type": "int, default=None", "description": "Number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "U_init", @@ -54506,7 +57227,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_components), default=None", "description": "Initial values for the loadings for warm restart scenarios. Only used\nif `U_init` and `V_init` are not None." - } + }, + "refined_type": {} }, { "name": "V_init", @@ -54516,7 +57238,8 @@ "docstring": { "type": "ndarray of shape (n_components, n_features), default=None", "description": "Initial values for the components for warm restart scenarios. Only used\nif `U_init` and `V_init` are not None." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -54526,7 +57249,8 @@ "docstring": { "type": "int or bool, default=False", "description": "Controls the verbosity; the higher, the more messages. Defaults to 0." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -54536,13 +57260,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used during dictionary learning. Pass an int for reproducible results\nacross multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01, max_iter=1000, tol=1e-08, method='lars', n_jobs=None, U_init=None, V_init=None, verbose=False, random_state=None):\n self.n_components = n_components\n self.alpha = alpha\n self.ridge_alpha = ridge_alpha\n self.max_iter = max_iter\n self.tol = tol\n self.method = method\n self.n_jobs = n_jobs\n self.U_init = U_init\n self.V_init = V_init\n self.verbose = verbose\n self.random_state = random_state" }, { @@ -54560,7 +57285,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -54570,7 +57296,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -54580,13 +57307,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model from data in X.", - "docstring": "Fit the model from data in X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X)\n self.mean_ = X.mean(axis=0)\n X = X - self.mean_\n if self.n_components is None:\n n_components = X.shape[1]\n else:\n n_components = self.n_components\n code_init = self.V_init.T if self.V_init is not None else None\n dict_init = self.U_init.T if self.U_init is not None else None\n (Vt, _, E, self.n_iter_) = dict_learning(X.T, n_components, alpha=self.alpha, tol=self.tol, max_iter=self.max_iter, method=self.method, n_jobs=self.n_jobs, verbose=self.verbose, random_state=random_state, code_init=code_init, dict_init=dict_init, return_n_iter=True)\n self.components_ = Vt.T\n components_norm = np.linalg.norm(self.components_, axis=1)[:, np.newaxis]\n components_norm[components_norm == 0] = 1\n self.components_ /= components_norm\n self.n_components_ = len(self.components_)\n self.error_ = E\n return self" }, { @@ -54604,7 +57332,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -54614,13 +57343,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Test data to be transformed, must have the same number of\nfeatures as the data used to train the model." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Least Squares projection of the data onto the sparse components.\n\nTo avoid instability issues in case the system is under-determined, regularization can be applied (Ridge regression) via the `ridge_alpha` parameter. Note that Sparse PCA components orthogonality is not enforced as in PCA hence one cannot use a simple linear projection.", - "docstring": "Least Squares projection of the data onto the sparse components.\n\nTo avoid instability issues in case the system is under-determined,\nregularization can be applied (Ridge regression) via the\n`ridge_alpha` parameter.\n\nNote that Sparse PCA components orthogonality is not enforced as in PCA\nhence one cannot use a simple linear projection.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Test data to be transformed, must have the same number of\n features as the data used to train the model.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Transformed data.", + "description": "Least Squares projection of the data onto the sparse components.\n\nTo avoid instability issues in case the system is under-determined,\nregularization can be applied (Ridge regression) via the\n`ridge_alpha` parameter.\n\nNote that Sparse PCA components orthogonality is not enforced as in PCA\nhence one cannot use a simple linear projection.", + "docstring": "Least Squares projection of the data onto the sparse components.\n\n To avoid instability issues in case the system is under-determined,\n regularization can be applied (Ridge regression) via the\n `ridge_alpha` parameter.\n\n Note that Sparse PCA components orthogonality is not enforced as in PCA\n hence one cannot use a simple linear projection.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Test data to be transformed, must have the same number of\n features as the data used to train the model.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed data.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Least Squares projection of the data onto the sparse components.\n\n To avoid instability issues in case the system is under-determined,\n regularization can be applied (Ridge regression) via the\n `ridge_alpha` parameter.\n\n Note that Sparse PCA components orthogonality is not enforced as in PCA\n hence one cannot use a simple linear projection.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Test data to be transformed, must have the same number of\n features as the data used to train the model.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n X = X - self.mean_\n U = ridge_regression(self.components_.T, X.T, self.ridge_alpha, solver='cholesky')\n return U" }, { @@ -54638,7 +57368,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -54648,7 +57379,8 @@ "docstring": { "type": "int, default=2", "description": "Desired dimensionality of output data.\nMust be strictly less than the number of features.\nThe default value is useful for visualisation. For LSA, a value of\n100 is recommended." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -54658,6 +57390,10 @@ "docstring": { "type": "{'arpack', 'randomized'}, default='randomized'", "description": "SVD solver to use. Either \"arpack\" for the ARPACK wrapper in SciPy\n(scipy.sparse.linalg.svds), or \"randomized\" for the randomized\nalgorithm due to Halko (2009)." + }, + "refined_type": { + "kind": "EnumType", + "values": ["randomized", "arpack"] } }, { @@ -54668,7 +57404,8 @@ "docstring": { "type": "int, default=5", "description": "Number of iterations for randomized SVD solver. Not used by ARPACK. The\ndefault is larger than the default in\n:func:`~sklearn.utils.extmath.randomized_svd` to handle sparse\nmatrices that may have large slowly decaying spectrum." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -54678,7 +57415,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used during randomized svd. Pass an int for reproducible results across\nmultiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "tol", @@ -54688,13 +57426,14 @@ "docstring": { "type": "float, default=0.0", "description": "Tolerance for ARPACK. 0 means machine precision. Ignored by randomized\nSVD solver." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=2, *, algorithm='randomized', n_iter=5, random_state=None, tol=0.0):\n self.algorithm = algorithm\n self.n_components = n_components\n self.n_iter = n_iter\n self.random_state = random_state\n self.tol = tol" }, { @@ -54712,13 +57451,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'preserves_dtype': [np.float64, np.float32]}" }, { @@ -54736,7 +57476,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -54746,6 +57487,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -54756,13 +57501,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit model on training data X.", - "docstring": "Fit model on training data X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the transformer object.", + "docstring": "Fit model on training data X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the transformer object.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit model on training data X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the transformer object.\n \"\"\"\n self.fit_transform(X)\n return self" }, { @@ -54780,7 +57526,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -54790,6 +57537,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -54800,13 +57551,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit model to X and perform dimensionality reduction on X.", - "docstring": "Fit model to X and perform dimensionality reduction on X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Reduced version of X. This will always be a dense array.", + "docstring": "Fit model to X and perform dimensionality reduction on X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Reduced version of X. This will always be a dense array.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Fit model to X and perform dimensionality reduction on X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Reduced version of X. This will always be a dense array.\n \"\"\"\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], ensure_min_features=2)\n random_state = check_random_state(self.random_state)\n if self.algorithm == 'arpack':\n v0 = _init_arpack_v0(min(X.shape), random_state)\n (U, Sigma, VT) = svds(X, k=self.n_components, tol=self.tol, v0=v0)\n Sigma = Sigma[::-1]\n (U, VT) = svd_flip(U[:, ::-1], VT[::-1])\n elif self.algorithm == 'randomized':\n k = self.n_components\n n_features = X.shape[1]\n if k >= n_features:\n raise ValueError('n_components must be < n_features; got %d >= %d' % (k, n_features))\n (U, Sigma, VT) = randomized_svd(X, self.n_components, n_iter=self.n_iter, random_state=random_state)\n else:\n raise ValueError('unknown algorithm %r' % self.algorithm)\n self.components_ = VT\n if self.algorithm == 'randomized' or self.algorithm == 'arpack' and self.tol > 0:\n X_transformed = safe_sparse_dot(X, self.components_.T)\n else:\n X_transformed = U * Sigma\n self.explained_variance_ = exp_var = np.var(X_transformed, axis=0)\n if sp.issparse(X):\n (_, full_var) = mean_variance_axis(X, axis=0)\n full_var = full_var.sum()\n else:\n full_var = np.var(X, axis=0).sum()\n self.explained_variance_ratio_ = exp_var / full_var\n self.singular_values_ = Sigma\n return X_transformed" }, { @@ -54824,7 +57576,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -54834,13 +57587,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "New data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform X back to its original space.\n\nReturns an array X_original whose transform would be X.", - "docstring": "Transform X back to its original space.\n\nReturns an array X_original whose transform would be X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_components)\n New data.\n\nReturns\n-------\nX_original : ndarray of shape (n_samples, n_features)\n Note that this is always a dense array.", + "docstring": "Transform X back to its original space.\n\n Returns an array X_original whose transform would be X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_components)\n New data.\n\n Returns\n -------\n X_original : ndarray of shape (n_samples, n_features)\n Note that this is always a dense array.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Transform X back to its original space.\n\n Returns an array X_original whose transform would be X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_components)\n New data.\n\n Returns\n -------\n X_original : ndarray of shape (n_samples, n_features)\n Note that this is always a dense array.\n \"\"\"\n X = check_array(X)\n return np.dot(X, self.components_)" }, { @@ -54858,7 +57612,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -54868,13 +57623,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "New data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Perform dimensionality reduction on X.", - "docstring": "Perform dimensionality reduction on X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Reduced version of X. This will always be a dense array.", + "docstring": "Perform dimensionality reduction on X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Reduced version of X. This will always be a dense array.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Perform dimensionality reduction on X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n New data.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Reduced version of X. This will always be a dense array.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], reset=False)\n return safe_sparse_dot(X, self.components_.T)" }, { @@ -54892,7 +57651,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -54902,13 +57662,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n config = Configuration('decomposition', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_extension('_online_lda_fast', sources=['_online_lda_fast.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_cdnmf_fast', sources=['_cdnmf_fast.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_subpackage('tests')\n return config" }, { @@ -54926,7 +57687,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "solver", @@ -54936,6 +57698,10 @@ "docstring": { "type": "{'svd', 'lsqr', 'eigen'}, default='svd'", "description": "Solver to use, possible values:\n - 'svd': Singular value decomposition (default).\n Does not compute the covariance matrix, therefore this solver is\n recommended for data with a large number of features.\n - 'lsqr': Least squares solution.\n Can be combined with shrinkage or custom covariance estimator.\n - 'eigen': Eigenvalue decomposition.\n Can be combined with shrinkage or custom covariance estimator." + }, + "refined_type": { + "kind": "EnumType", + "values": ["svd", "lsqr", "eigen"] } }, { @@ -54946,7 +57712,8 @@ "docstring": { "type": "'auto' or float, default=None", "description": "Shrinkage parameter, possible values:\n - None: no shrinkage (default).\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\nThis should be left to None if `covariance_estimator` is used.\nNote that shrinkage works only with 'lsqr' and 'eigen' solvers." - } + }, + "refined_type": {} }, { "name": "priors", @@ -54956,7 +57723,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "The class prior probabilities. By default, the class proportions are\ninferred from the training data." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -54966,7 +57734,8 @@ "docstring": { "type": "int, default=None", "description": "Number of components (<= min(n_classes - 1, n_features)) for\ndimensionality reduction. If None, will be set to\nmin(n_classes - 1, n_features). This parameter only affects the\n`transform` method." - } + }, + "refined_type": {} }, { "name": "store_covariance", @@ -54976,7 +57745,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, explicitly compute the weighted within-class covariance\nmatrix when solver is 'svd'. The matrix is always computed\nand stored for the other solvers.\n\n.. versionadded:: 0.17" - } + }, + "refined_type": {} }, { "name": "tol", @@ -54986,7 +57756,8 @@ "docstring": { "type": "float, default=1.0e-4", "description": "Absolute threshold for a singular value of X to be considered\nsignificant, used to estimate the rank of X. Dimensions whose\nsingular values are non-significant are discarded. Only used if\nsolver is 'svd'.\n\n.. versionadded:: 0.17" - } + }, + "refined_type": {} }, { "name": "covariance_estimator", @@ -54996,13 +57767,14 @@ "docstring": { "type": "covariance estimator, default=None", "description": "If not None, `covariance_estimator` is used to estimate\nthe covariance matrices instead of relying on the empirical\ncovariance estimator (with potential shrinkage).\nThe object should have a fit method and a ``covariance_`` attribute\nlike the estimators in :mod:`sklearn.covariance`.\nif None the shrinkage parameter drives the estimate.\n\nThis should be left to None if `shrinkage` is used.\nNote that `covariance_estimator` works only with 'lsqr' and 'eigen'\nsolvers.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001, covariance_estimator=None):\n self.solver = solver\n self.shrinkage = shrinkage\n self.priors = priors\n self.n_components = n_components\n self.store_covariance = store_covariance\n self.tol = tol\n self.covariance_estimator = covariance_estimator" }, { @@ -55020,7 +57792,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55030,7 +57803,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -55040,7 +57814,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "shrinkage", @@ -55050,7 +57825,8 @@ "docstring": { "type": "'auto', float or None", "description": "Shrinkage parameter, possible values:\n - None: no shrinkage.\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage constant.\n\nShrinkage parameter is ignored if `covariance_estimator` i\nnot None" - } + }, + "refined_type": {} }, { "name": "covariance_estimator", @@ -55060,13 +57836,14 @@ "docstring": { "type": "estimator, default=None", "description": "If not None, `covariance_estimator` is used to estimate\nthe covariance matrices instead of relying the empirical\ncovariance estimator (with potential shrinkage).\nThe object should have a fit method and a ``covariance_`` attribute\nlike the estimators in sklearn.covariance.\nif None the shrinkage parameter drives the estimate.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Eigenvalue solver.\n\nThe eigenvalue solver computes the optimal solution of the Rayleigh coefficient (basically the ratio of between class scatter to within class scatter). This solver supports both classification and dimensionality reduction (with any covariance estimator).", - "docstring": "Eigenvalue solver.\n\nThe eigenvalue solver computes the optimal solution of the Rayleigh\ncoefficient (basically the ratio of between class scatter to within\nclass scatter). This solver supports both classification and\ndimensionality reduction (with any covariance estimator).\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\nshrinkage : 'auto', float or None\n Shrinkage parameter, possible values:\n - None: no shrinkage.\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage constant.\n\n Shrinkage parameter is ignored if `covariance_estimator` i\n not None\n\ncovariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in sklearn.covariance.\n if None the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\nNotes\n-----\nThis solver is based on [1]_, section 3.8.3, pp. 121-124.\n\nReferences\n----------\n.. [1] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification\n (Second Edition). John Wiley & Sons, Inc., New York, 2001. ISBN\n 0-471-05669-3.", + "description": "Eigenvalue solver.\n\nThe eigenvalue solver computes the optimal solution of the Rayleigh\ncoefficient (basically the ratio of between class scatter to within\nclass scatter). This solver supports both classification and\ndimensionality reduction (with any covariance estimator).", + "docstring": "Eigenvalue solver.\n\n The eigenvalue solver computes the optimal solution of the Rayleigh\n coefficient (basically the ratio of between class scatter to within\n class scatter). This solver supports both classification and\n dimensionality reduction (with any covariance estimator).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n shrinkage : 'auto', float or None\n Shrinkage parameter, possible values:\n - None: no shrinkage.\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage constant.\n\n Shrinkage parameter is ignored if `covariance_estimator` i\n not None\n\n covariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in sklearn.covariance.\n if None the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\n Notes\n -----\n This solver is based on [1]_, section 3.8.3, pp. 121-124.\n\n References\n ----------\n .. [1] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification\n (Second Edition). John Wiley & Sons, Inc., New York, 2001. ISBN\n 0-471-05669-3.\n ", "source_code": "\ndef _solve_eigen(self, X, y, shrinkage, covariance_estimator):\n \"\"\"Eigenvalue solver.\n\n The eigenvalue solver computes the optimal solution of the Rayleigh\n coefficient (basically the ratio of between class scatter to within\n class scatter). This solver supports both classification and\n dimensionality reduction (with any covariance estimator).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n shrinkage : 'auto', float or None\n Shrinkage parameter, possible values:\n - None: no shrinkage.\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage constant.\n\n Shrinkage parameter is ignored if `covariance_estimator` i\n not None\n\n covariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in sklearn.covariance.\n if None the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\n Notes\n -----\n This solver is based on [1]_, section 3.8.3, pp. 121-124.\n\n References\n ----------\n .. [1] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification\n (Second Edition). John Wiley & Sons, Inc., New York, 2001. ISBN\n 0-471-05669-3.\n \"\"\"\n self.means_ = _class_means(X, y)\n self.covariance_ = _class_cov(X, y, self.priors_, shrinkage, covariance_estimator)\n Sw = self.covariance_\n St = _cov(X, shrinkage, covariance_estimator)\n Sb = St - Sw\n (evals, evecs) = linalg.eigh(Sb, Sw)\n self.explained_variance_ratio_ = np.sort(evals / np.sum(evals))[::-1][:self._max_components]\n evecs = evecs[:, np.argsort(evals)[::-1]]\n self.scalings_ = evecs\n self.coef_ = np.dot(self.means_, evecs).dot(evecs.T)\n self.intercept_ = -0.5 * np.diag(np.dot(self.means_, self.coef_.T)) + np.log(self.priors_)" }, { @@ -55084,7 +57861,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55094,7 +57872,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -55104,7 +57883,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_classes)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "shrinkage", @@ -55114,7 +57894,8 @@ "docstring": { "type": "'auto', float or None", "description": "Shrinkage parameter, possible values:\n - None: no shrinkage.\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\nShrinkage parameter is ignored if `covariance_estimator` i\nnot None" - } + }, + "refined_type": {} }, { "name": "covariance_estimator", @@ -55124,13 +57905,14 @@ "docstring": { "type": "estimator, default=None", "description": "If not None, `covariance_estimator` is used to estimate\nthe covariance matrices instead of relying the empirical\ncovariance estimator (with potential shrinkage).\nThe object should have a fit method and a ``covariance_`` attribute\nlike the estimators in sklearn.covariance.\nif None the shrinkage parameter drives the estimate.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Least squares solver.\n\nThe least squares solver computes a straightforward solution of the optimal decision rule based directly on the discriminant functions. It can only be used for classification (with any covariance estimator), because estimation of eigenvectors is not performed. Therefore, dimensionality reduction with the transform is not supported.", - "docstring": "Least squares solver.\n\nThe least squares solver computes a straightforward solution of the\noptimal decision rule based directly on the discriminant functions. It\ncan only be used for classification (with any covariance estimator),\nbecause\nestimation of eigenvectors is not performed. Therefore, dimensionality\nreduction with the transform is not supported.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_classes)\n Target values.\n\nshrinkage : 'auto', float or None\n Shrinkage parameter, possible values:\n - None: no shrinkage.\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n Shrinkage parameter is ignored if `covariance_estimator` i\n not None\n\ncovariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in sklearn.covariance.\n if None the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\nNotes\n-----\nThis solver is based on [1]_, section 2.6.2, pp. 39-41.\n\nReferences\n----------\n.. [1] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification\n (Second Edition). John Wiley & Sons, Inc., New York, 2001. ISBN\n 0-471-05669-3.", + "description": "Least squares solver.\n\nThe least squares solver computes a straightforward solution of the\noptimal decision rule based directly on the discriminant functions. It\ncan only be used for classification (with any covariance estimator),\nbecause\nestimation of eigenvectors is not performed. Therefore, dimensionality\nreduction with the transform is not supported.", + "docstring": "Least squares solver.\n\n The least squares solver computes a straightforward solution of the\n optimal decision rule based directly on the discriminant functions. It\n can only be used for classification (with any covariance estimator),\n because\n estimation of eigenvectors is not performed. Therefore, dimensionality\n reduction with the transform is not supported.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_classes)\n Target values.\n\n shrinkage : 'auto', float or None\n Shrinkage parameter, possible values:\n - None: no shrinkage.\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n Shrinkage parameter is ignored if `covariance_estimator` i\n not None\n\n covariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in sklearn.covariance.\n if None the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\n Notes\n -----\n This solver is based on [1]_, section 2.6.2, pp. 39-41.\n\n References\n ----------\n .. [1] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification\n (Second Edition). John Wiley & Sons, Inc., New York, 2001. ISBN\n 0-471-05669-3.\n ", "source_code": "\ndef _solve_lsqr(self, X, y, shrinkage, covariance_estimator):\n \"\"\"Least squares solver.\n\n The least squares solver computes a straightforward solution of the\n optimal decision rule based directly on the discriminant functions. It\n can only be used for classification (with any covariance estimator),\n because\n estimation of eigenvectors is not performed. Therefore, dimensionality\n reduction with the transform is not supported.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_classes)\n Target values.\n\n shrinkage : 'auto', float or None\n Shrinkage parameter, possible values:\n - None: no shrinkage.\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n Shrinkage parameter is ignored if `covariance_estimator` i\n not None\n\n covariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in sklearn.covariance.\n if None the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\n Notes\n -----\n This solver is based on [1]_, section 2.6.2, pp. 39-41.\n\n References\n ----------\n .. [1] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification\n (Second Edition). John Wiley & Sons, Inc., New York, 2001. ISBN\n 0-471-05669-3.\n \"\"\"\n self.means_ = _class_means(X, y)\n self.covariance_ = _class_cov(X, y, self.priors_, shrinkage, covariance_estimator)\n self.coef_ = linalg.lstsq(self.covariance_, self.means_.T)[0].T\n self.intercept_ = -0.5 * np.diag(np.dot(self.means_, self.coef_.T)) + np.log(self.priors_)" }, { @@ -55148,7 +57930,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55158,7 +57941,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -55168,13 +57952,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "SVD solver.", - "docstring": "SVD solver.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.", + "docstring": "SVD solver.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n ", "source_code": "\ndef _solve_svd(self, X, y):\n \"\"\"SVD solver.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n \"\"\"\n (n_samples, n_features) = X.shape\n n_classes = len(self.classes_)\n self.means_ = _class_means(X, y)\n if self.store_covariance:\n self.covariance_ = _class_cov(X, y, self.priors_)\n Xc = []\n for (idx, group) in enumerate(self.classes_):\n Xg = X[y == group, :]\n Xc.append(Xg - self.means_[idx])\n self.xbar_ = np.dot(self.priors_, self.means_)\n Xc = np.concatenate(Xc, axis=0)\n std = Xc.std(axis=0)\n std[std == 0] = 1.0\n fac = 1.0 / (n_samples - n_classes)\n X = np.sqrt(fac) * (Xc / std)\n (U, S, Vt) = linalg.svd(X, full_matrices=False)\n rank = np.sum(S > self.tol)\n scalings = (Vt[:rank] / std).T / S[:rank]\n X = np.dot((np.sqrt(n_samples * self.priors_ * fac) * (self.means_ - self.xbar_).T).T, scalings)\n (_, S, Vt) = linalg.svd(X, full_matrices=0)\n if self._max_components == 0:\n self.explained_variance_ratio_ = np.empty((0, ), dtype=S.dtype)\n else:\n self.explained_variance_ratio_ = (S**2 / np.sum(S**2))[:self._max_components]\n rank = np.sum(S > self.tol * S[0])\n self.scalings_ = np.dot(scalings, Vt.T[:, :rank])\n coef = np.dot(self.means_ - self.xbar_, self.scalings_)\n self.intercept_ = -0.5 * np.sum(coef**2, axis=1) + np.log(self.priors_)\n self.coef_ = np.dot(coef, self.scalings_.T)\n self.intercept_ -= np.dot(self.xbar_, self.coef_.T)" }, { @@ -55192,7 +57977,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55202,13 +57988,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Array of samples (test vectors)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Apply decision function to an array of samples.\n\nThe decision function is equal (up to a constant factor) to the log-posterior of the model, i.e. `log p(y = k | x)`. In a binary classification setting this instead corresponds to the difference `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.", - "docstring": "Apply decision function to an array of samples.\n\nThe decision function is equal (up to a constant factor) to the\nlog-posterior of the model, i.e. `log p(y = k | x)`. In a binary\nclassification setting this instead corresponds to the difference\n`log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Array of samples (test vectors).\n\nReturns\n-------\nC : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Decision function values related to each class, per sample.\n In the two-class case, the shape is (n_samples,), giving the\n log likelihood ratio of the positive class.", + "description": "Apply decision function to an array of samples.\n\nThe decision function is equal (up to a constant factor) to the\nlog-posterior of the model, i.e. `log p(y = k | x)`. In a binary\nclassification setting this instead corresponds to the difference\n`log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.", + "docstring": "Apply decision function to an array of samples.\n\n The decision function is equal (up to a constant factor) to the\n log-posterior of the model, i.e. `log p(y = k | x)`. In a binary\n classification setting this instead corresponds to the difference\n `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Array of samples (test vectors).\n\n Returns\n -------\n C : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Decision function values related to each class, per sample.\n In the two-class case, the shape is (n_samples,), giving the\n log likelihood ratio of the positive class.\n ", "source_code": "\ndef decision_function(self, X):\n \"\"\"Apply decision function to an array of samples.\n\n The decision function is equal (up to a constant factor) to the\n log-posterior of the model, i.e. `log p(y = k | x)`. In a binary\n classification setting this instead corresponds to the difference\n `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Array of samples (test vectors).\n\n Returns\n -------\n C : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Decision function values related to each class, per sample.\n In the two-class case, the shape is (n_samples,), giving the\n log likelihood ratio of the positive class.\n \"\"\"\n return super().decision_function(X)" }, { @@ -55226,7 +58013,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55236,7 +58024,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -55246,13 +58035,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit the Linear Discriminant Analysis model.\n\n .. versionchanged:: 0.19 *store_covariance* has been moved to main constructor. .. versionchanged:: 0.19 *tol* has been moved to main constructor.", - "docstring": "Fit the Linear Discriminant Analysis model.\n\n .. versionchanged:: 0.19\n *store_covariance* has been moved to main constructor.\n\n .. versionchanged:: 0.19\n *tol* has been moved to main constructor.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "description": "Fit the Linear Discriminant Analysis model.\n\n .. versionchanged:: 0.19\n *store_covariance* has been moved to main constructor.\n\n .. versionchanged:: 0.19\n *tol* has been moved to main constructor.", + "docstring": "Fit the Linear Discriminant Analysis model.\n\n .. versionchanged:: 0.19\n *store_covariance* has been moved to main constructor.\n\n .. versionchanged:: 0.19\n *tol* has been moved to main constructor.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit the Linear Discriminant Analysis model.\n\n .. versionchanged:: 0.19\n *store_covariance* has been moved to main constructor.\n\n .. versionchanged:: 0.19\n *tol* has been moved to main constructor.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y, ensure_min_samples=2, estimator=self, dtype=[np.float64, np.float32])\n self.classes_ = unique_labels(y)\n (n_samples, _) = X.shape\n n_classes = len(self.classes_)\n if n_samples == n_classes:\n raise ValueError('The number of samples must be more than the number of classes.')\n if self.priors is None:\n (_, y_t) = np.unique(y, return_inverse=True)\n self.priors_ = np.bincount(y_t) / float(len(y))\n else:\n self.priors_ = np.asarray(self.priors)\n if (self.priors_ < 0).any():\n raise ValueError('priors must be non-negative')\n if not np.isclose(self.priors_.sum(), 1.0):\n warnings.warn('The priors do not sum to 1. Renormalizing', UserWarning)\n self.priors_ = self.priors_ / self.priors_.sum()\n max_components = min(len(self.classes_) - 1, X.shape[1])\n if self.n_components is None:\n self._max_components = max_components\n else:\n if self.n_components > max_components:\n raise ValueError('n_components cannot be larger than min(n_features, n_classes - 1).')\n self._max_components = self.n_components\n if self.solver == 'svd':\n if self.shrinkage is not None:\n raise NotImplementedError('shrinkage not supported')\n if self.covariance_estimator is not None:\n raise ValueError('covariance estimator is not supported with svd solver. Try another solver')\n self._solve_svd(X, y)\n elif self.solver == 'lsqr':\n self._solve_lsqr(X, y, shrinkage=self.shrinkage, covariance_estimator=self.covariance_estimator)\n elif self.solver == 'eigen':\n self._solve_eigen(X, y, shrinkage=self.shrinkage, covariance_estimator=self.covariance_estimator)\n else:\n raise ValueError(\"unknown solver {} (valid solvers are 'svd', 'lsqr', and 'eigen').\".format(self.solver))\n if self.classes_.size == 2:\n self.coef_ = np.array(self.coef_[1, :] - self.coef_[0, :], ndmin=2, dtype=X.dtype)\n self.intercept_ = np.array(self.intercept_[1] - self.intercept_[0], ndmin=1, dtype=X.dtype)\n return self" }, { @@ -55270,7 +58060,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55280,13 +58071,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Estimate log probability.", - "docstring": "Estimate log probability.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data.\n\nReturns\n-------\nC : ndarray of shape (n_samples, n_classes)\n Estimated log probabilities.", + "docstring": "Estimate log probability.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n Estimated log probabilities.\n ", "source_code": "\ndef predict_log_proba(self, X):\n \"\"\"Estimate log probability.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n Estimated log probabilities.\n \"\"\"\n prediction = self.predict_proba(X)\n prediction[prediction == 0.0] += np.finfo(prediction.dtype).tiny\n return np.log(prediction)" }, { @@ -55304,7 +58096,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55314,13 +58107,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Estimate probability.", - "docstring": "Estimate probability.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data.\n\nReturns\n-------\nC : ndarray of shape (n_samples, n_classes)\n Estimated probabilities.", + "docstring": "Estimate probability.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n Estimated probabilities.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Estimate probability.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n Estimated probabilities.\n \"\"\"\n check_is_fitted(self)\n decision = self.decision_function(X)\n if self.classes_.size == 2:\n proba = expit(decision)\n return np.vstack([1 - proba, proba]).T\n else:\n return softmax(decision)" }, { @@ -55338,7 +58132,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55348,13 +58143,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Project data to maximize class separation.", - "docstring": "Project data to maximize class separation.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Transformed data.", + "docstring": "Project data to maximize class separation.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed data.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Project data to maximize class separation.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n if self.solver == 'lsqr':\n raise NotImplementedError(\"transform not implemented for 'lsqr' solver (use 'svd' or 'eigen').\")\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n if self.solver == 'svd':\n X_new = np.dot(X - self.xbar_, self.scalings_)\n elif self.solver == 'eigen':\n X_new = np.dot(X, self.scalings_)\n return X_new[:, :self._max_components]" }, { @@ -55372,7 +58168,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "priors", @@ -55382,7 +58179,8 @@ "docstring": { "type": "ndarray of shape (n_classes,), default=None", "description": "Class priors. By default, the class proportions are inferred from the\ntraining data." - } + }, + "refined_type": {} }, { "name": "reg_param", @@ -55392,7 +58190,8 @@ "docstring": { "type": "float, default=0.0", "description": "Regularizes the per-class covariance estimates by transforming S2 as\n``S2 = (1 - reg_param) * S2 + reg_param * np.eye(n_features)``,\nwhere S2 corresponds to the `scaling_` attribute of a given class." - } + }, + "refined_type": {} }, { "name": "store_covariance", @@ -55402,7 +58201,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, the class covariance matrices are explicitly computed and\nstored in the `self.covariance_` attribute.\n\n.. versionadded:: 0.17" - } + }, + "refined_type": {} }, { "name": "tol", @@ -55412,13 +58212,14 @@ "docstring": { "type": "float, default=1.0e-4", "description": "Absolute threshold for a singular value to be considered significant,\nused to estimate the rank of `Xk` where `Xk` is the centered matrix\nof samples in class k. This parameter does not affect the\npredictions. It only controls a warning that is raised when features\nare considered to be colinear.\n\n.. versionadded:: 0.17" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, priors=None, reg_param=0.0, store_covariance=False, tol=0.0001):\n self.priors = np.asarray(priors) if priors is not None else None\n self.reg_param = reg_param\n self.store_covariance = store_covariance\n self.tol = tol" }, { @@ -55436,7 +58237,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55446,13 +58248,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _decision_function(self, X):\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n norm2 = []\n for i in range(len(self.classes_)):\n R = self.rotations_[i]\n S = self.scalings_[i]\n Xm = X - self.means_[i]\n X2 = np.dot(Xm, R * S**(-0.5))\n norm2.append(np.sum(X2**2, axis=1))\n norm2 = np.array(norm2).T\n u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])\n return -0.5 * (norm2 + u) + np.log(self.priors_)" }, { @@ -55470,7 +58273,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55480,13 +58284,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Array of samples (test vectors)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Apply decision function to an array of samples.\n\nThe decision function is equal (up to a constant factor) to the log-posterior of the model, i.e. `log p(y = k | x)`. In a binary classification setting this instead corresponds to the difference `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.", - "docstring": "Apply decision function to an array of samples.\n\nThe decision function is equal (up to a constant factor) to the\nlog-posterior of the model, i.e. `log p(y = k | x)`. In a binary\nclassification setting this instead corresponds to the difference\n`log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Array of samples (test vectors).\n\nReturns\n-------\nC : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Decision function values related to each class, per sample.\n In the two-class case, the shape is (n_samples,), giving the\n log likelihood ratio of the positive class.", + "description": "Apply decision function to an array of samples.\n\nThe decision function is equal (up to a constant factor) to the\nlog-posterior of the model, i.e. `log p(y = k | x)`. In a binary\nclassification setting this instead corresponds to the difference\n`log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.", + "docstring": "Apply decision function to an array of samples.\n\n The decision function is equal (up to a constant factor) to the\n log-posterior of the model, i.e. `log p(y = k | x)`. In a binary\n classification setting this instead corresponds to the difference\n `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Array of samples (test vectors).\n\n Returns\n -------\n C : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Decision function values related to each class, per sample.\n In the two-class case, the shape is (n_samples,), giving the\n log likelihood ratio of the positive class.\n ", "source_code": "\ndef decision_function(self, X):\n \"\"\"Apply decision function to an array of samples.\n\n The decision function is equal (up to a constant factor) to the\n log-posterior of the model, i.e. `log p(y = k | x)`. In a binary\n classification setting this instead corresponds to the difference\n `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Array of samples (test vectors).\n\n Returns\n -------\n C : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Decision function values related to each class, per sample.\n In the two-class case, the shape is (n_samples,), giving the\n log likelihood ratio of the positive class.\n \"\"\"\n dec_func = self._decision_function(X)\n if len(self.classes_) == 2:\n return dec_func[:, 1] - dec_func[:, 0]\n return dec_func" }, { @@ -55504,7 +58309,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55514,7 +58320,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -55524,13 +58331,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values (integers)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit the model according to the given training data and parameters.\n\n .. versionchanged:: 0.19 ``store_covariances`` has been moved to main constructor as ``store_covariance`` .. versionchanged:: 0.19 ``tol`` has been moved to main constructor.", - "docstring": "Fit the model according to the given training data and parameters.\n\n .. versionchanged:: 0.19\n ``store_covariances`` has been moved to main constructor as\n ``store_covariance``\n\n .. versionchanged:: 0.19\n ``tol`` has been moved to main constructor.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target values (integers).\n\nReturns\n-------\nself : object\n Fitted estimator.", + "description": "Fit the model according to the given training data and parameters.\n\n .. versionchanged:: 0.19\n ``store_covariances`` has been moved to main constructor as\n ``store_covariance``\n\n .. versionchanged:: 0.19\n ``tol`` has been moved to main constructor.", + "docstring": "Fit the model according to the given training data and parameters.\n\n .. versionchanged:: 0.19\n ``store_covariances`` has been moved to main constructor as\n ``store_covariance``\n\n .. versionchanged:: 0.19\n ``tol`` has been moved to main constructor.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values (integers).\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit the model according to the given training data and parameters.\n\n .. versionchanged:: 0.19\n ``store_covariances`` has been moved to main constructor as\n ``store_covariance``\n\n .. versionchanged:: 0.19\n ``tol`` has been moved to main constructor.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values (integers).\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y)\n check_classification_targets(y)\n (self.classes_, y) = np.unique(y, return_inverse=True)\n (n_samples, n_features) = X.shape\n n_classes = len(self.classes_)\n if n_classes < 2:\n raise ValueError('The number of classes has to be greater than one; got %d class' % n_classes)\n if self.priors is None:\n self.priors_ = np.bincount(y) / float(n_samples)\n else:\n self.priors_ = self.priors\n cov = None\n store_covariance = self.store_covariance\n if store_covariance:\n cov = []\n means = []\n scalings = []\n rotations = []\n for ind in range(n_classes):\n Xg = X[y == ind, :]\n meang = Xg.mean(0)\n means.append(meang)\n if len(Xg) == 1:\n raise ValueError('y has only 1 sample in class %s, covariance is ill defined.' % str(self.classes_[ind]))\n Xgc = Xg - meang\n (_, S, Vt) = np.linalg.svd(Xgc, full_matrices=False)\n rank = np.sum(S > self.tol)\n if rank < n_features:\n warnings.warn('Variables are collinear')\n S2 = S**2 / (len(Xg) - 1)\n S2 = (1 - self.reg_param) * S2 + self.reg_param\n if self.store_covariance or store_covariance:\n cov.append(np.dot(S2 * Vt.T, Vt))\n scalings.append(S2)\n rotations.append(Vt.T)\n if self.store_covariance or store_covariance:\n self.covariance_ = cov\n self.means_ = np.asarray(means)\n self.scalings_ = scalings\n self.rotations_ = rotations\n return self" }, { @@ -55548,7 +58356,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55558,13 +58367,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Vector to be scored, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform classification on an array of test vectors X.\n\nThe predicted class C for each sample in X is returned.", - "docstring": "Perform classification on an array of test vectors X.\n\nThe predicted class C for each sample in X is returned.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Vector to be scored, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nReturns\n-------\nC : ndarray of shape (n_samples,)\n Estimated probabilities.", + "docstring": "Perform classification on an array of test vectors X.\n\n The predicted class C for each sample in X is returned.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Vector to be scored, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n Estimated probabilities.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Perform classification on an array of test vectors X.\n\n The predicted class C for each sample in X is returned.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Vector to be scored, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n Estimated probabilities.\n \"\"\"\n d = self._decision_function(X)\n y_pred = self.classes_.take(d.argmax(1))\n return y_pred" }, { @@ -55582,7 +58392,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55592,13 +58403,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Array of samples/test vectors." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return log of posterior probabilities of classification.", - "docstring": "Return log of posterior probabilities of classification.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Array of samples/test vectors.\n\nReturns\n-------\nC : ndarray of shape (n_samples, n_classes)\n Posterior log-probabilities of classification per class.", + "docstring": "Return log of posterior probabilities of classification.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Array of samples/test vectors.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n Posterior log-probabilities of classification per class.\n ", "source_code": "\ndef predict_log_proba(self, X):\n \"\"\"Return log of posterior probabilities of classification.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Array of samples/test vectors.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n Posterior log-probabilities of classification per class.\n \"\"\"\n probas_ = self.predict_proba(X)\n return np.log(probas_)" }, { @@ -55616,7 +58428,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55626,13 +58439,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Array of samples/test vectors." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return posterior probabilities of classification.", - "docstring": "Return posterior probabilities of classification.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Array of samples/test vectors.\n\nReturns\n-------\nC : ndarray of shape (n_samples, n_classes)\n Posterior probabilities of classification per class.", + "docstring": "Return posterior probabilities of classification.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Array of samples/test vectors.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n Posterior probabilities of classification per class.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Return posterior probabilities of classification.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Array of samples/test vectors.\n\n Returns\n -------\n C : ndarray of shape (n_samples, n_classes)\n Posterior probabilities of classification per class.\n \"\"\"\n values = self._decision_function(X)\n likelihood = np.exp(values - values.max(axis=1)[:, np.newaxis])\n return likelihood / likelihood.sum(axis=1)[:, np.newaxis]" }, { @@ -55650,7 +58464,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data." - } + }, + "refined_type": {} }, { "name": "y", @@ -55660,7 +58475,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "priors", @@ -55670,7 +58486,8 @@ "docstring": { "type": "array-like of shape (n_classes,)", "description": "Class priors." - } + }, + "refined_type": {} }, { "name": "shrinkage", @@ -55680,7 +58497,8 @@ "docstring": { "type": "'auto' or float, default=None", "description": "Shrinkage parameter, possible values:\n - None: no shrinkage (default).\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\nShrinkage parameter is ignored if `covariance_estimator` is not None." - } + }, + "refined_type": {} }, { "name": "covariance_estimator", @@ -55690,13 +58508,14 @@ "docstring": { "type": "estimator, default=None", "description": "If not None, `covariance_estimator` is used to estimate\nthe covariance matrices instead of relying the empirical\ncovariance estimator (with potential shrinkage).\nThe object should have a fit method and a ``covariance_`` attribute\nlike the estimators in sklearn.covariance.\nIf None, the shrinkage parameter drives the estimate.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute weighted within-class covariance matrix.\n\nThe per-class covariance are weighted by the class priors.", - "docstring": "Compute weighted within-class covariance matrix.\n\nThe per-class covariance are weighted by the class priors.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\npriors : array-like of shape (n_classes,)\n Class priors.\n\nshrinkage : 'auto' or float, default=None\n Shrinkage parameter, possible values:\n - None: no shrinkage (default).\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n Shrinkage parameter is ignored if `covariance_estimator` is not None.\n\ncovariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in sklearn.covariance.\n If None, the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\nReturns\n-------\ncov : array-like of shape (n_features, n_features)\n Weighted within-class covariance matrix", + "docstring": "Compute weighted within-class covariance matrix.\n\n The per-class covariance are weighted by the class priors.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n priors : array-like of shape (n_classes,)\n Class priors.\n\n shrinkage : 'auto' or float, default=None\n Shrinkage parameter, possible values:\n - None: no shrinkage (default).\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n Shrinkage parameter is ignored if `covariance_estimator` is not None.\n\n covariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in sklearn.covariance.\n If None, the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n cov : array-like of shape (n_features, n_features)\n Weighted within-class covariance matrix\n ", "source_code": "\ndef _class_cov(X, y, priors, shrinkage=None, covariance_estimator=None):\n \"\"\"Compute weighted within-class covariance matrix.\n\n The per-class covariance are weighted by the class priors.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n priors : array-like of shape (n_classes,)\n Class priors.\n\n shrinkage : 'auto' or float, default=None\n Shrinkage parameter, possible values:\n - None: no shrinkage (default).\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n Shrinkage parameter is ignored if `covariance_estimator` is not None.\n\n covariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in sklearn.covariance.\n If None, the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n cov : array-like of shape (n_features, n_features)\n Weighted within-class covariance matrix\n \"\"\"\n classes = np.unique(y)\n cov = np.zeros(shape=(X.shape[1], X.shape[1]))\n for (idx, group) in enumerate(classes):\n Xg = X[y == group, :]\n cov += priors[idx] * np.atleast_2d(_cov(Xg, shrinkage, covariance_estimator))\n return cov" }, { @@ -55714,7 +58533,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data." - } + }, + "refined_type": {} }, { "name": "y", @@ -55724,13 +58544,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute class means.", - "docstring": "Compute class means.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\nReturns\n-------\nmeans : array-like of shape (n_classes, n_features)\n Class means.", + "docstring": "Compute class means.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n Returns\n -------\n means : array-like of shape (n_classes, n_features)\n Class means.\n ", "source_code": "\ndef _class_means(X, y):\n \"\"\"Compute class means.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n Returns\n -------\n means : array-like of shape (n_classes, n_features)\n Class means.\n \"\"\"\n (classes, y) = np.unique(y, return_inverse=True)\n cnt = np.bincount(y)\n means = np.zeros(shape=(len(classes), X.shape[1]))\n np.add.at(means, y, X)\n means /= cnt[:, None]\n return means" }, { @@ -55748,7 +58569,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "shrinkage", @@ -55758,7 +58580,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "covariance_estimator", @@ -55768,13 +58591,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Estimate covariance matrix (using optional covariance_estimator). Parameters ---------- X : array-like of shape (n_samples, n_features) Input data.\n\nshrinkage : {'empirical', 'auto'} or float, default=None Shrinkage parameter, possible values: - None or 'empirical': no shrinkage (default). - 'auto': automatic shrinkage using the Ledoit-Wolf lemma. - float between 0 and 1: fixed shrinkage parameter. Shrinkage parameter is ignored if `covariance_estimator` is not None. covariance_estimator : estimator, default=None If not None, `covariance_estimator` is used to estimate the covariance matrices instead of relying on the empirical covariance estimator (with potential shrinkage). The object should have a fit method and a ``covariance_`` attribute like the estimators in :mod:`sklearn.covariance``. if None the shrinkage parameter drives the estimate. .. versionadded:: 0.24", - "docstring": "Estimate covariance matrix (using optional covariance_estimator).\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data.\n\nshrinkage : {'empirical', 'auto'} or float, default=None\n Shrinkage parameter, possible values:\n - None or 'empirical': no shrinkage (default).\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n Shrinkage parameter is ignored if `covariance_estimator`\n is not None.\n\ncovariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying on the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in :mod:`sklearn.covariance``.\n if None the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\nReturns\n-------\ns : ndarray of shape (n_features, n_features)\n Estimated covariance matrix.", + "description": "Estimate covariance matrix (using optional covariance_estimator).\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data.\n\nshrinkage : {'empirical', 'auto'} or float, default=None\n Shrinkage parameter, possible values:\n - None or 'empirical': no shrinkage (default).\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n Shrinkage parameter is ignored if `covariance_estimator`\n is not None.\n\ncovariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying on the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in :mod:`sklearn.covariance``.\n if None the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24", + "docstring": "Estimate covariance matrix (using optional covariance_estimator).\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n shrinkage : {'empirical', 'auto'} or float, default=None\n Shrinkage parameter, possible values:\n - None or 'empirical': no shrinkage (default).\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n Shrinkage parameter is ignored if `covariance_estimator`\n is not None.\n\n covariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying on the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in :mod:`sklearn.covariance``.\n if None the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n s : ndarray of shape (n_features, n_features)\n Estimated covariance matrix.\n ", "source_code": "\ndef _cov(X, shrinkage=None, covariance_estimator=None):\n \"\"\"Estimate covariance matrix (using optional covariance_estimator).\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n shrinkage : {'empirical', 'auto'} or float, default=None\n Shrinkage parameter, possible values:\n - None or 'empirical': no shrinkage (default).\n - 'auto': automatic shrinkage using the Ledoit-Wolf lemma.\n - float between 0 and 1: fixed shrinkage parameter.\n\n Shrinkage parameter is ignored if `covariance_estimator`\n is not None.\n\n covariance_estimator : estimator, default=None\n If not None, `covariance_estimator` is used to estimate\n the covariance matrices instead of relying on the empirical\n covariance estimator (with potential shrinkage).\n The object should have a fit method and a ``covariance_`` attribute\n like the estimators in :mod:`sklearn.covariance``.\n if None the shrinkage parameter drives the estimate.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n s : ndarray of shape (n_features, n_features)\n Estimated covariance matrix.\n \"\"\"\n if covariance_estimator is None:\n shrinkage = 'empirical' if shrinkage is None else shrinkage\n if isinstance(shrinkage, str):\n if shrinkage == 'auto':\n sc = StandardScaler()\n X = sc.fit_transform(X)\n s = ledoit_wolf(X)[0]\n s = sc.scale_[:, np.newaxis] * s * sc.scale_[np.newaxis, :]\n elif shrinkage == 'empirical':\n s = empirical_covariance(X)\n else:\n raise ValueError('unknown shrinkage parameter')\n elif isinstance(shrinkage, float) or isinstance(shrinkage, int):\n if shrinkage < 0 or shrinkage > 1:\n raise ValueError('shrinkage parameter must be between 0 and 1')\n s = shrunk_covariance(empirical_covariance(X), shrinkage)\n else:\n raise TypeError('shrinkage must be a float or a string')\n else:\n if shrinkage is not None and shrinkage != 0:\n raise ValueError('covariance_estimator and shrinkage parameters are not None. Only one of the two can be set.')\n covariance_estimator.fit(X)\n if not hasattr(covariance_estimator, 'covariance_'):\n raise ValueError('%s does not have a covariance_ attribute' % covariance_estimator.__class__.__name__)\n s = covariance_estimator.covariance_\n return s" }, { @@ -55792,7 +58616,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "strategy", @@ -55800,8 +58625,18 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "{\"stratified\", \"most_frequent\", \"prior\", \"uniform\", \"constant\"}, default=\"prior\"", - "description": "Strategy to use to generate predictions.\n\n* \"stratified\": generates predictions by respecting the training\n set's class distribution.\n* \"most_frequent\": always predicts the most frequent label in the\n training set.\n* \"prior\": always predicts the class that maximizes the class prior\n (like \"most_frequent\") and ``predict_proba`` returns the class prior.\n* \"uniform\": generates predictions uniformly at random.\n* \"constant\": always predicts a constant label that is provided by\n the user. This is useful for metrics that evaluate a non-majority\n class\n\n .. versionchanged:: 0.24\n The default value of `strategy` has changed to \"prior\" in version\n 0.24." + "type": "{\"most_frequent\", \"prior\", \"stratified\", \"uniform\", \"constant\"}, default=\"prior\"", + "description": "Strategy to use to generate predictions.\n\n* \"most_frequent\": the `predict` method always returns the most\n frequent class label in the observed `y` argument passed to `fit`.\n The `predict_proba` method returns the matching one-hot encoded\n vector.\n* \"prior\": the `predict` method always returns the most frequent\n class label in the observed `y` argument passed to `fit` (like\n \"most_frequent\"). ``predict_proba`` always returns the empirical\n class distribution of `y` also known as the empirical class prior\n distribution.\n* \"stratified\": the `predict_proba` method randomly samples one-hot\n vectors from a multinomial distribution parametrized by the empirical\n class prior probabilities.\n The `predict` method returns the class label which got probability\n one in the one-hot vector of `predict_proba`.\n Each sampled row of both methods is therefore independent and\n identically distributed.\n* \"uniform\": generates predictions uniformly at random from the list\n of unique classes observed in `y`, i.e. each class has equal\n probability.\n* \"constant\": always predicts a constant label that is provided by\n the user. This is useful for metrics that evaluate a non-majority\n class.\n\n .. versionchanged:: 0.24\n The default value of `strategy` has changed to \"prior\" in version\n 0.24." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "most_frequent", + "uniform", + "prior", + "constant", + "stratified" + ] } }, { @@ -55812,7 +58647,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the randomness to generate the predictions when\n``strategy='stratified'`` or ``strategy='uniform'``.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "constant", @@ -55820,15 +58656,16 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "int or str or array-like of shape (n_outputs,)", + "type": "int or str or array-like of shape (n_outputs,), default=None", "description": "The explicit constant as predicted by the \"constant\" strategy. This\nparameter is useful only for the \"constant\" strategy." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, strategy='prior', random_state=None, constant=None):\n self.strategy = strategy\n self.random_state = random_state\n self.constant = constant" }, { @@ -55846,13 +58683,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'poor_score': True, 'no_validation': True, '_xfail_checks': {'check_methods_subset_invariance': 'fails for the predict method', 'check_methods_sample_order_invariance': 'fails for the predict method'}}" }, { @@ -55870,7 +58708,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55880,7 +58719,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -55890,7 +58730,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -55900,14 +58741,15 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit the random classifier.", - "docstring": "Fit the random classifier.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nself : object\n Returns the instance itself.", - "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the random classifier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n allowed_strategies = ('most_frequent', 'stratified', 'uniform', 'constant', 'prior')\n if self.strategy not in allowed_strategies:\n raise ValueError('Unknown strategy type: %s, expected one of %s.' % (self.strategy, allowed_strategies))\n self._strategy = self.strategy\n if self._strategy == 'uniform' and sp.issparse(y):\n y = y.toarray()\n warnings.warn('A local copy of the target data has been converted to a numpy array. Predicting on sparse target data with the uniform strategy would not save memory and would be slower.', UserWarning)\n self.sparse_output_ = sp.issparse(y)\n if not self.sparse_output_:\n y = np.asarray(y)\n y = np.atleast_1d(y)\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n self.n_outputs_ = y.shape[1]\n check_consistent_length(X, y)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if self._strategy == 'constant':\n if self.constant is None:\n raise ValueError('Constant target value has to be specified when the constant strategy is used.')\n else:\n constant = np.reshape(np.atleast_1d(self.constant), (-1, 1))\n if constant.shape[0] != self.n_outputs_:\n raise ValueError('Constant target value should have shape (%d, 1).' % self.n_outputs_)\n (self.classes_, self.n_classes_, self.class_prior_) = class_distribution(y, sample_weight)\n if self._strategy == 'constant':\n for k in range(self.n_outputs_):\n if not any((constant[k][0] == c for c in self.classes_[k])):\n err_msg = 'The constant target value must be present in the training data. You provided constant={}. Possible values are: {}.'.format(self.constant, list(self.classes_[k]))\n raise ValueError(err_msg)\n if self.n_outputs_ == 1:\n self.n_classes_ = self.n_classes_[0]\n self.classes_ = self.classes_[0]\n self.class_prior_ = self.class_prior_[0]\n return self" + "description": "Fit the baseline classifier.", + "docstring": "Fit the baseline classifier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", + "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the baseline classifier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n allowed_strategies = ('most_frequent', 'stratified', 'uniform', 'constant', 'prior')\n if self.strategy not in allowed_strategies:\n raise ValueError('Unknown strategy type: %s, expected one of %s.' % (self.strategy, allowed_strategies))\n self._strategy = self.strategy\n if self._strategy == 'uniform' and sp.issparse(y):\n y = y.toarray()\n warnings.warn('A local copy of the target data has been converted to a numpy array. Predicting on sparse target data with the uniform strategy would not save memory and would be slower.', UserWarning)\n self.sparse_output_ = sp.issparse(y)\n if not self.sparse_output_:\n y = np.asarray(y)\n y = np.atleast_1d(y)\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n self.n_outputs_ = y.shape[1]\n check_consistent_length(X, y)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if self._strategy == 'constant':\n if self.constant is None:\n raise ValueError('Constant target value has to be specified when the constant strategy is used.')\n else:\n constant = np.reshape(np.atleast_1d(self.constant), (-1, 1))\n if constant.shape[0] != self.n_outputs_:\n raise ValueError('Constant target value should have shape (%d, 1).' % self.n_outputs_)\n (self.classes_, self.n_classes_, self.class_prior_) = class_distribution(y, sample_weight)\n if self._strategy == 'constant':\n for k in range(self.n_outputs_):\n if not any((constant[k][0] == c for c in self.classes_[k])):\n err_msg = 'The constant target value must be present in the training data. You provided constant={}. Possible values are: {}.'.format(self.constant, list(self.classes_[k]))\n raise ValueError(err_msg)\n if self.n_outputs_ == 1:\n self.n_classes_ = self.n_classes_[0]\n self.classes_ = self.classes_[0]\n self.class_prior_ = self.class_prior_[0]\n return self" }, { "name": "n_features_in_", @@ -55927,13 +58769,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('`n_features_in_` is deprecated in 1.0 and will be removed in 1.2.')\n@property\ndef n_features_in_(self):\n check_is_fitted(self)\n return None" }, { @@ -55951,7 +58794,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55961,13 +58805,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Test data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform classification on test vectors X.", - "docstring": "Perform classification on test vectors X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Test data.\n\nReturns\n-------\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Predicted target values for X.", + "docstring": "Perform classification on test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test data.\n\n Returns\n -------\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Predicted target values for X.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Perform classification on test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test data.\n\n Returns\n -------\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Predicted target values for X.\n \"\"\"\n check_is_fitted(self)\n n_samples = _num_samples(X)\n rs = check_random_state(self.random_state)\n n_classes_ = self.n_classes_\n classes_ = self.classes_\n class_prior_ = self.class_prior_\n constant = self.constant\n if self.n_outputs_ == 1:\n n_classes_ = [n_classes_]\n classes_ = [classes_]\n class_prior_ = [class_prior_]\n constant = [constant]\n if self._strategy == 'stratified':\n proba = self.predict_proba(X)\n if self.n_outputs_ == 1:\n proba = [proba]\n if self.sparse_output_:\n class_prob = None\n if self._strategy in ('most_frequent', 'prior'):\n classes_ = [np.array([cp.argmax()]) for cp in class_prior_]\n elif self._strategy == 'stratified':\n class_prob = class_prior_\n elif self._strategy == 'uniform':\n raise ValueError('Sparse target prediction is not supported with the uniform strategy')\n elif self._strategy == 'constant':\n classes_ = [np.array([c]) for c in constant]\n y = _random_choice_csc(n_samples, classes_, class_prob, self.random_state)\n else:\n if self._strategy in ('most_frequent', 'prior'):\n y = np.tile([classes_[k][class_prior_[k].argmax()] for k in range(self.n_outputs_)], [n_samples, 1])\n elif self._strategy == 'stratified':\n y = np.vstack([classes_[k][proba[k].argmax(axis=1)] for k in range(self.n_outputs_)]).T\n elif self._strategy == 'uniform':\n ret = [classes_[k][rs.randint(n_classes_[k], size=n_samples)] for k in range(self.n_outputs_)]\n y = np.vstack(ret).T\n elif self._strategy == 'constant':\n y = np.tile(self.constant, (n_samples, 1))\n if self.n_outputs_ == 1:\n y = np.ravel(y)\n return y" }, { @@ -55985,7 +58830,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -55995,13 +58841,17 @@ "docstring": { "type": "{array-like, object with finite length or shape}", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Return log probability estimates for the test vectors X.", - "docstring": "Return log probability estimates for the test vectors X.\n\nParameters\n----------\nX : {array-like, object with finite length or shape}\n Training data.\n\nReturns\n-------\nP : ndarray of shape (n_samples, n_classes) or list of such arrays\n Returns the log probability of the sample for each class in\n the model, where classes are ordered arithmetically for each\n output.", + "docstring": "\n Return log probability estimates for the test vectors X.\n\n Parameters\n ----------\n X : {array-like, object with finite length or shape}\n Training data.\n\n Returns\n -------\n P : ndarray of shape (n_samples, n_classes) or list of such arrays\n Returns the log probability of the sample for each class in\n the model, where classes are ordered arithmetically for each\n output.\n ", "source_code": "\ndef predict_log_proba(self, X):\n \"\"\"\n Return log probability estimates for the test vectors X.\n\n Parameters\n ----------\n X : {array-like, object with finite length or shape}\n Training data.\n\n Returns\n -------\n P : ndarray of shape (n_samples, n_classes) or list of such arrays\n Returns the log probability of the sample for each class in\n the model, where classes are ordered arithmetically for each\n output.\n \"\"\"\n proba = self.predict_proba(X)\n if self.n_outputs_ == 1:\n return np.log(proba)\n else:\n return [np.log(p) for p in proba]" }, { @@ -56019,7 +58869,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -56029,13 +58880,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Test data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return probability estimates for the test vectors X.", - "docstring": "Return probability estimates for the test vectors X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Test data.\n\nReturns\n-------\nP : ndarray of shape (n_samples, n_classes) or list of such arrays\n Returns the probability of the sample for each class in\n the model, where classes are ordered arithmetically, for each\n output.", + "docstring": "\n Return probability estimates for the test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test data.\n\n Returns\n -------\n P : ndarray of shape (n_samples, n_classes) or list of such arrays\n Returns the probability of the sample for each class in\n the model, where classes are ordered arithmetically, for each\n output.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"\n Return probability estimates for the test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test data.\n\n Returns\n -------\n P : ndarray of shape (n_samples, n_classes) or list of such arrays\n Returns the probability of the sample for each class in\n the model, where classes are ordered arithmetically, for each\n output.\n \"\"\"\n check_is_fitted(self)\n n_samples = _num_samples(X)\n rs = check_random_state(self.random_state)\n n_classes_ = self.n_classes_\n classes_ = self.classes_\n class_prior_ = self.class_prior_\n constant = self.constant\n if self.n_outputs_ == 1:\n n_classes_ = [n_classes_]\n classes_ = [classes_]\n class_prior_ = [class_prior_]\n constant = [constant]\n P = []\n for k in range(self.n_outputs_):\n if self._strategy == 'most_frequent':\n ind = class_prior_[k].argmax()\n out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)\n out[:, ind] = 1.0\n elif self._strategy == 'prior':\n out = np.ones((n_samples, 1)) * class_prior_[k]\n elif self._strategy == 'stratified':\n out = rs.multinomial(1, class_prior_[k], size=n_samples)\n out = out.astype(np.float64)\n elif self._strategy == 'uniform':\n out = np.ones((n_samples, n_classes_[k]), dtype=np.float64)\n out /= n_classes_[k]\n elif self._strategy == 'constant':\n ind = np.where(classes_[k] == constant[k])\n out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)\n out[:, ind] = 1.0\n P.append(out)\n if self.n_outputs_ == 1:\n P = P[0]\n return P" }, { @@ -56053,7 +58905,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -56063,7 +58916,8 @@ "docstring": { "type": "None or array-like of shape (n_samples, n_features)", "description": "Test samples. Passing None as test samples gives the same result\nas passing real test samples, since DummyClassifier\noperates independently of the sampled observations." - } + }, + "refined_type": {} }, { "name": "y", @@ -56073,7 +58927,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "True labels for X." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -56083,13 +58938,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return the mean accuracy on the given test data and labels.\n\nIn multi-label classification, this is the subset accuracy which is a harsh metric since you require for each sample that each label set be correctly predicted.", - "docstring": "Return the mean accuracy on the given test data and labels.\n\nIn multi-label classification, this is the subset accuracy\nwhich is a harsh metric since you require for each sample that\neach label set be correctly predicted.\n\nParameters\n----------\nX : None or array-like of shape (n_samples, n_features)\n Test samples. Passing None as test samples gives the same result\n as passing real test samples, since DummyClassifier\n operates independently of the sampled observations.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True labels for X.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nscore : float\n Mean accuracy of self.predict(X) wrt. y.", + "description": "Return the mean accuracy on the given test data and labels.\n\nIn multi-label classification, this is the subset accuracy\nwhich is a harsh metric since you require for each sample that\neach label set be correctly predicted.", + "docstring": "Return the mean accuracy on the given test data and labels.\n\n In multi-label classification, this is the subset accuracy\n which is a harsh metric since you require for each sample that\n each label set be correctly predicted.\n\n Parameters\n ----------\n X : None or array-like of shape (n_samples, n_features)\n Test samples. Passing None as test samples gives the same result\n as passing real test samples, since DummyClassifier\n operates independently of the sampled observations.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True labels for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Mean accuracy of self.predict(X) wrt. y.\n ", "source_code": "\ndef score(self, X, y, sample_weight=None):\n \"\"\"Return the mean accuracy on the given test data and labels.\n\n In multi-label classification, this is the subset accuracy\n which is a harsh metric since you require for each sample that\n each label set be correctly predicted.\n\n Parameters\n ----------\n X : None or array-like of shape (n_samples, n_features)\n Test samples. Passing None as test samples gives the same result\n as passing real test samples, since DummyClassifier\n operates independently of the sampled observations.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True labels for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Mean accuracy of self.predict(X) wrt. y.\n \"\"\"\n if X is None:\n X = np.zeros(shape=(len(y), 1))\n return super().score(X, y, sample_weight)" }, { @@ -56107,7 +58963,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "strategy", @@ -56117,6 +58974,10 @@ "docstring": { "type": "{\"mean\", \"median\", \"quantile\", \"constant\"}, default=\"mean\"", "description": "Strategy to use to generate predictions.\n\n* \"mean\": always predicts the mean of the training set\n* \"median\": always predicts the median of the training set\n* \"quantile\": always predicts a specified quantile of the training set,\n provided with the quantile parameter.\n* \"constant\": always predicts a constant value that is provided by\n the user." + }, + "refined_type": { + "kind": "EnumType", + "values": ["median", "quantile", "constant", "mean"] } }, { @@ -56127,7 +58988,8 @@ "docstring": { "type": "int or float or array-like of shape (n_outputs,), default=None", "description": "The explicit constant as predicted by the \"constant\" strategy. This\nparameter is useful only for the \"constant\" strategy." - } + }, + "refined_type": {} }, { "name": "quantile", @@ -56137,13 +58999,14 @@ "docstring": { "type": "float in [0.0, 1.0], default=None", "description": "The quantile to predict using the \"quantile\" strategy. A quantile of\n0.5 corresponds to the median, while 0.0 to the minimum and 1.0 to the\nmaximum." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, strategy='mean', constant=None, quantile=None):\n self.strategy = strategy\n self.constant = constant\n self.quantile = quantile" }, { @@ -56161,13 +59024,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'poor_score': True, 'no_validation': True}" }, { @@ -56185,7 +59049,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -56195,7 +59060,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -56205,7 +59071,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -56215,13 +59082,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the random regressor.", - "docstring": "Fit the random regressor.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the random regressor.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the random regressor.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n allowed_strategies = ('mean', 'median', 'quantile', 'constant')\n if self.strategy not in allowed_strategies:\n raise ValueError('Unknown strategy type: %s, expected one of %s.' % (self.strategy, allowed_strategies))\n y = check_array(y, ensure_2d=False)\n if len(y) == 0:\n raise ValueError('y must not be empty.')\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n self.n_outputs_ = y.shape[1]\n check_consistent_length(X, y, sample_weight)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if self.strategy == 'mean':\n self.constant_ = np.average(y, axis=0, weights=sample_weight)\n elif self.strategy == 'median':\n if sample_weight is None:\n self.constant_ = np.median(y, axis=0)\n else:\n self.constant_ = [_weighted_percentile(y[:, k], sample_weight, percentile=50.0) for k in range(self.n_outputs_)]\n elif self.strategy == 'quantile':\n if self.quantile is None or not np.isscalar(self.quantile):\n raise ValueError('Quantile must be a scalar in the range [0.0, 1.0], but got %s.' % self.quantile)\n percentile = self.quantile * 100.0\n if sample_weight is None:\n self.constant_ = np.percentile(y, axis=0, q=percentile)\n else:\n self.constant_ = [_weighted_percentile(y[:, k], sample_weight, percentile=percentile) for k in range(self.n_outputs_)]\n elif self.strategy == 'constant':\n if self.constant is None:\n raise TypeError('Constant target value has to be specified when the constant strategy is used.')\n self.constant = check_array(self.constant, accept_sparse=['csr', 'csc', 'coo'], ensure_2d=False, ensure_min_samples=0)\n if self.n_outputs_ != 1 and self.constant.shape[0] != y.shape[1]:\n raise ValueError('Constant target value should have shape (%d, 1).' % y.shape[1])\n self.constant_ = self.constant\n self.constant_ = np.reshape(self.constant_, (1, -1))\n return self" }, { @@ -56242,13 +59110,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('`n_features_in_` is deprecated in 1.0 and will be removed in 1.2.')\n@property\ndef n_features_in_(self):\n check_is_fitted(self)\n return None" }, { @@ -56266,7 +59135,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -56276,7 +59146,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Test data." - } + }, + "refined_type": {} }, { "name": "return_std", @@ -56286,13 +59157,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the standard deviation of posterior prediction.\nAll zeros in this case.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform classification on test vectors X.", - "docstring": "Perform classification on test vectors X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Test data.\n\nreturn_std : bool, default=False\n Whether to return the standard deviation of posterior prediction.\n All zeros in this case.\n\n .. versionadded:: 0.20\n\nReturns\n-------\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Predicted target values for X.\n\ny_std : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Standard deviation of predictive distribution of query points.", + "docstring": "Perform classification on test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test data.\n\n return_std : bool, default=False\n Whether to return the standard deviation of posterior prediction.\n All zeros in this case.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Predicted target values for X.\n\n y_std : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Standard deviation of predictive distribution of query points.\n ", "source_code": "\ndef predict(self, X, return_std=False):\n \"\"\"Perform classification on test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test data.\n\n return_std : bool, default=False\n Whether to return the standard deviation of posterior prediction.\n All zeros in this case.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Predicted target values for X.\n\n y_std : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Standard deviation of predictive distribution of query points.\n \"\"\"\n check_is_fitted(self)\n n_samples = _num_samples(X)\n y = np.full((n_samples, self.n_outputs_), self.constant_, dtype=np.array(self.constant_).dtype)\n y_std = np.zeros((n_samples, self.n_outputs_))\n if self.n_outputs_ == 1:\n y = np.ravel(y)\n y_std = np.ravel(y_std)\n return (y, y_std) if return_std else y" }, { @@ -56310,7 +59182,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -56320,7 +59193,8 @@ "docstring": { "type": "None or array-like of shape (n_samples, n_features)", "description": "Test samples. Passing None as test samples gives the same result\nas passing real test samples, since `DummyRegressor`\noperates independently of the sampled observations." - } + }, + "refined_type": {} }, { "name": "y", @@ -56330,7 +59204,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "True values for X." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -56340,13 +59215,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return the coefficient of determination R^2 of the prediction.\n\nThe coefficient R^2 is defined as `(1 - u/v)`, where `u` is the residual sum of squares `((y_true - y_pred) ** 2).sum()` and `v` is the total sum of squares `((y_true - y_true.mean()) ** 2).sum()`. The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.", - "docstring": "Return the coefficient of determination R^2 of the prediction.\n\nThe coefficient R^2 is defined as `(1 - u/v)`, where `u` is the\nresidual sum of squares `((y_true - y_pred) ** 2).sum()` and `v` is the\ntotal sum of squares `((y_true - y_true.mean()) ** 2).sum()`. The best\npossible score is 1.0 and it can be negative (because the model can be\narbitrarily worse). A constant model that always predicts the expected\nvalue of y, disregarding the input features, would get a R^2 score of\n0.0.\n\nParameters\n----------\nX : None or array-like of shape (n_samples, n_features)\n Test samples. Passing None as test samples gives the same result\n as passing real test samples, since `DummyRegressor`\n operates independently of the sampled observations.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True values for X.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nscore : float\n R^2 of `self.predict(X)` wrt. y.", + "description": "Return the coefficient of determination R^2 of the prediction.\n\nThe coefficient R^2 is defined as `(1 - u/v)`, where `u` is the\nresidual sum of squares `((y_true - y_pred) ** 2).sum()` and `v` is the\ntotal sum of squares `((y_true - y_true.mean()) ** 2).sum()`. The best\npossible score is 1.0 and it can be negative (because the model can be\narbitrarily worse). A constant model that always predicts the expected\nvalue of y, disregarding the input features, would get a R^2 score of\n0.0.", + "docstring": "Return the coefficient of determination R^2 of the prediction.\n\n The coefficient R^2 is defined as `(1 - u/v)`, where `u` is the\n residual sum of squares `((y_true - y_pred) ** 2).sum()` and `v` is the\n total sum of squares `((y_true - y_true.mean()) ** 2).sum()`. The best\n possible score is 1.0 and it can be negative (because the model can be\n arbitrarily worse). A constant model that always predicts the expected\n value of y, disregarding the input features, would get a R^2 score of\n 0.0.\n\n Parameters\n ----------\n X : None or array-like of shape (n_samples, n_features)\n Test samples. Passing None as test samples gives the same result\n as passing real test samples, since `DummyRegressor`\n operates independently of the sampled observations.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True values for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n R^2 of `self.predict(X)` wrt. y.\n ", "source_code": "\ndef score(self, X, y, sample_weight=None):\n \"\"\"Return the coefficient of determination R^2 of the prediction.\n\n The coefficient R^2 is defined as `(1 - u/v)`, where `u` is the\n residual sum of squares `((y_true - y_pred) ** 2).sum()` and `v` is the\n total sum of squares `((y_true - y_true.mean()) ** 2).sum()`. The best\n possible score is 1.0 and it can be negative (because the model can be\n arbitrarily worse). A constant model that always predicts the expected\n value of y, disregarding the input features, would get a R^2 score of\n 0.0.\n\n Parameters\n ----------\n X : None or array-like of shape (n_samples, n_features)\n Test samples. Passing None as test samples gives the same result\n as passing real test samples, since `DummyRegressor`\n operates independently of the sampled observations.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n True values for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n R^2 of `self.predict(X)` wrt. y.\n \"\"\"\n if X is None:\n X = np.zeros(shape=(len(y), 1))\n return super().score(X, y, sample_weight)" }, { @@ -56364,7 +59240,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -56374,7 +59251,8 @@ "docstring": { "type": "object, default=None", "description": "The base estimator to fit on random subsets of the dataset.\nIf None, then the base estimator is a\n:class:`~sklearn.tree.DecisionTreeClassifier`." - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -56384,7 +59262,8 @@ "docstring": { "type": "int, default=10", "description": "The number of base estimators in the ensemble." - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -56394,7 +59273,8 @@ "docstring": { "type": "int or float, default=1.0", "description": "The number of samples to draw from X to train each base estimator (with\nreplacement by default, see `bootstrap` for more details).\n\n- If int, then draw `max_samples` samples.\n- If float, then draw `max_samples * X.shape[0]` samples." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -56404,7 +59284,8 @@ "docstring": { "type": "int or float, default=1.0", "description": "The number of features to draw from X to train each base estimator (\nwithout replacement by default, see `bootstrap_features` for more\ndetails).\n\n- If int, then draw `max_features` features.\n- If float, then draw `max_features * X.shape[1]` features." - } + }, + "refined_type": {} }, { "name": "bootstrap", @@ -56414,7 +59295,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether samples are drawn with replacement. If False, sampling\nwithout replacement is performed." - } + }, + "refined_type": {} }, { "name": "bootstrap_features", @@ -56424,7 +59306,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether features are drawn with replacement." - } + }, + "refined_type": {} }, { "name": "oob_score", @@ -56434,7 +59317,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use out-of-bag samples to estimate\nthe generalization error. Only available if bootstrap=True." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -56444,7 +59328,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to True, reuse the solution of the previous call to fit\nand add more estimators to the ensemble, otherwise, just fit\na whole new ensemble. See :term:`the Glossary `.\n\n.. versionadded:: 0.17\n *warm_start* constructor parameter." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -56454,7 +59339,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to run in parallel for both :meth:`fit` and\n:meth:`predict`. ``None`` means 1 unless in a\n:obj:`joblib.parallel_backend` context. ``-1`` means using all\nprocessors. See :term:`Glossary ` for more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -56464,7 +59350,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the random resampling of the original dataset\n(sample wise and feature wise).\nIf the base estimator accepts a `random_state` attribute, a different\nseed is generated for each instance in the ensemble.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -56474,13 +59361,14 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity when fitting and predicting." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0):\n super().__init__(base_estimator, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, bootstrap=bootstrap, bootstrap_features=bootstrap_features, oob_score=oob_score, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose)" }, { @@ -56498,7 +59386,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -56508,7 +59397,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -56518,13 +59408,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _set_oob_score(self, X, y):\n n_samples = y.shape[0]\n n_classes_ = self.n_classes_\n predictions = np.zeros((n_samples, n_classes_))\n for (estimator, samples, features) in zip(self.estimators_, self.estimators_samples_, self.estimators_features_):\n mask = ~indices_to_mask(samples, n_samples)\n if hasattr(estimator, 'predict_proba'):\n predictions[mask, :] += estimator.predict_proba(X[mask, :][:, features])\n else:\n p = estimator.predict(X[mask, :][:, features])\n j = 0\n for i in range(n_samples):\n if mask[i]:\n predictions[i, p[j]] += 1\n j += 1\n if (predictions.sum(axis=1) == 0).any():\n warn('Some inputs do not have OOB scores. This probably means too few estimators were used to compute any reliable oob estimates.')\n oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]\n oob_score = accuracy_score(y, np.argmax(predictions, axis=1))\n self.oob_decision_function_ = oob_decision_function\n self.oob_score_ = oob_score" }, { @@ -56542,7 +59433,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -56566,7 +59458,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -56576,13 +59469,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_y(self, y):\n y = column_or_1d(y, warn=True)\n check_classification_targets(y)\n (self.classes_, y) = np.unique(y, return_inverse=True)\n self.n_classes_ = len(self.classes_)\n return y" }, { @@ -56600,7 +59494,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -56610,13 +59505,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrices are accepted only if\nthey are supported by the base estimator." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Average of the decision functions of the base classifiers.", - "docstring": "Average of the decision functions of the base classifiers.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\nReturns\n-------\nscore : ndarray of shape (n_samples, k)\n The decision function of the input samples. The columns correspond\n to the classes in sorted order, as they appear in the attribute\n ``classes_``. Regression and binary classification are special\n cases with ``k == 1``, otherwise ``k==n_classes``.", + "docstring": "Average of the decision functions of the base classifiers.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n score : ndarray of shape (n_samples, k)\n The decision function of the input samples. The columns correspond\n to the classes in sorted order, as they appear in the attribute\n ``classes_``. Regression and binary classification are special\n cases with ``k == 1``, otherwise ``k==n_classes``.\n ", "source_code": "\n@if_delegate_has_method(delegate='base_estimator')\ndef decision_function(self, X):\n \"\"\"Average of the decision functions of the base classifiers.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n score : ndarray of shape (n_samples, k)\n The decision function of the input samples. The columns correspond\n to the classes in sorted order, as they appear in the attribute\n ``classes_``. Regression and binary classification are special\n cases with ``k == 1``, otherwise ``k==n_classes``.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False, reset=False)\n (n_jobs, n_estimators, starts) = _partition_estimators(self.n_estimators, self.n_jobs)\n all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)((delayed(_parallel_decision_function)(self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X) for i in range(n_jobs)))\n decisions = sum(all_decisions) / self.n_estimators\n return decisions" }, { @@ -56634,7 +59533,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -56644,13 +59544,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrices are accepted only if\nthey are supported by the base estimator." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict class for X.\n\nThe predicted class of an input sample is computed as the class with the highest mean predicted probability. If base estimators do not implement a ``predict_proba`` method, then it resorts to voting.", - "docstring": "Predict class for X.\n\nThe predicted class of an input sample is computed as the class with\nthe highest mean predicted probability. If base estimators do not\nimplement a ``predict_proba`` method, then it resorts to voting.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\nReturns\n-------\ny : ndarray of shape (n_samples,)\n The predicted classes.", + "description": "Predict class for X.\n\nThe predicted class of an input sample is computed as the class with\nthe highest mean predicted probability. If base estimators do not\nimplement a ``predict_proba`` method, then it resorts to voting.", + "docstring": "Predict class for X.\n\n The predicted class of an input sample is computed as the class with\n the highest mean predicted probability. If base estimators do not\n implement a ``predict_proba`` method, then it resorts to voting.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted classes.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict class for X.\n\n The predicted class of an input sample is computed as the class with\n the highest mean predicted probability. If base estimators do not\n implement a ``predict_proba`` method, then it resorts to voting.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted classes.\n \"\"\"\n predicted_probabilitiy = self.predict_proba(X)\n return self.classes_.take(np.argmax(predicted_probabilitiy, axis=1), axis=0)" }, { @@ -56668,7 +59572,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -56678,13 +59583,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrices are accepted only if\nthey are supported by the base estimator." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict class log-probabilities for X.\n\nThe predicted class log-probabilities of an input sample is computed as the log of the mean predicted class probabilities of the base estimators in the ensemble.", - "docstring": "Predict class log-probabilities for X.\n\nThe predicted class log-probabilities of an input sample is computed as\nthe log of the mean predicted class probabilities of the base\nestimators in the ensemble.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\nReturns\n-------\np : ndarray of shape (n_samples, n_classes)\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.", + "description": "Predict class log-probabilities for X.\n\nThe predicted class log-probabilities of an input sample is computed as\nthe log of the mean predicted class probabilities of the base\nestimators in the ensemble.", + "docstring": "Predict class log-probabilities for X.\n\n The predicted class log-probabilities of an input sample is computed as\n the log of the mean predicted class probabilities of the base\n estimators in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n ", "source_code": "\ndef predict_log_proba(self, X):\n \"\"\"Predict class log-probabilities for X.\n\n The predicted class log-probabilities of an input sample is computed as\n the log of the mean predicted class probabilities of the base\n estimators in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n if hasattr(self.base_estimator_, 'predict_log_proba'):\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False, reset=False)\n (n_jobs, n_estimators, starts) = _partition_estimators(self.n_estimators, self.n_jobs)\n all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)((delayed(_parallel_predict_log_proba)(self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_) for i in range(n_jobs)))\n log_proba = all_log_proba[0]\n for j in range(1, len(all_log_proba)):\n log_proba = np.logaddexp(log_proba, all_log_proba[j])\n log_proba -= np.log(self.n_estimators)\n return log_proba\n else:\n return np.log(self.predict_proba(X))" }, { @@ -56702,7 +59611,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -56712,13 +59622,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrices are accepted only if\nthey are supported by the base estimator." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict class probabilities for X.\n\nThe predicted class probabilities of an input sample is computed as the mean predicted class probabilities of the base estimators in the ensemble. If base estimators do not implement a ``predict_proba`` method, then it resorts to voting and the predicted class probabilities of an input sample represents the proportion of estimators predicting each class.", - "docstring": "Predict class probabilities for X.\n\nThe predicted class probabilities of an input sample is computed as\nthe mean predicted class probabilities of the base estimators in the\nensemble. If base estimators do not implement a ``predict_proba``\nmethod, then it resorts to voting and the predicted class probabilities\nof an input sample represents the proportion of estimators predicting\neach class.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\nReturns\n-------\np : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.", + "description": "Predict class probabilities for X.\n\nThe predicted class probabilities of an input sample is computed as\nthe mean predicted class probabilities of the base estimators in the\nensemble. If base estimators do not implement a ``predict_proba``\nmethod, then it resorts to voting and the predicted class probabilities\nof an input sample represents the proportion of estimators predicting\neach class.", + "docstring": "Predict class probabilities for X.\n\n The predicted class probabilities of an input sample is computed as\n the mean predicted class probabilities of the base estimators in the\n ensemble. If base estimators do not implement a ``predict_proba``\n method, then it resorts to voting and the predicted class probabilities\n of an input sample represents the proportion of estimators predicting\n each class.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Predict class probabilities for X.\n\n The predicted class probabilities of an input sample is computed as\n the mean predicted class probabilities of the base estimators in the\n ensemble. If base estimators do not implement a ``predict_proba``\n method, then it resorts to voting and the predicted class probabilities\n of an input sample represents the proportion of estimators predicting\n each class.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False, reset=False)\n (n_jobs, n_estimators, starts) = _partition_estimators(self.n_estimators, self.n_jobs)\n all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args())((delayed(_parallel_predict_proba)(self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_) for i in range(n_jobs)))\n proba = sum(all_proba) / self.n_estimators\n return proba" }, { @@ -56736,7 +59650,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -56746,7 +59661,8 @@ "docstring": { "type": "object, default=None", "description": "The base estimator to fit on random subsets of the dataset.\nIf None, then the base estimator is a\n:class:`~sklearn.tree.DecisionTreeRegressor`." - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -56756,7 +59672,8 @@ "docstring": { "type": "int, default=10", "description": "The number of base estimators in the ensemble." - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -56766,7 +59683,8 @@ "docstring": { "type": "int or float, default=1.0", "description": "The number of samples to draw from X to train each base estimator (with\nreplacement by default, see `bootstrap` for more details).\n\n- If int, then draw `max_samples` samples.\n- If float, then draw `max_samples * X.shape[0]` samples." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -56776,7 +59694,8 @@ "docstring": { "type": "int or float, default=1.0", "description": "The number of features to draw from X to train each base estimator (\nwithout replacement by default, see `bootstrap_features` for more\ndetails).\n\n- If int, then draw `max_features` features.\n- If float, then draw `max_features * X.shape[1]` features." - } + }, + "refined_type": {} }, { "name": "bootstrap", @@ -56786,7 +59705,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether samples are drawn with replacement. If False, sampling\nwithout replacement is performed." - } + }, + "refined_type": {} }, { "name": "bootstrap_features", @@ -56796,7 +59716,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether features are drawn with replacement." - } + }, + "refined_type": {} }, { "name": "oob_score", @@ -56806,7 +59727,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use out-of-bag samples to estimate\nthe generalization error. Only available if bootstrap=True." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -56816,7 +59738,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to True, reuse the solution of the previous call to fit\nand add more estimators to the ensemble, otherwise, just fit\na whole new ensemble. See :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -56826,7 +59749,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to run in parallel for both :meth:`fit` and\n:meth:`predict`. ``None`` means 1 unless in a\n:obj:`joblib.parallel_backend` context. ``-1`` means using all\nprocessors. See :term:`Glossary ` for more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -56836,7 +59760,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the random resampling of the original dataset\n(sample wise and feature wise).\nIf the base estimator accepts a `random_state` attribute, a different\nseed is generated for each instance in the ensemble.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -56846,13 +59771,14 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity when fitting and predicting." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0):\n super().__init__(base_estimator, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, bootstrap=bootstrap, bootstrap_features=bootstrap_features, oob_score=oob_score, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose)" }, { @@ -56870,7 +59796,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -56880,7 +59807,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -56890,13 +59818,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _set_oob_score(self, X, y):\n n_samples = y.shape[0]\n predictions = np.zeros((n_samples, ))\n n_predictions = np.zeros((n_samples, ))\n for (estimator, samples, features) in zip(self.estimators_, self.estimators_samples_, self.estimators_features_):\n mask = ~indices_to_mask(samples, n_samples)\n predictions[mask] += estimator.predict(X[mask, :][:, features])\n n_predictions[mask] += 1\n if (n_predictions == 0).any():\n warn('Some inputs do not have OOB scores. This probably means too few estimators were used to compute any reliable oob estimates.')\n n_predictions[n_predictions == 0] = 1\n predictions /= n_predictions\n self.oob_prediction_ = predictions\n self.oob_score_ = r2_score(y, predictions)" }, { @@ -56914,7 +59843,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -56938,7 +59868,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -56948,13 +59879,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrices are accepted only if\nthey are supported by the base estimator." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict regression target for X.\n\nThe predicted regression target of an input sample is computed as the mean predicted regression targets of the estimators in the ensemble.", - "docstring": "Predict regression target for X.\n\nThe predicted regression target of an input sample is computed as the\nmean predicted regression targets of the estimators in the ensemble.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\nReturns\n-------\ny : ndarray of shape (n_samples,)\n The predicted values.", + "description": "Predict regression target for X.\n\nThe predicted regression target of an input sample is computed as the\nmean predicted regression targets of the estimators in the ensemble.", + "docstring": "Predict regression target for X.\n\n The predicted regression target of an input sample is computed as the\n mean predicted regression targets of the estimators in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict regression target for X.\n\n The predicted regression target of an input sample is computed as the\n mean predicted regression targets of the estimators in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False, reset=False)\n (n_jobs, n_estimators, starts) = _partition_estimators(self.n_estimators, self.n_jobs)\n all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)((delayed(_parallel_predict_regression)(self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X) for i in range(n_jobs)))\n y_hat = sum(all_y_hat) / self.n_estimators\n return y_hat" }, { @@ -56972,7 +59907,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -56982,7 +59918,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -56992,7 +59929,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -57002,7 +59940,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_features", @@ -57012,7 +59951,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bootstrap", @@ -57022,7 +59962,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bootstrap_features", @@ -57032,7 +59973,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "oob_score", @@ -57042,7 +59984,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -57052,7 +59995,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -57062,7 +60006,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -57072,7 +60017,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -57082,13 +60028,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0):\n super().__init__(base_estimator=base_estimator, n_estimators=n_estimators)\n self.max_samples = max_samples\n self.max_features = max_features\n self.bootstrap = bootstrap\n self.bootstrap_features = bootstrap_features\n self.oob_score = oob_score\n self.warm_start = warm_start\n self.n_jobs = n_jobs\n self.random_state = random_state\n self.verbose = verbose" }, { @@ -57106,7 +60053,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -57116,6 +60064,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrices are accepted only if\nthey are supported by the base estimator." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -57126,7 +60078,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target values (class labels in classification, real numbers in\nregression)." - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -57136,7 +60089,8 @@ "docstring": { "type": "int or float, default=None", "description": "Argument to use instead of self.max_samples." - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -57146,7 +60100,8 @@ "docstring": { "type": "int, default=None", "description": "Override value used when constructing base estimator. Only\nsupported if the base estimator has a max_depth parameter." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -57156,13 +60111,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted.\nNote that this is supported only if the base estimator supports\nsample weighting." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Build a Bagging ensemble of estimators from the training set (X, y).", - "docstring": "Build a Bagging ensemble of estimators from the training\n set (X, y).\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\ny : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\nmax_samples : int or float, default=None\n Argument to use instead of self.max_samples.\n\nmax_depth : int, default=None\n Override value used when constructing base estimator. Only\n supported if the base estimator has a max_depth parameter.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if the base estimator supports\n sample weighting.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "description": "Build a Bagging ensemble of estimators from the training\n set (X, y).", + "docstring": "Build a Bagging ensemble of estimators from the training\n set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n max_samples : int or float, default=None\n Argument to use instead of self.max_samples.\n\n max_depth : int, default=None\n Override value used when constructing base estimator. Only\n supported if the base estimator has a max_depth parameter.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if the base estimator supports\n sample weighting.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):\n \"\"\"Build a Bagging ensemble of estimators from the training\n set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n max_samples : int or float, default=None\n Argument to use instead of self.max_samples.\n\n max_depth : int, default=None\n Override value used when constructing base estimator. Only\n supported if the base estimator has a max_depth parameter.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if the base estimator supports\n sample weighting.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n random_state = check_random_state(self.random_state)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=None)\n n_samples = X.shape[0]\n self._n_samples = n_samples\n y = self._validate_y(y)\n self._validate_estimator()\n if max_depth is not None:\n self.base_estimator_.max_depth = max_depth\n if max_samples is None:\n max_samples = self.max_samples\n elif not isinstance(max_samples, numbers.Integral):\n max_samples = int(max_samples * X.shape[0])\n if not 0 < max_samples <= X.shape[0]:\n raise ValueError('max_samples must be in (0, n_samples]')\n self._max_samples = max_samples\n if isinstance(self.max_features, numbers.Integral):\n max_features = self.max_features\n elif isinstance(self.max_features, float):\n max_features = self.max_features * self.n_features_in_\n else:\n raise ValueError('max_features must be int or float')\n if not 0 < max_features <= self.n_features_in_:\n raise ValueError('max_features must be in (0, n_features]')\n max_features = max(1, int(max_features))\n self._max_features = max_features\n if not self.bootstrap and self.oob_score:\n raise ValueError('Out of bag estimation only available if bootstrap=True')\n if self.warm_start and self.oob_score:\n raise ValueError('Out of bag estimate only available if warm_start=False')\n if hasattr(self, 'oob_score_') and self.warm_start:\n del self.oob_score_\n if not self.warm_start or not hasattr(self, 'estimators_'):\n self.estimators_ = []\n self.estimators_features_ = []\n n_more_estimators = self.n_estimators - len(self.estimators_)\n if n_more_estimators < 0:\n raise ValueError('n_estimators=%d must be larger or equal to len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_)))\n elif n_more_estimators == 0:\n warn('Warm-start fitting without increasing n_estimators does not fit new trees.')\n return self\n (n_jobs, n_estimators, starts) = _partition_estimators(n_more_estimators, self.n_jobs)\n total_n_estimators = sum(n_estimators)\n if self.warm_start and len(self.estimators_) > 0:\n random_state.randint(MAX_INT, size=len(self.estimators_))\n seeds = random_state.randint(MAX_INT, size=n_more_estimators)\n self._seeds = seeds\n all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args())((delayed(_parallel_build_estimators)(n_estimators[i], self, X, y, sample_weight, seeds[starts[i]:starts[i + 1]], total_n_estimators, verbose=self.verbose) for i in range(n_jobs)))\n self.estimators_ += list(itertools.chain.from_iterable((t[0] for t in all_results)))\n self.estimators_features_ += list(itertools.chain.from_iterable((t[1] for t in all_results)))\n if self.oob_score:\n self._set_oob_score(X, y)\n return self" }, { @@ -57180,13 +60136,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_estimators_indices(self):\n for seed in self._seeds:\n (feature_indices, sample_indices) = _generate_bagging_indices(seed, self.bootstrap_features, self.bootstrap, self.n_features_in_, self._n_samples, self._max_features, self._max_samples)\n yield (feature_indices, sample_indices)" }, { @@ -57204,13 +60161,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _parallel_args(self):\n return {}" }, { @@ -57228,7 +60186,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -57238,7 +60197,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -57248,7 +60208,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -57272,7 +60233,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -57282,13 +60244,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_y(self, y):\n if len(y.shape) == 1 or y.shape[1] == 1:\n return column_or_1d(y, warn=True)\n else:\n return y" }, { @@ -57306,13 +60269,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "The subset of drawn samples for each base estimator.\n\nReturns a dynamically generated list of indices identifying the samples used for fitting each member of the ensemble, i.e., the in-bag samples. Note: the list is re-created at each call to the property in order to reduce the object memory footprint by not storing the sampling data. Thus fetching the property may be slower than expected.", - "docstring": "The subset of drawn samples for each base estimator.\n\nReturns a dynamically generated list of indices identifying\nthe samples used for fitting each member of the ensemble, i.e.,\nthe in-bag samples.\n\nNote: the list is re-created at each call to the property in order\nto reduce the object memory footprint by not storing the sampling\ndata. Thus fetching the property may be slower than expected.", + "description": "The subset of drawn samples for each base estimator.\n\nReturns a dynamically generated list of indices identifying\nthe samples used for fitting each member of the ensemble, i.e.,\nthe in-bag samples.\n\nNote: the list is re-created at each call to the property in order\nto reduce the object memory footprint by not storing the sampling\ndata. Thus fetching the property may be slower than expected.", + "docstring": "\n The subset of drawn samples for each base estimator.\n\n Returns a dynamically generated list of indices identifying\n the samples used for fitting each member of the ensemble, i.e.,\n the in-bag samples.\n\n Note: the list is re-created at each call to the property in order\n to reduce the object memory footprint by not storing the sampling\n data. Thus fetching the property may be slower than expected.\n ", "source_code": "\n@property\ndef estimators_samples_(self):\n \"\"\"\n The subset of drawn samples for each base estimator.\n\n Returns a dynamically generated list of indices identifying\n the samples used for fitting each member of the ensemble, i.e.,\n the in-bag samples.\n\n Note: the list is re-created at each call to the property in order\n to reduce the object memory footprint by not storing the sampling\n data. Thus fetching the property may be slower than expected.\n \"\"\"\n return [sample_indices for (_, sample_indices) in self._get_estimators_indices()]" }, { @@ -57330,7 +60294,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -57340,6 +60305,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrices are accepted only if\nthey are supported by the base estimator." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -57350,7 +60319,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target values (class labels in classification, real numbers in\nregression)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -57360,13 +60330,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted.\nNote that this is supported only if the base estimator supports\nsample weighting." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Build a Bagging ensemble of estimators from the training set (X, y).", - "docstring": "Build a Bagging ensemble of estimators from the training set (X, y).\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\ny : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if the base estimator supports\n sample weighting.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Build a Bagging ensemble of estimators from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if the base estimator supports\n sample weighting.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Build a Bagging ensemble of estimators from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrices are accepted only if\n they are supported by the base estimator.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if the base estimator supports\n sample weighting.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], dtype=None, force_all_finite=False, multi_output=True)\n return self._fit(X, y, self.max_samples, sample_weight=sample_weight)" }, { @@ -57387,13 +60358,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead.')\n@property\ndef n_features_(self):\n return self.n_features_in_" }, { @@ -57411,7 +60383,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bootstrap_features", @@ -57421,7 +60394,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bootstrap_samples", @@ -57431,7 +60405,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -57441,7 +60416,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -57451,7 +60427,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_features", @@ -57461,7 +60438,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -57471,7 +60449,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -57495,7 +60474,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bootstrap", @@ -57505,7 +60485,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_population", @@ -57515,7 +60496,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -57525,7 +60507,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -57549,7 +60532,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ensemble", @@ -57559,7 +60543,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -57569,7 +60554,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -57579,7 +60565,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -57589,7 +60576,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "seeds", @@ -57599,7 +60587,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "total_n_estimators", @@ -57609,7 +60598,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -57619,7 +60609,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -57643,7 +60634,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimators_features", @@ -57653,7 +60645,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -57663,7 +60656,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -57687,7 +60681,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimators_features", @@ -57697,7 +60692,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -57707,7 +60703,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -57717,7 +60714,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -57741,7 +60739,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimators_features", @@ -57751,7 +60750,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -57761,7 +60761,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -57771,7 +60772,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -57795,7 +60797,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimators_features", @@ -57805,7 +60808,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -57815,7 +60819,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -57839,7 +60844,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "index", @@ -57849,7 +60855,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -57873,7 +60880,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -57883,7 +60891,8 @@ "docstring": { "type": "object", "description": "The base estimator from which the ensemble is built." - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -57893,7 +60902,8 @@ "docstring": { "type": "int, default=10", "description": "The number of estimators in the ensemble." - } + }, + "refined_type": {} }, { "name": "estimator_params", @@ -57903,13 +60913,14 @@ "docstring": { "type": "list of str, default=tuple()", "description": "The list of attributes to use as parameters when instantiating a\nnew base estimator. If none are given, default parameters are used." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, base_estimator, *, n_estimators=10, estimator_params=tuple()):\n self.base_estimator = base_estimator\n self.n_estimators = n_estimators\n self.estimator_params = estimator_params" }, { @@ -57927,7 +60938,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -57951,7 +60963,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -57975,7 +60988,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "append", @@ -57985,7 +60999,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -57995,13 +61010,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Make and configure a copy of the `base_estimator_` attribute.\n\nWarning: This method should be used to properly instantiate new sub-estimators.", - "docstring": "Make and configure a copy of the `base_estimator_` attribute.\n\nWarning: This method should be used to properly instantiate new\nsub-estimators.", + "description": "Make and configure a copy of the `base_estimator_` attribute.\n\nWarning: This method should be used to properly instantiate new\nsub-estimators.", + "docstring": "Make and configure a copy of the `base_estimator_` attribute.\n\n Warning: This method should be used to properly instantiate new\n sub-estimators.\n ", "source_code": "\ndef _make_estimator(self, append=True, random_state=None):\n \"\"\"Make and configure a copy of the `base_estimator_` attribute.\n\n Warning: This method should be used to properly instantiate new\n sub-estimators.\n \"\"\"\n estimator = clone(self.base_estimator_)\n estimator.set_params(**{p: getattr(self, p) for p in self.estimator_params})\n if isinstance(estimator, (DecisionTreeRegressor, ExtraTreeRegressor)):\n if getattr(estimator, 'criterion', None) == 'mse':\n estimator.set_params(criterion='squared_error')\n elif getattr(estimator, 'criterion', None) == 'mae':\n estimator.set_params(criterion='absolute_error')\n if random_state is not None:\n _set_random_states(estimator, random_state)\n if append:\n self.estimators_.append(estimator)\n return estimator" }, { @@ -58019,7 +61035,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "default", @@ -58029,13 +61046,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check the estimator and the n_estimator attribute.\n\nSets the base_estimator_` attributes.", - "docstring": "Check the estimator and the n_estimator attribute.\n\nSets the base_estimator_` attributes.", + "docstring": "Check the estimator and the n_estimator attribute.\n\n Sets the base_estimator_` attributes.\n ", "source_code": "\ndef _validate_estimator(self, default=None):\n \"\"\"Check the estimator and the n_estimator attribute.\n\n Sets the base_estimator_` attributes.\n \"\"\"\n if not isinstance(self.n_estimators, numbers.Integral):\n raise ValueError('n_estimators must be an integer, got {0}.'.format(type(self.n_estimators)))\n if self.n_estimators <= 0:\n raise ValueError('n_estimators must be greater than zero, got {0}.'.format(self.n_estimators))\n if self.base_estimator is not None:\n self.base_estimator_ = self.base_estimator\n else:\n self.base_estimator_ = default\n if self.base_estimator_ is None:\n raise ValueError('base_estimator cannot be None')" }, { @@ -58053,7 +61071,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimators", @@ -58063,13 +61082,14 @@ "docstring": { "type": "list of (str, estimator) tuples", "description": "The ensemble of estimators to use in the ensemble. Each element of the\nlist is defined as a tuple of string (i.e. name of the estimator) and\nan estimator instance. An estimator can be set to `'drop'` using\n`set_params`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, estimators):\n self.estimators = estimators" }, { @@ -58087,13 +61107,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_estimators(self):\n if self.estimators is None or len(self.estimators) == 0:\n raise ValueError(\"Invalid 'estimators' attribute, 'estimators' should be a list of (string, estimator) tuples.\")\n (names, estimators) = zip(*self.estimators)\n self._validate_names(names)\n has_estimator = any((est != 'drop' for est in estimators))\n if not has_estimator:\n raise ValueError('All estimators are dropped. At least one is required to be an estimator.')\n is_estimator_type = is_classifier if is_classifier(self) else is_regressor\n for est in estimators:\n if est != 'drop' and not is_estimator_type(est):\n raise ValueError('The estimator {} should be a {}.'.format(est.__class__.__name__, is_estimator_type.__name__[3:]))\n return names, estimators" }, { @@ -58111,7 +61132,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -58121,13 +61143,14 @@ "docstring": { "type": "bool, default=True", "description": "Setting it to True gets the various estimators and the parameters\nof the estimators as well." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Get the parameters of an estimator from the ensemble.\n\nReturns the parameters given in the constructor as well as the estimators contained within the `estimators` parameter.", - "docstring": "Get the parameters of an estimator from the ensemble.\n\nReturns the parameters given in the constructor as well as the\nestimators contained within the `estimators` parameter.\n\nParameters\n----------\ndeep : bool, default=True\n Setting it to True gets the various estimators and the parameters\n of the estimators as well.\n\nReturns\n-------\nparams : dict\n Parameter and estimator names mapped to their values or parameter\n names mapped to their values.", + "description": "Get the parameters of an estimator from the ensemble.\n\nReturns the parameters given in the constructor as well as the\nestimators contained within the `estimators` parameter.", + "docstring": "\n Get the parameters of an estimator from the ensemble.\n\n Returns the parameters given in the constructor as well as the\n estimators contained within the `estimators` parameter.\n\n Parameters\n ----------\n deep : bool, default=True\n Setting it to True gets the various estimators and the parameters\n of the estimators as well.\n\n Returns\n -------\n params : dict\n Parameter and estimator names mapped to their values or parameter\n names mapped to their values.\n ", "source_code": "\ndef get_params(self, deep=True):\n \"\"\"\n Get the parameters of an estimator from the ensemble.\n\n Returns the parameters given in the constructor as well as the\n estimators contained within the `estimators` parameter.\n\n Parameters\n ----------\n deep : bool, default=True\n Setting it to True gets the various estimators and the parameters\n of the estimators as well.\n\n Returns\n -------\n params : dict\n Parameter and estimator names mapped to their values or parameter\n names mapped to their values.\n \"\"\"\n return super()._get_params('estimators', deep=deep)" }, { @@ -58145,13 +61168,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Dictionary to access any fitted sub-estimators by name.", - "docstring": "Dictionary to access any fitted sub-estimators by name.\n\nReturns\n-------\n:class:`~sklearn.utils.Bunch`", + "docstring": "Dictionary to access any fitted sub-estimators by name.\n\n Returns\n -------\n :class:`~sklearn.utils.Bunch`\n ", "source_code": "\n@property\ndef named_estimators(self):\n \"\"\"Dictionary to access any fitted sub-estimators by name.\n\n Returns\n -------\n :class:`~sklearn.utils.Bunch`\n \"\"\"\n return Bunch(**dict(self.estimators))" }, { @@ -58169,13 +61193,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Set the parameters of an estimator from the ensemble.\n\nValid parameter keys can be listed with `get_params()`. Note that you can directly set the parameters of the estimators contained in `estimators`.", - "docstring": "Set the parameters of an estimator from the ensemble.\n\nValid parameter keys can be listed with `get_params()`. Note that you\ncan directly set the parameters of the estimators contained in\n`estimators`.\n\nParameters\n----------\n**params : keyword arguments\n Specific parameters using e.g.\n `set_params(parameter_name=new_value)`. In addition, to setting the\n parameters of the estimator, the individual estimator of the\n estimators can also be set, or can be removed by setting them to\n 'drop'.\n\nReturns\n-------\nself : object\n Estimator instance.", + "description": "Set the parameters of an estimator from the ensemble.\n\nValid parameter keys can be listed with `get_params()`. Note that you\ncan directly set the parameters of the estimators contained in\n`estimators`.", + "docstring": "\n Set the parameters of an estimator from the ensemble.\n\n Valid parameter keys can be listed with `get_params()`. Note that you\n can directly set the parameters of the estimators contained in\n `estimators`.\n\n Parameters\n ----------\n **params : keyword arguments\n Specific parameters using e.g.\n `set_params(parameter_name=new_value)`. In addition, to setting the\n parameters of the estimator, the individual estimator of the\n estimators can also be set, or can be removed by setting them to\n 'drop'.\n\n Returns\n -------\n self : object\n Estimator instance.\n ", "source_code": "\ndef set_params(self, **params):\n \"\"\"\n Set the parameters of an estimator from the ensemble.\n\n Valid parameter keys can be listed with `get_params()`. Note that you\n can directly set the parameters of the estimators contained in\n `estimators`.\n\n Parameters\n ----------\n **params : keyword arguments\n Specific parameters using e.g.\n `set_params(parameter_name=new_value)`. In addition, to setting the\n parameters of the estimator, the individual estimator of the\n estimators can also be set, or can be removed by setting them to\n 'drop'.\n\n Returns\n -------\n self : object\n Estimator instance.\n \"\"\"\n super()._set_params('estimators', **params)\n return self" }, { @@ -58193,7 +61218,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -58203,7 +61229,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -58213,7 +61240,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -58223,7 +61251,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "message_clsname", @@ -58233,7 +61262,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "message", @@ -58243,7 +61273,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -58267,7 +61298,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -58277,7 +61309,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -58301,7 +61334,8 @@ "docstring": { "type": "estimator supporting get/set_params", "description": "Estimator with potential randomness managed by random_state\nparameters." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -58311,13 +61345,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pseudo-random number generator to control the generation of the random\nintegers. Pass an int for reproducible output across multiple function\ncalls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Set fixed random_state parameters for an estimator.\n\nFinds all parameters ending ``random_state`` and sets them to integers derived from ``random_state``.", - "docstring": "Set fixed random_state parameters for an estimator.\n\nFinds all parameters ending ``random_state`` and sets them to integers\nderived from ``random_state``.\n\nParameters\n----------\nestimator : estimator supporting get/set_params\n Estimator with potential randomness managed by random_state\n parameters.\n\nrandom_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the generation of the random\n integers. Pass an int for reproducible output across multiple function\n calls.\n See :term:`Glossary `.\n\nNotes\n-----\nThis does not necessarily set *all* ``random_state`` attributes that\ncontrol an estimator's randomness, only those accessible through\n``estimator.get_params()``. ``random_state``s not controlled include\nthose belonging to:\n\n * cross-validation splitters\n * ``scipy.stats`` rvs", + "description": "Set fixed random_state parameters for an estimator.\n\nFinds all parameters ending ``random_state`` and sets them to integers\nderived from ``random_state``.", + "docstring": "Set fixed random_state parameters for an estimator.\n\n Finds all parameters ending ``random_state`` and sets them to integers\n derived from ``random_state``.\n\n Parameters\n ----------\n estimator : estimator supporting get/set_params\n Estimator with potential randomness managed by random_state\n parameters.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the generation of the random\n integers. Pass an int for reproducible output across multiple function\n calls.\n See :term:`Glossary `.\n\n Notes\n -----\n This does not necessarily set *all* ``random_state`` attributes that\n control an estimator's randomness, only those accessible through\n ``estimator.get_params()``. ``random_state``s not controlled include\n those belonging to:\n\n * cross-validation splitters\n * ``scipy.stats`` rvs\n ", "source_code": "\ndef _set_random_states(estimator, random_state=None):\n \"\"\"Set fixed random_state parameters for an estimator.\n\n Finds all parameters ending ``random_state`` and sets them to integers\n derived from ``random_state``.\n\n Parameters\n ----------\n estimator : estimator supporting get/set_params\n Estimator with potential randomness managed by random_state\n parameters.\n\n random_state : int, RandomState instance or None, default=None\n Pseudo-random number generator to control the generation of the random\n integers. Pass an int for reproducible output across multiple function\n calls.\n See :term:`Glossary `.\n\n Notes\n -----\n This does not necessarily set *all* ``random_state`` attributes that\n control an estimator's randomness, only those accessible through\n ``estimator.get_params()``. ``random_state``s not controlled include\n those belonging to:\n\n * cross-validation splitters\n * ``scipy.stats`` rvs\n \"\"\"\n random_state = check_random_state(random_state)\n to_set = {}\n for key in sorted(estimator.get_params(deep=True)):\n if key == 'random_state' or key.endswith('__random_state'):\n to_set[key] = random_state.randint(np.iinfo(np.int32).max)\n if to_set:\n estimator.set_params(**to_set)" }, { @@ -58335,7 +61370,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -58345,7 +61381,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -58355,7 +61392,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_params", @@ -58365,7 +61403,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bootstrap", @@ -58375,7 +61414,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "oob_score", @@ -58385,7 +61425,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -58395,7 +61436,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -58405,7 +61447,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -58415,7 +61458,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -58425,7 +61469,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -58435,7 +61480,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -58445,13 +61491,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, base_estimator, n_estimators=100, *, estimator_params=tuple(), bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, max_samples=None):\n super().__init__(base_estimator=base_estimator, n_estimators=n_estimators, estimator_params=estimator_params)\n self.bootstrap = bootstrap\n self.oob_score = oob_score\n self.n_jobs = n_jobs\n self.random_state = random_state\n self.verbose = verbose\n self.warm_start = warm_start\n self.class_weight = class_weight\n self.max_samples = max_samples" }, { @@ -58469,7 +61516,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -58479,7 +61527,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix." - } + }, + "refined_type": {} }, { "name": "y", @@ -58489,14 +61538,15 @@ "docstring": { "type": "ndarray of shape (n_samples, n_outputs)", "description": "The target matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute and set the OOB score.", - "docstring": "Compute and set the OOB score.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix.\ny : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n\nReturns\n-------\noob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or (n_samples, 1, n_outputs)\n The OOB predictions.", - "source_code": "\ndef _compute_oob_predictions(self, X, y):\n \"\"\"Compute and set the OOB score.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n\n Returns\n -------\n oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or (n_samples, 1, n_outputs)\n The OOB predictions.\n \"\"\"\n X = self._validate_data(X, dtype=DTYPE, accept_sparse='csr', reset=False)\n n_samples = y.shape[0]\n n_outputs = self.n_outputs_\n if is_classifier(self) and hasattr(self, 'n_classes_'):\n oob_pred_shape = (n_samples, self.n_classes_[0], n_outputs)\n else:\n oob_pred_shape = (n_samples, 1, n_outputs)\n oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64)\n n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64)\n n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples)\n for estimator in self.estimators_:\n unsampled_indices = _generate_unsampled_indices(estimator.random_state, n_samples, n_samples_bootstrap)\n y_pred = self._get_oob_predictions(estimator, X[unsampled_indices, :])\n oob_pred[unsampled_indices, ...] += y_pred\n n_oob_pred[unsampled_indices, :] += 1\n for k in range(n_outputs):\n if (n_oob_pred == 0).any():\n warn('Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.', UserWarning)\n n_oob_pred[n_oob_pred == 0] = 1\n oob_pred[..., k] /= n_oob_pred[..., [k]]\n return oob_pred" + "docstring": "Compute and set the OOB score.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n\n Returns\n -------\n oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or (n_samples, 1, n_outputs)\n The OOB predictions.\n ", + "source_code": "\ndef _compute_oob_predictions(self, X, y):\n \"\"\"Compute and set the OOB score.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n\n Returns\n -------\n oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or (n_samples, 1, n_outputs)\n The OOB predictions.\n \"\"\"\n if issparse(X):\n X = X.tocsr()\n n_samples = y.shape[0]\n n_outputs = self.n_outputs_\n if is_classifier(self) and hasattr(self, 'n_classes_'):\n oob_pred_shape = (n_samples, self.n_classes_[0], n_outputs)\n else:\n oob_pred_shape = (n_samples, 1, n_outputs)\n oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64)\n n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64)\n n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples)\n for estimator in self.estimators_:\n unsampled_indices = _generate_unsampled_indices(estimator.random_state, n_samples, n_samples_bootstrap)\n y_pred = self._get_oob_predictions(estimator, X[unsampled_indices, :])\n oob_pred[unsampled_indices, ...] += y_pred\n n_oob_pred[unsampled_indices, :] += 1\n for k in range(n_outputs):\n if (n_oob_pred == 0).any():\n warn('Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.', UserWarning)\n n_oob_pred[n_oob_pred == 0] = 1\n oob_pred[..., k] /= n_oob_pred[..., [k]]\n return oob_pred" }, { "name": "_set_oob_score_and_attributes", @@ -58513,7 +61563,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -58523,7 +61574,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix." - } + }, + "refined_type": {} }, { "name": "y", @@ -58533,13 +61585,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_outputs)", "description": "The target matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute and set the OOB score and attributes.", - "docstring": "Compute and set the OOB score and attributes.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix.\ny : ndarray of shape (n_samples, n_outputs)\n The target matrix.", + "docstring": "Compute and set the OOB score and attributes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n ", "source_code": "\n@abstractmethod\ndef _set_oob_score_and_attributes(self, X, y):\n \"\"\"Compute and set the OOB score and attributes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n \"\"\"\n " }, { @@ -58557,7 +61610,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -58567,13 +61621,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Validate X whenever one tries to predict, apply, predict_proba.", - "docstring": "Validate X whenever one tries to predict, apply, predict_proba.", + "docstring": "\n Validate X whenever one tries to predict, apply, predict_proba.", "source_code": "\ndef _validate_X_predict(self, X):\n \"\"\"\n Validate X whenever one tries to predict, apply, predict_proba.\"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, dtype=DTYPE, accept_sparse='csr', reset=False)\n if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):\n raise ValueError('No support for np.int64 index based sparse matrices')\n return X" }, { @@ -58591,7 +61646,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -58601,13 +61657,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_y_class_weight(self, y):\n return y, None" }, { @@ -58625,7 +61682,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -58635,13 +61693,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, its dtype will be converted to\n``dtype=np.float32``. If a sparse matrix is provided, it will be\nconverted into a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Apply trees in the forest to X, return leaf indices.", - "docstring": "Apply trees in the forest to X, return leaf indices.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\nReturns\n-------\nX_leaves : ndarray of shape (n_samples, n_estimators)\n For each datapoint x in X and for each tree in the forest,\n return the index of the leaf x ends up in.", + "docstring": "\n Apply trees in the forest to X, return leaf indices.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n X_leaves : ndarray of shape (n_samples, n_estimators)\n For each datapoint x in X and for each tree in the forest,\n return the index of the leaf x ends up in.\n ", "source_code": "\ndef apply(self, X):\n \"\"\"\n Apply trees in the forest to X, return leaf indices.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n X_leaves : ndarray of shape (n_samples, n_estimators)\n For each datapoint x in X and for each tree in the forest,\n return the index of the leaf x ends up in.\n \"\"\"\n X = self._validate_X_predict(X)\n results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))((delayed(tree.apply)(X, check_input=False) for tree in self.estimators_))\n return np.array(results).T" }, { @@ -58659,7 +61721,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -58669,13 +61732,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, its dtype will be converted to\n``dtype=np.float32``. If a sparse matrix is provided, it will be\nconverted into a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Return the decision path in the forest.\n\n.. versionadded:: 0.18", - "docstring": "Return the decision path in the forest.\n\n.. versionadded:: 0.18\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\nReturns\n-------\nindicator : sparse matrix of shape (n_samples, n_nodes)\n Return a node indicator matrix where non zero elements indicates\n that the samples goes through the nodes. The matrix is of CSR\n format.\n\nn_nodes_ptr : ndarray of shape (n_estimators + 1,)\n The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]\n gives the indicator value for the i-th estimator.", + "docstring": "\n Return the decision path in the forest.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n indicator : sparse matrix of shape (n_samples, n_nodes)\n Return a node indicator matrix where non zero elements indicates\n that the samples goes through the nodes. The matrix is of CSR\n format.\n\n n_nodes_ptr : ndarray of shape (n_estimators + 1,)\n The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]\n gives the indicator value for the i-th estimator.\n ", "source_code": "\ndef decision_path(self, X):\n \"\"\"\n Return the decision path in the forest.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n indicator : sparse matrix of shape (n_samples, n_nodes)\n Return a node indicator matrix where non zero elements indicates\n that the samples goes through the nodes. The matrix is of CSR\n format.\n\n n_nodes_ptr : ndarray of shape (n_estimators + 1,)\n The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]\n gives the indicator value for the i-th estimator.\n \"\"\"\n X = self._validate_X_predict(X)\n indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))((delayed(tree.decision_path)(X, check_input=False) for tree in self.estimators_))\n n_nodes = [0]\n n_nodes.extend([i.shape[1] for i in indicators])\n n_nodes_ptr = np.array(n_nodes).cumsum()\n return sparse_hstack(indicators).tocsr(), n_nodes_ptr" }, { @@ -58693,13 +61760,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "The impurity-based feature importances.\n\nThe higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values). See :func:`sklearn.inspection.permutation_importance` as an alternative.", - "docstring": "The impurity-based feature importances.\n\nThe higher, the more important the feature.\nThe importance of a feature is computed as the (normalized)\ntotal reduction of the criterion brought by that feature. It is also\nknown as the Gini importance.\n\nWarning: impurity-based feature importances can be misleading for\nhigh cardinality features (many unique values). See\n:func:`sklearn.inspection.permutation_importance` as an alternative.\n\nReturns\n-------\nfeature_importances_ : ndarray of shape (n_features,)\n The values of this array sum to 1, unless all trees are single node\n trees consisting of only the root node, in which case it will be an\n array of zeros.", + "description": "The impurity-based feature importances.\n\nThe higher, the more important the feature.\nThe importance of a feature is computed as the (normalized)\ntotal reduction of the criterion brought by that feature. It is also\nknown as the Gini importance.\n\nWarning: impurity-based feature importances can be misleading for\nhigh cardinality features (many unique values). See\n:func:`sklearn.inspection.permutation_importance` as an alternative.", + "docstring": "\n The impurity-based feature importances.\n\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n The values of this array sum to 1, unless all trees are single node\n trees consisting of only the root node, in which case it will be an\n array of zeros.\n ", "source_code": "\n@property\ndef feature_importances_(self):\n \"\"\"\n The impurity-based feature importances.\n\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n The values of this array sum to 1, unless all trees are single node\n trees consisting of only the root node, in which case it will be an\n array of zeros.\n \"\"\"\n check_is_fitted(self)\n all_importances = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))((delayed(getattr)(tree, 'feature_importances_') for tree in self.estimators_ if tree.tree_.node_count > 1))\n if not all_importances:\n return np.zeros(self.n_features_in_, dtype=np.float64)\n all_importances = np.mean(all_importances, axis=0, dtype=np.float64)\n return all_importances / np.sum(all_importances)" }, { @@ -58717,7 +61785,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -58727,6 +61796,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Internally, its dtype will be converted\nto ``dtype=np.float32``. If a sparse matrix is provided, it will be\nconverted into a sparse ``csc_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -58737,7 +61810,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "The target values (class labels in classification, real numbers in\nregression)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -58747,14 +61821,15 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted. Splits\nthat would create child nodes with net zero or negative weight are\nignored while searching for a split in each node. In the case of\nclassification, splits are also ignored if they would result in any\nsingle class carrying a negative weight in either child node." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Build a forest of trees from the training set (X, y).", - "docstring": "Build a forest of trees from the training set (X, y).\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, its dtype will be converted\n to ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csc_matrix``.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels in classification, real numbers in\n regression).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\nReturns\n-------\nself : object\n Fitted estimator.", - "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"\n Build a forest of trees from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, its dtype will be converted\n to ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csc_matrix``.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if issparse(y):\n raise ValueError('sparse multilabel-indicator for y is not supported.')\n (X, y) = self._validate_data(X, y, multi_output=True, accept_sparse='csc', dtype=DTYPE)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if issparse(X):\n X.sort_indices()\n y = np.atleast_1d(y)\n if y.ndim == 2 and y.shape[1] == 1:\n warn('A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().', DataConversionWarning, stacklevel=2)\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n if self.criterion == 'poisson':\n if np.any(y < 0):\n raise ValueError('Some value(s) of y are negative which is not allowed for Poisson regression.')\n if np.sum(y) <= 0:\n raise ValueError('Sum of y is not strictly positive which is necessary for Poisson regression.')\n self.n_outputs_ = y.shape[1]\n (y, expanded_class_weight) = self._validate_y_class_weight(y)\n if getattr(y, 'dtype', None) != DOUBLE or not y.flags.contiguous:\n y = np.ascontiguousarray(y, dtype=DOUBLE)\n if expanded_class_weight is not None:\n if sample_weight is not None:\n sample_weight = sample_weight * expanded_class_weight\n else:\n sample_weight = expanded_class_weight\n n_samples_bootstrap = _get_n_samples_bootstrap(n_samples=X.shape[0], max_samples=self.max_samples)\n self._validate_estimator()\n if isinstance(self, (RandomForestRegressor, ExtraTreesRegressor)):\n if self.criterion == 'mse':\n warn(\"Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.\", FutureWarning)\n elif self.criterion == 'mae':\n warn(\"Criterion 'mae' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='absolute_error'` which is equivalent.\", FutureWarning)\n if not self.bootstrap and self.oob_score:\n raise ValueError('Out of bag estimation only available if bootstrap=True')\n random_state = check_random_state(self.random_state)\n if not self.warm_start or not hasattr(self, 'estimators_'):\n self.estimators_ = []\n n_more_estimators = self.n_estimators - len(self.estimators_)\n if n_more_estimators < 0:\n raise ValueError('n_estimators=%d must be larger or equal to len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_)))\n elif n_more_estimators == 0:\n warn('Warm-start fitting without increasing n_estimators does not fit new trees.')\n else:\n if self.warm_start and len(self.estimators_) > 0:\n random_state.randint(MAX_INT, size=len(self.estimators_))\n trees = [self._make_estimator(append=False, random_state=random_state) for i in range(n_more_estimators)]\n trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))((delayed(_parallel_build_trees)(t, self, X, y, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap) for (i, t) in enumerate(trees)))\n self.estimators_.extend(trees)\n if self.oob_score:\n y_type = type_of_target(y)\n if y_type in ('multiclass-multioutput', 'unknown'):\n raise ValueError(f'The type of target cannot be used to compute OOB estimates. Got {y_type} while only the following are supported: continuous, continuous-multioutput, binary, multiclass, multilabel-indicator.')\n self._set_oob_score_and_attributes(X, y)\n if hasattr(self, 'classes_') and self.n_outputs_ == 1:\n self.n_classes_ = self.n_classes_[0]\n self.classes_ = self.classes_[0]\n return self" + "docstring": "\n Build a forest of trees from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, its dtype will be converted\n to ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csc_matrix``.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", + "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"\n Build a forest of trees from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, its dtype will be converted\n to ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csc_matrix``.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if issparse(y):\n raise ValueError('sparse multilabel-indicator for y is not supported.')\n (X, y) = self._validate_data(X, y, multi_output=True, accept_sparse='csc', dtype=DTYPE)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n if issparse(X):\n X.sort_indices()\n y = np.atleast_1d(y)\n if y.ndim == 2 and y.shape[1] == 1:\n warn('A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().', DataConversionWarning, stacklevel=2)\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n if self.criterion == 'poisson':\n if np.any(y < 0):\n raise ValueError('Some value(s) of y are negative which is not allowed for Poisson regression.')\n if np.sum(y) <= 0:\n raise ValueError('Sum of y is not strictly positive which is necessary for Poisson regression.')\n self.n_outputs_ = y.shape[1]\n (y, expanded_class_weight) = self._validate_y_class_weight(y)\n if getattr(y, 'dtype', None) != DOUBLE or not y.flags.contiguous:\n y = np.ascontiguousarray(y, dtype=DOUBLE)\n if expanded_class_weight is not None:\n if sample_weight is not None:\n sample_weight = sample_weight * expanded_class_weight\n else:\n sample_weight = expanded_class_weight\n if not self.bootstrap and self.max_samples is not None:\n raise ValueError('`max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.')\n elif self.bootstrap:\n n_samples_bootstrap = _get_n_samples_bootstrap(n_samples=X.shape[0], max_samples=self.max_samples)\n else:\n n_samples_bootstrap = None\n self._validate_estimator()\n if isinstance(self, (RandomForestRegressor, ExtraTreesRegressor)):\n if self.criterion == 'mse':\n warn(\"Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.\", FutureWarning)\n elif self.criterion == 'mae':\n warn(\"Criterion 'mae' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='absolute_error'` which is equivalent.\", FutureWarning)\n if not self.bootstrap and self.oob_score:\n raise ValueError('Out of bag estimation only available if bootstrap=True')\n random_state = check_random_state(self.random_state)\n if not self.warm_start or not hasattr(self, 'estimators_'):\n self.estimators_ = []\n n_more_estimators = self.n_estimators - len(self.estimators_)\n if n_more_estimators < 0:\n raise ValueError('n_estimators=%d must be larger or equal to len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_)))\n elif n_more_estimators == 0:\n warn('Warm-start fitting without increasing n_estimators does not fit new trees.')\n else:\n if self.warm_start and len(self.estimators_) > 0:\n random_state.randint(MAX_INT, size=len(self.estimators_))\n trees = [self._make_estimator(append=False, random_state=random_state) for i in range(n_more_estimators)]\n trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))((delayed(_parallel_build_trees)(t, self, X, y, sample_weight, i, len(trees), verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap) for (i, t) in enumerate(trees)))\n self.estimators_.extend(trees)\n if self.oob_score:\n y_type = type_of_target(y)\n if y_type in ('multiclass-multioutput', 'unknown'):\n raise ValueError(f'The type of target cannot be used to compute OOB estimates. Got {y_type} while only the following are supported: continuous, continuous-multioutput, binary, multiclass, multilabel-indicator.')\n self._set_oob_score_and_attributes(X, y)\n if hasattr(self, 'classes_') and self.n_outputs_ == 1:\n self.n_classes_ = self.n_classes_[0]\n self.classes_ = self.classes_[0]\n return self" }, { "name": "n_features_", @@ -58774,7 +61849,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -58798,7 +61874,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -58808,7 +61885,8 @@ "docstring": { "type": "int, default=100", "description": "The number of trees in the forest.\n\n.. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22." - } + }, + "refined_type": {} }, { "name": "criterion", @@ -58818,6 +61896,10 @@ "docstring": { "type": "{\"gini\", \"entropy\"}, default=\"gini\"", "description": "The function to measure the quality of a split. Supported criteria are\n\"gini\" for the Gini impurity and \"entropy\" for the information gain." + }, + "refined_type": { + "kind": "EnumType", + "values": ["gini", "entropy"] } }, { @@ -58828,7 +61910,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum depth of the tree. If None, then nodes are expanded until\nall leaves are pure or until all leaves contain less than\nmin_samples_split samples." - } + }, + "refined_type": {} }, { "name": "min_samples_split", @@ -58838,7 +61921,8 @@ "docstring": { "type": "int or float, default=2", "description": "The minimum number of samples required to split an internal node:\n\n- If int, then consider `min_samples_split` as the minimum number.\n- If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -58848,7 +61932,8 @@ "docstring": { "type": "int or float, default=1", "description": "The minimum number of samples required to be at a leaf node.\nA split point at any depth will only be considered if it leaves at\nleast ``min_samples_leaf`` training samples in each of the left and\nright branches. This may have the effect of smoothing the model,\nespecially in regression.\n\n- If int, then consider `min_samples_leaf` as the minimum number.\n- If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -58858,7 +61943,8 @@ "docstring": { "type": "float, default=0.0", "description": "The minimum weighted fraction of the sum total of weights (of all\nthe input samples) required to be at a leaf node. Samples have\nequal weight when sample_weight is not provided." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -58868,6 +61954,10 @@ "docstring": { "type": "{\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"", "description": "The number of features to consider when looking for the best split:\n\n- If int, then consider `max_features` features at each split.\n- If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n- If \"auto\", then `max_features=sqrt(n_features)`.\n- If \"sqrt\", then `max_features=sqrt(n_features)`.\n- If \"log2\", then `max_features=log2(n_features)`.\n- If None, then `max_features=n_features`.\n\nNote: the search for a split does not stop until at least one\nvalid partition of the node samples is found, even if it requires to\neffectively inspect more than ``max_features`` features." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "sqrt", "log2"] } }, { @@ -58878,7 +61968,8 @@ "docstring": { "type": "int, default=None", "description": "Grow trees with ``max_leaf_nodes`` in best-first fashion.\nBest nodes are defined as relative reduction in impurity.\nIf None then unlimited number of leaf nodes." - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -58888,7 +61979,8 @@ "docstring": { "type": "float, default=0.0", "description": "A node will be split if this split induces a decrease of the impurity\ngreater than or equal to this value.\n\nThe weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\nwhere ``N`` is the total number of samples, ``N_t`` is the number of\nsamples at the current node, ``N_t_L`` is the number of samples in the\nleft child, and ``N_t_R`` is the number of samples in the right child.\n\n``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\nif ``sample_weight`` is passed.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "bootstrap", @@ -58898,7 +61990,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether bootstrap samples are used when building trees. If False, the\nwhole dataset is used to build each tree." - } + }, + "refined_type": {} }, { "name": "oob_score", @@ -58908,7 +62001,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use out-of-bag samples to estimate the generalization score.\nOnly available if bootstrap=True." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -58918,7 +62012,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n:meth:`decision_path` and :meth:`apply` are all parallelized over the\ntrees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\ncontext. ``-1`` means using all processors. See :term:`Glossary\n` for more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -58928,7 +62023,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls 3 sources of randomness:\n\n- the bootstrapping of the samples used when building trees\n (if ``bootstrap=True``)\n- the sampling of the features to consider when looking for the best\n split at each node (if ``max_features < n_features``)\n- the draw of the splits for each of the `max_features`\n\nSee :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -58938,7 +62034,8 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity when fitting and predicting." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -58948,7 +62045,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit\nand add more estimators to the ensemble, otherwise, just fit a whole\nnew forest. See :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -58958,6 +62056,10 @@ "docstring": { "type": "{\"balanced\", \"balanced_subsample\"}, dict or list of dicts, default=None", "description": "Weights associated with classes in the form ``{class_label: weight}``.\nIf not given, all classes are supposed to have weight one. For\nmulti-output problems, a list of dicts can be provided in the same\norder as the columns of y.\n\nNote that for multioutput (including multilabel) weights should be\ndefined for each class of every column in its own dict. For example,\nfor four-class multilabel classification weights should be\n[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n[{1:1}, {2:5}, {3:1}, {4:1}].\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``\n\nThe \"balanced_subsample\" mode is the same as \"balanced\" except that\nweights are computed based on the bootstrap sample for every tree\ngrown.\n\nFor multi-output, the weights of each column of y will be multiplied.\n\nNote that these weights will be multiplied with sample_weight (passed\nthrough the fit method) if sample_weight is specified." + }, + "refined_type": { + "kind": "EnumType", + "values": ["balanced", "balanced_subsample"] } }, { @@ -58968,7 +62070,8 @@ "docstring": { "type": "non-negative float, default=0.0", "description": "Complexity parameter used for Minimal Cost-Complexity Pruning. The\nsubtree with the largest cost complexity that is smaller than\n``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n:ref:`minimal_cost_complexity_pruning` for details.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -58978,13 +62081,21 @@ "docstring": { "type": "int or float, default=None", "description": "If bootstrap is True, the number of samples to draw from X\nto train each base estimator.\n\n- If None (default), then draw `X.shape[0]` samples.\n- If int, then draw `max_samples` samples.\n- If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n.. versionadded:: 0.22" + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": false, + "max_inclusive": true } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None):\n super().__init__(base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples)\n self.criterion = criterion\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha" }, { @@ -59002,7 +62113,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -59012,7 +62124,8 @@ "docstring": { "type": "int, default=100", "description": "The number of trees in the forest.\n\n.. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22." - } + }, + "refined_type": {} }, { "name": "criterion", @@ -59022,6 +62135,10 @@ "docstring": { "type": "{\"squared_error\", \"absolute_error\"}, default=\"squared_error\"", "description": "The function to measure the quality of a split. Supported criteria\nare \"squared_error\" for the mean squared error, which is equal to\nvariance reduction as feature selection criterion, and \"absolute_error\"\nfor the mean absolute error.\n\n.. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n.. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n.. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent." + }, + "refined_type": { + "kind": "EnumType", + "values": ["squared_error", "absolute_error"] } }, { @@ -59032,7 +62149,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum depth of the tree. If None, then nodes are expanded until\nall leaves are pure or until all leaves contain less than\nmin_samples_split samples." - } + }, + "refined_type": {} }, { "name": "min_samples_split", @@ -59042,7 +62160,8 @@ "docstring": { "type": "int or float, default=2", "description": "The minimum number of samples required to split an internal node:\n\n- If int, then consider `min_samples_split` as the minimum number.\n- If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -59052,7 +62171,8 @@ "docstring": { "type": "int or float, default=1", "description": "The minimum number of samples required to be at a leaf node.\nA split point at any depth will only be considered if it leaves at\nleast ``min_samples_leaf`` training samples in each of the left and\nright branches. This may have the effect of smoothing the model,\nespecially in regression.\n\n- If int, then consider `min_samples_leaf` as the minimum number.\n- If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -59062,7 +62182,8 @@ "docstring": { "type": "float, default=0.0", "description": "The minimum weighted fraction of the sum total of weights (of all\nthe input samples) required to be at a leaf node. Samples have\nequal weight when sample_weight is not provided." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -59072,6 +62193,10 @@ "docstring": { "type": "{\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"", "description": "The number of features to consider when looking for the best split:\n\n- If int, then consider `max_features` features at each split.\n- If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n- If \"auto\", then `max_features=n_features`.\n- If \"sqrt\", then `max_features=sqrt(n_features)`.\n- If \"log2\", then `max_features=log2(n_features)`.\n- If None, then `max_features=n_features`.\n\nNote: the search for a split does not stop until at least one\nvalid partition of the node samples is found, even if it requires to\neffectively inspect more than ``max_features`` features." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "sqrt", "log2"] } }, { @@ -59082,7 +62207,8 @@ "docstring": { "type": "int, default=None", "description": "Grow trees with ``max_leaf_nodes`` in best-first fashion.\nBest nodes are defined as relative reduction in impurity.\nIf None then unlimited number of leaf nodes." - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -59092,7 +62218,8 @@ "docstring": { "type": "float, default=0.0", "description": "A node will be split if this split induces a decrease of the impurity\ngreater than or equal to this value.\n\nThe weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\nwhere ``N`` is the total number of samples, ``N_t`` is the number of\nsamples at the current node, ``N_t_L`` is the number of samples in the\nleft child, and ``N_t_R`` is the number of samples in the right child.\n\n``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\nif ``sample_weight`` is passed.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "bootstrap", @@ -59102,7 +62229,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether bootstrap samples are used when building trees. If False, the\nwhole dataset is used to build each tree." - } + }, + "refined_type": {} }, { "name": "oob_score", @@ -59112,7 +62240,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use out-of-bag samples to estimate the generalization score.\nOnly available if bootstrap=True." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -59122,7 +62251,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n:meth:`decision_path` and :meth:`apply` are all parallelized over the\ntrees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\ncontext. ``-1`` means using all processors. See :term:`Glossary\n` for more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -59132,7 +62262,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls 3 sources of randomness:\n\n- the bootstrapping of the samples used when building trees\n (if ``bootstrap=True``)\n- the sampling of the features to consider when looking for the best\n split at each node (if ``max_features < n_features``)\n- the draw of the splits for each of the `max_features`\n\nSee :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -59142,7 +62273,8 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity when fitting and predicting." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -59152,7 +62284,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit\nand add more estimators to the ensemble, otherwise, just fit a whole\nnew forest. See :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "ccp_alpha", @@ -59162,7 +62295,8 @@ "docstring": { "type": "non-negative float, default=0.0", "description": "Complexity parameter used for Minimal Cost-Complexity Pruning. The\nsubtree with the largest cost complexity that is smaller than\n``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n:ref:`minimal_cost_complexity_pruning` for details.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -59172,13 +62306,21 @@ "docstring": { "type": "int or float, default=None", "description": "If bootstrap is True, the number of samples to draw from X\nto train each base estimator.\n\n- If None (default), then draw `X.shape[0]` samples.\n- If int, then draw `max_samples` samples.\n- If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n.. versionadded:: 0.22" + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": false, + "max_inclusive": true } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_estimators=100, *, criterion='squared_error', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None):\n super().__init__(base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, max_samples=max_samples)\n self.criterion = criterion\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha" }, { @@ -59196,7 +62338,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -59206,7 +62349,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -59216,7 +62360,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_params", @@ -59226,7 +62371,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bootstrap", @@ -59236,7 +62382,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "oob_score", @@ -59246,7 +62393,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -59256,7 +62404,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -59266,7 +62415,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -59276,7 +62426,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -59286,7 +62437,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -59296,7 +62448,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -59306,13 +62459,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, base_estimator, n_estimators=100, *, estimator_params=tuple(), bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, max_samples=None):\n super().__init__(base_estimator, n_estimators=n_estimators, estimator_params=estimator_params, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples)" }, { @@ -59330,7 +62484,8 @@ "docstring": { "type": "DecisionTreeClassifier object", "description": "A single decision tree classifier." - } + }, + "refined_type": {} }, { "name": "X", @@ -59340,13 +62495,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The OOB samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the OOB predictions for an individual tree.", - "docstring": "Compute the OOB predictions for an individual tree.\n\nParameters\n----------\ntree : DecisionTreeClassifier object\n A single decision tree classifier.\nX : ndarray of shape (n_samples, n_features)\n The OOB samples.\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples, n_classes, n_outputs)\n The OOB associated predictions.", + "docstring": "Compute the OOB predictions for an individual tree.\n\n Parameters\n ----------\n tree : DecisionTreeClassifier object\n A single decision tree classifier.\n X : ndarray of shape (n_samples, n_features)\n The OOB samples.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples, n_classes, n_outputs)\n The OOB associated predictions.\n ", "source_code": "\n@staticmethod\ndef _get_oob_predictions(tree, X):\n \"\"\"Compute the OOB predictions for an individual tree.\n\n Parameters\n ----------\n tree : DecisionTreeClassifier object\n A single decision tree classifier.\n X : ndarray of shape (n_samples, n_features)\n The OOB samples.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples, n_classes, n_outputs)\n The OOB associated predictions.\n \"\"\"\n y_pred = tree.predict_proba(X, check_input=False)\n y_pred = np.array(y_pred, copy=False)\n if y_pred.ndim == 2:\n y_pred = y_pred[..., np.newaxis]\n else:\n y_pred = np.rollaxis(y_pred, axis=0, start=3)\n return y_pred" }, { @@ -59364,13 +62520,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multilabel': True}" }, { @@ -59388,7 +62545,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -59398,7 +62556,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix." - } + }, + "refined_type": {} }, { "name": "y", @@ -59408,13 +62567,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_outputs)", "description": "The target matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute and set the OOB score and attributes.", - "docstring": "Compute and set the OOB score and attributes.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix.\ny : ndarray of shape (n_samples, n_outputs)\n The target matrix.", + "docstring": "Compute and set the OOB score and attributes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n ", "source_code": "\ndef _set_oob_score_and_attributes(self, X, y):\n \"\"\"Compute and set the OOB score and attributes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n \"\"\"\n self.oob_decision_function_ = super()._compute_oob_predictions(X, y)\n if self.oob_decision_function_.shape[-1] == 1:\n self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1)\n self.oob_score_ = accuracy_score(y, np.argmax(self.oob_decision_function_, axis=1))" }, { @@ -59432,7 +62592,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -59442,13 +62603,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_y_class_weight(self, y):\n check_classification_targets(y)\n y = np.copy(y)\n expanded_class_weight = None\n if self.class_weight is not None:\n y_original = np.copy(y)\n self.classes_ = []\n self.n_classes_ = []\n y_store_unique_indices = np.zeros(y.shape, dtype=int)\n for k in range(self.n_outputs_):\n (classes_k, y_store_unique_indices[:, k]) = np.unique(y[:, k], return_inverse=True)\n self.classes_.append(classes_k)\n self.n_classes_.append(classes_k.shape[0])\n y = y_store_unique_indices\n if self.class_weight is not None:\n valid_presets = ('balanced', 'balanced_subsample')\n if isinstance(self.class_weight, str):\n if self.class_weight not in valid_presets:\n raise ValueError('Valid presets for class_weight include \"balanced\" and \"balanced_subsample\".Given \"%s\".' % self.class_weight)\n if self.warm_start:\n warn('class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.')\n if self.class_weight != 'balanced_subsample' or not self.bootstrap:\n if self.class_weight == 'balanced_subsample':\n class_weight = 'balanced'\n else:\n class_weight = self.class_weight\n expanded_class_weight = compute_sample_weight(class_weight, y_original)\n return y, expanded_class_weight" }, { @@ -59466,7 +62628,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -59476,13 +62639,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, its dtype will be converted to\n``dtype=np.float32``. If a sparse matrix is provided, it will be\nconverted into a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Predict class for X.\n\nThe predicted class of an input sample is a vote by the trees in the forest, weighted by their probability estimates. That is, the predicted class is the one with highest mean probability estimate across the trees.", - "docstring": "Predict class for X.\n\nThe predicted class of an input sample is a vote by the trees in\nthe forest, weighted by their probability estimates. That is,\nthe predicted class is the one with highest mean probability\nestimate across the trees.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\nReturns\n-------\ny : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The predicted classes.", + "description": "Predict class for X.\n\nThe predicted class of an input sample is a vote by the trees in\nthe forest, weighted by their probability estimates. That is,\nthe predicted class is the one with highest mean probability\nestimate across the trees.", + "docstring": "\n Predict class for X.\n\n The predicted class of an input sample is a vote by the trees in\n the forest, weighted by their probability estimates. That is,\n the predicted class is the one with highest mean probability\n estimate across the trees.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The predicted classes.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"\n Predict class for X.\n\n The predicted class of an input sample is a vote by the trees in\n the forest, weighted by their probability estimates. That is,\n the predicted class is the one with highest mean probability\n estimate across the trees.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The predicted classes.\n \"\"\"\n proba = self.predict_proba(X)\n if self.n_outputs_ == 1:\n return self.classes_.take(np.argmax(proba, axis=1), axis=0)\n else:\n n_samples = proba[0].shape[0]\n class_type = self.classes_[0].dtype\n predictions = np.empty((n_samples, self.n_outputs_), dtype=class_type)\n for k in range(self.n_outputs_):\n predictions[:, k] = self.classes_[k].take(np.argmax(proba[k], axis=1), axis=0)\n return predictions" }, { @@ -59500,7 +62667,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -59510,13 +62678,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, its dtype will be converted to\n``dtype=np.float32``. If a sparse matrix is provided, it will be\nconverted into a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Predict class log-probabilities for X.\n\nThe predicted class log-probabilities of an input sample is computed as the log of the mean predicted class probabilities of the trees in the forest.", - "docstring": "Predict class log-probabilities for X.\n\nThe predicted class log-probabilities of an input sample is computed as\nthe log of the mean predicted class probabilities of the trees in the\nforest.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\nReturns\n-------\np : ndarray of shape (n_samples, n_classes), or a list of such arrays\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.", + "description": "Predict class log-probabilities for X.\n\nThe predicted class log-probabilities of an input sample is computed as\nthe log of the mean predicted class probabilities of the trees in the\nforest.", + "docstring": "\n Predict class log-probabilities for X.\n\n The predicted class log-probabilities of an input sample is computed as\n the log of the mean predicted class probabilities of the trees in the\n forest.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes), or a list of such arrays\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n ", "source_code": "\ndef predict_log_proba(self, X):\n \"\"\"\n Predict class log-probabilities for X.\n\n The predicted class log-probabilities of an input sample is computed as\n the log of the mean predicted class probabilities of the trees in the\n forest.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes), or a list of such arrays\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n proba = self.predict_proba(X)\n if self.n_outputs_ == 1:\n return np.log(proba)\n else:\n for k in range(self.n_outputs_):\n proba[k] = np.log(proba[k])\n return proba" }, { @@ -59534,7 +62706,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -59544,13 +62717,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, its dtype will be converted to\n``dtype=np.float32``. If a sparse matrix is provided, it will be\nconverted into a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Predict class probabilities for X.\n\nThe predicted class probabilities of an input sample are computed as the mean predicted class probabilities of the trees in the forest. The class probability of a single tree is the fraction of samples of the same class in a leaf.", - "docstring": "Predict class probabilities for X.\n\nThe predicted class probabilities of an input sample are computed as\nthe mean predicted class probabilities of the trees in the forest.\nThe class probability of a single tree is the fraction of samples of\nthe same class in a leaf.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\nReturns\n-------\np : ndarray of shape (n_samples, n_classes), or a list of such arrays\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.", + "description": "Predict class probabilities for X.\n\nThe predicted class probabilities of an input sample are computed as\nthe mean predicted class probabilities of the trees in the forest.\nThe class probability of a single tree is the fraction of samples of\nthe same class in a leaf.", + "docstring": "\n Predict class probabilities for X.\n\n The predicted class probabilities of an input sample are computed as\n the mean predicted class probabilities of the trees in the forest.\n The class probability of a single tree is the fraction of samples of\n the same class in a leaf.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes), or a list of such arrays\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"\n Predict class probabilities for X.\n\n The predicted class probabilities of an input sample are computed as\n the mean predicted class probabilities of the trees in the forest.\n The class probability of a single tree is the fraction of samples of\n the same class in a leaf.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes), or a list of such arrays\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_X_predict(X)\n (n_jobs, _, _) = _partition_estimators(self.n_estimators, self.n_jobs)\n all_proba = [np.zeros((X.shape[0], j), dtype=np.float64) for j in np.atleast_1d(self.n_classes_)]\n lock = threading.Lock()\n Parallel(n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require='sharedmem'))((delayed(_accumulate_prediction)(e.predict_proba, X, all_proba, lock) for e in self.estimators_))\n for proba in all_proba:\n proba /= len(self.estimators_)\n if len(all_proba) == 1:\n return all_proba[0]\n else:\n return all_proba" }, { @@ -59568,7 +62745,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -59578,7 +62756,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -59588,7 +62767,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_params", @@ -59598,7 +62778,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bootstrap", @@ -59608,7 +62789,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "oob_score", @@ -59618,7 +62800,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -59628,7 +62811,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -59638,7 +62822,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -59648,7 +62833,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -59658,7 +62844,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -59668,13 +62855,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, base_estimator, n_estimators=100, *, estimator_params=tuple(), bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, max_samples=None):\n super().__init__(base_estimator, n_estimators=n_estimators, estimator_params=estimator_params, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, max_samples=max_samples)" }, { @@ -59692,7 +62880,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "grid", @@ -59702,7 +62891,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_target_features)", "description": "The grid points on which the partial dependence should be\nevaluated." - } + }, + "refined_type": {} }, { "name": "target_features", @@ -59712,13 +62902,14 @@ "docstring": { "type": "ndarray of shape (n_target_features)", "description": "The set of target features for which the partial dependence\nshould be evaluated." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fast partial dependence computation.", - "docstring": "Fast partial dependence computation.\n\nParameters\n----------\ngrid : ndarray of shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\ntarget_features : ndarray of shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\n\nReturns\n-------\naveraged_predictions : ndarray of shape (n_samples,)\n The value of the partial dependence function on each grid point.", + "docstring": "Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray of shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray of shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\n\n Returns\n -------\n averaged_predictions : ndarray of shape (n_samples,)\n The value of the partial dependence function on each grid point.\n ", "source_code": "\ndef _compute_partial_dependence_recursion(self, grid, target_features):\n \"\"\"Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray of shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray of shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\n\n Returns\n -------\n averaged_predictions : ndarray of shape (n_samples,)\n The value of the partial dependence function on each grid point.\n \"\"\"\n grid = np.asarray(grid, dtype=DTYPE, order='C')\n averaged_predictions = np.zeros(shape=grid.shape[0], dtype=np.float64, order='C')\n for tree in self.estimators_:\n tree.tree_.compute_partial_dependence(grid, target_features, averaged_predictions)\n averaged_predictions /= len(self.estimators_)\n return averaged_predictions" }, { @@ -59736,7 +62927,8 @@ "docstring": { "type": "DecisionTreeRegressor object", "description": "A single decision tree regressor." - } + }, + "refined_type": {} }, { "name": "X", @@ -59746,13 +62938,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The OOB samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the OOB predictions for an individual tree.", - "docstring": "Compute the OOB predictions for an individual tree.\n\nParameters\n----------\ntree : DecisionTreeRegressor object\n A single decision tree regressor.\nX : ndarray of shape (n_samples, n_features)\n The OOB samples.\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples, 1, n_outputs)\n The OOB associated predictions.", + "docstring": "Compute the OOB predictions for an individual tree.\n\n Parameters\n ----------\n tree : DecisionTreeRegressor object\n A single decision tree regressor.\n X : ndarray of shape (n_samples, n_features)\n The OOB samples.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples, 1, n_outputs)\n The OOB associated predictions.\n ", "source_code": "\n@staticmethod\ndef _get_oob_predictions(tree, X):\n \"\"\"Compute the OOB predictions for an individual tree.\n\n Parameters\n ----------\n tree : DecisionTreeRegressor object\n A single decision tree regressor.\n X : ndarray of shape (n_samples, n_features)\n The OOB samples.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples, 1, n_outputs)\n The OOB associated predictions.\n \"\"\"\n y_pred = tree.predict(X, check_input=False)\n if y_pred.ndim == 1:\n y_pred = y_pred[:, np.newaxis, np.newaxis]\n else:\n y_pred = y_pred[:, np.newaxis, :]\n return y_pred" }, { @@ -59770,13 +62963,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multilabel': True}" }, { @@ -59794,7 +62988,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -59804,7 +62999,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix." - } + }, + "refined_type": {} }, { "name": "y", @@ -59814,13 +63010,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_outputs)", "description": "The target matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute and set the OOB score and attributes.", - "docstring": "Compute and set the OOB score and attributes.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix.\ny : ndarray of shape (n_samples, n_outputs)\n The target matrix.", + "docstring": "Compute and set the OOB score and attributes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n ", "source_code": "\ndef _set_oob_score_and_attributes(self, X, y):\n \"\"\"Compute and set the OOB score and attributes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n y : ndarray of shape (n_samples, n_outputs)\n The target matrix.\n \"\"\"\n self.oob_prediction_ = super()._compute_oob_predictions(X, y).squeeze(axis=1)\n if self.oob_prediction_.shape[-1] == 1:\n self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1)\n self.oob_score_ = r2_score(y, self.oob_prediction_)" }, { @@ -59838,7 +63035,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -59848,13 +63046,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, its dtype will be converted to\n``dtype=np.float32``. If a sparse matrix is provided, it will be\nconverted into a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Predict regression target for X.\n\nThe predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest.", - "docstring": "Predict regression target for X.\n\nThe predicted regression target of an input sample is computed as the\nmean predicted regression targets of the trees in the forest.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\nReturns\n-------\ny : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The predicted values.", + "description": "Predict regression target for X.\n\nThe predicted regression target of an input sample is computed as the\nmean predicted regression targets of the trees in the forest.", + "docstring": "\n Predict regression target for X.\n\n The predicted regression target of an input sample is computed as the\n mean predicted regression targets of the trees in the forest.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"\n Predict regression target for X.\n\n The predicted regression target of an input sample is computed as the\n mean predicted regression targets of the trees in the forest.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will be\n converted into a sparse ``csr_matrix``.\n\n Returns\n -------\n y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The predicted values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_X_predict(X)\n (n_jobs, _, _) = _partition_estimators(self.n_estimators, self.n_jobs)\n if self.n_outputs_ > 1:\n y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64)\n else:\n y_hat = np.zeros(X.shape[0], dtype=np.float64)\n lock = threading.Lock()\n Parallel(n_jobs=n_jobs, verbose=self.verbose, **_joblib_parallel_args(require='sharedmem'))((delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) for e in self.estimators_))\n y_hat /= len(self.estimators_)\n return y_hat" }, { @@ -59872,7 +63074,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -59882,7 +63085,8 @@ "docstring": { "type": "int, default=100", "description": "The number of trees in the forest.\n\n.. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22." - } + }, + "refined_type": {} }, { "name": "criterion", @@ -59892,6 +63096,10 @@ "docstring": { "type": "{\"gini\", \"entropy\"}, default=\"gini\"", "description": "The function to measure the quality of a split. Supported criteria are\n\"gini\" for the Gini impurity and \"entropy\" for the information gain.\nNote: this parameter is tree-specific." + }, + "refined_type": { + "kind": "EnumType", + "values": ["gini", "entropy"] } }, { @@ -59902,7 +63110,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum depth of the tree. If None, then nodes are expanded until\nall leaves are pure or until all leaves contain less than\nmin_samples_split samples." - } + }, + "refined_type": {} }, { "name": "min_samples_split", @@ -59912,7 +63121,8 @@ "docstring": { "type": "int or float, default=2", "description": "The minimum number of samples required to split an internal node:\n\n- If int, then consider `min_samples_split` as the minimum number.\n- If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -59922,7 +63132,8 @@ "docstring": { "type": "int or float, default=1", "description": "The minimum number of samples required to be at a leaf node.\nA split point at any depth will only be considered if it leaves at\nleast ``min_samples_leaf`` training samples in each of the left and\nright branches. This may have the effect of smoothing the model,\nespecially in regression.\n\n- If int, then consider `min_samples_leaf` as the minimum number.\n- If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -59932,7 +63143,8 @@ "docstring": { "type": "float, default=0.0", "description": "The minimum weighted fraction of the sum total of weights (of all\nthe input samples) required to be at a leaf node. Samples have\nequal weight when sample_weight is not provided." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -59942,6 +63154,10 @@ "docstring": { "type": "{\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"", "description": "The number of features to consider when looking for the best split:\n\n- If int, then consider `max_features` features at each split.\n- If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n- If \"auto\", then `max_features=sqrt(n_features)`.\n- If \"sqrt\", then `max_features=sqrt(n_features)` (same as \"auto\").\n- If \"log2\", then `max_features=log2(n_features)`.\n- If None, then `max_features=n_features`.\n\nNote: the search for a split does not stop until at least one\nvalid partition of the node samples is found, even if it requires to\neffectively inspect more than ``max_features`` features." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "sqrt", "log2"] } }, { @@ -59952,7 +63168,8 @@ "docstring": { "type": "int, default=None", "description": "Grow trees with ``max_leaf_nodes`` in best-first fashion.\nBest nodes are defined as relative reduction in impurity.\nIf None then unlimited number of leaf nodes." - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -59962,7 +63179,8 @@ "docstring": { "type": "float, default=0.0", "description": "A node will be split if this split induces a decrease of the impurity\ngreater than or equal to this value.\n\nThe weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\nwhere ``N`` is the total number of samples, ``N_t`` is the number of\nsamples at the current node, ``N_t_L`` is the number of samples in the\nleft child, and ``N_t_R`` is the number of samples in the right child.\n\n``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\nif ``sample_weight`` is passed.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "bootstrap", @@ -59972,7 +63190,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether bootstrap samples are used when building trees. If False, the\nwhole dataset is used to build each tree." - } + }, + "refined_type": {} }, { "name": "oob_score", @@ -59982,7 +63201,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use out-of-bag samples to estimate the generalization score.\nOnly available if bootstrap=True." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -59992,7 +63212,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n:meth:`decision_path` and :meth:`apply` are all parallelized over the\ntrees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\ncontext. ``-1`` means using all processors. See :term:`Glossary\n` for more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -60002,7 +63223,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls both the randomness of the bootstrapping of the samples used\nwhen building trees (if ``bootstrap=True``) and the sampling of the\nfeatures to consider when looking for the best split at each node\n(if ``max_features < n_features``).\nSee :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -60012,7 +63234,8 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity when fitting and predicting." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -60022,7 +63245,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit\nand add more estimators to the ensemble, otherwise, just fit a whole\nnew forest. See :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -60032,6 +63256,10 @@ "docstring": { "type": "{\"balanced\", \"balanced_subsample\"}, dict or list of dicts, default=None", "description": "Weights associated with classes in the form ``{class_label: weight}``.\nIf not given, all classes are supposed to have weight one. For\nmulti-output problems, a list of dicts can be provided in the same\norder as the columns of y.\n\nNote that for multioutput (including multilabel) weights should be\ndefined for each class of every column in its own dict. For example,\nfor four-class multilabel classification weights should be\n[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n[{1:1}, {2:5}, {3:1}, {4:1}].\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``\n\nThe \"balanced_subsample\" mode is the same as \"balanced\" except that\nweights are computed based on the bootstrap sample for every tree\ngrown.\n\nFor multi-output, the weights of each column of y will be multiplied.\n\nNote that these weights will be multiplied with sample_weight (passed\nthrough the fit method) if sample_weight is specified." + }, + "refined_type": { + "kind": "EnumType", + "values": ["balanced", "balanced_subsample"] } }, { @@ -60042,7 +63270,8 @@ "docstring": { "type": "non-negative float, default=0.0", "description": "Complexity parameter used for Minimal Cost-Complexity Pruning. The\nsubtree with the largest cost complexity that is smaller than\n``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n:ref:`minimal_cost_complexity_pruning` for details.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -60052,13 +63281,21 @@ "docstring": { "type": "int or float, default=None", "description": "If bootstrap is True, the number of samples to draw from X\nto train each base estimator.\n\n- If None (default), then draw `X.shape[0]` samples.\n- If int, then draw `max_samples` samples.\n- If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n.. versionadded:: 0.22" + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": false, + "max_inclusive": true } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None):\n super().__init__(base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples)\n self.criterion = criterion\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha" }, { @@ -60076,7 +63313,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -60086,7 +63324,8 @@ "docstring": { "type": "int, default=100", "description": "The number of trees in the forest.\n\n.. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22." - } + }, + "refined_type": {} }, { "name": "criterion", @@ -60096,6 +63335,10 @@ "docstring": { "type": "{\"squared_error\", \"absolute_error\", \"poisson\"}, default=\"squared_error\"", "description": "The function to measure the quality of a split. Supported criteria\nare \"squared_error\" for the mean squared error, which is equal to\nvariance reduction as feature selection criterion, \"absolute_error\"\nfor the mean absolute error, and \"poisson\" which uses reduction in\nPoisson deviance to find splits.\nTraining using \"absolute_error\" is significantly slower\nthan when using \"squared_error\".\n\n.. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n.. versionadded:: 1.0\n Poisson criterion.\n\n.. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n.. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent." + }, + "refined_type": { + "kind": "EnumType", + "values": ["squared_error", "poisson", "absolute_error"] } }, { @@ -60106,7 +63349,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum depth of the tree. If None, then nodes are expanded until\nall leaves are pure or until all leaves contain less than\nmin_samples_split samples." - } + }, + "refined_type": {} }, { "name": "min_samples_split", @@ -60116,7 +63360,8 @@ "docstring": { "type": "int or float, default=2", "description": "The minimum number of samples required to split an internal node:\n\n- If int, then consider `min_samples_split` as the minimum number.\n- If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -60126,7 +63371,8 @@ "docstring": { "type": "int or float, default=1", "description": "The minimum number of samples required to be at a leaf node.\nA split point at any depth will only be considered if it leaves at\nleast ``min_samples_leaf`` training samples in each of the left and\nright branches. This may have the effect of smoothing the model,\nespecially in regression.\n\n- If int, then consider `min_samples_leaf` as the minimum number.\n- If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -60136,7 +63382,8 @@ "docstring": { "type": "float, default=0.0", "description": "The minimum weighted fraction of the sum total of weights (of all\nthe input samples) required to be at a leaf node. Samples have\nequal weight when sample_weight is not provided." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -60146,6 +63393,10 @@ "docstring": { "type": "{\"auto\", \"sqrt\", \"log2\"}, int or float, default=\"auto\"", "description": "The number of features to consider when looking for the best split:\n\n- If int, then consider `max_features` features at each split.\n- If float, then `max_features` is a fraction and\n `round(max_features * n_features)` features are considered at each\n split.\n- If \"auto\", then `max_features=n_features`.\n- If \"sqrt\", then `max_features=sqrt(n_features)`.\n- If \"log2\", then `max_features=log2(n_features)`.\n- If None, then `max_features=n_features`.\n\nNote: the search for a split does not stop until at least one\nvalid partition of the node samples is found, even if it requires to\neffectively inspect more than ``max_features`` features." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "sqrt", "log2"] } }, { @@ -60156,7 +63407,8 @@ "docstring": { "type": "int, default=None", "description": "Grow trees with ``max_leaf_nodes`` in best-first fashion.\nBest nodes are defined as relative reduction in impurity.\nIf None then unlimited number of leaf nodes." - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -60166,7 +63418,8 @@ "docstring": { "type": "float, default=0.0", "description": "A node will be split if this split induces a decrease of the impurity\ngreater than or equal to this value.\n\nThe weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\nwhere ``N`` is the total number of samples, ``N_t`` is the number of\nsamples at the current node, ``N_t_L`` is the number of samples in the\nleft child, and ``N_t_R`` is the number of samples in the right child.\n\n``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\nif ``sample_weight`` is passed.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "bootstrap", @@ -60176,7 +63429,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether bootstrap samples are used when building trees. If False, the\nwhole dataset is used to build each tree." - } + }, + "refined_type": {} }, { "name": "oob_score", @@ -60186,7 +63440,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use out-of-bag samples to estimate the generalization score.\nOnly available if bootstrap=True." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -60196,7 +63451,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,\n:meth:`decision_path` and :meth:`apply` are all parallelized over the\ntrees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\ncontext. ``-1`` means using all processors. See :term:`Glossary\n` for more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -60206,7 +63462,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls both the randomness of the bootstrapping of the samples used\nwhen building trees (if ``bootstrap=True``) and the sampling of the\nfeatures to consider when looking for the best split at each node\n(if ``max_features < n_features``).\nSee :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -60216,7 +63473,8 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity when fitting and predicting." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -60226,7 +63484,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit\nand add more estimators to the ensemble, otherwise, just fit a whole\nnew forest. See :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "ccp_alpha", @@ -60236,7 +63495,8 @@ "docstring": { "type": "non-negative float, default=0.0", "description": "Complexity parameter used for Minimal Cost-Complexity Pruning. The\nsubtree with the largest cost complexity that is smaller than\n``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n:ref:`minimal_cost_complexity_pruning` for details.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -60246,13 +63506,21 @@ "docstring": { "type": "int or float, default=None", "description": "If bootstrap is True, the number of samples to draw from X\nto train each base estimator.\n\n- If None (default), then draw `X.shape[0]` samples.\n- If int, then draw `max_samples` samples.\n- If float, then draw `max_samples * X.shape[0]` samples. Thus,\n `max_samples` should be in the interval `(0.0, 1.0]`.\n\n.. versionadded:: 0.22" + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": false, + "max_inclusive": true } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_estimators=100, *, criterion='squared_error', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None):\n super().__init__(base_estimator=DecisionTreeRegressor(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, max_samples=max_samples)\n self.criterion = criterion\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha" }, { @@ -60270,7 +63538,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -60280,7 +63549,8 @@ "docstring": { "type": "int, default=100", "description": "Number of trees in the forest.\n\n.. versionchanged:: 0.22\n The default value of ``n_estimators`` changed from 10 to 100\n in 0.22." - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -60290,7 +63560,8 @@ "docstring": { "type": "int, default=5", "description": "The maximum depth of each tree. If None, then nodes are expanded until\nall leaves are pure or until all leaves contain less than\nmin_samples_split samples." - } + }, + "refined_type": {} }, { "name": "min_samples_split", @@ -60300,7 +63571,8 @@ "docstring": { "type": "int or float, default=2", "description": "The minimum number of samples required to split an internal node:\n\n- If int, then consider `min_samples_split` as the minimum number.\n- If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` is the minimum\n number of samples for each split.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -60310,7 +63582,8 @@ "docstring": { "type": "int or float, default=1", "description": "The minimum number of samples required to be at a leaf node.\nA split point at any depth will only be considered if it leaves at\nleast ``min_samples_leaf`` training samples in each of the left and\nright branches. This may have the effect of smoothing the model,\nespecially in regression.\n\n- If int, then consider `min_samples_leaf` as the minimum number.\n- If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` is the minimum\n number of samples for each node.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -60320,7 +63593,8 @@ "docstring": { "type": "float, default=0.0", "description": "The minimum weighted fraction of the sum total of weights (of all\nthe input samples) required to be at a leaf node. Samples have\nequal weight when sample_weight is not provided." - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -60330,7 +63604,8 @@ "docstring": { "type": "int, default=None", "description": "Grow trees with ``max_leaf_nodes`` in best-first fashion.\nBest nodes are defined as relative reduction in impurity.\nIf None then unlimited number of leaf nodes." - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -60340,7 +63615,8 @@ "docstring": { "type": "float, default=0.0", "description": "A node will be split if this split induces a decrease of the impurity\ngreater than or equal to this value.\n\nThe weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\nwhere ``N`` is the total number of samples, ``N_t`` is the number of\nsamples at the current node, ``N_t_L`` is the number of samples in the\nleft child, and ``N_t_R`` is the number of samples in the right child.\n\n``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\nif ``sample_weight`` is passed.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "sparse_output", @@ -60350,7 +63626,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to return a sparse CSR matrix, as default behavior,\nor to return a dense array compatible with dense pipeline operators." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -60360,7 +63637,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to run in parallel. :meth:`fit`, :meth:`transform`,\n:meth:`decision_path` and :meth:`apply` are all parallelized over the\ntrees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\ncontext. ``-1`` means using all processors. See :term:`Glossary\n` for more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -60370,7 +63648,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the generation of the random `y` used to fit the trees\nand the draw of the splits for each feature at the trees' nodes.\nSee :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -60380,7 +63659,8 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity when fitting and predicting." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -60390,13 +63670,14 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit\nand add more estimators to the ensemble, otherwise, just fit a whole\nnew forest. See :term:`the Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_estimators=100, *, max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0, sparse_output=True, n_jobs=None, random_state=None, verbose=0, warm_start=False):\n super().__init__(base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, estimator_params=('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state'), bootstrap=False, oob_score=False, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, max_samples=None)\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_leaf_nodes = max_leaf_nodes\n self.min_impurity_decrease = min_impurity_decrease\n self.sparse_output = sparse_output" }, { @@ -60414,7 +63695,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -60424,7 +63706,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -60434,13 +63717,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _set_oob_score_and_attributes(self, X, y):\n raise NotImplementedError('OOB score not supported by tree embedding')" }, { @@ -60458,7 +63742,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -60468,6 +63753,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Use ``dtype=np.float32`` for maximum\nefficiency. Sparse matrices are also supported, use sparse\n``csc_matrix`` for maximum efficiency." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -60478,7 +63767,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -60488,13 +63778,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted. Splits\nthat would create child nodes with net zero or negative weight are\nignored while searching for a split in each node. In the case of\nclassification, splits are also ignored if they would result in any\nsingle class carrying a negative weight in either child node." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit estimator.", - "docstring": "Fit estimator.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csc_matrix`` for maximum efficiency.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "\n Fit estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csc_matrix`` for maximum efficiency.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None, sample_weight=None):\n \"\"\"\n Fit estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csc_matrix`` for maximum efficiency.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self.fit_transform(X, y, sample_weight=sample_weight)\n return self" }, { @@ -60512,7 +63803,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -60522,6 +63814,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input data used to build forests. Use ``dtype=np.float32`` for\nmaximum efficiency." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -60532,7 +63828,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -60542,13 +63839,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted. Splits\nthat would create child nodes with net zero or negative weight are\nignored while searching for a split in each node. In the case of\nclassification, splits are also ignored if they would result in any\nsingle class carrying a negative weight in either child node." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit estimator and transform dataset.", - "docstring": "Fit estimator and transform dataset.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data used to build forests. Use ``dtype=np.float32`` for\n maximum efficiency.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\nReturns\n-------\nX_transformed : sparse matrix of shape (n_samples, n_out)\n Transformed dataset.", + "docstring": "\n Fit estimator and transform dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data used to build forests. Use ``dtype=np.float32`` for\n maximum efficiency.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n X_transformed : sparse matrix of shape (n_samples, n_out)\n Transformed dataset.\n ", "source_code": "\ndef fit_transform(self, X, y=None, sample_weight=None):\n \"\"\"\n Fit estimator and transform dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data used to build forests. Use ``dtype=np.float32`` for\n maximum efficiency.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n Returns\n -------\n X_transformed : sparse matrix of shape (n_samples, n_out)\n Transformed dataset.\n \"\"\"\n rnd = check_random_state(self.random_state)\n y = rnd.uniform(size=_num_samples(X))\n super().fit(X, y, sample_weight=sample_weight)\n self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output)\n return self.one_hot_encoder_.fit_transform(self.apply(X))" }, { @@ -60566,7 +63864,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -60576,13 +63875,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input data to be transformed. Use ``dtype=np.float32`` for maximum\nefficiency. Sparse matrices are also supported, use sparse\n``csr_matrix`` for maximum efficiency." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Transform dataset.", - "docstring": "Transform dataset.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data to be transformed. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csr_matrix`` for maximum efficiency.\n\nReturns\n-------\nX_transformed : sparse matrix of shape (n_samples, n_out)\n Transformed dataset.", + "docstring": "\n Transform dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data to be transformed. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csr_matrix`` for maximum efficiency.\n\n Returns\n -------\n X_transformed : sparse matrix of shape (n_samples, n_out)\n Transformed dataset.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"\n Transform dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data to be transformed. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csr_matrix`` for maximum efficiency.\n\n Returns\n -------\n X_transformed : sparse matrix of shape (n_samples, n_out)\n Transformed dataset.\n \"\"\"\n check_is_fitted(self)\n return self.one_hot_encoder_.transform(self.apply(X))" }, { @@ -60600,7 +63903,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -60610,7 +63914,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "out", @@ -60620,7 +63925,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lock", @@ -60630,13 +63936,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "This is a utility function for joblib's Parallel.\n\nIt can't go locally in ForestClassifier or ForestRegressor, because joblib complains that it cannot pickle it when placed there.", - "docstring": "This is a utility function for joblib's Parallel.\n\nIt can't go locally in ForestClassifier or ForestRegressor, because joblib\ncomplains that it cannot pickle it when placed there.", + "description": "This is a utility function for joblib's Parallel.\n\nIt can't go locally in ForestClassifier or ForestRegressor, because joblib\ncomplains that it cannot pickle it when placed there.", + "docstring": "\n This is a utility function for joblib's Parallel.\n\n It can't go locally in ForestClassifier or ForestRegressor, because joblib\n complains that it cannot pickle it when placed there.\n ", "source_code": "\ndef _accumulate_prediction(predict, X, out, lock):\n \"\"\"\n This is a utility function for joblib's Parallel.\n\n It can't go locally in ForestClassifier or ForestRegressor, because joblib\n complains that it cannot pickle it when placed there.\n \"\"\"\n prediction = predict(X, check_input=False)\n with lock:\n if len(out) == 1:\n out[0] += prediction\n else:\n for i in range(len(out)):\n out[i] += prediction[i]" }, { @@ -60654,7 +63961,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -60664,7 +63972,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples_bootstrap", @@ -60674,13 +63983,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Private function used to _parallel_build_trees function.", - "docstring": "Private function used to _parallel_build_trees function.", + "docstring": "\n Private function used to _parallel_build_trees function.", "source_code": "\ndef _generate_sample_indices(random_state, n_samples, n_samples_bootstrap):\n \"\"\"\n Private function used to _parallel_build_trees function.\"\"\"\n random_instance = check_random_state(random_state)\n sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap)\n return sample_indices" }, { @@ -60698,7 +64008,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -60708,7 +64019,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples_bootstrap", @@ -60718,13 +64030,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Private function used to forest._set_oob_score function.", - "docstring": "Private function used to forest._set_oob_score function.", + "docstring": "\n Private function used to forest._set_oob_score function.", "source_code": "\ndef _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap):\n \"\"\"\n Private function used to forest._set_oob_score function.\"\"\"\n sample_indices = _generate_sample_indices(random_state, n_samples, n_samples_bootstrap)\n sample_counts = np.bincount(sample_indices, minlength=n_samples)\n unsampled_mask = sample_counts == 0\n indices_range = np.arange(n_samples)\n unsampled_indices = indices_range[unsampled_mask]\n return unsampled_indices" }, { @@ -60742,7 +64055,8 @@ "docstring": { "type": "int", "description": "Number of samples in the dataset." - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -60752,13 +64066,14 @@ "docstring": { "type": "int or float", "description": "The maximum number of samples to draw from the total available:\n - if float, this indicates a fraction of the total and should be\n the interval `(0.0, 1.0]`;\n - if int, this indicates the exact number of samples;\n - if None, this indicates the total number of samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Get the number of samples in a bootstrap sample.", - "docstring": "Get the number of samples in a bootstrap sample.\n\nParameters\n----------\nn_samples : int\n Number of samples in the dataset.\nmax_samples : int or float\n The maximum number of samples to draw from the total available:\n - if float, this indicates a fraction of the total and should be\n the interval `(0.0, 1.0]`;\n - if int, this indicates the exact number of samples;\n - if None, this indicates the total number of samples.\n\nReturns\n-------\nn_samples_bootstrap : int\n The total number of samples to draw for the bootstrap sample.", + "docstring": "\n Get the number of samples in a bootstrap sample.\n\n Parameters\n ----------\n n_samples : int\n Number of samples in the dataset.\n max_samples : int or float\n The maximum number of samples to draw from the total available:\n - if float, this indicates a fraction of the total and should be\n the interval `(0.0, 1.0]`;\n - if int, this indicates the exact number of samples;\n - if None, this indicates the total number of samples.\n\n Returns\n -------\n n_samples_bootstrap : int\n The total number of samples to draw for the bootstrap sample.\n ", "source_code": "\ndef _get_n_samples_bootstrap(n_samples, max_samples):\n \"\"\"\n Get the number of samples in a bootstrap sample.\n\n Parameters\n ----------\n n_samples : int\n Number of samples in the dataset.\n max_samples : int or float\n The maximum number of samples to draw from the total available:\n - if float, this indicates a fraction of the total and should be\n the interval `(0.0, 1.0]`;\n - if int, this indicates the exact number of samples;\n - if None, this indicates the total number of samples.\n\n Returns\n -------\n n_samples_bootstrap : int\n The total number of samples to draw for the bootstrap sample.\n \"\"\"\n if max_samples is None:\n return n_samples\n if isinstance(max_samples, numbers.Integral):\n if not 1 <= max_samples <= n_samples:\n msg = '`max_samples` must be in range 1 to {} but got value {}'\n raise ValueError(msg.format(n_samples, max_samples))\n return max_samples\n if isinstance(max_samples, numbers.Real):\n if not 0 < max_samples <= 1:\n msg = '`max_samples` must be in range (0.0, 1.0] but got value {}'\n raise ValueError(msg.format(max_samples))\n return round(n_samples * max_samples)\n msg = \"`max_samples` should be int or float, but got type '{}'\"\n raise TypeError(msg.format(type(max_samples)))" }, { @@ -60776,7 +64091,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "forest", @@ -60786,7 +64102,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -60796,7 +64113,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -60806,7 +64124,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -60816,7 +64135,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree_idx", @@ -60826,7 +64146,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_trees", @@ -60836,7 +64157,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -60846,7 +64168,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -60856,7 +64179,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples_bootstrap", @@ -60866,13 +64190,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Private function used to fit a single tree in parallel.", - "docstring": "Private function used to fit a single tree in parallel.", + "docstring": "\n Private function used to fit a single tree in parallel.", "source_code": "\ndef _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose=0, class_weight=None, n_samples_bootstrap=None):\n \"\"\"\n Private function used to fit a single tree in parallel.\"\"\"\n if verbose > 1:\n print('building tree %d of %d' % (tree_idx + 1, n_trees))\n if forest.bootstrap:\n n_samples = X.shape[0]\n if sample_weight is None:\n curr_sample_weight = np.ones((n_samples, ), dtype=np.float64)\n else:\n curr_sample_weight = sample_weight.copy()\n indices = _generate_sample_indices(tree.random_state, n_samples, n_samples_bootstrap)\n sample_counts = np.bincount(indices, minlength=n_samples)\n curr_sample_weight *= sample_counts\n if class_weight == 'subsample':\n with catch_warnings():\n simplefilter('ignore', DeprecationWarning)\n curr_sample_weight *= compute_sample_weight('auto', y, indices=indices)\n elif class_weight == 'balanced_subsample':\n curr_sample_weight *= compute_sample_weight('balanced', y, indices=indices)\n tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)\n else:\n tree.fit(X, y, sample_weight=sample_weight, check_input=False)\n return tree" }, { @@ -60890,7 +64215,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -60900,7 +64226,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -60910,7 +64237,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -60920,7 +64248,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "criterion", @@ -60930,7 +64259,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_samples_split", @@ -60940,7 +64270,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -60950,7 +64281,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -60960,7 +64292,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -60970,7 +64303,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -60980,7 +64314,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "init", @@ -60990,7 +64325,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "subsample", @@ -61000,7 +64336,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_features", @@ -61010,7 +64347,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ccp_alpha", @@ -61020,7 +64358,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -61030,7 +64369,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -61040,7 +64380,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -61050,7 +64391,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -61060,7 +64402,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -61070,7 +64413,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -61080,7 +64424,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -61090,7 +64435,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -61100,13 +64446,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, *, loss, learning_rate, n_estimators, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, min_impurity_decrease, init, subsample, max_features, ccp_alpha, random_state, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001):\n self.n_estimators = n_estimators\n self.learning_rate = learning_rate\n self.loss = loss\n self.criterion = criterion\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.subsample = subsample\n self.max_features = max_features\n self.max_depth = max_depth\n self.min_impurity_decrease = min_impurity_decrease\n self.ccp_alpha = ccp_alpha\n self.init = init\n self.random_state = random_state\n self.alpha = alpha\n self.verbose = verbose\n self.max_leaf_nodes = max_leaf_nodes\n self.warm_start = warm_start\n self.validation_fraction = validation_fraction\n self.n_iter_no_change = n_iter_no_change\n self.tol = tol" }, { @@ -61124,7 +64471,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -61148,7 +64496,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -61172,7 +64521,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -61196,7 +64546,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "grid", @@ -61206,7 +64557,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_target_features)", "description": "The grid points on which the partial dependence should be\nevaluated." - } + }, + "refined_type": {} }, { "name": "target_features", @@ -61216,13 +64568,14 @@ "docstring": { "type": "ndarray of shape (n_target_features,)", "description": "The set of target features for which the partial dependence\nshould be evaluated." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fast partial dependence computation.", - "docstring": "Fast partial dependence computation.\n\nParameters\n----------\ngrid : ndarray of shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\ntarget_features : ndarray of shape (n_target_features,)\n The set of target features for which the partial dependence\n should be evaluated.\n\nReturns\n-------\naveraged_predictions : ndarray of shape (n_trees_per_iteration, n_samples)\n The value of the partial dependence function on each grid point.", + "docstring": "Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray of shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray of shape (n_target_features,)\n The set of target features for which the partial dependence\n should be evaluated.\n\n Returns\n -------\n averaged_predictions : ndarray of shape (n_trees_per_iteration, n_samples)\n The value of the partial dependence function on each grid point.\n ", "source_code": "\ndef _compute_partial_dependence_recursion(self, grid, target_features):\n \"\"\"Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray of shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray of shape (n_target_features,)\n The set of target features for which the partial dependence\n should be evaluated.\n\n Returns\n -------\n averaged_predictions : ndarray of shape (n_trees_per_iteration, n_samples)\n The value of the partial dependence function on each grid point.\n \"\"\"\n if self.init is not None:\n warnings.warn('Using recursion method with a non-constant init predictor will lead to incorrect partial dependence values. Got init=%s.' % self.init, UserWarning)\n grid = np.asarray(grid, dtype=DTYPE, order='C')\n (n_estimators, n_trees_per_stage) = self.estimators_.shape\n averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]), dtype=np.float64, order='C')\n for stage in range(n_estimators):\n for k in range(n_trees_per_stage):\n tree = self.estimators_[stage, k].tree_\n tree.compute_partial_dependence(grid, target_features, averaged_predictions[k])\n averaged_predictions *= self.learning_rate\n return averaged_predictions" }, { @@ -61240,7 +64593,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "i", @@ -61250,7 +64604,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -61260,7 +64615,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -61270,7 +64626,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -61280,7 +64637,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -61290,7 +64648,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_mask", @@ -61300,7 +64659,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -61310,7 +64670,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_csc", @@ -61320,7 +64681,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_csr", @@ -61330,7 +64692,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -61354,7 +64717,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -61364,7 +64728,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -61374,7 +64739,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -61384,7 +64750,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -61394,7 +64761,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -61404,7 +64772,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_val", @@ -61414,7 +64783,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_val", @@ -61424,7 +64794,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight_val", @@ -61434,7 +64805,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "begin_at_stage", @@ -61444,7 +64816,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "monitor", @@ -61454,14 +64827,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Iteratively fits the stages.\n\nFor each stage it computes the progress (OOB, train score) and delegates to ``_fit_stage``. Returns the number of stages fit; might differ from ``n_estimators`` due to early stopping.", - "docstring": "Iteratively fits the stages.\n\nFor each stage it computes the progress (OOB, train score)\nand delegates to ``_fit_stage``.\nReturns the number of stages fit; might differ from ``n_estimators``\ndue to early stopping.", - "source_code": "\ndef _fit_stages(self, X, y, raw_predictions, sample_weight, random_state, X_val, y_val, sample_weight_val, begin_at_stage=0, monitor=None):\n \"\"\"Iteratively fits the stages.\n\n For each stage it computes the progress (OOB, train score)\n and delegates to ``_fit_stage``.\n Returns the number of stages fit; might differ from ``n_estimators``\n due to early stopping.\n \"\"\"\n n_samples = X.shape[0]\n do_oob = self.subsample < 1.0\n sample_mask = np.ones((n_samples, ), dtype=bool)\n n_inbag = max(1, int(self.subsample * n_samples))\n loss_ = self.loss_\n if self.verbose:\n verbose_reporter = VerboseReporter(verbose=self.verbose)\n verbose_reporter.init(self, begin_at_stage)\n X_csc = csc_matrix(X) if issparse(X) else None\n X_csr = csr_matrix(X) if issparse(X) else None\n if self.n_iter_no_change is not None:\n loss_history = np.full(self.n_iter_no_change, np.inf)\n y_val_pred_iter = self._staged_raw_predict(X_val)\n i = begin_at_stage\n for i in range(begin_at_stage, self.n_estimators):\n if do_oob:\n sample_mask = _random_sample_mask(n_samples, n_inbag, random_state)\n old_oob_score = loss_(y[~sample_mask], raw_predictions[~sample_mask], sample_weight[~sample_mask])\n raw_predictions = self._fit_stage(i, X, y, raw_predictions, sample_weight, sample_mask, random_state, X_csc, X_csr)\n if do_oob:\n self.train_score_[i] = loss_(y[sample_mask], raw_predictions[sample_mask], sample_weight[sample_mask])\n self.oob_improvement_[i] = old_oob_score - loss_(y[~sample_mask], raw_predictions[~sample_mask], sample_weight[~sample_mask])\n else:\n self.train_score_[i] = loss_(y, raw_predictions, sample_weight)\n if self.verbose > 0:\n verbose_reporter.update(i, self)\n if monitor is not None:\n early_stopping = monitor(i, self, locals())\n if early_stopping:\n break\n if self.n_iter_no_change is not None:\n validation_loss = loss_(y_val, next(y_val_pred_iter), sample_weight_val)\n if np.any(validation_loss + self.tol < loss_history):\n loss_history[i % len(loss_history)] = validation_loss\n else:\n break\n return i + 1" + "description": "Iteratively fits the stages.\n\nFor each stage it computes the progress (OOB, train score)\nand delegates to ``_fit_stage``.\nReturns the number of stages fit; might differ from ``n_estimators``\ndue to early stopping.", + "docstring": "Iteratively fits the stages.\n\n For each stage it computes the progress (OOB, train score)\n and delegates to ``_fit_stage``.\n Returns the number of stages fit; might differ from ``n_estimators``\n due to early stopping.\n ", + "source_code": "\ndef _fit_stages(self, X, y, raw_predictions, sample_weight, random_state, X_val, y_val, sample_weight_val, begin_at_stage=0, monitor=None):\n \"\"\"Iteratively fits the stages.\n\n For each stage it computes the progress (OOB, train score)\n and delegates to ``_fit_stage``.\n Returns the number of stages fit; might differ from ``n_estimators``\n due to early stopping.\n \"\"\"\n n_samples = X.shape[0]\n do_oob = self.subsample < 1.0\n sample_mask = np.ones((n_samples, ), dtype=bool)\n n_inbag = max(1, int(self.subsample * n_samples))\n loss_ = self.loss_\n if self.verbose:\n verbose_reporter = VerboseReporter(verbose=self.verbose)\n verbose_reporter.init(self, begin_at_stage)\n X_csc = csc_matrix(X) if issparse(X) else None\n X_csr = csr_matrix(X) if issparse(X) else None\n if self.n_iter_no_change is not None:\n loss_history = np.full(self.n_iter_no_change, np.inf)\n y_val_pred_iter = self._staged_raw_predict(X_val, check_input=False)\n i = begin_at_stage\n for i in range(begin_at_stage, self.n_estimators):\n if do_oob:\n sample_mask = _random_sample_mask(n_samples, n_inbag, random_state)\n old_oob_score = loss_(y[~sample_mask], raw_predictions[~sample_mask], sample_weight[~sample_mask])\n raw_predictions = self._fit_stage(i, X, y, raw_predictions, sample_weight, sample_mask, random_state, X_csc, X_csr)\n if do_oob:\n self.train_score_[i] = loss_(y[sample_mask], raw_predictions[sample_mask], sample_weight[sample_mask])\n self.oob_improvement_[i] = old_oob_score - loss_(y[~sample_mask], raw_predictions[~sample_mask], sample_weight[~sample_mask])\n else:\n self.train_score_[i] = loss_(y, raw_predictions, sample_weight)\n if self.verbose > 0:\n verbose_reporter.update(i, self)\n if monitor is not None:\n early_stopping = monitor(i, self, locals())\n if early_stopping:\n break\n if self.n_iter_no_change is not None:\n validation_loss = loss_(y_val, next(y_val_pred_iter), sample_weight_val)\n if np.any(validation_loss + self.tol < loss_history):\n loss_history[i % len(loss_history)] = validation_loss\n else:\n break\n return i + 1" }, { "name": "_init_state", @@ -61478,7 +64852,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -61502,13 +64877,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _is_initialized(self):\n return len(getattr(self, 'estimators_', [])) > 0" }, { @@ -61526,7 +64902,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "append", @@ -61536,13 +64913,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _make_estimator(self, append=True):\n raise NotImplementedError()" }, { @@ -61560,7 +64938,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -61570,7 +64949,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -61594,7 +64974,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -61604,7 +64985,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -61628,7 +65010,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -61652,7 +65035,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -61662,14 +65046,29 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } + }, + { + "name": "check_input", + "default_value": "True", + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "bool, default=True", + "description": "If False, the input arrays X will not be checked." + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute raw predictions of ``X`` for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set) after each stage.", - "docstring": "Compute raw predictions of ``X`` for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nReturns\n-------\nraw_predictions : generator of ndarray of shape (n_samples, k)\n The raw predictions of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification are special cases with\n ``k == 1``, otherwise ``k==n_classes``.", - "source_code": "\ndef _staged_raw_predict(self, X):\n \"\"\"Compute raw predictions of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n raw_predictions : generator of ndarray of shape (n_samples, k)\n The raw predictions of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification are special cases with\n ``k == 1``, otherwise ``k==n_classes``.\n \"\"\"\n X = self._validate_data(X, dtype=DTYPE, order='C', accept_sparse='csr', reset=False)\n raw_predictions = self._raw_predict_init(X)\n for i in range(self.estimators_.shape[0]):\n predict_stage(self.estimators_, i, X, self.learning_rate, raw_predictions)\n yield raw_predictions.copy()" + "description": "Compute raw predictions of ``X`` for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.", + "docstring": "Compute raw predictions of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n If False, the input arrays X will not be checked.\n\n Returns\n -------\n raw_predictions : generator of ndarray of shape (n_samples, k)\n The raw predictions of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification are special cases with\n ``k == 1``, otherwise ``k==n_classes``.\n ", + "source_code": "\ndef _staged_raw_predict(self, X, check_input=True):\n \"\"\"Compute raw predictions of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n If False, the input arrays X will not be checked.\n\n Returns\n -------\n raw_predictions : generator of ndarray of shape (n_samples, k)\n The raw predictions of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification are special cases with\n ``k == 1``, otherwise ``k==n_classes``.\n \"\"\"\n if check_input:\n X = self._validate_data(X, dtype=DTYPE, order='C', accept_sparse='csr', reset=False)\n raw_predictions = self._raw_predict_init(X)\n for i in range(self.estimators_.shape[0]):\n predict_stage(self.estimators_, i, X, self.learning_rate, raw_predictions)\n yield raw_predictions.copy()" }, { "name": "_validate_y", @@ -61686,7 +65085,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -61696,7 +65096,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -61706,7 +65107,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -61730,13 +65132,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef _warn_mae_for_criterion(self):\n pass" }, { @@ -61754,7 +65157,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -61764,13 +65168,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, its dtype will be converted to\n``dtype=np.float32``. If a sparse matrix is provided, it will\nbe converted to a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Apply trees in the ensemble to X, return leaf indices.\n\n.. versionadded:: 0.17", - "docstring": "Apply trees in the ensemble to X, return leaf indices.\n\n.. versionadded:: 0.17\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will\n be converted to a sparse ``csr_matrix``.\n\nReturns\n-------\nX_leaves : array-like of shape (n_samples, n_estimators, n_classes)\n For each datapoint x in X and for each tree in the ensemble,\n return the index of the leaf x ends up in each estimator.\n In the case of binary classification n_classes is 1.", + "docstring": "Apply trees in the ensemble to X, return leaf indices.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will\n be converted to a sparse ``csr_matrix``.\n\n Returns\n -------\n X_leaves : array-like of shape (n_samples, n_estimators, n_classes)\n For each datapoint x in X and for each tree in the ensemble,\n return the index of the leaf x ends up in each estimator.\n In the case of binary classification n_classes is 1.\n ", "source_code": "\ndef apply(self, X):\n \"\"\"Apply trees in the ensemble to X, return leaf indices.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will\n be converted to a sparse ``csr_matrix``.\n\n Returns\n -------\n X_leaves : array-like of shape (n_samples, n_estimators, n_classes)\n For each datapoint x in X and for each tree in the ensemble,\n return the index of the leaf x ends up in each estimator.\n In the case of binary classification n_classes is 1.\n \"\"\"\n self._check_initialized()\n X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True)\n (n_estimators, n_classes) = self.estimators_.shape\n leaves = np.zeros((X.shape[0], n_estimators, n_classes))\n for i in range(n_estimators):\n for j in range(n_classes):\n estimator = self.estimators_[i, j]\n leaves[:, i, j] = estimator.apply(X, check_input=False)\n return leaves" }, { @@ -61788,13 +65196,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "The impurity-based feature importances.\n\nThe higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values). See :func:`sklearn.inspection.permutation_importance` as an alternative.", - "docstring": "The impurity-based feature importances.\n\nThe higher, the more important the feature.\nThe importance of a feature is computed as the (normalized)\ntotal reduction of the criterion brought by that feature. It is also\nknown as the Gini importance.\n\nWarning: impurity-based feature importances can be misleading for\nhigh cardinality features (many unique values). See\n:func:`sklearn.inspection.permutation_importance` as an alternative.\n\nReturns\n-------\nfeature_importances_ : ndarray of shape (n_features,)\n The values of this array sum to 1, unless all trees are single node\n trees consisting of only the root node, in which case it will be an\n array of zeros.", + "description": "The impurity-based feature importances.\n\nThe higher, the more important the feature.\nThe importance of a feature is computed as the (normalized)\ntotal reduction of the criterion brought by that feature. It is also\nknown as the Gini importance.\n\nWarning: impurity-based feature importances can be misleading for\nhigh cardinality features (many unique values). See\n:func:`sklearn.inspection.permutation_importance` as an alternative.", + "docstring": "The impurity-based feature importances.\n\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n The values of this array sum to 1, unless all trees are single node\n trees consisting of only the root node, in which case it will be an\n array of zeros.\n ", "source_code": "\n@property\ndef feature_importances_(self):\n \"\"\"The impurity-based feature importances.\n\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n The values of this array sum to 1, unless all trees are single node\n trees consisting of only the root node, in which case it will be an\n array of zeros.\n \"\"\"\n self._check_initialized()\n relevant_trees = [tree for stage in self.estimators_ for tree in stage if tree.tree_.node_count > 1]\n if not relevant_trees:\n return np.zeros(shape=self.n_features_in_, dtype=np.float64)\n relevant_feature_importances = [tree.tree_.compute_feature_importances(normalize=False) for tree in relevant_trees]\n avg_feature_importances = np.mean(relevant_feature_importances, axis=0, dtype=np.float64)\n return avg_feature_importances / np.sum(avg_feature_importances)" }, { @@ -61812,7 +65221,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -61822,6 +65232,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -61832,7 +65246,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values (strings or integers in classification, real numbers\nin regression)\nFor classification, labels must correspond to classes." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -61842,7 +65257,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted. Splits\nthat would create child nodes with net zero or negative weight are\nignored while searching for a split in each node. In the case of\nclassification, splits are also ignored if they would result in any\nsingle class carrying a negative weight in either child node." - } + }, + "refined_type": {} }, { "name": "monitor", @@ -61852,13 +65268,14 @@ "docstring": { "type": "callable, default=None", "description": "The monitor is called after each iteration with the current\niteration, a reference to the estimator and the local variables of\n``_fit_stages`` as keyword arguments ``callable(i, self,\nlocals())``. If the callable returns ``True`` the fitting procedure\nis stopped. The monitor can be used for various things such as\ncomputing held-out estimates, early stopping, model introspect, and\nsnapshoting." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit the gradient boosting model.", - "docstring": "Fit the gradient boosting model.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\ny : array-like of shape (n_samples,)\n Target values (strings or integers in classification, real numbers\n in regression)\n For classification, labels must correspond to classes.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\nmonitor : callable, default=None\n The monitor is called after each iteration with the current\n iteration, a reference to the estimator and the local variables of\n ``_fit_stages`` as keyword arguments ``callable(i, self,\n locals())``. If the callable returns ``True`` the fitting procedure\n is stopped. The monitor can be used for various things such as\n computing held-out estimates, early stopping, model introspect, and\n snapshoting.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the gradient boosting model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n y : array-like of shape (n_samples,)\n Target values (strings or integers in classification, real numbers\n in regression)\n For classification, labels must correspond to classes.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n monitor : callable, default=None\n The monitor is called after each iteration with the current\n iteration, a reference to the estimator and the local variables of\n ``_fit_stages`` as keyword arguments ``callable(i, self,\n locals())``. If the callable returns ``True`` the fitting procedure\n is stopped. The monitor can be used for various things such as\n computing held-out estimates, early stopping, model introspect, and\n snapshoting.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None, monitor=None):\n \"\"\"Fit the gradient boosting model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n y : array-like of shape (n_samples,)\n Target values (strings or integers in classification, real numbers\n in regression)\n For classification, labels must correspond to classes.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. In the case of\n classification, splits are also ignored if they would result in any\n single class carrying a negative weight in either child node.\n\n monitor : callable, default=None\n The monitor is called after each iteration with the current\n iteration, a reference to the estimator and the local variables of\n ``_fit_stages`` as keyword arguments ``callable(i, self,\n locals())``. If the callable returns ``True`` the fitting procedure\n is stopped. The monitor can be used for various things such as\n computing held-out estimates, early stopping, model introspect, and\n snapshoting.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.criterion in ('absolute_error', 'mae'):\n self._warn_mae_for_criterion()\n if self.criterion == 'mse':\n warnings.warn(\"Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.\", FutureWarning)\n if not self.warm_start:\n self._clear_state()\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE, multi_output=True)\n sample_weight_is_none = sample_weight is None\n sample_weight = _check_sample_weight(sample_weight, X)\n y = column_or_1d(y, warn=True)\n if is_classifier(self):\n y = self._validate_y(y, sample_weight)\n else:\n y = self._validate_y(y)\n if self.n_iter_no_change is not None:\n stratify = y if is_classifier(self) else None\n (X, X_val, y, y_val, sample_weight, sample_weight_val) = train_test_split(X, y, sample_weight, random_state=self.random_state, test_size=self.validation_fraction, stratify=stratify)\n if is_classifier(self):\n if self._n_classes != np.unique(y).shape[0]:\n raise ValueError('The training data after the early stopping split is missing some classes. Try using another random seed.')\n else:\n X_val = y_val = sample_weight_val = None\n self._check_params()\n if not self._is_initialized():\n self._init_state()\n if self.init_ == 'zero':\n raw_predictions = np.zeros(shape=(X.shape[0], self.loss_.K), dtype=np.float64)\n else:\n if sample_weight_is_none:\n self.init_.fit(X, y)\n else:\n msg = 'The initial estimator {} does not support sample weights.'.format(self.init_.__class__.__name__)\n try:\n self.init_.fit(X, y, sample_weight=sample_weight)\n except TypeError as e:\n raise ValueError(msg) from e\n except ValueError as e:\n if 'pass parameters to specific steps of your pipeline using the stepname__parameter' in str(e):\n raise ValueError(msg) from e\n else:\n raise\n raw_predictions = self.loss_.get_init_raw_predictions(X, self.init_)\n begin_at_stage = 0\n self._rng = check_random_state(self.random_state)\n else:\n if self.n_estimators < self.estimators_.shape[0]:\n raise ValueError('n_estimators=%d must be larger or equal to estimators_.shape[0]=%d when warm_start==True' % (self.n_estimators, self.estimators_.shape[0]))\n begin_at_stage = self.estimators_.shape[0]\n X = check_array(X, dtype=DTYPE, order='C', accept_sparse='csr')\n raw_predictions = self._raw_predict(X)\n self._resize_state()\n n_stages = self._fit_stages(X, y, raw_predictions, sample_weight, self._rng, X_val, y_val, sample_weight_val, begin_at_stage, monitor)\n if n_stages != self.estimators_.shape[0]:\n self.estimators_ = self.estimators_[:n_stages]\n self.train_score_ = self.train_score_[:n_stages]\n if hasattr(self, 'oob_improvement_'):\n self.oob_improvement_ = self.oob_improvement_[:n_stages]\n self.n_estimators_ = n_stages\n return self" }, { @@ -61879,13 +65296,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead.')\n@property\ndef n_features_(self):\n return self.n_features_in_" }, { @@ -61903,7 +65321,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -61913,6 +65332,10 @@ "docstring": { "type": "{'deviance', 'exponential'}, default='deviance'", "description": "The loss function to be optimized. 'deviance' refers to\ndeviance (= logistic regression) for classification\nwith probabilistic outputs. For loss 'exponential' gradient\nboosting recovers the AdaBoost algorithm." + }, + "refined_type": { + "kind": "EnumType", + "values": ["deviance", "exponential"] } }, { @@ -61923,7 +65346,8 @@ "docstring": { "type": "float, default=0.1", "description": "Learning rate shrinks the contribution of each tree by `learning_rate`.\nThere is a trade-off between learning_rate and n_estimators." - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -61933,7 +65357,8 @@ "docstring": { "type": "int, default=100", "description": "The number of boosting stages to perform. Gradient boosting\nis fairly robust to over-fitting so a large number usually\nresults in better performance." - } + }, + "refined_type": {} }, { "name": "subsample", @@ -61943,7 +65368,8 @@ "docstring": { "type": "float, default=1.0", "description": "The fraction of samples to be used for fitting the individual base\nlearners. If smaller than 1.0 this results in Stochastic Gradient\nBoosting. `subsample` interacts with the parameter `n_estimators`.\nChoosing `subsample < 1.0` leads to a reduction of variance\nand an increase in bias." - } + }, + "refined_type": {} }, { "name": "criterion", @@ -61953,6 +65379,15 @@ "docstring": { "type": "{'friedman_mse', 'squared_error', 'mse', 'mae'}, default='friedman_mse'", "description": "The function to measure the quality of a split. Supported criteria\nare 'friedman_mse' for the mean squared error with improvement\nscore by Friedman, 'squared_error' for mean squared error, and 'mae'\nfor the mean absolute error. The default value of 'friedman_mse' is\ngenerally the best as it can provide a better approximation in some\ncases.\n\n.. versionadded:: 0.18\n\n.. deprecated:: 0.24\n `criterion='mae'` is deprecated and will be removed in version\n 1.1 (renaming of 0.26). Use `criterion='friedman_mse'` or\n `'squared_error'` instead, as trees should use a squared error\n criterion in Gradient Boosting.\n\n.. deprecated:: 1.0\n Criterion 'mse' was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion='squared_error'` which is equivalent." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "friedman_mse", + "squared_error", + "mse", + "mae" + ] } }, { @@ -61963,7 +65398,8 @@ "docstring": { "type": "int or float, default=2", "description": "The minimum number of samples required to split an internal node:\n\n- If int, then consider `min_samples_split` as the minimum number.\n- If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -61973,7 +65409,8 @@ "docstring": { "type": "int or float, default=1", "description": "The minimum number of samples required to be at a leaf node.\nA split point at any depth will only be considered if it leaves at\nleast ``min_samples_leaf`` training samples in each of the left and\nright branches. This may have the effect of smoothing the model,\nespecially in regression.\n\n- If int, then consider `min_samples_leaf` as the minimum number.\n- If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -61983,7 +65420,8 @@ "docstring": { "type": "float, default=0.0", "description": "The minimum weighted fraction of the sum total of weights (of all\nthe input samples) required to be at a leaf node. Samples have\nequal weight when sample_weight is not provided." - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -61993,7 +65431,8 @@ "docstring": { "type": "int, default=3", "description": "The maximum depth of the individual regression estimators. The maximum\ndepth limits the number of nodes in the tree. Tune this parameter\nfor best performance; the best value depends on the interaction\nof the input variables." - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -62003,7 +65442,8 @@ "docstring": { "type": "float, default=0.0", "description": "A node will be split if this split induces a decrease of the impurity\ngreater than or equal to this value.\n\nThe weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\nwhere ``N`` is the total number of samples, ``N_t`` is the number of\nsamples at the current node, ``N_t_L`` is the number of samples in the\nleft child, and ``N_t_R`` is the number of samples in the right child.\n\n``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\nif ``sample_weight`` is passed.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "init", @@ -62013,7 +65453,8 @@ "docstring": { "type": "estimator or 'zero', default=None", "description": "An estimator object that is used to compute the initial predictions.\n``init`` has to provide :meth:`fit` and :meth:`predict_proba`. If\n'zero', the initial raw predictions are set to zero. By default, a\n``DummyEstimator`` predicting the classes priors is used." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -62023,7 +65464,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the random seed given to each Tree estimator at each\nboosting iteration.\nIn addition, it controls the random permutation of the features at\neach split (see Notes for more details).\nIt also controls the random splitting of the training data to obtain a\nvalidation set if `n_iter_no_change` is not None.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -62033,6 +65475,10 @@ "docstring": { "type": "{'auto', 'sqrt', 'log2'}, int or float, default=None", "description": "The number of features to consider when looking for the best split:\n\n- If int, then consider `max_features` features at each split.\n- If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n- If 'auto', then `max_features=sqrt(n_features)`.\n- If 'sqrt', then `max_features=sqrt(n_features)`.\n- If 'log2', then `max_features=log2(n_features)`.\n- If None, then `max_features=n_features`.\n\nChoosing `max_features < n_features` leads to a reduction of variance\nand an increase in bias.\n\nNote: the search for a split does not stop until at least one\nvalid partition of the node samples is found, even if it requires to\neffectively inspect more than ``max_features`` features." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "sqrt", "log2"] } }, { @@ -62043,7 +65489,8 @@ "docstring": { "type": "int, default=0", "description": "Enable verbose output. If 1 then it prints progress and performance\nonce in a while (the more trees the lower the frequency). If greater\nthan 1 then it prints progress and performance for every tree." - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -62053,7 +65500,8 @@ "docstring": { "type": "int, default=None", "description": "Grow trees with ``max_leaf_nodes`` in best-first fashion.\nBest nodes are defined as relative reduction in impurity.\nIf None then unlimited number of leaf nodes." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -62063,7 +65511,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit\nand add more estimators to the ensemble, otherwise, just erase the\nprevious solution. See :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -62073,7 +65522,8 @@ "docstring": { "type": "float, default=0.1", "description": "The proportion of training data to set aside as validation set for\nearly stopping. Must be between 0 and 1.\nOnly used if ``n_iter_no_change`` is set to an integer.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -62083,7 +65533,8 @@ "docstring": { "type": "int, default=None", "description": "``n_iter_no_change`` is used to decide if early stopping will be used\nto terminate training when validation score is not improving. By\ndefault it is set to None to disable early stopping. If set to a\nnumber, it will set aside ``validation_fraction`` size of the training\ndata as validation and terminate training when validation score is not\nimproving in all of the previous ``n_iter_no_change`` numbers of\niterations. The split is stratified.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "tol", @@ -62093,7 +65544,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance for the early stopping. When the loss is not improving\nby at least tol for ``n_iter_no_change`` iterations (if set to a\nnumber), the training stops.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "ccp_alpha", @@ -62103,13 +65555,14 @@ "docstring": { "type": "non-negative float, default=0.0", "description": "Complexity parameter used for Minimal Cost-Complexity Pruning. The\nsubtree with the largest cost complexity that is smaller than\n``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n:ref:`minimal_cost_complexity_pruning` for details.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0):\n super().__init__(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, init=init, subsample=subsample, max_features=max_features, random_state=random_state, verbose=verbose, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, warm_start=warm_start, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, ccp_alpha=ccp_alpha)" }, { @@ -62127,7 +65580,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -62137,7 +65591,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -62147,13 +65602,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_y(self, y, sample_weight):\n check_classification_targets(y)\n (self.classes_, y) = np.unique(y, return_inverse=True)\n n_trim_classes = np.count_nonzero(np.bincount(y, sample_weight))\n if n_trim_classes < 2:\n raise ValueError('y contains %d class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.' % n_trim_classes)\n self._n_classes = len(self.classes_)\n self.n_classes_ = self._n_classes\n return y" }, { @@ -62171,13 +65627,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _warn_mae_for_criterion(self):\n warnings.warn(\"criterion='mae' was deprecated in version 0.24 and will be removed in version 1.1 (renaming of 0.26). Use criterion='friedman_mse' or 'squared_error' instead, as trees should use a squared error criterion in Gradient Boosting.\", FutureWarning)" }, { @@ -62195,7 +65652,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -62205,13 +65663,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Compute the decision function of ``X``.", - "docstring": "Compute the decision function of ``X``.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nReturns\n-------\nscore : ndarray of shape (n_samples, n_classes) or (n_samples,)\n The decision function of the input samples, which corresponds to\n the raw values predicted from the trees of the ensemble . The\n order of the classes corresponds to that in the attribute\n :term:`classes_`. Regression and binary classification produce an\n array of shape (n_samples,).", + "docstring": "Compute the decision function of ``X``.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n score : ndarray of shape (n_samples, n_classes) or (n_samples,)\n The decision function of the input samples, which corresponds to\n the raw values predicted from the trees of the ensemble . The\n order of the classes corresponds to that in the attribute\n :term:`classes_`. Regression and binary classification produce an\n array of shape (n_samples,).\n ", "source_code": "\ndef decision_function(self, X):\n \"\"\"Compute the decision function of ``X``.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n score : ndarray of shape (n_samples, n_classes) or (n_samples,)\n The decision function of the input samples, which corresponds to\n the raw values predicted from the trees of the ensemble . The\n order of the classes corresponds to that in the attribute\n :term:`classes_`. Regression and binary classification produce an\n array of shape (n_samples,).\n \"\"\"\n X = self._validate_data(X, dtype=DTYPE, order='C', accept_sparse='csr', reset=False)\n raw_predictions = self._raw_predict(X)\n if raw_predictions.shape[1] == 1:\n return raw_predictions.ravel()\n return raw_predictions" }, { @@ -62229,7 +65691,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -62239,13 +65702,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict class for X.", - "docstring": "Predict class for X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nReturns\n-------\ny : ndarray of shape (n_samples,)\n The predicted values.", + "docstring": "Predict class for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict class for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted values.\n \"\"\"\n raw_predictions = self.decision_function(X)\n encoded_labels = self.loss_._raw_prediction_to_decision(raw_predictions)\n return self.classes_.take(encoded_labels, axis=0)" }, { @@ -62263,7 +65730,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -62273,13 +65741,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict class log-probabilities for X.", - "docstring": "Predict class log-probabilities for X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nReturns\n-------\np : ndarray of shape (n_samples, n_classes)\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n\nRaises\n------\nAttributeError\n If the ``loss`` does not support probabilities.", + "docstring": "Predict class log-probabilities for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n\n Raises\n ------\n AttributeError\n If the ``loss`` does not support probabilities.\n ", "source_code": "\ndef predict_log_proba(self, X):\n \"\"\"Predict class log-probabilities for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n\n Raises\n ------\n AttributeError\n If the ``loss`` does not support probabilities.\n \"\"\"\n proba = self.predict_proba(X)\n return np.log(proba)" }, { @@ -62297,7 +65769,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -62307,13 +65780,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict class probabilities for X.", - "docstring": "Predict class probabilities for X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nReturns\n-------\np : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n\nRaises\n------\nAttributeError\n If the ``loss`` does not support probabilities.", + "docstring": "Predict class probabilities for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n\n Raises\n ------\n AttributeError\n If the ``loss`` does not support probabilities.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Predict class probabilities for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n\n Raises\n ------\n AttributeError\n If the ``loss`` does not support probabilities.\n \"\"\"\n raw_predictions = self.decision_function(X)\n try:\n return self.loss_._raw_prediction_to_proba(raw_predictions)\n except NotFittedError:\n raise\n except AttributeError as e:\n raise AttributeError('loss=%r does not support predict_proba' % self.loss) from e" }, { @@ -62331,7 +65808,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -62341,13 +65819,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Compute decision function of ``X`` for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set) after each stage.", - "docstring": "Compute decision function of ``X`` for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nYields\n------\nscore : generator of ndarray of shape (n_samples, k)\n The decision function of the input samples, which corresponds to\n the raw values predicted from the trees of the ensemble . The\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification are special cases with\n ``k == 1``, otherwise ``k==n_classes``.", + "description": "Compute decision function of ``X`` for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.", + "docstring": "Compute decision function of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Yields\n ------\n score : generator of ndarray of shape (n_samples, k)\n The decision function of the input samples, which corresponds to\n the raw values predicted from the trees of the ensemble . The\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification are special cases with\n ``k == 1``, otherwise ``k==n_classes``.\n ", "source_code": "\ndef staged_decision_function(self, X):\n \"\"\"Compute decision function of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Yields\n ------\n score : generator of ndarray of shape (n_samples, k)\n The decision function of the input samples, which corresponds to\n the raw values predicted from the trees of the ensemble . The\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification are special cases with\n ``k == 1``, otherwise ``k==n_classes``.\n \"\"\"\n yield from self._staged_raw_predict(X)" }, { @@ -62365,7 +65847,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -62375,13 +65858,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict class at each stage for X.\n\nThis method allows monitoring (i.e. determine error on testing set) after each stage.", - "docstring": "Predict class at each stage for X.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nYields\n-------\ny : generator of ndarray of shape (n_samples,)\n The predicted value of the input samples.", + "description": "Predict class at each stage for X.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.", + "docstring": "Predict class at each stage for X.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted value of the input samples.\n ", "source_code": "\ndef staged_predict(self, X):\n \"\"\"Predict class at each stage for X.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted value of the input samples.\n \"\"\"\n for raw_predictions in self._staged_raw_predict(X):\n encoded_labels = self.loss_._raw_prediction_to_decision(raw_predictions)\n yield self.classes_.take(encoded_labels, axis=0)" }, { @@ -62399,7 +65886,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -62409,13 +65897,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict class probabilities at each stage for X.\n\nThis method allows monitoring (i.e. determine error on testing set) after each stage.", - "docstring": "Predict class probabilities at each stage for X.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nYields\n------\ny : generator of ndarray of shape (n_samples,)\n The predicted value of the input samples.", + "description": "Predict class probabilities at each stage for X.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.", + "docstring": "Predict class probabilities at each stage for X.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Yields\n ------\n y : generator of ndarray of shape (n_samples,)\n The predicted value of the input samples.\n ", "source_code": "\ndef staged_predict_proba(self, X):\n \"\"\"Predict class probabilities at each stage for X.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Yields\n ------\n y : generator of ndarray of shape (n_samples,)\n The predicted value of the input samples.\n \"\"\"\n try:\n for raw_predictions in self._staged_raw_predict(X):\n yield self.loss_._raw_prediction_to_proba(raw_predictions)\n except NotFittedError:\n raise\n except AttributeError as e:\n raise AttributeError('loss=%r does not support predict_proba' % self.loss) from e" }, { @@ -62433,7 +65925,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -62443,6 +65936,15 @@ "docstring": { "type": "{'squared_error', 'absolute_error', 'huber', 'quantile'}, default='squared_error'", "description": "Loss function to be optimized. 'squared_error' refers to the squared\nerror for regression. 'absolute_error' refers to the absolute error of\nregression and is a robust loss function. 'huber' is a\ncombination of the two. 'quantile' allows quantile regression (use\n`alpha` to specify the quantile).\n\n.. deprecated:: 1.0\n The loss 'ls' was deprecated in v1.0 and will be removed in\n version 1.2. Use `loss='squared_error'` which is equivalent.\n\n.. deprecated:: 1.0\n The loss 'lad' was deprecated in v1.0 and will be removed in\n version 1.2. Use `loss='absolute_error'` which is equivalent." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "squared_error", + "quantile", + "huber", + "absolute_error" + ] } }, { @@ -62453,7 +65955,8 @@ "docstring": { "type": "float, default=0.1", "description": "Learning rate shrinks the contribution of each tree by `learning_rate`.\nThere is a trade-off between learning_rate and n_estimators." - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -62463,7 +65966,8 @@ "docstring": { "type": "int, default=100", "description": "The number of boosting stages to perform. Gradient boosting\nis fairly robust to over-fitting so a large number usually\nresults in better performance." - } + }, + "refined_type": {} }, { "name": "subsample", @@ -62473,7 +65977,8 @@ "docstring": { "type": "float, default=1.0", "description": "The fraction of samples to be used for fitting the individual base\nlearners. If smaller than 1.0 this results in Stochastic Gradient\nBoosting. `subsample` interacts with the parameter `n_estimators`.\nChoosing `subsample < 1.0` leads to a reduction of variance\nand an increase in bias." - } + }, + "refined_type": {} }, { "name": "criterion", @@ -62483,6 +65988,15 @@ "docstring": { "type": "{'friedman_mse', 'squared_error', 'mse', 'mae'}, default='friedman_mse'", "description": "The function to measure the quality of a split. Supported criteria\nare \"friedman_mse\" for the mean squared error with improvement\nscore by Friedman, \"squared_error\" for mean squared error, and \"mae\"\nfor the mean absolute error. The default value of \"friedman_mse\" is\ngenerally the best as it can provide a better approximation in some\ncases.\n\n.. versionadded:: 0.18\n\n.. deprecated:: 0.24\n `criterion='mae'` is deprecated and will be removed in version\n 1.1 (renaming of 0.26). The correct way of minimizing the absolute\n error is to use `loss='absolute_error'` instead.\n\n.. deprecated:: 1.0\n Criterion 'mse' was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion='squared_error'` which is equivalent." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "friedman_mse", + "squared_error", + "mse", + "mae" + ] } }, { @@ -62493,7 +66007,8 @@ "docstring": { "type": "int or float, default=2", "description": "The minimum number of samples required to split an internal node:\n\n- If int, then consider `min_samples_split` as the minimum number.\n- If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -62503,7 +66018,8 @@ "docstring": { "type": "int or float, default=1", "description": "The minimum number of samples required to be at a leaf node.\nA split point at any depth will only be considered if it leaves at\nleast ``min_samples_leaf`` training samples in each of the left and\nright branches. This may have the effect of smoothing the model,\nespecially in regression.\n\n- If int, then consider `min_samples_leaf` as the minimum number.\n- If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -62513,7 +66029,8 @@ "docstring": { "type": "float, default=0.0", "description": "The minimum weighted fraction of the sum total of weights (of all\nthe input samples) required to be at a leaf node. Samples have\nequal weight when sample_weight is not provided." - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -62523,7 +66040,8 @@ "docstring": { "type": "int, default=3", "description": "Maximum depth of the individual regression estimators. The maximum\ndepth limits the number of nodes in the tree. Tune this parameter\nfor best performance; the best value depends on the interaction\nof the input variables." - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -62533,7 +66051,8 @@ "docstring": { "type": "float, default=0.0", "description": "A node will be split if this split induces a decrease of the impurity\ngreater than or equal to this value.\n\nThe weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\nwhere ``N`` is the total number of samples, ``N_t`` is the number of\nsamples at the current node, ``N_t_L`` is the number of samples in the\nleft child, and ``N_t_R`` is the number of samples in the right child.\n\n``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\nif ``sample_weight`` is passed.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "init", @@ -62543,7 +66062,8 @@ "docstring": { "type": "estimator or 'zero', default=None", "description": "An estimator object that is used to compute the initial predictions.\n``init`` has to provide :term:`fit` and :term:`predict`. If 'zero', the\ninitial raw predictions are set to zero. By default a\n``DummyEstimator`` is used, predicting either the average target value\n(for loss='squared_error'), or a quantile for the other losses." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -62553,7 +66073,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the random seed given to each Tree estimator at each\nboosting iteration.\nIn addition, it controls the random permutation of the features at\neach split (see Notes for more details).\nIt also controls the random splitting of the training data to obtain a\nvalidation set if `n_iter_no_change` is not None.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -62563,6 +66084,10 @@ "docstring": { "type": "{'auto', 'sqrt', 'log2'}, int or float, default=None", "description": "The number of features to consider when looking for the best split:\n\n- If int, then consider `max_features` features at each split.\n- If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n- If \"auto\", then `max_features=n_features`.\n- If \"sqrt\", then `max_features=sqrt(n_features)`.\n- If \"log2\", then `max_features=log2(n_features)`.\n- If None, then `max_features=n_features`.\n\nChoosing `max_features < n_features` leads to a reduction of variance\nand an increase in bias.\n\nNote: the search for a split does not stop until at least one\nvalid partition of the node samples is found, even if it requires to\neffectively inspect more than ``max_features`` features." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "sqrt", "log2"] } }, { @@ -62573,7 +66098,8 @@ "docstring": { "type": "float, default=0.9", "description": "The alpha-quantile of the huber loss function and the quantile\nloss function. Only if ``loss='huber'`` or ``loss='quantile'``." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -62583,7 +66109,8 @@ "docstring": { "type": "int, default=0", "description": "Enable verbose output. If 1 then it prints progress and performance\nonce in a while (the more trees the lower the frequency). If greater\nthan 1 then it prints progress and performance for every tree." - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -62593,7 +66120,8 @@ "docstring": { "type": "int, default=None", "description": "Grow trees with ``max_leaf_nodes`` in best-first fashion.\nBest nodes are defined as relative reduction in impurity.\nIf None then unlimited number of leaf nodes." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -62603,7 +66131,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit\nand add more estimators to the ensemble, otherwise, just erase the\nprevious solution. See :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -62613,7 +66142,8 @@ "docstring": { "type": "float, default=0.1", "description": "The proportion of training data to set aside as validation set for\nearly stopping. Must be between 0 and 1.\nOnly used if ``n_iter_no_change`` is set to an integer.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -62623,7 +66153,8 @@ "docstring": { "type": "int, default=None", "description": "``n_iter_no_change`` is used to decide if early stopping will be used\nto terminate training when validation score is not improving. By\ndefault it is set to None to disable early stopping. If set to a\nnumber, it will set aside ``validation_fraction`` size of the training\ndata as validation and terminate training when validation score is not\nimproving in all of the previous ``n_iter_no_change`` numbers of\niterations.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "tol", @@ -62633,7 +66164,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance for the early stopping. When the loss is not improving\nby at least tol for ``n_iter_no_change`` iterations (if set to a\nnumber), the training stops.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "ccp_alpha", @@ -62643,13 +66175,14 @@ "docstring": { "type": "non-negative float, default=0.0", "description": "Complexity parameter used for Minimal Cost-Complexity Pruning. The\nsubtree with the largest cost complexity that is smaller than\n``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n:ref:`minimal_cost_complexity_pruning` for details.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, loss='squared_error', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0):\n super().__init__(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, init=init, subsample=subsample, max_features=max_features, min_impurity_decrease=min_impurity_decrease, random_state=random_state, alpha=alpha, verbose=verbose, max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, ccp_alpha=ccp_alpha)" }, { @@ -62667,7 +66200,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -62677,7 +66211,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -62687,13 +66222,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_y(self, y, sample_weight=None):\n if y.dtype.kind == 'O':\n y = y.astype(DOUBLE)\n return y" }, { @@ -62711,13 +66247,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _warn_mae_for_criterion(self):\n warnings.warn(\"criterion='mae' was deprecated in version 0.24 and will be removed in version 1.1 (renaming of 0.26). The correct way of minimizing the absolute error is to use loss='absolute_error' instead.\", FutureWarning)" }, { @@ -62735,7 +66272,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -62745,13 +66283,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, its dtype will be converted to\n``dtype=np.float32``. If a sparse matrix is provided, it will\nbe converted to a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Apply trees in the ensemble to X, return leaf indices.\n\n.. versionadded:: 0.17", - "docstring": "Apply trees in the ensemble to X, return leaf indices.\n\n.. versionadded:: 0.17\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will\n be converted to a sparse ``csr_matrix``.\n\nReturns\n-------\nX_leaves : array-like of shape (n_samples, n_estimators)\n For each datapoint x in X and for each tree in the ensemble,\n return the index of the leaf x ends up in each estimator.", + "docstring": "Apply trees in the ensemble to X, return leaf indices.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will\n be converted to a sparse ``csr_matrix``.\n\n Returns\n -------\n X_leaves : array-like of shape (n_samples, n_estimators)\n For each datapoint x in X and for each tree in the ensemble,\n return the index of the leaf x ends up in each estimator.\n ", "source_code": "\ndef apply(self, X):\n \"\"\"Apply trees in the ensemble to X, return leaf indices.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, its dtype will be converted to\n ``dtype=np.float32``. If a sparse matrix is provided, it will\n be converted to a sparse ``csr_matrix``.\n\n Returns\n -------\n X_leaves : array-like of shape (n_samples, n_estimators)\n For each datapoint x in X and for each tree in the ensemble,\n return the index of the leaf x ends up in each estimator.\n \"\"\"\n leaves = super().apply(X)\n leaves = leaves.reshape(X.shape[0], self.estimators_.shape[0])\n return leaves" }, { @@ -62772,13 +66314,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `n_classes_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef n_classes_(self):\n try:\n check_is_fitted(self)\n except NotFittedError as nfe:\n raise AttributeError('{} object has no n_classes_ attribute.'.format(self.__class__.__name__)) from nfe\n return 1" }, { @@ -62796,7 +66339,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -62806,13 +66350,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict regression target for X.", - "docstring": "Predict regression target for X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nReturns\n-------\ny : ndarray of shape (n_samples,)\n The predicted values.", + "docstring": "Predict regression target for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict regression target for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted values.\n \"\"\"\n X = self._validate_data(X, dtype=DTYPE, order='C', accept_sparse='csr', reset=False)\n return self._raw_predict(X).ravel()" }, { @@ -62830,7 +66378,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -62840,13 +66389,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict regression target at each stage for X.\n\nThis method allows monitoring (i.e. determine error on testing set) after each stage.", - "docstring": "Predict regression target at each stage for X.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nYields\n------\ny : generator of ndarray of shape (n_samples,)\n The predicted value of the input samples.", + "description": "Predict regression target at each stage for X.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.", + "docstring": "Predict regression target at each stage for X.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Yields\n ------\n y : generator of ndarray of shape (n_samples,)\n The predicted value of the input samples.\n ", "source_code": "\ndef staged_predict(self, X):\n \"\"\"Predict regression target at each stage for X.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Yields\n ------\n y : generator of ndarray of shape (n_samples,)\n The predicted value of the input samples.\n \"\"\"\n for raw_predictions in self._staged_raw_predict(X):\n yield raw_predictions.ravel()" }, { @@ -62864,7 +66417,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -62874,13 +66428,14 @@ "docstring": { "type": "int", "description": "Verbosity level. If ``verbose==1`` output is printed once in a while\n(when iteration mod verbose_mod is zero).; if larger than 1 then output\nis printed for each update." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, verbose):\n self.verbose = verbose" }, { @@ -62898,7 +66453,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "est", @@ -62908,7 +66464,8 @@ "docstring": { "type": "Estimator", "description": "The estimator" - } + }, + "refined_type": {} }, { "name": "begin_at_stage", @@ -62918,13 +66475,14 @@ "docstring": { "type": "int, default=0", "description": "stage at which to begin reporting" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Initialize reporter", - "docstring": "Initialize reporter\n\nParameters\n----------\nest : Estimator\n The estimator\n\nbegin_at_stage : int, default=0\n stage at which to begin reporting", + "docstring": "Initialize reporter\n\n Parameters\n ----------\n est : Estimator\n The estimator\n\n begin_at_stage : int, default=0\n stage at which to begin reporting\n ", "source_code": "\ndef init(self, est, begin_at_stage=0):\n \"\"\"Initialize reporter\n\n Parameters\n ----------\n est : Estimator\n The estimator\n\n begin_at_stage : int, default=0\n stage at which to begin reporting\n \"\"\"\n header_fields = ['Iter', 'Train Loss']\n verbose_fmt = ['{iter:>10d}', '{train_score:>16.4f}']\n if est.subsample < 1:\n header_fields.append('OOB Improve')\n verbose_fmt.append('{oob_impr:>16.4f}')\n header_fields.append('Remaining Time')\n verbose_fmt.append('{remaining_time:>16s}')\n print(('%10s ' + '%16s ' * (len(header_fields) - 1)) % tuple(header_fields))\n self.verbose_fmt = ' '.join(verbose_fmt)\n self.verbose_mod = 1\n self.start_time = time()\n self.begin_at_stage = begin_at_stage" }, { @@ -62942,7 +66500,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "j", @@ -62952,7 +66511,8 @@ "docstring": { "type": "int", "description": "The new iteration." - } + }, + "refined_type": {} }, { "name": "est", @@ -62962,13 +66522,14 @@ "docstring": { "type": "Estimator", "description": "The estimator." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Update reporter with new iteration.", - "docstring": "Update reporter with new iteration.\n\nParameters\n----------\nj : int\n The new iteration.\nest : Estimator\n The estimator.", + "docstring": "Update reporter with new iteration.\n\n Parameters\n ----------\n j : int\n The new iteration.\n est : Estimator\n The estimator.\n ", "source_code": "\ndef update(self, j, est):\n \"\"\"Update reporter with new iteration.\n\n Parameters\n ----------\n j : int\n The new iteration.\n est : Estimator\n The estimator.\n \"\"\"\n do_oob = est.subsample < 1\n i = j - self.begin_at_stage\n if (i + 1) % self.verbose_mod == 0:\n oob_impr = est.oob_improvement_[j] if do_oob else 0\n remaining_time = (est.n_estimators - (j + 1)) * (time() - self.start_time) / float(i + 1)\n if remaining_time > 60:\n remaining_time = '{0:.2f}m'.format(remaining_time / 60.0)\n else:\n remaining_time = '{0:.2f}s'.format(remaining_time)\n print(self.verbose_fmt.format(iter=j + 1, train_score=est.train_score_[j], oob_impr=oob_impr, remaining_time=remaining_time))\n if self.verbose == 1 and (i + 1) // (self.verbose_mod * 10) > 0:\n self.verbose_mod *= 10" }, { @@ -62986,7 +66547,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -62996,7 +66558,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63006,7 +66569,8 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -63016,13 +66580,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the deviance (= 2 * negative log-likelihood).", - "docstring": "Compute the deviance (= 2 * negative log-likelihood).\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n True labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.", + "docstring": "Compute the deviance (= 2 * negative log-likelihood).\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n ", "source_code": "\ndef __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the deviance (= 2 * negative log-likelihood).\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n raw_predictions = raw_predictions.ravel()\n if sample_weight is None:\n return -2 * np.mean(y * raw_predictions - np.logaddexp(0, raw_predictions))\n else:\n return -2 / sample_weight.sum() * np.sum(sample_weight * (y * raw_predictions - np.logaddexp(0, raw_predictions)))" }, { @@ -63040,7 +66605,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -63050,13 +66616,14 @@ "docstring": { "type": "int", "description": "Number of classes." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_classes):\n if n_classes != 2:\n raise ValueError('{0:s} requires 2 classes; got {1:d} class(es)'.format(self.__class__.__name__, n_classes))\n super().__init__(n_classes=1)" }, { @@ -63074,7 +66641,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63084,13 +66652,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _raw_prediction_to_decision(self, raw_predictions):\n proba = self._raw_prediction_to_proba(raw_predictions)\n return np.argmax(proba, axis=1)" }, { @@ -63108,7 +66677,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63118,13 +66688,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _raw_prediction_to_proba(self, raw_predictions):\n proba = np.ones((raw_predictions.shape[0], 2), dtype=np.float64)\n proba[:, 1] = expit(raw_predictions.ravel())\n proba[:, 0] -= proba[:, 1]\n return proba" }, { @@ -63142,7 +66713,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -63152,7 +66724,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "terminal_regions", @@ -63162,7 +66735,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "leaf", @@ -63172,7 +66746,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -63182,7 +66757,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -63192,7 +66768,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "residual", @@ -63202,7 +66779,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63212,7 +66790,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -63222,13 +66801,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Make a single Newton-Raphson step.\n\nour node estimate is given by: sum(w * (y - prob)) / sum(w * prob * (1 - prob)) we take advantage that: y - prob = residual", - "docstring": "Make a single Newton-Raphson step.\n\nour node estimate is given by:\n\n sum(w * (y - prob)) / sum(w * prob * (1 - prob))\n\nwe take advantage that: y - prob = residual", + "description": "Make a single Newton-Raphson step.\n\nour node estimate is given by:\n\n sum(w * (y - prob)) / sum(w * prob * (1 - prob))\n\nwe take advantage that: y - prob = residual", + "docstring": "Make a single Newton-Raphson step.\n\n our node estimate is given by:\n\n sum(w * (y - prob)) / sum(w * prob * (1 - prob))\n\n we take advantage that: y - prob = residual\n ", "source_code": "\ndef _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, raw_predictions, sample_weight):\n \"\"\"Make a single Newton-Raphson step.\n\n our node estimate is given by:\n\n sum(w * (y - prob)) / sum(w * prob * (1 - prob))\n\n we take advantage that: y - prob = residual\n \"\"\"\n terminal_region = np.where(terminal_regions == leaf)[0]\n residual = residual.take(terminal_region, axis=0)\n y = y.take(terminal_region, axis=0)\n sample_weight = sample_weight.take(terminal_region, axis=0)\n numerator = np.sum(sample_weight * residual)\n denominator = np.sum(sample_weight * (y - residual) * (1 - y + residual))\n if abs(denominator) < 1e-150:\n tree.value[leaf, 0, 0] = 0.0\n else:\n tree.value[leaf, 0, 0] = numerator / denominator" }, { @@ -63246,7 +66826,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -63256,7 +66837,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -63266,13 +66848,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_init_raw_predictions(self, X, estimator):\n probas = estimator.predict_proba(X)\n proba_pos_class = probas[:, 1]\n eps = np.finfo(np.float32).eps\n proba_pos_class = np.clip(proba_pos_class, eps, 1 - eps)\n raw_predictions = np.log(proba_pos_class / (1 - proba_pos_class))\n return raw_predictions.reshape(-1, 1).astype(np.float64)" }, { @@ -63290,13 +66873,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef init_estimator(self):\n return DummyClassifier(strategy='prior')" }, { @@ -63314,7 +66898,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -63324,7 +66909,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63334,13 +66920,14 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble at iteration ``i - 1``." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute half of the negative gradient.", - "docstring": "Compute half of the negative gradient.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n True labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.", + "docstring": "Compute half of the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n ", "source_code": "\ndef negative_gradient(self, y, raw_predictions, **kargs):\n \"\"\"Compute half of the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n \"\"\"\n return y - expit(raw_predictions.ravel())" }, { @@ -63358,7 +66945,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63368,13 +66956,14 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Template method to convert raw predictions to decisions.", - "docstring": "Template method to convert raw predictions to decisions.\n\nParameters\n----------\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\nReturns\n-------\nencoded_predictions : ndarray of shape (n_samples, K)\n The predicted encoded labels.", + "docstring": "Template method to convert raw predictions to decisions.\n\n Parameters\n ----------\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n Returns\n -------\n encoded_predictions : ndarray of shape (n_samples, K)\n The predicted encoded labels.\n ", "source_code": "\n@abstractmethod\ndef _raw_prediction_to_decision(self, raw_predictions):\n \"\"\"Template method to convert raw predictions to decisions.\n\n Parameters\n ----------\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n Returns\n -------\n encoded_predictions : ndarray of shape (n_samples, K)\n The predicted encoded labels.\n \"\"\"\n " }, { @@ -63392,7 +66981,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63402,13 +66992,14 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Template method to convert raw predictions into probabilities.", - "docstring": "Template method to convert raw predictions into probabilities.\n\nParameters\n----------\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\nReturns\n-------\nprobas : ndarray of shape (n_samples, K)\n The predicted probabilities.", + "docstring": "Template method to convert raw predictions into probabilities.\n\n Parameters\n ----------\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n Returns\n -------\n probas : ndarray of shape (n_samples, K)\n The predicted probabilities.\n ", "source_code": "\ndef _raw_prediction_to_proba(self, raw_predictions):\n \"\"\"Template method to convert raw predictions into probabilities.\n\n Parameters\n ----------\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n Returns\n -------\n probas : ndarray of shape (n_samples, K)\n The predicted probabilities.\n \"\"\"\n " }, { @@ -63426,7 +67017,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -63436,13 +67028,14 @@ "docstring": { "type": "object", "description": "The init estimator to check." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Make sure estimator has fit and predict_proba methods.", - "docstring": "Make sure estimator has fit and predict_proba methods.\n\nParameters\n----------\nestimator : object\n The init estimator to check.", + "docstring": "Make sure estimator has fit and predict_proba methods.\n\n Parameters\n ----------\n estimator : object\n The init estimator to check.\n ", "source_code": "\ndef check_init_estimator(self, estimator):\n \"\"\"Make sure estimator has fit and predict_proba methods.\n\n Parameters\n ----------\n estimator : object\n The init estimator to check.\n \"\"\"\n if not (hasattr(estimator, 'fit') and hasattr(estimator, 'predict_proba')):\n raise ValueError('The init parameter must be a valid estimator and support both fit and predict_proba.')" }, { @@ -63460,7 +67053,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -63470,7 +67064,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63480,7 +67075,8 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -63490,13 +67086,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the exponential loss", - "docstring": "Compute the exponential loss\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n True labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.", + "docstring": "Compute the exponential loss\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n ", "source_code": "\ndef __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the exponential loss\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n raw_predictions = raw_predictions.ravel()\n if sample_weight is None:\n return np.mean(np.exp(-(2.0 * y - 1.0) * raw_predictions))\n else:\n return 1.0 / sample_weight.sum() * np.sum(sample_weight * np.exp(-(2 * y - 1) * raw_predictions))" }, { @@ -63514,7 +67111,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -63524,13 +67122,14 @@ "docstring": { "type": "int", "description": "Number of classes." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_classes):\n if n_classes != 2:\n raise ValueError('{0:s} requires 2 classes; got {1:d} class(es)'.format(self.__class__.__name__, n_classes))\n super().__init__(n_classes=1)" }, { @@ -63548,7 +67147,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63558,13 +67158,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _raw_prediction_to_decision(self, raw_predictions):\n return (raw_predictions.ravel() >= 0).astype(int)" }, { @@ -63582,7 +67183,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63592,13 +67194,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _raw_prediction_to_proba(self, raw_predictions):\n proba = np.ones((raw_predictions.shape[0], 2), dtype=np.float64)\n proba[:, 1] = expit(2.0 * raw_predictions.ravel())\n proba[:, 0] -= proba[:, 1]\n return proba" }, { @@ -63616,7 +67219,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -63626,7 +67230,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "terminal_regions", @@ -63636,7 +67241,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "leaf", @@ -63646,7 +67252,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -63656,7 +67263,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -63666,7 +67274,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "residual", @@ -63676,7 +67285,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63686,7 +67296,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -63696,13 +67307,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, raw_predictions, sample_weight):\n terminal_region = np.where(terminal_regions == leaf)[0]\n raw_predictions = raw_predictions.take(terminal_region, axis=0)\n y = y.take(terminal_region, axis=0)\n sample_weight = sample_weight.take(terminal_region, axis=0)\n y_ = 2.0 * y - 1.0\n numerator = np.sum(y_ * sample_weight * np.exp(-y_ * raw_predictions))\n denominator = np.sum(sample_weight * np.exp(-y_ * raw_predictions))\n if abs(denominator) < 1e-150:\n tree.value[leaf, 0, 0] = 0.0\n else:\n tree.value[leaf, 0, 0] = numerator / denominator" }, { @@ -63720,7 +67332,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -63730,7 +67343,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -63740,13 +67354,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_init_raw_predictions(self, X, estimator):\n probas = estimator.predict_proba(X)\n proba_pos_class = probas[:, 1]\n eps = np.finfo(np.float32).eps\n proba_pos_class = np.clip(proba_pos_class, eps, 1 - eps)\n raw_predictions = 0.5 * np.log(proba_pos_class / (1 - proba_pos_class))\n return raw_predictions.reshape(-1, 1).astype(np.float64)" }, { @@ -63764,13 +67379,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef init_estimator(self):\n return DummyClassifier(strategy='prior')" }, { @@ -63788,7 +67404,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -63798,7 +67415,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63808,14 +67426,15 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble at iteration ``i - 1``." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the residual (= negative gradient).", - "docstring": "Compute the residual (= negative gradient).\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n True labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.", - "source_code": "\ndef negative_gradient(self, y, raw_predictions, **kargs):\n \"\"\"Compute the residual (= negative gradient).\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n \"\"\"\n y_ = -(2.0 * y - 1.0)\n return y_ * np.exp(y_ * raw_predictions.ravel())" + "docstring": "Compute the residual (= negative gradient).\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n ", + "source_code": "\ndef negative_gradient(self, y, raw_predictions, **kargs):\n \"\"\"Compute the residual (= negative gradient).\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n \"\"\"\n y_ = 2.0 * y - 1.0\n return y_ * np.exp(-y_ * raw_predictions.ravel())" }, { "name": "__call__", @@ -63832,7 +67451,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -63842,7 +67462,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63852,7 +67473,8 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -63862,13 +67484,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the Huber loss.", - "docstring": "Compute the Huber loss.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n True labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.", + "docstring": "Compute the Huber loss.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n ", "source_code": "\ndef __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the Huber loss.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n raw_predictions = raw_predictions.ravel()\n diff = y - raw_predictions\n gamma = self.gamma\n if gamma is None:\n if sample_weight is None:\n gamma = np.percentile(np.abs(diff), self.alpha * 100)\n else:\n gamma = _weighted_percentile(np.abs(diff), sample_weight, self.alpha * 100)\n gamma_mask = np.abs(diff) <= gamma\n if sample_weight is None:\n sq_loss = np.sum(0.5 * diff[gamma_mask]**2)\n lin_loss = np.sum(gamma * (np.abs(diff[~gamma_mask]) - gamma / 2))\n loss = (sq_loss + lin_loss) / y.shape[0]\n else:\n sq_loss = np.sum(0.5 * sample_weight[gamma_mask] * diff[gamma_mask]**2)\n lin_loss = np.sum(gamma * sample_weight[~gamma_mask] * (np.abs(diff[~gamma_mask]) - gamma / 2))\n loss = (sq_loss + lin_loss) / sample_weight.sum()\n return loss" }, { @@ -63886,7 +67509,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -63896,13 +67520,14 @@ "docstring": { "type": "float, default=0.9", "description": "Percentile at which to extract score." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alpha=0.9):\n super().__init__()\n self.alpha = alpha\n self.gamma = None" }, { @@ -63920,7 +67545,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -63930,7 +67556,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "terminal_regions", @@ -63940,7 +67567,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "leaf", @@ -63950,7 +67578,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -63960,7 +67589,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -63970,7 +67600,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "residual", @@ -63980,7 +67611,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -63990,7 +67622,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -64000,13 +67633,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, raw_predictions, sample_weight):\n terminal_region = np.where(terminal_regions == leaf)[0]\n sample_weight = sample_weight.take(terminal_region, axis=0)\n gamma = self.gamma\n diff = y.take(terminal_region, axis=0) - raw_predictions.take(terminal_region, axis=0)\n median = _weighted_percentile(diff, sample_weight, percentile=50)\n diff_minus_median = diff - median\n tree.value[leaf, 0] = median + np.mean(np.sign(diff_minus_median) * np.minimum(np.abs(diff_minus_median), gamma))" }, { @@ -64024,13 +67658,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef init_estimator(self):\n return DummyRegressor(strategy='quantile', quantile=0.5)" }, { @@ -64048,7 +67683,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -64058,7 +67694,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -64068,7 +67705,8 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble at iteration ``i - 1``." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -64078,13 +67716,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the negative gradient.", - "docstring": "Compute the negative gradient.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n The target labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.", + "docstring": "Compute the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n ", "source_code": "\ndef negative_gradient(self, y, raw_predictions, sample_weight=None, **kargs):\n \"\"\"Compute the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n raw_predictions = raw_predictions.ravel()\n diff = y - raw_predictions\n if sample_weight is None:\n gamma = np.percentile(np.abs(diff), self.alpha * 100)\n else:\n gamma = _weighted_percentile(np.abs(diff), sample_weight, self.alpha * 100)\n gamma_mask = np.abs(diff) <= gamma\n residual = np.zeros((y.shape[0], ), dtype=np.float64)\n residual[gamma_mask] = diff[gamma_mask]\n residual[~gamma_mask] = gamma * np.sign(diff[~gamma_mask])\n self.gamma = gamma\n return residual" }, { @@ -64102,7 +67741,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -64112,7 +67752,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -64122,7 +67763,8 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -64132,13 +67774,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the least absolute error.", - "docstring": "Compute the least absolute error.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n True labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves).\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.", + "docstring": "Compute the least absolute error.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves).\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n ", "source_code": "\ndef __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the least absolute error.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves).\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n if sample_weight is None:\n return np.abs(y - raw_predictions.ravel()).mean()\n else:\n return 1 / sample_weight.sum() * np.sum(sample_weight * np.abs(y - raw_predictions.ravel()))" }, { @@ -64156,7 +67799,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -64166,7 +67810,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "terminal_regions", @@ -64176,7 +67821,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "leaf", @@ -64186,7 +67832,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -64196,7 +67843,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -64206,7 +67854,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "residual", @@ -64216,7 +67865,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -64226,7 +67876,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -64236,7 +67887,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -64260,13 +67912,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef init_estimator(self):\n return DummyRegressor(strategy='quantile', quantile=0.5)" }, { @@ -64284,7 +67937,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -64294,7 +67948,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -64304,13 +67959,14 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble at iteration ``i - 1``." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the negative gradient.\n\n1.0 if y - raw_predictions > 0.0 else -1.0", - "docstring": "Compute the negative gradient.\n\n1.0 if y - raw_predictions > 0.0 else -1.0\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n The target labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.", + "docstring": "Compute the negative gradient.\n\n 1.0 if y - raw_predictions > 0.0 else -1.0\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n ", "source_code": "\ndef negative_gradient(self, y, raw_predictions, **kargs):\n \"\"\"Compute the negative gradient.\n\n 1.0 if y - raw_predictions > 0.0 else -1.0\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n \"\"\"\n raw_predictions = raw_predictions.ravel()\n return 2 * (y - raw_predictions > 0) - 1" }, { @@ -64328,7 +67984,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -64338,7 +67995,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -64348,7 +68006,8 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -64358,13 +68017,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the least squares loss.", - "docstring": "Compute the least squares loss.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n True labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves).\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.", + "docstring": "Compute the least squares loss.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves).\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n ", "source_code": "\ndef __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the least squares loss.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves).\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n if sample_weight is None:\n return np.mean((y - raw_predictions.ravel())**2)\n else:\n return 1 / sample_weight.sum() * np.sum(sample_weight * (y - raw_predictions.ravel())**2)" }, { @@ -64382,7 +68042,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -64392,7 +68053,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "terminal_regions", @@ -64402,7 +68064,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "leaf", @@ -64412,7 +68075,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -64422,7 +68086,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -64432,7 +68097,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "residual", @@ -64442,7 +68108,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -64452,7 +68119,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -64462,13 +68130,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, raw_predictions, sample_weight):\n pass" }, { @@ -64486,13 +68155,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef init_estimator(self):\n return DummyRegressor(strategy='mean')" }, { @@ -64510,7 +68180,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -64520,7 +68191,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -64530,13 +68202,14 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble at iteration ``i - 1``." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute half of the negative gradient.", - "docstring": "Compute half of the negative gradient.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n The target labels.\n\nraw_predictions : ndarray of shape (n_samples,)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.", + "docstring": "Compute half of the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples,)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n ", "source_code": "\ndef negative_gradient(self, y, raw_predictions, **kargs):\n \"\"\"Compute half of the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples,)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n \"\"\"\n return y - raw_predictions.ravel()" }, { @@ -64554,7 +68227,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -64564,7 +68238,8 @@ "docstring": { "type": "tree.Tree", "description": "The tree object." - } + }, + "refined_type": {} }, { "name": "X", @@ -64574,7 +68249,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The data array." - } + }, + "refined_type": {} }, { "name": "y", @@ -64584,7 +68260,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target labels." - } + }, + "refined_type": {} }, { "name": "residual", @@ -64594,7 +68271,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The residuals (usually the negative gradient)." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -64604,7 +68282,8 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble at iteration ``i - 1``." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -64614,7 +68293,8 @@ "docstring": { "type": "ndarray of shape (n,)", "description": "The weight of each sample." - } + }, + "refined_type": {} }, { "name": "sample_mask", @@ -64624,7 +68304,8 @@ "docstring": { "type": "ndarray of shape (n,)", "description": "The sample mask to be used." - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -64634,7 +68315,8 @@ "docstring": { "type": "float, default=0.1", "description": "Learning rate shrinks the contribution of each tree by\n ``learning_rate``." - } + }, + "refined_type": {} }, { "name": "k", @@ -64644,13 +68326,14 @@ "docstring": { "type": "int, default=0", "description": "The index of the estimator being updated." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Least squares does not need to update terminal regions.\n\nBut it has to update the predictions.", - "docstring": "Least squares does not need to update terminal regions.\n\nBut it has to update the predictions.\n\nParameters\n----------\ntree : tree.Tree\n The tree object.\nX : ndarray of shape (n_samples, n_features)\n The data array.\ny : ndarray of shape (n_samples,)\n The target labels.\nresidual : ndarray of shape (n_samples,)\n The residuals (usually the negative gradient).\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\nsample_weight : ndarray of shape (n,)\n The weight of each sample.\nsample_mask : ndarray of shape (n,)\n The sample mask to be used.\nlearning_rate : float, default=0.1\n Learning rate shrinks the contribution of each tree by\n ``learning_rate``.\nk : int, default=0\n The index of the estimator being updated.", + "docstring": "Least squares does not need to update terminal regions.\n\n But it has to update the predictions.\n\n Parameters\n ----------\n tree : tree.Tree\n The tree object.\n X : ndarray of shape (n_samples, n_features)\n The data array.\n y : ndarray of shape (n_samples,)\n The target labels.\n residual : ndarray of shape (n_samples,)\n The residuals (usually the negative gradient).\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n sample_weight : ndarray of shape (n,)\n The weight of each sample.\n sample_mask : ndarray of shape (n,)\n The sample mask to be used.\n learning_rate : float, default=0.1\n Learning rate shrinks the contribution of each tree by\n ``learning_rate``.\n k : int, default=0\n The index of the estimator being updated.\n ", "source_code": "\ndef update_terminal_regions(self, tree, X, y, residual, raw_predictions, sample_weight, sample_mask, learning_rate=0.1, k=0):\n \"\"\"Least squares does not need to update terminal regions.\n\n But it has to update the predictions.\n\n Parameters\n ----------\n tree : tree.Tree\n The tree object.\n X : ndarray of shape (n_samples, n_features)\n The data array.\n y : ndarray of shape (n_samples,)\n The target labels.\n residual : ndarray of shape (n_samples,)\n The residuals (usually the negative gradient).\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n sample_weight : ndarray of shape (n,)\n The weight of each sample.\n sample_mask : ndarray of shape (n,)\n The sample mask to be used.\n learning_rate : float, default=0.1\n Learning rate shrinks the contribution of each tree by\n ``learning_rate``.\n k : int, default=0\n The index of the estimator being updated.\n \"\"\"\n raw_predictions[:, k] += learning_rate * tree.predict(X).ravel()" }, { @@ -64668,7 +68351,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -64678,7 +68362,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -64688,7 +68373,8 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -64698,13 +68384,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the loss.", - "docstring": "Compute the loss.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n True labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves).\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.", + "docstring": "Compute the loss.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves).\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n ", "source_code": "\n@abstractmethod\ndef __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the loss.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves).\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n " }, { @@ -64722,7 +68409,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -64732,13 +68420,14 @@ "docstring": { "type": "int", "description": "Number of classes." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_classes):\n self.K = n_classes" }, { @@ -64756,7 +68445,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -64766,7 +68456,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "terminal_regions", @@ -64776,7 +68467,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "leaf", @@ -64786,7 +68478,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -64796,7 +68489,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -64806,7 +68500,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "residual", @@ -64816,7 +68511,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -64826,7 +68522,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -64836,7 +68533,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -64860,7 +68558,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -64870,7 +68569,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The data array." - } + }, + "refined_type": {} }, { "name": "estimator", @@ -64880,13 +68580,14 @@ "docstring": { "type": "object", "description": "The estimator to use to compute the predictions." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return the initial raw predictions.", - "docstring": "Return the initial raw predictions.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n The data array.\nestimator : object\n The estimator to use to compute the predictions.\n\nReturns\n-------\nraw_predictions : ndarray of shape (n_samples, K)\n The initial raw predictions. K is equal to 1 for binary\n classification and regression, and equal to the number of classes\n for multiclass classification. ``raw_predictions`` is casted\n into float64.", + "docstring": "Return the initial raw predictions.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data array.\n estimator : object\n The estimator to use to compute the predictions.\n\n Returns\n -------\n raw_predictions : ndarray of shape (n_samples, K)\n The initial raw predictions. K is equal to 1 for binary\n classification and regression, and equal to the number of classes\n for multiclass classification. ``raw_predictions`` is casted\n into float64.\n ", "source_code": "\n@abstractmethod\ndef get_init_raw_predictions(self, X, estimator):\n \"\"\"Return the initial raw predictions.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data array.\n estimator : object\n The estimator to use to compute the predictions.\n\n Returns\n -------\n raw_predictions : ndarray of shape (n_samples, K)\n The initial raw predictions. K is equal to 1 for binary\n classification and regression, and equal to the number of classes\n for multiclass classification. ``raw_predictions`` is casted\n into float64.\n \"\"\"\n pass" }, { @@ -64904,7 +68605,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -64928,7 +68630,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -64938,7 +68641,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -64948,13 +68652,14 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble at iteration ``i - 1``." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the negative gradient.", - "docstring": "Compute the negative gradient.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n The target labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.", + "docstring": "Compute the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n ", "source_code": "\n@abstractmethod\ndef negative_gradient(self, y, raw_predictions, **kargs):\n \"\"\"Compute the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n \"\"\"\n " }, { @@ -64972,7 +68677,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -64982,7 +68688,8 @@ "docstring": { "type": "tree.Tree", "description": "The tree object." - } + }, + "refined_type": {} }, { "name": "X", @@ -64992,7 +68699,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The data array." - } + }, + "refined_type": {} }, { "name": "y", @@ -65002,7 +68710,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target labels." - } + }, + "refined_type": {} }, { "name": "residual", @@ -65012,7 +68721,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The residuals (usually the negative gradient)." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -65022,7 +68732,8 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble at iteration ``i - 1``." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -65032,7 +68743,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The weight of each sample." - } + }, + "refined_type": {} }, { "name": "sample_mask", @@ -65042,7 +68754,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The sample mask to be used." - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -65052,7 +68765,8 @@ "docstring": { "type": "float, default=0.1", "description": "Learning rate shrinks the contribution of each tree by\n ``learning_rate``." - } + }, + "refined_type": {} }, { "name": "k", @@ -65062,13 +68776,14 @@ "docstring": { "type": "int, default=0", "description": "The index of the estimator being updated." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Update the terminal regions (=leaves) of the given tree and updates the current predictions of the model. Traverses tree and invokes template method `_update_terminal_region`.", - "docstring": "Update the terminal regions (=leaves) of the given tree and\nupdates the current predictions of the model. Traverses tree\nand invokes template method `_update_terminal_region`.\n\nParameters\n----------\ntree : tree.Tree\n The tree object.\nX : ndarray of shape (n_samples, n_features)\n The data array.\ny : ndarray of shape (n_samples,)\n The target labels.\nresidual : ndarray of shape (n_samples,)\n The residuals (usually the negative gradient).\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\nsample_weight : ndarray of shape (n_samples,)\n The weight of each sample.\nsample_mask : ndarray of shape (n_samples,)\n The sample mask to be used.\nlearning_rate : float, default=0.1\n Learning rate shrinks the contribution of each tree by\n ``learning_rate``.\nk : int, default=0\n The index of the estimator being updated.", + "description": "Update the terminal regions (=leaves) of the given tree and\nupdates the current predictions of the model. Traverses tree\nand invokes template method `_update_terminal_region`.", + "docstring": "Update the terminal regions (=leaves) of the given tree and\n updates the current predictions of the model. Traverses tree\n and invokes template method `_update_terminal_region`.\n\n Parameters\n ----------\n tree : tree.Tree\n The tree object.\n X : ndarray of shape (n_samples, n_features)\n The data array.\n y : ndarray of shape (n_samples,)\n The target labels.\n residual : ndarray of shape (n_samples,)\n The residuals (usually the negative gradient).\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n sample_weight : ndarray of shape (n_samples,)\n The weight of each sample.\n sample_mask : ndarray of shape (n_samples,)\n The sample mask to be used.\n learning_rate : float, default=0.1\n Learning rate shrinks the contribution of each tree by\n ``learning_rate``.\n k : int, default=0\n The index of the estimator being updated.\n\n ", "source_code": "\ndef update_terminal_regions(self, tree, X, y, residual, raw_predictions, sample_weight, sample_mask, learning_rate=0.1, k=0):\n \"\"\"Update the terminal regions (=leaves) of the given tree and\n updates the current predictions of the model. Traverses tree\n and invokes template method `_update_terminal_region`.\n\n Parameters\n ----------\n tree : tree.Tree\n The tree object.\n X : ndarray of shape (n_samples, n_features)\n The data array.\n y : ndarray of shape (n_samples,)\n The target labels.\n residual : ndarray of shape (n_samples,)\n The residuals (usually the negative gradient).\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n sample_weight : ndarray of shape (n_samples,)\n The weight of each sample.\n sample_mask : ndarray of shape (n_samples,)\n The sample mask to be used.\n learning_rate : float, default=0.1\n Learning rate shrinks the contribution of each tree by\n ``learning_rate``.\n k : int, default=0\n The index of the estimator being updated.\n\n \"\"\"\n terminal_regions = tree.apply(X)\n masked_terminal_regions = terminal_regions.copy()\n masked_terminal_regions[~sample_mask] = -1\n for leaf in np.where(tree.children_left == TREE_LEAF)[0]:\n self._update_terminal_region(tree, masked_terminal_regions, leaf, X, y, residual, raw_predictions[:, k], sample_weight)\n raw_predictions[:, k] += learning_rate * tree.value[:, 0, 0].take(terminal_regions, axis=0)" }, { @@ -65086,7 +68801,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -65096,7 +68812,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -65106,7 +68823,8 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -65116,13 +68834,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the Multinomial deviance.", - "docstring": "Compute the Multinomial deviance.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n True labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.", + "docstring": "Compute the Multinomial deviance.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n ", "source_code": "\ndef __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the Multinomial deviance.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n Y = np.zeros((y.shape[0], self.K), dtype=np.float64)\n for k in range(self.K):\n Y[:, k] = y == k\n return np.average(-1 * (Y * raw_predictions).sum(axis=1) + logsumexp(raw_predictions, axis=1), weights=sample_weight)" }, { @@ -65140,7 +68859,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -65150,13 +68870,14 @@ "docstring": { "type": "int", "description": "Number of classes." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_classes):\n if n_classes < 3:\n raise ValueError('{0:s} requires more than 2 classes.'.format(self.__class__.__name__))\n super().__init__(n_classes)" }, { @@ -65174,7 +68895,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -65184,13 +68906,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _raw_prediction_to_decision(self, raw_predictions):\n proba = self._raw_prediction_to_proba(raw_predictions)\n return np.argmax(proba, axis=1)" }, { @@ -65208,7 +68931,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -65218,13 +68942,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _raw_prediction_to_proba(self, raw_predictions):\n return np.nan_to_num(np.exp(raw_predictions - logsumexp(raw_predictions, axis=1)[:, np.newaxis]))" }, { @@ -65242,7 +68967,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -65252,7 +68978,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "terminal_regions", @@ -65262,7 +68989,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "leaf", @@ -65272,7 +69000,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -65282,7 +69011,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -65292,7 +69022,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "residual", @@ -65302,7 +69033,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -65312,7 +69044,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -65322,7 +69055,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -65346,7 +69080,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -65356,7 +69091,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -65366,13 +69102,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_init_raw_predictions(self, X, estimator):\n probas = estimator.predict_proba(X)\n eps = np.finfo(np.float32).eps\n probas = np.clip(probas, eps, 1 - eps)\n raw_predictions = np.log(probas).astype(np.float64)\n return raw_predictions" }, { @@ -65390,13 +69127,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef init_estimator(self):\n return DummyClassifier(strategy='prior')" }, { @@ -65414,7 +69152,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -65424,7 +69163,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -65434,7 +69174,8 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble at iteration ``i - 1``." - } + }, + "refined_type": {} }, { "name": "k", @@ -65444,13 +69185,14 @@ "docstring": { "type": "int, default=0", "description": "The index of the class." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute negative gradient for the ``k``-th class.", - "docstring": "Compute negative gradient for the ``k``-th class.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n The target labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n\nk : int, default=0\n The index of the class.", + "docstring": "Compute negative gradient for the ``k``-th class.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n\n k : int, default=0\n The index of the class.\n ", "source_code": "\ndef negative_gradient(self, y, raw_predictions, k=0, **kwargs):\n \"\"\"Compute negative gradient for the ``k``-th class.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n\n k : int, default=0\n The index of the class.\n \"\"\"\n return y - np.nan_to_num(np.exp(raw_predictions[:, k] - logsumexp(raw_predictions, axis=1)))" }, { @@ -65468,7 +69210,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -65478,7 +69221,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -65488,7 +69232,8 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -65498,13 +69243,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the Quantile loss.", - "docstring": "Compute the Quantile loss.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n True labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.", + "docstring": "Compute the Quantile loss.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n ", "source_code": "\ndef __call__(self, y, raw_predictions, sample_weight=None):\n \"\"\"Compute the Quantile loss.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n True labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights.\n \"\"\"\n raw_predictions = raw_predictions.ravel()\n diff = y - raw_predictions\n alpha = self.alpha\n mask = y > raw_predictions\n if sample_weight is None:\n loss = (alpha * diff[mask].sum() - (1 - alpha) * diff[~mask].sum()) / y.shape[0]\n else:\n loss = (alpha * np.sum(sample_weight[mask] * diff[mask]) - (1 - alpha) * np.sum(sample_weight[~mask] * diff[~mask])) / sample_weight.sum()\n return loss" }, { @@ -65522,7 +69268,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -65532,13 +69279,14 @@ "docstring": { "type": "float, default=0.9", "description": "The percentile." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alpha=0.9):\n super().__init__()\n self.alpha = alpha\n self.percentile = alpha * 100" }, { @@ -65556,7 +69304,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -65566,7 +69315,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "terminal_regions", @@ -65576,7 +69326,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "leaf", @@ -65586,7 +69337,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -65596,7 +69348,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -65606,7 +69359,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "residual", @@ -65616,7 +69370,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -65626,7 +69381,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -65636,13 +69392,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, raw_predictions, sample_weight):\n terminal_region = np.where(terminal_regions == leaf)[0]\n diff = y.take(terminal_region, axis=0) - raw_predictions.take(terminal_region, axis=0)\n sample_weight = sample_weight.take(terminal_region, axis=0)\n val = _weighted_percentile(diff, sample_weight, self.percentile)\n tree.value[leaf, 0] = val" }, { @@ -65660,13 +69417,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef init_estimator(self):\n return DummyRegressor(strategy='quantile', quantile=self.alpha)" }, { @@ -65684,7 +69442,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -65694,7 +69453,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target labels." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -65704,13 +69464,14 @@ "docstring": { "type": "ndarray of shape (n_samples, K)", "description": "The raw predictions (i.e. values from the tree leaves) of the\ntree ensemble at iteration ``i - 1``." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the negative gradient.", - "docstring": "Compute the negative gradient.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n The target labels.\n\nraw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.", + "docstring": "Compute the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n ", "source_code": "\ndef negative_gradient(self, y, raw_predictions, **kargs):\n \"\"\"Compute the negative gradient.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n The target labels.\n\n raw_predictions : ndarray of shape (n_samples, K)\n The raw predictions (i.e. values from the tree leaves) of the\n tree ensemble at iteration ``i - 1``.\n \"\"\"\n alpha = self.alpha\n raw_predictions = raw_predictions.ravel()\n mask = y > raw_predictions\n return alpha * mask - (1 - alpha) * ~mask" }, { @@ -65728,13 +69489,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self):\n super().__init__(n_classes=1)" }, { @@ -65752,7 +69514,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -65762,13 +69525,14 @@ "docstring": { "type": "object", "description": "The init estimator to check." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Make sure estimator has the required fit and predict methods.", - "docstring": "Make sure estimator has the required fit and predict methods.\n\nParameters\n----------\nestimator : object\n The init estimator to check.", + "docstring": "Make sure estimator has the required fit and predict methods.\n\n Parameters\n ----------\n estimator : object\n The init estimator to check.\n ", "source_code": "\ndef check_init_estimator(self, estimator):\n \"\"\"Make sure estimator has the required fit and predict methods.\n\n Parameters\n ----------\n estimator : object\n The init estimator to check.\n \"\"\"\n if not (hasattr(estimator, 'fit') and hasattr(estimator, 'predict')):\n raise ValueError('The init parameter must be a valid estimator and support both fit and predict.')" }, { @@ -65786,7 +69550,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -65796,7 +69561,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -65806,13 +69572,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_init_raw_predictions(self, X, estimator):\n predictions = estimator.predict(X)\n return predictions.reshape(-1, 1).astype(np.float64)" }, { @@ -65830,7 +69597,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_bins", @@ -65840,7 +69608,8 @@ "docstring": { "type": "int, default=256", "description": "The maximum number of bins to use (including the bin for missing\nvalues). Should be in [3, 256]. Non-missing values are binned on\n``max_bins = n_bins - 1`` bins. The last bin is always reserved for\nmissing values. If for a given feature the number of unique values is\nless than ``max_bins``, then those unique values will be used to\ncompute the bin thresholds, instead of the quantiles. For categorical\nfeatures indicated by ``is_categorical``, the docstring for\n``is_categorical`` details on this procedure." - } + }, + "refined_type": {} }, { "name": "subsample", @@ -65850,7 +69619,8 @@ "docstring": { "type": "int or None, default=2e5", "description": "If ``n_samples > subsample``, then ``sub_samples`` samples will be\nrandomly chosen to compute the quantiles. If ``None``, the whole data\nis used." - } + }, + "refined_type": {} }, { "name": "is_categorical", @@ -65860,7 +69630,8 @@ "docstring": { "type": "ndarray of bool of shape (n_features,), default=None", "description": "Indicates categorical features. By default, all features are\nconsidered continuous." - } + }, + "refined_type": {} }, { "name": "known_categories", @@ -65870,6 +69641,10 @@ "docstring": { "type": "list of {ndarray, None} of shape (n_features,), default=none", "description": "For each categorical feature, the array indicates the set of unique\ncategorical values. These should be the possible values over all the\ndata, not just the training data. For continuous features, the\ncorresponding entry should be None." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -65880,7 +69655,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -65890,13 +69666,14 @@ "docstring": { "type": "int, default=None", "description": "Number of OpenMP threads to use. `_openmp_effective_n_threads` is called\nto determine the effective number of threads use, which takes cgroups CPU\nquotes into account. See the docstring of `_openmp_effective_n_threads`\nfor details." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_bins=256, subsample=int(200000.0), is_categorical=None, known_categories=None, random_state=None, n_threads=None):\n self.n_bins = n_bins\n self.subsample = subsample\n self.is_categorical = is_categorical\n self.known_categories = known_categories\n self.random_state = random_state\n self.n_threads = n_threads" }, { @@ -65914,7 +69691,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -65924,7 +69702,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to bin." - } + }, + "refined_type": {} }, { "name": "y", @@ -65934,13 +69713,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Fit data X by computing the binning thresholds.\n\nThe last bin is reserved for missing values, whether missing values are present in the data or not.", - "docstring": "Fit data X by computing the binning thresholds.\n\nThe last bin is reserved for missing values, whether missing values\nare present in the data or not.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data to bin.\ny: None\n Ignored.\n\nReturns\n-------\nself : object", + "description": "Fit data X by computing the binning thresholds.\n\nThe last bin is reserved for missing values, whether missing values\nare present in the data or not.", + "docstring": "Fit data X by computing the binning thresholds.\n\n The last bin is reserved for missing values, whether missing values\n are present in the data or not.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to bin.\n y: None\n Ignored.\n\n Returns\n -------\n self : object\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit data X by computing the binning thresholds.\n\n The last bin is reserved for missing values, whether missing values\n are present in the data or not.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to bin.\n y: None\n Ignored.\n\n Returns\n -------\n self : object\n \"\"\"\n if not 3 <= self.n_bins <= 256:\n raise ValueError('n_bins={} should be no smaller than 3 and no larger than 256.'.format(self.n_bins))\n X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)\n max_bins = self.n_bins - 1\n rng = check_random_state(self.random_state)\n if self.subsample is not None and X.shape[0] > self.subsample:\n subset = rng.choice(X.shape[0], self.subsample, replace=False)\n X = X.take(subset, axis=0)\n if self.is_categorical is None:\n self.is_categorical_ = np.zeros(X.shape[1], dtype=np.uint8)\n else:\n self.is_categorical_ = np.asarray(self.is_categorical, dtype=np.uint8)\n n_features = X.shape[1]\n known_categories = self.known_categories\n if known_categories is None:\n known_categories = [None] * n_features\n for f_idx in range(n_features):\n is_categorical = self.is_categorical_[f_idx]\n known_cats = known_categories[f_idx]\n if is_categorical and known_cats is None:\n raise ValueError(f'Known categories for feature {f_idx} must be provided.')\n if not is_categorical and known_cats is not None:\n raise ValueError(f\"Feature {f_idx} isn't marked as a categorical feature, but categories were passed.\")\n self.missing_values_bin_idx_ = self.n_bins - 1\n self.bin_thresholds_ = []\n n_bins_non_missing = []\n for f_idx in range(n_features):\n if not self.is_categorical_[f_idx]:\n thresholds = _find_binning_thresholds(X[:, f_idx], max_bins)\n n_bins_non_missing.append(thresholds.shape[0] + 1)\n else:\n thresholds = known_categories[f_idx]\n n_bins_non_missing.append(thresholds.shape[0])\n self.bin_thresholds_.append(thresholds)\n self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint32)\n return self" }, { @@ -65958,13 +69738,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Create bitsets of known categories.", - "docstring": "Create bitsets of known categories.\n\nReturns\n-------\n- known_cat_bitsets : ndarray of shape (n_categorical_features, 8)\n Array of bitsets of known categories, for each categorical feature.\n- f_idx_map : ndarray of shape (n_features,)\n Map from original feature index to the corresponding index in the\n known_cat_bitsets array.", + "docstring": "Create bitsets of known categories.\n\n Returns\n -------\n - known_cat_bitsets : ndarray of shape (n_categorical_features, 8)\n Array of bitsets of known categories, for each categorical feature.\n - f_idx_map : ndarray of shape (n_features,)\n Map from original feature index to the corresponding index in the\n known_cat_bitsets array.\n ", "source_code": "\ndef make_known_categories_bitsets(self):\n \"\"\"Create bitsets of known categories.\n\n Returns\n -------\n - known_cat_bitsets : ndarray of shape (n_categorical_features, 8)\n Array of bitsets of known categories, for each categorical feature.\n - f_idx_map : ndarray of shape (n_features,)\n Map from original feature index to the corresponding index in the\n known_cat_bitsets array.\n \"\"\"\n categorical_features_indices = np.flatnonzero(self.is_categorical_)\n n_features = self.is_categorical_.size\n n_categorical_features = categorical_features_indices.size\n f_idx_map = np.zeros(n_features, dtype=np.uint32)\n f_idx_map[categorical_features_indices] = np.arange(n_categorical_features, dtype=np.uint32)\n known_categories = self.bin_thresholds_\n known_cat_bitsets = np.zeros((n_categorical_features, 8), dtype=X_BITSET_INNER_DTYPE)\n for (mapped_f_idx, f_idx) in enumerate(categorical_features_indices):\n for raw_cat_val in known_categories[f_idx]:\n set_bitset_memoryview(known_cat_bitsets[mapped_f_idx], raw_cat_val)\n return known_cat_bitsets, f_idx_map" }, { @@ -65982,7 +69763,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -65992,13 +69774,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to bin." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Bin data X.\n\nMissing values will be mapped to the last bin. For categorical features, the mapping will be incorrect for unknown categories. Since the BinMapper is given known_categories of the entire training data (i.e. before the call to train_test_split() in case of early-stopping), this never happens.", - "docstring": "Bin data X.\n\nMissing values will be mapped to the last bin.\n\nFor categorical features, the mapping will be incorrect for unknown\ncategories. Since the BinMapper is given known_categories of the\nentire training data (i.e. before the call to train_test_split() in\ncase of early-stopping), this never happens.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data to bin.\n\nReturns\n-------\nX_binned : array-like of shape (n_samples, n_features)\n The binned data (fortran-aligned).", + "description": "Bin data X.\n\nMissing values will be mapped to the last bin.\n\nFor categorical features, the mapping will be incorrect for unknown\ncategories. Since the BinMapper is given known_categories of the\nentire training data (i.e. before the call to train_test_split() in\ncase of early-stopping), this never happens.", + "docstring": "Bin data X.\n\n Missing values will be mapped to the last bin.\n\n For categorical features, the mapping will be incorrect for unknown\n categories. Since the BinMapper is given known_categories of the\n entire training data (i.e. before the call to train_test_split() in\n case of early-stopping), this never happens.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to bin.\n\n Returns\n -------\n X_binned : array-like of shape (n_samples, n_features)\n The binned data (fortran-aligned).\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Bin data X.\n\n Missing values will be mapped to the last bin.\n\n For categorical features, the mapping will be incorrect for unknown\n categories. Since the BinMapper is given known_categories of the\n entire training data (i.e. before the call to train_test_split() in\n case of early-stopping), this never happens.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to bin.\n\n Returns\n -------\n X_binned : array-like of shape (n_samples, n_features)\n The binned data (fortran-aligned).\n \"\"\"\n X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)\n check_is_fitted(self)\n if X.shape[1] != self.n_bins_non_missing_.shape[0]:\n raise ValueError('This estimator was fitted with {} features but {} got passed to transform()'.format(self.n_bins_non_missing_.shape[0], X.shape[1]))\n n_threads = _openmp_effective_n_threads(self.n_threads)\n binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F')\n _map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_, n_threads, binned)\n return binned" }, { @@ -66016,7 +69799,8 @@ "docstring": { "type": "array-like, shape (n_samples,)", "description": "The continuous feature to bin." - } + }, + "refined_type": {} }, { "name": "max_bins", @@ -66026,14 +69810,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Extract quantiles from a continuous feature.\n\nMissing values are ignored for finding the thresholds.", - "docstring": "Extract quantiles from a continuous feature.\n\nMissing values are ignored for finding the thresholds.\n\nParameters\n----------\ncol_data : array-like, shape (n_samples,)\n The continuous feature to bin.\nmax_bins: int\n The maximum number of bins to use for non-missing values. If for a\n given feature the number of unique values is less than ``max_bins``,\n then those unique values will be used to compute the bin thresholds,\n instead of the quantiles\n\nReturn\n------\nbinning_thresholds : ndarray of shape(min(max_bins, n_unique_values) - 1,)\n The increasing numeric values that can be used to separate the bins.\n A given value x will be mapped into bin value i iff\n bining_thresholds[i - 1] < x <= binning_thresholds[i]", - "source_code": "\ndef _find_binning_thresholds(col_data, max_bins):\n \"\"\"Extract quantiles from a continuous feature.\n\n Missing values are ignored for finding the thresholds.\n\n Parameters\n ----------\n col_data : array-like, shape (n_samples,)\n The continuous feature to bin.\n max_bins: int\n The maximum number of bins to use for non-missing values. If for a\n given feature the number of unique values is less than ``max_bins``,\n then those unique values will be used to compute the bin thresholds,\n instead of the quantiles\n\n Return\n ------\n binning_thresholds : ndarray of shape(min(max_bins, n_unique_values) - 1,)\n The increasing numeric values that can be used to separate the bins.\n A given value x will be mapped into bin value i iff\n bining_thresholds[i - 1] < x <= binning_thresholds[i]\n \"\"\"\n missing_mask = np.isnan(col_data)\n if missing_mask.any():\n col_data = col_data[~missing_mask]\n col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)\n distinct_values = np.unique(col_data)\n if len(distinct_values) <= max_bins:\n midpoints = distinct_values[:-1] + distinct_values[1:]\n midpoints *= 0.5\n else:\n percentiles = np.linspace(0, 100, num=max_bins + 1)\n percentiles = percentiles[1:-1]\n midpoints = np.percentile(col_data, percentiles, interpolation='midpoint').astype(X_DTYPE)\n assert midpoints.shape[0] == max_bins - 1\n np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)\n return midpoints" + "docstring": "Extract quantiles from a continuous feature.\n\n Missing values are ignored for finding the thresholds.\n\n Parameters\n ----------\n col_data : array-like, shape (n_samples,)\n The continuous feature to bin.\n max_bins: int\n The maximum number of bins to use for non-missing values. If for a\n given feature the number of unique values is less than ``max_bins``,\n then those unique values will be used to compute the bin thresholds,\n instead of the quantiles\n\n Return\n ------\n binning_thresholds : ndarray of shape(min(max_bins, n_unique_values) - 1,)\n The increasing numeric values that can be used to separate the bins.\n A given value x will be mapped into bin value i iff\n bining_thresholds[i - 1] < x <= binning_thresholds[i]\n ", + "source_code": "\ndef _find_binning_thresholds(col_data, max_bins):\n \"\"\"Extract quantiles from a continuous feature.\n\n Missing values are ignored for finding the thresholds.\n\n Parameters\n ----------\n col_data : array-like, shape (n_samples,)\n The continuous feature to bin.\n max_bins: int\n The maximum number of bins to use for non-missing values. If for a\n given feature the number of unique values is less than ``max_bins``,\n then those unique values will be used to compute the bin thresholds,\n instead of the quantiles\n\n Return\n ------\n binning_thresholds : ndarray of shape(min(max_bins, n_unique_values) - 1,)\n The increasing numeric values that can be used to separate the bins.\n A given value x will be mapped into bin value i iff\n bining_thresholds[i - 1] < x <= binning_thresholds[i]\n \"\"\"\n missing_mask = np.isnan(col_data)\n if missing_mask.any():\n col_data = col_data[~missing_mask]\n col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)\n distinct_values = np.unique(col_data)\n if len(distinct_values) <= max_bins:\n midpoints = distinct_values[:-1] + distinct_values[1:]\n midpoints *= 0.5\n else:\n percentiles = np.linspace(0, 100, num=max_bins + 1)\n percentiles = percentiles[1:-1]\n midpoints = percentile(col_data, percentiles, method='midpoint').astype(X_DTYPE)\n assert midpoints.shape[0] == max_bins - 1\n np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)\n return midpoints" }, { "name": "__init__", @@ -66050,7 +69835,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -66060,7 +69846,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -66070,7 +69857,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -66080,7 +69868,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -66090,7 +69879,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -66100,7 +69890,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -66110,7 +69901,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l2_regularization", @@ -66120,7 +69912,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_bins", @@ -66130,7 +69923,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "categorical_features", @@ -66140,7 +69934,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "monotonic_cst", @@ -66150,7 +69945,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -66160,7 +69956,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -66170,7 +69967,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scoring", @@ -66180,7 +69978,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -66190,7 +69989,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -66200,7 +70000,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -66210,7 +70011,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -66220,7 +70022,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -66230,13 +70033,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, loss, *, learning_rate, max_iter, max_leaf_nodes, max_depth, min_samples_leaf, l2_regularization, max_bins, categorical_features, monotonic_cst, warm_start, early_stopping, scoring, validation_fraction, n_iter_no_change, tol, verbose, random_state):\n self.loss = loss\n self.learning_rate = learning_rate\n self.max_iter = max_iter\n self.max_leaf_nodes = max_leaf_nodes\n self.max_depth = max_depth\n self.min_samples_leaf = min_samples_leaf\n self.l2_regularization = l2_regularization\n self.max_bins = max_bins\n self.monotonic_cst = monotonic_cst\n self.categorical_features = categorical_features\n self.warm_start = warm_start\n self.early_stopping = early_stopping\n self.scoring = scoring\n self.validation_fraction = validation_fraction\n self.n_iter_no_change = n_iter_no_change\n self.tol = tol\n self.verbose = verbose\n self.random_state = random_state" }, { @@ -66254,7 +70058,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -66264,7 +70069,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "is_training_data", @@ -66274,13 +70080,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Bin data X.\n\nIf is_training_data, then fit the _bin_mapper attribute. Else, the binned data is converted to a C-contiguous array.", - "docstring": "Bin data X.\n\nIf is_training_data, then fit the _bin_mapper attribute.\nElse, the binned data is converted to a C-contiguous array.", + "description": "Bin data X.\n\nIf is_training_data, then fit the _bin_mapper attribute.\nElse, the binned data is converted to a C-contiguous array.", + "docstring": "Bin data X.\n\n If is_training_data, then fit the _bin_mapper attribute.\n Else, the binned data is converted to a C-contiguous array.\n ", "source_code": "\ndef _bin_data(self, X, is_training_data):\n \"\"\"Bin data X.\n\n If is_training_data, then fit the _bin_mapper attribute.\n Else, the binned data is converted to a C-contiguous array.\n \"\"\"\n description = 'training' if is_training_data else 'validation'\n if self.verbose:\n print('Binning {:.3f} GB of {} data: '.format(X.nbytes / 1000000000.0, description), end='', flush=True)\n tic = time()\n if is_training_data:\n X_binned = self._bin_mapper.fit_transform(X)\n else:\n X_binned = self._bin_mapper.transform(X)\n X_binned = np.ascontiguousarray(X_binned)\n toc = time()\n if self.verbose:\n duration = toc - tic\n print('{:.3f} s'.format(duration))\n return X_binned" }, { @@ -66298,7 +70105,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -66308,13 +70116,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check and validate categorical features in X", - "docstring": "Check and validate categorical features in X\n\nReturn\n------\nis_categorical : ndarray of shape (n_features,) or None, dtype=bool\n Indicates whether a feature is categorical. If no feature is\n categorical, this is None.\nknown_categories : list of size n_features or None\n The list contains, for each feature:\n - an array of shape (n_categories,) with the unique cat values\n - None if the feature is not categorical\n None if no feature is categorical.", + "docstring": "Check and validate categorical features in X\n\n Return\n ------\n is_categorical : ndarray of shape (n_features,) or None, dtype=bool\n Indicates whether a feature is categorical. If no feature is\n categorical, this is None.\n known_categories : list of size n_features or None\n The list contains, for each feature:\n - an array of shape (n_categories,) with the unique cat values\n - None if the feature is not categorical\n None if no feature is categorical.\n ", "source_code": "\ndef _check_categories(self, X):\n \"\"\"Check and validate categorical features in X\n\n Return\n ------\n is_categorical : ndarray of shape (n_features,) or None, dtype=bool\n Indicates whether a feature is categorical. If no feature is\n categorical, this is None.\n known_categories : list of size n_features or None\n The list contains, for each feature:\n - an array of shape (n_categories,) with the unique cat values\n - None if the feature is not categorical\n None if no feature is categorical.\n \"\"\"\n if self.categorical_features is None:\n return None, None\n categorical_features = np.asarray(self.categorical_features)\n if categorical_features.size == 0:\n return None, None\n if categorical_features.dtype.kind not in ('i', 'b'):\n raise ValueError('categorical_features must be an array-like of bools or array-like of ints.')\n n_features = X.shape[1]\n if categorical_features.dtype.kind == 'i':\n if np.max(categorical_features) >= n_features or np.min(categorical_features) < 0:\n raise ValueError('categorical_features set as integer indices must be in [0, n_features - 1]')\n is_categorical = np.zeros(n_features, dtype=bool)\n is_categorical[categorical_features] = True\n else:\n if categorical_features.shape[0] != n_features:\n raise ValueError(f'categorical_features set as a boolean mask must have shape (n_features,), got: {categorical_features.shape}')\n is_categorical = categorical_features\n if not np.any(is_categorical):\n return None, None\n known_categories = []\n for f_idx in range(n_features):\n if is_categorical[f_idx]:\n categories = np.unique(X[:, f_idx])\n missing = np.isnan(categories)\n if missing.any():\n categories = categories[~missing]\n if categories.size > self.max_bins:\n raise ValueError(f'Categorical feature at index {f_idx} is expected to have a cardinality <= {self.max_bins}')\n if (categories >= self.max_bins).any():\n raise ValueError(f'Categorical feature at index {f_idx} is expected to be encoded with values < {self.max_bins}')\n else:\n categories = None\n known_categories.append(categories)\n return is_categorical, known_categories" }, { @@ -66332,7 +70141,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -66342,7 +70152,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_train", @@ -66352,7 +70163,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight_train", @@ -66362,7 +70174,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions_val", @@ -66372,7 +70185,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_val", @@ -66382,7 +70196,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight_val", @@ -66392,13 +70207,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check if fitting should be early-stopped based on loss.\n\nScores are computed on validation data or on training data.", - "docstring": "Check if fitting should be early-stopped based on loss.\n\nScores are computed on validation data or on training data.", + "docstring": "Check if fitting should be early-stopped based on loss.\n\n Scores are computed on validation data or on training data.\n ", "source_code": "\ndef _check_early_stopping_loss(self, raw_predictions, y_train, sample_weight_train, raw_predictions_val, y_val, sample_weight_val):\n \"\"\"Check if fitting should be early-stopped based on loss.\n\n Scores are computed on validation data or on training data.\n \"\"\"\n self.train_score_.append(-self._loss(y_train, raw_predictions, sample_weight_train))\n if self._use_validation_data:\n self.validation_score_.append(-self._loss(y_val, raw_predictions_val, sample_weight_val))\n return self._should_stop(self.validation_score_)\n else:\n return self._should_stop(self.train_score_)" }, { @@ -66416,7 +70232,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_binned_small_train", @@ -66426,7 +70243,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_small_train", @@ -66436,7 +70254,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight_small_train", @@ -66446,7 +70265,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_binned_val", @@ -66456,7 +70276,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_val", @@ -66466,7 +70287,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight_val", @@ -66476,13 +70298,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check if fitting should be early-stopped based on scorer.\n\nScores are computed on validation data or on training data.", - "docstring": "Check if fitting should be early-stopped based on scorer.\n\nScores are computed on validation data or on training data.", + "docstring": "Check if fitting should be early-stopped based on scorer.\n\n Scores are computed on validation data or on training data.\n ", "source_code": "\ndef _check_early_stopping_scorer(self, X_binned_small_train, y_small_train, sample_weight_small_train, X_binned_val, y_val, sample_weight_val):\n \"\"\"Check if fitting should be early-stopped based on scorer.\n\n Scores are computed on validation data or on training data.\n \"\"\"\n if is_classifier(self):\n y_small_train = self.classes_[y_small_train.astype(int)]\n if sample_weight_small_train is None:\n self.train_score_.append(self._scorer(self, X_binned_small_train, y_small_train))\n else:\n self.train_score_.append(self._scorer(self, X_binned_small_train, y_small_train, sample_weight=sample_weight_small_train))\n if self._use_validation_data:\n if is_classifier(self):\n y_val = self.classes_[y_val.astype(int)]\n if sample_weight_val is None:\n self.validation_score_.append(self._scorer(self, X_binned_val, y_val))\n else:\n self.validation_score_.append(self._scorer(self, X_binned_val, y_val, sample_weight=sample_weight_val))\n return self._should_stop(self.validation_score_)\n else:\n return self._should_stop(self.train_score_)" }, { @@ -66500,7 +70323,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -66524,7 +70348,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "grid", @@ -66534,7 +70359,8 @@ "docstring": { "type": "ndarray, shape (n_samples, n_target_features)", "description": "The grid points on which the partial dependence should be\nevaluated." - } + }, + "refined_type": {} }, { "name": "target_features", @@ -66544,13 +70370,14 @@ "docstring": { "type": "ndarray, shape (n_target_features)", "description": "The set of target features for which the partial dependence\nshould be evaluated." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fast partial dependence computation.", - "docstring": "Fast partial dependence computation.\n\nParameters\n----------\ngrid : ndarray, shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\ntarget_features : ndarray, shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\n\nReturns\n-------\naveraged_predictions : ndarray, shape (n_trees_per_iteration, n_samples)\n The value of the partial dependence function on each grid point.", + "docstring": "Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray, shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray, shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\n\n Returns\n -------\n averaged_predictions : ndarray, shape (n_trees_per_iteration, n_samples)\n The value of the partial dependence function on each grid point.\n ", "source_code": "\ndef _compute_partial_dependence_recursion(self, grid, target_features):\n \"\"\"Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray, shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray, shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\n\n Returns\n -------\n averaged_predictions : ndarray, shape (n_trees_per_iteration, n_samples)\n The value of the partial dependence function on each grid point.\n \"\"\"\n if getattr(self, '_fitted_with_sw', False):\n raise NotImplementedError(\"{} does not support partial dependence plots with the 'recursion' method when sample weights were given during fit time.\".format(self.__class__.__name__))\n grid = np.asarray(grid, dtype=X_DTYPE, order='C')\n averaged_predictions = np.zeros((self.n_trees_per_iteration_, grid.shape[0]), dtype=Y_DTYPE)\n for predictors_of_ith_iteration in self._predictors:\n for (k, predictor) in enumerate(predictors_of_ith_iteration):\n predictor.compute_partial_dependence(grid, target_features, averaged_predictions[k])\n return averaged_predictions" }, { @@ -66568,7 +70395,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -66578,13 +70406,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef _encode_y(self, y=None):\n pass" }, { @@ -66602,7 +70431,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -66612,7 +70442,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -66622,13 +70453,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef _get_loss(self, sample_weight, n_threads):\n pass" }, { @@ -66646,7 +70478,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_binned_train", @@ -66656,7 +70489,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_train", @@ -66666,7 +70500,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight_train", @@ -66676,7 +70511,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "seed", @@ -66686,13 +70522,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the indices of the subsample set and return this set.\n\nFor efficiency, we need to subsample the training set to compute scores with scorers.", - "docstring": "Compute the indices of the subsample set and return this set.\n\nFor efficiency, we need to subsample the training set to compute scores\nwith scorers.", + "description": "Compute the indices of the subsample set and return this set.\n\nFor efficiency, we need to subsample the training set to compute scores\nwith scorers.", + "docstring": "Compute the indices of the subsample set and return this set.\n\n For efficiency, we need to subsample the training set to compute scores\n with scorers.\n ", "source_code": "\ndef _get_small_trainset(self, X_binned_train, y_train, sample_weight_train, seed):\n \"\"\"Compute the indices of the subsample set and return this set.\n\n For efficiency, we need to subsample the training set to compute scores\n with scorers.\n \"\"\"\n subsample_size = 10000\n if X_binned_train.shape[0] > subsample_size:\n indices = np.arange(X_binned_train.shape[0])\n stratify = y_train if is_classifier(self) else None\n indices = resample(indices, n_samples=subsample_size, replace=False, random_state=seed, stratify=stratify)\n X_binned_small_train = X_binned_train[indices]\n y_small_train = y_train[indices]\n if sample_weight_train is not None:\n sample_weight_small_train = sample_weight_train[indices]\n else:\n sample_weight_small_train = None\n X_binned_small_train = np.ascontiguousarray(X_binned_small_train)\n return X_binned_small_train, y_small_train, sample_weight_small_train\n else:\n return X_binned_train, y_train, sample_weight_train" }, { @@ -66710,13 +70547,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _is_fitted(self):\n return len(getattr(self, '_predictors', [])) > 0" }, { @@ -66734,13 +70572,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'allow_nan': True}" }, { @@ -66758,7 +70597,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -66768,7 +70608,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "predictors", @@ -66778,7 +70619,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -66788,7 +70630,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "is_binned", @@ -66798,7 +70641,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -66808,7 +70652,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -66832,7 +70677,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "iteration_start_time", @@ -66842,7 +70688,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -66866,7 +70713,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -66876,7 +70724,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -66886,13 +70735,14 @@ "docstring": { "type": "int, default=None", "description": "Number of OpenMP threads to use. `_openmp_effective_n_threads` is called\nto determine the effective number of threads use, which takes cgroups CPU\nquotes into account. See the docstring of `_openmp_effective_n_threads`\nfor details." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return the sum of the leaves values over all predictors.", - "docstring": "Return the sum of the leaves values over all predictors.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input samples.\nn_threads : int, default=None\n Number of OpenMP threads to use. `_openmp_effective_n_threads` is called\n to determine the effective number of threads use, which takes cgroups CPU\n quotes into account. See the docstring of `_openmp_effective_n_threads`\n for details.\n\nReturns\n-------\nraw_predictions : array, shape (n_trees_per_iteration, n_samples)\n The raw predicted values.", + "docstring": "Return the sum of the leaves values over all predictors.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n n_threads : int, default=None\n Number of OpenMP threads to use. `_openmp_effective_n_threads` is called\n to determine the effective number of threads use, which takes cgroups CPU\n quotes into account. See the docstring of `_openmp_effective_n_threads`\n for details.\n\n Returns\n -------\n raw_predictions : array, shape (n_trees_per_iteration, n_samples)\n The raw predicted values.\n ", "source_code": "\ndef _raw_predict(self, X, n_threads=None):\n \"\"\"Return the sum of the leaves values over all predictors.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n n_threads : int, default=None\n Number of OpenMP threads to use. `_openmp_effective_n_threads` is called\n to determine the effective number of threads use, which takes cgroups CPU\n quotes into account. See the docstring of `_openmp_effective_n_threads`\n for details.\n\n Returns\n -------\n raw_predictions : array, shape (n_trees_per_iteration, n_samples)\n The raw predicted values.\n \"\"\"\n is_binned = getattr(self, '_in_fit', False)\n dtype = X_BINNED_DTYPE if is_binned else X_DTYPE\n X = self._validate_data(X, dtype=dtype, force_all_finite=False, reset=False)\n check_is_fitted(self)\n if X.shape[1] != self._n_features:\n raise ValueError('X has {} features but this estimator was trained with {} features.'.format(X.shape[1], self._n_features))\n n_samples = X.shape[0]\n raw_predictions = np.zeros(shape=(self.n_trees_per_iteration_, n_samples), dtype=self._baseline_prediction.dtype)\n raw_predictions += self._baseline_prediction\n n_threads = _openmp_effective_n_threads(n_threads)\n self._predict_iterations(X, self._predictors, raw_predictions, is_binned, n_threads)\n return raw_predictions" }, { @@ -66910,7 +70760,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scores", @@ -66920,13 +70771,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return True (do early stopping) if the last n scores aren't better than the (n-1)th-to-last score, up to some tolerance.", - "docstring": "Return True (do early stopping) if the last n scores aren't better\nthan the (n-1)th-to-last score, up to some tolerance.", + "description": "Return True (do early stopping) if the last n scores aren't better\nthan the (n-1)th-to-last score, up to some tolerance.", + "docstring": "\n Return True (do early stopping) if the last n scores aren't better\n than the (n-1)th-to-last score, up to some tolerance.\n ", "source_code": "\ndef _should_stop(self, scores):\n \"\"\"\n Return True (do early stopping) if the last n scores aren't better\n than the (n-1)th-to-last score, up to some tolerance.\n \"\"\"\n reference_position = self.n_iter_no_change + 1\n if len(scores) < reference_position:\n return False\n reference_score = scores[-reference_position] + self.tol\n recent_scores = scores[-reference_position + 1:]\n recent_improvements = [score > reference_score for score in recent_scores]\n return not any(recent_improvements)" }, { @@ -66944,7 +70796,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -66954,13 +70807,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute raw predictions of ``X`` for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set) after each stage.", - "docstring": "Compute raw predictions of ``X`` for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input samples.\n\nYields\n-------\nraw_predictions : generator of ndarray of shape (n_trees_per_iteration, n_samples)\n The raw predictions of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.", + "description": "Compute raw predictions of ``X`` for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.", + "docstring": "Compute raw predictions of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n raw_predictions : generator of ndarray of shape (n_trees_per_iteration, n_samples)\n The raw predictions of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n ", "source_code": "\ndef _staged_raw_predict(self, X):\n \"\"\"Compute raw predictions of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n raw_predictions : generator of ndarray of shape (n_trees_per_iteration, n_samples)\n The raw predictions of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n X = self._validate_data(X, dtype=X_DTYPE, force_all_finite=False, reset=False)\n check_is_fitted(self)\n if X.shape[1] != self._n_features:\n raise ValueError('X has {} features but this estimator was trained with {} features.'.format(X.shape[1], self._n_features))\n n_samples = X.shape[0]\n raw_predictions = np.zeros(shape=(self.n_trees_per_iteration_, n_samples), dtype=self._baseline_prediction.dtype)\n raw_predictions += self._baseline_prediction\n n_threads = _openmp_effective_n_threads()\n for iteration in range(len(self._predictors)):\n self._predict_iterations(X, self._predictors[iteration:iteration + 1], raw_predictions, is_binned=False, n_threads=n_threads)\n yield raw_predictions.copy()" }, { @@ -66978,13 +70832,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Validate parameters passed to __init__.\n\nThe parameters that are directly passed to the grower are checked in TreeGrower.", - "docstring": "Validate parameters passed to __init__.\n\nThe parameters that are directly passed to the grower are checked in\nTreeGrower.", + "description": "Validate parameters passed to __init__.\n\nThe parameters that are directly passed to the grower are checked in\nTreeGrower.", + "docstring": "Validate parameters passed to __init__.\n\n The parameters that are directly passed to the grower are checked in\n TreeGrower.", "source_code": "\ndef _validate_parameters(self):\n \"\"\"Validate parameters passed to __init__.\n\n The parameters that are directly passed to the grower are checked in\n TreeGrower.\"\"\"\n if self.loss not in self._VALID_LOSSES and not isinstance(self.loss, BaseLoss):\n raise ValueError('Loss {} is not supported for {}. Accepted losses: {}.'.format(self.loss, self.__class__.__name__, ', '.join(self._VALID_LOSSES)))\n if self.learning_rate <= 0:\n raise ValueError('learning_rate={} must be strictly positive'.format(self.learning_rate))\n if self.max_iter < 1:\n raise ValueError('max_iter={} must not be smaller than 1.'.format(self.max_iter))\n if self.n_iter_no_change < 0:\n raise ValueError('n_iter_no_change={} must be positive.'.format(self.n_iter_no_change))\n if self.validation_fraction is not None and self.validation_fraction <= 0:\n raise ValueError('validation_fraction={} must be strictly positive, or None.'.format(self.validation_fraction))\n if self.tol < 0:\n raise ValueError('tol={} must not be smaller than 0.'.format(self.tol))\n if not 2 <= self.max_bins <= 255:\n raise ValueError('max_bins={} should be no smaller than 2 and no larger than 255.'.format(self.max_bins))\n if self.monotonic_cst is not None and self.n_trees_per_iteration_ != 1:\n raise ValueError('monotonic constraints are not supported for multiclass classification.')" }, { @@ -67002,7 +70857,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -67012,7 +70868,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -67022,7 +70879,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -67032,13 +70890,14 @@ "docstring": { "type": "array-like of shape (n_samples,) default=None", "description": "Weights of training data.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit the gradient boosting model.", - "docstring": "Fit the gradient boosting model.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input samples.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,) default=None\n Weights of training data.\n\n .. versionadded:: 0.23\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the gradient boosting model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,) default=None\n Weights of training data.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the gradient boosting model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,) default=None\n Weights of training data.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n fit_start_time = time()\n acc_find_split_time = 0.0\n acc_apply_split_time = 0.0\n acc_compute_hist_time = 0.0\n acc_prediction_time = 0.0\n (X, y) = self._validate_data(X, y, dtype=[X_DTYPE], force_all_finite=False)\n y = self._encode_y(y)\n check_consistent_length(X, y)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64)\n self._fitted_with_sw = True\n rng = check_random_state(self.random_state)\n if not (self.warm_start and self._is_fitted()):\n self._random_seed = rng.randint(np.iinfo(np.uint32).max, dtype='u8')\n self._validate_parameters()\n (n_samples, self._n_features) = X.shape\n (self.is_categorical_, known_categories) = self._check_categories(X)\n self._in_fit = True\n n_threads = _openmp_effective_n_threads()\n if isinstance(self.loss, str):\n self._loss = self._get_loss(sample_weight=sample_weight, n_threads=n_threads)\n elif isinstance(self.loss, BaseLoss):\n self._loss = self.loss\n if self.early_stopping == 'auto':\n self.do_early_stopping_ = n_samples > 10000\n else:\n self.do_early_stopping_ = self.early_stopping\n self._use_validation_data = self.validation_fraction is not None\n if self.do_early_stopping_ and self._use_validation_data:\n stratify = y if hasattr(self._loss, 'predict_proba') else None\n if sample_weight is None:\n (X_train, X_val, y_train, y_val) = train_test_split(X, y, test_size=self.validation_fraction, stratify=stratify, random_state=self._random_seed)\n sample_weight_train = sample_weight_val = None\n else:\n (X_train, X_val, y_train, y_val, sample_weight_train, sample_weight_val) = train_test_split(X, y, sample_weight, test_size=self.validation_fraction, stratify=stratify, random_state=self._random_seed)\n else:\n (X_train, y_train, sample_weight_train) = (X, y, sample_weight)\n X_val = y_val = sample_weight_val = None\n n_bins = self.max_bins + 1\n self._bin_mapper = _BinMapper(n_bins=n_bins, is_categorical=self.is_categorical_, known_categories=known_categories, random_state=self._random_seed, n_threads=n_threads)\n X_binned_train = self._bin_data(X_train, is_training_data=True)\n if X_val is not None:\n X_binned_val = self._bin_data(X_val, is_training_data=False)\n else:\n X_binned_val = None\n has_missing_values = (X_binned_train == self._bin_mapper.missing_values_bin_idx_).any(axis=0).astype(np.uint8)\n if self.verbose:\n print('Fitting gradient boosted rounds:')\n n_samples = X_binned_train.shape[0]\n if not (self._is_fitted() and self.warm_start):\n self._clear_state()\n self._baseline_prediction = self._loss.get_baseline_prediction(y_train, sample_weight_train, self.n_trees_per_iteration_)\n raw_predictions = np.zeros(shape=(self.n_trees_per_iteration_, n_samples), dtype=self._baseline_prediction.dtype)\n raw_predictions += self._baseline_prediction\n self._predictors = predictors = []\n self._scorer = None\n raw_predictions_val = None\n self.train_score_ = []\n self.validation_score_ = []\n if self.do_early_stopping_:\n if self.scoring == 'loss':\n if self._use_validation_data:\n raw_predictions_val = np.zeros(shape=(self.n_trees_per_iteration_, X_binned_val.shape[0]), dtype=self._baseline_prediction.dtype)\n raw_predictions_val += self._baseline_prediction\n self._check_early_stopping_loss(raw_predictions, y_train, sample_weight_train, raw_predictions_val, y_val, sample_weight_val)\n else:\n self._scorer = check_scoring(self, self.scoring)\n (X_binned_small_train, y_small_train, sample_weight_small_train) = self._get_small_trainset(X_binned_train, y_train, sample_weight_train, self._random_seed)\n self._check_early_stopping_scorer(X_binned_small_train, y_small_train, sample_weight_small_train, X_binned_val, y_val, sample_weight_val)\n begin_at_stage = 0\n else:\n if self.max_iter < self.n_iter_:\n raise ValueError('max_iter=%d must be larger than or equal to n_iter_=%d when warm_start==True' % (self.max_iter, self.n_iter_))\n self.train_score_ = self.train_score_.tolist()\n self.validation_score_ = self.validation_score_.tolist()\n raw_predictions = self._raw_predict(X_binned_train, n_threads=n_threads)\n if self.do_early_stopping_ and self._use_validation_data:\n raw_predictions_val = self._raw_predict(X_binned_val, n_threads=n_threads)\n else:\n raw_predictions_val = None\n if self.do_early_stopping_ and self.scoring != 'loss':\n (X_binned_small_train, y_small_train, sample_weight_small_train) = self._get_small_trainset(X_binned_train, y_train, sample_weight_train, self._random_seed)\n predictors = self._predictors\n begin_at_stage = self.n_iter_\n (gradients, hessians) = self._loss.init_gradients_and_hessians(n_samples=n_samples, prediction_dim=self.n_trees_per_iteration_, sample_weight=sample_weight_train)\n for iteration in range(begin_at_stage, self.max_iter):\n if self.verbose:\n iteration_start_time = time()\n print('[{}/{}] '.format(iteration + 1, self.max_iter), end='', flush=True)\n self._loss.update_gradients_and_hessians(gradients, hessians, y_train, raw_predictions, sample_weight_train)\n predictors.append([])\n for k in range(self.n_trees_per_iteration_):\n grower = TreeGrower(X_binned_train, gradients[k, :], hessians[k, :], n_bins=n_bins, n_bins_non_missing=self._bin_mapper.n_bins_non_missing_, has_missing_values=has_missing_values, is_categorical=self.is_categorical_, monotonic_cst=self.monotonic_cst, max_leaf_nodes=self.max_leaf_nodes, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, l2_regularization=self.l2_regularization, shrinkage=self.learning_rate, n_threads=n_threads)\n grower.grow()\n acc_apply_split_time += grower.total_apply_split_time\n acc_find_split_time += grower.total_find_split_time\n acc_compute_hist_time += grower.total_compute_hist_time\n if self._loss.need_update_leaves_values:\n self._loss.update_leaves_values(grower, y_train, raw_predictions[k, :], sample_weight_train)\n predictor = grower.make_predictor(binning_thresholds=self._bin_mapper.bin_thresholds_)\n predictors[-1].append(predictor)\n tic_pred = time()\n _update_raw_predictions(raw_predictions[k, :], grower, n_threads)\n toc_pred = time()\n acc_prediction_time += toc_pred - tic_pred\n should_early_stop = False\n if self.do_early_stopping_:\n if self.scoring == 'loss':\n if self._use_validation_data:\n for (k, pred) in enumerate(self._predictors[-1]):\n raw_predictions_val[k, :] += pred.predict_binned(X_binned_val, self._bin_mapper.missing_values_bin_idx_, n_threads)\n should_early_stop = self._check_early_stopping_loss(raw_predictions, y_train, sample_weight_train, raw_predictions_val, y_val, sample_weight_val)\n else:\n should_early_stop = self._check_early_stopping_scorer(X_binned_small_train, y_small_train, sample_weight_small_train, X_binned_val, y_val, sample_weight_val)\n if self.verbose:\n self._print_iteration_stats(iteration_start_time)\n if should_early_stop:\n break\n if self.verbose:\n duration = time() - fit_start_time\n n_total_leaves = sum((predictor.get_n_leaf_nodes() for predictors_at_ith_iteration in self._predictors for predictor in predictors_at_ith_iteration))\n n_predictors = sum((len(predictors_at_ith_iteration) for predictors_at_ith_iteration in self._predictors))\n print('Fit {} trees in {:.3f} s, ({} total leaves)'.format(n_predictors, duration, n_total_leaves))\n print('{:<32} {:.3f}s'.format('Time spent computing histograms:', acc_compute_hist_time))\n print('{:<32} {:.3f}s'.format('Time spent finding best splits:', acc_find_split_time))\n print('{:<32} {:.3f}s'.format('Time spent applying splits:', acc_apply_split_time))\n print('{:<32} {:.3f}s'.format('Time spent predicting:', acc_prediction_time))\n self.train_score_ = np.asarray(self.train_score_)\n self.validation_score_ = np.asarray(self.validation_score_)\n del self._in_fit\n return self" }, { @@ -67056,7 +70915,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -67080,7 +70940,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -67090,6 +70951,14 @@ "docstring": { "type": "{'auto', 'binary_crossentropy', 'categorical_crossentropy'}, default='auto'", "description": "The loss function to use in the boosting process. 'binary_crossentropy'\n(also known as logistic loss) is used for binary classification and\ngeneralizes to 'categorical_crossentropy' for multiclass\nclassification. 'auto' will automatically choose either loss depending\non the nature of the problem." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "auto", + "categorical_crossentropy", + "binary_crossentropy" + ] } }, { @@ -67100,7 +70969,8 @@ "docstring": { "type": "float, default=0.1", "description": "The learning rate, also known as *shrinkage*. This is used as a\nmultiplicative factor for the leaves values. Use ``1`` for no\nshrinkage." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -67110,7 +70980,8 @@ "docstring": { "type": "int, default=100", "description": "The maximum number of iterations of the boosting process, i.e. the\nmaximum number of trees for binary classification. For multiclass\nclassification, `n_classes` trees per iteration are built." - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -67120,7 +70991,8 @@ "docstring": { "type": "int or None, default=31", "description": "The maximum number of leaves for each tree. Must be strictly greater\nthan 1. If None, there is no maximum limit." - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -67130,7 +71002,8 @@ "docstring": { "type": "int or None, default=None", "description": "The maximum depth of each tree. The depth of a tree is the number of\nedges to go from the root to the deepest leaf.\nDepth isn't constrained by default." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -67140,7 +71013,8 @@ "docstring": { "type": "int, default=20", "description": "The minimum number of samples per leaf. For small datasets with less\nthan a few hundred samples, it is recommended to lower this value\nsince only very shallow trees would be built." - } + }, + "refined_type": {} }, { "name": "l2_regularization", @@ -67150,7 +71024,8 @@ "docstring": { "type": "float, default=0", "description": "The L2 regularization parameter. Use 0 for no regularization." - } + }, + "refined_type": {} }, { "name": "max_bins", @@ -67160,7 +71035,8 @@ "docstring": { "type": "int, default=255", "description": "The maximum number of bins to use for non-missing values. Before\ntraining, each feature of the input array `X` is binned into\ninteger-valued bins, which allows for a much faster training stage.\nFeatures with a small number of unique values may use less than\n``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin\nis always reserved for missing values. Must be no larger than 255." - } + }, + "refined_type": {} }, { "name": "categorical_features", @@ -67170,6 +71046,10 @@ "docstring": { "type": "array-like of {bool, int} of shape (n_features) or shape (n_categorical_features,), default=None", "description": "Indicates the categorical features.\n\n- None : no feature will be considered categorical.\n- boolean array-like : boolean mask indicating categorical features.\n- integer array-like : integer indices indicating categorical\n features.\n\nFor each categorical feature, there must be at most `max_bins` unique\ncategories, and each categorical value must be in [0, max_bins -1].\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.24" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -67180,7 +71060,8 @@ "docstring": { "type": "array-like of int of shape (n_features), default=None", "description": "Indicates the monotonic constraint to enforce on each feature. -1, 1\nand 0 respectively correspond to a negative constraint, positive\nconstraint and no constraint. Read more in the :ref:`User Guide\n`.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -67190,7 +71071,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit\nand add more estimators to the ensemble. For results to be valid, the\nestimator should be re-trained on the same data only.\nSee :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -67200,7 +71082,8 @@ "docstring": { "type": "'auto' or bool, default='auto'", "description": "If 'auto', early stopping is enabled if the sample size is larger than\n10000. If True, early stopping is enabled, otherwise early stopping is\ndisabled.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} }, { "name": "scoring", @@ -67210,7 +71093,8 @@ "docstring": { "type": "str or callable or None, default='loss'", "description": "Scoring parameter to use for early stopping. It can be a single\nstring (see :ref:`scoring_parameter`) or a callable (see\n:ref:`scoring`). If None, the estimator's default scorer\nis used. If ``scoring='loss'``, early stopping is checked\nw.r.t the loss value. Only used if early stopping is performed." - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -67220,7 +71104,8 @@ "docstring": { "type": "int or float or None, default=0.1", "description": "Proportion (or absolute size) of training data to set aside as\nvalidation data for early stopping. If None, early stopping is done on\nthe training data. Only used if early stopping is performed." - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -67230,7 +71115,8 @@ "docstring": { "type": "int, default=10", "description": "Used to determine when to \"early stop\". The fitting process is\nstopped when none of the last ``n_iter_no_change`` scores are better\nthan the ``n_iter_no_change - 1`` -th-to-last one, up to some\ntolerance. Only used if early stopping is performed." - } + }, + "refined_type": {} }, { "name": "tol", @@ -67240,7 +71126,8 @@ "docstring": { "type": "float, default=1e-7", "description": "The absolute tolerance to use when comparing scores. The higher the\ntolerance, the more likely we are to early stop: higher tolerance\nmeans that it will be harder for subsequent iterations to be\nconsidered an improvement upon the reference score." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -67250,7 +71137,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level. If not zero, print some information about the\nfitting process." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -67260,13 +71148,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pseudo-random number generator to control the subsampling in the\nbinning process, and the train/validation data split if early stopping\nis enabled.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, loss='auto', *, learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0.0, max_bins=255, categorical_features=None, monotonic_cst=None, warm_start=False, early_stopping='auto', scoring='loss', validation_fraction=0.1, n_iter_no_change=10, tol=1e-07, verbose=0, random_state=None):\n super(HistGradientBoostingClassifier, self).__init__(loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, categorical_features=categorical_features, monotonic_cst=monotonic_cst, warm_start=warm_start, early_stopping=early_stopping, scoring=scoring, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, random_state=random_state)" }, { @@ -67284,7 +71173,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -67294,13 +71184,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _encode_y(self, y):\n check_classification_targets(y)\n label_encoder = LabelEncoder()\n encoded_y = label_encoder.fit_transform(y)\n self.classes_ = label_encoder.classes_\n n_classes = self.classes_.shape[0]\n self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes\n encoded_y = encoded_y.astype(Y_DTYPE, copy=False)\n return encoded_y" }, { @@ -67318,7 +71209,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -67328,7 +71220,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -67338,13 +71231,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_loss(self, sample_weight, n_threads):\n if self.loss == 'categorical_crossentropy' and self.n_trees_per_iteration_ == 1:\n raise ValueError(\"'categorical_crossentropy' is not suitable for a binary classification problem. Please use 'auto' or 'binary_crossentropy' instead.\")\n if self.loss == 'auto':\n if self.n_trees_per_iteration_ == 1:\n return _LOSSES['binary_crossentropy'](sample_weight=sample_weight, n_threads=n_threads)\n else:\n return _LOSSES['categorical_crossentropy'](sample_weight=sample_weight, n_threads=n_threads)\n return _LOSSES[self.loss](sample_weight=sample_weight, n_threads=n_threads)" }, { @@ -67362,7 +71256,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -67372,13 +71267,14 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the decision function of ``X``.", - "docstring": "Compute the decision function of ``X``.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\ndecision : ndarray, shape (n_samples,) or (n_samples, n_trees_per_iteration)\n The raw predicted values (i.e. the sum of the trees leaves) for\n each sample. n_trees_per_iteration is equal to the number of\n classes in multiclass classification.", + "docstring": "Compute the decision function of ``X``.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n decision : ndarray, shape (n_samples,) or (n_samples, n_trees_per_iteration)\n The raw predicted values (i.e. the sum of the trees leaves) for\n each sample. n_trees_per_iteration is equal to the number of\n classes in multiclass classification.\n ", "source_code": "\ndef decision_function(self, X):\n \"\"\"Compute the decision function of ``X``.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n decision : ndarray, shape (n_samples,) or (n_samples, n_trees_per_iteration)\n The raw predicted values (i.e. the sum of the trees leaves) for\n each sample. n_trees_per_iteration is equal to the number of\n classes in multiclass classification.\n \"\"\"\n decision = self._raw_predict(X)\n if decision.shape[0] == 1:\n decision = decision.ravel()\n return decision.T" }, { @@ -67396,7 +71292,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -67406,13 +71303,14 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict classes for X.", - "docstring": "Predict classes for X.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\ny : ndarray, shape (n_samples,)\n The predicted classes.", + "docstring": "Predict classes for X.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n y : ndarray, shape (n_samples,)\n The predicted classes.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict classes for X.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n y : ndarray, shape (n_samples,)\n The predicted classes.\n \"\"\"\n encoded_classes = np.argmax(self.predict_proba(X), axis=1)\n return self.classes_[encoded_classes]" }, { @@ -67430,7 +71328,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -67440,13 +71339,14 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict class probabilities for X.", - "docstring": "Predict class probabilities for X.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\np : ndarray, shape (n_samples, n_classes)\n The class probabilities of the input samples.", + "docstring": "Predict class probabilities for X.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n p : ndarray, shape (n_samples, n_classes)\n The class probabilities of the input samples.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Predict class probabilities for X.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n p : ndarray, shape (n_samples, n_classes)\n The class probabilities of the input samples.\n \"\"\"\n raw_predictions = self._raw_predict(X)\n return self._loss.predict_proba(raw_predictions)" }, { @@ -67464,7 +71364,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -67474,13 +71375,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute decision function of ``X`` for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set) after each stage.", - "docstring": "Compute decision function of ``X`` for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input samples.\n\nYields\n-------\ndecision : generator of ndarray of shape (n_samples,) or (n_samples, n_trees_per_iteration)\n The decision function of the input samples, which corresponds to\n the raw values predicted from the trees of the ensemble . The\n classes corresponds to that in the attribute :term:`classes_`.", + "description": "Compute decision function of ``X`` for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.", + "docstring": "Compute decision function of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n decision : generator of ndarray of shape (n_samples,) or (n_samples, n_trees_per_iteration)\n The decision function of the input samples, which corresponds to\n the raw values predicted from the trees of the ensemble . The\n classes corresponds to that in the attribute :term:`classes_`.\n ", "source_code": "\ndef staged_decision_function(self, X):\n \"\"\"Compute decision function of ``X`` for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n decision : generator of ndarray of shape (n_samples,) or (n_samples, n_trees_per_iteration)\n The decision function of the input samples, which corresponds to\n the raw values predicted from the trees of the ensemble . The\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n for staged_decision in self._staged_raw_predict(X):\n if staged_decision.shape[0] == 1:\n staged_decision = staged_decision.ravel()\n yield staged_decision.T" }, { @@ -67498,7 +71400,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -67508,13 +71411,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict classes at each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set) after each stage. .. versionadded:: 0.24", - "docstring": "Predict classes at each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.\n\n.. versionadded:: 0.24\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input samples.\n\nYields\n-------\ny : generator of ndarray of shape (n_samples,)\n The predicted classes of the input samples, for each iteration.", + "description": "Predict classes at each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.\n\n.. versionadded:: 0.24", + "docstring": "Predict classes at each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted classes of the input samples, for each iteration.\n ", "source_code": "\ndef staged_predict(self, X):\n \"\"\"Predict classes at each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted classes of the input samples, for each iteration.\n \"\"\"\n for proba in self.staged_predict_proba(X):\n encoded_classes = np.argmax(proba, axis=1)\n yield self.classes_.take(encoded_classes, axis=0)" }, { @@ -67532,7 +71436,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -67542,13 +71447,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict class probabilities at each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set) after each stage.", - "docstring": "Predict class probabilities at each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input samples.\n\nYields\n-------\ny : generator of ndarray of shape (n_samples,)\n The predicted class probabilities of the input samples,\n for each iteration.", + "description": "Predict class probabilities at each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.", + "docstring": "Predict class probabilities at each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted class probabilities of the input samples,\n for each iteration.\n ", "source_code": "\ndef staged_predict_proba(self, X):\n \"\"\"Predict class probabilities at each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted class probabilities of the input samples,\n for each iteration.\n \"\"\"\n for raw_predictions in self._staged_raw_predict(X):\n yield self._loss.predict_proba(raw_predictions)" }, { @@ -67566,7 +71472,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -67576,6 +71483,10 @@ "docstring": { "type": "{'squared_error', 'absolute_error', 'poisson'}, default='squared_error'", "description": "The loss function to use in the boosting process. Note that the\n\"squared error\" and \"poisson\" losses actually implement\n\"half least squares loss\" and \"half poisson deviance\" to simplify the\ncomputation of the gradient. Furthermore, \"poisson\" loss internally\nuses a log-link and requires ``y >= 0``.\n\n.. versionchanged:: 0.23\n Added option 'poisson'.\n\n.. deprecated:: 1.0\n The loss 'least_squares' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n.. deprecated:: 1.0\n The loss 'least_absolute_deviation' was deprecated in v1.0 and will\n be removed in version 1.2. Use `loss='absolute_error'` which is\n equivalent." + }, + "refined_type": { + "kind": "EnumType", + "values": ["squared_error", "poisson", "absolute_error"] } }, { @@ -67586,7 +71497,8 @@ "docstring": { "type": "float, default=0.1", "description": "The learning rate, also known as *shrinkage*. This is used as a\nmultiplicative factor for the leaves values. Use ``1`` for no\nshrinkage." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -67596,7 +71508,8 @@ "docstring": { "type": "int, default=100", "description": "The maximum number of iterations of the boosting process, i.e. the\nmaximum number of trees." - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -67606,7 +71519,8 @@ "docstring": { "type": "int or None, default=31", "description": "The maximum number of leaves for each tree. Must be strictly greater\nthan 1. If None, there is no maximum limit." - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -67616,7 +71530,8 @@ "docstring": { "type": "int or None, default=None", "description": "The maximum depth of each tree. The depth of a tree is the number of\nedges to go from the root to the deepest leaf.\nDepth isn't constrained by default." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -67626,7 +71541,8 @@ "docstring": { "type": "int, default=20", "description": "The minimum number of samples per leaf. For small datasets with less\nthan a few hundred samples, it is recommended to lower this value\nsince only very shallow trees would be built." - } + }, + "refined_type": {} }, { "name": "l2_regularization", @@ -67636,7 +71552,8 @@ "docstring": { "type": "float, default=0", "description": "The L2 regularization parameter. Use ``0`` for no regularization\n(default)." - } + }, + "refined_type": {} }, { "name": "max_bins", @@ -67646,7 +71563,8 @@ "docstring": { "type": "int, default=255", "description": "The maximum number of bins to use for non-missing values. Before\ntraining, each feature of the input array `X` is binned into\ninteger-valued bins, which allows for a much faster training stage.\nFeatures with a small number of unique values may use less than\n``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin\nis always reserved for missing values. Must be no larger than 255." - } + }, + "refined_type": {} }, { "name": "categorical_features", @@ -67656,6 +71574,10 @@ "docstring": { "type": "array-like of {bool, int} of shape (n_features) or shape (n_categorical_features,), default=None", "description": "Indicates the categorical features.\n\n- None : no feature will be considered categorical.\n- boolean array-like : boolean mask indicating categorical features.\n- integer array-like : integer indices indicating categorical\n features.\n\nFor each categorical feature, there must be at most `max_bins` unique\ncategories, and each categorical value must be in [0, max_bins -1].\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.24" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -67666,7 +71588,8 @@ "docstring": { "type": "array-like of int of shape (n_features), default=None", "description": "Indicates the monotonic constraint to enforce on each feature. -1, 1\nand 0 respectively correspond to a negative constraint, positive\nconstraint and no constraint. Read more in the :ref:`User Guide\n`.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -67676,7 +71599,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit\nand add more estimators to the ensemble. For results to be valid, the\nestimator should be re-trained on the same data only.\nSee :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -67686,7 +71610,8 @@ "docstring": { "type": "'auto' or bool, default='auto'", "description": "If 'auto', early stopping is enabled if the sample size is larger than\n10000. If True, early stopping is enabled, otherwise early stopping is\ndisabled.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} }, { "name": "scoring", @@ -67696,7 +71621,8 @@ "docstring": { "type": "str or callable or None, default='loss'", "description": "Scoring parameter to use for early stopping. It can be a single\nstring (see :ref:`scoring_parameter`) or a callable (see\n:ref:`scoring`). If None, the estimator's default scorer is used. If\n``scoring='loss'``, early stopping is checked w.r.t the loss value.\nOnly used if early stopping is performed." - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -67706,7 +71632,8 @@ "docstring": { "type": "int or float or None, default=0.1", "description": "Proportion (or absolute size) of training data to set aside as\nvalidation data for early stopping. If None, early stopping is done on\nthe training data. Only used if early stopping is performed." - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -67716,7 +71643,8 @@ "docstring": { "type": "int, default=10", "description": "Used to determine when to \"early stop\". The fitting process is\nstopped when none of the last ``n_iter_no_change`` scores are better\nthan the ``n_iter_no_change - 1`` -th-to-last one, up to some\ntolerance. Only used if early stopping is performed." - } + }, + "refined_type": {} }, { "name": "tol", @@ -67726,7 +71654,8 @@ "docstring": { "type": "float, default=1e-7", "description": "The absolute tolerance to use when comparing scores during early\nstopping. The higher the tolerance, the more likely we are to early\nstop: higher tolerance means that it will be harder for subsequent\niterations to be considered an improvement upon the reference score." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -67736,7 +71665,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level. If not zero, print some information about the\nfitting process." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -67746,13 +71676,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pseudo-random number generator to control the subsampling in the\nbinning process, and the train/validation data split if early stopping\nis enabled.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, loss='squared_error', *, learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0.0, max_bins=255, categorical_features=None, monotonic_cst=None, warm_start=False, early_stopping='auto', scoring='loss', validation_fraction=0.1, n_iter_no_change=10, tol=1e-07, verbose=0, random_state=None):\n super(HistGradientBoostingRegressor, self).__init__(loss=loss, learning_rate=learning_rate, max_iter=max_iter, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, monotonic_cst=monotonic_cst, categorical_features=categorical_features, early_stopping=early_stopping, warm_start=warm_start, scoring=scoring, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, random_state=random_state)" }, { @@ -67770,7 +71701,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -67780,13 +71712,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _encode_y(self, y):\n self.n_trees_per_iteration_ = 1\n y = y.astype(Y_DTYPE, copy=False)\n if self.loss == 'poisson':\n if not (np.all(y >= 0) and np.sum(y) > 0):\n raise ValueError(\"loss='poisson' requires non-negative y and sum(y) > 0.\")\n return y" }, { @@ -67804,7 +71737,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -67814,7 +71748,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -67824,13 +71759,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_loss(self, sample_weight, n_threads):\n if self.loss == 'least_squares':\n warnings.warn(\"The loss 'least_squares' was deprecated in v1.0 and will be removed in version 1.2. Use 'squared_error' which is equivalent.\", FutureWarning)\n return _LOSSES['squared_error'](sample_weight=sample_weight, n_threads=n_threads)\n elif self.loss == 'least_absolute_deviation':\n warnings.warn(\"The loss 'least_absolute_deviation' was deprecated in v1.0 and will be removed in version 1.2. Use 'absolute_error' which is equivalent.\", FutureWarning)\n return _LOSSES['absolute_error'](sample_weight=sample_weight, n_threads=n_threads)\n return _LOSSES[self.loss](sample_weight=sample_weight, n_threads=n_threads)" }, { @@ -67848,7 +71784,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -67858,13 +71795,14 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict values for X.", - "docstring": "Predict values for X.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\ny : ndarray, shape (n_samples,)\n The predicted values.", + "docstring": "Predict values for X.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n y : ndarray, shape (n_samples,)\n The predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict values for X.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n y : ndarray, shape (n_samples,)\n The predicted values.\n \"\"\"\n check_is_fitted(self)\n return self._loss.inverse_link_function(self._raw_predict(X).ravel())" }, { @@ -67882,7 +71820,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -67892,13 +71831,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict regression target for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set) after each stage. .. versionadded:: 0.24", - "docstring": "Predict regression target for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.\n\n.. versionadded:: 0.24\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input samples.\n\nYields\n-------\ny : generator of ndarray of shape (n_samples,)\n The predicted values of the input samples, for each iteration.", + "description": "Predict regression target for each iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each stage.\n\n.. versionadded:: 0.24", + "docstring": "Predict regression target for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted values of the input samples, for each iteration.\n ", "source_code": "\ndef staged_predict(self, X):\n \"\"\"Predict regression target for each iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each stage.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted values of the input samples, for each iteration.\n \"\"\"\n for raw_predictions in self._staged_raw_predict(X):\n yield self._loss.inverse_link_function(raw_predictions.ravel())" }, { @@ -67916,7 +71856,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_binned", @@ -67926,7 +71867,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features), dtype=np.uint8", "description": "The binned input samples. Must be Fortran-aligned." - } + }, + "refined_type": {} }, { "name": "gradients", @@ -67936,7 +71878,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The gradients of each training sample. Those are the gradients of the\nloss w.r.t the predictions, evaluated at iteration ``i - 1``." - } + }, + "refined_type": {} }, { "name": "hessians", @@ -67946,7 +71889,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The hessians of each training sample. Those are the hessians of the\nloss w.r.t the predictions, evaluated at iteration ``i - 1``." - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -67956,7 +71900,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum number of leaves for each tree. If None, there is no\nmaximum limit." - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -67966,7 +71911,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum depth of each tree. The depth of a tree is the number of\nedges to go from the root to the deepest leaf.\nDepth isn't constrained by default." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -67976,7 +71922,8 @@ "docstring": { "type": "int, default=20", "description": "The minimum number of samples per leaf." - } + }, + "refined_type": {} }, { "name": "min_gain_to_split", @@ -67986,7 +71933,8 @@ "docstring": { "type": "float, default=0.", "description": "The minimum gain needed to split a node. Splits with lower gain will\nbe ignored." - } + }, + "refined_type": {} }, { "name": "n_bins", @@ -67996,7 +71944,8 @@ "docstring": { "type": "int, default=256", "description": "The total number of bins, including the bin for missing values. Used\nto define the shape of the histograms." - } + }, + "refined_type": {} }, { "name": "n_bins_non_missing", @@ -68006,7 +71955,8 @@ "docstring": { "type": "ndarray, dtype=np.uint32, default=None", "description": "For each feature, gives the number of bins actually used for\nnon-missing values. For features with a lot of unique values, this\nis equal to ``n_bins - 1``. If it's an int, all features are\nconsidered to have the same number of bins. If None, all features\nare considered to have ``n_bins - 1`` bins." - } + }, + "refined_type": {} }, { "name": "has_missing_values", @@ -68016,7 +71966,8 @@ "docstring": { "type": "bool or ndarray, dtype=bool, default=False", "description": "Whether each feature contains missing values (in the training data).\nIf it's a bool, the same value is used for all features." - } + }, + "refined_type": {} }, { "name": "is_categorical", @@ -68026,7 +71977,8 @@ "docstring": { "type": "ndarray of bool of shape (n_features,), default=None", "description": "Indicates categorical features." - } + }, + "refined_type": {} }, { "name": "monotonic_cst", @@ -68036,7 +71988,8 @@ "docstring": { "type": "array-like of shape (n_features,), dtype=int, default=None", "description": "Indicates the monotonic constraint to enforce on each feature. -1, 1\nand 0 respectively correspond to a positive constraint, negative\nconstraint and no constraint. Read more in the :ref:`User Guide\n`." - } + }, + "refined_type": {} }, { "name": "l2_regularization", @@ -68046,7 +71999,8 @@ "docstring": { "type": "float, default=0.", "description": "The L2 regularization parameter." - } + }, + "refined_type": {} }, { "name": "min_hessian_to_split", @@ -68056,7 +72010,8 @@ "docstring": { "type": "float, default=1e-3", "description": "The minimum sum of hessians needed in each node. Splits that result in\nat least one child having a sum of hessians less than\n``min_hessian_to_split`` are discarded." - } + }, + "refined_type": {} }, { "name": "shrinkage", @@ -68066,7 +72021,8 @@ "docstring": { "type": "float, default=1.", "description": "The shrinkage parameter to apply to the leaves values, also known as\nlearning rate." - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -68076,13 +72032,14 @@ "docstring": { "type": "int, default=None", "description": "Number of OpenMP threads to use. `_openmp_effective_n_threads` is called\nto determine the effective number of threads use, which takes cgroups CPU\nquotes into account. See the docstring of `_openmp_effective_n_threads`\nfor details." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None, max_depth=None, min_samples_leaf=20, min_gain_to_split=0.0, n_bins=256, n_bins_non_missing=None, has_missing_values=False, is_categorical=None, monotonic_cst=None, l2_regularization=0.0, min_hessian_to_split=0.001, shrinkage=1.0, n_threads=None):\n self._validate_parameters(X_binned, max_leaf_nodes, max_depth, min_samples_leaf, min_gain_to_split, l2_regularization, min_hessian_to_split)\n n_threads = _openmp_effective_n_threads(n_threads)\n if n_bins_non_missing is None:\n n_bins_non_missing = n_bins - 1\n if isinstance(n_bins_non_missing, numbers.Integral):\n n_bins_non_missing = np.array([n_bins_non_missing] * X_binned.shape[1], dtype=np.uint32)\n else:\n n_bins_non_missing = np.asarray(n_bins_non_missing, dtype=np.uint32)\n if isinstance(has_missing_values, bool):\n has_missing_values = [has_missing_values] * X_binned.shape[1]\n has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)\n if monotonic_cst is None:\n self.with_monotonic_cst = False\n monotonic_cst = np.full(shape=X_binned.shape[1], fill_value=MonotonicConstraint.NO_CST, dtype=np.int8)\n else:\n self.with_monotonic_cst = True\n monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)\n if monotonic_cst.shape[0] != X_binned.shape[1]:\n raise ValueError('monotonic_cst has shape {} but the input data X has {} features.'.format(monotonic_cst.shape[0], X_binned.shape[1]))\n if np.any(monotonic_cst < -1) or np.any(monotonic_cst > 1):\n raise ValueError('monotonic_cst must be None or an array-like of -1, 0 or 1.')\n if is_categorical is None:\n is_categorical = np.zeros(shape=X_binned.shape[1], dtype=np.uint8)\n else:\n is_categorical = np.asarray(is_categorical, dtype=np.uint8)\n if np.any(np.logical_and(is_categorical == 1, monotonic_cst != MonotonicConstraint.NO_CST)):\n raise ValueError('Categorical features cannot have monotonic constraints.')\n hessians_are_constant = hessians.shape[0] == 1\n self.histogram_builder = HistogramBuilder(X_binned, n_bins, gradients, hessians, hessians_are_constant, n_threads)\n missing_values_bin_idx = n_bins - 1\n self.splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx, has_missing_values, is_categorical, monotonic_cst, l2_regularization, min_hessian_to_split, min_samples_leaf, min_gain_to_split, hessians_are_constant, n_threads)\n self.n_bins_non_missing = n_bins_non_missing\n self.missing_values_bin_idx = missing_values_bin_idx\n self.max_leaf_nodes = max_leaf_nodes\n self.has_missing_values = has_missing_values\n self.monotonic_cst = monotonic_cst\n self.is_categorical = is_categorical\n self.l2_regularization = l2_regularization\n self.n_features = X_binned.shape[1]\n self.max_depth = max_depth\n self.min_samples_leaf = min_samples_leaf\n self.X_binned = X_binned\n self.min_gain_to_split = min_gain_to_split\n self.shrinkage = shrinkage\n self.n_threads = n_threads\n self.splittable_nodes = []\n self.finalized_leaves = []\n self.total_find_split_time = 0.0\n self.total_compute_hist_time = 0.0\n self.total_apply_split_time = 0.0\n self.n_categorical_splits = 0\n self._intilialize_root(gradients, hessians, hessians_are_constant)\n self.n_nodes = 1" }, { @@ -68100,13 +72057,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Multiply leaves values by shrinkage parameter.\n\nThis must be done at the very end of the growing process. If this were done during the growing process e.g. in finalize_leaf(), then a leaf would be shrunk but its sibling would potentially not be (if it's a non-leaf), which would lead to a wrong computation of the 'middle' value needed to enforce the monotonic constraints.", - "docstring": "Multiply leaves values by shrinkage parameter.\n\nThis must be done at the very end of the growing process. If this were\ndone during the growing process e.g. in finalize_leaf(), then a leaf\nwould be shrunk but its sibling would potentially not be (if it's a\nnon-leaf), which would lead to a wrong computation of the 'middle'\nvalue needed to enforce the monotonic constraints.", + "description": "Multiply leaves values by shrinkage parameter.\n\nThis must be done at the very end of the growing process. If this were\ndone during the growing process e.g. in finalize_leaf(), then a leaf\nwould be shrunk but its sibling would potentially not be (if it's a\nnon-leaf), which would lead to a wrong computation of the 'middle'\nvalue needed to enforce the monotonic constraints.", + "docstring": "Multiply leaves values by shrinkage parameter.\n\n This must be done at the very end of the growing process. If this were\n done during the growing process e.g. in finalize_leaf(), then a leaf\n would be shrunk but its sibling would potentially not be (if it's a\n non-leaf), which would lead to a wrong computation of the 'middle'\n value needed to enforce the monotonic constraints.\n ", "source_code": "\ndef _apply_shrinkage(self):\n \"\"\"Multiply leaves values by shrinkage parameter.\n\n This must be done at the very end of the growing process. If this were\n done during the growing process e.g. in finalize_leaf(), then a leaf\n would be shrunk but its sibling would potentially not be (if it's a\n non-leaf), which would lead to a wrong computation of the 'middle'\n value needed to enforce the monotonic constraints.\n \"\"\"\n for leaf in self.finalized_leaves:\n leaf.value *= self.shrinkage" }, { @@ -68124,7 +72082,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "node", @@ -68134,13 +72093,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the best possible split (SplitInfo) of a given node.\n\nAlso push it in the heap of splittable nodes if gain isn't zero. The gain of a node is 0 if either all the leaves are pure (best gain = 0), or if no split would satisfy the constraints, (min_hessians_to_split, min_gain_to_split, min_samples_leaf)", - "docstring": "Compute the best possible split (SplitInfo) of a given node.\n\nAlso push it in the heap of splittable nodes if gain isn't zero.\nThe gain of a node is 0 if either all the leaves are pure\n(best gain = 0), or if no split would satisfy the constraints,\n(min_hessians_to_split, min_gain_to_split, min_samples_leaf)", + "description": "Compute the best possible split (SplitInfo) of a given node.\n\nAlso push it in the heap of splittable nodes if gain isn't zero.\nThe gain of a node is 0 if either all the leaves are pure\n(best gain = 0), or if no split would satisfy the constraints,\n(min_hessians_to_split, min_gain_to_split, min_samples_leaf)", + "docstring": "Compute the best possible split (SplitInfo) of a given node.\n\n Also push it in the heap of splittable nodes if gain isn't zero.\n The gain of a node is 0 if either all the leaves are pure\n (best gain = 0), or if no split would satisfy the constraints,\n (min_hessians_to_split, min_gain_to_split, min_samples_leaf)\n ", "source_code": "\ndef _compute_best_split_and_push(self, node):\n \"\"\"Compute the best possible split (SplitInfo) of a given node.\n\n Also push it in the heap of splittable nodes if gain isn't zero.\n The gain of a node is 0 if either all the leaves are pure\n (best gain = 0), or if no split would satisfy the constraints,\n (min_hessians_to_split, min_gain_to_split, min_samples_leaf)\n \"\"\"\n node.split_info = self.splitter.find_node_split(node.n_samples, node.histograms, node.sum_gradients, node.sum_hessians, node.value, node.children_lower_bound, node.children_upper_bound)\n if node.split_info.gain <= 0:\n self._finalize_leaf(node)\n else:\n heappush(self.splittable_nodes, node)" }, { @@ -68158,7 +72118,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "node", @@ -68168,7 +72129,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -68192,13 +72154,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Transform all splittable nodes into leaves.\n\nUsed when some constraint is met e.g. maximum number of leaves or maximum depth.", - "docstring": "Transform all splittable nodes into leaves.\n\nUsed when some constraint is met e.g. maximum number of leaves or\nmaximum depth.", + "description": "Transform all splittable nodes into leaves.\n\nUsed when some constraint is met e.g. maximum number of leaves or\nmaximum depth.", + "docstring": "Transform all splittable nodes into leaves.\n\n Used when some constraint is met e.g. maximum number of leaves or\n maximum depth.", "source_code": "\ndef _finalize_splittable_nodes(self):\n \"\"\"Transform all splittable nodes into leaves.\n\n Used when some constraint is met e.g. maximum number of leaves or\n maximum depth.\"\"\"\n while len(self.splittable_nodes) > 0:\n node = self.splittable_nodes.pop()\n self._finalize_leaf(node)" }, { @@ -68216,7 +72179,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gradients", @@ -68226,7 +72190,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "hessians", @@ -68236,7 +72201,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "hessians_are_constant", @@ -68246,7 +72212,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -68270,7 +72237,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_binned", @@ -68280,7 +72248,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -68290,7 +72259,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -68300,7 +72270,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -68310,7 +72281,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_gain_to_split", @@ -68320,7 +72292,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l2_regularization", @@ -68330,7 +72303,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_hessian_to_split", @@ -68340,13 +72314,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Validate parameters passed to __init__.\n\nAlso validate parameters passed to splitter.", - "docstring": "Validate parameters passed to __init__.\n\nAlso validate parameters passed to splitter.", + "docstring": "Validate parameters passed to __init__.\n\n Also validate parameters passed to splitter.\n ", "source_code": "\ndef _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, min_samples_leaf, min_gain_to_split, l2_regularization, min_hessian_to_split):\n \"\"\"Validate parameters passed to __init__.\n\n Also validate parameters passed to splitter.\n \"\"\"\n if X_binned.dtype != np.uint8:\n raise NotImplementedError('X_binned must be of type uint8.')\n if not X_binned.flags.f_contiguous:\n raise ValueError('X_binned should be passed as Fortran contiguous array for maximum efficiency.')\n if max_leaf_nodes is not None and max_leaf_nodes <= 1:\n raise ValueError('max_leaf_nodes={} should not be smaller than 2'.format(max_leaf_nodes))\n if max_depth is not None and max_depth < 1:\n raise ValueError('max_depth={} should not be smaller than 1'.format(max_depth))\n if min_samples_leaf < 1:\n raise ValueError('min_samples_leaf={} should not be smaller than 1'.format(min_samples_leaf))\n if min_gain_to_split < 0:\n raise ValueError('min_gain_to_split={} must be positive.'.format(min_gain_to_split))\n if l2_regularization < 0:\n raise ValueError('l2_regularization={} must be positive.'.format(l2_regularization))\n if min_hessian_to_split < 0:\n raise ValueError('min_hessian_to_split={} must be positive.'.format(min_hessian_to_split))" }, { @@ -68364,7 +72339,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -68388,7 +72364,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "binning_thresholds", @@ -68398,13 +72375,14 @@ "docstring": { "type": "array-like of floats", "description": "Corresponds to the bin_thresholds_ attribute of the BinMapper.\nFor each feature, this stores:\n\n- the bin frontiers for continuous features\n- the unique raw category values for categorical features" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Make a TreePredictor object out of the current tree.", - "docstring": "Make a TreePredictor object out of the current tree.\n\nParameters\n----------\nbinning_thresholds : array-like of floats\n Corresponds to the bin_thresholds_ attribute of the BinMapper.\n For each feature, this stores:\n\n - the bin frontiers for continuous features\n - the unique raw category values for categorical features\n\nReturns\n-------\nA TreePredictor object.", + "docstring": "Make a TreePredictor object out of the current tree.\n\n Parameters\n ----------\n binning_thresholds : array-like of floats\n Corresponds to the bin_thresholds_ attribute of the BinMapper.\n For each feature, this stores:\n\n - the bin frontiers for continuous features\n - the unique raw category values for categorical features\n\n Returns\n -------\n A TreePredictor object.\n ", "source_code": "\ndef make_predictor(self, binning_thresholds):\n \"\"\"Make a TreePredictor object out of the current tree.\n\n Parameters\n ----------\n binning_thresholds : array-like of floats\n Corresponds to the bin_thresholds_ attribute of the BinMapper.\n For each feature, this stores:\n\n - the bin frontiers for continuous features\n - the unique raw category values for categorical features\n\n Returns\n -------\n A TreePredictor object.\n \"\"\"\n predictor_nodes = np.zeros(self.n_nodes, dtype=PREDICTOR_RECORD_DTYPE)\n binned_left_cat_bitsets = np.zeros((self.n_categorical_splits, 8), dtype=X_BITSET_INNER_DTYPE)\n raw_left_cat_bitsets = np.zeros((self.n_categorical_splits, 8), dtype=X_BITSET_INNER_DTYPE)\n _fill_predictor_arrays(predictor_nodes, binned_left_cat_bitsets, raw_left_cat_bitsets, self.root, binning_thresholds, self.n_bins_non_missing)\n return TreePredictor(predictor_nodes, binned_left_cat_bitsets, raw_left_cat_bitsets)" }, { @@ -68422,13 +72400,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Split the node with highest potential gain.", - "docstring": "Split the node with highest potential gain.\n\nReturns\n-------\nleft : TreeNode\n The resulting left child.\nright : TreeNode\n The resulting right child.", + "docstring": "Split the node with highest potential gain.\n\n Returns\n -------\n left : TreeNode\n The resulting left child.\n right : TreeNode\n The resulting right child.\n ", "source_code": "\ndef split_next(self):\n \"\"\"Split the node with highest potential gain.\n\n Returns\n -------\n left : TreeNode\n The resulting left child.\n right : TreeNode\n The resulting right child.\n \"\"\"\n node = heappop(self.splittable_nodes)\n tic = time()\n (sample_indices_left, sample_indices_right, right_child_pos) = self.splitter.split_indices(node.split_info, node.sample_indices)\n self.total_apply_split_time += time() - tic\n depth = node.depth + 1\n n_leaf_nodes = len(self.finalized_leaves) + len(self.splittable_nodes)\n n_leaf_nodes += 2\n left_child_node = TreeNode(depth, sample_indices_left, node.split_info.sum_gradient_left, node.split_info.sum_hessian_left, value=node.split_info.value_left)\n right_child_node = TreeNode(depth, sample_indices_right, node.split_info.sum_gradient_right, node.split_info.sum_hessian_right, value=node.split_info.value_right)\n node.right_child = right_child_node\n node.left_child = left_child_node\n left_child_node.partition_start = node.partition_start\n left_child_node.partition_stop = node.partition_start + right_child_pos\n right_child_node.partition_start = left_child_node.partition_stop\n right_child_node.partition_stop = node.partition_stop\n if not self.has_missing_values[node.split_info.feature_idx]:\n node.split_info.missing_go_to_left = left_child_node.n_samples > right_child_node.n_samples\n self.n_nodes += 2\n self.n_categorical_splits += node.split_info.is_categorical\n if self.max_leaf_nodes is not None and n_leaf_nodes == self.max_leaf_nodes:\n self._finalize_leaf(left_child_node)\n self._finalize_leaf(right_child_node)\n self._finalize_splittable_nodes()\n return left_child_node, right_child_node\n if self.max_depth is not None and depth == self.max_depth:\n self._finalize_leaf(left_child_node)\n self._finalize_leaf(right_child_node)\n return left_child_node, right_child_node\n if left_child_node.n_samples < self.min_samples_leaf * 2:\n self._finalize_leaf(left_child_node)\n if right_child_node.n_samples < self.min_samples_leaf * 2:\n self._finalize_leaf(right_child_node)\n if self.with_monotonic_cst:\n if self.monotonic_cst[node.split_info.feature_idx] == MonotonicConstraint.NO_CST:\n lower_left = lower_right = node.children_lower_bound\n upper_left = upper_right = node.children_upper_bound\n else:\n mid = (left_child_node.value + right_child_node.value) / 2\n if self.monotonic_cst[node.split_info.feature_idx] == MonotonicConstraint.POS:\n (lower_left, upper_left) = (node.children_lower_bound, mid)\n (lower_right, upper_right) = (mid, node.children_upper_bound)\n else:\n (lower_left, upper_left) = (mid, node.children_upper_bound)\n (lower_right, upper_right) = (node.children_lower_bound, mid)\n left_child_node.set_children_bounds(lower_left, upper_left)\n right_child_node.set_children_bounds(lower_right, upper_right)\n should_split_left = not left_child_node.is_leaf\n should_split_right = not right_child_node.is_leaf\n if should_split_left or should_split_right:\n n_samples_left = left_child_node.sample_indices.shape[0]\n n_samples_right = right_child_node.sample_indices.shape[0]\n if n_samples_left < n_samples_right:\n smallest_child = left_child_node\n largest_child = right_child_node\n else:\n smallest_child = right_child_node\n largest_child = left_child_node\n tic = time()\n smallest_child.histograms = self.histogram_builder.compute_histograms_brute(smallest_child.sample_indices)\n largest_child.histograms = self.histogram_builder.compute_histograms_subtraction(node.histograms, smallest_child.histograms)\n self.total_compute_hist_time += time() - tic\n tic = time()\n if should_split_left:\n self._compute_best_split_and_push(left_child_node)\n if should_split_right:\n self._compute_best_split_and_push(right_child_node)\n self.total_find_split_time += time() - tic\n for child in (left_child_node, right_child_node):\n if child.is_leaf:\n del child.histograms\n del node.histograms\n return left_child_node, right_child_node" }, { @@ -68446,7 +72425,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "depth", @@ -68456,7 +72436,8 @@ "docstring": { "type": "int", "description": "The depth of the node, i.e. its distance from the root." - } + }, + "refined_type": {} }, { "name": "sample_indices", @@ -68466,7 +72447,8 @@ "docstring": { "type": "ndarray of shape (n_samples_at_node,), dtype=np.uint", "description": "The indices of the samples at the node." - } + }, + "refined_type": {} }, { "name": "sum_gradients", @@ -68476,7 +72458,8 @@ "docstring": { "type": "float", "description": "The sum of the gradients of the samples at the node." - } + }, + "refined_type": {} }, { "name": "sum_hessians", @@ -68486,7 +72469,8 @@ "docstring": { "type": "float", "description": "The sum of the hessians of the samples at the node." - } + }, + "refined_type": {} }, { "name": "value", @@ -68496,13 +72480,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, depth, sample_indices, sum_gradients, sum_hessians, value=None):\n self.depth = depth\n self.sample_indices = sample_indices\n self.n_samples = sample_indices.shape[0]\n self.sum_gradients = sum_gradients\n self.sum_hessians = sum_hessians\n self.value = value\n self.is_leaf = False\n self.set_children_bounds(float('-inf'), float('+inf'))" }, { @@ -68520,7 +72505,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other_node", @@ -68530,13 +72516,14 @@ "docstring": { "type": "TreeNode", "description": "The node to compare with." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Comparison for priority queue.\n\nNodes with high gain are higher priority than nodes with low gain. heapq.heappush only need the '<' operator. heapq.heappop take the smallest item first (smaller is higher priority).", - "docstring": "Comparison for priority queue.\n\nNodes with high gain are higher priority than nodes with low gain.\n\nheapq.heappush only need the '<' operator.\nheapq.heappop take the smallest item first (smaller is higher\npriority).\n\nParameters\n----------\nother_node : TreeNode\n The node to compare with.", + "description": "Comparison for priority queue.\n\nNodes with high gain are higher priority than nodes with low gain.\n\nheapq.heappush only need the '<' operator.\nheapq.heappop take the smallest item first (smaller is higher\npriority).", + "docstring": "Comparison for priority queue.\n\n Nodes with high gain are higher priority than nodes with low gain.\n\n heapq.heappush only need the '<' operator.\n heapq.heappop take the smallest item first (smaller is higher\n priority).\n\n Parameters\n ----------\n other_node : TreeNode\n The node to compare with.\n ", "source_code": "\ndef __lt__(self, other_node):\n \"\"\"Comparison for priority queue.\n\n Nodes with high gain are higher priority than nodes with low gain.\n\n heapq.heappush only need the '<' operator.\n heapq.heappop take the smallest item first (smaller is higher\n priority).\n\n Parameters\n ----------\n other_node : TreeNode\n The node to compare with.\n \"\"\"\n return self.split_info.gain > other_node.split_info.gain" }, { @@ -68554,7 +72541,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lower", @@ -68564,7 +72552,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "upper", @@ -68574,7 +72563,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -68598,7 +72588,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "binned_left_cat_bitsets", @@ -68608,7 +72599,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_left_cat_bitsets", @@ -68618,7 +72610,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "grower_node", @@ -68628,7 +72621,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "binning_thresholds", @@ -68638,7 +72632,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_bins_non_missing", @@ -68648,7 +72643,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "next_free_node_idx", @@ -68658,7 +72654,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "next_free_bitset_idx", @@ -68668,7 +72665,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -68692,7 +72690,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -68702,7 +72701,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -68712,7 +72712,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -68722,7 +72723,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -68746,7 +72748,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "hessians_are_constant", @@ -68756,7 +72759,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -68766,13 +72770,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, hessians_are_constant, n_threads=None):\n self.hessians_are_constant = hessians_are_constant\n self.n_threads = _openmp_effective_n_threads(n_threads)" }, { @@ -68790,7 +72795,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_train", @@ -68800,7 +72806,8 @@ "docstring": { "type": "ndarray, shape (n_samples,)", "description": "The target training values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -68810,7 +72817,8 @@ "docstring": { "type": "array-like of shape(n_samples,) default=None", "description": "Weights of training data." - } + }, + "refined_type": {} }, { "name": "prediction_dim", @@ -68820,13 +72828,14 @@ "docstring": { "type": "int", "description": "The dimension of one prediction: 1 for binary classification and\nregression, n_classes for multiclass classification." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return initial predictions (before the first iteration).", - "docstring": "Return initial predictions (before the first iteration).\n\nParameters\n----------\ny_train : ndarray, shape (n_samples,)\n The target training values.\n\nsample_weight : array-like of shape(n_samples,) default=None\n Weights of training data.\n\nprediction_dim : int\n The dimension of one prediction: 1 for binary classification and\n regression, n_classes for multiclass classification.\n\nReturns\n-------\nbaseline_prediction : float or ndarray, shape (1, prediction_dim)\n The baseline prediction.", + "docstring": "Return initial predictions (before the first iteration).\n\n Parameters\n ----------\n y_train : ndarray, shape (n_samples,)\n The target training values.\n\n sample_weight : array-like of shape(n_samples,) default=None\n Weights of training data.\n\n prediction_dim : int\n The dimension of one prediction: 1 for binary classification and\n regression, n_classes for multiclass classification.\n\n Returns\n -------\n baseline_prediction : float or ndarray, shape (1, prediction_dim)\n The baseline prediction.\n ", "source_code": "\n@abstractmethod\ndef get_baseline_prediction(self, y_train, sample_weight, prediction_dim):\n \"\"\"Return initial predictions (before the first iteration).\n\n Parameters\n ----------\n y_train : ndarray, shape (n_samples,)\n The target training values.\n\n sample_weight : array-like of shape(n_samples,) default=None\n Weights of training data.\n\n prediction_dim : int\n The dimension of one prediction: 1 for binary classification and\n regression, n_classes for multiclass classification.\n\n Returns\n -------\n baseline_prediction : float or ndarray, shape (1, prediction_dim)\n The baseline prediction.\n \"\"\"\n " }, { @@ -68844,7 +72853,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -68854,7 +72864,8 @@ "docstring": { "type": "int", "description": "The number of samples passed to `fit()`." - } + }, + "refined_type": {} }, { "name": "prediction_dim", @@ -68864,7 +72875,8 @@ "docstring": { "type": "int", "description": "The dimension of a raw prediction, i.e. the number of trees\nbuilt at each iteration. Equals 1 for regression and binary\nclassification, or K where K is the number of classes for\nmulticlass classification." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -68874,13 +72886,14 @@ "docstring": { "type": "array-like of shape(n_samples,) default=None", "description": "Weights of training data." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return initial gradients and hessians.\n\nUnless hessians are constant, arrays are initialized with undefined values.", - "docstring": "Return initial gradients and hessians.\n\nUnless hessians are constant, arrays are initialized with undefined\nvalues.\n\nParameters\n----------\nn_samples : int\n The number of samples passed to `fit()`.\n\nprediction_dim : int\n The dimension of a raw prediction, i.e. the number of trees\n built at each iteration. Equals 1 for regression and binary\n classification, or K where K is the number of classes for\n multiclass classification.\n\nsample_weight : array-like of shape(n_samples,) default=None\n Weights of training data.\n\nReturns\n-------\ngradients : ndarray, shape (prediction_dim, n_samples)\n The initial gradients. The array is not initialized.\nhessians : ndarray, shape (prediction_dim, n_samples)\n If hessians are constant (e.g. for `LeastSquares` loss, the\n array is initialized to ``1``. Otherwise, the array is allocated\n without being initialized.", + "description": "Return initial gradients and hessians.\n\nUnless hessians are constant, arrays are initialized with undefined\nvalues.", + "docstring": "Return initial gradients and hessians.\n\n Unless hessians are constant, arrays are initialized with undefined\n values.\n\n Parameters\n ----------\n n_samples : int\n The number of samples passed to `fit()`.\n\n prediction_dim : int\n The dimension of a raw prediction, i.e. the number of trees\n built at each iteration. Equals 1 for regression and binary\n classification, or K where K is the number of classes for\n multiclass classification.\n\n sample_weight : array-like of shape(n_samples,) default=None\n Weights of training data.\n\n Returns\n -------\n gradients : ndarray, shape (prediction_dim, n_samples)\n The initial gradients. The array is not initialized.\n hessians : ndarray, shape (prediction_dim, n_samples)\n If hessians are constant (e.g. for `LeastSquares` loss, the\n array is initialized to ``1``. Otherwise, the array is allocated\n without being initialized.\n ", "source_code": "\ndef init_gradients_and_hessians(self, n_samples, prediction_dim, sample_weight):\n \"\"\"Return initial gradients and hessians.\n\n Unless hessians are constant, arrays are initialized with undefined\n values.\n\n Parameters\n ----------\n n_samples : int\n The number of samples passed to `fit()`.\n\n prediction_dim : int\n The dimension of a raw prediction, i.e. the number of trees\n built at each iteration. Equals 1 for regression and binary\n classification, or K where K is the number of classes for\n multiclass classification.\n\n sample_weight : array-like of shape(n_samples,) default=None\n Weights of training data.\n\n Returns\n -------\n gradients : ndarray, shape (prediction_dim, n_samples)\n The initial gradients. The array is not initialized.\n hessians : ndarray, shape (prediction_dim, n_samples)\n If hessians are constant (e.g. for `LeastSquares` loss, the\n array is initialized to ``1``. Otherwise, the array is allocated\n without being initialized.\n \"\"\"\n shape = (prediction_dim, n_samples)\n gradients = np.empty(shape=shape, dtype=G_H_DTYPE)\n if self.hessians_are_constant:\n hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE)\n else:\n hessians = np.empty(shape=shape, dtype=G_H_DTYPE)\n return gradients, hessians" }, { @@ -68898,7 +72911,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -68908,7 +72922,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -68918,7 +72933,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -68942,7 +72958,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gradients", @@ -68952,7 +72969,8 @@ "docstring": { "type": "ndarray, shape (prediction_dim, n_samples)", "description": "The gradients (treated as OUT array)." - } + }, + "refined_type": {} }, { "name": "hessians", @@ -68962,7 +72980,8 @@ "docstring": { "type": "ndarray, shape (prediction_dim, n_samples) or (1,)", "description": "The hessians (treated as OUT array)." - } + }, + "refined_type": {} }, { "name": "y_true", @@ -68972,7 +72991,8 @@ "docstring": { "type": "ndarray, shape (n_samples,)", "description": "The true target values or each training sample." - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -68982,7 +73002,8 @@ "docstring": { "type": "ndarray, shape (prediction_dim, n_samples)", "description": "The raw_predictions (i.e. values from the trees) of the tree\nensemble at iteration ``i - 1``." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -68992,13 +73013,14 @@ "docstring": { "type": "array-like of shape(n_samples,) default=None", "description": "Weights of training data." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Update gradients and hessians arrays, inplace.\n\nThe gradients (resp. hessians) are the first (resp. second) order derivatives of the loss for each sample with respect to the predictions of model, evaluated at iteration ``i - 1``.", - "docstring": "Update gradients and hessians arrays, inplace.\n\nThe gradients (resp. hessians) are the first (resp. second) order\nderivatives of the loss for each sample with respect to the\npredictions of model, evaluated at iteration ``i - 1``.\n\nParameters\n----------\ngradients : ndarray, shape (prediction_dim, n_samples)\n The gradients (treated as OUT array).\n\nhessians : ndarray, shape (prediction_dim, n_samples) or (1,)\n The hessians (treated as OUT array).\n\ny_true : ndarray, shape (n_samples,)\n The true target values or each training sample.\n\nraw_predictions : ndarray, shape (prediction_dim, n_samples)\n The raw_predictions (i.e. values from the trees) of the tree\n ensemble at iteration ``i - 1``.\n\nsample_weight : array-like of shape(n_samples,) default=None\n Weights of training data.", + "description": "Update gradients and hessians arrays, inplace.\n\nThe gradients (resp. hessians) are the first (resp. second) order\nderivatives of the loss for each sample with respect to the\npredictions of model, evaluated at iteration ``i - 1``.", + "docstring": "Update gradients and hessians arrays, inplace.\n\n The gradients (resp. hessians) are the first (resp. second) order\n derivatives of the loss for each sample with respect to the\n predictions of model, evaluated at iteration ``i - 1``.\n\n Parameters\n ----------\n gradients : ndarray, shape (prediction_dim, n_samples)\n The gradients (treated as OUT array).\n\n hessians : ndarray, shape (prediction_dim, n_samples) or (1,)\n The hessians (treated as OUT array).\n\n y_true : ndarray, shape (n_samples,)\n The true target values or each training sample.\n\n raw_predictions : ndarray, shape (prediction_dim, n_samples)\n The raw_predictions (i.e. values from the trees) of the tree\n ensemble at iteration ``i - 1``.\n\n sample_weight : array-like of shape(n_samples,) default=None\n Weights of training data.\n ", "source_code": "\n@abstractmethod\ndef update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions, sample_weight):\n \"\"\"Update gradients and hessians arrays, inplace.\n\n The gradients (resp. hessians) are the first (resp. second) order\n derivatives of the loss for each sample with respect to the\n predictions of model, evaluated at iteration ``i - 1``.\n\n Parameters\n ----------\n gradients : ndarray, shape (prediction_dim, n_samples)\n The gradients (treated as OUT array).\n\n hessians : ndarray, shape (prediction_dim, n_samples) or (1,)\n The hessians (treated as OUT array).\n\n y_true : ndarray, shape (n_samples,)\n The true target values or each training sample.\n\n raw_predictions : ndarray, shape (prediction_dim, n_samples)\n The raw_predictions (i.e. values from the trees) of the tree\n ensemble at iteration ``i - 1``.\n\n sample_weight : array-like of shape(n_samples,) default=None\n Weights of training data.\n \"\"\"\n " }, { @@ -69016,7 +73038,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -69026,7 +73049,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -69036,13 +73060,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, sample_weight, n_threads=None):\n super().__init__(hessians_are_constant=False, n_threads=n_threads)" }, { @@ -69060,7 +73085,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_train", @@ -69070,7 +73096,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -69080,7 +73107,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "prediction_dim", @@ -69090,13 +73118,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_baseline_prediction(self, y_train, sample_weight, prediction_dim):\n if prediction_dim > 2:\n raise ValueError(\"loss='binary_crossentropy' is not defined for multiclass classification with n_classes=%d, use loss='categorical_crossentropy' instead\" % prediction_dim)\n proba_positive_class = np.average(y_train, weights=sample_weight)\n eps = np.finfo(y_train.dtype).eps\n proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps)\n return np.log(proba_positive_class / (1 - proba_positive_class))" }, { @@ -69114,7 +73143,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -69124,7 +73154,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -69134,13 +73165,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef pointwise_loss(self, y_true, raw_predictions):\n raw_predictions = raw_predictions.reshape(-1)\n loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions\n return loss" }, { @@ -69158,7 +73190,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -69168,13 +73201,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef predict_proba(self, raw_predictions):\n raw_predictions = raw_predictions.reshape(-1)\n proba = np.empty((raw_predictions.shape[0], 2), dtype=Y_DTYPE)\n proba[:, 1] = expit(raw_predictions)\n proba[:, 0] = 1 - proba[:, 1]\n return proba" }, { @@ -69192,7 +73226,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gradients", @@ -69202,7 +73237,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "hessians", @@ -69212,7 +73248,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -69222,7 +73259,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -69232,7 +73270,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -69242,13 +73281,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions, sample_weight):\n raw_predictions = raw_predictions.reshape(-1)\n gradients = gradients.reshape(-1)\n hessians = hessians.reshape(-1)\n _update_gradients_hessians_binary_crossentropy(gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads)" }, { @@ -69266,7 +73306,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -69276,7 +73317,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -69286,13 +73328,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, sample_weight, n_threads=None):\n super().__init__(hessians_are_constant=False, n_threads=n_threads)" }, { @@ -69310,7 +73353,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_train", @@ -69320,7 +73364,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -69330,7 +73375,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "prediction_dim", @@ -69340,13 +73386,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_baseline_prediction(self, y_train, sample_weight, prediction_dim):\n init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE)\n eps = np.finfo(y_train.dtype).eps\n for k in range(prediction_dim):\n proba_kth_class = np.average(y_train == k, weights=sample_weight)\n proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps)\n init_value[k, :] += np.log(proba_kth_class)\n return init_value" }, { @@ -69364,7 +73411,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -69374,7 +73422,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -69384,13 +73433,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef pointwise_loss(self, y_true, raw_predictions):\n one_hot_true = np.zeros_like(raw_predictions)\n prediction_dim = raw_predictions.shape[0]\n for k in range(prediction_dim):\n one_hot_true[k, :] = y_true == k\n loss = logsumexp(raw_predictions, axis=0) - (one_hot_true * raw_predictions).sum(axis=0)\n return loss" }, { @@ -69408,7 +73458,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -69418,13 +73469,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef predict_proba(self, raw_predictions):\n proba = np.exp(raw_predictions - logsumexp(raw_predictions, axis=0)[np.newaxis, :])\n return proba.T" }, { @@ -69442,7 +73494,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gradients", @@ -69452,7 +73505,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "hessians", @@ -69462,7 +73516,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -69472,7 +73527,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -69482,7 +73538,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -69492,13 +73549,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions, sample_weight):\n _update_gradients_hessians_categorical_crossentropy(gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads)" }, { @@ -69516,7 +73574,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -69526,7 +73585,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -69536,13 +73596,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, sample_weight, n_threads=None):\n super().__init__(hessians_are_constant=sample_weight is None, n_threads=n_threads)" }, { @@ -69560,7 +73621,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_train", @@ -69570,7 +73632,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -69580,7 +73643,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "prediction_dim", @@ -69590,13 +73654,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_baseline_prediction(self, y_train, sample_weight, prediction_dim):\n if sample_weight is None:\n return np.median(y_train)\n else:\n return _weighted_percentile(y_train, sample_weight, 50)" }, { @@ -69614,13 +73679,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@staticmethod\ndef inverse_link_function(raw_predictions):\n return raw_predictions" }, { @@ -69638,7 +73704,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -69648,7 +73715,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -69658,13 +73726,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef pointwise_loss(self, y_true, raw_predictions):\n raw_predictions = raw_predictions.reshape(-1)\n loss = np.abs(y_true - raw_predictions)\n return loss" }, { @@ -69682,7 +73751,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gradients", @@ -69692,7 +73762,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "hessians", @@ -69702,7 +73773,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -69712,7 +73784,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -69722,7 +73795,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -69732,13 +73806,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions, sample_weight):\n raw_predictions = raw_predictions.reshape(-1)\n gradients = gradients.reshape(-1)\n if sample_weight is None:\n _update_gradients_least_absolute_deviation(gradients, y_true, raw_predictions, self.n_threads)\n else:\n hessians = hessians.reshape(-1)\n _update_gradients_hessians_least_absolute_deviation(gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads)" }, { @@ -69756,7 +73831,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "grower", @@ -69766,7 +73842,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -69776,7 +73853,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -69786,7 +73864,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -69796,13 +73875,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef update_leaves_values(self, grower, y_true, raw_predictions, sample_weight):\n for leaf in grower.finalized_leaves:\n indices = leaf.sample_indices\n if sample_weight is None:\n median_res = np.median(y_true[indices] - raw_predictions[indices])\n else:\n median_res = _weighted_percentile(y_true[indices] - raw_predictions[indices], sample_weight=sample_weight[indices], percentile=50)\n leaf.value = grower.shrinkage * median_res" }, { @@ -69820,7 +73900,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -69830,7 +73911,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -69840,13 +73922,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, sample_weight, n_threads=None):\n super().__init__(hessians_are_constant=sample_weight is None, n_threads=n_threads)" }, { @@ -69864,7 +73947,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_train", @@ -69874,7 +73958,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -69884,7 +73969,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "prediction_dim", @@ -69894,13 +73980,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_baseline_prediction(self, y_train, sample_weight, prediction_dim):\n return np.average(y_train, weights=sample_weight)" }, { @@ -69918,13 +74005,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@staticmethod\ndef inverse_link_function(raw_predictions):\n return raw_predictions" }, { @@ -69942,7 +74030,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -69952,7 +74041,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -69962,13 +74052,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef pointwise_loss(self, y_true, raw_predictions):\n raw_predictions = raw_predictions.reshape(-1)\n loss = 0.5 * np.power(y_true - raw_predictions, 2)\n return loss" }, { @@ -69986,7 +74077,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gradients", @@ -69996,7 +74088,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "hessians", @@ -70006,7 +74099,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -70016,7 +74110,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -70026,7 +74121,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -70036,13 +74132,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions, sample_weight):\n raw_predictions = raw_predictions.reshape(-1)\n gradients = gradients.reshape(-1)\n if sample_weight is None:\n _update_gradients_least_squares(gradients, y_true, raw_predictions, self.n_threads)\n else:\n hessians = hessians.reshape(-1)\n _update_gradients_hessians_least_squares(gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads)" }, { @@ -70060,7 +74157,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -70070,7 +74168,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -70080,13 +74179,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, sample_weight, n_threads=None):\n super().__init__(hessians_are_constant=False, n_threads=n_threads)" }, { @@ -70104,7 +74204,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_train", @@ -70114,7 +74215,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -70124,7 +74226,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "prediction_dim", @@ -70134,13 +74237,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_baseline_prediction(self, y_train, sample_weight, prediction_dim):\n y_pred = np.average(y_train, weights=sample_weight)\n eps = np.finfo(y_train.dtype).eps\n y_pred = np.clip(y_pred, eps, None)\n return np.log(y_pred)" }, { @@ -70158,7 +74262,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -70168,7 +74273,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -70178,13 +74284,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef pointwise_loss(self, y_true, raw_predictions):\n raw_predictions = raw_predictions.reshape(-1)\n loss = xlogy(y_true, y_true) - y_true * (raw_predictions + 1) + np.exp(raw_predictions)\n return loss" }, { @@ -70202,7 +74309,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gradients", @@ -70212,7 +74320,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "hessians", @@ -70222,7 +74331,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -70232,7 +74342,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_predictions", @@ -70242,7 +74353,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -70252,13 +74364,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef update_gradients_and_hessians(self, gradients, hessians, y_true, raw_predictions, sample_weight):\n raw_predictions = raw_predictions.reshape(-1)\n gradients = gradients.reshape(-1)\n hessians = hessians.reshape(-1)\n _update_gradients_hessians_poisson(gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads)" }, { @@ -70276,7 +74389,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nodes", @@ -70286,7 +74400,8 @@ "docstring": { "type": "ndarray of PREDICTOR_RECORD_DTYPE", "description": "The nodes of the tree." - } + }, + "refined_type": {} }, { "name": "binned_left_cat_bitsets", @@ -70296,7 +74411,8 @@ "docstring": { "type": "ndarray of shape (n_categorical_splits, 8), dtype=uint32", "description": "Array of bitsets for binned categories used in predict_binned when a\nsplit is categorical." - } + }, + "refined_type": {} }, { "name": "raw_left_cat_bitsets", @@ -70306,13 +74422,14 @@ "docstring": { "type": "ndarray of shape (n_categorical_splits, 8), dtype=uint32", "description": "Array of bitsets for raw categories used in predict when a split is\ncategorical." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, nodes, binned_left_cat_bitsets, raw_left_cat_bitsets):\n self.nodes = nodes\n self.binned_left_cat_bitsets = binned_left_cat_bitsets\n self.raw_left_cat_bitsets = raw_left_cat_bitsets" }, { @@ -70330,7 +74447,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "grid", @@ -70340,7 +74458,8 @@ "docstring": { "type": "ndarray, shape (n_samples, n_target_features)", "description": "The grid points on which the partial dependence should be\nevaluated." - } + }, + "refined_type": {} }, { "name": "target_features", @@ -70350,7 +74469,8 @@ "docstring": { "type": "ndarray, shape (n_target_features)", "description": "The set of target features for which the partial dependence\nshould be evaluated." - } + }, + "refined_type": {} }, { "name": "out", @@ -70360,13 +74480,14 @@ "docstring": { "type": "ndarray, shape (n_samples)", "description": "The value of the partial dependence function on each grid\npoint." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fast partial dependence computation.", - "docstring": "Fast partial dependence computation.\n\nParameters\n----------\ngrid : ndarray, shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\ntarget_features : ndarray, shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\nout : ndarray, shape (n_samples)\n The value of the partial dependence function on each grid\n point.", + "docstring": "Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray, shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray, shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\n out : ndarray, shape (n_samples)\n The value of the partial dependence function on each grid\n point.\n ", "source_code": "\ndef compute_partial_dependence(self, grid, target_features, out):\n \"\"\"Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray, shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray, shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\n out : ndarray, shape (n_samples)\n The value of the partial dependence function on each grid\n point.\n \"\"\"\n _compute_partial_dependence(self.nodes, grid, target_features, out)" }, { @@ -70384,7 +74505,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -70408,7 +74530,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -70432,7 +74555,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -70442,7 +74566,8 @@ "docstring": { "type": "ndarray, shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} }, { "name": "known_cat_bitsets", @@ -70452,7 +74577,8 @@ "docstring": { "type": "ndarray of shape (n_categorical_features, 8)", "description": "Array of bitsets of known categories, for each categorical feature." - } + }, + "refined_type": {} }, { "name": "f_idx_map", @@ -70462,7 +74588,8 @@ "docstring": { "type": "ndarray of shape (n_features,)", "description": "Map from original feature index to the corresponding index in the\nknown_cat_bitsets array." - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -70472,13 +74599,14 @@ "docstring": { "type": "int", "description": "Number of OpenMP threads to use." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Predict raw values for non-binned data.", - "docstring": "Predict raw values for non-binned data.\n\nParameters\n----------\nX : ndarray, shape (n_samples, n_features)\n The input samples.\n\nknown_cat_bitsets : ndarray of shape (n_categorical_features, 8)\n Array of bitsets of known categories, for each categorical feature.\n\nf_idx_map : ndarray of shape (n_features,)\n Map from original feature index to the corresponding index in the\n known_cat_bitsets array.\n\nn_threads : int\n Number of OpenMP threads to use.\n\nReturns\n-------\ny : ndarray, shape (n_samples,)\n The raw predicted values.", + "docstring": "Predict raw values for non-binned data.\n\n Parameters\n ----------\n X : ndarray, shape (n_samples, n_features)\n The input samples.\n\n known_cat_bitsets : ndarray of shape (n_categorical_features, 8)\n Array of bitsets of known categories, for each categorical feature.\n\n f_idx_map : ndarray of shape (n_features,)\n Map from original feature index to the corresponding index in the\n known_cat_bitsets array.\n\n n_threads : int\n Number of OpenMP threads to use.\n\n Returns\n -------\n y : ndarray, shape (n_samples,)\n The raw predicted values.\n ", "source_code": "\ndef predict(self, X, known_cat_bitsets, f_idx_map, n_threads):\n \"\"\"Predict raw values for non-binned data.\n\n Parameters\n ----------\n X : ndarray, shape (n_samples, n_features)\n The input samples.\n\n known_cat_bitsets : ndarray of shape (n_categorical_features, 8)\n Array of bitsets of known categories, for each categorical feature.\n\n f_idx_map : ndarray of shape (n_features,)\n Map from original feature index to the corresponding index in the\n known_cat_bitsets array.\n\n n_threads : int\n Number of OpenMP threads to use.\n\n Returns\n -------\n y : ndarray, shape (n_samples,)\n The raw predicted values.\n \"\"\"\n out = np.empty(X.shape[0], dtype=Y_DTYPE)\n _predict_from_raw_data(self.nodes, X, self.raw_left_cat_bitsets, known_cat_bitsets, f_idx_map, n_threads, out)\n return out" }, { @@ -70496,7 +74624,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -70506,7 +74635,8 @@ "docstring": { "type": "ndarray, shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} }, { "name": "missing_values_bin_idx", @@ -70516,7 +74646,8 @@ "docstring": { "type": "uint8", "description": "Index of the bin that is used for missing values. This is the\nindex of the last bin and is always equal to max_bins (as passed\nto the GBDT classes), or equivalently to n_bins - 1." - } + }, + "refined_type": {} }, { "name": "n_threads", @@ -70526,13 +74657,14 @@ "docstring": { "type": "int", "description": "Number of OpenMP threads to use." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Predict raw values for binned data.", - "docstring": "Predict raw values for binned data.\n\nParameters\n----------\nX : ndarray, shape (n_samples, n_features)\n The input samples.\nmissing_values_bin_idx : uint8\n Index of the bin that is used for missing values. This is the\n index of the last bin and is always equal to max_bins (as passed\n to the GBDT classes), or equivalently to n_bins - 1.\nn_threads : int\n Number of OpenMP threads to use.\n\nReturns\n-------\ny : ndarray, shape (n_samples,)\n The raw predicted values.", + "docstring": "Predict raw values for binned data.\n\n Parameters\n ----------\n X : ndarray, shape (n_samples, n_features)\n The input samples.\n missing_values_bin_idx : uint8\n Index of the bin that is used for missing values. This is the\n index of the last bin and is always equal to max_bins (as passed\n to the GBDT classes), or equivalently to n_bins - 1.\n n_threads : int\n Number of OpenMP threads to use.\n\n Returns\n -------\n y : ndarray, shape (n_samples,)\n The raw predicted values.\n ", "source_code": "\ndef predict_binned(self, X, missing_values_bin_idx, n_threads):\n \"\"\"Predict raw values for binned data.\n\n Parameters\n ----------\n X : ndarray, shape (n_samples, n_features)\n The input samples.\n missing_values_bin_idx : uint8\n Index of the bin that is used for missing values. This is the\n index of the last bin and is always equal to max_bins (as passed\n to the GBDT classes), or equivalently to n_bins - 1.\n n_threads : int\n Number of OpenMP threads to use.\n\n Returns\n -------\n y : ndarray, shape (n_samples,)\n The raw predicted values.\n \"\"\"\n out = np.empty(X.shape[0], dtype=Y_DTYPE)\n _predict_from_binned_data(self.nodes, X, self.binned_left_cat_bitsets, missing_values_bin_idx, n_threads, out)\n return out" }, { @@ -70550,7 +74682,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -70560,7 +74693,8 @@ "docstring": { "type": "int, default=100", "description": "The number of base estimators in the ensemble." - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -70570,7 +74704,8 @@ "docstring": { "type": "\"auto\", int or float, default=\"auto\"", "description": "The number of samples to draw from X to train each base estimator.\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples.\n - If \"auto\", then `max_samples=min(256, n_samples)`.\n\nIf max_samples is larger than the number of samples provided,\nall samples will be used for all trees (no sampling)." - } + }, + "refined_type": {} }, { "name": "contamination", @@ -70580,6 +74715,14 @@ "docstring": { "type": "'auto' or float, default='auto'", "description": "The amount of contamination of the data set, i.e. the proportion\nof outliers in the data set. Used when fitting to define the threshold\non the scores of the samples.\n\n - If 'auto', the threshold is determined as in the\n original paper.\n - If float, the contamination should be in the range (0, 0.5].\n\n.. versionchanged:: 0.22\n The default value of ``contamination`` changed from 0.1\n to ``'auto'``." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 0.5, + "min_inclusive": false, + "max_inclusive": true } }, { @@ -70590,7 +74733,8 @@ "docstring": { "type": "int or float, default=1.0", "description": "The number of features to draw from X to train each base estimator.\n\n - If int, then draw `max_features` features.\n - If float, then draw `max_features * X.shape[1]` features." - } + }, + "refined_type": {} }, { "name": "bootstrap", @@ -70600,7 +74744,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, individual trees are fit on random subsets of the training\ndata sampled with replacement. If False, sampling without replacement\nis performed." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -70610,7 +74755,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to run in parallel for both :meth:`fit` and\n:meth:`predict`. ``None`` means 1 unless in a\n:obj:`joblib.parallel_backend` context. ``-1`` means using all\nprocessors. See :term:`Glossary ` for more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -70620,7 +74766,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the pseudo-randomness of the selection of the feature\nand split values for each branching step and each tree in the forest.\n\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -70630,7 +74777,8 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity of the tree building process." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -70640,13 +74788,14 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit\nand add more estimators to the ensemble, otherwise, just fit a whole\nnew forest. See :term:`the Glossary `.\n\n.. versionadded:: 0.21" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, n_estimators=100, max_samples='auto', contamination='auto', max_features=1.0, bootstrap=False, n_jobs=None, random_state=None, verbose=0, warm_start=False):\n super().__init__(base_estimator=ExtraTreeRegressor(max_features=1, splitter='random', random_state=random_state), bootstrap=bootstrap, bootstrap_features=False, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose)\n self.contamination = contamination" }, { @@ -70664,7 +74813,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -70674,13 +74824,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _compute_chunked_score_samples(self, X):\n n_samples = _num_samples(X)\n if self._max_features == X.shape[1]:\n subsample_features = False\n else:\n subsample_features = True\n chunk_n_rows = get_chunk_n_rows(row_bytes=16 * self._max_features, max_n_rows=n_samples)\n slices = gen_batches(n_samples, chunk_n_rows)\n scores = np.zeros(n_samples, order='f')\n for sl in slices:\n scores[sl] = self._compute_score_samples(X[sl], subsample_features)\n return scores" }, { @@ -70698,7 +74849,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -70708,7 +74860,8 @@ "docstring": { "type": "array-like or sparse matrix", "description": "Data matrix." - } + }, + "refined_type": {} }, { "name": "subsample_features", @@ -70718,13 +74871,14 @@ "docstring": { "type": "bool", "description": "Whether features should be subsampled." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the score of each samples in X going through the extra trees.", - "docstring": "Compute the score of each samples in X going through the extra trees.\n\nParameters\n----------\nX : array-like or sparse matrix\n Data matrix.\n\nsubsample_features : bool\n Whether features should be subsampled.", + "docstring": "\n Compute the score of each samples in X going through the extra trees.\n\n Parameters\n ----------\n X : array-like or sparse matrix\n Data matrix.\n\n subsample_features : bool\n Whether features should be subsampled.\n ", "source_code": "\ndef _compute_score_samples(self, X, subsample_features):\n \"\"\"\n Compute the score of each samples in X going through the extra trees.\n\n Parameters\n ----------\n X : array-like or sparse matrix\n Data matrix.\n\n subsample_features : bool\n Whether features should be subsampled.\n \"\"\"\n n_samples = X.shape[0]\n depths = np.zeros(n_samples, order='f')\n for (tree, features) in zip(self.estimators_, self.estimators_features_):\n X_subset = X[:, features] if subsample_features else X\n leaves_index = tree.apply(X_subset)\n node_indicator = tree.decision_path(X_subset)\n n_samples_leaf = tree.tree_.n_node_samples[leaves_index]\n depths += np.ravel(node_indicator.sum(axis=1)) + _average_path_length(n_samples_leaf) - 1.0\n denominator = len(self.estimators_) * _average_path_length([self.max_samples_])\n scores = 2**(-np.divide(depths, denominator, out=np.ones_like(depths), where=denominator != 0))\n return scores" }, { @@ -70742,13 +74896,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -70766,13 +74921,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _parallel_args(self):\n return _joblib_parallel_args(prefer='threads')" }, { @@ -70790,7 +74946,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -70800,7 +74957,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -70810,13 +74968,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _set_oob_score(self, X, y):\n raise NotImplementedError('OOB score not supported by iforest')" }, { @@ -70834,7 +74993,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -70844,13 +75004,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Average anomaly score of X of the base classifiers.\n\nThe anomaly score of an input sample is computed as the mean anomaly score of the trees in the forest. The measure of normality of an observation given a tree is the depth of the leaf containing this observation, which is equivalent to the number of splittings required to isolate this point. In case of several observations n_left in the leaf, the average path length of a n_left samples isolation tree is added.", - "docstring": "Average anomaly score of X of the base classifiers.\n\nThe anomaly score of an input sample is computed as\nthe mean anomaly score of the trees in the forest.\n\nThe measure of normality of an observation given a tree is the depth\nof the leaf containing this observation, which is equivalent to\nthe number of splittings required to isolate this point. In case of\nseveral observations n_left in the leaf, the average path length of\na n_left samples isolation tree is added.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nReturns\n-------\nscores : ndarray of shape (n_samples,)\n The anomaly score of the input samples.\n The lower, the more abnormal. Negative scores represent outliers,\n positive scores represent inliers.", + "description": "Average anomaly score of X of the base classifiers.\n\nThe anomaly score of an input sample is computed as\nthe mean anomaly score of the trees in the forest.\n\nThe measure of normality of an observation given a tree is the depth\nof the leaf containing this observation, which is equivalent to\nthe number of splittings required to isolate this point. In case of\nseveral observations n_left in the leaf, the average path length of\na n_left samples isolation tree is added.", + "docstring": "\n Average anomaly score of X of the base classifiers.\n\n The anomaly score of an input sample is computed as\n the mean anomaly score of the trees in the forest.\n\n The measure of normality of an observation given a tree is the depth\n of the leaf containing this observation, which is equivalent to\n the number of splittings required to isolate this point. In case of\n several observations n_left in the leaf, the average path length of\n a n_left samples isolation tree is added.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n scores : ndarray of shape (n_samples,)\n The anomaly score of the input samples.\n The lower, the more abnormal. Negative scores represent outliers,\n positive scores represent inliers.\n ", "source_code": "\ndef decision_function(self, X):\n \"\"\"\n Average anomaly score of X of the base classifiers.\n\n The anomaly score of an input sample is computed as\n the mean anomaly score of the trees in the forest.\n\n The measure of normality of an observation given a tree is the depth\n of the leaf containing this observation, which is equivalent to\n the number of splittings required to isolate this point. In case of\n several observations n_left in the leaf, the average path length of\n a n_left samples isolation tree is added.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n scores : ndarray of shape (n_samples,)\n The anomaly score of the input samples.\n The lower, the more abnormal. Negative scores represent outliers,\n positive scores represent inliers.\n \"\"\"\n return self.score_samples(X) - self.offset_" }, { @@ -70868,7 +75032,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -70878,6 +75043,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Use ``dtype=np.float32`` for maximum\nefficiency. Sparse matrices are also supported, use sparse\n``csc_matrix`` for maximum efficiency." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -70888,7 +75057,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -70898,13 +75068,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit estimator.", - "docstring": "Fit estimator.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csc_matrix`` for maximum efficiency.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "\n Fit estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csc_matrix`` for maximum efficiency.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y=None, sample_weight=None):\n \"\"\"\n Fit estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Use ``dtype=np.float32`` for maximum\n efficiency. Sparse matrices are also supported, use sparse\n ``csc_matrix`` for maximum efficiency.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n X = self._validate_data(X, accept_sparse=['csc'])\n if issparse(X):\n X.sort_indices()\n rnd = check_random_state(self.random_state)\n y = rnd.uniform(size=X.shape[0])\n n_samples = X.shape[0]\n if self.contamination != 'auto':\n if not 0.0 < self.contamination <= 0.5:\n raise ValueError('contamination must be in (0, 0.5], got: %f' % self.contamination)\n if isinstance(self.max_samples, str):\n if self.max_samples == 'auto':\n max_samples = min(256, n_samples)\n else:\n raise ValueError('max_samples (%s) is not supported.Valid choices are: \"auto\", int orfloat' % self.max_samples)\n elif isinstance(self.max_samples, numbers.Integral):\n if self.max_samples > n_samples:\n warn('max_samples (%s) is greater than the total number of samples (%s). max_samples will be set to n_samples for estimation.' % (self.max_samples, n_samples))\n max_samples = n_samples\n else:\n max_samples = self.max_samples\n else:\n if not 0.0 < self.max_samples <= 1.0:\n raise ValueError('max_samples must be in (0, 1], got %r' % self.max_samples)\n max_samples = int(self.max_samples * X.shape[0])\n self.max_samples_ = max_samples\n max_depth = int(np.ceil(np.log2(max(max_samples, 2))))\n super()._fit(X, y, max_samples, max_depth=max_depth, sample_weight=sample_weight)\n if self.contamination == 'auto':\n self.offset_ = -0.5\n return self\n self.offset_ = np.percentile(self.score_samples(X), 100.0 * self.contamination)\n return self" }, { @@ -70922,7 +75093,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -70932,13 +75104,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict if a particular sample is an outlier or not.", - "docstring": "Predict if a particular sample is an outlier or not.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nReturns\n-------\nis_inlier : ndarray of shape (n_samples,)\n For each observation, tells whether or not (+1 or -1) it should\n be considered as an inlier according to the fitted model.", + "docstring": "\n Predict if a particular sample is an outlier or not.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n For each observation, tells whether or not (+1 or -1) it should\n be considered as an inlier according to the fitted model.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"\n Predict if a particular sample is an outlier or not.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n For each observation, tells whether or not (+1 or -1) it should\n be considered as an inlier according to the fitted model.\n \"\"\"\n check_is_fitted(self)\n decision_func = self.decision_function(X)\n is_inlier = np.ones_like(decision_func, dtype=int)\n is_inlier[decision_func < 0] = -1\n return is_inlier" }, { @@ -70956,7 +75132,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -70966,13 +75143,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Opposite of the anomaly score defined in the original paper.\n\nThe anomaly score of an input sample is computed as the mean anomaly score of the trees in the forest. The measure of normality of an observation given a tree is the depth of the leaf containing this observation, which is equivalent to the number of splittings required to isolate this point. In case of several observations n_left in the leaf, the average path length of a n_left samples isolation tree is added.", - "docstring": "Opposite of the anomaly score defined in the original paper.\n\nThe anomaly score of an input sample is computed as\nthe mean anomaly score of the trees in the forest.\n\nThe measure of normality of an observation given a tree is the depth\nof the leaf containing this observation, which is equivalent to\nthe number of splittings required to isolate this point. In case of\nseveral observations n_left in the leaf, the average path length of\na n_left samples isolation tree is added.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\nscores : ndarray of shape (n_samples,)\n The anomaly score of the input samples.\n The lower, the more abnormal.", + "description": "Opposite of the anomaly score defined in the original paper.\n\nThe anomaly score of an input sample is computed as\nthe mean anomaly score of the trees in the forest.\n\nThe measure of normality of an observation given a tree is the depth\nof the leaf containing this observation, which is equivalent to\nthe number of splittings required to isolate this point. In case of\nseveral observations n_left in the leaf, the average path length of\na n_left samples isolation tree is added.", + "docstring": "\n Opposite of the anomaly score defined in the original paper.\n\n The anomaly score of an input sample is computed as\n the mean anomaly score of the trees in the forest.\n\n The measure of normality of an observation given a tree is the depth\n of the leaf containing this observation, which is equivalent to\n the number of splittings required to isolate this point. In case of\n several observations n_left in the leaf, the average path length of\n a n_left samples isolation tree is added.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n scores : ndarray of shape (n_samples,)\n The anomaly score of the input samples.\n The lower, the more abnormal.\n ", "source_code": "\ndef score_samples(self, X):\n \"\"\"\n Opposite of the anomaly score defined in the original paper.\n\n The anomaly score of an input sample is computed as\n the mean anomaly score of the trees in the forest.\n\n The measure of normality of an observation given a tree is the depth\n of the leaf containing this observation, which is equivalent to\n the number of splittings required to isolate this point. In case of\n several observations n_left in the leaf, the average path length of\n a n_left samples isolation tree is added.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n scores : ndarray of shape (n_samples,)\n The anomaly score of the input samples.\n The lower, the more abnormal.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n return -self._compute_chunked_score_samples(X)" }, { @@ -70990,13 +75171,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "The average path length in a n_samples iTree, which is equal to the average path length of an unsuccessful BST search since the latter has the same structure as an isolation tree. Parameters ---------- n_samples_leaf : array-like of shape (n_samples,) The number of training samples in each test sample leaf, for each estimators.", - "docstring": "The average path length in a n_samples iTree, which is equal to\nthe average path length of an unsuccessful BST search since the\nlatter has the same structure as an isolation tree.\nParameters\n----------\nn_samples_leaf : array-like of shape (n_samples,)\n The number of training samples in each test sample leaf, for\n each estimators.\n\nReturns\n-------\naverage_path_length : ndarray of shape (n_samples,)", + "description": "The average path length in a n_samples iTree, which is equal to\nthe average path length of an unsuccessful BST search since the\nlatter has the same structure as an isolation tree.\nParameters\n----------\nn_samples_leaf : array-like of shape (n_samples,)\n The number of training samples in each test sample leaf, for\n each estimators.", + "docstring": "\n The average path length in a n_samples iTree, which is equal to\n the average path length of an unsuccessful BST search since the\n latter has the same structure as an isolation tree.\n Parameters\n ----------\n n_samples_leaf : array-like of shape (n_samples,)\n The number of training samples in each test sample leaf, for\n each estimators.\n\n Returns\n -------\n average_path_length : ndarray of shape (n_samples,)\n ", "source_code": "\ndef _average_path_length(n_samples_leaf):\n \"\"\"\n The average path length in a n_samples iTree, which is equal to\n the average path length of an unsuccessful BST search since the\n latter has the same structure as an isolation tree.\n Parameters\n ----------\n n_samples_leaf : array-like of shape (n_samples,)\n The number of training samples in each test sample leaf, for\n each estimators.\n\n Returns\n -------\n average_path_length : ndarray of shape (n_samples,)\n \"\"\"\n n_samples_leaf = check_array(n_samples_leaf, ensure_2d=False)\n n_samples_leaf_shape = n_samples_leaf.shape\n n_samples_leaf = n_samples_leaf.reshape((1, -1))\n average_path_length = np.zeros(n_samples_leaf.shape)\n mask_1 = n_samples_leaf <= 1\n mask_2 = n_samples_leaf == 2\n not_mask = ~np.logical_or(mask_1, mask_2)\n average_path_length[mask_1] = 0.0\n average_path_length[mask_2] = 1.0\n average_path_length[not_mask] = 2.0 * (np.log(n_samples_leaf[not_mask] - 1.0) + np.euler_gamma) - 2.0 * (n_samples_leaf[not_mask] - 1.0) / n_samples_leaf[not_mask]\n return average_path_length.reshape(n_samples_leaf_shape)" }, { @@ -71014,7 +75196,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimators", @@ -71024,7 +75207,8 @@ "docstring": { "type": "list of (str, estimator)", "description": "Base estimators which will be stacked together. Each element of the\nlist is defined as a tuple of string (i.e. name) and an estimator\ninstance. An estimator can be set to 'drop' using `set_params`." - } + }, + "refined_type": {} }, { "name": "final_estimator", @@ -71034,7 +75218,8 @@ "docstring": { "type": "estimator, default=None", "description": "A classifier which will be used to combine the base estimators.\nThe default classifier is a\n:class:`~sklearn.linear_model.LogisticRegression`." - } + }, + "refined_type": {} }, { "name": "cv", @@ -71044,7 +75229,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy used in\n`cross_val_predict` to train `final_estimator`. Possible inputs for\ncv are:\n\n* None, to use the default 5-fold cross validation,\n* integer, to specify the number of folds in a (Stratified) KFold,\n* An object to be used as a cross-validation generator,\n* An iterable yielding train, test splits.\n\nFor integer/None inputs, if the estimator is a classifier and y is\neither binary or multiclass,\n:class:`~sklearn.model_selection.StratifiedKFold` is used.\nIn all other cases, :class:`~sklearn.model_selection.KFold` is used.\nThese splitters are instantiated with `shuffle=False` so the splits\nwill be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. note::\n A larger number of split will provide no benefits if the number\n of training samples is large enough. Indeed, the training time\n will increase. ``cv`` is not used for model evaluation but for\n prediction." - } + }, + "refined_type": {} }, { "name": "stack_method", @@ -71054,6 +75240,15 @@ "docstring": { "type": "{'auto', 'predict_proba', 'decision_function', 'predict'}, default='auto'", "description": "Methods called for each base estimator. It can be:\n\n* if 'auto', it will try to invoke, for each estimator,\n `'predict_proba'`, `'decision_function'` or `'predict'` in that\n order.\n* otherwise, one of `'predict_proba'`, `'decision_function'` or\n `'predict'`. If the method is not implemented by the estimator, it\n will raise an error." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "auto", + "predict", + "decision_function", + "predict_proba" + ] } }, { @@ -71064,7 +75259,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to run in parallel all `estimators` `fit`.\n`None` means 1 unless in a `joblib.parallel_backend` context. -1 means\nusing all processors. See Glossary for more details." - } + }, + "refined_type": {} }, { "name": "passthrough", @@ -71074,7 +75270,8 @@ "docstring": { "type": "bool, default=False", "description": "When False, only the predictions of estimators will be used as\ntraining data for `final_estimator`. When True, the\n`final_estimator` is trained on the predictions as well as the\noriginal training data." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -71084,13 +75281,14 @@ "docstring": { "type": "int, default=0", "description": "Verbosity level." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimators, final_estimator=None, *, cv=None, stack_method='auto', n_jobs=None, passthrough=False, verbose=0):\n super().__init__(estimators=estimators, final_estimator=final_estimator, cv=cv, stack_method=stack_method, n_jobs=n_jobs, passthrough=passthrough, verbose=verbose)" }, { @@ -71108,13 +75306,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sk_visual_block_(self):\n if self.final_estimator is None:\n final_estimator = LogisticRegression()\n else:\n final_estimator = self.final_estimator\n return super()._sk_visual_block_(final_estimator)" }, { @@ -71132,13 +75331,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_final_estimator(self):\n self._clone_final_estimator(default=LogisticRegression())\n if not is_classifier(self.final_estimator_):\n raise ValueError(\"'final_estimator' parameter should be a classifier. Got {}\".format(self.final_estimator_))" }, { @@ -71158,7 +75358,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -71168,13 +75369,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Decision function for samples in `X` using the final estimator.", - "docstring": "Decision function for samples in `X` using the final estimator.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nReturns\n-------\ndecisions : ndarray of shape (n_samples,), (n_samples, n_classes), or (n_samples, n_classes * (n_classes-1) / 2)\n The decision function computed the final estimator.", + "docstring": "Decision function for samples in `X` using the final estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n decisions : ndarray of shape (n_samples,), (n_samples, n_classes), or (n_samples, n_classes * (n_classes-1) / 2)\n The decision function computed the final estimator.\n ", "source_code": "\n@if_delegate_has_method(delegate='final_estimator_')\ndef decision_function(self, X):\n \"\"\"Decision function for samples in `X` using the final estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n decisions : ndarray of shape (n_samples,), (n_samples, n_classes), or (n_samples, n_classes * (n_classes-1) / 2)\n The decision function computed the final estimator.\n \"\"\"\n check_is_fitted(self)\n return self.final_estimator_.decision_function(self.transform(X))" }, { @@ -71192,7 +75397,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -71202,6 +75408,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -71212,7 +75422,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -71222,13 +75433,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted.\nNote that this is supported only if all underlying estimators\nsupport sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the estimators.", - "docstring": "Fit the estimators.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\nReturns\n-------\nself : object\n Returns a fitted instance of estimator.", + "docstring": "Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n Returns\n -------\n self : object\n Returns a fitted instance of estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n Returns\n -------\n self : object\n Returns a fitted instance of estimator.\n \"\"\"\n check_classification_targets(y)\n self._le = LabelEncoder().fit(y)\n self.classes_ = self._le.classes_\n return super().fit(X, self._le.transform(y), sample_weight)" }, { @@ -71248,7 +75460,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -71258,13 +75471,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict target for X.", - "docstring": "Predict target for X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n**predict_params : dict of str -> obj\n Parameters to the `predict` called by the `final_estimator`. Note\n that this may be used to return uncertainties from some estimators\n with `return_std` or `return_cov`. Be aware that it will only\n accounts for uncertainty in the final estimator.\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,) or (n_samples, n_output)\n Predicted targets.", + "docstring": "Predict target for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n **predict_params : dict of str -> obj\n Parameters to the `predict` called by the `final_estimator`. Note\n that this may be used to return uncertainties from some estimators\n with `return_std` or `return_cov`. Be aware that it will only\n accounts for uncertainty in the final estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)\n Predicted targets.\n ", "source_code": "\n@if_delegate_has_method(delegate='final_estimator_')\ndef predict(self, X, **predict_params):\n \"\"\"Predict target for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n **predict_params : dict of str -> obj\n Parameters to the `predict` called by the `final_estimator`. Note\n that this may be used to return uncertainties from some estimators\n with `return_std` or `return_cov`. Be aware that it will only\n accounts for uncertainty in the final estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)\n Predicted targets.\n \"\"\"\n y_pred = super().predict(X, **predict_params)\n return self._le.inverse_transform(y_pred)" }, { @@ -71284,7 +75501,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -71294,13 +75512,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict class probabilities for `X` using the final estimator.", - "docstring": "Predict class probabilities for `X` using the final estimator.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nReturns\n-------\nprobabilities : ndarray of shape (n_samples, n_classes) or list of ndarray of shape (n_output,)\n The class probabilities of the input samples.", + "docstring": "Predict class probabilities for `X` using the final estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n probabilities : ndarray of shape (n_samples, n_classes) or list of ndarray of shape (n_output,)\n The class probabilities of the input samples.\n ", "source_code": "\n@if_delegate_has_method(delegate='final_estimator_')\ndef predict_proba(self, X):\n \"\"\"Predict class probabilities for `X` using the final estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n probabilities : ndarray of shape (n_samples, n_classes) or list of ndarray of shape (n_output,)\n The class probabilities of the input samples.\n \"\"\"\n check_is_fitted(self)\n return self.final_estimator_.predict_proba(self.transform(X))" }, { @@ -71318,7 +75540,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -71328,13 +75551,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Return class labels or probabilities for X for each estimator.", - "docstring": "Return class labels or probabilities for X for each estimator.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nReturns\n-------\ny_preds : ndarray of shape (n_samples, n_estimators) or (n_samples, n_classes * n_estimators)\n Prediction outputs for each estimator.", + "docstring": "Return class labels or probabilities for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n y_preds : ndarray of shape (n_samples, n_estimators) or (n_samples, n_classes * n_estimators)\n Prediction outputs for each estimator.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Return class labels or probabilities for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n y_preds : ndarray of shape (n_samples, n_estimators) or (n_samples, n_classes * n_estimators)\n Prediction outputs for each estimator.\n \"\"\"\n return self._transform(X)" }, { @@ -71352,7 +75579,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimators", @@ -71362,7 +75590,8 @@ "docstring": { "type": "list of (str, estimator)", "description": "Base estimators which will be stacked together. Each element of the\nlist is defined as a tuple of string (i.e. name) and an estimator\ninstance. An estimator can be set to 'drop' using `set_params`." - } + }, + "refined_type": {} }, { "name": "final_estimator", @@ -71372,7 +75601,8 @@ "docstring": { "type": "estimator, default=None", "description": "A regressor which will be used to combine the base estimators.\nThe default regressor is a :class:`~sklearn.linear_model.RidgeCV`." - } + }, + "refined_type": {} }, { "name": "cv", @@ -71382,7 +75612,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy used in\n`cross_val_predict` to train `final_estimator`. Possible inputs for\ncv are:\n\n* None, to use the default 5-fold cross validation,\n* integer, to specify the number of folds in a (Stratified) KFold,\n* An object to be used as a cross-validation generator,\n* An iterable yielding train, test splits.\n\nFor integer/None inputs, if the estimator is a classifier and y is\neither binary or multiclass,\n:class:`~sklearn.model_selection.StratifiedKFold` is used.\nIn all other cases, :class:`~sklearn.model_selection.KFold` is used.\nThese splitters are instantiated with `shuffle=False` so the splits\nwill be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. note::\n A larger number of split will provide no benefits if the number\n of training samples is large enough. Indeed, the training time\n will increase. ``cv`` is not used for model evaluation but for\n prediction." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -71392,7 +75623,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to run in parallel for `fit` of all `estimators`.\n`None` means 1 unless in a `joblib.parallel_backend` context. -1 means\nusing all processors. See Glossary for more details." - } + }, + "refined_type": {} }, { "name": "passthrough", @@ -71402,7 +75634,8 @@ "docstring": { "type": "bool, default=False", "description": "When False, only the predictions of estimators will be used as\ntraining data for `final_estimator`. When True, the\n`final_estimator` is trained on the predictions as well as the\noriginal training data." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -71412,13 +75645,14 @@ "docstring": { "type": "int, default=0", "description": "Verbosity level." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimators, final_estimator=None, *, cv=None, n_jobs=None, passthrough=False, verbose=0):\n super().__init__(estimators=estimators, final_estimator=final_estimator, cv=cv, stack_method='predict', n_jobs=n_jobs, passthrough=passthrough, verbose=verbose)" }, { @@ -71436,13 +75670,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sk_visual_block_(self):\n if self.final_estimator is None:\n final_estimator = RidgeCV()\n else:\n final_estimator = self.final_estimator\n return super()._sk_visual_block_(final_estimator)" }, { @@ -71460,13 +75695,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_final_estimator(self):\n self._clone_final_estimator(default=RidgeCV())\n if not is_regressor(self.final_estimator_):\n raise ValueError(\"'final_estimator' parameter should be a regressor. Got {}\".format(self.final_estimator_))" }, { @@ -71484,7 +75720,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -71494,6 +75731,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -71504,7 +75745,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -71514,14 +75756,15 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted.\nNote that this is supported only if all underlying estimators\nsupport sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the estimators.", - "docstring": "Fit the estimators.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\nReturns\n-------\nself : object", - "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n Returns\n -------\n self : object\n \"\"\"\n y = column_or_1d(y, warn=True)\n return super().fit(X, y, sample_weight)" + "docstring": "Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n ", + "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n \"\"\"\n y = column_or_1d(y, warn=True)\n return super().fit(X, y, sample_weight)" }, { "name": "transform", @@ -71538,7 +75781,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -71548,13 +75792,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Return the predictions for X for each estimator.", - "docstring": "Return the predictions for X for each estimator.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nReturns\n-------\ny_preds : ndarray of shape (n_samples, n_estimators)\n Prediction outputs for each estimator.", + "docstring": "Return the predictions for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n y_preds : ndarray of shape (n_samples, n_estimators)\n Prediction outputs for each estimator.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Return the predictions for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n y_preds : ndarray of shape (n_samples, n_estimators)\n Prediction outputs for each estimator.\n \"\"\"\n return self._transform(X)" }, { @@ -71572,7 +75820,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimators", @@ -71582,7 +75831,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "final_estimator", @@ -71592,7 +75842,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cv", @@ -71602,7 +75853,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stack_method", @@ -71612,7 +75864,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -71622,7 +75875,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -71632,7 +75886,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "passthrough", @@ -71642,13 +75897,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, estimators, final_estimator=None, *, cv=None, stack_method='auto', n_jobs=None, verbose=0, passthrough=False):\n super().__init__(estimators=estimators)\n self.final_estimator = final_estimator\n self.cv = cv\n self.stack_method = stack_method\n self.n_jobs = n_jobs\n self.verbose = verbose\n self.passthrough = passthrough" }, { @@ -71666,7 +75922,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "default", @@ -71676,13 +75933,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _clone_final_estimator(self, default):\n if self.final_estimator is not None:\n self.final_estimator_ = clone(self.final_estimator)\n else:\n self.final_estimator_ = clone(default)" }, { @@ -71700,7 +75958,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -71710,7 +75969,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "predictions", @@ -71720,13 +75980,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Concatenate the predictions of each first layer learner and possibly the input dataset `X`.\n\nIf `X` is sparse and `self.passthrough` is False, the output of `transform` will be dense (the predictions). If `X` is sparse and `self.passthrough` is True, the output of `transform` will be sparse. This helper is in charge of ensuring the predictions are 2D arrays and it will drop one of the probability column when using probabilities in the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1)", - "docstring": "Concatenate the predictions of each first layer learner and\npossibly the input dataset `X`.\n\nIf `X` is sparse and `self.passthrough` is False, the output of\n`transform` will be dense (the predictions). If `X` is sparse\nand `self.passthrough` is True, the output of `transform` will\nbe sparse.\n\nThis helper is in charge of ensuring the predictions are 2D arrays and\nit will drop one of the probability column when using probabilities\nin the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1)", + "description": "Concatenate the predictions of each first layer learner and\npossibly the input dataset `X`.\n\nIf `X` is sparse and `self.passthrough` is False, the output of\n`transform` will be dense (the predictions). If `X` is sparse\nand `self.passthrough` is True, the output of `transform` will\nbe sparse.\n\nThis helper is in charge of ensuring the predictions are 2D arrays and\nit will drop one of the probability column when using probabilities\nin the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1)", + "docstring": "Concatenate the predictions of each first layer learner and\n possibly the input dataset `X`.\n\n If `X` is sparse and `self.passthrough` is False, the output of\n `transform` will be dense (the predictions). If `X` is sparse\n and `self.passthrough` is True, the output of `transform` will\n be sparse.\n\n This helper is in charge of ensuring the predictions are 2D arrays and\n it will drop one of the probability column when using probabilities\n in the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1)\n ", "source_code": "\ndef _concatenate_predictions(self, X, predictions):\n \"\"\"Concatenate the predictions of each first layer learner and\n possibly the input dataset `X`.\n\n If `X` is sparse and `self.passthrough` is False, the output of\n `transform` will be dense (the predictions). If `X` is sparse\n and `self.passthrough` is True, the output of `transform` will\n be sparse.\n\n This helper is in charge of ensuring the predictions are 2D arrays and\n it will drop one of the probability column when using probabilities\n in the binary case. Indeed, the p(y|c=0) = 1 - p(y|c=1)\n \"\"\"\n X_meta = []\n for (est_idx, preds) in enumerate(predictions):\n if preds.ndim == 1:\n X_meta.append(preds.reshape(-1, 1))\n elif self.stack_method_[est_idx] == 'predict_proba' and len(self.classes_) == 2:\n X_meta.append(preds[:, 1:])\n else:\n X_meta.append(preds)\n if self.passthrough:\n X_meta.append(X)\n if sparse.issparse(X):\n return sparse.hstack(X_meta, format=X.format)\n return np.hstack(X_meta)" }, { @@ -71744,7 +76005,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -71754,7 +76016,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "method", @@ -71764,13 +76027,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@staticmethod\ndef _method_name(name, estimator, method):\n if estimator == 'drop':\n return None\n if method == 'auto':\n if getattr(estimator, 'predict_proba', None):\n return 'predict_proba'\n elif getattr(estimator, 'decision_function', None):\n return 'decision_function'\n else:\n return 'predict'\n else:\n if not hasattr(estimator, method):\n raise ValueError('Underlying estimator {} does not implement the method {}.'.format(name, method))\n return method" }, { @@ -71788,7 +76052,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "final_estimator", @@ -71798,13 +76063,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sk_visual_block_(self, final_estimator):\n (names, estimators) = zip(*self.estimators)\n parallel = _VisualBlock('parallel', estimators, names=names, dash_wrapped=False)\n final_block = _VisualBlock('parallel', [final_estimator], names=['final_estimator'], dash_wrapped=False)\n return _VisualBlock('serial', (parallel, final_block), dash_wrapped=False)" }, { @@ -71822,7 +76088,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -71832,7 +76099,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -71856,7 +76124,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -71866,6 +76135,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -71876,7 +76149,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -71886,13 +76160,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or default=None", "description": "Sample weights. If None, then samples are equally weighted.\nNote that this is supported only if all underlying estimators\nsupport sample weights.\n\n.. versionchanged:: 0.23\n when not None, `sample_weight` is passed to all underlying\n estimators" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit the estimators.", - "docstring": "Fit the estimators.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,) or default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n .. versionchanged:: 0.23\n when not None, `sample_weight` is passed to all underlying\n estimators\n\nReturns\n-------\nself : object", + "docstring": "Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,) or default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n .. versionchanged:: 0.23\n when not None, `sample_weight` is passed to all underlying\n estimators\n\n Returns\n -------\n self : object\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,) or default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n .. versionchanged:: 0.23\n when not None, `sample_weight` is passed to all underlying\n estimators\n\n Returns\n -------\n self : object\n \"\"\"\n (names, all_estimators) = self._validate_estimators()\n self._validate_final_estimator()\n stack_method = [self.stack_method] * len(all_estimators)\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_fit_single_estimator)(clone(est), X, y, sample_weight) for est in all_estimators if est != 'drop'))\n self.named_estimators_ = Bunch()\n est_fitted_idx = 0\n for (name_est, org_est) in zip(names, all_estimators):\n if org_est != 'drop':\n current_estimator = self.estimators_[est_fitted_idx]\n self.named_estimators_[name_est] = current_estimator\n est_fitted_idx += 1\n if hasattr(current_estimator, 'feature_names_in_'):\n self.feature_names_in_ = current_estimator.feature_names_in_\n else:\n self.named_estimators_[name_est] = 'drop'\n cv = check_cv(self.cv, y=y, classifier=is_classifier(self))\n if hasattr(cv, 'random_state') and cv.random_state is None:\n cv.random_state = np.random.RandomState()\n self.stack_method_ = [self._method_name(name, est, meth) for (name, est, meth) in zip(names, all_estimators, stack_method)]\n fit_params = {'sample_weight': sample_weight} if sample_weight is not None else None\n predictions = Parallel(n_jobs=self.n_jobs)((delayed(cross_val_predict)(clone(est), X, y, cv=deepcopy(cv), method=meth, n_jobs=self.n_jobs, fit_params=fit_params, verbose=self.verbose) for (est, meth) in zip(all_estimators, self.stack_method_) if est != 'drop'))\n self.stack_method_ = [meth for (meth, est) in zip(self.stack_method_, all_estimators) if est != 'drop']\n X_meta = self._concatenate_predictions(X, predictions)\n _fit_single_estimator(self.final_estimator_, X_meta, y, sample_weight=sample_weight)\n return self" }, { @@ -71910,7 +76185,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -71936,7 +76212,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -71946,13 +76223,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Predict target for X.", - "docstring": "Predict target for X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n**predict_params : dict of str -> obj\n Parameters to the `predict` called by the `final_estimator`. Note\n that this may be used to return uncertainties from some estimators\n with `return_std` or `return_cov`. Be aware that it will only\n accounts for uncertainty in the final estimator.\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,) or (n_samples, n_output)\n Predicted targets.", + "docstring": "Predict target for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n **predict_params : dict of str -> obj\n Parameters to the `predict` called by the `final_estimator`. Note\n that this may be used to return uncertainties from some estimators\n with `return_std` or `return_cov`. Be aware that it will only\n accounts for uncertainty in the final estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)\n Predicted targets.\n ", "source_code": "\n@if_delegate_has_method(delegate='final_estimator_')\ndef predict(self, X, **predict_params):\n \"\"\"Predict target for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n **predict_params : dict of str -> obj\n Parameters to the `predict` called by the `final_estimator`. Note\n that this may be used to return uncertainties from some estimators\n with `return_std` or `return_cov`. Be aware that it will only\n accounts for uncertainty in the final estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_output)\n Predicted targets.\n \"\"\"\n check_is_fitted(self)\n return self.final_estimator_.predict(self.transform(X), **predict_params)" }, { @@ -71970,7 +76251,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimators", @@ -71980,7 +76262,8 @@ "docstring": { "type": "list of (str, estimator) tuples", "description": "Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones\nof those original estimators that will be stored in the class attribute\n``self.estimators_``. An estimator can be set to ``'drop'``\nusing ``set_params``.\n\n.. versionchanged:: 0.21\n ``'drop'`` is accepted. Using None was deprecated in 0.22 and\n support was removed in 0.24." - } + }, + "refined_type": {} }, { "name": "voting", @@ -71990,6 +76273,10 @@ "docstring": { "type": "{'hard', 'soft'}, default='hard'", "description": "If 'hard', uses predicted class labels for majority rule voting.\nElse if 'soft', predicts the class label based on the argmax of\nthe sums of the predicted probabilities, which is recommended for\nan ensemble of well-calibrated classifiers." + }, + "refined_type": { + "kind": "EnumType", + "values": ["soft", "hard"] } }, { @@ -72000,7 +76287,8 @@ "docstring": { "type": "array-like of shape (n_classifiers,), default=None", "description": "Sequence of weights (`float` or `int`) to weight the occurrences of\npredicted class labels (`hard` voting) or class probabilities\nbefore averaging (`soft` voting). Uses uniform weights if `None`." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -72010,7 +76298,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to run in parallel for ``fit``.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "flatten_transform", @@ -72020,7 +76309,8 @@ "docstring": { "type": "bool, default=True", "description": "Affects shape of transform output only when voting='soft'\nIf voting='soft' and flatten_transform=True, transform method returns\nmatrix with shape (n_samples, n_classifiers * n_classes). If\nflatten_transform=False, it returns\n(n_classifiers, n_samples, n_classes)." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -72030,13 +76320,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the time elapsed while fitting will be printed as it\nis completed.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimators, *, voting='hard', weights=None, n_jobs=None, flatten_transform=True, verbose=False):\n super().__init__(estimators=estimators)\n self.voting = voting\n self.weights = weights\n self.n_jobs = n_jobs\n self.flatten_transform = flatten_transform\n self.verbose = verbose" }, { @@ -72054,13 +76345,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_voting(self):\n if self.voting == 'hard':\n raise AttributeError(f'predict_proba is not available when voting={repr(self.voting)}')\n return True" }, { @@ -72078,7 +76370,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72088,7 +76381,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -72112,7 +76406,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72122,6 +76417,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -72132,7 +76431,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -72142,13 +76442,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted.\nNote that this is supported only if all underlying estimators\nsupport sample weights.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the estimators.", - "docstring": "Fit the estimators.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n .. versionadded:: 0.18\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n check_classification_targets(y)\n if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:\n raise NotImplementedError('Multilabel and multi-output classification is not supported.')\n if self.voting not in ('soft', 'hard'):\n raise ValueError(\"Voting must be 'soft' or 'hard'; got (voting=%r)\" % self.voting)\n self.le_ = LabelEncoder().fit(y)\n self.classes_ = self.le_.classes_\n transformed_y = self.le_.transform(y)\n return super().fit(X, transformed_y, sample_weight)" }, { @@ -72166,7 +76467,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72176,13 +76478,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict class labels for X.", - "docstring": "Predict class labels for X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\nmaj : array-like of shape (n_samples,)\n Predicted class labels.", + "docstring": "Predict class labels for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n maj : array-like of shape (n_samples,)\n Predicted class labels.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict class labels for X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n maj : array-like of shape (n_samples,)\n Predicted class labels.\n \"\"\"\n check_is_fitted(self)\n if self.voting == 'soft':\n maj = np.argmax(self.predict_proba(X), axis=1)\n else:\n predictions = self._predict(X)\n maj = np.apply_along_axis(lambda x: np.argmax(np.bincount(x, weights=self._weights_not_none)), axis=1, arr=predictions)\n maj = self.le_.inverse_transform(maj)\n return maj" }, { @@ -72200,7 +76506,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72210,13 +76517,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Compute probabilities of possible outcomes for samples in X.", - "docstring": "Compute probabilities of possible outcomes for samples in X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\navg : array-like of shape (n_samples, n_classes)\n Weighted average probability for each class per sample.", + "docstring": "Compute probabilities of possible outcomes for samples in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n avg : array-like of shape (n_samples, n_classes)\n Weighted average probability for each class per sample.\n ", "source_code": "\n@available_if(_check_voting)\ndef predict_proba(self, X):\n \"\"\"Compute probabilities of possible outcomes for samples in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n avg : array-like of shape (n_samples, n_classes)\n Weighted average probability for each class per sample.\n \"\"\"\n check_is_fitted(self)\n avg = np.average(self._collect_probas(X), axis=0, weights=self._weights_not_none)\n return avg" }, { @@ -72234,7 +76545,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72244,13 +76556,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Return class labels or probabilities for X for each estimator.", - "docstring": "Return class labels or probabilities for X for each estimator.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nReturns\n-------\nprobabilities_or_labels\n If `voting='soft'` and `flatten_transform=True`:\n returns ndarray of shape (n_classifiers, n_samples *\n n_classes), being class probabilities calculated by each\n classifier.\n If `voting='soft' and `flatten_transform=False`:\n ndarray of shape (n_classifiers, n_samples, n_classes)\n If `voting='hard'`:\n ndarray of shape (n_samples, n_classifiers), being\n class labels predicted by each classifier.", + "docstring": "Return class labels or probabilities for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n probabilities_or_labels\n If `voting='soft'` and `flatten_transform=True`:\n returns ndarray of shape (n_classifiers, n_samples *\n n_classes), being class probabilities calculated by each\n classifier.\n If `voting='soft' and `flatten_transform=False`:\n ndarray of shape (n_classifiers, n_samples, n_classes)\n If `voting='hard'`:\n ndarray of shape (n_samples, n_classifiers), being\n class labels predicted by each classifier.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Return class labels or probabilities for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n probabilities_or_labels\n If `voting='soft'` and `flatten_transform=True`:\n returns ndarray of shape (n_classifiers, n_samples *\n n_classes), being class probabilities calculated by each\n classifier.\n If `voting='soft' and `flatten_transform=False`:\n ndarray of shape (n_classifiers, n_samples, n_classes)\n If `voting='hard'`:\n ndarray of shape (n_samples, n_classifiers), being\n class labels predicted by each classifier.\n \"\"\"\n check_is_fitted(self)\n if self.voting == 'soft':\n probas = self._collect_probas(X)\n if not self.flatten_transform:\n return probas\n return np.hstack(probas)\n else:\n return self._predict(X)" }, { @@ -72268,7 +76584,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimators", @@ -72278,7 +76595,8 @@ "docstring": { "type": "list of (str, estimator) tuples", "description": "Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones\nof those original estimators that will be stored in the class attribute\n``self.estimators_``. An estimator can be set to ``'drop'`` using\n``set_params``.\n\n.. versionchanged:: 0.21\n ``'drop'`` is accepted. Using None was deprecated in 0.22 and\n support was removed in 0.24." - } + }, + "refined_type": {} }, { "name": "weights", @@ -72288,7 +76606,8 @@ "docstring": { "type": "array-like of shape (n_regressors,), default=None", "description": "Sequence of weights (`float` or `int`) to weight the occurrences of\npredicted values before averaging. Uses uniform weights if `None`." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -72298,7 +76617,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to run in parallel for ``fit``.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -72308,13 +76628,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the time elapsed while fitting will be printed as it\nis completed.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False):\n super().__init__(estimators=estimators)\n self.weights = weights\n self.n_jobs = n_jobs\n self.verbose = verbose" }, { @@ -72332,7 +76653,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72342,6 +76664,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -72352,7 +76678,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -72362,13 +76689,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted.\nNote that this is supported only if all underlying estimators\nsupport sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the estimators.", - "docstring": "Fit the estimators.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the estimators.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted.\n Note that this is supported only if all underlying estimators\n support sample weights.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n y = column_or_1d(y, warn=True)\n return super().fit(X, y, sample_weight)" }, { @@ -72386,7 +76714,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72396,13 +76725,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict regression target for X.\n\nThe predicted regression target of an input sample is computed as the mean predicted regression targets of the estimators in the ensemble.", - "docstring": "Predict regression target for X.\n\nThe predicted regression target of an input sample is computed as the\nmean predicted regression targets of the estimators in the ensemble.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\ny : ndarray of shape (n_samples,)\n The predicted values.", + "description": "Predict regression target for X.\n\nThe predicted regression target of an input sample is computed as the\nmean predicted regression targets of the estimators in the ensemble.", + "docstring": "Predict regression target for X.\n\n The predicted regression target of an input sample is computed as the\n mean predicted regression targets of the estimators in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict regression target for X.\n\n The predicted regression target of an input sample is computed as the\n mean predicted regression targets of the estimators in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted values.\n \"\"\"\n check_is_fitted(self)\n return np.average(self._predict(X), axis=1, weights=self._weights_not_none)" }, { @@ -72420,7 +76753,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72430,13 +76764,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Return predictions for X for each estimator.", - "docstring": "Return predictions for X for each estimator.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\npredictions : ndarray of shape (n_samples, n_classifiers)\n Values predicted by each regressor.", + "docstring": "Return predictions for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n predictions : ndarray of shape (n_samples, n_classifiers)\n Values predicted by each regressor.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Return predictions for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n predictions : ndarray of shape (n_samples, n_classifiers)\n Values predicted by each regressor.\n \"\"\"\n check_is_fitted(self)\n return self._predict(X)" }, { @@ -72454,7 +76792,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "name", @@ -72464,7 +76803,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "idx", @@ -72474,7 +76814,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "total", @@ -72484,13 +76825,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _log_message(self, name, idx, total):\n if not self.verbose:\n return None\n return '(%d of %d) Processing %s' % (idx, total, name)" }, { @@ -72508,13 +76850,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'preserves_dtype': []}" }, { @@ -72532,7 +76875,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72542,7 +76886,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -72566,13 +76911,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sk_visual_block_(self):\n (names, estimators) = zip(*self.estimators)\n return _VisualBlock('parallel', estimators, names=names)" }, { @@ -72590,7 +76936,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -72614,7 +76961,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72624,7 +76972,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -72634,7 +76983,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -72644,7 +76994,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -72668,7 +77019,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72678,6 +77030,10 @@ "docstring": { "type": "{array-like, sparse matrix, dataframe} of shape (n_samples, n_features)", "description": "Input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -72688,13 +77044,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Target values (None for unsupervised transformations)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return class labels or probabilities for each estimator.\n\nReturn predictions for X for each estimator.", - "docstring": "Return class labels or probabilities for each estimator.\n\nReturn predictions for X for each estimator.\n\nParameters\n----------\nX : {array-like, sparse matrix, dataframe} of shape (n_samples, n_features)\n Input samples.\n\ny : ndarray of shape (n_samples,), default=None\n Target values (None for unsupervised transformations).\n\n**fit_params : dict\n Additional fit parameters.\n\nReturns\n-------\nX_new : ndarray array of shape (n_samples, n_features_new)\n Transformed array.", + "docstring": "Return class labels or probabilities for each estimator.\n\n Return predictions for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix, dataframe} of shape (n_samples, n_features)\n Input samples.\n\n y : ndarray of shape (n_samples,), default=None\n Target values (None for unsupervised transformations).\n\n **fit_params : dict\n Additional fit parameters.\n\n Returns\n -------\n X_new : ndarray array of shape (n_samples, n_features_new)\n Transformed array.\n ", "source_code": "\ndef fit_transform(self, X, y=None, **fit_params):\n \"\"\"Return class labels or probabilities for each estimator.\n\n Return predictions for X for each estimator.\n\n Parameters\n ----------\n X : {array-like, sparse matrix, dataframe} of shape (n_samples, n_features)\n Input samples.\n\n y : ndarray of shape (n_samples,), default=None\n Target values (None for unsupervised transformations).\n\n **fit_params : dict\n Additional fit parameters.\n\n Returns\n -------\n X_new : ndarray array of shape (n_samples, n_features_new)\n Transformed array.\n \"\"\"\n return super().fit_transform(X, y, **fit_params)" }, { @@ -72712,7 +77069,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -72736,7 +77094,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -72746,7 +77105,8 @@ "docstring": { "type": "object, default=None", "description": "The base estimator from which the boosted ensemble is built.\nSupport for sample weighting is required, as well as proper\n``classes_`` and ``n_classes_`` attributes. If ``None``, then\nthe base estimator is :class:`~sklearn.tree.DecisionTreeClassifier`\ninitialized with `max_depth=1`." - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -72756,7 +77116,8 @@ "docstring": { "type": "int, default=50", "description": "The maximum number of estimators at which boosting is terminated.\nIn case of perfect fit, the learning procedure is stopped early." - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -72766,7 +77127,8 @@ "docstring": { "type": "float, default=1.0", "description": "Weight applied to each classifier at each boosting iteration. A higher\nlearning rate increases the contribution of each classifier. There is\na trade-off between the `learning_rate` and `n_estimators` parameters." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -72776,6 +77138,10 @@ "docstring": { "type": "{'SAMME', 'SAMME.R'}, default='SAMME.R'", "description": "If 'SAMME.R' then use the SAMME.R real boosting algorithm.\n``base_estimator`` must support calculation of class probabilities.\nIf 'SAMME' then use the SAMME discrete boosting algorithm.\nThe SAMME.R algorithm typically converges faster than SAMME,\nachieving a lower test error with fewer boosting iterations." + }, + "refined_type": { + "kind": "EnumType", + "values": ["SAMME", "SAMME.R"] } }, { @@ -72786,13 +77152,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the random seed given at each `base_estimator` at each\nboosting iteration.\nThus, it is only used when `base_estimator` exposes a `random_state`.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, base_estimator=None, *, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None):\n super().__init__(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state)\n self.algorithm = algorithm" }, { @@ -72810,7 +77177,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "iboost", @@ -72820,7 +77188,8 @@ "docstring": { "type": "int", "description": "The index of the current boost iteration." - } + }, + "refined_type": {} }, { "name": "X", @@ -72830,6 +77199,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -72840,7 +77213,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target values (class labels)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -72850,7 +77224,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The current sample weights." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -72860,13 +77235,14 @@ "docstring": { "type": "RandomState instance", "description": "The RandomState instance used if the base estimator accepts a\n`random_state` attribute." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Implement a single boost.\n\nPerform a single boost according to the real multi-class SAMME.R algorithm or to the discrete SAMME algorithm and return the updated sample weights.", - "docstring": "Implement a single boost.\n\nPerform a single boost according to the real multi-class SAMME.R\nalgorithm or to the discrete SAMME algorithm and return the updated\nsample weights.\n\nParameters\n----------\niboost : int\n The index of the current boost iteration.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\ny : array-like of shape (n_samples,)\n The target values (class labels).\n\nsample_weight : array-like of shape (n_samples,)\n The current sample weights.\n\nrandom_state : RandomState instance\n The RandomState instance used if the base estimator accepts a\n `random_state` attribute.\n\nReturns\n-------\nsample_weight : array-like of shape (n_samples,) or None\n The reweighted sample weights.\n If None then boosting has terminated early.\n\nestimator_weight : float\n The weight for the current boost.\n If None then boosting has terminated early.\n\nestimator_error : float\n The classification error for the current boost.\n If None then boosting has terminated early.", + "description": "Implement a single boost.\n\nPerform a single boost according to the real multi-class SAMME.R\nalgorithm or to the discrete SAMME algorithm and return the updated\nsample weights.", + "docstring": "Implement a single boost.\n\n Perform a single boost according to the real multi-class SAMME.R\n algorithm or to the discrete SAMME algorithm and return the updated\n sample weights.\n\n Parameters\n ----------\n iboost : int\n The index of the current boost iteration.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,)\n The target values (class labels).\n\n sample_weight : array-like of shape (n_samples,)\n The current sample weights.\n\n random_state : RandomState instance\n The RandomState instance used if the base estimator accepts a\n `random_state` attribute.\n\n Returns\n -------\n sample_weight : array-like of shape (n_samples,) or None\n The reweighted sample weights.\n If None then boosting has terminated early.\n\n estimator_weight : float\n The weight for the current boost.\n If None then boosting has terminated early.\n\n estimator_error : float\n The classification error for the current boost.\n If None then boosting has terminated early.\n ", "source_code": "\ndef _boost(self, iboost, X, y, sample_weight, random_state):\n \"\"\"Implement a single boost.\n\n Perform a single boost according to the real multi-class SAMME.R\n algorithm or to the discrete SAMME algorithm and return the updated\n sample weights.\n\n Parameters\n ----------\n iboost : int\n The index of the current boost iteration.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,)\n The target values (class labels).\n\n sample_weight : array-like of shape (n_samples,)\n The current sample weights.\n\n random_state : RandomState instance\n The RandomState instance used if the base estimator accepts a\n `random_state` attribute.\n\n Returns\n -------\n sample_weight : array-like of shape (n_samples,) or None\n The reweighted sample weights.\n If None then boosting has terminated early.\n\n estimator_weight : float\n The weight for the current boost.\n If None then boosting has terminated early.\n\n estimator_error : float\n The classification error for the current boost.\n If None then boosting has terminated early.\n \"\"\"\n if self.algorithm == 'SAMME.R':\n return self._boost_real(iboost, X, y, sample_weight, random_state)\n else:\n return self._boost_discrete(iboost, X, y, sample_weight, random_state)" }, { @@ -72884,7 +77260,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "iboost", @@ -72894,7 +77271,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72904,7 +77282,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -72914,7 +77293,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -72924,7 +77304,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -72934,7 +77315,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -72958,7 +77340,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "iboost", @@ -72968,7 +77351,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -72978,7 +77362,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -72988,7 +77373,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -72998,7 +77384,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -73008,7 +77395,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -73032,7 +77420,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -73042,13 +77431,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute probabilities from the decision function.\n\nThis is based eq. (4) of [1] where: p(y=c|X) = exp((1 / K-1) f_c(X)) / sum_k(exp((1 / K-1) f_k(X))) = softmax((1 / K-1) * f(X))", - "docstring": "Compute probabilities from the decision function.\n\nThis is based eq. (4) of [1] where:\n p(y=c|X) = exp((1 / K-1) f_c(X)) / sum_k(exp((1 / K-1) f_k(X)))\n = softmax((1 / K-1) * f(X))\n\nReferences\n----------\n.. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\",\n 2009.", + "description": "Compute probabilities from the decision function.\n\nThis is based eq. (4) of [1] where:\n p(y=c|X) = exp((1 / K-1) f_c(X)) / sum_k(exp((1 / K-1) f_k(X)))\n = softmax((1 / K-1) * f(X))", + "docstring": "Compute probabilities from the decision function.\n\n This is based eq. (4) of [1] where:\n p(y=c|X) = exp((1 / K-1) f_c(X)) / sum_k(exp((1 / K-1) f_k(X)))\n = softmax((1 / K-1) * f(X))\n\n References\n ----------\n .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\",\n 2009.\n ", "source_code": "\n@staticmethod\ndef _compute_proba_from_decision(decision, n_classes):\n \"\"\"Compute probabilities from the decision function.\n\n This is based eq. (4) of [1] where:\n p(y=c|X) = exp((1 / K-1) f_c(X)) / sum_k(exp((1 / K-1) f_k(X)))\n = softmax((1 / K-1) * f(X))\n\n References\n ----------\n .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\",\n 2009.\n \"\"\"\n if n_classes == 2:\n decision = np.vstack([-decision, decision]).T / 2\n else:\n decision /= n_classes - 1\n return softmax(decision, copy=False)" }, { @@ -73066,7 +77456,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -73090,7 +77481,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73100,13 +77492,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Compute the decision function of ``X``.", - "docstring": "Compute the decision function of ``X``.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\nReturns\n-------\nscore : ndarray of shape of (n_samples, k)\n The decision function of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n Binary classification is a special cases with ``k == 1``,\n otherwise ``k==n_classes``. For binary classification,\n values closer to -1 or 1 mean more like the first or second\n class in ``classes_``, respectively.", + "docstring": "Compute the decision function of ``X``.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n score : ndarray of shape of (n_samples, k)\n The decision function of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n Binary classification is a special cases with ``k == 1``,\n otherwise ``k==n_classes``. For binary classification,\n values closer to -1 or 1 mean more like the first or second\n class in ``classes_``, respectively.\n ", "source_code": "\ndef decision_function(self, X):\n \"\"\"Compute the decision function of ``X``.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n score : ndarray of shape of (n_samples, k)\n The decision function of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n Binary classification is a special cases with ``k == 1``,\n otherwise ``k==n_classes``. For binary classification,\n values closer to -1 or 1 mean more like the first or second\n class in ``classes_``, respectively.\n \"\"\"\n check_is_fitted(self)\n X = self._check_X(X)\n n_classes = self.n_classes_\n classes = self.classes_[:, np.newaxis]\n if self.algorithm == 'SAMME.R':\n pred = sum((_samme_proba(estimator, n_classes, X) for estimator in self.estimators_))\n else:\n pred = sum(((estimator.predict(X) == classes).T * w for (estimator, w) in zip(self.estimators_, self.estimator_weights_)))\n pred /= self.estimator_weights_.sum()\n if n_classes == 2:\n pred[:, 0] *= -1\n return pred.sum(axis=1)\n return pred" }, { @@ -73124,7 +77520,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73134,6 +77531,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -73144,7 +77545,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target values (class labels)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -73154,13 +77556,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, the sample weights are initialized to\n``1 / n_samples``." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Build a boosted classifier from the training set (X, y).", - "docstring": "Build a boosted classifier from the training set (X, y).\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\ny : array-like of shape (n_samples,)\n The target values (class labels).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, the sample weights are initialized to\n ``1 / n_samples``.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Build a boosted classifier from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n The target values (class labels).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, the sample weights are initialized to\n ``1 / n_samples``.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Build a boosted classifier from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n The target values (class labels).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, the sample weights are initialized to\n ``1 / n_samples``.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.algorithm not in ('SAMME', 'SAMME.R'):\n raise ValueError('algorithm %s is not supported' % self.algorithm)\n return super().fit(X, y, sample_weight)" }, { @@ -73178,7 +77581,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73188,13 +77592,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict classes for X.\n\nThe predicted class of an input sample is computed as the weighted mean prediction of the classifiers in the ensemble.", - "docstring": "Predict classes for X.\n\nThe predicted class of an input sample is computed as the weighted mean\nprediction of the classifiers in the ensemble.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\nReturns\n-------\ny : ndarray of shape (n_samples,)\n The predicted classes.", + "description": "Predict classes for X.\n\nThe predicted class of an input sample is computed as the weighted mean\nprediction of the classifiers in the ensemble.", + "docstring": "Predict classes for X.\n\n The predicted class of an input sample is computed as the weighted mean\n prediction of the classifiers in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted classes.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict classes for X.\n\n The predicted class of an input sample is computed as the weighted mean\n prediction of the classifiers in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted classes.\n \"\"\"\n pred = self.decision_function(X)\n if self.n_classes_ == 2:\n return self.classes_.take(pred > 0, axis=0)\n return self.classes_.take(np.argmax(pred, axis=1), axis=0)" }, { @@ -73212,7 +77620,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73222,13 +77631,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict class log-probabilities for X.\n\nThe predicted class log-probabilities of an input sample is computed as the weighted mean predicted class log-probabilities of the classifiers in the ensemble.", - "docstring": "Predict class log-probabilities for X.\n\nThe predicted class log-probabilities of an input sample is computed as\nthe weighted mean predicted class log-probabilities of the classifiers\nin the ensemble.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\nReturns\n-------\np : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.", + "description": "Predict class log-probabilities for X.\n\nThe predicted class log-probabilities of an input sample is computed as\nthe weighted mean predicted class log-probabilities of the classifiers\nin the ensemble.", + "docstring": "Predict class log-probabilities for X.\n\n The predicted class log-probabilities of an input sample is computed as\n the weighted mean predicted class log-probabilities of the classifiers\n in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n ", "source_code": "\ndef predict_log_proba(self, X):\n \"\"\"Predict class log-probabilities for X.\n\n The predicted class log-probabilities of an input sample is computed as\n the weighted mean predicted class log-probabilities of the classifiers\n in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n \"\"\"\n return np.log(self.predict_proba(X))" }, { @@ -73246,7 +77659,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73256,13 +77670,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict class probabilities for X.\n\nThe predicted class probabilities of an input sample is computed as the weighted mean predicted class probabilities of the classifiers in the ensemble.", - "docstring": "Predict class probabilities for X.\n\nThe predicted class probabilities of an input sample is computed as\nthe weighted mean predicted class probabilities of the classifiers\nin the ensemble.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\nReturns\n-------\np : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.", + "description": "Predict class probabilities for X.\n\nThe predicted class probabilities of an input sample is computed as\nthe weighted mean predicted class probabilities of the classifiers\nin the ensemble.", + "docstring": "Predict class probabilities for X.\n\n The predicted class probabilities of an input sample is computed as\n the weighted mean predicted class probabilities of the classifiers\n in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Predict class probabilities for X.\n\n The predicted class probabilities of an input sample is computed as\n the weighted mean predicted class probabilities of the classifiers\n in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n p : ndarray of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n \"\"\"\n check_is_fitted(self)\n n_classes = self.n_classes_\n if n_classes == 1:\n return np.ones((_num_samples(X), 1))\n decision = self.decision_function(X)\n return self._compute_proba_from_decision(decision, n_classes)" }, { @@ -73280,7 +77698,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73290,13 +77709,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Compute decision function of ``X`` for each boosting iteration.\n\nThis method allows monitoring (i.e. determine error on testing set) after each boosting iteration.", - "docstring": "Compute decision function of ``X`` for each boosting iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each boosting iteration.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\nYields\n------\nscore : generator of ndarray of shape (n_samples, k)\n The decision function of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n Binary classification is a special cases with ``k == 1``,\n otherwise ``k==n_classes``. For binary classification,\n values closer to -1 or 1 mean more like the first or second\n class in ``classes_``, respectively.", + "description": "Compute decision function of ``X`` for each boosting iteration.\n\nThis method allows monitoring (i.e. determine error on testing set)\nafter each boosting iteration.", + "docstring": "Compute decision function of ``X`` for each boosting iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each boosting iteration.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Yields\n ------\n score : generator of ndarray of shape (n_samples, k)\n The decision function of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n Binary classification is a special cases with ``k == 1``,\n otherwise ``k==n_classes``. For binary classification,\n values closer to -1 or 1 mean more like the first or second\n class in ``classes_``, respectively.\n ", "source_code": "\ndef staged_decision_function(self, X):\n \"\"\"Compute decision function of ``X`` for each boosting iteration.\n\n This method allows monitoring (i.e. determine error on testing set)\n after each boosting iteration.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Yields\n ------\n score : generator of ndarray of shape (n_samples, k)\n The decision function of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n Binary classification is a special cases with ``k == 1``,\n otherwise ``k==n_classes``. For binary classification,\n values closer to -1 or 1 mean more like the first or second\n class in ``classes_``, respectively.\n \"\"\"\n check_is_fitted(self)\n X = self._check_X(X)\n n_classes = self.n_classes_\n classes = self.classes_[:, np.newaxis]\n pred = None\n norm = 0.0\n for (weight, estimator) in zip(self.estimator_weights_, self.estimators_):\n norm += weight\n if self.algorithm == 'SAMME.R':\n current_pred = _samme_proba(estimator, n_classes, X)\n else:\n current_pred = estimator.predict(X)\n current_pred = (current_pred == classes).T * weight\n if pred is None:\n pred = current_pred\n else:\n pred += current_pred\n if n_classes == 2:\n tmp_pred = np.copy(pred)\n tmp_pred[:, 0] *= -1\n yield (tmp_pred / norm).sum(axis=1)\n else:\n yield pred / norm" }, { @@ -73314,7 +77737,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73324,13 +77748,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return staged predictions for X.\n\nThe predicted class of an input sample is computed as the weighted mean prediction of the classifiers in the ensemble. This generator method yields the ensemble prediction after each iteration of boosting and therefore allows monitoring, such as to determine the prediction on a test set after each boost.", - "docstring": "Return staged predictions for X.\n\nThe predicted class of an input sample is computed as the weighted mean\nprediction of the classifiers in the ensemble.\n\nThis generator method yields the ensemble prediction after each\niteration of boosting and therefore allows monitoring, such as to\ndetermine the prediction on a test set after each boost.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\nYields\n------\ny : generator of ndarray of shape (n_samples,)\n The predicted classes.", + "description": "Return staged predictions for X.\n\nThe predicted class of an input sample is computed as the weighted mean\nprediction of the classifiers in the ensemble.\n\nThis generator method yields the ensemble prediction after each\niteration of boosting and therefore allows monitoring, such as to\ndetermine the prediction on a test set after each boost.", + "docstring": "Return staged predictions for X.\n\n The predicted class of an input sample is computed as the weighted mean\n prediction of the classifiers in the ensemble.\n\n This generator method yields the ensemble prediction after each\n iteration of boosting and therefore allows monitoring, such as to\n determine the prediction on a test set after each boost.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Yields\n ------\n y : generator of ndarray of shape (n_samples,)\n The predicted classes.\n ", "source_code": "\ndef staged_predict(self, X):\n \"\"\"Return staged predictions for X.\n\n The predicted class of an input sample is computed as the weighted mean\n prediction of the classifiers in the ensemble.\n\n This generator method yields the ensemble prediction after each\n iteration of boosting and therefore allows monitoring, such as to\n determine the prediction on a test set after each boost.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Yields\n ------\n y : generator of ndarray of shape (n_samples,)\n The predicted classes.\n \"\"\"\n X = self._check_X(X)\n n_classes = self.n_classes_\n classes = self.classes_\n if n_classes == 2:\n for pred in self.staged_decision_function(X):\n yield np.array(classes.take(pred > 0, axis=0))\n else:\n for pred in self.staged_decision_function(X):\n yield np.array(classes.take(np.argmax(pred, axis=1), axis=0))" }, { @@ -73348,7 +77773,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73358,13 +77784,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict class probabilities for X.\n\nThe predicted class probabilities of an input sample is computed as the weighted mean predicted class probabilities of the classifiers in the ensemble. This generator method yields the ensemble predicted class probabilities after each iteration of boosting and therefore allows monitoring, such as to determine the predicted class probabilities on a test set after each boost.", - "docstring": "Predict class probabilities for X.\n\nThe predicted class probabilities of an input sample is computed as\nthe weighted mean predicted class probabilities of the classifiers\nin the ensemble.\n\nThis generator method yields the ensemble predicted class probabilities\nafter each iteration of boosting and therefore allows monitoring, such\nas to determine the predicted class probabilities on a test set after\neach boost.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\nYields\n------\np : generator of ndarray of shape (n_samples,)\n The class probabilities of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.", + "description": "Predict class probabilities for X.\n\nThe predicted class probabilities of an input sample is computed as\nthe weighted mean predicted class probabilities of the classifiers\nin the ensemble.\n\nThis generator method yields the ensemble predicted class probabilities\nafter each iteration of boosting and therefore allows monitoring, such\nas to determine the predicted class probabilities on a test set after\neach boost.", + "docstring": "Predict class probabilities for X.\n\n The predicted class probabilities of an input sample is computed as\n the weighted mean predicted class probabilities of the classifiers\n in the ensemble.\n\n This generator method yields the ensemble predicted class probabilities\n after each iteration of boosting and therefore allows monitoring, such\n as to determine the predicted class probabilities on a test set after\n each boost.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Yields\n ------\n p : generator of ndarray of shape (n_samples,)\n The class probabilities of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n ", "source_code": "\ndef staged_predict_proba(self, X):\n \"\"\"Predict class probabilities for X.\n\n The predicted class probabilities of an input sample is computed as\n the weighted mean predicted class probabilities of the classifiers\n in the ensemble.\n\n This generator method yields the ensemble predicted class probabilities\n after each iteration of boosting and therefore allows monitoring, such\n as to determine the predicted class probabilities on a test set after\n each boost.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Yields\n ------\n p : generator of ndarray of shape (n_samples,)\n The class probabilities of the input samples. The order of\n outputs is the same of that of the :term:`classes_` attribute.\n \"\"\"\n n_classes = self.n_classes_\n for decision in self.staged_decision_function(X):\n yield self._compute_proba_from_decision(decision, n_classes)" }, { @@ -73382,7 +77812,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -73392,7 +77823,8 @@ "docstring": { "type": "object, default=None", "description": "The base estimator from which the boosted ensemble is built.\nIf ``None``, then the base estimator is\n:class:`~sklearn.tree.DecisionTreeRegressor` initialized with\n`max_depth=3`." - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -73402,7 +77834,8 @@ "docstring": { "type": "int, default=50", "description": "The maximum number of estimators at which boosting is terminated.\nIn case of perfect fit, the learning procedure is stopped early." - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -73412,7 +77845,8 @@ "docstring": { "type": "float, default=1.0", "description": "Weight applied to each regressor at each boosting iteration. A higher\nlearning rate increases the contribution of each regressor. There is\na trade-off between the `learning_rate` and `n_estimators` parameters." - } + }, + "refined_type": {} }, { "name": "loss", @@ -73422,6 +77856,10 @@ "docstring": { "type": "{'linear', 'square', 'exponential'}, default='linear'", "description": "The loss function to use when updating the weights after each\nboosting iteration." + }, + "refined_type": { + "kind": "EnumType", + "values": ["exponential", "linear", "square"] } }, { @@ -73432,13 +77870,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the random seed given at each `base_estimator` at each\nboosting iteration.\nThus, it is only used when `base_estimator` exposes a `random_state`.\nIn addition, it controls the bootstrap of the weights used to train the\n`base_estimator` at each boosting iteration.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, base_estimator=None, *, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None):\n super().__init__(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state)\n self.loss = loss\n self.random_state = random_state" }, { @@ -73456,7 +77895,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "iboost", @@ -73466,7 +77906,8 @@ "docstring": { "type": "int", "description": "The index of the current boost iteration." - } + }, + "refined_type": {} }, { "name": "X", @@ -73476,6 +77917,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -73486,7 +77931,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target values (class labels in classification, real numbers in\nregression)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -73496,7 +77942,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The current sample weights." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -73506,13 +77953,14 @@ "docstring": { "type": "RandomState", "description": "The RandomState instance used if the base estimator accepts a\n`random_state` attribute.\nControls also the bootstrap of the weights used to train the weak\nlearner.\nreplacement." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Implement a single boost for regression\n\nPerform a single boost according to the AdaBoost.R2 algorithm and return the updated sample weights.", - "docstring": "Implement a single boost for regression\n\nPerform a single boost according to the AdaBoost.R2 algorithm and\nreturn the updated sample weights.\n\nParameters\n----------\niboost : int\n The index of the current boost iteration.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\ny : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\nsample_weight : array-like of shape (n_samples,)\n The current sample weights.\n\nrandom_state : RandomState\n The RandomState instance used if the base estimator accepts a\n `random_state` attribute.\n Controls also the bootstrap of the weights used to train the weak\n learner.\n replacement.\n\nReturns\n-------\nsample_weight : array-like of shape (n_samples,) or None\n The reweighted sample weights.\n If None then boosting has terminated early.\n\nestimator_weight : float\n The weight for the current boost.\n If None then boosting has terminated early.\n\nestimator_error : float\n The regression error for the current boost.\n If None then boosting has terminated early.", + "description": "Implement a single boost for regression\n\nPerform a single boost according to the AdaBoost.R2 algorithm and\nreturn the updated sample weights.", + "docstring": "Implement a single boost for regression\n\n Perform a single boost according to the AdaBoost.R2 algorithm and\n return the updated sample weights.\n\n Parameters\n ----------\n iboost : int\n The index of the current boost iteration.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,)\n The current sample weights.\n\n random_state : RandomState\n The RandomState instance used if the base estimator accepts a\n `random_state` attribute.\n Controls also the bootstrap of the weights used to train the weak\n learner.\n replacement.\n\n Returns\n -------\n sample_weight : array-like of shape (n_samples,) or None\n The reweighted sample weights.\n If None then boosting has terminated early.\n\n estimator_weight : float\n The weight for the current boost.\n If None then boosting has terminated early.\n\n estimator_error : float\n The regression error for the current boost.\n If None then boosting has terminated early.\n ", "source_code": "\ndef _boost(self, iboost, X, y, sample_weight, random_state):\n \"\"\"Implement a single boost for regression\n\n Perform a single boost according to the AdaBoost.R2 algorithm and\n return the updated sample weights.\n\n Parameters\n ----------\n iboost : int\n The index of the current boost iteration.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,)\n The current sample weights.\n\n random_state : RandomState\n The RandomState instance used if the base estimator accepts a\n `random_state` attribute.\n Controls also the bootstrap of the weights used to train the weak\n learner.\n replacement.\n\n Returns\n -------\n sample_weight : array-like of shape (n_samples,) or None\n The reweighted sample weights.\n If None then boosting has terminated early.\n\n estimator_weight : float\n The weight for the current boost.\n If None then boosting has terminated early.\n\n estimator_error : float\n The regression error for the current boost.\n If None then boosting has terminated early.\n \"\"\"\n estimator = self._make_estimator(random_state=random_state)\n bootstrap_idx = random_state.choice(np.arange(_num_samples(X)), size=_num_samples(X), replace=True, p=sample_weight)\n X_ = _safe_indexing(X, bootstrap_idx)\n y_ = _safe_indexing(y, bootstrap_idx)\n estimator.fit(X_, y_)\n y_predict = estimator.predict(X)\n error_vect = np.abs(y_predict - y)\n sample_mask = sample_weight > 0\n masked_sample_weight = sample_weight[sample_mask]\n masked_error_vector = error_vect[sample_mask]\n error_max = masked_error_vector.max()\n if error_max != 0:\n masked_error_vector /= error_max\n if self.loss == 'square':\n masked_error_vector **= 2\n elif self.loss == 'exponential':\n masked_error_vector = 1.0 - np.exp(-masked_error_vector)\n estimator_error = (masked_sample_weight * masked_error_vector).sum()\n if estimator_error <= 0:\n return sample_weight, 1.0, 0.0\n elif estimator_error >= 0.5:\n if len(self.estimators_) > 1:\n self.estimators_.pop(-1)\n return None, None, None\n beta = estimator_error / (1.0 - estimator_error)\n estimator_weight = self.learning_rate * np.log(1.0 / beta)\n if not iboost == self.n_estimators - 1:\n sample_weight[sample_mask] *= np.power(beta, (1.0 - masked_error_vector) * self.learning_rate)\n return sample_weight, estimator_weight, estimator_error" }, { @@ -73530,7 +77978,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73540,7 +77989,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "limit", @@ -73550,13 +78000,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_median_predict(self, X, limit):\n predictions = np.array([est.predict(X) for est in self.estimators_[:limit]]).T\n sorted_idx = np.argsort(predictions, axis=1)\n weight_cdf = stable_cumsum(self.estimator_weights_[sorted_idx], axis=1)\n median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis]\n median_idx = median_or_above.argmax(axis=1)\n median_estimators = sorted_idx[np.arange(_num_samples(X)), median_idx]\n return predictions[np.arange(_num_samples(X)), median_estimators]" }, { @@ -73574,7 +78025,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -73598,7 +78050,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73608,6 +78061,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -73618,7 +78075,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target values (real numbers)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -73628,13 +78086,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, the sample weights are initialized to\n1 / n_samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Build a boosted regressor from the training set (X, y).", - "docstring": "Build a boosted regressor from the training set (X, y).\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\ny : array-like of shape (n_samples,)\n The target values (real numbers).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, the sample weights are initialized to\n 1 / n_samples.\n\nReturns\n-------\nself : object\n Fitted AdaBoostRegressor estimator.", + "docstring": "Build a boosted regressor from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n The target values (real numbers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, the sample weights are initialized to\n 1 / n_samples.\n\n Returns\n -------\n self : object\n Fitted AdaBoostRegressor estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Build a boosted regressor from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n The target values (real numbers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, the sample weights are initialized to\n 1 / n_samples.\n\n Returns\n -------\n self : object\n Fitted AdaBoostRegressor estimator.\n \"\"\"\n if self.loss not in ('linear', 'square', 'exponential'):\n raise ValueError(\"loss must be 'linear', 'square', or 'exponential'\")\n return super().fit(X, y, sample_weight)" }, { @@ -73652,7 +78111,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73662,13 +78122,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Predict regression value for X.\n\nThe predicted regression value of an input sample is computed as the weighted median prediction of the regressors in the ensemble.", - "docstring": "Predict regression value for X.\n\nThe predicted regression value of an input sample is computed\nas the weighted median prediction of the regressors in the ensemble.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\nReturns\n-------\ny : ndarray of shape (n_samples,)\n The predicted regression values.", + "description": "Predict regression value for X.\n\nThe predicted regression value of an input sample is computed\nas the weighted median prediction of the regressors in the ensemble.", + "docstring": "Predict regression value for X.\n\n The predicted regression value of an input sample is computed\n as the weighted median prediction of the regressors in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted regression values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict regression value for X.\n\n The predicted regression value of an input sample is computed\n as the weighted median prediction of the regressors in the ensemble.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n The predicted regression values.\n \"\"\"\n check_is_fitted(self)\n X = self._check_X(X)\n return self._get_median_predict(X, len(self.estimators_))" }, { @@ -73686,7 +78150,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73696,13 +78161,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Return staged predictions for X.\n\nThe predicted regression value of an input sample is computed as the weighted median prediction of the regressors in the ensemble. This generator method yields the ensemble prediction after each iteration of boosting and therefore allows monitoring, such as to determine the prediction on a test set after each boost.", - "docstring": "Return staged predictions for X.\n\nThe predicted regression value of an input sample is computed\nas the weighted median prediction of the regressors in the ensemble.\n\nThis generator method yields the ensemble prediction after each\niteration of boosting and therefore allows monitoring, such as to\ndetermine the prediction on a test set after each boost.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\nYields\n-------\ny : generator of ndarray of shape (n_samples,)\n The predicted regression values.", + "description": "Return staged predictions for X.\n\nThe predicted regression value of an input sample is computed\nas the weighted median prediction of the regressors in the ensemble.\n\nThis generator method yields the ensemble prediction after each\niteration of boosting and therefore allows monitoring, such as to\ndetermine the prediction on a test set after each boost.", + "docstring": "Return staged predictions for X.\n\n The predicted regression value of an input sample is computed\n as the weighted median prediction of the regressors in the ensemble.\n\n This generator method yields the ensemble prediction after each\n iteration of boosting and therefore allows monitoring, such as to\n determine the prediction on a test set after each boost.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted regression values.\n ", "source_code": "\ndef staged_predict(self, X):\n \"\"\"Return staged predictions for X.\n\n The predicted regression value of an input sample is computed\n as the weighted median prediction of the regressors in the ensemble.\n\n This generator method yields the ensemble prediction after each\n iteration of boosting and therefore allows monitoring, such as to\n determine the prediction on a test set after each boost.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\n Yields\n -------\n y : generator of ndarray of shape (n_samples,)\n The predicted regression values.\n \"\"\"\n check_is_fitted(self)\n X = self._check_X(X)\n for (i, _) in enumerate(self.estimators_, 1):\n yield self._get_median_predict(X, limit=i)" }, { @@ -73720,7 +78189,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -73730,7 +78200,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_estimators", @@ -73740,7 +78211,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_params", @@ -73750,7 +78222,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -73760,7 +78233,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -73770,13 +78244,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, base_estimator=None, *, n_estimators=50, estimator_params=tuple(), learning_rate=1.0, random_state=None):\n super().__init__(base_estimator=base_estimator, n_estimators=n_estimators, estimator_params=estimator_params)\n self.learning_rate = learning_rate\n self.random_state = random_state" }, { @@ -73794,7 +78269,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "iboost", @@ -73804,7 +78280,8 @@ "docstring": { "type": "int", "description": "The index of the current boost iteration." - } + }, + "refined_type": {} }, { "name": "X", @@ -73814,6 +78291,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -73824,7 +78305,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target values (class labels)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -73834,7 +78316,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The current sample weights." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -73844,13 +78327,14 @@ "docstring": { "type": "RandomState", "description": "The current random number generator" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Implement a single boost.\n\nWarning: This method needs to be overridden by subclasses.", - "docstring": "Implement a single boost.\n\nWarning: This method needs to be overridden by subclasses.\n\nParameters\n----------\niboost : int\n The index of the current boost iteration.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\ny : array-like of shape (n_samples,)\n The target values (class labels).\n\nsample_weight : array-like of shape (n_samples,)\n The current sample weights.\n\nrandom_state : RandomState\n The current random number generator\n\nReturns\n-------\nsample_weight : array-like of shape (n_samples,) or None\n The reweighted sample weights.\n If None then boosting has terminated early.\n\nestimator_weight : float\n The weight for the current boost.\n If None then boosting has terminated early.\n\nerror : float\n The classification error for the current boost.\n If None then boosting has terminated early.", + "docstring": "Implement a single boost.\n\n Warning: This method needs to be overridden by subclasses.\n\n Parameters\n ----------\n iboost : int\n The index of the current boost iteration.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n The target values (class labels).\n\n sample_weight : array-like of shape (n_samples,)\n The current sample weights.\n\n random_state : RandomState\n The current random number generator\n\n Returns\n -------\n sample_weight : array-like of shape (n_samples,) or None\n The reweighted sample weights.\n If None then boosting has terminated early.\n\n estimator_weight : float\n The weight for the current boost.\n If None then boosting has terminated early.\n\n error : float\n The classification error for the current boost.\n If None then boosting has terminated early.\n ", "source_code": "\n@abstractmethod\ndef _boost(self, iboost, X, y, sample_weight, random_state):\n \"\"\"Implement a single boost.\n\n Warning: This method needs to be overridden by subclasses.\n\n Parameters\n ----------\n iboost : int\n The index of the current boost iteration.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n The target values (class labels).\n\n sample_weight : array-like of shape (n_samples,)\n The current sample weights.\n\n random_state : RandomState\n The current random number generator\n\n Returns\n -------\n sample_weight : array-like of shape (n_samples,) or None\n The reweighted sample weights.\n If None then boosting has terminated early.\n\n estimator_weight : float\n The weight for the current boost.\n If None then boosting has terminated early.\n\n error : float\n The classification error for the current boost.\n If None then boosting has terminated early.\n \"\"\"\n pass" }, { @@ -73868,7 +78352,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73878,13 +78363,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_X(self, X):\n return self._validate_data(X, accept_sparse=['csr', 'csc'], ensure_2d=True, allow_nd=True, dtype=None, reset=False)" }, { @@ -73902,13 +78388,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "The impurity-based feature importances.\n\nThe higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values). See :func:`sklearn.inspection.permutation_importance` as an alternative.", - "docstring": "The impurity-based feature importances.\n\nThe higher, the more important the feature.\nThe importance of a feature is computed as the (normalized)\ntotal reduction of the criterion brought by that feature. It is also\nknown as the Gini importance.\n\nWarning: impurity-based feature importances can be misleading for\nhigh cardinality features (many unique values). See\n:func:`sklearn.inspection.permutation_importance` as an alternative.\n\nReturns\n-------\nfeature_importances_ : ndarray of shape (n_features,)\n The feature importances.", + "description": "The impurity-based feature importances.\n\nThe higher, the more important the feature.\nThe importance of a feature is computed as the (normalized)\ntotal reduction of the criterion brought by that feature. It is also\nknown as the Gini importance.\n\nWarning: impurity-based feature importances can be misleading for\nhigh cardinality features (many unique values). See\n:func:`sklearn.inspection.permutation_importance` as an alternative.", + "docstring": "The impurity-based feature importances.\n\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n The feature importances.\n ", "source_code": "\n@property\ndef feature_importances_(self):\n \"\"\"The impurity-based feature importances.\n\n The higher, the more important the feature.\n The importance of a feature is computed as the (normalized)\n total reduction of the criterion brought by that feature. It is also\n known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n The feature importances.\n \"\"\"\n if self.estimators_ is None or len(self.estimators_) == 0:\n raise ValueError('Estimator not fitted, call `fit` before `feature_importances_`.')\n try:\n norm = self.estimator_weights_.sum()\n return sum((weight * clf.feature_importances_ for (weight, clf) in zip(self.estimator_weights_, self.estimators_))) / norm\n except AttributeError as e:\n raise AttributeError('Unable to compute feature importances since base_estimator does not have a feature_importances_ attribute') from e" }, { @@ -73926,7 +78413,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73936,6 +78424,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -73946,7 +78438,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target values (class labels in classification, real numbers in\nregression)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -73956,13 +78449,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, the sample weights are initialized to\n1 / n_samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Build a boosted classifier/regressor from the training set (X, y).", - "docstring": "Build a boosted classifier/regressor from the training set (X, y).\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\ny : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, the sample weights are initialized to\n 1 / n_samples.\n\nReturns\n-------\nself : object", + "docstring": "Build a boosted classifier/regressor from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, the sample weights are initialized to\n 1 / n_samples.\n\n Returns\n -------\n self : object\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Build a boosted classifier/regressor from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, the sample weights are initialized to\n 1 / n_samples.\n\n Returns\n -------\n self : object\n \"\"\"\n if self.learning_rate <= 0:\n raise ValueError('learning_rate must be greater than zero')\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], ensure_2d=True, allow_nd=True, dtype=None, y_numeric=is_regressor(self))\n sample_weight = _check_sample_weight(sample_weight, X, np.float64, copy=True)\n sample_weight /= sample_weight.sum()\n if np.any(sample_weight < 0):\n raise ValueError('sample_weight cannot contain negative weights')\n self._validate_estimator()\n self.estimators_ = []\n self.estimator_weights_ = np.zeros(self.n_estimators, dtype=np.float64)\n self.estimator_errors_ = np.ones(self.n_estimators, dtype=np.float64)\n random_state = check_random_state(self.random_state)\n for iboost in range(self.n_estimators):\n (sample_weight, estimator_weight, estimator_error) = self._boost(iboost, X, y, sample_weight, random_state)\n if sample_weight is None:\n break\n self.estimator_weights_[iboost] = estimator_weight\n self.estimator_errors_[iboost] = estimator_error\n if estimator_error == 0:\n break\n sample_weight_sum = np.sum(sample_weight)\n if not np.isfinite(sample_weight_sum):\n warnings.warn(f'Sample weights have reached infinite values, at iteration {iboost}, causing overflow. Iterations stopped. Try lowering the learning rate.', stacklevel=2)\n break\n if sample_weight_sum <= 0:\n break\n if iboost < self.n_estimators - 1:\n sample_weight /= sample_weight_sum\n return self" }, { @@ -73980,7 +78474,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -73990,6 +78485,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Sparse matrix can be CSC, CSR, COO,\nDOK, or LIL. COO, DOK, and LIL are converted to CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -74000,7 +78499,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Labels for X." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -74010,13 +78510,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return staged scores for X, y.\n\nThis generator method yields the ensemble score after each iteration of boosting and therefore allows monitoring, such as to determine the score on a test set after each boost.", - "docstring": "Return staged scores for X, y.\n\nThis generator method yields the ensemble score after each iteration of\nboosting and therefore allows monitoring, such as to determine the\nscore on a test set after each boost.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\ny : array-like of shape (n_samples,)\n Labels for X.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nYields\n------\nz : float", + "description": "Return staged scores for X, y.\n\nThis generator method yields the ensemble score after each iteration of\nboosting and therefore allows monitoring, such as to determine the\nscore on a test set after each boost.", + "docstring": "Return staged scores for X, y.\n\n This generator method yields the ensemble score after each iteration of\n boosting and therefore allows monitoring, such as to determine the\n score on a test set after each boost.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n Labels for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Yields\n ------\n z : float\n ", "source_code": "\ndef staged_score(self, X, y, sample_weight=None):\n \"\"\"Return staged scores for X, y.\n\n This generator method yields the ensemble score after each iteration of\n boosting and therefore allows monitoring, such as to determine the\n score on a test set after each boost.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Sparse matrix can be CSC, CSR, COO,\n DOK, or LIL. COO, DOK, and LIL are converted to CSR.\n\n y : array-like of shape (n_samples,)\n Labels for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Yields\n ------\n z : float\n \"\"\"\n X = self._check_X(X)\n for y_pred in self.staged_predict(X):\n if is_classifier(self):\n yield accuracy_score(y, y_pred, sample_weight=sample_weight)\n else:\n yield r2_score(y, y_pred, sample_weight=sample_weight)" }, { @@ -74034,7 +78535,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -74044,7 +78546,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -74054,13 +78557,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Calculate algorithm 4, step 2, equation c) of Zhu et al [1].", - "docstring": "Calculate algorithm 4, step 2, equation c) of Zhu et al [1].\n\nReferences\n----------\n.. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\", 2009.", + "docstring": "Calculate algorithm 4, step 2, equation c) of Zhu et al [1].\n\n References\n ----------\n .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\", 2009.\n\n ", "source_code": "\ndef _samme_proba(estimator, n_classes, X):\n \"\"\"Calculate algorithm 4, step 2, equation c) of Zhu et al [1].\n\n References\n ----------\n .. [1] J. Zhu, H. Zou, S. Rosset, T. Hastie, \"Multi-class AdaBoost\", 2009.\n\n \"\"\"\n proba = estimator.predict_proba(X)\n np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba)\n log_proba = np.log(proba)\n return (n_classes - 1) * (log_proba - 1.0 / n_classes * log_proba.sum(axis=1)[:, np.newaxis])" }, { @@ -74078,7 +78582,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -74088,13 +78593,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n config = Configuration('ensemble', parent_package, top_path)\n config.add_extension('_gradient_boosting', sources=['_gradient_boosting.pyx'], include_dirs=[numpy.get_include()])\n config.add_subpackage('tests')\n config.add_extension('_hist_gradient_boosting._gradient_boosting', sources=['_hist_gradient_boosting/_gradient_boosting.pyx'], include_dirs=[numpy.get_include()])\n config.add_extension('_hist_gradient_boosting.histogram', sources=['_hist_gradient_boosting/histogram.pyx'], include_dirs=[numpy.get_include()])\n config.add_extension('_hist_gradient_boosting.splitting', sources=['_hist_gradient_boosting/splitting.pyx'], include_dirs=[numpy.get_include()])\n config.add_extension('_hist_gradient_boosting._binning', sources=['_hist_gradient_boosting/_binning.pyx'], include_dirs=[numpy.get_include()])\n config.add_extension('_hist_gradient_boosting._predictor', sources=['_hist_gradient_boosting/_predictor.pyx'], include_dirs=[numpy.get_include()])\n config.add_extension('_hist_gradient_boosting._loss', sources=['_hist_gradient_boosting/_loss.pyx'], include_dirs=[numpy.get_include()])\n config.add_extension('_hist_gradient_boosting._bitset', sources=['_hist_gradient_boosting/_bitset.pyx'], include_dirs=[numpy.get_include()])\n config.add_extension('_hist_gradient_boosting.common', sources=['_hist_gradient_boosting/common.pyx'], include_dirs=[numpy.get_include()])\n config.add_extension('_hist_gradient_boosting.utils', sources=['_hist_gradient_boosting/utils.pyx'], include_dirs=[numpy.get_include()])\n config.add_subpackage('_hist_gradient_boosting.tests')\n return config" }, { @@ -74112,7 +78618,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -74136,7 +78643,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "s", @@ -74146,7 +78654,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "encode_nominal", @@ -74156,7 +78665,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "matrix_type", @@ -74166,7 +78676,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -74190,7 +78701,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "s", @@ -74200,12 +78712,13 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "(INTERNAL) Decodes an attribute line.\n\n The attribute is the most complex declaration in an arff file. All attributes must follow the template:: @attribute where ``attribute-name`` is a string, quoted if the name contains any whitespace, and ``datatype`` can be: - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``. - Strings as ``STRING``. - Dates (NOT IMPLEMENTED). - Nominal attributes with format: {, , , ...} The nominal names follow the rules for the attribute names, i.e., they must be quoted if the name contains whitespaces. This method must receive a normalized string, i.e., a string without padding, including the \"\r \" characters. :param s: a normalized string. :return: a tuple (ATTRIBUTE_NAME, TYPE_OR_VALUES).", + "description": "(INTERNAL) Decodes an attribute line.\n\n The attribute is the most complex declaration in an arff file. All\n attributes must follow the template::\n\n @attribute \n\n where ``attribute-name`` is a string, quoted if the name contains any\n whitespace, and ``datatype`` can be:\n\n - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.\n - Strings as ``STRING``.\n - Dates (NOT IMPLEMENTED).\n - Nominal attributes with format:\n\n {, , , ...}\n\n The nominal names follow the rules for the attribute names, i.e., they\n must be quoted if the name contains whitespaces.\n\n This method must receive a normalized string, i.e., a string without\n padding, including the \"\r\n\" characters.\n\n :param s: a normalized string.\n :return: a tuple (ATTRIBUTE_NAME, TYPE_OR_VALUES).", "docstring": "(INTERNAL) Decodes an attribute line.\n\n The attribute is the most complex declaration in an arff file. All\n attributes must follow the template::\n\n @attribute \n\n where ``attribute-name`` is a string, quoted if the name contains any\n whitespace, and ``datatype`` can be:\n\n - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.\n - Strings as ``STRING``.\n - Dates (NOT IMPLEMENTED).\n - Nominal attributes with format:\n\n {, , , ...}\n\n The nominal names follow the rules for the attribute names, i.e., they\n must be quoted if the name contains whitespaces.\n\n This method must receive a normalized string, i.e., a string without\n padding, including the \"\r\n\" characters.\n\n :param s: a normalized string.\n :return: a tuple (ATTRIBUTE_NAME, TYPE_OR_VALUES).\n ", "source_code": "\ndef _decode_attribute(self, s):\n \"\"\"(INTERNAL) Decodes an attribute line.\n\n The attribute is the most complex declaration in an arff file. All\n attributes must follow the template::\n\n @attribute \n\n where ``attribute-name`` is a string, quoted if the name contains any\n whitespace, and ``datatype`` can be:\n\n - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.\n - Strings as ``STRING``.\n - Dates (NOT IMPLEMENTED).\n - Nominal attributes with format:\n\n {, , , ...}\n\n The nominal names follow the rules for the attribute names, i.e., they\n must be quoted if the name contains whitespaces.\n\n This method must receive a normalized string, i.e., a string without\n padding, including the \"\r\n\" characters.\n\n :param s: a normalized string.\n :return: a tuple (ATTRIBUTE_NAME, TYPE_OR_VALUES).\n \"\"\"\n (_, v) = s.split(' ', 1)\n v = v.strip()\n m = _RE_ATTRIBUTE.match(v)\n if not m:\n raise BadAttributeFormat()\n (name, type_) = m.groups()\n name = str(name.strip('\"\\''))\n if type_[:1] == '{' and type_[-1:] == '}':\n try:\n type_ = _parse_values(type_.strip('{} '))\n except Exception:\n raise BadAttributeType()\n if isinstance(type_, dict):\n raise BadAttributeType()\n else:\n type_ = str(type_).upper()\n if type_ not in ['NUMERIC', 'REAL', 'INTEGER', 'STRING']:\n raise BadAttributeType()\n return name, type_" }, @@ -74224,7 +78737,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "s", @@ -74234,12 +78748,13 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "(INTERNAL) Decodes a comment line.\n\n Comments are single line strings starting, obligatorily, with the ``%`` character, and can have any symbol, including whitespaces or special characters. This method must receive a normalized string, i.e., a string without padding, including the \"\r \" characters. :param s: a normalized string. :return: a string with the decoded comment.", + "description": "(INTERNAL) Decodes a comment line.\n\n Comments are single line strings starting, obligatorily, with the ``%``\n character, and can have any symbol, including whitespaces or special\n characters.\n\n This method must receive a normalized string, i.e., a string without\n padding, including the \"\r\n\" characters.\n\n :param s: a normalized string.\n :return: a string with the decoded comment.", "docstring": "(INTERNAL) Decodes a comment line.\n\n Comments are single line strings starting, obligatorily, with the ``%``\n character, and can have any symbol, including whitespaces or special\n characters.\n\n This method must receive a normalized string, i.e., a string without\n padding, including the \"\r\n\" characters.\n\n :param s: a normalized string.\n :return: a string with the decoded comment.\n ", "source_code": "\ndef _decode_comment(self, s):\n \"\"\"(INTERNAL) Decodes a comment line.\n\n Comments are single line strings starting, obligatorily, with the ``%``\n character, and can have any symbol, including whitespaces or special\n characters.\n\n This method must receive a normalized string, i.e., a string without\n padding, including the \"\r\n\" characters.\n\n :param s: a normalized string.\n :return: a string with the decoded comment.\n \"\"\"\n res = re.sub('^\\\\%( )?', '', s)\n return res" }, @@ -74258,7 +78773,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "s", @@ -74268,12 +78784,13 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "(INTERNAL) Decodes a relation line.\n\n The relation declaration is a line with the format ``@RELATION ``, where ``relation-name`` is a string. The string must start with alphabetic character and must be quoted if the name includes spaces, otherwise this method will raise a `BadRelationFormat` exception. This method must receive a normalized string, i.e., a string without padding, including the \"\r \" characters. :param s: a normalized string. :return: a string with the decoded relation name.", + "description": "(INTERNAL) Decodes a relation line.\n\n The relation declaration is a line with the format ``@RELATION\n ``, where ``relation-name`` is a string. The string must\n start with alphabetic character and must be quoted if the name includes\n spaces, otherwise this method will raise a `BadRelationFormat` exception.\n\n This method must receive a normalized string, i.e., a string without\n padding, including the \"\r\n\" characters.\n\n :param s: a normalized string.\n :return: a string with the decoded relation name.", "docstring": "(INTERNAL) Decodes a relation line.\n\n The relation declaration is a line with the format ``@RELATION\n ``, where ``relation-name`` is a string. The string must\n start with alphabetic character and must be quoted if the name includes\n spaces, otherwise this method will raise a `BadRelationFormat` exception.\n\n This method must receive a normalized string, i.e., a string without\n padding, including the \"\r\n\" characters.\n\n :param s: a normalized string.\n :return: a string with the decoded relation name.\n ", "source_code": "\ndef _decode_relation(self, s):\n \"\"\"(INTERNAL) Decodes a relation line.\n\n The relation declaration is a line with the format ``@RELATION\n ``, where ``relation-name`` is a string. The string must\n start with alphabetic character and must be quoted if the name includes\n spaces, otherwise this method will raise a `BadRelationFormat` exception.\n\n This method must receive a normalized string, i.e., a string without\n padding, including the \"\r\n\" characters.\n\n :param s: a normalized string.\n :return: a string with the decoded relation name.\n \"\"\"\n (_, v) = s.split(' ', 1)\n v = v.strip()\n if not _RE_RELATION.match(v):\n raise BadRelationFormat()\n res = str(v.strip('\"\\''))\n return res" }, @@ -74292,7 +78809,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "s", @@ -74302,7 +78820,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "encode_nominal", @@ -74312,7 +78831,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "return_type", @@ -74322,13 +78842,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Returns the Python representation of a given ARFF file.\n\nWhen a file object is passed as an argument, this method reads lines iteratively, avoiding to load unnecessary information to the memory. :param s: a string or file object with the ARFF file. :param encode_nominal: boolean, if True perform a label encoding while reading the .arff file. :param return_type: determines the data structure used to store the dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`, `arff.DENSE_GEN` or `arff.LOD_GEN`. Consult the sections on `working with sparse data`_ and `loading progressively`_.", - "docstring": "Returns the Python representation of a given ARFF file.\n\nWhen a file object is passed as an argument, this method reads lines\niteratively, avoiding to load unnecessary information to the memory.\n\n:param s: a string or file object with the ARFF file.\n:param encode_nominal: boolean, if True perform a label encoding\n while reading the .arff file.\n:param return_type: determines the data structure used to store the\n dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`,\n `arff.DENSE_GEN` or `arff.LOD_GEN`.\n Consult the sections on `working with sparse data`_ and `loading\n progressively`_.", + "description": "Returns the Python representation of a given ARFF file.\n\nWhen a file object is passed as an argument, this method reads lines\niteratively, avoiding to load unnecessary information to the memory.\n\n:param s: a string or file object with the ARFF file.\n:param encode_nominal: boolean, if True perform a label encoding\n while reading the .arff file.\n:param return_type: determines the data structure used to store the\n dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`,\n `arff.DENSE_GEN` or `arff.LOD_GEN`.\n Consult the sections on `working with sparse data`_ and `loading\n progressively`_.", + "docstring": "Returns the Python representation of a given ARFF file.\n\n When a file object is passed as an argument, this method reads lines\n iteratively, avoiding to load unnecessary information to the memory.\n\n :param s: a string or file object with the ARFF file.\n :param encode_nominal: boolean, if True perform a label encoding\n while reading the .arff file.\n :param return_type: determines the data structure used to store the\n dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`,\n `arff.DENSE_GEN` or `arff.LOD_GEN`.\n Consult the sections on `working with sparse data`_ and `loading\n progressively`_.\n ", "source_code": "\ndef decode(self, s, encode_nominal=False, return_type=DENSE):\n \"\"\"Returns the Python representation of a given ARFF file.\n\n When a file object is passed as an argument, this method reads lines\n iteratively, avoiding to load unnecessary information to the memory.\n\n :param s: a string or file object with the ARFF file.\n :param encode_nominal: boolean, if True perform a label encoding\n while reading the .arff file.\n :param return_type: determines the data structure used to store the\n dataset. Can be one of `arff.DENSE`, `arff.COO`, `arff.LOD`,\n `arff.DENSE_GEN` or `arff.LOD_GEN`.\n Consult the sections on `working with sparse data`_ and `loading\n progressively`_.\n \"\"\"\n try:\n return self._decode(s, encode_nominal=encode_nominal, matrix_type=return_type)\n except ArffException as e:\n e.line = self._current_line\n raise e" }, { @@ -74346,7 +78867,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "name", @@ -74356,7 +78878,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "type_", @@ -74366,13 +78889,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "(INTERNAL) Encodes an attribute line.\n\nThe attribute follow the template:: @attribute where ``attribute-name`` is a string, and ``datatype`` can be: - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``. - Strings as ``STRING``. - Dates (NOT IMPLEMENTED). - Nominal attributes with format: {, , , ...} This method must receive a the name of the attribute and its type, if the attribute type is nominal, ``type`` must be a list of values. :param name: a string. :param type_: a string or a list of string. :return: a string with the encoded attribute declaration.", - "docstring": "(INTERNAL) Encodes an attribute line.\n\nThe attribute follow the template::\n\n @attribute \n\nwhere ``attribute-name`` is a string, and ``datatype`` can be:\n\n- Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.\n- Strings as ``STRING``.\n- Dates (NOT IMPLEMENTED).\n- Nominal attributes with format:\n\n {, , , ...}\n\nThis method must receive a the name of the attribute and its type, if\nthe attribute type is nominal, ``type`` must be a list of values.\n\n:param name: a string.\n:param type_: a string or a list of string.\n:return: a string with the encoded attribute declaration.", + "description": "(INTERNAL) Encodes an attribute line.\n\nThe attribute follow the template::\n\n @attribute \n\nwhere ``attribute-name`` is a string, and ``datatype`` can be:\n\n- Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.\n- Strings as ``STRING``.\n- Dates (NOT IMPLEMENTED).\n- Nominal attributes with format:\n\n {, , , ...}\n\nThis method must receive a the name of the attribute and its type, if\nthe attribute type is nominal, ``type`` must be a list of values.\n\n:param name: a string.\n:param type_: a string or a list of string.\n:return: a string with the encoded attribute declaration.", + "docstring": "(INTERNAL) Encodes an attribute line.\n\n The attribute follow the template::\n\n @attribute \n\n where ``attribute-name`` is a string, and ``datatype`` can be:\n\n - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.\n - Strings as ``STRING``.\n - Dates (NOT IMPLEMENTED).\n - Nominal attributes with format:\n\n {, , , ...}\n\n This method must receive a the name of the attribute and its type, if\n the attribute type is nominal, ``type`` must be a list of values.\n\n :param name: a string.\n :param type_: a string or a list of string.\n :return: a string with the encoded attribute declaration.\n ", "source_code": "\ndef _encode_attribute(self, name, type_):\n \"\"\"(INTERNAL) Encodes an attribute line.\n\n The attribute follow the template::\n\n @attribute \n\n where ``attribute-name`` is a string, and ``datatype`` can be:\n\n - Numerical attributes as ``NUMERIC``, ``INTEGER`` or ``REAL``.\n - Strings as ``STRING``.\n - Dates (NOT IMPLEMENTED).\n - Nominal attributes with format:\n\n {, , , ...}\n\n This method must receive a the name of the attribute and its type, if\n the attribute type is nominal, ``type`` must be a list of values.\n\n :param name: a string.\n :param type_: a string or a list of string.\n :return: a string with the encoded attribute declaration.\n \"\"\"\n for char in ' %{},':\n if char in name:\n name = '\"%s\"' % name\n break\n if isinstance(type_, (tuple, list)):\n type_tmp = ['%s' % encode_string(type_k) for type_k in type_]\n type_ = '{%s}' % ', '.join(type_tmp)\n return '%s %s %s' % (_TK_ATTRIBUTE, name, type_)" }, { @@ -74390,7 +78914,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "s", @@ -74400,13 +78925,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "(INTERNAL) Encodes a comment line.\n\nComments are single line strings starting, obligatorily, with the ``%`` character, and can have any symbol, including whitespaces or special characters. If ``s`` is None, this method will simply return an empty comment. :param s: (OPTIONAL) string. :return: a string with the encoded comment line.", - "docstring": "(INTERNAL) Encodes a comment line.\n\nComments are single line strings starting, obligatorily, with the ``%``\ncharacter, and can have any symbol, including whitespaces or special\ncharacters.\n\nIf ``s`` is None, this method will simply return an empty comment.\n\n:param s: (OPTIONAL) string.\n:return: a string with the encoded comment line.", + "description": "(INTERNAL) Encodes a comment line.\n\nComments are single line strings starting, obligatorily, with the ``%``\ncharacter, and can have any symbol, including whitespaces or special\ncharacters.\n\nIf ``s`` is None, this method will simply return an empty comment.\n\n:param s: (OPTIONAL) string.\n:return: a string with the encoded comment line.", + "docstring": "(INTERNAL) Encodes a comment line.\n\n Comments are single line strings starting, obligatorily, with the ``%``\n character, and can have any symbol, including whitespaces or special\n characters.\n\n If ``s`` is None, this method will simply return an empty comment.\n\n :param s: (OPTIONAL) string.\n :return: a string with the encoded comment line.\n ", "source_code": "\ndef _encode_comment(self, s=''):\n \"\"\"(INTERNAL) Encodes a comment line.\n\n Comments are single line strings starting, obligatorily, with the ``%``\n character, and can have any symbol, including whitespaces or special\n characters.\n\n If ``s`` is None, this method will simply return an empty comment.\n\n :param s: (OPTIONAL) string.\n :return: a string with the encoded comment line.\n \"\"\"\n if s:\n return '%s %s' % (_TK_COMMENT, s)\n else:\n return '%s' % _TK_COMMENT" }, { @@ -74424,7 +78950,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "name", @@ -74434,13 +78961,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "(INTERNAL) Decodes a relation line.\n\nThe relation declaration is a line with the format ``@RELATION ``, where ``relation-name`` is a string. :param name: a string. :return: a string with the encoded relation declaration.", - "docstring": "(INTERNAL) Decodes a relation line.\n\nThe relation declaration is a line with the format ``@RELATION\n``, where ``relation-name`` is a string.\n\n:param name: a string.\n:return: a string with the encoded relation declaration.", + "description": "(INTERNAL) Decodes a relation line.\n\nThe relation declaration is a line with the format ``@RELATION\n``, where ``relation-name`` is a string.\n\n:param name: a string.\n:return: a string with the encoded relation declaration.", + "docstring": "(INTERNAL) Decodes a relation line.\n\n The relation declaration is a line with the format ``@RELATION\n ``, where ``relation-name`` is a string.\n\n :param name: a string.\n :return: a string with the encoded relation declaration.\n ", "source_code": "\ndef _encode_relation(self, name):\n \"\"\"(INTERNAL) Decodes a relation line.\n\n The relation declaration is a line with the format ``@RELATION\n ``, where ``relation-name`` is a string.\n\n :param name: a string.\n :return: a string with the encoded relation declaration.\n \"\"\"\n for char in ' %{},':\n if char in name:\n name = '\"%s\"' % name\n break\n return '%s %s' % (_TK_RELATION, name)" }, { @@ -74458,7 +78986,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "obj", @@ -74468,13 +78997,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Encodes a given object to an ARFF file.\n\n:param obj: the object containing the ARFF information. :return: the ARFF file as an string.", - "docstring": "Encodes a given object to an ARFF file.\n\n:param obj: the object containing the ARFF information.\n:return: the ARFF file as an string.", + "description": "Encodes a given object to an ARFF file.\n\n:param obj: the object containing the ARFF information.\n:return: the ARFF file as an string.", + "docstring": "Encodes a given object to an ARFF file.\n\n :param obj: the object containing the ARFF information.\n :return: the ARFF file as an string.\n ", "source_code": "\ndef encode(self, obj):\n \"\"\"Encodes a given object to an ARFF file.\n\n :param obj: the object containing the ARFF information.\n :return: the ARFF file as an string.\n \"\"\"\n data = [row for row in self.iter_encode(obj)]\n return '\\n'.join(data)" }, { @@ -74492,7 +79022,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "obj", @@ -74502,13 +79033,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "The iterative version of `arff.ArffEncoder.encode`.\n\nThis encodes iteratively a given object and return, one-by-one, the lines of the ARFF file. :param obj: the object containing the ARFF information. :return: (yields) the ARFF file as strings.", - "docstring": "The iterative version of `arff.ArffEncoder.encode`.\n\nThis encodes iteratively a given object and return, one-by-one, the\nlines of the ARFF file.\n\n:param obj: the object containing the ARFF information.\n:return: (yields) the ARFF file as strings.", + "description": "The iterative version of `arff.ArffEncoder.encode`.\n\nThis encodes iteratively a given object and return, one-by-one, the\nlines of the ARFF file.\n\n:param obj: the object containing the ARFF information.\n:return: (yields) the ARFF file as strings.", + "docstring": "The iterative version of `arff.ArffEncoder.encode`.\n\n This encodes iteratively a given object and return, one-by-one, the\n lines of the ARFF file.\n\n :param obj: the object containing the ARFF information.\n :return: (yields) the ARFF file as strings.\n ", "source_code": "\ndef iter_encode(self, obj):\n \"\"\"The iterative version of `arff.ArffEncoder.encode`.\n\n This encodes iteratively a given object and return, one-by-one, the\n lines of the ARFF file.\n\n :param obj: the object containing the ARFF information.\n :return: (yields) the ARFF file as strings.\n \"\"\"\n if obj.get('description', None):\n for row in obj['description'].split('\\n'):\n yield self._encode_comment(row)\n if not obj.get('relation'):\n raise BadObject('Relation name not found or with invalid value.')\n yield self._encode_relation(obj['relation'])\n yield ''\n if not obj.get('attributes'):\n raise BadObject('Attributes not found.')\n attribute_names = set()\n for attr in obj['attributes']:\n if not isinstance(attr, (tuple, list)) or len(attr) != 2 or not isinstance(attr[0], str):\n raise BadObject('Invalid attribute declaration \"%s\"' % str(attr))\n if isinstance(attr[1], str):\n if attr[1] not in _SIMPLE_TYPES:\n raise BadObject('Invalid attribute type \"%s\"' % str(attr))\n elif not isinstance(attr[1], (tuple, list)):\n raise BadObject('Invalid attribute type \"%s\"' % str(attr))\n if attr[0] in attribute_names:\n raise BadObject('Trying to use attribute name \"%s\" for the second time.' % str(attr[0]))\n else:\n attribute_names.add(attr[0])\n yield self._encode_attribute(attr[0], attr[1])\n yield ''\n attributes = obj['attributes']\n yield _TK_DATA\n if 'data' in obj:\n data = _get_data_object_for_encoding(obj.get('data'))\n yield from data.encode_data(obj.get('data'), attributes)\n yield ''" }, { @@ -74526,13 +79058,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self):\n self.line = -1" }, { @@ -74550,13 +79083,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __str__(self):\n return self.message % self.line" }, { @@ -74574,7 +79108,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -74584,7 +79119,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value2", @@ -74594,13 +79130,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, value, value2):\n super().__init__()\n self.message = 'Bad @ATTRIBUTE name %s at line' % value + ' %d, this name is already in use in line' + ' %d.' % value2" }, { @@ -74618,7 +79155,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -74628,13 +79166,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, value):\n super().__init__()\n self.message = 'Bad @DATA instance format in line %d: ' + '%s' % value" }, { @@ -74652,7 +79191,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "msg", @@ -74662,13 +79202,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, msg=''):\n super().__init__()\n if msg:\n self.message = BadLayout.message + ' ' + msg.replace('%', '%%')" }, { @@ -74686,7 +79227,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -74696,13 +79238,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, value):\n super().__init__()\n self.message = 'Nominal data value \"%s\" not properly quoted in line ' % value + '%d.'" }, { @@ -74720,7 +79263,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -74730,13 +79274,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, value):\n super().__init__()\n self.message = 'Data value %s not found in nominal declaration, ' % value + 'at line %d.'" }, { @@ -74754,7 +79299,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "msg", @@ -74764,13 +79310,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, msg='Invalid object.'):\n self.msg = msg" }, { @@ -74788,13 +79335,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __str__(self):\n return '%s' % self.msg" }, { @@ -74812,7 +79360,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stream", @@ -74822,7 +79371,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "conversors", @@ -74832,13 +79382,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef decode_rows(self, stream, conversors):\n (data, rows, cols) = ([], [], [])\n for (i, row) in enumerate(stream):\n values = _parse_values(row)\n if not isinstance(values, dict):\n raise BadLayout()\n if not values:\n continue\n (row_cols, values) = zip(*sorted(values.items()))\n try:\n values = [value if value is None else conversors[key](value) for (key, value) in zip(row_cols, values)]\n except ValueError as exc:\n if 'float: ' in str(exc):\n raise BadNumericalValue()\n raise\n except IndexError:\n raise BadDataFormat(row)\n data.extend(values)\n rows.extend([i] * len(values))\n cols.extend(row_cols)\n return data, rows, cols" }, { @@ -74856,7 +79407,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data", @@ -74866,7 +79418,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "attributes", @@ -74876,13 +79429,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef encode_data(self, data, attributes):\n num_attributes = len(attributes)\n new_data = []\n current_row = 0\n row = data.row\n col = data.col\n data = data.data\n if not all((row[i] <= row[i + 1] for i in range(len(row) - 1))):\n raise ValueError('liac-arff can only output COO matrices with sorted rows.')\n for (v, col, row) in zip(data, col, row):\n if row > current_row:\n while current_row < row:\n yield ' '.join(['{', ','.join(new_data), '}'])\n new_data = []\n current_row += 1\n if col >= num_attributes:\n raise BadObject('Instance %d has at least %d attributes, expected %d' % (current_row, col + 1, num_attributes))\n if v is None or v == '' or v != v:\n s = '?'\n else:\n s = encode_string(str(v))\n new_data.append('%d %s' % (col, s))\n yield ' '.join(['{', ','.join(new_data), '}'])" }, { @@ -74900,7 +79454,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "conversors", @@ -74910,13 +79465,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@staticmethod\ndef _decode_values(values, conversors):\n try:\n values = [None if value is None else conversor(value) for (conversor, value) in zip(conversors, values)]\n except ValueError as exc:\n if 'float: ' in str(exc):\n raise BadNumericalValue()\n return values" }, { @@ -74934,7 +79490,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stream", @@ -74944,7 +79501,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "conversors", @@ -74954,13 +79512,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef decode_rows(self, stream, conversors):\n for row in stream:\n values = _parse_values(row)\n if isinstance(values, dict):\n if values and max(values) >= len(conversors):\n raise BadDataFormat(row)\n values = [values[i] if i in values else 0 for i in range(len(conversors))]\n elif len(values) != len(conversors):\n raise BadDataFormat(row)\n yield self._decode_values(values, conversors)" }, { @@ -74978,7 +79537,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data", @@ -74988,7 +79548,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "attributes", @@ -74998,13 +79559,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "(INTERNAL) Encodes a line of data.\n\nData instances follow the csv format, i.e, attribute values are delimited by commas. After converted from csv. :param data: a list of values. :param attributes: a list of attributes. Used to check if data is valid. :return: a string with the encoded data line.", - "docstring": "(INTERNAL) Encodes a line of data.\n\nData instances follow the csv format, i.e, attribute values are\ndelimited by commas. After converted from csv.\n\n:param data: a list of values.\n:param attributes: a list of attributes. Used to check if data is valid.\n:return: a string with the encoded data line.", + "description": "(INTERNAL) Encodes a line of data.\n\nData instances follow the csv format, i.e, attribute values are\ndelimited by commas. After converted from csv.\n\n:param data: a list of values.\n:param attributes: a list of attributes. Used to check if data is valid.\n:return: a string with the encoded data line.", + "docstring": "(INTERNAL) Encodes a line of data.\n\n Data instances follow the csv format, i.e, attribute values are\n delimited by commas. After converted from csv.\n\n :param data: a list of values.\n :param attributes: a list of attributes. Used to check if data is valid.\n :return: a string with the encoded data line.\n ", "source_code": "\ndef encode_data(self, data, attributes):\n \"\"\"(INTERNAL) Encodes a line of data.\n\n Data instances follow the csv format, i.e, attribute values are\n delimited by commas. After converted from csv.\n\n :param data: a list of values.\n :param attributes: a list of attributes. Used to check if data is valid.\n :return: a string with the encoded data line.\n \"\"\"\n current_row = 0\n for inst in data:\n if len(inst) != len(attributes):\n raise BadObject('Instance %d has %d attributes, expected %d' % (current_row, len(inst), len(attributes)))\n new_data = []\n for value in inst:\n if value is None or value == '' or value != value:\n s = '?'\n else:\n s = encode_string(str(value))\n new_data.append(s)\n current_row += 1\n yield ','.join(new_data)" }, { @@ -75022,7 +79584,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -75032,13 +79595,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __call__(self, value):\n try:\n return self.values[value]\n except KeyError:\n raise BadNominalValue(value)" }, { @@ -75056,7 +79620,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "values", @@ -75066,13 +79631,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, values):\n self.values = {v: i for (i, v) in enumerate(values)}\n self.values[0] = 0" }, { @@ -75090,7 +79656,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stream", @@ -75100,7 +79667,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "conversors", @@ -75110,13 +79678,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef decode_rows(self, stream, conversors):\n for row in stream:\n values = _parse_values(row)\n if not isinstance(values, dict):\n raise BadLayout()\n try:\n yield {key: None if value is None else conversors[key](value) for (key, value) in values.items()}\n except ValueError as exc:\n if 'float: ' in str(exc):\n raise BadNumericalValue()\n raise\n except IndexError:\n raise BadDataFormat(row)" }, { @@ -75134,7 +79703,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data", @@ -75144,7 +79714,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "attributes", @@ -75154,13 +79725,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef encode_data(self, data, attributes):\n current_row = 0\n num_attributes = len(attributes)\n for row in data:\n new_data = []\n if len(row) > 0 and max(row) >= num_attributes:\n raise BadObject('Instance %d has %d attributes, expected %d' % (current_row, max(row) + 1, num_attributes))\n for col in sorted(row):\n v = row[col]\n if v is None or v == '' or v != v:\n s = '?'\n else:\n s = encode_string(str(v))\n new_data.append('%d %s' % (col, s))\n current_row += 1\n yield ' '.join(['{', ','.join(new_data), '}'])" }, { @@ -75178,7 +79750,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -75188,13 +79761,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __call__(self, value):\n if value not in self.values:\n if value == 0:\n return self.zero_value\n raise BadNominalValue(value)\n return str(value)" }, { @@ -75212,7 +79786,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "values", @@ -75222,13 +79797,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, values):\n self.values = set(values)\n self.zero_value = values[0]" }, { @@ -75246,7 +79822,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stream", @@ -75256,7 +79833,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "conversors", @@ -75266,13 +79844,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef decode_rows(self, stream, conversors):\n return list(super().decode_rows(stream, conversors))" }, { @@ -75285,7 +79864,7 @@ "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _build_re_values():\n quoted_re = '\\n \" # open quote followed by zero or more of:\\n (?:\\n (? tol:\n print('matrix %s of the type %s is not sufficiently Hermitian:' % (name, M.dtype))\n print('condition: %.e < %e' % (nmd, tol))" }, { @@ -75861,6 +80472,10 @@ "docstring": { "type": "{sparse matrix, dense matrix, LinearOperator}", "description": "The symmetric linear operator of the problem, usually a\nsparse matrix. Often called the \"stiffness matrix\"." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -75871,7 +80486,8 @@ "docstring": { "type": "ndarray, float32 or float64", "description": "Initial approximation to the ``k`` eigenvectors (non-sparse). If `A`\nhas ``shape=(n,n)`` then `X` should have shape ``shape=(n,k)``." - } + }, + "refined_type": {} }, { "name": "B", @@ -75881,6 +80497,10 @@ "docstring": { "type": "{dense matrix, sparse matrix, LinearOperator}, optional", "description": "The right hand side operator in a generalized eigenproblem.\nBy default, ``B = Identity``. Often called the \"mass matrix\"." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -75891,6 +80511,10 @@ "docstring": { "type": "{dense matrix, sparse matrix, LinearOperator}, optional", "description": "Preconditioner to `A`; by default ``M = Identity``.\n`M` should approximate the inverse of `A`." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -75901,7 +80525,8 @@ "docstring": { "type": "ndarray, float32 or float64, optional", "description": "n-by-sizeY matrix of constraints (non-sparse), sizeY < n\nThe iterations will be performed in the B-orthogonal complement\nof the column-space of Y. Y must be full rank." - } + }, + "refined_type": {} }, { "name": "tol", @@ -75911,7 +80536,8 @@ "docstring": { "type": "scalar, optional", "description": "Solver tolerance (stopping criterion).\nThe default is ``tol=n*sqrt(eps)``." - } + }, + "refined_type": {} }, { "name": "maxiter", @@ -75921,7 +80547,8 @@ "docstring": { "type": "int, optional", "description": "Maximum number of iterations. The default is ``maxiter = 20``." - } + }, + "refined_type": {} }, { "name": "largest", @@ -75931,7 +80558,8 @@ "docstring": { "type": "bool, optional", "description": "When True, solve for the largest eigenvalues, otherwise the smallest." - } + }, + "refined_type": {} }, { "name": "verbosityLevel", @@ -75941,7 +80569,8 @@ "docstring": { "type": "int, optional", "description": "Controls solver output. The default is ``verbosityLevel=0``." - } + }, + "refined_type": {} }, { "name": "retLambdaHistory", @@ -75951,7 +80580,8 @@ "docstring": { "type": "bool, optional", "description": "Whether to return eigenvalue history. Default is False." - } + }, + "refined_type": {} }, { "name": "retResidualNormsHistory", @@ -75961,13 +80591,14 @@ "docstring": { "type": "bool, optional", "description": "Whether to return history of residual norms. Default is False." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Locally Optimal Block Preconditioned Conjugate Gradient Method (LOBPCG)\n\nLOBPCG is a preconditioned eigensolver for large symmetric positive definite (SPD) generalized eigenproblems.", - "docstring": "Locally Optimal Block Preconditioned Conjugate Gradient Method (LOBPCG)\n\nLOBPCG is a preconditioned eigensolver for large symmetric positive\ndefinite (SPD) generalized eigenproblems.\n\nParameters\n----------\nA : {sparse matrix, dense matrix, LinearOperator}\n The symmetric linear operator of the problem, usually a\n sparse matrix. Often called the \"stiffness matrix\".\nX : ndarray, float32 or float64\n Initial approximation to the ``k`` eigenvectors (non-sparse). If `A`\n has ``shape=(n,n)`` then `X` should have shape ``shape=(n,k)``.\nB : {dense matrix, sparse matrix, LinearOperator}, optional\n The right hand side operator in a generalized eigenproblem.\n By default, ``B = Identity``. Often called the \"mass matrix\".\nM : {dense matrix, sparse matrix, LinearOperator}, optional\n Preconditioner to `A`; by default ``M = Identity``.\n `M` should approximate the inverse of `A`.\nY : ndarray, float32 or float64, optional\n n-by-sizeY matrix of constraints (non-sparse), sizeY < n\n The iterations will be performed in the B-orthogonal complement\n of the column-space of Y. Y must be full rank.\ntol : scalar, optional\n Solver tolerance (stopping criterion).\n The default is ``tol=n*sqrt(eps)``.\nmaxiter : int, optional\n Maximum number of iterations. The default is ``maxiter = 20``.\nlargest : bool, optional\n When True, solve for the largest eigenvalues, otherwise the smallest.\nverbosityLevel : int, optional\n Controls solver output. The default is ``verbosityLevel=0``.\nretLambdaHistory : bool, optional\n Whether to return eigenvalue history. Default is False.\nretResidualNormsHistory : bool, optional\n Whether to return history of residual norms. Default is False.\n\nReturns\n-------\nw : ndarray\n Array of ``k`` eigenvalues\nv : ndarray\n An array of ``k`` eigenvectors. `v` has the same shape as `X`.\nlambdas : list of ndarray, optional\n The eigenvalue history, if `retLambdaHistory` is True.\nrnorms : list of ndarray, optional\n The history of residual norms, if `retResidualNormsHistory` is True.\n\nNotes\n-----\nIf both ``retLambdaHistory`` and ``retResidualNormsHistory`` are True,\nthe return tuple has the following format\n``(lambda, V, lambda history, residual norms history)``.\n\nIn the following ``n`` denotes the matrix size and ``m`` the number\nof required eigenvalues (smallest or largest).\n\nThe LOBPCG code internally solves eigenproblems of the size ``3m`` on every\niteration by calling the \"standard\" dense eigensolver, so if ``m`` is not\nsmall enough compared to ``n``, it does not make sense to call the LOBPCG\ncode, but rather one should use the \"standard\" eigensolver, e.g. numpy or\nscipy function in this case.\nIf one calls the LOBPCG algorithm for ``5m > n``, it will most likely break\ninternally, so the code tries to call the standard function instead.\n\nIt is not that ``n`` should be large for the LOBPCG to work, but rather the\nratio ``n / m`` should be large. It you call LOBPCG with ``m=1``\nand ``n=10``, it works though ``n`` is small. The method is intended\nfor extremely large ``n / m`` [4]_.\n\nThe convergence speed depends basically on two factors:\n\n1. How well relatively separated the seeking eigenvalues are from the rest\n of the eigenvalues. One can try to vary ``m`` to make this better.\n\n2. How well conditioned the problem is. This can be changed by using proper\n preconditioning. For example, a rod vibration test problem (under tests\n directory) is ill-conditioned for large ``n``, so convergence will be\n slow, unless efficient preconditioning is used. For this specific\n problem, a good simple preconditioner function would be a linear solve\n for `A`, which is easy to code since A is tridiagonal.\n\nReferences\n----------\n.. [1] A. V. Knyazev (2001),\n Toward the Optimal Preconditioned Eigensolver: Locally Optimal\n Block Preconditioned Conjugate Gradient Method.\n SIAM Journal on Scientific Computing 23, no. 2,\n pp. 517-541. :doi:`10.1137/S1064827500366124`\n\n.. [2] A. V. Knyazev, I. Lashuk, M. E. Argentati, and E. Ovchinnikov\n (2007), Block Locally Optimal Preconditioned Eigenvalue Xolvers\n (BLOPEX) in hypre and PETSc. :arxiv:`0705.2626`\n\n.. [3] A. V. Knyazev's C and MATLAB implementations:\n https://bitbucket.org/joseroman/blopex\n\n.. [4] S. Yamada, T. Imamura, T. Kano, and M. Machida (2006),\n High-performance computing for exact numerical approaches to\n quantum many-body problems on the earth simulator. In Proceedings\n of the 2006 ACM/IEEE Conference on Supercomputing.\n :doi:`10.1145/1188455.1188504`\n\nExamples\n--------\n\nSolve ``A x = lambda x`` with constraints and preconditioning.\n\n>>> import numpy as np\n>>> from scipy.sparse import spdiags, issparse\n>>> from scipy.sparse.linalg import lobpcg, LinearOperator\n>>> n = 100\n>>> vals = np.arange(1, n + 1)\n>>> A = spdiags(vals, 0, n, n)\n>>> A.toarray()\narray([[ 1., 0., 0., ..., 0., 0., 0.],\n [ 0., 2., 0., ..., 0., 0., 0.],\n [ 0., 0., 3., ..., 0., 0., 0.],\n ...,\n [ 0., 0., 0., ..., 98., 0., 0.],\n [ 0., 0., 0., ..., 0., 99., 0.],\n [ 0., 0., 0., ..., 0., 0., 100.]])\n\nConstraints:\n\n>>> Y = np.eye(n, 3)\n\nInitial guess for eigenvectors, should have linearly independent\ncolumns. Column dimension = number of requested eigenvalues.\n\n>>> rng = np.random.default_rng()\n>>> X = rng.random((n, 3))\n\nPreconditioner in the inverse of A in this example:\n\n>>> invA = spdiags([1./vals], 0, n, n)\n\nThe preconditiner must be defined by a function:\n\n>>> def precond( x ):\n... return invA @ x\n\nThe argument x of the preconditioner function is a matrix inside `lobpcg`,\nthus the use of matrix-matrix product ``@``.\n\nThe preconditioner function is passed to lobpcg as a `LinearOperator`:\n\n>>> M = LinearOperator(matvec=precond, matmat=precond,\n... shape=(n, n), dtype=float)\n\nLet us now solve the eigenvalue problem for the matrix A:\n\n>>> eigenvalues, _ = lobpcg(A, X, Y=Y, M=M, largest=False)\n>>> eigenvalues\narray([4., 5., 6.])\n\nNote that the vectors passed in Y are the eigenvectors of the 3 smallest\neigenvalues. The results returned are orthogonal to those.", + "description": "Locally Optimal Block Preconditioned Conjugate Gradient Method (LOBPCG)\n\nLOBPCG is a preconditioned eigensolver for large symmetric positive\ndefinite (SPD) generalized eigenproblems.", + "docstring": "Locally Optimal Block Preconditioned Conjugate Gradient Method (LOBPCG)\n\n LOBPCG is a preconditioned eigensolver for large symmetric positive\n definite (SPD) generalized eigenproblems.\n\n Parameters\n ----------\n A : {sparse matrix, dense matrix, LinearOperator}\n The symmetric linear operator of the problem, usually a\n sparse matrix. Often called the \"stiffness matrix\".\n X : ndarray, float32 or float64\n Initial approximation to the ``k`` eigenvectors (non-sparse). If `A`\n has ``shape=(n,n)`` then `X` should have shape ``shape=(n,k)``.\n B : {dense matrix, sparse matrix, LinearOperator}, optional\n The right hand side operator in a generalized eigenproblem.\n By default, ``B = Identity``. Often called the \"mass matrix\".\n M : {dense matrix, sparse matrix, LinearOperator}, optional\n Preconditioner to `A`; by default ``M = Identity``.\n `M` should approximate the inverse of `A`.\n Y : ndarray, float32 or float64, optional\n n-by-sizeY matrix of constraints (non-sparse), sizeY < n\n The iterations will be performed in the B-orthogonal complement\n of the column-space of Y. Y must be full rank.\n tol : scalar, optional\n Solver tolerance (stopping criterion).\n The default is ``tol=n*sqrt(eps)``.\n maxiter : int, optional\n Maximum number of iterations. The default is ``maxiter = 20``.\n largest : bool, optional\n When True, solve for the largest eigenvalues, otherwise the smallest.\n verbosityLevel : int, optional\n Controls solver output. The default is ``verbosityLevel=0``.\n retLambdaHistory : bool, optional\n Whether to return eigenvalue history. Default is False.\n retResidualNormsHistory : bool, optional\n Whether to return history of residual norms. Default is False.\n\n Returns\n -------\n w : ndarray\n Array of ``k`` eigenvalues\n v : ndarray\n An array of ``k`` eigenvectors. `v` has the same shape as `X`.\n lambdas : list of ndarray, optional\n The eigenvalue history, if `retLambdaHistory` is True.\n rnorms : list of ndarray, optional\n The history of residual norms, if `retResidualNormsHistory` is True.\n\n Notes\n -----\n If both ``retLambdaHistory`` and ``retResidualNormsHistory`` are True,\n the return tuple has the following format\n ``(lambda, V, lambda history, residual norms history)``.\n\n In the following ``n`` denotes the matrix size and ``m`` the number\n of required eigenvalues (smallest or largest).\n\n The LOBPCG code internally solves eigenproblems of the size ``3m`` on every\n iteration by calling the \"standard\" dense eigensolver, so if ``m`` is not\n small enough compared to ``n``, it does not make sense to call the LOBPCG\n code, but rather one should use the \"standard\" eigensolver, e.g. numpy or\n scipy function in this case.\n If one calls the LOBPCG algorithm for ``5m > n``, it will most likely break\n internally, so the code tries to call the standard function instead.\n\n It is not that ``n`` should be large for the LOBPCG to work, but rather the\n ratio ``n / m`` should be large. It you call LOBPCG with ``m=1``\n and ``n=10``, it works though ``n`` is small. The method is intended\n for extremely large ``n / m`` [4]_.\n\n The convergence speed depends basically on two factors:\n\n 1. How well relatively separated the seeking eigenvalues are from the rest\n of the eigenvalues. One can try to vary ``m`` to make this better.\n\n 2. How well conditioned the problem is. This can be changed by using proper\n preconditioning. For example, a rod vibration test problem (under tests\n directory) is ill-conditioned for large ``n``, so convergence will be\n slow, unless efficient preconditioning is used. For this specific\n problem, a good simple preconditioner function would be a linear solve\n for `A`, which is easy to code since A is tridiagonal.\n\n References\n ----------\n .. [1] A. V. Knyazev (2001),\n Toward the Optimal Preconditioned Eigensolver: Locally Optimal\n Block Preconditioned Conjugate Gradient Method.\n SIAM Journal on Scientific Computing 23, no. 2,\n pp. 517-541. :doi:`10.1137/S1064827500366124`\n\n .. [2] A. V. Knyazev, I. Lashuk, M. E. Argentati, and E. Ovchinnikov\n (2007), Block Locally Optimal Preconditioned Eigenvalue Xolvers\n (BLOPEX) in hypre and PETSc. :arxiv:`0705.2626`\n\n .. [3] A. V. Knyazev's C and MATLAB implementations:\n https://bitbucket.org/joseroman/blopex\n\n .. [4] S. Yamada, T. Imamura, T. Kano, and M. Machida (2006),\n High-performance computing for exact numerical approaches to\n quantum many-body problems on the earth simulator. In Proceedings\n of the 2006 ACM/IEEE Conference on Supercomputing.\n :doi:`10.1145/1188455.1188504`\n\n Examples\n --------\n\n Solve ``A x = lambda x`` with constraints and preconditioning.\n\n >>> import numpy as np\n >>> from scipy.sparse import spdiags, issparse\n >>> from scipy.sparse.linalg import lobpcg, LinearOperator\n >>> n = 100\n >>> vals = np.arange(1, n + 1)\n >>> A = spdiags(vals, 0, n, n)\n >>> A.toarray()\n array([[ 1., 0., 0., ..., 0., 0., 0.],\n [ 0., 2., 0., ..., 0., 0., 0.],\n [ 0., 0., 3., ..., 0., 0., 0.],\n ...,\n [ 0., 0., 0., ..., 98., 0., 0.],\n [ 0., 0., 0., ..., 0., 99., 0.],\n [ 0., 0., 0., ..., 0., 0., 100.]])\n\n Constraints:\n\n >>> Y = np.eye(n, 3)\n\n Initial guess for eigenvectors, should have linearly independent\n columns. Column dimension = number of requested eigenvalues.\n\n >>> rng = np.random.default_rng()\n >>> X = rng.random((n, 3))\n\n Preconditioner in the inverse of A in this example:\n\n >>> invA = spdiags([1./vals], 0, n, n)\n\n The preconditiner must be defined by a function:\n\n >>> def precond( x ):\n ... return invA @ x\n\n The argument x of the preconditioner function is a matrix inside `lobpcg`,\n thus the use of matrix-matrix product ``@``.\n\n The preconditioner function is passed to lobpcg as a `LinearOperator`:\n\n >>> M = LinearOperator(matvec=precond, matmat=precond,\n ... shape=(n, n), dtype=float)\n\n Let us now solve the eigenvalue problem for the matrix A:\n\n >>> eigenvalues, _ = lobpcg(A, X, Y=Y, M=M, largest=False)\n >>> eigenvalues\n array([4., 5., 6.])\n\n Note that the vectors passed in Y are the eigenvectors of the 3 smallest\n eigenvalues. The results returned are orthogonal to those.\n\n ", "source_code": "\ndef lobpcg(A, X, B=None, M=None, Y=None, tol=None, maxiter=None, largest=True, verbosityLevel=0, retLambdaHistory=False, retResidualNormsHistory=False):\n \"\"\"Locally Optimal Block Preconditioned Conjugate Gradient Method (LOBPCG)\n\n LOBPCG is a preconditioned eigensolver for large symmetric positive\n definite (SPD) generalized eigenproblems.\n\n Parameters\n ----------\n A : {sparse matrix, dense matrix, LinearOperator}\n The symmetric linear operator of the problem, usually a\n sparse matrix. Often called the \"stiffness matrix\".\n X : ndarray, float32 or float64\n Initial approximation to the ``k`` eigenvectors (non-sparse). If `A`\n has ``shape=(n,n)`` then `X` should have shape ``shape=(n,k)``.\n B : {dense matrix, sparse matrix, LinearOperator}, optional\n The right hand side operator in a generalized eigenproblem.\n By default, ``B = Identity``. Often called the \"mass matrix\".\n M : {dense matrix, sparse matrix, LinearOperator}, optional\n Preconditioner to `A`; by default ``M = Identity``.\n `M` should approximate the inverse of `A`.\n Y : ndarray, float32 or float64, optional\n n-by-sizeY matrix of constraints (non-sparse), sizeY < n\n The iterations will be performed in the B-orthogonal complement\n of the column-space of Y. Y must be full rank.\n tol : scalar, optional\n Solver tolerance (stopping criterion).\n The default is ``tol=n*sqrt(eps)``.\n maxiter : int, optional\n Maximum number of iterations. The default is ``maxiter = 20``.\n largest : bool, optional\n When True, solve for the largest eigenvalues, otherwise the smallest.\n verbosityLevel : int, optional\n Controls solver output. The default is ``verbosityLevel=0``.\n retLambdaHistory : bool, optional\n Whether to return eigenvalue history. Default is False.\n retResidualNormsHistory : bool, optional\n Whether to return history of residual norms. Default is False.\n\n Returns\n -------\n w : ndarray\n Array of ``k`` eigenvalues\n v : ndarray\n An array of ``k`` eigenvectors. `v` has the same shape as `X`.\n lambdas : list of ndarray, optional\n The eigenvalue history, if `retLambdaHistory` is True.\n rnorms : list of ndarray, optional\n The history of residual norms, if `retResidualNormsHistory` is True.\n\n Notes\n -----\n If both ``retLambdaHistory`` and ``retResidualNormsHistory`` are True,\n the return tuple has the following format\n ``(lambda, V, lambda history, residual norms history)``.\n\n In the following ``n`` denotes the matrix size and ``m`` the number\n of required eigenvalues (smallest or largest).\n\n The LOBPCG code internally solves eigenproblems of the size ``3m`` on every\n iteration by calling the \"standard\" dense eigensolver, so if ``m`` is not\n small enough compared to ``n``, it does not make sense to call the LOBPCG\n code, but rather one should use the \"standard\" eigensolver, e.g. numpy or\n scipy function in this case.\n If one calls the LOBPCG algorithm for ``5m > n``, it will most likely break\n internally, so the code tries to call the standard function instead.\n\n It is not that ``n`` should be large for the LOBPCG to work, but rather the\n ratio ``n / m`` should be large. It you call LOBPCG with ``m=1``\n and ``n=10``, it works though ``n`` is small. The method is intended\n for extremely large ``n / m`` [4]_.\n\n The convergence speed depends basically on two factors:\n\n 1. How well relatively separated the seeking eigenvalues are from the rest\n of the eigenvalues. One can try to vary ``m`` to make this better.\n\n 2. How well conditioned the problem is. This can be changed by using proper\n preconditioning. For example, a rod vibration test problem (under tests\n directory) is ill-conditioned for large ``n``, so convergence will be\n slow, unless efficient preconditioning is used. For this specific\n problem, a good simple preconditioner function would be a linear solve\n for `A`, which is easy to code since A is tridiagonal.\n\n References\n ----------\n .. [1] A. V. Knyazev (2001),\n Toward the Optimal Preconditioned Eigensolver: Locally Optimal\n Block Preconditioned Conjugate Gradient Method.\n SIAM Journal on Scientific Computing 23, no. 2,\n pp. 517-541. :doi:`10.1137/S1064827500366124`\n\n .. [2] A. V. Knyazev, I. Lashuk, M. E. Argentati, and E. Ovchinnikov\n (2007), Block Locally Optimal Preconditioned Eigenvalue Xolvers\n (BLOPEX) in hypre and PETSc. :arxiv:`0705.2626`\n\n .. [3] A. V. Knyazev's C and MATLAB implementations:\n https://bitbucket.org/joseroman/blopex\n\n .. [4] S. Yamada, T. Imamura, T. Kano, and M. Machida (2006),\n High-performance computing for exact numerical approaches to\n quantum many-body problems on the earth simulator. In Proceedings\n of the 2006 ACM/IEEE Conference on Supercomputing.\n :doi:`10.1145/1188455.1188504`\n\n Examples\n --------\n\n Solve ``A x = lambda x`` with constraints and preconditioning.\n\n >>> import numpy as np\n >>> from scipy.sparse import spdiags, issparse\n >>> from scipy.sparse.linalg import lobpcg, LinearOperator\n >>> n = 100\n >>> vals = np.arange(1, n + 1)\n >>> A = spdiags(vals, 0, n, n)\n >>> A.toarray()\n array([[ 1., 0., 0., ..., 0., 0., 0.],\n [ 0., 2., 0., ..., 0., 0., 0.],\n [ 0., 0., 3., ..., 0., 0., 0.],\n ...,\n [ 0., 0., 0., ..., 98., 0., 0.],\n [ 0., 0., 0., ..., 0., 99., 0.],\n [ 0., 0., 0., ..., 0., 0., 100.]])\n\n Constraints:\n\n >>> Y = np.eye(n, 3)\n\n Initial guess for eigenvectors, should have linearly independent\n columns. Column dimension = number of requested eigenvalues.\n\n >>> rng = np.random.default_rng()\n >>> X = rng.random((n, 3))\n\n Preconditioner in the inverse of A in this example:\n\n >>> invA = spdiags([1./vals], 0, n, n)\n\n The preconditiner must be defined by a function:\n\n >>> def precond( x ):\n ... return invA @ x\n\n The argument x of the preconditioner function is a matrix inside `lobpcg`,\n thus the use of matrix-matrix product ``@``.\n\n The preconditioner function is passed to lobpcg as a `LinearOperator`:\n\n >>> M = LinearOperator(matvec=precond, matmat=precond,\n ... shape=(n, n), dtype=float)\n\n Let us now solve the eigenvalue problem for the matrix A:\n\n >>> eigenvalues, _ = lobpcg(A, X, Y=Y, M=M, largest=False)\n >>> eigenvalues\n array([4., 5., 6.])\n\n Note that the vectors passed in Y are the eigenvectors of the 3 smallest\n eigenvalues. The results returned are orthogonal to those.\n\n \"\"\"\n blockVectorX = X\n blockVectorY = Y\n residualTolerance = tol\n if maxiter is None:\n maxiter = 20\n if blockVectorY is not None:\n sizeY = blockVectorY.shape[1]\n else:\n sizeY = 0\n if len(blockVectorX.shape) != 2:\n raise ValueError('expected rank-2 array for argument X')\n (n, sizeX) = blockVectorX.shape\n if verbosityLevel:\n aux = 'Solving '\n if B is None:\n aux += 'standard'\n else:\n aux += 'generalized'\n aux += ' eigenvalue problem with'\n if M is None:\n aux += 'out'\n aux += ' preconditioning\\n\\n'\n aux += 'matrix size %d\\n' % n\n aux += 'block size %d\\n\\n' % sizeX\n if blockVectorY is None:\n aux += 'No constraints\\n\\n'\n elif sizeY > 1:\n aux += '%d constraints\\n\\n' % sizeY\n else:\n aux += '%d constraint\\n\\n' % sizeY\n print(aux)\n A = _makeOperator(A, (n, n))\n B = _makeOperator(B, (n, n))\n M = _makeOperator(M, (n, n))\n if n - sizeY < 5 * sizeX:\n sizeX = min(sizeX, n)\n if blockVectorY is not None:\n raise NotImplementedError('The dense eigensolver does not support constraints.')\n if largest:\n eigvals = (n - sizeX, n - 1)\n else:\n eigvals = (0, sizeX - 1)\n A_dense = A(np.eye(n, dtype=A.dtype))\n B_dense = None if B is None else B(np.eye(n, dtype=B.dtype))\n (vals, vecs) = eigh(A_dense, B_dense, eigvals=eigvals, check_finite=False)\n if largest:\n vals = vals[::-1]\n vecs = vecs[:, ::-1]\n return vals, vecs\n if residualTolerance is None or residualTolerance <= 0.0:\n residualTolerance = np.sqrt(1e-15) * n\n if blockVectorY is not None:\n if B is not None:\n blockVectorBY = B(blockVectorY)\n else:\n blockVectorBY = blockVectorY\n gramYBY = np.dot(blockVectorY.T.conj(), blockVectorBY)\n try:\n gramYBY = cho_factor(gramYBY)\n except LinAlgError as e:\n raise ValueError('cannot handle linearly dependent constraints') from e\n _applyConstraints(blockVectorX, gramYBY, blockVectorBY, blockVectorY)\n (blockVectorX, blockVectorBX) = _b_orthonormalize(B, blockVectorX)\n blockVectorAX = A(blockVectorX)\n gramXAX = np.dot(blockVectorX.T.conj(), blockVectorAX)\n (_lambda, eigBlockVector) = eigh(gramXAX, check_finite=False)\n ii = _get_indx(_lambda, sizeX, largest)\n _lambda = _lambda[ii]\n eigBlockVector = np.asarray(eigBlockVector[:, ii])\n blockVectorX = np.dot(blockVectorX, eigBlockVector)\n blockVectorAX = np.dot(blockVectorAX, eigBlockVector)\n if B is not None:\n blockVectorBX = np.dot(blockVectorBX, eigBlockVector)\n activeMask = np.ones((sizeX, ), dtype=bool)\n lambdaHistory = [_lambda]\n residualNormsHistory = []\n previousBlockSize = sizeX\n ident = np.eye(sizeX, dtype=A.dtype)\n ident0 = np.eye(sizeX, dtype=A.dtype)\n blockVectorP = None\n blockVectorAP = None\n blockVectorBP = None\n iterationNumber = -1\n restart = True\n explicitGramFlag = False\n while iterationNumber < maxiter:\n iterationNumber += 1\n if verbosityLevel > 0:\n print('iteration %d' % iterationNumber)\n if B is not None:\n aux = blockVectorBX * _lambda[np.newaxis, :]\n else:\n aux = blockVectorX * _lambda[np.newaxis, :]\n blockVectorR = blockVectorAX - aux\n aux = np.sum(blockVectorR.conj() * blockVectorR, 0)\n residualNorms = np.sqrt(aux)\n residualNormsHistory.append(residualNorms)\n ii = np.where(residualNorms > residualTolerance, True, False)\n activeMask = activeMask & ii\n if verbosityLevel > 2:\n print(activeMask)\n currentBlockSize = activeMask.sum()\n if currentBlockSize != previousBlockSize:\n previousBlockSize = currentBlockSize\n ident = np.eye(currentBlockSize, dtype=A.dtype)\n if currentBlockSize == 0:\n break\n if verbosityLevel > 0:\n print('current block size:', currentBlockSize)\n print('eigenvalue:', _lambda)\n print('residual norms:', residualNorms)\n if verbosityLevel > 10:\n print(eigBlockVector)\n activeBlockVectorR = _as2d(blockVectorR[:, activeMask])\n if iterationNumber > 0:\n activeBlockVectorP = _as2d(blockVectorP[:, activeMask])\n activeBlockVectorAP = _as2d(blockVectorAP[:, activeMask])\n if B is not None:\n activeBlockVectorBP = _as2d(blockVectorBP[:, activeMask])\n if M is not None:\n activeBlockVectorR = M(activeBlockVectorR)\n if blockVectorY is not None:\n _applyConstraints(activeBlockVectorR, gramYBY, blockVectorBY, blockVectorY)\n if B is not None:\n activeBlockVectorR = activeBlockVectorR - np.matmul(blockVectorX, np.matmul(blockVectorBX.T.conj(), activeBlockVectorR))\n else:\n activeBlockVectorR = activeBlockVectorR - np.matmul(blockVectorX, np.matmul(blockVectorX.T.conj(), activeBlockVectorR))\n aux = _b_orthonormalize(B, activeBlockVectorR)\n (activeBlockVectorR, activeBlockVectorBR) = aux\n activeBlockVectorAR = A(activeBlockVectorR)\n if iterationNumber > 0:\n if B is not None:\n aux = _b_orthonormalize(B, activeBlockVectorP, activeBlockVectorBP, retInvR=True)\n (activeBlockVectorP, activeBlockVectorBP, invR, normal) = aux\n else:\n aux = _b_orthonormalize(B, activeBlockVectorP, retInvR=True)\n (activeBlockVectorP, _, invR, normal) = aux\n if activeBlockVectorP is not None:\n activeBlockVectorAP = activeBlockVectorAP / normal\n activeBlockVectorAP = np.dot(activeBlockVectorAP, invR)\n restart = False\n else:\n restart = True\n if activeBlockVectorAR.dtype == 'float32':\n myeps = 1\n elif activeBlockVectorR.dtype == 'float32':\n myeps = 0.0001\n else:\n myeps = 1e-08\n if residualNorms.max() > myeps and not explicitGramFlag:\n explicitGramFlag = False\n else:\n explicitGramFlag = True\n if B is None:\n blockVectorBX = blockVectorX\n activeBlockVectorBR = activeBlockVectorR\n if not restart:\n activeBlockVectorBP = activeBlockVectorP\n gramXAR = np.dot(blockVectorX.T.conj(), activeBlockVectorAR)\n gramRAR = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorAR)\n if explicitGramFlag:\n gramRAR = (gramRAR + gramRAR.T.conj()) / 2\n gramXAX = np.dot(blockVectorX.T.conj(), blockVectorAX)\n gramXAX = (gramXAX + gramXAX.T.conj()) / 2\n gramXBX = np.dot(blockVectorX.T.conj(), blockVectorBX)\n gramRBR = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorBR)\n gramXBR = np.dot(blockVectorX.T.conj(), activeBlockVectorBR)\n else:\n gramXAX = np.diag(_lambda)\n gramXBX = ident0\n gramRBR = ident\n gramXBR = np.zeros((sizeX, currentBlockSize), dtype=A.dtype)\n \n def _handle_gramA_gramB_verbosity(gramA, gramB):\n if verbosityLevel > 0:\n _report_nonhermitian(gramA, 'gramA')\n _report_nonhermitian(gramB, 'gramB')\n if verbosityLevel > 10:\n np.savetxt('gramA.txt', gramA)\n np.savetxt('gramB.txt', gramB)\n if not restart:\n gramXAP = np.dot(blockVectorX.T.conj(), activeBlockVectorAP)\n gramRAP = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorAP)\n gramPAP = np.dot(activeBlockVectorP.T.conj(), activeBlockVectorAP)\n gramXBP = np.dot(blockVectorX.T.conj(), activeBlockVectorBP)\n gramRBP = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorBP)\n if explicitGramFlag:\n gramPAP = (gramPAP + gramPAP.T.conj()) / 2\n gramPBP = np.dot(activeBlockVectorP.T.conj(), activeBlockVectorBP)\n else:\n gramPBP = ident\n gramA = bmat([[gramXAX, gramXAR, gramXAP], [gramXAR.T.conj(), gramRAR, gramRAP], [gramXAP.T.conj(), gramRAP.T.conj(), gramPAP]])\n gramB = bmat([[gramXBX, gramXBR, gramXBP], [gramXBR.T.conj(), gramRBR, gramRBP], [gramXBP.T.conj(), gramRBP.T.conj(), gramPBP]])\n _handle_gramA_gramB_verbosity(gramA, gramB)\n try:\n (_lambda, eigBlockVector) = eigh(gramA, gramB, check_finite=False)\n except LinAlgError:\n restart = True\n if restart:\n gramA = bmat([[gramXAX, gramXAR], [gramXAR.T.conj(), gramRAR]])\n gramB = bmat([[gramXBX, gramXBR], [gramXBR.T.conj(), gramRBR]])\n _handle_gramA_gramB_verbosity(gramA, gramB)\n try:\n (_lambda, eigBlockVector) = eigh(gramA, gramB, check_finite=False)\n except LinAlgError as e:\n raise ValueError('eigh has failed in lobpcg iterations') from e\n ii = _get_indx(_lambda, sizeX, largest)\n if verbosityLevel > 10:\n print(ii)\n print(_lambda)\n _lambda = _lambda[ii]\n eigBlockVector = eigBlockVector[:, ii]\n lambdaHistory.append(_lambda)\n if verbosityLevel > 10:\n print('lambda:', _lambda)\n if verbosityLevel > 10:\n print(eigBlockVector)\n if B is not None:\n if not restart:\n eigBlockVectorX = eigBlockVector[:sizeX]\n eigBlockVectorR = eigBlockVector[sizeX:sizeX + currentBlockSize]\n eigBlockVectorP = eigBlockVector[sizeX + currentBlockSize:]\n pp = np.dot(activeBlockVectorR, eigBlockVectorR)\n pp += np.dot(activeBlockVectorP, eigBlockVectorP)\n app = np.dot(activeBlockVectorAR, eigBlockVectorR)\n app += np.dot(activeBlockVectorAP, eigBlockVectorP)\n bpp = np.dot(activeBlockVectorBR, eigBlockVectorR)\n bpp += np.dot(activeBlockVectorBP, eigBlockVectorP)\n else:\n eigBlockVectorX = eigBlockVector[:sizeX]\n eigBlockVectorR = eigBlockVector[sizeX:]\n pp = np.dot(activeBlockVectorR, eigBlockVectorR)\n app = np.dot(activeBlockVectorAR, eigBlockVectorR)\n bpp = np.dot(activeBlockVectorBR, eigBlockVectorR)\n if verbosityLevel > 10:\n print(pp)\n print(app)\n print(bpp)\n blockVectorX = np.dot(blockVectorX, eigBlockVectorX) + pp\n blockVectorAX = np.dot(blockVectorAX, eigBlockVectorX) + app\n blockVectorBX = np.dot(blockVectorBX, eigBlockVectorX) + bpp\n (blockVectorP, blockVectorAP, blockVectorBP) = (pp, app, bpp)\n else:\n if not restart:\n eigBlockVectorX = eigBlockVector[:sizeX]\n eigBlockVectorR = eigBlockVector[sizeX:sizeX + currentBlockSize]\n eigBlockVectorP = eigBlockVector[sizeX + currentBlockSize:]\n pp = np.dot(activeBlockVectorR, eigBlockVectorR)\n pp += np.dot(activeBlockVectorP, eigBlockVectorP)\n app = np.dot(activeBlockVectorAR, eigBlockVectorR)\n app += np.dot(activeBlockVectorAP, eigBlockVectorP)\n else:\n eigBlockVectorX = eigBlockVector[:sizeX]\n eigBlockVectorR = eigBlockVector[sizeX:]\n pp = np.dot(activeBlockVectorR, eigBlockVectorR)\n app = np.dot(activeBlockVectorAR, eigBlockVectorR)\n if verbosityLevel > 10:\n print(pp)\n print(app)\n blockVectorX = np.dot(blockVectorX, eigBlockVectorX) + pp\n blockVectorAX = np.dot(blockVectorAX, eigBlockVectorX) + app\n (blockVectorP, blockVectorAP) = (pp, app)\n if B is not None:\n aux = blockVectorBX * _lambda[np.newaxis, :]\n else:\n aux = blockVectorX * _lambda[np.newaxis, :]\n blockVectorR = blockVectorAX - aux\n aux = np.sum(blockVectorR.conj() * blockVectorR, 0)\n residualNorms = np.sqrt(aux)\n if verbosityLevel > 0:\n print('final eigenvalue:', _lambda)\n print('final residual norms:', residualNorms)\n if retLambdaHistory:\n if retResidualNormsHistory:\n return _lambda, blockVectorX, lambdaHistory, residualNormsHistory\n else:\n return _lambda, blockVectorX, lambdaHistory\n elif retResidualNormsHistory:\n return _lambda, blockVectorX, residualNormsHistory\n else:\n return _lambda, blockVectorX" }, { @@ -75985,7 +80616,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -75995,13 +80627,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __eq__(self, other: object) -> bool:\n return isinstance(other, self.__class__)" }, { @@ -76019,7 +80652,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -76029,13 +80663,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __ge__(self, other: object) -> bool:\n return True" }, { @@ -76053,7 +80688,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -76063,13 +80699,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __gt__(self, other: object) -> bool:\n return True" }, { @@ -76087,13 +80724,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __hash__(self) -> int:\n return hash(repr(self))" }, { @@ -76111,7 +80749,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -76121,13 +80760,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __le__(self, other: object) -> bool:\n return False" }, { @@ -76145,7 +80785,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -76155,13 +80796,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __lt__(self, other: object) -> bool:\n return False" }, { @@ -76179,7 +80821,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -76189,13 +80832,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __ne__(self, other: object) -> bool:\n return not isinstance(other, self.__class__)" }, { @@ -76213,13 +80857,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __neg__(self: object) -> 'NegativeInfinityType':\n return NegativeInfinity" }, { @@ -76237,13 +80882,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self) -> str:\n return 'Infinity'" }, { @@ -76261,7 +80907,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -76271,13 +80918,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __eq__(self, other: object) -> bool:\n return isinstance(other, self.__class__)" }, { @@ -76295,7 +80943,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -76305,13 +80954,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __ge__(self, other: object) -> bool:\n return False" }, { @@ -76329,7 +80979,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -76339,13 +80990,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __gt__(self, other: object) -> bool:\n return False" }, { @@ -76363,13 +81015,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __hash__(self) -> int:\n return hash(repr(self))" }, { @@ -76387,7 +81040,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -76397,13 +81051,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __le__(self, other: object) -> bool:\n return True" }, { @@ -76421,7 +81076,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -76431,13 +81087,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __lt__(self, other: object) -> bool:\n return True" }, { @@ -76455,7 +81112,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -76465,13 +81123,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __ne__(self, other: object) -> bool:\n return not isinstance(other, self.__class__)" }, { @@ -76489,13 +81148,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __neg__(self: object) -> InfinityType:\n return Infinity" }, { @@ -76513,13 +81173,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self) -> str:\n return '-Infinity'" }, { @@ -76537,7 +81198,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "version", @@ -76547,13 +81209,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, version: str) -> None:\n self._version = str(version)\n self._key = _legacy_cmpkey(self._version)\n warnings.warn('Creating a LegacyVersion has been deprecated and will be removed in the next major release', DeprecationWarning)" }, { @@ -76571,13 +81234,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self) -> str:\n return f\"\"" }, { @@ -76595,13 +81259,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __str__(self) -> str:\n return self._version" }, { @@ -76619,13 +81284,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef base_version(self) -> str:\n return self._version" }, { @@ -76643,13 +81309,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef dev(self) -> None:\n return None" }, { @@ -76667,13 +81334,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef epoch(self) -> int:\n return -1" }, { @@ -76691,13 +81359,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef is_devrelease(self) -> bool:\n return False" }, { @@ -76715,13 +81384,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef is_postrelease(self) -> bool:\n return False" }, { @@ -76739,13 +81409,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef is_prerelease(self) -> bool:\n return False" }, { @@ -76763,13 +81434,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef local(self) -> None:\n return None" }, { @@ -76787,13 +81459,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef post(self) -> None:\n return None" }, { @@ -76811,13 +81484,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef pre(self) -> None:\n return None" }, { @@ -76835,13 +81509,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef public(self) -> str:\n return self._version" }, { @@ -76859,13 +81534,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef release(self) -> None:\n return None" }, { @@ -76883,7 +81559,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "version", @@ -76893,13 +81570,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, version: str) -> None:\n match = self._regex.search(version)\n if not match:\n raise InvalidVersion(f\"Invalid version: '{version}'\")\n self._version = _Version(epoch=int(match.group('epoch')) if match.group('epoch') else 0, release=tuple((int(i) for i in match.group('release').split('.'))), pre=_parse_letter_version(match.group('pre_l'), match.group('pre_n')), post=_parse_letter_version(match.group('post_l'), match.group('post_n1') or match.group('post_n2')), dev=_parse_letter_version(match.group('dev_l'), match.group('dev_n')), local=_parse_local_version(match.group('local')))\n self._key = _cmpkey(self._version.epoch, self._version.release, self._version.pre, self._version.post, self._version.dev, self._version.local)" }, { @@ -76917,13 +81595,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self) -> str:\n return f\"\"" }, { @@ -76941,13 +81620,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __str__(self) -> str:\n parts = []\n if self.epoch != 0:\n parts.append(f'{self.epoch}!')\n parts.append('.'.join((str(x) for x in self.release)))\n if self.pre is not None:\n parts.append(''.join((str(x) for x in self.pre)))\n if self.post is not None:\n parts.append(f'.post{self.post}')\n if self.dev is not None:\n parts.append(f'.dev{self.dev}')\n if self.local is not None:\n parts.append(f'+{self.local}')\n return ''.join(parts)" }, { @@ -76965,13 +81645,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef base_version(self) -> str:\n parts = []\n if self.epoch != 0:\n parts.append(f'{self.epoch}!')\n parts.append('.'.join((str(x) for x in self.release)))\n return ''.join(parts)" }, { @@ -76989,13 +81670,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef dev(self) -> Optional[int]:\n return self._version.dev[1] if self._version.dev else None" }, { @@ -77013,13 +81695,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef epoch(self) -> int:\n _epoch: int = self._version.epoch\n return _epoch" }, { @@ -77037,13 +81720,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef is_devrelease(self) -> bool:\n return self.dev is not None" }, { @@ -77061,13 +81745,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef is_postrelease(self) -> bool:\n return self.post is not None" }, { @@ -77085,13 +81770,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef is_prerelease(self) -> bool:\n return self.dev is not None or self.pre is not None" }, { @@ -77109,13 +81795,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef local(self) -> Optional[str]:\n if self._version.local:\n return '.'.join((str(x) for x in self._version.local))\n else:\n return None" }, { @@ -77133,13 +81820,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef major(self) -> int:\n return self.release[0] if len(self.release) >= 1 else 0" }, { @@ -77157,13 +81845,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef micro(self) -> int:\n return self.release[2] if len(self.release) >= 3 else 0" }, { @@ -77181,13 +81870,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef minor(self) -> int:\n return self.release[1] if len(self.release) >= 2 else 0" }, { @@ -77205,13 +81895,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef post(self) -> Optional[int]:\n return self._version.post[1] if self._version.post else None" }, { @@ -77229,13 +81920,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef pre(self) -> Optional[Tuple[str, int]]:\n _pre: Optional[Tuple[str, int]] = self._version.pre\n return _pre" }, { @@ -77253,13 +81945,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef public(self) -> str:\n return str(self).split('+', 1)[0]" }, { @@ -77277,13 +81970,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef release(self) -> Tuple[int, ...]:\n _release: Tuple[int, ...] = self._version.release\n return _release" }, { @@ -77301,7 +81995,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -77311,13 +82006,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __eq__(self, other: object) -> bool:\n if not isinstance(other, _BaseVersion):\n return NotImplemented\n return self._key == other._key" }, { @@ -77335,7 +82031,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -77345,13 +82042,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __ge__(self, other: '_BaseVersion') -> bool:\n if not isinstance(other, _BaseVersion):\n return NotImplemented\n return self._key >= other._key" }, { @@ -77369,7 +82067,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -77379,13 +82078,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __gt__(self, other: '_BaseVersion') -> bool:\n if not isinstance(other, _BaseVersion):\n return NotImplemented\n return self._key > other._key" }, { @@ -77403,13 +82103,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __hash__(self) -> int:\n return hash(self._key)" }, { @@ -77427,7 +82128,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -77437,13 +82139,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __le__(self, other: '_BaseVersion') -> bool:\n if not isinstance(other, _BaseVersion):\n return NotImplemented\n return self._key <= other._key" }, { @@ -77461,7 +82164,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -77471,13 +82175,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __lt__(self, other: '_BaseVersion') -> bool:\n if not isinstance(other, _BaseVersion):\n return NotImplemented\n return self._key < other._key" }, { @@ -77495,7 +82200,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -77505,13 +82211,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __ne__(self, other: object) -> bool:\n if not isinstance(other, _BaseVersion):\n return NotImplemented\n return self._key != other._key" }, { @@ -77529,7 +82236,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "release", @@ -77539,7 +82247,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "pre", @@ -77549,7 +82258,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "post", @@ -77559,7 +82269,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dev", @@ -77569,7 +82280,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "local", @@ -77579,13 +82291,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _cmpkey(epoch: int, release: Tuple[int, ...], pre: Optional[Tuple[str, int]], post: Optional[Tuple[str, int]], dev: Optional[Tuple[str, int]], local: Optional[Tuple[SubLocalType]]) -> CmpKey:\n _release = tuple(reversed(list(itertools.dropwhile(lambda x: x == 0, reversed(release)))))\n if pre is None and post is None and dev is not None:\n _pre: PrePostDevType = NegativeInfinity\n elif pre is None:\n _pre = Infinity\n else:\n _pre = pre\n if post is None:\n _post: PrePostDevType = NegativeInfinity\n else:\n _post = post\n if dev is None:\n _dev: PrePostDevType = Infinity\n else:\n _dev = dev\n if local is None:\n _local: LocalType = NegativeInfinity\n else:\n _local = tuple(((i, '') if isinstance(i, int) else (NegativeInfinity, i) for i in local))\n return epoch, _release, _pre, _post, _dev, _local" }, { @@ -77603,13 +82316,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _legacy_cmpkey(version: str) -> LegacyCmpKey:\n epoch = -1\n parts: List[str] = []\n for part in _parse_version_parts(version.lower()):\n if part.startswith('*'):\n if part < '*final':\n while parts and parts[-1] == '*final-':\n parts.pop()\n while parts and parts[-1] == '00000000':\n parts.pop()\n parts.append(part)\n return epoch, tuple(parts)" }, { @@ -77627,7 +82341,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "number", @@ -77637,13 +82352,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _parse_letter_version(letter: str, number: Union[str, bytes, SupportsInt]) -> Optional[Tuple[str, int]]:\n if letter:\n if number is None:\n number = 0\n letter = letter.lower()\n if letter == 'alpha':\n letter = 'a'\n elif letter == 'beta':\n letter = 'b'\n elif letter in ['c', 'pre', 'preview']:\n letter = 'rc'\n elif letter in ['rev', 'r']:\n letter = 'post'\n return letter, int(number)\n if not letter and number:\n letter = 'post'\n return letter, int(number)\n return None" }, { @@ -77661,13 +82377,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Takes a string like abc.1.twelve and turns it into (\"abc\", 1, \"twelve\").", - "docstring": "Takes a string like abc.1.twelve and turns it into (\"abc\", 1, \"twelve\").", + "docstring": "\n Takes a string like abc.1.twelve and turns it into (\"abc\", 1, \"twelve\").\n ", "source_code": "\ndef _parse_local_version(local: str) -> Optional[LocalType]:\n \"\"\"\n Takes a string like abc.1.twelve and turns it into (\"abc\", 1, \"twelve\").\n \"\"\"\n if local is not None:\n return tuple((part.lower() if not part.isdigit() else int(part) for part in _local_version_separators.split(local)))\n return None" }, { @@ -77685,13 +82402,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _parse_version_parts(s: str) -> Iterator[str]:\n for part in _legacy_version_component_re.split(s):\n part = _legacy_version_replacement_map.get(part, part)\n if not part or part == '.':\n continue\n if part[:1] in '0123456789':\n yield part.zfill(8)\n else:\n yield '*' + part\n yield '*final'" }, { @@ -77709,13 +82427,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Parse the given version string and return either a :class:`Version` object or a :class:`LegacyVersion` object depending on if the given version is a valid PEP 440 version or a legacy version.", - "docstring": "Parse the given version string and return either a :class:`Version` object\nor a :class:`LegacyVersion` object depending on if the given version is\na valid PEP 440 version or a legacy version.", + "description": "Parse the given version string and return either a :class:`Version` object\nor a :class:`LegacyVersion` object depending on if the given version is\na valid PEP 440 version or a legacy version.", + "docstring": "\n Parse the given version string and return either a :class:`Version` object\n or a :class:`LegacyVersion` object depending on if the given version is\n a valid PEP 440 version or a legacy version.\n ", "source_code": "\ndef parse(version: str) -> Union['LegacyVersion', 'Version']:\n \"\"\"\n Parse the given version string and return either a :class:`Version` object\n or a :class:`LegacyVersion` object depending on if the given version is\n a valid PEP 440 version or a legacy version.\n \"\"\"\n try:\n return Version(version)\n except InvalidVersion:\n return LegacyVersion(version)" }, { @@ -77733,7 +82452,8 @@ "docstring": { "type": "ndarray", "description": "PIL image data array." - } + }, + "refined_type": {} }, { "name": "cmin", @@ -77743,7 +82463,8 @@ "docstring": { "type": "scalar, default=None", "description": "Bias scaling of small values. Default is ``data.min()``." - } + }, + "refined_type": {} }, { "name": "cmax", @@ -77753,7 +82474,8 @@ "docstring": { "type": "scalar, default=None", "description": "Bias scaling of large values. Default is ``data.max()``." - } + }, + "refined_type": {} }, { "name": "high", @@ -77763,7 +82485,8 @@ "docstring": { "type": "scalar, default=None", "description": "Scale max value to `high`. Default is 255." - } + }, + "refined_type": {} }, { "name": "low", @@ -77773,13 +82496,14 @@ "docstring": { "type": "scalar, default=None", "description": "Scale min value to `low`. Default is 0." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Byte scales an array (image).\n\nByte scaling means converting the input image to uint8 dtype and scaling the range to ``(low, high)`` (default 0-255). If the input image already has dtype uint8, no scaling is done. This function is only available if Python Imaging Library (PIL) is installed.", - "docstring": "Byte scales an array (image).\n\nByte scaling means converting the input image to uint8 dtype and scaling\nthe range to ``(low, high)`` (default 0-255).\nIf the input image already has dtype uint8, no scaling is done.\n\nThis function is only available if Python Imaging Library (PIL) is installed.\n\nParameters\n----------\ndata : ndarray\n PIL image data array.\ncmin : scalar, default=None\n Bias scaling of small values. Default is ``data.min()``.\ncmax : scalar, default=None\n Bias scaling of large values. Default is ``data.max()``.\nhigh : scalar, default=None\n Scale max value to `high`. Default is 255.\nlow : scalar, default=None\n Scale min value to `low`. Default is 0.\n\nReturns\n-------\nimg_array : uint8 ndarray\n The byte-scaled array.\n\nExamples\n--------\n>>> import numpy as np\n>>> from scipy.misc import bytescale\n>>> img = np.array([[ 91.06794177, 3.39058326, 84.4221549 ],\n... [ 73.88003259, 80.91433048, 4.88878881],\n... [ 51.53875334, 34.45808177, 27.5873488 ]])\n>>> bytescale(img)\narray([[255, 0, 236],\n [205, 225, 4],\n [140, 90, 70]], dtype=uint8)\n>>> bytescale(img, high=200, low=100)\narray([[200, 100, 192],\n [180, 188, 102],\n [155, 135, 128]], dtype=uint8)\n>>> bytescale(img, cmin=0, cmax=255)\narray([[91, 3, 84],\n [74, 81, 5],\n [52, 34, 28]], dtype=uint8)", + "description": "Byte scales an array (image).\n\nByte scaling means converting the input image to uint8 dtype and scaling\nthe range to ``(low, high)`` (default 0-255).\nIf the input image already has dtype uint8, no scaling is done.\n\nThis function is only available if Python Imaging Library (PIL) is installed.", + "docstring": "\n Byte scales an array (image).\n\n Byte scaling means converting the input image to uint8 dtype and scaling\n the range to ``(low, high)`` (default 0-255).\n If the input image already has dtype uint8, no scaling is done.\n\n This function is only available if Python Imaging Library (PIL) is installed.\n\n Parameters\n ----------\n data : ndarray\n PIL image data array.\n cmin : scalar, default=None\n Bias scaling of small values. Default is ``data.min()``.\n cmax : scalar, default=None\n Bias scaling of large values. Default is ``data.max()``.\n high : scalar, default=None\n Scale max value to `high`. Default is 255.\n low : scalar, default=None\n Scale min value to `low`. Default is 0.\n\n Returns\n -------\n img_array : uint8 ndarray\n The byte-scaled array.\n\n Examples\n --------\n >>> import numpy as np\n >>> from scipy.misc import bytescale\n >>> img = np.array([[ 91.06794177, 3.39058326, 84.4221549 ],\n ... [ 73.88003259, 80.91433048, 4.88878881],\n ... [ 51.53875334, 34.45808177, 27.5873488 ]])\n >>> bytescale(img)\n array([[255, 0, 236],\n [205, 225, 4],\n [140, 90, 70]], dtype=uint8)\n >>> bytescale(img, high=200, low=100)\n array([[200, 100, 192],\n [180, 188, 102],\n [155, 135, 128]], dtype=uint8)\n >>> bytescale(img, cmin=0, cmax=255)\n array([[91, 3, 84],\n [74, 81, 5],\n [52, 34, 28]], dtype=uint8)\n\n ", "source_code": "\ndef bytescale(data, cmin=None, cmax=None, high=255, low=0):\n \"\"\"\n Byte scales an array (image).\n\n Byte scaling means converting the input image to uint8 dtype and scaling\n the range to ``(low, high)`` (default 0-255).\n If the input image already has dtype uint8, no scaling is done.\n\n This function is only available if Python Imaging Library (PIL) is installed.\n\n Parameters\n ----------\n data : ndarray\n PIL image data array.\n cmin : scalar, default=None\n Bias scaling of small values. Default is ``data.min()``.\n cmax : scalar, default=None\n Bias scaling of large values. Default is ``data.max()``.\n high : scalar, default=None\n Scale max value to `high`. Default is 255.\n low : scalar, default=None\n Scale min value to `low`. Default is 0.\n\n Returns\n -------\n img_array : uint8 ndarray\n The byte-scaled array.\n\n Examples\n --------\n >>> import numpy as np\n >>> from scipy.misc import bytescale\n >>> img = np.array([[ 91.06794177, 3.39058326, 84.4221549 ],\n ... [ 73.88003259, 80.91433048, 4.88878881],\n ... [ 51.53875334, 34.45808177, 27.5873488 ]])\n >>> bytescale(img)\n array([[255, 0, 236],\n [205, 225, 4],\n [140, 90, 70]], dtype=uint8)\n >>> bytescale(img, high=200, low=100)\n array([[200, 100, 192],\n [180, 188, 102],\n [155, 135, 128]], dtype=uint8)\n >>> bytescale(img, cmin=0, cmax=255)\n array([[91, 3, 84],\n [74, 81, 5],\n [52, 34, 28]], dtype=uint8)\n\n \"\"\"\n if data.dtype == uint8:\n return data\n if high > 255:\n raise ValueError('`high` should be less than or equal to 255.')\n if low < 0:\n raise ValueError('`low` should be greater than or equal to 0.')\n if high < low:\n raise ValueError('`high` should be greater than or equal to `low`.')\n if cmin is None:\n cmin = data.min()\n if cmax is None:\n cmax = data.max()\n cscale = cmax - cmin\n if cscale < 0:\n raise ValueError('`cmax` should be larger than `cmin`.')\n elif cscale == 0:\n cscale = 1\n scale = float(high - low) / cscale\n bytedata = (data - cmin) * scale + low\n return (bytedata.clip(low, high) + 0.5).astype(uint8)" }, { @@ -77797,7 +82521,8 @@ "docstring": { "type": "PIL image", "description": "Input image." - } + }, + "refined_type": {} }, { "name": "flatten", @@ -77807,7 +82532,8 @@ "docstring": { "type": "bool, default=False", "description": "If true, convert the output to grey-scale." - } + }, + "refined_type": {} }, { "name": "mode", @@ -77817,13 +82543,14 @@ "docstring": { "type": "str, default=None", "description": "Mode to convert image to, e.g. ``'RGB'``. See the Notes of the\n`imread` docstring for more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return a copy of a PIL image as a numpy array.\n\nThis function is only available if Python Imaging Library (PIL) is installed.", - "docstring": "Return a copy of a PIL image as a numpy array.\n\nThis function is only available if Python Imaging Library (PIL) is installed.\n\nParameters\n----------\nim : PIL image\n Input image.\nflatten : bool, default=False\n If true, convert the output to grey-scale.\nmode : str, default=None\n Mode to convert image to, e.g. ``'RGB'``. See the Notes of the\n `imread` docstring for more details.\n\nReturns\n-------\nfromimage : ndarray\n The different colour bands/channels are stored in the\n third dimension, such that a grey-image is MxN, an\n RGB-image MxNx3 and an RGBA-image MxNx4.", + "docstring": "\n Return a copy of a PIL image as a numpy array.\n\n This function is only available if Python Imaging Library (PIL) is installed.\n\n Parameters\n ----------\n im : PIL image\n Input image.\n flatten : bool, default=False\n If true, convert the output to grey-scale.\n mode : str, default=None\n Mode to convert image to, e.g. ``'RGB'``. See the Notes of the\n `imread` docstring for more details.\n\n Returns\n -------\n fromimage : ndarray\n The different colour bands/channels are stored in the\n third dimension, such that a grey-image is MxN, an\n RGB-image MxNx3 and an RGBA-image MxNx4.\n\n ", "source_code": "\ndef fromimage(im, flatten=False, mode=None):\n \"\"\"\n Return a copy of a PIL image as a numpy array.\n\n This function is only available if Python Imaging Library (PIL) is installed.\n\n Parameters\n ----------\n im : PIL image\n Input image.\n flatten : bool, default=False\n If true, convert the output to grey-scale.\n mode : str, default=None\n Mode to convert image to, e.g. ``'RGB'``. See the Notes of the\n `imread` docstring for more details.\n\n Returns\n -------\n fromimage : ndarray\n The different colour bands/channels are stored in the\n third dimension, such that a grey-image is MxN, an\n RGB-image MxNx3 and an RGBA-image MxNx4.\n\n \"\"\"\n if not pillow_installed:\n raise ImportError(PILLOW_ERROR_MESSAGE)\n if not Image.isImageType(im):\n raise TypeError('Input is not a PIL image.')\n if mode is not None:\n if mode != im.mode:\n im = im.convert(mode)\n elif im.mode == 'P':\n if 'transparency' in im.info:\n im = im.convert('RGBA')\n else:\n im = im.convert('RGB')\n if flatten:\n im = im.convert('F')\n elif im.mode == '1':\n im = im.convert('L')\n a = array(im)\n return a" }, { @@ -77841,7 +82568,8 @@ "docstring": { "type": "str or file object", "description": "The file name or file object to be read." - } + }, + "refined_type": {} }, { "name": "flatten", @@ -77851,7 +82579,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, flattens the color layers into a single gray-scale layer." - } + }, + "refined_type": {} }, { "name": "mode", @@ -77861,13 +82590,14 @@ "docstring": { "type": "str, default=None", "description": "Mode to convert image to, e.g. ``'RGB'``. See the Notes for more\ndetails." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Read an image from a file as an array.\n\nThis function is only available if Python Imaging Library (PIL) is installed.", - "docstring": "Read an image from a file as an array.\n\nThis function is only available if Python Imaging Library (PIL) is installed.\n\nParameters\n----------\nname : str or file object\n The file name or file object to be read.\nflatten : bool, default=False\n If True, flattens the color layers into a single gray-scale layer.\nmode : str, default=None\n Mode to convert image to, e.g. ``'RGB'``. See the Notes for more\n details.\n\nReturns\n-------\nimread : ndarray\n The array obtained by reading the image.\n\nNotes\n-----\n`imread` uses the Python Imaging Library (PIL) to read an image.\nThe following notes are from the PIL documentation.\n\n`mode` can be one of the following strings:\n\n* 'L' (8-bit pixels, black and white)\n* 'P' (8-bit pixels, mapped to any other mode using a color palette)\n* 'RGB' (3x8-bit pixels, true color)\n* 'RGBA' (4x8-bit pixels, true color with transparency mask)\n* 'CMYK' (4x8-bit pixels, color separation)\n* 'YCbCr' (3x8-bit pixels, color video format)\n* 'I' (32-bit signed integer pixels)\n* 'F' (32-bit floating point pixels)\n\nPIL also provides limited support for a few special modes, including\n'LA' ('L' with alpha), 'RGBX' (true color with padding) and 'RGBa'\n(true color with premultiplied alpha).\n\nWhen translating a color image to black and white (mode 'L', 'I' or\n'F'), the library uses the ITU-R 601-2 luma transform::\n\n L = R * 299/1000 + G * 587/1000 + B * 114/1000\n\nWhen `flatten` is True, the image is converted using mode 'F'.\nWhen `mode` is not None and `flatten` is True, the image is first\nconverted according to `mode`, and the result is then flattened using\nmode 'F'.", + "docstring": "\n Read an image from a file as an array.\n\n This function is only available if Python Imaging Library (PIL) is installed.\n\n Parameters\n ----------\n name : str or file object\n The file name or file object to be read.\n flatten : bool, default=False\n If True, flattens the color layers into a single gray-scale layer.\n mode : str, default=None\n Mode to convert image to, e.g. ``'RGB'``. See the Notes for more\n details.\n\n Returns\n -------\n imread : ndarray\n The array obtained by reading the image.\n\n Notes\n -----\n `imread` uses the Python Imaging Library (PIL) to read an image.\n The following notes are from the PIL documentation.\n\n `mode` can be one of the following strings:\n\n * 'L' (8-bit pixels, black and white)\n * 'P' (8-bit pixels, mapped to any other mode using a color palette)\n * 'RGB' (3x8-bit pixels, true color)\n * 'RGBA' (4x8-bit pixels, true color with transparency mask)\n * 'CMYK' (4x8-bit pixels, color separation)\n * 'YCbCr' (3x8-bit pixels, color video format)\n * 'I' (32-bit signed integer pixels)\n * 'F' (32-bit floating point pixels)\n\n PIL also provides limited support for a few special modes, including\n 'LA' ('L' with alpha), 'RGBX' (true color with padding) and 'RGBa'\n (true color with premultiplied alpha).\n\n When translating a color image to black and white (mode 'L', 'I' or\n 'F'), the library uses the ITU-R 601-2 luma transform::\n\n L = R * 299/1000 + G * 587/1000 + B * 114/1000\n\n When `flatten` is True, the image is converted using mode 'F'.\n When `mode` is not None and `flatten` is True, the image is first\n converted according to `mode`, and the result is then flattened using\n mode 'F'.\n\n ", "source_code": "\ndef imread(name, flatten=False, mode=None):\n \"\"\"\n Read an image from a file as an array.\n\n This function is only available if Python Imaging Library (PIL) is installed.\n\n Parameters\n ----------\n name : str or file object\n The file name or file object to be read.\n flatten : bool, default=False\n If True, flattens the color layers into a single gray-scale layer.\n mode : str, default=None\n Mode to convert image to, e.g. ``'RGB'``. See the Notes for more\n details.\n\n Returns\n -------\n imread : ndarray\n The array obtained by reading the image.\n\n Notes\n -----\n `imread` uses the Python Imaging Library (PIL) to read an image.\n The following notes are from the PIL documentation.\n\n `mode` can be one of the following strings:\n\n * 'L' (8-bit pixels, black and white)\n * 'P' (8-bit pixels, mapped to any other mode using a color palette)\n * 'RGB' (3x8-bit pixels, true color)\n * 'RGBA' (4x8-bit pixels, true color with transparency mask)\n * 'CMYK' (4x8-bit pixels, color separation)\n * 'YCbCr' (3x8-bit pixels, color video format)\n * 'I' (32-bit signed integer pixels)\n * 'F' (32-bit floating point pixels)\n\n PIL also provides limited support for a few special modes, including\n 'LA' ('L' with alpha), 'RGBX' (true color with padding) and 'RGBa'\n (true color with premultiplied alpha).\n\n When translating a color image to black and white (mode 'L', 'I' or\n 'F'), the library uses the ITU-R 601-2 luma transform::\n\n L = R * 299/1000 + G * 587/1000 + B * 114/1000\n\n When `flatten` is True, the image is converted using mode 'F'.\n When `mode` is not None and `flatten` is True, the image is first\n converted according to `mode`, and the result is then flattened using\n mode 'F'.\n\n \"\"\"\n if not pillow_installed:\n raise ImportError(PILLOW_ERROR_MESSAGE)\n im = Image.open(name)\n return fromimage(im, flatten=flatten, mode=mode)" }, { @@ -77885,7 +82615,8 @@ "docstring": { "type": "ndarray", "description": "The array of image to be resized." - } + }, + "refined_type": {} }, { "name": "size", @@ -77895,7 +82626,8 @@ "docstring": { "type": "int, float or tuple", "description": "* int - Percentage of current size.\n* float - Fraction of current size.\n* tuple - Size of the output image (height, width)." - } + }, + "refined_type": {} }, { "name": "interp", @@ -77905,7 +82637,8 @@ "docstring": { "type": "str, default='bilinear'", "description": "Interpolation to use for re-sizing ('nearest', 'lanczos', 'bilinear',\n'bicubic' or 'cubic')." - } + }, + "refined_type": {} }, { "name": "mode", @@ -77915,13 +82648,14 @@ "docstring": { "type": "str, default=None", "description": "The PIL image mode ('P', 'L', etc.) to convert `arr` before resizing.\nIf ``mode=None`` (the default), 2-D images will be treated like\n``mode='L'``, i.e. casting to long integer. For 3-D and 4-D arrays,\n`mode` will be set to ``'RGB'`` and ``'RGBA'`` respectively." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Resize an image.\n\nThis function is only available if Python Imaging Library (PIL) is installed. .. warning:: This function uses `bytescale` under the hood to rescale images to use the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``. It will also cast data for 2-D images to ``uint32`` for ``mode=None`` (which is the default).", - "docstring": "Resize an image.\n\nThis function is only available if Python Imaging Library (PIL) is installed.\n\n.. warning::\n\n This function uses `bytescale` under the hood to rescale images to use\n the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``.\n It will also cast data for 2-D images to ``uint32`` for ``mode=None``\n (which is the default).\n\nParameters\n----------\narr : ndarray\n The array of image to be resized.\nsize : int, float or tuple\n * int - Percentage of current size.\n * float - Fraction of current size.\n * tuple - Size of the output image (height, width).\n\ninterp : str, default='bilinear'\n Interpolation to use for re-sizing ('nearest', 'lanczos', 'bilinear',\n 'bicubic' or 'cubic').\nmode : str, default=None\n The PIL image mode ('P', 'L', etc.) to convert `arr` before resizing.\n If ``mode=None`` (the default), 2-D images will be treated like\n ``mode='L'``, i.e. casting to long integer. For 3-D and 4-D arrays,\n `mode` will be set to ``'RGB'`` and ``'RGBA'`` respectively.\n\nReturns\n-------\nimresize : ndarray\n The resized array of image.\n\nSee Also\n--------\ntoimage : Implicitly used to convert `arr` according to `mode`.\nscipy.ndimage.zoom : More generic implementation that does not use PIL.", + "description": "Resize an image.\n\nThis function is only available if Python Imaging Library (PIL) is installed.\n\n.. warning::\n\n This function uses `bytescale` under the hood to rescale images to use\n the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``.\n It will also cast data for 2-D images to ``uint32`` for ``mode=None``\n (which is the default).", + "docstring": "\n Resize an image.\n\n This function is only available if Python Imaging Library (PIL) is installed.\n\n .. warning::\n\n This function uses `bytescale` under the hood to rescale images to use\n the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``.\n It will also cast data for 2-D images to ``uint32`` for ``mode=None``\n (which is the default).\n\n Parameters\n ----------\n arr : ndarray\n The array of image to be resized.\n size : int, float or tuple\n * int - Percentage of current size.\n * float - Fraction of current size.\n * tuple - Size of the output image (height, width).\n\n interp : str, default='bilinear'\n Interpolation to use for re-sizing ('nearest', 'lanczos', 'bilinear',\n 'bicubic' or 'cubic').\n mode : str, default=None\n The PIL image mode ('P', 'L', etc.) to convert `arr` before resizing.\n If ``mode=None`` (the default), 2-D images will be treated like\n ``mode='L'``, i.e. casting to long integer. For 3-D and 4-D arrays,\n `mode` will be set to ``'RGB'`` and ``'RGBA'`` respectively.\n\n Returns\n -------\n imresize : ndarray\n The resized array of image.\n\n See Also\n --------\n toimage : Implicitly used to convert `arr` according to `mode`.\n scipy.ndimage.zoom : More generic implementation that does not use PIL.\n\n ", "source_code": "\ndef imresize(arr, size, interp='bilinear', mode=None):\n \"\"\"\n Resize an image.\n\n This function is only available if Python Imaging Library (PIL) is installed.\n\n .. warning::\n\n This function uses `bytescale` under the hood to rescale images to use\n the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``.\n It will also cast data for 2-D images to ``uint32`` for ``mode=None``\n (which is the default).\n\n Parameters\n ----------\n arr : ndarray\n The array of image to be resized.\n size : int, float or tuple\n * int - Percentage of current size.\n * float - Fraction of current size.\n * tuple - Size of the output image (height, width).\n\n interp : str, default='bilinear'\n Interpolation to use for re-sizing ('nearest', 'lanczos', 'bilinear',\n 'bicubic' or 'cubic').\n mode : str, default=None\n The PIL image mode ('P', 'L', etc.) to convert `arr` before resizing.\n If ``mode=None`` (the default), 2-D images will be treated like\n ``mode='L'``, i.e. casting to long integer. For 3-D and 4-D arrays,\n `mode` will be set to ``'RGB'`` and ``'RGBA'`` respectively.\n\n Returns\n -------\n imresize : ndarray\n The resized array of image.\n\n See Also\n --------\n toimage : Implicitly used to convert `arr` according to `mode`.\n scipy.ndimage.zoom : More generic implementation that does not use PIL.\n\n \"\"\"\n im = toimage(arr, mode=mode)\n ts = type(size)\n if issubdtype(ts, numpy.signedinteger):\n percent = size / 100.0\n size = tuple((array(im.size) * percent).astype(int))\n elif issubdtype(type(size), numpy.floating):\n size = tuple((array(im.size) * size).astype(int))\n else:\n size = (size[1], size[0])\n func = {'nearest': 0, 'lanczos': 1, 'bilinear': 2, 'bicubic': 3, 'cubic': 3}\n imnew = im.resize(size, resample=func[interp])\n return fromimage(imnew)" }, { @@ -77939,7 +82673,8 @@ "docstring": { "type": "str or file object", "description": "Output file name or file object." - } + }, + "refined_type": {} }, { "name": "arr", @@ -77949,7 +82684,8 @@ "docstring": { "type": "ndarray, MxN or MxNx3 or MxNx4", "description": "Array containing image values. If the shape is ``MxN``, the array\nrepresents a grey-level image. Shape ``MxNx3`` stores the red, green\nand blue bands along the last dimension. An alpha layer may be\nincluded, specified as the last colour band of an ``MxNx4`` array." - } + }, + "refined_type": {} }, { "name": "format", @@ -77959,13 +82695,14 @@ "docstring": { "type": "str, default=None", "description": "Image format. If omitted, the format to use is determined from the\nfile name extension. If a file object was used instead of a file name,\nthis parameter should always be used." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Save an array as an image.\n\nThis function is only available if Python Imaging Library (PIL) is installed. .. warning:: This function uses `bytescale` under the hood to rescale images to use the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``. It will also cast data for 2-D images to ``uint32`` for ``mode=None`` (which is the default).", - "docstring": "Save an array as an image.\n\nThis function is only available if Python Imaging Library (PIL) is installed.\n\n.. warning::\n\n This function uses `bytescale` under the hood to rescale images to use\n the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``.\n It will also cast data for 2-D images to ``uint32`` for ``mode=None``\n (which is the default).\n\nParameters\n----------\nname : str or file object\n Output file name or file object.\narr : ndarray, MxN or MxNx3 or MxNx4\n Array containing image values. If the shape is ``MxN``, the array\n represents a grey-level image. Shape ``MxNx3`` stores the red, green\n and blue bands along the last dimension. An alpha layer may be\n included, specified as the last colour band of an ``MxNx4`` array.\nformat : str, default=None\n Image format. If omitted, the format to use is determined from the\n file name extension. If a file object was used instead of a file name,\n this parameter should always be used.\n\nExamples\n--------\nConstruct an array of gradient intensity values and save to file:\n\n>>> import numpy as np\n>>> from scipy.misc import imsave\n>>> x = np.zeros((255, 255))\n>>> x = np.zeros((255, 255), dtype=np.uint8)\n>>> x[:] = np.arange(255)\n>>> imsave('gradient.png', x)\n\nConstruct an array with three colour bands (R, G, B) and store to file:\n\n>>> rgb = np.zeros((255, 255, 3), dtype=np.uint8)\n>>> rgb[..., 0] = np.arange(255)\n>>> rgb[..., 1] = 55\n>>> rgb[..., 2] = 1 - np.arange(255)\n>>> imsave('rgb_gradient.png', rgb)", + "description": "Save an array as an image.\n\nThis function is only available if Python Imaging Library (PIL) is installed.\n\n.. warning::\n\n This function uses `bytescale` under the hood to rescale images to use\n the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``.\n It will also cast data for 2-D images to ``uint32`` for ``mode=None``\n (which is the default).", + "docstring": "\n Save an array as an image.\n\n This function is only available if Python Imaging Library (PIL) is installed.\n\n .. warning::\n\n This function uses `bytescale` under the hood to rescale images to use\n the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``.\n It will also cast data for 2-D images to ``uint32`` for ``mode=None``\n (which is the default).\n\n Parameters\n ----------\n name : str or file object\n Output file name or file object.\n arr : ndarray, MxN or MxNx3 or MxNx4\n Array containing image values. If the shape is ``MxN``, the array\n represents a grey-level image. Shape ``MxNx3`` stores the red, green\n and blue bands along the last dimension. An alpha layer may be\n included, specified as the last colour band of an ``MxNx4`` array.\n format : str, default=None\n Image format. If omitted, the format to use is determined from the\n file name extension. If a file object was used instead of a file name,\n this parameter should always be used.\n\n Examples\n --------\n Construct an array of gradient intensity values and save to file:\n\n >>> import numpy as np\n >>> from scipy.misc import imsave\n >>> x = np.zeros((255, 255))\n >>> x = np.zeros((255, 255), dtype=np.uint8)\n >>> x[:] = np.arange(255)\n >>> imsave('gradient.png', x)\n\n Construct an array with three colour bands (R, G, B) and store to file:\n\n >>> rgb = np.zeros((255, 255, 3), dtype=np.uint8)\n >>> rgb[..., 0] = np.arange(255)\n >>> rgb[..., 1] = 55\n >>> rgb[..., 2] = 1 - np.arange(255)\n >>> imsave('rgb_gradient.png', rgb)\n\n ", "source_code": "\ndef imsave(name, arr, format=None):\n \"\"\"\n Save an array as an image.\n\n This function is only available if Python Imaging Library (PIL) is installed.\n\n .. warning::\n\n This function uses `bytescale` under the hood to rescale images to use\n the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``.\n It will also cast data for 2-D images to ``uint32`` for ``mode=None``\n (which is the default).\n\n Parameters\n ----------\n name : str or file object\n Output file name or file object.\n arr : ndarray, MxN or MxNx3 or MxNx4\n Array containing image values. If the shape is ``MxN``, the array\n represents a grey-level image. Shape ``MxNx3`` stores the red, green\n and blue bands along the last dimension. An alpha layer may be\n included, specified as the last colour band of an ``MxNx4`` array.\n format : str, default=None\n Image format. If omitted, the format to use is determined from the\n file name extension. If a file object was used instead of a file name,\n this parameter should always be used.\n\n Examples\n --------\n Construct an array of gradient intensity values and save to file:\n\n >>> import numpy as np\n >>> from scipy.misc import imsave\n >>> x = np.zeros((255, 255))\n >>> x = np.zeros((255, 255), dtype=np.uint8)\n >>> x[:] = np.arange(255)\n >>> imsave('gradient.png', x)\n\n Construct an array with three colour bands (R, G, B) and store to file:\n\n >>> rgb = np.zeros((255, 255, 3), dtype=np.uint8)\n >>> rgb[..., 0] = np.arange(255)\n >>> rgb[..., 1] = 55\n >>> rgb[..., 2] = 1 - np.arange(255)\n >>> imsave('rgb_gradient.png', rgb)\n\n \"\"\"\n im = toimage(arr, channel_axis=2)\n if format is None:\n im.save(name)\n else:\n im.save(name, format)\n return" }, { @@ -77983,7 +82720,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "high", @@ -77993,7 +82731,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "low", @@ -78003,7 +82742,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cmin", @@ -78013,7 +82753,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cmax", @@ -78023,7 +82764,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "pal", @@ -78033,7 +82775,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mode", @@ -78043,7 +82786,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "channel_axis", @@ -78053,13 +82797,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Takes a numpy array and returns a PIL image.\n\nThis function is only available if Python Imaging Library (PIL) is installed. The mode of the PIL image depends on the array shape and the `pal` and `mode` keywords. For 2-D arrays, if `pal` is a valid (N,3) byte-array giving the RGB values (from 0 to 255) then ``mode='P'``, otherwise ``mode='L'``, unless mode is given as 'F' or 'I' in which case a float and/or integer array is made. .. warning:: This function uses `bytescale` under the hood to rescale images to use the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``. It will also cast data for 2-D images to ``uint32`` for ``mode=None`` (which is the default).", - "docstring": "Takes a numpy array and returns a PIL image.\n\nThis function is only available if Python Imaging Library (PIL) is installed.\n\nThe mode of the PIL image depends on the array shape and the `pal` and\n`mode` keywords.\n\nFor 2-D arrays, if `pal` is a valid (N,3) byte-array giving the RGB values\n(from 0 to 255) then ``mode='P'``, otherwise ``mode='L'``, unless mode\nis given as 'F' or 'I' in which case a float and/or integer array is made.\n\n.. warning::\n\n This function uses `bytescale` under the hood to rescale images to use\n the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``.\n It will also cast data for 2-D images to ``uint32`` for ``mode=None``\n (which is the default).\n\nNotes\n-----\nFor 3-D arrays, the `channel_axis` argument tells which dimension of the\narray holds the channel data.\n\nFor 3-D arrays if one of the dimensions is 3, the mode is 'RGB'\nby default or 'YCbCr' if selected.\n\nThe numpy array must be either 2 dimensional or 3 dimensional.", + "description": "Takes a numpy array and returns a PIL image.\n\nThis function is only available if Python Imaging Library (PIL) is installed.\n\nThe mode of the PIL image depends on the array shape and the `pal` and\n`mode` keywords.\n\nFor 2-D arrays, if `pal` is a valid (N,3) byte-array giving the RGB values\n(from 0 to 255) then ``mode='P'``, otherwise ``mode='L'``, unless mode\nis given as 'F' or 'I' in which case a float and/or integer array is made.\n\n.. warning::\n\n This function uses `bytescale` under the hood to rescale images to use\n the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``.\n It will also cast data for 2-D images to ``uint32`` for ``mode=None``\n (which is the default).", + "docstring": "Takes a numpy array and returns a PIL image.\n\n This function is only available if Python Imaging Library (PIL) is installed.\n\n The mode of the PIL image depends on the array shape and the `pal` and\n `mode` keywords.\n\n For 2-D arrays, if `pal` is a valid (N,3) byte-array giving the RGB values\n (from 0 to 255) then ``mode='P'``, otherwise ``mode='L'``, unless mode\n is given as 'F' or 'I' in which case a float and/or integer array is made.\n\n .. warning::\n\n This function uses `bytescale` under the hood to rescale images to use\n the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``.\n It will also cast data for 2-D images to ``uint32`` for ``mode=None``\n (which is the default).\n\n Notes\n -----\n For 3-D arrays, the `channel_axis` argument tells which dimension of the\n array holds the channel data.\n\n For 3-D arrays if one of the dimensions is 3, the mode is 'RGB'\n by default or 'YCbCr' if selected.\n\n The numpy array must be either 2 dimensional or 3 dimensional.\n\n ", "source_code": "\ndef toimage(arr, high=255, low=0, cmin=None, cmax=None, pal=None, mode=None, channel_axis=None):\n \"\"\"Takes a numpy array and returns a PIL image.\n\n This function is only available if Python Imaging Library (PIL) is installed.\n\n The mode of the PIL image depends on the array shape and the `pal` and\n `mode` keywords.\n\n For 2-D arrays, if `pal` is a valid (N,3) byte-array giving the RGB values\n (from 0 to 255) then ``mode='P'``, otherwise ``mode='L'``, unless mode\n is given as 'F' or 'I' in which case a float and/or integer array is made.\n\n .. warning::\n\n This function uses `bytescale` under the hood to rescale images to use\n the full (0, 255) range if ``mode`` is one of ``None, 'L', 'P', 'l'``.\n It will also cast data for 2-D images to ``uint32`` for ``mode=None``\n (which is the default).\n\n Notes\n -----\n For 3-D arrays, the `channel_axis` argument tells which dimension of the\n array holds the channel data.\n\n For 3-D arrays if one of the dimensions is 3, the mode is 'RGB'\n by default or 'YCbCr' if selected.\n\n The numpy array must be either 2 dimensional or 3 dimensional.\n\n \"\"\"\n if not pillow_installed:\n raise ImportError(PILLOW_ERROR_MESSAGE)\n data = asarray(arr)\n if iscomplexobj(data):\n raise ValueError('Cannot convert a complex-valued array.')\n shape = list(data.shape)\n valid = len(shape) == 2 or len(shape) == 3 and (3 in shape or 4 in shape)\n if not valid:\n raise ValueError(\"'arr' does not have a suitable array shape for any mode.\")\n if len(shape) == 2:\n shape = (shape[1], shape[0])\n if mode == 'F':\n data32 = data.astype(numpy.float32)\n image = Image.frombytes(mode, shape, data32.tobytes())\n return image\n if mode in [None, 'L', 'P']:\n bytedata = bytescale(data, high=high, low=low, cmin=cmin, cmax=cmax)\n image = Image.frombytes('L', shape, bytedata.tobytes())\n if pal is not None:\n image.putpalette(asarray(pal, dtype=uint8).tobytes())\n elif mode == 'P':\n pal = arange(0, 256, 1, dtype=uint8)[:, newaxis] * ones((3, ), dtype=uint8)[newaxis, :]\n image.putpalette(asarray(pal, dtype=uint8).tobytes())\n return image\n if mode == '1':\n bytedata = data > high\n image = Image.frombytes('1', shape, bytedata.tobytes())\n return image\n if cmin is None:\n cmin = amin(ravel(data))\n if cmax is None:\n cmax = amax(ravel(data))\n data = (data * 1.0 - cmin) * (high - low) / (cmax - cmin) + low\n if mode == 'I':\n data32 = data.astype(numpy.uint32)\n image = Image.frombytes(mode, shape, data32.tobytes())\n else:\n raise ValueError(_errstr)\n return image\n if channel_axis is None:\n if 3 in shape:\n ca = numpy.flatnonzero(asarray(shape) == 3)[0]\n else:\n ca = numpy.flatnonzero(asarray(shape) == 4)\n if len(ca):\n ca = ca[0]\n else:\n raise ValueError('Could not find channel dimension.')\n else:\n ca = channel_axis\n numch = shape[ca]\n if numch not in [3, 4]:\n raise ValueError('Channel axis dimension is not valid.')\n bytedata = bytescale(data, high=high, low=low, cmin=cmin, cmax=cmax)\n if ca == 2:\n strdata = bytedata.tobytes()\n shape = (shape[1], shape[0])\n elif ca == 1:\n strdata = transpose(bytedata, (0, 2, 1)).tobytes()\n shape = (shape[2], shape[0])\n elif ca == 0:\n strdata = transpose(bytedata, (1, 2, 0)).tobytes()\n shape = (shape[2], shape[1])\n if mode is None:\n if numch == 3:\n mode = 'RGB'\n else:\n mode = 'RGBA'\n if mode not in ['RGB', 'RGBA', 'YCbCr', 'CMYK']:\n raise ValueError(_errstr)\n if mode in ['RGB', 'YCbCr']:\n if numch != 3:\n raise ValueError('Invalid array shape for mode.')\n if mode in ['RGBA', 'CMYK']:\n if numch != 4:\n raise ValueError('Invalid array shape for mode.')\n image = Image.frombytes(mode, shape, strdata)\n return image" }, { @@ -78077,7 +82822,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "config", @@ -78087,13 +82833,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef pytest_ignore_collect(path, config):\n return True" }, { @@ -78111,7 +82858,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -78121,7 +82869,8 @@ "docstring": { "type": "dtype, default=np.float64", "description": "The type of feature values. Passed to Numpy array/scipy.sparse matrix\nconstructors as the dtype argument." - } + }, + "refined_type": {} }, { "name": "separator", @@ -78131,7 +82880,8 @@ "docstring": { "type": "str, default=\"=\"", "description": "Separator string used when constructing new features for one-hot\ncoding." - } + }, + "refined_type": {} }, { "name": "sparse", @@ -78141,7 +82891,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether transform should produce scipy.sparse matrices." - } + }, + "refined_type": {} }, { "name": "sort", @@ -78151,13 +82902,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether ``feature_names_`` and ``vocabulary_`` should be\nsorted when fitting." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, dtype=np.float64, separator='=', sparse=True, sort=True):\n self.dtype = dtype\n self.separator = separator\n self.sparse = sparse\n self.sort = sort" }, { @@ -78175,7 +82927,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "f", @@ -78185,7 +82938,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "v", @@ -78195,7 +82949,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "feature_names", @@ -78205,7 +82960,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "vocab", @@ -78215,7 +82971,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fitting", @@ -78225,7 +82982,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transforming", @@ -78235,7 +82993,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "indices", @@ -78245,7 +83004,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "values", @@ -78255,7 +83015,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -78279,13 +83040,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'X_types': ['dict']}" }, { @@ -78303,7 +83065,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -78313,7 +83076,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fitting", @@ -78323,13 +83087,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _transform(self, X, fitting):\n assert array('i').itemsize == 4, 'sizeof(int) != 4 on your platform; please report this at https://github.com/scikit-learn/scikit-learn/issues and include the output from platform.platform() in your bug report'\n dtype = self.dtype\n if fitting:\n feature_names = []\n vocab = {}\n else:\n feature_names = self.feature_names_\n vocab = self.vocabulary_\n transforming = True\n X = [X] if isinstance(X, Mapping) else X\n indices = array('i')\n indptr = [0]\n values = []\n for x in X:\n for (f, v) in x.items():\n if isinstance(v, str):\n feature_name = '%s%s%s' % (f, self.separator, v)\n v = 1\n elif isinstance(v, Number) or v is None:\n feature_name = f\n elif not isinstance(v, Mapping) and isinstance(v, Iterable):\n feature_name = None\n self._add_iterable_element(f, v, feature_names, vocab, fitting=fitting, transforming=transforming, indices=indices, values=values)\n else:\n raise TypeError(f'Unsupported value Type {type(v)} for {f}: {v}.\\n{type(v)} objects are not supported.')\n if feature_name is not None:\n if fitting and feature_name not in vocab:\n vocab[feature_name] = len(feature_names)\n feature_names.append(feature_name)\n if feature_name in vocab:\n indices.append(vocab[feature_name])\n values.append(self.dtype(v))\n indptr.append(len(indices))\n if len(indptr) == 1:\n raise ValueError('Sample sequence X is empty.')\n indices = np.frombuffer(indices, dtype=np.intc)\n shape = (len(indptr) - 1, len(vocab))\n result_matrix = sp.csr_matrix((values, indices, indptr), shape=shape, dtype=dtype)\n if fitting and self.sort:\n feature_names.sort()\n map_index = np.empty(len(feature_names), dtype=np.int32)\n for (new_val, f) in enumerate(feature_names):\n map_index[new_val] = vocab[f]\n vocab[f] = new_val\n result_matrix = result_matrix[:, map_index]\n if self.sparse:\n result_matrix.sort_indices()\n else:\n result_matrix = result_matrix.toarray()\n if fitting:\n self.feature_names_ = feature_names\n self.vocabulary_ = vocab\n return result_matrix" }, { @@ -78347,7 +83112,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -78357,7 +83123,8 @@ "docstring": { "type": "Mapping or iterable over Mappings", "description": "Dict(s) or Mapping(s) from feature names (arbitrary Python\nobjects) to feature values (strings or convertible to dtype).\n\n.. versionchanged:: 0.24\n Accepts multiple string values for one categorical feature." - } + }, + "refined_type": {} }, { "name": "y", @@ -78367,13 +83134,14 @@ "docstring": { "type": "(ignored)", "description": "Ignored parameter." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Learn a list of feature name -> indices mappings.", - "docstring": "Learn a list of feature name -> indices mappings.\n\nParameters\n----------\nX : Mapping or iterable over Mappings\n Dict(s) or Mapping(s) from feature names (arbitrary Python\n objects) to feature values (strings or convertible to dtype).\n\n .. versionchanged:: 0.24\n Accepts multiple string values for one categorical feature.\n\ny : (ignored)\n Ignored parameter.\n\nReturns\n-------\nself : object\n DictVectorizer class instance.", + "docstring": "Learn a list of feature name -> indices mappings.\n\n Parameters\n ----------\n X : Mapping or iterable over Mappings\n Dict(s) or Mapping(s) from feature names (arbitrary Python\n objects) to feature values (strings or convertible to dtype).\n\n .. versionchanged:: 0.24\n Accepts multiple string values for one categorical feature.\n\n y : (ignored)\n Ignored parameter.\n\n Returns\n -------\n self : object\n DictVectorizer class instance.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Learn a list of feature name -> indices mappings.\n\n Parameters\n ----------\n X : Mapping or iterable over Mappings\n Dict(s) or Mapping(s) from feature names (arbitrary Python\n objects) to feature values (strings or convertible to dtype).\n\n .. versionchanged:: 0.24\n Accepts multiple string values for one categorical feature.\n\n y : (ignored)\n Ignored parameter.\n\n Returns\n -------\n self : object\n DictVectorizer class instance.\n \"\"\"\n feature_names = []\n vocab = {}\n for x in X:\n for (f, v) in x.items():\n if isinstance(v, str):\n feature_name = '%s%s%s' % (f, self.separator, v)\n v = 1\n elif isinstance(v, Number) or v is None:\n feature_name = f\n elif isinstance(v, Mapping):\n raise TypeError(f'Unsupported value type {type(v)} for {f}: {v}.\\nMapping objects are not supported.')\n elif isinstance(v, Iterable):\n feature_name = None\n self._add_iterable_element(f, v, feature_names, vocab)\n if feature_name is not None:\n if feature_name not in vocab:\n vocab[feature_name] = len(feature_names)\n feature_names.append(feature_name)\n if self.sort:\n feature_names.sort()\n vocab = {f: i for (i, f) in enumerate(feature_names)}\n self.feature_names_ = feature_names\n self.vocabulary_ = vocab\n return self" }, { @@ -78391,7 +83159,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -78401,7 +83170,8 @@ "docstring": { "type": "Mapping or iterable over Mappings", "description": "Dict(s) or Mapping(s) from feature names (arbitrary Python\nobjects) to feature values (strings or convertible to dtype).\n\n.. versionchanged:: 0.24\n Accepts multiple string values for one categorical feature." - } + }, + "refined_type": {} }, { "name": "y", @@ -78411,13 +83181,14 @@ "docstring": { "type": "(ignored)", "description": "Ignored parameter." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Learn a list of feature name -> indices mappings and transform X.\n\nLike fit(X) followed by transform(X), but does not require materializing X in memory.", - "docstring": "Learn a list of feature name -> indices mappings and transform X.\n\nLike fit(X) followed by transform(X), but does not require\nmaterializing X in memory.\n\nParameters\n----------\nX : Mapping or iterable over Mappings\n Dict(s) or Mapping(s) from feature names (arbitrary Python\n objects) to feature values (strings or convertible to dtype).\n\n .. versionchanged:: 0.24\n Accepts multiple string values for one categorical feature.\n\ny : (ignored)\n Ignored parameter.\n\nReturns\n-------\nXa : {array, sparse matrix}\n Feature vectors; always 2-d.", + "description": "Learn a list of feature name -> indices mappings and transform X.\n\nLike fit(X) followed by transform(X), but does not require\nmaterializing X in memory.", + "docstring": "Learn a list of feature name -> indices mappings and transform X.\n\n Like fit(X) followed by transform(X), but does not require\n materializing X in memory.\n\n Parameters\n ----------\n X : Mapping or iterable over Mappings\n Dict(s) or Mapping(s) from feature names (arbitrary Python\n objects) to feature values (strings or convertible to dtype).\n\n .. versionchanged:: 0.24\n Accepts multiple string values for one categorical feature.\n\n y : (ignored)\n Ignored parameter.\n\n Returns\n -------\n Xa : {array, sparse matrix}\n Feature vectors; always 2-d.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Learn a list of feature name -> indices mappings and transform X.\n\n Like fit(X) followed by transform(X), but does not require\n materializing X in memory.\n\n Parameters\n ----------\n X : Mapping or iterable over Mappings\n Dict(s) or Mapping(s) from feature names (arbitrary Python\n objects) to feature values (strings or convertible to dtype).\n\n .. versionchanged:: 0.24\n Accepts multiple string values for one categorical feature.\n\n y : (ignored)\n Ignored parameter.\n\n Returns\n -------\n Xa : {array, sparse matrix}\n Feature vectors; always 2-d.\n \"\"\"\n return self._transform(X, fitting=True)" }, { @@ -78437,13 +83208,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return a list of feature names, ordered by their indices.\n\nIf one-of-K coding is applied to categorical features, this will include the constructed feature names but not the original ones.", - "docstring": "Return a list of feature names, ordered by their indices.\n\nIf one-of-K coding is applied to categorical features, this will\ninclude the constructed feature names but not the original ones.\n\nReturns\n-------\nfeature_names_ : list of length (n_features,)\n List containing the feature names (e.g., \"f=ham\" and \"f=spam\").", + "description": "Return a list of feature names, ordered by their indices.\n\nIf one-of-K coding is applied to categorical features, this will\ninclude the constructed feature names but not the original ones.", + "docstring": "Return a list of feature names, ordered by their indices.\n\n If one-of-K coding is applied to categorical features, this will\n include the constructed feature names but not the original ones.\n\n Returns\n -------\n feature_names_ : list of length (n_features,)\n List containing the feature names (e.g., \"f=ham\" and \"f=spam\").\n ", "source_code": "\n@deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\ndef get_feature_names(self):\n \"\"\"Return a list of feature names, ordered by their indices.\n\n If one-of-K coding is applied to categorical features, this will\n include the constructed feature names but not the original ones.\n\n Returns\n -------\n feature_names_ : list of length (n_features,)\n List containing the feature names (e.g., \"f=ham\" and \"f=spam\").\n \"\"\"\n return self.feature_names_" }, { @@ -78461,7 +83233,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -78471,13 +83244,14 @@ "docstring": { "type": "array-like of str or None, default=None", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get output feature names for transformation.", - "docstring": "Get output feature names for transformation.\n\nParameters\n----------\ninput_features : array-like of str or None, default=None\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nfeature_names_out : ndarray of str objects\n Transformed feature names.", + "docstring": "Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n ", "source_code": "\ndef get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n if any((not isinstance(name, str) for name in self.feature_names_)):\n feature_names = [str(name) for name in self.feature_names_]\n else:\n feature_names = self.feature_names_\n return np.asarray(feature_names, dtype=object)" }, { @@ -78495,7 +83269,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -78505,6 +83280,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Sample matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -78515,13 +83294,14 @@ "docstring": { "type": "type, default=dict", "description": "Constructor for feature mappings. Must conform to the\ncollections.Mapping API." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform array or sparse matrix X back to feature mappings.\n\nX must have been produced by this DictVectorizer's transform or fit_transform method; it may only have passed through transformers that preserve the number of features and their order. In the case of one-hot/one-of-K coding, the constructed feature names and values are returned rather than the original ones.", - "docstring": "Transform array or sparse matrix X back to feature mappings.\n\nX must have been produced by this DictVectorizer's transform or\nfit_transform method; it may only have passed through transformers\nthat preserve the number of features and their order.\n\nIn the case of one-hot/one-of-K coding, the constructed feature\nnames and values are returned rather than the original ones.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Sample matrix.\ndict_type : type, default=dict\n Constructor for feature mappings. Must conform to the\n collections.Mapping API.\n\nReturns\n-------\nD : list of dict_type objects of shape (n_samples,)\n Feature mappings for the samples in X.", + "description": "Transform array or sparse matrix X back to feature mappings.\n\nX must have been produced by this DictVectorizer's transform or\nfit_transform method; it may only have passed through transformers\nthat preserve the number of features and their order.\n\nIn the case of one-hot/one-of-K coding, the constructed feature\nnames and values are returned rather than the original ones.", + "docstring": "Transform array or sparse matrix X back to feature mappings.\n\n X must have been produced by this DictVectorizer's transform or\n fit_transform method; it may only have passed through transformers\n that preserve the number of features and their order.\n\n In the case of one-hot/one-of-K coding, the constructed feature\n names and values are returned rather than the original ones.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Sample matrix.\n dict_type : type, default=dict\n Constructor for feature mappings. Must conform to the\n collections.Mapping API.\n\n Returns\n -------\n D : list of dict_type objects of shape (n_samples,)\n Feature mappings for the samples in X.\n ", "source_code": "\ndef inverse_transform(self, X, dict_type=dict):\n \"\"\"Transform array or sparse matrix X back to feature mappings.\n\n X must have been produced by this DictVectorizer's transform or\n fit_transform method; it may only have passed through transformers\n that preserve the number of features and their order.\n\n In the case of one-hot/one-of-K coding, the constructed feature\n names and values are returned rather than the original ones.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Sample matrix.\n dict_type : type, default=dict\n Constructor for feature mappings. Must conform to the\n collections.Mapping API.\n\n Returns\n -------\n D : list of dict_type objects of shape (n_samples,)\n Feature mappings for the samples in X.\n \"\"\"\n X = check_array(X, accept_sparse=['csr', 'csc'])\n n_samples = X.shape[0]\n names = self.feature_names_\n dicts = [dict_type() for _ in range(n_samples)]\n if sp.issparse(X):\n for (i, j) in zip(*X.nonzero()):\n dicts[i][names[j]] = X[i, j]\n else:\n for (i, d) in enumerate(dicts):\n for (j, v) in enumerate(X[i, :]):\n if v != 0:\n d[names[j]] = X[i, j]\n return dicts" }, { @@ -78539,7 +83319,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "support", @@ -78549,7 +83330,8 @@ "docstring": { "type": "array-like", "description": "Boolean mask or list of indices (as returned by the get_support\nmember of feature selectors)." - } + }, + "refined_type": {} }, { "name": "indices", @@ -78559,13 +83341,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether support is a list of indices." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Restrict the features to those in support using feature selection.\n\nThis function modifies the estimator in-place.", - "docstring": "Restrict the features to those in support using feature selection.\n\nThis function modifies the estimator in-place.\n\nParameters\n----------\nsupport : array-like\n Boolean mask or list of indices (as returned by the get_support\n member of feature selectors).\nindices : bool, default=False\n Whether support is a list of indices.\n\nReturns\n-------\nself : object\n DictVectorizer class instance.\n\nExamples\n--------\n>>> from sklearn.feature_extraction import DictVectorizer\n>>> from sklearn.feature_selection import SelectKBest, chi2\n>>> v = DictVectorizer()\n>>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]\n>>> X = v.fit_transform(D)\n>>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])\n>>> v.get_feature_names_out()\narray(['bar', 'baz', 'foo'], ...)\n>>> v.restrict(support.get_support())\nDictVectorizer()\n>>> v.get_feature_names_out()\narray(['bar', 'foo'], ...)", + "docstring": "Restrict the features to those in support using feature selection.\n\n This function modifies the estimator in-place.\n\n Parameters\n ----------\n support : array-like\n Boolean mask or list of indices (as returned by the get_support\n member of feature selectors).\n indices : bool, default=False\n Whether support is a list of indices.\n\n Returns\n -------\n self : object\n DictVectorizer class instance.\n\n Examples\n --------\n >>> from sklearn.feature_extraction import DictVectorizer\n >>> from sklearn.feature_selection import SelectKBest, chi2\n >>> v = DictVectorizer()\n >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]\n >>> X = v.fit_transform(D)\n >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])\n >>> v.get_feature_names_out()\n array(['bar', 'baz', 'foo'], ...)\n >>> v.restrict(support.get_support())\n DictVectorizer()\n >>> v.get_feature_names_out()\n array(['bar', 'foo'], ...)\n ", "source_code": "\ndef restrict(self, support, indices=False):\n \"\"\"Restrict the features to those in support using feature selection.\n\n This function modifies the estimator in-place.\n\n Parameters\n ----------\n support : array-like\n Boolean mask or list of indices (as returned by the get_support\n member of feature selectors).\n indices : bool, default=False\n Whether support is a list of indices.\n\n Returns\n -------\n self : object\n DictVectorizer class instance.\n\n Examples\n --------\n >>> from sklearn.feature_extraction import DictVectorizer\n >>> from sklearn.feature_selection import SelectKBest, chi2\n >>> v = DictVectorizer()\n >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]\n >>> X = v.fit_transform(D)\n >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])\n >>> v.get_feature_names_out()\n array(['bar', 'baz', 'foo'], ...)\n >>> v.restrict(support.get_support())\n DictVectorizer()\n >>> v.get_feature_names_out()\n array(['bar', 'foo'], ...)\n \"\"\"\n if not indices:\n support = np.where(support)[0]\n names = self.feature_names_\n new_vocab = {}\n for i in support:\n new_vocab[names[i]] = len(new_vocab)\n self.vocabulary_ = new_vocab\n self.feature_names_ = [f for (f, i) in sorted(new_vocab.items(), key=itemgetter(1))]\n return self" }, { @@ -78583,7 +83366,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -78593,13 +83377,14 @@ "docstring": { "type": "Mapping or iterable over Mappings of shape (n_samples,)", "description": "Dict(s) or Mapping(s) from feature names (arbitrary Python\nobjects) to feature values (strings or convertible to dtype)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform feature->value dicts to array or sparse matrix.\n\nNamed features not encountered during fit or fit_transform will be silently ignored.", - "docstring": "Transform feature->value dicts to array or sparse matrix.\n\nNamed features not encountered during fit or fit_transform will be\nsilently ignored.\n\nParameters\n----------\nX : Mapping or iterable over Mappings of shape (n_samples,)\n Dict(s) or Mapping(s) from feature names (arbitrary Python\n objects) to feature values (strings or convertible to dtype).\n\nReturns\n-------\nXa : {array, sparse matrix}\n Feature vectors; always 2-d.", + "description": "Transform feature->value dicts to array or sparse matrix.\n\nNamed features not encountered during fit or fit_transform will be\nsilently ignored.", + "docstring": "Transform feature->value dicts to array or sparse matrix.\n\n Named features not encountered during fit or fit_transform will be\n silently ignored.\n\n Parameters\n ----------\n X : Mapping or iterable over Mappings of shape (n_samples,)\n Dict(s) or Mapping(s) from feature names (arbitrary Python\n objects) to feature values (strings or convertible to dtype).\n\n Returns\n -------\n Xa : {array, sparse matrix}\n Feature vectors; always 2-d.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform feature->value dicts to array or sparse matrix.\n\n Named features not encountered during fit or fit_transform will be\n silently ignored.\n\n Parameters\n ----------\n X : Mapping or iterable over Mappings of shape (n_samples,)\n Dict(s) or Mapping(s) from feature names (arbitrary Python\n objects) to feature values (strings or convertible to dtype).\n\n Returns\n -------\n Xa : {array, sparse matrix}\n Feature vectors; always 2-d.\n \"\"\"\n return self._transform(X, fitting=False)" }, { @@ -78617,7 +83402,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -78641,7 +83427,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -78651,7 +83438,8 @@ "docstring": { "type": "int, default=2**20", "description": "The number of features (columns) in the output matrices. Small numbers\nof features are likely to cause hash collisions, but large numbers\nwill cause larger coefficient dimensions in linear learners." - } + }, + "refined_type": {} }, { "name": "input_type", @@ -78661,6 +83449,10 @@ "docstring": { "type": "str, default='dict'", "description": "Choose a string from {'dict', 'pair', 'string'}.\nEither \"dict\" (the default) to accept dictionaries over\n(feature_name, value); \"pair\" to accept pairs of (feature_name, value);\nor \"string\" to accept single strings.\nfeature_name should be a string, while value should be a number.\nIn the case of \"string\", a value of 1 is implied.\nThe feature_name is hashed to find the appropriate column for the\nfeature. The value's sign might be flipped in the output (but see\nnon_negative, below)." + }, + "refined_type": { + "kind": "EnumType", + "values": ["string", "pair", "dict"] } }, { @@ -78671,7 +83463,8 @@ "docstring": { "type": "numpy dtype, default=np.float64", "description": "The type of feature values. Passed to scipy.sparse matrix constructors\nas the dtype argument. Do not set this to bool, np.boolean or any\nunsigned integer type." - } + }, + "refined_type": {} }, { "name": "alternate_sign", @@ -78681,13 +83474,14 @@ "docstring": { "type": "bool, default=True", "description": "When True, an alternating sign is added to the features as to\napproximately conserve the inner product in the hashed space even for\nsmall n_features. This approach is similar to sparse random projection.\n\n.. versionchanged:: 0.19\n ``alternate_sign`` replaces the now deprecated ``non_negative``\n parameter." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_features=2**20, *, input_type='dict', dtype=np.float64, alternate_sign=True):\n self._validate_params(n_features, input_type)\n self.dtype = dtype\n self.input_type = input_type\n self.n_features = n_features\n self.alternate_sign = alternate_sign" }, { @@ -78705,13 +83499,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'X_types': [self.input_type]}" }, { @@ -78729,7 +83524,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_type", @@ -78739,13 +83535,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@staticmethod\ndef _validate_params(n_features, input_type):\n if not isinstance(n_features, numbers.Integral):\n raise TypeError('n_features must be integral, got %r (%s).' % (n_features, type(n_features)))\n elif n_features < 1 or n_features >= np.iinfo(np.int32).max + 1:\n raise ValueError('Invalid number of features (%d).' % n_features)\n if input_type not in ('dict', 'pair', 'string'):\n raise ValueError(\"input_type must be 'dict', 'pair' or 'string', got %r.\" % input_type)" }, { @@ -78763,7 +83560,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -78773,7 +83571,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "y", @@ -78783,13 +83582,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "No-op.\n\nThis method doesn't do anything. It exists purely for compatibility with the scikit-learn transformer API.", - "docstring": "No-op.\n\nThis method doesn't do anything. It exists purely for compatibility\nwith the scikit-learn transformer API.\n\nParameters\n----------\nX : Ignored\n Not used, present here for API consistency by convention.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n FeatureHasher class instance.", + "description": "No-op.\n\nThis method doesn't do anything. It exists purely for compatibility\nwith the scikit-learn transformer API.", + "docstring": "No-op.\n\n This method doesn't do anything. It exists purely for compatibility\n with the scikit-learn transformer API.\n\n Parameters\n ----------\n X : Ignored\n Not used, present here for API consistency by convention.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n FeatureHasher class instance.\n ", "source_code": "\ndef fit(self, X=None, y=None):\n \"\"\"No-op.\n\n This method doesn't do anything. It exists purely for compatibility\n with the scikit-learn transformer API.\n\n Parameters\n ----------\n X : Ignored\n Not used, present here for API consistency by convention.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n FeatureHasher class instance.\n \"\"\"\n self._validate_params(self.n_features, self.input_type)\n return self" }, { @@ -78807,7 +83607,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_X", @@ -78817,13 +83618,14 @@ "docstring": { "type": "iterable over iterable over raw features, length = n_samples", "description": "Samples. Each sample must be iterable an (e.g., a list or tuple)\ncontaining/generating feature names (and optionally values, see\nthe input_type constructor argument) which will be hashed.\nraw_X need not support the len function, so it can be the result\nof a generator; n_samples is determined on the fly." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform a sequence of instances to a scipy.sparse matrix.", - "docstring": "Transform a sequence of instances to a scipy.sparse matrix.\n\nParameters\n----------\nraw_X : iterable over iterable over raw features, length = n_samples\n Samples. Each sample must be iterable an (e.g., a list or tuple)\n containing/generating feature names (and optionally values, see\n the input_type constructor argument) which will be hashed.\n raw_X need not support the len function, so it can be the result\n of a generator; n_samples is determined on the fly.\n\nReturns\n-------\nX : sparse matrix of shape (n_samples, n_features)\n Feature matrix, for use with estimators or further transformers.", + "docstring": "Transform a sequence of instances to a scipy.sparse matrix.\n\n Parameters\n ----------\n raw_X : iterable over iterable over raw features, length = n_samples\n Samples. Each sample must be iterable an (e.g., a list or tuple)\n containing/generating feature names (and optionally values, see\n the input_type constructor argument) which will be hashed.\n raw_X need not support the len function, so it can be the result\n of a generator; n_samples is determined on the fly.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Feature matrix, for use with estimators or further transformers.\n ", "source_code": "\ndef transform(self, raw_X):\n \"\"\"Transform a sequence of instances to a scipy.sparse matrix.\n\n Parameters\n ----------\n raw_X : iterable over iterable over raw features, length = n_samples\n Samples. Each sample must be iterable an (e.g., a list or tuple)\n containing/generating feature names (and optionally values, see\n the input_type constructor argument) which will be hashed.\n raw_X need not support the len function, so it can be the result\n of a generator; n_samples is determined on the fly.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Feature matrix, for use with estimators or further transformers.\n \"\"\"\n raw_X = iter(raw_X)\n if self.input_type == 'dict':\n raw_X = (_iteritems(d) for d in raw_X)\n elif self.input_type == 'string':\n raw_X = (((f, 1) for f in x) for x in raw_X)\n (indices, indptr, values) = _hashing_transform(raw_X, self.n_features, self.dtype, self.alternate_sign, seed=0)\n n_samples = indptr.shape[0] - 1\n if n_samples == 0:\n raise ValueError('Cannot vectorize empty sequence.')\n X = sp.csr_matrix((values, indices, indptr), dtype=self.dtype, shape=(n_samples, self.n_features))\n X.sum_duplicates()\n return X" }, { @@ -78836,7 +83638,7 @@ "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _hashing_transform(*args, **kwargs):\n raise NotImplementedError('FeatureHasher is not compatible with PyPy (see https://github.com/scikit-learn/scikit-learn/issues/11540 for the status updates).')" }, { @@ -78854,7 +83656,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -78878,7 +83681,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "patch_size", @@ -78888,7 +83692,8 @@ "docstring": { "type": "tuple of int (patch_height, patch_width), default=None", "description": "The dimensions of one patch." - } + }, + "refined_type": {} }, { "name": "max_patches", @@ -78898,7 +83703,8 @@ "docstring": { "type": "int or float, default=None", "description": "The maximum number of patches per image to extract. If `max_patches` is\na float in (0, 1), it is taken to mean a proportion of the total number\nof patches." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -78908,13 +83714,14 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Determines the random number generator used for random sampling when\n`max_patches is not None`. Use an int to make the randomness\ndeterministic.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, patch_size=None, max_patches=None, random_state=None):\n self.patch_size = patch_size\n self.max_patches = max_patches\n self.random_state = random_state" }, { @@ -78932,13 +83739,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'X_types': ['3darray']}" }, { @@ -78956,7 +83764,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -78966,7 +83775,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -78976,13 +83786,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Do nothing and return the estimator unchanged.\n\nThis method is just there to implement the usual API and hence work in pipelines.", - "docstring": "Do nothing and return the estimator unchanged.\n\nThis method is just there to implement the usual API and hence\nwork in pipelines.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "description": "Do nothing and return the estimator unchanged.\n\nThis method is just there to implement the usual API and hence\nwork in pipelines.", + "docstring": "Do nothing and return the estimator unchanged.\n\n This method is just there to implement the usual API and hence\n work in pipelines.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Do nothing and return the estimator unchanged.\n\n This method is just there to implement the usual API and hence\n work in pipelines.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n return self" }, { @@ -79000,7 +83811,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -79010,13 +83822,14 @@ "docstring": { "type": "ndarray of shape (n_samples, image_height, image_width) or (n_samples, image_height, image_width, n_channels)", "description": "Array of images from which to extract patches. For color images,\nthe last dimension specifies the channel: a RGB image would have\n`n_channels=3`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform the image samples in `X` into a matrix of patch data.", - "docstring": "Transform the image samples in `X` into a matrix of patch data.\n\nParameters\n----------\nX : ndarray of shape (n_samples, image_height, image_width) or (n_samples, image_height, image_width, n_channels)\n Array of images from which to extract patches. For color images,\n the last dimension specifies the channel: a RGB image would have\n `n_channels=3`.\n\nReturns\n-------\npatches : array of shape (n_patches, patch_height, patch_width) or (n_patches, patch_height, patch_width, n_channels)\n The collection of patches extracted from the images, where\n `n_patches` is either `n_samples * max_patches` or the total\n number of patches that can be extracted.", + "docstring": "Transform the image samples in `X` into a matrix of patch data.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, image_height, image_width) or (n_samples, image_height, image_width, n_channels)\n Array of images from which to extract patches. For color images,\n the last dimension specifies the channel: a RGB image would have\n `n_channels=3`.\n\n Returns\n -------\n patches : array of shape (n_patches, patch_height, patch_width) or (n_patches, patch_height, patch_width, n_channels)\n The collection of patches extracted from the images, where\n `n_patches` is either `n_samples * max_patches` or the total\n number of patches that can be extracted.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform the image samples in `X` into a matrix of patch data.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, image_height, image_width) or (n_samples, image_height, image_width, n_channels)\n Array of images from which to extract patches. For color images,\n the last dimension specifies the channel: a RGB image would have\n `n_channels=3`.\n\n Returns\n -------\n patches : array of shape (n_patches, patch_height, patch_width) or (n_patches, patch_height, patch_width, n_channels)\n The collection of patches extracted from the images, where\n `n_patches` is either `n_samples * max_patches` or the total\n number of patches that can be extracted.\n \"\"\"\n self.random_state = check_random_state(self.random_state)\n (n_images, i_h, i_w) = X.shape[:3]\n X = np.reshape(X, (n_images, i_h, i_w, -1))\n n_channels = X.shape[-1]\n if self.patch_size is None:\n patch_size = (i_h // 10, i_w // 10)\n else:\n patch_size = self.patch_size\n (p_h, p_w) = patch_size\n n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, self.max_patches)\n patches_shape = (n_images * n_patches, ) + patch_size\n if n_channels > 1:\n patches_shape += (n_channels, )\n patches = np.empty(patches_shape)\n for (ii, image) in enumerate(X):\n patches[ii * n_patches:(ii + 1) * n_patches] = extract_patches_2d(image, patch_size, max_patches=self.max_patches, random_state=self.random_state)\n return patches" }, { @@ -79034,7 +83847,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "img", @@ -79044,13 +83858,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _compute_gradient_3d(edges, img):\n (_, n_y, n_z) = img.shape\n gradient = np.abs(img[edges[0] // (n_y * n_z), edges[0] % (n_y * n_z) // n_z, edges[0] % (n_y * n_z) % n_z] - img[edges[1] // (n_y * n_z), edges[1] % (n_y * n_z) // n_z, edges[1] % (n_y * n_z) % n_z])\n return gradient" }, { @@ -79068,7 +83883,8 @@ "docstring": { "type": "int", "description": "The image height" - } + }, + "refined_type": {} }, { "name": "i_w", @@ -79078,7 +83894,8 @@ "docstring": { "type": "int", "description": "The image with" - } + }, + "refined_type": {} }, { "name": "p_h", @@ -79088,7 +83905,8 @@ "docstring": { "type": "int", "description": "The height of a patch" - } + }, + "refined_type": {} }, { "name": "p_w", @@ -79098,7 +83916,8 @@ "docstring": { "type": "int", "description": "The width of a patch" - } + }, + "refined_type": {} }, { "name": "max_patches", @@ -79108,13 +83927,14 @@ "docstring": { "type": "int or float, default=None", "description": "The maximum number of patches to extract. If max_patches is a float\nbetween 0 and 1, it is taken to be a proportion of the total number\nof patches." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the number of patches that will be extracted in an image.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Compute the number of patches that will be extracted in an image.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ni_h : int\n The image height\ni_w : int\n The image with\np_h : int\n The height of a patch\np_w : int\n The width of a patch\nmax_patches : int or float, default=None\n The maximum number of patches to extract. If max_patches is a float\n between 0 and 1, it is taken to be a proportion of the total number\n of patches.", + "docstring": "Compute the number of patches that will be extracted in an image.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n i_h : int\n The image height\n i_w : int\n The image with\n p_h : int\n The height of a patch\n p_w : int\n The width of a patch\n max_patches : int or float, default=None\n The maximum number of patches to extract. If max_patches is a float\n between 0 and 1, it is taken to be a proportion of the total number\n of patches.\n ", "source_code": "\ndef _compute_n_patches(i_h, i_w, p_h, p_w, max_patches=None):\n \"\"\"Compute the number of patches that will be extracted in an image.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n i_h : int\n The image height\n i_w : int\n The image with\n p_h : int\n The height of a patch\n p_w : int\n The width of a patch\n max_patches : int or float, default=None\n The maximum number of patches to extract. If max_patches is a float\n between 0 and 1, it is taken to be a proportion of the total number\n of patches.\n \"\"\"\n n_h = i_h - p_h + 1\n n_w = i_w - p_w + 1\n all_patches = n_h * n_w\n if max_patches:\n if isinstance(max_patches, numbers.Integral) and max_patches < all_patches:\n return max_patches\n elif isinstance(max_patches, numbers.Integral) and max_patches >= all_patches:\n return all_patches\n elif isinstance(max_patches, numbers.Real) and 0 < max_patches < 1:\n return int(max_patches * all_patches)\n else:\n raise ValueError('Invalid value for max_patches: %r' % max_patches)\n else:\n return all_patches" }, { @@ -79132,7 +83952,8 @@ "docstring": { "type": "ndarray", "description": "n-dimensional array of which patches are to be extracted" - } + }, + "refined_type": {} }, { "name": "patch_shape", @@ -79142,7 +83963,8 @@ "docstring": { "type": "int or tuple of length arr.ndim.default=8", "description": "Indicates the shape of the patches to be extracted. If an\ninteger is given, the shape will be a hypercube of\nsidelength given by its value." - } + }, + "refined_type": {} }, { "name": "extraction_step", @@ -79152,13 +83974,14 @@ "docstring": { "type": "int or tuple of length arr.ndim, default=1", "description": "Indicates step size at which extraction shall be performed.\nIf integer is given, then the step is uniform in all dimensions." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Extracts patches of any n-dimensional array in place using strides.\n\nGiven an n-dimensional array it will return a 2n-dimensional array with the first n dimensions indexing patch position and the last n indexing the patch content. This operation is immediate (O(1)). A reshape performed on the first n dimensions will cause numpy to copy data, leading to a list of extracted patches. Read more in the :ref:`User Guide `.", - "docstring": "Extracts patches of any n-dimensional array in place using strides.\n\nGiven an n-dimensional array it will return a 2n-dimensional array with\nthe first n dimensions indexing patch position and the last n indexing\nthe patch content. This operation is immediate (O(1)). A reshape\nperformed on the first n dimensions will cause numpy to copy data, leading\nto a list of extracted patches.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\narr : ndarray\n n-dimensional array of which patches are to be extracted\n\npatch_shape : int or tuple of length arr.ndim.default=8\n Indicates the shape of the patches to be extracted. If an\n integer is given, the shape will be a hypercube of\n sidelength given by its value.\n\nextraction_step : int or tuple of length arr.ndim, default=1\n Indicates step size at which extraction shall be performed.\n If integer is given, then the step is uniform in all dimensions.\n\n\nReturns\n-------\npatches : strided ndarray\n 2n-dimensional array indexing patches on first n dimensions and\n containing patches on the last n dimensions. These dimensions\n are fake, but this way no data is copied. A simple reshape invokes\n a copying operation to obtain a list of patches:\n result.reshape([-1] + list(patch_shape))", + "description": "Extracts patches of any n-dimensional array in place using strides.\n\nGiven an n-dimensional array it will return a 2n-dimensional array with\nthe first n dimensions indexing patch position and the last n indexing\nthe patch content. This operation is immediate (O(1)). A reshape\nperformed on the first n dimensions will cause numpy to copy data, leading\nto a list of extracted patches.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Extracts patches of any n-dimensional array in place using strides.\n\n Given an n-dimensional array it will return a 2n-dimensional array with\n the first n dimensions indexing patch position and the last n indexing\n the patch content. This operation is immediate (O(1)). A reshape\n performed on the first n dimensions will cause numpy to copy data, leading\n to a list of extracted patches.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n arr : ndarray\n n-dimensional array of which patches are to be extracted\n\n patch_shape : int or tuple of length arr.ndim.default=8\n Indicates the shape of the patches to be extracted. If an\n integer is given, the shape will be a hypercube of\n sidelength given by its value.\n\n extraction_step : int or tuple of length arr.ndim, default=1\n Indicates step size at which extraction shall be performed.\n If integer is given, then the step is uniform in all dimensions.\n\n\n Returns\n -------\n patches : strided ndarray\n 2n-dimensional array indexing patches on first n dimensions and\n containing patches on the last n dimensions. These dimensions\n are fake, but this way no data is copied. A simple reshape invokes\n a copying operation to obtain a list of patches:\n result.reshape([-1] + list(patch_shape))\n ", "source_code": "\ndef _extract_patches(arr, patch_shape=8, extraction_step=1):\n \"\"\"Extracts patches of any n-dimensional array in place using strides.\n\n Given an n-dimensional array it will return a 2n-dimensional array with\n the first n dimensions indexing patch position and the last n indexing\n the patch content. This operation is immediate (O(1)). A reshape\n performed on the first n dimensions will cause numpy to copy data, leading\n to a list of extracted patches.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n arr : ndarray\n n-dimensional array of which patches are to be extracted\n\n patch_shape : int or tuple of length arr.ndim.default=8\n Indicates the shape of the patches to be extracted. If an\n integer is given, the shape will be a hypercube of\n sidelength given by its value.\n\n extraction_step : int or tuple of length arr.ndim, default=1\n Indicates step size at which extraction shall be performed.\n If integer is given, then the step is uniform in all dimensions.\n\n\n Returns\n -------\n patches : strided ndarray\n 2n-dimensional array indexing patches on first n dimensions and\n containing patches on the last n dimensions. These dimensions\n are fake, but this way no data is copied. A simple reshape invokes\n a copying operation to obtain a list of patches:\n result.reshape([-1] + list(patch_shape))\n \"\"\"\n arr_ndim = arr.ndim\n if isinstance(patch_shape, numbers.Number):\n patch_shape = tuple([patch_shape] * arr_ndim)\n if isinstance(extraction_step, numbers.Number):\n extraction_step = tuple([extraction_step] * arr_ndim)\n patch_strides = arr.strides\n slices = tuple((slice(None, None, st) for st in extraction_step))\n indexing_strides = arr[slices].strides\n patch_indices_shape = (np.array(arr.shape) - np.array(patch_shape)) // np.array(extraction_step) + 1\n shape = tuple(list(patch_indices_shape) + list(patch_shape))\n strides = tuple(list(indexing_strides) + list(patch_strides))\n patches = as_strided(arr, shape=shape, strides=strides)\n return patches" }, { @@ -79176,7 +83999,8 @@ "docstring": { "type": "int", "description": "The size of the grid in the x direction." - } + }, + "refined_type": {} }, { "name": "n_y", @@ -79186,7 +84010,8 @@ "docstring": { "type": "int", "description": "The size of the grid in the y direction." - } + }, + "refined_type": {} }, { "name": "n_z", @@ -79196,13 +84021,14 @@ "docstring": { "type": "integer, default=1", "description": "The size of the grid in the z direction, defaults to 1" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Returns a list of edges for a 3D image.", - "docstring": "Returns a list of edges for a 3D image.\n\nParameters\n----------\nn_x : int\n The size of the grid in the x direction.\nn_y : int\n The size of the grid in the y direction.\nn_z : integer, default=1\n The size of the grid in the z direction, defaults to 1", + "docstring": "Returns a list of edges for a 3D image.\n\n Parameters\n ----------\n n_x : int\n The size of the grid in the x direction.\n n_y : int\n The size of the grid in the y direction.\n n_z : integer, default=1\n The size of the grid in the z direction, defaults to 1\n ", "source_code": "\ndef _make_edges_3d(n_x, n_y, n_z=1):\n \"\"\"Returns a list of edges for a 3D image.\n\n Parameters\n ----------\n n_x : int\n The size of the grid in the x direction.\n n_y : int\n The size of the grid in the y direction.\n n_z : integer, default=1\n The size of the grid in the z direction, defaults to 1\n \"\"\"\n vertices = np.arange(n_x * n_y * n_z).reshape((n_x, n_y, n_z))\n edges_deep = np.vstack((vertices[:, :, :-1].ravel(), vertices[:, :, 1:].ravel()))\n edges_right = np.vstack((vertices[:, :-1].ravel(), vertices[:, 1:].ravel()))\n edges_down = np.vstack((vertices[:-1].ravel(), vertices[1:].ravel()))\n edges = np.hstack((edges_deep, edges_right, edges_down))\n return edges" }, { @@ -79220,7 +84046,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "edges", @@ -79230,7 +84057,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "weights", @@ -79240,7 +84068,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -79264,7 +84093,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_y", @@ -79274,7 +84104,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_z", @@ -79284,7 +84115,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mask", @@ -79294,7 +84126,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "img", @@ -79304,7 +84137,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "return_as", @@ -79314,7 +84148,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -79324,7 +84159,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -79348,7 +84184,8 @@ "docstring": { "type": "ndarray of shape (image_height, image_width) or (image_height, image_width, n_channels)", "description": "The original image data. For color images, the last dimension specifies\nthe channel: a RGB image would have `n_channels=3`." - } + }, + "refined_type": {} }, { "name": "patch_size", @@ -79358,7 +84195,8 @@ "docstring": { "type": "tuple of int (patch_height, patch_width)", "description": "The dimensions of one patch." - } + }, + "refined_type": {} }, { "name": "max_patches", @@ -79368,7 +84206,8 @@ "docstring": { "type": "int or float, default=None", "description": "The maximum number of patches to extract. If `max_patches` is a float\nbetween 0 and 1, it is taken to be a proportion of the total number\nof patches." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -79378,13 +84217,14 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Determines the random number generator used for random sampling when\n`max_patches` is not None. Use an int to make the randomness\ndeterministic.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Reshape a 2D image into a collection of patches\n\nThe resulting patches are allocated in a dedicated array. Read more in the :ref:`User Guide `.", - "docstring": "Reshape a 2D image into a collection of patches\n\nThe resulting patches are allocated in a dedicated array.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nimage : ndarray of shape (image_height, image_width) or (image_height, image_width, n_channels)\n The original image data. For color images, the last dimension specifies\n the channel: a RGB image would have `n_channels=3`.\n\npatch_size : tuple of int (patch_height, patch_width)\n The dimensions of one patch.\n\nmax_patches : int or float, default=None\n The maximum number of patches to extract. If `max_patches` is a float\n between 0 and 1, it is taken to be a proportion of the total number\n of patches.\n\nrandom_state : int, RandomState instance, default=None\n Determines the random number generator used for random sampling when\n `max_patches` is not None. Use an int to make the randomness\n deterministic.\n See :term:`Glossary `.\n\nReturns\n-------\npatches : array of shape (n_patches, patch_height, patch_width) or (n_patches, patch_height, patch_width, n_channels)\n The collection of patches extracted from the image, where `n_patches`\n is either `max_patches` or the total number of patches that can be\n extracted.\n\nExamples\n--------\n>>> from sklearn.datasets import load_sample_image\n>>> from sklearn.feature_extraction import image\n>>> # Use the array data from the first image in this dataset:\n>>> one_image = load_sample_image(\"china.jpg\")\n>>> print('Image shape: {}'.format(one_image.shape))\nImage shape: (427, 640, 3)\n>>> patches = image.extract_patches_2d(one_image, (2, 2))\n>>> print('Patches shape: {}'.format(patches.shape))\nPatches shape: (272214, 2, 2, 3)\n>>> # Here are just two of these patches:\n>>> print(patches[1])\n[[[174 201 231]\n [174 201 231]]\n [[173 200 230]\n [173 200 230]]]\n>>> print(patches[800])\n[[[187 214 243]\n [188 215 244]]\n [[187 214 243]\n [188 215 244]]]", + "description": "Reshape a 2D image into a collection of patches\n\nThe resulting patches are allocated in a dedicated array.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Reshape a 2D image into a collection of patches\n\n The resulting patches are allocated in a dedicated array.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n image : ndarray of shape (image_height, image_width) or (image_height, image_width, n_channels)\n The original image data. For color images, the last dimension specifies\n the channel: a RGB image would have `n_channels=3`.\n\n patch_size : tuple of int (patch_height, patch_width)\n The dimensions of one patch.\n\n max_patches : int or float, default=None\n The maximum number of patches to extract. If `max_patches` is a float\n between 0 and 1, it is taken to be a proportion of the total number\n of patches.\n\n random_state : int, RandomState instance, default=None\n Determines the random number generator used for random sampling when\n `max_patches` is not None. Use an int to make the randomness\n deterministic.\n See :term:`Glossary `.\n\n Returns\n -------\n patches : array of shape (n_patches, patch_height, patch_width) or (n_patches, patch_height, patch_width, n_channels)\n The collection of patches extracted from the image, where `n_patches`\n is either `max_patches` or the total number of patches that can be\n extracted.\n\n Examples\n --------\n >>> from sklearn.datasets import load_sample_image\n >>> from sklearn.feature_extraction import image\n >>> # Use the array data from the first image in this dataset:\n >>> one_image = load_sample_image(\"china.jpg\")\n >>> print('Image shape: {}'.format(one_image.shape))\n Image shape: (427, 640, 3)\n >>> patches = image.extract_patches_2d(one_image, (2, 2))\n >>> print('Patches shape: {}'.format(patches.shape))\n Patches shape: (272214, 2, 2, 3)\n >>> # Here are just two of these patches:\n >>> print(patches[1])\n [[[174 201 231]\n [174 201 231]]\n [[173 200 230]\n [173 200 230]]]\n >>> print(patches[800])\n [[[187 214 243]\n [188 215 244]]\n [[187 214 243]\n [188 215 244]]]\n ", "source_code": "\ndef extract_patches_2d(image, patch_size, *, max_patches=None, random_state=None):\n \"\"\"Reshape a 2D image into a collection of patches\n\n The resulting patches are allocated in a dedicated array.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n image : ndarray of shape (image_height, image_width) or (image_height, image_width, n_channels)\n The original image data. For color images, the last dimension specifies\n the channel: a RGB image would have `n_channels=3`.\n\n patch_size : tuple of int (patch_height, patch_width)\n The dimensions of one patch.\n\n max_patches : int or float, default=None\n The maximum number of patches to extract. If `max_patches` is a float\n between 0 and 1, it is taken to be a proportion of the total number\n of patches.\n\n random_state : int, RandomState instance, default=None\n Determines the random number generator used for random sampling when\n `max_patches` is not None. Use an int to make the randomness\n deterministic.\n See :term:`Glossary `.\n\n Returns\n -------\n patches : array of shape (n_patches, patch_height, patch_width) or (n_patches, patch_height, patch_width, n_channels)\n The collection of patches extracted from the image, where `n_patches`\n is either `max_patches` or the total number of patches that can be\n extracted.\n\n Examples\n --------\n >>> from sklearn.datasets import load_sample_image\n >>> from sklearn.feature_extraction import image\n >>> # Use the array data from the first image in this dataset:\n >>> one_image = load_sample_image(\"china.jpg\")\n >>> print('Image shape: {}'.format(one_image.shape))\n Image shape: (427, 640, 3)\n >>> patches = image.extract_patches_2d(one_image, (2, 2))\n >>> print('Patches shape: {}'.format(patches.shape))\n Patches shape: (272214, 2, 2, 3)\n >>> # Here are just two of these patches:\n >>> print(patches[1])\n [[[174 201 231]\n [174 201 231]]\n [[173 200 230]\n [173 200 230]]]\n >>> print(patches[800])\n [[[187 214 243]\n [188 215 244]]\n [[187 214 243]\n [188 215 244]]]\n \"\"\"\n (i_h, i_w) = image.shape[:2]\n (p_h, p_w) = patch_size\n if p_h > i_h:\n raise ValueError('Height of the patch should be less than the height of the image.')\n if p_w > i_w:\n raise ValueError('Width of the patch should be less than the width of the image.')\n image = check_array(image, allow_nd=True)\n image = image.reshape((i_h, i_w, -1))\n n_colors = image.shape[-1]\n extracted_patches = _extract_patches(image, patch_shape=(p_h, p_w, n_colors), extraction_step=1)\n n_patches = _compute_n_patches(i_h, i_w, p_h, p_w, max_patches)\n if max_patches:\n rng = check_random_state(random_state)\n i_s = rng.randint(i_h - p_h + 1, size=n_patches)\n j_s = rng.randint(i_w - p_w + 1, size=n_patches)\n patches = extracted_patches[i_s, j_s, 0]\n else:\n patches = extracted_patches\n patches = patches.reshape(-1, p_h, p_w, n_colors)\n if patches.shape[-1] == 1:\n return patches.reshape((n_patches, p_h, p_w))\n else:\n return patches" }, { @@ -79402,7 +84242,8 @@ "docstring": { "type": "int", "description": "Dimension in x axis" - } + }, + "refined_type": {} }, { "name": "n_y", @@ -79412,7 +84253,8 @@ "docstring": { "type": "int", "description": "Dimension in y axis" - } + }, + "refined_type": {} }, { "name": "n_z", @@ -79422,7 +84264,8 @@ "docstring": { "type": "int, default=1", "description": "Dimension in z axis" - } + }, + "refined_type": {} }, { "name": "mask", @@ -79432,7 +84275,8 @@ "docstring": { "type": "ndarray of shape (n_x, n_y, n_z), dtype=bool, default=None", "description": "An optional mask of the image, to consider only part of the\npixels." - } + }, + "refined_type": {} }, { "name": "return_as", @@ -79442,7 +84286,8 @@ "docstring": { "type": "np.ndarray or a sparse matrix class, default=sparse.coo_matrix", "description": "The class to use to build the returned adjacency matrix." - } + }, + "refined_type": {} }, { "name": "dtype", @@ -79452,13 +84297,14 @@ "docstring": { "type": "dtype, default=int", "description": "The data of the returned sparse matrix. By default it is int" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Graph of the pixel-to-pixel connections\n\nEdges exist if 2 voxels are connected.", - "docstring": "Graph of the pixel-to-pixel connections\n\nEdges exist if 2 voxels are connected.\n\nParameters\n----------\nn_x : int\n Dimension in x axis\nn_y : int\n Dimension in y axis\nn_z : int, default=1\n Dimension in z axis\nmask : ndarray of shape (n_x, n_y, n_z), dtype=bool, default=None\n An optional mask of the image, to consider only part of the\n pixels.\nreturn_as : np.ndarray or a sparse matrix class, default=sparse.coo_matrix\n The class to use to build the returned adjacency matrix.\ndtype : dtype, default=int\n The data of the returned sparse matrix. By default it is int\n\nNotes\n-----\nFor scikit-learn versions 0.14.1 and prior, return_as=np.ndarray was\nhandled by returning a dense np.matrix instance. Going forward, np.ndarray\nreturns an np.ndarray, as expected.\n\nFor compatibility, user code relying on this method should wrap its\ncalls in ``np.asarray`` to avoid type issues.", + "docstring": "Graph of the pixel-to-pixel connections\n\n Edges exist if 2 voxels are connected.\n\n Parameters\n ----------\n n_x : int\n Dimension in x axis\n n_y : int\n Dimension in y axis\n n_z : int, default=1\n Dimension in z axis\n mask : ndarray of shape (n_x, n_y, n_z), dtype=bool, default=None\n An optional mask of the image, to consider only part of the\n pixels.\n return_as : np.ndarray or a sparse matrix class, default=sparse.coo_matrix\n The class to use to build the returned adjacency matrix.\n dtype : dtype, default=int\n The data of the returned sparse matrix. By default it is int\n\n Notes\n -----\n For scikit-learn versions 0.14.1 and prior, return_as=np.ndarray was\n handled by returning a dense np.matrix instance. Going forward, np.ndarray\n returns an np.ndarray, as expected.\n\n For compatibility, user code relying on this method should wrap its\n calls in ``np.asarray`` to avoid type issues.\n ", "source_code": "\ndef grid_to_graph(n_x, n_y, n_z=1, *, mask=None, return_as=sparse.coo_matrix, dtype=int):\n \"\"\"Graph of the pixel-to-pixel connections\n\n Edges exist if 2 voxels are connected.\n\n Parameters\n ----------\n n_x : int\n Dimension in x axis\n n_y : int\n Dimension in y axis\n n_z : int, default=1\n Dimension in z axis\n mask : ndarray of shape (n_x, n_y, n_z), dtype=bool, default=None\n An optional mask of the image, to consider only part of the\n pixels.\n return_as : np.ndarray or a sparse matrix class, default=sparse.coo_matrix\n The class to use to build the returned adjacency matrix.\n dtype : dtype, default=int\n The data of the returned sparse matrix. By default it is int\n\n Notes\n -----\n For scikit-learn versions 0.14.1 and prior, return_as=np.ndarray was\n handled by returning a dense np.matrix instance. Going forward, np.ndarray\n returns an np.ndarray, as expected.\n\n For compatibility, user code relying on this method should wrap its\n calls in ``np.asarray`` to avoid type issues.\n \"\"\"\n return _to_graph(n_x, n_y, n_z, mask=mask, return_as=return_as, dtype=dtype)" }, { @@ -79476,7 +84322,8 @@ "docstring": { "type": "ndarray of shape (height, width) or (height, width, channel)", "description": "2D or 3D image." - } + }, + "refined_type": {} }, { "name": "mask", @@ -79486,7 +84333,8 @@ "docstring": { "type": "ndarray of shape (height, width) or (height, width, channel), dtype=bool, default=None", "description": "An optional mask of the image, to consider only part of the\npixels." - } + }, + "refined_type": {} }, { "name": "return_as", @@ -79496,7 +84344,8 @@ "docstring": { "type": "np.ndarray or a sparse matrix class, default=sparse.coo_matrix", "description": "The class to use to build the returned adjacency matrix." - } + }, + "refined_type": {} }, { "name": "dtype", @@ -79506,13 +84355,14 @@ "docstring": { "type": "dtype, default=None", "description": "The data of the returned sparse matrix. By default it is the\ndtype of img" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Graph of the pixel-to-pixel gradient connections\n\nEdges are weighted with the gradient values. Read more in the :ref:`User Guide `.", - "docstring": "Graph of the pixel-to-pixel gradient connections\n\nEdges are weighted with the gradient values.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nimg : ndarray of shape (height, width) or (height, width, channel)\n 2D or 3D image.\nmask : ndarray of shape (height, width) or (height, width, channel), dtype=bool, default=None\n An optional mask of the image, to consider only part of the\n pixels.\nreturn_as : np.ndarray or a sparse matrix class, default=sparse.coo_matrix\n The class to use to build the returned adjacency matrix.\ndtype : dtype, default=None\n The data of the returned sparse matrix. By default it is the\n dtype of img\n\nNotes\n-----\nFor scikit-learn versions 0.14.1 and prior, return_as=np.ndarray was\nhandled by returning a dense np.matrix instance. Going forward, np.ndarray\nreturns an np.ndarray, as expected.\n\nFor compatibility, user code relying on this method should wrap its\ncalls in ``np.asarray`` to avoid type issues.", + "description": "Graph of the pixel-to-pixel gradient connections\n\nEdges are weighted with the gradient values.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Graph of the pixel-to-pixel gradient connections\n\n Edges are weighted with the gradient values.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n img : ndarray of shape (height, width) or (height, width, channel)\n 2D or 3D image.\n mask : ndarray of shape (height, width) or (height, width, channel), dtype=bool, default=None\n An optional mask of the image, to consider only part of the\n pixels.\n return_as : np.ndarray or a sparse matrix class, default=sparse.coo_matrix\n The class to use to build the returned adjacency matrix.\n dtype : dtype, default=None\n The data of the returned sparse matrix. By default it is the\n dtype of img\n\n Notes\n -----\n For scikit-learn versions 0.14.1 and prior, return_as=np.ndarray was\n handled by returning a dense np.matrix instance. Going forward, np.ndarray\n returns an np.ndarray, as expected.\n\n For compatibility, user code relying on this method should wrap its\n calls in ``np.asarray`` to avoid type issues.\n ", "source_code": "\ndef img_to_graph(img, *, mask=None, return_as=sparse.coo_matrix, dtype=None):\n \"\"\"Graph of the pixel-to-pixel gradient connections\n\n Edges are weighted with the gradient values.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n img : ndarray of shape (height, width) or (height, width, channel)\n 2D or 3D image.\n mask : ndarray of shape (height, width) or (height, width, channel), dtype=bool, default=None\n An optional mask of the image, to consider only part of the\n pixels.\n return_as : np.ndarray or a sparse matrix class, default=sparse.coo_matrix\n The class to use to build the returned adjacency matrix.\n dtype : dtype, default=None\n The data of the returned sparse matrix. By default it is the\n dtype of img\n\n Notes\n -----\n For scikit-learn versions 0.14.1 and prior, return_as=np.ndarray was\n handled by returning a dense np.matrix instance. Going forward, np.ndarray\n returns an np.ndarray, as expected.\n\n For compatibility, user code relying on this method should wrap its\n calls in ``np.asarray`` to avoid type issues.\n \"\"\"\n img = np.atleast_3d(img)\n (n_x, n_y, n_z) = img.shape\n return _to_graph(n_x, n_y, n_z, mask, img, return_as, dtype)" }, { @@ -79530,7 +84380,8 @@ "docstring": { "type": "ndarray of shape (n_patches, patch_height, patch_width) or (n_patches, patch_height, patch_width, n_channels)", "description": "The complete set of patches. If the patches contain colour information,\nchannels are indexed along the last dimension: RGB patches would\nhave `n_channels=3`." - } + }, + "refined_type": {} }, { "name": "image_size", @@ -79540,13 +84391,14 @@ "docstring": { "type": "tuple of int (image_height, image_width) or (image_height, image_width, n_channels)", "description": "The size of the image that will be reconstructed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Reconstruct the image from all of its patches.\n\nPatches are assumed to overlap and the image is constructed by filling in the patches from left to right, top to bottom, averaging the overlapping regions. Read more in the :ref:`User Guide `.", - "docstring": "Reconstruct the image from all of its patches.\n\nPatches are assumed to overlap and the image is constructed by filling in\nthe patches from left to right, top to bottom, averaging the overlapping\nregions.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\npatches : ndarray of shape (n_patches, patch_height, patch_width) or (n_patches, patch_height, patch_width, n_channels)\n The complete set of patches. If the patches contain colour information,\n channels are indexed along the last dimension: RGB patches would\n have `n_channels=3`.\n\nimage_size : tuple of int (image_height, image_width) or (image_height, image_width, n_channels)\n The size of the image that will be reconstructed.\n\nReturns\n-------\nimage : ndarray of shape image_size\n The reconstructed image.", + "description": "Reconstruct the image from all of its patches.\n\nPatches are assumed to overlap and the image is constructed by filling in\nthe patches from left to right, top to bottom, averaging the overlapping\nregions.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Reconstruct the image from all of its patches.\n\n Patches are assumed to overlap and the image is constructed by filling in\n the patches from left to right, top to bottom, averaging the overlapping\n regions.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n patches : ndarray of shape (n_patches, patch_height, patch_width) or (n_patches, patch_height, patch_width, n_channels)\n The complete set of patches. If the patches contain colour information,\n channels are indexed along the last dimension: RGB patches would\n have `n_channels=3`.\n\n image_size : tuple of int (image_height, image_width) or (image_height, image_width, n_channels)\n The size of the image that will be reconstructed.\n\n Returns\n -------\n image : ndarray of shape image_size\n The reconstructed image.\n ", "source_code": "\ndef reconstruct_from_patches_2d(patches, image_size):\n \"\"\"Reconstruct the image from all of its patches.\n\n Patches are assumed to overlap and the image is constructed by filling in\n the patches from left to right, top to bottom, averaging the overlapping\n regions.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n patches : ndarray of shape (n_patches, patch_height, patch_width) or (n_patches, patch_height, patch_width, n_channels)\n The complete set of patches. If the patches contain colour information,\n channels are indexed along the last dimension: RGB patches would\n have `n_channels=3`.\n\n image_size : tuple of int (image_height, image_width) or (image_height, image_width, n_channels)\n The size of the image that will be reconstructed.\n\n Returns\n -------\n image : ndarray of shape image_size\n The reconstructed image.\n \"\"\"\n (i_h, i_w) = image_size[:2]\n (p_h, p_w) = patches.shape[1:3]\n img = np.zeros(image_size)\n n_h = i_h - p_h + 1\n n_w = i_w - p_w + 1\n for (p, (i, j)) in zip(patches, product(range(n_h), range(n_w))):\n img[i:i + p_h, j:j + p_w] += p\n for i in range(i_h):\n for j in range(i_w):\n img[i, j] /= float(min(i + 1, p_h, i_h - i) * min(j + 1, p_w, i_w - j))\n return img" }, { @@ -79564,7 +84416,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -79574,13 +84427,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n import numpy\n from numpy.distutils.misc_util import Configuration\n config = Configuration('feature_extraction', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n if platform.python_implementation() != 'PyPy':\n config.add_extension('_hashing_fast', sources=['_hashing_fast.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_subpackage('tests')\n return config" }, { @@ -79598,7 +84452,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input", @@ -79608,6 +84463,10 @@ "docstring": { "type": "{'filename', 'file', 'content'}, default='content'", "description": "- If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n- If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n- If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte." + }, + "refined_type": { + "kind": "EnumType", + "values": ["filename", "content", "file"] } }, { @@ -79618,7 +84477,8 @@ "docstring": { "type": "str, default='utf-8'", "description": "If bytes or files are given to analyze, this encoding is used to\ndecode." - } + }, + "refined_type": {} }, { "name": "decode_error", @@ -79628,6 +84488,10 @@ "docstring": { "type": "{'strict', 'ignore', 'replace'}, default='strict'", "description": "Instruction on what to do if a byte sequence is given to analyze that\ncontains characters not of the given `encoding`. By default, it is\n'strict', meaning that a UnicodeDecodeError will be raised. Other\nvalues are 'ignore' and 'replace'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["strict", "replace", "ignore"] } }, { @@ -79638,6 +84502,10 @@ "docstring": { "type": "{'ascii', 'unicode'}, default=None", "description": "Remove accents and perform other character normalization\nduring the preprocessing step.\n'ascii' is a fast method that only works on characters that have\nan direct ASCII mapping.\n'unicode' is a slightly slower method that works on any characters.\nNone (default) does nothing.\n\nBoth 'ascii' and 'unicode' use NFKD normalization from\n:func:`unicodedata.normalize`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["ascii", "unicode"] } }, { @@ -79648,7 +84516,8 @@ "docstring": { "type": "bool, default=True", "description": "Convert all characters to lowercase before tokenizing." - } + }, + "refined_type": {} }, { "name": "preprocessor", @@ -79657,8 +84526,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "callable, default=None", - "description": "Override the preprocessing (strip_accents and lowercase) stage while\npreserving the tokenizing and n-grams generation steps.\nOnly applies if ``analyzer is not callable``." - } + "description": "Override the preprocessing (strip_accents and lowercase) stage while\npreserving the tokenizing and n-grams generation steps.\nOnly applies if ``analyzer`` is not callable." + }, + "refined_type": {} }, { "name": "tokenizer", @@ -79668,7 +84538,8 @@ "docstring": { "type": "callable, default=None", "description": "Override the string tokenization step while preserving the\npreprocessing and n-grams generation steps.\nOnly applies if ``analyzer == 'word'``." - } + }, + "refined_type": {} }, { "name": "stop_words", @@ -79678,6 +84549,10 @@ "docstring": { "type": "{'english'}, list, default=None", "description": "If 'english', a built-in stop word list for English is used.\nThere are several known issues with 'english' and you should\nconsider an alternative (see :ref:`stop_words`).\n\nIf a list, that list is assumed to contain stop words, all of which\nwill be removed from the resulting tokens.\nOnly applies if ``analyzer == 'word'``.\n\nIf None, no stop words will be used. max_df can be set to a value\nin the range [0.7, 1.0) to automatically detect and filter stop\nwords based on intra corpus document frequency of terms." + }, + "refined_type": { + "kind": "EnumType", + "values": ["english"] } }, { @@ -79688,7 +84563,8 @@ "docstring": { "type": "str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"", "description": "Regular expression denoting what constitutes a \"token\", only used\nif ``analyzer == 'word'``. The default regexp select tokens of 2\nor more alphanumeric characters (punctuation is completely ignored\nand always treated as a token separator).\n\nIf there is a capturing group in token_pattern then the\ncaptured group content, not the entire match, becomes the token.\nAt most one capturing group is permitted." - } + }, + "refined_type": {} }, { "name": "ngram_range", @@ -79697,8 +84573,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "tuple (min_n, max_n), default=(1, 1)", - "description": "The lower and upper boundary of the range of n-values for different\nword n-grams or char n-grams to be extracted. All values of n such\nsuch that min_n <= n <= max_n will be used. For example an\n``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means\nunigrams and bigrams, and ``(2, 2)`` means only bigrams.\nOnly applies if ``analyzer is not callable``." - } + "description": "The lower and upper boundary of the range of n-values for different\nword n-grams or char n-grams to be extracted. All values of n such\nsuch that min_n <= n <= max_n will be used. For example an\n``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means\nunigrams and bigrams, and ``(2, 2)`` means only bigrams.\nOnly applies if ``analyzer`` is not callable." + }, + "refined_type": {} }, { "name": "analyzer", @@ -79708,6 +84585,10 @@ "docstring": { "type": "{'word', 'char', 'char_wb'} or callable, default='word'", "description": "Whether the feature should be made of word n-gram or character\nn-grams.\nOption 'char_wb' creates character n-grams only from text inside\nword boundaries; n-grams at the edges of words are padded with space.\n\nIf a callable is passed it is used to extract the sequence of features\nout of the raw, unprocessed input.\n\n.. versionchanged:: 0.21\n\nSince v0.21, if ``input`` is ``filename`` or ``file``, the data is\nfirst read from the file and then passed to the given callable\nanalyzer." + }, + "refined_type": { + "kind": "EnumType", + "values": ["word", "char_wb", "char"] } }, { @@ -79718,6 +84599,14 @@ "docstring": { "type": "float in range [0.0, 1.0] or int, default=1.0", "description": "When building the vocabulary ignore terms that have a document\nfrequency strictly higher than the given threshold (corpus-specific\nstop words).\nIf float, the parameter represents a proportion of documents, integer\nabsolute counts.\nThis parameter is ignored if vocabulary is not None." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": true, + "max_inclusive": true } }, { @@ -79728,6 +84617,14 @@ "docstring": { "type": "float in range [0.0, 1.0] or int, default=1", "description": "When building the vocabulary ignore terms that have a document\nfrequency strictly lower than the given threshold. This value is also\ncalled cut-off in the literature.\nIf float, the parameter represents a proportion of documents, integer\nabsolute counts.\nThis parameter is ignored if vocabulary is not None." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": true, + "max_inclusive": true } }, { @@ -79738,7 +84635,8 @@ "docstring": { "type": "int, default=None", "description": "If not None, build a vocabulary that only consider the top\nmax_features ordered by term frequency across the corpus.\n\nThis parameter is ignored if vocabulary is not None." - } + }, + "refined_type": {} }, { "name": "vocabulary", @@ -79748,7 +84646,8 @@ "docstring": { "type": "Mapping or iterable, default=None", "description": "Either a Mapping (e.g., a dict) where keys are terms and values are\nindices in the feature matrix, or an iterable over terms. If not\ngiven, a vocabulary is determined from the input documents. Indices\nin the mapping should not be repeated and should not have any gap\nbetween 0 and the largest index." - } + }, + "refined_type": {} }, { "name": "binary", @@ -79758,7 +84657,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, all non zero counts are set to 1. This is useful for discrete\nprobabilistic models that model binary events rather than integer\ncounts." - } + }, + "refined_type": {} }, { "name": "dtype", @@ -79768,13 +84668,14 @@ "docstring": { "type": "type, default=np.int64", "description": "Type of the matrix returned by fit_transform() or transform()." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.int64):\n self.input = input\n self.encoding = encoding\n self.decode_error = decode_error\n self.strip_accents = strip_accents\n self.preprocessor = preprocessor\n self.tokenizer = tokenizer\n self.analyzer = analyzer\n self.lowercase = lowercase\n self.token_pattern = token_pattern\n self.stop_words = stop_words\n self.max_df = max_df\n self.min_df = min_df\n self.max_features = max_features\n self.ngram_range = ngram_range\n self.vocabulary = vocabulary\n self.binary = binary\n self.dtype = dtype" }, { @@ -79792,7 +84693,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_documents", @@ -79802,7 +84704,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fixed_vocab", @@ -79812,7 +84715,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -79836,7 +84740,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -79846,7 +84751,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "vocabulary", @@ -79856,7 +84762,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "high", @@ -79866,7 +84773,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "low", @@ -79876,7 +84784,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "limit", @@ -79886,13 +84795,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Remove too rare or too common features.\n\nPrune features that are non zero in more samples than high or less documents than low, modifying the vocabulary, and restricting it to at most the limit most frequent. This does not prune samples with zero features.", - "docstring": "Remove too rare or too common features.\n\nPrune features that are non zero in more samples than high or less\ndocuments than low, modifying the vocabulary, and restricting it to\nat most the limit most frequent.\n\nThis does not prune samples with zero features.", + "description": "Remove too rare or too common features.\n\nPrune features that are non zero in more samples than high or less\ndocuments than low, modifying the vocabulary, and restricting it to\nat most the limit most frequent.\n\nThis does not prune samples with zero features.", + "docstring": "Remove too rare or too common features.\n\n Prune features that are non zero in more samples than high or less\n documents than low, modifying the vocabulary, and restricting it to\n at most the limit most frequent.\n\n This does not prune samples with zero features.\n ", "source_code": "\ndef _limit_features(self, X, vocabulary, high=None, low=None, limit=None):\n \"\"\"Remove too rare or too common features.\n\n Prune features that are non zero in more samples than high or less\n documents than low, modifying the vocabulary, and restricting it to\n at most the limit most frequent.\n\n This does not prune samples with zero features.\n \"\"\"\n if high is None and low is None and limit is None:\n return X, set()\n dfs = _document_frequency(X)\n mask = np.ones(len(dfs), dtype=bool)\n if high is not None:\n mask &= dfs <= high\n if low is not None:\n mask &= dfs >= low\n if limit is not None and mask.sum() > limit:\n tfs = np.asarray(X.sum(axis=0)).ravel()\n mask_inds = (-tfs[mask]).argsort()[:limit]\n new_mask = np.zeros(len(dfs), dtype=bool)\n new_mask[np.where(mask)[0][mask_inds]] = True\n mask = new_mask\n new_indices = np.cumsum(mask) - 1\n removed_terms = set()\n for (term, old_index) in list(vocabulary.items()):\n if mask[old_index]:\n vocabulary[term] = new_indices[old_index]\n else:\n del vocabulary[term]\n removed_terms.add(term)\n kept_indices = np.where(mask)[0]\n if len(kept_indices) == 0:\n raise ValueError('After pruning, no terms remain. Try a lower min_df or a higher max_df.')\n return X[:, kept_indices], removed_terms" }, { @@ -79910,13 +84820,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'X_types': ['string']}" }, { @@ -79934,7 +84845,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -79944,7 +84856,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "vocabulary", @@ -79954,13 +84867,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Sort features by name\n\nReturns a reordered matrix and modifies the vocabulary in place", - "docstring": "Sort features by name\n\nReturns a reordered matrix and modifies the vocabulary in place", + "docstring": "Sort features by name\n\n Returns a reordered matrix and modifies the vocabulary in place\n ", "source_code": "\ndef _sort_features(self, X, vocabulary):\n \"\"\"Sort features by name\n\n Returns a reordered matrix and modifies the vocabulary in place\n \"\"\"\n sorted_features = sorted(vocabulary.items())\n map_index = np.empty(len(sorted_features), dtype=X.indices.dtype)\n for (new_val, (term, old_val)) in enumerate(sorted_features):\n vocabulary[term] = new_val\n map_index[old_val] = new_val\n X.indices = map_index.take(X.indices, mode='clip')\n return X" }, { @@ -79978,7 +84892,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -80002,7 +84917,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_documents", @@ -80012,7 +84928,8 @@ "docstring": { "type": "iterable", "description": "An iterable which generates either str, unicode or file objects." - } + }, + "refined_type": {} }, { "name": "y", @@ -80022,13 +84939,14 @@ "docstring": { "type": "None", "description": "This parameter is ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Learn a vocabulary dictionary of all tokens in the raw documents.", - "docstring": "Learn a vocabulary dictionary of all tokens in the raw documents.\n\nParameters\n----------\nraw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\ny : None\n This parameter is ignored.\n\nReturns\n-------\nself : object\n Fitted vectorizer.", + "docstring": "Learn a vocabulary dictionary of all tokens in the raw documents.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is ignored.\n\n Returns\n -------\n self : object\n Fitted vectorizer.\n ", "source_code": "\ndef fit(self, raw_documents, y=None):\n \"\"\"Learn a vocabulary dictionary of all tokens in the raw documents.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is ignored.\n\n Returns\n -------\n self : object\n Fitted vectorizer.\n \"\"\"\n self._warn_for_unused_params()\n self.fit_transform(raw_documents)\n return self" }, { @@ -80046,7 +84964,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_documents", @@ -80056,7 +84975,8 @@ "docstring": { "type": "iterable", "description": "An iterable which generates either str, unicode or file objects." - } + }, + "refined_type": {} }, { "name": "y", @@ -80066,13 +84986,14 @@ "docstring": { "type": "None", "description": "This parameter is ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Learn the vocabulary dictionary and return document-term matrix.\n\nThis is equivalent to fit followed by transform, but more efficiently implemented.", - "docstring": "Learn the vocabulary dictionary and return document-term matrix.\n\nThis is equivalent to fit followed by transform, but more efficiently\nimplemented.\n\nParameters\n----------\nraw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\ny : None\n This parameter is ignored.\n\nReturns\n-------\nX : array of shape (n_samples, n_features)\n Document-term matrix.", + "description": "Learn the vocabulary dictionary and return document-term matrix.\n\nThis is equivalent to fit followed by transform, but more efficiently\nimplemented.", + "docstring": "Learn the vocabulary dictionary and return document-term matrix.\n\n This is equivalent to fit followed by transform, but more efficiently\n implemented.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is ignored.\n\n Returns\n -------\n X : array of shape (n_samples, n_features)\n Document-term matrix.\n ", "source_code": "\ndef fit_transform(self, raw_documents, y=None):\n \"\"\"Learn the vocabulary dictionary and return document-term matrix.\n\n This is equivalent to fit followed by transform, but more efficiently\n implemented.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is ignored.\n\n Returns\n -------\n X : array of shape (n_samples, n_features)\n Document-term matrix.\n \"\"\"\n if isinstance(raw_documents, str):\n raise ValueError('Iterable over raw text documents expected, string object received.')\n self._validate_params()\n self._validate_vocabulary()\n max_df = self.max_df\n min_df = self.min_df\n max_features = self.max_features\n if self.fixed_vocabulary_ and self.lowercase:\n for term in self.vocabulary:\n if any(map(str.isupper, term)):\n warnings.warn(\"Upper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documents\")\n break\n (vocabulary, X) = self._count_vocab(raw_documents, self.fixed_vocabulary_)\n if self.binary:\n X.data.fill(1)\n if not self.fixed_vocabulary_:\n n_doc = X.shape[0]\n max_doc_count = max_df if isinstance(max_df, numbers.Integral) else max_df * n_doc\n min_doc_count = min_df if isinstance(min_df, numbers.Integral) else min_df * n_doc\n if max_doc_count < min_doc_count:\n raise ValueError('max_df corresponds to < documents than min_df')\n if max_features is not None:\n X = self._sort_features(X, vocabulary)\n (X, self.stop_words_) = self._limit_features(X, vocabulary, max_doc_count, min_doc_count, max_features)\n if max_features is None:\n X = self._sort_features(X, vocabulary)\n self.vocabulary_ = vocabulary\n return X" }, { @@ -80092,13 +85013,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Array mapping from feature integer indices to feature name.", - "docstring": "Array mapping from feature integer indices to feature name.\n\nReturns\n-------\nfeature_names : list\n A list of feature names.", + "docstring": "Array mapping from feature integer indices to feature name.\n\n Returns\n -------\n feature_names : list\n A list of feature names.\n ", "source_code": "\n@deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\ndef get_feature_names(self):\n \"\"\"Array mapping from feature integer indices to feature name.\n\n Returns\n -------\n feature_names : list\n A list of feature names.\n \"\"\"\n self._check_vocabulary()\n return [t for (t, i) in sorted(self.vocabulary_.items(), key=itemgetter(1))]" }, { @@ -80116,7 +85038,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -80126,13 +85049,14 @@ "docstring": { "type": "array-like of str or None, default=None", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get output feature names for transformation.", - "docstring": "Get output feature names for transformation.\n\nParameters\n----------\ninput_features : array-like of str or None, default=None\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nfeature_names_out : ndarray of str objects\n Transformed feature names.", + "docstring": "Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n ", "source_code": "\ndef get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n self._check_vocabulary()\n return np.asarray([t for (t, i) in sorted(self.vocabulary_.items(), key=itemgetter(1))], dtype=object)" }, { @@ -80150,7 +85074,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -80160,13 +85085,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Document-term matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Return terms per document with nonzero entries in X.", - "docstring": "Return terms per document with nonzero entries in X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document-term matrix.\n\nReturns\n-------\nX_inv : list of arrays of shape (n_samples,)\n List of arrays of terms.", + "docstring": "Return terms per document with nonzero entries in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document-term matrix.\n\n Returns\n -------\n X_inv : list of arrays of shape (n_samples,)\n List of arrays of terms.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Return terms per document with nonzero entries in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Document-term matrix.\n\n Returns\n -------\n X_inv : list of arrays of shape (n_samples,)\n List of arrays of terms.\n \"\"\"\n self._check_vocabulary()\n X = check_array(X, accept_sparse='csr')\n n_samples = X.shape[0]\n terms = np.array(list(self.vocabulary_.keys()))\n indices = np.array(list(self.vocabulary_.values()))\n inverse_vocabulary = terms[np.argsort(indices)]\n if sp.issparse(X):\n return [inverse_vocabulary[X[i, :].nonzero()[1]].ravel() for i in range(n_samples)]\n else:\n return [inverse_vocabulary[np.flatnonzero(X[i, :])].ravel() for i in range(n_samples)]" }, { @@ -80184,7 +85113,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_documents", @@ -80194,13 +85124,14 @@ "docstring": { "type": "iterable", "description": "An iterable which generates either str, unicode or file objects." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform documents to document-term matrix.\n\nExtract token counts out of raw text documents using the vocabulary fitted with fit or the one provided to the constructor.", - "docstring": "Transform documents to document-term matrix.\n\nExtract token counts out of raw text documents using the vocabulary\nfitted with fit or the one provided to the constructor.\n\nParameters\n----------\nraw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\nReturns\n-------\nX : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.", + "description": "Transform documents to document-term matrix.\n\nExtract token counts out of raw text documents using the vocabulary\nfitted with fit or the one provided to the constructor.", + "docstring": "Transform documents to document-term matrix.\n\n Extract token counts out of raw text documents using the vocabulary\n fitted with fit or the one provided to the constructor.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.\n ", "source_code": "\ndef transform(self, raw_documents):\n \"\"\"Transform documents to document-term matrix.\n\n Extract token counts out of raw text documents using the vocabulary\n fitted with fit or the one provided to the constructor.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.\n \"\"\"\n if isinstance(raw_documents, str):\n raise ValueError('Iterable over raw text documents expected, string object received.')\n self._check_vocabulary()\n (_, X) = self._count_vocab(raw_documents, fixed_vocab=True)\n if self.binary:\n X.data.fill(1)\n return X" }, { @@ -80218,7 +85149,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input", @@ -80228,6 +85160,10 @@ "docstring": { "type": "{'filename', 'file', 'content'}, default='content'", "description": "- If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n- If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n- If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte." + }, + "refined_type": { + "kind": "EnumType", + "values": ["filename", "content", "file"] } }, { @@ -80238,7 +85174,8 @@ "docstring": { "type": "str, default='utf-8'", "description": "If bytes or files are given to analyze, this encoding is used to\ndecode." - } + }, + "refined_type": {} }, { "name": "decode_error", @@ -80248,6 +85185,10 @@ "docstring": { "type": "{'strict', 'ignore', 'replace'}, default='strict'", "description": "Instruction on what to do if a byte sequence is given to analyze that\ncontains characters not of the given `encoding`. By default, it is\n'strict', meaning that a UnicodeDecodeError will be raised. Other\nvalues are 'ignore' and 'replace'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["strict", "replace", "ignore"] } }, { @@ -80258,6 +85199,10 @@ "docstring": { "type": "{'ascii', 'unicode'}, default=None", "description": "Remove accents and perform other character normalization\nduring the preprocessing step.\n'ascii' is a fast method that only works on characters that have\na direct ASCII mapping.\n'unicode' is a slightly slower method that works on any characters.\nNone (default) does nothing.\n\nBoth 'ascii' and 'unicode' use NFKD normalization from\n:func:`unicodedata.normalize`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["ascii", "unicode"] } }, { @@ -80268,7 +85213,8 @@ "docstring": { "type": "bool, default=True", "description": "Convert all characters to lowercase before tokenizing." - } + }, + "refined_type": {} }, { "name": "preprocessor", @@ -80277,8 +85223,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "callable, default=None", - "description": "Override the preprocessing (string transformation) stage while\npreserving the tokenizing and n-grams generation steps.\nOnly applies if ``analyzer is not callable``." - } + "description": "Override the preprocessing (string transformation) stage while\npreserving the tokenizing and n-grams generation steps.\nOnly applies if ``analyzer`` is not callable." + }, + "refined_type": {} }, { "name": "tokenizer", @@ -80288,7 +85235,8 @@ "docstring": { "type": "callable, default=None", "description": "Override the string tokenization step while preserving the\npreprocessing and n-grams generation steps.\nOnly applies if ``analyzer == 'word'``." - } + }, + "refined_type": {} }, { "name": "stop_words", @@ -80298,6 +85246,10 @@ "docstring": { "type": "{'english'}, list, default=None", "description": "If 'english', a built-in stop word list for English is used.\nThere are several known issues with 'english' and you should\nconsider an alternative (see :ref:`stop_words`).\n\nIf a list, that list is assumed to contain stop words, all of which\nwill be removed from the resulting tokens.\nOnly applies if ``analyzer == 'word'``." + }, + "refined_type": { + "kind": "EnumType", + "values": ["english"] } }, { @@ -80308,7 +85260,8 @@ "docstring": { "type": "str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"", "description": "Regular expression denoting what constitutes a \"token\", only used\nif ``analyzer == 'word'``. The default regexp selects tokens of 2\nor more alphanumeric characters (punctuation is completely ignored\nand always treated as a token separator).\n\nIf there is a capturing group in token_pattern then the\ncaptured group content, not the entire match, becomes the token.\nAt most one capturing group is permitted." - } + }, + "refined_type": {} }, { "name": "ngram_range", @@ -80317,8 +85270,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "tuple (min_n, max_n), default=(1, 1)", - "description": "The lower and upper boundary of the range of n-values for different\nn-grams to be extracted. All values of n such that min_n <= n <= max_n\nwill be used. For example an ``ngram_range`` of ``(1, 1)`` means only\nunigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\nonly bigrams.\nOnly applies if ``analyzer is not callable``." - } + "description": "The lower and upper boundary of the range of n-values for different\nn-grams to be extracted. All values of n such that min_n <= n <= max_n\nwill be used. For example an ``ngram_range`` of ``(1, 1)`` means only\nunigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\nonly bigrams.\nOnly applies if ``analyzer`` is not callable." + }, + "refined_type": {} }, { "name": "analyzer", @@ -80328,6 +85282,10 @@ "docstring": { "type": "{'word', 'char', 'char_wb'} or callable, default='word'", "description": "Whether the feature should be made of word or character n-grams.\nOption 'char_wb' creates character n-grams only from text inside\nword boundaries; n-grams at the edges of words are padded with space.\n\nIf a callable is passed it is used to extract the sequence of features\nout of the raw, unprocessed input.\n\n.. versionchanged:: 0.21\n Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data\n is first read from the file and then passed to the given callable\n analyzer." + }, + "refined_type": { + "kind": "EnumType", + "values": ["word", "char_wb", "char"] } }, { @@ -80338,7 +85296,8 @@ "docstring": { "type": "int, default=(2 ** 20)", "description": "The number of features (columns) in the output matrices. Small numbers\nof features are likely to cause hash collisions, but large numbers\nwill cause larger coefficient dimensions in linear learners." - } + }, + "refined_type": {} }, { "name": "binary", @@ -80348,7 +85307,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, all non zero counts are set to 1. This is useful for discrete\nprobabilistic models that model binary events rather than integer\ncounts." - } + }, + "refined_type": {} }, { "name": "norm", @@ -80358,6 +85318,10 @@ "docstring": { "type": "{'l1', 'l2'}, default='l2'", "description": "Norm used to normalize term vectors. None for no normalization." + }, + "refined_type": { + "kind": "EnumType", + "values": ["l2", "l1"] } }, { @@ -80368,7 +85332,8 @@ "docstring": { "type": "bool, default=True", "description": "When True, an alternating sign is added to the features as to\napproximately conserve the inner product in the hashed space even for\nsmall n_features. This approach is similar to sparse random projection.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -80378,13 +85343,14 @@ "docstring": { "type": "type, default=np.float64", "description": "Type of the matrix returned by fit_transform() or transform()." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', ngram_range=(1, 1), analyzer='word', n_features=2**20, binary=False, norm='l2', alternate_sign=True, dtype=np.float64):\n self.input = input\n self.encoding = encoding\n self.decode_error = decode_error\n self.strip_accents = strip_accents\n self.preprocessor = preprocessor\n self.tokenizer = tokenizer\n self.analyzer = analyzer\n self.lowercase = lowercase\n self.token_pattern = token_pattern\n self.stop_words = stop_words\n self.n_features = n_features\n self.ngram_range = ngram_range\n self.binary = binary\n self.norm = norm\n self.alternate_sign = alternate_sign\n self.dtype = dtype" }, { @@ -80402,13 +85368,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_hasher(self):\n return FeatureHasher(n_features=self.n_features, input_type='string', dtype=self.dtype, alternate_sign=self.alternate_sign)" }, { @@ -80426,13 +85393,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'X_types': ['string']}" }, { @@ -80450,7 +85418,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -80460,7 +85429,8 @@ "docstring": { "type": "ndarray of shape [n_samples, n_features]", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -80470,13 +85440,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "No-op: this transformer is stateless.", - "docstring": "No-op: this transformer is stateless.\n\nParameters\n----------\nX : ndarray of shape [n_samples, n_features]\n Training data.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n HashingVectorizer instance.", + "docstring": "No-op: this transformer is stateless.\n\n Parameters\n ----------\n X : ndarray of shape [n_samples, n_features]\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n HashingVectorizer instance.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"No-op: this transformer is stateless.\n\n Parameters\n ----------\n X : ndarray of shape [n_samples, n_features]\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n HashingVectorizer instance.\n \"\"\"\n if isinstance(X, str):\n raise ValueError('Iterable over raw text documents expected, string object received.')\n self._warn_for_unused_params()\n self._validate_params()\n self._get_hasher().fit(X, y=y)\n return self" }, { @@ -80494,7 +85465,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -80504,7 +85476,8 @@ "docstring": { "type": "iterable over raw text documents, length = n_samples", "description": "Samples. Each sample must be a text document (either bytes or\nunicode strings, file name or file object depending on the\nconstructor argument) which will be tokenized and hashed." - } + }, + "refined_type": {} }, { "name": "y", @@ -80514,13 +85487,14 @@ "docstring": { "type": "any", "description": "Ignored. This parameter exists only for compatibility with\nsklearn.pipeline.Pipeline." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform a sequence of documents to a document-term matrix.", - "docstring": "Transform a sequence of documents to a document-term matrix.\n\nParameters\n----------\nX : iterable over raw text documents, length = n_samples\n Samples. Each sample must be a text document (either bytes or\n unicode strings, file name or file object depending on the\n constructor argument) which will be tokenized and hashed.\ny : any\n Ignored. This parameter exists only for compatibility with\n sklearn.pipeline.Pipeline.\n\nReturns\n-------\nX : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.", + "docstring": "Transform a sequence of documents to a document-term matrix.\n\n Parameters\n ----------\n X : iterable over raw text documents, length = n_samples\n Samples. Each sample must be a text document (either bytes or\n unicode strings, file name or file object depending on the\n constructor argument) which will be tokenized and hashed.\n y : any\n Ignored. This parameter exists only for compatibility with\n sklearn.pipeline.Pipeline.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Transform a sequence of documents to a document-term matrix.\n\n Parameters\n ----------\n X : iterable over raw text documents, length = n_samples\n Samples. Each sample must be a text document (either bytes or\n unicode strings, file name or file object depending on the\n constructor argument) which will be tokenized and hashed.\n y : any\n Ignored. This parameter exists only for compatibility with\n sklearn.pipeline.Pipeline.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.\n \"\"\"\n return self.fit(X, y).transform(X)" }, { @@ -80538,7 +85512,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -80548,7 +85523,8 @@ "docstring": { "type": "ndarray of shape [n_samples, n_features]", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -80558,13 +85534,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "No-op: this transformer is stateless.\n\nThis method is just there to mark the fact that this transformer can work in a streaming setup.", - "docstring": "No-op: this transformer is stateless.\n\nThis method is just there to mark the fact that this transformer\ncan work in a streaming setup.\n\nParameters\n----------\nX : ndarray of shape [n_samples, n_features]\n Training data.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n HashingVectorizer instance.", + "description": "No-op: this transformer is stateless.\n\nThis method is just there to mark the fact that this transformer\ncan work in a streaming setup.", + "docstring": "No-op: this transformer is stateless.\n\n This method is just there to mark the fact that this transformer\n can work in a streaming setup.\n\n Parameters\n ----------\n X : ndarray of shape [n_samples, n_features]\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n HashingVectorizer instance.\n ", "source_code": "\ndef partial_fit(self, X, y=None):\n \"\"\"No-op: this transformer is stateless.\n\n This method is just there to mark the fact that this transformer\n can work in a streaming setup.\n\n Parameters\n ----------\n X : ndarray of shape [n_samples, n_features]\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n HashingVectorizer instance.\n \"\"\"\n return self" }, { @@ -80582,7 +85559,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -80592,13 +85570,14 @@ "docstring": { "type": "iterable over raw text documents, length = n_samples", "description": "Samples. Each sample must be a text document (either bytes or\nunicode strings, file name or file object depending on the\nconstructor argument) which will be tokenized and hashed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform a sequence of documents to a document-term matrix.", - "docstring": "Transform a sequence of documents to a document-term matrix.\n\nParameters\n----------\nX : iterable over raw text documents, length = n_samples\n Samples. Each sample must be a text document (either bytes or\n unicode strings, file name or file object depending on the\n constructor argument) which will be tokenized and hashed.\n\nReturns\n-------\nX : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.", + "docstring": "Transform a sequence of documents to a document-term matrix.\n\n Parameters\n ----------\n X : iterable over raw text documents, length = n_samples\n Samples. Each sample must be a text document (either bytes or\n unicode strings, file name or file object depending on the\n constructor argument) which will be tokenized and hashed.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform a sequence of documents to a document-term matrix.\n\n Parameters\n ----------\n X : iterable over raw text documents, length = n_samples\n Samples. Each sample must be a text document (either bytes or\n unicode strings, file name or file object depending on the\n constructor argument) which will be tokenized and hashed.\n\n Returns\n -------\n X : sparse matrix of shape (n_samples, n_features)\n Document-term matrix.\n \"\"\"\n if isinstance(X, str):\n raise ValueError('Iterable over raw text documents expected, string object received.')\n self._validate_params()\n analyzer = self.build_analyzer()\n X = self._get_hasher().transform((analyzer(doc) for doc in X))\n if self.binary:\n X.data.fill(1)\n if self.norm is not None:\n X = normalize(X, norm=self.norm, copy=False)\n return X" }, { @@ -80616,7 +85595,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "norm", @@ -80626,6 +85606,10 @@ "docstring": { "type": "{'l1', 'l2'}, default='l2'", "description": "Each output row will have unit norm, either:\n\n- 'l2': Sum of squares of vector elements is 1. The cosine\n similarity between two vectors is their dot product when l2 norm has\n been applied.\n- 'l1': Sum of absolute values of vector elements is 1.\n See :func:`preprocessing.normalize`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["l2", "l1"] } }, { @@ -80636,7 +85620,8 @@ "docstring": { "type": "bool, default=True", "description": "Enable inverse-document-frequency reweighting. If False, idf(t) = 1." - } + }, + "refined_type": {} }, { "name": "smooth_idf", @@ -80646,7 +85631,8 @@ "docstring": { "type": "bool, default=True", "description": "Smooth idf weights by adding one to document frequencies, as if an\nextra document was seen containing every term in the collection\nexactly once. Prevents zero divisions." - } + }, + "refined_type": {} }, { "name": "sublinear_tf", @@ -80656,13 +85642,14 @@ "docstring": { "type": "bool, default=False", "description": "Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False):\n self.norm = norm\n self.use_idf = use_idf\n self.smooth_idf = smooth_idf\n self.sublinear_tf = sublinear_tf" }, { @@ -80680,13 +85667,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'X_types': ['2darray', 'sparse']}" }, { @@ -80704,7 +85692,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -80714,7 +85703,8 @@ "docstring": { "type": "sparse matrix of shape n_samples, n_features)", "description": "A matrix of term/token counts." - } + }, + "refined_type": {} }, { "name": "y", @@ -80724,13 +85714,14 @@ "docstring": { "type": "None", "description": "This parameter is not needed to compute tf-idf." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Learn the idf vector (global term weights).", - "docstring": "Learn the idf vector (global term weights).\n\nParameters\n----------\nX : sparse matrix of shape n_samples, n_features)\n A matrix of term/token counts.\n\ny : None\n This parameter is not needed to compute tf-idf.\n\nReturns\n-------\nself : object\n Fitted transformer.", + "docstring": "Learn the idf vector (global term weights).\n\n Parameters\n ----------\n X : sparse matrix of shape n_samples, n_features)\n A matrix of term/token counts.\n\n y : None\n This parameter is not needed to compute tf-idf.\n\n Returns\n -------\n self : object\n Fitted transformer.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Learn the idf vector (global term weights).\n\n Parameters\n ----------\n X : sparse matrix of shape n_samples, n_features)\n A matrix of term/token counts.\n\n y : None\n This parameter is not needed to compute tf-idf.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), accept_large_sparse=not _IS_32BIT)\n if not sp.issparse(X):\n X = sp.csr_matrix(X)\n dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64\n if self.use_idf:\n (n_samples, n_features) = X.shape\n df = _document_frequency(X)\n df = df.astype(dtype, **_astype_copy_false(df))\n df += int(self.smooth_idf)\n n_samples += int(self.smooth_idf)\n idf = np.log(n_samples / df) + 1\n self._idf_diag = sp.diags(idf, offsets=0, shape=(n_features, n_features), format='csr', dtype=dtype)\n return self" }, { @@ -80748,13 +85739,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Inverse document frequency vector, only defined if `use_idf=True`.", - "docstring": "Inverse document frequency vector, only defined if `use_idf=True`.\n\nReturns\n-------\nndarray of shape (n_features,)", + "docstring": "Inverse document frequency vector, only defined if `use_idf=True`.\n\n Returns\n -------\n ndarray of shape (n_features,)\n ", "source_code": "\n@property\ndef idf_(self):\n \"\"\"Inverse document frequency vector, only defined if `use_idf=True`.\n\n Returns\n -------\n ndarray of shape (n_features,)\n \"\"\"\n return np.ravel(self._idf_diag.sum(axis=0))" }, { @@ -80772,7 +85764,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -80782,13 +85775,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@idf_.setter\ndef idf_(self, value):\n value = np.asarray(value, dtype=np.float64)\n n_features = value.shape[0]\n self._idf_diag = sp.spdiags(value, diags=0, m=n_features, n=n_features, format='csr')" }, { @@ -80806,7 +85800,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -80816,7 +85811,8 @@ "docstring": { "type": "sparse matrix of (n_samples, n_features)", "description": "A matrix of term/token counts." - } + }, + "refined_type": {} }, { "name": "copy", @@ -80826,13 +85822,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to copy X and operate on the copy or perform in-place\noperations." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform a count matrix to a tf or tf-idf representation.", - "docstring": "Transform a count matrix to a tf or tf-idf representation.\n\nParameters\n----------\nX : sparse matrix of (n_samples, n_features)\n A matrix of term/token counts.\n\ncopy : bool, default=True\n Whether to copy X and operate on the copy or perform in-place\n operations.\n\nReturns\n-------\nvectors : sparse matrix of shape (n_samples, n_features)\n Tf-idf-weighted document-term matrix.", + "docstring": "Transform a count matrix to a tf or tf-idf representation.\n\n Parameters\n ----------\n X : sparse matrix of (n_samples, n_features)\n A matrix of term/token counts.\n\n copy : bool, default=True\n Whether to copy X and operate on the copy or perform in-place\n operations.\n\n Returns\n -------\n vectors : sparse matrix of shape (n_samples, n_features)\n Tf-idf-weighted document-term matrix.\n ", "source_code": "\ndef transform(self, X, copy=True):\n \"\"\"Transform a count matrix to a tf or tf-idf representation.\n\n Parameters\n ----------\n X : sparse matrix of (n_samples, n_features)\n A matrix of term/token counts.\n\n copy : bool, default=True\n Whether to copy X and operate on the copy or perform in-place\n operations.\n\n Returns\n -------\n vectors : sparse matrix of shape (n_samples, n_features)\n Tf-idf-weighted document-term matrix.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy, reset=False)\n if not sp.issparse(X):\n X = sp.csr_matrix(X, dtype=np.float64)\n if self.sublinear_tf:\n np.log(X.data, X.data)\n X.data += 1\n if self.use_idf:\n check_is_fitted(self, attributes=['idf_'], msg='idf vector is not fitted')\n X = X * self._idf_diag\n if self.norm:\n X = normalize(X, norm=self.norm, copy=False)\n return X" }, { @@ -80850,7 +85847,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input", @@ -80860,6 +85858,10 @@ "docstring": { "type": "{'filename', 'file', 'content'}, default='content'", "description": "- If `'filename'`, the sequence passed as an argument to fit is\n expected to be a list of filenames that need reading to fetch\n the raw content to analyze.\n\n- If `'file'`, the sequence items must have a 'read' method (file-like\n object) that is called to fetch the bytes in memory.\n\n- If `'content'`, the input is expected to be a sequence of items that\n can be of type string or byte." + }, + "refined_type": { + "kind": "EnumType", + "values": ["filename", "content", "file"] } }, { @@ -80870,7 +85872,8 @@ "docstring": { "type": "str, default='utf-8'", "description": "If bytes or files are given to analyze, this encoding is used to\ndecode." - } + }, + "refined_type": {} }, { "name": "decode_error", @@ -80880,6 +85883,10 @@ "docstring": { "type": "{'strict', 'ignore', 'replace'}, default='strict'", "description": "Instruction on what to do if a byte sequence is given to analyze that\ncontains characters not of the given `encoding`. By default, it is\n'strict', meaning that a UnicodeDecodeError will be raised. Other\nvalues are 'ignore' and 'replace'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["strict", "replace", "ignore"] } }, { @@ -80890,6 +85897,10 @@ "docstring": { "type": "{'ascii', 'unicode'}, default=None", "description": "Remove accents and perform other character normalization\nduring the preprocessing step.\n'ascii' is a fast method that only works on characters that have\nan direct ASCII mapping.\n'unicode' is a slightly slower method that works on any characters.\nNone (default) does nothing.\n\nBoth 'ascii' and 'unicode' use NFKD normalization from\n:func:`unicodedata.normalize`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["ascii", "unicode"] } }, { @@ -80900,7 +85911,8 @@ "docstring": { "type": "bool, default=True", "description": "Convert all characters to lowercase before tokenizing." - } + }, + "refined_type": {} }, { "name": "preprocessor", @@ -80909,8 +85921,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "callable, default=None", - "description": "Override the preprocessing (string transformation) stage while\npreserving the tokenizing and n-grams generation steps.\nOnly applies if ``analyzer is not callable``." - } + "description": "Override the preprocessing (string transformation) stage while\npreserving the tokenizing and n-grams generation steps.\nOnly applies if ``analyzer`` is not callable." + }, + "refined_type": {} }, { "name": "tokenizer", @@ -80920,7 +85933,8 @@ "docstring": { "type": "callable, default=None", "description": "Override the string tokenization step while preserving the\npreprocessing and n-grams generation steps.\nOnly applies if ``analyzer == 'word'``." - } + }, + "refined_type": {} }, { "name": "analyzer", @@ -80930,6 +85944,10 @@ "docstring": { "type": "{'word', 'char', 'char_wb'} or callable, default='word'", "description": "Whether the feature should be made of word or character n-grams.\nOption 'char_wb' creates character n-grams only from text inside\nword boundaries; n-grams at the edges of words are padded with space.\n\nIf a callable is passed it is used to extract the sequence of features\nout of the raw, unprocessed input.\n\n.. versionchanged:: 0.21\n Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data\n is first read from the file and then passed to the given callable\n analyzer." + }, + "refined_type": { + "kind": "EnumType", + "values": ["word", "char_wb", "char"] } }, { @@ -80940,6 +85958,10 @@ "docstring": { "type": "{'english'}, list, default=None", "description": "If a string, it is passed to _check_stop_list and the appropriate stop\nlist is returned. 'english' is currently the only supported string\nvalue.\nThere are several known issues with 'english' and you should\nconsider an alternative (see :ref:`stop_words`).\n\nIf a list, that list is assumed to contain stop words, all of which\nwill be removed from the resulting tokens.\nOnly applies if ``analyzer == 'word'``.\n\nIf None, no stop words will be used. max_df can be set to a value\nin the range [0.7, 1.0) to automatically detect and filter stop\nwords based on intra corpus document frequency of terms." + }, + "refined_type": { + "kind": "EnumType", + "values": ["english"] } }, { @@ -80950,7 +85972,8 @@ "docstring": { "type": "str, default=r\"(?u)\\\\b\\\\w\\\\w+\\\\b\"", "description": "Regular expression denoting what constitutes a \"token\", only used\nif ``analyzer == 'word'``. The default regexp selects tokens of 2\nor more alphanumeric characters (punctuation is completely ignored\nand always treated as a token separator).\n\nIf there is a capturing group in token_pattern then the\ncaptured group content, not the entire match, becomes the token.\nAt most one capturing group is permitted." - } + }, + "refined_type": {} }, { "name": "ngram_range", @@ -80959,8 +85982,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "tuple (min_n, max_n), default=(1, 1)", - "description": "The lower and upper boundary of the range of n-values for different\nn-grams to be extracted. All values of n such that min_n <= n <= max_n\nwill be used. For example an ``ngram_range`` of ``(1, 1)`` means only\nunigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\nonly bigrams.\nOnly applies if ``analyzer is not callable``." - } + "description": "The lower and upper boundary of the range of n-values for different\nn-grams to be extracted. All values of n such that min_n <= n <= max_n\nwill be used. For example an ``ngram_range`` of ``(1, 1)`` means only\nunigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means\nonly bigrams.\nOnly applies if ``analyzer`` is not callable." + }, + "refined_type": {} }, { "name": "max_df", @@ -80970,6 +85994,14 @@ "docstring": { "type": "float or int, default=1.0", "description": "When building the vocabulary ignore terms that have a document\nfrequency strictly higher than the given threshold (corpus-specific\nstop words).\nIf float in range [0.0, 1.0], the parameter represents a proportion of\ndocuments, integer absolute counts.\nThis parameter is ignored if vocabulary is not None." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": true, + "max_inclusive": true } }, { @@ -80980,6 +86012,14 @@ "docstring": { "type": "float or int, default=1", "description": "When building the vocabulary ignore terms that have a document\nfrequency strictly lower than the given threshold. This value is also\ncalled cut-off in the literature.\nIf float in range of [0.0, 1.0], the parameter represents a proportion\nof documents, integer absolute counts.\nThis parameter is ignored if vocabulary is not None." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": true, + "max_inclusive": true } }, { @@ -80990,7 +86030,8 @@ "docstring": { "type": "int, default=None", "description": "If not None, build a vocabulary that only consider the top\nmax_features ordered by term frequency across the corpus.\n\nThis parameter is ignored if vocabulary is not None." - } + }, + "refined_type": {} }, { "name": "vocabulary", @@ -81000,7 +86041,8 @@ "docstring": { "type": "Mapping or iterable, default=None", "description": "Either a Mapping (e.g., a dict) where keys are terms and values are\nindices in the feature matrix, or an iterable over terms. If not\ngiven, a vocabulary is determined from the input documents." - } + }, + "refined_type": {} }, { "name": "binary", @@ -81010,7 +86052,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, all non-zero term counts are set to 1. This does not mean\noutputs will have only 0/1 values, only that the tf term in tf-idf\nis binary. (Set idf and normalization to False to get 0/1 outputs)." - } + }, + "refined_type": {} }, { "name": "dtype", @@ -81020,7 +86063,8 @@ "docstring": { "type": "dtype, default=float64", "description": "Type of the matrix returned by fit_transform() or transform()." - } + }, + "refined_type": {} }, { "name": "norm", @@ -81030,6 +86074,10 @@ "docstring": { "type": "{'l1', 'l2'}, default='l2'", "description": "Each output row will have unit norm, either:\n\n- 'l2': Sum of squares of vector elements is 1. The cosine\n similarity between two vectors is their dot product when l2 norm has\n been applied.\n- 'l1': Sum of absolute values of vector elements is 1.\n See :func:`preprocessing.normalize`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["l2", "l1"] } }, { @@ -81040,7 +86088,8 @@ "docstring": { "type": "bool, default=True", "description": "Enable inverse-document-frequency reweighting. If False, idf(t) = 1." - } + }, + "refined_type": {} }, { "name": "smooth_idf", @@ -81050,7 +86099,8 @@ "docstring": { "type": "bool, default=True", "description": "Smooth idf weights by adding one to document frequencies, as if an\nextra document was seen containing every term in the collection\nexactly once. Prevents zero divisions." - } + }, + "refined_type": {} }, { "name": "sublinear_tf", @@ -81060,13 +86110,14 @@ "docstring": { "type": "bool, default=False", "description": "Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=np.float64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False):\n super().__init__(input=input, encoding=encoding, decode_error=decode_error, strip_accents=strip_accents, lowercase=lowercase, preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer, stop_words=stop_words, token_pattern=token_pattern, ngram_range=ngram_range, max_df=max_df, min_df=min_df, max_features=max_features, vocabulary=vocabulary, binary=binary, dtype=dtype)\n self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)" }, { @@ -81084,13 +86135,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_params(self):\n if self.dtype not in FLOAT_DTYPES:\n warnings.warn(\"Only {} 'dtype' should be used. {} 'dtype' will be converted to np.float64.\".format(FLOAT_DTYPES, self.dtype), UserWarning)" }, { @@ -81108,13 +86160,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'X_types': ['string'], '_skip_test': True}" }, { @@ -81132,7 +86185,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_documents", @@ -81142,7 +86196,8 @@ "docstring": { "type": "iterable", "description": "An iterable which generates either str, unicode or file objects." - } + }, + "refined_type": {} }, { "name": "y", @@ -81152,13 +86207,14 @@ "docstring": { "type": "None", "description": "This parameter is not needed to compute tfidf." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Learn vocabulary and idf from training set.", - "docstring": "Learn vocabulary and idf from training set.\n\nParameters\n----------\nraw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\ny : None\n This parameter is not needed to compute tfidf.\n\nReturns\n-------\nself : object\n Fitted vectorizer.", + "docstring": "Learn vocabulary and idf from training set.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is not needed to compute tfidf.\n\n Returns\n -------\n self : object\n Fitted vectorizer.\n ", "source_code": "\ndef fit(self, raw_documents, y=None):\n \"\"\"Learn vocabulary and idf from training set.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is not needed to compute tfidf.\n\n Returns\n -------\n self : object\n Fitted vectorizer.\n \"\"\"\n self._check_params()\n self._warn_for_unused_params()\n X = super().fit_transform(raw_documents)\n self._tfidf.fit(X)\n return self" }, { @@ -81176,7 +86232,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_documents", @@ -81186,7 +86243,8 @@ "docstring": { "type": "iterable", "description": "An iterable which generates either str, unicode or file objects." - } + }, + "refined_type": {} }, { "name": "y", @@ -81196,13 +86254,14 @@ "docstring": { "type": "None", "description": "This parameter is ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Learn vocabulary and idf, return document-term matrix.\n\nThis is equivalent to fit followed by transform, but more efficiently implemented.", - "docstring": "Learn vocabulary and idf, return document-term matrix.\n\nThis is equivalent to fit followed by transform, but more efficiently\nimplemented.\n\nParameters\n----------\nraw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\ny : None\n This parameter is ignored.\n\nReturns\n-------\nX : sparse matrix of (n_samples, n_features)\n Tf-idf-weighted document-term matrix.", + "description": "Learn vocabulary and idf, return document-term matrix.\n\nThis is equivalent to fit followed by transform, but more efficiently\nimplemented.", + "docstring": "Learn vocabulary and idf, return document-term matrix.\n\n This is equivalent to fit followed by transform, but more efficiently\n implemented.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is ignored.\n\n Returns\n -------\n X : sparse matrix of (n_samples, n_features)\n Tf-idf-weighted document-term matrix.\n ", "source_code": "\ndef fit_transform(self, raw_documents, y=None):\n \"\"\"Learn vocabulary and idf, return document-term matrix.\n\n This is equivalent to fit followed by transform, but more efficiently\n implemented.\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n y : None\n This parameter is ignored.\n\n Returns\n -------\n X : sparse matrix of (n_samples, n_features)\n Tf-idf-weighted document-term matrix.\n \"\"\"\n self._check_params()\n X = super().fit_transform(raw_documents)\n self._tfidf.fit(X)\n return self._tfidf.transform(X, copy=False)" }, { @@ -81220,13 +86279,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Inverse document frequency vector, only defined if `use_idf=True`.", - "docstring": "Inverse document frequency vector, only defined if `use_idf=True`.\n\nReturns\n-------\nndarray of shape (n_features,)", + "docstring": "Inverse document frequency vector, only defined if `use_idf=True`.\n\n Returns\n -------\n ndarray of shape (n_features,)\n ", "source_code": "\n@property\ndef idf_(self):\n \"\"\"Inverse document frequency vector, only defined if `use_idf=True`.\n\n Returns\n -------\n ndarray of shape (n_features,)\n \"\"\"\n return self._tfidf.idf_" }, { @@ -81244,7 +86304,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -81254,13 +86315,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@idf_.setter\ndef idf_(self, value):\n self._validate_vocabulary()\n if hasattr(self, 'vocabulary_'):\n if len(self.vocabulary_) != len(value):\n raise ValueError('idf length = %d must be equal to vocabulary size = %d' % (len(value), len(self.vocabulary)))\n self._tfidf.idf_ = value" }, { @@ -81278,7 +86340,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -81302,7 +86365,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -81312,13 +86376,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@norm.setter\ndef norm(self, value):\n self._tfidf.norm = value" }, { @@ -81336,7 +86401,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -81360,7 +86426,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -81370,13 +86437,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@smooth_idf.setter\ndef smooth_idf(self, value):\n self._tfidf.smooth_idf = value" }, { @@ -81394,7 +86462,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -81418,7 +86487,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -81428,13 +86498,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@sublinear_tf.setter\ndef sublinear_tf(self, value):\n self._tfidf.sublinear_tf = value" }, { @@ -81452,7 +86523,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "raw_documents", @@ -81462,13 +86534,14 @@ "docstring": { "type": "iterable", "description": "An iterable which generates either str, unicode or file objects." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform documents to document-term matrix.\n\nUses the vocabulary and document frequencies (df) learned by fit (or fit_transform).", - "docstring": "Transform documents to document-term matrix.\n\nUses the vocabulary and document frequencies (df) learned by fit (or\nfit_transform).\n\nParameters\n----------\nraw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\nReturns\n-------\nX : sparse matrix of (n_samples, n_features)\n Tf-idf-weighted document-term matrix.", + "description": "Transform documents to document-term matrix.\n\nUses the vocabulary and document frequencies (df) learned by fit (or\nfit_transform).", + "docstring": "Transform documents to document-term matrix.\n\n Uses the vocabulary and document frequencies (df) learned by fit (or\n fit_transform).\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n Returns\n -------\n X : sparse matrix of (n_samples, n_features)\n Tf-idf-weighted document-term matrix.\n ", "source_code": "\ndef transform(self, raw_documents):\n \"\"\"Transform documents to document-term matrix.\n\n Uses the vocabulary and document frequencies (df) learned by fit (or\n fit_transform).\n\n Parameters\n ----------\n raw_documents : iterable\n An iterable which generates either str, unicode or file objects.\n\n Returns\n -------\n X : sparse matrix of (n_samples, n_features)\n Tf-idf-weighted document-term matrix.\n \"\"\"\n check_is_fitted(self, msg='The TF-IDF vectorizer is not fitted')\n X = super().transform(raw_documents)\n return self._tfidf.transform(X, copy=False)" }, { @@ -81486,7 +86559,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -81510,7 +86584,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -81520,13 +86595,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@use_idf.setter\ndef use_idf(self, value):\n self._tfidf.use_idf = value" }, { @@ -81544,7 +86620,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "text_document", @@ -81554,7 +86631,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -81578,7 +86656,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "text_document", @@ -81588,13 +86667,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Whitespace sensitive char-n-gram tokenization.\n\nTokenize text_document into a sequence of character n-grams operating only inside word boundaries. n-grams at the edges of words are padded with space.", - "docstring": "Whitespace sensitive char-n-gram tokenization.\n\nTokenize text_document into a sequence of character n-grams\noperating only inside word boundaries. n-grams at the edges\nof words are padded with space.", + "description": "Whitespace sensitive char-n-gram tokenization.\n\nTokenize text_document into a sequence of character n-grams\noperating only inside word boundaries. n-grams at the edges\nof words are padded with space.", + "docstring": "Whitespace sensitive char-n-gram tokenization.\n\n Tokenize text_document into a sequence of character n-grams\n operating only inside word boundaries. n-grams at the edges\n of words are padded with space.", "source_code": "\ndef _char_wb_ngrams(self, text_document):\n \"\"\"Whitespace sensitive char-n-gram tokenization.\n\n Tokenize text_document into a sequence of character n-grams\n operating only inside word boundaries. n-grams at the edges\n of words are padded with space.\"\"\"\n text_document = self._white_spaces.sub(' ', text_document)\n (min_n, max_n) = self.ngram_range\n ngrams = []\n ngrams_append = ngrams.append\n for w in text_document.split():\n w = ' ' + w + ' '\n w_len = len(w)\n for n in range(min_n, max_n + 1):\n offset = 0\n ngrams_append(w[offset:offset + n])\n while offset + n < w_len:\n offset += 1\n ngrams_append(w[offset:offset + n])\n if offset == 0:\n break\n return ngrams" }, { @@ -81612,7 +86692,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stop_words", @@ -81622,7 +86703,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "preprocess", @@ -81632,7 +86714,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tokenize", @@ -81642,13 +86725,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check if stop words are consistent", - "docstring": "Check if stop words are consistent\n\nReturns\n-------\nis_consistent : True if stop words are consistent with the preprocessor\n and tokenizer, False if they are not, None if the check\n was previously performed, \"error\" if it could not be\n performed (e.g. because of the use of a custom\n preprocessor / tokenizer)", + "docstring": "Check if stop words are consistent\n\n Returns\n -------\n is_consistent : True if stop words are consistent with the preprocessor\n and tokenizer, False if they are not, None if the check\n was previously performed, \"error\" if it could not be\n performed (e.g. because of the use of a custom\n preprocessor / tokenizer)\n ", "source_code": "\ndef _check_stop_words_consistency(self, stop_words, preprocess, tokenize):\n \"\"\"Check if stop words are consistent\n\n Returns\n -------\n is_consistent : True if stop words are consistent with the preprocessor\n and tokenizer, False if they are not, None if the check\n was previously performed, \"error\" if it could not be\n performed (e.g. because of the use of a custom\n preprocessor / tokenizer)\n \"\"\"\n if id(self.stop_words) == getattr(self, '_stop_words_id', None):\n return None\n try:\n inconsistent = set()\n for w in stop_words or ():\n tokens = list(tokenize(preprocess(w)))\n for token in tokens:\n if token not in stop_words:\n inconsistent.add(token)\n self._stop_words_id = id(self.stop_words)\n if inconsistent:\n warnings.warn('Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %r not in stop_words.' % sorted(inconsistent))\n return not inconsistent\n except Exception:\n self._stop_words_id = id(self.stop_words)\n return 'error'" }, { @@ -81666,7 +86750,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -81690,7 +86775,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -81714,13 +86800,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_vocabulary(self):\n vocabulary = self.vocabulary\n if vocabulary is not None:\n if isinstance(vocabulary, set):\n vocabulary = sorted(vocabulary)\n if not isinstance(vocabulary, Mapping):\n vocab = {}\n for (i, t) in enumerate(vocabulary):\n if vocab.setdefault(t, i) != i:\n msg = 'Duplicate term in vocabulary: %r' % t\n raise ValueError(msg)\n vocabulary = vocab\n else:\n indices = set(vocabulary.values())\n if len(indices) != len(vocabulary):\n raise ValueError('Vocabulary contains repeated indices.')\n for i in range(len(vocabulary)):\n if i not in indices:\n msg = \"Vocabulary of size %d doesn't contain index %d.\" % (len(vocabulary), i)\n raise ValueError(msg)\n if not vocabulary:\n raise ValueError('empty vocabulary passed to fit')\n self.fixed_vocabulary_ = True\n self.vocabulary_ = dict(vocabulary)\n else:\n self.fixed_vocabulary_ = False" }, { @@ -81738,13 +86825,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _warn_for_unused_params(self):\n if self.tokenizer is not None and self.token_pattern is not None:\n warnings.warn(\"The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\")\n if self.preprocessor is not None and callable(self.analyzer):\n warnings.warn(\"The parameter 'preprocessor' will not be used since 'analyzer' is callable'\")\n if self.ngram_range != (1, 1) and self.ngram_range is not None and callable(self.analyzer):\n warnings.warn(\"The parameter 'ngram_range' will not be used since 'analyzer' is callable'\")\n if self.analyzer != 'word' or callable(self.analyzer):\n if self.stop_words is not None:\n warnings.warn(\"The parameter 'stop_words' will not be used since 'analyzer' != 'word'\")\n if self.token_pattern is not None and self.token_pattern != '(?u)\\\\b\\\\w\\\\w+\\\\b':\n warnings.warn(\"The parameter 'token_pattern' will not be used since 'analyzer' != 'word'\")\n if self.tokenizer is not None:\n warnings.warn(\"The parameter 'tokenizer' will not be used since 'analyzer' != 'word'\")" }, { @@ -81762,7 +86850,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tokens", @@ -81772,7 +86861,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stop_words", @@ -81782,7 +86872,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -81806,13 +86897,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return a callable to process input data.\n\nThe callable handles that handles preprocessing, tokenization, and n-grams generation.", - "docstring": "Return a callable to process input data.\n\nThe callable handles that handles preprocessing, tokenization, and\nn-grams generation.\n\nReturns\n-------\nanalyzer: callable\n A function to handle preprocessing, tokenization\n and n-grams generation.", + "description": "Return a callable to process input data.\n\nThe callable handles that handles preprocessing, tokenization, and\nn-grams generation.", + "docstring": "Return a callable to process input data.\n\n The callable handles that handles preprocessing, tokenization, and\n n-grams generation.\n\n Returns\n -------\n analyzer: callable\n A function to handle preprocessing, tokenization\n and n-grams generation.\n ", "source_code": "\ndef build_analyzer(self):\n \"\"\"Return a callable to process input data.\n\n The callable handles that handles preprocessing, tokenization, and\n n-grams generation.\n\n Returns\n -------\n analyzer: callable\n A function to handle preprocessing, tokenization\n and n-grams generation.\n \"\"\"\n if callable(self.analyzer):\n return partial(_analyze, analyzer=self.analyzer, decoder=self.decode)\n preprocess = self.build_preprocessor()\n if self.analyzer == 'char':\n return partial(_analyze, ngrams=self._char_ngrams, preprocessor=preprocess, decoder=self.decode)\n elif self.analyzer == 'char_wb':\n return partial(_analyze, ngrams=self._char_wb_ngrams, preprocessor=preprocess, decoder=self.decode)\n elif self.analyzer == 'word':\n stop_words = self.get_stop_words()\n tokenize = self.build_tokenizer()\n self._check_stop_words_consistency(stop_words, preprocess, tokenize)\n return partial(_analyze, ngrams=self._word_ngrams, tokenizer=tokenize, preprocessor=preprocess, decoder=self.decode, stop_words=stop_words)\n else:\n raise ValueError('%s is not a valid tokenization scheme/analyzer' % self.analyzer)" }, { @@ -81830,13 +86922,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return a function to preprocess the text before tokenization.", - "docstring": "Return a function to preprocess the text before tokenization.\n\nReturns\n-------\npreprocessor: callable\n A function to preprocess the text before tokenization.", + "docstring": "Return a function to preprocess the text before tokenization.\n\n Returns\n -------\n preprocessor: callable\n A function to preprocess the text before tokenization.\n ", "source_code": "\ndef build_preprocessor(self):\n \"\"\"Return a function to preprocess the text before tokenization.\n\n Returns\n -------\n preprocessor: callable\n A function to preprocess the text before tokenization.\n \"\"\"\n if self.preprocessor is not None:\n return self.preprocessor\n if not self.strip_accents:\n strip_accents = None\n elif callable(self.strip_accents):\n strip_accents = self.strip_accents\n elif self.strip_accents == 'ascii':\n strip_accents = strip_accents_ascii\n elif self.strip_accents == 'unicode':\n strip_accents = strip_accents_unicode\n else:\n raise ValueError('Invalid value for \"strip_accents\": %s' % self.strip_accents)\n return partial(_preprocess, accent_function=strip_accents, lower=self.lowercase)" }, { @@ -81854,13 +86947,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return a function that splits a string into a sequence of tokens.", - "docstring": "Return a function that splits a string into a sequence of tokens.\n\nReturns\n-------\ntokenizer: callable\n A function to split a string into a sequence of tokens.", + "docstring": "Return a function that splits a string into a sequence of tokens.\n\n Returns\n -------\n tokenizer: callable\n A function to split a string into a sequence of tokens.\n ", "source_code": "\ndef build_tokenizer(self):\n \"\"\"Return a function that splits a string into a sequence of tokens.\n\n Returns\n -------\n tokenizer: callable\n A function to split a string into a sequence of tokens.\n \"\"\"\n if self.tokenizer is not None:\n return self.tokenizer\n token_pattern = re.compile(self.token_pattern)\n if token_pattern.groups > 1:\n raise ValueError('More than 1 capturing group in token pattern. Only a single group should be captured.')\n return token_pattern.findall" }, { @@ -81878,7 +86972,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "doc", @@ -81888,13 +86983,14 @@ "docstring": { "type": "bytes or str", "description": "The string to decode." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Decode the input into a string of unicode symbols.\n\nThe decoding strategy depends on the vectorizer parameters.", - "docstring": "Decode the input into a string of unicode symbols.\n\nThe decoding strategy depends on the vectorizer parameters.\n\nParameters\n----------\ndoc : bytes or str\n The string to decode.\n\nReturns\n-------\ndoc: str\n A string of unicode symbols.", + "docstring": "Decode the input into a string of unicode symbols.\n\n The decoding strategy depends on the vectorizer parameters.\n\n Parameters\n ----------\n doc : bytes or str\n The string to decode.\n\n Returns\n -------\n doc: str\n A string of unicode symbols.\n ", "source_code": "\ndef decode(self, doc):\n \"\"\"Decode the input into a string of unicode symbols.\n\n The decoding strategy depends on the vectorizer parameters.\n\n Parameters\n ----------\n doc : bytes or str\n The string to decode.\n\n Returns\n -------\n doc: str\n A string of unicode symbols.\n \"\"\"\n if self.input == 'filename':\n with open(doc, 'rb') as fh:\n doc = fh.read()\n elif self.input == 'file':\n doc = doc.read()\n if isinstance(doc, bytes):\n doc = doc.decode(self.encoding, self.decode_error)\n if doc is np.nan:\n raise ValueError('np.nan is an invalid document, expected byte or unicode string.')\n return doc" }, { @@ -81912,13 +87008,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Build or fetch the effective stop words list.", - "docstring": "Build or fetch the effective stop words list.\n\nReturns\n-------\nstop_words: list or None\n A list of stop words.", + "docstring": "Build or fetch the effective stop words list.\n\n Returns\n -------\n stop_words: list or None\n A list of stop words.\n ", "source_code": "\ndef get_stop_words(self):\n \"\"\"Build or fetch the effective stop words list.\n\n Returns\n -------\n stop_words: list or None\n A list of stop words.\n \"\"\"\n return _check_stop_list(self.stop_words)" }, { @@ -81936,7 +87033,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "analyzer", @@ -81946,7 +87044,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tokenizer", @@ -81956,7 +87055,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ngrams", @@ -81966,7 +87066,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "preprocessor", @@ -81976,7 +87077,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "decoder", @@ -81986,7 +87088,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stop_words", @@ -81996,13 +87099,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Chain together an optional series of text processing steps to go from a single document to ngrams, with or without tokenizing or preprocessing.\n\nIf analyzer is used, only the decoder argument is used, as the analyzer is intended to replace the preprocessor, tokenizer, and ngrams steps.", - "docstring": "Chain together an optional series of text processing steps to go from\na single document to ngrams, with or without tokenizing or preprocessing.\n\nIf analyzer is used, only the decoder argument is used, as the analyzer is\nintended to replace the preprocessor, tokenizer, and ngrams steps.\n\nParameters\n----------\nanalyzer: callable, default=None\ntokenizer: callable, default=None\nngrams: callable, default=None\npreprocessor: callable, default=None\ndecoder: callable, default=None\nstop_words: list, default=None\n\nReturns\n-------\nngrams: list\n A sequence of tokens, possibly with pairs, triples, etc.", + "description": "Chain together an optional series of text processing steps to go from\na single document to ngrams, with or without tokenizing or preprocessing.\n\nIf analyzer is used, only the decoder argument is used, as the analyzer is\nintended to replace the preprocessor, tokenizer, and ngrams steps.", + "docstring": "Chain together an optional series of text processing steps to go from\n a single document to ngrams, with or without tokenizing or preprocessing.\n\n If analyzer is used, only the decoder argument is used, as the analyzer is\n intended to replace the preprocessor, tokenizer, and ngrams steps.\n\n Parameters\n ----------\n analyzer: callable, default=None\n tokenizer: callable, default=None\n ngrams: callable, default=None\n preprocessor: callable, default=None\n decoder: callable, default=None\n stop_words: list, default=None\n\n Returns\n -------\n ngrams: list\n A sequence of tokens, possibly with pairs, triples, etc.\n ", "source_code": "\ndef _analyze(doc, analyzer=None, tokenizer=None, ngrams=None, preprocessor=None, decoder=None, stop_words=None):\n \"\"\"Chain together an optional series of text processing steps to go from\n a single document to ngrams, with or without tokenizing or preprocessing.\n\n If analyzer is used, only the decoder argument is used, as the analyzer is\n intended to replace the preprocessor, tokenizer, and ngrams steps.\n\n Parameters\n ----------\n analyzer: callable, default=None\n tokenizer: callable, default=None\n ngrams: callable, default=None\n preprocessor: callable, default=None\n decoder: callable, default=None\n stop_words: list, default=None\n\n Returns\n -------\n ngrams: list\n A sequence of tokens, possibly with pairs, triples, etc.\n \"\"\"\n if decoder is not None:\n doc = decoder(doc)\n if analyzer is not None:\n doc = analyzer(doc)\n else:\n if preprocessor is not None:\n doc = preprocessor(doc)\n if tokenizer is not None:\n doc = tokenizer(doc)\n if ngrams is not None:\n if stop_words is not None:\n doc = ngrams(doc, stop_words)\n else:\n doc = ngrams(doc)\n return doc" }, { @@ -82020,13 +87124,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_stop_list(stop):\n if stop == 'english':\n return ENGLISH_STOP_WORDS\n elif isinstance(stop, str):\n raise ValueError('not a built-in stop list: %s' % stop)\n elif stop is None:\n return None\n else:\n return frozenset(stop)" }, { @@ -82044,7 +87149,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -82081,7 +87187,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "accent_function", @@ -82091,7 +87198,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lower", @@ -82101,13 +87209,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Chain together an optional series of text preprocessing steps to apply to a document.", - "docstring": "Chain together an optional series of text preprocessing steps to\napply to a document.\n\nParameters\n----------\ndoc: str\n The string to preprocess\naccent_function: callable, default=None\n Function for handling accented characters. Common strategies include\n normalizing and removing.\nlower: bool, default=False\n Whether to use str.lower to lowercase all of the text\n\nReturns\n-------\ndoc: str\n preprocessed string", + "description": "Chain together an optional series of text preprocessing steps to\napply to a document.", + "docstring": "Chain together an optional series of text preprocessing steps to\n apply to a document.\n\n Parameters\n ----------\n doc: str\n The string to preprocess\n accent_function: callable, default=None\n Function for handling accented characters. Common strategies include\n normalizing and removing.\n lower: bool, default=False\n Whether to use str.lower to lowercase all of the text\n\n Returns\n -------\n doc: str\n preprocessed string\n ", "source_code": "\ndef _preprocess(doc, accent_function=None, lower=False):\n \"\"\"Chain together an optional series of text preprocessing steps to\n apply to a document.\n\n Parameters\n ----------\n doc: str\n The string to preprocess\n accent_function: callable, default=None\n Function for handling accented characters. Common strategies include\n normalizing and removing.\n lower: bool, default=False\n Whether to use str.lower to lowercase all of the text\n\n Returns\n -------\n doc: str\n preprocessed string\n \"\"\"\n if lower:\n doc = doc.lower()\n if accent_function is not None:\n doc = accent_function(doc)\n return doc" }, { @@ -82125,13 +87234,14 @@ "docstring": { "type": "str", "description": "The string to strip" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform accentuated unicode symbols into ascii or nothing\n\nWarning: this solution is only suited for languages that have a direct transliteration to ASCII symbols.", - "docstring": "Transform accentuated unicode symbols into ascii or nothing\n\nWarning: this solution is only suited for languages that have a direct\ntransliteration to ASCII symbols.\n\nParameters\n----------\ns : str\n The string to strip\n\nSee Also\n--------\nstrip_accents_unicode : Remove accentuated char for any unicode symbol.", + "description": "Transform accentuated unicode symbols into ascii or nothing\n\nWarning: this solution is only suited for languages that have a direct\ntransliteration to ASCII symbols.", + "docstring": "Transform accentuated unicode symbols into ascii or nothing\n\n Warning: this solution is only suited for languages that have a direct\n transliteration to ASCII symbols.\n\n Parameters\n ----------\n s : str\n The string to strip\n\n See Also\n --------\n strip_accents_unicode : Remove accentuated char for any unicode symbol.\n ", "source_code": "\ndef strip_accents_ascii(s):\n \"\"\"Transform accentuated unicode symbols into ascii or nothing\n\n Warning: this solution is only suited for languages that have a direct\n transliteration to ASCII symbols.\n\n Parameters\n ----------\n s : str\n The string to strip\n\n See Also\n --------\n strip_accents_unicode : Remove accentuated char for any unicode symbol.\n \"\"\"\n nkfd_form = unicodedata.normalize('NFKD', s)\n return nkfd_form.encode('ASCII', 'ignore').decode('ASCII')" }, { @@ -82149,13 +87259,14 @@ "docstring": { "type": "string", "description": "The string to strip" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform accentuated unicode symbols into their simple counterpart\n\nWarning: the python-level loop and join operations make this implementation 20 times slower than the strip_accents_ascii basic normalization.", - "docstring": "Transform accentuated unicode symbols into their simple counterpart\n\nWarning: the python-level loop and join operations make this\nimplementation 20 times slower than the strip_accents_ascii basic\nnormalization.\n\nParameters\n----------\ns : string\n The string to strip\n\nSee Also\n--------\nstrip_accents_ascii : Remove accentuated char for any unicode symbol that\n has a direct ASCII equivalent.", + "description": "Transform accentuated unicode symbols into their simple counterpart\n\nWarning: the python-level loop and join operations make this\nimplementation 20 times slower than the strip_accents_ascii basic\nnormalization.", + "docstring": "Transform accentuated unicode symbols into their simple counterpart\n\n Warning: the python-level loop and join operations make this\n implementation 20 times slower than the strip_accents_ascii basic\n normalization.\n\n Parameters\n ----------\n s : string\n The string to strip\n\n See Also\n --------\n strip_accents_ascii : Remove accentuated char for any unicode symbol that\n has a direct ASCII equivalent.\n ", "source_code": "\ndef strip_accents_unicode(s):\n \"\"\"Transform accentuated unicode symbols into their simple counterpart\n\n Warning: the python-level loop and join operations make this\n implementation 20 times slower than the strip_accents_ascii basic\n normalization.\n\n Parameters\n ----------\n s : string\n The string to strip\n\n See Also\n --------\n strip_accents_ascii : Remove accentuated char for any unicode symbol that\n has a direct ASCII equivalent.\n \"\"\"\n try:\n s.encode('ASCII', errors='strict')\n return s\n except UnicodeEncodeError:\n normalized = unicodedata.normalize('NFKD', s)\n return ''.join([c for c in normalized if not unicodedata.combining(c)])" }, { @@ -82173,13 +87284,14 @@ "docstring": { "type": "str", "description": "The string to strip" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Basic regexp based HTML / XML tag stripper function\n\nFor serious HTML/XML preprocessing you should rather use an external library such as lxml or BeautifulSoup.", - "docstring": "Basic regexp based HTML / XML tag stripper function\n\nFor serious HTML/XML preprocessing you should rather use an external\nlibrary such as lxml or BeautifulSoup.\n\nParameters\n----------\ns : str\n The string to strip", + "description": "Basic regexp based HTML / XML tag stripper function\n\nFor serious HTML/XML preprocessing you should rather use an external\nlibrary such as lxml or BeautifulSoup.", + "docstring": "Basic regexp based HTML / XML tag stripper function\n\n For serious HTML/XML preprocessing you should rather use an external\n library such as lxml or BeautifulSoup.\n\n Parameters\n ----------\n s : str\n The string to strip\n ", "source_code": "\ndef strip_tags(s):\n \"\"\"Basic regexp based HTML / XML tag stripper function\n\n For serious HTML/XML preprocessing you should rather use an external\n library such as lxml or BeautifulSoup.\n\n Parameters\n ----------\n s : str\n The string to strip\n \"\"\"\n return re.compile('<([^>]+)>', flags=re.UNICODE).sub(' ', s)" }, { @@ -82197,15 +87309,52 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Get the boolean mask indicating which features are selected", - "docstring": "Get the boolean mask indicating which features are selected\n\nReturns\n-------\nsupport : boolean array of shape [# input features]\n An element is True iff its corresponding feature is selected for\n retention.", + "docstring": "\n Get the boolean mask indicating which features are selected\n\n Returns\n -------\n support : boolean array of shape [# input features]\n An element is True iff its corresponding feature is selected for\n retention.\n ", "source_code": "\n@abstractmethod\ndef _get_support_mask(self):\n \"\"\"\n Get the boolean mask indicating which features are selected\n\n Returns\n -------\n support : boolean array of shape [# input features]\n An element is True iff its corresponding feature is selected for\n retention.\n \"\"\"\n " }, + { + "name": "_transform", + "unique_name": "_transform", + "qname": "sklearn.feature_selection._base.SelectorMixin._transform", + "unique_qname": "sklearn.feature_selection._base.SelectorMixin._transform", + "decorators": [], + "parameters": [ + { + "name": "self", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + }, + { + "name": "X", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + } + ], + "results": [], + "is_public": false, + "description": "Reduce X to the selected features.", + "docstring": "Reduce X to the selected features.", + "source_code": "\ndef _transform(self, X):\n \"\"\"Reduce X to the selected features.\"\"\"\n mask = self.get_support()\n if not mask.any():\n warn('No features were selected: either the data is too noisy or the selection test too strict.', UserWarning)\n return np.empty(0).reshape((X.shape[0], 0))\n if len(mask) != X.shape[1]:\n raise ValueError('X has a different shape than during fitting.')\n return X[:, safe_mask(X, mask)]" + }, { "name": "get_feature_names_out", "unique_name": "get_feature_names_out", @@ -82221,7 +87370,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -82231,13 +87381,14 @@ "docstring": { "type": "array-like of str or None, default=None", "description": "Input features.\n\n- If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n- If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Mask feature names according to selected features.", - "docstring": "Mask feature names according to selected features.\n\nParameters\n----------\ninput_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\nReturns\n-------\nfeature_names_out : ndarray of str objects\n Transformed feature names.", + "docstring": "Mask feature names according to selected features.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n ", "source_code": "\ndef get_feature_names_out(self, input_features=None):\n \"\"\"Mask feature names according to selected features.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n input_features = _check_feature_names_in(self, input_features)\n return input_features[self.get_support()]" }, { @@ -82255,7 +87406,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "indices", @@ -82265,13 +87417,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the return value will be an array of integers, rather\nthan a boolean mask." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get a mask, or integer index, of the features selected.", - "docstring": "Get a mask, or integer index, of the features selected.\n\nParameters\n----------\nindices : bool, default=False\n If True, the return value will be an array of integers, rather\n than a boolean mask.\n\nReturns\n-------\nsupport : array\n An index that selects the retained features from a feature vector.\n If `indices` is False, this is a boolean array of shape\n [# input features], in which an element is True iff its\n corresponding feature is selected for retention. If `indices` is\n True, this is an integer array of shape [# output features] whose\n values are indices into the input feature vector.", + "docstring": "\n Get a mask, or integer index, of the features selected.\n\n Parameters\n ----------\n indices : bool, default=False\n If True, the return value will be an array of integers, rather\n than a boolean mask.\n\n Returns\n -------\n support : array\n An index that selects the retained features from a feature vector.\n If `indices` is False, this is a boolean array of shape\n [# input features], in which an element is True iff its\n corresponding feature is selected for retention. If `indices` is\n True, this is an integer array of shape [# output features] whose\n values are indices into the input feature vector.\n ", "source_code": "\ndef get_support(self, indices=False):\n \"\"\"\n Get a mask, or integer index, of the features selected.\n\n Parameters\n ----------\n indices : bool, default=False\n If True, the return value will be an array of integers, rather\n than a boolean mask.\n\n Returns\n -------\n support : array\n An index that selects the retained features from a feature vector.\n If `indices` is False, this is a boolean array of shape\n [# input features], in which an element is True iff its\n corresponding feature is selected for retention. If `indices` is\n True, this is an integer array of shape [# output features] whose\n values are indices into the input feature vector.\n \"\"\"\n mask = self._get_support_mask()\n return mask if not indices else np.where(mask)[0]" }, { @@ -82289,7 +87442,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -82299,13 +87453,14 @@ "docstring": { "type": "array of shape [n_samples, n_selected_features]", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Reverse the transformation operation.", - "docstring": "Reverse the transformation operation.\n\nParameters\n----------\nX : array of shape [n_samples, n_selected_features]\n The input samples.\n\nReturns\n-------\nX_r : array of shape [n_samples, n_original_features]\n `X` with columns of zeros inserted where features would have\n been removed by :meth:`transform`.", + "docstring": "Reverse the transformation operation.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_selected_features]\n The input samples.\n\n Returns\n -------\n X_r : array of shape [n_samples, n_original_features]\n `X` with columns of zeros inserted where features would have\n been removed by :meth:`transform`.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Reverse the transformation operation.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_selected_features]\n The input samples.\n\n Returns\n -------\n X_r : array of shape [n_samples, n_original_features]\n `X` with columns of zeros inserted where features would have\n been removed by :meth:`transform`.\n \"\"\"\n if issparse(X):\n X = X.tocsc()\n it = self.inverse_transform(np.diff(X.indptr).reshape(1, -1))\n col_nonzeros = it.ravel()\n indptr = np.concatenate([[0], np.cumsum(col_nonzeros)])\n Xt = csc_matrix((X.data, X.indices, indptr), shape=(X.shape[0], len(indptr) - 1), dtype=X.dtype)\n return Xt\n support = self.get_support()\n X = check_array(X, dtype=None)\n if support.sum() != X.shape[1]:\n raise ValueError('X has a different shape than during fitting.')\n if X.ndim == 1:\n X = X[None, :]\n Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype)\n Xt[:, support] = X\n return Xt" }, { @@ -82323,7 +87478,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -82333,14 +87489,15 @@ "docstring": { "type": "array of shape [n_samples, n_features]", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Reduce X to the selected features.", - "docstring": "Reduce X to the selected features.\n\nParameters\n----------\nX : array of shape [n_samples, n_features]\n The input samples.\n\nReturns\n-------\nX_r : array of shape [n_samples, n_selected_features]\n The input samples with only the selected features.", - "source_code": "\ndef transform(self, X):\n \"\"\"Reduce X to the selected features.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n Returns\n -------\n X_r : array of shape [n_samples, n_selected_features]\n The input samples with only the selected features.\n \"\"\"\n X = self._validate_data(X, dtype=None, accept_sparse='csr', force_all_finite=not _safe_tags(self, key='allow_nan'), reset=False)\n mask = self.get_support()\n if not mask.any():\n warn('No features were selected: either the data is too noisy or the selection test too strict.', UserWarning)\n return np.empty(0).reshape((X.shape[0], 0))\n if len(mask) != X.shape[1]:\n raise ValueError('X has a different shape than during fitting.')\n return X[:, safe_mask(X, mask)]" + "docstring": "Reduce X to the selected features.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n Returns\n -------\n X_r : array of shape [n_samples, n_selected_features]\n The input samples with only the selected features.\n ", + "source_code": "\ndef transform(self, X):\n \"\"\"Reduce X to the selected features.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n Returns\n -------\n X_r : array of shape [n_samples, n_selected_features]\n The input samples with only the selected features.\n \"\"\"\n X = self._validate_data(X, dtype=None, accept_sparse='csr', force_all_finite=not _safe_tags(self, key='allow_nan'), reset=False)\n return self._transform(X)" }, { "name": "_get_feature_importances", @@ -82357,7 +87514,8 @@ "docstring": { "type": "estimator", "description": "A scikit-learn estimator from which we want to get the feature\nimportances." - } + }, + "refined_type": {} }, { "name": "getter", @@ -82367,7 +87525,8 @@ "docstring": { "type": "\"auto\", str or callable", "description": "An attribute or a callable to get the feature importance. If `\"auto\"`,\n`estimator` is expected to expose `coef_` or `feature_importances`." - } + }, + "refined_type": {} }, { "name": "transform_func", @@ -82377,6 +87536,10 @@ "docstring": { "type": "{\"norm\", \"square\"}, default=None", "description": "The transform to apply to the feature importances. By default (`None`)\nno transformation is applied." + }, + "refined_type": { + "kind": "EnumType", + "values": ["norm", "square"] } }, { @@ -82387,13 +87550,14 @@ "docstring": { "type": "int, default=1", "description": "The norm order to apply when `transform_func=\"norm\"`. Only applied\nwhen `importances.ndim > 1`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Retrieve and aggregate (ndim > 1) the feature importances from an estimator. Also optionally applies transformation.", - "docstring": "Retrieve and aggregate (ndim > 1) the feature importances\nfrom an estimator. Also optionally applies transformation.\n\nParameters\n----------\nestimator : estimator\n A scikit-learn estimator from which we want to get the feature\n importances.\n\ngetter : \"auto\", str or callable\n An attribute or a callable to get the feature importance. If `\"auto\"`,\n `estimator` is expected to expose `coef_` or `feature_importances`.\n\ntransform_func : {\"norm\", \"square\"}, default=None\n The transform to apply to the feature importances. By default (`None`)\n no transformation is applied.\n\nnorm_order : int, default=1\n The norm order to apply when `transform_func=\"norm\"`. Only applied\n when `importances.ndim > 1`.\n\nReturns\n-------\nimportances : ndarray of shape (n_features,)\n The features importances, optionally transformed.", + "description": "Retrieve and aggregate (ndim > 1) the feature importances\nfrom an estimator. Also optionally applies transformation.", + "docstring": "\n Retrieve and aggregate (ndim > 1) the feature importances\n from an estimator. Also optionally applies transformation.\n\n Parameters\n ----------\n estimator : estimator\n A scikit-learn estimator from which we want to get the feature\n importances.\n\n getter : \"auto\", str or callable\n An attribute or a callable to get the feature importance. If `\"auto\"`,\n `estimator` is expected to expose `coef_` or `feature_importances`.\n\n transform_func : {\"norm\", \"square\"}, default=None\n The transform to apply to the feature importances. By default (`None`)\n no transformation is applied.\n\n norm_order : int, default=1\n The norm order to apply when `transform_func=\"norm\"`. Only applied\n when `importances.ndim > 1`.\n\n Returns\n -------\n importances : ndarray of shape (n_features,)\n The features importances, optionally transformed.\n ", "source_code": "\ndef _get_feature_importances(estimator, getter, transform_func=None, norm_order=1):\n \"\"\"\n Retrieve and aggregate (ndim > 1) the feature importances\n from an estimator. Also optionally applies transformation.\n\n Parameters\n ----------\n estimator : estimator\n A scikit-learn estimator from which we want to get the feature\n importances.\n\n getter : \"auto\", str or callable\n An attribute or a callable to get the feature importance. If `\"auto\"`,\n `estimator` is expected to expose `coef_` or `feature_importances`.\n\n transform_func : {\"norm\", \"square\"}, default=None\n The transform to apply to the feature importances. By default (`None`)\n no transformation is applied.\n\n norm_order : int, default=1\n The norm order to apply when `transform_func=\"norm\"`. Only applied\n when `importances.ndim > 1`.\n\n Returns\n -------\n importances : ndarray of shape (n_features,)\n The features importances, optionally transformed.\n \"\"\"\n if isinstance(getter, str):\n if getter == 'auto':\n if hasattr(estimator, 'coef_'):\n getter = attrgetter('coef_')\n elif hasattr(estimator, 'feature_importances_'):\n getter = attrgetter('feature_importances_')\n else:\n raise ValueError(f\"when `importance_getter=='auto'`, the underlying estimator {estimator.__class__.__name__} should have `coef_` or `feature_importances_` attribute. Either pass a fitted estimator to feature selector or call fit before calling transform.\")\n else:\n getter = attrgetter(getter)\n elif not callable(getter):\n raise ValueError('`importance_getter` has to be a string or `callable`')\n importances = getter(estimator)\n if transform_func is None:\n return importances\n elif transform_func == 'norm':\n if importances.ndim == 1:\n importances = np.abs(importances)\n else:\n importances = np.linalg.norm(importances, axis=0, ord=norm_order)\n elif transform_func == 'square':\n if importances.ndim == 1:\n importances = safe_sqr(importances)\n else:\n importances = safe_sqr(importances).sum(axis=0)\n else:\n raise ValueError('Valid values for `transform_func` are ' + \"None, 'norm' and 'square'. Those two \" + 'transformation are only supported now')\n return importances" }, { @@ -82411,7 +87575,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -82421,7 +87586,8 @@ "docstring": { "type": "object", "description": "The base estimator from which the transformer is built.\nThis can be both a fitted (if ``prefit`` is set to True)\nor a non-fitted estimator. The estimator should have a\n``feature_importances_`` or ``coef_`` attribute after fitting.\nOtherwise, the ``importance_getter`` parameter should be used." - } + }, + "refined_type": {} }, { "name": "threshold", @@ -82431,7 +87597,8 @@ "docstring": { "type": "str or float, default=None", "description": "The threshold value to use for feature selection. Features whose\nimportance is greater or equal are kept while the others are\ndiscarded. If \"median\" (resp. \"mean\"), then the ``threshold`` value is\nthe median (resp. the mean) of the feature importances. A scaling\nfactor (e.g., \"1.25*mean\") may also be used. If None and if the\nestimator has a parameter penalty set to l1, either explicitly\nor implicitly (e.g, Lasso), the threshold used is 1e-5.\nOtherwise, \"mean\" is used by default." - } + }, + "refined_type": {} }, { "name": "prefit", @@ -82441,7 +87608,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether a prefit model is expected to be passed into the constructor\ndirectly or not. If True, ``transform`` must be called directly\nand SelectFromModel cannot be used with ``cross_val_score``,\n``GridSearchCV`` and similar utilities that clone the estimator.\nOtherwise train the model using ``fit`` and then ``transform`` to do\nfeature selection." - } + }, + "refined_type": {} }, { "name": "norm_order", @@ -82451,7 +87619,8 @@ "docstring": { "type": "non-zero int, inf, -inf, default=1", "description": "Order of the norm used to filter the vectors of coefficients below\n``threshold`` in the case where the ``coef_`` attribute of the\nestimator is of dimension 2." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -82461,7 +87630,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum number of features to select.\nTo only select based on ``max_features``, set ``threshold=-np.inf``.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "importance_getter", @@ -82471,13 +87641,14 @@ "docstring": { "type": "str or callable, default='auto'", "description": "If 'auto', uses the feature importance either through a ``coef_``\nattribute or ``feature_importances_`` attribute of estimator.\n\nAlso accepts a string that specifies an attribute name/path\nfor extracting feature importance (implemented with `attrgetter`).\nFor example, give `regressor_.coef_` in case of\n:class:`~sklearn.compose.TransformedTargetRegressor` or\n`named_steps.clf.feature_importances_` in case of\n:class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.\n\nIf `callable`, overrides the default feature importance getter.\nThe callable is passed with the fitted estimator and it should\nreturn importance for each feature.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, *, threshold=None, prefit=False, norm_order=1, max_features=None, importance_getter='auto'):\n self.estimator = estimator\n self.threshold = threshold\n self.prefit = prefit\n self.importance_getter = importance_getter\n self.norm_order = norm_order\n self.max_features = max_features" }, { @@ -82495,13 +87666,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_support_mask(self):\n if self.prefit:\n estimator = self.estimator\n elif hasattr(self, 'estimator_'):\n estimator = self.estimator_\n else:\n raise ValueError('Either fit the model before transform or set \"prefit=True\" while passing the fitted estimator to the constructor.')\n scores = _get_feature_importances(estimator=estimator, getter=self.importance_getter, transform_func='norm', norm_order=self.norm_order)\n threshold = _calculate_threshold(estimator, scores, self.threshold)\n if self.max_features is not None:\n mask = np.zeros_like(scores, dtype=bool)\n candidate_indices = np.argsort(-scores, kind='mergesort')[:self.max_features]\n mask[candidate_indices] = True\n else:\n mask = np.ones_like(scores, dtype=bool)\n mask[scores < threshold] = False\n return mask" }, { @@ -82519,13 +87691,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'allow_nan': _safe_tags(self.estimator, key='allow_nan')}" }, { @@ -82543,7 +87716,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -82553,7 +87727,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The training input samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -82563,14 +87738,15 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The target values (integers that correspond to classes in\nclassification, real numbers in regression)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the SelectFromModel meta-transformer.", - "docstring": "Fit the SelectFromModel meta-transformer.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The training input samples.\n\ny : array-like of shape (n_samples,), default=None\n The target values (integers that correspond to classes in\n classification, real numbers in regression).\n\n**fit_params : dict\n Other estimator specific parameters.\n\nReturns\n-------\nself : object\n Fitted estimator.", - "source_code": "\ndef fit(self, X, y=None, **fit_params):\n \"\"\"Fit the SelectFromModel meta-transformer.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,), default=None\n The target values (integers that correspond to classes in\n classification, real numbers in regression).\n\n **fit_params : dict\n Other estimator specific parameters.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.max_features is not None:\n if not isinstance(self.max_features, numbers.Integral):\n raise TypeError(\"'max_features' should be an integer between 0 and {} features. Got {!r} instead.\".format(X.shape[1], self.max_features))\n elif self.max_features < 0 or self.max_features > X.shape[1]:\n raise ValueError(\"'max_features' should be 0 and {} features.Got {} instead.\".format(X.shape[1], self.max_features))\n if self.prefit:\n raise NotFittedError(\"Since 'prefit=True', call transform directly\")\n self.estimator_ = clone(self.estimator)\n self.estimator_.fit(X, y, **fit_params)\n if hasattr(self.estimator_, 'feature_names_in_'):\n self.feature_names_in_ = self.estimator_.feature_names_in_\n return self" + "docstring": "Fit the SelectFromModel meta-transformer.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,), default=None\n The target values (integers that correspond to classes in\n classification, real numbers in regression).\n\n **fit_params : dict\n Other estimator specific parameters.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", + "source_code": "\ndef fit(self, X, y=None, **fit_params):\n \"\"\"Fit the SelectFromModel meta-transformer.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,), default=None\n The target values (integers that correspond to classes in\n classification, real numbers in regression).\n\n **fit_params : dict\n Other estimator specific parameters.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.max_features is not None:\n if not isinstance(self.max_features, numbers.Integral):\n raise TypeError(\"'max_features' should be an integer between 0 and {} features. Got {!r} instead.\".format(X.shape[1], self.max_features))\n elif self.max_features < 0 or self.max_features > X.shape[1]:\n raise ValueError(\"'max_features' should be 0 and {} features.Got {} instead.\".format(X.shape[1], self.max_features))\n if self.prefit:\n raise NotFittedError(\"Since 'prefit=True', call transform directly\")\n self.estimator_ = clone(self.estimator)\n self.estimator_.fit(X, y, **fit_params)\n if hasattr(self.estimator_, 'feature_names_in_'):\n self.feature_names_in_ = self.estimator_.feature_names_in_\n else:\n self._check_feature_names(X, reset=True)\n return self" }, { "name": "n_features_in_", @@ -82587,7 +87763,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -82611,7 +87788,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -82621,7 +87799,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The training input samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -82631,13 +87810,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The target values (integers that correspond to classes in\nclassification, real numbers in regression)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the SelectFromModel meta-transformer only once.", - "docstring": "Fit the SelectFromModel meta-transformer only once.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The training input samples.\n\ny : array-like of shape (n_samples,), default=None\n The target values (integers that correspond to classes in\n classification, real numbers in regression).\n\n**fit_params : dict\n Other estimator specific parameters.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the SelectFromModel meta-transformer only once.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,), default=None\n The target values (integers that correspond to classes in\n classification, real numbers in regression).\n\n **fit_params : dict\n Other estimator specific parameters.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\n@if_delegate_has_method('estimator')\ndef partial_fit(self, X, y=None, **fit_params):\n \"\"\"Fit the SelectFromModel meta-transformer only once.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,), default=None\n The target values (integers that correspond to classes in\n classification, real numbers in regression).\n\n **fit_params : dict\n Other estimator specific parameters.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.prefit:\n raise NotFittedError(\"Since 'prefit=True', call transform directly\")\n if not hasattr(self, 'estimator_'):\n self.estimator_ = clone(self.estimator)\n self.estimator_.partial_fit(X, y, **fit_params)\n return self" }, { @@ -82655,7 +87835,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -82679,7 +87860,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "importances", @@ -82689,7 +87871,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "threshold", @@ -82699,7 +87882,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -82723,7 +87907,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -82733,7 +87918,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "x_discrete", @@ -82743,7 +87929,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_discrete", @@ -82753,7 +87940,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -82763,13 +87951,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute mutual information between two variables.\n\nThis is a simple wrapper which selects a proper function to call based on whether `x` and `y` are discrete or not.", - "docstring": "Compute mutual information between two variables.\n\nThis is a simple wrapper which selects a proper function to call based on\nwhether `x` and `y` are discrete or not.", + "description": "Compute mutual information between two variables.\n\nThis is a simple wrapper which selects a proper function to call based on\nwhether `x` and `y` are discrete or not.", + "docstring": "Compute mutual information between two variables.\n\n This is a simple wrapper which selects a proper function to call based on\n whether `x` and `y` are discrete or not.\n ", "source_code": "\ndef _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3):\n \"\"\"Compute mutual information between two variables.\n\n This is a simple wrapper which selects a proper function to call based on\n whether `x` and `y` are discrete or not.\n \"\"\"\n if x_discrete and y_discrete:\n return mutual_info_score(x, y)\n elif x_discrete and not y_discrete:\n return _compute_mi_cd(y, x, n_neighbors)\n elif not x_discrete and y_discrete:\n return _compute_mi_cd(x, y, n_neighbors)\n else:\n return _compute_mi_cc(x, y, n_neighbors)" }, { @@ -82787,7 +87976,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -82797,7 +87987,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -82807,13 +87998,14 @@ "docstring": { "type": "int", "description": "Number of nearest neighbors to search for each point, see [1]_." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute mutual information between two continuous variables.", - "docstring": "Compute mutual information between two continuous variables.\n\nParameters\n----------\nx, y : ndarray, shape (n_samples,)\n Samples of two continuous random variables, must have an identical\n shape.\n\nn_neighbors : int\n Number of nearest neighbors to search for each point, see [1]_.\n\nReturns\n-------\nmi : float\n Estimated mutual information. If it turned out to be negative it is\n replace by 0.\n\nNotes\n-----\nTrue mutual information can't be negative. If its estimate by a numerical\nmethod is negative, it means (providing the method is adequate) that the\nmutual information is close to 0 and replacing it by 0 is a reasonable\nstrategy.\n\nReferences\n----------\n.. [1] A. Kraskov, H. Stogbauer and P. Grassberger, \"Estimating mutual\n information\". Phys. Rev. E 69, 2004.", + "docstring": "Compute mutual information between two continuous variables.\n\n Parameters\n ----------\n x, y : ndarray, shape (n_samples,)\n Samples of two continuous random variables, must have an identical\n shape.\n\n n_neighbors : int\n Number of nearest neighbors to search for each point, see [1]_.\n\n Returns\n -------\n mi : float\n Estimated mutual information. If it turned out to be negative it is\n replace by 0.\n\n Notes\n -----\n True mutual information can't be negative. If its estimate by a numerical\n method is negative, it means (providing the method is adequate) that the\n mutual information is close to 0 and replacing it by 0 is a reasonable\n strategy.\n\n References\n ----------\n .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, \"Estimating mutual\n information\". Phys. Rev. E 69, 2004.\n ", "source_code": "\ndef _compute_mi_cc(x, y, n_neighbors):\n \"\"\"Compute mutual information between two continuous variables.\n\n Parameters\n ----------\n x, y : ndarray, shape (n_samples,)\n Samples of two continuous random variables, must have an identical\n shape.\n\n n_neighbors : int\n Number of nearest neighbors to search for each point, see [1]_.\n\n Returns\n -------\n mi : float\n Estimated mutual information. If it turned out to be negative it is\n replace by 0.\n\n Notes\n -----\n True mutual information can't be negative. If its estimate by a numerical\n method is negative, it means (providing the method is adequate) that the\n mutual information is close to 0 and replacing it by 0 is a reasonable\n strategy.\n\n References\n ----------\n .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, \"Estimating mutual\n information\". Phys. Rev. E 69, 2004.\n \"\"\"\n n_samples = x.size\n x = x.reshape((-1, 1))\n y = y.reshape((-1, 1))\n xy = np.hstack((x, y))\n nn = NearestNeighbors(metric='chebyshev', n_neighbors=n_neighbors)\n nn.fit(xy)\n radius = nn.kneighbors()[0]\n radius = np.nextafter(radius[:, -1], 0)\n kd = KDTree(x, metric='chebyshev')\n nx = kd.query_radius(x, radius, count_only=True, return_distance=False)\n nx = np.array(nx) - 1.0\n kd = KDTree(y, metric='chebyshev')\n ny = kd.query_radius(y, radius, count_only=True, return_distance=False)\n ny = np.array(ny) - 1.0\n mi = digamma(n_samples) + digamma(n_neighbors) - np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1))\n return max(0, mi)" }, { @@ -82831,7 +88023,8 @@ "docstring": { "type": "ndarray, shape (n_samples,)", "description": "Samples of a continuous random variable." - } + }, + "refined_type": {} }, { "name": "d", @@ -82841,7 +88034,8 @@ "docstring": { "type": "ndarray, shape (n_samples,)", "description": "Samples of a discrete random variable." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -82851,13 +88045,14 @@ "docstring": { "type": "int", "description": "Number of nearest neighbors to search for each point, see [1]_." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute mutual information between continuous and discrete variables.", - "docstring": "Compute mutual information between continuous and discrete variables.\n\nParameters\n----------\nc : ndarray, shape (n_samples,)\n Samples of a continuous random variable.\n\nd : ndarray, shape (n_samples,)\n Samples of a discrete random variable.\n\nn_neighbors : int\n Number of nearest neighbors to search for each point, see [1]_.\n\nReturns\n-------\nmi : float\n Estimated mutual information. If it turned out to be negative it is\n replace by 0.\n\nNotes\n-----\nTrue mutual information can't be negative. If its estimate by a numerical\nmethod is negative, it means (providing the method is adequate) that the\nmutual information is close to 0 and replacing it by 0 is a reasonable\nstrategy.\n\nReferences\n----------\n.. [1] B. C. Ross \"Mutual Information between Discrete and Continuous\n Data Sets\". PLoS ONE 9(2), 2014.", + "docstring": "Compute mutual information between continuous and discrete variables.\n\n Parameters\n ----------\n c : ndarray, shape (n_samples,)\n Samples of a continuous random variable.\n\n d : ndarray, shape (n_samples,)\n Samples of a discrete random variable.\n\n n_neighbors : int\n Number of nearest neighbors to search for each point, see [1]_.\n\n Returns\n -------\n mi : float\n Estimated mutual information. If it turned out to be negative it is\n replace by 0.\n\n Notes\n -----\n True mutual information can't be negative. If its estimate by a numerical\n method is negative, it means (providing the method is adequate) that the\n mutual information is close to 0 and replacing it by 0 is a reasonable\n strategy.\n\n References\n ----------\n .. [1] B. C. Ross \"Mutual Information between Discrete and Continuous\n Data Sets\". PLoS ONE 9(2), 2014.\n ", "source_code": "\ndef _compute_mi_cd(c, d, n_neighbors):\n \"\"\"Compute mutual information between continuous and discrete variables.\n\n Parameters\n ----------\n c : ndarray, shape (n_samples,)\n Samples of a continuous random variable.\n\n d : ndarray, shape (n_samples,)\n Samples of a discrete random variable.\n\n n_neighbors : int\n Number of nearest neighbors to search for each point, see [1]_.\n\n Returns\n -------\n mi : float\n Estimated mutual information. If it turned out to be negative it is\n replace by 0.\n\n Notes\n -----\n True mutual information can't be negative. If its estimate by a numerical\n method is negative, it means (providing the method is adequate) that the\n mutual information is close to 0 and replacing it by 0 is a reasonable\n strategy.\n\n References\n ----------\n .. [1] B. C. Ross \"Mutual Information between Discrete and Continuous\n Data Sets\". PLoS ONE 9(2), 2014.\n \"\"\"\n n_samples = c.shape[0]\n c = c.reshape((-1, 1))\n radius = np.empty(n_samples)\n label_counts = np.empty(n_samples)\n k_all = np.empty(n_samples)\n nn = NearestNeighbors()\n for label in np.unique(d):\n mask = d == label\n count = np.sum(mask)\n if count > 1:\n k = min(n_neighbors, count - 1)\n nn.set_params(n_neighbors=k)\n nn.fit(c[mask])\n r = nn.kneighbors()[0]\n radius[mask] = np.nextafter(r[:, -1], 0)\n k_all[mask] = k\n label_counts[mask] = count\n mask = label_counts > 1\n n_samples = np.sum(mask)\n label_counts = label_counts[mask]\n k_all = k_all[mask]\n c = c[mask]\n radius = radius[mask]\n kd = KDTree(c)\n m_all = kd.query_radius(c, radius, count_only=True, return_distance=False)\n m_all = np.array(m_all) - 1.0\n mi = digamma(n_samples) + np.mean(digamma(k_all)) - np.mean(digamma(label_counts)) - np.mean(digamma(m_all + 1))\n return max(0, mi)" }, { @@ -82875,7 +88070,8 @@ "docstring": { "type": "array-like or sparse matrix, shape (n_samples, n_features)", "description": "Feature matrix." - } + }, + "refined_type": {} }, { "name": "y", @@ -82885,7 +88081,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target vector." - } + }, + "refined_type": {} }, { "name": "discrete_features", @@ -82895,6 +88092,10 @@ "docstring": { "type": "{'auto', bool, array-like}, default='auto'", "description": "If bool, then determines whether to consider all features discrete\nor continuous. If array, then it should be either a boolean mask\nwith shape (n_features,) or array with indices of discrete features.\nIf 'auto', it is assigned to False for dense `X` and to True for\nsparse `X`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto"] } }, { @@ -82905,7 +88106,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to consider `y` as a discrete variable." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -82915,7 +88117,8 @@ "docstring": { "type": "int, default=3", "description": "Number of neighbors to use for MI estimation for continuous variables,\nsee [1]_ and [2]_. Higher values reduce variance of the estimation, but\ncould introduce a bias." - } + }, + "refined_type": {} }, { "name": "copy", @@ -82925,7 +88128,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to make a copy of the given data. If set to False, the initial\ndata will be overwritten." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -82935,13 +88139,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for adding small noise to\ncontinuous variables in order to remove repeated values.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate mutual information between the features and the target.", - "docstring": "Estimate mutual information between the features and the target.\n\nParameters\n----------\nX : array-like or sparse matrix, shape (n_samples, n_features)\n Feature matrix.\n\ny : array-like of shape (n_samples,)\n Target vector.\n\ndiscrete_features : {'auto', bool, array-like}, default='auto'\n If bool, then determines whether to consider all features discrete\n or continuous. If array, then it should be either a boolean mask\n with shape (n_features,) or array with indices of discrete features.\n If 'auto', it is assigned to False for dense `X` and to True for\n sparse `X`.\n\ndiscrete_target : bool, default=False\n Whether to consider `y` as a discrete variable.\n\nn_neighbors : int, default=3\n Number of neighbors to use for MI estimation for continuous variables,\n see [1]_ and [2]_. Higher values reduce variance of the estimation, but\n could introduce a bias.\n\ncopy : bool, default=True\n Whether to make a copy of the given data. If set to False, the initial\n data will be overwritten.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for adding small noise to\n continuous variables in order to remove repeated values.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nmi : ndarray, shape (n_features,)\n Estimated mutual information between each feature and the target.\n A negative value will be replaced by 0.\n\nReferences\n----------\n.. [1] A. Kraskov, H. Stogbauer and P. Grassberger, \"Estimating mutual\n information\". Phys. Rev. E 69, 2004.\n.. [2] B. C. Ross \"Mutual Information between Discrete and Continuous\n Data Sets\". PLoS ONE 9(2), 2014.", + "docstring": "Estimate mutual information between the features and the target.\n\n Parameters\n ----------\n X : array-like or sparse matrix, shape (n_samples, n_features)\n Feature matrix.\n\n y : array-like of shape (n_samples,)\n Target vector.\n\n discrete_features : {'auto', bool, array-like}, default='auto'\n If bool, then determines whether to consider all features discrete\n or continuous. If array, then it should be either a boolean mask\n with shape (n_features,) or array with indices of discrete features.\n If 'auto', it is assigned to False for dense `X` and to True for\n sparse `X`.\n\n discrete_target : bool, default=False\n Whether to consider `y` as a discrete variable.\n\n n_neighbors : int, default=3\n Number of neighbors to use for MI estimation for continuous variables,\n see [1]_ and [2]_. Higher values reduce variance of the estimation, but\n could introduce a bias.\n\n copy : bool, default=True\n Whether to make a copy of the given data. If set to False, the initial\n data will be overwritten.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for adding small noise to\n continuous variables in order to remove repeated values.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n mi : ndarray, shape (n_features,)\n Estimated mutual information between each feature and the target.\n A negative value will be replaced by 0.\n\n References\n ----------\n .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, \"Estimating mutual\n information\". Phys. Rev. E 69, 2004.\n .. [2] B. C. Ross \"Mutual Information between Discrete and Continuous\n Data Sets\". PLoS ONE 9(2), 2014.\n ", "source_code": "\ndef _estimate_mi(X, y, discrete_features='auto', discrete_target=False, n_neighbors=3, copy=True, random_state=None):\n \"\"\"Estimate mutual information between the features and the target.\n\n Parameters\n ----------\n X : array-like or sparse matrix, shape (n_samples, n_features)\n Feature matrix.\n\n y : array-like of shape (n_samples,)\n Target vector.\n\n discrete_features : {'auto', bool, array-like}, default='auto'\n If bool, then determines whether to consider all features discrete\n or continuous. If array, then it should be either a boolean mask\n with shape (n_features,) or array with indices of discrete features.\n If 'auto', it is assigned to False for dense `X` and to True for\n sparse `X`.\n\n discrete_target : bool, default=False\n Whether to consider `y` as a discrete variable.\n\n n_neighbors : int, default=3\n Number of neighbors to use for MI estimation for continuous variables,\n see [1]_ and [2]_. Higher values reduce variance of the estimation, but\n could introduce a bias.\n\n copy : bool, default=True\n Whether to make a copy of the given data. If set to False, the initial\n data will be overwritten.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for adding small noise to\n continuous variables in order to remove repeated values.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n mi : ndarray, shape (n_features,)\n Estimated mutual information between each feature and the target.\n A negative value will be replaced by 0.\n\n References\n ----------\n .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, \"Estimating mutual\n information\". Phys. Rev. E 69, 2004.\n .. [2] B. C. Ross \"Mutual Information between Discrete and Continuous\n Data Sets\". PLoS ONE 9(2), 2014.\n \"\"\"\n (X, y) = check_X_y(X, y, accept_sparse='csc', y_numeric=not discrete_target)\n (n_samples, n_features) = X.shape\n if isinstance(discrete_features, (str, bool)):\n if isinstance(discrete_features, str):\n if discrete_features == 'auto':\n discrete_features = issparse(X)\n else:\n raise ValueError('Invalid string value for discrete_features.')\n discrete_mask = np.empty(n_features, dtype=bool)\n discrete_mask.fill(discrete_features)\n else:\n discrete_features = check_array(discrete_features, ensure_2d=False)\n if discrete_features.dtype != 'bool':\n discrete_mask = np.zeros(n_features, dtype=bool)\n discrete_mask[discrete_features] = True\n else:\n discrete_mask = discrete_features\n continuous_mask = ~discrete_mask\n if np.any(continuous_mask) and issparse(X):\n raise ValueError(\"Sparse matrix `X` can't have continuous features.\")\n rng = check_random_state(random_state)\n if np.any(continuous_mask):\n if copy:\n X = X.copy()\n if not discrete_target:\n X[:, continuous_mask] = scale(X[:, continuous_mask], with_mean=False, copy=False)\n X = X.astype(float, **_astype_copy_false(X))\n means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))\n X[:, continuous_mask] += 1e-10 * means * rng.randn(n_samples, np.sum(continuous_mask))\n if not discrete_target:\n y = scale(y, with_mean=False)\n y += 1e-10 * np.maximum(1, np.mean(np.abs(y))) * rng.randn(n_samples)\n mi = [_compute_mi(x, y, discrete_feature, discrete_target, n_neighbors) for (x, discrete_feature) in zip(_iterate_columns(X), discrete_mask)]\n return np.array(mi)" }, { @@ -82959,7 +88164,8 @@ "docstring": { "type": "ndarray or csc_matrix, shape (n_samples, n_features)", "description": "Matrix over which to iterate." - } + }, + "refined_type": {} }, { "name": "columns", @@ -82969,13 +88175,14 @@ "docstring": { "type": "iterable or None, default=None", "description": "Indices of columns to iterate over. If None, iterate over all columns." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Iterate over columns of a matrix.", - "docstring": "Iterate over columns of a matrix.\n\nParameters\n----------\nX : ndarray or csc_matrix, shape (n_samples, n_features)\n Matrix over which to iterate.\n\ncolumns : iterable or None, default=None\n Indices of columns to iterate over. If None, iterate over all columns.\n\nYields\n------\nx : ndarray, shape (n_samples,)\n Columns of `X` in dense format.", + "docstring": "Iterate over columns of a matrix.\n\n Parameters\n ----------\n X : ndarray or csc_matrix, shape (n_samples, n_features)\n Matrix over which to iterate.\n\n columns : iterable or None, default=None\n Indices of columns to iterate over. If None, iterate over all columns.\n\n Yields\n ------\n x : ndarray, shape (n_samples,)\n Columns of `X` in dense format.\n ", "source_code": "\ndef _iterate_columns(X, columns=None):\n \"\"\"Iterate over columns of a matrix.\n\n Parameters\n ----------\n X : ndarray or csc_matrix, shape (n_samples, n_features)\n Matrix over which to iterate.\n\n columns : iterable or None, default=None\n Indices of columns to iterate over. If None, iterate over all columns.\n\n Yields\n ------\n x : ndarray, shape (n_samples,)\n Columns of `X` in dense format.\n \"\"\"\n if columns is None:\n columns = range(X.shape[1])\n if issparse(X):\n for i in columns:\n x = np.zeros(X.shape[0])\n (start_ptr, end_ptr) = (X.indptr[i], X.indptr[i + 1])\n x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr]\n yield x\n else:\n for i in columns:\n yield X[:, i]" }, { @@ -82993,7 +88200,8 @@ "docstring": { "type": "array-like or sparse matrix, shape (n_samples, n_features)", "description": "Feature matrix." - } + }, + "refined_type": {} }, { "name": "y", @@ -83003,7 +88211,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target vector." - } + }, + "refined_type": {} }, { "name": "discrete_features", @@ -83013,6 +88222,10 @@ "docstring": { "type": "{'auto', bool, array-like}, default='auto'", "description": "If bool, then determines whether to consider all features discrete\nor continuous. If array, then it should be either a boolean mask\nwith shape (n_features,) or array with indices of discrete features.\nIf 'auto', it is assigned to False for dense `X` and to True for\nsparse `X`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto"] } }, { @@ -83023,7 +88236,8 @@ "docstring": { "type": "int, default=3", "description": "Number of neighbors to use for MI estimation for continuous variables,\nsee [2]_ and [3]_. Higher values reduce variance of the estimation, but\ncould introduce a bias." - } + }, + "refined_type": {} }, { "name": "copy", @@ -83033,7 +88247,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to make a copy of the given data. If set to False, the initial\ndata will be overwritten." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -83043,13 +88258,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for adding small noise to\ncontinuous variables in order to remove repeated values.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Estimate mutual information for a discrete target variable.\n\nMutual information (MI) [1]_ between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency. The function relies on nonparametric methods based on entropy estimation from k-nearest neighbors distances as described in [2]_ and [3]_. Both methods are based on the idea originally proposed in [4]_. It can be used for univariate features selection, read more in the :ref:`User Guide `.", - "docstring": "Estimate mutual information for a discrete target variable.\n\nMutual information (MI) [1]_ between two random variables is a non-negative\nvalue, which measures the dependency between the variables. It is equal\nto zero if and only if two random variables are independent, and higher\nvalues mean higher dependency.\n\nThe function relies on nonparametric methods based on entropy estimation\nfrom k-nearest neighbors distances as described in [2]_ and [3]_. Both\nmethods are based on the idea originally proposed in [4]_.\n\nIt can be used for univariate features selection, read more in the\n:ref:`User Guide `.\n\nParameters\n----------\nX : array-like or sparse matrix, shape (n_samples, n_features)\n Feature matrix.\n\ny : array-like of shape (n_samples,)\n Target vector.\n\ndiscrete_features : {'auto', bool, array-like}, default='auto'\n If bool, then determines whether to consider all features discrete\n or continuous. If array, then it should be either a boolean mask\n with shape (n_features,) or array with indices of discrete features.\n If 'auto', it is assigned to False for dense `X` and to True for\n sparse `X`.\n\nn_neighbors : int, default=3\n Number of neighbors to use for MI estimation for continuous variables,\n see [2]_ and [3]_. Higher values reduce variance of the estimation, but\n could introduce a bias.\n\ncopy : bool, default=True\n Whether to make a copy of the given data. If set to False, the initial\n data will be overwritten.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for adding small noise to\n continuous variables in order to remove repeated values.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nmi : ndarray, shape (n_features,)\n Estimated mutual information between each feature and the target.\n\nNotes\n-----\n1. The term \"discrete features\" is used instead of naming them\n \"categorical\", because it describes the essence more accurately.\n For example, pixel intensities of an image are discrete features\n (but hardly categorical) and you will get better results if mark them\n as such. Also note, that treating a continuous variable as discrete and\n vice versa will usually give incorrect results, so be attentive about\n that.\n2. True mutual information can't be negative. If its estimate turns out\n to be negative, it is replaced by zero.\n\nReferences\n----------\n.. [1] `Mutual Information\n `_\n on Wikipedia.\n.. [2] A. Kraskov, H. Stogbauer and P. Grassberger, \"Estimating mutual\n information\". Phys. Rev. E 69, 2004.\n.. [3] B. C. Ross \"Mutual Information between Discrete and Continuous\n Data Sets\". PLoS ONE 9(2), 2014.\n.. [4] L. F. Kozachenko, N. N. Leonenko, \"Sample Estimate of the Entropy\n of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16", + "description": "Estimate mutual information for a discrete target variable.\n\nMutual information (MI) [1]_ between two random variables is a non-negative\nvalue, which measures the dependency between the variables. It is equal\nto zero if and only if two random variables are independent, and higher\nvalues mean higher dependency.\n\nThe function relies on nonparametric methods based on entropy estimation\nfrom k-nearest neighbors distances as described in [2]_ and [3]_. Both\nmethods are based on the idea originally proposed in [4]_.\n\nIt can be used for univariate features selection, read more in the\n:ref:`User Guide `.", + "docstring": "Estimate mutual information for a discrete target variable.\n\n Mutual information (MI) [1]_ between two random variables is a non-negative\n value, which measures the dependency between the variables. It is equal\n to zero if and only if two random variables are independent, and higher\n values mean higher dependency.\n\n The function relies on nonparametric methods based on entropy estimation\n from k-nearest neighbors distances as described in [2]_ and [3]_. Both\n methods are based on the idea originally proposed in [4]_.\n\n It can be used for univariate features selection, read more in the\n :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like or sparse matrix, shape (n_samples, n_features)\n Feature matrix.\n\n y : array-like of shape (n_samples,)\n Target vector.\n\n discrete_features : {'auto', bool, array-like}, default='auto'\n If bool, then determines whether to consider all features discrete\n or continuous. If array, then it should be either a boolean mask\n with shape (n_features,) or array with indices of discrete features.\n If 'auto', it is assigned to False for dense `X` and to True for\n sparse `X`.\n\n n_neighbors : int, default=3\n Number of neighbors to use for MI estimation for continuous variables,\n see [2]_ and [3]_. Higher values reduce variance of the estimation, but\n could introduce a bias.\n\n copy : bool, default=True\n Whether to make a copy of the given data. If set to False, the initial\n data will be overwritten.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for adding small noise to\n continuous variables in order to remove repeated values.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n mi : ndarray, shape (n_features,)\n Estimated mutual information between each feature and the target.\n\n Notes\n -----\n 1. The term \"discrete features\" is used instead of naming them\n \"categorical\", because it describes the essence more accurately.\n For example, pixel intensities of an image are discrete features\n (but hardly categorical) and you will get better results if mark them\n as such. Also note, that treating a continuous variable as discrete and\n vice versa will usually give incorrect results, so be attentive about\n that.\n 2. True mutual information can't be negative. If its estimate turns out\n to be negative, it is replaced by zero.\n\n References\n ----------\n .. [1] `Mutual Information\n `_\n on Wikipedia.\n .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, \"Estimating mutual\n information\". Phys. Rev. E 69, 2004.\n .. [3] B. C. Ross \"Mutual Information between Discrete and Continuous\n Data Sets\". PLoS ONE 9(2), 2014.\n .. [4] L. F. Kozachenko, N. N. Leonenko, \"Sample Estimate of the Entropy\n of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16\n ", "source_code": "\ndef mutual_info_classif(X, y, *, discrete_features='auto', n_neighbors=3, copy=True, random_state=None):\n \"\"\"Estimate mutual information for a discrete target variable.\n\n Mutual information (MI) [1]_ between two random variables is a non-negative\n value, which measures the dependency between the variables. It is equal\n to zero if and only if two random variables are independent, and higher\n values mean higher dependency.\n\n The function relies on nonparametric methods based on entropy estimation\n from k-nearest neighbors distances as described in [2]_ and [3]_. Both\n methods are based on the idea originally proposed in [4]_.\n\n It can be used for univariate features selection, read more in the\n :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like or sparse matrix, shape (n_samples, n_features)\n Feature matrix.\n\n y : array-like of shape (n_samples,)\n Target vector.\n\n discrete_features : {'auto', bool, array-like}, default='auto'\n If bool, then determines whether to consider all features discrete\n or continuous. If array, then it should be either a boolean mask\n with shape (n_features,) or array with indices of discrete features.\n If 'auto', it is assigned to False for dense `X` and to True for\n sparse `X`.\n\n n_neighbors : int, default=3\n Number of neighbors to use for MI estimation for continuous variables,\n see [2]_ and [3]_. Higher values reduce variance of the estimation, but\n could introduce a bias.\n\n copy : bool, default=True\n Whether to make a copy of the given data. If set to False, the initial\n data will be overwritten.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for adding small noise to\n continuous variables in order to remove repeated values.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n mi : ndarray, shape (n_features,)\n Estimated mutual information between each feature and the target.\n\n Notes\n -----\n 1. The term \"discrete features\" is used instead of naming them\n \"categorical\", because it describes the essence more accurately.\n For example, pixel intensities of an image are discrete features\n (but hardly categorical) and you will get better results if mark them\n as such. Also note, that treating a continuous variable as discrete and\n vice versa will usually give incorrect results, so be attentive about\n that.\n 2. True mutual information can't be negative. If its estimate turns out\n to be negative, it is replaced by zero.\n\n References\n ----------\n .. [1] `Mutual Information\n `_\n on Wikipedia.\n .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, \"Estimating mutual\n information\". Phys. Rev. E 69, 2004.\n .. [3] B. C. Ross \"Mutual Information between Discrete and Continuous\n Data Sets\". PLoS ONE 9(2), 2014.\n .. [4] L. F. Kozachenko, N. N. Leonenko, \"Sample Estimate of the Entropy\n of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16\n \"\"\"\n check_classification_targets(y)\n return _estimate_mi(X, y, discrete_features, True, n_neighbors, copy, random_state)" }, { @@ -83067,7 +88283,8 @@ "docstring": { "type": "array-like or sparse matrix, shape (n_samples, n_features)", "description": "Feature matrix." - } + }, + "refined_type": {} }, { "name": "y", @@ -83077,7 +88294,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target vector." - } + }, + "refined_type": {} }, { "name": "discrete_features", @@ -83087,6 +88305,10 @@ "docstring": { "type": "{'auto', bool, array-like}, default='auto'", "description": "If bool, then determines whether to consider all features discrete\nor continuous. If array, then it should be either a boolean mask\nwith shape (n_features,) or array with indices of discrete features.\nIf 'auto', it is assigned to False for dense `X` and to True for\nsparse `X`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto"] } }, { @@ -83097,7 +88319,8 @@ "docstring": { "type": "int, default=3", "description": "Number of neighbors to use for MI estimation for continuous variables,\nsee [2]_ and [3]_. Higher values reduce variance of the estimation, but\ncould introduce a bias." - } + }, + "refined_type": {} }, { "name": "copy", @@ -83107,7 +88330,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to make a copy of the given data. If set to False, the initial\ndata will be overwritten." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -83117,13 +88341,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for adding small noise to\ncontinuous variables in order to remove repeated values.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Estimate mutual information for a continuous target variable.\n\nMutual information (MI) [1]_ between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency. The function relies on nonparametric methods based on entropy estimation from k-nearest neighbors distances as described in [2]_ and [3]_. Both methods are based on the idea originally proposed in [4]_. It can be used for univariate features selection, read more in the :ref:`User Guide `.", - "docstring": "Estimate mutual information for a continuous target variable.\n\nMutual information (MI) [1]_ between two random variables is a non-negative\nvalue, which measures the dependency between the variables. It is equal\nto zero if and only if two random variables are independent, and higher\nvalues mean higher dependency.\n\nThe function relies on nonparametric methods based on entropy estimation\nfrom k-nearest neighbors distances as described in [2]_ and [3]_. Both\nmethods are based on the idea originally proposed in [4]_.\n\nIt can be used for univariate features selection, read more in the\n:ref:`User Guide `.\n\nParameters\n----------\nX : array-like or sparse matrix, shape (n_samples, n_features)\n Feature matrix.\n\ny : array-like of shape (n_samples,)\n Target vector.\n\ndiscrete_features : {'auto', bool, array-like}, default='auto'\n If bool, then determines whether to consider all features discrete\n or continuous. If array, then it should be either a boolean mask\n with shape (n_features,) or array with indices of discrete features.\n If 'auto', it is assigned to False for dense `X` and to True for\n sparse `X`.\n\nn_neighbors : int, default=3\n Number of neighbors to use for MI estimation for continuous variables,\n see [2]_ and [3]_. Higher values reduce variance of the estimation, but\n could introduce a bias.\n\ncopy : bool, default=True\n Whether to make a copy of the given data. If set to False, the initial\n data will be overwritten.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for adding small noise to\n continuous variables in order to remove repeated values.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nmi : ndarray, shape (n_features,)\n Estimated mutual information between each feature and the target.\n\nNotes\n-----\n1. The term \"discrete features\" is used instead of naming them\n \"categorical\", because it describes the essence more accurately.\n For example, pixel intensities of an image are discrete features\n (but hardly categorical) and you will get better results if mark them\n as such. Also note, that treating a continuous variable as discrete and\n vice versa will usually give incorrect results, so be attentive about\n that.\n2. True mutual information can't be negative. If its estimate turns out\n to be negative, it is replaced by zero.\n\nReferences\n----------\n.. [1] `Mutual Information\n `_\n on Wikipedia.\n.. [2] A. Kraskov, H. Stogbauer and P. Grassberger, \"Estimating mutual\n information\". Phys. Rev. E 69, 2004.\n.. [3] B. C. Ross \"Mutual Information between Discrete and Continuous\n Data Sets\". PLoS ONE 9(2), 2014.\n.. [4] L. F. Kozachenko, N. N. Leonenko, \"Sample Estimate of the Entropy\n of a Random Vector\", Probl. Peredachi Inf., 23:2 (1987), 9-16", + "description": "Estimate mutual information for a continuous target variable.\n\nMutual information (MI) [1]_ between two random variables is a non-negative\nvalue, which measures the dependency between the variables. It is equal\nto zero if and only if two random variables are independent, and higher\nvalues mean higher dependency.\n\nThe function relies on nonparametric methods based on entropy estimation\nfrom k-nearest neighbors distances as described in [2]_ and [3]_. Both\nmethods are based on the idea originally proposed in [4]_.\n\nIt can be used for univariate features selection, read more in the\n:ref:`User Guide `.", + "docstring": "Estimate mutual information for a continuous target variable.\n\n Mutual information (MI) [1]_ between two random variables is a non-negative\n value, which measures the dependency between the variables. It is equal\n to zero if and only if two random variables are independent, and higher\n values mean higher dependency.\n\n The function relies on nonparametric methods based on entropy estimation\n from k-nearest neighbors distances as described in [2]_ and [3]_. Both\n methods are based on the idea originally proposed in [4]_.\n\n It can be used for univariate features selection, read more in the\n :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like or sparse matrix, shape (n_samples, n_features)\n Feature matrix.\n\n y : array-like of shape (n_samples,)\n Target vector.\n\n discrete_features : {'auto', bool, array-like}, default='auto'\n If bool, then determines whether to consider all features discrete\n or continuous. If array, then it should be either a boolean mask\n with shape (n_features,) or array with indices of discrete features.\n If 'auto', it is assigned to False for dense `X` and to True for\n sparse `X`.\n\n n_neighbors : int, default=3\n Number of neighbors to use for MI estimation for continuous variables,\n see [2]_ and [3]_. Higher values reduce variance of the estimation, but\n could introduce a bias.\n\n copy : bool, default=True\n Whether to make a copy of the given data. If set to False, the initial\n data will be overwritten.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for adding small noise to\n continuous variables in order to remove repeated values.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n mi : ndarray, shape (n_features,)\n Estimated mutual information between each feature and the target.\n\n Notes\n -----\n 1. The term \"discrete features\" is used instead of naming them\n \"categorical\", because it describes the essence more accurately.\n For example, pixel intensities of an image are discrete features\n (but hardly categorical) and you will get better results if mark them\n as such. Also note, that treating a continuous variable as discrete and\n vice versa will usually give incorrect results, so be attentive about\n that.\n 2. True mutual information can't be negative. If its estimate turns out\n to be negative, it is replaced by zero.\n\n References\n ----------\n .. [1] `Mutual Information\n `_\n on Wikipedia.\n .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, \"Estimating mutual\n information\". Phys. Rev. E 69, 2004.\n .. [3] B. C. Ross \"Mutual Information between Discrete and Continuous\n Data Sets\". PLoS ONE 9(2), 2014.\n .. [4] L. F. Kozachenko, N. N. Leonenko, \"Sample Estimate of the Entropy\n of a Random Vector\", Probl. Peredachi Inf., 23:2 (1987), 9-16\n ", "source_code": "\ndef mutual_info_regression(X, y, *, discrete_features='auto', n_neighbors=3, copy=True, random_state=None):\n \"\"\"Estimate mutual information for a continuous target variable.\n\n Mutual information (MI) [1]_ between two random variables is a non-negative\n value, which measures the dependency between the variables. It is equal\n to zero if and only if two random variables are independent, and higher\n values mean higher dependency.\n\n The function relies on nonparametric methods based on entropy estimation\n from k-nearest neighbors distances as described in [2]_ and [3]_. Both\n methods are based on the idea originally proposed in [4]_.\n\n It can be used for univariate features selection, read more in the\n :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like or sparse matrix, shape (n_samples, n_features)\n Feature matrix.\n\n y : array-like of shape (n_samples,)\n Target vector.\n\n discrete_features : {'auto', bool, array-like}, default='auto'\n If bool, then determines whether to consider all features discrete\n or continuous. If array, then it should be either a boolean mask\n with shape (n_features,) or array with indices of discrete features.\n If 'auto', it is assigned to False for dense `X` and to True for\n sparse `X`.\n\n n_neighbors : int, default=3\n Number of neighbors to use for MI estimation for continuous variables,\n see [2]_ and [3]_. Higher values reduce variance of the estimation, but\n could introduce a bias.\n\n copy : bool, default=True\n Whether to make a copy of the given data. If set to False, the initial\n data will be overwritten.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for adding small noise to\n continuous variables in order to remove repeated values.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n mi : ndarray, shape (n_features,)\n Estimated mutual information between each feature and the target.\n\n Notes\n -----\n 1. The term \"discrete features\" is used instead of naming them\n \"categorical\", because it describes the essence more accurately.\n For example, pixel intensities of an image are discrete features\n (but hardly categorical) and you will get better results if mark them\n as such. Also note, that treating a continuous variable as discrete and\n vice versa will usually give incorrect results, so be attentive about\n that.\n 2. True mutual information can't be negative. If its estimate turns out\n to be negative, it is replaced by zero.\n\n References\n ----------\n .. [1] `Mutual Information\n `_\n on Wikipedia.\n .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, \"Estimating mutual\n information\". Phys. Rev. E 69, 2004.\n .. [3] B. C. Ross \"Mutual Information between Discrete and Continuous\n Data Sets\". PLoS ONE 9(2), 2014.\n .. [4] L. F. Kozachenko, N. N. Leonenko, \"Sample Estimate of the Entropy\n of a Random Vector\", Probl. Peredachi Inf., 23:2 (1987), 9-16\n \"\"\"\n return _estimate_mi(X, y, discrete_features, False, n_neighbors, copy, random_state)" }, { @@ -83141,7 +88366,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -83151,7 +88377,8 @@ "docstring": { "type": "``Estimator`` instance", "description": "A supervised learning estimator with a ``fit`` method that provides\ninformation about feature importance\n(e.g. `coef_`, `feature_importances_`)." - } + }, + "refined_type": {} }, { "name": "n_features_to_select", @@ -83161,7 +88388,8 @@ "docstring": { "type": "int or float, default=None", "description": "The number of features to select. If `None`, half of the features are\nselected. If integer, the parameter is the absolute number of features\nto select. If float between 0 and 1, it is the fraction of features to\nselect.\n\n.. versionchanged:: 0.24\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "step", @@ -83171,7 +88399,8 @@ "docstring": { "type": "int or float, default=1", "description": "If greater than or equal to 1, then ``step`` corresponds to the\n(integer) number of features to remove at each iteration.\nIf within (0.0, 1.0), then ``step`` corresponds to the percentage\n(rounded down) of features to remove at each iteration." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -83181,7 +88410,8 @@ "docstring": { "type": "int, default=0", "description": "Controls verbosity of output." - } + }, + "refined_type": {} }, { "name": "importance_getter", @@ -83191,13 +88421,14 @@ "docstring": { "type": "str or callable, default='auto'", "description": "If 'auto', uses the feature importance either through a `coef_`\nor `feature_importances_` attributes of estimator.\n\nAlso accepts a string that specifies an attribute name/path\nfor extracting feature importance (implemented with `attrgetter`).\nFor example, give `regressor_.coef_` in case of\n:class:`~sklearn.compose.TransformedTargetRegressor` or\n`named_steps.clf.feature_importances_` in case of\nclass:`~sklearn.pipeline.Pipeline` with its last step named `clf`.\n\nIf `callable`, overrides the default feature importance getter.\nThe callable is passed with the fitted estimator and it should\nreturn importance for each feature.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, *, n_features_to_select=None, step=1, verbose=0, importance_getter='auto'):\n self.estimator = estimator\n self.n_features_to_select = n_features_to_select\n self.step = step\n self.importance_getter = importance_getter\n self.verbose = verbose" }, { @@ -83215,13 +88446,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef _estimator_type(self):\n return self.estimator._estimator_type" }, { @@ -83239,7 +88471,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -83249,7 +88482,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -83259,7 +88493,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "step_score", @@ -83269,13 +88504,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit(self, X, y, step_score=None, **fit_params):\n tags = self._get_tags()\n (X, y) = self._validate_data(X, y, accept_sparse='csc', ensure_min_features=2, force_all_finite=not tags.get('allow_nan', True), multi_output=True)\n error_msg = f'n_features_to_select must be either None, a positive integer representing the absolute number of features or a float in (0.0, 1.0] representing a percentage of features to select. Got {self.n_features_to_select}'\n n_features = X.shape[1]\n if self.n_features_to_select is None:\n n_features_to_select = n_features // 2\n elif self.n_features_to_select < 0:\n raise ValueError(error_msg)\n elif isinstance(self.n_features_to_select, numbers.Integral):\n n_features_to_select = self.n_features_to_select\n elif self.n_features_to_select > 1.0:\n raise ValueError(error_msg)\n else:\n n_features_to_select = int(n_features * self.n_features_to_select)\n if 0.0 < self.step < 1.0:\n step = int(max(1, self.step * n_features))\n else:\n step = int(self.step)\n if step <= 0:\n raise ValueError('Step must be >0')\n support_ = np.ones(n_features, dtype=bool)\n ranking_ = np.ones(n_features, dtype=int)\n if step_score:\n self.scores_ = []\n while np.sum(support_) > n_features_to_select:\n features = np.arange(n_features)[support_]\n estimator = clone(self.estimator)\n if self.verbose > 0:\n print('Fitting estimator with %d features.' % np.sum(support_))\n estimator.fit(X[:, features], y, **fit_params)\n importances = _get_feature_importances(estimator, self.importance_getter, transform_func='square')\n ranks = np.argsort(importances)\n ranks = np.ravel(ranks)\n threshold = min(step, np.sum(support_) - n_features_to_select)\n if step_score:\n self.scores_.append(step_score(estimator, features))\n support_[features[ranks][:threshold]] = False\n ranking_[np.logical_not(support_)] += 1\n features = np.arange(n_features)[support_]\n self.estimator_ = clone(self.estimator)\n self.estimator_.fit(X[:, features], y, **fit_params)\n if step_score:\n self.scores_.append(step_score(self.estimator_, features))\n self.n_features_ = support_.sum()\n self.support_ = support_\n self.ranking_ = ranking_\n return self" }, { @@ -83293,13 +88529,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_support_mask(self):\n check_is_fitted(self)\n return self.support_" }, { @@ -83317,13 +88554,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'poor_score': True, 'allow_nan': _safe_tags(self.estimator, key='allow_nan'), 'requires_y': True}" }, { @@ -83341,13 +88579,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Classes labels available when `estimator` is a classifier.", - "docstring": "Classes labels available when `estimator` is a classifier.\n\nReturns\n-------\nndarray of shape (n_classes,)", + "docstring": "Classes labels available when `estimator` is a classifier.\n\n Returns\n -------\n ndarray of shape (n_classes,)\n ", "source_code": "\n@property\ndef classes_(self):\n \"\"\"Classes labels available when `estimator` is a classifier.\n\n Returns\n -------\n ndarray of shape (n_classes,)\n \"\"\"\n return self.estimator_.classes_" }, { @@ -83365,7 +88604,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -83375,13 +88615,17 @@ "docstring": { "type": "{array-like or sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Compute the decision function of ``X``.", - "docstring": "Compute the decision function of ``X``.\n\nParameters\n----------\nX : {array-like or sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nReturns\n-------\nscore : array, shape = [n_samples, n_classes] or [n_samples]\n The decision function of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification produce an array of shape\n [n_samples].", + "docstring": "Compute the decision function of ``X``.\n\n Parameters\n ----------\n X : {array-like or sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n score : array, shape = [n_samples, n_classes] or [n_samples]\n The decision function of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification produce an array of shape\n [n_samples].\n ", "source_code": "\n@if_delegate_has_method(delegate='estimator')\ndef decision_function(self, X):\n \"\"\"Compute the decision function of ``X``.\n\n Parameters\n ----------\n X : {array-like or sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n score : array, shape = [n_samples, n_classes] or [n_samples]\n The decision function of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n Regression and binary classification produce an array of shape\n [n_samples].\n \"\"\"\n check_is_fitted(self)\n return self.estimator_.decision_function(self.transform(X))" }, { @@ -83399,7 +88643,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -83409,6 +88654,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -83419,13 +88668,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the RFE model and then the underlying estimator on the selected features.", - "docstring": "Fit the RFE model and then the underlying estimator on the selected features.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\ny : array-like of shape (n_samples,)\n The target values.\n\n**fit_params : dict\n Additional parameters passed to the `fit` method of the underlying\n estimator.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the RFE model and then the underlying estimator on the selected features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,)\n The target values.\n\n **fit_params : dict\n Additional parameters passed to the `fit` method of the underlying\n estimator.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, **fit_params):\n \"\"\"Fit the RFE model and then the underlying estimator on the selected features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,)\n The target values.\n\n **fit_params : dict\n Additional parameters passed to the `fit` method of the underlying\n estimator.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n return self._fit(X, y, **fit_params)" }, { @@ -83443,7 +88693,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -83453,13 +88704,14 @@ "docstring": { "type": "array of shape [n_samples, n_features]", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Reduce X to the selected features and then predict using the underlying estimator.", - "docstring": "Reduce X to the selected features and then predict using the underlying estimator.\n\nParameters\n----------\nX : array of shape [n_samples, n_features]\n The input samples.\n\nReturns\n-------\ny : array of shape [n_samples]\n The predicted target values.", + "docstring": "Reduce X to the selected features and then predict using the underlying estimator.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n Returns\n -------\n y : array of shape [n_samples]\n The predicted target values.\n ", "source_code": "\n@if_delegate_has_method(delegate='estimator')\ndef predict(self, X):\n \"\"\"Reduce X to the selected features and then predict using the underlying estimator.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n Returns\n -------\n y : array of shape [n_samples]\n The predicted target values.\n \"\"\"\n check_is_fitted(self)\n return self.estimator_.predict(self.transform(X))" }, { @@ -83477,7 +88729,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -83487,13 +88740,14 @@ "docstring": { "type": "array of shape [n_samples, n_features]", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict class log-probabilities for X.", - "docstring": "Predict class log-probabilities for X.\n\nParameters\n----------\nX : array of shape [n_samples, n_features]\n The input samples.\n\nReturns\n-------\np : array of shape (n_samples, n_classes)\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.", + "docstring": "Predict class log-probabilities for X.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n Returns\n -------\n p : array of shape (n_samples, n_classes)\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n ", "source_code": "\n@if_delegate_has_method(delegate='estimator')\ndef predict_log_proba(self, X):\n \"\"\"Predict class log-probabilities for X.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n Returns\n -------\n p : array of shape (n_samples, n_classes)\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n return self.estimator_.predict_log_proba(self.transform(X))" }, { @@ -83511,7 +88765,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -83521,13 +88776,17 @@ "docstring": { "type": "{array-like or sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict class probabilities for X.", - "docstring": "Predict class probabilities for X.\n\nParameters\n----------\nX : {array-like or sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nReturns\n-------\np : array of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.", + "docstring": "Predict class probabilities for X.\n\n Parameters\n ----------\n X : {array-like or sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n p : array of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n ", "source_code": "\n@if_delegate_has_method(delegate='estimator')\ndef predict_proba(self, X):\n \"\"\"Predict class probabilities for X.\n\n Parameters\n ----------\n X : {array-like or sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n p : array of shape (n_samples, n_classes)\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n return self.estimator_.predict_proba(self.transform(X))" }, { @@ -83545,7 +88804,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -83555,7 +88815,8 @@ "docstring": { "type": "array of shape [n_samples, n_features]", "description": "The input samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -83565,13 +88826,14 @@ "docstring": { "type": "array of shape [n_samples]", "description": "The target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Reduce X to the selected features and return the score of the underlying estimator.", - "docstring": "Reduce X to the selected features and return the score of the underlying estimator.\n\nParameters\n----------\nX : array of shape [n_samples, n_features]\n The input samples.\n\ny : array of shape [n_samples]\n The target values.\n\n**fit_params : dict\n Parameters to pass to the `score` method of the underlying\n estimator.\n\n .. versionadded:: 1.0\n\nReturns\n-------\nscore : float\n Score of the underlying base estimator computed with the selected\n features returned by `rfe.transform(X)` and `y`.", + "docstring": "Reduce X to the selected features and return the score of the underlying estimator.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n y : array of shape [n_samples]\n The target values.\n\n **fit_params : dict\n Parameters to pass to the `score` method of the underlying\n estimator.\n\n .. versionadded:: 1.0\n\n Returns\n -------\n score : float\n Score of the underlying base estimator computed with the selected\n features returned by `rfe.transform(X)` and `y`.\n ", "source_code": "\n@if_delegate_has_method(delegate='estimator')\ndef score(self, X, y, **fit_params):\n \"\"\"Reduce X to the selected features and return the score of the underlying estimator.\n\n Parameters\n ----------\n X : array of shape [n_samples, n_features]\n The input samples.\n\n y : array of shape [n_samples]\n The target values.\n\n **fit_params : dict\n Parameters to pass to the `score` method of the underlying\n estimator.\n\n .. versionadded:: 1.0\n\n Returns\n -------\n score : float\n Score of the underlying base estimator computed with the selected\n features returned by `rfe.transform(X)` and `y`.\n \"\"\"\n check_is_fitted(self)\n return self.estimator_.score(self.transform(X), y, **fit_params)" }, { @@ -83589,7 +88851,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -83599,7 +88862,8 @@ "docstring": { "type": "``Estimator`` instance", "description": "A supervised learning estimator with a ``fit`` method that provides\ninformation about feature importance either through a ``coef_``\nattribute or through a ``feature_importances_`` attribute." - } + }, + "refined_type": {} }, { "name": "step", @@ -83609,7 +88873,8 @@ "docstring": { "type": "int or float, default=1", "description": "If greater than or equal to 1, then ``step`` corresponds to the\n(integer) number of features to remove at each iteration.\nIf within (0.0, 1.0), then ``step`` corresponds to the percentage\n(rounded down) of features to remove at each iteration.\nNote that the last iteration may remove fewer than ``step`` features in\norder to reach ``min_features_to_select``." - } + }, + "refined_type": {} }, { "name": "min_features_to_select", @@ -83619,7 +88884,8 @@ "docstring": { "type": "int, default=1", "description": "The minimum number of features to be selected. This number of features\nwill always be scored, even if the difference between the original\nfeature count and ``min_features_to_select`` isn't divisible by\n``step``.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "cv", @@ -83629,7 +88895,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross-validation,\n- integer, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor integer/None inputs, if ``y`` is binary or multiclass,\n:class:`~sklearn.model_selection.StratifiedKFold` is used. If the\nestimator is a classifier or if ``y`` is neither binary nor multiclass,\n:class:`~sklearn.model_selection.KFold` is used.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value of None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -83639,7 +88906,8 @@ "docstring": { "type": "str, callable or None, default=None", "description": "A string (see model evaluation documentation) or\na scorer callable object / function with signature\n``scorer(estimator, X, y)``." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -83649,7 +88917,8 @@ "docstring": { "type": "int, default=0", "description": "Controls verbosity of output." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -83659,7 +88928,8 @@ "docstring": { "type": "int or None, default=None", "description": "Number of cores to run in parallel while fitting across folds.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "importance_getter", @@ -83669,13 +88939,14 @@ "docstring": { "type": "str or callable, default='auto'", "description": "If 'auto', uses the feature importance either through a `coef_`\nor `feature_importances_` attributes of estimator.\n\nAlso accepts a string that specifies an attribute name/path\nfor extracting feature importance.\nFor example, give `regressor_.coef_` in case of\n:class:`~sklearn.compose.TransformedTargetRegressor` or\n`named_steps.clf.feature_importances_` in case of\n:class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.\n\nIf `callable`, overrides the default feature importance getter.\nThe callable is passed with the fitted estimator and it should\nreturn importance for each feature.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, *, step=1, min_features_to_select=1, cv=None, scoring=None, verbose=0, n_jobs=None, importance_getter='auto'):\n self.estimator = estimator\n self.step = step\n self.importance_getter = importance_getter\n self.cv = cv\n self.scoring = scoring\n self.verbose = verbose\n self.n_jobs = n_jobs\n self.min_features_to_select = min_features_to_select" }, { @@ -83693,7 +88964,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -83703,6 +88975,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the total number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -83713,7 +88989,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values (integers for classification, real numbers for\nregression)." - } + }, + "refined_type": {} }, { "name": "groups", @@ -83723,14 +89000,15 @@ "docstring": { "type": "array-like of shape (n_samples,) or None, default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set. Only used in conjunction with a \"Group\" :term:`cv`\ninstance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the RFE model and automatically tune the number of selected features.", - "docstring": "Fit the RFE model and automatically tune the number of selected features.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the total number of features.\n\ny : array-like of shape (n_samples,)\n Target values (integers for classification, real numbers for\n regression).\n\ngroups : array-like of shape (n_samples,) or None, default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n .. versionadded:: 0.20\n\nReturns\n-------\nself : object\n Fitted estimator.", - "source_code": "\ndef fit(self, X, y, groups=None):\n \"\"\"Fit the RFE model and automatically tune the number of selected features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the total number of features.\n\n y : array-like of shape (n_samples,)\n Target values (integers for classification, real numbers for\n regression).\n\n groups : array-like of shape (n_samples,) or None, default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n .. versionadded:: 0.20\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n tags = self._get_tags()\n (X, y) = self._validate_data(X, y, accept_sparse='csr', ensure_min_features=2, force_all_finite=not tags.get('allow_nan', True), multi_output=True)\n cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n scorer = check_scoring(self.estimator, scoring=self.scoring)\n n_features = X.shape[1]\n if 0.0 < self.step < 1.0:\n step = int(max(1, self.step * n_features))\n else:\n step = int(self.step)\n if step <= 0:\n raise ValueError('Step must be >0')\n rfe = RFE(estimator=self.estimator, n_features_to_select=self.min_features_to_select, importance_getter=self.importance_getter, step=self.step, verbose=self.verbose)\n if effective_n_jobs(self.n_jobs) == 1:\n (parallel, func) = (list, _rfe_single_fit)\n else:\n parallel = Parallel(n_jobs=self.n_jobs)\n func = delayed(_rfe_single_fit)\n scores = parallel((func(rfe, self.estimator, X, y, train, test, scorer) for (train, test) in cv.split(X, y, groups)))\n scores = np.array(scores)\n scores_sum = np.sum(scores, axis=0)\n scores_sum_rev = scores_sum[::-1]\n argmax_idx = len(scores_sum) - np.argmax(scores_sum_rev) - 1\n n_features_to_select = max(n_features - argmax_idx * step, self.min_features_to_select)\n rfe = RFE(estimator=self.estimator, n_features_to_select=n_features_to_select, step=self.step, importance_getter=self.importance_getter, verbose=self.verbose)\n rfe.fit(X, y)\n self.support_ = rfe.support_\n self.n_features_ = rfe.n_features_\n self.ranking_ = rfe.ranking_\n self.estimator_ = clone(self.estimator)\n self.estimator_.fit(self.transform(X), y)\n scores_rev = scores[:, ::-1]\n self.cv_results_ = {}\n self.cv_results_['mean_test_score'] = np.mean(scores_rev, axis=0)\n self.cv_results_['std_test_score'] = np.std(scores_rev, axis=0)\n for i in range(scores.shape[0]):\n self.cv_results_[f'split{i}_test_score'] = scores_rev[i]\n return self" + "docstring": "Fit the RFE model and automatically tune the number of selected features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the total number of features.\n\n y : array-like of shape (n_samples,)\n Target values (integers for classification, real numbers for\n regression).\n\n groups : array-like of shape (n_samples,) or None, default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n .. versionadded:: 0.20\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", + "source_code": "\ndef fit(self, X, y, groups=None):\n \"\"\"Fit the RFE model and automatically tune the number of selected features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the total number of features.\n\n y : array-like of shape (n_samples,)\n Target values (integers for classification, real numbers for\n regression).\n\n groups : array-like of shape (n_samples,) or None, default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n .. versionadded:: 0.20\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n tags = self._get_tags()\n (X, y) = self._validate_data(X, y, accept_sparse='csr', ensure_min_features=2, force_all_finite=not tags.get('allow_nan', True), multi_output=True)\n cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n scorer = check_scoring(self.estimator, scoring=self.scoring)\n n_features = X.shape[1]\n if 0.0 < self.step < 1.0:\n step = int(max(1, self.step * n_features))\n else:\n step = int(self.step)\n if step <= 0:\n raise ValueError('Step must be >0')\n rfe = RFE(estimator=self.estimator, n_features_to_select=self.min_features_to_select, importance_getter=self.importance_getter, step=self.step, verbose=self.verbose)\n if effective_n_jobs(self.n_jobs) == 1:\n (parallel, func) = (list, _rfe_single_fit)\n else:\n parallel = Parallel(n_jobs=self.n_jobs)\n func = delayed(_rfe_single_fit)\n scores = parallel((func(rfe, self.estimator, X, y, train, test, scorer) for (train, test) in cv.split(X, y, groups)))\n scores = np.array(scores)\n scores_sum = np.sum(scores, axis=0)\n scores_sum_rev = scores_sum[::-1]\n argmax_idx = len(scores_sum) - np.argmax(scores_sum_rev) - 1\n n_features_to_select = max(n_features - argmax_idx * step, self.min_features_to_select)\n rfe = RFE(estimator=self.estimator, n_features_to_select=n_features_to_select, step=self.step, importance_getter=self.importance_getter, verbose=self.verbose)\n rfe.fit(X, y)\n self.support_ = rfe.support_\n self.n_features_ = rfe.n_features_\n self.ranking_ = rfe.ranking_\n self.estimator_ = clone(self.estimator)\n self.estimator_.fit(self._transform(X), y)\n scores_rev = scores[:, ::-1]\n self.cv_results_ = {}\n self.cv_results_['mean_test_score'] = np.mean(scores_rev, axis=0)\n self.cv_results_['std_test_score'] = np.std(scores_rev, axis=0)\n for i in range(scores.shape[0]):\n self.cv_results_[f'split{i}_test_score'] = scores_rev[i]\n return self" }, { "name": "grid_scores_", @@ -83750,13 +89028,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('The `grid_scores_` attribute is deprecated in version 1.0 in favor of `cv_results_` and will be removed in version 1.2.')\n@property\ndef grid_scores_(self):\n grid_size = len(self.cv_results_) - 2\n return np.asarray([self.cv_results_[f'split{i}_test_score'] for i in range(grid_size)]).T" }, { @@ -83774,7 +89053,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -83784,7 +89064,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -83794,7 +89075,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -83804,7 +89086,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "train", @@ -83814,7 +89097,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "test", @@ -83824,7 +89108,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scorer", @@ -83834,13 +89119,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return the score for a fit across one fold.", - "docstring": "Return the score for a fit across one fold.", + "docstring": "\n Return the score for a fit across one fold.\n ", "source_code": "\ndef _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):\n \"\"\"\n Return the score for a fit across one fold.\n \"\"\"\n (X_train, y_train) = _safe_split(estimator, X, y, train)\n (X_test, y_test) = _safe_split(estimator, X, y, test, train)\n return rfe._fit(X_train, y_train, lambda estimator, features: _score(estimator, X_test[:, features], y_test, scorer)).scores_" }, { @@ -83858,7 +89144,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -83868,7 +89155,8 @@ "docstring": { "type": "estimator instance", "description": "An unfitted estimator." - } + }, + "refined_type": {} }, { "name": "n_features_to_select", @@ -83878,7 +89166,8 @@ "docstring": { "type": "int or float, default=None", "description": "The number of features to select. If `None`, half of the features are\nselected. If integer, the parameter is the absolute number of features\nto select. If float between 0 and 1, it is the fraction of features to\nselect." - } + }, + "refined_type": {} }, { "name": "direction", @@ -83888,6 +89177,10 @@ "docstring": { "type": "{'forward', 'backward'}, default='forward'", "description": "Whether to perform forward selection or backward selection." + }, + "refined_type": { + "kind": "EnumType", + "values": ["backward", "forward"] } }, { @@ -83898,7 +89191,8 @@ "docstring": { "type": "str, callable, list/tuple or dict, default=None", "description": "A single str (see :ref:`scoring_parameter`) or a callable\n(see :ref:`scoring`) to evaluate the predictions on the test set.\n\nNOTE that when using custom scorers, each scorer should return a single\nvalue. Metric functions returning a list/array of values can be wrapped\ninto multiple scorers that return one value each.\n\nIf None, the estimator's score method is used." - } + }, + "refined_type": {} }, { "name": "cv", @@ -83908,7 +89202,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross validation,\n- integer, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor integer/None inputs, if the estimator is a classifier and ``y`` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`KFold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -83918,13 +89213,14 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel. When evaluating a new feature to\nadd or remove, the cross-validation procedure is parallel over the\nfolds.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, *, n_features_to_select=None, direction='forward', scoring=None, cv=5, n_jobs=None):\n self.estimator = estimator\n self.n_features_to_select = n_features_to_select\n self.direction = direction\n self.scoring = scoring\n self.cv = cv\n self.n_jobs = n_jobs" }, { @@ -83942,7 +89238,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -83952,7 +89249,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -83962,7 +89260,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -83972,7 +89271,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "current_mask", @@ -83982,13 +89282,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_best_new_feature(self, estimator, X, y, current_mask):\n candidate_feature_indices = np.flatnonzero(~current_mask)\n scores = {}\n for feature_idx in candidate_feature_indices:\n candidate_mask = current_mask.copy()\n candidate_mask[feature_idx] = True\n if self.direction == 'backward':\n candidate_mask = ~candidate_mask\n X_new = X[:, candidate_mask]\n scores[feature_idx] = cross_val_score(estimator, X_new, y, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs).mean()\n return max(scores, key=lambda feature_idx: scores[feature_idx])" }, { @@ -84006,13 +89307,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_support_mask(self):\n check_is_fitted(self)\n return self.support_" }, { @@ -84030,13 +89332,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'allow_nan': _safe_tags(self.estimator, key='allow_nan'), 'requires_y': True}" }, { @@ -84054,7 +89357,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -84064,7 +89368,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of predictors." - } + }, + "refined_type": {} }, { "name": "y", @@ -84074,13 +89379,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Target values. This parameter may be ignored for\nunsupervised learning." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Learn the features to select from X.", - "docstring": "Learn the features to select from X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of predictors.\n\ny : array-like of shape (n_samples,), default=None\n Target values. This parameter may be ignored for\n unsupervised learning.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Learn the features to select from X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of predictors.\n\n y : array-like of shape (n_samples,), default=None\n Target values. This parameter may be ignored for\n unsupervised learning.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Learn the features to select from X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of predictors.\n\n y : array-like of shape (n_samples,), default=None\n Target values. This parameter may be ignored for\n unsupervised learning.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n tags = self._get_tags()\n X = self._validate_data(X, accept_sparse='csc', ensure_min_features=2, force_all_finite=not tags.get('allow_nan', True))\n n_features = X.shape[1]\n error_msg = f'n_features_to_select must be either None, an integer in [1, n_features - 1] representing the absolute number of features, or a float in (0, 1] representing a percentage of features to select. Got {self.n_features_to_select}'\n if self.n_features_to_select is None:\n self.n_features_to_select_ = n_features // 2\n elif isinstance(self.n_features_to_select, numbers.Integral):\n if not 0 < self.n_features_to_select < n_features:\n raise ValueError(error_msg)\n self.n_features_to_select_ = self.n_features_to_select\n elif isinstance(self.n_features_to_select, numbers.Real):\n if not 0 < self.n_features_to_select <= 1:\n raise ValueError(error_msg)\n self.n_features_to_select_ = int(n_features * self.n_features_to_select)\n else:\n raise ValueError(error_msg)\n if self.direction not in ('forward', 'backward'):\n raise ValueError(f\"direction must be either 'forward' or 'backward'. Got {self.direction}.\")\n cloned_estimator = clone(self.estimator)\n current_mask = np.zeros(shape=n_features, dtype=bool)\n n_iterations = self.n_features_to_select_ if self.direction == 'forward' else n_features - self.n_features_to_select_\n for _ in range(n_iterations):\n new_feature_idx = self._get_best_new_feature(cloned_estimator, X, y, current_mask)\n current_mask[new_feature_idx] = True\n if self.direction == 'backward':\n current_mask = ~current_mask\n self.support_ = current_mask\n return self" }, { @@ -84098,7 +89404,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "score_func", @@ -84108,7 +89415,8 @@ "docstring": { "type": "callable, default=f_classif", "description": "Function taking two arrays X and y, and returning a pair of arrays\n(scores, pvalues). For modes 'percentile' or 'kbest' it can return\na single array scores." - } + }, + "refined_type": {} }, { "name": "mode", @@ -84118,6 +89426,10 @@ "docstring": { "type": "{'percentile', 'k_best', 'fpr', 'fdr', 'fwe'}, default='percentile'", "description": "Feature selection mode." + }, + "refined_type": { + "kind": "EnumType", + "values": ["percentile", "fpr", "fdr", "k_best", "fwe"] } }, { @@ -84128,13 +89440,14 @@ "docstring": { "type": "float or int depending on the feature selection mode, default=1e-5", "description": "Parameter of the corresponding mode." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, score_func=f_classif, *, mode='percentile', param=1e-05):\n super().__init__(score_func=score_func)\n self.mode = mode\n self.param = param" }, { @@ -84152,7 +89465,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -84162,7 +89476,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -84172,13 +89487,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_params(self, X, y):\n if self.mode not in self._selection_modes:\n raise ValueError('The mode passed should be one of %s, %r, (type %s) was passed.' % (self._selection_modes.keys(), self.mode, type(self.mode)))\n self._make_selector()._check_params(X, y)" }, { @@ -84196,13 +89512,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_support_mask(self):\n check_is_fitted(self)\n selector = self._make_selector()\n selector.pvalues_ = self.pvalues_\n selector.scores_ = self.scores_\n return selector._get_support_mask()" }, { @@ -84220,13 +89537,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _make_selector(self):\n selector = self._selection_modes[self.mode](score_func=self.score_func)\n possible_params = selector._get_param_names()\n possible_params.remove('score_func')\n selector.set_params(**{possible_params[0]: self.param})\n return selector" }, { @@ -84244,7 +89562,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "score_func", @@ -84254,7 +89573,8 @@ "docstring": { "type": "callable, default=f_classif", "description": "Function taking two arrays X and y, and returning a pair of arrays\n(scores, pvalues).\nDefault is f_classif (see below \"See Also\"). The default function only\nworks with classification tasks." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -84264,13 +89584,14 @@ "docstring": { "type": "float, default=5e-2", "description": "The highest uncorrected p-value for features to keep." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, score_func=f_classif, *, alpha=0.05):\n super().__init__(score_func=score_func)\n self.alpha = alpha" }, { @@ -84288,13 +89609,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_support_mask(self):\n check_is_fitted(self)\n n_features = len(self.pvalues_)\n sv = np.sort(self.pvalues_)\n selected = sv[sv <= float(self.alpha) / n_features * np.arange(1, n_features + 1)]\n if selected.size == 0:\n return np.zeros_like(self.pvalues_, dtype=bool)\n return self.pvalues_ <= selected.max()" }, { @@ -84312,7 +89634,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "score_func", @@ -84322,7 +89645,8 @@ "docstring": { "type": "callable, default=f_classif", "description": "Function taking two arrays X and y, and returning a pair of arrays\n(scores, pvalues).\nDefault is f_classif (see below \"See Also\"). The default function only\nworks with classification tasks." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -84331,14 +89655,15 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "float, default=5e-2", - "description": "The highest p-value for features to be kept." - } + "description": "Features with p-values less than `alpha` are selected." + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, score_func=f_classif, *, alpha=0.05):\n super().__init__(score_func=score_func)\n self.alpha = alpha" }, { @@ -84356,13 +89681,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_support_mask(self):\n check_is_fitted(self)\n return self.pvalues_ < self.alpha" }, { @@ -84380,7 +89706,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "score_func", @@ -84390,7 +89717,8 @@ "docstring": { "type": "callable, default=f_classif", "description": "Function taking two arrays X and y, and returning a pair of arrays\n(scores, pvalues).\nDefault is f_classif (see below \"See Also\"). The default function only\nworks with classification tasks." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -84400,13 +89728,14 @@ "docstring": { "type": "float, default=5e-2", "description": "The highest uncorrected p-value for features to keep." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, score_func=f_classif, *, alpha=0.05):\n super().__init__(score_func=score_func)\n self.alpha = alpha" }, { @@ -84424,13 +89753,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_support_mask(self):\n check_is_fitted(self)\n return self.pvalues_ < self.alpha / len(self.pvalues_)" }, { @@ -84448,7 +89778,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "score_func", @@ -84458,7 +89789,8 @@ "docstring": { "type": "callable, default=f_classif", "description": "Function taking two arrays X and y, and returning a pair of arrays\n(scores, pvalues) or a single array with scores.\nDefault is f_classif (see below \"See Also\"). The default function only\nworks with classification tasks.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "k", @@ -84468,13 +89800,14 @@ "docstring": { "type": "int or \"all\", default=10", "description": "Number of top features to select.\nThe \"all\" option bypasses selection, for use in a parameter search." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, score_func=f_classif, *, k=10):\n super().__init__(score_func=score_func)\n self.k = k" }, { @@ -84492,7 +89825,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -84502,7 +89836,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -84512,13 +89847,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_params(self, X, y):\n if not (self.k == 'all' or 0 <= self.k <= X.shape[1]):\n raise ValueError(\"k should be >=0, <= n_features = %d; got %r. Use k='all' to return all features.\" % (X.shape[1], self.k))" }, { @@ -84536,13 +89872,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_support_mask(self):\n check_is_fitted(self)\n if self.k == 'all':\n return np.ones(self.scores_.shape, dtype=bool)\n elif self.k == 0:\n return np.zeros(self.scores_.shape, dtype=bool)\n else:\n scores = _clean_nans(self.scores_)\n mask = np.zeros(scores.shape, dtype=bool)\n mask[np.argsort(scores, kind='mergesort')[-self.k:]] = 1\n return mask" }, { @@ -84560,7 +89897,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "score_func", @@ -84570,7 +89908,8 @@ "docstring": { "type": "callable, default=f_classif", "description": "Function taking two arrays X and y, and returning a pair of arrays\n(scores, pvalues) or a single array with scores.\nDefault is f_classif (see below \"See Also\"). The default function only\nworks with classification tasks.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "percentile", @@ -84580,13 +89919,14 @@ "docstring": { "type": "int, default=10", "description": "Percent of features to keep." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, score_func=f_classif, *, percentile=10):\n super().__init__(score_func=score_func)\n self.percentile = percentile" }, { @@ -84604,7 +89944,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -84614,7 +89955,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -84624,13 +89966,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_params(self, X, y):\n if not 0 <= self.percentile <= 100:\n raise ValueError('percentile should be >=0, <=100; got %r' % self.percentile)" }, { @@ -84648,13 +89991,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_support_mask(self):\n check_is_fitted(self)\n if self.percentile == 100:\n return np.ones(len(self.scores_), dtype=bool)\n elif self.percentile == 0:\n return np.zeros(len(self.scores_), dtype=bool)\n scores = _clean_nans(self.scores_)\n threshold = np.percentile(scores, 100 - self.percentile)\n mask = scores > threshold\n ties = np.where(scores == threshold)[0]\n if len(ties):\n max_feats = int(len(scores) * self.percentile / 100)\n kept_ties = ties[:max_feats - mask.sum()]\n mask[kept_ties] = True\n return mask" }, { @@ -84672,7 +90016,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "score_func", @@ -84682,13 +90027,14 @@ "docstring": { "type": "callable", "description": "Function taking two arrays X and y, and returning a pair of arrays\n(scores, pvalues) or a single array with scores." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, score_func):\n self.score_func = score_func" }, { @@ -84706,7 +90052,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -84716,7 +90063,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -84726,13 +90074,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_params(self, X, y):\n pass" }, { @@ -84750,13 +90099,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'requires_y': True}" }, { @@ -84774,7 +90124,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -84784,7 +90135,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The training input samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -84794,13 +90146,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target values (class labels in classification, real numbers in\nregression)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Run score function on (X, y) and get the appropriate features.", - "docstring": "Run score function on (X, y) and get the appropriate features.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The training input samples.\n\ny : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Run score function on (X, y) and get the appropriate features.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Run score function on (X, y) and get the appropriate features.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training input samples.\n\n y : array-like of shape (n_samples,)\n The target values (class labels in classification, real numbers in\n regression).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], multi_output=True)\n if not callable(self.score_func):\n raise TypeError('The score function should be a callable, %s (%s) was passed.' % (self.score_func, type(self.score_func)))\n self._check_params(X, y)\n score_func_ret = self.score_func(X, y)\n if isinstance(score_func_ret, (list, tuple)):\n (self.scores_, self.pvalues_) = score_func_ret\n self.pvalues_ = np.asarray(self.pvalues_)\n else:\n self.scores_ = score_func_ret\n self.pvalues_ = None\n self.scores_ = np.asarray(self.scores_)\n return self" }, { @@ -84818,7 +90171,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "f_exp", @@ -84828,13 +90182,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Fast replacement for scipy.stats.chisquare.\n\nVersion from https://github.com/scipy/scipy/pull/2525 with additional optimizations.", - "docstring": "Fast replacement for scipy.stats.chisquare.\n\nVersion from https://github.com/scipy/scipy/pull/2525 with additional\noptimizations.", + "description": "Fast replacement for scipy.stats.chisquare.\n\nVersion from https://github.com/scipy/scipy/pull/2525 with additional\noptimizations.", + "docstring": "Fast replacement for scipy.stats.chisquare.\n\n Version from https://github.com/scipy/scipy/pull/2525 with additional\n optimizations.\n ", "source_code": "\ndef _chisquare(f_obs, f_exp):\n \"\"\"Fast replacement for scipy.stats.chisquare.\n\n Version from https://github.com/scipy/scipy/pull/2525 with additional\n optimizations.\n \"\"\"\n f_obs = np.asarray(f_obs, dtype=np.float64)\n k = len(f_obs)\n chisq = f_obs\n chisq -= f_exp\n chisq **= 2\n with np.errstate(invalid='ignore'):\n chisq /= f_exp\n chisq = chisq.sum(axis=0)\n return chisq, special.chdtrc(k - 1, chisq)" }, { @@ -84852,13 +90207,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Fixes Issue #1240: NaNs can't be properly compared, so change them to the smallest value of scores's dtype. -inf seems to be unreliable.", - "docstring": "Fixes Issue #1240: NaNs can't be properly compared, so change them to the\nsmallest value of scores's dtype. -inf seems to be unreliable.", + "description": "Fixes Issue #1240: NaNs can't be properly compared, so change them to the\nsmallest value of scores's dtype. -inf seems to be unreliable.", + "docstring": "\n Fixes Issue #1240: NaNs can't be properly compared, so change them to the\n smallest value of scores's dtype. -inf seems to be unreliable.\n ", "source_code": "\ndef _clean_nans(scores):\n \"\"\"\n Fixes Issue #1240: NaNs can't be properly compared, so change them to the\n smallest value of scores's dtype. -inf seems to be unreliable.\n \"\"\"\n scores = as_float_array(scores, copy=True)\n scores[np.isnan(scores)] = np.finfo(scores.dtype).min\n return scores" }, { @@ -84876,6 +90232,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Sample vectors." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -84886,13 +90246,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target vector (class labels)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute chi-squared stats between each non-negative feature and class.\n\nThis score can be used to select the n_features features with the highest values for the test chi-squared statistic from X, which must contain only non-negative features such as booleans or frequencies (e.g., term counts in document classification), relative to the classes. Recall that the chi-square test measures dependence between stochastic variables, so using this function \"weeds out\" the features that are the most likely to be independent of class and therefore irrelevant for classification. Read more in the :ref:`User Guide `.", - "docstring": "Compute chi-squared stats between each non-negative feature and class.\n\nThis score can be used to select the n_features features with the\nhighest values for the test chi-squared statistic from X, which must\ncontain only non-negative features such as booleans or frequencies\n(e.g., term counts in document classification), relative to the classes.\n\nRecall that the chi-square test measures dependence between stochastic\nvariables, so using this function \"weeds out\" the features that are the\nmost likely to be independent of class and therefore irrelevant for\nclassification.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Sample vectors.\n\ny : array-like of shape (n_samples,)\n Target vector (class labels).\n\nReturns\n-------\nchi2 : ndarray of shape (n_features,)\n Chi2 statistics for each feature.\n\np_values : ndarray of shape (n_features,)\n P-values for each feature.\n\nNotes\n-----\nComplexity of this algorithm is O(n_classes * n_features).\n\nSee Also\n--------\nf_classif : ANOVA F-value between label/feature for classification tasks.\nf_regression : F-value between label/feature for regression tasks.", + "description": "Compute chi-squared stats between each non-negative feature and class.\n\nThis score can be used to select the n_features features with the\nhighest values for the test chi-squared statistic from X, which must\ncontain only non-negative features such as booleans or frequencies\n(e.g., term counts in document classification), relative to the classes.\n\nRecall that the chi-square test measures dependence between stochastic\nvariables, so using this function \"weeds out\" the features that are the\nmost likely to be independent of class and therefore irrelevant for\nclassification.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute chi-squared stats between each non-negative feature and class.\n\n This score can be used to select the n_features features with the\n highest values for the test chi-squared statistic from X, which must\n contain only non-negative features such as booleans or frequencies\n (e.g., term counts in document classification), relative to the classes.\n\n Recall that the chi-square test measures dependence between stochastic\n variables, so using this function \"weeds out\" the features that are the\n most likely to be independent of class and therefore irrelevant for\n classification.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Sample vectors.\n\n y : array-like of shape (n_samples,)\n Target vector (class labels).\n\n Returns\n -------\n chi2 : ndarray of shape (n_features,)\n Chi2 statistics for each feature.\n\n p_values : ndarray of shape (n_features,)\n P-values for each feature.\n\n Notes\n -----\n Complexity of this algorithm is O(n_classes * n_features).\n\n See Also\n --------\n f_classif : ANOVA F-value between label/feature for classification tasks.\n f_regression : F-value between label/feature for regression tasks.\n ", "source_code": "\ndef chi2(X, y):\n \"\"\"Compute chi-squared stats between each non-negative feature and class.\n\n This score can be used to select the n_features features with the\n highest values for the test chi-squared statistic from X, which must\n contain only non-negative features such as booleans or frequencies\n (e.g., term counts in document classification), relative to the classes.\n\n Recall that the chi-square test measures dependence between stochastic\n variables, so using this function \"weeds out\" the features that are the\n most likely to be independent of class and therefore irrelevant for\n classification.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Sample vectors.\n\n y : array-like of shape (n_samples,)\n Target vector (class labels).\n\n Returns\n -------\n chi2 : ndarray of shape (n_features,)\n Chi2 statistics for each feature.\n\n p_values : ndarray of shape (n_features,)\n P-values for each feature.\n\n Notes\n -----\n Complexity of this algorithm is O(n_classes * n_features).\n\n See Also\n --------\n f_classif : ANOVA F-value between label/feature for classification tasks.\n f_regression : F-value between label/feature for regression tasks.\n \"\"\"\n X = check_array(X, accept_sparse='csr')\n if np.any((X.data if issparse(X) else X) < 0):\n raise ValueError('Input X must be non-negative.')\n Y = LabelBinarizer().fit_transform(y)\n if Y.shape[1] == 1:\n Y = np.append(1 - Y, Y, axis=1)\n observed = safe_sparse_dot(Y.T, X)\n feature_count = X.sum(axis=0).reshape(1, -1)\n class_prob = Y.mean(axis=0).reshape(1, -1)\n expected = np.dot(class_prob.T, feature_count)\n return _chisquare(observed, expected)" }, { @@ -84910,6 +90271,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The set of regressors that will be tested sequentially." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -84920,13 +90285,14 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target vector." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the ANOVA F-value for the provided sample.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Compute the ANOVA F-value for the provided sample.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The set of regressors that will be tested sequentially.\n\ny : ndarray of shape (n_samples,)\n The target vector.\n\nReturns\n-------\nf_statistic : ndarray of shape (n_features,)\n F-statistic for each feature.\n\np_values : ndarray of shape (n_features,)\n P-values associated with the F-statistic.\n\nSee Also\n--------\nchi2 : Chi-squared stats of non-negative features for classification tasks.\nf_regression : F-value between label/feature for regression tasks.", + "docstring": "Compute the ANOVA F-value for the provided sample.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The set of regressors that will be tested sequentially.\n\n y : ndarray of shape (n_samples,)\n The target vector.\n\n Returns\n -------\n f_statistic : ndarray of shape (n_features,)\n F-statistic for each feature.\n\n p_values : ndarray of shape (n_features,)\n P-values associated with the F-statistic.\n\n See Also\n --------\n chi2 : Chi-squared stats of non-negative features for classification tasks.\n f_regression : F-value between label/feature for regression tasks.\n ", "source_code": "\ndef f_classif(X, y):\n \"\"\"Compute the ANOVA F-value for the provided sample.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The set of regressors that will be tested sequentially.\n\n y : ndarray of shape (n_samples,)\n The target vector.\n\n Returns\n -------\n f_statistic : ndarray of shape (n_features,)\n F-statistic for each feature.\n\n p_values : ndarray of shape (n_features,)\n P-values associated with the F-statistic.\n\n See Also\n --------\n chi2 : Chi-squared stats of non-negative features for classification tasks.\n f_regression : F-value between label/feature for regression tasks.\n \"\"\"\n (X, y) = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'])\n args = [X[safe_mask(X, y == k)] for k in np.unique(y)]\n return f_oneway(*args)" }, { @@ -84938,8 +90304,8 @@ "parameters": [], "results": [], "is_public": true, - "description": "Performs a 1-way ANOVA.\n\nThe one-way ANOVA tests the null hypothesis that 2 or more groups have the same population mean. The test is applied to samples from two or more groups, possibly with differing sizes. Read more in the :ref:`User Guide `.", - "docstring": "Performs a 1-way ANOVA.\n\nThe one-way ANOVA tests the null hypothesis that 2 or more groups have\nthe same population mean. The test is applied to samples from two or\nmore groups, possibly with differing sizes.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\n*args : {array-like, sparse matrix}\n sample1, sample2... The sample measurements should be given as\n arguments.\n\nReturns\n-------\nf_statistic : float\n The computed F-value of the test.\np_value : float\n The associated p-value from the F-distribution.\n\nNotes\n-----\nThe ANOVA test has important assumptions that must be satisfied in order\nfor the associated p-value to be valid.\n\n1. The samples are independent\n2. Each sample is from a normally distributed population\n3. The population standard deviations of the groups are all equal. This\n property is known as homoscedasticity.\n\nIf these assumptions are not true for a given set of data, it may still be\npossible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although\nwith some loss of power.\n\nThe algorithm is from Heiman[2], pp.394-7.\n\nSee ``scipy.stats.f_oneway`` that should give the same results while\nbeing less efficient.\n\nReferences\n----------\n\n.. [1] Lowry, Richard. \"Concepts and Applications of Inferential\n Statistics\". Chapter 14.\n http://faculty.vassar.edu/lowry/ch14pt1.html\n\n.. [2] Heiman, G.W. Research Methods in Statistics. 2002.", + "description": "Performs a 1-way ANOVA.\n\nThe one-way ANOVA tests the null hypothesis that 2 or more groups have\nthe same population mean. The test is applied to samples from two or\nmore groups, possibly with differing sizes.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Performs a 1-way ANOVA.\n\n The one-way ANOVA tests the null hypothesis that 2 or more groups have\n the same population mean. The test is applied to samples from two or\n more groups, possibly with differing sizes.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n *args : {array-like, sparse matrix}\n sample1, sample2... The sample measurements should be given as\n arguments.\n\n Returns\n -------\n f_statistic : float\n The computed F-value of the test.\n p_value : float\n The associated p-value from the F-distribution.\n\n Notes\n -----\n The ANOVA test has important assumptions that must be satisfied in order\n for the associated p-value to be valid.\n\n 1. The samples are independent\n 2. Each sample is from a normally distributed population\n 3. The population standard deviations of the groups are all equal. This\n property is known as homoscedasticity.\n\n If these assumptions are not true for a given set of data, it may still be\n possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although\n with some loss of power.\n\n The algorithm is from Heiman[2], pp.394-7.\n\n See ``scipy.stats.f_oneway`` that should give the same results while\n being less efficient.\n\n References\n ----------\n\n .. [1] Lowry, Richard. \"Concepts and Applications of Inferential\n Statistics\". Chapter 14.\n http://faculty.vassar.edu/lowry/ch14pt1.html\n\n .. [2] Heiman, G.W. Research Methods in Statistics. 2002.\n\n ", "source_code": "\ndef f_oneway(*args):\n \"\"\"Performs a 1-way ANOVA.\n\n The one-way ANOVA tests the null hypothesis that 2 or more groups have\n the same population mean. The test is applied to samples from two or\n more groups, possibly with differing sizes.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n *args : {array-like, sparse matrix}\n sample1, sample2... The sample measurements should be given as\n arguments.\n\n Returns\n -------\n f_statistic : float\n The computed F-value of the test.\n p_value : float\n The associated p-value from the F-distribution.\n\n Notes\n -----\n The ANOVA test has important assumptions that must be satisfied in order\n for the associated p-value to be valid.\n\n 1. The samples are independent\n 2. Each sample is from a normally distributed population\n 3. The population standard deviations of the groups are all equal. This\n property is known as homoscedasticity.\n\n If these assumptions are not true for a given set of data, it may still be\n possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although\n with some loss of power.\n\n The algorithm is from Heiman[2], pp.394-7.\n\n See ``scipy.stats.f_oneway`` that should give the same results while\n being less efficient.\n\n References\n ----------\n\n .. [1] Lowry, Richard. \"Concepts and Applications of Inferential\n Statistics\". Chapter 14.\n http://faculty.vassar.edu/lowry/ch14pt1.html\n\n .. [2] Heiman, G.W. Research Methods in Statistics. 2002.\n\n \"\"\"\n n_classes = len(args)\n args = [as_float_array(a) for a in args]\n n_samples_per_class = np.array([a.shape[0] for a in args])\n n_samples = np.sum(n_samples_per_class)\n ss_alldata = sum((safe_sqr(a).sum(axis=0) for a in args))\n sums_args = [np.asarray(a.sum(axis=0)) for a in args]\n square_of_sums_alldata = sum(sums_args)**2\n square_of_sums_args = [s**2 for s in sums_args]\n sstot = ss_alldata - square_of_sums_alldata / float(n_samples)\n ssbn = 0.0\n for (k, _) in enumerate(args):\n ssbn += square_of_sums_args[k] / n_samples_per_class[k]\n ssbn -= square_of_sums_alldata / float(n_samples)\n sswn = sstot - ssbn\n dfbn = n_classes - 1\n dfwn = n_samples - n_classes\n msb = ssbn / float(dfbn)\n msw = sswn / float(dfwn)\n constant_features_idx = np.where(msw == 0.0)[0]\n if np.nonzero(msb)[0].size != msb.size and constant_features_idx.size:\n warnings.warn('Features %s are constant.' % constant_features_idx, UserWarning)\n f = msb / msw\n f = np.asarray(f).ravel()\n prob = special.fdtrc(dfbn, dfwn, f)\n return f, prob" }, { @@ -84957,6 +90323,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -84967,7 +90337,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target vector." - } + }, + "refined_type": {} }, { "name": "center", @@ -84977,13 +90348,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to center the data matrix `X` and the target vector `y`.\nBy default, `X` and `y` will be centered." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Univariate linear regression tests returning F-statistic and p-values.\n\nQuick linear model for testing the effect of a single regressor, sequentially for many regressors. This is done in 2 steps: 1. The cross correlation between each regressor and the target is computed, that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)) using r_regression function. 2. It is converted to an F score and then to a p-value. :func:`f_regression` is derived from :func:`r_regression` and will rank features in the same order if all the features are positively correlated with the target. Note however that contrary to :func:`f_regression`, :func:`r_regression` values lie in [-1, 1] and can thus be negative. :func:`f_regression` is therefore recommended as a feature selection criterion to identify potentially predictive feature for a downstream classifier, irrespective of the sign of the association with the target variable. Furthermore :func:`f_regression` returns p-values while :func:`r_regression` does not. Read more in the :ref:`User Guide `.", - "docstring": "Univariate linear regression tests returning F-statistic and p-values.\n\nQuick linear model for testing the effect of a single regressor,\nsequentially for many regressors.\n\nThis is done in 2 steps:\n\n1. The cross correlation between each regressor and the target is computed,\n that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *\n std(y)) using r_regression function.\n2. It is converted to an F score and then to a p-value.\n\n:func:`f_regression` is derived from :func:`r_regression` and will rank\nfeatures in the same order if all the features are positively correlated\nwith the target.\n\nNote however that contrary to :func:`f_regression`, :func:`r_regression`\nvalues lie in [-1, 1] and can thus be negative. :func:`f_regression` is\ntherefore recommended as a feature selection criterion to identify\npotentially predictive feature for a downstream classifier, irrespective of\nthe sign of the association with the target variable.\n\nFurthermore :func:`f_regression` returns p-values while\n:func:`r_regression` does not.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data matrix.\n\ny : array-like of shape (n_samples,)\n The target vector.\n\ncenter : bool, default=True\n Whether or not to center the data matrix `X` and the target vector `y`.\n By default, `X` and `y` will be centered.\n\nReturns\n-------\nf_statistic : ndarray of shape (n_features,)\n F-statistic for each feature.\n\np_values : ndarray of shape (n_features,)\n P-values associated with the F-statistic.\n\nSee Also\n--------\nr_regression: Pearson's R between label/feature for regression tasks.\nf_classif: ANOVA F-value between label/feature for classification tasks.\nchi2: Chi-squared stats of non-negative features for classification tasks.\nSelectKBest: Select features based on the k highest scores.\nSelectFpr: Select features based on a false positive rate test.\nSelectFdr: Select features based on an estimated false discovery rate.\nSelectFwe: Select features based on family-wise error rate.\nSelectPercentile: Select features based on percentile of the highest\n scores.", + "description": "Univariate linear regression tests returning F-statistic and p-values.\n\nQuick linear model for testing the effect of a single regressor,\nsequentially for many regressors.\n\nThis is done in 2 steps:\n\n1. The cross correlation between each regressor and the target is computed,\n that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *\n std(y)) using r_regression function.\n2. It is converted to an F score and then to a p-value.\n\n:func:`f_regression` is derived from :func:`r_regression` and will rank\nfeatures in the same order if all the features are positively correlated\nwith the target.\n\nNote however that contrary to :func:`f_regression`, :func:`r_regression`\nvalues lie in [-1, 1] and can thus be negative. :func:`f_regression` is\ntherefore recommended as a feature selection criterion to identify\npotentially predictive feature for a downstream classifier, irrespective of\nthe sign of the association with the target variable.\n\nFurthermore :func:`f_regression` returns p-values while\n:func:`r_regression` does not.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Univariate linear regression tests returning F-statistic and p-values.\n\n Quick linear model for testing the effect of a single regressor,\n sequentially for many regressors.\n\n This is done in 2 steps:\n\n 1. The cross correlation between each regressor and the target is computed,\n that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *\n std(y)) using r_regression function.\n 2. It is converted to an F score and then to a p-value.\n\n :func:`f_regression` is derived from :func:`r_regression` and will rank\n features in the same order if all the features are positively correlated\n with the target.\n\n Note however that contrary to :func:`f_regression`, :func:`r_regression`\n values lie in [-1, 1] and can thus be negative. :func:`f_regression` is\n therefore recommended as a feature selection criterion to identify\n potentially predictive feature for a downstream classifier, irrespective of\n the sign of the association with the target variable.\n\n Furthermore :func:`f_regression` returns p-values while\n :func:`r_regression` does not.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data matrix.\n\n y : array-like of shape (n_samples,)\n The target vector.\n\n center : bool, default=True\n Whether or not to center the data matrix `X` and the target vector `y`.\n By default, `X` and `y` will be centered.\n\n Returns\n -------\n f_statistic : ndarray of shape (n_features,)\n F-statistic for each feature.\n\n p_values : ndarray of shape (n_features,)\n P-values associated with the F-statistic.\n\n See Also\n --------\n r_regression: Pearson's R between label/feature for regression tasks.\n f_classif: ANOVA F-value between label/feature for classification tasks.\n chi2: Chi-squared stats of non-negative features for classification tasks.\n SelectKBest: Select features based on the k highest scores.\n SelectFpr: Select features based on a false positive rate test.\n SelectFdr: Select features based on an estimated false discovery rate.\n SelectFwe: Select features based on family-wise error rate.\n SelectPercentile: Select features based on percentile of the highest\n scores.\n ", "source_code": "\ndef f_regression(X, y, *, center=True):\n \"\"\"Univariate linear regression tests returning F-statistic and p-values.\n\n Quick linear model for testing the effect of a single regressor,\n sequentially for many regressors.\n\n This is done in 2 steps:\n\n 1. The cross correlation between each regressor and the target is computed,\n that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) *\n std(y)) using r_regression function.\n 2. It is converted to an F score and then to a p-value.\n\n :func:`f_regression` is derived from :func:`r_regression` and will rank\n features in the same order if all the features are positively correlated\n with the target.\n\n Note however that contrary to :func:`f_regression`, :func:`r_regression`\n values lie in [-1, 1] and can thus be negative. :func:`f_regression` is\n therefore recommended as a feature selection criterion to identify\n potentially predictive feature for a downstream classifier, irrespective of\n the sign of the association with the target variable.\n\n Furthermore :func:`f_regression` returns p-values while\n :func:`r_regression` does not.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data matrix.\n\n y : array-like of shape (n_samples,)\n The target vector.\n\n center : bool, default=True\n Whether or not to center the data matrix `X` and the target vector `y`.\n By default, `X` and `y` will be centered.\n\n Returns\n -------\n f_statistic : ndarray of shape (n_features,)\n F-statistic for each feature.\n\n p_values : ndarray of shape (n_features,)\n P-values associated with the F-statistic.\n\n See Also\n --------\n r_regression: Pearson's R between label/feature for regression tasks.\n f_classif: ANOVA F-value between label/feature for classification tasks.\n chi2: Chi-squared stats of non-negative features for classification tasks.\n SelectKBest: Select features based on the k highest scores.\n SelectFpr: Select features based on a false positive rate test.\n SelectFdr: Select features based on an estimated false discovery rate.\n SelectFwe: Select features based on family-wise error rate.\n SelectPercentile: Select features based on percentile of the highest\n scores.\n \"\"\"\n correlation_coefficient = r_regression(X, y, center=center)\n deg_of_freedom = y.size - (2 if center else 1)\n corr_coef_squared = correlation_coefficient**2\n f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom\n p_values = stats.f.sf(f_statistic, 1, deg_of_freedom)\n return f_statistic, p_values" }, { @@ -85001,6 +90373,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -85011,7 +90387,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target vector." - } + }, + "refined_type": {} }, { "name": "center", @@ -85021,13 +90398,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to center the data matrix `X` and the target vector `y`.\nBy default, `X` and `y` will be centered." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute Pearson's r for each features and the target.\n\nPearson's r is also known as the Pearson correlation coefficient. .. versionadded:: 1.0 Linear model for testing the individual effect of each of many regressors. This is a scoring function to be used in a feature selection procedure, not a free standing feature selection procedure. The cross correlation between each regressor and the target is computed as ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)). For more on usage see the :ref:`User Guide `.", - "docstring": "Compute Pearson's r for each features and the target.\n\nPearson's r is also known as the Pearson correlation coefficient.\n\n.. versionadded:: 1.0\n\nLinear model for testing the individual effect of each of many regressors.\nThis is a scoring function to be used in a feature selection procedure, not\na free standing feature selection procedure.\n\nThe cross correlation between each regressor and the target is computed\nas ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)).\n\nFor more on usage see the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data matrix.\n\ny : array-like of shape (n_samples,)\n The target vector.\n\ncenter : bool, default=True\n Whether or not to center the data matrix `X` and the target vector `y`.\n By default, `X` and `y` will be centered.\n\nReturns\n-------\ncorrelation_coefficient : ndarray of shape (n_features,)\n Pearson's R correlation coefficients of features.\n\nSee Also\n--------\nf_regression: Univariate linear regression tests returning f-statistic\n and p-values\nmutual_info_regression: Mutual information for a continuous target.\nf_classif: ANOVA F-value between label/feature for classification tasks.\nchi2: Chi-squared stats of non-negative features for classification tasks.", + "description": "Compute Pearson's r for each features and the target.\n\nPearson's r is also known as the Pearson correlation coefficient.\n\n.. versionadded:: 1.0\n\nLinear model for testing the individual effect of each of many regressors.\nThis is a scoring function to be used in a feature selection procedure, not\na free standing feature selection procedure.\n\nThe cross correlation between each regressor and the target is computed\nas ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)).\n\nFor more on usage see the :ref:`User Guide `.", + "docstring": "Compute Pearson's r for each features and the target.\n\n Pearson's r is also known as the Pearson correlation coefficient.\n\n .. versionadded:: 1.0\n\n Linear model for testing the individual effect of each of many regressors.\n This is a scoring function to be used in a feature selection procedure, not\n a free standing feature selection procedure.\n\n The cross correlation between each regressor and the target is computed\n as ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)).\n\n For more on usage see the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data matrix.\n\n y : array-like of shape (n_samples,)\n The target vector.\n\n center : bool, default=True\n Whether or not to center the data matrix `X` and the target vector `y`.\n By default, `X` and `y` will be centered.\n\n Returns\n -------\n correlation_coefficient : ndarray of shape (n_features,)\n Pearson's R correlation coefficients of features.\n\n See Also\n --------\n f_regression: Univariate linear regression tests returning f-statistic\n and p-values\n mutual_info_regression: Mutual information for a continuous target.\n f_classif: ANOVA F-value between label/feature for classification tasks.\n chi2: Chi-squared stats of non-negative features for classification tasks.\n ", "source_code": "\ndef r_regression(X, y, *, center=True):\n \"\"\"Compute Pearson's r for each features and the target.\n\n Pearson's r is also known as the Pearson correlation coefficient.\n\n .. versionadded:: 1.0\n\n Linear model for testing the individual effect of each of many regressors.\n This is a scoring function to be used in a feature selection procedure, not\n a free standing feature selection procedure.\n\n The cross correlation between each regressor and the target is computed\n as ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * std(y)).\n\n For more on usage see the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data matrix.\n\n y : array-like of shape (n_samples,)\n The target vector.\n\n center : bool, default=True\n Whether or not to center the data matrix `X` and the target vector `y`.\n By default, `X` and `y` will be centered.\n\n Returns\n -------\n correlation_coefficient : ndarray of shape (n_features,)\n Pearson's R correlation coefficients of features.\n\n See Also\n --------\n f_regression: Univariate linear regression tests returning f-statistic\n and p-values\n mutual_info_regression: Mutual information for a continuous target.\n f_classif: ANOVA F-value between label/feature for classification tasks.\n chi2: Chi-squared stats of non-negative features for classification tasks.\n \"\"\"\n (X, y) = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64)\n n_samples = X.shape[0]\n if center:\n y = y - np.mean(y)\n if issparse(X):\n X_means = X.mean(axis=0).getA1()\n else:\n X_means = X.mean(axis=0)\n X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)\n else:\n X_norms = row_norms(X.T)\n correlation_coefficient = safe_sparse_dot(y, X)\n correlation_coefficient /= X_norms\n correlation_coefficient /= np.linalg.norm(y)\n return correlation_coefficient" }, { @@ -85045,7 +90423,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "threshold", @@ -85055,13 +90434,14 @@ "docstring": { "type": "float, default=0", "description": "Features with a training-set variance lower than this threshold will\nbe removed. The default is to keep all features with non-zero variance,\ni.e. remove the features that have the same value in all samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, threshold=0.0):\n self.threshold = threshold" }, { @@ -85079,13 +90459,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_support_mask(self):\n check_is_fitted(self)\n return self.variances_ > self.threshold" }, { @@ -85103,13 +90484,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'allow_nan': True}" }, { @@ -85127,7 +90509,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -85137,6 +90520,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Data from which to compute variances, where `n_samples` is\nthe number of samples and `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -85147,13 +90534,14 @@ "docstring": { "type": "any, default=None", "description": "Ignored. This parameter exists only for compatibility with\nsklearn.pipeline.Pipeline." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Learn empirical variances from X.", - "docstring": "Learn empirical variances from X.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Data from which to compute variances, where `n_samples` is\n the number of samples and `n_features` is the number of features.\n\ny : any, default=None\n Ignored. This parameter exists only for compatibility with\n sklearn.pipeline.Pipeline.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Learn empirical variances from X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Data from which to compute variances, where `n_samples` is\n the number of samples and `n_features` is the number of features.\n\n y : any, default=None\n Ignored. This parameter exists only for compatibility with\n sklearn.pipeline.Pipeline.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Learn empirical variances from X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Data from which to compute variances, where `n_samples` is\n the number of samples and `n_features` is the number of features.\n\n y : any, default=None\n Ignored. This parameter exists only for compatibility with\n sklearn.pipeline.Pipeline.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), dtype=np.float64, force_all_finite='allow-nan')\n if hasattr(X, 'toarray'):\n (_, self.variances_) = mean_variance_axis(X, axis=0)\n if self.threshold == 0:\n (mins, maxes) = min_max_axis(X, axis=0)\n peak_to_peaks = maxes - mins\n else:\n self.variances_ = np.nanvar(X, axis=0)\n if self.threshold == 0:\n peak_to_peaks = np.ptp(X, axis=0)\n if self.threshold == 0:\n compare_arr = np.array([self.variances_, peak_to_peaks])\n self.variances_ = np.nanmin(compare_arr, axis=0)\n elif self.threshold < 0.0:\n raise ValueError(f'Threshold must be non-negative. Got: {self.threshold}')\n if np.all(~np.isfinite(self.variances_) | (self.variances_ <= self.threshold)):\n msg = 'No feature in X meets the variance threshold {0:.5f}'\n if X.shape[0] == 1:\n msg += ' (X contains only one sample)'\n raise ValueError(msg.format(self.threshold))\n return self" }, { @@ -85171,7 +90559,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -85181,7 +90570,8 @@ "docstring": { "type": "kernel instance, default=None", "description": "The kernel specifying the covariance function of the GP. If None is\npassed, the kernel \"1.0 * RBF(1.0)\" is used as default. Note that\nthe kernel's hyperparameters are optimized during fitting." - } + }, + "refined_type": {} }, { "name": "optimizer", @@ -85191,7 +90581,8 @@ "docstring": { "type": "'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'", "description": "Can either be one of the internally supported optimizers for optimizing\nthe kernel's parameters, specified by a string, or an externally\ndefined optimizer passed as a callable. If a callable is passed, it\nmust have the signature::\n\n def optimizer(obj_func, initial_theta, bounds):\n # * 'obj_func' is the objective function to be maximized, which\n # takes the hyperparameters theta as parameter and an\n # optional flag eval_gradient, which determines if the\n # gradient is returned additionally to the function value\n # * 'initial_theta': the initial value for theta, which can be\n # used by local optimizers\n # * 'bounds': the bounds on the values of theta\n ....\n # Returned are the best found hyperparameters theta and\n # the corresponding value of the target function.\n return theta_opt, func_min\n\nPer default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize\nis used. If None is passed, the kernel's parameters are kept fixed.\nAvailable internal optimizers are::\n\n 'fmin_l_bfgs_b'" - } + }, + "refined_type": {} }, { "name": "n_restarts_optimizer", @@ -85201,7 +90592,8 @@ "docstring": { "type": "int, default=0", "description": "The number of restarts of the optimizer for finding the kernel's\nparameters which maximize the log-marginal likelihood. The first run\nof the optimizer is performed from the kernel's initial parameters,\nthe remaining ones (if any) from thetas sampled log-uniform randomly\nfrom the space of allowed theta-values. If greater than 0, all bounds\nmust be finite. Note that n_restarts_optimizer=0 implies that one\nrun is performed." - } + }, + "refined_type": {} }, { "name": "max_iter_predict", @@ -85211,7 +90603,8 @@ "docstring": { "type": "int, default=100", "description": "The maximum number of iterations in Newton's method for approximating\nthe posterior during predict. Smaller values will reduce computation\ntime at the cost of worse results." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -85221,7 +90614,8 @@ "docstring": { "type": "bool, default=False", "description": "If warm-starts are enabled, the solution of the last Newton iteration\non the Laplace approximation of the posterior mode is used as\ninitialization for the next call of _posterior_mode(). This can speed\nup convergence when _posterior_mode is called several times on similar\nproblems as in hyperparameter optimization. See :term:`the Glossary\n`." - } + }, + "refined_type": {} }, { "name": "copy_X_train", @@ -85231,7 +90625,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, a persistent copy of the training data is stored in the\nobject. Otherwise, just a reference to the training data is stored,\nwhich might cause predictions to change if the data is modified\nexternally." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -85241,7 +90636,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation used to initialize the centers.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "multi_class", @@ -85251,6 +90647,10 @@ "docstring": { "type": "{'one_vs_rest', 'one_vs_one'}, default='one_vs_rest'", "description": "Specifies how multi-class classification problems are handled.\nSupported are 'one_vs_rest' and 'one_vs_one'. In 'one_vs_rest',\none binary Gaussian process classifier is fitted for each class, which\nis trained to separate this class from the rest. In 'one_vs_one', one\nbinary Gaussian process classifier is fitted for each pair of classes,\nwhich is trained to separate these two classes. The predictions of\nthese binary predictors are combined into multi-class predictions.\nNote that 'one_vs_one' does not support predicting probability\nestimates." + }, + "refined_type": { + "kind": "EnumType", + "values": ["one_vs_rest", "one_vs_one"] } }, { @@ -85261,13 +90661,14 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation: the specified\nmulticlass problems are computed in parallel.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, kernel=None, *, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None, multi_class='one_vs_rest', n_jobs=None):\n self.kernel = kernel\n self.optimizer = optimizer\n self.n_restarts_optimizer = n_restarts_optimizer\n self.max_iter_predict = max_iter_predict\n self.warm_start = warm_start\n self.copy_X_train = copy_X_train\n self.random_state = random_state\n self.multi_class = multi_class\n self.n_jobs = n_jobs" }, { @@ -85285,7 +90686,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -85295,7 +90697,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or list of object", "description": "Feature vectors or other representations of training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -85305,13 +90708,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values, must be binary." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit Gaussian process classification model.", - "docstring": "Fit Gaussian process classification model.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\ny : array-like of shape (n_samples,)\n Target values, must be binary.\n\nReturns\n-------\nself : object\n Returns an instance of self.", + "docstring": "Fit Gaussian process classification model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\n y : array-like of shape (n_samples,)\n Target values, must be binary.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit Gaussian process classification model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\n y : array-like of shape (n_samples,)\n Target values, must be binary.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n if self.kernel is None or self.kernel.requires_vector_input:\n (X, y) = self._validate_data(X, y, multi_output=False, ensure_2d=True, dtype='numeric')\n else:\n (X, y) = self._validate_data(X, y, multi_output=False, ensure_2d=False, dtype=None)\n self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(kernel=self.kernel, optimizer=self.optimizer, n_restarts_optimizer=self.n_restarts_optimizer, max_iter_predict=self.max_iter_predict, warm_start=self.warm_start, copy_X_train=self.copy_X_train, random_state=self.random_state)\n self.classes_ = np.unique(y)\n self.n_classes_ = self.classes_.size\n if self.n_classes_ == 1:\n raise ValueError('GaussianProcessClassifier requires 2 or more distinct classes; got %d class (only class %s is present)' % (self.n_classes_, self.classes_[0]))\n if self.n_classes_ > 2:\n if self.multi_class == 'one_vs_rest':\n self.base_estimator_ = OneVsRestClassifier(self.base_estimator_, n_jobs=self.n_jobs)\n elif self.multi_class == 'one_vs_one':\n self.base_estimator_ = OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs)\n else:\n raise ValueError('Unknown multi-class mode %s' % self.multi_class)\n self.base_estimator_.fit(X, y)\n if self.n_classes_ > 2:\n self.log_marginal_likelihood_value_ = np.mean([estimator.log_marginal_likelihood() for estimator in self.base_estimator_.estimators_])\n else:\n self.log_marginal_likelihood_value_ = self.base_estimator_.log_marginal_likelihood()\n return self" }, { @@ -85329,7 +90733,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -85353,7 +90758,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "theta", @@ -85363,7 +90769,8 @@ "docstring": { "type": "array-like of shape (n_kernel_params,), default=None", "description": "Kernel hyperparameters for which the log-marginal likelihood is\nevaluated. In the case of multi-class classification, theta may\nbe the hyperparameters of the compound kernel or of an individual\nkernel. In the latter case, all individual kernel get assigned the\nsame theta values. If None, the precomputed log_marginal_likelihood\nof ``self.kernel_.theta`` is returned." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -85373,7 +90780,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, the gradient of the log-marginal likelihood with respect\nto the kernel hyperparameters at position theta is returned\nadditionally. Note that gradient computation is not supported\nfor non-binary classification. If True, theta must not be None." - } + }, + "refined_type": {} }, { "name": "clone_kernel", @@ -85383,13 +90791,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, the kernel attribute is copied. If False, the kernel\nattribute is modified, but may result in a performance improvement." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return log-marginal likelihood of theta for training data.\n\nIn the case of multi-class classification, the mean log-marginal likelihood of the one-versus-rest classifiers are returned.", - "docstring": "Return log-marginal likelihood of theta for training data.\n\nIn the case of multi-class classification, the mean log-marginal\nlikelihood of the one-versus-rest classifiers are returned.\n\nParameters\n----------\ntheta : array-like of shape (n_kernel_params,), default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. In the case of multi-class classification, theta may\n be the hyperparameters of the compound kernel or of an individual\n kernel. In the latter case, all individual kernel get assigned the\n same theta values. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\neval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. Note that gradient computation is not supported\n for non-binary classification. If True, theta must not be None.\n\nclone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\nReturns\n-------\nlog_likelihood : float\n Log-marginal likelihood of theta for training data.\n\nlog_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when `eval_gradient` is True.", + "description": "Return log-marginal likelihood of theta for training data.\n\nIn the case of multi-class classification, the mean log-marginal\nlikelihood of the one-versus-rest classifiers are returned.", + "docstring": "Return log-marginal likelihood of theta for training data.\n\n In the case of multi-class classification, the mean log-marginal\n likelihood of the one-versus-rest classifiers are returned.\n\n Parameters\n ----------\n theta : array-like of shape (n_kernel_params,), default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. In the case of multi-class classification, theta may\n be the hyperparameters of the compound kernel or of an individual\n kernel. In the latter case, all individual kernel get assigned the\n same theta values. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\n eval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. Note that gradient computation is not supported\n for non-binary classification. If True, theta must not be None.\n\n clone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\n Returns\n -------\n log_likelihood : float\n Log-marginal likelihood of theta for training data.\n\n log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when `eval_gradient` is True.\n ", "source_code": "\ndef log_marginal_likelihood(self, theta=None, eval_gradient=False, clone_kernel=True):\n \"\"\"Return log-marginal likelihood of theta for training data.\n\n In the case of multi-class classification, the mean log-marginal\n likelihood of the one-versus-rest classifiers are returned.\n\n Parameters\n ----------\n theta : array-like of shape (n_kernel_params,), default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. In the case of multi-class classification, theta may\n be the hyperparameters of the compound kernel or of an individual\n kernel. In the latter case, all individual kernel get assigned the\n same theta values. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\n eval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. Note that gradient computation is not supported\n for non-binary classification. If True, theta must not be None.\n\n clone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\n Returns\n -------\n log_likelihood : float\n Log-marginal likelihood of theta for training data.\n\n log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when `eval_gradient` is True.\n \"\"\"\n check_is_fitted(self)\n if theta is None:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated for theta!=None')\n return self.log_marginal_likelihood_value_\n theta = np.asarray(theta)\n if self.n_classes_ == 2:\n return self.base_estimator_.log_marginal_likelihood(theta, eval_gradient, clone_kernel=clone_kernel)\n else:\n if eval_gradient:\n raise NotImplementedError('Gradient of log-marginal-likelihood not implemented for multi-class GPC.')\n estimators = self.base_estimator_.estimators_\n n_dims = estimators[0].kernel_.n_dims\n if theta.shape[0] == n_dims:\n return np.mean([estimator.log_marginal_likelihood(theta, clone_kernel=clone_kernel) for (i, estimator) in enumerate(estimators)])\n elif theta.shape[0] == n_dims * self.classes_.shape[0]:\n return np.mean([estimator.log_marginal_likelihood(theta[n_dims * i:n_dims * (i + 1)], clone_kernel=clone_kernel) for (i, estimator) in enumerate(estimators)])\n else:\n raise ValueError('Shape of theta must be either %d or %d. Obtained theta with shape %d.' % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))" }, { @@ -85407,7 +90816,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -85417,13 +90827,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or list of object", "description": "Query points where the GP is evaluated for classification." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform classification on an array of test vectors X.", - "docstring": "Perform classification on an array of test vectors X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\nReturns\n-------\nC : ndarray of shape (n_samples,)\n Predicted target values for X, values are from ``classes_``.", + "docstring": "Perform classification on an array of test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n Predicted target values for X, values are from ``classes_``.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Perform classification on an array of test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n Predicted target values for X, values are from ``classes_``.\n \"\"\"\n check_is_fitted(self)\n if self.kernel is None or self.kernel.requires_vector_input:\n X = self._validate_data(X, ensure_2d=True, dtype='numeric', reset=False)\n else:\n X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False)\n return self.base_estimator_.predict(X)" }, { @@ -85441,7 +90852,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -85451,13 +90863,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or list of object", "description": "Query points where the GP is evaluated for classification." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return probability estimates for the test vector X.", - "docstring": "Return probability estimates for the test vector X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\nReturns\n-------\nC : array-like of shape (n_samples, n_classes)\n Returns the probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.", + "docstring": "Return probability estimates for the test vector X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\n Returns\n -------\n C : array-like of shape (n_samples, n_classes)\n Returns the probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Return probability estimates for the test vector X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\n Returns\n -------\n C : array-like of shape (n_samples, n_classes)\n Returns the probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n if self.n_classes_ > 2 and self.multi_class == 'one_vs_one':\n raise ValueError('one_vs_one multi-class mode does not support predicting probability estimates. Use one_vs_rest mode instead.')\n if self.kernel is None or self.kernel.requires_vector_input:\n X = self._validate_data(X, ensure_2d=True, dtype='numeric', reset=False)\n else:\n X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False)\n return self.base_estimator_.predict_proba(X)" }, { @@ -85475,7 +90888,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -85485,7 +90899,8 @@ "docstring": { "type": "kernel instance, default=None", "description": "The kernel specifying the covariance function of the GP. If None is\npassed, the kernel \"1.0 * RBF(1.0)\" is used as default. Note that\nthe kernel's hyperparameters are optimized during fitting." - } + }, + "refined_type": {} }, { "name": "optimizer", @@ -85495,7 +90910,8 @@ "docstring": { "type": "'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'", "description": "Can either be one of the internally supported optimizers for optimizing\nthe kernel's parameters, specified by a string, or an externally\ndefined optimizer passed as a callable. If a callable is passed, it\nmust have the signature::\n\n def optimizer(obj_func, initial_theta, bounds):\n # * 'obj_func' is the objective function to be maximized, which\n # takes the hyperparameters theta as parameter and an\n # optional flag eval_gradient, which determines if the\n # gradient is returned additionally to the function value\n # * 'initial_theta': the initial value for theta, which can be\n # used by local optimizers\n # * 'bounds': the bounds on the values of theta\n ....\n # Returned are the best found hyperparameters theta and\n # the corresponding value of the target function.\n return theta_opt, func_min\n\nPer default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize\nis used. If None is passed, the kernel's parameters are kept fixed.\nAvailable internal optimizers are::\n\n 'fmin_l_bfgs_b'" - } + }, + "refined_type": {} }, { "name": "n_restarts_optimizer", @@ -85505,7 +90921,8 @@ "docstring": { "type": "int, default=0", "description": "The number of restarts of the optimizer for finding the kernel's\nparameters which maximize the log-marginal likelihood. The first run\nof the optimizer is performed from the kernel's initial parameters,\nthe remaining ones (if any) from thetas sampled log-uniform randomly\nfrom the space of allowed theta-values. If greater than 0, all bounds\nmust be finite. Note that n_restarts_optimizer=0 implies that one\nrun is performed." - } + }, + "refined_type": {} }, { "name": "max_iter_predict", @@ -85515,7 +90932,8 @@ "docstring": { "type": "int, default=100", "description": "The maximum number of iterations in Newton's method for approximating\nthe posterior during predict. Smaller values will reduce computation\ntime at the cost of worse results." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -85525,7 +90943,8 @@ "docstring": { "type": "bool, default=False", "description": "If warm-starts are enabled, the solution of the last Newton iteration\non the Laplace approximation of the posterior mode is used as\ninitialization for the next call of _posterior_mode(). This can speed\nup convergence when _posterior_mode is called several times on similar\nproblems as in hyperparameter optimization. See :term:`the Glossary\n`." - } + }, + "refined_type": {} }, { "name": "copy_X_train", @@ -85535,7 +90954,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, a persistent copy of the training data is stored in the\nobject. Otherwise, just a reference to the training data is stored,\nwhich might cause predictions to change if the data is modified\nexternally." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -85545,13 +90965,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation used to initialize the centers.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, kernel=None, *, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None):\n self.kernel = kernel\n self.optimizer = optimizer\n self.n_restarts_optimizer = n_restarts_optimizer\n self.max_iter_predict = max_iter_predict\n self.warm_start = warm_start\n self.copy_X_train = copy_X_train\n self.random_state = random_state" }, { @@ -85569,7 +90990,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "obj_func", @@ -85579,7 +91001,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "initial_theta", @@ -85589,7 +91012,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bounds", @@ -85599,13 +91023,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _constrained_optimization(self, obj_func, initial_theta, bounds):\n if self.optimizer == 'fmin_l_bfgs_b':\n opt_res = scipy.optimize.minimize(obj_func, initial_theta, method='L-BFGS-B', jac=True, bounds=bounds)\n _check_optimize_result('lbfgs', opt_res)\n (theta_opt, func_min) = (opt_res.x, opt_res.fun)\n elif callable(self.optimizer):\n (theta_opt, func_min) = self.optimizer(obj_func, initial_theta, bounds=bounds)\n else:\n raise ValueError('Unknown optimizer %s.' % self.optimizer)\n return theta_opt, func_min" }, { @@ -85623,7 +91048,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "K", @@ -85633,7 +91059,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "return_temporaries", @@ -85643,13 +91070,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Mode-finding for binary Laplace GPC and fixed kernel.\n\nThis approximates the posterior of the latent function values for given inputs and target observations with a Gaussian approximation and uses Newton's iteration to find the mode of this approximation.", - "docstring": "Mode-finding for binary Laplace GPC and fixed kernel.\n\nThis approximates the posterior of the latent function values for given\ninputs and target observations with a Gaussian approximation and uses\nNewton's iteration to find the mode of this approximation.", + "description": "Mode-finding for binary Laplace GPC and fixed kernel.\n\nThis approximates the posterior of the latent function values for given\ninputs and target observations with a Gaussian approximation and uses\nNewton's iteration to find the mode of this approximation.", + "docstring": "Mode-finding for binary Laplace GPC and fixed kernel.\n\n This approximates the posterior of the latent function values for given\n inputs and target observations with a Gaussian approximation and uses\n Newton's iteration to find the mode of this approximation.\n ", "source_code": "\ndef _posterior_mode(self, K, return_temporaries=False):\n \"\"\"Mode-finding for binary Laplace GPC and fixed kernel.\n\n This approximates the posterior of the latent function values for given\n inputs and target observations with a Gaussian approximation and uses\n Newton's iteration to find the mode of this approximation.\n \"\"\"\n if self.warm_start and hasattr(self, 'f_cached') and self.f_cached.shape == self.y_train_.shape:\n f = self.f_cached\n else:\n f = np.zeros_like(self.y_train_, dtype=np.float64)\n log_marginal_likelihood = -np.inf\n for _ in range(self.max_iter_predict):\n pi = expit(f)\n W = pi * (1 - pi)\n W_sr = np.sqrt(W)\n W_sr_K = W_sr[:, np.newaxis] * K\n B = np.eye(W.shape[0]) + W_sr_K * W_sr\n L = cholesky(B, lower=True)\n b = W * f + (self.y_train_ - pi)\n a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b))\n f = K.dot(a)\n lml = -0.5 * a.T.dot(f) - np.log1p(np.exp(-(self.y_train_ * 2 - 1) * f)).sum() - np.log(np.diag(L)).sum()\n if lml - log_marginal_likelihood < 1e-10:\n break\n log_marginal_likelihood = lml\n self.f_cached = f\n if return_temporaries:\n return log_marginal_likelihood, (pi, W_sr, L, b, a)\n else:\n return log_marginal_likelihood" }, { @@ -85667,7 +91095,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -85677,7 +91106,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or list of object", "description": "Feature vectors or other representations of training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -85687,13 +91117,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values, must be binary." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit Gaussian process classification model.", - "docstring": "Fit Gaussian process classification model.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\ny : array-like of shape (n_samples,)\n Target values, must be binary.\n\nReturns\n-------\nself : returns an instance of self.", + "docstring": "Fit Gaussian process classification model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\n y : array-like of shape (n_samples,)\n Target values, must be binary.\n\n Returns\n -------\n self : returns an instance of self.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit Gaussian process classification model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\n y : array-like of shape (n_samples,)\n Target values, must be binary.\n\n Returns\n -------\n self : returns an instance of self.\n \"\"\"\n if self.kernel is None:\n self.kernel_ = C(1.0, constant_value_bounds='fixed') * RBF(1.0, length_scale_bounds='fixed')\n else:\n self.kernel_ = clone(self.kernel)\n self.rng = check_random_state(self.random_state)\n self.X_train_ = np.copy(X) if self.copy_X_train else X\n label_encoder = LabelEncoder()\n self.y_train_ = label_encoder.fit_transform(y)\n self.classes_ = label_encoder.classes_\n if self.classes_.size > 2:\n raise ValueError('%s supports only binary classification. y contains classes %s' % (self.__class__.__name__, self.classes_))\n elif self.classes_.size == 1:\n raise ValueError('{0:s} requires 2 classes; got {1:d} class'.format(self.__class__.__name__, self.classes_.size))\n if self.optimizer is not None and self.kernel_.n_dims > 0:\n \n def obj_func(theta, eval_gradient=True):\n if eval_gradient:\n (lml, grad) = self.log_marginal_likelihood(theta, eval_gradient=True, clone_kernel=False)\n return -lml, -grad\n else:\n return -self.log_marginal_likelihood(theta, clone_kernel=False)\n optima = [self._constrained_optimization(obj_func, self.kernel_.theta, self.kernel_.bounds)]\n if self.n_restarts_optimizer > 0:\n if not np.isfinite(self.kernel_.bounds).all():\n raise ValueError('Multiple optimizer restarts (n_restarts_optimizer>0) requires that all bounds are finite.')\n bounds = self.kernel_.bounds\n for iteration in range(self.n_restarts_optimizer):\n theta_initial = np.exp(self.rng.uniform(bounds[:, 0], bounds[:, 1]))\n optima.append(self._constrained_optimization(obj_func, theta_initial, bounds))\n lml_values = list(map(itemgetter(1), optima))\n self.kernel_.theta = optima[np.argmin(lml_values)][0]\n self.kernel_._check_bounds_params()\n self.log_marginal_likelihood_value_ = -np.min(lml_values)\n else:\n self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(self.kernel_.theta)\n K = self.kernel_(self.X_train_)\n (_, (self.pi_, self.W_sr_, self.L_, _, _)) = self._posterior_mode(K, return_temporaries=True)\n return self" }, { @@ -85711,7 +91142,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "theta", @@ -85721,7 +91153,8 @@ "docstring": { "type": "array-like of shape (n_kernel_params,), default=None", "description": "Kernel hyperparameters for which the log-marginal likelihood is\nevaluated. If None, the precomputed log_marginal_likelihood\nof ``self.kernel_.theta`` is returned." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -85731,7 +91164,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, the gradient of the log-marginal likelihood with respect\nto the kernel hyperparameters at position theta is returned\nadditionally. If True, theta must not be None." - } + }, + "refined_type": {} }, { "name": "clone_kernel", @@ -85741,13 +91175,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, the kernel attribute is copied. If False, the kernel\nattribute is modified, but may result in a performance improvement." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Returns log-marginal likelihood of theta for training data.", - "docstring": "Returns log-marginal likelihood of theta for training data.\n\nParameters\n----------\ntheta : array-like of shape (n_kernel_params,), default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\neval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. If True, theta must not be None.\n\nclone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\nReturns\n-------\nlog_likelihood : float\n Log-marginal likelihood of theta for training data.\n\nlog_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when `eval_gradient` is True.", + "docstring": "Returns log-marginal likelihood of theta for training data.\n\n Parameters\n ----------\n theta : array-like of shape (n_kernel_params,), default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\n eval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. If True, theta must not be None.\n\n clone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\n Returns\n -------\n log_likelihood : float\n Log-marginal likelihood of theta for training data.\n\n log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when `eval_gradient` is True.\n ", "source_code": "\ndef log_marginal_likelihood(self, theta=None, eval_gradient=False, clone_kernel=True):\n \"\"\"Returns log-marginal likelihood of theta for training data.\n\n Parameters\n ----------\n theta : array-like of shape (n_kernel_params,), default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\n eval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. If True, theta must not be None.\n\n clone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\n Returns\n -------\n log_likelihood : float\n Log-marginal likelihood of theta for training data.\n\n log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when `eval_gradient` is True.\n \"\"\"\n if theta is None:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated for theta!=None')\n return self.log_marginal_likelihood_value_\n if clone_kernel:\n kernel = self.kernel_.clone_with_theta(theta)\n else:\n kernel = self.kernel_\n kernel.theta = theta\n if eval_gradient:\n (K, K_gradient) = kernel(self.X_train_, eval_gradient=True)\n else:\n K = kernel(self.X_train_)\n (Z, (pi, W_sr, L, b, a)) = self._posterior_mode(K, return_temporaries=True)\n if not eval_gradient:\n return Z\n d_Z = np.empty(theta.shape[0])\n R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr))\n C = solve(L, W_sr[:, np.newaxis] * K)\n s_2 = -0.5 * (np.diag(K) - np.einsum('ij, ij -> j', C, C)) * (pi * (1 - pi) * (1 - 2 * pi))\n for j in range(d_Z.shape[0]):\n C = K_gradient[:, :, j]\n s_1 = 0.5 * a.T.dot(C).dot(a) - 0.5 * R.T.ravel().dot(C.ravel())\n b = C.dot(self.y_train_ - pi)\n s_3 = b - K.dot(R.dot(b))\n d_Z[j] = s_1 + s_2.T.dot(s_3)\n return Z, d_Z" }, { @@ -85765,7 +91200,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -85775,13 +91211,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or list of object", "description": "Query points where the GP is evaluated for classification." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Perform classification on an array of test vectors X.", - "docstring": "Perform classification on an array of test vectors X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\nReturns\n-------\nC : ndarray of shape (n_samples,)\n Predicted target values for X, values are from ``classes_``", + "docstring": "Perform classification on an array of test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n Predicted target values for X, values are from ``classes_``\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Perform classification on an array of test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n Predicted target values for X, values are from ``classes_``\n \"\"\"\n check_is_fitted(self)\n K_star = self.kernel_(self.X_train_, X)\n f_star = K_star.T.dot(self.y_train_ - self.pi_)\n return np.where(f_star > 0, self.classes_[1], self.classes_[0])" }, { @@ -85799,7 +91236,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -85809,13 +91247,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or list of object", "description": "Query points where the GP is evaluated for classification." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return probability estimates for the test vector X.", - "docstring": "Return probability estimates for the test vector X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\nReturns\n-------\nC : array-like of shape (n_samples, n_classes)\n Returns the probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute ``classes_``.", + "docstring": "Return probability estimates for the test vector X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\n Returns\n -------\n C : array-like of shape (n_samples, n_classes)\n Returns the probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute ``classes_``.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Return probability estimates for the test vector X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated for classification.\n\n Returns\n -------\n C : array-like of shape (n_samples, n_classes)\n Returns the probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute ``classes_``.\n \"\"\"\n check_is_fitted(self)\n K_star = self.kernel_(self.X_train_, X)\n f_star = K_star.T.dot(self.y_train_ - self.pi_)\n v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star)\n var_f_star = self.kernel_.diag(X) - np.einsum('ij,ij->j', v, v)\n alpha = 1 / (2 * var_f_star)\n gamma = LAMBDAS * f_star\n integrals = np.sqrt(np.pi / alpha) * erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2))) / (2 * np.sqrt(var_f_star * 2 * np.pi))\n pi_star = (COEFS * integrals).sum(axis=0) + 0.5 * COEFS.sum()\n return np.vstack((1 - pi_star, pi_star)).T" }, { @@ -85833,7 +91272,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -85843,7 +91283,8 @@ "docstring": { "type": "kernel instance, default=None", "description": "The kernel specifying the covariance function of the GP. If None is\npassed, the kernel ``ConstantKernel(1.0, constant_value_bounds=\"fixed\"\n* RBF(1.0, length_scale_bounds=\"fixed\")`` is used as default. Note that\nthe kernel hyperparameters are optimized during fitting unless the\nbounds are marked as \"fixed\"." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -85853,7 +91294,8 @@ "docstring": { "type": "float or ndarray of shape (n_samples,), default=1e-10", "description": "Value added to the diagonal of the kernel matrix during fitting.\nThis can prevent a potential numerical issue during fitting, by\nensuring that the calculated values form a positive definite matrix.\nIt can also be interpreted as the variance of additional Gaussian\nmeasurement noise on the training observations. Note that this is\ndifferent from using a `WhiteKernel`. If an array is passed, it must\nhave the same number of entries as the data used for fitting and is\nused as datapoint-dependent noise level. Allowing to specify the\nnoise level directly as a parameter is mainly for convenience and\nfor consistency with :class:`~sklearn.linear_model.Ridge`." - } + }, + "refined_type": {} }, { "name": "optimizer", @@ -85863,6 +91305,10 @@ "docstring": { "type": "\"fmin_l_bfgs_b\" or callable, default=\"fmin_l_bfgs_b\"", "description": "Can either be one of the internally supported optimizers for optimizing\nthe kernel's parameters, specified by a string, or an externally\ndefined optimizer passed as a callable. If a callable is passed, it\nmust have the signature::\n\n def optimizer(obj_func, initial_theta, bounds):\n # * 'obj_func': the objective function to be minimized, which\n # takes the hyperparameters theta as a parameter and an\n # optional flag eval_gradient, which determines if the\n # gradient is returned additionally to the function value\n # * 'initial_theta': the initial value for theta, which can be\n # used by local optimizers\n # * 'bounds': the bounds on the values of theta\n ....\n # Returned are the best found hyperparameters theta and\n # the corresponding value of the target function.\n return theta_opt, func_min\n\nPer default, the L-BFGS-B algorithm from `scipy.optimize.minimize`\nis used. If None is passed, the kernel's parameters are kept fixed.\nAvailable internal optimizers are: `{'fmin_l_bfgs_b'}`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["fmin_l_bfgs_b"] } }, { @@ -85873,7 +91319,8 @@ "docstring": { "type": "int, default=0", "description": "The number of restarts of the optimizer for finding the kernel's\nparameters which maximize the log-marginal likelihood. The first run\nof the optimizer is performed from the kernel's initial parameters,\nthe remaining ones (if any) from thetas sampled log-uniform randomly\nfrom the space of allowed theta-values. If greater than 0, all bounds\nmust be finite. Note that `n_restarts_optimizer == 0` implies that one\nrun is performed." - } + }, + "refined_type": {} }, { "name": "normalize_y", @@ -85882,8 +91329,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "bool, default=False", - "description": "Whether or not to normalized the target values `y` by removing the mean\nand scaling to unit-variance. This is recommended for cases where\nzero-mean, unit-variance priors are used. Note that, in this\nimplementation, the normalisation is reversed before the GP predictions\nare reported.\n\n.. versionchanged:: 0.23" - } + "description": "Whether or not to normalize the target values `y` by removing the mean\nand scaling to unit-variance. This is recommended for cases where\nzero-mean, unit-variance priors are used. Note that, in this\nimplementation, the normalisation is reversed before the GP predictions\nare reported.\n\n.. versionchanged:: 0.23" + }, + "refined_type": {} }, { "name": "copy_X_train", @@ -85893,7 +91341,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, a persistent copy of the training data is stored in the\nobject. Otherwise, just a reference to the training data is stored,\nwhich might cause predictions to change if the data is modified\nexternally." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -85903,13 +91352,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation used to initialize the centers.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, kernel=None, *, alpha=1e-10, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, normalize_y=False, copy_X_train=True, random_state=None):\n self.kernel = kernel\n self.alpha = alpha\n self.optimizer = optimizer\n self.n_restarts_optimizer = n_restarts_optimizer\n self.normalize_y = normalize_y\n self.copy_X_train = copy_X_train\n self.random_state = random_state" }, { @@ -85927,7 +91377,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "obj_func", @@ -85937,7 +91388,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "initial_theta", @@ -85947,7 +91399,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bounds", @@ -85957,13 +91410,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _constrained_optimization(self, obj_func, initial_theta, bounds):\n if self.optimizer == 'fmin_l_bfgs_b':\n opt_res = scipy.optimize.minimize(obj_func, initial_theta, method='L-BFGS-B', jac=True, bounds=bounds)\n _check_optimize_result('lbfgs', opt_res)\n (theta_opt, func_min) = (opt_res.x, opt_res.fun)\n elif callable(self.optimizer):\n (theta_opt, func_min) = self.optimizer(obj_func, initial_theta, bounds=bounds)\n else:\n raise ValueError(f'Unknown optimizer {self.optimizer}.')\n return theta_opt, func_min" }, { @@ -85981,13 +91435,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'requires_fit': False}" }, { @@ -86005,7 +91460,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -86015,7 +91471,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or list of object", "description": "Feature vectors or other representations of training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -86025,13 +91482,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit Gaussian process regression model.", - "docstring": "Fit Gaussian process regression model.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\nReturns\n-------\nself : object\n GaussianProcessRegressor class instance.", + "docstring": "Fit Gaussian process regression model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n Returns\n -------\n self : object\n GaussianProcessRegressor class instance.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit Gaussian process regression model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Feature vectors or other representations of training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n Returns\n -------\n self : object\n GaussianProcessRegressor class instance.\n \"\"\"\n if self.kernel is None:\n self.kernel_ = C(1.0, constant_value_bounds='fixed') * RBF(1.0, length_scale_bounds='fixed')\n else:\n self.kernel_ = clone(self.kernel)\n self._rng = check_random_state(self.random_state)\n if self.kernel_.requires_vector_input:\n (dtype, ensure_2d) = ('numeric', True)\n else:\n (dtype, ensure_2d) = (None, False)\n (X, y) = self._validate_data(X, y, multi_output=True, y_numeric=True, ensure_2d=ensure_2d, dtype=dtype)\n if self.normalize_y:\n self._y_train_mean = np.mean(y, axis=0)\n self._y_train_std = _handle_zeros_in_scale(np.std(y, axis=0), copy=False)\n y = (y - self._y_train_mean) / self._y_train_std\n else:\n self._y_train_mean = np.zeros(1)\n self._y_train_std = 1\n if np.iterable(self.alpha) and self.alpha.shape[0] != y.shape[0]:\n if self.alpha.shape[0] == 1:\n self.alpha = self.alpha[0]\n else:\n raise ValueError(f'alpha must be a scalar or an array with same number of entries as y. ({self.alpha.shape[0]} != {y.shape[0]})')\n self.X_train_ = np.copy(X) if self.copy_X_train else X\n self.y_train_ = np.copy(y) if self.copy_X_train else y\n if self.optimizer is not None and self.kernel_.n_dims > 0:\n \n def obj_func(theta, eval_gradient=True):\n if eval_gradient:\n (lml, grad) = self.log_marginal_likelihood(theta, eval_gradient=True, clone_kernel=False)\n return -lml, -grad\n else:\n return -self.log_marginal_likelihood(theta, clone_kernel=False)\n optima = [self._constrained_optimization(obj_func, self.kernel_.theta, self.kernel_.bounds)]\n if self.n_restarts_optimizer > 0:\n if not np.isfinite(self.kernel_.bounds).all():\n raise ValueError('Multiple optimizer restarts (n_restarts_optimizer>0) requires that all bounds are finite.')\n bounds = self.kernel_.bounds\n for iteration in range(self.n_restarts_optimizer):\n theta_initial = self._rng.uniform(bounds[:, 0], bounds[:, 1])\n optima.append(self._constrained_optimization(obj_func, theta_initial, bounds))\n lml_values = list(map(itemgetter(1), optima))\n self.kernel_.theta = optima[np.argmin(lml_values)][0]\n self.kernel_._check_bounds_params()\n self.log_marginal_likelihood_value_ = -np.min(lml_values)\n else:\n self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(self.kernel_.theta, clone_kernel=False)\n K = self.kernel_(self.X_train_)\n K[np.diag_indices_from(K)] += self.alpha\n try:\n self.L_ = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)\n except np.linalg.LinAlgError as exc:\n exc.args = (f\"The kernel, {self.kernel_}, is not returning a positive definite matrix. Try gradually increasing the 'alpha' parameter of your GaussianProcessRegressor estimator.\", ) + exc.args\n raise\n self.alpha_ = cho_solve((self.L_, GPR_CHOLESKY_LOWER), self.y_train_, check_finite=False)\n return self" }, { @@ -86049,7 +91507,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "theta", @@ -86059,7 +91518,8 @@ "docstring": { "type": "array-like of shape (n_kernel_params,) default=None", "description": "Kernel hyperparameters for which the log-marginal likelihood is\nevaluated. If None, the precomputed log_marginal_likelihood\nof ``self.kernel_.theta`` is returned." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -86069,7 +91529,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, the gradient of the log-marginal likelihood with respect\nto the kernel hyperparameters at position theta is returned\nadditionally. If True, theta must not be None." - } + }, + "refined_type": {} }, { "name": "clone_kernel", @@ -86079,13 +91540,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, the kernel attribute is copied. If False, the kernel\nattribute is modified, but may result in a performance improvement." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return log-marginal likelihood of theta for training data.", - "docstring": "Return log-marginal likelihood of theta for training data.\n\nParameters\n----------\ntheta : array-like of shape (n_kernel_params,) default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\neval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. If True, theta must not be None.\n\nclone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\nReturns\n-------\nlog_likelihood : float\n Log-marginal likelihood of theta for training data.\n\nlog_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when eval_gradient is True.", + "docstring": "Return log-marginal likelihood of theta for training data.\n\n Parameters\n ----------\n theta : array-like of shape (n_kernel_params,) default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\n eval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. If True, theta must not be None.\n\n clone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\n Returns\n -------\n log_likelihood : float\n Log-marginal likelihood of theta for training data.\n\n log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when eval_gradient is True.\n ", "source_code": "\ndef log_marginal_likelihood(self, theta=None, eval_gradient=False, clone_kernel=True):\n \"\"\"Return log-marginal likelihood of theta for training data.\n\n Parameters\n ----------\n theta : array-like of shape (n_kernel_params,) default=None\n Kernel hyperparameters for which the log-marginal likelihood is\n evaluated. If None, the precomputed log_marginal_likelihood\n of ``self.kernel_.theta`` is returned.\n\n eval_gradient : bool, default=False\n If True, the gradient of the log-marginal likelihood with respect\n to the kernel hyperparameters at position theta is returned\n additionally. If True, theta must not be None.\n\n clone_kernel : bool, default=True\n If True, the kernel attribute is copied. If False, the kernel\n attribute is modified, but may result in a performance improvement.\n\n Returns\n -------\n log_likelihood : float\n Log-marginal likelihood of theta for training data.\n\n log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional\n Gradient of the log-marginal likelihood with respect to the kernel\n hyperparameters at position theta.\n Only returned when eval_gradient is True.\n \"\"\"\n if theta is None:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated for theta!=None')\n return self.log_marginal_likelihood_value_\n if clone_kernel:\n kernel = self.kernel_.clone_with_theta(theta)\n else:\n kernel = self.kernel_\n kernel.theta = theta\n if eval_gradient:\n (K, K_gradient) = kernel(self.X_train_, eval_gradient=True)\n else:\n K = kernel(self.X_train_)\n K[np.diag_indices_from(K)] += self.alpha\n try:\n L = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)\n except np.linalg.LinAlgError:\n return (-np.inf, np.zeros_like(theta)) if eval_gradient else -np.inf\n y_train = self.y_train_\n if y_train.ndim == 1:\n y_train = y_train[:, np.newaxis]\n alpha = cho_solve((L, GPR_CHOLESKY_LOWER), y_train, check_finite=False)\n log_likelihood_dims = -0.5 * np.einsum('ik,ik->k', y_train, alpha)\n log_likelihood_dims -= np.log(np.diag(L)).sum()\n log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi)\n log_likelihood = log_likelihood_dims.sum(axis=-1)\n if eval_gradient:\n inner_term = np.einsum('ik,jk->ijk', alpha, alpha)\n K_inv = cho_solve((L, GPR_CHOLESKY_LOWER), np.eye(K.shape[0]), check_finite=False)\n inner_term -= K_inv[..., np.newaxis]\n log_likelihood_gradient_dims = 0.5 * np.einsum('ijl,jik->kl', inner_term, K_gradient)\n log_likelihood_gradient = log_likelihood_gradient_dims.sum(axis=-1)\n if eval_gradient:\n return log_likelihood, log_likelihood_gradient\n else:\n return log_likelihood" }, { @@ -86103,7 +91565,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -86113,7 +91576,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or list of object", "description": "Query points where the GP is evaluated." - } + }, + "refined_type": {} }, { "name": "return_std", @@ -86123,7 +91587,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, the standard-deviation of the predictive distribution at\nthe query points is returned along with the mean." - } + }, + "refined_type": {} }, { "name": "return_cov", @@ -86133,13 +91598,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the covariance of the joint predictive distribution at\nthe query points is returned along with the mean." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict using the Gaussian process regression model.\n\nWe can also predict based on an unfitted model by using the GP prior. In addition to the mean of the predictive distribution, optionally also returns its standard deviation (`return_std=True`) or covariance (`return_cov=True`). Note that at most one of the two can be requested.", - "docstring": "Predict using the Gaussian process regression model.\n\nWe can also predict based on an unfitted model by using the GP prior.\nIn addition to the mean of the predictive distribution, optionally also\nreturns its standard deviation (`return_std=True`) or covariance\n(`return_cov=True`). Note that at most one of the two can be requested.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated.\n\nreturn_std : bool, default=False\n If True, the standard-deviation of the predictive distribution at\n the query points is returned along with the mean.\n\nreturn_cov : bool, default=False\n If True, the covariance of the joint predictive distribution at\n the query points is returned along with the mean.\n\nReturns\n-------\ny_mean : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Mean of predictive distribution a query points.\n\ny_std : ndarray of shape (n_samples,) or (n_samples, n_targets), optional\n Standard deviation of predictive distribution at query points.\n Only returned when `return_std` is True.\n\ny_cov : ndarray of shape (n_samples, n_samples) or (n_samples, n_samples, n_targets), optional\n Covariance of joint predictive distribution a query points.\n Only returned when `return_cov` is True.", + "description": "Predict using the Gaussian process regression model.\n\nWe can also predict based on an unfitted model by using the GP prior.\nIn addition to the mean of the predictive distribution, optionally also\nreturns its standard deviation (`return_std=True`) or covariance\n(`return_cov=True`). Note that at most one of the two can be requested.", + "docstring": "Predict using the Gaussian process regression model.\n\n We can also predict based on an unfitted model by using the GP prior.\n In addition to the mean of the predictive distribution, optionally also\n returns its standard deviation (`return_std=True`) or covariance\n (`return_cov=True`). Note that at most one of the two can be requested.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated.\n\n return_std : bool, default=False\n If True, the standard-deviation of the predictive distribution at\n the query points is returned along with the mean.\n\n return_cov : bool, default=False\n If True, the covariance of the joint predictive distribution at\n the query points is returned along with the mean.\n\n Returns\n -------\n y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Mean of predictive distribution a query points.\n\n y_std : ndarray of shape (n_samples,) or (n_samples, n_targets), optional\n Standard deviation of predictive distribution at query points.\n Only returned when `return_std` is True.\n\n y_cov : ndarray of shape (n_samples, n_samples) or (n_samples, n_samples, n_targets), optional\n Covariance of joint predictive distribution a query points.\n Only returned when `return_cov` is True.\n ", "source_code": "\ndef predict(self, X, return_std=False, return_cov=False):\n \"\"\"Predict using the Gaussian process regression model.\n\n We can also predict based on an unfitted model by using the GP prior.\n In addition to the mean of the predictive distribution, optionally also\n returns its standard deviation (`return_std=True`) or covariance\n (`return_cov=True`). Note that at most one of the two can be requested.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or list of object\n Query points where the GP is evaluated.\n\n return_std : bool, default=False\n If True, the standard-deviation of the predictive distribution at\n the query points is returned along with the mean.\n\n return_cov : bool, default=False\n If True, the covariance of the joint predictive distribution at\n the query points is returned along with the mean.\n\n Returns\n -------\n y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Mean of predictive distribution a query points.\n\n y_std : ndarray of shape (n_samples,) or (n_samples, n_targets), optional\n Standard deviation of predictive distribution at query points.\n Only returned when `return_std` is True.\n\n y_cov : ndarray of shape (n_samples, n_samples) or (n_samples, n_samples, n_targets), optional\n Covariance of joint predictive distribution a query points.\n Only returned when `return_cov` is True.\n \"\"\"\n if return_std and return_cov:\n raise RuntimeError('At most one of return_std or return_cov can be requested.')\n if self.kernel is None or self.kernel.requires_vector_input:\n (dtype, ensure_2d) = ('numeric', True)\n else:\n (dtype, ensure_2d) = (None, False)\n X = self._validate_data(X, ensure_2d=ensure_2d, dtype=dtype, reset=False)\n if not hasattr(self, 'X_train_'):\n if self.kernel is None:\n kernel = C(1.0, constant_value_bounds='fixed') * RBF(1.0, length_scale_bounds='fixed')\n else:\n kernel = self.kernel\n y_mean = np.zeros(X.shape[0])\n if return_cov:\n y_cov = kernel(X)\n return y_mean, y_cov\n elif return_std:\n y_var = kernel.diag(X)\n return y_mean, np.sqrt(y_var)\n else:\n return y_mean\n else:\n K_trans = self.kernel_(X, self.X_train_)\n y_mean = K_trans @ self.alpha_\n y_mean = self._y_train_std * y_mean + self._y_train_mean\n V = solve_triangular(self.L_, K_trans.T, lower=GPR_CHOLESKY_LOWER, check_finite=False)\n if return_cov:\n y_cov = self.kernel_(X) - V.T @ V\n y_cov = np.outer(y_cov, self._y_train_std**2).reshape(*y_cov.shape, -1)\n if y_cov.shape[2] == 1:\n y_cov = np.squeeze(y_cov, axis=2)\n return y_mean, y_cov\n elif return_std:\n y_var = self.kernel_.diag(X)\n y_var -= np.einsum('ij,ji->i', V.T, V)\n y_var_negative = y_var < 0\n if np.any(y_var_negative):\n warnings.warn('Predicted variances smaller than 0. Setting those variances to 0.')\n y_var[y_var_negative] = 0.0\n y_var = np.outer(y_var, self._y_train_std**2).reshape(*y_var.shape, -1)\n if y_var.shape[1] == 1:\n y_var = np.squeeze(y_var, axis=1)\n return y_mean, np.sqrt(y_var)\n else:\n return y_mean" }, { @@ -86157,7 +91623,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -86167,7 +91634,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object", "description": "Query points where the GP is evaluated." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -86177,7 +91645,8 @@ "docstring": { "type": "int, default=1", "description": "Number of samples drawn from the Gaussian process per query point." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -86187,13 +91656,14 @@ "docstring": { "type": "int, RandomState instance or None, default=0", "description": "Determines random number generation to randomly draw samples.\nPass an int for reproducible results across multiple function\ncalls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Draw samples from Gaussian process and evaluate at X.", - "docstring": "Draw samples from Gaussian process and evaluate at X.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object\n Query points where the GP is evaluated.\n\nn_samples : int, default=1\n Number of samples drawn from the Gaussian process per query point.\n\nrandom_state : int, RandomState instance or None, default=0\n Determines random number generation to randomly draw samples.\n Pass an int for reproducible results across multiple function\n calls.\n See :term:`Glossary `.\n\nReturns\n-------\ny_samples : ndarray of shape (n_samples_X, n_samples), or (n_samples_X, n_targets, n_samples)\n Values of n_samples samples drawn from Gaussian process and\n evaluated at query points.", + "docstring": "Draw samples from Gaussian process and evaluate at X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Query points where the GP is evaluated.\n\n n_samples : int, default=1\n Number of samples drawn from the Gaussian process per query point.\n\n random_state : int, RandomState instance or None, default=0\n Determines random number generation to randomly draw samples.\n Pass an int for reproducible results across multiple function\n calls.\n See :term:`Glossary `.\n\n Returns\n -------\n y_samples : ndarray of shape (n_samples_X, n_samples), or (n_samples_X, n_targets, n_samples)\n Values of n_samples samples drawn from Gaussian process and\n evaluated at query points.\n ", "source_code": "\ndef sample_y(self, X, n_samples=1, random_state=0):\n \"\"\"Draw samples from Gaussian process and evaluate at X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Query points where the GP is evaluated.\n\n n_samples : int, default=1\n Number of samples drawn from the Gaussian process per query point.\n\n random_state : int, RandomState instance or None, default=0\n Determines random number generation to randomly draw samples.\n Pass an int for reproducible results across multiple function\n calls.\n See :term:`Glossary `.\n\n Returns\n -------\n y_samples : ndarray of shape (n_samples_X, n_samples), or (n_samples_X, n_targets, n_samples)\n Values of n_samples samples drawn from Gaussian process and\n evaluated at query points.\n \"\"\"\n rng = check_random_state(random_state)\n (y_mean, y_cov) = self.predict(X, return_cov=True)\n if y_mean.ndim == 1:\n y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T\n else:\n y_samples = [rng.multivariate_normal(y_mean[:, i], y_cov, n_samples).T[:, np.newaxis] for i in range(y_mean.shape[1])]\n y_samples = np.hstack(y_samples)\n return y_samples" }, { @@ -86211,7 +91681,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -86221,7 +91692,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object, default=None", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} }, { "name": "Y", @@ -86231,7 +91703,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object, default=None", "description": "Right argument of the returned kernel k(X, Y). If None, k(X, X)\nis evaluated instead." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -86241,13 +91714,14 @@ "docstring": { "type": "bool, default=False", "description": "Determines whether the gradient with respect to the log of the\nkernel hyperparameter is computed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return the kernel k(X, Y) and optionally its gradient.\n\nNote that this compound kernel returns the results of all simple kernel stacked along an additional axis.", - "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\nNote that this compound kernel returns the results of all simple kernel\nstacked along an additional axis.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Left argument of the returned kernel k(X, Y)\n\nY : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\neval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of the\n kernel hyperparameter is computed.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_Y, n_kernels)\n Kernel k(X, Y)\n\nK_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims, n_kernels), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.", + "description": "Return the kernel k(X, Y) and optionally its gradient.\n\nNote that this compound kernel returns the results of all simple kernel\nstacked along an additional axis.", + "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\n Note that this compound kernel returns the results of all simple kernel\n stacked along an additional axis.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of the\n kernel hyperparameter is computed.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y, n_kernels)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims, n_kernels), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n ", "source_code": "\ndef __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Note that this compound kernel returns the results of all simple kernel\n stacked along an additional axis.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of the\n kernel hyperparameter is computed.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y, n_kernels)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims, n_kernels), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n if eval_gradient:\n K = []\n K_grad = []\n for kernel in self.kernels:\n (K_single, K_grad_single) = kernel(X, Y, eval_gradient)\n K.append(K_single)\n K_grad.append(K_grad_single[..., np.newaxis])\n return np.dstack(K), np.concatenate(K_grad, 3)\n else:\n return np.dstack([kernel(X, Y, eval_gradient) for kernel in self.kernels])" }, { @@ -86265,7 +91739,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b", @@ -86275,13 +91750,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __eq__(self, b):\n if type(self) != type(b) or len(self.kernels) != len(b.kernels):\n return False\n return np.all([self.kernels[i] == b.kernels[i] for i in range(len(self.kernels))])" }, { @@ -86299,7 +91775,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernels", @@ -86309,13 +91786,14 @@ "docstring": { "type": "list of Kernels", "description": "The other kernels" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, kernels):\n self.kernels = kernels" }, { @@ -86333,13 +91811,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Returns the log-transformed bounds on the theta.", - "docstring": "Returns the log-transformed bounds on the theta.\n\nReturns\n-------\nbounds : array of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta", + "docstring": "Returns the log-transformed bounds on the theta.\n\n Returns\n -------\n bounds : array of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta\n ", "source_code": "\n@property\ndef bounds(self):\n \"\"\"Returns the log-transformed bounds on the theta.\n\n Returns\n -------\n bounds : array of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta\n \"\"\"\n return np.vstack([kernel.bounds for kernel in self.kernels])" }, { @@ -86357,7 +91836,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -86367,13 +91847,14 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object", "description": "Argument to the kernel." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to `np.diag(self(X))`; however, it can be evaluated more efficiently since only the diagonal is evaluated.", - "docstring": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to `np.diag(self(X))`; however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\nReturns\n-------\nK_diag : ndarray of shape (n_samples_X, n_kernels)\n Diagonal of kernel k(X, X)", + "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to `np.diag(self(X))`; however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.", + "docstring": "Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to `np.diag(self(X))`; however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X, n_kernels)\n Diagonal of kernel k(X, X)\n ", "source_code": "\ndef diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to `np.diag(self(X))`; however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X, n_kernels)\n Diagonal of kernel k(X, X)\n \"\"\"\n return np.vstack([kernel.diag(X) for kernel in self.kernels]).T" }, { @@ -86391,7 +91872,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -86401,13 +91883,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, will return the parameters for this estimator and\ncontained subobjects that are estimators." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get parameters of this kernel.", - "docstring": "Get parameters of this kernel.\n\nParameters\n----------\ndeep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\nReturns\n-------\nparams : dict\n Parameter names mapped to their values.", + "docstring": "Get parameters of this kernel.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n ", "source_code": "\ndef get_params(self, deep=True):\n \"\"\"Get parameters of this kernel.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n \"\"\"\n return dict(kernels=self.kernels)" }, { @@ -86425,7 +91908,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -86449,7 +91933,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -86473,13 +91958,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\nNote that theta are typically the log-transformed values of the kernel's hyperparameters as this representation of the search space is more amenable for hyperparameter search, as hyperparameters like length-scales naturally live on a log-scale.", - "docstring": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\nNote that theta are typically the log-transformed values of the\nkernel's hyperparameters as this representation of the search space\nis more amenable for hyperparameter search, as hyperparameters like\nlength-scales naturally live on a log-scale.\n\nReturns\n-------\ntheta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel", + "description": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\nNote that theta are typically the log-transformed values of the\nkernel's hyperparameters as this representation of the search space\nis more amenable for hyperparameter search, as hyperparameters like\nlength-scales naturally live on a log-scale.", + "docstring": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\n Note that theta are typically the log-transformed values of the\n kernel's hyperparameters as this representation of the search space\n is more amenable for hyperparameter search, as hyperparameters like\n length-scales naturally live on a log-scale.\n\n Returns\n -------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n ", "source_code": "\n@property\ndef theta(self):\n \"\"\"Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\n Note that theta are typically the log-transformed values of the\n kernel's hyperparameters as this representation of the search space\n is more amenable for hyperparameter search, as hyperparameters like\n length-scales naturally live on a log-scale.\n\n Returns\n -------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n \"\"\"\n return np.hstack([kernel.theta for kernel in self.kernels])" }, { @@ -86497,7 +91983,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "theta", @@ -86507,13 +91994,14 @@ "docstring": { "type": "array of shape (n_dims,)", "description": "The non-fixed, log-transformed hyperparameters of the kernel" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Sets the (flattened, log-transformed) non-fixed hyperparameters.", - "docstring": "Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\nParameters\n----------\ntheta : array of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel", + "docstring": "Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\n Parameters\n ----------\n theta : array of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n ", "source_code": "\n@theta.setter\ndef theta(self, theta):\n \"\"\"Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\n Parameters\n ----------\n theta : array of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n \"\"\"\n k_dims = self.k1.n_dims\n for (i, kernel) in enumerate(self.kernels):\n kernel.theta = theta[i * k_dims:(i + 1) * k_dims]" }, { @@ -86531,7 +92019,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -86541,7 +92030,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} }, { "name": "Y", @@ -86551,7 +92041,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object, default=None", "description": "Right argument of the returned kernel k(X, Y). If None, k(X, X)\nis evaluated instead." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -86561,13 +92052,14 @@ "docstring": { "type": "bool, default=False", "description": "Determines whether the gradient with respect to the log of\nthe kernel hyperparameter is computed.\nOnly supported when Y is None." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the kernel k(X, Y) and optionally its gradient.", - "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\nY : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\neval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\nK_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when eval_gradient\n is True.", + "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when eval_gradient\n is True.\n ", "source_code": "\ndef __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when eval_gradient\n is True.\n \"\"\"\n if Y is None:\n Y = X\n elif eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n K = np.full((_num_samples(X), _num_samples(Y)), self.constant_value, dtype=np.array(self.constant_value).dtype)\n if eval_gradient:\n if not self.hyperparameter_constant_value.fixed:\n return K, np.full((_num_samples(X), _num_samples(X), 1), self.constant_value, dtype=np.array(self.constant_value).dtype)\n else:\n return K, np.empty((_num_samples(X), _num_samples(X), 0))\n else:\n return K" }, { @@ -86585,7 +92077,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "constant_value", @@ -86595,7 +92088,8 @@ "docstring": { "type": "float, default=1.0", "description": "The constant value which defines the covariance:\nk(x_1, x_2) = constant_value" - } + }, + "refined_type": {} }, { "name": "constant_value_bounds", @@ -86605,13 +92099,14 @@ "docstring": { "type": "pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)", "description": "The lower and upper bound on `constant_value`.\nIf set to \"fixed\", `constant_value` cannot be changed during\nhyperparameter tuning." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, constant_value=1.0, constant_value_bounds=(1e-05, 100000.0)):\n self.constant_value = constant_value\n self.constant_value_bounds = constant_value_bounds" }, { @@ -86629,13 +92124,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return '{0:.3g}**2'.format(np.sqrt(self.constant_value))" }, { @@ -86653,7 +92149,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -86663,13 +92160,14 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object", "description": "Argument to the kernel." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however, it can be evaluated more efficiently since only the diagonal is evaluated.", - "docstring": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\nReturns\n-------\nK_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)", + "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.", + "docstring": "Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n ", "source_code": "\ndef diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return np.full(_num_samples(X), self.constant_value, dtype=np.array(self.constant_value).dtype)" }, { @@ -86687,13 +92185,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef hyperparameter_constant_value(self):\n return Hyperparameter('constant_value', 'numeric', self.constant_value_bounds)" }, { @@ -86711,7 +92210,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -86721,7 +92221,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} }, { "name": "Y", @@ -86731,7 +92232,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "Right argument of the returned kernel k(X, Y). If None, k(X, X)\nif evaluated instead." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -86741,13 +92243,14 @@ "docstring": { "type": "bool, default=False", "description": "Determines whether the gradient with respect to the log of\nthe kernel hyperparameter is computed.\nOnly supported when Y is None." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the kernel k(X, Y) and optionally its gradient.", - "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\neval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\nK_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.", + "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n ", "source_code": "\ndef __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n X = np.atleast_2d(X)\n if Y is None:\n K = np.inner(X, X) + self.sigma_0**2\n else:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n K = np.inner(X, Y) + self.sigma_0**2\n if eval_gradient:\n if not self.hyperparameter_sigma_0.fixed:\n K_gradient = np.empty((K.shape[0], K.shape[1], 1))\n K_gradient[..., 0] = 2 * self.sigma_0**2\n return K, K_gradient\n else:\n return K, np.empty((X.shape[0], X.shape[0], 0))\n else:\n return K" }, { @@ -86765,7 +92268,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sigma_0", @@ -86775,7 +92279,8 @@ "docstring": { "type": "float >= 0, default=1.0", "description": "Parameter controlling the inhomogenity of the kernel. If sigma_0=0,\nthe kernel is homogeneous." - } + }, + "refined_type": {} }, { "name": "sigma_0_bounds", @@ -86785,13 +92290,14 @@ "docstring": { "type": "pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)", "description": "The lower and upper bound on 'sigma_0'.\nIf set to \"fixed\", 'sigma_0' cannot be changed during\nhyperparameter tuning." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, sigma_0=1.0, sigma_0_bounds=(1e-05, 100000.0)):\n self.sigma_0 = sigma_0\n self.sigma_0_bounds = sigma_0_bounds" }, { @@ -86809,13 +92315,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return '{0}(sigma_0={1:.3g})'.format(self.__class__.__name__, self.sigma_0)" }, { @@ -86833,7 +92340,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -86843,13 +92351,14 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "Left argument of the returned kernel k(X, Y)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however, it can be evaluated more efficiently since only the diagonal is evaluated.", - "docstring": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y).\n\nReturns\n-------\nK_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X).", + "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.", + "docstring": "Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y).\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X).\n ", "source_code": "\ndef diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y).\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X).\n \"\"\"\n return np.einsum('ij,ij->i', X, X) + self.sigma_0**2" }, { @@ -86867,13 +92376,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef hyperparameter_sigma_0(self):\n return Hyperparameter('sigma_0', 'numeric', self.sigma_0_bounds)" }, { @@ -86891,7 +92401,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -86915,7 +92426,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -86925,7 +92437,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} }, { "name": "Y", @@ -86935,7 +92448,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "Right argument of the returned kernel k(X, Y). If None, k(X, X)\nif evaluated instead." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -86945,13 +92459,14 @@ "docstring": { "type": "bool, default=False", "description": "Determines whether the gradient with respect to the log of\nthe kernel hyperparameter is computed.\nOnly supported when Y is None." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the kernel k(X, Y) and optionally its gradient.", - "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\neval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\nK_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.", + "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n ", "source_code": "\ndef __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n X = np.atleast_2d(X)\n if Y is None:\n dists = squareform(pdist(X, metric='euclidean'))\n arg = np.pi * dists / self.periodicity\n sin_of_arg = np.sin(arg)\n K = np.exp(-2 * (sin_of_arg / self.length_scale)**2)\n else:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n dists = cdist(X, Y, metric='euclidean')\n K = np.exp(-2 * (np.sin(np.pi / self.periodicity * dists) / self.length_scale)**2)\n if eval_gradient:\n cos_of_arg = np.cos(arg)\n if not self.hyperparameter_length_scale.fixed:\n length_scale_gradient = 4 / self.length_scale**2 * sin_of_arg**2 * K\n length_scale_gradient = length_scale_gradient[:, :, np.newaxis]\n else:\n length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0))\n if not self.hyperparameter_periodicity.fixed:\n periodicity_gradient = 4 * arg / self.length_scale**2 * cos_of_arg * sin_of_arg * K\n periodicity_gradient = periodicity_gradient[:, :, np.newaxis]\n else:\n periodicity_gradient = np.empty((K.shape[0], K.shape[1], 0))\n return K, np.dstack((length_scale_gradient, periodicity_gradient))\n else:\n return K" }, { @@ -86969,7 +92484,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "length_scale", @@ -86979,7 +92495,8 @@ "docstring": { "type": "float > 0, default=1.0", "description": "The length scale of the kernel." - } + }, + "refined_type": {} }, { "name": "periodicity", @@ -86989,7 +92506,8 @@ "docstring": { "type": "float > 0, default=1.0", "description": "The periodicity of the kernel." - } + }, + "refined_type": {} }, { "name": "length_scale_bounds", @@ -86999,7 +92517,8 @@ "docstring": { "type": "pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)", "description": "The lower and upper bound on 'length_scale'.\nIf set to \"fixed\", 'length_scale' cannot be changed during\nhyperparameter tuning." - } + }, + "refined_type": {} }, { "name": "periodicity_bounds", @@ -87009,13 +92528,14 @@ "docstring": { "type": "pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)", "description": "The lower and upper bound on 'periodicity'.\nIf set to \"fixed\", 'periodicity' cannot be changed during\nhyperparameter tuning." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, length_scale=1.0, periodicity=1.0, length_scale_bounds=(1e-05, 100000.0), periodicity_bounds=(1e-05, 100000.0)):\n self.length_scale = length_scale\n self.periodicity = periodicity\n self.length_scale_bounds = length_scale_bounds\n self.periodicity_bounds = periodicity_bounds" }, { @@ -87033,13 +92553,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return '{0}(length_scale={1:.3g}, periodicity={2:.3g})'.format(self.__class__.__name__, self.length_scale, self.periodicity)" }, { @@ -87057,7 +92578,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -87081,13 +92603,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef hyperparameter_periodicity(self):\n return Hyperparameter('periodicity', 'numeric', self.periodicity_bounds)" }, { @@ -87105,7 +92628,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -87115,7 +92639,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} }, { "name": "Y", @@ -87125,7 +92650,8 @@ "docstring": { "type": "array-like of shape (n_samples_Y, n_features) or list of object, default=None", "description": "Right argument of the returned kernel k(X, Y). If None, k(X, X)\nis evaluated instead." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -87135,13 +92661,14 @@ "docstring": { "type": "bool, default=False", "description": "Determines whether the gradient with respect to the log of\nthe kernel hyperparameter is computed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the kernel k(X, Y) and optionally its gradient.", - "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\nY : array-like of shape (n_samples_Y, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\neval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\nK_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.", + "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_Y, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n ", "source_code": "\ndef __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_Y, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n if eval_gradient:\n (K, K_gradient) = self.kernel(X, Y, eval_gradient=True)\n K_gradient *= self.exponent * K[:, :, np.newaxis]**(self.exponent - 1)\n return K**self.exponent, K_gradient\n else:\n K = self.kernel(X, Y, eval_gradient=False)\n return K**self.exponent" }, { @@ -87159,7 +92686,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b", @@ -87169,13 +92697,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __eq__(self, b):\n if type(self) != type(b):\n return False\n return self.kernel == b.kernel and self.exponent == b.exponent" }, { @@ -87193,7 +92722,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -87203,7 +92733,8 @@ "docstring": { "type": "Kernel", "description": "The base kernel" - } + }, + "refined_type": {} }, { "name": "exponent", @@ -87213,13 +92744,14 @@ "docstring": { "type": "float", "description": "The exponent for the base kernel" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, kernel, exponent):\n self.kernel = kernel\n self.exponent = exponent" }, { @@ -87237,13 +92769,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return '{0} ** {1}'.format(self.kernel, self.exponent)" }, { @@ -87261,13 +92794,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Returns the log-transformed bounds on the theta.", - "docstring": "Returns the log-transformed bounds on the theta.\n\nReturns\n-------\nbounds : ndarray of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta", + "docstring": "Returns the log-transformed bounds on the theta.\n\n Returns\n -------\n bounds : ndarray of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta\n ", "source_code": "\n@property\ndef bounds(self):\n \"\"\"Returns the log-transformed bounds on the theta.\n\n Returns\n -------\n bounds : ndarray of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta\n \"\"\"\n return self.kernel.bounds" }, { @@ -87285,7 +92819,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -87295,13 +92830,14 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object", "description": "Argument to the kernel." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however, it can be evaluated more efficiently since only the diagonal is evaluated.", - "docstring": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\nReturns\n-------\nK_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)", + "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.", + "docstring": "Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n ", "source_code": "\ndef diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return self.kernel.diag(X)**self.exponent" }, { @@ -87319,7 +92855,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -87329,13 +92866,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, will return the parameters for this estimator and\ncontained subobjects that are estimators." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get parameters of this kernel.", - "docstring": "Get parameters of this kernel.\n\nParameters\n----------\ndeep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\nReturns\n-------\nparams : dict\n Parameter names mapped to their values.", + "docstring": "Get parameters of this kernel.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n ", "source_code": "\ndef get_params(self, deep=True):\n \"\"\"Get parameters of this kernel.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n \"\"\"\n params = dict(kernel=self.kernel, exponent=self.exponent)\n if deep:\n deep_items = self.kernel.get_params().items()\n params.update((('kernel__' + k, val) for (k, val) in deep_items))\n return params" }, { @@ -87353,7 +92891,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -87377,7 +92916,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -87401,7 +92941,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -87425,13 +92966,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\nNote that theta are typically the log-transformed values of the kernel's hyperparameters as this representation of the search space is more amenable for hyperparameter search, as hyperparameters like length-scales naturally live on a log-scale.", - "docstring": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\nNote that theta are typically the log-transformed values of the\nkernel's hyperparameters as this representation of the search space\nis more amenable for hyperparameter search, as hyperparameters like\nlength-scales naturally live on a log-scale.\n\nReturns\n-------\ntheta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel", + "description": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\nNote that theta are typically the log-transformed values of the\nkernel's hyperparameters as this representation of the search space\nis more amenable for hyperparameter search, as hyperparameters like\nlength-scales naturally live on a log-scale.", + "docstring": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\n Note that theta are typically the log-transformed values of the\n kernel's hyperparameters as this representation of the search space\n is more amenable for hyperparameter search, as hyperparameters like\n length-scales naturally live on a log-scale.\n\n Returns\n -------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n ", "source_code": "\n@property\ndef theta(self):\n \"\"\"Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\n Note that theta are typically the log-transformed values of the\n kernel's hyperparameters as this representation of the search space\n is more amenable for hyperparameter search, as hyperparameters like\n length-scales naturally live on a log-scale.\n\n Returns\n -------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n \"\"\"\n return self.kernel.theta" }, { @@ -87449,7 +92991,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "theta", @@ -87459,13 +93002,14 @@ "docstring": { "type": "ndarray of shape (n_dims,)", "description": "The non-fixed, log-transformed hyperparameters of the kernel" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Sets the (flattened, log-transformed) non-fixed hyperparameters.", - "docstring": "Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\nParameters\n----------\ntheta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel", + "docstring": "Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\n Parameters\n ----------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n ", "source_code": "\n@theta.setter\ndef theta(self, theta):\n \"\"\"Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\n Parameters\n ----------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n \"\"\"\n self.kernel.theta = theta" }, { @@ -87483,7 +93027,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -87507,7 +93052,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -87517,13 +93063,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __eq__(self, other):\n return self.name == other.name and self.value_type == other.value_type and np.all(self.bounds == other.bounds) and self.n_elements == other.n_elements and self.fixed == other.fixed" }, { @@ -87541,7 +93088,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "name", @@ -87551,7 +93099,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value_type", @@ -87561,7 +93110,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bounds", @@ -87571,7 +93121,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_elements", @@ -87581,7 +93132,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fixed", @@ -87591,13 +93143,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __new__(cls, name, value_type, bounds, n_elements=1, fixed=None):\n if not isinstance(bounds, str) or bounds != 'fixed':\n bounds = np.atleast_2d(bounds)\n if n_elements > 1:\n if bounds.shape[0] == 1:\n bounds = np.repeat(bounds, n_elements, 0)\n elif bounds.shape[0] != n_elements:\n raise ValueError('Bounds on %s should have either 1 or %d dimensions. Given are %d' % (name, n_elements, bounds.shape[0]))\n if fixed is None:\n fixed = isinstance(bounds, str) and bounds == 'fixed'\n return super(Hyperparameter, cls).__new__(cls, name, value_type, bounds, n_elements, fixed)" }, { @@ -87615,7 +93168,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b", @@ -87625,13 +93179,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __add__(self, b):\n if not isinstance(b, Kernel):\n return Sum(self, ConstantKernel(b))\n return Sum(self, b)" }, { @@ -87649,7 +93204,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -87659,7 +93215,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -87669,7 +93226,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -87679,7 +93237,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -87703,7 +93262,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b", @@ -87713,13 +93273,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __eq__(self, b):\n if type(self) != type(b):\n return False\n params_a = self.get_params()\n params_b = b.get_params()\n for key in set(list(params_a.keys()) + list(params_b.keys())):\n if np.any(params_a.get(key, None) != params_b.get(key, None)):\n return False\n return True" }, { @@ -87737,7 +93298,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b", @@ -87747,13 +93309,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __mul__(self, b):\n if not isinstance(b, Kernel):\n return Product(self, ConstantKernel(b))\n return Product(self, b)" }, { @@ -87771,7 +93334,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b", @@ -87781,13 +93345,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __pow__(self, b):\n return Exponentiation(self, b)" }, { @@ -87805,7 +93370,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b", @@ -87815,13 +93381,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __radd__(self, b):\n if not isinstance(b, Kernel):\n return Sum(ConstantKernel(b), self)\n return Sum(b, self)" }, { @@ -87839,13 +93406,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return '{0}({1})'.format(self.__class__.__name__, ', '.join(map('{0:.3g}'.format, self.theta)))" }, { @@ -87863,7 +93431,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b", @@ -87873,13 +93442,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __rmul__(self, b):\n if not isinstance(b, Kernel):\n return Product(ConstantKernel(b), self)\n return Product(b, self)" }, { @@ -87897,7 +93467,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -87921,13 +93492,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Returns the log-transformed bounds on the theta.", - "docstring": "Returns the log-transformed bounds on the theta.\n\nReturns\n-------\nbounds : ndarray of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta", + "docstring": "Returns the log-transformed bounds on the theta.\n\n Returns\n -------\n bounds : ndarray of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta\n ", "source_code": "\n@property\ndef bounds(self):\n \"\"\"Returns the log-transformed bounds on the theta.\n\n Returns\n -------\n bounds : ndarray of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta\n \"\"\"\n bounds = [hyperparameter.bounds for hyperparameter in self.hyperparameters if not hyperparameter.fixed]\n if len(bounds) > 0:\n return np.log(np.vstack(bounds))\n else:\n return np.array([])" }, { @@ -87945,7 +93517,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "theta", @@ -87955,13 +93528,14 @@ "docstring": { "type": "ndarray of shape (n_dims,)", "description": "The hyperparameters" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Returns a clone of self with given hyperparameters theta.", - "docstring": "Returns a clone of self with given hyperparameters theta.\n\nParameters\n----------\ntheta : ndarray of shape (n_dims,)\n The hyperparameters", + "docstring": "Returns a clone of self with given hyperparameters theta.\n\n Parameters\n ----------\n theta : ndarray of shape (n_dims,)\n The hyperparameters\n ", "source_code": "\ndef clone_with_theta(self, theta):\n \"\"\"Returns a clone of self with given hyperparameters theta.\n\n Parameters\n ----------\n theta : ndarray of shape (n_dims,)\n The hyperparameters\n \"\"\"\n cloned = clone(self)\n cloned.theta = theta\n return cloned" }, { @@ -87979,7 +93553,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -87989,13 +93564,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however, it can be evaluated more efficiently since only the diagonal is evaluated.", - "docstring": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.\n\nParameters\n----------\nX : array-like of shape (n_samples,)\n Left argument of the returned kernel k(X, Y)\n\nReturns\n-------\nK_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)", + "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.", + "docstring": "Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples,)\n Left argument of the returned kernel k(X, Y)\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n ", "source_code": "\n@abstractmethod\ndef diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples,)\n Left argument of the returned kernel k(X, Y)\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n " }, { @@ -88013,7 +93589,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -88023,13 +93600,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, will return the parameters for this estimator and\ncontained subobjects that are estimators." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get parameters of this kernel.", - "docstring": "Get parameters of this kernel.\n\nParameters\n----------\ndeep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\nReturns\n-------\nparams : dict\n Parameter names mapped to their values.", + "docstring": "Get parameters of this kernel.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n ", "source_code": "\ndef get_params(self, deep=True):\n \"\"\"Get parameters of this kernel.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n \"\"\"\n params = dict()\n cls = self.__class__\n init = getattr(cls.__init__, 'deprecated_original', cls.__init__)\n init_sign = signature(init)\n (args, varargs) = ([], [])\n for parameter in init_sign.parameters.values():\n if parameter.kind != parameter.VAR_KEYWORD and parameter.name != 'self':\n args.append(parameter.name)\n if parameter.kind == parameter.VAR_POSITIONAL:\n varargs.append(parameter.name)\n if len(varargs) != 0:\n raise RuntimeError(\"scikit-learn kernels should always specify their parameters in the signature of their __init__ (no varargs). %s doesn't follow this convention.\" % (cls, ))\n for arg in args:\n params[arg] = getattr(self, arg)\n return params" }, { @@ -88047,7 +93625,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -88071,7 +93650,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -88095,7 +93675,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -88119,13 +93700,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns whether the kernel is defined on fixed-length feature vectors or generic objects. Defaults to True for backward compatibility.", - "docstring": "Returns whether the kernel is defined on fixed-length feature\nvectors or generic objects. Defaults to True for backward\ncompatibility.", + "description": "Returns whether the kernel is defined on fixed-length feature\nvectors or generic objects. Defaults to True for backward\ncompatibility.", + "docstring": "Returns whether the kernel is defined on fixed-length feature\n vectors or generic objects. Defaults to True for backward\n compatibility.", "source_code": "\n@property\ndef requires_vector_input(self):\n \"\"\"Returns whether the kernel is defined on fixed-length feature\n vectors or generic objects. Defaults to True for backward\n compatibility.\"\"\"\n return True" }, { @@ -88143,13 +93725,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Set the parameters of this kernel.\n\nThe method works on simple kernels as well as on nested kernels. The latter have parameters of the form ``__`` so that it's possible to update each component of a nested object.", - "docstring": "Set the parameters of this kernel.\n\nThe method works on simple kernels as well as on nested kernels.\nThe latter have parameters of the form ``__``\nso that it's possible to update each component of a nested object.\n\nReturns\n-------\nself", + "description": "Set the parameters of this kernel.\n\nThe method works on simple kernels as well as on nested kernels.\nThe latter have parameters of the form ``__``\nso that it's possible to update each component of a nested object.", + "docstring": "Set the parameters of this kernel.\n\n The method works on simple kernels as well as on nested kernels.\n The latter have parameters of the form ``__``\n so that it's possible to update each component of a nested object.\n\n Returns\n -------\n self\n ", "source_code": "\ndef set_params(self, **params):\n \"\"\"Set the parameters of this kernel.\n\n The method works on simple kernels as well as on nested kernels.\n The latter have parameters of the form ``__``\n so that it's possible to update each component of a nested object.\n\n Returns\n -------\n self\n \"\"\"\n if not params:\n return self\n valid_params = self.get_params(deep=True)\n for (key, value) in params.items():\n split = key.split('__', 1)\n if len(split) > 1:\n (name, sub_name) = split\n if name not in valid_params:\n raise ValueError('Invalid parameter %s for kernel %s. Check the list of available parameters with `kernel.get_params().keys()`.' % (name, self))\n sub_object = valid_params[name]\n sub_object.set_params(**{sub_name: value})\n else:\n if key not in valid_params:\n raise ValueError('Invalid parameter %s for kernel %s. Check the list of available parameters with `kernel.get_params().keys()`.' % (key, self.__class__.__name__))\n setattr(self, key, value)\n return self" }, { @@ -88167,13 +93750,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\nNote that theta are typically the log-transformed values of the kernel's hyperparameters as this representation of the search space is more amenable for hyperparameter search, as hyperparameters like length-scales naturally live on a log-scale.", - "docstring": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\nNote that theta are typically the log-transformed values of the\nkernel's hyperparameters as this representation of the search space\nis more amenable for hyperparameter search, as hyperparameters like\nlength-scales naturally live on a log-scale.\n\nReturns\n-------\ntheta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel", + "description": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\nNote that theta are typically the log-transformed values of the\nkernel's hyperparameters as this representation of the search space\nis more amenable for hyperparameter search, as hyperparameters like\nlength-scales naturally live on a log-scale.", + "docstring": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\n Note that theta are typically the log-transformed values of the\n kernel's hyperparameters as this representation of the search space\n is more amenable for hyperparameter search, as hyperparameters like\n length-scales naturally live on a log-scale.\n\n Returns\n -------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n ", "source_code": "\n@property\ndef theta(self):\n \"\"\"Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\n Note that theta are typically the log-transformed values of the\n kernel's hyperparameters as this representation of the search space\n is more amenable for hyperparameter search, as hyperparameters like\n length-scales naturally live on a log-scale.\n\n Returns\n -------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n \"\"\"\n theta = []\n params = self.get_params()\n for hyperparameter in self.hyperparameters:\n if not hyperparameter.fixed:\n theta.append(params[hyperparameter.name])\n if len(theta) > 0:\n return np.log(np.hstack(theta))\n else:\n return np.array([])" }, { @@ -88191,7 +93775,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "theta", @@ -88201,13 +93786,14 @@ "docstring": { "type": "ndarray of shape (n_dims,)", "description": "The non-fixed, log-transformed hyperparameters of the kernel" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Sets the (flattened, log-transformed) non-fixed hyperparameters.", - "docstring": "Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\nParameters\n----------\ntheta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel", + "docstring": "Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\n Parameters\n ----------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n ", "source_code": "\n@theta.setter\ndef theta(self, theta):\n \"\"\"Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\n Parameters\n ----------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n \"\"\"\n params = self.get_params()\n i = 0\n for hyperparameter in self.hyperparameters:\n if hyperparameter.fixed:\n continue\n if hyperparameter.n_elements > 1:\n params[hyperparameter.name] = np.exp(theta[i:i + hyperparameter.n_elements])\n i += hyperparameter.n_elements\n else:\n params[hyperparameter.name] = np.exp(theta[i])\n i += 1\n if i != len(theta):\n raise ValueError('theta has not the correct number of entries. Should be %d; given are %d' % (i, len(theta)))\n self.set_params(**params)" }, { @@ -88225,7 +93811,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b", @@ -88235,13 +93822,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __eq__(self, b):\n if type(self) != type(b):\n return False\n return self.k1 == b.k1 and self.k2 == b.k2 or self.k1 == b.k2 and self.k2 == b.k1" }, { @@ -88259,7 +93847,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "k1", @@ -88269,7 +93858,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "k2", @@ -88279,13 +93869,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, k1, k2):\n self.k1 = k1\n self.k2 = k2" }, { @@ -88303,13 +93894,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Returns the log-transformed bounds on the theta.", - "docstring": "Returns the log-transformed bounds on the theta.\n\nReturns\n-------\nbounds : ndarray of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta", + "docstring": "Returns the log-transformed bounds on the theta.\n\n Returns\n -------\n bounds : ndarray of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta\n ", "source_code": "\n@property\ndef bounds(self):\n \"\"\"Returns the log-transformed bounds on the theta.\n\n Returns\n -------\n bounds : ndarray of shape (n_dims, 2)\n The log-transformed bounds on the kernel's hyperparameters theta\n \"\"\"\n if self.k1.bounds.size == 0:\n return self.k2.bounds\n if self.k2.bounds.size == 0:\n return self.k1.bounds\n return np.vstack((self.k1.bounds, self.k2.bounds))" }, { @@ -88327,7 +93919,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -88337,13 +93930,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, will return the parameters for this estimator and\ncontained subobjects that are estimators." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get parameters of this kernel.", - "docstring": "Get parameters of this kernel.\n\nParameters\n----------\ndeep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\nReturns\n-------\nparams : dict\n Parameter names mapped to their values.", + "docstring": "Get parameters of this kernel.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n ", "source_code": "\ndef get_params(self, deep=True):\n \"\"\"Get parameters of this kernel.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : dict\n Parameter names mapped to their values.\n \"\"\"\n params = dict(k1=self.k1, k2=self.k2)\n if deep:\n deep_items = self.k1.get_params().items()\n params.update((('k1__' + k, val) for (k, val) in deep_items))\n deep_items = self.k2.get_params().items()\n params.update((('k2__' + k, val) for (k, val) in deep_items))\n return params" }, { @@ -88361,7 +93955,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -88385,7 +93980,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -88409,7 +94005,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -88433,13 +94030,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\nNote that theta are typically the log-transformed values of the kernel's hyperparameters as this representation of the search space is more amenable for hyperparameter search, as hyperparameters like length-scales naturally live on a log-scale.", - "docstring": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\nNote that theta are typically the log-transformed values of the\nkernel's hyperparameters as this representation of the search space\nis more amenable for hyperparameter search, as hyperparameters like\nlength-scales naturally live on a log-scale.\n\nReturns\n-------\ntheta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel", + "description": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\nNote that theta are typically the log-transformed values of the\nkernel's hyperparameters as this representation of the search space\nis more amenable for hyperparameter search, as hyperparameters like\nlength-scales naturally live on a log-scale.", + "docstring": "Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\n Note that theta are typically the log-transformed values of the\n kernel's hyperparameters as this representation of the search space\n is more amenable for hyperparameter search, as hyperparameters like\n length-scales naturally live on a log-scale.\n\n Returns\n -------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n ", "source_code": "\n@property\ndef theta(self):\n \"\"\"Returns the (flattened, log-transformed) non-fixed hyperparameters.\n\n Note that theta are typically the log-transformed values of the\n kernel's hyperparameters as this representation of the search space\n is more amenable for hyperparameter search, as hyperparameters like\n length-scales naturally live on a log-scale.\n\n Returns\n -------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n \"\"\"\n return np.append(self.k1.theta, self.k2.theta)" }, { @@ -88457,7 +94055,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "theta", @@ -88467,13 +94066,14 @@ "docstring": { "type": "ndarray of shape (n_dims,)", "description": "The non-fixed, log-transformed hyperparameters of the kernel" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Sets the (flattened, log-transformed) non-fixed hyperparameters.", - "docstring": "Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\nParameters\n----------\ntheta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel", + "docstring": "Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\n Parameters\n ----------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n ", "source_code": "\n@theta.setter\ndef theta(self, theta):\n \"\"\"Sets the (flattened, log-transformed) non-fixed hyperparameters.\n\n Parameters\n ----------\n theta : ndarray of shape (n_dims,)\n The non-fixed, log-transformed hyperparameters of the kernel\n \"\"\"\n k1_dims = self.k1.n_dims\n self.k1.theta = theta[:k1_dims]\n self.k2.theta = theta[k1_dims:]" }, { @@ -88491,7 +94091,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -88501,7 +94102,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} }, { "name": "Y", @@ -88511,7 +94113,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "Right argument of the returned kernel k(X, Y). If None, k(X, X)\nif evaluated instead." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -88521,13 +94124,14 @@ "docstring": { "type": "bool, default=False", "description": "Determines whether the gradient with respect to the log of\nthe kernel hyperparameter is computed.\nOnly supported when Y is None." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the kernel k(X, Y) and optionally its gradient.", - "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\neval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\nK_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.", + "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n ", "source_code": "\ndef __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n X = np.atleast_2d(X)\n length_scale = _check_length_scale(X, self.length_scale)\n if Y is None:\n dists = pdist(X / length_scale, metric='euclidean')\n else:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n dists = cdist(X / length_scale, Y / length_scale, metric='euclidean')\n if self.nu == 0.5:\n K = np.exp(-dists)\n elif self.nu == 1.5:\n K = dists * math.sqrt(3)\n K = (1.0 + K) * np.exp(-K)\n elif self.nu == 2.5:\n K = dists * math.sqrt(5)\n K = (1.0 + K + K**2 / 3.0) * np.exp(-K)\n elif self.nu == np.inf:\n K = np.exp(-dists**2 / 2.0)\n else:\n K = dists\n K[K == 0.0] += np.finfo(float).eps\n tmp = math.sqrt(2 * self.nu) * K\n K.fill(2**(1.0 - self.nu) / gamma(self.nu))\n K *= tmp**self.nu\n K *= kv(self.nu, tmp)\n if Y is None:\n K = squareform(K)\n np.fill_diagonal(K, 1)\n if eval_gradient:\n if self.hyperparameter_length_scale.fixed:\n K_gradient = np.empty((X.shape[0], X.shape[0], 0))\n return K, K_gradient\n if self.anisotropic:\n D = (X[:, np.newaxis, :] - X[np.newaxis, :, :])**2 / length_scale**2\n else:\n D = squareform(dists**2)[:, :, np.newaxis]\n if self.nu == 0.5:\n denominator = np.sqrt(D.sum(axis=2))[:, :, np.newaxis]\n K_gradient = K[..., np.newaxis] * np.divide(D, denominator, where=denominator != 0)\n elif self.nu == 1.5:\n K_gradient = 3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis]\n elif self.nu == 2.5:\n tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis]\n K_gradient = 5.0 / 3.0 * D * (tmp + 1) * np.exp(-tmp)\n elif self.nu == np.inf:\n K_gradient = D * K[..., np.newaxis]\n else:\n \n def f(theta):\n return self.clone_with_theta(theta)(X, Y)\n return K, _approx_fprime(self.theta, f, 1e-10)\n if not self.anisotropic:\n return K, K_gradient[:, :].sum(-1)[:, :, np.newaxis]\n else:\n return K, K_gradient\n else:\n return K" }, { @@ -88545,7 +94149,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "length_scale", @@ -88555,7 +94160,8 @@ "docstring": { "type": "float or ndarray of shape (n_features,), default=1.0", "description": "The length scale of the kernel. If a float, an isotropic kernel is\nused. If an array, an anisotropic kernel is used where each dimension\nof l defines the length-scale of the respective feature dimension." - } + }, + "refined_type": {} }, { "name": "length_scale_bounds", @@ -88565,7 +94171,8 @@ "docstring": { "type": "pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)", "description": "The lower and upper bound on 'length_scale'.\nIf set to \"fixed\", 'length_scale' cannot be changed during\nhyperparameter tuning." - } + }, + "refined_type": {} }, { "name": "nu", @@ -88575,13 +94182,14 @@ "docstring": { "type": "float, default=1.5", "description": "The parameter nu controlling the smoothness of the learned function.\nThe smaller nu, the less smooth the approximated function is.\nFor nu=inf, the kernel becomes equivalent to the RBF kernel and for\nnu=0.5 to the absolute exponential kernel. Important intermediate\nvalues are nu=1.5 (once differentiable functions) and nu=2.5\n(twice differentiable functions). Note that values of nu not in\n[0.5, 1.5, 2.5, inf] incur a considerably higher computational cost\n(appr. 10 times higher) since they require to evaluate the modified\nBessel function. Furthermore, in contrast to l, nu is kept fixed to\nits initial value and not optimized." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, length_scale=1.0, length_scale_bounds=(1e-05, 100000.0), nu=1.5):\n super().__init__(length_scale, length_scale_bounds)\n self.nu = nu" }, { @@ -88599,13 +94207,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n if self.anisotropic:\n return '{0}(length_scale=[{1}], nu={2:.3g})'.format(self.__class__.__name__, ', '.join(map('{0:.3g}'.format, self.length_scale)), self.nu)\n else:\n return '{0}(length_scale={1:.3g}, nu={2:.3g})'.format(self.__class__.__name__, np.ravel(self.length_scale)[0], self.nu)" }, { @@ -88623,7 +94232,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -88633,13 +94243,14 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however, it can be evaluated more efficiently since only the diagonal is evaluated.", - "docstring": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\nReturns\n-------\nK_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)", + "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.", + "docstring": "Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n ", "source_code": "\ndef diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return np.ones(X.shape[0])" }, { @@ -88657,7 +94268,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -88667,7 +94279,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} }, { "name": "Y", @@ -88677,7 +94290,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "Right argument of the returned kernel k(X, Y). If None, k(X, X)\nif evaluated instead." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -88687,13 +94301,14 @@ "docstring": { "type": "bool, default=False", "description": "Determines whether the gradient with respect to the log of\nthe kernel hyperparameter is computed.\nOnly supported when Y is None." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the kernel k(X, Y) and optionally its gradient.", - "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\neval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\nK_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.", + "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n ", "source_code": "\ndef __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n pairwise_kernels_kwargs = self.pairwise_kernels_kwargs\n if self.pairwise_kernels_kwargs is None:\n pairwise_kernels_kwargs = {}\n X = np.atleast_2d(X)\n K = pairwise_kernels(X, Y, metric=self.metric, gamma=self.gamma, filter_params=True, **pairwise_kernels_kwargs)\n if eval_gradient:\n if self.hyperparameter_gamma.fixed:\n return K, np.empty((X.shape[0], X.shape[0], 0))\n else:\n \n def f(gamma):\n return pairwise_kernels(X, Y, metric=self.metric, gamma=np.exp(gamma), filter_params=True, **pairwise_kernels_kwargs)\n return K, _approx_fprime(self.theta, f, 1e-10)\n else:\n return K" }, { @@ -88711,7 +94326,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gamma", @@ -88721,7 +94337,8 @@ "docstring": { "type": "float, default=1.0", "description": "Parameter gamma of the pairwise kernel specified by metric. It should\nbe positive." - } + }, + "refined_type": {} }, { "name": "gamma_bounds", @@ -88731,7 +94348,8 @@ "docstring": { "type": "pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)", "description": "The lower and upper bound on 'gamma'.\nIf set to \"fixed\", 'gamma' cannot be changed during\nhyperparameter tuning." - } + }, + "refined_type": {} }, { "name": "metric", @@ -88741,6 +94359,20 @@ "docstring": { "type": "{\"linear\", \"additive_chi2\", \"chi2\", \"poly\", \"polynomial\", \"rbf\", \"laplacian\", \"sigmoid\", \"cosine\"} or callable, default=\"linear\"", "description": "The metric to use when calculating kernel between instances in a\nfeature array. If metric is a string, it must be one of the metrics\nin pairwise.PAIRWISE_KERNEL_FUNCTIONS.\nIf metric is \"precomputed\", X is assumed to be a kernel matrix.\nAlternatively, if metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two arrays from X as input and return a value indicating\nthe distance between them." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "additive_chi2", + "poly", + "cosine", + "rbf", + "chi2", + "laplacian", + "polynomial", + "sigmoid", + "linear" + ] } }, { @@ -88751,13 +94383,14 @@ "docstring": { "type": "dict, default=None", "description": "All entries of this dict (if any) are passed as keyword arguments to\nthe pairwise kernel function." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, gamma=1.0, gamma_bounds=(1e-05, 100000.0), metric='linear', pairwise_kernels_kwargs=None):\n self.gamma = gamma\n self.gamma_bounds = gamma_bounds\n self.metric = metric\n self.pairwise_kernels_kwargs = pairwise_kernels_kwargs" }, { @@ -88775,13 +94408,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return '{0}(gamma={1}, metric={2})'.format(self.__class__.__name__, self.gamma, self.metric)" }, { @@ -88799,7 +94433,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -88809,13 +94444,14 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however, it can be evaluated more efficiently since only the diagonal is evaluated.", - "docstring": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\nReturns\n-------\nK_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)", + "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.", + "docstring": "Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n ", "source_code": "\ndef diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return np.apply_along_axis(self, 1, X).ravel()" }, { @@ -88833,13 +94469,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef hyperparameter_gamma(self):\n return Hyperparameter('gamma', 'numeric', self.gamma_bounds)" }, { @@ -88857,7 +94494,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -88881,7 +94519,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -88891,7 +94530,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} }, { "name": "Y", @@ -88901,7 +94541,8 @@ "docstring": { "type": "array-like of shape (n_samples_Y, n_features) or list of object, default=None", "description": "Right argument of the returned kernel k(X, Y). If None, k(X, X)\nis evaluated instead." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -88911,13 +94552,14 @@ "docstring": { "type": "bool, default=False", "description": "Determines whether the gradient with respect to the log of\nthe kernel hyperparameter is computed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the kernel k(X, Y) and optionally its gradient.", - "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\nY : array-like of shape (n_samples_Y, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\neval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\nK_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.", + "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_Y, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n ", "source_code": "\ndef __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_Y, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n if eval_gradient:\n (K1, K1_gradient) = self.k1(X, Y, eval_gradient=True)\n (K2, K2_gradient) = self.k2(X, Y, eval_gradient=True)\n return K1 * K2, np.dstack((K1_gradient * K2[:, :, np.newaxis], K2_gradient * K1[:, :, np.newaxis]))\n else:\n return self.k1(X, Y) * self.k2(X, Y)" }, { @@ -88935,13 +94577,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return '{0} * {1}'.format(self.k1, self.k2)" }, { @@ -88959,7 +94602,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -88969,13 +94613,14 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object", "description": "Argument to the kernel." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however, it can be evaluated more efficiently since only the diagonal is evaluated.", - "docstring": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\nReturns\n-------\nK_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)", + "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.", + "docstring": "Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n ", "source_code": "\ndef diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return self.k1.diag(X) * self.k2.diag(X)" }, { @@ -88993,7 +94638,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -89003,7 +94649,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} }, { "name": "Y", @@ -89013,7 +94660,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "Right argument of the returned kernel k(X, Y). If None, k(X, X)\nif evaluated instead." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -89023,13 +94671,14 @@ "docstring": { "type": "bool, default=False", "description": "Determines whether the gradient with respect to the log of\nthe kernel hyperparameter is computed.\nOnly supported when Y is None." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the kernel k(X, Y) and optionally its gradient.", - "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\neval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\nK_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.", + "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n ", "source_code": "\ndef __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n X = np.atleast_2d(X)\n length_scale = _check_length_scale(X, self.length_scale)\n if Y is None:\n dists = pdist(X / length_scale, metric='sqeuclidean')\n K = np.exp(-0.5 * dists)\n K = squareform(K)\n np.fill_diagonal(K, 1)\n else:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n dists = cdist(X / length_scale, Y / length_scale, metric='sqeuclidean')\n K = np.exp(-0.5 * dists)\n if eval_gradient:\n if self.hyperparameter_length_scale.fixed:\n return K, np.empty((X.shape[0], X.shape[0], 0))\n elif not self.anisotropic or length_scale.shape[0] == 1:\n K_gradient = (K * squareform(dists))[:, :, np.newaxis]\n return K, K_gradient\n elif self.anisotropic:\n K_gradient = (X[:, np.newaxis, :] - X[np.newaxis, :, :])**2 / length_scale**2\n K_gradient *= K[..., np.newaxis]\n return K, K_gradient\n else:\n return K" }, { @@ -89047,7 +94696,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "length_scale", @@ -89057,7 +94707,8 @@ "docstring": { "type": "float or ndarray of shape (n_features,), default=1.0", "description": "The length scale of the kernel. If a float, an isotropic kernel is\nused. If an array, an anisotropic kernel is used where each dimension\nof l defines the length-scale of the respective feature dimension." - } + }, + "refined_type": {} }, { "name": "length_scale_bounds", @@ -89067,13 +94718,14 @@ "docstring": { "type": "pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)", "description": "The lower and upper bound on 'length_scale'.\nIf set to \"fixed\", 'length_scale' cannot be changed during\nhyperparameter tuning." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, length_scale=1.0, length_scale_bounds=(1e-05, 100000.0)):\n self.length_scale = length_scale\n self.length_scale_bounds = length_scale_bounds" }, { @@ -89091,13 +94743,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n if self.anisotropic:\n return '{0}(length_scale=[{1}])'.format(self.__class__.__name__, ', '.join(map('{0:.3g}'.format, self.length_scale)))\n else:\n return '{0}(length_scale={1:.3g})'.format(self.__class__.__name__, np.ravel(self.length_scale)[0])" }, { @@ -89115,13 +94768,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef anisotropic(self):\n return np.iterable(self.length_scale) and len(self.length_scale) > 1" }, { @@ -89139,13 +94793,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef hyperparameter_length_scale(self):\n if self.anisotropic:\n return Hyperparameter('length_scale', 'numeric', self.length_scale_bounds, len(self.length_scale))\n return Hyperparameter('length_scale', 'numeric', self.length_scale_bounds)" }, { @@ -89163,7 +94818,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -89173,7 +94829,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} }, { "name": "Y", @@ -89183,7 +94840,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "Right argument of the returned kernel k(X, Y). If None, k(X, X)\nif evaluated instead." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -89193,13 +94851,14 @@ "docstring": { "type": "bool, default=False", "description": "Determines whether the gradient with respect to the log of\nthe kernel hyperparameter is computed.\nOnly supported when Y is None." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the kernel k(X, Y) and optionally its gradient.", - "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\neval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\nK_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims)\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when eval_gradient\n is True.", + "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims)\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when eval_gradient\n is True.\n ", "source_code": "\ndef __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n Left argument of the returned kernel k(X, Y)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n if evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims)\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when eval_gradient\n is True.\n \"\"\"\n if len(np.atleast_1d(self.length_scale)) > 1:\n raise AttributeError('RationalQuadratic kernel only supports isotropic version, please use a single scalar for length_scale')\n X = np.atleast_2d(X)\n if Y is None:\n dists = squareform(pdist(X, metric='sqeuclidean'))\n tmp = dists / (2 * self.alpha * self.length_scale**2)\n base = 1 + tmp\n K = base**(-self.alpha)\n np.fill_diagonal(K, 1)\n else:\n if eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n dists = cdist(X, Y, metric='sqeuclidean')\n K = (1 + dists / (2 * self.alpha * self.length_scale**2))**(-self.alpha)\n if eval_gradient:\n if not self.hyperparameter_length_scale.fixed:\n length_scale_gradient = dists * K / (self.length_scale**2 * base)\n length_scale_gradient = length_scale_gradient[:, :, np.newaxis]\n else:\n length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0))\n if not self.hyperparameter_alpha.fixed:\n alpha_gradient = K * (-self.alpha * np.log(base) + dists / (2 * self.length_scale**2 * base))\n alpha_gradient = alpha_gradient[:, :, np.newaxis]\n else:\n alpha_gradient = np.empty((K.shape[0], K.shape[1], 0))\n return K, np.dstack((alpha_gradient, length_scale_gradient))\n else:\n return K" }, { @@ -89217,7 +94876,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "length_scale", @@ -89227,7 +94887,8 @@ "docstring": { "type": "float > 0, default=1.0", "description": "The length scale of the kernel." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -89237,7 +94898,8 @@ "docstring": { "type": "float > 0, default=1.0", "description": "Scale mixture parameter" - } + }, + "refined_type": {} }, { "name": "length_scale_bounds", @@ -89247,7 +94909,8 @@ "docstring": { "type": "pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)", "description": "The lower and upper bound on 'length_scale'.\nIf set to \"fixed\", 'length_scale' cannot be changed during\nhyperparameter tuning." - } + }, + "refined_type": {} }, { "name": "alpha_bounds", @@ -89257,13 +94920,14 @@ "docstring": { "type": "pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)", "description": "The lower and upper bound on 'alpha'.\nIf set to \"fixed\", 'alpha' cannot be changed during\nhyperparameter tuning." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, length_scale=1.0, alpha=1.0, length_scale_bounds=(1e-05, 100000.0), alpha_bounds=(1e-05, 100000.0)):\n self.length_scale = length_scale\n self.alpha = alpha\n self.length_scale_bounds = length_scale_bounds\n self.alpha_bounds = alpha_bounds" }, { @@ -89281,13 +94945,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return '{0}(alpha={1:.3g}, length_scale={2:.3g})'.format(self.__class__.__name__, self.alpha, self.length_scale)" }, { @@ -89305,13 +94970,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef hyperparameter_alpha(self):\n return Hyperparameter('alpha', 'numeric', self.alpha_bounds)" }, { @@ -89329,13 +94995,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef hyperparameter_length_scale(self):\n return Hyperparameter('length_scale', 'numeric', self.length_scale_bounds)" }, { @@ -89353,7 +95020,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -89377,7 +95045,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -89387,7 +95056,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} }, { "name": "Y", @@ -89397,7 +95067,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object, default=None", "description": "Right argument of the returned kernel k(X, Y). If None, k(X, X)\nis evaluated instead." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -89407,13 +95078,14 @@ "docstring": { "type": "bool, default=False", "description": "Determines whether the gradient with respect to the log of\nthe kernel hyperparameter is computed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the kernel k(X, Y) and optionally its gradient.", - "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\nY : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\neval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\nK_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.", + "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n ", "source_code": "\ndef __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when `eval_gradient`\n is True.\n \"\"\"\n if eval_gradient:\n (K1, K1_gradient) = self.k1(X, Y, eval_gradient=True)\n (K2, K2_gradient) = self.k2(X, Y, eval_gradient=True)\n return K1 + K2, np.dstack((K1_gradient, K2_gradient))\n else:\n return self.k1(X, Y) + self.k2(X, Y)" }, { @@ -89431,13 +95103,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return '{0} + {1}'.format(self.k1, self.k2)" }, { @@ -89455,7 +95128,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -89465,13 +95139,14 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object", "description": "Argument to the kernel." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to `np.diag(self(X))`; however, it can be evaluated more efficiently since only the diagonal is evaluated.", - "docstring": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to `np.diag(self(X))`; however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\nReturns\n-------\nK_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)", + "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to `np.diag(self(X))`; however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.", + "docstring": "Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to `np.diag(self(X))`; however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n ", "source_code": "\ndef diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to `np.diag(self(X))`; however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return self.k1.diag(X) + self.k2.diag(X)" }, { @@ -89489,7 +95164,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -89499,7 +95175,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object", "description": "Left argument of the returned kernel k(X, Y)" - } + }, + "refined_type": {} }, { "name": "Y", @@ -89509,7 +95186,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object, default=None", "description": "Right argument of the returned kernel k(X, Y). If None, k(X, X)\nis evaluated instead." - } + }, + "refined_type": {} }, { "name": "eval_gradient", @@ -89519,13 +95197,14 @@ "docstring": { "type": "bool, default=False", "description": "Determines whether the gradient with respect to the log of\nthe kernel hyperparameter is computed.\nOnly supported when Y is None." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the kernel k(X, Y) and optionally its gradient.", - "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\nY : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\neval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\nK_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when eval_gradient\n is True.", + "docstring": "Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when eval_gradient\n is True.\n ", "source_code": "\ndef __call__(self, X, Y=None, eval_gradient=False):\n \"\"\"Return the kernel k(X, Y) and optionally its gradient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Left argument of the returned kernel k(X, Y)\n\n Y : array-like of shape (n_samples_X, n_features) or list of object, default=None\n Right argument of the returned kernel k(X, Y). If None, k(X, X)\n is evaluated instead.\n\n eval_gradient : bool, default=False\n Determines whether the gradient with respect to the log of\n the kernel hyperparameter is computed.\n Only supported when Y is None.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_Y)\n Kernel k(X, Y)\n\n K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), optional\n The gradient of the kernel k(X, X) with respect to the log of the\n hyperparameter of the kernel. Only returned when eval_gradient\n is True.\n \"\"\"\n if Y is not None and eval_gradient:\n raise ValueError('Gradient can only be evaluated when Y is None.')\n if Y is None:\n K = self.noise_level * np.eye(_num_samples(X))\n if eval_gradient:\n if not self.hyperparameter_noise_level.fixed:\n return K, self.noise_level * np.eye(_num_samples(X))[:, :, np.newaxis]\n else:\n return K, np.empty((_num_samples(X), _num_samples(X), 0))\n else:\n return K\n else:\n return np.zeros((_num_samples(X), _num_samples(Y)))" }, { @@ -89543,7 +95222,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "noise_level", @@ -89553,7 +95233,8 @@ "docstring": { "type": "float, default=1.0", "description": "Parameter controlling the noise level (variance)" - } + }, + "refined_type": {} }, { "name": "noise_level_bounds", @@ -89563,13 +95244,14 @@ "docstring": { "type": "pair of floats >= 0 or \"fixed\", default=(1e-5, 1e5)", "description": "The lower and upper bound on 'noise_level'.\nIf set to \"fixed\", 'noise_level' cannot be changed during\nhyperparameter tuning." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, noise_level=1.0, noise_level_bounds=(1e-05, 100000.0)):\n self.noise_level = noise_level\n self.noise_level_bounds = noise_level_bounds" }, { @@ -89587,13 +95269,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return '{0}(noise_level={1:.3g})'.format(self.__class__.__name__, self.noise_level)" }, { @@ -89611,7 +95294,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -89621,13 +95305,14 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features) or list of object", "description": "Argument to the kernel." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however, it can be evaluated more efficiently since only the diagonal is evaluated.", - "docstring": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\nReturns\n-------\nK_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)", + "description": "Returns the diagonal of the kernel k(X, X).\n\nThe result of this method is identical to np.diag(self(X)); however,\nit can be evaluated more efficiently since only the diagonal is\nevaluated.", + "docstring": "Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n ", "source_code": "\ndef diag(self, X):\n \"\"\"Returns the diagonal of the kernel k(X, X).\n\n The result of this method is identical to np.diag(self(X)); however,\n it can be evaluated more efficiently since only the diagonal is\n evaluated.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features) or list of object\n Argument to the kernel.\n\n Returns\n -------\n K_diag : ndarray of shape (n_samples_X,)\n Diagonal of kernel k(X, X)\n \"\"\"\n return np.full(_num_samples(X), self.noise_level, dtype=np.array(self.noise_level).dtype)" }, { @@ -89645,13 +95330,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef hyperparameter_noise_level(self):\n return Hyperparameter('noise_level', 'numeric', self.noise_level_bounds)" }, { @@ -89669,7 +95355,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "f", @@ -89679,7 +95366,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -89689,7 +95377,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "args", @@ -89699,13 +95388,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _approx_fprime(xk, f, epsilon, args=()):\n f0 = f(*(xk, ) + args)\n grad = np.zeros((f0.shape[0], f0.shape[1], len(xk)), float)\n ei = np.zeros((len(xk), ), float)\n for k in range(len(xk)):\n ei[k] = 1.0\n d = epsilon * ei\n grad[:, :, k] = (f(*(xk + d, ) + args) - f0) / d[k]\n ei[k] = 0.0\n return grad" }, { @@ -89723,7 +95413,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "length_scale", @@ -89733,13 +95424,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_length_scale(X, length_scale):\n length_scale = np.squeeze(length_scale).astype(float)\n if np.ndim(length_scale) > 1:\n raise ValueError('length_scale cannot be of dimension greater than 1')\n if np.ndim(length_scale) == 1 and X.shape[1] != length_scale.shape[0]:\n raise ValueError('Anisotropic kernel must have the same number of dimensions as data (%d!=%d)' % (length_scale.shape[0], X.shape[1]))\n return length_scale" }, { @@ -89757,7 +95449,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "missing_values", @@ -89767,7 +95460,8 @@ "docstring": { "type": "int, float, str, np.nan or None, default=np.nan", "description": "The placeholder for the missing values. All occurrences of\n`missing_values` will be imputed. For pandas' dataframes with\nnullable integer dtypes with missing values, `missing_values`\nshould be set to `np.nan`, since `pd.NA` will be converted to `np.nan`." - } + }, + "refined_type": {} }, { "name": "features", @@ -89777,6 +95471,10 @@ "docstring": { "type": "{'missing-only', 'all'}, default='missing-only'", "description": "Whether the imputer mask should represent all or a subset of\nfeatures.\n\n- If `'missing-only'` (default), the imputer mask will only represent\n features containing missing values during fit time.\n- If `'all'`, the imputer mask will represent all features." + }, + "refined_type": { + "kind": "EnumType", + "values": ["missing-only", "all"] } }, { @@ -89787,7 +95485,8 @@ "docstring": { "type": "bool or 'auto', default='auto'", "description": "Whether the imputer mask format should be sparse or dense.\n\n- If `'auto'` (default), the imputer mask will be of same type as\n input.\n- If `True`, the imputer mask will be a sparse matrix.\n- If `False`, the imputer mask will be a numpy array." - } + }, + "refined_type": {} }, { "name": "error_on_new", @@ -89797,13 +95496,14 @@ "docstring": { "type": "bool, default=True", "description": "If `True`, :meth:`transform` will raise an error when there are\nfeatures with missing values that have no missing values in\n:meth:`fit`. This is applicable only when `features='missing-only'`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, missing_values=np.nan, features='missing-only', sparse='auto', error_on_new=True):\n self.missing_values = missing_values\n self.features = features\n self.sparse = sparse\n self.error_on_new = error_on_new" }, { @@ -89821,7 +95521,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -89831,6 +95532,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input data, where `n_samples` is the number of samples and\n`n_features` is the number of features.\nIf `precomputed=True`, then `X` is a mask of the input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -89841,7 +95546,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "precomputed", @@ -89851,13 +95557,14 @@ "docstring": { "type": "bool", "description": "Whether the input data is a mask." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit the transformer on `X`.", - "docstring": "Fit the transformer on `X`.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n If `precomputed=True`, then `X` is a mask of the input data.\n\nprecomputed : bool\n Whether the input data is a mask.\n\nReturns\n-------\nimputer_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The imputer mask of the original data.", + "docstring": "Fit the transformer on `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n If `precomputed=True`, then `X` is a mask of the input data.\n\n precomputed : bool\n Whether the input data is a mask.\n\n Returns\n -------\n imputer_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The imputer mask of the original data.\n ", "source_code": "\ndef _fit(self, X, y=None, precomputed=False):\n \"\"\"Fit the transformer on `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n If `precomputed=True`, then `X` is a mask of the input data.\n\n precomputed : bool\n Whether the input data is a mask.\n\n Returns\n -------\n imputer_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The imputer mask of the original data.\n \"\"\"\n if precomputed:\n if not (hasattr(X, 'dtype') and X.dtype.kind == 'b'):\n raise ValueError('precomputed is True but the input data is not a mask')\n self._precomputed = True\n else:\n self._precomputed = False\n if not self._precomputed:\n X = self._validate_input(X, in_fit=True)\n self._n_features = X.shape[1]\n if self.features not in ('missing-only', 'all'):\n raise ValueError(\"'features' has to be either 'missing-only' or 'all'. Got {} instead.\".format(self.features))\n if not (isinstance(self.sparse, str) and self.sparse == 'auto' or isinstance(self.sparse, bool)):\n raise ValueError(\"'sparse' has to be a boolean or 'auto'. Got {!r} instead.\".format(self.sparse))\n missing_features_info = self._get_missing_features_info(X)\n self.features_ = missing_features_info[1]\n return missing_features_info[0]" }, { @@ -89875,7 +95582,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -89885,13 +95593,17 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "The input data with missing values. Note that `X` has been\nchecked in :meth:`fit` and :meth:`transform` before to call this\nfunction." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Compute the imputer mask and the indices of the features containing missing values.", - "docstring": "Compute the imputer mask and the indices of the features\ncontaining missing values.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input data with missing values. Note that `X` has been\n checked in :meth:`fit` and :meth:`transform` before to call this\n function.\n\nReturns\n-------\nimputer_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The imputer mask of the original data.\n\nfeatures_with_missing : ndarray of shape (n_features_with_missing)\n The features containing missing values.", + "description": "Compute the imputer mask and the indices of the features\ncontaining missing values.", + "docstring": "Compute the imputer mask and the indices of the features\n containing missing values.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input data with missing values. Note that `X` has been\n checked in :meth:`fit` and :meth:`transform` before to call this\n function.\n\n Returns\n -------\n imputer_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The imputer mask of the original data.\n\n features_with_missing : ndarray of shape (n_features_with_missing)\n The features containing missing values.\n ", "source_code": "\ndef _get_missing_features_info(self, X):\n \"\"\"Compute the imputer mask and the indices of the features\n containing missing values.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input data with missing values. Note that `X` has been\n checked in :meth:`fit` and :meth:`transform` before to call this\n function.\n\n Returns\n -------\n imputer_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The imputer mask of the original data.\n\n features_with_missing : ndarray of shape (n_features_with_missing)\n The features containing missing values.\n \"\"\"\n if not self._precomputed:\n imputer_mask = _get_mask(X, self.missing_values)\n else:\n imputer_mask = X\n if sp.issparse(X):\n imputer_mask.eliminate_zeros()\n if self.features == 'missing-only':\n n_missing = imputer_mask.getnnz(axis=0)\n if self.sparse is False:\n imputer_mask = imputer_mask.toarray()\n elif imputer_mask.format == 'csr':\n imputer_mask = imputer_mask.tocsc()\n else:\n if not self._precomputed:\n imputer_mask = _get_mask(X, self.missing_values)\n else:\n imputer_mask = X\n if self.features == 'missing-only':\n n_missing = imputer_mask.sum(axis=0)\n if self.sparse is True:\n imputer_mask = sp.csc_matrix(imputer_mask)\n if self.features == 'all':\n features_indices = np.arange(X.shape[1])\n else:\n features_indices = np.flatnonzero(n_missing)\n return imputer_mask, features_indices" }, { @@ -89909,13 +95621,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'allow_nan': True, 'X_types': ['2darray', 'string'], 'preserves_dtype': []}" }, { @@ -89933,7 +95646,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -89943,7 +95657,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "in_fit", @@ -89953,13 +95668,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_input(self, X, in_fit):\n if not is_scalar_nan(self.missing_values):\n force_all_finite = True\n else:\n force_all_finite = 'allow-nan'\n X = self._validate_data(X, reset=in_fit, accept_sparse=('csc', 'csr'), dtype=None, force_all_finite=force_all_finite)\n _check_inputs_dtype(X, self.missing_values)\n if X.dtype.kind not in ('i', 'u', 'f', 'O'):\n raise ValueError('MissingIndicator does not support data with dtype {0}. Please provide either a numeric array (with a floating point or integer dtype) or categorical data represented either as an array with integer dtype or an array of string values with an object dtype.'.format(X.dtype))\n if sp.issparse(X) and self.missing_values == 0:\n raise ValueError('Sparse input with missing_values=0 is not supported. Provide a dense array instead.')\n return X" }, { @@ -89977,7 +95693,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -89987,6 +95704,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input data, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -89997,13 +95718,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the transformer on `X`.", - "docstring": "Fit the transformer on `X`.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the transformer on `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the transformer on `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self._fit(X, y)\n return self" }, { @@ -90021,7 +95743,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -90031,6 +95754,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data to complete." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -90041,13 +95768,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate missing values indicator for `X`.", - "docstring": "Generate missing values indicator for `X`.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data to complete.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nXt : {ndarray, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_features_with_missing)\n The missing indicator for input data. The data type of `Xt`\n will be boolean.", + "docstring": "Generate missing values indicator for `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data to complete.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_features_with_missing)\n The missing indicator for input data. The data type of `Xt`\n will be boolean.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Generate missing values indicator for `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data to complete.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_features_with_missing)\n The missing indicator for input data. The data type of `Xt`\n will be boolean.\n \"\"\"\n imputer_mask = self._fit(X, y)\n if self.features_.size < self._n_features:\n imputer_mask = imputer_mask[:, self.features_]\n return imputer_mask" }, { @@ -90065,7 +95793,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -90075,13 +95804,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data to complete." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Generate missing values indicator for `X`.", - "docstring": "Generate missing values indicator for `X`.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data to complete.\n\nReturns\n-------\nXt : {ndarray, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_features_with_missing)\n The missing indicator for input data. The data type of `Xt`\n will be boolean.", + "docstring": "Generate missing values indicator for `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data to complete.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_features_with_missing)\n The missing indicator for input data. The data type of `Xt`\n will be boolean.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Generate missing values indicator for `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data to complete.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_features_with_missing)\n The missing indicator for input data. The data type of `Xt`\n will be boolean.\n \"\"\"\n check_is_fitted(self)\n if not self._precomputed:\n X = self._validate_input(X, in_fit=False)\n elif not (hasattr(X, 'dtype') and X.dtype.kind == 'b'):\n raise ValueError('precomputed is True but the input data is not a mask')\n (imputer_mask, features) = self._get_missing_features_info(X)\n if self.features == 'missing-only':\n features_diff_fit_trans = np.setdiff1d(features, self.features_)\n if self.error_on_new and features_diff_fit_trans.size > 0:\n raise ValueError('The features {} have missing values in transform but have no missing values in fit.'.format(features_diff_fit_trans))\n if self.features_.size < self._n_features:\n imputer_mask = imputer_mask[:, self.features_]\n return imputer_mask" }, { @@ -90099,7 +95832,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "missing_values", @@ -90109,7 +95843,8 @@ "docstring": { "type": "int, float, str, np.nan or None, default=np.nan", "description": "The placeholder for the missing values. All occurrences of\n`missing_values` will be imputed. For pandas' dataframes with\nnullable integer dtypes with missing values, `missing_values`\nshould be set to `np.nan`, since `pd.NA` will be converted to `np.nan`." - } + }, + "refined_type": {} }, { "name": "strategy", @@ -90119,7 +95854,8 @@ "docstring": { "type": "str, default='mean'", "description": "The imputation strategy.\n\n- If \"mean\", then replace missing values using the mean along\n each column. Can only be used with numeric data.\n- If \"median\", then replace missing values using the median along\n each column. Can only be used with numeric data.\n- If \"most_frequent\", then replace missing using the most frequent\n value along each column. Can be used with strings or numeric data.\n If there is more than one such value, only the smallest is returned.\n- If \"constant\", then replace missing values with fill_value. Can be\n used with strings or numeric data.\n\n.. versionadded:: 0.20\n strategy=\"constant\" for fixed value imputation." - } + }, + "refined_type": {} }, { "name": "fill_value", @@ -90129,7 +95865,8 @@ "docstring": { "type": "str or numerical value, default=None", "description": "When strategy == \"constant\", fill_value is used to replace all\noccurrences of missing_values.\nIf left to the default, fill_value will be 0 when imputing numerical\ndata and \"missing_value\" for strings or object data types." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -90139,7 +95876,8 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity of the imputer." - } + }, + "refined_type": {} }, { "name": "copy", @@ -90149,7 +95887,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, a copy of `X` will be created. If False, imputation will\nbe done in-place whenever possible. Note that, in the following cases,\na new copy will always be made, even if `copy=False`:\n\n- If `X` is not an array of floating values;\n- If `X` is encoded as a CSR matrix;\n- If `add_indicator=True`." - } + }, + "refined_type": {} }, { "name": "add_indicator", @@ -90159,13 +95898,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, a :class:`MissingIndicator` transform will stack onto output\nof the imputer's transform. This allows a predictive estimator\nto account for missingness despite imputation. If a feature has no\nmissing values at fit/train time, the feature won't appear on\nthe missing indicator even if there are missing values at\ntransform/test time." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, missing_values=np.nan, strategy='mean', fill_value=None, verbose=0, copy=True, add_indicator=False):\n super().__init__(missing_values=missing_values, add_indicator=add_indicator)\n self.strategy = strategy\n self.fill_value = fill_value\n self.verbose = verbose\n self.copy = copy" }, { @@ -90183,7 +95923,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -90193,7 +95934,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "strategy", @@ -90203,7 +95945,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "missing_values", @@ -90213,7 +95956,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fill_value", @@ -90223,7 +95967,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -90247,7 +95992,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -90257,7 +96003,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "strategy", @@ -90267,7 +96014,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "missing_values", @@ -90277,7 +96025,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fill_value", @@ -90287,7 +96036,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -90311,7 +96061,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -90321,7 +96072,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "in_fit", @@ -90331,13 +96083,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_input(self, X, in_fit):\n allowed_strategies = ['mean', 'median', 'most_frequent', 'constant']\n if self.strategy not in allowed_strategies:\n raise ValueError('Can only use these strategies: {0} got strategy={1}'.format(allowed_strategies, self.strategy))\n if self.strategy in ('most_frequent', 'constant'):\n if isinstance(X, list) and any((isinstance(elem, str) for row in X for elem in row)):\n dtype = object\n else:\n dtype = None\n else:\n dtype = FLOAT_DTYPES\n if not is_scalar_nan(self.missing_values):\n force_all_finite = True\n else:\n force_all_finite = 'allow-nan'\n try:\n X = self._validate_data(X, reset=in_fit, accept_sparse='csc', dtype=dtype, force_all_finite=force_all_finite, copy=self.copy)\n except ValueError as ve:\n if 'could not convert' in str(ve):\n new_ve = ValueError('Cannot use {} strategy with non-numeric data:\\n{}'.format(self.strategy, ve))\n raise new_ve from None\n else:\n raise ve\n _check_inputs_dtype(X, self.missing_values)\n if X.dtype.kind not in ('i', 'u', 'f', 'O'):\n raise ValueError('SimpleImputer does not support data with dtype {0}. Please provide either a numeric array (with a floating point or integer dtype) or categorical data represented either as an array with integer dtype or an array of string values with an object dtype.'.format(X.dtype))\n return X" }, { @@ -90355,7 +96108,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -90365,6 +96119,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Input data, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -90375,13 +96133,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the imputer on `X`.", - "docstring": "Fit the imputer on `X`.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the imputer on `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the imputer on `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n X = self._validate_input(X, in_fit=True)\n if self.fill_value is None:\n if X.dtype.kind in ('i', 'u', 'f'):\n fill_value = 0\n else:\n fill_value = 'missing_value'\n else:\n fill_value = self.fill_value\n if self.strategy == 'constant' and X.dtype.kind in ('i', 'u', 'f') and not isinstance(fill_value, numbers.Real):\n raise ValueError(\"'fill_value'={0} is invalid. Expected a numerical value when imputing numerical data\".format(fill_value))\n if sp.issparse(X):\n if self.missing_values == 0:\n raise ValueError('Imputation not possible when missing_values == 0 and input is sparse. Provide a dense array instead.')\n else:\n self.statistics_ = self._sparse_fit(X, self.strategy, self.missing_values, fill_value)\n else:\n self.statistics_ = self._dense_fit(X, self.strategy, self.missing_values, fill_value)\n return self" }, { @@ -90399,7 +96158,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -90409,13 +96169,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features + n_features_missing_indicator)", "description": "The imputed data to be reverted to original data. It has to be\nan augmented array of imputed data and the missing indicator mask." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Convert the data back to the original representation.\n\nInverts the `transform` operation performed on an array. This operation can only be performed after :class:`SimpleImputer` is instantiated with `add_indicator=True`. Note that `inverse_transform` can only invert the transform in features that have binary indicators for missing values. If a feature has no missing values at `fit` time, the feature won't have a binary indicator, and the imputation done at `transform` time won't be inverted. .. versionadded:: 0.24", - "docstring": "Convert the data back to the original representation.\n\nInverts the `transform` operation performed on an array.\nThis operation can only be performed after :class:`SimpleImputer` is\ninstantiated with `add_indicator=True`.\n\nNote that `inverse_transform` can only invert the transform in\nfeatures that have binary indicators for missing values. If a feature\nhas no missing values at `fit` time, the feature won't have a binary\nindicator, and the imputation done at `transform` time won't be\ninverted.\n\n.. versionadded:: 0.24\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features + n_features_missing_indicator)\n The imputed data to be reverted to original data. It has to be\n an augmented array of imputed data and the missing indicator mask.\n\nReturns\n-------\nX_original : ndarray of shape (n_samples, n_features)\n The original `X` with missing values as it was prior\n to imputation.", + "description": "Convert the data back to the original representation.\n\nInverts the `transform` operation performed on an array.\nThis operation can only be performed after :class:`SimpleImputer` is\ninstantiated with `add_indicator=True`.\n\nNote that `inverse_transform` can only invert the transform in\nfeatures that have binary indicators for missing values. If a feature\nhas no missing values at `fit` time, the feature won't have a binary\nindicator, and the imputation done at `transform` time won't be\ninverted.\n\n.. versionadded:: 0.24", + "docstring": "Convert the data back to the original representation.\n\n Inverts the `transform` operation performed on an array.\n This operation can only be performed after :class:`SimpleImputer` is\n instantiated with `add_indicator=True`.\n\n Note that `inverse_transform` can only invert the transform in\n features that have binary indicators for missing values. If a feature\n has no missing values at `fit` time, the feature won't have a binary\n indicator, and the imputation done at `transform` time won't be\n inverted.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features + n_features_missing_indicator)\n The imputed data to be reverted to original data. It has to be\n an augmented array of imputed data and the missing indicator mask.\n\n Returns\n -------\n X_original : ndarray of shape (n_samples, n_features)\n The original `X` with missing values as it was prior\n to imputation.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Convert the data back to the original representation.\n\n Inverts the `transform` operation performed on an array.\n This operation can only be performed after :class:`SimpleImputer` is\n instantiated with `add_indicator=True`.\n\n Note that `inverse_transform` can only invert the transform in\n features that have binary indicators for missing values. If a feature\n has no missing values at `fit` time, the feature won't have a binary\n indicator, and the imputation done at `transform` time won't be\n inverted.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features + n_features_missing_indicator)\n The imputed data to be reverted to original data. It has to be\n an augmented array of imputed data and the missing indicator mask.\n\n Returns\n -------\n X_original : ndarray of shape (n_samples, n_features)\n The original `X` with missing values as it was prior\n to imputation.\n \"\"\"\n check_is_fitted(self)\n if not self.add_indicator:\n raise ValueError(f\"'inverse_transform' works only when 'SimpleImputer' is instantiated with 'add_indicator=True'. Got 'add_indicator={self.add_indicator}' instead.\")\n n_features_missing = len(self.indicator_.features_)\n non_empty_feature_count = X.shape[1] - n_features_missing\n array_imputed = X[:, :non_empty_feature_count].copy()\n missing_mask = X[:, non_empty_feature_count:].astype(bool)\n n_features_original = len(self.statistics_)\n shape_original = (X.shape[0], n_features_original)\n X_original = np.zeros(shape_original)\n X_original[:, self.indicator_.features_] = missing_mask\n full_mask = X_original.astype(bool)\n (imputed_idx, original_idx) = (0, 0)\n while imputed_idx < len(array_imputed.T):\n if not np.all(X_original[:, original_idx]):\n X_original[:, original_idx] = array_imputed.T[imputed_idx]\n imputed_idx += 1\n original_idx += 1\n else:\n original_idx += 1\n X_original[full_mask] = self.missing_values\n return X_original" }, { @@ -90433,7 +96194,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -90443,13 +96205,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "The input data to complete." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Impute all missing values in `X`.", - "docstring": "Impute all missing values in `X`.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data to complete.\n\nReturns\n-------\nX_imputed : {ndarray, sparse matrix} of shape (n_samples, n_features_out)\n `X` with imputed values.", + "docstring": "Impute all missing values in `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data to complete.\n\n Returns\n -------\n X_imputed : {ndarray, sparse matrix} of shape (n_samples, n_features_out)\n `X` with imputed values.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Impute all missing values in `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data to complete.\n\n Returns\n -------\n X_imputed : {ndarray, sparse matrix} of shape (n_samples, n_features_out)\n `X` with imputed values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_input(X, in_fit=False)\n statistics = self.statistics_\n if X.shape[1] != statistics.shape[0]:\n raise ValueError('X has %d features per sample, expected %d' % (X.shape[1], self.statistics_.shape[0]))\n missing_mask = _get_mask(X, self.missing_values)\n if self.strategy == 'constant':\n valid_statistics = statistics\n valid_statistics_indexes = None\n else:\n invalid_mask = _get_mask(statistics, np.nan)\n valid_mask = np.logical_not(invalid_mask)\n valid_statistics = statistics[valid_mask]\n valid_statistics_indexes = np.flatnonzero(valid_mask)\n if invalid_mask.any():\n missing = np.arange(X.shape[1])[invalid_mask]\n if self.verbose:\n warnings.warn('Deleting features without observed values: %s' % missing)\n X = X[:, valid_statistics_indexes]\n if sp.issparse(X):\n if self.missing_values == 0:\n raise ValueError('Imputation not possible when missing_values == 0 and input is sparse. Provide a dense array instead.')\n else:\n if valid_statistics_indexes is None:\n mask = missing_mask.data\n else:\n mask = _get_mask(X.data, self.missing_values)\n indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=int), np.diff(X.indptr))[mask]\n X.data[mask] = valid_statistics[indexes].astype(X.dtype, copy=False)\n else:\n if valid_statistics_indexes is None:\n mask_valid_features = missing_mask\n else:\n mask_valid_features = missing_mask[:, valid_statistics_indexes]\n n_missing = np.sum(mask_valid_features, axis=0)\n values = np.repeat(valid_statistics, n_missing)\n coordinates = np.where(mask_valid_features.transpose())[::-1]\n X[coordinates] = values\n X_indicator = super()._transform_indicator(missing_mask)\n return super()._concatenate_indicator(X, X_indicator)" }, { @@ -90467,7 +96233,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "missing_values", @@ -90477,7 +96244,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "add_indicator", @@ -90487,13 +96255,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, missing_values=np.nan, add_indicator=False):\n self.missing_values = missing_values\n self.add_indicator = add_indicator" }, { @@ -90511,7 +96280,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_imputed", @@ -90521,7 +96291,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_indicator", @@ -90531,7 +96302,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -90555,7 +96327,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -90565,7 +96338,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -90589,13 +96363,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'allow_nan': is_scalar_nan(self.missing_values)}" }, { @@ -90613,7 +96388,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -90623,13 +96399,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the indicator mask.'\n\nNote that X must be the original data as passed to the imputer before any imputation, since imputation may be done inplace in some cases.", - "docstring": "Compute the indicator mask.'\n\nNote that X must be the original data as passed to the imputer before\nany imputation, since imputation may be done inplace in some cases.", + "description": "Compute the indicator mask.'\n\nNote that X must be the original data as passed to the imputer before\nany imputation, since imputation may be done inplace in some cases.", + "docstring": "Compute the indicator mask.'\n\n Note that X must be the original data as passed to the imputer before\n any imputation, since imputation may be done inplace in some cases.\n ", "source_code": "\ndef _transform_indicator(self, X):\n \"\"\"Compute the indicator mask.'\n\n Note that X must be the original data as passed to the imputer before\n any imputation, since imputation may be done inplace in some cases.\n \"\"\"\n if self.add_indicator:\n if not hasattr(self, 'indicator_'):\n raise ValueError('Make sure to call _fit_indicator before _transform_indicator')\n return self.indicator_.transform(X)" }, { @@ -90647,7 +96424,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "missing_values", @@ -90657,13 +96435,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_inputs_dtype(X, missing_values):\n if X.dtype.kind in ('f', 'i', 'u') and not isinstance(missing_values, numbers.Real):\n raise ValueError(\"'X' and 'missing_values' types are expected to be both numerical. Got X.dtype={} and type(missing_values)={}.\".format(X.dtype, type(missing_values)))" }, { @@ -90681,7 +96460,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "extra_value", @@ -90691,7 +96471,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_repeat", @@ -90701,13 +96482,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the most frequent value in a 1d array extended with [extra_value] * n_repeat, where extra_value is assumed to be not part of the array.", - "docstring": "Compute the most frequent value in a 1d array extended with\n[extra_value] * n_repeat, where extra_value is assumed to be not part\nof the array.", + "description": "Compute the most frequent value in a 1d array extended with\n[extra_value] * n_repeat, where extra_value is assumed to be not part\nof the array.", + "docstring": "Compute the most frequent value in a 1d array extended with\n [extra_value] * n_repeat, where extra_value is assumed to be not part\n of the array.", "source_code": "\ndef _most_frequent(array, extra_value, n_repeat):\n \"\"\"Compute the most frequent value in a 1d array extended with\n [extra_value] * n_repeat, where extra_value is assumed to be not part\n of the array.\"\"\"\n if array.size > 0:\n if array.dtype == object:\n counter = Counter(array)\n most_frequent_count = counter.most_common(1)[0][1]\n most_frequent_value = min((value for (value, count) in counter.items() if count == most_frequent_count))\n else:\n mode = stats.mode(array)\n most_frequent_value = mode[0][0]\n most_frequent_count = mode[1][0]\n else:\n most_frequent_value = 0\n most_frequent_count = 0\n if most_frequent_count == 0 and n_repeat == 0:\n return np.nan\n elif most_frequent_count < n_repeat:\n return extra_value\n elif most_frequent_count > n_repeat:\n return most_frequent_value\n elif most_frequent_count == n_repeat:\n return min(most_frequent_value, extra_value)" }, { @@ -90725,7 +96507,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -90735,7 +96518,8 @@ "docstring": { "type": "estimator object, default=BayesianRidge()", "description": "The estimator to use at each step of the round-robin imputation.\nIf `sample_posterior=True`, the estimator must support\n`return_std` in its `predict` method." - } + }, + "refined_type": {} }, { "name": "missing_values", @@ -90745,7 +96529,8 @@ "docstring": { "type": "int or np.nan, default=np.nan", "description": "The placeholder for the missing values. All occurrences of\n`missing_values` will be imputed. For pandas' dataframes with\nnullable integer dtypes with missing values, `missing_values`\nshould be set to `np.nan`, since `pd.NA` will be converted to `np.nan`." - } + }, + "refined_type": {} }, { "name": "sample_posterior", @@ -90755,7 +96540,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to sample from the (Gaussian) predictive posterior of the\nfitted estimator for each imputation. Estimator must support\n`return_std` in its `predict` method if set to `True`. Set to\n`True` if using `IterativeImputer` for multiple imputations." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -90765,6 +96551,10 @@ "docstring": { "type": "int, default=10", "description": "Maximum number of imputation rounds to perform before returning the\nimputations computed during the final round. A round is a single\nimputation of each feature with missing values. The stopping criterion\nis met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol`,\nwhere `X_t` is `X` at iteration `t`. Note that early stopping is only\napplied if `sample_posterior=False`." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -90775,7 +96565,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Tolerance of the stopping condition." - } + }, + "refined_type": {} }, { "name": "n_nearest_features", @@ -90785,7 +96576,8 @@ "docstring": { "type": "int, default=None", "description": "Number of other features to use to estimate the missing values of\neach feature column. Nearness between features is measured using\nthe absolute correlation coefficient between each feature pair (after\ninitial imputation). To ensure coverage of features throughout the\nimputation process, the neighbor features are not necessarily nearest,\nbut are drawn with probability proportional to correlation for each\nimputed target feature. Can provide significant speed-up when the\nnumber of features is huge. If `None`, all features will be used." - } + }, + "refined_type": {} }, { "name": "initial_strategy", @@ -90795,6 +96587,15 @@ "docstring": { "type": "{'mean', 'median', 'most_frequent', 'constant'}, default='mean'", "description": "Which strategy to use to initialize the missing values. Same as the\n`strategy` parameter in :class:`~sklearn.impute.SimpleImputer`." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "most_frequent", + "median", + "constant", + "mean" + ] } }, { @@ -90805,6 +96606,16 @@ "docstring": { "type": "{'ascending', 'descending', 'roman', 'arabic', 'random'}, default='ascending'", "description": "The order in which the features will be imputed. Possible values:\n\n- `'ascending'`: From features with fewest missing values to most.\n- `'descending'`: From features with most missing values to fewest.\n- `'roman'`: Left to right.\n- `'arabic'`: Right to left.\n- `'random'`: A random order for each round." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "random", + "ascending", + "arabic", + "roman", + "descending" + ] } }, { @@ -90815,7 +96626,8 @@ "docstring": { "type": "bool, default=False", "description": "If `True` then features with missing values during :meth:`transform`\nwhich did not have any missing values during :meth:`fit` will be\nimputed with the initial imputation method only. Set to `True` if you\nhave many features with no missing values at both :meth:`fit` and\n:meth:`transform` time to save compute." - } + }, + "refined_type": {} }, { "name": "min_value", @@ -90825,7 +96637,8 @@ "docstring": { "type": "float or array-like of shape (n_features,), default=-np.inf", "description": "Minimum possible imputed value. Broadcast to shape `(n_features,)` if\nscalar. If array-like, expects shape `(n_features,)`, one min value for\neach feature. The default is `-np.inf`.\n\n.. versionchanged:: 0.23\n Added support for array-like." - } + }, + "refined_type": {} }, { "name": "max_value", @@ -90835,7 +96648,8 @@ "docstring": { "type": "float or array-like of shape (n_features,), default=np.inf", "description": "Maximum possible imputed value. Broadcast to shape `(n_features,)` if\nscalar. If array-like, expects shape `(n_features,)`, one max value for\neach feature. The default is `np.inf`.\n\n.. versionchanged:: 0.23\n Added support for array-like." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -90845,7 +96659,8 @@ "docstring": { "type": "int, default=0", "description": "Verbosity flag, controls the debug messages that are issued\nas functions are evaluated. The higher, the more verbose. Can be 0, 1,\nor 2." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -90855,7 +96670,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "The seed of the pseudo random number generator to use. Randomizes\nselection of estimator features if `n_nearest_features` is not `None`,\nthe `imputation_order` if `random`, and the sampling from posterior if\n`sample_posterior=True`. Use an integer for determinism.\nSee :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "add_indicator", @@ -90865,13 +96681,14 @@ "docstring": { "type": "bool, default=False", "description": "If `True`, a :class:`MissingIndicator` transform will stack onto output\nof the imputer's transform. This allows a predictive estimator\nto account for missingness despite imputation. If a feature has no\nmissing values at fit/train time, the feature won't appear on\nthe missing indicator even if there are missing values at\ntransform/test time." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator=None, *, missing_values=np.nan, sample_posterior=False, max_iter=10, tol=0.001, n_nearest_features=None, initial_strategy='mean', imputation_order='ascending', skip_complete=False, min_value=-np.inf, max_value=np.inf, verbose=0, random_state=None, add_indicator=False):\n super().__init__(missing_values=missing_values, add_indicator=add_indicator)\n self.estimator = estimator\n self.sample_posterior = sample_posterior\n self.max_iter = max_iter\n self.tol = tol\n self.n_nearest_features = n_nearest_features\n self.initial_strategy = initial_strategy\n self.imputation_order = imputation_order\n self.skip_complete = skip_complete\n self.min_value = min_value\n self.max_value = max_value\n self.verbose = verbose\n self.random_state = random_state" }, { @@ -90889,7 +96706,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_filled", @@ -90899,7 +96717,8 @@ "docstring": { "type": "ndarray, shape (n_samples, n_features)", "description": "Input data with the most recent imputations." - } + }, + "refined_type": {} }, { "name": "tolerance", @@ -90909,13 +96728,14 @@ "docstring": { "type": "float, default=1e-6", "description": "`abs_corr_mat` can have nans, which will be replaced\nwith `tolerance`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Get absolute correlation matrix between features.", - "docstring": "Get absolute correlation matrix between features.\n\nParameters\n----------\nX_filled : ndarray, shape (n_samples, n_features)\n Input data with the most recent imputations.\n\ntolerance : float, default=1e-6\n `abs_corr_mat` can have nans, which will be replaced\n with `tolerance`.\n\nReturns\n-------\nabs_corr_mat : ndarray, shape (n_features, n_features)\n Absolute correlation matrix of `X` at the beginning of the\n current round. The diagonal has been zeroed out and each feature's\n absolute correlations with all others have been normalized to sum\n to 1.", + "docstring": "Get absolute correlation matrix between features.\n\n Parameters\n ----------\n X_filled : ndarray, shape (n_samples, n_features)\n Input data with the most recent imputations.\n\n tolerance : float, default=1e-6\n `abs_corr_mat` can have nans, which will be replaced\n with `tolerance`.\n\n Returns\n -------\n abs_corr_mat : ndarray, shape (n_features, n_features)\n Absolute correlation matrix of `X` at the beginning of the\n current round. The diagonal has been zeroed out and each feature's\n absolute correlations with all others have been normalized to sum\n to 1.\n ", "source_code": "\ndef _get_abs_corr_mat(self, X_filled, tolerance=1e-06):\n \"\"\"Get absolute correlation matrix between features.\n\n Parameters\n ----------\n X_filled : ndarray, shape (n_samples, n_features)\n Input data with the most recent imputations.\n\n tolerance : float, default=1e-6\n `abs_corr_mat` can have nans, which will be replaced\n with `tolerance`.\n\n Returns\n -------\n abs_corr_mat : ndarray, shape (n_features, n_features)\n Absolute correlation matrix of `X` at the beginning of the\n current round. The diagonal has been zeroed out and each feature's\n absolute correlations with all others have been normalized to sum\n to 1.\n \"\"\"\n n_features = X_filled.shape[1]\n if self.n_nearest_features is None or self.n_nearest_features >= n_features:\n return None\n with np.errstate(invalid='ignore'):\n abs_corr_mat = np.abs(np.corrcoef(X_filled.T))\n abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance\n np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)\n np.fill_diagonal(abs_corr_mat, 0)\n abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)\n return abs_corr_mat" }, { @@ -90933,7 +96753,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -90943,7 +96764,8 @@ "docstring": { "type": "int", "description": "Number of features in `X`." - } + }, + "refined_type": {} }, { "name": "feat_idx", @@ -90953,7 +96775,8 @@ "docstring": { "type": "int", "description": "Index of the feature currently being imputed." - } + }, + "refined_type": {} }, { "name": "abs_corr_mat", @@ -90963,13 +96786,14 @@ "docstring": { "type": "ndarray, shape (n_features, n_features)", "description": "Absolute correlation matrix of `X`. The diagonal has been zeroed\nout and each feature has been normalized to sum to 1. Can be None." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Get a list of other features to predict `feat_idx`.\n\nIf `self.n_nearest_features` is less than or equal to the total number of features, then use a probability proportional to the absolute correlation between `feat_idx` and each other feature to randomly choose a subsample of the other features (without replacement).", - "docstring": "Get a list of other features to predict `feat_idx`.\n\nIf `self.n_nearest_features` is less than or equal to the total\nnumber of features, then use a probability proportional to the absolute\ncorrelation between `feat_idx` and each other feature to randomly\nchoose a subsample of the other features (without replacement).\n\nParameters\n----------\nn_features : int\n Number of features in `X`.\n\nfeat_idx : int\n Index of the feature currently being imputed.\n\nabs_corr_mat : ndarray, shape (n_features, n_features)\n Absolute correlation matrix of `X`. The diagonal has been zeroed\n out and each feature has been normalized to sum to 1. Can be None.\n\nReturns\n-------\nneighbor_feat_idx : array-like\n The features to use to impute `feat_idx`.", + "description": "Get a list of other features to predict `feat_idx`.\n\nIf `self.n_nearest_features` is less than or equal to the total\nnumber of features, then use a probability proportional to the absolute\ncorrelation between `feat_idx` and each other feature to randomly\nchoose a subsample of the other features (without replacement).", + "docstring": "Get a list of other features to predict `feat_idx`.\n\n If `self.n_nearest_features` is less than or equal to the total\n number of features, then use a probability proportional to the absolute\n correlation between `feat_idx` and each other feature to randomly\n choose a subsample of the other features (without replacement).\n\n Parameters\n ----------\n n_features : int\n Number of features in `X`.\n\n feat_idx : int\n Index of the feature currently being imputed.\n\n abs_corr_mat : ndarray, shape (n_features, n_features)\n Absolute correlation matrix of `X`. The diagonal has been zeroed\n out and each feature has been normalized to sum to 1. Can be None.\n\n Returns\n -------\n neighbor_feat_idx : array-like\n The features to use to impute `feat_idx`.\n ", "source_code": "\ndef _get_neighbor_feat_idx(self, n_features, feat_idx, abs_corr_mat):\n \"\"\"Get a list of other features to predict `feat_idx`.\n\n If `self.n_nearest_features` is less than or equal to the total\n number of features, then use a probability proportional to the absolute\n correlation between `feat_idx` and each other feature to randomly\n choose a subsample of the other features (without replacement).\n\n Parameters\n ----------\n n_features : int\n Number of features in `X`.\n\n feat_idx : int\n Index of the feature currently being imputed.\n\n abs_corr_mat : ndarray, shape (n_features, n_features)\n Absolute correlation matrix of `X`. The diagonal has been zeroed\n out and each feature has been normalized to sum to 1. Can be None.\n\n Returns\n -------\n neighbor_feat_idx : array-like\n The features to use to impute `feat_idx`.\n \"\"\"\n if self.n_nearest_features is not None and self.n_nearest_features < n_features:\n p = abs_corr_mat[:, feat_idx]\n neighbor_feat_idx = self.random_state_.choice(np.arange(n_features), self.n_nearest_features, replace=False, p=p)\n else:\n inds_left = np.arange(feat_idx)\n inds_right = np.arange(feat_idx + 1, n_features)\n neighbor_feat_idx = np.concatenate((inds_left, inds_right))\n return neighbor_feat_idx" }, { @@ -90987,7 +96811,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mask_missing_values", @@ -90997,13 +96822,14 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Input data's missing indicator matrix, where `n_samples` is the\nnumber of samples and `n_features` is the number of features." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Decide in what order we will update the features.\n\nAs a homage to the MICE R package, we will have 4 main options of how to order the updates, and use a random order if anything else is specified. Also, this function skips features which have no missing values.", - "docstring": "Decide in what order we will update the features.\n\nAs a homage to the MICE R package, we will have 4 main options of\nhow to order the updates, and use a random order if anything else\nis specified.\n\nAlso, this function skips features which have no missing values.\n\nParameters\n----------\nmask_missing_values : array-like, shape (n_samples, n_features)\n Input data's missing indicator matrix, where `n_samples` is the\n number of samples and `n_features` is the number of features.\n\nReturns\n-------\nordered_idx : ndarray, shape (n_features,)\n The order in which to impute the features.", + "description": "Decide in what order we will update the features.\n\nAs a homage to the MICE R package, we will have 4 main options of\nhow to order the updates, and use a random order if anything else\nis specified.\n\nAlso, this function skips features which have no missing values.", + "docstring": "Decide in what order we will update the features.\n\n As a homage to the MICE R package, we will have 4 main options of\n how to order the updates, and use a random order if anything else\n is specified.\n\n Also, this function skips features which have no missing values.\n\n Parameters\n ----------\n mask_missing_values : array-like, shape (n_samples, n_features)\n Input data's missing indicator matrix, where `n_samples` is the\n number of samples and `n_features` is the number of features.\n\n Returns\n -------\n ordered_idx : ndarray, shape (n_features,)\n The order in which to impute the features.\n ", "source_code": "\ndef _get_ordered_idx(self, mask_missing_values):\n \"\"\"Decide in what order we will update the features.\n\n As a homage to the MICE R package, we will have 4 main options of\n how to order the updates, and use a random order if anything else\n is specified.\n\n Also, this function skips features which have no missing values.\n\n Parameters\n ----------\n mask_missing_values : array-like, shape (n_samples, n_features)\n Input data's missing indicator matrix, where `n_samples` is the\n number of samples and `n_features` is the number of features.\n\n Returns\n -------\n ordered_idx : ndarray, shape (n_features,)\n The order in which to impute the features.\n \"\"\"\n frac_of_missing_values = mask_missing_values.mean(axis=0)\n if self.skip_complete:\n missing_values_idx = np.flatnonzero(frac_of_missing_values)\n else:\n missing_values_idx = np.arange(np.shape(frac_of_missing_values)[0])\n if self.imputation_order == 'roman':\n ordered_idx = missing_values_idx\n elif self.imputation_order == 'arabic':\n ordered_idx = missing_values_idx[::-1]\n elif self.imputation_order == 'ascending':\n n = len(frac_of_missing_values) - len(missing_values_idx)\n ordered_idx = np.argsort(frac_of_missing_values, kind='mergesort')[n:]\n elif self.imputation_order == 'descending':\n n = len(frac_of_missing_values) - len(missing_values_idx)\n ordered_idx = np.argsort(frac_of_missing_values, kind='mergesort')[n:][::-1]\n elif self.imputation_order == 'random':\n ordered_idx = missing_values_idx\n self.random_state_.shuffle(ordered_idx)\n else:\n raise ValueError(\"Got an invalid imputation order: '{0}'. It must be one of the following: 'roman', 'arabic', 'ascending', 'descending', or 'random'.\".format(self.imputation_order))\n return ordered_idx" }, { @@ -91021,7 +96847,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_filled", @@ -91031,7 +96858,8 @@ "docstring": { "type": "ndarray", "description": "Input data with the most recent imputations." - } + }, + "refined_type": {} }, { "name": "mask_missing_values", @@ -91041,7 +96869,8 @@ "docstring": { "type": "ndarray", "description": "Input data's missing indicator matrix." - } + }, + "refined_type": {} }, { "name": "feat_idx", @@ -91051,7 +96880,8 @@ "docstring": { "type": "int", "description": "Index of the feature currently being imputed." - } + }, + "refined_type": {} }, { "name": "neighbor_feat_idx", @@ -91061,7 +96891,8 @@ "docstring": { "type": "ndarray", "description": "Indices of the features to be used in imputing `feat_idx`." - } + }, + "refined_type": {} }, { "name": "estimator", @@ -91071,7 +96902,8 @@ "docstring": { "type": "object", "description": "The estimator to use at this step of the round-robin imputation.\nIf `sample_posterior=True`, the estimator must support\n`return_std` in its `predict` method.\nIf None, it will be cloned from self._estimator." - } + }, + "refined_type": {} }, { "name": "fit_mode", @@ -91081,13 +96913,14 @@ "docstring": { "type": "boolean, default=True", "description": "Whether to fit and predict with the estimator or just predict." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Impute a single feature from the others provided.\n\nThis function predicts the missing values of one of the features using the current estimates of all the other features. The `estimator` must support `return_std=True` in its `predict` method for this function to work.", - "docstring": "Impute a single feature from the others provided.\n\nThis function predicts the missing values of one of the features using\nthe current estimates of all the other features. The `estimator` must\nsupport `return_std=True` in its `predict` method for this function\nto work.\n\nParameters\n----------\nX_filled : ndarray\n Input data with the most recent imputations.\n\nmask_missing_values : ndarray\n Input data's missing indicator matrix.\n\nfeat_idx : int\n Index of the feature currently being imputed.\n\nneighbor_feat_idx : ndarray\n Indices of the features to be used in imputing `feat_idx`.\n\nestimator : object\n The estimator to use at this step of the round-robin imputation.\n If `sample_posterior=True`, the estimator must support\n `return_std` in its `predict` method.\n If None, it will be cloned from self._estimator.\n\nfit_mode : boolean, default=True\n Whether to fit and predict with the estimator or just predict.\n\nReturns\n-------\nX_filled : ndarray\n Input data with `X_filled[missing_row_mask, feat_idx]` updated.\n\nestimator : estimator with sklearn API\n The fitted estimator used to impute\n `X_filled[missing_row_mask, feat_idx]`.", + "description": "Impute a single feature from the others provided.\n\nThis function predicts the missing values of one of the features using\nthe current estimates of all the other features. The `estimator` must\nsupport `return_std=True` in its `predict` method for this function\nto work.", + "docstring": "Impute a single feature from the others provided.\n\n This function predicts the missing values of one of the features using\n the current estimates of all the other features. The `estimator` must\n support `return_std=True` in its `predict` method for this function\n to work.\n\n Parameters\n ----------\n X_filled : ndarray\n Input data with the most recent imputations.\n\n mask_missing_values : ndarray\n Input data's missing indicator matrix.\n\n feat_idx : int\n Index of the feature currently being imputed.\n\n neighbor_feat_idx : ndarray\n Indices of the features to be used in imputing `feat_idx`.\n\n estimator : object\n The estimator to use at this step of the round-robin imputation.\n If `sample_posterior=True`, the estimator must support\n `return_std` in its `predict` method.\n If None, it will be cloned from self._estimator.\n\n fit_mode : boolean, default=True\n Whether to fit and predict with the estimator or just predict.\n\n Returns\n -------\n X_filled : ndarray\n Input data with `X_filled[missing_row_mask, feat_idx]` updated.\n\n estimator : estimator with sklearn API\n The fitted estimator used to impute\n `X_filled[missing_row_mask, feat_idx]`.\n ", "source_code": "\ndef _impute_one_feature(self, X_filled, mask_missing_values, feat_idx, neighbor_feat_idx, estimator=None, fit_mode=True):\n \"\"\"Impute a single feature from the others provided.\n\n This function predicts the missing values of one of the features using\n the current estimates of all the other features. The `estimator` must\n support `return_std=True` in its `predict` method for this function\n to work.\n\n Parameters\n ----------\n X_filled : ndarray\n Input data with the most recent imputations.\n\n mask_missing_values : ndarray\n Input data's missing indicator matrix.\n\n feat_idx : int\n Index of the feature currently being imputed.\n\n neighbor_feat_idx : ndarray\n Indices of the features to be used in imputing `feat_idx`.\n\n estimator : object\n The estimator to use at this step of the round-robin imputation.\n If `sample_posterior=True`, the estimator must support\n `return_std` in its `predict` method.\n If None, it will be cloned from self._estimator.\n\n fit_mode : boolean, default=True\n Whether to fit and predict with the estimator or just predict.\n\n Returns\n -------\n X_filled : ndarray\n Input data with `X_filled[missing_row_mask, feat_idx]` updated.\n\n estimator : estimator with sklearn API\n The fitted estimator used to impute\n `X_filled[missing_row_mask, feat_idx]`.\n \"\"\"\n if estimator is None and fit_mode is False:\n raise ValueError('If fit_mode is False, then an already-fitted estimator should be passed in.')\n if estimator is None:\n estimator = clone(self._estimator)\n missing_row_mask = mask_missing_values[:, feat_idx]\n if fit_mode:\n X_train = _safe_indexing(X_filled[:, neighbor_feat_idx], ~missing_row_mask)\n y_train = _safe_indexing(X_filled[:, feat_idx], ~missing_row_mask)\n estimator.fit(X_train, y_train)\n if np.sum(missing_row_mask) == 0:\n return X_filled, estimator\n X_test = _safe_indexing(X_filled[:, neighbor_feat_idx], missing_row_mask)\n if self.sample_posterior:\n (mus, sigmas) = estimator.predict(X_test, return_std=True)\n imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)\n positive_sigmas = sigmas > 0\n imputed_values[~positive_sigmas] = mus[~positive_sigmas]\n mus_too_low = mus < self._min_value[feat_idx]\n imputed_values[mus_too_low] = self._min_value[feat_idx]\n mus_too_high = mus > self._max_value[feat_idx]\n imputed_values[mus_too_high] = self._max_value[feat_idx]\n inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high\n mus = mus[inrange_mask]\n sigmas = sigmas[inrange_mask]\n a = (self._min_value[feat_idx] - mus) / sigmas\n b = (self._max_value[feat_idx] - mus) / sigmas\n truncated_normal = stats.truncnorm(a=a, b=b, loc=mus, scale=sigmas)\n imputed_values[inrange_mask] = truncated_normal.rvs(random_state=self.random_state_)\n else:\n imputed_values = estimator.predict(X_test)\n imputed_values = np.clip(imputed_values, self._min_value[feat_idx], self._max_value[feat_idx])\n X_filled[missing_row_mask, feat_idx] = imputed_values\n return X_filled, estimator" }, { @@ -91105,7 +96938,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -91115,7 +96949,8 @@ "docstring": { "type": "ndarray, shape (n_samples, n_features)", "description": "Input data, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "in_fit", @@ -91125,13 +96960,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether function is called in :meth:`fit`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Perform initial imputation for input `X`.", - "docstring": "Perform initial imputation for input `X`.\n\nParameters\n----------\nX : ndarray, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nin_fit : bool, default=False\n Whether function is called in :meth:`fit`.\n\nReturns\n-------\nXt : ndarray, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nX_filled : ndarray, shape (n_samples, n_features)\n Input data with the most recent imputations.\n\nmask_missing_values : ndarray, shape (n_samples, n_features)\n Input data's missing indicator matrix, where `n_samples` is the\n number of samples and `n_features` is the number of features.\n\nX_missing_mask : ndarray, shape (n_samples, n_features)\n Input data's mask matrix indicating missing datapoints, where\n `n_samples` is the number of samples and `n_features` is the\n number of features.", + "docstring": "Perform initial imputation for input `X`.\n\n Parameters\n ----------\n X : ndarray, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n in_fit : bool, default=False\n Whether function is called in :meth:`fit`.\n\n Returns\n -------\n Xt : ndarray, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n X_filled : ndarray, shape (n_samples, n_features)\n Input data with the most recent imputations.\n\n mask_missing_values : ndarray, shape (n_samples, n_features)\n Input data's missing indicator matrix, where `n_samples` is the\n number of samples and `n_features` is the number of features.\n\n X_missing_mask : ndarray, shape (n_samples, n_features)\n Input data's mask matrix indicating missing datapoints, where\n `n_samples` is the number of samples and `n_features` is the\n number of features.\n ", "source_code": "\ndef _initial_imputation(self, X, in_fit=False):\n \"\"\"Perform initial imputation for input `X`.\n\n Parameters\n ----------\n X : ndarray, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n in_fit : bool, default=False\n Whether function is called in :meth:`fit`.\n\n Returns\n -------\n Xt : ndarray, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n X_filled : ndarray, shape (n_samples, n_features)\n Input data with the most recent imputations.\n\n mask_missing_values : ndarray, shape (n_samples, n_features)\n Input data's missing indicator matrix, where `n_samples` is the\n number of samples and `n_features` is the number of features.\n\n X_missing_mask : ndarray, shape (n_samples, n_features)\n Input data's mask matrix indicating missing datapoints, where\n `n_samples` is the number of samples and `n_features` is the\n number of features.\n \"\"\"\n if is_scalar_nan(self.missing_values):\n force_all_finite = 'allow-nan'\n else:\n force_all_finite = True\n X = self._validate_data(X, dtype=FLOAT_DTYPES, order='F', reset=in_fit, force_all_finite=force_all_finite)\n _check_inputs_dtype(X, self.missing_values)\n X_missing_mask = _get_mask(X, self.missing_values)\n mask_missing_values = X_missing_mask.copy()\n if self.initial_imputer_ is None:\n self.initial_imputer_ = SimpleImputer(missing_values=self.missing_values, strategy=self.initial_strategy)\n X_filled = self.initial_imputer_.fit_transform(X)\n else:\n X_filled = self.initial_imputer_.transform(X)\n valid_mask = np.flatnonzero(np.logical_not(np.isnan(self.initial_imputer_.statistics_)))\n Xt = X[:, valid_mask]\n mask_missing_values = mask_missing_values[:, valid_mask]\n return Xt, X_filled, mask_missing_values, X_missing_mask" }, { @@ -91149,7 +96985,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "limit_type", @@ -91159,7 +96996,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -91169,13 +97007,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Validate the limits (min/max) of the feature values.\n\nConverts scalar min/max limits to vectors of shape `(n_features,)`.", - "docstring": "Validate the limits (min/max) of the feature values.\n\nConverts scalar min/max limits to vectors of shape `(n_features,)`.\n\nParameters\n----------\nlimit: scalar or array-like\n The user-specified limit (i.e, min_value or max_value).\nlimit_type: {'max', 'min'}\n Type of limit to validate.\nn_features: int\n Number of features in the dataset.\n\nReturns\n-------\nlimit: ndarray, shape(n_features,)\n Array of limits, one for each feature.", + "docstring": "Validate the limits (min/max) of the feature values.\n\n Converts scalar min/max limits to vectors of shape `(n_features,)`.\n\n Parameters\n ----------\n limit: scalar or array-like\n The user-specified limit (i.e, min_value or max_value).\n limit_type: {'max', 'min'}\n Type of limit to validate.\n n_features: int\n Number of features in the dataset.\n\n Returns\n -------\n limit: ndarray, shape(n_features,)\n Array of limits, one for each feature.\n ", "source_code": "\n@staticmethod\ndef _validate_limit(limit, limit_type, n_features):\n \"\"\"Validate the limits (min/max) of the feature values.\n\n Converts scalar min/max limits to vectors of shape `(n_features,)`.\n\n Parameters\n ----------\n limit: scalar or array-like\n The user-specified limit (i.e, min_value or max_value).\n limit_type: {'max', 'min'}\n Type of limit to validate.\n n_features: int\n Number of features in the dataset.\n\n Returns\n -------\n limit: ndarray, shape(n_features,)\n Array of limits, one for each feature.\n \"\"\"\n limit_bound = np.inf if limit_type == 'max' else -np.inf\n limit = limit_bound if limit is None else limit\n if np.isscalar(limit):\n limit = np.full(n_features, limit)\n limit = check_array(limit, force_all_finite=False, copy=False, ensure_2d=False)\n if not limit.shape[0] == n_features:\n raise ValueError(f\"'{limit_type}_value' should be of shape ({n_features},) when an array-like is provided. Got {limit.shape}, instead.\")\n return limit" }, { @@ -91193,7 +97032,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -91203,7 +97043,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Input data, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -91213,13 +97054,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the imputer on `X` and return self.", - "docstring": "Fit the imputer on `X` and return self.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the imputer on `X` and return self.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the imputer on `X` and return self.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self.fit_transform(X)\n return self" }, { @@ -91237,7 +97079,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -91247,7 +97090,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Input data, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -91257,13 +97101,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the imputer on `X` and return the transformed `X`.", - "docstring": "Fit the imputer on `X` and return the transformed `X`.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nXt : array-like, shape (n_samples, n_features)\n The imputed input data.", + "docstring": "Fit the imputer on `X` and return the transformed `X`.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n Xt : array-like, shape (n_samples, n_features)\n The imputed input data.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Fit the imputer on `X` and return the transformed `X`.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n Xt : array-like, shape (n_samples, n_features)\n The imputed input data.\n \"\"\"\n self.random_state_ = getattr(self, 'random_state_', check_random_state(self.random_state))\n if self.max_iter < 0:\n raise ValueError(\"'max_iter' should be a positive integer. Got {} instead.\".format(self.max_iter))\n if self.tol < 0:\n raise ValueError(\"'tol' should be a non-negative float. Got {} instead.\".format(self.tol))\n if self.estimator is None:\n from ..linear_model import BayesianRidge\n self._estimator = BayesianRidge()\n else:\n self._estimator = clone(self.estimator)\n self.imputation_sequence_ = []\n self.initial_imputer_ = None\n (X, Xt, mask_missing_values, complete_mask) = self._initial_imputation(X, in_fit=True)\n super()._fit_indicator(complete_mask)\n X_indicator = super()._transform_indicator(complete_mask)\n if self.max_iter == 0 or np.all(mask_missing_values):\n self.n_iter_ = 0\n return super()._concatenate_indicator(Xt, X_indicator)\n if Xt.shape[1] == 1:\n self.n_iter_ = 0\n return super()._concatenate_indicator(Xt, X_indicator)\n self._min_value = self._validate_limit(self.min_value, 'min', X.shape[1])\n self._max_value = self._validate_limit(self.max_value, 'max', X.shape[1])\n if not np.all(np.greater(self._max_value, self._min_value)):\n raise ValueError('One (or more) features have min_value >= max_value.')\n ordered_idx = self._get_ordered_idx(mask_missing_values)\n self.n_features_with_missing_ = len(ordered_idx)\n abs_corr_mat = self._get_abs_corr_mat(Xt)\n (n_samples, n_features) = Xt.shape\n if self.verbose > 0:\n print('[IterativeImputer] Completing matrix with shape %s' % (X.shape, ))\n start_t = time()\n if not self.sample_posterior:\n Xt_previous = Xt.copy()\n normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))\n for self.n_iter_ in range(1, self.max_iter + 1):\n if self.imputation_order == 'random':\n ordered_idx = self._get_ordered_idx(mask_missing_values)\n for feat_idx in ordered_idx:\n neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, feat_idx, abs_corr_mat)\n (Xt, estimator) = self._impute_one_feature(Xt, mask_missing_values, feat_idx, neighbor_feat_idx, estimator=None, fit_mode=True)\n estimator_triplet = _ImputerTriplet(feat_idx, neighbor_feat_idx, estimator)\n self.imputation_sequence_.append(estimator_triplet)\n if self.verbose > 1:\n print('[IterativeImputer] Ending imputation round %d/%d, elapsed time %0.2f' % (self.n_iter_, self.max_iter, time() - start_t))\n if not self.sample_posterior:\n inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, axis=None)\n if self.verbose > 0:\n print('[IterativeImputer] Change: {}, scaled tolerance: {} '.format(inf_norm, normalized_tol))\n if inf_norm < normalized_tol:\n if self.verbose > 0:\n print('[IterativeImputer] Early stopping criterion reached.')\n break\n Xt_previous = Xt.copy()\n else:\n if not self.sample_posterior:\n warnings.warn('[IterativeImputer] Early stopping criterion not reached.', ConvergenceWarning)\n Xt[~mask_missing_values] = X[~mask_missing_values]\n return super()._concatenate_indicator(Xt, X_indicator)" }, { @@ -91281,7 +97126,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -91291,13 +97137,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input data to complete." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Impute all missing values in `X`.\n\nNote that this is stochastic, and that if `random_state` is not fixed, repeated calls, or permuted input, results will differ.", - "docstring": "Impute all missing values in `X`.\n\nNote that this is stochastic, and that if `random_state` is not fixed,\nrepeated calls, or permuted input, results will differ.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input data to complete.\n\nReturns\n-------\nXt : array-like, shape (n_samples, n_features)\n The imputed input data.", + "description": "Impute all missing values in `X`.\n\nNote that this is stochastic, and that if `random_state` is not fixed,\nrepeated calls, or permuted input, results will differ.", + "docstring": "Impute all missing values in `X`.\n\n Note that this is stochastic, and that if `random_state` is not fixed,\n repeated calls, or permuted input, results will differ.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data to complete.\n\n Returns\n -------\n Xt : array-like, shape (n_samples, n_features)\n The imputed input data.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Impute all missing values in `X`.\n\n Note that this is stochastic, and that if `random_state` is not fixed,\n repeated calls, or permuted input, results will differ.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data to complete.\n\n Returns\n -------\n Xt : array-like, shape (n_samples, n_features)\n The imputed input data.\n \"\"\"\n check_is_fitted(self)\n (X, Xt, mask_missing_values, complete_mask) = self._initial_imputation(X)\n X_indicator = super()._transform_indicator(complete_mask)\n if self.n_iter_ == 0 or np.all(mask_missing_values):\n return super()._concatenate_indicator(Xt, X_indicator)\n imputations_per_round = len(self.imputation_sequence_) // self.n_iter_\n i_rnd = 0\n if self.verbose > 0:\n print('[IterativeImputer] Completing matrix with shape %s' % (X.shape, ))\n start_t = time()\n for (it, estimator_triplet) in enumerate(self.imputation_sequence_):\n (Xt, _) = self._impute_one_feature(Xt, mask_missing_values, estimator_triplet.feat_idx, estimator_triplet.neighbor_feat_idx, estimator=estimator_triplet.estimator, fit_mode=False)\n if not (it + 1) % imputations_per_round:\n if self.verbose > 1:\n print('[IterativeImputer] Ending imputation round %d/%d, elapsed time %0.2f' % (i_rnd + 1, self.n_iter_, time() - start_t))\n i_rnd += 1\n Xt[~mask_missing_values] = X[~mask_missing_values]\n return super()._concatenate_indicator(Xt, X_indicator)" }, { @@ -91315,7 +97162,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "missing_values", @@ -91325,7 +97173,8 @@ "docstring": { "type": "int, float, str, np.nan or None, default=np.nan", "description": "The placeholder for the missing values. All occurrences of\n`missing_values` will be imputed. For pandas' dataframes with\nnullable integer dtypes with missing values, `missing_values`\nshould be set to np.nan, since `pd.NA` will be converted to np.nan." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -91335,7 +97184,8 @@ "docstring": { "type": "int, default=5", "description": "Number of neighboring samples to use for imputation." - } + }, + "refined_type": {} }, { "name": "weights", @@ -91345,6 +97195,10 @@ "docstring": { "type": "{'uniform', 'distance'} or callable, default='uniform'", "description": "Weight function used in prediction. Possible values:\n\n- 'uniform' : uniform weights. All points in each neighborhood are\n weighted equally.\n- 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n- callable : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights." + }, + "refined_type": { + "kind": "EnumType", + "values": ["uniform", "distance"] } }, { @@ -91355,6 +97209,10 @@ "docstring": { "type": "{'nan_euclidean'} or callable, default='nan_euclidean'", "description": "Distance metric for searching neighbors. Possible values:\n\n- 'nan_euclidean'\n- callable : a user-defined function which conforms to the definition\n of ``_pairwise_callable(X, Y, metric, **kwds)``. The function\n accepts two arrays, X and Y, and a `missing_values` keyword in\n `kwds` and returns a scalar distance value." + }, + "refined_type": { + "kind": "EnumType", + "values": ["nan_euclidean"] } }, { @@ -91365,7 +97223,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, a copy of X will be created. If False, imputation will\nbe done in-place whenever possible." - } + }, + "refined_type": {} }, { "name": "add_indicator", @@ -91375,13 +97234,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, a :class:`MissingIndicator` transform will stack onto the\noutput of the imputer's transform. This allows a predictive estimator\nto account for missingness despite imputation. If a feature has no\nmissing values at fit/train time, the feature won't appear on the\nmissing indicator even if there are missing values at transform/test\ntime." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, missing_values=np.nan, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False):\n super().__init__(missing_values=missing_values, add_indicator=add_indicator)\n self.n_neighbors = n_neighbors\n self.weights = weights\n self.metric = metric\n self.copy = copy" }, { @@ -91399,7 +97259,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dist_pot_donors", @@ -91409,7 +97270,8 @@ "docstring": { "type": "ndarray of shape (n_receivers, n_potential_donors)", "description": "Distance matrix between the receivers and potential donors from\ntraining set. There must be at least one non-nan distance between\na receiver and a potential donor." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -91419,7 +97281,8 @@ "docstring": { "type": "int", "description": "Number of neighbors to consider." - } + }, + "refined_type": {} }, { "name": "fit_X_col", @@ -91429,7 +97292,8 @@ "docstring": { "type": "ndarray of shape (n_potential_donors,)", "description": "Column of potential donors from training set." - } + }, + "refined_type": {} }, { "name": "mask_fit_X_col", @@ -91439,13 +97303,14 @@ "docstring": { "type": "ndarray of shape (n_potential_donors,)", "description": "Missing mask for fit_X_col." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Helper function to impute a single column.", - "docstring": "Helper function to impute a single column.\n\nParameters\n----------\ndist_pot_donors : ndarray of shape (n_receivers, n_potential_donors)\n Distance matrix between the receivers and potential donors from\n training set. There must be at least one non-nan distance between\n a receiver and a potential donor.\n\nn_neighbors : int\n Number of neighbors to consider.\n\nfit_X_col : ndarray of shape (n_potential_donors,)\n Column of potential donors from training set.\n\nmask_fit_X_col : ndarray of shape (n_potential_donors,)\n Missing mask for fit_X_col.\n\nReturns\n-------\nimputed_values: ndarray of shape (n_receivers,)\n Imputed values for receiver.", + "docstring": "Helper function to impute a single column.\n\n Parameters\n ----------\n dist_pot_donors : ndarray of shape (n_receivers, n_potential_donors)\n Distance matrix between the receivers and potential donors from\n training set. There must be at least one non-nan distance between\n a receiver and a potential donor.\n\n n_neighbors : int\n Number of neighbors to consider.\n\n fit_X_col : ndarray of shape (n_potential_donors,)\n Column of potential donors from training set.\n\n mask_fit_X_col : ndarray of shape (n_potential_donors,)\n Missing mask for fit_X_col.\n\n Returns\n -------\n imputed_values: ndarray of shape (n_receivers,)\n Imputed values for receiver.\n ", "source_code": "\ndef _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):\n \"\"\"Helper function to impute a single column.\n\n Parameters\n ----------\n dist_pot_donors : ndarray of shape (n_receivers, n_potential_donors)\n Distance matrix between the receivers and potential donors from\n training set. There must be at least one non-nan distance between\n a receiver and a potential donor.\n\n n_neighbors : int\n Number of neighbors to consider.\n\n fit_X_col : ndarray of shape (n_potential_donors,)\n Column of potential donors from training set.\n\n mask_fit_X_col : ndarray of shape (n_potential_donors,)\n Missing mask for fit_X_col.\n\n Returns\n -------\n imputed_values: ndarray of shape (n_receivers,)\n Imputed values for receiver.\n \"\"\"\n donors_idx = np.argpartition(dist_pot_donors, n_neighbors - 1, axis=1)[:, :n_neighbors]\n donors_dist = dist_pot_donors[np.arange(donors_idx.shape[0])[:, None], donors_idx]\n weight_matrix = _get_weights(donors_dist, self.weights)\n if weight_matrix is not None:\n weight_matrix[np.isnan(weight_matrix)] = 0.0\n donors = fit_X_col.take(donors_idx)\n donors_mask = mask_fit_X_col.take(donors_idx)\n donors = np.ma.array(donors, mask=donors_mask)\n return np.ma.average(donors, axis=1, weights=weight_matrix).data" }, { @@ -91463,7 +97328,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -91473,7 +97339,8 @@ "docstring": { "type": "array-like shape of (n_samples, n_features)", "description": "Input data, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -91483,13 +97350,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the imputer on X.", - "docstring": "Fit the imputer on X.\n\nParameters\n----------\nX : array-like shape of (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n The fitted `KNNImputer` class instance.", + "docstring": "Fit the imputer on X.\n\n Parameters\n ----------\n X : array-like shape of (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n The fitted `KNNImputer` class instance.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the imputer on X.\n\n Parameters\n ----------\n X : array-like shape of (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n The fitted `KNNImputer` class instance.\n \"\"\"\n if not is_scalar_nan(self.missing_values):\n force_all_finite = True\n else:\n force_all_finite = 'allow-nan'\n if self.metric not in _NAN_METRICS and not callable(self.metric):\n raise ValueError('The selected metric does not support NaN values')\n if self.n_neighbors <= 0:\n raise ValueError('Expected n_neighbors > 0. Got {}'.format(self.n_neighbors))\n X = self._validate_data(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy)\n _check_weights(self.weights)\n self._fit_X = X\n self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)\n super()._fit_indicator(self._mask_fit_X)\n return self" }, { @@ -91507,7 +97375,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -91517,13 +97386,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input data to complete." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Impute all missing values in X.", - "docstring": "Impute all missing values in X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input data to complete.\n\nReturns\n-------\nX : array-like of shape (n_samples, n_output_features)\n The imputed dataset. `n_output_features` is the number of features\n that is not always missing during `fit`.", + "docstring": "Impute all missing values in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data to complete.\n\n Returns\n -------\n X : array-like of shape (n_samples, n_output_features)\n The imputed dataset. `n_output_features` is the number of features\n that is not always missing during `fit`.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Impute all missing values in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data to complete.\n\n Returns\n -------\n X : array-like of shape (n_samples, n_output_features)\n The imputed dataset. `n_output_features` is the number of features\n that is not always missing during `fit`.\n \"\"\"\n check_is_fitted(self)\n if not is_scalar_nan(self.missing_values):\n force_all_finite = True\n else:\n force_all_finite = 'allow-nan'\n X = self._validate_data(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy, reset=False)\n mask = _get_mask(X, self.missing_values)\n mask_fit_X = self._mask_fit_X\n valid_mask = ~np.all(mask_fit_X, axis=0)\n X_indicator = super()._transform_indicator(mask)\n if not np.any(mask):\n return X[:, valid_mask]\n row_missing_idx = np.flatnonzero(mask.any(axis=1))\n non_missing_fix_X = np.logical_not(mask_fit_X)\n dist_idx_map = np.zeros(X.shape[0], dtype=int)\n dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0])\n \n def process_chunk(dist_chunk, start):\n row_missing_chunk = row_missing_idx[start:start + len(dist_chunk)]\n for col in range(X.shape[1]):\n if not valid_mask[col]:\n continue\n col_mask = mask[row_missing_chunk, col]\n if not np.any(col_mask):\n continue\n (potential_donors_idx, ) = np.nonzero(non_missing_fix_X[:, col])\n receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]\n dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][:, potential_donors_idx]\n all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)\n all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]\n if all_nan_receivers_idx.size:\n col_mean = np.ma.array(self._fit_X[:, col], mask=mask_fit_X[:, col]).mean()\n X[all_nan_receivers_idx, col] = col_mean\n if len(all_nan_receivers_idx) == len(receivers_idx):\n continue\n receivers_idx = receivers_idx[~all_nan_dist_mask]\n dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][:, potential_donors_idx]\n n_neighbors = min(self.n_neighbors, len(potential_donors_idx))\n value = self._calc_impute(dist_subset, n_neighbors, self._fit_X[potential_donors_idx, col], mask_fit_X[potential_donors_idx, col])\n X[receivers_idx, col] = value\n gen = pairwise_distances_chunked(X[row_missing_idx, :], self._fit_X, metric=self.metric, missing_values=self.missing_values, force_all_finite=force_all_finite, reduce_func=process_chunk)\n for chunk in gen:\n pass\n return super()._concatenate_indicator(X[:, valid_mask], X_indicator)" }, { @@ -91541,7 +97411,8 @@ "docstring": { "type": "ndarray, shape (n_samples, n_target_features)", "description": "The data." - } + }, + "refined_type": {} }, { "name": "percentiles", @@ -91551,7 +97422,8 @@ "docstring": { "type": "tuple of floats", "description": "The percentiles which are used to construct the extreme values of\nthe grid. Must be in [0, 1]." - } + }, + "refined_type": {} }, { "name": "grid_resolution", @@ -91561,13 +97433,14 @@ "docstring": { "type": "int", "description": "The number of equally spaced points to be placed on the grid for each\nfeature." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Generate a grid of points based on the percentiles of X.\n\nThe grid is a cartesian product between the columns of ``values``. The ith column of ``values`` consists in ``grid_resolution`` equally-spaced points between the percentiles of the jth column of X. If ``grid_resolution`` is bigger than the number of unique values in the jth column of X, then those unique values will be used instead.", - "docstring": "Generate a grid of points based on the percentiles of X.\n\nThe grid is a cartesian product between the columns of ``values``. The\nith column of ``values`` consists in ``grid_resolution`` equally-spaced\npoints between the percentiles of the jth column of X.\nIf ``grid_resolution`` is bigger than the number of unique values in the\njth column of X, then those unique values will be used instead.\n\nParameters\n----------\nX : ndarray, shape (n_samples, n_target_features)\n The data.\n\npercentiles : tuple of floats\n The percentiles which are used to construct the extreme values of\n the grid. Must be in [0, 1].\n\ngrid_resolution : int\n The number of equally spaced points to be placed on the grid for each\n feature.\n\nReturns\n-------\ngrid : ndarray, shape (n_points, n_target_features)\n A value for each feature at each point in the grid. ``n_points`` is\n always ``<= grid_resolution ** X.shape[1]``.\n\nvalues : list of 1d ndarrays\n The values with which the grid has been created. The size of each\n array ``values[j]`` is either ``grid_resolution``, or the number of\n unique values in ``X[:, j]``, whichever is smaller.", + "description": "Generate a grid of points based on the percentiles of X.\n\nThe grid is a cartesian product between the columns of ``values``. The\nith column of ``values`` consists in ``grid_resolution`` equally-spaced\npoints between the percentiles of the jth column of X.\nIf ``grid_resolution`` is bigger than the number of unique values in the\njth column of X, then those unique values will be used instead.", + "docstring": "Generate a grid of points based on the percentiles of X.\n\n The grid is a cartesian product between the columns of ``values``. The\n ith column of ``values`` consists in ``grid_resolution`` equally-spaced\n points between the percentiles of the jth column of X.\n If ``grid_resolution`` is bigger than the number of unique values in the\n jth column of X, then those unique values will be used instead.\n\n Parameters\n ----------\n X : ndarray, shape (n_samples, n_target_features)\n The data.\n\n percentiles : tuple of floats\n The percentiles which are used to construct the extreme values of\n the grid. Must be in [0, 1].\n\n grid_resolution : int\n The number of equally spaced points to be placed on the grid for each\n feature.\n\n Returns\n -------\n grid : ndarray, shape (n_points, n_target_features)\n A value for each feature at each point in the grid. ``n_points`` is\n always ``<= grid_resolution ** X.shape[1]``.\n\n values : list of 1d ndarrays\n The values with which the grid has been created. The size of each\n array ``values[j]`` is either ``grid_resolution``, or the number of\n unique values in ``X[:, j]``, whichever is smaller.\n ", "source_code": "\ndef _grid_from_X(X, percentiles, grid_resolution):\n \"\"\"Generate a grid of points based on the percentiles of X.\n\n The grid is a cartesian product between the columns of ``values``. The\n ith column of ``values`` consists in ``grid_resolution`` equally-spaced\n points between the percentiles of the jth column of X.\n If ``grid_resolution`` is bigger than the number of unique values in the\n jth column of X, then those unique values will be used instead.\n\n Parameters\n ----------\n X : ndarray, shape (n_samples, n_target_features)\n The data.\n\n percentiles : tuple of floats\n The percentiles which are used to construct the extreme values of\n the grid. Must be in [0, 1].\n\n grid_resolution : int\n The number of equally spaced points to be placed on the grid for each\n feature.\n\n Returns\n -------\n grid : ndarray, shape (n_points, n_target_features)\n A value for each feature at each point in the grid. ``n_points`` is\n always ``<= grid_resolution ** X.shape[1]``.\n\n values : list of 1d ndarrays\n The values with which the grid has been created. The size of each\n array ``values[j]`` is either ``grid_resolution``, or the number of\n unique values in ``X[:, j]``, whichever is smaller.\n \"\"\"\n if not isinstance(percentiles, Iterable) or len(percentiles) != 2:\n raise ValueError(\"'percentiles' must be a sequence of 2 elements.\")\n if not all((0 <= x <= 1 for x in percentiles)):\n raise ValueError(\"'percentiles' values must be in [0, 1].\")\n if percentiles[0] >= percentiles[1]:\n raise ValueError('percentiles[0] must be strictly less than percentiles[1].')\n if grid_resolution <= 1:\n raise ValueError(\"'grid_resolution' must be strictly greater than 1.\")\n values = []\n for feature in range(X.shape[1]):\n uniques = np.unique(_safe_indexing(X, feature, axis=1))\n if uniques.shape[0] < grid_resolution:\n axis = uniques\n else:\n emp_percentiles = mquantiles(_safe_indexing(X, feature, axis=1), prob=percentiles, axis=0)\n if np.allclose(emp_percentiles[0], emp_percentiles[1]):\n raise ValueError('percentiles are too close to each other, unable to build the grid. Please choose percentiles that are further apart.')\n axis = np.linspace(emp_percentiles[0], emp_percentiles[1], num=grid_resolution, endpoint=True)\n values.append(axis)\n return cartesian(values), values" }, { @@ -91585,7 +97458,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "grid", @@ -91595,7 +97469,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "features", @@ -91605,7 +97480,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -91615,7 +97491,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "response_method", @@ -91625,13 +97502,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _partial_dependence_brute(est, grid, features, X, response_method):\n predictions = []\n averaged_predictions = []\n if is_regressor(est):\n prediction_method = est.predict\n else:\n predict_proba = getattr(est, 'predict_proba', None)\n decision_function = getattr(est, 'decision_function', None)\n if response_method == 'auto':\n prediction_method = predict_proba or decision_function\n else:\n prediction_method = predict_proba if response_method == 'predict_proba' else decision_function\n if prediction_method is None:\n if response_method == 'auto':\n raise ValueError('The estimator has no predict_proba and no decision_function method.')\n elif response_method == 'predict_proba':\n raise ValueError('The estimator has no predict_proba method.')\n else:\n raise ValueError('The estimator has no decision_function method.')\n for new_values in grid:\n X_eval = X.copy()\n for (i, variable) in enumerate(features):\n if hasattr(X_eval, 'iloc'):\n X_eval.iloc[:, variable] = new_values[i]\n else:\n X_eval[:, variable] = new_values[i]\n try:\n pred = prediction_method(X_eval)\n predictions.append(pred)\n averaged_predictions.append(np.mean(pred, axis=0))\n except NotFittedError as e:\n raise ValueError(\"'estimator' parameter must be a fitted estimator\") from e\n n_samples = X.shape[0]\n predictions = np.array(predictions).T\n if is_regressor(est) and predictions.ndim == 2:\n predictions = predictions.reshape(n_samples, -1)\n elif is_classifier(est) and predictions.shape[0] == 2:\n predictions = predictions[1]\n predictions = predictions.reshape(n_samples, -1)\n averaged_predictions = np.array(averaged_predictions).T\n if is_regressor(est) and averaged_predictions.ndim == 1:\n averaged_predictions = averaged_predictions.reshape(1, -1)\n elif is_classifier(est) and averaged_predictions.shape[0] == 2:\n averaged_predictions = averaged_predictions[1]\n averaged_predictions = averaged_predictions.reshape(1, -1)\n return averaged_predictions, predictions" }, { @@ -91649,7 +97527,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "grid", @@ -91659,7 +97538,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "features", @@ -91669,13 +97549,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _partial_dependence_recursion(est, grid, features):\n averaged_predictions = est._compute_partial_dependence_recursion(grid, features)\n if averaged_predictions.ndim == 1:\n averaged_predictions = averaged_predictions.reshape(1, -1)\n return averaged_predictions" }, { @@ -91693,7 +97574,8 @@ "docstring": { "type": "BaseEstimator", "description": "A fitted estimator object implementing :term:`predict`,\n:term:`predict_proba`, or :term:`decision_function`.\nMultioutput-multiclass classifiers are not supported." - } + }, + "refined_type": {} }, { "name": "X", @@ -91703,6 +97585,10 @@ "docstring": { "type": "{array-like or dataframe} of shape (n_samples, n_features)", "description": "``X`` is used to generate a grid of values for the target\n``features`` (where the partial dependence will be evaluated), and\nalso to generate values for the complement features when the\n`method` is 'brute'." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -91713,6 +97599,10 @@ "docstring": { "type": "array-like of {int, str}", "description": "The feature (e.g. `[0]`) or pair of interacting features\n(e.g. `[(0, 1)]`) for which the partial dependency should be computed." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -91723,6 +97613,10 @@ "docstring": { "type": "{'auto', 'predict_proba', 'decision_function'}, default='auto'", "description": "Specifies whether to use :term:`predict_proba` or\n:term:`decision_function` as the target response. For regressors\nthis parameter is ignored and the response is always the output of\n:term:`predict`. By default, :term:`predict_proba` is tried first\nand we revert to :term:`decision_function` if it doesn't exist. If\n``method`` is 'recursion', the response is always the output of\n:term:`decision_function`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "decision_function", "predict_proba"] } }, { @@ -91733,7 +97627,8 @@ "docstring": { "type": "tuple of float, default=(0.05, 0.95)", "description": "The lower and upper percentile used to create the extreme values\nfor the grid. Must be in [0, 1]." - } + }, + "refined_type": {} }, { "name": "grid_resolution", @@ -91743,7 +97638,8 @@ "docstring": { "type": "int, default=100", "description": "The number of equally spaced points on the grid, for each target\nfeature." - } + }, + "refined_type": {} }, { "name": "method", @@ -91753,6 +97649,10 @@ "docstring": { "type": "{'auto', 'recursion', 'brute'}, default='auto'", "description": "The method used to calculate the averaged predictions:\n\n- `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`,\n ) when `kind='average'`.\n This is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the Individual Conditional Expectation (ICE) by\n design, it is not compatible with ICE and thus `kind` must be\n `'average'`.\n\n- `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n- `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\nPlease see :ref:`this note ` for\ndifferences between the `'brute'` and `'recursion'` method." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "recursion", "brute"] } }, { @@ -91763,14 +97663,18 @@ "docstring": { "type": "{'legacy', 'average', 'individual', 'both'}, default='legacy'", "description": "Whether to return the partial dependence averaged across all the\nsamples in the dataset or one line per sample or both.\nSee Returns below.\n\nNote that the fast `method='recursion'` option is only available for\n`kind='average'`. Plotting individual dependencies requires using the\nslower `method='brute'` option.\n\n.. versionadded:: 0.24\n.. deprecated:: 0.24\n `kind='legacy'` is deprecated and will be removed in version 1.1.\n `kind='average'` will be the new default. It is intended to migrate\n from the ndarray output to :class:`~sklearn.utils.Bunch` output." + }, + "refined_type": { + "kind": "EnumType", + "values": ["legacy", "individual", "average", "both"] } } ], "results": [], "is_public": true, - "description": "Partial dependence of ``features``.\n\nPartial dependence of a feature (or a set of features) corresponds to the average response of an estimator for each possible value of the feature. Read more in the :ref:`User Guide `. .. warning:: For :class:`~sklearn.ensemble.GradientBoostingClassifier` and :class:`~sklearn.ensemble.GradientBoostingRegressor`, the `'recursion'` method (used by default) will not account for the `init` predictor of the boosting process. In practice, this will produce the same values as `'brute'` up to a constant offset in the target response, provided that `init` is a constant estimator (which is the default). However, if `init` is not a constant estimator, the partial dependence values are incorrect for `'recursion'` because the offset will be sample-dependent. It is preferable to use the `'brute'` method. Note that this only applies to :class:`~sklearn.ensemble.GradientBoostingClassifier` and :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.", - "docstring": "Partial dependence of ``features``.\n\nPartial dependence of a feature (or a set of features) corresponds to\nthe average response of an estimator for each possible value of the\nfeature.\n\nRead more in the :ref:`User Guide `.\n\n.. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\nParameters\n----------\nestimator : BaseEstimator\n A fitted estimator object implementing :term:`predict`,\n :term:`predict_proba`, or :term:`decision_function`.\n Multioutput-multiclass classifiers are not supported.\n\nX : {array-like or dataframe} of shape (n_samples, n_features)\n ``X`` is used to generate a grid of values for the target\n ``features`` (where the partial dependence will be evaluated), and\n also to generate values for the complement features when the\n `method` is 'brute'.\n\nfeatures : array-like of {int, str}\n The feature (e.g. `[0]`) or pair of interacting features\n (e.g. `[(0, 1)]`) for which the partial dependency should be computed.\n\nresponse_method : {'auto', 'predict_proba', 'decision_function'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. For regressors\n this parameter is ignored and the response is always the output of\n :term:`predict`. By default, :term:`predict_proba` is tried first\n and we revert to :term:`decision_function` if it doesn't exist. If\n ``method`` is 'recursion', the response is always the output of\n :term:`decision_function`.\n\npercentiles : tuple of float, default=(0.05, 0.95)\n The lower and upper percentile used to create the extreme values\n for the grid. Must be in [0, 1].\n\ngrid_resolution : int, default=100\n The number of equally spaced points on the grid, for each target\n feature.\n\nmethod : {'auto', 'recursion', 'brute'}, default='auto'\n The method used to calculate the averaged predictions:\n\n - `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`,\n ) when `kind='average'`.\n This is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the Individual Conditional Expectation (ICE) by\n design, it is not compatible with ICE and thus `kind` must be\n `'average'`.\n\n - `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n - `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\n Please see :ref:`this note ` for\n differences between the `'brute'` and `'recursion'` method.\n\nkind : {'legacy', 'average', 'individual', 'both'}, default='legacy'\n Whether to return the partial dependence averaged across all the\n samples in the dataset or one line per sample or both.\n See Returns below.\n\n Note that the fast `method='recursion'` option is only available for\n `kind='average'`. Plotting individual dependencies requires using the\n slower `method='brute'` option.\n\n .. versionadded:: 0.24\n .. deprecated:: 0.24\n `kind='legacy'` is deprecated and will be removed in version 1.1.\n `kind='average'` will be the new default. It is intended to migrate\n from the ndarray output to :class:`~sklearn.utils.Bunch` output.\n\n\nReturns\n-------\npredictions : ndarray or :class:`~sklearn.utils.Bunch`\n\n - if `kind='legacy'`, return value is ndarray of shape (n_outputs, len(values[0]), len(values[1]), ...)\n The predictions for all the points in the grid, averaged\n over all samples in X (or over the training data if ``method``\n is 'recursion').\n\n - if `kind='individual'`, `'average'` or `'both'`, return value is :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n individual : ndarray of shape (n_outputs, n_instances, len(values[0]), len(values[1]), ...)\n The predictions for all the points in the grid for all\n samples in X. This is also known as Individual\n Conditional Expectation (ICE)\n\n average : ndarray of shape (n_outputs, len(values[0]), len(values[1]), ...)\n The predictions for all the points in the grid, averaged\n over all samples in X (or over the training data if\n ``method`` is 'recursion').\n Only available when kind='both'.\n\n values : seq of 1d ndarrays\n The values with which the grid has been created. The generated\n grid is a cartesian product of the arrays in ``values``.\n ``len(values) == len(features)``. The size of each array\n ``values[j]`` is either ``grid_resolution``, or the number of\n unique values in ``X[:, j]``, whichever is smaller.\n\n ``n_outputs`` corresponds to the number of classes in a multi-class\n setting, or to the number of tasks for multi-output regression.\n For classical regression and binary classification ``n_outputs==1``.\n ``n_values_feature_j`` corresponds to the size ``values[j]``.\n\nvalues : seq of 1d ndarrays\n The values with which the grid has been created. The generated grid\n is a cartesian product of the arrays in ``values``. ``len(values) ==\n len(features)``. The size of each array ``values[j]`` is either\n ``grid_resolution``, or the number of unique values in ``X[:, j]``,\n whichever is smaller. Only available when `kind=\"legacy\"`.\n\nSee Also\n--------\nPartialDependenceDisplay.from_estimator : Plot Partial Dependence.\nPartialDependenceDisplay : Partial Dependence visualization.\n\nExamples\n--------\n>>> X = [[0, 0, 2], [1, 0, 0]]\n>>> y = [0, 1]\n>>> from sklearn.ensemble import GradientBoostingClassifier\n>>> gb = GradientBoostingClassifier(random_state=0).fit(X, y)\n>>> partial_dependence(gb, features=[0], X=X, percentiles=(0, 1),\n... grid_resolution=2) # doctest: +SKIP\n(array([[-4.52..., 4.52...]]), [array([ 0., 1.])])", - "source_code": "\ndef partial_dependence(estimator, X, features, *, response_method='auto', percentiles=(0.05, 0.95), grid_resolution=100, method='auto', kind='legacy'):\n \"\"\"Partial dependence of ``features``.\n\n Partial dependence of a feature (or a set of features) corresponds to\n the average response of an estimator for each possible value of the\n feature.\n\n Read more in the :ref:`User Guide `.\n\n .. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\n Parameters\n ----------\n estimator : BaseEstimator\n A fitted estimator object implementing :term:`predict`,\n :term:`predict_proba`, or :term:`decision_function`.\n Multioutput-multiclass classifiers are not supported.\n\n X : {array-like or dataframe} of shape (n_samples, n_features)\n ``X`` is used to generate a grid of values for the target\n ``features`` (where the partial dependence will be evaluated), and\n also to generate values for the complement features when the\n `method` is 'brute'.\n\n features : array-like of {int, str}\n The feature (e.g. `[0]`) or pair of interacting features\n (e.g. `[(0, 1)]`) for which the partial dependency should be computed.\n\n response_method : {'auto', 'predict_proba', 'decision_function'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. For regressors\n this parameter is ignored and the response is always the output of\n :term:`predict`. By default, :term:`predict_proba` is tried first\n and we revert to :term:`decision_function` if it doesn't exist. If\n ``method`` is 'recursion', the response is always the output of\n :term:`decision_function`.\n\n percentiles : tuple of float, default=(0.05, 0.95)\n The lower and upper percentile used to create the extreme values\n for the grid. Must be in [0, 1].\n\n grid_resolution : int, default=100\n The number of equally spaced points on the grid, for each target\n feature.\n\n method : {'auto', 'recursion', 'brute'}, default='auto'\n The method used to calculate the averaged predictions:\n\n - `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`,\n ) when `kind='average'`.\n This is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the Individual Conditional Expectation (ICE) by\n design, it is not compatible with ICE and thus `kind` must be\n `'average'`.\n\n - `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n - `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\n Please see :ref:`this note ` for\n differences between the `'brute'` and `'recursion'` method.\n\n kind : {'legacy', 'average', 'individual', 'both'}, default='legacy'\n Whether to return the partial dependence averaged across all the\n samples in the dataset or one line per sample or both.\n See Returns below.\n\n Note that the fast `method='recursion'` option is only available for\n `kind='average'`. Plotting individual dependencies requires using the\n slower `method='brute'` option.\n\n .. versionadded:: 0.24\n .. deprecated:: 0.24\n `kind='legacy'` is deprecated and will be removed in version 1.1.\n `kind='average'` will be the new default. It is intended to migrate\n from the ndarray output to :class:`~sklearn.utils.Bunch` output.\n\n\n Returns\n -------\n predictions : ndarray or :class:`~sklearn.utils.Bunch`\n\n - if `kind='legacy'`, return value is ndarray of shape (n_outputs, len(values[0]), len(values[1]), ...)\n The predictions for all the points in the grid, averaged\n over all samples in X (or over the training data if ``method``\n is 'recursion').\n\n - if `kind='individual'`, `'average'` or `'both'`, return value is :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n individual : ndarray of shape (n_outputs, n_instances, len(values[0]), len(values[1]), ...)\n The predictions for all the points in the grid for all\n samples in X. This is also known as Individual\n Conditional Expectation (ICE)\n\n average : ndarray of shape (n_outputs, len(values[0]), len(values[1]), ...)\n The predictions for all the points in the grid, averaged\n over all samples in X (or over the training data if\n ``method`` is 'recursion').\n Only available when kind='both'.\n\n values : seq of 1d ndarrays\n The values with which the grid has been created. The generated\n grid is a cartesian product of the arrays in ``values``.\n ``len(values) == len(features)``. The size of each array\n ``values[j]`` is either ``grid_resolution``, or the number of\n unique values in ``X[:, j]``, whichever is smaller.\n\n ``n_outputs`` corresponds to the number of classes in a multi-class\n setting, or to the number of tasks for multi-output regression.\n For classical regression and binary classification ``n_outputs==1``.\n ``n_values_feature_j`` corresponds to the size ``values[j]``.\n\n values : seq of 1d ndarrays\n The values with which the grid has been created. The generated grid\n is a cartesian product of the arrays in ``values``. ``len(values) ==\n len(features)``. The size of each array ``values[j]`` is either\n ``grid_resolution``, or the number of unique values in ``X[:, j]``,\n whichever is smaller. Only available when `kind=\"legacy\"`.\n\n See Also\n --------\n PartialDependenceDisplay.from_estimator : Plot Partial Dependence.\n PartialDependenceDisplay : Partial Dependence visualization.\n\n Examples\n --------\n >>> X = [[0, 0, 2], [1, 0, 0]]\n >>> y = [0, 1]\n >>> from sklearn.ensemble import GradientBoostingClassifier\n >>> gb = GradientBoostingClassifier(random_state=0).fit(X, y)\n >>> partial_dependence(gb, features=[0], X=X, percentiles=(0, 1),\n ... grid_resolution=2) # doctest: +SKIP\n (array([[-4.52..., 4.52...]]), [array([ 0., 1.])])\n \"\"\"\n if not (is_classifier(estimator) or is_regressor(estimator)):\n raise ValueError(\"'estimator' must be a fitted regressor or classifier.\")\n if isinstance(estimator, Pipeline):\n for est in estimator:\n if est not in (None, 'drop'):\n check_is_fitted(est)\n else:\n check_is_fitted(estimator)\n if is_classifier(estimator) and isinstance(estimator.classes_[0], np.ndarray):\n raise ValueError('Multiclass-multioutput estimators are not supported')\n if not (hasattr(X, '__array__') or sparse.issparse(X)):\n X = check_array(X, force_all_finite='allow-nan', dtype=object)\n accepted_responses = ('auto', 'predict_proba', 'decision_function')\n if response_method not in accepted_responses:\n raise ValueError('response_method {} is invalid. Accepted response_method names are {}.'.format(response_method, ', '.join(accepted_responses)))\n if is_regressor(estimator) and response_method != 'auto':\n raise ValueError(\"The response_method parameter is ignored for regressors and must be 'auto'.\")\n accepted_methods = ('brute', 'recursion', 'auto')\n if method not in accepted_methods:\n raise ValueError('method {} is invalid. Accepted method names are {}.'.format(method, ', '.join(accepted_methods)))\n if kind != 'average' and kind != 'legacy':\n if method == 'recursion':\n raise ValueError(\"The 'recursion' method only applies when 'kind' is set to 'average'\")\n method = 'brute'\n if method == 'auto':\n if isinstance(estimator, BaseGradientBoosting) and estimator.init is None:\n method = 'recursion'\n elif isinstance(estimator, (BaseHistGradientBoosting, DecisionTreeRegressor, RandomForestRegressor)):\n method = 'recursion'\n else:\n method = 'brute'\n if method == 'recursion':\n if not isinstance(estimator, (BaseGradientBoosting, BaseHistGradientBoosting, DecisionTreeRegressor, RandomForestRegressor)):\n supported_classes_recursion = ('GradientBoostingClassifier', 'GradientBoostingRegressor', 'HistGradientBoostingClassifier', 'HistGradientBoostingRegressor', 'HistGradientBoostingRegressor', 'DecisionTreeRegressor', 'RandomForestRegressor')\n raise ValueError(\"Only the following estimators support the 'recursion' method: {}. Try using method='brute'.\".format(', '.join(supported_classes_recursion)))\n if response_method == 'auto':\n response_method = 'decision_function'\n if response_method != 'decision_function':\n raise ValueError(\"With the 'recursion' method, the response_method must be 'decision_function'. Got {}.\".format(response_method))\n if _determine_key_type(features, accept_slice=False) == 'int':\n if np.any(np.less(features, 0)):\n raise ValueError('all features must be in [0, {}]'.format(X.shape[1] - 1))\n features_indices = np.asarray(_get_column_indices(X, features), dtype=np.int32, order='C').ravel()\n (grid, values) = _grid_from_X(_safe_indexing(X, features_indices, axis=1), percentiles, grid_resolution)\n if method == 'brute':\n (averaged_predictions, predictions) = _partial_dependence_brute(estimator, grid, features_indices, X, response_method)\n predictions = predictions.reshape(-1, X.shape[0], *[val.shape[0] for val in values])\n else:\n averaged_predictions = _partial_dependence_recursion(estimator, grid, features_indices)\n averaged_predictions = averaged_predictions.reshape(-1, *[val.shape[0] for val in values])\n if kind == 'legacy':\n warnings.warn(\"A Bunch will be returned in place of 'predictions' from version 1.1 (renaming of 0.26) with partial dependence results accessible via the 'average' key. In the meantime, pass kind='average' to get the future behaviour.\", FutureWarning)\n return averaged_predictions, values\n elif kind == 'average':\n return Bunch(average=averaged_predictions, values=values)\n elif kind == 'individual':\n return Bunch(individual=predictions, values=values)\n else:\n return Bunch(average=averaged_predictions, individual=predictions, values=values)" + "description": "Partial dependence of ``features``.\n\nPartial dependence of a feature (or a set of features) corresponds to\nthe average response of an estimator for each possible value of the\nfeature.\n\nRead more in the :ref:`User Guide `.\n\n.. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.", + "docstring": "Partial dependence of ``features``.\n\n Partial dependence of a feature (or a set of features) corresponds to\n the average response of an estimator for each possible value of the\n feature.\n\n Read more in the :ref:`User Guide `.\n\n .. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\n Parameters\n ----------\n estimator : BaseEstimator\n A fitted estimator object implementing :term:`predict`,\n :term:`predict_proba`, or :term:`decision_function`.\n Multioutput-multiclass classifiers are not supported.\n\n X : {array-like or dataframe} of shape (n_samples, n_features)\n ``X`` is used to generate a grid of values for the target\n ``features`` (where the partial dependence will be evaluated), and\n also to generate values for the complement features when the\n `method` is 'brute'.\n\n features : array-like of {int, str}\n The feature (e.g. `[0]`) or pair of interacting features\n (e.g. `[(0, 1)]`) for which the partial dependency should be computed.\n\n response_method : {'auto', 'predict_proba', 'decision_function'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. For regressors\n this parameter is ignored and the response is always the output of\n :term:`predict`. By default, :term:`predict_proba` is tried first\n and we revert to :term:`decision_function` if it doesn't exist. If\n ``method`` is 'recursion', the response is always the output of\n :term:`decision_function`.\n\n percentiles : tuple of float, default=(0.05, 0.95)\n The lower and upper percentile used to create the extreme values\n for the grid. Must be in [0, 1].\n\n grid_resolution : int, default=100\n The number of equally spaced points on the grid, for each target\n feature.\n\n method : {'auto', 'recursion', 'brute'}, default='auto'\n The method used to calculate the averaged predictions:\n\n - `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`,\n ) when `kind='average'`.\n This is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the Individual Conditional Expectation (ICE) by\n design, it is not compatible with ICE and thus `kind` must be\n `'average'`.\n\n - `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n - `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\n Please see :ref:`this note ` for\n differences between the `'brute'` and `'recursion'` method.\n\n kind : {'legacy', 'average', 'individual', 'both'}, default='legacy'\n Whether to return the partial dependence averaged across all the\n samples in the dataset or one line per sample or both.\n See Returns below.\n\n Note that the fast `method='recursion'` option is only available for\n `kind='average'`. Plotting individual dependencies requires using the\n slower `method='brute'` option.\n\n .. versionadded:: 0.24\n .. deprecated:: 0.24\n `kind='legacy'` is deprecated and will be removed in version 1.1.\n `kind='average'` will be the new default. It is intended to migrate\n from the ndarray output to :class:`~sklearn.utils.Bunch` output.\n\n\n Returns\n -------\n predictions : ndarray or :class:`~sklearn.utils.Bunch`\n\n - if `kind='legacy'`, return value is ndarray of shape (n_outputs, len(values[0]), len(values[1]), ...)\n The predictions for all the points in the grid, averaged\n over all samples in X (or over the training data if ``method``\n is 'recursion').\n\n - if `kind='individual'`, `'average'` or `'both'`, return value is :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n individual : ndarray of shape (n_outputs, n_instances, len(values[0]), len(values[1]), ...)\n The predictions for all the points in the grid for all\n samples in X. This is also known as Individual\n Conditional Expectation (ICE)\n\n average : ndarray of shape (n_outputs, len(values[0]), len(values[1]), ...)\n The predictions for all the points in the grid, averaged\n over all samples in X (or over the training data if\n ``method`` is 'recursion').\n Only available when kind='both'.\n\n values : seq of 1d ndarrays\n The values with which the grid has been created. The generated\n grid is a cartesian product of the arrays in ``values``.\n ``len(values) == len(features)``. The size of each array\n ``values[j]`` is either ``grid_resolution``, or the number of\n unique values in ``X[:, j]``, whichever is smaller.\n\n ``n_outputs`` corresponds to the number of classes in a multi-class\n setting, or to the number of tasks for multi-output regression.\n For classical regression and binary classification ``n_outputs==1``.\n ``n_values_feature_j`` corresponds to the size ``values[j]``.\n\n values : seq of 1d ndarrays\n The values with which the grid has been created. The generated grid\n is a cartesian product of the arrays in ``values``. ``len(values) ==\n len(features)``. The size of each array ``values[j]`` is either\n ``grid_resolution``, or the number of unique values in ``X[:, j]``,\n whichever is smaller. Only available when `kind=\"legacy\"`.\n\n See Also\n --------\n PartialDependenceDisplay.from_estimator : Plot Partial Dependence.\n PartialDependenceDisplay : Partial Dependence visualization.\n\n Examples\n --------\n >>> X = [[0, 0, 2], [1, 0, 0]]\n >>> y = [0, 1]\n >>> from sklearn.ensemble import GradientBoostingClassifier\n >>> gb = GradientBoostingClassifier(random_state=0).fit(X, y)\n >>> partial_dependence(gb, features=[0], X=X, percentiles=(0, 1),\n ... grid_resolution=2) # doctest: +SKIP\n (array([[-4.52..., 4.52...]]), [array([ 0., 1.])])\n ", + "source_code": "\ndef partial_dependence(estimator, X, features, *, response_method='auto', percentiles=(0.05, 0.95), grid_resolution=100, method='auto', kind='legacy'):\n \"\"\"Partial dependence of ``features``.\n\n Partial dependence of a feature (or a set of features) corresponds to\n the average response of an estimator for each possible value of the\n feature.\n\n Read more in the :ref:`User Guide `.\n\n .. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\n Parameters\n ----------\n estimator : BaseEstimator\n A fitted estimator object implementing :term:`predict`,\n :term:`predict_proba`, or :term:`decision_function`.\n Multioutput-multiclass classifiers are not supported.\n\n X : {array-like or dataframe} of shape (n_samples, n_features)\n ``X`` is used to generate a grid of values for the target\n ``features`` (where the partial dependence will be evaluated), and\n also to generate values for the complement features when the\n `method` is 'brute'.\n\n features : array-like of {int, str}\n The feature (e.g. `[0]`) or pair of interacting features\n (e.g. `[(0, 1)]`) for which the partial dependency should be computed.\n\n response_method : {'auto', 'predict_proba', 'decision_function'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. For regressors\n this parameter is ignored and the response is always the output of\n :term:`predict`. By default, :term:`predict_proba` is tried first\n and we revert to :term:`decision_function` if it doesn't exist. If\n ``method`` is 'recursion', the response is always the output of\n :term:`decision_function`.\n\n percentiles : tuple of float, default=(0.05, 0.95)\n The lower and upper percentile used to create the extreme values\n for the grid. Must be in [0, 1].\n\n grid_resolution : int, default=100\n The number of equally spaced points on the grid, for each target\n feature.\n\n method : {'auto', 'recursion', 'brute'}, default='auto'\n The method used to calculate the averaged predictions:\n\n - `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`,\n ) when `kind='average'`.\n This is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the Individual Conditional Expectation (ICE) by\n design, it is not compatible with ICE and thus `kind` must be\n `'average'`.\n\n - `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n - `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\n Please see :ref:`this note ` for\n differences between the `'brute'` and `'recursion'` method.\n\n kind : {'legacy', 'average', 'individual', 'both'}, default='legacy'\n Whether to return the partial dependence averaged across all the\n samples in the dataset or one line per sample or both.\n See Returns below.\n\n Note that the fast `method='recursion'` option is only available for\n `kind='average'`. Plotting individual dependencies requires using the\n slower `method='brute'` option.\n\n .. versionadded:: 0.24\n .. deprecated:: 0.24\n `kind='legacy'` is deprecated and will be removed in version 1.1.\n `kind='average'` will be the new default. It is intended to migrate\n from the ndarray output to :class:`~sklearn.utils.Bunch` output.\n\n\n Returns\n -------\n predictions : ndarray or :class:`~sklearn.utils.Bunch`\n\n - if `kind='legacy'`, return value is ndarray of shape (n_outputs, len(values[0]), len(values[1]), ...)\n The predictions for all the points in the grid, averaged\n over all samples in X (or over the training data if ``method``\n is 'recursion').\n\n - if `kind='individual'`, `'average'` or `'both'`, return value is :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n individual : ndarray of shape (n_outputs, n_instances, len(values[0]), len(values[1]), ...)\n The predictions for all the points in the grid for all\n samples in X. This is also known as Individual\n Conditional Expectation (ICE)\n\n average : ndarray of shape (n_outputs, len(values[0]), len(values[1]), ...)\n The predictions for all the points in the grid, averaged\n over all samples in X (or over the training data if\n ``method`` is 'recursion').\n Only available when kind='both'.\n\n values : seq of 1d ndarrays\n The values with which the grid has been created. The generated\n grid is a cartesian product of the arrays in ``values``.\n ``len(values) == len(features)``. The size of each array\n ``values[j]`` is either ``grid_resolution``, or the number of\n unique values in ``X[:, j]``, whichever is smaller.\n\n ``n_outputs`` corresponds to the number of classes in a multi-class\n setting, or to the number of tasks for multi-output regression.\n For classical regression and binary classification ``n_outputs==1``.\n ``n_values_feature_j`` corresponds to the size ``values[j]``.\n\n values : seq of 1d ndarrays\n The values with which the grid has been created. The generated grid\n is a cartesian product of the arrays in ``values``. ``len(values) ==\n len(features)``. The size of each array ``values[j]`` is either\n ``grid_resolution``, or the number of unique values in ``X[:, j]``,\n whichever is smaller. Only available when `kind=\"legacy\"`.\n\n See Also\n --------\n PartialDependenceDisplay.from_estimator : Plot Partial Dependence.\n PartialDependenceDisplay : Partial Dependence visualization.\n\n Examples\n --------\n >>> X = [[0, 0, 2], [1, 0, 0]]\n >>> y = [0, 1]\n >>> from sklearn.ensemble import GradientBoostingClassifier\n >>> gb = GradientBoostingClassifier(random_state=0).fit(X, y)\n >>> partial_dependence(gb, features=[0], X=X, percentiles=(0, 1),\n ... grid_resolution=2) # doctest: +SKIP\n (array([[-4.52..., 4.52...]]), [array([ 0., 1.])])\n \"\"\"\n check_is_fitted(estimator)\n if not (is_classifier(estimator) or is_regressor(estimator)):\n raise ValueError(\"'estimator' must be a fitted regressor or classifier.\")\n if is_classifier(estimator) and isinstance(estimator.classes_[0], np.ndarray):\n raise ValueError('Multiclass-multioutput estimators are not supported')\n if not (hasattr(X, '__array__') or sparse.issparse(X)):\n X = check_array(X, force_all_finite='allow-nan', dtype=object)\n accepted_responses = ('auto', 'predict_proba', 'decision_function')\n if response_method not in accepted_responses:\n raise ValueError('response_method {} is invalid. Accepted response_method names are {}.'.format(response_method, ', '.join(accepted_responses)))\n if is_regressor(estimator) and response_method != 'auto':\n raise ValueError(\"The response_method parameter is ignored for regressors and must be 'auto'.\")\n accepted_methods = ('brute', 'recursion', 'auto')\n if method not in accepted_methods:\n raise ValueError('method {} is invalid. Accepted method names are {}.'.format(method, ', '.join(accepted_methods)))\n if kind != 'average' and kind != 'legacy':\n if method == 'recursion':\n raise ValueError(\"The 'recursion' method only applies when 'kind' is set to 'average'\")\n method = 'brute'\n if method == 'auto':\n if isinstance(estimator, BaseGradientBoosting) and estimator.init is None:\n method = 'recursion'\n elif isinstance(estimator, (BaseHistGradientBoosting, DecisionTreeRegressor, RandomForestRegressor)):\n method = 'recursion'\n else:\n method = 'brute'\n if method == 'recursion':\n if not isinstance(estimator, (BaseGradientBoosting, BaseHistGradientBoosting, DecisionTreeRegressor, RandomForestRegressor)):\n supported_classes_recursion = ('GradientBoostingClassifier', 'GradientBoostingRegressor', 'HistGradientBoostingClassifier', 'HistGradientBoostingRegressor', 'HistGradientBoostingRegressor', 'DecisionTreeRegressor', 'RandomForestRegressor')\n raise ValueError(\"Only the following estimators support the 'recursion' method: {}. Try using method='brute'.\".format(', '.join(supported_classes_recursion)))\n if response_method == 'auto':\n response_method = 'decision_function'\n if response_method != 'decision_function':\n raise ValueError(\"With the 'recursion' method, the response_method must be 'decision_function'. Got {}.\".format(response_method))\n if _determine_key_type(features, accept_slice=False) == 'int':\n if np.any(np.less(features, 0)):\n raise ValueError('all features must be in [0, {}]'.format(X.shape[1] - 1))\n features_indices = np.asarray(_get_column_indices(X, features), dtype=np.int32, order='C').ravel()\n (grid, values) = _grid_from_X(_safe_indexing(X, features_indices, axis=1), percentiles, grid_resolution)\n if method == 'brute':\n (averaged_predictions, predictions) = _partial_dependence_brute(estimator, grid, features_indices, X, response_method)\n predictions = predictions.reshape(-1, X.shape[0], *[val.shape[0] for val in values])\n else:\n averaged_predictions = _partial_dependence_recursion(estimator, grid, features_indices)\n averaged_predictions = averaged_predictions.reshape(-1, *[val.shape[0] for val in values])\n if kind == 'legacy':\n warnings.warn(\"A Bunch will be returned in place of 'predictions' from version 1.1 (renaming of 0.26) with partial dependence results accessible via the 'average' key. In the meantime, pass kind='average' to get the future behaviour.\", FutureWarning)\n return averaged_predictions, values\n elif kind == 'average':\n return Bunch(average=averaged_predictions, values=values)\n elif kind == 'individual':\n return Bunch(individual=predictions, values=values)\n else:\n return Bunch(average=averaged_predictions, individual=predictions, values=values)" }, { "name": "_calculate_permutation_scores", @@ -91787,7 +97691,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -91797,7 +97702,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -91807,7 +97713,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -91817,7 +97724,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "col_idx", @@ -91827,7 +97735,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -91837,7 +97746,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_repeats", @@ -91847,7 +97757,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scorer", @@ -91857,7 +97768,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -91867,7 +97779,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -91891,7 +97804,8 @@ "docstring": { "type": "ndarray of shape (n_features,)", "description": "The baseline score without permutation." - } + }, + "refined_type": {} }, { "name": "permuted_score", @@ -91901,13 +97815,14 @@ "docstring": { "type": "ndarray of shape (n_features, n_repeats)", "description": "The permuted scores for the `n` repetitions." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the importances as the decrease in score.", - "docstring": "Compute the importances as the decrease in score.\n\nParameters\n----------\nbaseline_score : ndarray of shape (n_features,)\n The baseline score without permutation.\npermuted_score : ndarray of shape (n_features, n_repeats)\n The permuted scores for the `n` repetitions.\n\nReturns\n-------\nimportances : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n importances_mean : ndarray, shape (n_features, )\n Mean of feature importance over `n_repeats`.\n importances_std : ndarray, shape (n_features, )\n Standard deviation over `n_repeats`.\n importances : ndarray, shape (n_features, n_repeats)\n Raw permutation importance scores.", + "docstring": "Compute the importances as the decrease in score.\n\n Parameters\n ----------\n baseline_score : ndarray of shape (n_features,)\n The baseline score without permutation.\n permuted_score : ndarray of shape (n_features, n_repeats)\n The permuted scores for the `n` repetitions.\n\n Returns\n -------\n importances : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n importances_mean : ndarray, shape (n_features, )\n Mean of feature importance over `n_repeats`.\n importances_std : ndarray, shape (n_features, )\n Standard deviation over `n_repeats`.\n importances : ndarray, shape (n_features, n_repeats)\n Raw permutation importance scores.\n ", "source_code": "\ndef _create_importances_bunch(baseline_score, permuted_score):\n \"\"\"Compute the importances as the decrease in score.\n\n Parameters\n ----------\n baseline_score : ndarray of shape (n_features,)\n The baseline score without permutation.\n permuted_score : ndarray of shape (n_features, n_repeats)\n The permuted scores for the `n` repetitions.\n\n Returns\n -------\n importances : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n importances_mean : ndarray, shape (n_features, )\n Mean of feature importance over `n_repeats`.\n importances_std : ndarray, shape (n_features, )\n Standard deviation over `n_repeats`.\n importances : ndarray, shape (n_features, n_repeats)\n Raw permutation importance scores.\n \"\"\"\n importances = baseline_score - permuted_score\n return Bunch(importances_mean=np.mean(importances, axis=1), importances_std=np.std(importances, axis=1), importances=importances)" }, { @@ -91925,7 +97840,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -91935,7 +97851,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -91945,7 +97862,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -91955,7 +97873,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -91965,13 +97884,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _weights_scorer(scorer, estimator, X, y, sample_weight):\n if sample_weight is not None:\n return scorer(estimator, X, y, sample_weight)\n return scorer(estimator, X, y)" }, { @@ -91989,7 +97909,8 @@ "docstring": { "type": "object", "description": "An estimator that has already been :term:`fitted` and is compatible\nwith :term:`scorer`." - } + }, + "refined_type": {} }, { "name": "X", @@ -91999,7 +97920,8 @@ "docstring": { "type": "ndarray or DataFrame, shape (n_samples, n_features)", "description": "Data on which permutation importance will be computed." - } + }, + "refined_type": {} }, { "name": "y", @@ -92009,7 +97931,8 @@ "docstring": { "type": "array-like or None, shape (n_samples, ) or (n_samples, n_classes)", "description": "Targets for supervised or `None` for unsupervised." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -92019,7 +97942,8 @@ "docstring": { "type": "str, callable, list, tuple, or dict, default=None", "description": "Scorer to use.\nIf `scoring` represents a single score, one can use:\n\n- a single string (see :ref:`scoring_parameter`);\n- a callable (see :ref:`scoring`) that returns a single value.\n\nIf `scoring` represents multiple scores, one can use:\n\n- a list or tuple of unique strings;\n- a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n- a dictionary with metric names as keys and callables a values.\n\nPassing multiple scores to `scoring` is more efficient than calling\n`permutation_importance` for each of the scores as it reuses\npredictions to avoid redundant computation.\n\nIf None, the estimator's default scorer is used." - } + }, + "refined_type": {} }, { "name": "n_repeats", @@ -92029,7 +97953,8 @@ "docstring": { "type": "int, default=5", "description": "Number of times to permute a feature." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -92039,7 +97964,8 @@ "docstring": { "type": "int or None, default=None", "description": "Number of jobs to run in parallel. The computation is done by computing\npermutation score for each columns and parallelized over the columns.\n`None` means 1 unless in a :obj:`joblib.parallel_backend` context.\n`-1` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -92049,7 +97975,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Pseudo-random number generator to control the permutations of each\nfeature.\nPass an int to get reproducible results across function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -92059,7 +97986,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights used in scoring.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} }, { "name": "max_samples", @@ -92069,13 +97997,14 @@ "docstring": { "type": "int or float, default=1.0", "description": "The number of samples to draw from X to compute feature importance\nin each repeat (without replacement).\n\n- If int, then draw `max_samples` samples.\n- If float, then draw `max_samples * X.shape[0]` samples.\n- If `max_samples` is equal to `1.0` or `X.shape[0]`, all samples\n will be used.\n\nWhile using this option may provide less accurate importance estimates,\nit keeps the method tractable when evaluating feature importance on\nlarge datasets. In combination with `n_repeats`, this allows to control\nthe computational speed vs statistical accuracy trade-off of this method.\n\n.. versionadded:: 1.0" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Permutation importance for feature evaluation [BRE]_.\n\nThe :term:`estimator` is required to be a fitted estimator. `X` can be the data set used to train the estimator or a hold-out set. The permutation importance of a feature is calculated as follows. First, a baseline metric, defined by :term:`scoring`, is evaluated on a (potentially different) dataset defined by the `X`. Next, a feature column from the validation set is permuted and the metric is evaluated again. The permutation importance is defined to be the difference between the baseline metric and metric from permutating the feature column. Read more in the :ref:`User Guide `.", - "docstring": "Permutation importance for feature evaluation [BRE]_.\n\nThe :term:`estimator` is required to be a fitted estimator. `X` can be the\ndata set used to train the estimator or a hold-out set. The permutation\nimportance of a feature is calculated as follows. First, a baseline metric,\ndefined by :term:`scoring`, is evaluated on a (potentially different)\ndataset defined by the `X`. Next, a feature column from the validation set\nis permuted and the metric is evaluated again. The permutation importance\nis defined to be the difference between the baseline metric and metric from\npermutating the feature column.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nestimator : object\n An estimator that has already been :term:`fitted` and is compatible\n with :term:`scorer`.\n\nX : ndarray or DataFrame, shape (n_samples, n_features)\n Data on which permutation importance will be computed.\n\ny : array-like or None, shape (n_samples, ) or (n_samples, n_classes)\n Targets for supervised or `None` for unsupervised.\n\nscoring : str, callable, list, tuple, or dict, default=None\n Scorer to use.\n If `scoring` represents a single score, one can use:\n\n - a single string (see :ref:`scoring_parameter`);\n - a callable (see :ref:`scoring`) that returns a single value.\n\n If `scoring` represents multiple scores, one can use:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n Passing multiple scores to `scoring` is more efficient than calling\n `permutation_importance` for each of the scores as it reuses\n predictions to avoid redundant computation.\n\n If None, the estimator's default scorer is used.\n\nn_repeats : int, default=5\n Number of times to permute a feature.\n\nn_jobs : int or None, default=None\n Number of jobs to run in parallel. The computation is done by computing\n permutation score for each columns and parallelized over the columns.\n `None` means 1 unless in a :obj:`joblib.parallel_backend` context.\n `-1` means using all processors. See :term:`Glossary `\n for more details.\n\nrandom_state : int, RandomState instance, default=None\n Pseudo-random number generator to control the permutations of each\n feature.\n Pass an int to get reproducible results across function calls.\n See :term:`Glossary `.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights used in scoring.\n\n .. versionadded:: 0.24\n\nmax_samples : int or float, default=1.0\n The number of samples to draw from X to compute feature importance\n in each repeat (without replacement).\n\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples.\n - If `max_samples` is equal to `1.0` or `X.shape[0]`, all samples\n will be used.\n\n While using this option may provide less accurate importance estimates,\n it keeps the method tractable when evaluating feature importance on\n large datasets. In combination with `n_repeats`, this allows to control\n the computational speed vs statistical accuracy trade-off of this method.\n\n .. versionadded:: 1.0\n\nReturns\n-------\nresult : :class:`~sklearn.utils.Bunch` or dict of such instances\n Dictionary-like object, with the following attributes.\n\n importances_mean : ndarray of shape (n_features, )\n Mean of feature importance over `n_repeats`.\n importances_std : ndarray of shape (n_features, )\n Standard deviation over `n_repeats`.\n importances : ndarray of shape (n_features, n_repeats)\n Raw permutation importance scores.\n\n If there are multiple scoring metrics in the scoring parameter\n `result` is a dict with scorer names as keys (e.g. 'roc_auc') and\n `Bunch` objects like above as values.\n\nReferences\n----------\n.. [BRE] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32,\n 2001. https://doi.org/10.1023/A:1010933404324\n\nExamples\n--------\n>>> from sklearn.linear_model import LogisticRegression\n>>> from sklearn.inspection import permutation_importance\n>>> X = [[1, 9, 9],[1, 9, 9],[1, 9, 9],\n... [0, 9, 9],[0, 9, 9],[0, 9, 9]]\n>>> y = [1, 1, 1, 0, 0, 0]\n>>> clf = LogisticRegression().fit(X, y)\n>>> result = permutation_importance(clf, X, y, n_repeats=10,\n... random_state=0)\n>>> result.importances_mean\narray([0.4666..., 0. , 0. ])\n>>> result.importances_std\narray([0.2211..., 0. , 0. ])", + "description": "Permutation importance for feature evaluation [BRE]_.\n\nThe :term:`estimator` is required to be a fitted estimator. `X` can be the\ndata set used to train the estimator or a hold-out set. The permutation\nimportance of a feature is calculated as follows. First, a baseline metric,\ndefined by :term:`scoring`, is evaluated on a (potentially different)\ndataset defined by the `X`. Next, a feature column from the validation set\nis permuted and the metric is evaluated again. The permutation importance\nis defined to be the difference between the baseline metric and metric from\npermutating the feature column.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Permutation importance for feature evaluation [BRE]_.\n\n The :term:`estimator` is required to be a fitted estimator. `X` can be the\n data set used to train the estimator or a hold-out set. The permutation\n importance of a feature is calculated as follows. First, a baseline metric,\n defined by :term:`scoring`, is evaluated on a (potentially different)\n dataset defined by the `X`. Next, a feature column from the validation set\n is permuted and the metric is evaluated again. The permutation importance\n is defined to be the difference between the baseline metric and metric from\n permutating the feature column.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : object\n An estimator that has already been :term:`fitted` and is compatible\n with :term:`scorer`.\n\n X : ndarray or DataFrame, shape (n_samples, n_features)\n Data on which permutation importance will be computed.\n\n y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)\n Targets for supervised or `None` for unsupervised.\n\n scoring : str, callable, list, tuple, or dict, default=None\n Scorer to use.\n If `scoring` represents a single score, one can use:\n\n - a single string (see :ref:`scoring_parameter`);\n - a callable (see :ref:`scoring`) that returns a single value.\n\n If `scoring` represents multiple scores, one can use:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n Passing multiple scores to `scoring` is more efficient than calling\n `permutation_importance` for each of the scores as it reuses\n predictions to avoid redundant computation.\n\n If None, the estimator's default scorer is used.\n\n n_repeats : int, default=5\n Number of times to permute a feature.\n\n n_jobs : int or None, default=None\n Number of jobs to run in parallel. The computation is done by computing\n permutation score for each columns and parallelized over the columns.\n `None` means 1 unless in a :obj:`joblib.parallel_backend` context.\n `-1` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n Pseudo-random number generator to control the permutations of each\n feature.\n Pass an int to get reproducible results across function calls.\n See :term:`Glossary `.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights used in scoring.\n\n .. versionadded:: 0.24\n\n max_samples : int or float, default=1.0\n The number of samples to draw from X to compute feature importance\n in each repeat (without replacement).\n\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples.\n - If `max_samples` is equal to `1.0` or `X.shape[0]`, all samples\n will be used.\n\n While using this option may provide less accurate importance estimates,\n it keeps the method tractable when evaluating feature importance on\n large datasets. In combination with `n_repeats`, this allows to control\n the computational speed vs statistical accuracy trade-off of this method.\n\n .. versionadded:: 1.0\n\n Returns\n -------\n result : :class:`~sklearn.utils.Bunch` or dict of such instances\n Dictionary-like object, with the following attributes.\n\n importances_mean : ndarray of shape (n_features, )\n Mean of feature importance over `n_repeats`.\n importances_std : ndarray of shape (n_features, )\n Standard deviation over `n_repeats`.\n importances : ndarray of shape (n_features, n_repeats)\n Raw permutation importance scores.\n\n If there are multiple scoring metrics in the scoring parameter\n `result` is a dict with scorer names as keys (e.g. 'roc_auc') and\n `Bunch` objects like above as values.\n\n References\n ----------\n .. [BRE] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32,\n 2001. https://doi.org/10.1023/A:1010933404324\n\n Examples\n --------\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.inspection import permutation_importance\n >>> X = [[1, 9, 9],[1, 9, 9],[1, 9, 9],\n ... [0, 9, 9],[0, 9, 9],[0, 9, 9]]\n >>> y = [1, 1, 1, 0, 0, 0]\n >>> clf = LogisticRegression().fit(X, y)\n >>> result = permutation_importance(clf, X, y, n_repeats=10,\n ... random_state=0)\n >>> result.importances_mean\n array([0.4666..., 0. , 0. ])\n >>> result.importances_std\n array([0.2211..., 0. , 0. ])\n ", "source_code": "\ndef permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5, n_jobs=None, random_state=None, sample_weight=None, max_samples=1.0):\n \"\"\"Permutation importance for feature evaluation [BRE]_.\n\n The :term:`estimator` is required to be a fitted estimator. `X` can be the\n data set used to train the estimator or a hold-out set. The permutation\n importance of a feature is calculated as follows. First, a baseline metric,\n defined by :term:`scoring`, is evaluated on a (potentially different)\n dataset defined by the `X`. Next, a feature column from the validation set\n is permuted and the metric is evaluated again. The permutation importance\n is defined to be the difference between the baseline metric and metric from\n permutating the feature column.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : object\n An estimator that has already been :term:`fitted` and is compatible\n with :term:`scorer`.\n\n X : ndarray or DataFrame, shape (n_samples, n_features)\n Data on which permutation importance will be computed.\n\n y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)\n Targets for supervised or `None` for unsupervised.\n\n scoring : str, callable, list, tuple, or dict, default=None\n Scorer to use.\n If `scoring` represents a single score, one can use:\n\n - a single string (see :ref:`scoring_parameter`);\n - a callable (see :ref:`scoring`) that returns a single value.\n\n If `scoring` represents multiple scores, one can use:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n Passing multiple scores to `scoring` is more efficient than calling\n `permutation_importance` for each of the scores as it reuses\n predictions to avoid redundant computation.\n\n If None, the estimator's default scorer is used.\n\n n_repeats : int, default=5\n Number of times to permute a feature.\n\n n_jobs : int or None, default=None\n Number of jobs to run in parallel. The computation is done by computing\n permutation score for each columns and parallelized over the columns.\n `None` means 1 unless in a :obj:`joblib.parallel_backend` context.\n `-1` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance, default=None\n Pseudo-random number generator to control the permutations of each\n feature.\n Pass an int to get reproducible results across function calls.\n See :term:`Glossary `.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights used in scoring.\n\n .. versionadded:: 0.24\n\n max_samples : int or float, default=1.0\n The number of samples to draw from X to compute feature importance\n in each repeat (without replacement).\n\n - If int, then draw `max_samples` samples.\n - If float, then draw `max_samples * X.shape[0]` samples.\n - If `max_samples` is equal to `1.0` or `X.shape[0]`, all samples\n will be used.\n\n While using this option may provide less accurate importance estimates,\n it keeps the method tractable when evaluating feature importance on\n large datasets. In combination with `n_repeats`, this allows to control\n the computational speed vs statistical accuracy trade-off of this method.\n\n .. versionadded:: 1.0\n\n Returns\n -------\n result : :class:`~sklearn.utils.Bunch` or dict of such instances\n Dictionary-like object, with the following attributes.\n\n importances_mean : ndarray of shape (n_features, )\n Mean of feature importance over `n_repeats`.\n importances_std : ndarray of shape (n_features, )\n Standard deviation over `n_repeats`.\n importances : ndarray of shape (n_features, n_repeats)\n Raw permutation importance scores.\n\n If there are multiple scoring metrics in the scoring parameter\n `result` is a dict with scorer names as keys (e.g. 'roc_auc') and\n `Bunch` objects like above as values.\n\n References\n ----------\n .. [BRE] L. Breiman, \"Random Forests\", Machine Learning, 45(1), 5-32,\n 2001. https://doi.org/10.1023/A:1010933404324\n\n Examples\n --------\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.inspection import permutation_importance\n >>> X = [[1, 9, 9],[1, 9, 9],[1, 9, 9],\n ... [0, 9, 9],[0, 9, 9],[0, 9, 9]]\n >>> y = [1, 1, 1, 0, 0, 0]\n >>> clf = LogisticRegression().fit(X, y)\n >>> result = permutation_importance(clf, X, y, n_repeats=10,\n ... random_state=0)\n >>> result.importances_mean\n array([0.4666..., 0. , 0. ])\n >>> result.importances_std\n array([0.2211..., 0. , 0. ])\n \"\"\"\n if not hasattr(X, 'iloc'):\n X = check_array(X, force_all_finite='allow-nan', dtype=None)\n random_state = check_random_state(random_state)\n random_seed = random_state.randint(np.iinfo(np.int32).max + 1)\n if not isinstance(max_samples, numbers.Integral):\n max_samples = int(max_samples * X.shape[0])\n elif not 0 < max_samples <= X.shape[0]:\n raise ValueError('max_samples must be in (0, n_samples]')\n if callable(scoring):\n scorer = scoring\n elif scoring is None or isinstance(scoring, str):\n scorer = check_scoring(estimator, scoring=scoring)\n else:\n scorers_dict = _check_multimetric_scoring(estimator, scoring)\n scorer = _MultimetricScorer(**scorers_dict)\n baseline_score = _weights_scorer(scorer, estimator, X, y, sample_weight)\n scores = Parallel(n_jobs=n_jobs)((delayed(_calculate_permutation_scores)(estimator, X, y, sample_weight, col_idx, random_seed, n_repeats, scorer, max_samples) for col_idx in range(X.shape[1])))\n if isinstance(baseline_score, dict):\n return {name: _create_importances_bunch(baseline_score[name], np.array([scores[col_idx][name] for col_idx in range(X.shape[1])])) for name in baseline_score}\n else:\n return _create_importances_bunch(baseline_score, np.array(scores))" }, { @@ -92093,7 +98022,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "pd_results", @@ -92103,7 +98033,8 @@ "docstring": { "type": "list of Bunch", "description": "Results of :func:`~sklearn.inspection.partial_dependence` for\n``features``." - } + }, + "refined_type": {} }, { "name": "features", @@ -92113,7 +98044,8 @@ "docstring": { "type": "list of (int,) or list of (int, int)", "description": "Indices of features for a given plot. A tuple of one integer will plot\na partial dependence curve of one feature. A tuple of two integers will\nplot a two-way partial dependence curve as a contour plot." - } + }, + "refined_type": {} }, { "name": "feature_names", @@ -92123,7 +98055,8 @@ "docstring": { "type": "list of str", "description": "Feature names corresponding to the indices in ``features``." - } + }, + "refined_type": {} }, { "name": "target_idx", @@ -92133,7 +98066,8 @@ "docstring": { "type": "int", "description": "- In a multiclass setting, specifies the class for which the PDPs\n should be computed. Note that for binary classification, the\n positive class (index 1) is always used.\n- In a multioutput setting, specifies the task for which the PDPs\n should be computed.\n\nIgnored in binary classification or classical regression settings." - } + }, + "refined_type": {} }, { "name": "pdp_lim", @@ -92143,7 +98077,8 @@ "docstring": { "type": "dict", "description": "Global min and max average predictions, such that all plots will have\nthe same scale and y limits. `pdp_lim[1]` is the global min and max for\nsingle partial dependence curves. `pdp_lim[2]` is the global min and\nmax for two-way partial dependence curves." - } + }, + "refined_type": {} }, { "name": "deciles", @@ -92153,7 +98088,8 @@ "docstring": { "type": "dict", "description": "Deciles for feature indices in ``features``." - } + }, + "refined_type": {} }, { "name": "kind", @@ -92163,6 +98099,10 @@ "docstring": { "type": "{'average', 'individual', 'both'}, default='average'", "description": " Whether to plot the partial dependence averaged across all the samples\n in the dataset or one line per sample or both.\n\n - ``kind='average'`` results in the traditional PD plot;\n - ``kind='individual'`` results in the ICE plot.\n\nNote that the fast ``method='recursion'`` option is only available for\n``kind='average'``. Plotting individual dependencies requires using the\nslower ``method='brute'`` option.\n\n .. versionadded:: 0.24" + }, + "refined_type": { + "kind": "EnumType", + "values": ["individual", "average", "both"] } }, { @@ -92173,7 +98113,8 @@ "docstring": { "type": "float, int or None, default=1000", "description": "Sampling for ICE curves when `kind` is 'individual' or 'both'.\nIf float, should be between 0.0 and 1.0 and represent the proportion\nof the dataset to be used to plot ICE curves. If int, represents the\nmaximum absolute number of samples to use.\n\nNote that the full dataset is still used to calculate partial\ndependence when `kind='both'`.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -92183,13 +98124,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the randomness of the selected samples when subsamples is not\n`None`. See :term:`Glossary ` for details.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, pd_results, *, features, feature_names, target_idx, pdp_lim, deciles, kind='average', subsample=1000, random_state=None):\n self.pd_results = pd_results\n self.features = features\n self.feature_names = feature_names\n self.target_idx = target_idx\n self.pdp_lim = pdp_lim\n self.deciles = deciles\n self.kind = kind\n self.subsample = subsample\n self.random_state = random_state" }, { @@ -92207,7 +98149,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -92217,7 +98160,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -92241,7 +98185,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "avg_preds", @@ -92251,7 +98196,8 @@ "docstring": { "type": "ndarray of shape (n_grid_points,)", "description": "The average predictions for all points of `feature_values` for a\ngiven feature for all samples in `X`." - } + }, + "refined_type": {} }, { "name": "feature_values", @@ -92261,7 +98207,8 @@ "docstring": { "type": "ndarray of shape (n_grid_points,)", "description": "The feature values for which the predictions have been computed." - } + }, + "refined_type": {} }, { "name": "ax", @@ -92271,7 +98218,8 @@ "docstring": { "type": "Matplotlib axes", "description": "The axis on which to plot the ICE lines." - } + }, + "refined_type": {} }, { "name": "pd_line_idx", @@ -92281,7 +98229,8 @@ "docstring": { "type": "int", "description": "The sequential index of the plot. It will be unraveled to find the\nmatching 2D position in the grid layout." - } + }, + "refined_type": {} }, { "name": "line_kw", @@ -92291,13 +98240,14 @@ "docstring": { "type": "dict", "description": "Dict with keywords passed when plotting the PD plot." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Plot the average partial dependence.", - "docstring": "Plot the average partial dependence.\n\nParameters\n----------\navg_preds : ndarray of shape (n_grid_points,)\n The average predictions for all points of `feature_values` for a\n given feature for all samples in `X`.\nfeature_values : ndarray of shape (n_grid_points,)\n The feature values for which the predictions have been computed.\nax : Matplotlib axes\n The axis on which to plot the ICE lines.\npd_line_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\nline_kw : dict\n Dict with keywords passed when plotting the PD plot.", + "docstring": "Plot the average partial dependence.\n\n Parameters\n ----------\n avg_preds : ndarray of shape (n_grid_points,)\n The average predictions for all points of `feature_values` for a\n given feature for all samples in `X`.\n feature_values : ndarray of shape (n_grid_points,)\n The feature values for which the predictions have been computed.\n ax : Matplotlib axes\n The axis on which to plot the ICE lines.\n pd_line_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\n line_kw : dict\n Dict with keywords passed when plotting the PD plot.\n ", "source_code": "\ndef _plot_average_dependence(self, avg_preds, feature_values, ax, pd_line_idx, line_kw):\n \"\"\"Plot the average partial dependence.\n\n Parameters\n ----------\n avg_preds : ndarray of shape (n_grid_points,)\n The average predictions for all points of `feature_values` for a\n given feature for all samples in `X`.\n feature_values : ndarray of shape (n_grid_points,)\n The feature values for which the predictions have been computed.\n ax : Matplotlib axes\n The axis on which to plot the ICE lines.\n pd_line_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\n line_kw : dict\n Dict with keywords passed when plotting the PD plot.\n \"\"\"\n line_idx = np.unravel_index(pd_line_idx, self.lines_.shape)\n self.lines_[line_idx] = ax.plot(feature_values, avg_preds, **line_kw)[0]" }, { @@ -92315,7 +98265,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "preds", @@ -92325,7 +98276,8 @@ "docstring": { "type": "ndarray of shape (n_instances, n_grid_points)", "description": "The predictions computed for all points of `feature_values` for a\ngiven feature for all samples in `X`." - } + }, + "refined_type": {} }, { "name": "feature_values", @@ -92335,7 +98287,8 @@ "docstring": { "type": "ndarray of shape (n_grid_points,)", "description": "The feature values for which the predictions have been computed." - } + }, + "refined_type": {} }, { "name": "n_ice_to_plot", @@ -92345,7 +98298,8 @@ "docstring": { "type": "int", "description": "The number of ICE lines to plot." - } + }, + "refined_type": {} }, { "name": "ax", @@ -92355,7 +98309,8 @@ "docstring": { "type": "Matplotlib axes", "description": "The axis on which to plot the ICE lines." - } + }, + "refined_type": {} }, { "name": "pd_plot_idx", @@ -92365,7 +98320,8 @@ "docstring": { "type": "int", "description": "The sequential index of the plot. It will be unraveled to find the\nmatching 2D position in the grid layout." - } + }, + "refined_type": {} }, { "name": "n_total_lines_by_plot", @@ -92375,7 +98331,8 @@ "docstring": { "type": "int", "description": "The total number of lines expected to be plot on the axis." - } + }, + "refined_type": {} }, { "name": "individual_line_kw", @@ -92385,13 +98342,14 @@ "docstring": { "type": "dict", "description": "Dict with keywords passed when plotting the ICE lines." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Plot the ICE lines.", - "docstring": "Plot the ICE lines.\n\nParameters\n----------\npreds : ndarray of shape (n_instances, n_grid_points)\n The predictions computed for all points of `feature_values` for a\n given feature for all samples in `X`.\nfeature_values : ndarray of shape (n_grid_points,)\n The feature values for which the predictions have been computed.\nn_ice_to_plot : int\n The number of ICE lines to plot.\nax : Matplotlib axes\n The axis on which to plot the ICE lines.\npd_plot_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\nn_total_lines_by_plot : int\n The total number of lines expected to be plot on the axis.\nindividual_line_kw : dict\n Dict with keywords passed when plotting the ICE lines.", + "docstring": "Plot the ICE lines.\n\n Parameters\n ----------\n preds : ndarray of shape (n_instances, n_grid_points)\n The predictions computed for all points of `feature_values` for a\n given feature for all samples in `X`.\n feature_values : ndarray of shape (n_grid_points,)\n The feature values for which the predictions have been computed.\n n_ice_to_plot : int\n The number of ICE lines to plot.\n ax : Matplotlib axes\n The axis on which to plot the ICE lines.\n pd_plot_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\n n_total_lines_by_plot : int\n The total number of lines expected to be plot on the axis.\n individual_line_kw : dict\n Dict with keywords passed when plotting the ICE lines.\n ", "source_code": "\ndef _plot_ice_lines(self, preds, feature_values, n_ice_to_plot, ax, pd_plot_idx, n_total_lines_by_plot, individual_line_kw):\n \"\"\"Plot the ICE lines.\n\n Parameters\n ----------\n preds : ndarray of shape (n_instances, n_grid_points)\n The predictions computed for all points of `feature_values` for a\n given feature for all samples in `X`.\n feature_values : ndarray of shape (n_grid_points,)\n The feature values for which the predictions have been computed.\n n_ice_to_plot : int\n The number of ICE lines to plot.\n ax : Matplotlib axes\n The axis on which to plot the ICE lines.\n pd_plot_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\n n_total_lines_by_plot : int\n The total number of lines expected to be plot on the axis.\n individual_line_kw : dict\n Dict with keywords passed when plotting the ICE lines.\n \"\"\"\n rng = check_random_state(self.random_state)\n ice_lines_idx = rng.choice(preds.shape[0], n_ice_to_plot, replace=False)\n ice_lines_subsampled = preds[ice_lines_idx, :]\n for (ice_idx, ice) in enumerate(ice_lines_subsampled):\n line_idx = np.unravel_index(pd_plot_idx * n_total_lines_by_plot + ice_idx, self.lines_.shape)\n self.lines_[line_idx] = ax.plot(feature_values, ice.ravel(), **individual_line_kw)[0]" }, { @@ -92409,7 +98367,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "preds", @@ -92419,7 +98378,8 @@ "docstring": { "type": "ndarray of shape (n_instances, n_grid_points) or None", "description": "The predictions computed for all points of `feature_values` for a\ngiven feature for all samples in `X`." - } + }, + "refined_type": {} }, { "name": "avg_preds", @@ -92429,7 +98389,8 @@ "docstring": { "type": "ndarray of shape (n_grid_points,)", "description": "The average predictions for all points of `feature_values` for a\ngiven feature for all samples in `X`." - } + }, + "refined_type": {} }, { "name": "feature_values", @@ -92439,7 +98400,8 @@ "docstring": { "type": "ndarray of shape (n_grid_points,)", "description": "The feature values for which the predictions have been computed." - } + }, + "refined_type": {} }, { "name": "feature_idx", @@ -92449,7 +98411,8 @@ "docstring": { "type": "int", "description": "The index corresponding to the target feature." - } + }, + "refined_type": {} }, { "name": "n_ice_lines", @@ -92459,7 +98422,8 @@ "docstring": { "type": "int", "description": "The number of ICE lines to plot." - } + }, + "refined_type": {} }, { "name": "ax", @@ -92469,7 +98433,8 @@ "docstring": { "type": "Matplotlib axes", "description": "The axis on which to plot the ICE and PDP lines." - } + }, + "refined_type": {} }, { "name": "n_cols", @@ -92479,7 +98444,8 @@ "docstring": { "type": "int or None", "description": "The number of column in the axis." - } + }, + "refined_type": {} }, { "name": "pd_plot_idx", @@ -92489,7 +98455,8 @@ "docstring": { "type": "int", "description": "The sequential index of the plot. It will be unraveled to find the\nmatching 2D position in the grid layout." - } + }, + "refined_type": {} }, { "name": "n_lines", @@ -92499,7 +98466,8 @@ "docstring": { "type": "int", "description": "The total number of lines expected to be plot on the axis." - } + }, + "refined_type": {} }, { "name": "ice_lines_kw", @@ -92509,7 +98477,8 @@ "docstring": { "type": "dict", "description": "Dict with keywords passed when plotting the ICE lines." - } + }, + "refined_type": {} }, { "name": "pd_line_kw", @@ -92519,13 +98488,14 @@ "docstring": { "type": "dict", "description": "Dict with keywords passed when plotting the PD plot." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Plot 1-way partial dependence: ICE and PDP.", - "docstring": "Plot 1-way partial dependence: ICE and PDP.\n\nParameters\n----------\npreds : ndarray of shape (n_instances, n_grid_points) or None\n The predictions computed for all points of `feature_values` for a\n given feature for all samples in `X`.\navg_preds : ndarray of shape (n_grid_points,)\n The average predictions for all points of `feature_values` for a\n given feature for all samples in `X`.\nfeature_values : ndarray of shape (n_grid_points,)\n The feature values for which the predictions have been computed.\nfeature_idx : int\n The index corresponding to the target feature.\nn_ice_lines : int\n The number of ICE lines to plot.\nax : Matplotlib axes\n The axis on which to plot the ICE and PDP lines.\nn_cols : int or None\n The number of column in the axis.\npd_plot_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\nn_lines : int\n The total number of lines expected to be plot on the axis.\nice_lines_kw : dict\n Dict with keywords passed when plotting the ICE lines.\npd_line_kw : dict\n Dict with keywords passed when plotting the PD plot.", + "docstring": "Plot 1-way partial dependence: ICE and PDP.\n\n Parameters\n ----------\n preds : ndarray of shape (n_instances, n_grid_points) or None\n The predictions computed for all points of `feature_values` for a\n given feature for all samples in `X`.\n avg_preds : ndarray of shape (n_grid_points,)\n The average predictions for all points of `feature_values` for a\n given feature for all samples in `X`.\n feature_values : ndarray of shape (n_grid_points,)\n The feature values for which the predictions have been computed.\n feature_idx : int\n The index corresponding to the target feature.\n n_ice_lines : int\n The number of ICE lines to plot.\n ax : Matplotlib axes\n The axis on which to plot the ICE and PDP lines.\n n_cols : int or None\n The number of column in the axis.\n pd_plot_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\n n_lines : int\n The total number of lines expected to be plot on the axis.\n ice_lines_kw : dict\n Dict with keywords passed when plotting the ICE lines.\n pd_line_kw : dict\n Dict with keywords passed when plotting the PD plot.\n ", "source_code": "\ndef _plot_one_way_partial_dependence(self, preds, avg_preds, feature_values, feature_idx, n_ice_lines, ax, n_cols, pd_plot_idx, n_lines, ice_lines_kw, pd_line_kw):\n \"\"\"Plot 1-way partial dependence: ICE and PDP.\n\n Parameters\n ----------\n preds : ndarray of shape (n_instances, n_grid_points) or None\n The predictions computed for all points of `feature_values` for a\n given feature for all samples in `X`.\n avg_preds : ndarray of shape (n_grid_points,)\n The average predictions for all points of `feature_values` for a\n given feature for all samples in `X`.\n feature_values : ndarray of shape (n_grid_points,)\n The feature values for which the predictions have been computed.\n feature_idx : int\n The index corresponding to the target feature.\n n_ice_lines : int\n The number of ICE lines to plot.\n ax : Matplotlib axes\n The axis on which to plot the ICE and PDP lines.\n n_cols : int or None\n The number of column in the axis.\n pd_plot_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\n n_lines : int\n The total number of lines expected to be plot on the axis.\n ice_lines_kw : dict\n Dict with keywords passed when plotting the ICE lines.\n pd_line_kw : dict\n Dict with keywords passed when plotting the PD plot.\n \"\"\"\n from matplotlib import transforms\n if self.kind in ('individual', 'both'):\n self._plot_ice_lines(preds[self.target_idx], feature_values, n_ice_lines, ax, pd_plot_idx, n_lines, ice_lines_kw)\n if self.kind in ('average', 'both'):\n if self.kind == 'average':\n pd_line_idx = pd_plot_idx\n else:\n pd_line_idx = pd_plot_idx * n_lines + n_ice_lines\n self._plot_average_dependence(avg_preds[self.target_idx].ravel(), feature_values, ax, pd_line_idx, pd_line_kw)\n trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)\n vlines_idx = np.unravel_index(pd_plot_idx, self.deciles_vlines_.shape)\n self.deciles_vlines_[vlines_idx] = ax.vlines(self.deciles[feature_idx[0]], 0, 0.05, transform=trans, color='k')\n ax.set_ylim(self.pdp_lim[1])\n if not ax.get_xlabel():\n ax.set_xlabel(self.feature_names[feature_idx[0]])\n if n_cols is None or pd_plot_idx % n_cols == 0:\n if not ax.get_ylabel():\n ax.set_ylabel('Partial dependence')\n else:\n ax.set_yticklabels([])\n if pd_line_kw.get('label', None) and self.kind != 'individual':\n ax.legend()" }, { @@ -92543,7 +98513,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "avg_preds", @@ -92553,7 +98524,8 @@ "docstring": { "type": "ndarray of shape (n_instances, n_grid_points, n_grid_points)", "description": "The average predictions for all points of `feature_values[0]` and\n`feature_values[1]` for some given features for all samples in `X`." - } + }, + "refined_type": {} }, { "name": "feature_values", @@ -92563,7 +98535,8 @@ "docstring": { "type": "seq of 1d array", "description": "A sequence of array of the feature values for which the predictions\nhave been computed." - } + }, + "refined_type": {} }, { "name": "feature_idx", @@ -92573,7 +98546,8 @@ "docstring": { "type": "tuple of int", "description": "The indices of the target features" - } + }, + "refined_type": {} }, { "name": "ax", @@ -92583,7 +98557,8 @@ "docstring": { "type": "Matplotlib axes", "description": "The axis on which to plot the ICE and PDP lines." - } + }, + "refined_type": {} }, { "name": "pd_plot_idx", @@ -92593,7 +98568,8 @@ "docstring": { "type": "int", "description": "The sequential index of the plot. It will be unraveled to find the\nmatching 2D position in the grid layout." - } + }, + "refined_type": {} }, { "name": "Z_level", @@ -92603,7 +98579,8 @@ "docstring": { "type": "ndarray of shape (8, 8)", "description": "The Z-level used to encode the average predictions." - } + }, + "refined_type": {} }, { "name": "contour_kw", @@ -92613,13 +98590,14 @@ "docstring": { "type": "dict", "description": "Dict with keywords passed when plotting the contours." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Plot 2-way partial dependence.", - "docstring": "Plot 2-way partial dependence.\n\nParameters\n----------\navg_preds : ndarray of shape (n_instances, n_grid_points, n_grid_points)\n The average predictions for all points of `feature_values[0]` and\n `feature_values[1]` for some given features for all samples in `X`.\nfeature_values : seq of 1d array\n A sequence of array of the feature values for which the predictions\n have been computed.\nfeature_idx : tuple of int\n The indices of the target features\nax : Matplotlib axes\n The axis on which to plot the ICE and PDP lines.\npd_plot_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\nZ_level : ndarray of shape (8, 8)\n The Z-level used to encode the average predictions.\ncontour_kw : dict\n Dict with keywords passed when plotting the contours.", + "docstring": "Plot 2-way partial dependence.\n\n Parameters\n ----------\n avg_preds : ndarray of shape (n_instances, n_grid_points, n_grid_points)\n The average predictions for all points of `feature_values[0]` and\n `feature_values[1]` for some given features for all samples in `X`.\n feature_values : seq of 1d array\n A sequence of array of the feature values for which the predictions\n have been computed.\n feature_idx : tuple of int\n The indices of the target features\n ax : Matplotlib axes\n The axis on which to plot the ICE and PDP lines.\n pd_plot_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\n Z_level : ndarray of shape (8, 8)\n The Z-level used to encode the average predictions.\n contour_kw : dict\n Dict with keywords passed when plotting the contours.\n ", "source_code": "\ndef _plot_two_way_partial_dependence(self, avg_preds, feature_values, feature_idx, ax, pd_plot_idx, Z_level, contour_kw):\n \"\"\"Plot 2-way partial dependence.\n\n Parameters\n ----------\n avg_preds : ndarray of shape (n_instances, n_grid_points, n_grid_points)\n The average predictions for all points of `feature_values[0]` and\n `feature_values[1]` for some given features for all samples in `X`.\n feature_values : seq of 1d array\n A sequence of array of the feature values for which the predictions\n have been computed.\n feature_idx : tuple of int\n The indices of the target features\n ax : Matplotlib axes\n The axis on which to plot the ICE and PDP lines.\n pd_plot_idx : int\n The sequential index of the plot. It will be unraveled to find the\n matching 2D position in the grid layout.\n Z_level : ndarray of shape (8, 8)\n The Z-level used to encode the average predictions.\n contour_kw : dict\n Dict with keywords passed when plotting the contours.\n \"\"\"\n from matplotlib import transforms\n (XX, YY) = np.meshgrid(feature_values[0], feature_values[1])\n Z = avg_preds[self.target_idx].T\n CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, colors='k')\n contour_idx = np.unravel_index(pd_plot_idx, self.contours_.shape)\n self.contours_[contour_idx] = ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1], vmin=Z_level[0], **contour_kw)\n ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)\n trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)\n (xlim, ylim) = (ax.get_xlim(), ax.get_ylim())\n vlines_idx = np.unravel_index(pd_plot_idx, self.deciles_vlines_.shape)\n self.deciles_vlines_[vlines_idx] = ax.vlines(self.deciles[feature_idx[0]], 0, 0.05, transform=trans, color='k')\n hlines_idx = np.unravel_index(pd_plot_idx, self.deciles_hlines_.shape)\n self.deciles_hlines_[hlines_idx] = ax.hlines(self.deciles[feature_idx[1]], 0, 0.05, transform=trans, color='k')\n ax.set_xlim(xlim)\n ax.set_ylim(ylim)\n if not ax.get_xlabel():\n ax.set_xlabel(self.feature_names[feature_idx[0]])\n ax.set_ylabel(self.feature_names[feature_idx[1]])" }, { @@ -92637,7 +98615,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -92647,7 +98626,8 @@ "docstring": { "type": "BaseEstimator", "description": "A fitted estimator object implementing :term:`predict`,\n:term:`predict_proba`, or :term:`decision_function`.\nMultioutput-multiclass classifiers are not supported." - } + }, + "refined_type": {} }, { "name": "X", @@ -92657,6 +98637,10 @@ "docstring": { "type": "{array-like, dataframe} of shape (n_samples, n_features)", "description": "``X`` is used to generate a grid of values for the target\n``features`` (where the partial dependence will be evaluated), and\nalso to generate values for the complement features when the\n`method` is `'brute'`." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -92667,6 +98651,10 @@ "docstring": { "type": "list of {int, str, pair of int, pair of str}", "description": "The target features for which to create the PDPs.\nIf `features[i]` is an integer or a string, a one-way PDP is created;\nif `features[i]` is a tuple, a two-way PDP is created (only supported\nwith `kind='average'`). Each tuple must be of size 2.\nif any entry is a string, then it must be in ``feature_names``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -92677,7 +98665,8 @@ "docstring": { "type": "array-like of shape (n_features,), dtype=str, default=None", "description": "Name of each feature; `feature_names[i]` holds the name of the feature\nwith index `i`.\nBy default, the name of the feature corresponds to their numerical\nindex for NumPy array and their column name for pandas dataframe." - } + }, + "refined_type": {} }, { "name": "target", @@ -92687,7 +98676,8 @@ "docstring": { "type": "int, default=None", "description": "- In a multiclass setting, specifies the class for which the PDPs\n should be computed. Note that for binary classification, the\n positive class (index 1) is always used.\n- In a multioutput setting, specifies the task for which the PDPs\n should be computed.\n\nIgnored in binary classification or classical regression settings." - } + }, + "refined_type": {} }, { "name": "response_method", @@ -92697,6 +98687,10 @@ "docstring": { "type": "{'auto', 'predict_proba', 'decision_function'}, default='auto'", "description": "Specifies whether to use :term:`predict_proba` or\n:term:`decision_function` as the target response. For regressors\nthis parameter is ignored and the response is always the output of\n:term:`predict`. By default, :term:`predict_proba` is tried first\nand we revert to :term:`decision_function` if it doesn't exist. If\n``method`` is `'recursion'`, the response is always the output of\n:term:`decision_function`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "decision_function", "predict_proba"] } }, { @@ -92707,7 +98701,8 @@ "docstring": { "type": "int, default=3", "description": "The maximum number of columns in the grid plot. Only active when `ax`\nis a single axis or `None`." - } + }, + "refined_type": {} }, { "name": "grid_resolution", @@ -92717,7 +98712,8 @@ "docstring": { "type": "int, default=100", "description": "The number of equally spaced points on the axes of the plots, for each\ntarget feature." - } + }, + "refined_type": {} }, { "name": "percentiles", @@ -92727,7 +98723,8 @@ "docstring": { "type": "tuple of float, default=(0.05, 0.95)", "description": "The lower and upper percentile used to create the extreme values\nfor the PDP axes. Must be in [0, 1]." - } + }, + "refined_type": {} }, { "name": "method", @@ -92737,7 +98734,8 @@ "docstring": { "type": "str, default='auto'", "description": "The method used to calculate the averaged predictions:\n\n- `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`\n but is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the ICEs by design, it is not compatible with ICE and\n thus `kind` must be `'average'`.\n\n- `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n- `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\nPlease see :ref:`this note ` for\ndifferences between the `'brute'` and `'recursion'` method." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -92747,7 +98745,8 @@ "docstring": { "type": "int, default=None", "description": "The number of CPUs to use to compute the partial dependences.\nComputation is parallelized over features specified by the `features`\nparameter.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -92757,7 +98756,8 @@ "docstring": { "type": "int, default=0", "description": "Verbose output during PD computations." - } + }, + "refined_type": {} }, { "name": "line_kw", @@ -92767,7 +98767,8 @@ "docstring": { "type": "dict, default=None", "description": "Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.\nFor one-way partial dependence plots. It can be used to define common\nproperties for both `ice_lines_kw` and `pdp_line_kw`." - } + }, + "refined_type": {} }, { "name": "ice_lines_kw", @@ -92777,7 +98778,8 @@ "docstring": { "type": "dict, default=None", "description": "Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\nFor ICE lines in the one-way partial dependence plots.\nThe key value pairs defined in `ice_lines_kw` takes priority over\n`line_kw`." - } + }, + "refined_type": {} }, { "name": "pd_line_kw", @@ -92787,7 +98789,8 @@ "docstring": { "type": "dict, default=None", "description": "Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\nFor partial dependence in one-way partial dependence plots.\nThe key value pairs defined in `pd_line_kw` takes priority over\n`line_kw`." - } + }, + "refined_type": {} }, { "name": "contour_kw", @@ -92797,7 +98800,8 @@ "docstring": { "type": "dict, default=None", "description": "Dict with keywords passed to the ``matplotlib.pyplot.contourf`` call.\nFor two-way partial dependence plots." - } + }, + "refined_type": {} }, { "name": "ax", @@ -92807,7 +98811,8 @@ "docstring": { "type": "Matplotlib axes or array-like of Matplotlib axes, default=None", "description": "- If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n- If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n- If `None`, a figure and a bounding axes is created and treated\n as the single axes case." - } + }, + "refined_type": {} }, { "name": "kind", @@ -92817,6 +98822,10 @@ "docstring": { "type": "{'average', 'individual', 'both'}, default='average'", "description": " Whether to plot the partial dependence averaged across all the samples\n in the dataset or one line per sample or both.\n\n - ``kind='average'`` results in the traditional PD plot;\n - ``kind='individual'`` results in the ICE plot.\n\nNote that the fast ``method='recursion'`` option is only available for\n``kind='average'``. Plotting individual dependencies requires using the\nslower ``method='brute'`` option." + }, + "refined_type": { + "kind": "EnumType", + "values": ["individual", "average", "both"] } }, { @@ -92827,7 +98836,8 @@ "docstring": { "type": "float, int or None, default=1000", "description": "Sampling for ICE curves when `kind` is 'individual' or 'both'.\nIf `float`, should be between 0.0 and 1.0 and represent the proportion\nof the dataset to be used to plot ICE curves. If `int`, represents the\nabsolute number samples to use.\n\nNote that the full dataset is still used to calculate averaged partial\ndependence when `kind='both'`." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -92837,13 +98847,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the randomness of the selected samples when subsamples is not\n`None` and `kind` is either `'both'` or `'individual'`.\nSee :term:`Glossary ` for details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Partial dependence (PD) and individual conditional expectation (ICE) plots.\n\nPartial dependence plots, individual conditional expectation plots or an overlay of both of them can be plotted by setting the ``kind`` parameter. The ``len(features)`` plots are arranged in a grid with ``n_cols`` columns. Two-way partial dependence plots are plotted as contour plots. The deciles of the feature values will be shown with tick marks on the x-axes for one-way plots, and on both axes for two-way plots. Read more in the :ref:`User Guide `. .. note:: :func:`PartialDependenceDisplay.from_estimator` does not support using the same axes with multiple calls. To plot the the partial dependence for multiple estimators, please pass the axes created by the first call to the second call:: >>> from sklearn.inspection import PartialDependenceDisplay >>> from sklearn.datasets import make_friedman1 >>> from sklearn.linear_model import LinearRegression >>> from sklearn.ensemble import RandomForestRegressor >>> X, y = make_friedman1() >>> est1 = LinearRegression().fit(X, y) >>> est2 = RandomForestRegressor().fit(X, y) >>> disp1 = PartialDependenceDisplay.from_estimator(est1, X, ... [1, 2]) >>> disp2 = PartialDependenceDisplay.from_estimator(est2, X, [1, 2], ... ax=disp1.axes_) .. warning:: For :class:`~sklearn.ensemble.GradientBoostingClassifier` and :class:`~sklearn.ensemble.GradientBoostingRegressor`, the `'recursion'` method (used by default) will not account for the `init` predictor of the boosting process. In practice, this will produce the same values as `'brute'` up to a constant offset in the target response, provided that `init` is a constant estimator (which is the default). However, if `init` is not a constant estimator, the partial dependence values are incorrect for `'recursion'` because the offset will be sample-dependent. It is preferable to use the `'brute'` method. Note that this only applies to :class:`~sklearn.ensemble.GradientBoostingClassifier` and :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. .. versionadded:: 1.0", - "docstring": "Partial dependence (PD) and individual conditional expectation (ICE) plots.\n\nPartial dependence plots, individual conditional expectation plots or an\noverlay of both of them can be plotted by setting the ``kind``\nparameter. The ``len(features)`` plots are arranged in a grid with\n``n_cols`` columns. Two-way partial dependence plots are plotted as\ncontour plots. The deciles of the feature values will be shown with tick\nmarks on the x-axes for one-way plots, and on both axes for two-way\nplots.\n\nRead more in the :ref:`User Guide `.\n\n.. note::\n\n :func:`PartialDependenceDisplay.from_estimator` does not support using the\n same axes with multiple calls. To plot the the partial dependence for\n multiple estimators, please pass the axes created by the first call to the\n second call::\n\n >>> from sklearn.inspection import PartialDependenceDisplay\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> X, y = make_friedman1()\n >>> est1 = LinearRegression().fit(X, y)\n >>> est2 = RandomForestRegressor().fit(X, y)\n >>> disp1 = PartialDependenceDisplay.from_estimator(est1, X,\n ... [1, 2])\n >>> disp2 = PartialDependenceDisplay.from_estimator(est2, X, [1, 2],\n ... ax=disp1.axes_)\n\n.. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\n.. versionadded:: 1.0\n\nParameters\n----------\nestimator : BaseEstimator\n A fitted estimator object implementing :term:`predict`,\n :term:`predict_proba`, or :term:`decision_function`.\n Multioutput-multiclass classifiers are not supported.\n\nX : {array-like, dataframe} of shape (n_samples, n_features)\n ``X`` is used to generate a grid of values for the target\n ``features`` (where the partial dependence will be evaluated), and\n also to generate values for the complement features when the\n `method` is `'brute'`.\n\nfeatures : list of {int, str, pair of int, pair of str}\n The target features for which to create the PDPs.\n If `features[i]` is an integer or a string, a one-way PDP is created;\n if `features[i]` is a tuple, a two-way PDP is created (only supported\n with `kind='average'`). Each tuple must be of size 2.\n if any entry is a string, then it must be in ``feature_names``.\n\nfeature_names : array-like of shape (n_features,), dtype=str, default=None\n Name of each feature; `feature_names[i]` holds the name of the feature\n with index `i`.\n By default, the name of the feature corresponds to their numerical\n index for NumPy array and their column name for pandas dataframe.\n\ntarget : int, default=None\n - In a multiclass setting, specifies the class for which the PDPs\n should be computed. Note that for binary classification, the\n positive class (index 1) is always used.\n - In a multioutput setting, specifies the task for which the PDPs\n should be computed.\n\n Ignored in binary classification or classical regression settings.\n\nresponse_method : {'auto', 'predict_proba', 'decision_function'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. For regressors\n this parameter is ignored and the response is always the output of\n :term:`predict`. By default, :term:`predict_proba` is tried first\n and we revert to :term:`decision_function` if it doesn't exist. If\n ``method`` is `'recursion'`, the response is always the output of\n :term:`decision_function`.\n\nn_cols : int, default=3\n The maximum number of columns in the grid plot. Only active when `ax`\n is a single axis or `None`.\n\ngrid_resolution : int, default=100\n The number of equally spaced points on the axes of the plots, for each\n target feature.\n\npercentiles : tuple of float, default=(0.05, 0.95)\n The lower and upper percentile used to create the extreme values\n for the PDP axes. Must be in [0, 1].\n\nmethod : str, default='auto'\n The method used to calculate the averaged predictions:\n\n - `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`\n but is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the ICEs by design, it is not compatible with ICE and\n thus `kind` must be `'average'`.\n\n - `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n - `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\n Please see :ref:`this note ` for\n differences between the `'brute'` and `'recursion'` method.\n\nn_jobs : int, default=None\n The number of CPUs to use to compute the partial dependences.\n Computation is parallelized over features specified by the `features`\n parameter.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nverbose : int, default=0\n Verbose output during PD computations.\n\nline_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.\n For one-way partial dependence plots. It can be used to define common\n properties for both `ice_lines_kw` and `pdp_line_kw`.\n\nice_lines_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For ICE lines in the one-way partial dependence plots.\n The key value pairs defined in `ice_lines_kw` takes priority over\n `line_kw`.\n\npd_line_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For partial dependence in one-way partial dependence plots.\n The key value pairs defined in `pd_line_kw` takes priority over\n `line_kw`.\n\ncontour_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.contourf`` call.\n For two-way partial dependence plots.\n\nax : Matplotlib axes or array-like of Matplotlib axes, default=None\n - If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n - If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n - If `None`, a figure and a bounding axes is created and treated\n as the single axes case.\n\nkind : {'average', 'individual', 'both'}, default='average'\n Whether to plot the partial dependence averaged across all the samples\n in the dataset or one line per sample or both.\n\n - ``kind='average'`` results in the traditional PD plot;\n - ``kind='individual'`` results in the ICE plot.\n\n Note that the fast ``method='recursion'`` option is only available for\n ``kind='average'``. Plotting individual dependencies requires using the\n slower ``method='brute'`` option.\n\nsubsample : float, int or None, default=1000\n Sampling for ICE curves when `kind` is 'individual' or 'both'.\n If `float`, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to be used to plot ICE curves. If `int`, represents the\n absolute number samples to use.\n\n Note that the full dataset is still used to calculate averaged partial\n dependence when `kind='both'`.\n\nrandom_state : int, RandomState instance or None, default=None\n Controls the randomness of the selected samples when subsamples is not\n `None` and `kind` is either `'both'` or `'individual'`.\n See :term:`Glossary ` for details.\n\nReturns\n-------\ndisplay : :class:`~sklearn.inspection.PartialDependenceDisplay`\n\nSee Also\n--------\npartial_dependence : Compute Partial Dependence values.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_friedman1\n>>> from sklearn.ensemble import GradientBoostingRegressor\n>>> from sklearn.inspection import PartialDependenceDisplay\n>>> X, y = make_friedman1()\n>>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)\n>>> PartialDependenceDisplay.from_estimator(clf, X, [0, (0, 1)])\n<...>\n>>> plt.show()", + "description": "Partial dependence (PD) and individual conditional expectation (ICE) plots.\n\nPartial dependence plots, individual conditional expectation plots or an\noverlay of both of them can be plotted by setting the ``kind``\nparameter. The ``len(features)`` plots are arranged in a grid with\n``n_cols`` columns. Two-way partial dependence plots are plotted as\ncontour plots. The deciles of the feature values will be shown with tick\nmarks on the x-axes for one-way plots, and on both axes for two-way\nplots.\n\nRead more in the :ref:`User Guide `.\n\n.. note::\n\n :func:`PartialDependenceDisplay.from_estimator` does not support using the\n same axes with multiple calls. To plot the the partial dependence for\n multiple estimators, please pass the axes created by the first call to the\n second call::\n\n >>> from sklearn.inspection import PartialDependenceDisplay\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> X, y = make_friedman1()\n >>> est1 = LinearRegression().fit(X, y)\n >>> est2 = RandomForestRegressor().fit(X, y)\n >>> disp1 = PartialDependenceDisplay.from_estimator(est1, X,\n ... [1, 2])\n >>> disp2 = PartialDependenceDisplay.from_estimator(est2, X, [1, 2],\n ... ax=disp1.axes_)\n\n.. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\n.. versionadded:: 1.0", + "docstring": "Partial dependence (PD) and individual conditional expectation (ICE) plots.\n\n Partial dependence plots, individual conditional expectation plots or an\n overlay of both of them can be plotted by setting the ``kind``\n parameter. The ``len(features)`` plots are arranged in a grid with\n ``n_cols`` columns. Two-way partial dependence plots are plotted as\n contour plots. The deciles of the feature values will be shown with tick\n marks on the x-axes for one-way plots, and on both axes for two-way\n plots.\n\n Read more in the :ref:`User Guide `.\n\n .. note::\n\n :func:`PartialDependenceDisplay.from_estimator` does not support using the\n same axes with multiple calls. To plot the the partial dependence for\n multiple estimators, please pass the axes created by the first call to the\n second call::\n\n >>> from sklearn.inspection import PartialDependenceDisplay\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> X, y = make_friedman1()\n >>> est1 = LinearRegression().fit(X, y)\n >>> est2 = RandomForestRegressor().fit(X, y)\n >>> disp1 = PartialDependenceDisplay.from_estimator(est1, X,\n ... [1, 2])\n >>> disp2 = PartialDependenceDisplay.from_estimator(est2, X, [1, 2],\n ... ax=disp1.axes_)\n\n .. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : BaseEstimator\n A fitted estimator object implementing :term:`predict`,\n :term:`predict_proba`, or :term:`decision_function`.\n Multioutput-multiclass classifiers are not supported.\n\n X : {array-like, dataframe} of shape (n_samples, n_features)\n ``X`` is used to generate a grid of values for the target\n ``features`` (where the partial dependence will be evaluated), and\n also to generate values for the complement features when the\n `method` is `'brute'`.\n\n features : list of {int, str, pair of int, pair of str}\n The target features for which to create the PDPs.\n If `features[i]` is an integer or a string, a one-way PDP is created;\n if `features[i]` is a tuple, a two-way PDP is created (only supported\n with `kind='average'`). Each tuple must be of size 2.\n if any entry is a string, then it must be in ``feature_names``.\n\n feature_names : array-like of shape (n_features,), dtype=str, default=None\n Name of each feature; `feature_names[i]` holds the name of the feature\n with index `i`.\n By default, the name of the feature corresponds to their numerical\n index for NumPy array and their column name for pandas dataframe.\n\n target : int, default=None\n - In a multiclass setting, specifies the class for which the PDPs\n should be computed. Note that for binary classification, the\n positive class (index 1) is always used.\n - In a multioutput setting, specifies the task for which the PDPs\n should be computed.\n\n Ignored in binary classification or classical regression settings.\n\n response_method : {'auto', 'predict_proba', 'decision_function'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. For regressors\n this parameter is ignored and the response is always the output of\n :term:`predict`. By default, :term:`predict_proba` is tried first\n and we revert to :term:`decision_function` if it doesn't exist. If\n ``method`` is `'recursion'`, the response is always the output of\n :term:`decision_function`.\n\n n_cols : int, default=3\n The maximum number of columns in the grid plot. Only active when `ax`\n is a single axis or `None`.\n\n grid_resolution : int, default=100\n The number of equally spaced points on the axes of the plots, for each\n target feature.\n\n percentiles : tuple of float, default=(0.05, 0.95)\n The lower and upper percentile used to create the extreme values\n for the PDP axes. Must be in [0, 1].\n\n method : str, default='auto'\n The method used to calculate the averaged predictions:\n\n - `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`\n but is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the ICEs by design, it is not compatible with ICE and\n thus `kind` must be `'average'`.\n\n - `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n - `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\n Please see :ref:`this note ` for\n differences between the `'brute'` and `'recursion'` method.\n\n n_jobs : int, default=None\n The number of CPUs to use to compute the partial dependences.\n Computation is parallelized over features specified by the `features`\n parameter.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n Verbose output during PD computations.\n\n line_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.\n For one-way partial dependence plots. It can be used to define common\n properties for both `ice_lines_kw` and `pdp_line_kw`.\n\n ice_lines_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For ICE lines in the one-way partial dependence plots.\n The key value pairs defined in `ice_lines_kw` takes priority over\n `line_kw`.\n\n pd_line_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For partial dependence in one-way partial dependence plots.\n The key value pairs defined in `pd_line_kw` takes priority over\n `line_kw`.\n\n contour_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.contourf`` call.\n For two-way partial dependence plots.\n\n ax : Matplotlib axes or array-like of Matplotlib axes, default=None\n - If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n - If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n - If `None`, a figure and a bounding axes is created and treated\n as the single axes case.\n\n kind : {'average', 'individual', 'both'}, default='average'\n Whether to plot the partial dependence averaged across all the samples\n in the dataset or one line per sample or both.\n\n - ``kind='average'`` results in the traditional PD plot;\n - ``kind='individual'`` results in the ICE plot.\n\n Note that the fast ``method='recursion'`` option is only available for\n ``kind='average'``. Plotting individual dependencies requires using the\n slower ``method='brute'`` option.\n\n subsample : float, int or None, default=1000\n Sampling for ICE curves when `kind` is 'individual' or 'both'.\n If `float`, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to be used to plot ICE curves. If `int`, represents the\n absolute number samples to use.\n\n Note that the full dataset is still used to calculate averaged partial\n dependence when `kind='both'`.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the selected samples when subsamples is not\n `None` and `kind` is either `'both'` or `'individual'`.\n See :term:`Glossary ` for details.\n\n Returns\n -------\n display : :class:`~sklearn.inspection.PartialDependenceDisplay`\n\n See Also\n --------\n partial_dependence : Compute Partial Dependence values.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.ensemble import GradientBoostingRegressor\n >>> from sklearn.inspection import PartialDependenceDisplay\n >>> X, y = make_friedman1()\n >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)\n >>> PartialDependenceDisplay.from_estimator(clf, X, [0, (0, 1)])\n <...>\n >>> plt.show()\n ", "source_code": "\n@classmethod\ndef from_estimator(cls, estimator, X, features, *, feature_names=None, target=None, response_method='auto', n_cols=3, grid_resolution=100, percentiles=(0.05, 0.95), method='auto', n_jobs=None, verbose=0, line_kw=None, ice_lines_kw=None, pd_line_kw=None, contour_kw=None, ax=None, kind='average', subsample=1000, random_state=None):\n \"\"\"Partial dependence (PD) and individual conditional expectation (ICE) plots.\n\n Partial dependence plots, individual conditional expectation plots or an\n overlay of both of them can be plotted by setting the ``kind``\n parameter. The ``len(features)`` plots are arranged in a grid with\n ``n_cols`` columns. Two-way partial dependence plots are plotted as\n contour plots. The deciles of the feature values will be shown with tick\n marks on the x-axes for one-way plots, and on both axes for two-way\n plots.\n\n Read more in the :ref:`User Guide `.\n\n .. note::\n\n :func:`PartialDependenceDisplay.from_estimator` does not support using the\n same axes with multiple calls. To plot the the partial dependence for\n multiple estimators, please pass the axes created by the first call to the\n second call::\n\n >>> from sklearn.inspection import PartialDependenceDisplay\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> X, y = make_friedman1()\n >>> est1 = LinearRegression().fit(X, y)\n >>> est2 = RandomForestRegressor().fit(X, y)\n >>> disp1 = PartialDependenceDisplay.from_estimator(est1, X,\n ... [1, 2])\n >>> disp2 = PartialDependenceDisplay.from_estimator(est2, X, [1, 2],\n ... ax=disp1.axes_)\n\n .. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : BaseEstimator\n A fitted estimator object implementing :term:`predict`,\n :term:`predict_proba`, or :term:`decision_function`.\n Multioutput-multiclass classifiers are not supported.\n\n X : {array-like, dataframe} of shape (n_samples, n_features)\n ``X`` is used to generate a grid of values for the target\n ``features`` (where the partial dependence will be evaluated), and\n also to generate values for the complement features when the\n `method` is `'brute'`.\n\n features : list of {int, str, pair of int, pair of str}\n The target features for which to create the PDPs.\n If `features[i]` is an integer or a string, a one-way PDP is created;\n if `features[i]` is a tuple, a two-way PDP is created (only supported\n with `kind='average'`). Each tuple must be of size 2.\n if any entry is a string, then it must be in ``feature_names``.\n\n feature_names : array-like of shape (n_features,), dtype=str, default=None\n Name of each feature; `feature_names[i]` holds the name of the feature\n with index `i`.\n By default, the name of the feature corresponds to their numerical\n index for NumPy array and their column name for pandas dataframe.\n\n target : int, default=None\n - In a multiclass setting, specifies the class for which the PDPs\n should be computed. Note that for binary classification, the\n positive class (index 1) is always used.\n - In a multioutput setting, specifies the task for which the PDPs\n should be computed.\n\n Ignored in binary classification or classical regression settings.\n\n response_method : {'auto', 'predict_proba', 'decision_function'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. For regressors\n this parameter is ignored and the response is always the output of\n :term:`predict`. By default, :term:`predict_proba` is tried first\n and we revert to :term:`decision_function` if it doesn't exist. If\n ``method`` is `'recursion'`, the response is always the output of\n :term:`decision_function`.\n\n n_cols : int, default=3\n The maximum number of columns in the grid plot. Only active when `ax`\n is a single axis or `None`.\n\n grid_resolution : int, default=100\n The number of equally spaced points on the axes of the plots, for each\n target feature.\n\n percentiles : tuple of float, default=(0.05, 0.95)\n The lower and upper percentile used to create the extreme values\n for the PDP axes. Must be in [0, 1].\n\n method : str, default='auto'\n The method used to calculate the averaged predictions:\n\n - `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`\n but is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the ICEs by design, it is not compatible with ICE and\n thus `kind` must be `'average'`.\n\n - `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n - `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\n Please see :ref:`this note ` for\n differences between the `'brute'` and `'recursion'` method.\n\n n_jobs : int, default=None\n The number of CPUs to use to compute the partial dependences.\n Computation is parallelized over features specified by the `features`\n parameter.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n Verbose output during PD computations.\n\n line_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.\n For one-way partial dependence plots. It can be used to define common\n properties for both `ice_lines_kw` and `pdp_line_kw`.\n\n ice_lines_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For ICE lines in the one-way partial dependence plots.\n The key value pairs defined in `ice_lines_kw` takes priority over\n `line_kw`.\n\n pd_line_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For partial dependence in one-way partial dependence plots.\n The key value pairs defined in `pd_line_kw` takes priority over\n `line_kw`.\n\n contour_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.contourf`` call.\n For two-way partial dependence plots.\n\n ax : Matplotlib axes or array-like of Matplotlib axes, default=None\n - If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n - If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n - If `None`, a figure and a bounding axes is created and treated\n as the single axes case.\n\n kind : {'average', 'individual', 'both'}, default='average'\n Whether to plot the partial dependence averaged across all the samples\n in the dataset or one line per sample or both.\n\n - ``kind='average'`` results in the traditional PD plot;\n - ``kind='individual'`` results in the ICE plot.\n\n Note that the fast ``method='recursion'`` option is only available for\n ``kind='average'``. Plotting individual dependencies requires using the\n slower ``method='brute'`` option.\n\n subsample : float, int or None, default=1000\n Sampling for ICE curves when `kind` is 'individual' or 'both'.\n If `float`, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to be used to plot ICE curves. If `int`, represents the\n absolute number samples to use.\n\n Note that the full dataset is still used to calculate averaged partial\n dependence when `kind='both'`.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the selected samples when subsamples is not\n `None` and `kind` is either `'both'` or `'individual'`.\n See :term:`Glossary ` for details.\n\n Returns\n -------\n display : :class:`~sklearn.inspection.PartialDependenceDisplay`\n\n See Also\n --------\n partial_dependence : Compute Partial Dependence values.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.ensemble import GradientBoostingRegressor\n >>> from sklearn.inspection import PartialDependenceDisplay\n >>> X, y = make_friedman1()\n >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)\n >>> PartialDependenceDisplay.from_estimator(clf, X, [0, (0, 1)])\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_estimator')\n return _plot_partial_dependence(estimator, X, features, feature_names=feature_names, target=target, response_method=response_method, n_cols=n_cols, grid_resolution=grid_resolution, percentiles=percentiles, method=method, n_jobs=n_jobs, verbose=verbose, line_kw=line_kw, ice_lines_kw=ice_lines_kw, pd_line_kw=pd_line_kw, contour_kw=contour_kw, ax=ax, kind=kind, subsample=subsample, random_state=random_state)" }, { @@ -92861,7 +98872,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ax", @@ -92871,7 +98883,8 @@ "docstring": { "type": "Matplotlib axes or array-like of Matplotlib axes, default=None", "description": "- If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n- If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n- If `None`, a figure and a bounding axes is created and treated\n as the single axes case." - } + }, + "refined_type": {} }, { "name": "n_cols", @@ -92881,7 +98894,8 @@ "docstring": { "type": "int, default=3", "description": "The maximum number of columns in the grid plot. Only active when\n`ax` is a single axes or `None`." - } + }, + "refined_type": {} }, { "name": "line_kw", @@ -92891,7 +98905,8 @@ "docstring": { "type": "dict, default=None", "description": "Dict with keywords passed to the `matplotlib.pyplot.plot` call.\nFor one-way partial dependence plots." - } + }, + "refined_type": {} }, { "name": "ice_lines_kw", @@ -92901,7 +98916,8 @@ "docstring": { "type": "dict, default=None", "description": "Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\nFor ICE lines in the one-way partial dependence plots.\nThe key value pairs defined in `ice_lines_kw` takes priority over\n`line_kw`.\n\n.. versionadded:: 1.0" - } + }, + "refined_type": {} }, { "name": "pd_line_kw", @@ -92911,7 +98927,8 @@ "docstring": { "type": "dict, default=None", "description": "Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\nFor partial dependence in one-way partial dependence plots.\nThe key value pairs defined in `pd_line_kw` takes priority over\n`line_kw`.\n\n.. versionadded:: 1.0" - } + }, + "refined_type": {} }, { "name": "contour_kw", @@ -92921,13 +98938,14 @@ "docstring": { "type": "dict, default=None", "description": "Dict with keywords passed to the `matplotlib.pyplot.contourf`\ncall for two-way partial dependence plots." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Plot partial dependence plots.", - "docstring": "Plot partial dependence plots.\n\nParameters\n----------\nax : Matplotlib axes or array-like of Matplotlib axes, default=None\n - If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n - If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n - If `None`, a figure and a bounding axes is created and treated\n as the single axes case.\n\nn_cols : int, default=3\n The maximum number of columns in the grid plot. Only active when\n `ax` is a single axes or `None`.\n\nline_kw : dict, default=None\n Dict with keywords passed to the `matplotlib.pyplot.plot` call.\n For one-way partial dependence plots.\n\nice_lines_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For ICE lines in the one-way partial dependence plots.\n The key value pairs defined in `ice_lines_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\npd_line_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For partial dependence in one-way partial dependence plots.\n The key value pairs defined in `pd_line_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\ncontour_kw : dict, default=None\n Dict with keywords passed to the `matplotlib.pyplot.contourf`\n call for two-way partial dependence plots.\n\nReturns\n-------\ndisplay : :class:`~sklearn.inspection.PartialDependenceDisplay`", + "docstring": "Plot partial dependence plots.\n\n Parameters\n ----------\n ax : Matplotlib axes or array-like of Matplotlib axes, default=None\n - If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n - If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n - If `None`, a figure and a bounding axes is created and treated\n as the single axes case.\n\n n_cols : int, default=3\n The maximum number of columns in the grid plot. Only active when\n `ax` is a single axes or `None`.\n\n line_kw : dict, default=None\n Dict with keywords passed to the `matplotlib.pyplot.plot` call.\n For one-way partial dependence plots.\n\n ice_lines_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For ICE lines in the one-way partial dependence plots.\n The key value pairs defined in `ice_lines_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\n pd_line_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For partial dependence in one-way partial dependence plots.\n The key value pairs defined in `pd_line_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\n contour_kw : dict, default=None\n Dict with keywords passed to the `matplotlib.pyplot.contourf`\n call for two-way partial dependence plots.\n\n Returns\n -------\n display : :class:`~sklearn.inspection.PartialDependenceDisplay`\n ", "source_code": "\n@_deprecate_positional_args(version='1.1')\ndef plot(self, *, ax=None, n_cols=3, line_kw=None, ice_lines_kw=None, pd_line_kw=None, contour_kw=None):\n \"\"\"Plot partial dependence plots.\n\n Parameters\n ----------\n ax : Matplotlib axes or array-like of Matplotlib axes, default=None\n - If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n - If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n - If `None`, a figure and a bounding axes is created and treated\n as the single axes case.\n\n n_cols : int, default=3\n The maximum number of columns in the grid plot. Only active when\n `ax` is a single axes or `None`.\n\n line_kw : dict, default=None\n Dict with keywords passed to the `matplotlib.pyplot.plot` call.\n For one-way partial dependence plots.\n\n ice_lines_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For ICE lines in the one-way partial dependence plots.\n The key value pairs defined in `ice_lines_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\n pd_line_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For partial dependence in one-way partial dependence plots.\n The key value pairs defined in `pd_line_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\n contour_kw : dict, default=None\n Dict with keywords passed to the `matplotlib.pyplot.contourf`\n call for two-way partial dependence plots.\n\n Returns\n -------\n display : :class:`~sklearn.inspection.PartialDependenceDisplay`\n \"\"\"\n check_matplotlib_support('plot_partial_dependence')\n import matplotlib.pyplot as plt\n from matplotlib.gridspec import GridSpecFromSubplotSpec\n if line_kw is None:\n line_kw = {}\n if ice_lines_kw is None:\n ice_lines_kw = {}\n if pd_line_kw is None:\n pd_line_kw = {}\n if contour_kw is None:\n contour_kw = {}\n if ax is None:\n (_, ax) = plt.subplots()\n default_contour_kws = {'alpha': 0.75}\n contour_kw = {**default_contour_kws, **contour_kw}\n default_line_kws = {'color': 'C0', 'label': 'average' if self.kind == 'both' else None}\n if self.kind in ('individual', 'both'):\n default_ice_lines_kws = {'alpha': 0.3, 'linewidth': 0.5}\n else:\n default_ice_lines_kws = {}\n ice_lines_kw = {**default_line_kws, **line_kw, **default_ice_lines_kws, **ice_lines_kw}\n del ice_lines_kw['label']\n pd_line_kw = {**default_line_kws, **line_kw, **pd_line_kw}\n n_features = len(self.features)\n if self.kind in ('individual', 'both'):\n n_ice_lines = self._get_sample_count(len(self.pd_results[0].individual[0]))\n if self.kind == 'individual':\n n_lines = n_ice_lines\n else:\n n_lines = n_ice_lines + 1\n else:\n n_ice_lines = 0\n n_lines = 1\n if isinstance(ax, plt.Axes):\n if not ax.axison:\n raise ValueError('The ax was already used in another plot function, please set ax=display.axes_ instead')\n ax.set_axis_off()\n self.bounding_ax_ = ax\n self.figure_ = ax.figure\n n_cols = min(n_cols, n_features)\n n_rows = int(np.ceil(n_features / float(n_cols)))\n self.axes_ = np.empty((n_rows, n_cols), dtype=object)\n if self.kind == 'average':\n self.lines_ = np.empty((n_rows, n_cols), dtype=object)\n else:\n self.lines_ = np.empty((n_rows, n_cols, n_lines), dtype=object)\n self.contours_ = np.empty((n_rows, n_cols), dtype=object)\n axes_ravel = self.axes_.ravel()\n gs = GridSpecFromSubplotSpec(n_rows, n_cols, subplot_spec=ax.get_subplotspec())\n for (i, spec) in zip(range(n_features), gs):\n axes_ravel[i] = self.figure_.add_subplot(spec)\n else:\n ax = np.asarray(ax, dtype=object)\n if ax.size != n_features:\n raise ValueError('Expected ax to have {} axes, got {}'.format(n_features, ax.size))\n if ax.ndim == 2:\n n_cols = ax.shape[1]\n else:\n n_cols = None\n self.bounding_ax_ = None\n self.figure_ = ax.ravel()[0].figure\n self.axes_ = ax\n if self.kind == 'average':\n self.lines_ = np.empty_like(ax, dtype=object)\n else:\n self.lines_ = np.empty(ax.shape + (n_lines, ), dtype=object)\n self.contours_ = np.empty_like(ax, dtype=object)\n if 2 in self.pdp_lim:\n Z_level = np.linspace(*self.pdp_lim[2], num=8)\n self.deciles_vlines_ = np.empty_like(self.axes_, dtype=object)\n self.deciles_hlines_ = np.empty_like(self.axes_, dtype=object)\n for (pd_plot_idx, (axi, feature_idx, pd_result)) in enumerate(zip(self.axes_.ravel(), self.features, self.pd_results)):\n avg_preds = None\n preds = None\n feature_values = pd_result['values']\n if self.kind == 'individual':\n preds = pd_result.individual\n elif self.kind == 'average':\n avg_preds = pd_result.average\n else:\n avg_preds = pd_result.average\n preds = pd_result.individual\n if len(feature_values) == 1:\n self._plot_one_way_partial_dependence(preds, avg_preds, feature_values[0], feature_idx, n_ice_lines, axi, n_cols, pd_plot_idx, n_lines, ice_lines_kw, pd_line_kw)\n else:\n self._plot_two_way_partial_dependence(avg_preds, feature_values, feature_idx, axi, pd_plot_idx, Z_level, contour_kw)\n return self" }, { @@ -92945,7 +98963,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -92955,7 +98974,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "features", @@ -92965,7 +98985,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "feature_names", @@ -92975,7 +98996,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "target", @@ -92985,7 +99007,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "response_method", @@ -92995,7 +99018,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_cols", @@ -93005,7 +99029,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "grid_resolution", @@ -93015,7 +99040,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "percentiles", @@ -93025,7 +99051,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "method", @@ -93035,7 +99062,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -93045,7 +99073,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -93055,7 +99084,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "line_kw", @@ -93065,7 +99095,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ice_lines_kw", @@ -93075,7 +99106,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "pd_line_kw", @@ -93085,7 +99117,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "contour_kw", @@ -93095,7 +99128,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ax", @@ -93105,7 +99139,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kind", @@ -93115,7 +99150,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "subsample", @@ -93125,7 +99161,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -93135,7 +99172,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -93161,7 +99199,8 @@ "docstring": { "type": "BaseEstimator", "description": "A fitted estimator object implementing :term:`predict`,\n:term:`predict_proba`, or :term:`decision_function`.\nMultioutput-multiclass classifiers are not supported." - } + }, + "refined_type": {} }, { "name": "X", @@ -93171,6 +99210,10 @@ "docstring": { "type": "{array-like, dataframe} of shape (n_samples, n_features)", "description": "``X`` is used to generate a grid of values for the target\n``features`` (where the partial dependence will be evaluated), and\nalso to generate values for the complement features when the\n`method` is `'brute'`." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -93181,6 +99224,10 @@ "docstring": { "type": "list of {int, str, pair of int, pair of str}", "description": "The target features for which to create the PDPs.\nIf `features[i]` is an integer or a string, a one-way PDP is created;\nif `features[i]` is a tuple, a two-way PDP is created (only supported\nwith `kind='average'`). Each tuple must be of size 2.\nif any entry is a string, then it must be in ``feature_names``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -93191,7 +99238,8 @@ "docstring": { "type": "array-like of shape (n_features,), dtype=str, default=None", "description": "Name of each feature; `feature_names[i]` holds the name of the feature\nwith index `i`.\nBy default, the name of the feature corresponds to their numerical\nindex for NumPy array and their column name for pandas dataframe." - } + }, + "refined_type": {} }, { "name": "target", @@ -93201,7 +99249,8 @@ "docstring": { "type": "int, default=None", "description": "- In a multiclass setting, specifies the class for which the PDPs\n should be computed. Note that for binary classification, the\n positive class (index 1) is always used.\n- In a multioutput setting, specifies the task for which the PDPs\n should be computed.\n\nIgnored in binary classification or classical regression settings." - } + }, + "refined_type": {} }, { "name": "response_method", @@ -93211,6 +99260,10 @@ "docstring": { "type": "{'auto', 'predict_proba', 'decision_function'}, default='auto'", "description": "Specifies whether to use :term:`predict_proba` or\n:term:`decision_function` as the target response. For regressors\nthis parameter is ignored and the response is always the output of\n:term:`predict`. By default, :term:`predict_proba` is tried first\nand we revert to :term:`decision_function` if it doesn't exist. If\n``method`` is `'recursion'`, the response is always the output of\n:term:`decision_function`." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "decision_function", "predict_proba"] } }, { @@ -93221,7 +99274,8 @@ "docstring": { "type": "int, default=3", "description": "The maximum number of columns in the grid plot. Only active when `ax`\nis a single axis or `None`." - } + }, + "refined_type": {} }, { "name": "grid_resolution", @@ -93231,7 +99285,8 @@ "docstring": { "type": "int, default=100", "description": "The number of equally spaced points on the axes of the plots, for each\ntarget feature." - } + }, + "refined_type": {} }, { "name": "percentiles", @@ -93241,7 +99296,8 @@ "docstring": { "type": "tuple of float, default=(0.05, 0.95)", "description": "The lower and upper percentile used to create the extreme values\nfor the PDP axes. Must be in [0, 1]." - } + }, + "refined_type": {} }, { "name": "method", @@ -93251,7 +99307,8 @@ "docstring": { "type": "str, default='auto'", "description": "The method used to calculate the averaged predictions:\n\n- `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`\n but is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the ICEs by design, it is not compatible with ICE and\n thus `kind` must be `'average'`.\n\n- `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n- `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\nPlease see :ref:`this note ` for\ndifferences between the `'brute'` and `'recursion'` method." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -93261,7 +99318,8 @@ "docstring": { "type": "int, default=None", "description": "The number of CPUs to use to compute the partial dependences.\nComputation is parallelized over features specified by the `features`\nparameter.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -93271,7 +99329,8 @@ "docstring": { "type": "int, default=0", "description": "Verbose output during PD computations." - } + }, + "refined_type": {} }, { "name": "line_kw", @@ -93281,7 +99340,8 @@ "docstring": { "type": "dict, default=None", "description": "Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.\nFor one-way partial dependence plots. It can be used to define common\nproperties for both `ice_lines_kw` and `pdp_line_kw`." - } + }, + "refined_type": {} }, { "name": "ice_lines_kw", @@ -93291,7 +99351,8 @@ "docstring": { "type": "dict, default=None", "description": "Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\nFor ICE lines in the one-way partial dependence plots.\nThe key value pairs defined in `ice_lines_kw` takes priority over\n`line_kw`.\n\n.. versionadded:: 1.0" - } + }, + "refined_type": {} }, { "name": "pd_line_kw", @@ -93301,7 +99362,8 @@ "docstring": { "type": "dict, default=None", "description": "Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\nFor partial dependence in one-way partial dependence plots.\nThe key value pairs defined in `pd_line_kw` takes priority over\n`line_kw`.\n\n.. versionadded:: 1.0" - } + }, + "refined_type": {} }, { "name": "contour_kw", @@ -93311,7 +99373,8 @@ "docstring": { "type": "dict, default=None", "description": "Dict with keywords passed to the ``matplotlib.pyplot.contourf`` call.\nFor two-way partial dependence plots." - } + }, + "refined_type": {} }, { "name": "ax", @@ -93321,7 +99384,8 @@ "docstring": { "type": "Matplotlib axes or array-like of Matplotlib axes, default=None", "description": "- If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n- If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n- If `None`, a figure and a bounding axes is created and treated\n as the single axes case.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "kind", @@ -93331,6 +99395,10 @@ "docstring": { "type": "{'average', 'individual', 'both'}, default='average'", "description": " Whether to plot the partial dependence averaged across all the samples\n in the dataset or one line per sample or both.\n\n - ``kind='average'`` results in the traditional PD plot;\n - ``kind='individual'`` results in the ICE plot.\n\nNote that the fast ``method='recursion'`` option is only available for\n``kind='average'``. Plotting individual dependencies requires using the\nslower ``method='brute'`` option.\n\n .. versionadded:: 0.24" + }, + "refined_type": { + "kind": "EnumType", + "values": ["individual", "average", "both"] } }, { @@ -93341,7 +99409,8 @@ "docstring": { "type": "float, int or None, default=1000", "description": "Sampling for ICE curves when `kind` is 'individual' or 'both'.\nIf `float`, should be between 0.0 and 1.0 and represent the proportion\nof the dataset to be used to plot ICE curves. If `int`, represents the\nabsolute number samples to use.\n\nNote that the full dataset is still used to calculate averaged partial\ndependence when `kind='both'`.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -93351,13 +99420,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the randomness of the selected samples when subsamples is not\n`None` and `kind` is either `'both'` or `'individual'`.\nSee :term:`Glossary ` for details.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Partial dependence (PD) and individual conditional expectation (ICE) plots.\n\nPartial dependence plots, individual conditional expectation plots or an overlay of both of them can be plotted by setting the ``kind`` parameter. The ``len(features)`` plots are arranged in a grid with ``n_cols`` columns. Two-way partial dependence plots are plotted as contour plots. The deciles of the feature values will be shown with tick marks on the x-axes for one-way plots, and on both axes for two-way plots. Read more in the :ref:`User Guide `. .. note:: :func:`plot_partial_dependence` does not support using the same axes with multiple calls. To plot the the partial dependence for multiple estimators, please pass the axes created by the first call to the second call:: >>> from sklearn.inspection import plot_partial_dependence >>> from sklearn.datasets import make_friedman1 >>> from sklearn.linear_model import LinearRegression >>> from sklearn.ensemble import RandomForestRegressor >>> X, y = make_friedman1() >>> est1 = LinearRegression().fit(X, y) >>> est2 = RandomForestRegressor().fit(X, y) >>> disp1 = plot_partial_dependence(est1, X, ... [1, 2]) # doctest: +SKIP >>> disp2 = plot_partial_dependence(est2, X, [1, 2], ... ax=disp1.axes_) # doctest: +SKIP .. warning:: For :class:`~sklearn.ensemble.GradientBoostingClassifier` and :class:`~sklearn.ensemble.GradientBoostingRegressor`, the `'recursion'` method (used by default) will not account for the `init` predictor of the boosting process. In practice, this will produce the same values as `'brute'` up to a constant offset in the target response, provided that `init` is a constant estimator (which is the default). However, if `init` is not a constant estimator, the partial dependence values are incorrect for `'recursion'` because the offset will be sample-dependent. It is preferable to use the `'brute'` method. Note that this only applies to :class:`~sklearn.ensemble.GradientBoostingClassifier` and :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. .. deprecated:: 1.0 `plot_partial_dependence` is deprecated in 1.0 and will be removed in 1.2. Please use the class method: :func:`~sklearn.metrics.PartialDependenceDisplay.from_estimator`.", - "docstring": "Partial dependence (PD) and individual conditional expectation (ICE)\nplots.\n\nPartial dependence plots, individual conditional expectation plots or an\noverlay of both of them can be plotted by setting the ``kind``\nparameter.\nThe ``len(features)`` plots are arranged in a grid with ``n_cols``\ncolumns. Two-way partial dependence plots are plotted as contour plots. The\ndeciles of the feature values will be shown with tick marks on the x-axes\nfor one-way plots, and on both axes for two-way plots.\n\nRead more in the :ref:`User Guide `.\n\n.. note::\n\n :func:`plot_partial_dependence` does not support using the same axes\n with multiple calls. To plot the the partial dependence for multiple\n estimators, please pass the axes created by the first call to the\n second call::\n\n >>> from sklearn.inspection import plot_partial_dependence\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> X, y = make_friedman1()\n >>> est1 = LinearRegression().fit(X, y)\n >>> est2 = RandomForestRegressor().fit(X, y)\n >>> disp1 = plot_partial_dependence(est1, X,\n ... [1, 2]) # doctest: +SKIP\n >>> disp2 = plot_partial_dependence(est2, X, [1, 2],\n ... ax=disp1.axes_) # doctest: +SKIP\n\n.. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\n.. deprecated:: 1.0\n `plot_partial_dependence` is deprecated in 1.0 and will be removed in\n 1.2. Please use the class method:\n :func:`~sklearn.metrics.PartialDependenceDisplay.from_estimator`.\n\nParameters\n----------\nestimator : BaseEstimator\n A fitted estimator object implementing :term:`predict`,\n :term:`predict_proba`, or :term:`decision_function`.\n Multioutput-multiclass classifiers are not supported.\n\nX : {array-like, dataframe} of shape (n_samples, n_features)\n ``X`` is used to generate a grid of values for the target\n ``features`` (where the partial dependence will be evaluated), and\n also to generate values for the complement features when the\n `method` is `'brute'`.\n\nfeatures : list of {int, str, pair of int, pair of str}\n The target features for which to create the PDPs.\n If `features[i]` is an integer or a string, a one-way PDP is created;\n if `features[i]` is a tuple, a two-way PDP is created (only supported\n with `kind='average'`). Each tuple must be of size 2.\n if any entry is a string, then it must be in ``feature_names``.\n\nfeature_names : array-like of shape (n_features,), dtype=str, default=None\n Name of each feature; `feature_names[i]` holds the name of the feature\n with index `i`.\n By default, the name of the feature corresponds to their numerical\n index for NumPy array and their column name for pandas dataframe.\n\ntarget : int, default=None\n - In a multiclass setting, specifies the class for which the PDPs\n should be computed. Note that for binary classification, the\n positive class (index 1) is always used.\n - In a multioutput setting, specifies the task for which the PDPs\n should be computed.\n\n Ignored in binary classification or classical regression settings.\n\nresponse_method : {'auto', 'predict_proba', 'decision_function'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. For regressors\n this parameter is ignored and the response is always the output of\n :term:`predict`. By default, :term:`predict_proba` is tried first\n and we revert to :term:`decision_function` if it doesn't exist. If\n ``method`` is `'recursion'`, the response is always the output of\n :term:`decision_function`.\n\nn_cols : int, default=3\n The maximum number of columns in the grid plot. Only active when `ax`\n is a single axis or `None`.\n\ngrid_resolution : int, default=100\n The number of equally spaced points on the axes of the plots, for each\n target feature.\n\npercentiles : tuple of float, default=(0.05, 0.95)\n The lower and upper percentile used to create the extreme values\n for the PDP axes. Must be in [0, 1].\n\nmethod : str, default='auto'\n The method used to calculate the averaged predictions:\n\n - `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`\n but is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the ICEs by design, it is not compatible with ICE and\n thus `kind` must be `'average'`.\n\n - `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n - `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\n Please see :ref:`this note ` for\n differences between the `'brute'` and `'recursion'` method.\n\nn_jobs : int, default=None\n The number of CPUs to use to compute the partial dependences.\n Computation is parallelized over features specified by the `features`\n parameter.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nverbose : int, default=0\n Verbose output during PD computations.\n\nline_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.\n For one-way partial dependence plots. It can be used to define common\n properties for both `ice_lines_kw` and `pdp_line_kw`.\n\nice_lines_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For ICE lines in the one-way partial dependence plots.\n The key value pairs defined in `ice_lines_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\npd_line_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For partial dependence in one-way partial dependence plots.\n The key value pairs defined in `pd_line_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\ncontour_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.contourf`` call.\n For two-way partial dependence plots.\n\nax : Matplotlib axes or array-like of Matplotlib axes, default=None\n - If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n - If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n - If `None`, a figure and a bounding axes is created and treated\n as the single axes case.\n\n .. versionadded:: 0.22\n\nkind : {'average', 'individual', 'both'}, default='average'\n Whether to plot the partial dependence averaged across all the samples\n in the dataset or one line per sample or both.\n\n - ``kind='average'`` results in the traditional PD plot;\n - ``kind='individual'`` results in the ICE plot.\n\n Note that the fast ``method='recursion'`` option is only available for\n ``kind='average'``. Plotting individual dependencies requires using the\n slower ``method='brute'`` option.\n\n .. versionadded:: 0.24\n\nsubsample : float, int or None, default=1000\n Sampling for ICE curves when `kind` is 'individual' or 'both'.\n If `float`, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to be used to plot ICE curves. If `int`, represents the\n absolute number samples to use.\n\n Note that the full dataset is still used to calculate averaged partial\n dependence when `kind='both'`.\n\n .. versionadded:: 0.24\n\nrandom_state : int, RandomState instance or None, default=None\n Controls the randomness of the selected samples when subsamples is not\n `None` and `kind` is either `'both'` or `'individual'`.\n See :term:`Glossary ` for details.\n\n .. versionadded:: 0.24\n\nReturns\n-------\ndisplay : :class:`~sklearn.inspection.PartialDependenceDisplay`\n\nSee Also\n--------\npartial_dependence : Compute Partial Dependence values.\nPartialDependenceDisplay : Partial Dependence visualization.\nPartialDependenceDisplay.from_estimator : Plot Partial Dependence.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_friedman1\n>>> from sklearn.ensemble import GradientBoostingRegressor\n>>> from sklearn.inspection import plot_partial_dependence\n>>> X, y = make_friedman1()\n>>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)\n>>> plot_partial_dependence(clf, X, [0, (0, 1)]) # doctest: +SKIP\n<...>\n>>> plt.show() # doctest: +SKIP", + "description": "Partial dependence (PD) and individual conditional expectation (ICE)\nplots.\n\nPartial dependence plots, individual conditional expectation plots or an\noverlay of both of them can be plotted by setting the ``kind``\nparameter.\nThe ``len(features)`` plots are arranged in a grid with ``n_cols``\ncolumns. Two-way partial dependence plots are plotted as contour plots. The\ndeciles of the feature values will be shown with tick marks on the x-axes\nfor one-way plots, and on both axes for two-way plots.\n\nRead more in the :ref:`User Guide `.\n\n.. note::\n\n :func:`plot_partial_dependence` does not support using the same axes\n with multiple calls. To plot the the partial dependence for multiple\n estimators, please pass the axes created by the first call to the\n second call::\n\n >>> from sklearn.inspection import plot_partial_dependence\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> X, y = make_friedman1()\n >>> est1 = LinearRegression().fit(X, y)\n >>> est2 = RandomForestRegressor().fit(X, y)\n >>> disp1 = plot_partial_dependence(est1, X,\n ... [1, 2]) # doctest: +SKIP\n >>> disp2 = plot_partial_dependence(est2, X, [1, 2],\n ... ax=disp1.axes_) # doctest: +SKIP\n\n.. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\n.. deprecated:: 1.0\n `plot_partial_dependence` is deprecated in 1.0 and will be removed in\n 1.2. Please use the class method:\n :func:`~sklearn.metrics.PartialDependenceDisplay.from_estimator`.", + "docstring": "Partial dependence (PD) and individual conditional expectation (ICE)\n plots.\n\n Partial dependence plots, individual conditional expectation plots or an\n overlay of both of them can be plotted by setting the ``kind``\n parameter.\n The ``len(features)`` plots are arranged in a grid with ``n_cols``\n columns. Two-way partial dependence plots are plotted as contour plots. The\n deciles of the feature values will be shown with tick marks on the x-axes\n for one-way plots, and on both axes for two-way plots.\n\n Read more in the :ref:`User Guide `.\n\n .. note::\n\n :func:`plot_partial_dependence` does not support using the same axes\n with multiple calls. To plot the the partial dependence for multiple\n estimators, please pass the axes created by the first call to the\n second call::\n\n >>> from sklearn.inspection import plot_partial_dependence\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> X, y = make_friedman1()\n >>> est1 = LinearRegression().fit(X, y)\n >>> est2 = RandomForestRegressor().fit(X, y)\n >>> disp1 = plot_partial_dependence(est1, X,\n ... [1, 2]) # doctest: +SKIP\n >>> disp2 = plot_partial_dependence(est2, X, [1, 2],\n ... ax=disp1.axes_) # doctest: +SKIP\n\n .. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\n .. deprecated:: 1.0\n `plot_partial_dependence` is deprecated in 1.0 and will be removed in\n 1.2. Please use the class method:\n :func:`~sklearn.metrics.PartialDependenceDisplay.from_estimator`.\n\n Parameters\n ----------\n estimator : BaseEstimator\n A fitted estimator object implementing :term:`predict`,\n :term:`predict_proba`, or :term:`decision_function`.\n Multioutput-multiclass classifiers are not supported.\n\n X : {array-like, dataframe} of shape (n_samples, n_features)\n ``X`` is used to generate a grid of values for the target\n ``features`` (where the partial dependence will be evaluated), and\n also to generate values for the complement features when the\n `method` is `'brute'`.\n\n features : list of {int, str, pair of int, pair of str}\n The target features for which to create the PDPs.\n If `features[i]` is an integer or a string, a one-way PDP is created;\n if `features[i]` is a tuple, a two-way PDP is created (only supported\n with `kind='average'`). Each tuple must be of size 2.\n if any entry is a string, then it must be in ``feature_names``.\n\n feature_names : array-like of shape (n_features,), dtype=str, default=None\n Name of each feature; `feature_names[i]` holds the name of the feature\n with index `i`.\n By default, the name of the feature corresponds to their numerical\n index for NumPy array and their column name for pandas dataframe.\n\n target : int, default=None\n - In a multiclass setting, specifies the class for which the PDPs\n should be computed. Note that for binary classification, the\n positive class (index 1) is always used.\n - In a multioutput setting, specifies the task for which the PDPs\n should be computed.\n\n Ignored in binary classification or classical regression settings.\n\n response_method : {'auto', 'predict_proba', 'decision_function'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. For regressors\n this parameter is ignored and the response is always the output of\n :term:`predict`. By default, :term:`predict_proba` is tried first\n and we revert to :term:`decision_function` if it doesn't exist. If\n ``method`` is `'recursion'`, the response is always the output of\n :term:`decision_function`.\n\n n_cols : int, default=3\n The maximum number of columns in the grid plot. Only active when `ax`\n is a single axis or `None`.\n\n grid_resolution : int, default=100\n The number of equally spaced points on the axes of the plots, for each\n target feature.\n\n percentiles : tuple of float, default=(0.05, 0.95)\n The lower and upper percentile used to create the extreme values\n for the PDP axes. Must be in [0, 1].\n\n method : str, default='auto'\n The method used to calculate the averaged predictions:\n\n - `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`\n but is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the ICEs by design, it is not compatible with ICE and\n thus `kind` must be `'average'`.\n\n - `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n - `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\n Please see :ref:`this note ` for\n differences between the `'brute'` and `'recursion'` method.\n\n n_jobs : int, default=None\n The number of CPUs to use to compute the partial dependences.\n Computation is parallelized over features specified by the `features`\n parameter.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n Verbose output during PD computations.\n\n line_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.\n For one-way partial dependence plots. It can be used to define common\n properties for both `ice_lines_kw` and `pdp_line_kw`.\n\n ice_lines_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For ICE lines in the one-way partial dependence plots.\n The key value pairs defined in `ice_lines_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\n pd_line_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For partial dependence in one-way partial dependence plots.\n The key value pairs defined in `pd_line_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\n contour_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.contourf`` call.\n For two-way partial dependence plots.\n\n ax : Matplotlib axes or array-like of Matplotlib axes, default=None\n - If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n - If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n - If `None`, a figure and a bounding axes is created and treated\n as the single axes case.\n\n .. versionadded:: 0.22\n\n kind : {'average', 'individual', 'both'}, default='average'\n Whether to plot the partial dependence averaged across all the samples\n in the dataset or one line per sample or both.\n\n - ``kind='average'`` results in the traditional PD plot;\n - ``kind='individual'`` results in the ICE plot.\n\n Note that the fast ``method='recursion'`` option is only available for\n ``kind='average'``. Plotting individual dependencies requires using the\n slower ``method='brute'`` option.\n\n .. versionadded:: 0.24\n\n subsample : float, int or None, default=1000\n Sampling for ICE curves when `kind` is 'individual' or 'both'.\n If `float`, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to be used to plot ICE curves. If `int`, represents the\n absolute number samples to use.\n\n Note that the full dataset is still used to calculate averaged partial\n dependence when `kind='both'`.\n\n .. versionadded:: 0.24\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the selected samples when subsamples is not\n `None` and `kind` is either `'both'` or `'individual'`.\n See :term:`Glossary ` for details.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n display : :class:`~sklearn.inspection.PartialDependenceDisplay`\n\n See Also\n --------\n partial_dependence : Compute Partial Dependence values.\n PartialDependenceDisplay : Partial Dependence visualization.\n PartialDependenceDisplay.from_estimator : Plot Partial Dependence.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.ensemble import GradientBoostingRegressor\n >>> from sklearn.inspection import plot_partial_dependence\n >>> X, y = make_friedman1()\n >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)\n >>> plot_partial_dependence(clf, X, [0, (0, 1)]) # doctest: +SKIP\n <...>\n >>> plt.show() # doctest: +SKIP\n ", "source_code": "\n@deprecated('Function `plot_partial_dependence` is deprecated in 1.0 and will be removed in 1.2. Use PartialDependenceDisplay.from_estimator instead')\ndef plot_partial_dependence(estimator, X, features, *, feature_names=None, target=None, response_method='auto', n_cols=3, grid_resolution=100, percentiles=(0.05, 0.95), method='auto', n_jobs=None, verbose=0, line_kw=None, ice_lines_kw=None, pd_line_kw=None, contour_kw=None, ax=None, kind='average', subsample=1000, random_state=None):\n \"\"\"Partial dependence (PD) and individual conditional expectation (ICE)\n plots.\n\n Partial dependence plots, individual conditional expectation plots or an\n overlay of both of them can be plotted by setting the ``kind``\n parameter.\n The ``len(features)`` plots are arranged in a grid with ``n_cols``\n columns. Two-way partial dependence plots are plotted as contour plots. The\n deciles of the feature values will be shown with tick marks on the x-axes\n for one-way plots, and on both axes for two-way plots.\n\n Read more in the :ref:`User Guide `.\n\n .. note::\n\n :func:`plot_partial_dependence` does not support using the same axes\n with multiple calls. To plot the the partial dependence for multiple\n estimators, please pass the axes created by the first call to the\n second call::\n\n >>> from sklearn.inspection import plot_partial_dependence\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.linear_model import LinearRegression\n >>> from sklearn.ensemble import RandomForestRegressor\n >>> X, y = make_friedman1()\n >>> est1 = LinearRegression().fit(X, y)\n >>> est2 = RandomForestRegressor().fit(X, y)\n >>> disp1 = plot_partial_dependence(est1, X,\n ... [1, 2]) # doctest: +SKIP\n >>> disp2 = plot_partial_dependence(est2, X, [1, 2],\n ... ax=disp1.axes_) # doctest: +SKIP\n\n .. warning::\n\n For :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, the\n `'recursion'` method (used by default) will not account for the `init`\n predictor of the boosting process. In practice, this will produce\n the same values as `'brute'` up to a constant offset in the target\n response, provided that `init` is a constant estimator (which is the\n default). However, if `init` is not a constant estimator, the\n partial dependence values are incorrect for `'recursion'` because the\n offset will be sample-dependent. It is preferable to use the `'brute'`\n method. Note that this only applies to\n :class:`~sklearn.ensemble.GradientBoostingClassifier` and\n :class:`~sklearn.ensemble.GradientBoostingRegressor`, not to\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`.\n\n .. deprecated:: 1.0\n `plot_partial_dependence` is deprecated in 1.0 and will be removed in\n 1.2. Please use the class method:\n :func:`~sklearn.metrics.PartialDependenceDisplay.from_estimator`.\n\n Parameters\n ----------\n estimator : BaseEstimator\n A fitted estimator object implementing :term:`predict`,\n :term:`predict_proba`, or :term:`decision_function`.\n Multioutput-multiclass classifiers are not supported.\n\n X : {array-like, dataframe} of shape (n_samples, n_features)\n ``X`` is used to generate a grid of values for the target\n ``features`` (where the partial dependence will be evaluated), and\n also to generate values for the complement features when the\n `method` is `'brute'`.\n\n features : list of {int, str, pair of int, pair of str}\n The target features for which to create the PDPs.\n If `features[i]` is an integer or a string, a one-way PDP is created;\n if `features[i]` is a tuple, a two-way PDP is created (only supported\n with `kind='average'`). Each tuple must be of size 2.\n if any entry is a string, then it must be in ``feature_names``.\n\n feature_names : array-like of shape (n_features,), dtype=str, default=None\n Name of each feature; `feature_names[i]` holds the name of the feature\n with index `i`.\n By default, the name of the feature corresponds to their numerical\n index for NumPy array and their column name for pandas dataframe.\n\n target : int, default=None\n - In a multiclass setting, specifies the class for which the PDPs\n should be computed. Note that for binary classification, the\n positive class (index 1) is always used.\n - In a multioutput setting, specifies the task for which the PDPs\n should be computed.\n\n Ignored in binary classification or classical regression settings.\n\n response_method : {'auto', 'predict_proba', 'decision_function'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. For regressors\n this parameter is ignored and the response is always the output of\n :term:`predict`. By default, :term:`predict_proba` is tried first\n and we revert to :term:`decision_function` if it doesn't exist. If\n ``method`` is `'recursion'`, the response is always the output of\n :term:`decision_function`.\n\n n_cols : int, default=3\n The maximum number of columns in the grid plot. Only active when `ax`\n is a single axis or `None`.\n\n grid_resolution : int, default=100\n The number of equally spaced points on the axes of the plots, for each\n target feature.\n\n percentiles : tuple of float, default=(0.05, 0.95)\n The lower and upper percentile used to create the extreme values\n for the PDP axes. Must be in [0, 1].\n\n method : str, default='auto'\n The method used to calculate the averaged predictions:\n\n - `'recursion'` is only supported for some tree-based estimators\n (namely\n :class:`~sklearn.ensemble.GradientBoostingClassifier`,\n :class:`~sklearn.ensemble.GradientBoostingRegressor`,\n :class:`~sklearn.ensemble.HistGradientBoostingClassifier`,\n :class:`~sklearn.ensemble.HistGradientBoostingRegressor`,\n :class:`~sklearn.tree.DecisionTreeRegressor`,\n :class:`~sklearn.ensemble.RandomForestRegressor`\n but is more efficient in terms of speed.\n With this method, the target response of a\n classifier is always the decision function, not the predicted\n probabilities. Since the `'recursion'` method implicitly computes\n the average of the ICEs by design, it is not compatible with ICE and\n thus `kind` must be `'average'`.\n\n - `'brute'` is supported for any estimator, but is more\n computationally intensive.\n\n - `'auto'`: the `'recursion'` is used for estimators that support it,\n and `'brute'` is used otherwise.\n\n Please see :ref:`this note ` for\n differences between the `'brute'` and `'recursion'` method.\n\n n_jobs : int, default=None\n The number of CPUs to use to compute the partial dependences.\n Computation is parallelized over features specified by the `features`\n parameter.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n Verbose output during PD computations.\n\n line_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.\n For one-way partial dependence plots. It can be used to define common\n properties for both `ice_lines_kw` and `pdp_line_kw`.\n\n ice_lines_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For ICE lines in the one-way partial dependence plots.\n The key value pairs defined in `ice_lines_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\n pd_line_kw : dict, default=None\n Dictionary with keywords passed to the `matplotlib.pyplot.plot` call.\n For partial dependence in one-way partial dependence plots.\n The key value pairs defined in `pd_line_kw` takes priority over\n `line_kw`.\n\n .. versionadded:: 1.0\n\n contour_kw : dict, default=None\n Dict with keywords passed to the ``matplotlib.pyplot.contourf`` call.\n For two-way partial dependence plots.\n\n ax : Matplotlib axes or array-like of Matplotlib axes, default=None\n - If a single axis is passed in, it is treated as a bounding axes\n and a grid of partial dependence plots will be drawn within\n these bounds. The `n_cols` parameter controls the number of\n columns in the grid.\n - If an array-like of axes are passed in, the partial dependence\n plots will be drawn directly into these axes.\n - If `None`, a figure and a bounding axes is created and treated\n as the single axes case.\n\n .. versionadded:: 0.22\n\n kind : {'average', 'individual', 'both'}, default='average'\n Whether to plot the partial dependence averaged across all the samples\n in the dataset or one line per sample or both.\n\n - ``kind='average'`` results in the traditional PD plot;\n - ``kind='individual'`` results in the ICE plot.\n\n Note that the fast ``method='recursion'`` option is only available for\n ``kind='average'``. Plotting individual dependencies requires using the\n slower ``method='brute'`` option.\n\n .. versionadded:: 0.24\n\n subsample : float, int or None, default=1000\n Sampling for ICE curves when `kind` is 'individual' or 'both'.\n If `float`, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to be used to plot ICE curves. If `int`, represents the\n absolute number samples to use.\n\n Note that the full dataset is still used to calculate averaged partial\n dependence when `kind='both'`.\n\n .. versionadded:: 0.24\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the selected samples when subsamples is not\n `None` and `kind` is either `'both'` or `'individual'`.\n See :term:`Glossary ` for details.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n display : :class:`~sklearn.inspection.PartialDependenceDisplay`\n\n See Also\n --------\n partial_dependence : Compute Partial Dependence values.\n PartialDependenceDisplay : Partial Dependence visualization.\n PartialDependenceDisplay.from_estimator : Plot Partial Dependence.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_friedman1\n >>> from sklearn.ensemble import GradientBoostingRegressor\n >>> from sklearn.inspection import plot_partial_dependence\n >>> X, y = make_friedman1()\n >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)\n >>> plot_partial_dependence(clf, X, [0, (0, 1)]) # doctest: +SKIP\n <...>\n >>> plt.show() # doctest: +SKIP\n \"\"\"\n check_matplotlib_support('plot_partial_dependence')\n return _plot_partial_dependence(estimator, X, features, feature_names=feature_names, target=target, response_method=response_method, n_cols=n_cols, grid_resolution=grid_resolution, percentiles=percentiles, method=method, n_jobs=n_jobs, verbose=verbose, line_kw=line_kw, ice_lines_kw=ice_lines_kw, pd_line_kw=pd_line_kw, contour_kw=contour_kw, ax=ax, kind=kind, subsample=subsample, random_state=random_state)" }, { @@ -93375,7 +99445,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -93385,13 +99456,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n config = Configuration('inspection', parent_package, top_path)\n config.add_subpackage('_plot')\n config.add_subpackage('_plot.tests')\n config.add_subpackage('tests')\n return config" }, { @@ -93409,7 +99481,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -93433,7 +99506,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_min", @@ -93443,7 +99517,8 @@ "docstring": { "type": "float, default=None", "description": "Lower bound on the lowest predicted value (the minimum value may\nstill be higher). If not set, defaults to -inf." - } + }, + "refined_type": {} }, { "name": "y_max", @@ -93453,7 +99528,8 @@ "docstring": { "type": "float, default=None", "description": "Upper bound on the highest predicted value (the maximum may still be\nlower). If not set, defaults to +inf." - } + }, + "refined_type": {} }, { "name": "increasing", @@ -93463,7 +99539,8 @@ "docstring": { "type": "bool or 'auto', default=True", "description": "Determines whether the predictions should be constrained to increase\nor decrease with `X`. 'auto' will decide based on the Spearman\ncorrelation estimate's sign." - } + }, + "refined_type": {} }, { "name": "out_of_bounds", @@ -93473,13 +99550,17 @@ "docstring": { "type": "{'nan', 'clip', 'raise'}, default='nan'", "description": "Handles how `X` values outside of the training domain are handled\nduring prediction.\n\n- 'nan', predictions will be NaN.\n- 'clip', predictions will be set to the value corresponding to\n the nearest train interval endpoint.\n- 'raise', a `ValueError` is raised." + }, + "refined_type": { + "kind": "EnumType", + "values": ["raise", "nan", "clip"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, y_min=None, y_max=None, increasing=True, out_of_bounds='nan'):\n self.y_min = y_min\n self.y_max = y_max\n self.increasing = increasing\n self.out_of_bounds = out_of_bounds" }, { @@ -93497,7 +99578,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "state", @@ -93507,13 +99589,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Pickle-protocol - set state of the estimator.\n\nWe need to rebuild the interpolation function.", - "docstring": "Pickle-protocol - set state of the estimator.\n\nWe need to rebuild the interpolation function.", + "docstring": "Pickle-protocol - set state of the estimator.\n\n We need to rebuild the interpolation function.\n ", "source_code": "\ndef __setstate__(self, state):\n \"\"\"Pickle-protocol - set state of the estimator.\n\n We need to rebuild the interpolation function.\n \"\"\"\n super().__setstate__(state)\n if hasattr(self, 'X_thresholds_') and hasattr(self, 'y_thresholds_'):\n self._build_f(self.X_thresholds_, self.y_thresholds_)" }, { @@ -93531,7 +99614,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -93541,7 +99625,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -93551,7 +99636,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -93575,7 +99661,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -93585,7 +99672,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -93595,7 +99683,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -93605,7 +99694,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "trim_duplicates", @@ -93615,7 +99705,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -93639,7 +99730,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -93649,13 +99741,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_input_data_shape(self, X):\n if not (X.ndim == 1 or X.ndim == 2 and X.shape[1] == 1):\n msg = 'Isotonic regression input X should be a 1d array or 2d array with 1 feature'\n raise ValueError(msg)" }, { @@ -93673,13 +99766,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'X_types': ['1darray']}" }, { @@ -93697,7 +99791,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -93707,7 +99802,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, 1)", "description": "Training data.\n\n.. versionchanged:: 0.24\n Also accepts 2d array with 1 feature." - } + }, + "refined_type": {} }, { "name": "y", @@ -93717,7 +99813,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Training target." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -93727,13 +99824,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weights. If set to None, all weights will be set to 1 (equal\nweights)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model using X, y as training data.", - "docstring": "Fit the model using X, y as training data.\n\nParameters\n----------\nX : array-like of shape (n_samples,) or (n_samples, 1)\n Training data.\n\n .. versionchanged:: 0.24\n Also accepts 2d array with 1 feature.\n\ny : array-like of shape (n_samples,)\n Training target.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weights. If set to None, all weights will be set to 1 (equal\n weights).\n\nReturns\n-------\nself : object\n Returns an instance of self.\n\nNotes\n-----\nX is stored for future use, as :meth:`transform` needs X to interpolate\nnew input data.", + "docstring": "Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples,) or (n_samples, 1)\n Training data.\n\n .. versionchanged:: 0.24\n Also accepts 2d array with 1 feature.\n\n y : array-like of shape (n_samples,)\n Training target.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights. If set to None, all weights will be set to 1 (equal\n weights).\n\n Returns\n -------\n self : object\n Returns an instance of self.\n\n Notes\n -----\n X is stored for future use, as :meth:`transform` needs X to interpolate\n new input data.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples,) or (n_samples, 1)\n Training data.\n\n .. versionchanged:: 0.24\n Also accepts 2d array with 1 feature.\n\n y : array-like of shape (n_samples,)\n Training target.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights. If set to None, all weights will be set to 1 (equal\n weights).\n\n Returns\n -------\n self : object\n Returns an instance of self.\n\n Notes\n -----\n X is stored for future use, as :meth:`transform` needs X to interpolate\n new input data.\n \"\"\"\n check_params = dict(accept_sparse=False, ensure_2d=False)\n X = check_array(X, dtype=[np.float64, np.float32], **check_params)\n y = check_array(y, dtype=X.dtype, **check_params)\n check_consistent_length(X, y, sample_weight)\n (X, y) = self._build_y(X, y, sample_weight)\n (self.X_thresholds_, self.y_thresholds_) = (X, y)\n self._build_f(X, y)\n return self" }, { @@ -93751,7 +99849,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "T", @@ -93761,13 +99860,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, 1)", "description": "Data to transform." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict new data by linear interpolation.", - "docstring": "Predict new data by linear interpolation.\n\nParameters\n----------\nT : array-like of shape (n_samples,) or (n_samples, 1)\n Data to transform.\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,)\n Transformed data.", + "docstring": "Predict new data by linear interpolation.\n\n Parameters\n ----------\n T : array-like of shape (n_samples,) or (n_samples, 1)\n Data to transform.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Transformed data.\n ", "source_code": "\ndef predict(self, T):\n \"\"\"Predict new data by linear interpolation.\n\n Parameters\n ----------\n T : array-like of shape (n_samples,) or (n_samples, 1)\n Data to transform.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Transformed data.\n \"\"\"\n return self.transform(T)" }, { @@ -93785,7 +99885,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "T", @@ -93795,13 +99896,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, 1)", "description": "Data to transform.\n\n.. versionchanged:: 0.24\n Also accepts 2d array with 1 feature." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform new data by linear interpolation.", - "docstring": "Transform new data by linear interpolation.\n\nParameters\n----------\nT : array-like of shape (n_samples,) or (n_samples, 1)\n Data to transform.\n\n .. versionchanged:: 0.24\n Also accepts 2d array with 1 feature.\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,)\n The transformed data.", + "docstring": "Transform new data by linear interpolation.\n\n Parameters\n ----------\n T : array-like of shape (n_samples,) or (n_samples, 1)\n Data to transform.\n\n .. versionchanged:: 0.24\n Also accepts 2d array with 1 feature.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n The transformed data.\n ", "source_code": "\ndef transform(self, T):\n \"\"\"Transform new data by linear interpolation.\n\n Parameters\n ----------\n T : array-like of shape (n_samples,) or (n_samples, 1)\n Data to transform.\n\n .. versionchanged:: 0.24\n Also accepts 2d array with 1 feature.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n The transformed data.\n \"\"\"\n if hasattr(self, 'X_thresholds_'):\n dtype = self.X_thresholds_.dtype\n else:\n dtype = np.float64\n T = check_array(T, dtype=dtype, ensure_2d=False)\n self._check_input_data_shape(T)\n T = T.reshape(-1)\n if self.out_of_bounds not in ['raise', 'nan', 'clip']:\n raise ValueError(\"The argument ``out_of_bounds`` must be in 'nan', 'clip', 'raise'; got {0}\".format(self.out_of_bounds))\n if self.out_of_bounds == 'clip':\n T = np.clip(T, self.X_min_, self.X_max_)\n res = self.f_(T)\n res = res.astype(T.dtype)\n return res" }, { @@ -93819,7 +99921,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -93829,13 +99932,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Training target." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Determine whether y is monotonically correlated with x.\n\ny is found increasing or decreasing with respect to x based on a Spearman correlation test.", - "docstring": "Determine whether y is monotonically correlated with x.\n\ny is found increasing or decreasing with respect to x based on a Spearman\ncorrelation test.\n\nParameters\n----------\nx : array-like of shape (n_samples,)\n Training data.\n\ny : array-like of shape (n_samples,)\n Training target.\n\nReturns\n-------\nincreasing_bool : boolean\n Whether the relationship is increasing or decreasing.\n\nNotes\n-----\nThe Spearman correlation coefficient is estimated from the data, and the\nsign of the resulting estimate is used as the result.\n\nIn the event that the 95% confidence interval based on Fisher transform\nspans zero, a warning is raised.\n\nReferences\n----------\nFisher transformation. Wikipedia.\nhttps://en.wikipedia.org/wiki/Fisher_transformation", + "description": "Determine whether y is monotonically correlated with x.\n\ny is found increasing or decreasing with respect to x based on a Spearman\ncorrelation test.", + "docstring": "Determine whether y is monotonically correlated with x.\n\n y is found increasing or decreasing with respect to x based on a Spearman\n correlation test.\n\n Parameters\n ----------\n x : array-like of shape (n_samples,)\n Training data.\n\n y : array-like of shape (n_samples,)\n Training target.\n\n Returns\n -------\n increasing_bool : boolean\n Whether the relationship is increasing or decreasing.\n\n Notes\n -----\n The Spearman correlation coefficient is estimated from the data, and the\n sign of the resulting estimate is used as the result.\n\n In the event that the 95% confidence interval based on Fisher transform\n spans zero, a warning is raised.\n\n References\n ----------\n Fisher transformation. Wikipedia.\n https://en.wikipedia.org/wiki/Fisher_transformation\n ", "source_code": "\ndef check_increasing(x, y):\n \"\"\"Determine whether y is monotonically correlated with x.\n\n y is found increasing or decreasing with respect to x based on a Spearman\n correlation test.\n\n Parameters\n ----------\n x : array-like of shape (n_samples,)\n Training data.\n\n y : array-like of shape (n_samples,)\n Training target.\n\n Returns\n -------\n increasing_bool : boolean\n Whether the relationship is increasing or decreasing.\n\n Notes\n -----\n The Spearman correlation coefficient is estimated from the data, and the\n sign of the resulting estimate is used as the result.\n\n In the event that the 95% confidence interval based on Fisher transform\n spans zero, a warning is raised.\n\n References\n ----------\n Fisher transformation. Wikipedia.\n https://en.wikipedia.org/wiki/Fisher_transformation\n \"\"\"\n (rho, _) = spearmanr(x, y)\n increasing_bool = rho >= 0\n if rho not in [-1.0, 1.0] and len(x) > 3:\n F = 0.5 * math.log((1.0 + rho) / (1.0 - rho))\n F_se = 1 / math.sqrt(len(x) - 3)\n rho_0 = math.tanh(F - 1.96 * F_se)\n rho_1 = math.tanh(F + 1.96 * F_se)\n if np.sign(rho_0) != np.sign(rho_1):\n warnings.warn('Confidence interval of the Spearman correlation coefficient spans zero. Determination of ``increasing`` may be suspect.')\n return increasing_bool" }, { @@ -93853,7 +99957,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The data." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -93863,7 +99968,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weights on each point of the regression.\nIf None, weight is set to 1 (equal weights)." - } + }, + "refined_type": {} }, { "name": "y_min", @@ -93873,7 +99979,8 @@ "docstring": { "type": "float, default=None", "description": "Lower bound on the lowest predicted value (the minimum value may\nstill be higher). If not set, defaults to -inf." - } + }, + "refined_type": {} }, { "name": "y_max", @@ -93883,7 +99990,8 @@ "docstring": { "type": "float, default=None", "description": "Upper bound on the highest predicted value (the maximum may still be\nlower). If not set, defaults to +inf." - } + }, + "refined_type": {} }, { "name": "increasing", @@ -93893,13 +100001,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to compute ``y_`` is increasing (if set to True) or decreasing\n(if set to False)" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Solve the isotonic regression model.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Solve the isotonic regression model.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny : array-like of shape (n_samples,)\n The data.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weights on each point of the regression.\n If None, weight is set to 1 (equal weights).\n\ny_min : float, default=None\n Lower bound on the lowest predicted value (the minimum value may\n still be higher). If not set, defaults to -inf.\n\ny_max : float, default=None\n Upper bound on the highest predicted value (the maximum may still be\n lower). If not set, defaults to +inf.\n\nincreasing : bool, default=True\n Whether to compute ``y_`` is increasing (if set to True) or decreasing\n (if set to False)\n\nReturns\n-------\ny_ : list of floats\n Isotonic fit of y.\n\nReferences\n----------\n\"Active set algorithms for isotonic regression; A unifying framework\"\nby Michael J. Best and Nilotpal Chakravarti, section 3.", + "docstring": "Solve the isotonic regression model.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y : array-like of shape (n_samples,)\n The data.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights on each point of the regression.\n If None, weight is set to 1 (equal weights).\n\n y_min : float, default=None\n Lower bound on the lowest predicted value (the minimum value may\n still be higher). If not set, defaults to -inf.\n\n y_max : float, default=None\n Upper bound on the highest predicted value (the maximum may still be\n lower). If not set, defaults to +inf.\n\n increasing : bool, default=True\n Whether to compute ``y_`` is increasing (if set to True) or decreasing\n (if set to False)\n\n Returns\n -------\n y_ : list of floats\n Isotonic fit of y.\n\n References\n ----------\n \"Active set algorithms for isotonic regression; A unifying framework\"\n by Michael J. Best and Nilotpal Chakravarti, section 3.\n ", "source_code": "\ndef isotonic_regression(y, *, sample_weight=None, y_min=None, y_max=None, increasing=True):\n \"\"\"Solve the isotonic regression model.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y : array-like of shape (n_samples,)\n The data.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights on each point of the regression.\n If None, weight is set to 1 (equal weights).\n\n y_min : float, default=None\n Lower bound on the lowest predicted value (the minimum value may\n still be higher). If not set, defaults to -inf.\n\n y_max : float, default=None\n Upper bound on the highest predicted value (the maximum may still be\n lower). If not set, defaults to +inf.\n\n increasing : bool, default=True\n Whether to compute ``y_`` is increasing (if set to True) or decreasing\n (if set to False)\n\n Returns\n -------\n y_ : list of floats\n Isotonic fit of y.\n\n References\n ----------\n \"Active set algorithms for isotonic regression; A unifying framework\"\n by Michael J. Best and Nilotpal Chakravarti, section 3.\n \"\"\"\n order = np.s_[:] if increasing else np.s_[::-1]\n y = check_array(y, ensure_2d=False, dtype=[np.float64, np.float32])\n y = np.array(y[order], dtype=y.dtype)\n sample_weight = _check_sample_weight(sample_weight, y, dtype=y.dtype, copy=True)\n sample_weight = np.ascontiguousarray(sample_weight[order])\n _inplace_contiguous_isotonic_regression(y, sample_weight)\n if y_min is not None or y_max is not None:\n if y_min is None:\n y_min = -np.inf\n if y_max is None:\n y_max = np.inf\n np.clip(y, y_min, y_max, y)\n return y[order]" }, { @@ -93917,7 +100026,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_steps", @@ -93927,7 +100037,8 @@ "docstring": { "type": "int, default=2", "description": "Gives the number of (complex) sampling points." - } + }, + "refined_type": {} }, { "name": "sample_interval", @@ -93937,13 +100048,17 @@ "docstring": { "type": "float, default=None", "description": "Sampling interval. Must be specified when sample_steps not in {1,2,3}." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, sample_steps=2, sample_interval=None):\n self.sample_steps = sample_steps\n self.sample_interval = sample_interval" }, { @@ -93961,13 +100076,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'stateless': True, 'requires_positive_X': True}" }, { @@ -93985,7 +100101,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -93995,13 +100112,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _transform_dense(self, X):\n non_zero = X != 0.0\n X_nz = X[non_zero]\n X_step = np.zeros_like(X)\n X_step[non_zero] = np.sqrt(X_nz * self.sample_interval_)\n X_new = [X_step]\n log_step_nz = self.sample_interval_ * np.log(X_nz)\n step_nz = 2 * X_nz * self.sample_interval_\n for j in range(1, self.sample_steps):\n factor_nz = np.sqrt(step_nz / np.cosh(np.pi * j * self.sample_interval_))\n X_step = np.zeros_like(X)\n X_step[non_zero] = factor_nz * np.cos(j * log_step_nz)\n X_new.append(X_step)\n X_step = np.zeros_like(X)\n X_step[non_zero] = factor_nz * np.sin(j * log_step_nz)\n X_new.append(X_step)\n return np.hstack(X_new)" }, { @@ -94019,7 +100137,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94029,13 +100148,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _transform_sparse(self, X):\n indices = X.indices.copy()\n indptr = X.indptr.copy()\n data_step = np.sqrt(X.data * self.sample_interval_)\n X_step = sp.csr_matrix((data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False)\n X_new = [X_step]\n log_step_nz = self.sample_interval_ * np.log(X.data)\n step_nz = 2 * X.data * self.sample_interval_\n for j in range(1, self.sample_steps):\n factor_nz = np.sqrt(step_nz / np.cosh(np.pi * j * self.sample_interval_))\n data_step = factor_nz * np.cos(j * log_step_nz)\n X_step = sp.csr_matrix((data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False)\n X_new.append(X_step)\n data_step = factor_nz * np.sin(j * log_step_nz)\n X_step = sp.csr_matrix((data_step, indices, indptr), shape=X.shape, dtype=X.dtype, copy=False)\n X_new.append(X_step)\n return sp.hstack(X_new)" }, { @@ -94053,7 +100173,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94063,7 +100184,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -94073,13 +100195,14 @@ "docstring": { "type": "array-like, shape (n_samples,) or (n_samples, n_outputs), default=None", "description": "Target values (None for unsupervised transformations)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Set the parameters.", - "docstring": "Set the parameters.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\nReturns\n-------\nself : object\n Returns the transformer.", + "docstring": "Set the parameters.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the transformer.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Set the parameters.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the transformer.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr')\n check_non_negative(X, 'X in AdditiveChi2Sampler.fit')\n if self.sample_interval is None:\n if self.sample_steps == 1:\n self.sample_interval_ = 0.8\n elif self.sample_steps == 2:\n self.sample_interval_ = 0.5\n elif self.sample_steps == 3:\n self.sample_interval_ = 0.4\n else:\n raise ValueError('If sample_steps is not in [1, 2, 3], you need to provide sample_interval')\n else:\n self.sample_interval_ = self.sample_interval\n return self" }, { @@ -94097,7 +100220,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94107,13 +100231,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Apply approximate feature map to X.", - "docstring": "Apply approximate feature map to X.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\nReturns\n-------\nX_new : {ndarray, sparse matrix}, shape = (n_samples, n_features * (2*sample_steps + 1))\n Whether the return value is an array or sparse matrix depends on\n the type of the input X.", + "docstring": "Apply approximate feature map to X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : {ndarray, sparse matrix}, shape = (n_samples, n_features * (2*sample_steps + 1))\n Whether the return value is an array or sparse matrix depends on\n the type of the input X.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Apply approximate feature map to X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : {ndarray, sparse matrix}, shape = (n_samples, n_features * (2*sample_steps + 1))\n Whether the return value is an array or sparse matrix depends on\n the type of the input X.\n \"\"\"\n msg = '%(name)s is not fitted. Call fit to set the parameters before calling transform'\n check_is_fitted(self, msg=msg)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n check_non_negative(X, 'X in AdditiveChi2Sampler.transform')\n sparse = sp.issparse(X)\n transf = self._transform_sparse if sparse else self._transform_dense\n return transf(X)" }, { @@ -94131,7 +100259,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -94141,7 +100270,8 @@ "docstring": { "type": "str or callable, default='rbf'", "description": "Kernel map to be approximated. A callable should accept two arguments\nand the keyword arguments passed to this object as `kernel_params`, and\nshould return a floating point number." - } + }, + "refined_type": {} }, { "name": "gamma", @@ -94151,7 +100281,8 @@ "docstring": { "type": "float, default=None", "description": "Gamma parameter for the RBF, laplacian, polynomial, exponential chi2\nand sigmoid kernels. Interpretation of the default value is left to\nthe kernel; see the documentation for sklearn.metrics.pairwise.\nIgnored by other kernels." - } + }, + "refined_type": {} }, { "name": "coef0", @@ -94161,7 +100292,8 @@ "docstring": { "type": "float, default=None", "description": "Zero coefficient for polynomial and sigmoid kernels.\nIgnored by other kernels." - } + }, + "refined_type": {} }, { "name": "degree", @@ -94171,7 +100303,8 @@ "docstring": { "type": "float, default=None", "description": "Degree of the polynomial kernel. Ignored by other kernels." - } + }, + "refined_type": {} }, { "name": "kernel_params", @@ -94181,7 +100314,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional parameters (keyword arguments) for kernel function passed\nas callable object." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -94191,7 +100325,8 @@ "docstring": { "type": "int, default=100", "description": "Number of features to construct.\nHow many data points will be used to construct the mapping." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -94201,7 +100336,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pseudo-random number generator to control the uniform sampling without\nreplacement of `n_components` of the training data to construct the\nbasis kernel.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -94211,13 +100347,14 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation. This works by breaking\ndown the kernel matrix into `n_jobs` even slices and computing them in\nparallel.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, kernel='rbf', *, gamma=None, coef0=None, degree=None, kernel_params=None, n_components=100, random_state=None, n_jobs=None):\n self.kernel = kernel\n self.gamma = gamma\n self.coef0 = coef0\n self.degree = degree\n self.kernel_params = kernel_params\n self.n_components = n_components\n self.random_state = random_state\n self.n_jobs = n_jobs" }, { @@ -94235,13 +100372,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_kernel_params(self):\n params = self.kernel_params\n if params is None:\n params = {}\n if not callable(self.kernel) and self.kernel != 'precomputed':\n for param in KERNEL_PARAMS[self.kernel]:\n if getattr(self, param) is not None:\n params[param] = getattr(self, param)\n elif self.gamma is not None or self.coef0 is not None or self.degree is not None:\n raise ValueError(\"Don't pass gamma, coef0 or degree to Nystroem if using a callable or precomputed kernel\")\n return params" }, { @@ -94259,13 +100397,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_transformer_preserve_dtypes': 'dtypes are preserved but not at a close enough precision'}, 'preserves_dtype': [np.float64, np.float32]}" }, { @@ -94283,7 +100422,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94293,7 +100433,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -94303,13 +100444,14 @@ "docstring": { "type": "array-like, shape (n_samples,) or (n_samples, n_outputs), default=None", "description": "Target values (None for unsupervised transformations)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit estimator to data.\n\nSamples a subset of training points, computes kernel on these and computes normalization matrix.", - "docstring": "Fit estimator to data.\n\nSamples a subset of training points, computes kernel\non these and computes normalization matrix.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "description": "Fit estimator to data.\n\nSamples a subset of training points, computes kernel\non these and computes normalization matrix.", + "docstring": "Fit estimator to data.\n\n Samples a subset of training points, computes kernel\n on these and computes normalization matrix.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit estimator to data.\n\n Samples a subset of training points, computes kernel\n on these and computes normalization matrix.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr')\n rnd = check_random_state(self.random_state)\n n_samples = X.shape[0]\n if self.n_components > n_samples:\n n_components = n_samples\n warnings.warn('n_components > n_samples. This is not possible.\\nn_components was set to n_samples, which results in inefficient evaluation of the full kernel.')\n else:\n n_components = self.n_components\n n_components = min(n_samples, n_components)\n inds = rnd.permutation(n_samples)\n basis_inds = inds[:n_components]\n basis = X[basis_inds]\n basis_kernel = pairwise_kernels(basis, metric=self.kernel, filter_params=True, n_jobs=self.n_jobs, **self._get_kernel_params())\n (U, S, V) = svd(basis_kernel)\n S = np.maximum(S, 1e-12)\n self.normalization_ = np.dot(U / np.sqrt(S), V)\n self.components_ = basis\n self.component_indices_ = basis_inds\n return self" }, { @@ -94327,7 +100469,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94337,13 +100480,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data to transform." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Apply feature map to X.\n\nComputes an approximate feature map using the kernel between some training points and X.", - "docstring": "Apply feature map to X.\n\nComputes an approximate feature map using the kernel\nbetween some training points and X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data to transform.\n\nReturns\n-------\nX_transformed : ndarray of shape (n_samples, n_components)\n Transformed data.", + "description": "Apply feature map to X.\n\nComputes an approximate feature map using the kernel\nbetween some training points and X.", + "docstring": "Apply feature map to X.\n\n Computes an approximate feature map using the kernel\n between some training points and X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to transform.\n\n Returns\n -------\n X_transformed : ndarray of shape (n_samples, n_components)\n Transformed data.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Apply feature map to X.\n\n Computes an approximate feature map using the kernel\n between some training points and X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to transform.\n\n Returns\n -------\n X_transformed : ndarray of shape (n_samples, n_components)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n kernel_params = self._get_kernel_params()\n embedded = pairwise_kernels(X, self.components_, metric=self.kernel, filter_params=True, n_jobs=self.n_jobs, **kernel_params)\n return np.dot(embedded, self.normalization_.T)" }, { @@ -94361,7 +100505,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gamma", @@ -94371,7 +100516,8 @@ "docstring": { "type": "float, default=1.0", "description": "Parameter of the polynomial kernel whose feature map\nwill be approximated." - } + }, + "refined_type": {} }, { "name": "degree", @@ -94381,7 +100527,8 @@ "docstring": { "type": "int, default=2", "description": "Degree of the polynomial kernel whose feature map\nwill be approximated." - } + }, + "refined_type": {} }, { "name": "coef0", @@ -94391,7 +100538,8 @@ "docstring": { "type": "int, default=0", "description": "Constant term of the polynomial kernel whose feature map\nwill be approximated." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -94401,7 +100549,8 @@ "docstring": { "type": "int, default=100", "description": "Dimensionality of the output feature space. Usually, `n_components`\nshould be greater than the number of features in input samples in\norder to achieve good performance. The optimal score / run time\nbalance is typically achieved around `n_components` = 10 * `n_features`,\nbut this depends on the specific dataset being used." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -94411,13 +100560,14 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Determines random number generation for indexHash and bitHash\ninitialization. Pass an int for reproducible results across multiple\nfunction calls. See :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, gamma=1.0, degree=2, coef0=0, n_components=100, random_state=None):\n self.gamma = gamma\n self.degree = degree\n self.coef0 = coef0\n self.n_components = n_components\n self.random_state = random_state" }, { @@ -94435,7 +100585,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94445,6 +100596,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -94455,13 +100610,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs), default=None", "description": "Target values (None for unsupervised transformations)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit the model with X.\n\nInitializes the internal variables. The method needs no information about the distribution of data, so we only care about n_features in X.", - "docstring": "Fit the model with X.\n\nInitializes the internal variables. The method needs no information\nabout the distribution of data, so we only care about n_features in X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "description": "Fit the model with X.\n\nInitializes the internal variables. The method needs no information\nabout the distribution of data, so we only care about n_features in X.", + "docstring": "Fit the model with X.\n\n Initializes the internal variables. The method needs no information\n about the distribution of data, so we only care about n_features in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model with X.\n\n Initializes the internal variables. The method needs no information\n about the distribution of data, so we only care about n_features in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n if not self.degree >= 1:\n raise ValueError(f'degree={self.degree} should be >=1.')\n X = self._validate_data(X, accept_sparse='csc')\n random_state = check_random_state(self.random_state)\n n_features = X.shape[1]\n if self.coef0 != 0:\n n_features += 1\n self.indexHash_ = random_state.randint(0, high=self.n_components, size=(self.degree, n_features))\n self.bitHash_ = random_state.choice(a=[-1, 1], size=(self.degree, n_features))\n return self" }, { @@ -94479,7 +100635,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94489,13 +100646,17 @@ "docstring": { "type": "{array-like}, shape (n_samples, n_features)", "description": "New data, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Generate the feature map approximation for X.", - "docstring": "Generate the feature map approximation for X.\n\nParameters\n----------\nX : {array-like}, shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\nReturns\n-------\nX_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.", + "docstring": "Generate the feature map approximation for X.\n\n Parameters\n ----------\n X : {array-like}, shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Generate the feature map approximation for X.\n\n Parameters\n ----------\n X : {array-like}, shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csc', reset=False)\n X_gamma = np.sqrt(self.gamma) * X\n if sp.issparse(X_gamma) and self.coef0 != 0:\n X_gamma = sp.hstack([X_gamma, np.sqrt(self.coef0) * np.ones((X_gamma.shape[0], 1))], format='csc')\n elif not sp.issparse(X_gamma) and self.coef0 != 0:\n X_gamma = np.hstack([X_gamma, np.sqrt(self.coef0) * np.ones((X_gamma.shape[0], 1))])\n if X_gamma.shape[1] != self.indexHash_.shape[1]:\n raise ValueError('Number of features of test samples does not match that of training samples.')\n count_sketches = np.zeros((X_gamma.shape[0], self.degree, self.n_components))\n if sp.issparse(X_gamma):\n for j in range(X_gamma.shape[1]):\n for d in range(self.degree):\n iHashIndex = self.indexHash_[d, j]\n iHashBit = self.bitHash_[d, j]\n count_sketches[:, d, iHashIndex] += (iHashBit * X_gamma[:, j]).toarray().ravel()\n else:\n for j in range(X_gamma.shape[1]):\n for d in range(self.degree):\n iHashIndex = self.indexHash_[d, j]\n iHashBit = self.bitHash_[d, j]\n count_sketches[:, d, iHashIndex] += iHashBit * X_gamma[:, j]\n count_sketches_fft = fft(count_sketches, axis=2, overwrite_x=True)\n count_sketches_fft_prod = np.prod(count_sketches_fft, axis=1)\n data_sketch = np.real(ifft(count_sketches_fft_prod, overwrite_x=True))\n return data_sketch" }, { @@ -94513,7 +100674,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gamma", @@ -94523,7 +100685,8 @@ "docstring": { "type": "float, default=1.0", "description": "Parameter of RBF kernel: exp(-gamma * x^2)." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -94533,7 +100696,8 @@ "docstring": { "type": "int, default=100", "description": "Number of Monte Carlo samples per original feature.\nEquals the dimensionality of the computed feature space." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -94543,13 +100707,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pseudo-random number generator to control the generation of the random\nweights and random offset when fitting the training data.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, gamma=1.0, n_components=100, random_state=None):\n self.gamma = gamma\n self.n_components = n_components\n self.random_state = random_state" }, { @@ -94567,7 +100732,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94577,6 +100743,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -94587,13 +100757,14 @@ "docstring": { "type": "array-like, shape (n_samples,) or (n_samples, n_outputs), default=None", "description": "Target values (None for unsupervised transformations)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model with X.\n\nSamples random projection according to n_features.", - "docstring": "Fit the model with X.\n\nSamples random projection according to n_features.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the model with X.\n\n Samples random projection according to n_features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model with X.\n\n Samples random projection according to n_features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr')\n random_state = check_random_state(self.random_state)\n n_features = X.shape[1]\n self.random_weights_ = np.sqrt(2 * self.gamma) * random_state.normal(size=(n_features, self.n_components))\n self.random_offset_ = random_state.uniform(0, 2 * np.pi, size=self.n_components)\n return self" }, { @@ -94611,7 +100782,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94621,13 +100793,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "New data, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Apply the approximate feature map to X.", - "docstring": "Apply the approximate feature map to X.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\nReturns\n-------\nX_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.", + "docstring": "Apply the approximate feature map to X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Apply the approximate feature map to X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n projection = safe_sparse_dot(X, self.random_weights_)\n projection += self.random_offset_\n np.cos(projection, projection)\n projection *= np.sqrt(2.0) / np.sqrt(self.n_components)\n return projection" }, { @@ -94645,7 +100821,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "skewedness", @@ -94655,7 +100832,8 @@ "docstring": { "type": "float, default=1.0", "description": "\"skewedness\" parameter of the kernel. Needs to be cross-validated." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -94665,7 +100843,8 @@ "docstring": { "type": "int, default=100", "description": "Number of Monte Carlo samples per original feature.\nEquals the dimensionality of the computed feature space." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -94675,13 +100854,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pseudo-random number generator to control the generation of the random\nweights and random offset when fitting the training data.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, skewedness=1.0, n_components=100, random_state=None):\n self.skewedness = skewedness\n self.n_components = n_components\n self.random_state = random_state" }, { @@ -94699,7 +100879,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94709,7 +100890,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -94719,13 +100901,14 @@ "docstring": { "type": "array-like, shape (n_samples,) or (n_samples, n_outputs), default=None", "description": "Target values (None for unsupervised transformations)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model with X.\n\nSamples random projection according to n_features.", - "docstring": "Fit the model with X.\n\nSamples random projection according to n_features.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the model with X.\n\n Samples random projection according to n_features.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model with X.\n\n Samples random projection according to n_features.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X)\n random_state = check_random_state(self.random_state)\n n_features = X.shape[1]\n uniform = random_state.uniform(size=(n_features, self.n_components))\n self.random_weights_ = 1.0 / np.pi * np.log(np.tan(np.pi / 2.0 * uniform))\n self.random_offset_ = random_state.uniform(0, 2 * np.pi, size=self.n_components)\n return self" }, { @@ -94743,7 +100926,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94753,13 +100937,14 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "New data, where `n_samples` is the number of samples\nand `n_features` is the number of features. All values of X must be\nstrictly greater than \"-skewedness\"." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Apply the approximate feature map to X.", - "docstring": "Apply the approximate feature map to X.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features. All values of X must be\n strictly greater than \"-skewedness\".\n\nReturns\n-------\nX_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.", + "docstring": "Apply the approximate feature map to X.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features. All values of X must be\n strictly greater than \"-skewedness\".\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Apply the approximate feature map to X.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n New data, where `n_samples` is the number of samples\n and `n_features` is the number of features. All values of X must be\n strictly greater than \"-skewedness\".\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, copy=True, dtype=[np.float64, np.float32], reset=False)\n if (X <= -self.skewedness).any():\n raise ValueError('X may not contain entries smaller than -skewedness.')\n X += self.skewedness\n np.log(X, X)\n projection = safe_sparse_dot(X, self.random_weights_)\n projection += self.random_offset_\n np.cos(projection, projection)\n projection *= np.sqrt(2.0) / np.sqrt(self.n_components)\n return projection" }, { @@ -94777,7 +100962,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -94787,7 +100973,8 @@ "docstring": { "type": "float or array-like of shape (n_targets,), default=1.0", "description": "Regularization strength; must be a positive float. Regularization\nimproves the conditioning of the problem and reduces the variance of\nthe estimates. Larger values specify stronger regularization.\nAlpha corresponds to ``1 / (2C)`` in other linear models such as\n:class:`~sklearn.linear_model.LogisticRegression` or\n:class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are\nassumed to be specific to the targets. Hence they must correspond in\nnumber. See :ref:`ridge_regression` for formula." - } + }, + "refined_type": {} }, { "name": "kernel", @@ -94796,8 +100983,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "str or callable, default=\"linear\"", - "description": "Kernel mapping used internally. This parameter is directly passed to\n:class:`~sklearn.metrics.pairwise.pairwise_kernel`.\nIf `kernel` is a string, it must be one of the metrics\nin `pairwise.PAIRWISE_KERNEL_FUNCTIONS`.\nIf `kernel` is \"precomputed\", X is assumed to be a kernel matrix.\nAlternatively, if `kernel` is a callable function, it is called on\neach pair of instances (rows) and the resulting value recorded. The\ncallable should take two rows from X as input and return the\ncorresponding kernel value as a single number. This means that\ncallables from :mod:`sklearn.metrics.pairwise` are not allowed, as\nthey operate on matrices, not single samples. Use the string\nidentifying the kernel instead." - } + "description": "Kernel mapping used internally. This parameter is directly passed to\n:class:`~sklearn.metrics.pairwise.pairwise_kernel`.\nIf `kernel` is a string, it must be one of the metrics\nin `pairwise.PAIRWISE_KERNEL_FUNCTIONS` or \"precomputed\".\nIf `kernel` is \"precomputed\", X is assumed to be a kernel matrix.\nAlternatively, if `kernel` is a callable function, it is called on\neach pair of instances (rows) and the resulting value recorded. The\ncallable should take two rows from X as input and return the\ncorresponding kernel value as a single number. This means that\ncallables from :mod:`sklearn.metrics.pairwise` are not allowed, as\nthey operate on matrices, not single samples. Use the string\nidentifying the kernel instead." + }, + "refined_type": {} }, { "name": "gamma", @@ -94807,7 +100995,8 @@ "docstring": { "type": "float, default=None", "description": "Gamma parameter for the RBF, laplacian, polynomial, exponential chi2\nand sigmoid kernels. Interpretation of the default value is left to\nthe kernel; see the documentation for sklearn.metrics.pairwise.\nIgnored by other kernels." - } + }, + "refined_type": {} }, { "name": "degree", @@ -94817,7 +101006,8 @@ "docstring": { "type": "float, default=3", "description": "Degree of the polynomial kernel. Ignored by other kernels." - } + }, + "refined_type": {} }, { "name": "coef0", @@ -94827,7 +101017,8 @@ "docstring": { "type": "float, default=1", "description": "Zero coefficient for polynomial and sigmoid kernels.\nIgnored by other kernels." - } + }, + "refined_type": {} }, { "name": "kernel_params", @@ -94837,13 +101028,14 @@ "docstring": { "type": "mapping of str to any, default=None", "description": "Additional parameters (keyword arguments) for kernel function passed\nas callable object." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alpha=1, *, kernel='linear', gamma=None, degree=3, coef0=1, kernel_params=None):\n self.alpha = alpha\n self.kernel = kernel\n self.gamma = gamma\n self.degree = degree\n self.coef0 = coef0\n self.kernel_params = kernel_params" }, { @@ -94861,7 +101053,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94871,7 +101064,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -94881,13 +101075,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_kernel(self, X, Y=None):\n if callable(self.kernel):\n params = self.kernel_params or {}\n else:\n params = {'gamma': self.gamma, 'degree': self.degree, 'coef0': self.coef0}\n return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params)" }, { @@ -94905,13 +101100,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'pairwise': self.kernel == 'precomputed'}" }, { @@ -94932,13 +101128,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef _pairwise(self):\n return self.kernel == 'precomputed'" }, { @@ -94956,7 +101153,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -94966,6 +101164,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data. If kernel == \"precomputed\" this is instead\na precomputed kernel matrix, of shape (n_samples, n_samples)." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -94976,7 +101178,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -94986,13 +101189,14 @@ "docstring": { "type": "float or array-like of shape (n_samples,), default=None", "description": "Individual weights for each sample, ignored if None is passed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit Kernel Ridge regression model.", - "docstring": "Fit Kernel Ridge regression model.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. If kernel == \"precomputed\" this is instead\n a precomputed kernel matrix, of shape (n_samples, n_samples).\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\nsample_weight : float or array-like of shape (n_samples,), default=None\n Individual weights for each sample, ignored if None is passed.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit Kernel Ridge regression model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. If kernel == \"precomputed\" this is instead\n a precomputed kernel matrix, of shape (n_samples, n_samples).\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : float or array-like of shape (n_samples,), default=None\n Individual weights for each sample, ignored if None is passed.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit Kernel Ridge regression model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. If kernel == \"precomputed\" this is instead\n a precomputed kernel matrix, of shape (n_samples, n_samples).\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : float or array-like of shape (n_samples,), default=None\n Individual weights for each sample, ignored if None is passed.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=('csr', 'csc'), multi_output=True, y_numeric=True)\n if sample_weight is not None and not isinstance(sample_weight, float):\n sample_weight = _check_sample_weight(sample_weight, X)\n K = self._get_kernel(X)\n alpha = np.atleast_1d(self.alpha)\n ravel = False\n if len(y.shape) == 1:\n y = y.reshape(-1, 1)\n ravel = True\n copy = self.kernel == 'precomputed'\n self.dual_coef_ = _solve_cholesky_kernel(K, y, alpha, sample_weight, copy)\n if ravel:\n self.dual_coef_ = self.dual_coef_.ravel()\n self.X_fit_ = X\n return self" }, { @@ -95010,7 +101214,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -95020,13 +101225,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Samples. If kernel == \"precomputed\" this is instead a\nprecomputed kernel matrix, shape = [n_samples,\nn_samples_fitted], where n_samples_fitted is the number of\nsamples used in the fitting for this estimator." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict using the kernel ridge model.", - "docstring": "Predict using the kernel ridge model.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples. If kernel == \"precomputed\" this is instead a\n precomputed kernel matrix, shape = [n_samples,\n n_samples_fitted], where n_samples_fitted is the number of\n samples used in the fitting for this estimator.\n\nReturns\n-------\nC : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Returns predicted values.", + "docstring": "Predict using the kernel ridge model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples. If kernel == \"precomputed\" this is instead a\n precomputed kernel matrix, shape = [n_samples,\n n_samples_fitted], where n_samples_fitted is the number of\n samples used in the fitting for this estimator.\n\n Returns\n -------\n C : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Returns predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict using the kernel ridge model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples. If kernel == \"precomputed\" this is instead a\n precomputed kernel matrix, shape = [n_samples,\n n_samples_fitted], where n_samples_fitted is the number of\n samples used in the fitting for this estimator.\n\n Returns\n -------\n C : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Returns predicted values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), reset=False)\n K = self._get_kernel(X, self.X_fit_)\n return np.dot(K, self.dual_coef_)" }, { @@ -95044,7 +101253,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -95054,13 +101264,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Probability estimation for OvR logistic regression.\n\nPositive class probabilities are computed as 1. / (1. + np.exp(-self.decision_function(X))); multiclass is handled by normalizing that over all classes.", - "docstring": "Probability estimation for OvR logistic regression.\n\nPositive class probabilities are computed as\n1. / (1. + np.exp(-self.decision_function(X)));\nmulticlass is handled by normalizing that over all classes.", + "description": "Probability estimation for OvR logistic regression.\n\nPositive class probabilities are computed as\n1. / (1. + np.exp(-self.decision_function(X)));\nmulticlass is handled by normalizing that over all classes.", + "docstring": "Probability estimation for OvR logistic regression.\n\n Positive class probabilities are computed as\n 1. / (1. + np.exp(-self.decision_function(X)));\n multiclass is handled by normalizing that over all classes.\n ", "source_code": "\ndef _predict_proba_lr(self, X):\n \"\"\"Probability estimation for OvR logistic regression.\n\n Positive class probabilities are computed as\n 1. / (1. + np.exp(-self.decision_function(X)));\n multiclass is handled by normalizing that over all classes.\n \"\"\"\n prob = self.decision_function(X)\n expit(prob, out=prob)\n if prob.ndim == 1:\n return np.vstack([1 - prob, prob]).T\n else:\n prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))\n return prob" }, { @@ -95078,7 +101289,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -95086,16 +101298,20 @@ "is_public": false, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "array-like or sparse matrix, shape (n_samples, n_features)", - "description": "Samples." + "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", + "description": "The data matrix for which we want to get the confidence scores." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Predict confidence scores for samples.\n\nThe confidence score for a sample is proportional to the signed distance of that sample to the hyperplane.", - "docstring": "Predict confidence scores for samples.\n\nThe confidence score for a sample is proportional to the signed\ndistance of that sample to the hyperplane.\n\nParameters\n----------\nX : array-like or sparse matrix, shape (n_samples, n_features)\n Samples.\n\nReturns\n-------\narray, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)\n Confidence scores per (sample, class) combination. In the binary\n case, confidence score for self.classes_[1] where >0 means this\n class would be predicted.", - "source_code": "\ndef decision_function(self, X):\n \"\"\"\n Predict confidence scores for samples.\n\n The confidence score for a sample is proportional to the signed\n distance of that sample to the hyperplane.\n\n Parameters\n ----------\n X : array-like or sparse matrix, shape (n_samples, n_features)\n Samples.\n\n Returns\n -------\n array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)\n Confidence scores per (sample, class) combination. In the binary\n case, confidence score for self.classes_[1] where >0 means this\n class would be predicted.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_\n return scores.ravel() if scores.shape[1] == 1 else scores" + "description": "Predict confidence scores for samples.\n\nThe confidence score for a sample is proportional to the signed\ndistance of that sample to the hyperplane.", + "docstring": "\n Predict confidence scores for samples.\n\n The confidence score for a sample is proportional to the signed\n distance of that sample to the hyperplane.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data matrix for which we want to get the confidence scores.\n\n Returns\n -------\n scores : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Confidence scores per `(n_samples, n_classes)` combination. In the\n binary case, confidence score for `self.classes_[1]` where >0 means\n this class would be predicted.\n ", + "source_code": "\ndef decision_function(self, X):\n \"\"\"\n Predict confidence scores for samples.\n\n The confidence score for a sample is proportional to the signed\n distance of that sample to the hyperplane.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data matrix for which we want to get the confidence scores.\n\n Returns\n -------\n scores : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Confidence scores per `(n_samples, n_classes)` combination. In the\n binary case, confidence score for `self.classes_[1]` where >0 means\n this class would be predicted.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_\n return scores.ravel() if scores.shape[1] == 1 else scores" }, { "name": "predict", @@ -95112,7 +101328,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -95120,16 +101337,20 @@ "is_public": false, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "array-like or sparse matrix, shape (n_samples, n_features)", - "description": "Samples." + "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", + "description": "The data matrix for which we want to get the predictions." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Predict class labels for samples in X.", - "docstring": "Predict class labels for samples in X.\n\nParameters\n----------\nX : array-like or sparse matrix, shape (n_samples, n_features)\n Samples.\n\nReturns\n-------\nC : array, shape [n_samples]\n Predicted class label per sample.", - "source_code": "\ndef predict(self, X):\n \"\"\"\n Predict class labels for samples in X.\n\n Parameters\n ----------\n X : array-like or sparse matrix, shape (n_samples, n_features)\n Samples.\n\n Returns\n -------\n C : array, shape [n_samples]\n Predicted class label per sample.\n \"\"\"\n scores = self.decision_function(X)\n if len(scores.shape) == 1:\n indices = (scores > 0).astype(int)\n else:\n indices = scores.argmax(axis=1)\n return self.classes_[indices]" + "docstring": "\n Predict class labels for samples in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data matrix for which we want to get the predictions.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Vector containing the class labels for each sample.\n ", + "source_code": "\ndef predict(self, X):\n \"\"\"\n Predict class labels for samples in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data matrix for which we want to get the predictions.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Vector containing the class labels for each sample.\n \"\"\"\n scores = self.decision_function(X)\n if len(scores.shape) == 1:\n indices = (scores > 0).astype(int)\n else:\n indices = scores.argmax(axis=1)\n return self.classes_[indices]" }, { "name": "_decision_function", @@ -95146,7 +101367,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -95156,13 +101378,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _decision_function(self, X):\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], reset=False)\n return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_" }, { @@ -95180,13 +101403,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'requires_y': True}" }, { @@ -95204,7 +101428,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_offset", @@ -95214,7 +101439,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_offset", @@ -95224,7 +101450,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_scale", @@ -95234,7 +101461,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -95258,7 +101486,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -95268,7 +101497,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -95278,7 +101508,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -95302,7 +101533,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -95312,13 +101544,14 @@ "docstring": { "type": "array-like or sparse matrix, shape (n_samples, n_features)", "description": "Samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Predict using the linear model.", - "docstring": "Predict using the linear model.\n\nParameters\n----------\nX : array-like or sparse matrix, shape (n_samples, n_features)\n Samples.\n\nReturns\n-------\nC : array, shape (n_samples,)\n Returns predicted values.", + "docstring": "\n Predict using the linear model.\n\n Parameters\n ----------\n X : array-like or sparse matrix, shape (n_samples, n_features)\n Samples.\n\n Returns\n -------\n C : array, shape (n_samples,)\n Returns predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"\n Predict using the linear model.\n\n Parameters\n ----------\n X : array-like or sparse matrix, shape (n_samples, n_features)\n Samples.\n\n Returns\n -------\n C : array, shape (n_samples,)\n Returns predicted values.\n \"\"\"\n return self._decision_function(X)" }, { @@ -95336,7 +101569,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -95346,7 +101580,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto False, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -95356,7 +101591,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n `normalize` was deprecated in version 1.0 and will be\n removed in 1.2." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -95366,7 +101602,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -95376,7 +101613,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation. This will only provide\nspeedup in case of sufficiently large problems, that is if firstly\n`n_targets > 1` and secondly `X` is sparse or if `positive` is set\nto `True`. ``None`` means 1 unless in a\n:obj:`joblib.parallel_backend` context. ``-1`` means using all\nprocessors. See :term:`Glossary ` for more details." - } + }, + "refined_type": {} }, { "name": "positive", @@ -95386,13 +101624,14 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, forces the coefficients to be positive. This\noption is only supported for dense arrays.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, fit_intercept=True, normalize='deprecated', copy_X=True, n_jobs=None, positive=False):\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.copy_X = copy_X\n self.n_jobs = n_jobs\n self.positive = positive" }, { @@ -95410,7 +101649,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -95420,6 +101660,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -95430,7 +101674,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values. Will be cast to X's dtype if necessary." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -95440,13 +101685,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Individual weights for each sample.\n\n.. versionadded:: 0.17\n parameter *sample_weight* support to LinearRegression." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit linear model.", - "docstring": "Fit linear model.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to X's dtype if necessary.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.17\n parameter *sample_weight* support to LinearRegression.\n\nReturns\n-------\nself : object\n Fitted Estimator.", + "docstring": "\n Fit linear model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.17\n parameter *sample_weight* support to LinearRegression.\n\n Returns\n -------\n self : object\n Fitted Estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"\n Fit linear model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.17\n parameter *sample_weight* support to LinearRegression.\n\n Returns\n -------\n self : object\n Fitted Estimator.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=False, estimator_name=self.__class__.__name__)\n n_jobs_ = self.n_jobs\n accept_sparse = False if self.positive else ['csr', 'csc', 'coo']\n (X, y) = self._validate_data(X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n (X, y, X_offset, y_offset, X_scale) = self._preprocess_data(X, y, fit_intercept=self.fit_intercept, normalize=_normalize, copy=self.copy_X, sample_weight=sample_weight, return_mean=True)\n if sample_weight is not None:\n (X, y) = _rescale_data(X, y, sample_weight)\n if self.positive:\n if y.ndim < 2:\n (self.coef_, self._residues) = optimize.nnls(X, y)\n else:\n outs = Parallel(n_jobs=n_jobs_)((delayed(optimize.nnls)(X, y[:, j]) for j in range(y.shape[1])))\n (self.coef_, self._residues) = map(np.vstack, zip(*outs))\n elif sp.issparse(X):\n X_offset_scale = X_offset / X_scale\n \n def matvec(b):\n return X.dot(b) - b.dot(X_offset_scale)\n \n def rmatvec(b):\n return X.T.dot(b) - X_offset_scale * np.sum(b)\n X_centered = sparse.linalg.LinearOperator(shape=X.shape, matvec=matvec, rmatvec=rmatvec)\n if y.ndim < 2:\n out = sparse_lsqr(X_centered, y)\n self.coef_ = out[0]\n self._residues = out[3]\n else:\n outs = Parallel(n_jobs=n_jobs_)((delayed(sparse_lsqr)(X_centered, y[:, j].ravel()) for j in range(y.shape[1])))\n self.coef_ = np.vstack([out[0] for out in outs])\n self._residues = np.vstack([out[3] for out in outs])\n else:\n (self.coef_, self._residues, self.rank_, self.singular_) = linalg.lstsq(X, y)\n self.coef_ = self.coef_.T\n if y.ndim == 1:\n self.coef_ = np.ravel(self.coef_)\n self._set_intercept(X_offset, y_offset, X_scale)\n return self" }, { @@ -95464,13 +101710,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Convert coefficient matrix to dense array format.\n\nConverts the ``coef_`` member (back) to a numpy.ndarray. This is the default format of ``coef_`` and is required for fitting, so calling this method is only required on models that have previously been sparsified; otherwise, it is a no-op.", - "docstring": "Convert coefficient matrix to dense array format.\n\nConverts the ``coef_`` member (back) to a numpy.ndarray. This is the\ndefault format of ``coef_`` and is required for fitting, so calling\nthis method is only required on models that have previously been\nsparsified; otherwise, it is a no-op.\n\nReturns\n-------\nself\n Fitted estimator.", + "description": "Convert coefficient matrix to dense array format.\n\nConverts the ``coef_`` member (back) to a numpy.ndarray. This is the\ndefault format of ``coef_`` and is required for fitting, so calling\nthis method is only required on models that have previously been\nsparsified; otherwise, it is a no-op.", + "docstring": "\n Convert coefficient matrix to dense array format.\n\n Converts the ``coef_`` member (back) to a numpy.ndarray. This is the\n default format of ``coef_`` and is required for fitting, so calling\n this method is only required on models that have previously been\n sparsified; otherwise, it is a no-op.\n\n Returns\n -------\n self\n Fitted estimator.\n ", "source_code": "\ndef densify(self):\n \"\"\"\n Convert coefficient matrix to dense array format.\n\n Converts the ``coef_`` member (back) to a numpy.ndarray. This is the\n default format of ``coef_`` and is required for fitting, so calling\n this method is only required on models that have previously been\n sparsified; otherwise, it is a no-op.\n\n Returns\n -------\n self\n Fitted estimator.\n \"\"\"\n msg = 'Estimator, %(name)s, must be fitted before densifying.'\n check_is_fitted(self, msg=msg)\n if sp.issparse(self.coef_):\n self.coef_ = self.coef_.toarray()\n return self" }, { @@ -95488,13 +101735,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Convert coefficient matrix to sparse format.\n\nConverts the ``coef_`` member to a scipy.sparse matrix, which for L1-regularized models can be much more memory- and storage-efficient than the usual numpy.ndarray representation. The ``intercept_`` member is not converted.", - "docstring": "Convert coefficient matrix to sparse format.\n\nConverts the ``coef_`` member to a scipy.sparse matrix, which for\nL1-regularized models can be much more memory- and storage-efficient\nthan the usual numpy.ndarray representation.\n\nThe ``intercept_`` member is not converted.\n\nReturns\n-------\nself\n Fitted estimator.\n\nNotes\n-----\nFor non-sparse models, i.e. when there are not many zeros in ``coef_``,\nthis may actually *increase* memory usage, so use this method with\ncare. A rule of thumb is that the number of zero elements, which can\nbe computed with ``(coef_ == 0).sum()``, must be more than 50% for this\nto provide significant benefits.\n\nAfter calling this method, further fitting with the partial_fit\nmethod (if any) will not work until you call densify.", + "description": "Convert coefficient matrix to sparse format.\n\nConverts the ``coef_`` member to a scipy.sparse matrix, which for\nL1-regularized models can be much more memory- and storage-efficient\nthan the usual numpy.ndarray representation.\n\nThe ``intercept_`` member is not converted.", + "docstring": "\n Convert coefficient matrix to sparse format.\n\n Converts the ``coef_`` member to a scipy.sparse matrix, which for\n L1-regularized models can be much more memory- and storage-efficient\n than the usual numpy.ndarray representation.\n\n The ``intercept_`` member is not converted.\n\n Returns\n -------\n self\n Fitted estimator.\n\n Notes\n -----\n For non-sparse models, i.e. when there are not many zeros in ``coef_``,\n this may actually *increase* memory usage, so use this method with\n care. A rule of thumb is that the number of zero elements, which can\n be computed with ``(coef_ == 0).sum()``, must be more than 50% for this\n to provide significant benefits.\n\n After calling this method, further fitting with the partial_fit\n method (if any) will not work until you call densify.\n ", "source_code": "\ndef sparsify(self):\n \"\"\"\n Convert coefficient matrix to sparse format.\n\n Converts the ``coef_`` member to a scipy.sparse matrix, which for\n L1-regularized models can be much more memory- and storage-efficient\n than the usual numpy.ndarray representation.\n\n The ``intercept_`` member is not converted.\n\n Returns\n -------\n self\n Fitted estimator.\n\n Notes\n -----\n For non-sparse models, i.e. when there are not many zeros in ``coef_``,\n this may actually *increase* memory usage, so use this method with\n care. A rule of thumb is that the number of zero elements, which can\n be computed with ``(coef_ == 0).sum()``, must be more than 50% for this\n to provide significant benefits.\n\n After calling this method, further fitting with the partial_fit\n method (if any) will not work until you call densify.\n \"\"\"\n msg = 'Estimator, %(name)s, must be fitted before sparsifying.'\n check_is_fitted(self, msg=msg)\n self.coef_ = sp.csr_matrix(self.coef_)\n return self" }, { @@ -95512,7 +101760,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Data array." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -95522,7 +101771,8 @@ "docstring": { "type": "array-like of shape (n_features, n_features)", "description": "User-supplied gram matrix." - } + }, + "refined_type": {} }, { "name": "X_offset", @@ -95532,7 +101782,8 @@ "docstring": { "type": "ndarray of shape (n_features,)", "description": "Array of feature means used to center design matrix." - } + }, + "refined_type": {} }, { "name": "X_scale", @@ -95542,7 +101793,8 @@ "docstring": { "type": "ndarray of shape (n_features,)", "description": "Array of feature scale factors used to normalize design matrix." - } + }, + "refined_type": {} }, { "name": "rtol", @@ -95552,7 +101804,8 @@ "docstring": { "type": "float, default=1e-7", "description": "Relative tolerance; see numpy.allclose." - } + }, + "refined_type": {} }, { "name": "atol", @@ -95562,13 +101815,14 @@ "docstring": { "type": "float, default=1e-5", "description": "absolute tolerance; see :func`numpy.allclose`. Note that the default\nhere is more tolerant than the default for\n:func:`numpy.testing.assert_allclose`, where `atol=0`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Computes a single element of the gram matrix and compares it to the corresponding element of the user supplied gram matrix.\n\nIf the values do not match a ValueError will be thrown.", - "docstring": "Computes a single element of the gram matrix and compares it to\nthe corresponding element of the user supplied gram matrix.\n\nIf the values do not match a ValueError will be thrown.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Data array.\n\nprecompute : array-like of shape (n_features, n_features)\n User-supplied gram matrix.\n\nX_offset : ndarray of shape (n_features,)\n Array of feature means used to center design matrix.\n\nX_scale : ndarray of shape (n_features,)\n Array of feature scale factors used to normalize design matrix.\n\nrtol : float, default=1e-7\n Relative tolerance; see numpy.allclose.\n\natol : float, default=1e-5\n absolute tolerance; see :func`numpy.allclose`. Note that the default\n here is more tolerant than the default for\n :func:`numpy.testing.assert_allclose`, where `atol=0`.\n\nRaises\n------\nValueError\n Raised when the provided Gram matrix is not consistent.", + "description": "Computes a single element of the gram matrix and compares it to\nthe corresponding element of the user supplied gram matrix.\n\nIf the values do not match a ValueError will be thrown.", + "docstring": "Computes a single element of the gram matrix and compares it to\n the corresponding element of the user supplied gram matrix.\n\n If the values do not match a ValueError will be thrown.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data array.\n\n precompute : array-like of shape (n_features, n_features)\n User-supplied gram matrix.\n\n X_offset : ndarray of shape (n_features,)\n Array of feature means used to center design matrix.\n\n X_scale : ndarray of shape (n_features,)\n Array of feature scale factors used to normalize design matrix.\n\n rtol : float, default=1e-7\n Relative tolerance; see numpy.allclose.\n\n atol : float, default=1e-5\n absolute tolerance; see :func`numpy.allclose`. Note that the default\n here is more tolerant than the default for\n :func:`numpy.testing.assert_allclose`, where `atol=0`.\n\n Raises\n ------\n ValueError\n Raised when the provided Gram matrix is not consistent.\n ", "source_code": "\ndef _check_precomputed_gram_matrix(X, precompute, X_offset, X_scale, rtol=1e-07, atol=1e-05):\n \"\"\"Computes a single element of the gram matrix and compares it to\n the corresponding element of the user supplied gram matrix.\n\n If the values do not match a ValueError will be thrown.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data array.\n\n precompute : array-like of shape (n_features, n_features)\n User-supplied gram matrix.\n\n X_offset : ndarray of shape (n_features,)\n Array of feature means used to center design matrix.\n\n X_scale : ndarray of shape (n_features,)\n Array of feature scale factors used to normalize design matrix.\n\n rtol : float, default=1e-7\n Relative tolerance; see numpy.allclose.\n\n atol : float, default=1e-5\n absolute tolerance; see :func`numpy.allclose`. Note that the default\n here is more tolerant than the default for\n :func:`numpy.testing.assert_allclose`, where `atol=0`.\n\n Raises\n ------\n ValueError\n Raised when the provided Gram matrix is not consistent.\n \"\"\"\n n_features = X.shape[1]\n f1 = n_features // 2\n f2 = min(f1 + 1, n_features - 1)\n v1 = (X[:, f1] - X_offset[f1]) * X_scale[f1]\n v2 = (X[:, f2] - X_offset[f2]) * X_scale[f2]\n expected = np.dot(v1, v2)\n actual = precompute[f1, f2]\n if not np.isclose(expected, actual, rtol=rtol, atol=atol):\n raise ValueError(f\"Gram matrix passed in via 'precompute' parameter did not pass validation when a single element was checked - please check that it was computed properly. For element ({f1},{f2}) we computed {expected} but the user-supplied value was {actual}.\")" }, { @@ -95586,7 +101840,8 @@ "docstring": { "type": "bool,", "description": "normalize value passed by the user" - } + }, + "refined_type": {} }, { "name": "default", @@ -95596,7 +101851,8 @@ "docstring": { "type": "bool,", "description": "default normalize value used by the estimator" - } + }, + "refined_type": {} }, { "name": "estimator_name", @@ -95606,13 +101862,14 @@ "docstring": { "type": "str", "description": "name of the linear estimator which calls this function.\nThe name will be used for writing the deprecation warnings" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Normalize is to be deprecated from linear models and a use of a pipeline with a StandardScaler is to be recommended instead. Here the appropriate message is selected to be displayed to the user depending on the default normalize value (as it varies between the linear models and normalize value selected by the user).", - "docstring": "Normalize is to be deprecated from linear models and a use of\na pipeline with a StandardScaler is to be recommended instead.\nHere the appropriate message is selected to be displayed to the user\ndepending on the default normalize value (as it varies between the linear\nmodels and normalize value selected by the user).\n\nParameters\n----------\nnormalize : bool,\n normalize value passed by the user\n\ndefault : bool,\n default normalize value used by the estimator\n\nestimator_name : str\n name of the linear estimator which calls this function.\n The name will be used for writing the deprecation warnings\n\nReturns\n-------\nnormalize : bool,\n normalize value which should further be used by the estimator at this\n stage of the depreciation process\n\nNotes\n-----\nThis function should be updated in 1.2 depending on the value of\n`normalize`:\n- True, warning: `normalize` was deprecated in 1.2 and will be removed in\n 1.4. Suggest to use pipeline instead.\n- False, `normalize` was deprecated in 1.2 and it will be removed in 1.4.\n Leave normalize to its default value.\n- `deprecated` - this should only be possible with default == False as from\n 1.2 `normalize` in all the linear models should be either removed or the\n default should be set to False.\nThis function should be completely removed in 1.4.", + "description": "Normalize is to be deprecated from linear models and a use of\na pipeline with a StandardScaler is to be recommended instead.\nHere the appropriate message is selected to be displayed to the user\ndepending on the default normalize value (as it varies between the linear\nmodels and normalize value selected by the user).", + "docstring": "Normalize is to be deprecated from linear models and a use of\n a pipeline with a StandardScaler is to be recommended instead.\n Here the appropriate message is selected to be displayed to the user\n depending on the default normalize value (as it varies between the linear\n models and normalize value selected by the user).\n\n Parameters\n ----------\n normalize : bool,\n normalize value passed by the user\n\n default : bool,\n default normalize value used by the estimator\n\n estimator_name : str\n name of the linear estimator which calls this function.\n The name will be used for writing the deprecation warnings\n\n Returns\n -------\n normalize : bool,\n normalize value which should further be used by the estimator at this\n stage of the depreciation process\n\n Notes\n -----\n This function should be updated in 1.2 depending on the value of\n `normalize`:\n - True, warning: `normalize` was deprecated in 1.2 and will be removed in\n 1.4. Suggest to use pipeline instead.\n - False, `normalize` was deprecated in 1.2 and it will be removed in 1.4.\n Leave normalize to its default value.\n - `deprecated` - this should only be possible with default == False as from\n 1.2 `normalize` in all the linear models should be either removed or the\n default should be set to False.\n This function should be completely removed in 1.4.\n ", "source_code": "\ndef _deprecate_normalize(normalize, default, estimator_name):\n \"\"\"Normalize is to be deprecated from linear models and a use of\n a pipeline with a StandardScaler is to be recommended instead.\n Here the appropriate message is selected to be displayed to the user\n depending on the default normalize value (as it varies between the linear\n models and normalize value selected by the user).\n\n Parameters\n ----------\n normalize : bool,\n normalize value passed by the user\n\n default : bool,\n default normalize value used by the estimator\n\n estimator_name : str\n name of the linear estimator which calls this function.\n The name will be used for writing the deprecation warnings\n\n Returns\n -------\n normalize : bool,\n normalize value which should further be used by the estimator at this\n stage of the depreciation process\n\n Notes\n -----\n This function should be updated in 1.2 depending on the value of\n `normalize`:\n - True, warning: `normalize` was deprecated in 1.2 and will be removed in\n 1.4. Suggest to use pipeline instead.\n - False, `normalize` was deprecated in 1.2 and it will be removed in 1.4.\n Leave normalize to its default value.\n - `deprecated` - this should only be possible with default == False as from\n 1.2 `normalize` in all the linear models should be either removed or the\n default should be set to False.\n This function should be completely removed in 1.4.\n \"\"\"\n if normalize not in [True, False, 'deprecated']:\n raise ValueError(\"Leave 'normalize' to its default value or set it to True or False\")\n if normalize == 'deprecated':\n _normalize = default\n else:\n _normalize = normalize\n pipeline_msg = f\"If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:\\n\\nfrom sklearn.pipeline import make_pipeline\\n\\nmodel = make_pipeline(StandardScaler(with_mean=False), {estimator_name}())\\n\\nIf you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:\\n\\nkwargs = {{s[0] + '__sample_weight': sample_weight for s in model.steps}}\\nmodel.fit(X, y, **kwargs)\\n\\n\"\n if estimator_name == 'Ridge' or estimator_name == 'RidgeClassifier':\n alpha_msg = 'Set parameter alpha to: original_alpha * n_samples. '\n elif 'Lasso' in estimator_name:\n alpha_msg = 'Set parameter alpha to: original_alpha * np.sqrt(n_samples). '\n elif 'ElasticNet' in estimator_name:\n alpha_msg = 'Set parameter alpha to original_alpha * np.sqrt(n_samples) if l1_ratio is 1, and to original_alpha * n_samples if l1_ratio is 0. For other values of l1_ratio, no analytic formula is available.'\n elif estimator_name == 'RidgeCV' or estimator_name == 'RidgeClassifierCV':\n alpha_msg = 'Set parameter alphas to: original_alphas * n_samples. '\n else:\n alpha_msg = ''\n if default and normalize == 'deprecated':\n warnings.warn(\"The default of 'normalize' will be set to False in version 1.2 and deprecated in version 1.4.\\n\" + pipeline_msg + alpha_msg, FutureWarning)\n elif normalize != 'deprecated' and normalize and not default:\n warnings.warn(\"'normalize' was deprecated in version 1.0 and will be removed in 1.2.\\n\" + pipeline_msg + alpha_msg, FutureWarning)\n elif not normalize and not default:\n warnings.warn(\"'normalize' was deprecated in version 1.0 and will be removed in 1.2. Please leave the normalize parameter to its default value to silence this warning. The default behavior of this estimator is to not do any normalization. If normalization is needed please use sklearn.preprocessing.StandardScaler instead.\", FutureWarning)\n return _normalize" }, { @@ -95630,7 +101887,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -95640,7 +101898,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Xy", @@ -95650,7 +101909,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "precompute", @@ -95660,7 +101920,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "normalize", @@ -95670,7 +101931,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -95680,7 +101942,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy", @@ -95690,7 +101953,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check_input", @@ -95700,7 +101964,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -95710,13 +101975,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Aux function used at beginning of fit in linear models", - "docstring": "Aux function used at beginning of fit in linear models\n\nParameters\n----------\norder : 'F', 'C' or None, default=None\n Whether X and y will be forced to be fortran or c-style. Only relevant\n if sample_weight is not None.", + "docstring": "Aux function used at beginning of fit in linear models\n\n Parameters\n ----------\n order : 'F', 'C' or None, default=None\n Whether X and y will be forced to be fortran or c-style. Only relevant\n if sample_weight is not None.\n ", "source_code": "\ndef _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy, check_input=True, sample_weight=None):\n \"\"\"Aux function used at beginning of fit in linear models\n\n Parameters\n ----------\n order : 'F', 'C' or None, default=None\n Whether X and y will be forced to be fortran or c-style. Only relevant\n if sample_weight is not None.\n \"\"\"\n (n_samples, n_features) = X.shape\n if sparse.isspmatrix(X):\n precompute = False\n (X, y, X_offset, y_offset, X_scale) = _preprocess_data(X, y, fit_intercept=fit_intercept, normalize=normalize, copy=False, return_mean=True, check_input=check_input)\n else:\n (X, y, X_offset, y_offset, X_scale) = _preprocess_data(X, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy, check_input=check_input, sample_weight=sample_weight)\n if sample_weight is not None:\n (X, y) = _rescale_data(X, y, sample_weight=sample_weight)\n if hasattr(precompute, '__array__'):\n if fit_intercept and not np.allclose(X_offset, np.zeros(n_features)) or normalize and not np.allclose(X_scale, np.ones(n_features)):\n warnings.warn('Gram matrix was provided but X was centered to fit intercept, or X was normalized : recomputing Gram matrix.', UserWarning)\n precompute = 'auto'\n Xy = None\n elif check_input:\n _check_precomputed_gram_matrix(X, precompute, X_offset, X_scale)\n if isinstance(precompute, str) and precompute == 'auto':\n precompute = n_samples > n_features\n if precompute is True:\n precompute = np.empty(shape=(n_features, n_features), dtype=X.dtype, order='C')\n np.dot(X.T, X, out=precompute)\n if not hasattr(precompute, '__array__'):\n Xy = None\n if hasattr(precompute, '__array__') and Xy is None:\n common_dtype = np.find_common_type([X.dtype, y.dtype], [])\n if y.ndim == 1:\n Xy = np.empty(shape=n_features, dtype=common_dtype, order='C')\n np.dot(X.T, y, out=Xy)\n else:\n n_targets = y.shape[1]\n Xy = np.empty(shape=(n_features, n_targets), dtype=common_dtype, order='F')\n np.dot(y.T, X, out=Xy.T)\n return X, y, X_offset, y_offset, X_scale, precompute, Xy" }, { @@ -95734,7 +102000,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -95744,7 +102011,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -95754,7 +102022,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "normalize", @@ -95764,7 +102033,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy", @@ -95774,7 +102044,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -95784,7 +102055,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "return_mean", @@ -95794,7 +102066,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check_input", @@ -95804,13 +102077,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Center and scale data.\n\nCenters data to have mean zero along axis 0. If fit_intercept=False or if the X is a sparse matrix, no centering is done, but normalization can still be applied. The function returns the statistics necessary to reconstruct the input data, which are X_offset, y_offset, X_scale, such that the output X = (X - X_offset) / X_scale X_scale is the L2 norm of X - X_offset. If sample_weight is not None, then the weighted mean of X and y is zero, and not the mean itself. If return_mean=True, the mean, eventually weighted, is returned, independently of whether X was centered (option used for optimization with sparse data in coordinate_descend). This is here because nearly all linear models will want their data to be centered. This function also systematically makes y consistent with X.dtype", - "docstring": "Center and scale data.\n\nCenters data to have mean zero along axis 0. If fit_intercept=False or if\nthe X is a sparse matrix, no centering is done, but normalization can still\nbe applied. The function returns the statistics necessary to reconstruct\nthe input data, which are X_offset, y_offset, X_scale, such that the output\n\n X = (X - X_offset) / X_scale\n\nX_scale is the L2 norm of X - X_offset. If sample_weight is not None,\nthen the weighted mean of X and y is zero, and not the mean itself. If\nreturn_mean=True, the mean, eventually weighted, is returned, independently\nof whether X was centered (option used for optimization with sparse data in\ncoordinate_descend).\n\nThis is here because nearly all linear models will want their data to be\ncentered. This function also systematically makes y consistent with X.dtype", + "description": "Center and scale data.\n\nCenters data to have mean zero along axis 0. If fit_intercept=False or if\nthe X is a sparse matrix, no centering is done, but normalization can still\nbe applied. The function returns the statistics necessary to reconstruct\nthe input data, which are X_offset, y_offset, X_scale, such that the output\n\n X = (X - X_offset) / X_scale\n\nX_scale is the L2 norm of X - X_offset. If sample_weight is not None,\nthen the weighted mean of X and y is zero, and not the mean itself. If\nreturn_mean=True, the mean, eventually weighted, is returned, independently\nof whether X was centered (option used for optimization with sparse data in\ncoordinate_descend).\n\nThis is here because nearly all linear models will want their data to be\ncentered. This function also systematically makes y consistent with X.dtype", + "docstring": "Center and scale data.\n\n Centers data to have mean zero along axis 0. If fit_intercept=False or if\n the X is a sparse matrix, no centering is done, but normalization can still\n be applied. The function returns the statistics necessary to reconstruct\n the input data, which are X_offset, y_offset, X_scale, such that the output\n\n X = (X - X_offset) / X_scale\n\n X_scale is the L2 norm of X - X_offset. If sample_weight is not None,\n then the weighted mean of X and y is zero, and not the mean itself. If\n return_mean=True, the mean, eventually weighted, is returned, independently\n of whether X was centered (option used for optimization with sparse data in\n coordinate_descend).\n\n This is here because nearly all linear models will want their data to be\n centered. This function also systematically makes y consistent with X.dtype\n ", "source_code": "\ndef _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, sample_weight=None, return_mean=False, check_input=True):\n \"\"\"Center and scale data.\n\n Centers data to have mean zero along axis 0. If fit_intercept=False or if\n the X is a sparse matrix, no centering is done, but normalization can still\n be applied. The function returns the statistics necessary to reconstruct\n the input data, which are X_offset, y_offset, X_scale, such that the output\n\n X = (X - X_offset) / X_scale\n\n X_scale is the L2 norm of X - X_offset. If sample_weight is not None,\n then the weighted mean of X and y is zero, and not the mean itself. If\n return_mean=True, the mean, eventually weighted, is returned, independently\n of whether X was centered (option used for optimization with sparse data in\n coordinate_descend).\n\n This is here because nearly all linear models will want their data to be\n centered. This function also systematically makes y consistent with X.dtype\n \"\"\"\n if isinstance(sample_weight, numbers.Number):\n sample_weight = None\n if sample_weight is not None:\n sample_weight = np.asarray(sample_weight)\n if check_input:\n X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'], dtype=FLOAT_DTYPES)\n elif copy:\n if sp.issparse(X):\n X = X.copy()\n else:\n X = X.copy(order='K')\n y = np.asarray(y, dtype=X.dtype)\n if fit_intercept:\n if sp.issparse(X):\n (X_offset, X_var) = mean_variance_axis(X, axis=0, weights=sample_weight)\n if not return_mean:\n X_offset[:] = X.dtype.type(0)\n else:\n if normalize:\n (X_offset, X_var, _) = _incremental_mean_and_var(X, last_mean=0.0, last_variance=0.0, last_sample_count=0.0, sample_weight=sample_weight)\n else:\n X_offset = np.average(X, axis=0, weights=sample_weight)\n X_offset = X_offset.astype(X.dtype, copy=False)\n X -= X_offset\n if normalize:\n X_var = X_var.astype(X.dtype, copy=False)\n constant_mask = _is_constant_feature(X_var, X_offset, X.shape[0])\n if sample_weight is None:\n X_var *= X.shape[0]\n else:\n X_var *= sample_weight.sum()\n X_scale = np.sqrt(X_var, out=X_var)\n X_scale[constant_mask] = 1.0\n if sp.issparse(X):\n inplace_column_scale(X, 1.0 / X_scale)\n else:\n X /= X_scale\n else:\n X_scale = np.ones(X.shape[1], dtype=X.dtype)\n y_offset = np.average(y, axis=0, weights=sample_weight)\n y = y - y_offset\n else:\n X_offset = np.zeros(X.shape[1], dtype=X.dtype)\n X_scale = np.ones(X.shape[1], dtype=X.dtype)\n if y.ndim == 1:\n y_offset = X.dtype.type(0)\n else:\n y_offset = np.zeros(y.shape[1], dtype=X.dtype)\n return X, y, X_offset, y_offset, X_scale" }, { @@ -95828,7 +102102,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -95838,7 +102113,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -95848,13 +102124,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Rescale data sample-wise by square root of sample_weight.\n\nFor many linear models, this enables easy support for sample_weight.", - "docstring": "Rescale data sample-wise by square root of sample_weight.\n\nFor many linear models, this enables easy support for sample_weight.\n\nReturns\n-------\nX_rescaled : {array-like, sparse matrix}\n\ny_rescaled : {array-like, sparse matrix}", + "docstring": "Rescale data sample-wise by square root of sample_weight.\n\n For many linear models, this enables easy support for sample_weight.\n\n Returns\n -------\n X_rescaled : {array-like, sparse matrix}\n\n y_rescaled : {array-like, sparse matrix}\n ", "source_code": "\ndef _rescale_data(X, y, sample_weight):\n \"\"\"Rescale data sample-wise by square root of sample_weight.\n\n For many linear models, this enables easy support for sample_weight.\n\n Returns\n -------\n X_rescaled : {array-like, sparse matrix}\n\n y_rescaled : {array-like, sparse matrix}\n \"\"\"\n n_samples = X.shape[0]\n sample_weight = np.asarray(sample_weight)\n if sample_weight.ndim == 0:\n sample_weight = np.full(n_samples, sample_weight, dtype=sample_weight.dtype)\n sample_weight = np.sqrt(sample_weight)\n sw_matrix = sparse.dia_matrix((sample_weight, 0), shape=(n_samples, n_samples))\n X = safe_sparse_dot(sw_matrix, X)\n y = safe_sparse_dot(sw_matrix, y)\n return X, y" }, { @@ -95872,7 +102149,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Training data" - } + }, + "refined_type": {} }, { "name": "y", @@ -95882,7 +102160,8 @@ "docstring": { "type": "array-like, shape (n_samples, )", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -95892,7 +102171,8 @@ "docstring": { "type": "numpy array of shape (n_samples,)", "description": "The weight of each sample" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -95902,13 +102182,14 @@ "docstring": { "type": "int, RandomState instance or None (default)", "description": "Determines random number generation for dataset shuffling and noise.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Create ``Dataset`` abstraction for sparse and dense inputs.\n\nThis also returns the ``intercept_decay`` which is different for sparse datasets.", - "docstring": "Create ``Dataset`` abstraction for sparse and dense inputs.\n\nThis also returns the ``intercept_decay`` which is different\nfor sparse datasets.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n Training data\n\ny : array-like, shape (n_samples, )\n Target values.\n\nsample_weight : numpy array of shape (n_samples,)\n The weight of each sample\n\nrandom_state : int, RandomState instance or None (default)\n Determines random number generation for dataset shuffling and noise.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\ndataset\n The ``Dataset`` abstraction\nintercept_decay\n The intercept decay", + "description": "Create ``Dataset`` abstraction for sparse and dense inputs.\n\nThis also returns the ``intercept_decay`` which is different\nfor sparse datasets.", + "docstring": "Create ``Dataset`` abstraction for sparse and dense inputs.\n\n This also returns the ``intercept_decay`` which is different\n for sparse datasets.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training data\n\n y : array-like, shape (n_samples, )\n Target values.\n\n sample_weight : numpy array of shape (n_samples,)\n The weight of each sample\n\n random_state : int, RandomState instance or None (default)\n Determines random number generation for dataset shuffling and noise.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n dataset\n The ``Dataset`` abstraction\n intercept_decay\n The intercept decay\n ", "source_code": "\ndef make_dataset(X, y, sample_weight, random_state=None):\n \"\"\"Create ``Dataset`` abstraction for sparse and dense inputs.\n\n This also returns the ``intercept_decay`` which is different\n for sparse datasets.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training data\n\n y : array-like, shape (n_samples, )\n Target values.\n\n sample_weight : numpy array of shape (n_samples,)\n The weight of each sample\n\n random_state : int, RandomState instance or None (default)\n Determines random number generation for dataset shuffling and noise.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n dataset\n The ``Dataset`` abstraction\n intercept_decay\n The intercept decay\n \"\"\"\n rng = check_random_state(random_state)\n seed = rng.randint(1, np.iinfo(np.int32).max)\n if X.dtype == np.float32:\n CSRData = CSRDataset32\n ArrayData = ArrayDataset32\n else:\n CSRData = CSRDataset64\n ArrayData = ArrayDataset64\n if sp.issparse(X):\n dataset = CSRData(X.data, X.indptr, X.indices, y, sample_weight, seed=seed)\n intercept_decay = SPARSE_INTERCEPT_DECAY\n else:\n X = np.ascontiguousarray(X)\n dataset = ArrayData(X, y, sample_weight, seed=seed)\n intercept_decay = 1.0\n return dataset, intercept_decay" }, { @@ -95926,7 +102207,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -95936,7 +102218,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "tol", @@ -95946,7 +102229,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Stop the algorithm if w has converged." - } + }, + "refined_type": {} }, { "name": "alpha_1", @@ -95956,7 +102240,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Hyper-parameter : shape parameter for the Gamma distribution prior\nover the alpha parameter." - } + }, + "refined_type": {} }, { "name": "alpha_2", @@ -95966,7 +102251,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Hyper-parameter : inverse scale parameter (rate parameter) for the\nGamma distribution prior over the alpha parameter." - } + }, + "refined_type": {} }, { "name": "lambda_1", @@ -95976,7 +102262,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Hyper-parameter : shape parameter for the Gamma distribution prior\nover the lambda parameter." - } + }, + "refined_type": {} }, { "name": "lambda_2", @@ -95986,7 +102273,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Hyper-parameter : inverse scale parameter (rate parameter) for the\nGamma distribution prior over the lambda parameter." - } + }, + "refined_type": {} }, { "name": "compute_score", @@ -95996,7 +102284,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, compute the objective function at each step of the model." - } + }, + "refined_type": {} }, { "name": "threshold_lambda", @@ -96006,7 +102295,8 @@ "docstring": { "type": "float, default=10 000", "description": "Threshold for removing (pruning) weights with high precision from\nthe computation." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -96016,7 +102306,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -96026,7 +102317,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -96036,7 +102328,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -96046,13 +102339,14 @@ "docstring": { "type": "bool, default=False", "description": "Verbose mode when fitting the model." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, threshold_lambda=10000.0, fit_intercept=True, normalize='deprecated', copy_X=True, verbose=False):\n self.n_iter = n_iter\n self.tol = tol\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.alpha_1 = alpha_1\n self.alpha_2 = alpha_2\n self.lambda_1 = lambda_1\n self.lambda_2 = lambda_2\n self.compute_score = compute_score\n self.threshold_lambda = threshold_lambda\n self.copy_X = copy_X\n self.verbose = verbose" }, { @@ -96070,7 +102364,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -96080,7 +102375,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha_", @@ -96090,7 +102386,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lambda_", @@ -96100,7 +102397,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "keep_lambda", @@ -96110,13 +102408,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _update_sigma(self, X, alpha_, lambda_, keep_lambda):\n X_keep = X[:, keep_lambda]\n gram = np.dot(X_keep.T, X_keep)\n eye = np.eye(gram.shape[0])\n sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram\n sigma_ = pinvh(sigma_inv)\n return sigma_" }, { @@ -96134,7 +102433,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -96144,7 +102444,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha_", @@ -96154,7 +102455,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lambda_", @@ -96164,7 +102466,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "keep_lambda", @@ -96174,13 +102477,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):\n n_samples = X.shape[0]\n X_keep = X[:, keep_lambda]\n inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)\n sigma_ = pinvh(np.eye(n_samples) / alpha_ + np.dot(X_keep * inv_lambda, X_keep.T))\n sigma_ = np.dot(sigma_, X_keep * inv_lambda)\n sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)\n sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda]\n return sigma_" }, { @@ -96198,7 +102502,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -96208,7 +102513,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -96218,13 +102524,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values (integers). Will be cast to X's dtype if necessary." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model according to the given training data and parameters.\n\nIterative procedure to maximize the evidence", - "docstring": "Fit the model according to the given training data and parameters.\n\nIterative procedure to maximize the evidence\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\ny : array-like of shape (n_samples,)\n Target values (integers). Will be cast to X's dtype if necessary.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the model according to the given training data and parameters.\n\n Iterative procedure to maximize the evidence\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n y : array-like of shape (n_samples,)\n Target values (integers). Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit the model according to the given training data and parameters.\n\n Iterative procedure to maximize the evidence\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n y : array-like of shape (n_samples,)\n Target values (integers). Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self._normalize = _deprecate_normalize(self.normalize, default=False, estimator_name=self.__class__.__name__)\n (X, y) = self._validate_data(X, y, dtype=np.float64, y_numeric=True, ensure_min_samples=2)\n (n_samples, n_features) = X.shape\n coef_ = np.zeros(n_features)\n (X, y, X_offset_, y_offset_, X_scale_) = self._preprocess_data(X, y, self.fit_intercept, self._normalize, self.copy_X)\n self.X_offset_ = X_offset_\n self.X_scale_ = X_scale_\n keep_lambda = np.ones(n_features, dtype=bool)\n lambda_1 = self.lambda_1\n lambda_2 = self.lambda_2\n alpha_1 = self.alpha_1\n alpha_2 = self.alpha_2\n verbose = self.verbose\n eps = np.finfo(np.float64).eps\n alpha_ = 1.0 / (np.var(y) + eps)\n lambda_ = np.ones(n_features)\n self.scores_ = list()\n coef_old_ = None\n \n def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):\n coef_[keep_lambda] = alpha_ * np.linalg.multi_dot([sigma_, X[:, keep_lambda].T, y])\n return coef_\n update_sigma = self._update_sigma if n_samples >= n_features else self._update_sigma_woodbury\n for iter_ in range(self.n_iter):\n sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)\n coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)\n rmse_ = np.sum((y - np.dot(X, coef_))**2)\n gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_)\n lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / (coef_[keep_lambda]**2 + 2.0 * lambda_2)\n alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (rmse_ + 2.0 * alpha_2)\n keep_lambda = lambda_ < self.threshold_lambda\n coef_[~keep_lambda] = 0\n if self.compute_score:\n s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()\n s += alpha_1 * log(alpha_) - alpha_2 * alpha_\n s += 0.5 * (fast_logdet(sigma_) + n_samples * log(alpha_) + np.sum(np.log(lambda_)))\n s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_**2).sum())\n self.scores_.append(s)\n if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:\n if verbose:\n print('Converged after %s iterations' % iter_)\n break\n coef_old_ = np.copy(coef_)\n if not keep_lambda.any():\n break\n if keep_lambda.any():\n sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)\n coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)\n else:\n sigma_ = np.array([]).reshape(0, 0)\n self.coef_ = coef_\n self.alpha_ = alpha_\n self.sigma_ = sigma_\n self.lambda_ = lambda_\n self._set_intercept(X_offset_, y_offset_, X_scale_)\n return self" }, { @@ -96242,7 +102549,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -96252,6 +102560,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -96262,13 +102574,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the standard deviation of posterior prediction." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict using the linear model.\n\nIn addition to the mean of the predictive distribution, also its standard deviation can be returned.", - "docstring": "Predict using the linear model.\n\nIn addition to the mean of the predictive distribution, also its\nstandard deviation can be returned.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\nreturn_std : bool, default=False\n Whether to return the standard deviation of posterior prediction.\n\nReturns\n-------\ny_mean : array-like of shape (n_samples,)\n Mean of predictive distribution of query points.\n\ny_std : array-like of shape (n_samples,)\n Standard deviation of predictive distribution of query points.", + "description": "Predict using the linear model.\n\nIn addition to the mean of the predictive distribution, also its\nstandard deviation can be returned.", + "docstring": "Predict using the linear model.\n\n In addition to the mean of the predictive distribution, also its\n standard deviation can be returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n return_std : bool, default=False\n Whether to return the standard deviation of posterior prediction.\n\n Returns\n -------\n y_mean : array-like of shape (n_samples,)\n Mean of predictive distribution of query points.\n\n y_std : array-like of shape (n_samples,)\n Standard deviation of predictive distribution of query points.\n ", "source_code": "\ndef predict(self, X, return_std=False):\n \"\"\"Predict using the linear model.\n\n In addition to the mean of the predictive distribution, also its\n standard deviation can be returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n return_std : bool, default=False\n Whether to return the standard deviation of posterior prediction.\n\n Returns\n -------\n y_mean : array-like of shape (n_samples,)\n Mean of predictive distribution of query points.\n\n y_std : array-like of shape (n_samples,)\n Standard deviation of predictive distribution of query points.\n \"\"\"\n y_mean = self._decision_function(X)\n if return_std is False:\n return y_mean\n else:\n if self._normalize:\n X = (X - self.X_offset_) / self.X_scale_\n X = X[:, self.lambda_ < self.threshold_lambda]\n sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)\n y_std = np.sqrt(sigmas_squared_data + 1.0 / self.alpha_)\n return y_mean, y_std" }, { @@ -96286,7 +102599,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -96296,7 +102610,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations. Should be greater than or equal to 1." - } + }, + "refined_type": {} }, { "name": "tol", @@ -96306,7 +102621,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Stop the algorithm if w has converged." - } + }, + "refined_type": {} }, { "name": "alpha_1", @@ -96316,7 +102632,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Hyper-parameter : shape parameter for the Gamma distribution prior\nover the alpha parameter." - } + }, + "refined_type": {} }, { "name": "alpha_2", @@ -96326,7 +102643,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Hyper-parameter : inverse scale parameter (rate parameter) for the\nGamma distribution prior over the alpha parameter." - } + }, + "refined_type": {} }, { "name": "lambda_1", @@ -96336,7 +102654,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Hyper-parameter : shape parameter for the Gamma distribution prior\nover the lambda parameter." - } + }, + "refined_type": {} }, { "name": "lambda_2", @@ -96346,7 +102665,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Hyper-parameter : inverse scale parameter (rate parameter) for the\nGamma distribution prior over the lambda parameter." - } + }, + "refined_type": {} }, { "name": "alpha_init", @@ -96356,7 +102676,8 @@ "docstring": { "type": "float, default=None", "description": "Initial value for alpha (precision of the noise).\nIf not set, alpha_init is 1/Var(y).\n\n .. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "lambda_init", @@ -96366,7 +102687,8 @@ "docstring": { "type": "float, default=None", "description": "Initial value for lambda (precision of the weights).\nIf not set, lambda_init is 1.\n\n .. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "compute_score", @@ -96376,7 +102698,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, compute the log marginal likelihood at each iteration of the\noptimization." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -96386,7 +102709,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model.\nThe intercept is not treated as a probabilistic parameter\nand thus has no associated variance. If set\nto False, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -96396,7 +102720,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -96406,7 +102731,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -96416,13 +102742,14 @@ "docstring": { "type": "bool, default=False", "description": "Verbose mode when fitting the model." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, alpha_init=None, lambda_init=None, compute_score=False, fit_intercept=True, normalize='deprecated', copy_X=True, verbose=False):\n self.n_iter = n_iter\n self.tol = tol\n self.alpha_1 = alpha_1\n self.alpha_2 = alpha_2\n self.lambda_1 = lambda_1\n self.lambda_2 = lambda_2\n self.alpha_init = alpha_init\n self.lambda_init = lambda_init\n self.compute_score = compute_score\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.copy_X = copy_X\n self.verbose = verbose" }, { @@ -96440,7 +102767,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -96450,7 +102778,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -96460,7 +102789,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eigen_vals", @@ -96470,7 +102800,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha_", @@ -96480,7 +102811,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lambda_", @@ -96490,7 +102822,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef", @@ -96500,7 +102833,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "rmse", @@ -96510,7 +102844,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -96534,7 +102869,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -96544,7 +102880,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -96554,7 +102891,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -96564,7 +102902,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -96574,7 +102913,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "XT_y", @@ -96584,7 +102924,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "U", @@ -96594,7 +102935,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Vh", @@ -96604,7 +102946,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eigen_vals_", @@ -96614,7 +102957,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha_", @@ -96624,7 +102968,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lambda_", @@ -96634,13 +102979,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Update posterior mean and compute corresponding rmse.\n\nPosterior mean is given by coef_ = scaled_sigma_ * X.T * y where scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features) + np.dot(X.T, X))^-1", - "docstring": "Update posterior mean and compute corresponding rmse.\n\nPosterior mean is given by coef_ = scaled_sigma_ * X.T * y where\nscaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)\n + np.dot(X.T, X))^-1", + "description": "Update posterior mean and compute corresponding rmse.\n\nPosterior mean is given by coef_ = scaled_sigma_ * X.T * y where\nscaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)\n + np.dot(X.T, X))^-1", + "docstring": "Update posterior mean and compute corresponding rmse.\n\n Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where\n scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)\n + np.dot(X.T, X))^-1\n ", "source_code": "\ndef _update_coef_(self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_):\n \"\"\"Update posterior mean and compute corresponding rmse.\n\n Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where\n scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)\n + np.dot(X.T, X))^-1\n \"\"\"\n if n_samples > n_features:\n coef_ = np.linalg.multi_dot([Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y])\n else:\n coef_ = np.linalg.multi_dot([X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y])\n rmse_ = np.sum((y - np.dot(X, coef_))**2)\n return coef_, rmse_" }, { @@ -96658,7 +103004,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -96668,7 +103015,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -96678,7 +103026,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target values. Will be cast to X's dtype if necessary." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -96688,13 +103037,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Individual weights for each sample.\n\n.. versionadded:: 0.20\n parameter *sample_weight* support to BayesianRidge." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model.", - "docstring": "Fit the model.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Training data.\ny : ndarray of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.20\n parameter *sample_weight* support to BayesianRidge.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the model.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n y : ndarray of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.20\n parameter *sample_weight* support to BayesianRidge.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n y : ndarray of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.20\n parameter *sample_weight* support to BayesianRidge.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self._normalize = _deprecate_normalize(self.normalize, default=False, estimator_name=self.__class__.__name__)\n if self.n_iter < 1:\n raise ValueError('n_iter should be greater than or equal to 1. Got {!r}.'.format(self.n_iter))\n (X, y) = self._validate_data(X, y, dtype=np.float64, y_numeric=True)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n (X, y, X_offset_, y_offset_, X_scale_) = self._preprocess_data(X, y, self.fit_intercept, self._normalize, self.copy_X, sample_weight=sample_weight)\n if sample_weight is not None:\n (X, y) = _rescale_data(X, y, sample_weight)\n self.X_offset_ = X_offset_\n self.X_scale_ = X_scale_\n (n_samples, n_features) = X.shape\n eps = np.finfo(np.float64).eps\n alpha_ = self.alpha_init\n lambda_ = self.lambda_init\n if alpha_ is None:\n alpha_ = 1.0 / (np.var(y) + eps)\n if lambda_ is None:\n lambda_ = 1.0\n verbose = self.verbose\n lambda_1 = self.lambda_1\n lambda_2 = self.lambda_2\n alpha_1 = self.alpha_1\n alpha_2 = self.alpha_2\n self.scores_ = list()\n coef_old_ = None\n XT_y = np.dot(X.T, y)\n (U, S, Vh) = linalg.svd(X, full_matrices=False)\n eigen_vals_ = S**2\n for iter_ in range(self.n_iter):\n (coef_, rmse_) = self._update_coef_(X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_)\n if self.compute_score:\n s = self._log_marginal_likelihood(n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_)\n self.scores_.append(s)\n gamma_ = np.sum(alpha_ * eigen_vals_ / (lambda_ + alpha_ * eigen_vals_))\n lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2)\n alpha_ = (n_samples - gamma_ + 2 * alpha_1) / (rmse_ + 2 * alpha_2)\n if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:\n if verbose:\n print('Convergence after ', str(iter_), ' iterations')\n break\n coef_old_ = np.copy(coef_)\n self.n_iter_ = iter_ + 1\n self.alpha_ = alpha_\n self.lambda_ = lambda_\n (self.coef_, rmse_) = self._update_coef_(X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_)\n if self.compute_score:\n s = self._log_marginal_likelihood(n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_)\n self.scores_.append(s)\n self.scores_ = np.array(self.scores_)\n scaled_sigma_ = np.dot(Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis])\n self.sigma_ = 1.0 / alpha_ * scaled_sigma_\n self._set_intercept(X_offset_, y_offset_, X_scale_)\n return self" }, { @@ -96712,7 +103062,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -96722,6 +103073,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -96732,13 +103087,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the standard deviation of posterior prediction." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict using the linear model.\n\nIn addition to the mean of the predictive distribution, also its standard deviation can be returned.", - "docstring": "Predict using the linear model.\n\nIn addition to the mean of the predictive distribution, also its\nstandard deviation can be returned.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\nreturn_std : bool, default=False\n Whether to return the standard deviation of posterior prediction.\n\nReturns\n-------\ny_mean : array-like of shape (n_samples,)\n Mean of predictive distribution of query points.\n\ny_std : array-like of shape (n_samples,)\n Standard deviation of predictive distribution of query points.", + "description": "Predict using the linear model.\n\nIn addition to the mean of the predictive distribution, also its\nstandard deviation can be returned.", + "docstring": "Predict using the linear model.\n\n In addition to the mean of the predictive distribution, also its\n standard deviation can be returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n return_std : bool, default=False\n Whether to return the standard deviation of posterior prediction.\n\n Returns\n -------\n y_mean : array-like of shape (n_samples,)\n Mean of predictive distribution of query points.\n\n y_std : array-like of shape (n_samples,)\n Standard deviation of predictive distribution of query points.\n ", "source_code": "\ndef predict(self, X, return_std=False):\n \"\"\"Predict using the linear model.\n\n In addition to the mean of the predictive distribution, also its\n standard deviation can be returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n return_std : bool, default=False\n Whether to return the standard deviation of posterior prediction.\n\n Returns\n -------\n y_mean : array-like of shape (n_samples,)\n Mean of predictive distribution of query points.\n\n y_std : array-like of shape (n_samples,)\n Standard deviation of predictive distribution of query points.\n \"\"\"\n y_mean = self._decision_function(X)\n if return_std is False:\n return y_mean\n else:\n if self._normalize:\n X = (X - self.X_offset_) / self.X_scale_\n sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)\n y_std = np.sqrt(sigmas_squared_data + 1.0 / self.alpha_)\n return y_mean, y_std" }, { @@ -96756,7 +103112,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -96766,7 +103123,8 @@ "docstring": { "type": "float, default=1.0", "description": "Constant that multiplies the penalty terms. Defaults to 1.0.\nSee the notes for the exact mathematical meaning of this\nparameter. ``alpha = 0`` is equivalent to an ordinary least square,\nsolved by the :class:`LinearRegression` object. For numerical\nreasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.\nGiven this, you should use the :class:`LinearRegression` object." - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -96776,7 +103134,8 @@ "docstring": { "type": "float, default=0.5", "description": "The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For\n``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it\nis an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a\ncombination of L1 and L2." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -96786,7 +103145,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the intercept should be estimated or not. If ``False``, the\ndata is assumed to be already centered." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -96796,7 +103156,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -96806,7 +103167,8 @@ "docstring": { "type": "bool or array-like of shape (n_features, n_features), default=False", "description": "Whether to use a precomputed Gram matrix to speed up\ncalculations. The Gram matrix can also be passed as argument.\nFor sparse input this option is always ``False`` to preserve sparsity." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -96816,7 +103178,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -96826,7 +103189,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "tol", @@ -96836,7 +103200,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance for the optimization: if the updates are\nsmaller than ``tol``, the optimization code checks the\ndual gap for optimality and continues until it is smaller\nthan ``tol``." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -96846,7 +103211,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit as\ninitialization, otherwise, just erase the previous solution.\nSee :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "positive", @@ -96856,7 +103222,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, forces the coefficients to be positive." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -96866,7 +103233,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "The seed of the pseudo random number generator that selects a random\nfeature to update. Used when ``selection`` == 'random'.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "selection", @@ -96876,13 +103244,17 @@ "docstring": { "type": "{'cyclic', 'random'}, default='cyclic'", "description": "If set to 'random', a random coefficient is updated every iteration\nrather than looping over features sequentially by default. This\n(setting to 'random') often leads to significantly faster convergence\nespecially when tol is higher than 1e-4." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "cyclic"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, normalize='deprecated', precompute=False, max_iter=1000, copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic'):\n self.alpha = alpha\n self.l1_ratio = l1_ratio\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.precompute = precompute\n self.max_iter = max_iter\n self.copy_X = copy_X\n self.tol = tol\n self.warm_start = warm_start\n self.positive = positive\n self.random_state = random_state\n self.selection = selection" }, { @@ -96900,7 +103272,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -96910,13 +103283,14 @@ "docstring": { "type": "numpy array or scipy.sparse matrix of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Decision function of the linear model.", - "docstring": "Decision function of the linear model.\n\nParameters\n----------\nX : numpy array or scipy.sparse matrix of shape (n_samples, n_features)\n\nReturns\n-------\nT : ndarray of shape (n_samples,)\n The predicted decision function.", + "docstring": "Decision function of the linear model.\n\n Parameters\n ----------\n X : numpy array or scipy.sparse matrix of shape (n_samples, n_features)\n\n Returns\n -------\n T : ndarray of shape (n_samples,)\n The predicted decision function.\n ", "source_code": "\ndef _decision_function(self, X):\n \"\"\"Decision function of the linear model.\n\n Parameters\n ----------\n X : numpy array or scipy.sparse matrix of shape (n_samples, n_features)\n\n Returns\n -------\n T : ndarray of shape (n_samples,)\n The predicted decision function.\n \"\"\"\n check_is_fitted(self)\n if sparse.isspmatrix(X):\n return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_\n else:\n return super()._decision_function(X)" }, { @@ -96934,7 +103308,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -96944,6 +103319,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of (n_samples, n_features)", "description": "Data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -96954,6 +103333,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)", "description": "Target. Will be cast to X's dtype if necessary." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -96964,7 +103347,8 @@ "docstring": { "type": "float or array-like of shape (n_samples,), default=None", "description": "Sample weights. Internally, the `sample_weight` vector will be\nrescaled to sum to `n_samples`.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} }, { "name": "check_input", @@ -96974,13 +103358,14 @@ "docstring": { "type": "bool, default=True", "description": "Allow to bypass several input checking.\nDon't use this parameter unless you know what you do." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit model with coordinate descent.", - "docstring": "Fit model with coordinate descent.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of (n_samples, n_features)\n Data.\n\ny : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)\n Target. Will be cast to X's dtype if necessary.\n\nsample_weight : float or array-like of shape (n_samples,), default=None\n Sample weights. Internally, the `sample_weight` vector will be\n rescaled to sum to `n_samples`.\n\n .. versionadded:: 0.23\n\ncheck_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\nReturns\n-------\nself : object\n Fitted estimator.\n\nNotes\n-----\nCoordinate descent is an algorithm that considers each column of\ndata at a time hence it will automatically convert the X input\nas a Fortran-contiguous numpy array if necessary.\n\nTo avoid memory re-allocation it is advised to allocate the\ninitial data in memory directly using that format.", + "docstring": "Fit model with coordinate descent.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of (n_samples, n_features)\n Data.\n\n y : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)\n Target. Will be cast to X's dtype if necessary.\n\n sample_weight : float or array-like of shape (n_samples,), default=None\n Sample weights. Internally, the `sample_weight` vector will be\n rescaled to sum to `n_samples`.\n\n .. versionadded:: 0.23\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n Coordinate descent is an algorithm that considers each column of\n data at a time hence it will automatically convert the X input\n as a Fortran-contiguous numpy array if necessary.\n\n To avoid memory re-allocation it is advised to allocate the\n initial data in memory directly using that format.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None, check_input=True):\n \"\"\"Fit model with coordinate descent.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of (n_samples, n_features)\n Data.\n\n y : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)\n Target. Will be cast to X's dtype if necessary.\n\n sample_weight : float or array-like of shape (n_samples,), default=None\n Sample weights. Internally, the `sample_weight` vector will be\n rescaled to sum to `n_samples`.\n\n .. versionadded:: 0.23\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n Coordinate descent is an algorithm that considers each column of\n data at a time hence it will automatically convert the X input\n as a Fortran-contiguous numpy array if necessary.\n\n To avoid memory re-allocation it is advised to allocate the\n initial data in memory directly using that format.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=False, estimator_name=self.__class__.__name__)\n if self.alpha == 0:\n warnings.warn('With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator', stacklevel=2)\n if isinstance(self.precompute, str):\n raise ValueError('precompute should be one of True, False or array-like. Got %r' % self.precompute)\n if not isinstance(self.l1_ratio, numbers.Number) or self.l1_ratio < 0 or self.l1_ratio > 1:\n raise ValueError(f'l1_ratio must be between 0 and 1; got l1_ratio={self.l1_ratio}')\n X_copied = False\n if check_input:\n X_copied = self.copy_X and self.fit_intercept\n (X, y) = self._validate_data(X, y, accept_sparse='csc', order='F', dtype=[np.float64, np.float32], copy=X_copied, multi_output=True, y_numeric=True)\n y = check_array(y, order='F', copy=False, dtype=X.dtype.type, ensure_2d=False)\n (n_samples, n_features) = X.shape\n alpha = self.alpha\n if isinstance(sample_weight, numbers.Number):\n sample_weight = None\n if sample_weight is not None:\n if check_input:\n if sparse.issparse(X):\n raise ValueError('Sample weights do not (yet) support sparse matrices.')\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n sample_weight = sample_weight * (n_samples / np.sum(sample_weight))\n should_copy = self.copy_X and not X_copied\n (X, y, X_offset, y_offset, X_scale, precompute, Xy) = _pre_fit(X, y, None, self.precompute, _normalize, self.fit_intercept, copy=should_copy, check_input=check_input, sample_weight=sample_weight)\n if check_input or sample_weight is not None:\n (X, y) = _set_order(X, y, order='F')\n if y.ndim == 1:\n y = y[:, np.newaxis]\n if Xy is not None and Xy.ndim == 1:\n Xy = Xy[:, np.newaxis]\n n_targets = y.shape[1]\n if self.selection not in ['cyclic', 'random']:\n raise ValueError('selection should be either random or cyclic.')\n if not self.warm_start or not hasattr(self, 'coef_'):\n coef_ = np.zeros((n_targets, n_features), dtype=X.dtype, order='F')\n else:\n coef_ = self.coef_\n if coef_.ndim == 1:\n coef_ = coef_[np.newaxis, :]\n dual_gaps_ = np.zeros(n_targets, dtype=X.dtype)\n self.n_iter_ = []\n for k in range(n_targets):\n if Xy is not None:\n this_Xy = Xy[:, k]\n else:\n this_Xy = None\n (_, this_coef, this_dual_gap, this_iter) = self.path(X, y[:, k], l1_ratio=self.l1_ratio, eps=None, n_alphas=None, alphas=[alpha], precompute=precompute, Xy=this_Xy, copy_X=True, verbose=False, tol=self.tol, positive=self.positive, X_offset=X_offset, X_scale=X_scale, return_n_iter=True, coef_init=coef_[k], max_iter=self.max_iter, random_state=self.random_state, selection=self.selection, check_input=False)\n coef_[k] = this_coef[:, 0]\n dual_gaps_[k] = this_dual_gap[0]\n self.n_iter_.append(this_iter[0])\n if n_targets == 1:\n self.n_iter_ = self.n_iter_[0]\n self.coef_ = coef_[0]\n self.dual_gap_ = dual_gaps_[0]\n else:\n self.coef_ = coef_\n self.dual_gap_ = dual_gaps_\n self._set_intercept(X_offset, y_offset, X_scale)\n self.coef_ = np.asarray(self.coef_, dtype=X.dtype)\n return self" }, { @@ -96998,7 +103383,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -97022,7 +103408,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -97032,7 +103419,8 @@ "docstring": { "type": "float or list of float, default=0.5", "description": "Float between 0 and 1 passed to ElasticNet (scaling between\nl1 and l2 penalties). For ``l1_ratio = 0``\nthe penalty is an L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty.\nFor ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2\nThis parameter can be a list, in which case the different\nvalues are tested by cross-validation and the one giving the best\nprediction score is used. Note that a good choice of list of\nvalues for l1_ratio is often to put more values close to 1\n(i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7,\n.9, .95, .99, 1]``." - } + }, + "refined_type": {} }, { "name": "eps", @@ -97042,7 +103430,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Length of the path. ``eps=1e-3`` means that\n``alpha_min / alpha_max = 1e-3``." - } + }, + "refined_type": {} }, { "name": "n_alphas", @@ -97052,7 +103441,8 @@ "docstring": { "type": "int, default=100", "description": "Number of alphas along the regularization path, used for each l1_ratio." - } + }, + "refined_type": {} }, { "name": "alphas", @@ -97062,7 +103452,8 @@ "docstring": { "type": "ndarray, default=None", "description": "List of alphas where to compute the models.\nIf None alphas are set automatically." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -97072,7 +103463,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -97082,7 +103474,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -97092,7 +103485,8 @@ "docstring": { "type": "'auto', bool or array-like of shape (n_features, n_features), default='auto'", "description": "Whether to use a precomputed Gram matrix to speed up\ncalculations. If set to ``'auto'`` let us decide. The Gram\nmatrix can also be passed as argument." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -97102,7 +103496,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "tol", @@ -97112,7 +103507,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance for the optimization: if the updates are\nsmaller than ``tol``, the optimization code checks the\ndual gap for optimality and continues until it is smaller\nthan ``tol``." - } + }, + "refined_type": {} }, { "name": "cv", @@ -97122,7 +103518,8 @@ "docstring": { "type": "int, cross-validation generator or iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross-validation,\n- int, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor int/None inputs, :class:`KFold` is used.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -97132,7 +103529,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -97142,7 +103540,8 @@ "docstring": { "type": "bool or int, default=0", "description": "Amount of verbosity." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -97152,7 +103551,8 @@ "docstring": { "type": "int, default=None", "description": "Number of CPUs to use during the cross validation.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "positive", @@ -97162,7 +103562,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, forces the coefficients to be positive." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -97172,7 +103573,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "The seed of the pseudo random number generator that selects a random\nfeature to update. Used when ``selection`` == 'random'.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "selection", @@ -97182,13 +103584,17 @@ "docstring": { "type": "{'cyclic', 'random'}, default='cyclic'", "description": "If set to 'random', a random coefficient is updated every iteration\nrather than looping over features sequentially by default. This\n(setting to 'random') often leads to significantly faster convergence\nespecially when tol is higher than 1e-4." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "cyclic"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize='deprecated', precompute='auto', max_iter=1000, tol=0.0001, cv=None, copy_X=True, verbose=0, n_jobs=None, positive=False, random_state=None, selection='cyclic'):\n self.l1_ratio = l1_ratio\n self.eps = eps\n self.n_alphas = n_alphas\n self.alphas = alphas\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.precompute = precompute\n self.max_iter = max_iter\n self.tol = tol\n self.cv = cv\n self.copy_X = copy_X\n self.verbose = verbose\n self.n_jobs = n_jobs\n self.positive = positive\n self.random_state = random_state\n self.selection = selection" }, { @@ -97206,13 +103612,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_estimator(self):\n return ElasticNet()" }, { @@ -97230,13 +103637,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _is_multitask(self):\n return False" }, { @@ -97254,13 +103662,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multioutput': False}" }, { @@ -97278,7 +103687,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -97288,7 +103698,8 @@ "docstring": { "type": "float, default=1.0", "description": "Constant that multiplies the L1 term. Defaults to 1.0.\n``alpha = 0`` is equivalent to an ordinary least square, solved\nby the :class:`LinearRegression` object. For numerical\nreasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.\nGiven this, you should use the :class:`LinearRegression` object." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -97298,7 +103709,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto False, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -97308,7 +103720,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -97316,9 +103729,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "'auto', bool or array-like of shape (n_features, n_features), precompute", + "type": "bool or array-like of shape (n_features, n_features), default=False", "description": "Whether to use a precomputed Gram matrix to speed up\ncalculations. The Gram matrix can also be passed as argument.\nFor sparse input this option is always ``False`` to preserve sparsity." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -97328,7 +103742,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -97338,7 +103753,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "tol", @@ -97348,7 +103764,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance for the optimization: if the updates are\nsmaller than ``tol``, the optimization code checks the\ndual gap for optimality and continues until it is smaller\nthan ``tol``." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -97358,7 +103775,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to True, reuse the solution of the previous call to fit as\ninitialization, otherwise, just erase the previous solution.\nSee :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "positive", @@ -97368,7 +103786,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, forces the coefficients to be positive." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -97378,7 +103797,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "The seed of the pseudo random number generator that selects a random\nfeature to update. Used when ``selection`` == 'random'.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "selection", @@ -97388,13 +103808,17 @@ "docstring": { "type": "{'cyclic', 'random'}, default='cyclic'", "description": "If set to 'random', a random coefficient is updated every iteration\nrather than looping over features sequentially by default. This\n(setting to 'random') often leads to significantly faster convergence\nespecially when tol is higher than 1e-4." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "cyclic"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic'):\n super().__init__(alpha=alpha, l1_ratio=1.0, fit_intercept=fit_intercept, normalize=normalize, precompute=precompute, copy_X=copy_X, max_iter=max_iter, tol=tol, warm_start=warm_start, positive=positive, random_state=random_state, selection=selection)" }, { @@ -97412,7 +103836,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eps", @@ -97422,7 +103847,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Length of the path. ``eps=1e-3`` means that\n``alpha_min / alpha_max = 1e-3``." - } + }, + "refined_type": {} }, { "name": "n_alphas", @@ -97432,7 +103858,8 @@ "docstring": { "type": "int, default=100", "description": "Number of alphas along the regularization path." - } + }, + "refined_type": {} }, { "name": "alphas", @@ -97442,7 +103869,8 @@ "docstring": { "type": "ndarray, default=None", "description": "List of alphas where to compute the models.\nIf ``None`` alphas are set automatically." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -97452,7 +103880,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -97462,7 +103891,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -97472,7 +103902,8 @@ "docstring": { "type": "'auto', bool or array-like of shape (n_features, n_features), default='auto'", "description": "Whether to use a precomputed Gram matrix to speed up\ncalculations. If set to ``'auto'`` let us decide. The Gram\nmatrix can also be passed as argument." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -97482,7 +103913,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "tol", @@ -97492,7 +103924,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance for the optimization: if the updates are\nsmaller than ``tol``, the optimization code checks the\ndual gap for optimality and continues until it is smaller\nthan ``tol``." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -97502,7 +103935,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "cv", @@ -97512,7 +103946,8 @@ "docstring": { "type": "int, cross-validation generator or iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross-validation,\n- int, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor int/None inputs, :class:`KFold` is used.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -97522,7 +103957,8 @@ "docstring": { "type": "bool or int, default=False", "description": "Amount of verbosity." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -97532,7 +103968,8 @@ "docstring": { "type": "int, default=None", "description": "Number of CPUs to use during the cross validation.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "positive", @@ -97542,7 +103979,8 @@ "docstring": { "type": "bool, default=False", "description": "If positive, restrict regression coefficients to be positive." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -97552,7 +103990,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "The seed of the pseudo random number generator that selects a random\nfeature to update. Used when ``selection`` == 'random'.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "selection", @@ -97562,13 +104001,17 @@ "docstring": { "type": "{'cyclic', 'random'}, default='cyclic'", "description": "If set to 'random', a random coefficient is updated every iteration\nrather than looping over features sequentially by default. This\n(setting to 'random') often leads to significantly faster convergence\nespecially when tol is higher than 1e-4." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "cyclic"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize='deprecated', precompute='auto', max_iter=1000, tol=0.0001, copy_X=True, cv=None, verbose=False, n_jobs=None, positive=False, random_state=None, selection='cyclic'):\n super().__init__(eps=eps, n_alphas=n_alphas, alphas=alphas, fit_intercept=fit_intercept, normalize=normalize, precompute=precompute, max_iter=max_iter, tol=tol, copy_X=copy_X, cv=cv, verbose=verbose, n_jobs=n_jobs, positive=positive, random_state=random_state, selection=selection)" }, { @@ -97586,13 +104029,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_estimator(self):\n return Lasso()" }, { @@ -97610,13 +104054,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _is_multitask(self):\n return False" }, { @@ -97634,13 +104079,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multioutput': False}" }, { @@ -97658,7 +104104,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eps", @@ -97668,7 +104115,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_alphas", @@ -97678,7 +104126,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alphas", @@ -97688,7 +104137,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -97698,7 +104148,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "normalize", @@ -97708,7 +104159,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "precompute", @@ -97718,7 +104170,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -97728,7 +104181,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -97738,7 +104192,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -97748,7 +104203,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cv", @@ -97758,7 +104214,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -97768,7 +104225,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -97778,7 +104236,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "positive", @@ -97788,7 +104247,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -97798,7 +104258,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "selection", @@ -97808,13 +104269,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize='deprecated', precompute='auto', max_iter=1000, tol=0.0001, copy_X=True, cv=None, verbose=False, n_jobs=None, positive=False, random_state=None, selection='cyclic'):\n self.eps = eps\n self.n_alphas = n_alphas\n self.alphas = alphas\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.precompute = precompute\n self.max_iter = max_iter\n self.tol = tol\n self.copy_X = copy_X\n self.cv = cv\n self.verbose = verbose\n self.n_jobs = n_jobs\n self.positive = positive\n self.random_state = random_state\n self.selection = selection" }, { @@ -97832,7 +104294,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -97856,7 +104319,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -97880,13 +104344,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -97904,7 +104369,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -97914,6 +104380,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data. Pass directly as Fortran-contiguous data\nto avoid unnecessary memory duplication. If y is mono-output,\nX can be sparse." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -97924,7 +104394,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -97934,13 +104405,14 @@ "docstring": { "type": "float or array-like of shape (n_samples,), default=None", "description": "Sample weights used for fitting and evaluation of the weighted\nmean squared error of each cv-fold. Note that the cross validated\nMSE that is finally used to find the best model is the unweighted\nmean over the (weighted) MSEs of each test fold." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit linear model with coordinate descent.\n\nFit is on grid of alphas and best alpha estimated by cross-validation.", - "docstring": "Fit linear model with coordinate descent.\n\nFit is on grid of alphas and best alpha estimated by cross-validation.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. Pass directly as Fortran-contiguous data\n to avoid unnecessary memory duplication. If y is mono-output,\n X can be sparse.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\nsample_weight : float or array-like of shape (n_samples,), default=None\n Sample weights used for fitting and evaluation of the weighted\n mean squared error of each cv-fold. Note that the cross validated\n MSE that is finally used to find the best model is the unweighted\n mean over the (weighted) MSEs of each test fold.\n\nReturns\n-------\nself : object\n Returns an instance of fitted model.", + "docstring": "Fit linear model with coordinate descent.\n\n Fit is on grid of alphas and best alpha estimated by cross-validation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. Pass directly as Fortran-contiguous data\n to avoid unnecessary memory duplication. If y is mono-output,\n X can be sparse.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : float or array-like of shape (n_samples,), default=None\n Sample weights used for fitting and evaluation of the weighted\n mean squared error of each cv-fold. Note that the cross validated\n MSE that is finally used to find the best model is the unweighted\n mean over the (weighted) MSEs of each test fold.\n\n Returns\n -------\n self : object\n Returns an instance of fitted model.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit linear model with coordinate descent.\n\n Fit is on grid of alphas and best alpha estimated by cross-validation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. Pass directly as Fortran-contiguous data\n to avoid unnecessary memory duplication. If y is mono-output,\n X can be sparse.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : float or array-like of shape (n_samples,), default=None\n Sample weights used for fitting and evaluation of the weighted\n mean squared error of each cv-fold. Note that the cross validated\n MSE that is finally used to find the best model is the unweighted\n mean over the (weighted) MSEs of each test fold.\n\n Returns\n -------\n self : object\n Returns an instance of fitted model.\n \"\"\"\n _normalize = self.normalize\n if _normalize == 'deprecated':\n _normalize = False\n copy_X = self.copy_X and self.fit_intercept\n check_y_params = dict(copy=False, dtype=[np.float64, np.float32], ensure_2d=False)\n if isinstance(X, np.ndarray) or sparse.isspmatrix(X):\n reference_to_old_X = X\n check_X_params = dict(accept_sparse='csc', dtype=[np.float64, np.float32], copy=False)\n (X, y) = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params))\n if sparse.isspmatrix(X):\n if hasattr(reference_to_old_X, 'data') and not np.may_share_memory(reference_to_old_X.data, X.data):\n copy_X = False\n elif not np.may_share_memory(reference_to_old_X, X):\n copy_X = False\n del reference_to_old_X\n else:\n check_X_params = dict(accept_sparse='csc', dtype=[np.float64, np.float32], order='F', copy=copy_X)\n (X, y) = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params))\n copy_X = False\n check_consistent_length(X, y)\n if not self._is_multitask():\n if y.ndim > 1 and y.shape[1] > 1:\n raise ValueError('For multi-task outputs, use MultiTask%s' % self.__class__.__name__)\n y = column_or_1d(y, warn=True)\n elif sparse.isspmatrix(X):\n raise TypeError('X should be dense but a sparse matrix waspassed')\n elif y.ndim == 1:\n raise ValueError('For mono-task outputs, use %sCV' % self.__class__.__name__[9:])\n if isinstance(sample_weight, numbers.Number):\n sample_weight = None\n if sample_weight is not None:\n if sparse.issparse(X):\n raise ValueError('Sample weights do not (yet) support sparse matrices.')\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n model = self._get_estimator()\n if self.selection not in ['random', 'cyclic']:\n raise ValueError('selection should be either random or cyclic.')\n path_params = self.get_params()\n path_params.pop('normalize', None)\n path_params.pop('fit_intercept', None)\n if 'l1_ratio' in path_params:\n l1_ratios = np.atleast_1d(path_params['l1_ratio'])\n path_params['l1_ratio'] = l1_ratios[0]\n else:\n l1_ratios = [1]\n path_params.pop('cv', None)\n path_params.pop('n_jobs', None)\n alphas = self.alphas\n n_l1_ratio = len(l1_ratios)\n if alphas is None:\n alphas = [_alpha_grid(X, y, l1_ratio=l1_ratio, fit_intercept=self.fit_intercept, eps=self.eps, n_alphas=self.n_alphas, normalize=_normalize, copy_X=self.copy_X) for l1_ratio in l1_ratios]\n else:\n alphas = np.tile(np.sort(alphas)[::-1], (n_l1_ratio, 1))\n n_alphas = len(alphas[0])\n path_params.update({'n_alphas': n_alphas})\n path_params['copy_X'] = copy_X\n if effective_n_jobs(self.n_jobs) > 1:\n path_params['copy_X'] = False\n cv = check_cv(self.cv)\n folds = list(cv.split(X, y))\n best_mse = np.inf\n jobs = (delayed(_path_residuals)(X, y, sample_weight, train, test, _normalize, self.fit_intercept, self.path, path_params, alphas=this_alphas, l1_ratio=this_l1_ratio, X_order='F', dtype=X.dtype.type) for (this_l1_ratio, this_alphas) in zip(l1_ratios, alphas) for (train, test) in folds)\n mse_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer='threads'))(jobs)\n mse_paths = np.reshape(mse_paths, (n_l1_ratio, len(folds), -1))\n mean_mse = np.mean(mse_paths, axis=1)\n self.mse_path_ = np.squeeze(np.moveaxis(mse_paths, 2, 1))\n for (l1_ratio, l1_alphas, mse_alphas) in zip(l1_ratios, alphas, mean_mse):\n i_best_alpha = np.argmin(mse_alphas)\n this_best_mse = mse_alphas[i_best_alpha]\n if this_best_mse < best_mse:\n best_alpha = l1_alphas[i_best_alpha]\n best_l1_ratio = l1_ratio\n best_mse = this_best_mse\n self.l1_ratio_ = best_l1_ratio\n self.alpha_ = best_alpha\n if self.alphas is None:\n self.alphas_ = np.asarray(alphas)\n if n_l1_ratio == 1:\n self.alphas_ = self.alphas_[0]\n else:\n self.alphas_ = np.asarray(alphas[0])\n common_params = {name: value for (name, value) in self.get_params().items() if name in model.get_params()}\n model.set_params(**common_params)\n model.alpha = best_alpha\n model.l1_ratio = best_l1_ratio\n model.copy_X = copy_X\n precompute = getattr(self, 'precompute', None)\n if isinstance(precompute, str) and precompute == 'auto':\n model.precompute = False\n if sample_weight is None:\n model.fit(X, y)\n else:\n model.fit(X, y, sample_weight=sample_weight)\n if not hasattr(self, 'l1_ratio'):\n del self.l1_ratio_\n self.coef_ = model.coef_\n self.intercept_ = model.intercept_\n self.dual_gap_ = model.dual_gap_\n self.n_iter_ = model.n_iter_\n return self" }, { @@ -97958,7 +104430,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -97968,7 +104441,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -97992,7 +104466,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -98002,7 +104477,8 @@ "docstring": { "type": "float, default=1.0", "description": "Constant that multiplies the L1/L2 term. Defaults to 1.0." - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -98012,7 +104488,8 @@ "docstring": { "type": "float, default=0.5", "description": "The ElasticNet mixing parameter, with 0 < l1_ratio <= 1.\nFor l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it\nis an L2 penalty.\nFor ``0 < l1_ratio < 1``, the penalty is a combination of L1/L2 and L2." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -98022,7 +104499,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -98032,7 +104510,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -98042,7 +104521,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -98052,7 +104532,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "tol", @@ -98062,7 +104543,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance for the optimization: if the updates are\nsmaller than ``tol``, the optimization code checks the\ndual gap for optimality and continues until it is smaller\nthan ``tol``." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -98072,7 +104554,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit as\ninitialization, otherwise, just erase the previous solution.\nSee :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -98082,7 +104565,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "The seed of the pseudo random number generator that selects a random\nfeature to update. Used when ``selection`` == 'random'.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "selection", @@ -98092,13 +104576,17 @@ "docstring": { "type": "{'cyclic', 'random'}, default='cyclic'", "description": "If set to 'random', a random coefficient is updated every iteration\nrather than looping over features sequentially by default. This\n(setting to 'random') often leads to significantly faster convergence\nespecially when tol is higher than 1e-4." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "cyclic"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, random_state=None, selection='cyclic'):\n self.l1_ratio = l1_ratio\n self.alpha = alpha\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.max_iter = max_iter\n self.copy_X = copy_X\n self.tol = tol\n self.warm_start = warm_start\n self.random_state = random_state\n self.selection = selection" }, { @@ -98116,13 +104604,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multioutput_only': True}" }, { @@ -98140,7 +104629,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -98150,7 +104640,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Data." - } + }, + "refined_type": {} }, { "name": "y", @@ -98160,13 +104651,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_targets)", "description": "Target. Will be cast to X's dtype if necessary." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit MultiTaskElasticNet model with coordinate descent.", - "docstring": "Fit MultiTaskElasticNet model with coordinate descent.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Data.\ny : ndarray of shape (n_samples, n_targets)\n Target. Will be cast to X's dtype if necessary.\n\nReturns\n-------\nself : object\n Fitted estimator.\n\nNotes\n-----\nCoordinate descent is an algorithm that considers each column of\ndata at a time hence it will automatically convert the X input\nas a Fortran-contiguous numpy array if necessary.\n\nTo avoid memory re-allocation it is advised to allocate the\ninitial data in memory directly using that format.", + "docstring": "Fit MultiTaskElasticNet model with coordinate descent.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data.\n y : ndarray of shape (n_samples, n_targets)\n Target. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n Coordinate descent is an algorithm that considers each column of\n data at a time hence it will automatically convert the X input\n as a Fortran-contiguous numpy array if necessary.\n\n To avoid memory re-allocation it is advised to allocate the\n initial data in memory directly using that format.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit MultiTaskElasticNet model with coordinate descent.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data.\n y : ndarray of shape (n_samples, n_targets)\n Target. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n Coordinate descent is an algorithm that considers each column of\n data at a time hence it will automatically convert the X input\n as a Fortran-contiguous numpy array if necessary.\n\n To avoid memory re-allocation it is advised to allocate the\n initial data in memory directly using that format.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=False, estimator_name=self.__class__.__name__)\n check_X_params = dict(dtype=[np.float64, np.float32], order='F', copy=self.copy_X and self.fit_intercept)\n check_y_params = dict(ensure_2d=False, order='F')\n (X, y) = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params))\n check_consistent_length(X, y)\n y = y.astype(X.dtype)\n if hasattr(self, 'l1_ratio'):\n model_str = 'ElasticNet'\n else:\n model_str = 'Lasso'\n if y.ndim == 1:\n raise ValueError('For mono-task outputs, use %s' % model_str)\n (n_samples, n_features) = X.shape\n n_targets = y.shape[1]\n (X, y, X_offset, y_offset, X_scale) = _preprocess_data(X, y, self.fit_intercept, _normalize, copy=False)\n if not self.warm_start or not hasattr(self, 'coef_'):\n self.coef_ = np.zeros((n_targets, n_features), dtype=X.dtype.type, order='F')\n l1_reg = self.alpha * self.l1_ratio * n_samples\n l2_reg = self.alpha * (1.0 - self.l1_ratio) * n_samples\n self.coef_ = np.asfortranarray(self.coef_)\n if self.selection not in ['random', 'cyclic']:\n raise ValueError('selection should be either random or cyclic.')\n random = self.selection == 'random'\n (self.coef_, self.dual_gap_, self.eps_, self.n_iter_) = cd_fast.enet_coordinate_descent_multi_task(self.coef_, l1_reg, l2_reg, X, y, self.max_iter, self.tol, check_random_state(self.random_state), random)\n self.dual_gap_ /= n_samples\n self._set_intercept(X_offset, y_offset, X_scale)\n return self" }, { @@ -98184,7 +104676,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -98194,7 +104687,8 @@ "docstring": { "type": "float or list of float, default=0.5", "description": "The ElasticNet mixing parameter, with 0 < l1_ratio <= 1.\nFor l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it\nis an L2 penalty.\nFor ``0 < l1_ratio < 1``, the penalty is a combination of L1/L2 and L2.\nThis parameter can be a list, in which case the different\nvalues are tested by cross-validation and the one giving the best\nprediction score is used. Note that a good choice of list of\nvalues for l1_ratio is often to put more values close to 1\n(i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7,\n.9, .95, .99, 1]``." - } + }, + "refined_type": {} }, { "name": "eps", @@ -98204,7 +104698,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Length of the path. ``eps=1e-3`` means that\n``alpha_min / alpha_max = 1e-3``." - } + }, + "refined_type": {} }, { "name": "n_alphas", @@ -98214,7 +104709,8 @@ "docstring": { "type": "int, default=100", "description": "Number of alphas along the regularization path." - } + }, + "refined_type": {} }, { "name": "alphas", @@ -98224,7 +104720,8 @@ "docstring": { "type": "array-like, default=None", "description": "List of alphas where to compute the models.\nIf not provided, set automatically." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -98234,7 +104731,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -98244,7 +104742,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -98254,7 +104753,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "tol", @@ -98264,7 +104764,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance for the optimization: if the updates are\nsmaller than ``tol``, the optimization code checks the\ndual gap for optimality and continues until it is smaller\nthan ``tol``." - } + }, + "refined_type": {} }, { "name": "cv", @@ -98274,7 +104775,8 @@ "docstring": { "type": "int, cross-validation generator or iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross-validation,\n- int, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor int/None inputs, :class:`KFold` is used.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -98284,7 +104786,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -98294,7 +104797,8 @@ "docstring": { "type": "bool or int, default=0", "description": "Amount of verbosity." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -98304,7 +104808,8 @@ "docstring": { "type": "int, default=None", "description": "Number of CPUs to use during the cross validation. Note that this is\nused only if multiple values for l1_ratio are given.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -98314,7 +104819,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "The seed of the pseudo random number generator that selects a random\nfeature to update. Used when ``selection`` == 'random'.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "selection", @@ -98324,13 +104830,17 @@ "docstring": { "type": "{'cyclic', 'random'}, default='cyclic'", "description": "If set to 'random', a random coefficient is updated every iteration\nrather than looping over features sequentially by default. This\n(setting to 'random') often leads to significantly faster convergence\nespecially when tol is higher than 1e-4." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "cyclic"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize='deprecated', max_iter=1000, tol=0.0001, cv=None, copy_X=True, verbose=0, n_jobs=None, random_state=None, selection='cyclic'):\n self.l1_ratio = l1_ratio\n self.eps = eps\n self.n_alphas = n_alphas\n self.alphas = alphas\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.max_iter = max_iter\n self.tol = tol\n self.cv = cv\n self.copy_X = copy_X\n self.verbose = verbose\n self.n_jobs = n_jobs\n self.random_state = random_state\n self.selection = selection" }, { @@ -98348,13 +104858,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_estimator(self):\n return MultiTaskElasticNet()" }, { @@ -98372,13 +104883,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _is_multitask(self):\n return True" }, { @@ -98396,13 +104908,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multioutput_only': True}" }, { @@ -98420,7 +104933,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -98430,7 +104944,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -98440,13 +104955,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_targets)", "description": "Training target variable. Will be cast to X's dtype if necessary." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit MultiTaskElasticNet model with coordinate descent.\n\nFit is on grid of alphas and best alpha estimated by cross-validation.", - "docstring": "Fit MultiTaskElasticNet model with coordinate descent.\n\nFit is on grid of alphas and best alpha estimated by cross-validation.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Training data.\ny : ndarray of shape (n_samples, n_targets)\n Training target variable. Will be cast to X's dtype if necessary.\n\nReturns\n-------\nself : object\n Returns MultiTaskElasticNet instance.", + "docstring": "Fit MultiTaskElasticNet model with coordinate descent.\n\n Fit is on grid of alphas and best alpha estimated by cross-validation.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n y : ndarray of shape (n_samples, n_targets)\n Training target variable. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Returns MultiTaskElasticNet instance.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit MultiTaskElasticNet model with coordinate descent.\n\n Fit is on grid of alphas and best alpha estimated by cross-validation.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n y : ndarray of shape (n_samples, n_targets)\n Training target variable. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Returns MultiTaskElasticNet instance.\n \"\"\"\n return super().fit(X, y)" }, { @@ -98464,7 +104980,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -98474,7 +104991,8 @@ "docstring": { "type": "float, default=1.0", "description": "Constant that multiplies the L1/L2 term. Defaults to 1.0." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -98484,7 +105002,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -98494,7 +105013,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -98504,7 +105024,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -98514,7 +105035,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "tol", @@ -98524,7 +105046,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance for the optimization: if the updates are\nsmaller than ``tol``, the optimization code checks the\ndual gap for optimality and continues until it is smaller\nthan ``tol``." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -98534,7 +105057,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, reuse the solution of the previous call to fit as\ninitialization, otherwise, just erase the previous solution.\nSee :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -98544,7 +105068,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "The seed of the pseudo random number generator that selects a random\nfeature to update. Used when ``selection`` == 'random'.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "selection", @@ -98554,13 +105079,17 @@ "docstring": { "type": "{'cyclic', 'random'}, default='cyclic'", "description": "If set to 'random', a random coefficient is updated every iteration\nrather than looping over features sequentially by default. This\n(setting to 'random') often leads to significantly faster convergence\nespecially when tol is higher than 1e-4." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "cyclic"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, random_state=None, selection='cyclic'):\n self.alpha = alpha\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.max_iter = max_iter\n self.copy_X = copy_X\n self.tol = tol\n self.warm_start = warm_start\n self.l1_ratio = 1.0\n self.random_state = random_state\n self.selection = selection" }, { @@ -98578,7 +105107,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eps", @@ -98588,7 +105118,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Length of the path. ``eps=1e-3`` means that\n``alpha_min / alpha_max = 1e-3``." - } + }, + "refined_type": {} }, { "name": "n_alphas", @@ -98598,7 +105129,8 @@ "docstring": { "type": "int, default=100", "description": "Number of alphas along the regularization path." - } + }, + "refined_type": {} }, { "name": "alphas", @@ -98608,7 +105140,8 @@ "docstring": { "type": "array-like, default=None", "description": "List of alphas where to compute the models.\nIf not provided, set automatically." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -98618,7 +105151,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -98628,7 +105162,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -98638,7 +105173,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "tol", @@ -98648,7 +105184,8 @@ "docstring": { "type": "float, default=1e-4", "description": "The tolerance for the optimization: if the updates are\nsmaller than ``tol``, the optimization code checks the\ndual gap for optimality and continues until it is smaller\nthan ``tol``." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -98658,7 +105195,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "cv", @@ -98668,7 +105206,8 @@ "docstring": { "type": "int, cross-validation generator or iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross-validation,\n- int, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor int/None inputs, :class:`KFold` is used.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -98678,7 +105217,8 @@ "docstring": { "type": "bool or int, default=False", "description": "Amount of verbosity." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -98688,7 +105228,8 @@ "docstring": { "type": "int, default=None", "description": "Number of CPUs to use during the cross validation. Note that this is\nused only if multiple values for l1_ratio are given.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -98698,7 +105239,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "The seed of the pseudo random number generator that selects a random\nfeature to update. Used when ``selection`` == 'random'.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "selection", @@ -98708,13 +105250,17 @@ "docstring": { "type": "{'cyclic', 'random'}, default='cyclic'", "description": "If set to 'random', a random coefficient is updated every iteration\nrather than looping over features sequentially by default. This\n(setting to 'random') often leads to significantly faster convergence\nespecially when tol is higher than 1e-4." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "cyclic"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize='deprecated', max_iter=1000, tol=0.0001, copy_X=True, cv=None, verbose=False, n_jobs=None, random_state=None, selection='cyclic'):\n super().__init__(eps=eps, n_alphas=n_alphas, alphas=alphas, fit_intercept=fit_intercept, normalize=normalize, max_iter=max_iter, tol=tol, copy_X=copy_X, cv=cv, verbose=verbose, n_jobs=n_jobs, random_state=random_state, selection=selection)" }, { @@ -98732,13 +105278,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_estimator(self):\n return MultiTaskLasso()" }, { @@ -98756,13 +105303,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _is_multitask(self):\n return True" }, { @@ -98780,13 +105328,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multioutput_only': True}" }, { @@ -98804,7 +105353,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -98814,7 +105364,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Data." - } + }, + "refined_type": {} }, { "name": "y", @@ -98824,13 +105375,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_targets)", "description": "Target. Will be cast to X's dtype if necessary." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit MultiTaskLasso model with coordinate descent.\n\nFit is on grid of alphas and best alpha estimated by cross-validation.", - "docstring": "Fit MultiTaskLasso model with coordinate descent.\n\nFit is on grid of alphas and best alpha estimated by cross-validation.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Data.\ny : ndarray of shape (n_samples, n_targets)\n Target. Will be cast to X's dtype if necessary.\n\nReturns\n-------\nself : object\n Returns an instance of fitted model.", + "docstring": "Fit MultiTaskLasso model with coordinate descent.\n\n Fit is on grid of alphas and best alpha estimated by cross-validation.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data.\n y : ndarray of shape (n_samples, n_targets)\n Target. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Returns an instance of fitted model.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit MultiTaskLasso model with coordinate descent.\n\n Fit is on grid of alphas and best alpha estimated by cross-validation.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data.\n y : ndarray of shape (n_samples, n_targets)\n Target. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Returns an instance of fitted model.\n \"\"\"\n return super().fit(X, y)" }, { @@ -98848,6 +105400,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data. Pass directly as Fortran-contiguous data to avoid\nunnecessary memory duplication" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -98858,7 +105414,8 @@ "docstring": { "type": "ndarray of shape (n_samples,) or (n_samples, n_outputs)", "description": "Target values" - } + }, + "refined_type": {} }, { "name": "Xy", @@ -98868,7 +105425,8 @@ "docstring": { "type": "array-like of shape (n_features,) or (n_features, n_outputs), default=None", "description": "Xy = np.dot(X.T, y) that can be precomputed." - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -98878,7 +105436,8 @@ "docstring": { "type": "float, default=1.0", "description": "The elastic net mixing parameter, with ``0 < l1_ratio <= 1``.\nFor ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not\nsupported) ``For l1_ratio = 1`` it is an L1 penalty. For\n``0 < l1_ratio <1``, the penalty is a combination of L1 and L2." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -98888,7 +105447,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to fit an intercept or not" - } + }, + "refined_type": {} }, { "name": "eps", @@ -98898,7 +105458,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Length of the path. ``eps=1e-3`` means that\n``alpha_min / alpha_max = 1e-3``" - } + }, + "refined_type": {} }, { "name": "n_alphas", @@ -98908,7 +105469,8 @@ "docstring": { "type": "int, default=100", "description": "Number of alphas along the regularization path" - } + }, + "refined_type": {} }, { "name": "normalize", @@ -98918,7 +105480,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -98928,13 +105491,14 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the grid of alpha values for elastic net parameter search", - "docstring": "Compute the grid of alpha values for elastic net parameter search\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. Pass directly as Fortran-contiguous data to avoid\n unnecessary memory duplication\n\ny : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Target values\n\nXy : array-like of shape (n_features,) or (n_features, n_outputs), default=None\n Xy = np.dot(X.T, y) that can be precomputed.\n\nl1_ratio : float, default=1.0\n The elastic net mixing parameter, with ``0 < l1_ratio <= 1``.\n For ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not\n supported) ``For l1_ratio = 1`` it is an L1 penalty. For\n ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2.\n\neps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``\n\nn_alphas : int, default=100\n Number of alphas along the regularization path\n\nfit_intercept : bool, default=True\n Whether to fit an intercept or not\n\nnormalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\ncopy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.", + "docstring": "Compute the grid of alpha values for elastic net parameter search\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. Pass directly as Fortran-contiguous data to avoid\n unnecessary memory duplication\n\n y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Target values\n\n Xy : array-like of shape (n_features,) or (n_features, n_outputs), default=None\n Xy = np.dot(X.T, y) that can be precomputed.\n\n l1_ratio : float, default=1.0\n The elastic net mixing parameter, with ``0 < l1_ratio <= 1``.\n For ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not\n supported) ``For l1_ratio = 1`` it is an L1 penalty. For\n ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2.\n\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``\n\n n_alphas : int, default=100\n Number of alphas along the regularization path\n\n fit_intercept : bool, default=True\n Whether to fit an intercept or not\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n ", "source_code": "\ndef _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True, eps=0.001, n_alphas=100, normalize=False, copy_X=True):\n \"\"\"Compute the grid of alpha values for elastic net parameter search\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. Pass directly as Fortran-contiguous data to avoid\n unnecessary memory duplication\n\n y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Target values\n\n Xy : array-like of shape (n_features,) or (n_features, n_outputs), default=None\n Xy = np.dot(X.T, y) that can be precomputed.\n\n l1_ratio : float, default=1.0\n The elastic net mixing parameter, with ``0 < l1_ratio <= 1``.\n For ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not\n supported) ``For l1_ratio = 1`` it is an L1 penalty. For\n ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2.\n\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``\n\n n_alphas : int, default=100\n Number of alphas along the regularization path\n\n fit_intercept : bool, default=True\n Whether to fit an intercept or not\n\n normalize : bool, default=False\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and will be removed in\n 1.2.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n \"\"\"\n if l1_ratio == 0:\n raise ValueError('Automatic alpha grid generation is not supported for l1_ratio=0. Please supply a grid by providing your estimator with the appropriate `alphas=` argument.')\n n_samples = len(y)\n sparse_center = False\n if Xy is None:\n X_sparse = sparse.isspmatrix(X)\n sparse_center = X_sparse and (fit_intercept or normalize)\n X = check_array(X, accept_sparse='csc', copy=copy_X and fit_intercept and not X_sparse)\n if not X_sparse:\n (X, y, _, _, _) = _preprocess_data(X, y, fit_intercept, normalize, copy=False)\n Xy = safe_sparse_dot(X.T, y, dense_output=True)\n if sparse_center:\n (_, _, X_offset, _, X_scale) = _preprocess_data(X, y, fit_intercept, normalize, return_mean=True)\n mean_dot = X_offset * np.sum(y)\n if Xy.ndim == 1:\n Xy = Xy[:, np.newaxis]\n if sparse_center:\n if fit_intercept:\n Xy -= mean_dot[:, np.newaxis]\n if normalize:\n Xy /= X_scale[:, np.newaxis]\n alpha_max = np.sqrt(np.sum(Xy**2, axis=1)).max() / (n_samples * l1_ratio)\n if alpha_max <= np.finfo(float).resolution:\n alphas = np.empty(n_alphas)\n alphas.fill(np.finfo(float).resolution)\n return alphas\n return np.logspace(np.log10(alpha_max * eps), np.log10(alpha_max), num=n_alphas)[::-1]" }, { @@ -98952,6 +105516,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -98962,7 +105530,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -98972,7 +105541,8 @@ "docstring": { "type": "None or array-like of shape (n_samples,)", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "train", @@ -98982,7 +105552,8 @@ "docstring": { "type": "list of indices", "description": "The indices of the train set." - } + }, + "refined_type": {} }, { "name": "test", @@ -98992,7 +105563,8 @@ "docstring": { "type": "list of indices", "description": "The indices of the test set." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -99002,7 +105574,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -99012,7 +105585,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "path", @@ -99022,7 +105596,8 @@ "docstring": { "type": "callable", "description": "Function returning a list of models on the path. See\nenet_path for an example of signature." - } + }, + "refined_type": {} }, { "name": "path_params", @@ -99032,7 +105607,8 @@ "docstring": { "type": "dictionary", "description": "Parameters passed to the path function." - } + }, + "refined_type": {} }, { "name": "alphas", @@ -99042,7 +105618,8 @@ "docstring": { "type": "array-like, default=None", "description": "Array of float that is used for cross-validation. If not\nprovided, computed using 'path'." - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -99052,7 +105629,8 @@ "docstring": { "type": "float, default=1", "description": "float between 0 and 1 passed to ElasticNet (scaling between\nl1 and l2 penalties). For ``l1_ratio = 0`` the penalty is an\nL2 penalty. For ``l1_ratio = 1`` it is an L1 penalty. For ``0\n< l1_ratio < 1``, the penalty is a combination of L1 and L2." - } + }, + "refined_type": {} }, { "name": "X_order", @@ -99062,6 +105640,10 @@ "docstring": { "type": "{'F', 'C'}, default=None", "description": "The order of the arrays expected by the path function to\navoid memory copies." + }, + "refined_type": { + "kind": "EnumType", + "values": ["F", "C"] } }, { @@ -99072,13 +105654,14 @@ "docstring": { "type": "a numpy dtype, default=None", "description": "The dtype of the arrays expected by the path function to\navoid memory copies." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Returns the MSE for the models computed by 'path'.", - "docstring": "Returns the MSE for the models computed by 'path'.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\nsample_weight : None or array-like of shape (n_samples,)\n Sample weights.\n\ntrain : list of indices\n The indices of the train set.\n\ntest : list of indices\n The indices of the test set.\n\npath : callable\n Function returning a list of models on the path. See\n enet_path for an example of signature.\n\npath_params : dictionary\n Parameters passed to the path function.\n\nalphas : array-like, default=None\n Array of float that is used for cross-validation. If not\n provided, computed using 'path'.\n\nl1_ratio : float, default=1\n float between 0 and 1 passed to ElasticNet (scaling between\n l1 and l2 penalties). For ``l1_ratio = 0`` the penalty is an\n L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty. For ``0\n < l1_ratio < 1``, the penalty is a combination of L1 and L2.\n\nX_order : {'F', 'C'}, default=None\n The order of the arrays expected by the path function to\n avoid memory copies.\n\ndtype : a numpy dtype, default=None\n The dtype of the arrays expected by the path function to\n avoid memory copies.", + "docstring": "Returns the MSE for the models computed by 'path'.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : None or array-like of shape (n_samples,)\n Sample weights.\n\n train : list of indices\n The indices of the train set.\n\n test : list of indices\n The indices of the test set.\n\n path : callable\n Function returning a list of models on the path. See\n enet_path for an example of signature.\n\n path_params : dictionary\n Parameters passed to the path function.\n\n alphas : array-like, default=None\n Array of float that is used for cross-validation. If not\n provided, computed using 'path'.\n\n l1_ratio : float, default=1\n float between 0 and 1 passed to ElasticNet (scaling between\n l1 and l2 penalties). For ``l1_ratio = 0`` the penalty is an\n L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty. For ``0\n < l1_ratio < 1``, the penalty is a combination of L1 and L2.\n\n X_order : {'F', 'C'}, default=None\n The order of the arrays expected by the path function to\n avoid memory copies.\n\n dtype : a numpy dtype, default=None\n The dtype of the arrays expected by the path function to\n avoid memory copies.\n ", "source_code": "\ndef _path_residuals(X, y, sample_weight, train, test, normalize, fit_intercept, path, path_params, alphas=None, l1_ratio=1, X_order=None, dtype=None):\n \"\"\"Returns the MSE for the models computed by 'path'.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : None or array-like of shape (n_samples,)\n Sample weights.\n\n train : list of indices\n The indices of the train set.\n\n test : list of indices\n The indices of the test set.\n\n path : callable\n Function returning a list of models on the path. See\n enet_path for an example of signature.\n\n path_params : dictionary\n Parameters passed to the path function.\n\n alphas : array-like, default=None\n Array of float that is used for cross-validation. If not\n provided, computed using 'path'.\n\n l1_ratio : float, default=1\n float between 0 and 1 passed to ElasticNet (scaling between\n l1 and l2 penalties). For ``l1_ratio = 0`` the penalty is an\n L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty. For ``0\n < l1_ratio < 1``, the penalty is a combination of L1 and L2.\n\n X_order : {'F', 'C'}, default=None\n The order of the arrays expected by the path function to\n avoid memory copies.\n\n dtype : a numpy dtype, default=None\n The dtype of the arrays expected by the path function to\n avoid memory copies.\n \"\"\"\n X_train = X[train]\n y_train = y[train]\n X_test = X[test]\n y_test = y[test]\n if sample_weight is None:\n (sw_train, sw_test) = (None, None)\n else:\n sw_train = sample_weight[train]\n sw_test = sample_weight[test]\n n_samples = X_train.shape[0]\n sw_train *= n_samples / np.sum(sw_train)\n if not sparse.issparse(X):\n for (array, array_input) in ((X_train, X), (y_train, y), (X_test, X), (y_test, y)):\n if array.base is not array_input and not array.flags['WRITEABLE']:\n array.setflags(write=True)\n if y.ndim == 1:\n precompute = path_params['precompute']\n else:\n precompute = False\n (X_train, y_train, X_offset, y_offset, X_scale, precompute, Xy) = _pre_fit(X_train, y_train, None, precompute, normalize, fit_intercept, copy=False, sample_weight=sw_train)\n path_params = path_params.copy()\n path_params['Xy'] = Xy\n path_params['X_offset'] = X_offset\n path_params['X_scale'] = X_scale\n path_params['precompute'] = precompute\n path_params['copy_X'] = False\n path_params['alphas'] = alphas\n if 'l1_ratio' in path_params:\n path_params['l1_ratio'] = l1_ratio\n X_train = check_array(X_train, accept_sparse='csc', dtype=dtype, order=X_order)\n (alphas, coefs, _) = path(X_train, y_train, **path_params)\n del X_train, y_train\n if y.ndim == 1:\n coefs = coefs[np.newaxis, :, :]\n y_offset = np.atleast_1d(y_offset)\n y_test = y_test[:, np.newaxis]\n if normalize:\n nonzeros = np.flatnonzero(X_scale)\n coefs[:, nonzeros] /= X_scale[nonzeros][:, np.newaxis]\n intercepts = y_offset[:, np.newaxis] - np.dot(X_offset, coefs)\n X_test_coefs = safe_sparse_dot(X_test, coefs)\n residues = X_test_coefs - y_test[:, :, np.newaxis]\n residues += intercepts\n if sample_weight is None:\n this_mse = (residues**2).mean(axis=0)\n else:\n this_mse = np.average(residues**2, weights=sw_test, axis=0)\n return this_mse.mean(axis=0)" }, { @@ -99096,6 +105679,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -99106,7 +105693,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "order", @@ -99116,13 +105704,17 @@ "docstring": { "type": "{None, 'C', 'F'}", "description": "If 'C', dense arrays are returned as C-ordered, sparse matrices in csr\nformat. If 'F', dense arrays are return as F-ordered, sparse matrices\nin csc format." + }, + "refined_type": { + "kind": "EnumType", + "values": ["F", "C"] } } ], "results": [], "is_public": false, "description": "Change the order of X and y if necessary.", - "docstring": "Change the order of X and y if necessary.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : ndarray of shape (n_samples,)\n Target values.\n\norder : {None, 'C', 'F'}\n If 'C', dense arrays are returned as C-ordered, sparse matrices in csr\n format. If 'F', dense arrays are return as F-ordered, sparse matrices\n in csc format.\n\nReturns\n-------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data with guaranteed order.\n\ny : ndarray of shape (n_samples,)\n Target values with guaranteed order.", + "docstring": "Change the order of X and y if necessary.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n order : {None, 'C', 'F'}\n If 'C', dense arrays are returned as C-ordered, sparse matrices in csr\n format. If 'F', dense arrays are return as F-ordered, sparse matrices\n in csc format.\n\n Returns\n -------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data with guaranteed order.\n\n y : ndarray of shape (n_samples,)\n Target values with guaranteed order.\n ", "source_code": "\ndef _set_order(X, y, order='C'):\n \"\"\"Change the order of X and y if necessary.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n order : {None, 'C', 'F'}\n If 'C', dense arrays are returned as C-ordered, sparse matrices in csr\n format. If 'F', dense arrays are return as F-ordered, sparse matrices\n in csc format.\n\n Returns\n -------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data with guaranteed order.\n\n y : ndarray of shape (n_samples,)\n Target values with guaranteed order.\n \"\"\"\n if order not in [None, 'C', 'F']:\n raise ValueError(\"Unknown value for order. Got {} instead of None, 'C' or 'F'.\".format(order))\n sparse_X = sparse.issparse(X)\n sparse_y = sparse.issparse(y)\n if order is not None:\n sparse_format = 'csc' if order == 'F' else 'csr'\n if sparse_X:\n X = X.asformat(sparse_format, **_astype_copy_false(X))\n else:\n X = np.asarray(X, order=order)\n if sparse_y:\n y = y.asformat(sparse_format)\n else:\n y = np.asarray(y, order=order)\n return X, y" }, { @@ -99140,6 +105732,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data. Pass directly as Fortran-contiguous data to avoid\nunnecessary memory duplication. If ``y`` is mono-output then ``X``\ncan be sparse." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -99150,6 +105746,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -99160,7 +105760,8 @@ "docstring": { "type": "float, default=0.5", "description": "Number between 0 and 1 passed to elastic net (scaling between\nl1 and l2 penalties). ``l1_ratio=1`` corresponds to the Lasso." - } + }, + "refined_type": {} }, { "name": "eps", @@ -99170,7 +105771,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Length of the path. ``eps=1e-3`` means that\n``alpha_min / alpha_max = 1e-3``." - } + }, + "refined_type": {} }, { "name": "n_alphas", @@ -99180,7 +105782,8 @@ "docstring": { "type": "int, default=100", "description": "Number of alphas along the regularization path." - } + }, + "refined_type": {} }, { "name": "alphas", @@ -99190,7 +105793,8 @@ "docstring": { "type": "ndarray, default=None", "description": "List of alphas where to compute the models.\nIf None alphas are set automatically." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -99200,7 +105804,8 @@ "docstring": { "type": "'auto', bool or array-like of shape (n_features, n_features), default='auto'", "description": "Whether to use a precomputed Gram matrix to speed up\ncalculations. If set to ``'auto'`` let us decide. The Gram\nmatrix can also be passed as argument." - } + }, + "refined_type": {} }, { "name": "Xy", @@ -99210,7 +105815,8 @@ "docstring": { "type": "array-like of shape (n_features,) or (n_features, n_targets), default=None", "description": "Xy = np.dot(X.T, y) that can be precomputed. It is useful\nonly when the Gram matrix is precomputed." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -99220,7 +105826,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -99230,7 +105837,8 @@ "docstring": { "type": "ndarray of shape (n_features, ), default=None", "description": "The initial values of the coefficients." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -99240,7 +105848,8 @@ "docstring": { "type": "bool or int, default=False", "description": "Amount of verbosity." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -99250,7 +105859,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the number of iterations or not." - } + }, + "refined_type": {} }, { "name": "positive", @@ -99260,7 +105870,8 @@ "docstring": { "type": "bool, default=False", "description": "If set to True, forces coefficients to be positive.\n(Only allowed when ``y.ndim == 1``)." - } + }, + "refined_type": {} }, { "name": "check_input", @@ -99270,13 +105881,14 @@ "docstring": { "type": "bool, default=True", "description": "If set to False, the input validation checks are skipped (including the\nGram matrix when provided). It is assumed that they are handled\nby the caller." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute elastic net path with coordinate descent.\n\nThe elastic net optimization function varies for mono and multi-outputs. For mono-output tasks it is:: 1 / (2 * n_samples) * ||y - Xw||^2_2 + alpha * l1_ratio * ||w||_1 + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2 For multi-output tasks it is:: (1 / (2 * n_samples)) * ||Y - XW||_Fro^2 + alpha * l1_ratio * ||W||_21 + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 Where:: ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} i.e. the sum of norm of each row. Read more in the :ref:`User Guide `.", - "docstring": "Compute elastic net path with coordinate descent.\n\nThe elastic net optimization function varies for mono and multi-outputs.\n\nFor mono-output tasks it is::\n\n 1 / (2 * n_samples) * ||y - Xw||^2_2\n + alpha * l1_ratio * ||w||_1\n + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2\n\nFor multi-output tasks it is::\n\n (1 / (2 * n_samples)) * ||Y - XW||_Fro^2\n + alpha * l1_ratio * ||W||_21\n + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2\n\nWhere::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\ni.e. the sum of norm of each row.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. Pass directly as Fortran-contiguous data to avoid\n unnecessary memory duplication. If ``y`` is mono-output then ``X``\n can be sparse.\n\ny : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\nl1_ratio : float, default=0.5\n Number between 0 and 1 passed to elastic net (scaling between\n l1 and l2 penalties). ``l1_ratio=1`` corresponds to the Lasso.\n\neps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\nn_alphas : int, default=100\n Number of alphas along the regularization path.\n\nalphas : ndarray, default=None\n List of alphas where to compute the models.\n If None alphas are set automatically.\n\nprecompute : 'auto', bool or array-like of shape (n_features, n_features), default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\nXy : array-like of shape (n_features,) or (n_features, n_targets), default=None\n Xy = np.dot(X.T, y) that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\ncopy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\ncoef_init : ndarray of shape (n_features, ), default=None\n The initial values of the coefficients.\n\nverbose : bool or int, default=False\n Amount of verbosity.\n\nreturn_n_iter : bool, default=False\n Whether to return the number of iterations or not.\n\npositive : bool, default=False\n If set to True, forces coefficients to be positive.\n (Only allowed when ``y.ndim == 1``).\n\ncheck_input : bool, default=True\n If set to False, the input validation checks are skipped (including the\n Gram matrix when provided). It is assumed that they are handled\n by the caller.\n\n**params : kwargs\n Keyword arguments passed to the coordinate descent solver.\n\nReturns\n-------\nalphas : ndarray of shape (n_alphas,)\n The alphas along the path where models are computed.\n\ncoefs : ndarray of shape (n_features, n_alphas) or (n_targets, n_features, n_alphas)\n Coefficients along the path.\n\ndual_gaps : ndarray of shape (n_alphas,)\n The dual gaps at the end of the optimization for each alpha.\n\nn_iters : list of int\n The number of iterations taken by the coordinate descent optimizer to\n reach the specified tolerance for each alpha.\n (Is returned when ``return_n_iter`` is set to True).\n\nSee Also\n--------\nMultiTaskElasticNet : Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer.\nMultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in cross-validation.\nElasticNet : Linear regression with combined L1 and L2 priors as regularizer.\nElasticNetCV : Elastic Net model with iterative fitting along a regularization path.\n\nNotes\n-----\nFor an example, see\n:ref:`examples/linear_model/plot_lasso_coordinate_descent_path.py\n`.", + "description": "Compute elastic net path with coordinate descent.\n\nThe elastic net optimization function varies for mono and multi-outputs.\n\nFor mono-output tasks it is::\n\n 1 / (2 * n_samples) * ||y - Xw||^2_2\n + alpha * l1_ratio * ||w||_1\n + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2\n\nFor multi-output tasks it is::\n\n (1 / (2 * n_samples)) * ||Y - XW||_Fro^2\n + alpha * l1_ratio * ||W||_21\n + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2\n\nWhere::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\ni.e. the sum of norm of each row.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute elastic net path with coordinate descent.\n\n The elastic net optimization function varies for mono and multi-outputs.\n\n For mono-output tasks it is::\n\n 1 / (2 * n_samples) * ||y - Xw||^2_2\n + alpha * l1_ratio * ||w||_1\n + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2\n\n For multi-output tasks it is::\n\n (1 / (2 * n_samples)) * ||Y - XW||_Fro^2\n + alpha * l1_ratio * ||W||_21\n + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2\n\n Where::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\n i.e. the sum of norm of each row.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. Pass directly as Fortran-contiguous data to avoid\n unnecessary memory duplication. If ``y`` is mono-output then ``X``\n can be sparse.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n l1_ratio : float, default=0.5\n Number between 0 and 1 passed to elastic net (scaling between\n l1 and l2 penalties). ``l1_ratio=1`` corresponds to the Lasso.\n\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\n n_alphas : int, default=100\n Number of alphas along the regularization path.\n\n alphas : ndarray, default=None\n List of alphas where to compute the models.\n If None alphas are set automatically.\n\n precompute : 'auto', bool or array-like of shape (n_features, n_features), default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n Xy : array-like of shape (n_features,) or (n_features, n_targets), default=None\n Xy = np.dot(X.T, y) that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n coef_init : ndarray of shape (n_features, ), default=None\n The initial values of the coefficients.\n\n verbose : bool or int, default=False\n Amount of verbosity.\n\n return_n_iter : bool, default=False\n Whether to return the number of iterations or not.\n\n positive : bool, default=False\n If set to True, forces coefficients to be positive.\n (Only allowed when ``y.ndim == 1``).\n\n check_input : bool, default=True\n If set to False, the input validation checks are skipped (including the\n Gram matrix when provided). It is assumed that they are handled\n by the caller.\n\n **params : kwargs\n Keyword arguments passed to the coordinate descent solver.\n\n Returns\n -------\n alphas : ndarray of shape (n_alphas,)\n The alphas along the path where models are computed.\n\n coefs : ndarray of shape (n_features, n_alphas) or (n_targets, n_features, n_alphas)\n Coefficients along the path.\n\n dual_gaps : ndarray of shape (n_alphas,)\n The dual gaps at the end of the optimization for each alpha.\n\n n_iters : list of int\n The number of iterations taken by the coordinate descent optimizer to\n reach the specified tolerance for each alpha.\n (Is returned when ``return_n_iter`` is set to True).\n\n See Also\n --------\n MultiTaskElasticNet : Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer.\n MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in cross-validation.\n ElasticNet : Linear regression with combined L1 and L2 priors as regularizer.\n ElasticNetCV : Elastic Net model with iterative fitting along a regularization path.\n\n Notes\n -----\n For an example, see\n :ref:`examples/linear_model/plot_lasso_coordinate_descent_path.py\n `.\n ", "source_code": "\ndef enet_path(X, y, *, l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, precompute='auto', Xy=None, copy_X=True, coef_init=None, verbose=False, return_n_iter=False, positive=False, check_input=True, **params):\n \"\"\"Compute elastic net path with coordinate descent.\n\n The elastic net optimization function varies for mono and multi-outputs.\n\n For mono-output tasks it is::\n\n 1 / (2 * n_samples) * ||y - Xw||^2_2\n + alpha * l1_ratio * ||w||_1\n + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2\n\n For multi-output tasks it is::\n\n (1 / (2 * n_samples)) * ||Y - XW||_Fro^2\n + alpha * l1_ratio * ||W||_21\n + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2\n\n Where::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\n i.e. the sum of norm of each row.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. Pass directly as Fortran-contiguous data to avoid\n unnecessary memory duplication. If ``y`` is mono-output then ``X``\n can be sparse.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n l1_ratio : float, default=0.5\n Number between 0 and 1 passed to elastic net (scaling between\n l1 and l2 penalties). ``l1_ratio=1`` corresponds to the Lasso.\n\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\n n_alphas : int, default=100\n Number of alphas along the regularization path.\n\n alphas : ndarray, default=None\n List of alphas where to compute the models.\n If None alphas are set automatically.\n\n precompute : 'auto', bool or array-like of shape (n_features, n_features), default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n Xy : array-like of shape (n_features,) or (n_features, n_targets), default=None\n Xy = np.dot(X.T, y) that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n coef_init : ndarray of shape (n_features, ), default=None\n The initial values of the coefficients.\n\n verbose : bool or int, default=False\n Amount of verbosity.\n\n return_n_iter : bool, default=False\n Whether to return the number of iterations or not.\n\n positive : bool, default=False\n If set to True, forces coefficients to be positive.\n (Only allowed when ``y.ndim == 1``).\n\n check_input : bool, default=True\n If set to False, the input validation checks are skipped (including the\n Gram matrix when provided). It is assumed that they are handled\n by the caller.\n\n **params : kwargs\n Keyword arguments passed to the coordinate descent solver.\n\n Returns\n -------\n alphas : ndarray of shape (n_alphas,)\n The alphas along the path where models are computed.\n\n coefs : ndarray of shape (n_features, n_alphas) or (n_targets, n_features, n_alphas)\n Coefficients along the path.\n\n dual_gaps : ndarray of shape (n_alphas,)\n The dual gaps at the end of the optimization for each alpha.\n\n n_iters : list of int\n The number of iterations taken by the coordinate descent optimizer to\n reach the specified tolerance for each alpha.\n (Is returned when ``return_n_iter`` is set to True).\n\n See Also\n --------\n MultiTaskElasticNet : Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer.\n MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in cross-validation.\n ElasticNet : Linear regression with combined L1 and L2 priors as regularizer.\n ElasticNetCV : Elastic Net model with iterative fitting along a regularization path.\n\n Notes\n -----\n For an example, see\n :ref:`examples/linear_model/plot_lasso_coordinate_descent_path.py\n `.\n \"\"\"\n X_offset_param = params.pop('X_offset', None)\n X_scale_param = params.pop('X_scale', None)\n tol = params.pop('tol', 0.0001)\n max_iter = params.pop('max_iter', 1000)\n random_state = params.pop('random_state', None)\n selection = params.pop('selection', 'cyclic')\n if len(params) > 0:\n raise ValueError('Unexpected parameters in params', params.keys())\n if check_input:\n X = check_array(X, accept_sparse='csc', dtype=[np.float64, np.float32], order='F', copy=copy_X)\n y = check_array(y, accept_sparse='csc', dtype=X.dtype.type, order='F', copy=False, ensure_2d=False)\n if Xy is not None:\n Xy = check_array(Xy, dtype=X.dtype.type, order='C', copy=False, ensure_2d=False)\n (n_samples, n_features) = X.shape\n multi_output = False\n if y.ndim != 1:\n multi_output = True\n n_targets = y.shape[1]\n if multi_output and positive:\n raise ValueError('positive=True is not allowed for multi-output (y.ndim != 1)')\n if not multi_output and sparse.isspmatrix(X):\n if X_offset_param is not None:\n X_sparse_scaling = X_offset_param / X_scale_param\n X_sparse_scaling = np.asarray(X_sparse_scaling, dtype=X.dtype)\n else:\n X_sparse_scaling = np.zeros(n_features, dtype=X.dtype)\n if check_input:\n (X, y, X_offset, y_offset, X_scale, precompute, Xy) = _pre_fit(X, y, Xy, precompute, normalize=False, fit_intercept=False, copy=False, check_input=check_input)\n if alphas is None:\n alphas = _alpha_grid(X, y, Xy=Xy, l1_ratio=l1_ratio, fit_intercept=False, eps=eps, n_alphas=n_alphas, normalize=False, copy_X=False)\n else:\n alphas = np.sort(alphas)[::-1]\n n_alphas = len(alphas)\n dual_gaps = np.empty(n_alphas)\n n_iters = []\n rng = check_random_state(random_state)\n if selection not in ['random', 'cyclic']:\n raise ValueError('selection should be either random or cyclic.')\n random = selection == 'random'\n if not multi_output:\n coefs = np.empty((n_features, n_alphas), dtype=X.dtype)\n else:\n coefs = np.empty((n_targets, n_features, n_alphas), dtype=X.dtype)\n if coef_init is None:\n coef_ = np.zeros(coefs.shape[:-1], dtype=X.dtype, order='F')\n else:\n coef_ = np.asfortranarray(coef_init, dtype=X.dtype)\n for (i, alpha) in enumerate(alphas):\n l1_reg = alpha * l1_ratio * n_samples\n l2_reg = alpha * (1.0 - l1_ratio) * n_samples\n if not multi_output and sparse.isspmatrix(X):\n model = cd_fast.sparse_enet_coordinate_descent(coef_, l1_reg, l2_reg, X.data, X.indices, X.indptr, y, X_sparse_scaling, max_iter, tol, rng, random, positive)\n elif multi_output:\n model = cd_fast.enet_coordinate_descent_multi_task(coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random)\n elif isinstance(precompute, np.ndarray):\n if check_input:\n precompute = check_array(precompute, dtype=X.dtype.type, order='C')\n model = cd_fast.enet_coordinate_descent_gram(coef_, l1_reg, l2_reg, precompute, Xy, y, max_iter, tol, rng, random, positive)\n elif precompute is False:\n model = cd_fast.enet_coordinate_descent(coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive)\n else:\n raise ValueError(\"Precompute should be one of True, False, 'auto' or array-like. Got %r\" % precompute)\n (coef_, dual_gap_, eps_, n_iter_) = model\n coefs[..., i] = coef_\n dual_gaps[i] = dual_gap_ / n_samples\n n_iters.append(n_iter_)\n if verbose:\n if verbose > 2:\n print(model)\n elif verbose > 1:\n print('Path: %03i out of %03i' % (i, n_alphas))\n else:\n sys.stderr.write('.')\n if return_n_iter:\n return alphas, coefs, dual_gaps, n_iters\n return alphas, coefs, dual_gaps" }, { @@ -99294,6 +105906,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data. Pass directly as Fortran-contiguous data to avoid\nunnecessary memory duplication. If ``y`` is mono-output then ``X``\ncan be sparse." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -99304,6 +105920,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -99314,7 +105934,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Length of the path. ``eps=1e-3`` means that\n``alpha_min / alpha_max = 1e-3``." - } + }, + "refined_type": {} }, { "name": "n_alphas", @@ -99324,7 +105945,8 @@ "docstring": { "type": "int, default=100", "description": "Number of alphas along the regularization path." - } + }, + "refined_type": {} }, { "name": "alphas", @@ -99334,7 +105956,8 @@ "docstring": { "type": "ndarray, default=None", "description": "List of alphas where to compute the models.\nIf ``None`` alphas are set automatically." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -99344,7 +105967,8 @@ "docstring": { "type": "'auto', bool or array-like of shape (n_features, n_features), default='auto'", "description": "Whether to use a precomputed Gram matrix to speed up\ncalculations. If set to ``'auto'`` let us decide. The Gram\nmatrix can also be passed as argument." - } + }, + "refined_type": {} }, { "name": "Xy", @@ -99354,7 +105978,8 @@ "docstring": { "type": "array-like of shape (n_features,) or (n_features, n_targets), default=None", "description": "Xy = np.dot(X.T, y) that can be precomputed. It is useful\nonly when the Gram matrix is precomputed." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -99364,7 +105989,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -99374,7 +106000,8 @@ "docstring": { "type": "ndarray of shape (n_features, ), default=None", "description": "The initial values of the coefficients." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -99384,7 +106011,8 @@ "docstring": { "type": "bool or int, default=False", "description": "Amount of verbosity." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -99394,7 +106022,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the number of iterations or not." - } + }, + "refined_type": {} }, { "name": "positive", @@ -99404,13 +106033,14 @@ "docstring": { "type": "bool, default=False", "description": "If set to True, forces coefficients to be positive.\n(Only allowed when ``y.ndim == 1``)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute Lasso path with coordinate descent.\n\nThe Lasso optimization function varies for mono and multi-outputs. For mono-output tasks it is:: (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 For multi-output tasks it is:: (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21 Where:: ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} i.e. the sum of norm of each row. Read more in the :ref:`User Guide `.", - "docstring": "Compute Lasso path with coordinate descent.\n\nThe Lasso optimization function varies for mono and multi-outputs.\n\nFor mono-output tasks it is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nFor multi-output tasks it is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21\n\nWhere::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\ni.e. the sum of norm of each row.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. Pass directly as Fortran-contiguous data to avoid\n unnecessary memory duplication. If ``y`` is mono-output then ``X``\n can be sparse.\n\ny : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\neps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\nn_alphas : int, default=100\n Number of alphas along the regularization path.\n\nalphas : ndarray, default=None\n List of alphas where to compute the models.\n If ``None`` alphas are set automatically.\n\nprecompute : 'auto', bool or array-like of shape (n_features, n_features), default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\nXy : array-like of shape (n_features,) or (n_features, n_targets), default=None\n Xy = np.dot(X.T, y) that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\ncopy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\ncoef_init : ndarray of shape (n_features, ), default=None\n The initial values of the coefficients.\n\nverbose : bool or int, default=False\n Amount of verbosity.\n\nreturn_n_iter : bool, default=False\n Whether to return the number of iterations or not.\n\npositive : bool, default=False\n If set to True, forces coefficients to be positive.\n (Only allowed when ``y.ndim == 1``).\n\n**params : kwargs\n Keyword arguments passed to the coordinate descent solver.\n\nReturns\n-------\nalphas : ndarray of shape (n_alphas,)\n The alphas along the path where models are computed.\n\ncoefs : ndarray of shape (n_features, n_alphas) or (n_targets, n_features, n_alphas)\n Coefficients along the path.\n\ndual_gaps : ndarray of shape (n_alphas,)\n The dual gaps at the end of the optimization for each alpha.\n\nn_iters : list of int\n The number of iterations taken by the coordinate descent optimizer to\n reach the specified tolerance for each alpha.\n\nSee Also\n--------\nlars_path : Compute Least Angle Regression or Lasso path using LARS\n algorithm.\nLasso : The Lasso is a linear model that estimates sparse coefficients.\nLassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\nLassoCV : Lasso linear model with iterative fitting along a regularization\n path.\nLassoLarsCV : Cross-validated Lasso using the LARS algorithm.\nsklearn.decomposition.sparse_encode : Estimator that can be used to\n transform signals into sparse linear combination of atoms from a fixed.\n\nNotes\n-----\nFor an example, see\n:ref:`examples/linear_model/plot_lasso_coordinate_descent_path.py\n`.\n\nTo avoid unnecessary memory duplication the X argument of the fit method\nshould be directly passed as a Fortran-contiguous numpy array.\n\nNote that in certain cases, the Lars solver may be significantly\nfaster to implement this functionality. In particular, linear\ninterpolation can be used to retrieve model coefficients between the\nvalues output by lars_path\n\nExamples\n--------\n\nComparing lasso_path and lars_path with interpolation:\n\n>>> import numpy as np\n>>> from sklearn.linear_model import lasso_path\n>>> X = np.array([[1, 2, 3.1], [2.3, 5.4, 4.3]]).T\n>>> y = np.array([1, 2, 3.1])\n>>> # Use lasso_path to compute a coefficient path\n>>> _, coef_path, _ = lasso_path(X, y, alphas=[5., 1., .5])\n>>> print(coef_path)\n[[0. 0. 0.46874778]\n [0.2159048 0.4425765 0.23689075]]\n\n>>> # Now use lars_path and 1D linear interpolation to compute the\n>>> # same path\n>>> from sklearn.linear_model import lars_path\n>>> alphas, active, coef_path_lars = lars_path(X, y, method='lasso')\n>>> from scipy import interpolate\n>>> coef_path_continuous = interpolate.interp1d(alphas[::-1],\n... coef_path_lars[:, ::-1])\n>>> print(coef_path_continuous([5., 1., .5]))\n[[0. 0. 0.46915237]\n [0.2159048 0.4425765 0.23668876]]", + "description": "Compute Lasso path with coordinate descent.\n\nThe Lasso optimization function varies for mono and multi-outputs.\n\nFor mono-output tasks it is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nFor multi-output tasks it is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21\n\nWhere::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\ni.e. the sum of norm of each row.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute Lasso path with coordinate descent.\n\n The Lasso optimization function varies for mono and multi-outputs.\n\n For mono-output tasks it is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n For multi-output tasks it is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21\n\n Where::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\n i.e. the sum of norm of each row.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. Pass directly as Fortran-contiguous data to avoid\n unnecessary memory duplication. If ``y`` is mono-output then ``X``\n can be sparse.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\n n_alphas : int, default=100\n Number of alphas along the regularization path.\n\n alphas : ndarray, default=None\n List of alphas where to compute the models.\n If ``None`` alphas are set automatically.\n\n precompute : 'auto', bool or array-like of shape (n_features, n_features), default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n Xy : array-like of shape (n_features,) or (n_features, n_targets), default=None\n Xy = np.dot(X.T, y) that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n coef_init : ndarray of shape (n_features, ), default=None\n The initial values of the coefficients.\n\n verbose : bool or int, default=False\n Amount of verbosity.\n\n return_n_iter : bool, default=False\n Whether to return the number of iterations or not.\n\n positive : bool, default=False\n If set to True, forces coefficients to be positive.\n (Only allowed when ``y.ndim == 1``).\n\n **params : kwargs\n Keyword arguments passed to the coordinate descent solver.\n\n Returns\n -------\n alphas : ndarray of shape (n_alphas,)\n The alphas along the path where models are computed.\n\n coefs : ndarray of shape (n_features, n_alphas) or (n_targets, n_features, n_alphas)\n Coefficients along the path.\n\n dual_gaps : ndarray of shape (n_alphas,)\n The dual gaps at the end of the optimization for each alpha.\n\n n_iters : list of int\n The number of iterations taken by the coordinate descent optimizer to\n reach the specified tolerance for each alpha.\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso path using LARS\n algorithm.\n Lasso : The Lasso is a linear model that estimates sparse coefficients.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n LassoCV : Lasso linear model with iterative fitting along a regularization\n path.\n LassoLarsCV : Cross-validated Lasso using the LARS algorithm.\n sklearn.decomposition.sparse_encode : Estimator that can be used to\n transform signals into sparse linear combination of atoms from a fixed.\n\n Notes\n -----\n For an example, see\n :ref:`examples/linear_model/plot_lasso_coordinate_descent_path.py\n `.\n\n To avoid unnecessary memory duplication the X argument of the fit method\n should be directly passed as a Fortran-contiguous numpy array.\n\n Note that in certain cases, the Lars solver may be significantly\n faster to implement this functionality. In particular, linear\n interpolation can be used to retrieve model coefficients between the\n values output by lars_path\n\n Examples\n --------\n\n Comparing lasso_path and lars_path with interpolation:\n\n >>> import numpy as np\n >>> from sklearn.linear_model import lasso_path\n >>> X = np.array([[1, 2, 3.1], [2.3, 5.4, 4.3]]).T\n >>> y = np.array([1, 2, 3.1])\n >>> # Use lasso_path to compute a coefficient path\n >>> _, coef_path, _ = lasso_path(X, y, alphas=[5., 1., .5])\n >>> print(coef_path)\n [[0. 0. 0.46874778]\n [0.2159048 0.4425765 0.23689075]]\n\n >>> # Now use lars_path and 1D linear interpolation to compute the\n >>> # same path\n >>> from sklearn.linear_model import lars_path\n >>> alphas, active, coef_path_lars = lars_path(X, y, method='lasso')\n >>> from scipy import interpolate\n >>> coef_path_continuous = interpolate.interp1d(alphas[::-1],\n ... coef_path_lars[:, ::-1])\n >>> print(coef_path_continuous([5., 1., .5]))\n [[0. 0. 0.46915237]\n [0.2159048 0.4425765 0.23668876]]\n ", "source_code": "\ndef lasso_path(X, y, *, eps=0.001, n_alphas=100, alphas=None, precompute='auto', Xy=None, copy_X=True, coef_init=None, verbose=False, return_n_iter=False, positive=False, **params):\n \"\"\"Compute Lasso path with coordinate descent.\n\n The Lasso optimization function varies for mono and multi-outputs.\n\n For mono-output tasks it is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n For multi-output tasks it is::\n\n (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21\n\n Where::\n\n ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}\n\n i.e. the sum of norm of each row.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data. Pass directly as Fortran-contiguous data to avoid\n unnecessary memory duplication. If ``y`` is mono-output then ``X``\n can be sparse.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n eps : float, default=1e-3\n Length of the path. ``eps=1e-3`` means that\n ``alpha_min / alpha_max = 1e-3``.\n\n n_alphas : int, default=100\n Number of alphas along the regularization path.\n\n alphas : ndarray, default=None\n List of alphas where to compute the models.\n If ``None`` alphas are set automatically.\n\n precompute : 'auto', bool or array-like of shape (n_features, n_features), default='auto'\n Whether to use a precomputed Gram matrix to speed up\n calculations. If set to ``'auto'`` let us decide. The Gram\n matrix can also be passed as argument.\n\n Xy : array-like of shape (n_features,) or (n_features, n_targets), default=None\n Xy = np.dot(X.T, y) that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\n copy_X : bool, default=True\n If ``True``, X will be copied; else, it may be overwritten.\n\n coef_init : ndarray of shape (n_features, ), default=None\n The initial values of the coefficients.\n\n verbose : bool or int, default=False\n Amount of verbosity.\n\n return_n_iter : bool, default=False\n Whether to return the number of iterations or not.\n\n positive : bool, default=False\n If set to True, forces coefficients to be positive.\n (Only allowed when ``y.ndim == 1``).\n\n **params : kwargs\n Keyword arguments passed to the coordinate descent solver.\n\n Returns\n -------\n alphas : ndarray of shape (n_alphas,)\n The alphas along the path where models are computed.\n\n coefs : ndarray of shape (n_features, n_alphas) or (n_targets, n_features, n_alphas)\n Coefficients along the path.\n\n dual_gaps : ndarray of shape (n_alphas,)\n The dual gaps at the end of the optimization for each alpha.\n\n n_iters : list of int\n The number of iterations taken by the coordinate descent optimizer to\n reach the specified tolerance for each alpha.\n\n See Also\n --------\n lars_path : Compute Least Angle Regression or Lasso path using LARS\n algorithm.\n Lasso : The Lasso is a linear model that estimates sparse coefficients.\n LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars.\n LassoCV : Lasso linear model with iterative fitting along a regularization\n path.\n LassoLarsCV : Cross-validated Lasso using the LARS algorithm.\n sklearn.decomposition.sparse_encode : Estimator that can be used to\n transform signals into sparse linear combination of atoms from a fixed.\n\n Notes\n -----\n For an example, see\n :ref:`examples/linear_model/plot_lasso_coordinate_descent_path.py\n `.\n\n To avoid unnecessary memory duplication the X argument of the fit method\n should be directly passed as a Fortran-contiguous numpy array.\n\n Note that in certain cases, the Lars solver may be significantly\n faster to implement this functionality. In particular, linear\n interpolation can be used to retrieve model coefficients between the\n values output by lars_path\n\n Examples\n --------\n\n Comparing lasso_path and lars_path with interpolation:\n\n >>> import numpy as np\n >>> from sklearn.linear_model import lasso_path\n >>> X = np.array([[1, 2, 3.1], [2.3, 5.4, 4.3]]).T\n >>> y = np.array([1, 2, 3.1])\n >>> # Use lasso_path to compute a coefficient path\n >>> _, coef_path, _ = lasso_path(X, y, alphas=[5., 1., .5])\n >>> print(coef_path)\n [[0. 0. 0.46874778]\n [0.2159048 0.4425765 0.23689075]]\n\n >>> # Now use lars_path and 1D linear interpolation to compute the\n >>> # same path\n >>> from sklearn.linear_model import lars_path\n >>> alphas, active, coef_path_lars = lars_path(X, y, method='lasso')\n >>> from scipy import interpolate\n >>> coef_path_continuous = interpolate.interp1d(alphas[::-1],\n ... coef_path_lars[:, ::-1])\n >>> print(coef_path_continuous([5., 1., .5]))\n [[0. 0. 0.46915237]\n [0.2159048 0.4425765 0.23668876]]\n \"\"\"\n return enet_path(X, y, l1_ratio=1.0, eps=eps, n_alphas=n_alphas, alphas=alphas, precompute=precompute, Xy=Xy, copy_X=copy_X, coef_init=coef_init, verbose=verbose, positive=positive, return_n_iter=return_n_iter, **params)" }, { @@ -99428,7 +106058,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -99438,7 +106069,8 @@ "docstring": { "type": "float, default=1", "description": "Constant that multiplies the penalty term and thus determines the\nregularization strength. ``alpha = 0`` is equivalent to unpenalized\nGLMs. In this case, the design matrix `X` must have full column rank\n(no collinearities)." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -99448,7 +106080,8 @@ "docstring": { "type": "bool, default=True", "description": "Specifies if a constant (a.k.a. bias or intercept) should be\nadded to the linear predictor (X @ coef + intercept)." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -99458,7 +106091,8 @@ "docstring": { "type": "int, default=100", "description": "The maximal number of iterations for the solver." - } + }, + "refined_type": {} }, { "name": "tol", @@ -99468,6 +106102,10 @@ "docstring": { "type": "float, default=1e-4", "description": "Stopping criterion. For the lbfgs solver,\nthe iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``\nwhere ``g_j`` is the j-th component of the gradient (derivative) of\nthe objective function." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -99478,7 +106116,8 @@ "docstring": { "type": "bool, default=False", "description": "If set to ``True``, reuse the solution of the previous call to ``fit``\nas initialization for ``coef_`` and ``intercept_`` ." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -99488,13 +106127,14 @@ "docstring": { "type": "int, default=0", "description": "For the lbfgs solver set verbose to any positive number for verbosity." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, tol=0.0001, warm_start=False, verbose=0):\n super().__init__(alpha=alpha, fit_intercept=fit_intercept, family='gamma', link='log', max_iter=max_iter, tol=tol, warm_start=warm_start, verbose=verbose)" }, { @@ -99512,7 +106152,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -99536,7 +106177,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -99546,13 +106188,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@family.setter\ndef family(self, value):\n if value != 'gamma':\n raise ValueError(\"GammaRegressor.family must be 'gamma'!\")" }, { @@ -99570,7 +106213,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -99580,7 +106224,8 @@ "docstring": { "type": "float, default=1", "description": "Constant that multiplies the penalty term and thus determines the\nregularization strength. ``alpha = 0`` is equivalent to unpenalized\nGLMs. In this case, the design matrix `X` must have full column rank\n(no collinearities)." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -99590,7 +106235,8 @@ "docstring": { "type": "bool, default=True", "description": "Specifies if a constant (a.k.a. bias or intercept) should be\nadded to the linear predictor (X @ coef + intercept)." - } + }, + "refined_type": {} }, { "name": "family", @@ -99600,6 +106246,15 @@ "docstring": { "type": "{'normal', 'poisson', 'gamma', 'inverse-gaussian'} or an ExponentialDispersionModel instance, default='normal'", "description": "The distributional assumption of the GLM, i.e. which distribution from\nthe EDM, specifies the loss function to be minimized." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "inverse-gaussian", + "poisson", + "gamma", + "normal" + ] } }, { @@ -99610,6 +106265,10 @@ "docstring": { "type": "{'auto', 'identity', 'log'} or an instance of class BaseLink, default='auto'", "description": "The link function of the GLM, i.e. mapping from linear predictor\n`X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets\nthe link depending on the chosen family as follows:\n\n- 'identity' for Normal distribution\n- 'log' for Poisson, Gamma and Inverse Gaussian distributions" + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "identity", "log"] } }, { @@ -99620,7 +106279,8 @@ "docstring": { "type": "'lbfgs', default='lbfgs'", "description": "Algorithm to use in the optimization problem:\n\n'lbfgs'\n Calls scipy's L-BFGS-B optimizer." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -99630,7 +106290,8 @@ "docstring": { "type": "int, default=100", "description": "The maximal number of iterations for the solver." - } + }, + "refined_type": {} }, { "name": "tol", @@ -99640,6 +106301,10 @@ "docstring": { "type": "float, default=1e-4", "description": "Stopping criterion. For the lbfgs solver,\nthe iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``\nwhere ``g_j`` is the j-th component of the gradient (derivative) of\nthe objective function." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -99650,7 +106315,8 @@ "docstring": { "type": "bool, default=False", "description": "If set to ``True``, reuse the solution of the previous call to ``fit``\nas initialization for ``coef_`` and ``intercept_``." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -99660,13 +106326,14 @@ "docstring": { "type": "int, default=0", "description": "For the lbfgs solver set verbose to any positive number for verbosity." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, alpha=1.0, fit_intercept=True, family='normal', link='auto', solver='lbfgs', max_iter=100, tol=0.0001, warm_start=False, verbose=0):\n self.alpha = alpha\n self.fit_intercept = fit_intercept\n self.family = family\n self.link = link\n self.solver = solver\n self.max_iter = max_iter\n self.tol = tol\n self.warm_start = warm_start\n self.verbose = verbose" }, { @@ -99684,7 +106351,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -99694,13 +106362,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Compute the linear_predictor = `X @ coef_ + intercept_`.", - "docstring": "Compute the linear_predictor = `X @ coef_ + intercept_`.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\nReturns\n-------\ny_pred : array of shape (n_samples,)\n Returns predicted values of linear predictor.", + "docstring": "Compute the linear_predictor = `X @ coef_ + intercept_`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n Returns\n -------\n y_pred : array of shape (n_samples,)\n Returns predicted values of linear predictor.\n ", "source_code": "\ndef _linear_predictor(self, X):\n \"\"\"Compute the linear_predictor = `X @ coef_ + intercept_`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n Returns\n -------\n y_pred : array of shape (n_samples,)\n Returns predicted values of linear predictor.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float64, np.float32], ensure_2d=True, allow_nd=False, reset=False)\n return X @ self.coef_ + self.intercept_" }, { @@ -99718,13 +106390,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n if hasattr(self, '_family_instance'):\n _family_instance = self._family_instance\n elif isinstance(self.family, ExponentialDispersionModel):\n _family_instance = self.family\n elif self.family in EDM_DISTRIBUTIONS:\n _family_instance = EDM_DISTRIBUTIONS[self.family]()\n else:\n raise ValueError\n return {'requires_positive_y': not _family_instance.in_y_range(-1.0)}" }, { @@ -99742,7 +106415,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -99752,6 +106426,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -99762,7 +106440,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -99772,13 +106451,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit a Generalized Linear Model.", - "docstring": "Fit a Generalized Linear Model.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nself : object\n Fitted model.", + "docstring": "Fit a Generalized Linear Model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Fitted model.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit a Generalized Linear Model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Fitted model.\n \"\"\"\n if isinstance(self.family, ExponentialDispersionModel):\n self._family_instance = self.family\n elif self.family in EDM_DISTRIBUTIONS:\n self._family_instance = EDM_DISTRIBUTIONS[self.family]()\n else:\n raise ValueError(\"The family must be an instance of class ExponentialDispersionModel or an element of ['normal', 'poisson', 'gamma', 'inverse-gaussian']; got (family={0})\".format(self.family))\n if isinstance(self.link, BaseLink):\n self._link_instance = self.link\n elif self.link == 'auto':\n if isinstance(self._family_instance, TweedieDistribution):\n if self._family_instance.power <= 0:\n self._link_instance = IdentityLink()\n if self._family_instance.power >= 1:\n self._link_instance = LogLink()\n else:\n raise ValueError(\"No default link known for the specified distribution family. Please set link manually, i.e. not to 'auto'; got (link='auto', family={})\".format(self.family))\n elif self.link == 'identity':\n self._link_instance = IdentityLink()\n elif self.link == 'log':\n self._link_instance = LogLink()\n else:\n raise ValueError(\"The link must be an instance of class Link or an element of ['auto', 'identity', 'log']; got (link={0})\".format(self.link))\n if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:\n raise ValueError('Penalty term must be a non-negative number; got (alpha={0})'.format(self.alpha))\n if not isinstance(self.fit_intercept, bool):\n raise ValueError('The argument fit_intercept must be bool; got {0}'.format(self.fit_intercept))\n if self.solver not in ['lbfgs']:\n raise ValueError(\"GeneralizedLinearRegressor supports only solvers'lbfgs'; got {0}\".format(self.solver))\n solver = self.solver\n if not isinstance(self.max_iter, numbers.Integral) or self.max_iter <= 0:\n raise ValueError('Maximum number of iteration must be a positive integer; got (max_iter={0!r})'.format(self.max_iter))\n if not isinstance(self.tol, numbers.Number) or self.tol <= 0:\n raise ValueError('Tolerance for stopping criteria must be positive; got (tol={0!r})'.format(self.tol))\n if not isinstance(self.warm_start, bool):\n raise ValueError('The argument warm_start must be bool; got {0}'.format(self.warm_start))\n family = self._family_instance\n link = self._link_instance\n (X, y) = self._validate_data(X, y, accept_sparse=['csc', 'csr'], dtype=[np.float64, np.float32], y_numeric=True, multi_output=False)\n weights = _check_sample_weight(sample_weight, X)\n (_, n_features) = X.shape\n if not np.all(family.in_y_range(y)):\n raise ValueError('Some value(s) of y are out of the valid range for family {0}'.format(family.__class__.__name__))\n weights = weights / weights.sum()\n if self.warm_start and hasattr(self, 'coef_'):\n if self.fit_intercept:\n coef = np.concatenate((np.array([self.intercept_]), self.coef_))\n else:\n coef = self.coef_\n elif self.fit_intercept:\n coef = np.zeros(n_features + 1)\n coef[0] = link(np.average(y, weights=weights))\n else:\n coef = np.zeros(n_features)\n if solver == 'lbfgs':\n \n def func(coef, X, y, weights, alpha, family, link):\n (y_pred, devp) = _y_pred_deviance_derivative(coef, X, y, weights, family, link)\n dev = family.deviance(y, y_pred, weights)\n offset = 1 if self.fit_intercept else 0\n coef_scaled = alpha * coef[offset:]\n obj = 0.5 * dev + 0.5 * (coef[offset:] @ coef_scaled)\n objp = 0.5 * devp\n objp[offset:] += coef_scaled\n return obj, objp\n args = (X, y, weights, self.alpha, family, link)\n opt_res = scipy.optimize.minimize(func, coef, method='L-BFGS-B', jac=True, options={'maxiter': self.max_iter, 'iprint': (self.verbose > 0) - 1, 'gtol': self.tol, 'ftol': 1000.0 * np.finfo(float).eps}, args=args)\n self.n_iter_ = _check_optimize_result('lbfgs', opt_res)\n coef = opt_res.x\n if self.fit_intercept:\n self.intercept_ = coef[0]\n self.coef_ = coef[1:]\n else:\n self.intercept_ = 0.0\n self.coef_ = coef\n return self" }, { @@ -99796,7 +106476,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -99806,13 +106487,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict using GLM with feature matrix X.", - "docstring": "Predict using GLM with feature matrix X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\nReturns\n-------\ny_pred : array of shape (n_samples,)\n Returns predicted values.", + "docstring": "Predict using GLM with feature matrix X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n Returns\n -------\n y_pred : array of shape (n_samples,)\n Returns predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict using GLM with feature matrix X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Samples.\n\n Returns\n -------\n y_pred : array of shape (n_samples,)\n Returns predicted values.\n \"\"\"\n eta = self._linear_predictor(X)\n y_pred = self._link_instance.inverse(eta)\n return y_pred" }, { @@ -99830,7 +106515,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -99840,6 +106526,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Test samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -99850,7 +106540,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "True values of target." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -99860,13 +106551,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute D^2, the percentage of deviance explained.\n\nD^2 is a generalization of the coefficient of determination R^2. R^2 uses squared error and D^2 deviance. Note that those two are equal for ``family='normal'``. D^2 is defined as :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` is the null deviance, i.e. the deviance of a model with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. The mean :math:`\\bar{y}` is averaged by sample_weight. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse).", - "docstring": "Compute D^2, the percentage of deviance explained.\n\nD^2 is a generalization of the coefficient of determination R^2.\nR^2 uses squared error and D^2 deviance. Note that those two are equal\nfor ``family='normal'``.\n\nD^2 is defined as\n:math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,\n:math:`D_{null}` is the null deviance, i.e. the deviance of a model\nwith intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.\nThe mean :math:`\\bar{y}` is averaged by sample_weight.\nBest possible score is 1.0 and it can be negative (because the model\ncan be arbitrarily worse).\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Test samples.\n\ny : array-like of shape (n_samples,)\n True values of target.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nscore : float\n D^2 of self.predict(X) w.r.t. y.", + "description": "Compute D^2, the percentage of deviance explained.\n\nD^2 is a generalization of the coefficient of determination R^2.\nR^2 uses squared error and D^2 deviance. Note that those two are equal\nfor ``family='normal'``.\n\nD^2 is defined as\n:math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,\n:math:`D_{null}` is the null deviance, i.e. the deviance of a model\nwith intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.\nThe mean :math:`\\bar{y}` is averaged by sample_weight.\nBest possible score is 1.0 and it can be negative (because the model\ncan be arbitrarily worse).", + "docstring": "Compute D^2, the percentage of deviance explained.\n\n D^2 is a generalization of the coefficient of determination R^2.\n R^2 uses squared error and D^2 deviance. Note that those two are equal\n for ``family='normal'``.\n\n D^2 is defined as\n :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,\n :math:`D_{null}` is the null deviance, i.e. the deviance of a model\n with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.\n The mean :math:`\\bar{y}` is averaged by sample_weight.\n Best possible score is 1.0 and it can be negative (because the model\n can be arbitrarily worse).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples,)\n True values of target.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n D^2 of self.predict(X) w.r.t. y.\n ", "source_code": "\ndef score(self, X, y, sample_weight=None):\n \"\"\"Compute D^2, the percentage of deviance explained.\n\n D^2 is a generalization of the coefficient of determination R^2.\n R^2 uses squared error and D^2 deviance. Note that those two are equal\n for ``family='normal'``.\n\n D^2 is defined as\n :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,\n :math:`D_{null}` is the null deviance, i.e. the deviance of a model\n with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.\n The mean :math:`\\bar{y}` is averaged by sample_weight.\n Best possible score is 1.0 and it can be negative (because the model\n can be arbitrarily worse).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples,)\n True values of target.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n D^2 of self.predict(X) w.r.t. y.\n \"\"\"\n weights = _check_sample_weight(sample_weight, X)\n y_pred = self.predict(X)\n dev = self._family_instance.deviance(y, y_pred, weights=weights)\n y_mean = np.average(y, weights=weights)\n dev_null = self._family_instance.deviance(y, y_mean, weights=weights)\n return 1 - dev / dev_null" }, { @@ -99884,7 +106576,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -99894,7 +106587,8 @@ "docstring": { "type": "float, default=1", "description": "Constant that multiplies the penalty term and thus determines the\nregularization strength. ``alpha = 0`` is equivalent to unpenalized\nGLMs. In this case, the design matrix `X` must have full column rank\n(no collinearities)." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -99904,7 +106598,8 @@ "docstring": { "type": "bool, default=True", "description": "Specifies if a constant (a.k.a. bias or intercept) should be\nadded to the linear predictor (X @ coef + intercept)." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -99914,7 +106609,8 @@ "docstring": { "type": "int, default=100", "description": "The maximal number of iterations for the solver." - } + }, + "refined_type": {} }, { "name": "tol", @@ -99924,6 +106620,10 @@ "docstring": { "type": "float, default=1e-4", "description": "Stopping criterion. For the lbfgs solver,\nthe iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``\nwhere ``g_j`` is the j-th component of the gradient (derivative) of\nthe objective function." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -99934,7 +106634,8 @@ "docstring": { "type": "bool, default=False", "description": "If set to ``True``, reuse the solution of the previous call to ``fit``\nas initialization for ``coef_`` and ``intercept_`` ." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -99944,13 +106645,14 @@ "docstring": { "type": "int, default=0", "description": "For the lbfgs solver set verbose to any positive number for verbosity." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, tol=0.0001, warm_start=False, verbose=0):\n super().__init__(alpha=alpha, fit_intercept=fit_intercept, family='poisson', link='log', max_iter=max_iter, tol=tol, warm_start=warm_start, verbose=verbose)" }, { @@ -99968,7 +106670,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -99992,7 +106695,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -100002,13 +106706,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@family.setter\ndef family(self, value):\n if value != 'poisson':\n raise ValueError(\"PoissonRegressor.family must be 'poisson'!\")" }, { @@ -100026,7 +106731,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "power", @@ -100036,7 +106742,8 @@ "docstring": { "type": "float, default=0", "description": "The power determines the underlying target distribution according\nto the following table:\n\n+-------+------------------------+\n| Power | Distribution |\n+=======+========================+\n| 0 | Normal |\n+-------+------------------------+\n| 1 | Poisson |\n+-------+------------------------+\n| (1,2) | Compound Poisson Gamma |\n+-------+------------------------+\n| 2 | Gamma |\n+-------+------------------------+\n| 3 | Inverse Gaussian |\n+-------+------------------------+\n\nFor ``0 < power < 1``, no distribution exists." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -100046,7 +106753,8 @@ "docstring": { "type": "float, default=1", "description": "Constant that multiplies the penalty term and thus determines the\nregularization strength. ``alpha = 0`` is equivalent to unpenalized\nGLMs. In this case, the design matrix `X` must have full column rank\n(no collinearities)." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -100056,7 +106764,8 @@ "docstring": { "type": "bool, default=True", "description": "Specifies if a constant (a.k.a. bias or intercept) should be\nadded to the linear predictor (X @ coef + intercept)." - } + }, + "refined_type": {} }, { "name": "link", @@ -100066,6 +106775,10 @@ "docstring": { "type": "{'auto', 'identity', 'log'}, default='auto'", "description": "The link function of the GLM, i.e. mapping from linear predictor\n`X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets\nthe link depending on the chosen family as follows:\n\n- 'identity' for Normal distribution\n- 'log' for Poisson, Gamma and Inverse Gaussian distributions" + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "identity", "log"] } }, { @@ -100076,7 +106789,8 @@ "docstring": { "type": "int, default=100", "description": "The maximal number of iterations for the solver." - } + }, + "refined_type": {} }, { "name": "tol", @@ -100086,6 +106800,10 @@ "docstring": { "type": "float, default=1e-4", "description": "Stopping criterion. For the lbfgs solver,\nthe iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``\nwhere ``g_j`` is the j-th component of the gradient (derivative) of\nthe objective function." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -100096,7 +106814,8 @@ "docstring": { "type": "bool, default=False", "description": "If set to ``True``, reuse the solution of the previous call to ``fit``\nas initialization for ``coef_`` and ``intercept_`` ." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -100106,13 +106825,14 @@ "docstring": { "type": "int, default=0", "description": "For the lbfgs solver set verbose to any positive number for verbosity." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, power=0.0, alpha=1.0, fit_intercept=True, link='auto', max_iter=100, tol=0.0001, warm_start=False, verbose=0):\n super().__init__(alpha=alpha, fit_intercept=fit_intercept, family=TweedieDistribution(power=power), link=link, max_iter=max_iter, tol=tol, warm_start=warm_start, verbose=verbose)" }, { @@ -100130,7 +106850,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -100154,7 +106875,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -100164,13 +106886,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@family.setter\ndef family(self, value):\n if isinstance(value, TweedieDistribution):\n self.power = value.power\n else:\n raise TypeError('TweedieRegressor.family must be of type TweedieDistribution!')" }, { @@ -100188,7 +106911,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef", @@ -100198,7 +106922,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -100222,7 +106947,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -100232,7 +106958,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -100242,7 +106969,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "weights", @@ -100252,7 +106980,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "family", @@ -100262,7 +106991,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "link", @@ -100272,7 +107002,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -100296,7 +107027,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -100306,13 +107038,14 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Usually the (predicted) mean." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the link function g(y_pred).\n\nThe link function links the mean y_pred=E[Y] to the so called linear predictor (X*w), i.e. g(y_pred) = linear predictor.", - "docstring": "Compute the link function g(y_pred).\n\nThe link function links the mean y_pred=E[Y] to the so called linear\npredictor (X*w), i.e. g(y_pred) = linear predictor.\n\nParameters\n----------\ny_pred : array of shape (n_samples,)\n Usually the (predicted) mean.", + "description": "Compute the link function g(y_pred).\n\nThe link function links the mean y_pred=E[Y] to the so called linear\npredictor (X*w), i.e. g(y_pred) = linear predictor.", + "docstring": "Compute the link function g(y_pred).\n\n The link function links the mean y_pred=E[Y] to the so called linear\n predictor (X*w), i.e. g(y_pred) = linear predictor.\n\n Parameters\n ----------\n y_pred : array of shape (n_samples,)\n Usually the (predicted) mean.\n ", "source_code": "\n@abstractmethod\ndef __call__(self, y_pred):\n \"\"\"Compute the link function g(y_pred).\n\n The link function links the mean y_pred=E[Y] to the so called linear\n predictor (X*w), i.e. g(y_pred) = linear predictor.\n\n Parameters\n ----------\n y_pred : array of shape (n_samples,)\n Usually the (predicted) mean.\n \"\"\"\n " }, { @@ -100330,7 +107063,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -100340,13 +107074,14 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Usually the (predicted) mean." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the derivative of the link g'(y_pred).", - "docstring": "Compute the derivative of the link g'(y_pred).\n\nParameters\n----------\ny_pred : array of shape (n_samples,)\n Usually the (predicted) mean.", + "docstring": "Compute the derivative of the link g'(y_pred).\n\n Parameters\n ----------\n y_pred : array of shape (n_samples,)\n Usually the (predicted) mean.\n ", "source_code": "\n@abstractmethod\ndef derivative(self, y_pred):\n \"\"\"Compute the derivative of the link g'(y_pred).\n\n Parameters\n ----------\n y_pred : array of shape (n_samples,)\n Usually the (predicted) mean.\n \"\"\"\n " }, { @@ -100364,7 +107099,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lin_pred", @@ -100374,13 +107110,14 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Usually the (fitted) linear predictor." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the inverse link function h(lin_pred).\n\nGives the inverse relationship between linear predictor and the mean y_pred=E[Y], i.e. h(linear predictor) = y_pred.", - "docstring": "Compute the inverse link function h(lin_pred).\n\nGives the inverse relationship between linear predictor and the mean\ny_pred=E[Y], i.e. h(linear predictor) = y_pred.\n\nParameters\n----------\nlin_pred : array of shape (n_samples,)\n Usually the (fitted) linear predictor.", + "description": "Compute the inverse link function h(lin_pred).\n\nGives the inverse relationship between linear predictor and the mean\ny_pred=E[Y], i.e. h(linear predictor) = y_pred.", + "docstring": "Compute the inverse link function h(lin_pred).\n\n Gives the inverse relationship between linear predictor and the mean\n y_pred=E[Y], i.e. h(linear predictor) = y_pred.\n\n Parameters\n ----------\n lin_pred : array of shape (n_samples,)\n Usually the (fitted) linear predictor.\n ", "source_code": "\n@abstractmethod\ndef inverse(self, lin_pred):\n \"\"\"Compute the inverse link function h(lin_pred).\n\n Gives the inverse relationship between linear predictor and the mean\n y_pred=E[Y], i.e. h(linear predictor) = y_pred.\n\n Parameters\n ----------\n lin_pred : array of shape (n_samples,)\n Usually the (fitted) linear predictor.\n \"\"\"\n " }, { @@ -100398,7 +107135,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lin_pred", @@ -100408,13 +107146,14 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Usually the (fitted) linear predictor." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the derivative of the inverse link function h'(lin_pred).", - "docstring": "Compute the derivative of the inverse link function h'(lin_pred).\n\nParameters\n----------\nlin_pred : array of shape (n_samples,)\n Usually the (fitted) linear predictor.", + "docstring": "Compute the derivative of the inverse link function h'(lin_pred).\n\n Parameters\n ----------\n lin_pred : array of shape (n_samples,)\n Usually the (fitted) linear predictor.\n ", "source_code": "\n@abstractmethod\ndef inverse_derivative(self, lin_pred):\n \"\"\"Compute the derivative of the inverse link function h'(lin_pred).\n\n Parameters\n ----------\n lin_pred : array of shape (n_samples,)\n Usually the (fitted) linear predictor.\n \"\"\"\n " }, { @@ -100432,7 +107171,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -100442,13 +107182,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __call__(self, y_pred):\n return y_pred" }, { @@ -100466,7 +107207,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -100476,13 +107218,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef derivative(self, y_pred):\n return np.ones_like(y_pred)" }, { @@ -100500,7 +107243,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lin_pred", @@ -100510,13 +107254,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef inverse(self, lin_pred):\n return lin_pred" }, { @@ -100534,7 +107279,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lin_pred", @@ -100544,13 +107290,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef inverse_derivative(self, lin_pred):\n return np.ones_like(lin_pred)" }, { @@ -100568,7 +107315,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -100578,13 +107326,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __call__(self, y_pred):\n return np.log(y_pred)" }, { @@ -100602,7 +107351,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -100612,13 +107362,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef derivative(self, y_pred):\n return 1 / y_pred" }, { @@ -100636,7 +107387,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lin_pred", @@ -100646,13 +107398,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef inverse(self, lin_pred):\n return np.exp(lin_pred)" }, { @@ -100670,7 +107423,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lin_pred", @@ -100680,13 +107434,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef inverse_derivative(self, lin_pred):\n return np.exp(lin_pred)" }, { @@ -100704,7 +107459,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -100714,13 +107470,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __call__(self, y_pred):\n return logit(y_pred)" }, { @@ -100738,7 +107495,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -100748,13 +107506,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef derivative(self, y_pred):\n return 1 / (y_pred * (1 - y_pred))" }, { @@ -100772,7 +107531,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lin_pred", @@ -100782,13 +107542,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef inverse(self, lin_pred):\n return expit(lin_pred)" }, { @@ -100806,7 +107567,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lin_pred", @@ -100816,13 +107578,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef inverse_derivative(self, lin_pred):\n ep = expit(lin_pred)\n return ep * (1 - ep)" }, { @@ -100840,7 +107603,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -100850,7 +107614,8 @@ "docstring": { "type": "float, greater than 1.0, default=1.35", "description": "The parameter epsilon controls the number of samples that should be\nclassified as outliers. The smaller the epsilon, the more robust it is\nto outliers." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -100860,7 +107625,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum number of iterations that\n``scipy.optimize.minimize(method=\"L-BFGS-B\")`` should run for." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -100870,7 +107636,8 @@ "docstring": { "type": "float, default=0.0001", "description": "Regularization parameter." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -100880,7 +107647,8 @@ "docstring": { "type": "bool, default=False", "description": "This is useful if the stored attributes of a previously used model\nhas to be reused. If set to False, then the coefficients will\nbe rewritten for every call to fit.\nSee :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -100890,7 +107658,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to fit the intercept. This can be set to False\nif the data is already centered around the origin." - } + }, + "refined_type": {} }, { "name": "tol", @@ -100900,13 +107669,17 @@ "docstring": { "type": "float, default=1e-05", "description": "The iteration will stop when\n``max{|proj g_i | i = 1, ..., n}`` <= ``tol``\nwhere pg_i is the i-th component of the projected gradient." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, epsilon=1.35, max_iter=100, alpha=0.0001, warm_start=False, fit_intercept=True, tol=1e-05):\n self.epsilon = epsilon\n self.max_iter = max_iter\n self.alpha = alpha\n self.warm_start = warm_start\n self.fit_intercept = fit_intercept\n self.tol = tol" }, { @@ -100924,7 +107697,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -100934,7 +107708,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -100944,7 +107719,8 @@ "docstring": { "type": "array-like, shape (n_samples,)", "description": "Target vector relative to X." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -100954,13 +107730,14 @@ "docstring": { "type": "array-like, shape (n_samples,)", "description": "Weight given to each sample." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model according to the given training data.", - "docstring": "Fit the model according to the given training data.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like, shape (n_samples,)\n Target vector relative to X.\n\nsample_weight : array-like, shape (n_samples,)\n Weight given to each sample.\n\nReturns\n-------\nself : object\n Fitted `HuberRegressor` estimator.", + "docstring": "Fit the model according to the given training data.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like, shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like, shape (n_samples,)\n Weight given to each sample.\n\n Returns\n -------\n self : object\n Fitted `HuberRegressor` estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like, shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like, shape (n_samples,)\n Weight given to each sample.\n\n Returns\n -------\n self : object\n Fitted `HuberRegressor` estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y, copy=False, accept_sparse=['csr'], y_numeric=True, dtype=[np.float64, np.float32])\n sample_weight = _check_sample_weight(sample_weight, X)\n if self.epsilon < 1.0:\n raise ValueError('epsilon should be greater than or equal to 1.0, got %f' % self.epsilon)\n if self.warm_start and hasattr(self, 'coef_'):\n parameters = np.concatenate((self.coef_, [self.intercept_, self.scale_]))\n else:\n if self.fit_intercept:\n parameters = np.zeros(X.shape[1] + 2)\n else:\n parameters = np.zeros(X.shape[1] + 1)\n parameters[-1] = 1\n bounds = np.tile([-np.inf, np.inf], (parameters.shape[0], 1))\n bounds[-1][0] = np.finfo(np.float64).eps * 10\n opt_res = optimize.minimize(_huber_loss_and_gradient, parameters, method='L-BFGS-B', jac=True, args=(X, y, self.epsilon, self.alpha, sample_weight), options={'maxiter': self.max_iter, 'gtol': self.tol, 'iprint': -1}, bounds=bounds)\n parameters = opt_res.x\n if opt_res.status == 2:\n raise ValueError('HuberRegressor convergence failed: l-BFGS-b solver terminated with %s' % opt_res.message)\n self.n_iter_ = _check_optimize_result('lbfgs', opt_res, self.max_iter)\n self.scale_ = parameters[-1]\n if self.fit_intercept:\n self.intercept_ = parameters[-2]\n else:\n self.intercept_ = 0.0\n self.coef_ = parameters[:X.shape[1]]\n residual = np.abs(y - safe_sparse_dot(X, self.coef_) - self.intercept_)\n self.outliers_ = residual > self.scale_ * self.epsilon\n return self" }, { @@ -100978,7 +107755,8 @@ "docstring": { "type": "ndarray, shape (n_features + 1,) or (n_features + 2,)", "description": "Feature vector.\nw[:n_features] gives the coefficients\nw[-1] gives the scale factor and if the intercept is fit w[-2]\ngives the intercept factor." - } + }, + "refined_type": {} }, { "name": "X", @@ -100988,7 +107766,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Input data." - } + }, + "refined_type": {} }, { "name": "y", @@ -100998,7 +107777,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target vector." - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -101008,7 +107788,8 @@ "docstring": { "type": "float", "description": "Robustness of the Huber estimator." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -101018,7 +107799,8 @@ "docstring": { "type": "float", "description": "Regularization parameter." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -101028,13 +107810,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Weight assigned to each sample." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Returns the Huber loss and the gradient.", - "docstring": "Returns the Huber loss and the gradient.\n\nParameters\n----------\nw : ndarray, shape (n_features + 1,) or (n_features + 2,)\n Feature vector.\n w[:n_features] gives the coefficients\n w[-1] gives the scale factor and if the intercept is fit w[-2]\n gives the intercept factor.\n\nX : ndarray of shape (n_samples, n_features)\n Input data.\n\ny : ndarray of shape (n_samples,)\n Target vector.\n\nepsilon : float\n Robustness of the Huber estimator.\n\nalpha : float\n Regularization parameter.\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Weight assigned to each sample.\n\nReturns\n-------\nloss : float\n Huber loss.\n\ngradient : ndarray, shape (len(w))\n Returns the derivative of the Huber loss with respect to each\n coefficient, intercept and the scale as a vector.", + "docstring": "Returns the Huber loss and the gradient.\n\n Parameters\n ----------\n w : ndarray, shape (n_features + 1,) or (n_features + 2,)\n Feature vector.\n w[:n_features] gives the coefficients\n w[-1] gives the scale factor and if the intercept is fit w[-2]\n gives the intercept factor.\n\n X : ndarray of shape (n_samples, n_features)\n Input data.\n\n y : ndarray of shape (n_samples,)\n Target vector.\n\n epsilon : float\n Robustness of the Huber estimator.\n\n alpha : float\n Regularization parameter.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Weight assigned to each sample.\n\n Returns\n -------\n loss : float\n Huber loss.\n\n gradient : ndarray, shape (len(w))\n Returns the derivative of the Huber loss with respect to each\n coefficient, intercept and the scale as a vector.\n ", "source_code": "\ndef _huber_loss_and_gradient(w, X, y, epsilon, alpha, sample_weight=None):\n \"\"\"Returns the Huber loss and the gradient.\n\n Parameters\n ----------\n w : ndarray, shape (n_features + 1,) or (n_features + 2,)\n Feature vector.\n w[:n_features] gives the coefficients\n w[-1] gives the scale factor and if the intercept is fit w[-2]\n gives the intercept factor.\n\n X : ndarray of shape (n_samples, n_features)\n Input data.\n\n y : ndarray of shape (n_samples,)\n Target vector.\n\n epsilon : float\n Robustness of the Huber estimator.\n\n alpha : float\n Regularization parameter.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Weight assigned to each sample.\n\n Returns\n -------\n loss : float\n Huber loss.\n\n gradient : ndarray, shape (len(w))\n Returns the derivative of the Huber loss with respect to each\n coefficient, intercept and the scale as a vector.\n \"\"\"\n (_, n_features) = X.shape\n fit_intercept = n_features + 2 == w.shape[0]\n if fit_intercept:\n intercept = w[-2]\n sigma = w[-1]\n w = w[:n_features]\n n_samples = np.sum(sample_weight)\n linear_loss = y - safe_sparse_dot(X, w)\n if fit_intercept:\n linear_loss -= intercept\n abs_linear_loss = np.abs(linear_loss)\n outliers_mask = abs_linear_loss > epsilon * sigma\n outliers = abs_linear_loss[outliers_mask]\n num_outliers = np.count_nonzero(outliers_mask)\n n_non_outliers = X.shape[0] - num_outliers\n outliers_sw = sample_weight[outliers_mask]\n n_sw_outliers = np.sum(outliers_sw)\n outlier_loss = 2.0 * epsilon * np.sum(outliers_sw * outliers) - sigma * n_sw_outliers * epsilon**2\n non_outliers = linear_loss[~outliers_mask]\n weighted_non_outliers = sample_weight[~outliers_mask] * non_outliers\n weighted_loss = np.dot(weighted_non_outliers.T, non_outliers)\n squared_loss = weighted_loss / sigma\n if fit_intercept:\n grad = np.zeros(n_features + 2)\n else:\n grad = np.zeros(n_features + 1)\n X_non_outliers = -axis0_safe_slice(X, ~outliers_mask, n_non_outliers)\n grad[:n_features] = 2.0 / sigma * safe_sparse_dot(weighted_non_outliers, X_non_outliers)\n signed_outliers = np.ones_like(outliers)\n signed_outliers_mask = linear_loss[outliers_mask] < 0\n signed_outliers[signed_outliers_mask] = -1.0\n X_outliers = axis0_safe_slice(X, outliers_mask, num_outliers)\n sw_outliers = sample_weight[outliers_mask] * signed_outliers\n grad[:n_features] -= 2.0 * epsilon * safe_sparse_dot(sw_outliers, X_outliers)\n grad[:n_features] += alpha * 2.0 * w\n grad[-1] = n_samples\n grad[-1] -= n_sw_outliers * epsilon**2\n grad[-1] -= squared_loss / sigma\n if fit_intercept:\n grad[-2] = -2.0 * np.sum(weighted_non_outliers) / sigma\n grad[-2] -= 2.0 * epsilon * np.sum(sw_outliers)\n loss = n_samples * sigma + squared_loss + outlier_loss\n loss += alpha * np.dot(w, w)\n return loss, grad" }, { @@ -101052,7 +107835,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -101062,7 +107846,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -101072,7 +107857,8 @@ "docstring": { "type": "bool or int, default=False", "description": "Sets the verbosity amount." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -101082,7 +107868,8 @@ "docstring": { "type": "bool, default=True", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -101092,7 +107879,8 @@ "docstring": { "type": "bool, 'auto' or array-like , default='auto'", "description": "Whether to use a precomputed Gram matrix to speed up\ncalculations. If set to ``'auto'`` let us decide. The Gram\nmatrix can also be passed as argument." - } + }, + "refined_type": {} }, { "name": "n_nonzero_coefs", @@ -101102,7 +107890,8 @@ "docstring": { "type": "int, default=500", "description": "Target number of non-zero coefficients. Use ``np.inf`` for no limit." - } + }, + "refined_type": {} }, { "name": "eps", @@ -101112,7 +107901,8 @@ "docstring": { "type": "float, default=np.finfo(float).eps", "description": "The machine-precision regularization in the computation of the\nCholesky diagonal factors. Increase this for very ill-conditioned\nsystems. Unlike the ``tol`` parameter in some iterative\noptimization-based algorithms, this parameter does not control\nthe tolerance of the optimization." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -101122,7 +107912,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "fit_path", @@ -101132,7 +107923,8 @@ "docstring": { "type": "bool, default=True", "description": "If True the full path is stored in the ``coef_path_`` attribute.\nIf you compute the solution for a large problem or many targets,\nsetting ``fit_path`` to ``False`` will lead to a speedup, especially\nwith a small alpha." - } + }, + "refined_type": {} }, { "name": "jitter", @@ -101142,7 +107934,8 @@ "docstring": { "type": "float, default=None", "description": "Upper bound on a uniform noise parameter to be added to the\n`y` values, to satisfy the model's assumption of\none-at-a-time computations. Might help with stability.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -101152,13 +107945,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for jittering. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `. Ignored if `jitter` is None.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, fit_intercept=True, verbose=False, normalize='deprecated', precompute='auto', n_nonzero_coefs=500, eps=np.finfo(float).eps, copy_X=True, fit_path=True, jitter=None, random_state=None):\n self.fit_intercept = fit_intercept\n self.verbose = verbose\n self.normalize = normalize\n self.precompute = precompute\n self.n_nonzero_coefs = n_nonzero_coefs\n self.eps = eps\n self.copy_X = copy_X\n self.fit_path = fit_path\n self.jitter = jitter\n self.random_state = random_state" }, { @@ -101176,7 +107970,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -101186,7 +107981,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -101196,7 +107992,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -101206,7 +108003,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -101216,7 +108014,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_path", @@ -101226,7 +108025,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "normalize", @@ -101236,7 +108036,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Xy", @@ -101246,7 +108047,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -101270,7 +108072,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -101280,7 +108083,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -101290,13 +108094,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@staticmethod\ndef _get_gram(precompute, X, y):\n if not hasattr(precompute, '__array__') and (precompute is True or precompute == 'auto' and X.shape[0] > X.shape[1] or precompute == 'auto' and y.shape[1] > 1):\n precompute = np.dot(X.T, X)\n return precompute" }, { @@ -101314,7 +108119,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -101324,7 +108130,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -101334,7 +108141,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "Xy", @@ -101344,13 +108152,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets), default=None", "description": "Xy = np.dot(X.T, y) that can be precomputed. It is useful\nonly when the Gram matrix is precomputed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model using X, y as training data.", - "docstring": "Fit the model using X, y as training data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\nXy : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Xy = np.dot(X.T, y) that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\nReturns\n-------\nself : object\n Returns an instance of self.", + "docstring": "Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n Xy : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Xy = np.dot(X.T, y) that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n ", "source_code": "\ndef fit(self, X, y, Xy=None):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n Xy : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Xy = np.dot(X.T, y) that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n (X, y) = self._validate_data(X, y, y_numeric=True, multi_output=True)\n _normalize = _deprecate_normalize(self.normalize, default=True, estimator_name=self.__class__.__name__)\n alpha = getattr(self, 'alpha', 0.0)\n if hasattr(self, 'n_nonzero_coefs'):\n alpha = 0.0\n max_iter = self.n_nonzero_coefs\n else:\n max_iter = self.max_iter\n if self.jitter is not None:\n rng = check_random_state(self.random_state)\n noise = rng.uniform(high=self.jitter, size=len(y))\n y = y + noise\n self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path, normalize=_normalize, Xy=Xy)\n return self" }, { @@ -101368,7 +108177,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -101378,7 +108188,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -101388,7 +108199,8 @@ "docstring": { "type": "bool or int, default=False", "description": "Sets the verbosity amount." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -101398,7 +108210,8 @@ "docstring": { "type": "int, default=500", "description": "Maximum number of iterations to perform." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -101408,7 +108221,8 @@ "docstring": { "type": "bool, default=True", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -101418,7 +108232,8 @@ "docstring": { "type": "bool, 'auto' or array-like , default='auto'", "description": "Whether to use a precomputed Gram matrix to speed up\ncalculations. If set to ``'auto'`` let us decide. The Gram matrix\ncannot be passed as argument since we will use only subsets of X." - } + }, + "refined_type": {} }, { "name": "cv", @@ -101428,7 +108243,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross-validation,\n- integer, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor integer/None inputs, :class:`KFold` is used.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "max_n_alphas", @@ -101438,7 +108254,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of points on the path used to compute the\nresiduals in the cross-validation." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -101448,7 +108265,8 @@ "docstring": { "type": "int or None, default=None", "description": "Number of CPUs to use during the cross validation.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "eps", @@ -101458,7 +108276,8 @@ "docstring": { "type": "float, default=np.finfo(float).eps", "description": "The machine-precision regularization in the computation of the\nCholesky diagonal factors. Increase this for very ill-conditioned\nsystems. Unlike the ``tol`` parameter in some iterative\noptimization-based algorithms, this parameter does not control\nthe tolerance of the optimization." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -101468,13 +108287,14 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, fit_intercept=True, verbose=False, max_iter=500, normalize='deprecated', precompute='auto', cv=None, max_n_alphas=1000, n_jobs=None, eps=np.finfo(float).eps, copy_X=True):\n self.max_iter = max_iter\n self.cv = cv\n self.max_n_alphas = max_n_alphas\n self.n_jobs = n_jobs\n super().__init__(fit_intercept=fit_intercept, verbose=verbose, normalize=normalize, precompute=precompute, n_nonzero_coefs=500, eps=eps, copy_X=copy_X, fit_path=True)" }, { @@ -101492,13 +108312,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multioutput': False}" }, { @@ -101516,7 +108337,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -101526,7 +108348,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -101536,13 +108359,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model using X, y as training data.", - "docstring": "Fit the model using X, y as training data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nReturns\n-------\nself : object\n Returns an instance of self.", + "docstring": "Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=True, estimator_name=self.__class__.__name__)\n (X, y) = self._validate_data(X, y, y_numeric=True)\n X = as_float_array(X, copy=self.copy_X)\n y = as_float_array(y, copy=self.copy_X)\n cv = check_cv(self.cv, classifier=False)\n Gram = self.precompute\n if hasattr(Gram, '__array__'):\n warnings.warn('Parameter \"precompute\" cannot be an array in %s. Automatically switch to \"auto\" instead.' % self.__class__.__name__)\n Gram = 'auto'\n cv_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)((delayed(_lars_path_residues)(X[train], y[train], X[test], y[test], Gram=Gram, copy=False, method=self.method, verbose=max(0, self.verbose - 1), normalize=_normalize, fit_intercept=self.fit_intercept, max_iter=self.max_iter, eps=self.eps, positive=self.positive) for (train, test) in cv.split(X, y)))\n all_alphas = np.concatenate(list(zip(*cv_paths))[0])\n all_alphas = np.unique(all_alphas)\n stride = int(max(1, int(len(all_alphas) / float(self.max_n_alphas))))\n all_alphas = all_alphas[::stride]\n mse_path = np.empty((len(all_alphas), len(cv_paths)))\n for (index, (alphas, _, _, residues)) in enumerate(cv_paths):\n alphas = alphas[::-1]\n residues = residues[::-1]\n if alphas[0] != 0:\n alphas = np.r_[0, alphas]\n residues = np.r_[residues[0, np.newaxis], residues]\n if alphas[-1] != all_alphas[-1]:\n alphas = np.r_[alphas, all_alphas[-1]]\n residues = np.r_[residues, residues[-1, np.newaxis]]\n this_residues = interpolate.interp1d(alphas, residues, axis=0)(all_alphas)\n this_residues **= 2\n mse_path[:, index] = np.mean(this_residues, axis=-1)\n mask = np.all(np.isfinite(mse_path), axis=-1)\n all_alphas = all_alphas[mask]\n mse_path = mse_path[mask]\n i_best_alpha = np.argmin(mse_path.mean(axis=-1))\n best_alpha = all_alphas[i_best_alpha]\n self.alpha_ = best_alpha\n self.cv_alphas_ = all_alphas\n self.mse_path_ = mse_path\n self._fit(X, y, max_iter=self.max_iter, alpha=best_alpha, Xy=None, fit_path=True, normalize=_normalize)\n return self" }, { @@ -101560,7 +108384,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -101570,7 +108395,8 @@ "docstring": { "type": "float, default=1.0", "description": "Constant that multiplies the penalty term. Defaults to 1.0.\n``alpha = 0`` is equivalent to an ordinary least square, solved\nby :class:`LinearRegression`. For numerical reasons, using\n``alpha = 0`` with the LassoLars object is not advised and you\nshould prefer the LinearRegression object." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -101580,7 +108406,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -101590,7 +108417,8 @@ "docstring": { "type": "bool or int, default=False", "description": "Sets the verbosity amount." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -101600,7 +108428,8 @@ "docstring": { "type": "bool, default=True", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -101610,7 +108439,8 @@ "docstring": { "type": "bool, 'auto' or array-like, default='auto'", "description": "Whether to use a precomputed Gram matrix to speed up\ncalculations. If set to ``'auto'`` let us decide. The Gram\nmatrix can also be passed as argument." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -101620,7 +108450,8 @@ "docstring": { "type": "int, default=500", "description": "Maximum number of iterations to perform." - } + }, + "refined_type": {} }, { "name": "eps", @@ -101630,7 +108461,8 @@ "docstring": { "type": "float, default=np.finfo(float).eps", "description": "The machine-precision regularization in the computation of the\nCholesky diagonal factors. Increase this for very ill-conditioned\nsystems. Unlike the ``tol`` parameter in some iterative\noptimization-based algorithms, this parameter does not control\nthe tolerance of the optimization." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -101640,7 +108472,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "fit_path", @@ -101650,7 +108483,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True`` the full path is stored in the ``coef_path_`` attribute.\nIf you compute the solution for a large problem or many targets,\nsetting ``fit_path`` to ``False`` will lead to a speedup, especially\nwith a small alpha." - } + }, + "refined_type": {} }, { "name": "positive", @@ -101660,7 +108494,8 @@ "docstring": { "type": "bool, default=False", "description": "Restrict coefficients to be >= 0. Be aware that you might want to\nremove fit_intercept which is set True by default.\nUnder the positive restriction the model coefficients will not converge\nto the ordinary-least-squares solution for small values of alpha.\nOnly coefficients up to the smallest alpha value (``alphas_[alphas_ >\n0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso\nalgorithm are typically in congruence with the solution of the\ncoordinate descent Lasso estimator." - } + }, + "refined_type": {} }, { "name": "jitter", @@ -101670,7 +108505,8 @@ "docstring": { "type": "float, default=None", "description": "Upper bound on a uniform noise parameter to be added to the\n`y` values, to satisfy the model's assumption of\none-at-a-time computations. Might help with stability.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -101680,13 +108516,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for jittering. Pass an int\nfor reproducible output across multiple function calls.\nSee :term:`Glossary `. Ignored if `jitter` is None.\n\n.. versionadded:: 0.23" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alpha=1.0, *, fit_intercept=True, verbose=False, normalize='deprecated', precompute='auto', max_iter=500, eps=np.finfo(float).eps, copy_X=True, fit_path=True, positive=False, jitter=None, random_state=None):\n self.alpha = alpha\n self.fit_intercept = fit_intercept\n self.max_iter = max_iter\n self.verbose = verbose\n self.normalize = normalize\n self.positive = positive\n self.precompute = precompute\n self.copy_X = copy_X\n self.eps = eps\n self.fit_path = fit_path\n self.jitter = jitter\n self.random_state = random_state" }, { @@ -101704,7 +108541,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -101714,7 +108552,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -101724,7 +108563,8 @@ "docstring": { "type": "bool or int, default=False", "description": "Sets the verbosity amount." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -101734,7 +108574,8 @@ "docstring": { "type": "int, default=500", "description": "Maximum number of iterations to perform." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -101744,7 +108585,8 @@ "docstring": { "type": "bool, default=True", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -101754,7 +108596,8 @@ "docstring": { "type": "bool or 'auto' , default='auto'", "description": "Whether to use a precomputed Gram matrix to speed up\ncalculations. If set to ``'auto'`` let us decide. The Gram matrix\ncannot be passed as argument since we will use only subsets of X." - } + }, + "refined_type": {} }, { "name": "cv", @@ -101764,7 +108607,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross-validation,\n- integer, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor integer/None inputs, :class:`KFold` is used.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "max_n_alphas", @@ -101774,7 +108618,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of points on the path used to compute the\nresiduals in the cross-validation." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -101784,7 +108629,8 @@ "docstring": { "type": "int or None, default=None", "description": "Number of CPUs to use during the cross validation.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "eps", @@ -101794,7 +108640,8 @@ "docstring": { "type": "float, default=np.finfo(float).eps", "description": "The machine-precision regularization in the computation of the\nCholesky diagonal factors. Increase this for very ill-conditioned\nsystems. Unlike the ``tol`` parameter in some iterative\noptimization-based algorithms, this parameter does not control\nthe tolerance of the optimization." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -101804,7 +108651,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "positive", @@ -101814,13 +108662,14 @@ "docstring": { "type": "bool, default=False", "description": "Restrict coefficients to be >= 0. Be aware that you might want to\nremove fit_intercept which is set True by default.\nUnder the positive restriction the model coefficients do not converge\nto the ordinary-least-squares solution for small values of alpha.\nOnly coefficients up to the smallest alpha value (``alphas_[alphas_ >\n0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso\nalgorithm are typically in congruence with the solution of the\ncoordinate descent Lasso estimator.\nAs a consequence using LassoLarsCV only makes sense for problems where\na sparse solution is expected and/or reached." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, fit_intercept=True, verbose=False, max_iter=500, normalize='deprecated', precompute='auto', cv=None, max_n_alphas=1000, n_jobs=None, eps=np.finfo(float).eps, copy_X=True, positive=False):\n self.fit_intercept = fit_intercept\n self.verbose = verbose\n self.max_iter = max_iter\n self.normalize = normalize\n self.precompute = precompute\n self.cv = cv\n self.max_n_alphas = max_n_alphas\n self.n_jobs = n_jobs\n self.eps = eps\n self.copy_X = copy_X\n self.positive = positive" }, { @@ -101838,7 +108687,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "criterion", @@ -101846,8 +108696,12 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "{'bic' , 'aic'}, default='aic'", + "type": "{'aic', 'bic'}, default='aic'", "description": "The type of criterion to use." + }, + "refined_type": { + "kind": "EnumType", + "values": ["bic", "aic"] } }, { @@ -101858,7 +108712,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -101868,7 +108723,8 @@ "docstring": { "type": "bool or int, default=False", "description": "Sets the verbosity amount." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -101878,7 +108734,8 @@ "docstring": { "type": "bool, default=True", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -101888,7 +108745,8 @@ "docstring": { "type": "bool, 'auto' or array-like, default='auto'", "description": "Whether to use a precomputed Gram matrix to speed up\ncalculations. If set to ``'auto'`` let us decide. The Gram\nmatrix can also be passed as argument." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -101898,7 +108756,8 @@ "docstring": { "type": "int, default=500", "description": "Maximum number of iterations to perform. Can be used for\nearly stopping." - } + }, + "refined_type": {} }, { "name": "eps", @@ -101908,7 +108767,8 @@ "docstring": { "type": "float, default=np.finfo(float).eps", "description": "The machine-precision regularization in the computation of the\nCholesky diagonal factors. Increase this for very ill-conditioned\nsystems. Unlike the ``tol`` parameter in some iterative\noptimization-based algorithms, this parameter does not control\nthe tolerance of the optimization." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -101918,7 +108778,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "positive", @@ -101928,14 +108789,84 @@ "docstring": { "type": "bool, default=False", "description": "Restrict coefficients to be >= 0. Be aware that you might want to\nremove fit_intercept which is set True by default.\nUnder the positive restriction the model coefficients do not converge\nto the ordinary-least-squares solution for small values of alpha.\nOnly coefficients up to the smallest alpha value (``alphas_[alphas_ >\n0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso\nalgorithm are typically in congruence with the solution of the\ncoordinate descent Lasso estimator.\nAs a consequence using LassoLarsIC only makes sense for problems where\na sparse solution is expected and/or reached." - } + }, + "refined_type": {} + }, + { + "name": "noise_variance", + "default_value": "None", + "is_public": true, + "assigned_by": "NAME_ONLY", + "docstring": { + "type": "float, default=None", + "description": "The estimated noise variance of the data. If `None`, an unbiased\nestimate is computed by an OLS model. However, it is only possible\nin the case where `n_samples > n_features + fit_intercept`.\n\n.. versionadded:: 1.1" + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", - "source_code": "\ndef __init__(self, criterion='aic', *, fit_intercept=True, verbose=False, normalize='deprecated', precompute='auto', max_iter=500, eps=np.finfo(float).eps, copy_X=True, positive=False):\n self.criterion = criterion\n self.fit_intercept = fit_intercept\n self.positive = positive\n self.max_iter = max_iter\n self.verbose = verbose\n self.normalize = normalize\n self.copy_X = copy_X\n self.precompute = precompute\n self.eps = eps\n self.fit_path = True" + "docstring": null, + "source_code": "\ndef __init__(self, criterion='aic', *, fit_intercept=True, verbose=False, normalize='deprecated', precompute='auto', max_iter=500, eps=np.finfo(float).eps, copy_X=True, positive=False, noise_variance=None):\n self.criterion = criterion\n self.fit_intercept = fit_intercept\n self.positive = positive\n self.max_iter = max_iter\n self.verbose = verbose\n self.normalize = normalize\n self.copy_X = copy_X\n self.precompute = precompute\n self.eps = eps\n self.fit_path = True\n self.noise_variance = noise_variance" + }, + { + "name": "_estimate_noise_variance", + "unique_name": "_estimate_noise_variance", + "qname": "sklearn.linear_model._least_angle.LassoLarsIC._estimate_noise_variance", + "unique_qname": "sklearn.linear_model._least_angle.LassoLarsIC._estimate_noise_variance", + "decorators": [], + "parameters": [ + { + "name": "self", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + }, + { + "name": "X", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "ndarray of shape (n_samples, n_features)", + "description": "Data to be fitted by the OLS model. We expect the data to be\ncentered." + }, + "refined_type": {} + }, + { + "name": "y", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "ndarray of shape (n_samples,)", + "description": "Associated target." + }, + "refined_type": {} + }, + { + "name": "positive", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "bool, default=False", + "description": "Restrict coefficients to be >= 0. This should be inline with\nthe `positive` parameter from `LassoLarsIC`." + }, + "refined_type": {} + } + ], + "results": [], + "is_public": false, + "description": "Compute an estimate of the variance with an OLS model.", + "docstring": "Compute an estimate of the variance with an OLS model.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data to be fitted by the OLS model. We expect the data to be\n centered.\n\n y : ndarray of shape (n_samples,)\n Associated target.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. This should be inline with\n the `positive` parameter from `LassoLarsIC`.\n\n Returns\n -------\n noise_variance : float\n An estimator of the noise variance of an OLS model.\n ", + "source_code": "\ndef _estimate_noise_variance(self, X, y, positive):\n \"\"\"Compute an estimate of the variance with an OLS model.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Data to be fitted by the OLS model. We expect the data to be\n centered.\n\n y : ndarray of shape (n_samples,)\n Associated target.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. This should be inline with\n the `positive` parameter from `LassoLarsIC`.\n\n Returns\n -------\n noise_variance : float\n An estimator of the noise variance of an OLS model.\n \"\"\"\n if X.shape[0] <= X.shape[1] + self.fit_intercept:\n raise ValueError(f'You are using {self.__class__.__name__} in the case where the number of samples is smaller than the number of features. In this setting, getting a good estimate for the variance of the noise is not possible. Provide an estimate of the noise variance in the constructor.')\n ols_model = LinearRegression(positive=positive, fit_intercept=False)\n y_pred = ols_model.fit(X, y).predict(X)\n return np.sum((y - y_pred)**2) / (X.shape[0] - X.shape[1] - self.fit_intercept)" }, { "name": "_more_tags", @@ -101952,13 +108883,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multioutput': False}" }, { @@ -101976,7 +108908,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -101986,7 +108919,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -101996,7 +108930,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values. Will be cast to X's dtype if necessary." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -102006,14 +108941,15 @@ "docstring": { "type": "bool, default=None", "description": "If provided, this parameter will override the choice\nof copy_X made at instance creation.\nIf ``True``, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model using X, y as training data.", - "docstring": "Fit the model using X, y as training data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\ncopy_X : bool, default=None\n If provided, this parameter will override the choice\n of copy_X made at instance creation.\n If ``True``, X will be copied; else, it may be overwritten.\n\nReturns\n-------\nself : object\n Returns an instance of self.", - "source_code": "\ndef fit(self, X, y, copy_X=None):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n copy_X : bool, default=None\n If provided, this parameter will override the choice\n of copy_X made at instance creation.\n If ``True``, X will be copied; else, it may be overwritten.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=True, estimator_name=self.__class__.__name__)\n if copy_X is None:\n copy_X = self.copy_X\n (X, y) = self._validate_data(X, y, y_numeric=True)\n (X, y, Xmean, ymean, Xstd) = LinearModel._preprocess_data(X, y, self.fit_intercept, _normalize, copy_X)\n Gram = self.precompute\n (alphas_, _, coef_path_, self.n_iter_) = lars_path(X, y, Gram=Gram, copy_X=copy_X, copy_Gram=True, alpha_min=0.0, method='lasso', verbose=self.verbose, max_iter=self.max_iter, eps=self.eps, return_n_iter=True, positive=self.positive)\n n_samples = X.shape[0]\n if self.criterion == 'aic':\n K = 2\n elif self.criterion == 'bic':\n K = log(n_samples)\n else:\n raise ValueError('criterion should be either bic or aic')\n R = y[:, np.newaxis] - np.dot(X, coef_path_)\n mean_squared_error = np.mean(R**2, axis=0)\n sigma2 = np.var(y)\n df = np.zeros(coef_path_.shape[1], dtype=int)\n for (k, coef) in enumerate(coef_path_.T):\n mask = np.abs(coef) > np.finfo(coef.dtype).eps\n if not np.any(mask):\n continue\n df[k] = np.sum(mask)\n self.alphas_ = alphas_\n eps64 = np.finfo('float64').eps\n self.criterion_ = n_samples * mean_squared_error / (sigma2 + eps64) + K * df\n n_best = np.argmin(self.criterion_)\n self.alpha_ = alphas_[n_best]\n self.coef_ = coef_path_[:, n_best]\n self._set_intercept(Xmean, ymean, Xstd)\n return self" + "docstring": "Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n copy_X : bool, default=None\n If provided, this parameter will override the choice\n of copy_X made at instance creation.\n If ``True``, X will be copied; else, it may be overwritten.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n ", + "source_code": "\ndef fit(self, X, y, copy_X=None):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n copy_X : bool, default=None\n If provided, this parameter will override the choice\n of copy_X made at instance creation.\n If ``True``, X will be copied; else, it may be overwritten.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=True, estimator_name=self.__class__.__name__)\n if copy_X is None:\n copy_X = self.copy_X\n (X, y) = self._validate_data(X, y, y_numeric=True)\n (X, y, Xmean, ymean, Xstd) = LinearModel._preprocess_data(X, y, self.fit_intercept, _normalize, copy_X)\n Gram = self.precompute\n (alphas_, _, coef_path_, self.n_iter_) = lars_path(X, y, Gram=Gram, copy_X=copy_X, copy_Gram=True, alpha_min=0.0, method='lasso', verbose=self.verbose, max_iter=self.max_iter, eps=self.eps, return_n_iter=True, positive=self.positive)\n n_samples = X.shape[0]\n if self.criterion == 'aic':\n criterion_factor = 2\n elif self.criterion == 'bic':\n criterion_factor = log(n_samples)\n else:\n raise ValueError(f'criterion should be either bic or aic, got {self.criterion!r}')\n residuals = y[:, np.newaxis] - np.dot(X, coef_path_)\n residuals_sum_squares = np.sum(residuals**2, axis=0)\n degrees_of_freedom = np.zeros(coef_path_.shape[1], dtype=int)\n for (k, coef) in enumerate(coef_path_.T):\n mask = np.abs(coef) > np.finfo(coef.dtype).eps\n if not np.any(mask):\n continue\n degrees_of_freedom[k] = np.sum(mask)\n self.alphas_ = alphas_\n if self.noise_variance is None:\n self.noise_variance_ = self._estimate_noise_variance(X, y, positive=self.positive)\n else:\n self.noise_variance_ = self.noise_variance\n self.criterion_ = n_samples * np.log(2 * np.pi * self.noise_variance_) + residuals_sum_squares / self.noise_variance_ + criterion_factor * degrees_of_freedom\n n_best = np.argmin(self.criterion_)\n self.alpha_ = alphas_[n_best]\n self.coef_ = coef_path_[:, n_best]\n self._set_intercept(Xmean, ymean, Xstd)\n return self" }, { "name": "_check_copy_and_writeable", @@ -102030,7 +108966,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy", @@ -102040,13 +108977,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_copy_and_writeable(array, copy=False):\n if copy or not array.flags.writeable:\n return array.copy()\n return array" }, { @@ -102064,7 +109002,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to fit the LARS on" - } + }, + "refined_type": {} }, { "name": "y_train", @@ -102074,7 +109013,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target variable to fit LARS on" - } + }, + "refined_type": {} }, { "name": "X_test", @@ -102084,7 +109024,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to compute the residues on" - } + }, + "refined_type": {} }, { "name": "y_test", @@ -102094,7 +109035,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target variable to compute the residues on" - } + }, + "refined_type": {} }, { "name": "Gram", @@ -102104,7 +109046,8 @@ "docstring": { "type": "None, 'auto' or array-like of shape (n_features, n_features), default=None", "description": "Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram\nmatrix is precomputed from the given X, if there are more samples\nthan features" - } + }, + "refined_type": {} }, { "name": "copy", @@ -102114,7 +109057,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether X_train, X_test, y_train and y_test should be copied;\nif False, they may be overwritten." - } + }, + "refined_type": {} }, { "name": "method", @@ -102124,6 +109068,10 @@ "docstring": { "type": "{'lar' , 'lasso'}, default='lar'", "description": "Specifies the returned model. Select ``'lar'`` for Least Angle\nRegression, ``'lasso'`` for the Lasso." + }, + "refined_type": { + "kind": "EnumType", + "values": ["lasso", "lar"] } }, { @@ -102134,7 +109082,8 @@ "docstring": { "type": "bool or int, default=False", "description": "Sets the amount of verbosity" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -102144,7 +109093,8 @@ "docstring": { "type": "bool, default=True", "description": "whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -102154,7 +109104,8 @@ "docstring": { "type": "bool, default=True", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -102164,7 +109115,8 @@ "docstring": { "type": "int, default=500", "description": "Maximum number of iterations to perform." - } + }, + "refined_type": {} }, { "name": "eps", @@ -102174,7 +109126,8 @@ "docstring": { "type": "float, default=np.finfo(float).eps", "description": "The machine-precision regularization in the computation of the\nCholesky diagonal factors. Increase this for very ill-conditioned\nsystems. Unlike the ``tol`` parameter in some iterative\noptimization-based algorithms, this parameter does not control\nthe tolerance of the optimization." - } + }, + "refined_type": {} }, { "name": "positive", @@ -102184,13 +109137,14 @@ "docstring": { "type": "bool, default=False", "description": "Restrict coefficients to be >= 0. Be aware that you might want to\nremove fit_intercept which is set True by default.\nSee reservations for using this option in combination with method\n'lasso' for expected small values of alpha in the doc of LassoLarsCV\nand LassoLarsIC." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the residues on left-out data for a full LARS path", - "docstring": "Compute the residues on left-out data for a full LARS path\n\nParameters\n-----------\nX_train : array-like of shape (n_samples, n_features)\n The data to fit the LARS on\n\ny_train : array-like of shape (n_samples,)\n The target variable to fit LARS on\n\nX_test : array-like of shape (n_samples, n_features)\n The data to compute the residues on\n\ny_test : array-like of shape (n_samples,)\n The target variable to compute the residues on\n\nGram : None, 'auto' or array-like of shape (n_features, n_features), default=None\n Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram\n matrix is precomputed from the given X, if there are more samples\n than features\n\ncopy : bool, default=True\n Whether X_train, X_test, y_train and y_test should be copied;\n if False, they may be overwritten.\n\nmethod : {'lar' , 'lasso'}, default='lar'\n Specifies the returned model. Select ``'lar'`` for Least Angle\n Regression, ``'lasso'`` for the Lasso.\n\nverbose : bool or int, default=False\n Sets the amount of verbosity\n\nfit_intercept : bool, default=True\n whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\npositive : bool, default=False\n Restrict coefficients to be >= 0. Be aware that you might want to\n remove fit_intercept which is set True by default.\n See reservations for using this option in combination with method\n 'lasso' for expected small values of alpha in the doc of LassoLarsCV\n and LassoLarsIC.\n\nnormalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\nmax_iter : int, default=500\n Maximum number of iterations to perform.\n\neps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\nReturns\n--------\nalphas : array-like of shape (n_alphas,)\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter`` or ``n_features``, whichever\n is smaller.\n\nactive : list\n Indices of active variables at the end of the path.\n\ncoefs : array-like of shape (n_features, n_alphas)\n Coefficients along the path\n\nresidues : array-like of shape (n_alphas, n_samples)\n Residues of the prediction on the test data", + "docstring": "Compute the residues on left-out data for a full LARS path\n\n Parameters\n -----------\n X_train : array-like of shape (n_samples, n_features)\n The data to fit the LARS on\n\n y_train : array-like of shape (n_samples,)\n The target variable to fit LARS on\n\n X_test : array-like of shape (n_samples, n_features)\n The data to compute the residues on\n\n y_test : array-like of shape (n_samples,)\n The target variable to compute the residues on\n\n Gram : None, 'auto' or array-like of shape (n_features, n_features), default=None\n Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram\n matrix is precomputed from the given X, if there are more samples\n than features\n\n copy : bool, default=True\n Whether X_train, X_test, y_train and y_test should be copied;\n if False, they may be overwritten.\n\n method : {'lar' , 'lasso'}, default='lar'\n Specifies the returned model. Select ``'lar'`` for Least Angle\n Regression, ``'lasso'`` for the Lasso.\n\n verbose : bool or int, default=False\n Sets the amount of verbosity\n\n fit_intercept : bool, default=True\n whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. Be aware that you might want to\n remove fit_intercept which is set True by default.\n See reservations for using this option in combination with method\n 'lasso' for expected small values of alpha in the doc of LassoLarsCV\n and LassoLarsIC.\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n max_iter : int, default=500\n Maximum number of iterations to perform.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n Returns\n --------\n alphas : array-like of shape (n_alphas,)\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter`` or ``n_features``, whichever\n is smaller.\n\n active : list\n Indices of active variables at the end of the path.\n\n coefs : array-like of shape (n_features, n_alphas)\n Coefficients along the path\n\n residues : array-like of shape (n_alphas, n_samples)\n Residues of the prediction on the test data\n ", "source_code": "\ndef _lars_path_residues(X_train, y_train, X_test, y_test, Gram=None, copy=True, method='lars', verbose=False, fit_intercept=True, normalize=True, max_iter=500, eps=np.finfo(float).eps, positive=False):\n \"\"\"Compute the residues on left-out data for a full LARS path\n\n Parameters\n -----------\n X_train : array-like of shape (n_samples, n_features)\n The data to fit the LARS on\n\n y_train : array-like of shape (n_samples,)\n The target variable to fit LARS on\n\n X_test : array-like of shape (n_samples, n_features)\n The data to compute the residues on\n\n y_test : array-like of shape (n_samples,)\n The target variable to compute the residues on\n\n Gram : None, 'auto' or array-like of shape (n_features, n_features), default=None\n Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram\n matrix is precomputed from the given X, if there are more samples\n than features\n\n copy : bool, default=True\n Whether X_train, X_test, y_train and y_test should be copied;\n if False, they may be overwritten.\n\n method : {'lar' , 'lasso'}, default='lar'\n Specifies the returned model. Select ``'lar'`` for Least Angle\n Regression, ``'lasso'`` for the Lasso.\n\n verbose : bool or int, default=False\n Sets the amount of verbosity\n\n fit_intercept : bool, default=True\n whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n positive : bool, default=False\n Restrict coefficients to be >= 0. Be aware that you might want to\n remove fit_intercept which is set True by default.\n See reservations for using this option in combination with method\n 'lasso' for expected small values of alpha in the doc of LassoLarsCV\n and LassoLarsIC.\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n max_iter : int, default=500\n Maximum number of iterations to perform.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n Returns\n --------\n alphas : array-like of shape (n_alphas,)\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter`` or ``n_features``, whichever\n is smaller.\n\n active : list\n Indices of active variables at the end of the path.\n\n coefs : array-like of shape (n_features, n_alphas)\n Coefficients along the path\n\n residues : array-like of shape (n_alphas, n_samples)\n Residues of the prediction on the test data\n \"\"\"\n X_train = _check_copy_and_writeable(X_train, copy)\n y_train = _check_copy_and_writeable(y_train, copy)\n X_test = _check_copy_and_writeable(X_test, copy)\n y_test = _check_copy_and_writeable(y_test, copy)\n if fit_intercept:\n X_mean = X_train.mean(axis=0)\n X_train -= X_mean\n X_test -= X_mean\n y_mean = y_train.mean(axis=0)\n y_train = as_float_array(y_train, copy=False)\n y_train -= y_mean\n y_test = as_float_array(y_test, copy=False)\n y_test -= y_mean\n if normalize:\n norms = np.sqrt(np.sum(X_train**2, axis=0))\n nonzeros = np.flatnonzero(norms)\n X_train[:, nonzeros] /= norms[nonzeros]\n (alphas, active, coefs) = lars_path(X_train, y_train, Gram=Gram, copy_X=False, copy_Gram=False, method=method, verbose=max(0, verbose - 1), max_iter=max_iter, eps=eps, positive=positive)\n if normalize:\n coefs[nonzeros] /= norms[nonzeros][:, np.newaxis]\n residues = np.dot(X_test, coefs) - y_test[:, np.newaxis]\n return alphas, active, coefs, residues.T" }, { @@ -102208,7 +109162,8 @@ "docstring": { "type": "None or ndarray of shape (n_samples, n_features)", "description": "Input data. Note that if X is None then Gram must be specified,\ni.e., cannot be None or False." - } + }, + "refined_type": {} }, { "name": "y", @@ -102218,7 +109173,8 @@ "docstring": { "type": "None or ndarray of shape (n_samples,)", "description": "Input targets." - } + }, + "refined_type": {} }, { "name": "Xy", @@ -102228,7 +109184,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets), default=None", "description": "`Xy = np.dot(X.T, y)` that can be precomputed. It is useful\nonly when the Gram matrix is precomputed." - } + }, + "refined_type": {} }, { "name": "Gram", @@ -102238,7 +109195,8 @@ "docstring": { "type": "None, 'auto' or array-like of shape (n_features, n_features), default=None", "description": "Precomputed Gram matrix `(X' * X)`, if ``'auto'``, the Gram\nmatrix is precomputed from the given X, if there are more samples\nthan features." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -102248,7 +109206,8 @@ "docstring": { "type": "int or float, default=None", "description": "Equivalent size of sample. If `None`, it will be `n_samples`." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -102258,7 +109217,8 @@ "docstring": { "type": "int, default=500", "description": "Maximum number of iterations to perform, set to infinity for no limit." - } + }, + "refined_type": {} }, { "name": "alpha_min", @@ -102268,7 +109228,8 @@ "docstring": { "type": "float, default=0", "description": "Minimum correlation along the path. It corresponds to the\nregularization parameter alpha parameter in the Lasso." - } + }, + "refined_type": {} }, { "name": "method", @@ -102278,6 +109239,10 @@ "docstring": { "type": "{'lar', 'lasso'}, default='lar'", "description": "Specifies the returned model. Select ``'lar'`` for Least Angle\nRegression, ``'lasso'`` for the Lasso." + }, + "refined_type": { + "kind": "EnumType", + "values": ["lasso", "lar"] } }, { @@ -102288,7 +109253,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``False``, ``X`` is overwritten." - } + }, + "refined_type": {} }, { "name": "eps", @@ -102298,7 +109264,8 @@ "docstring": { "type": "float, default=np.finfo(float).eps", "description": "The machine-precision regularization in the computation of the\nCholesky diagonal factors. Increase this for very ill-conditioned\nsystems. Unlike the ``tol`` parameter in some iterative\noptimization-based algorithms, this parameter does not control\nthe tolerance of the optimization." - } + }, + "refined_type": {} }, { "name": "copy_Gram", @@ -102308,7 +109275,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``False``, ``Gram`` is overwritten." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -102318,7 +109286,8 @@ "docstring": { "type": "int, default=0", "description": "Controls output verbosity." - } + }, + "refined_type": {} }, { "name": "return_path", @@ -102328,7 +109297,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``return_path==True`` returns the entire path, else returns only the\nlast point of the path." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -102338,7 +109308,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the number of iterations." - } + }, + "refined_type": {} }, { "name": "positive", @@ -102348,13 +109319,14 @@ "docstring": { "type": "bool, default=False", "description": "Restrict coefficients to be >= 0.\nThis option is only allowed with method 'lasso'. Note that the model\ncoefficients will not converge to the ordinary-least-squares solution\nfor small values of alpha. Only coefficients up to the smallest alpha\nvalue (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by\nthe stepwise Lars-Lasso algorithm are typically in congruence with the\nsolution of the coordinate descent lasso_path function." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute Least Angle Regression or Lasso path using LARS algorithm [1]\n\nThe optimization objective for the case method='lasso' is:: (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 in the case of method='lars', the objective function is only known in the form of an implicit equation (see discussion in [1]) Read more in the :ref:`User Guide `.", - "docstring": "Compute Least Angle Regression or Lasso path using LARS algorithm [1]\n\nThe optimization objective for the case method='lasso' is::\n\n(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nin the case of method='lars', the objective function is only known in\nthe form of an implicit equation (see discussion in [1])\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : None or ndarray of shape (n_samples, n_features)\n Input data. Note that if X is None then Gram must be specified,\n i.e., cannot be None or False.\n\ny : None or ndarray of shape (n_samples,)\n Input targets.\n\nXy : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n `Xy = np.dot(X.T, y)` that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\nGram : None, 'auto' or array-like of shape (n_features, n_features), default=None\n Precomputed Gram matrix `(X' * X)`, if ``'auto'``, the Gram\n matrix is precomputed from the given X, if there are more samples\n than features.\n\nn_samples : int or float, default=None\n Equivalent size of sample. If `None`, it will be `n_samples`.\n\nmax_iter : int, default=500\n Maximum number of iterations to perform, set to infinity for no limit.\n\nalpha_min : float, default=0\n Minimum correlation along the path. It corresponds to the\n regularization parameter alpha parameter in the Lasso.\n\nmethod : {'lar', 'lasso'}, default='lar'\n Specifies the returned model. Select ``'lar'`` for Least Angle\n Regression, ``'lasso'`` for the Lasso.\n\ncopy_X : bool, default=True\n If ``False``, ``X`` is overwritten.\n\neps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\ncopy_Gram : bool, default=True\n If ``False``, ``Gram`` is overwritten.\n\nverbose : int, default=0\n Controls output verbosity.\n\nreturn_path : bool, default=True\n If ``return_path==True`` returns the entire path, else returns only the\n last point of the path.\n\nreturn_n_iter : bool, default=False\n Whether to return the number of iterations.\n\npositive : bool, default=False\n Restrict coefficients to be >= 0.\n This option is only allowed with method 'lasso'. Note that the model\n coefficients will not converge to the ordinary-least-squares solution\n for small values of alpha. Only coefficients up to the smallest alpha\n value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by\n the stepwise Lars-Lasso algorithm are typically in congruence with the\n solution of the coordinate descent lasso_path function.\n\nReturns\n-------\nalphas : array-like of shape (n_alphas + 1,)\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller.\n\nactive : array-like of shape (n_alphas,)\n Indices of active variables at the end of the path.\n\ncoefs : array-like of shape (n_features, n_alphas + 1)\n Coefficients along the path\n\nn_iter : int\n Number of iterations run. Returned only if return_n_iter is set\n to True.\n\nSee Also\n--------\nlasso_path\nLassoLars\nLars\nLassoLarsCV\nLarsCV\nsklearn.decomposition.sparse_encode\n\nReferences\n----------\n.. [1] \"Least Angle Regression\", Efron et al.\n http://statweb.stanford.edu/~tibs/ftp/lars.pdf\n\n.. [2] `Wikipedia entry on the Least-angle regression\n `_\n\n.. [3] `Wikipedia entry on the Lasso\n `_", + "description": "Compute Least Angle Regression or Lasso path using LARS algorithm [1]\n\nThe optimization objective for the case method='lasso' is::\n\n(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nin the case of method='lars', the objective function is only known in\nthe form of an implicit equation (see discussion in [1])\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute Least Angle Regression or Lasso path using LARS algorithm [1]\n\n The optimization objective for the case method='lasso' is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n in the case of method='lars', the objective function is only known in\n the form of an implicit equation (see discussion in [1])\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : None or ndarray of shape (n_samples, n_features)\n Input data. Note that if X is None then Gram must be specified,\n i.e., cannot be None or False.\n\n y : None or ndarray of shape (n_samples,)\n Input targets.\n\n Xy : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n `Xy = np.dot(X.T, y)` that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\n Gram : None, 'auto' or array-like of shape (n_features, n_features), default=None\n Precomputed Gram matrix `(X' * X)`, if ``'auto'``, the Gram\n matrix is precomputed from the given X, if there are more samples\n than features.\n\n n_samples : int or float, default=None\n Equivalent size of sample. If `None`, it will be `n_samples`.\n\n max_iter : int, default=500\n Maximum number of iterations to perform, set to infinity for no limit.\n\n alpha_min : float, default=0\n Minimum correlation along the path. It corresponds to the\n regularization parameter alpha parameter in the Lasso.\n\n method : {'lar', 'lasso'}, default='lar'\n Specifies the returned model. Select ``'lar'`` for Least Angle\n Regression, ``'lasso'`` for the Lasso.\n\n copy_X : bool, default=True\n If ``False``, ``X`` is overwritten.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_Gram : bool, default=True\n If ``False``, ``Gram`` is overwritten.\n\n verbose : int, default=0\n Controls output verbosity.\n\n return_path : bool, default=True\n If ``return_path==True`` returns the entire path, else returns only the\n last point of the path.\n\n return_n_iter : bool, default=False\n Whether to return the number of iterations.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0.\n This option is only allowed with method 'lasso'. Note that the model\n coefficients will not converge to the ordinary-least-squares solution\n for small values of alpha. Only coefficients up to the smallest alpha\n value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by\n the stepwise Lars-Lasso algorithm are typically in congruence with the\n solution of the coordinate descent lasso_path function.\n\n Returns\n -------\n alphas : array-like of shape (n_alphas + 1,)\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller.\n\n active : array-like of shape (n_alphas,)\n Indices of active variables at the end of the path.\n\n coefs : array-like of shape (n_features, n_alphas + 1)\n Coefficients along the path\n\n n_iter : int\n Number of iterations run. Returned only if return_n_iter is set\n to True.\n\n See Also\n --------\n lasso_path\n LassoLars\n Lars\n LassoLarsCV\n LarsCV\n sklearn.decomposition.sparse_encode\n\n References\n ----------\n .. [1] \"Least Angle Regression\", Efron et al.\n http://statweb.stanford.edu/~tibs/ftp/lars.pdf\n\n .. [2] `Wikipedia entry on the Least-angle regression\n `_\n\n .. [3] `Wikipedia entry on the Lasso\n `_\n\n ", "source_code": "\ndef _lars_path_solver(X, y, Xy=None, Gram=None, n_samples=None, max_iter=500, alpha_min=0, method='lar', copy_X=True, eps=np.finfo(float).eps, copy_Gram=True, verbose=0, return_path=True, return_n_iter=False, positive=False):\n \"\"\"Compute Least Angle Regression or Lasso path using LARS algorithm [1]\n\n The optimization objective for the case method='lasso' is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n in the case of method='lars', the objective function is only known in\n the form of an implicit equation (see discussion in [1])\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : None or ndarray of shape (n_samples, n_features)\n Input data. Note that if X is None then Gram must be specified,\n i.e., cannot be None or False.\n\n y : None or ndarray of shape (n_samples,)\n Input targets.\n\n Xy : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n `Xy = np.dot(X.T, y)` that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\n Gram : None, 'auto' or array-like of shape (n_features, n_features), default=None\n Precomputed Gram matrix `(X' * X)`, if ``'auto'``, the Gram\n matrix is precomputed from the given X, if there are more samples\n than features.\n\n n_samples : int or float, default=None\n Equivalent size of sample. If `None`, it will be `n_samples`.\n\n max_iter : int, default=500\n Maximum number of iterations to perform, set to infinity for no limit.\n\n alpha_min : float, default=0\n Minimum correlation along the path. It corresponds to the\n regularization parameter alpha parameter in the Lasso.\n\n method : {'lar', 'lasso'}, default='lar'\n Specifies the returned model. Select ``'lar'`` for Least Angle\n Regression, ``'lasso'`` for the Lasso.\n\n copy_X : bool, default=True\n If ``False``, ``X`` is overwritten.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_Gram : bool, default=True\n If ``False``, ``Gram`` is overwritten.\n\n verbose : int, default=0\n Controls output verbosity.\n\n return_path : bool, default=True\n If ``return_path==True`` returns the entire path, else returns only the\n last point of the path.\n\n return_n_iter : bool, default=False\n Whether to return the number of iterations.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0.\n This option is only allowed with method 'lasso'. Note that the model\n coefficients will not converge to the ordinary-least-squares solution\n for small values of alpha. Only coefficients up to the smallest alpha\n value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by\n the stepwise Lars-Lasso algorithm are typically in congruence with the\n solution of the coordinate descent lasso_path function.\n\n Returns\n -------\n alphas : array-like of shape (n_alphas + 1,)\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller.\n\n active : array-like of shape (n_alphas,)\n Indices of active variables at the end of the path.\n\n coefs : array-like of shape (n_features, n_alphas + 1)\n Coefficients along the path\n\n n_iter : int\n Number of iterations run. Returned only if return_n_iter is set\n to True.\n\n See Also\n --------\n lasso_path\n LassoLars\n Lars\n LassoLarsCV\n LarsCV\n sklearn.decomposition.sparse_encode\n\n References\n ----------\n .. [1] \"Least Angle Regression\", Efron et al.\n http://statweb.stanford.edu/~tibs/ftp/lars.pdf\n\n .. [2] `Wikipedia entry on the Least-angle regression\n `_\n\n .. [3] `Wikipedia entry on the Lasso\n `_\n\n \"\"\"\n if method == 'lar' and positive:\n raise ValueError(\"Positive constraint not supported for 'lar' coding method.\")\n n_samples = n_samples if n_samples is not None else y.size\n if Xy is None:\n Cov = np.dot(X.T, y)\n else:\n Cov = Xy.copy()\n if Gram is None or Gram is False:\n Gram = None\n if X is None:\n raise ValueError('X and Gram cannot both be unspecified.')\n elif isinstance(Gram, str) and Gram == 'auto' or Gram is True:\n if Gram is True or X.shape[0] > X.shape[1]:\n Gram = np.dot(X.T, X)\n else:\n Gram = None\n elif copy_Gram:\n Gram = Gram.copy()\n if Gram is None:\n n_features = X.shape[1]\n else:\n n_features = Cov.shape[0]\n if Gram.shape != (n_features, n_features):\n raise ValueError('The shapes of the inputs Gram and Xy do not match.')\n if copy_X and X is not None and Gram is None:\n X = X.copy('F')\n max_features = min(max_iter, n_features)\n dtypes = set((a.dtype for a in (X, y, Xy, Gram) if a is not None))\n if len(dtypes) == 1:\n return_dtype = next(iter(dtypes))\n else:\n return_dtype = np.float64\n if return_path:\n coefs = np.zeros((max_features + 1, n_features), dtype=return_dtype)\n alphas = np.zeros(max_features + 1, dtype=return_dtype)\n else:\n (coef, prev_coef) = (np.zeros(n_features, dtype=return_dtype), np.zeros(n_features, dtype=return_dtype))\n (alpha, prev_alpha) = (np.array([0.0], dtype=return_dtype), np.array([0.0], dtype=return_dtype))\n (n_iter, n_active) = (0, 0)\n (active, indices) = (list(), np.arange(n_features))\n sign_active = np.empty(max_features, dtype=np.int8)\n drop = False\n if Gram is None:\n L = np.empty((max_features, max_features), dtype=X.dtype)\n (swap, nrm2) = linalg.get_blas_funcs(('swap', 'nrm2'), (X, ))\n else:\n L = np.empty((max_features, max_features), dtype=Gram.dtype)\n (swap, nrm2) = linalg.get_blas_funcs(('swap', 'nrm2'), (Cov, ))\n (solve_cholesky, ) = get_lapack_funcs(('potrs', ), (L, ))\n if verbose:\n if verbose > 1:\n print('Step\\t\\tAdded\\t\\tDropped\\t\\tActive set size\\t\\tC')\n else:\n sys.stdout.write('.')\n sys.stdout.flush()\n tiny32 = np.finfo(np.float32).tiny\n cov_precision = np.finfo(Cov.dtype).precision\n equality_tolerance = np.finfo(np.float32).eps\n if Gram is not None:\n Gram_copy = Gram.copy()\n Cov_copy = Cov.copy()\n while True:\n if Cov.size:\n if positive:\n C_idx = np.argmax(Cov)\n else:\n C_idx = np.argmax(np.abs(Cov))\n C_ = Cov[C_idx]\n if positive:\n C = C_\n else:\n C = np.fabs(C_)\n else:\n C = 0.0\n if return_path:\n alpha = alphas[n_iter, np.newaxis]\n coef = coefs[n_iter]\n prev_alpha = alphas[n_iter - 1, np.newaxis]\n prev_coef = coefs[n_iter - 1]\n alpha[0] = C / n_samples\n if alpha[0] <= alpha_min + equality_tolerance:\n if abs(alpha[0] - alpha_min) > equality_tolerance:\n if n_iter > 0:\n ss = (prev_alpha[0] - alpha_min) / (prev_alpha[0] - alpha[0])\n coef[:] = prev_coef + ss * (coef - prev_coef)\n alpha[0] = alpha_min\n if return_path:\n coefs[n_iter] = coef\n break\n if n_iter >= max_iter or n_active >= n_features:\n break\n if not drop:\n if positive:\n sign_active[n_active] = np.ones_like(C_)\n else:\n sign_active[n_active] = np.sign(C_)\n (m, n) = (n_active, C_idx + n_active)\n (Cov[C_idx], Cov[0]) = swap(Cov[C_idx], Cov[0])\n (indices[n], indices[m]) = (indices[m], indices[n])\n Cov_not_shortened = Cov\n Cov = Cov[1:]\n if Gram is None:\n (X.T[n], X.T[m]) = swap(X.T[n], X.T[m])\n c = nrm2(X.T[n_active])**2\n L[n_active, :n_active] = np.dot(X.T[n_active], X.T[:n_active].T)\n else:\n (Gram[m], Gram[n]) = swap(Gram[m], Gram[n])\n (Gram[:, m], Gram[:, n]) = swap(Gram[:, m], Gram[:, n])\n c = Gram[n_active, n_active]\n L[n_active, :n_active] = Gram[n_active, :n_active]\n if n_active:\n linalg.solve_triangular(L[:n_active, :n_active], L[n_active, :n_active], trans=0, lower=1, overwrite_b=True, **SOLVE_TRIANGULAR_ARGS)\n v = np.dot(L[n_active, :n_active], L[n_active, :n_active])\n diag = max(np.sqrt(np.abs(c - v)), eps)\n L[n_active, n_active] = diag\n if diag < 1e-07:\n warnings.warn('Regressors in active set degenerate. Dropping a regressor, after %i iterations, i.e. alpha=%.3e, with an active set of %i regressors, and the smallest cholesky pivot element being %.3e. Reduce max_iter or increase eps parameters.' % (n_iter, alpha, n_active, diag), ConvergenceWarning)\n Cov = Cov_not_shortened\n Cov[0] = 0\n (Cov[C_idx], Cov[0]) = swap(Cov[C_idx], Cov[0])\n continue\n active.append(indices[n_active])\n n_active += 1\n if verbose > 1:\n print('%s\\t\\t%s\\t\\t%s\\t\\t%s\\t\\t%s' % (n_iter, active[-1], '', n_active, C))\n if method == 'lasso' and n_iter > 0 and prev_alpha[0] < alpha[0]:\n warnings.warn('Early stopping the lars path, as the residues are small and the current value of alpha is no longer well controlled. %i iterations, alpha=%.3e, previous alpha=%.3e, with an active set of %i regressors.' % (n_iter, alpha, prev_alpha, n_active), ConvergenceWarning)\n break\n (least_squares, _) = solve_cholesky(L[:n_active, :n_active], sign_active[:n_active], lower=True)\n if least_squares.size == 1 and least_squares == 0:\n least_squares[...] = 1\n AA = 1.0\n else:\n AA = 1.0 / np.sqrt(np.sum(least_squares * sign_active[:n_active]))\n if not np.isfinite(AA):\n i = 0\n L_ = L[:n_active, :n_active].copy()\n while not np.isfinite(AA):\n L_.flat[::n_active + 1] += 2**i * eps\n (least_squares, _) = solve_cholesky(L_, sign_active[:n_active], lower=True)\n tmp = max(np.sum(least_squares * sign_active[:n_active]), eps)\n AA = 1.0 / np.sqrt(tmp)\n i += 1\n least_squares *= AA\n if Gram is None:\n eq_dir = np.dot(X.T[:n_active].T, least_squares)\n corr_eq_dir = np.dot(X.T[n_active:], eq_dir)\n else:\n corr_eq_dir = np.dot(Gram[:n_active, n_active:].T, least_squares)\n np.around(corr_eq_dir, decimals=cov_precision, out=corr_eq_dir)\n g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny32))\n if positive:\n gamma_ = min(g1, C / AA)\n else:\n g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny32))\n gamma_ = min(g1, g2, C / AA)\n drop = False\n z = -coef[active] / (least_squares + tiny32)\n z_pos = arrayfuncs.min_pos(z)\n if z_pos < gamma_:\n idx = np.where(z == z_pos)[0][::-1]\n sign_active[idx] = -sign_active[idx]\n if method == 'lasso':\n gamma_ = z_pos\n drop = True\n n_iter += 1\n if return_path:\n if n_iter >= coefs.shape[0]:\n del coef, alpha, prev_alpha, prev_coef\n add_features = 2 * max(1, max_features - n_active)\n coefs = np.resize(coefs, (n_iter + add_features, n_features))\n coefs[-add_features:] = 0\n alphas = np.resize(alphas, n_iter + add_features)\n alphas[-add_features:] = 0\n coef = coefs[n_iter]\n prev_coef = coefs[n_iter - 1]\n else:\n prev_coef = coef\n prev_alpha[0] = alpha[0]\n coef = np.zeros_like(coef)\n coef[active] = prev_coef[active] + gamma_ * least_squares\n Cov -= gamma_ * corr_eq_dir\n if drop and method == 'lasso':\n for ii in idx:\n arrayfuncs.cholesky_delete(L[:n_active, :n_active], ii)\n n_active -= 1\n drop_idx = [active.pop(ii) for ii in idx]\n if Gram is None:\n for ii in idx:\n for i in range(ii, n_active):\n (X.T[i], X.T[i + 1]) = swap(X.T[i], X.T[i + 1])\n (indices[i], indices[i + 1]) = (indices[i + 1], indices[i])\n residual = y - np.dot(X[:, :n_active], coef[active])\n temp = np.dot(X.T[n_active], residual)\n Cov = np.r_[temp, Cov]\n else:\n for ii in idx:\n for i in range(ii, n_active):\n (indices[i], indices[i + 1]) = (indices[i + 1], indices[i])\n (Gram[i], Gram[i + 1]) = swap(Gram[i], Gram[i + 1])\n (Gram[:, i], Gram[:, i + 1]) = swap(Gram[:, i], Gram[:, i + 1])\n temp = Cov_copy[drop_idx] - np.dot(Gram_copy[drop_idx], coef)\n Cov = np.r_[temp, Cov]\n sign_active = np.delete(sign_active, idx)\n sign_active = np.append(sign_active, 0.0)\n if verbose > 1:\n print('%s\\t\\t%s\\t\\t%s\\t\\t%s\\t\\t%s' % (n_iter, '', drop_idx, n_active, abs(temp)))\n if return_path:\n alphas = alphas[:n_iter + 1]\n coefs = coefs[:n_iter + 1]\n if return_n_iter:\n return alphas, active, coefs.T, n_iter\n else:\n return alphas, active, coefs.T\n elif return_n_iter:\n return alpha, active, coef, n_iter\n else:\n return alpha, active, coef" }, { @@ -102372,7 +109344,8 @@ "docstring": { "type": "None or array-like of shape (n_samples, n_features)", "description": "Input data. Note that if X is None then the Gram matrix must be\nspecified, i.e., cannot be None or False." - } + }, + "refined_type": {} }, { "name": "y", @@ -102382,7 +109355,8 @@ "docstring": { "type": "None or array-like of shape (n_samples,)", "description": "Input targets." - } + }, + "refined_type": {} }, { "name": "Xy", @@ -102392,7 +109366,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets), default=None", "description": "Xy = np.dot(X.T, y) that can be precomputed. It is useful\nonly when the Gram matrix is precomputed." - } + }, + "refined_type": {} }, { "name": "Gram", @@ -102402,7 +109377,8 @@ "docstring": { "type": "None, 'auto', array-like of shape (n_features, n_features), default=None", "description": "Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram\nmatrix is precomputed from the given X, if there are more samples\nthan features." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -102412,7 +109388,8 @@ "docstring": { "type": "int, default=500", "description": "Maximum number of iterations to perform, set to infinity for no limit." - } + }, + "refined_type": {} }, { "name": "alpha_min", @@ -102422,7 +109399,8 @@ "docstring": { "type": "float, default=0", "description": "Minimum correlation along the path. It corresponds to the\nregularization parameter alpha parameter in the Lasso." - } + }, + "refined_type": {} }, { "name": "method", @@ -102432,6 +109410,10 @@ "docstring": { "type": "{'lar', 'lasso'}, default='lar'", "description": "Specifies the returned model. Select ``'lar'`` for Least Angle\nRegression, ``'lasso'`` for the Lasso." + }, + "refined_type": { + "kind": "EnumType", + "values": ["lasso", "lar"] } }, { @@ -102442,7 +109424,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``False``, ``X`` is overwritten." - } + }, + "refined_type": {} }, { "name": "eps", @@ -102452,7 +109435,8 @@ "docstring": { "type": "float, default=np.finfo(float).eps", "description": "The machine-precision regularization in the computation of the\nCholesky diagonal factors. Increase this for very ill-conditioned\nsystems. Unlike the ``tol`` parameter in some iterative\noptimization-based algorithms, this parameter does not control\nthe tolerance of the optimization." - } + }, + "refined_type": {} }, { "name": "copy_Gram", @@ -102462,7 +109446,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``False``, ``Gram`` is overwritten." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -102472,7 +109457,8 @@ "docstring": { "type": "int, default=0", "description": "Controls output verbosity." - } + }, + "refined_type": {} }, { "name": "return_path", @@ -102482,7 +109468,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``return_path==True`` returns the entire path, else returns only the\nlast point of the path." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -102492,7 +109479,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the number of iterations." - } + }, + "refined_type": {} }, { "name": "positive", @@ -102502,13 +109490,14 @@ "docstring": { "type": "bool, default=False", "description": "Restrict coefficients to be >= 0.\nThis option is only allowed with method 'lasso'. Note that the model\ncoefficients will not converge to the ordinary-least-squares solution\nfor small values of alpha. Only coefficients up to the smallest alpha\nvalue (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by\nthe stepwise Lars-Lasso algorithm are typically in congruence with the\nsolution of the coordinate descent lasso_path function." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute Least Angle Regression or Lasso path using LARS algorithm [1]\n\nThe optimization objective for the case method='lasso' is:: (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 in the case of method='lars', the objective function is only known in the form of an implicit equation (see discussion in [1]) Read more in the :ref:`User Guide `.", - "docstring": "Compute Least Angle Regression or Lasso path using LARS algorithm [1]\n\nThe optimization objective for the case method='lasso' is::\n\n(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nin the case of method='lars', the objective function is only known in\nthe form of an implicit equation (see discussion in [1])\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : None or array-like of shape (n_samples, n_features)\n Input data. Note that if X is None then the Gram matrix must be\n specified, i.e., cannot be None or False.\n\ny : None or array-like of shape (n_samples,)\n Input targets.\n\nXy : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Xy = np.dot(X.T, y) that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\nGram : None, 'auto', array-like of shape (n_features, n_features), default=None\n Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram\n matrix is precomputed from the given X, if there are more samples\n than features.\n\nmax_iter : int, default=500\n Maximum number of iterations to perform, set to infinity for no limit.\n\nalpha_min : float, default=0\n Minimum correlation along the path. It corresponds to the\n regularization parameter alpha parameter in the Lasso.\n\nmethod : {'lar', 'lasso'}, default='lar'\n Specifies the returned model. Select ``'lar'`` for Least Angle\n Regression, ``'lasso'`` for the Lasso.\n\ncopy_X : bool, default=True\n If ``False``, ``X`` is overwritten.\n\neps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\ncopy_Gram : bool, default=True\n If ``False``, ``Gram`` is overwritten.\n\nverbose : int, default=0\n Controls output verbosity.\n\nreturn_path : bool, default=True\n If ``return_path==True`` returns the entire path, else returns only the\n last point of the path.\n\nreturn_n_iter : bool, default=False\n Whether to return the number of iterations.\n\npositive : bool, default=False\n Restrict coefficients to be >= 0.\n This option is only allowed with method 'lasso'. Note that the model\n coefficients will not converge to the ordinary-least-squares solution\n for small values of alpha. Only coefficients up to the smallest alpha\n value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by\n the stepwise Lars-Lasso algorithm are typically in congruence with the\n solution of the coordinate descent lasso_path function.\n\nReturns\n-------\nalphas : array-like of shape (n_alphas + 1,)\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller.\n\nactive : array-like of shape (n_alphas,)\n Indices of active variables at the end of the path.\n\ncoefs : array-like of shape (n_features, n_alphas + 1)\n Coefficients along the path\n\nn_iter : int\n Number of iterations run. Returned only if return_n_iter is set\n to True.\n\nSee Also\n--------\nlars_path_gram\nlasso_path\nlasso_path_gram\nLassoLars\nLars\nLassoLarsCV\nLarsCV\nsklearn.decomposition.sparse_encode\n\nReferences\n----------\n.. [1] \"Least Angle Regression\", Efron et al.\n http://statweb.stanford.edu/~tibs/ftp/lars.pdf\n\n.. [2] `Wikipedia entry on the Least-angle regression\n `_\n\n.. [3] `Wikipedia entry on the Lasso\n `_", + "description": "Compute Least Angle Regression or Lasso path using LARS algorithm [1]\n\nThe optimization objective for the case method='lasso' is::\n\n(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nin the case of method='lars', the objective function is only known in\nthe form of an implicit equation (see discussion in [1])\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute Least Angle Regression or Lasso path using LARS algorithm [1]\n\n The optimization objective for the case method='lasso' is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n in the case of method='lars', the objective function is only known in\n the form of an implicit equation (see discussion in [1])\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : None or array-like of shape (n_samples, n_features)\n Input data. Note that if X is None then the Gram matrix must be\n specified, i.e., cannot be None or False.\n\n y : None or array-like of shape (n_samples,)\n Input targets.\n\n Xy : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Xy = np.dot(X.T, y) that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\n Gram : None, 'auto', array-like of shape (n_features, n_features), default=None\n Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram\n matrix is precomputed from the given X, if there are more samples\n than features.\n\n max_iter : int, default=500\n Maximum number of iterations to perform, set to infinity for no limit.\n\n alpha_min : float, default=0\n Minimum correlation along the path. It corresponds to the\n regularization parameter alpha parameter in the Lasso.\n\n method : {'lar', 'lasso'}, default='lar'\n Specifies the returned model. Select ``'lar'`` for Least Angle\n Regression, ``'lasso'`` for the Lasso.\n\n copy_X : bool, default=True\n If ``False``, ``X`` is overwritten.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_Gram : bool, default=True\n If ``False``, ``Gram`` is overwritten.\n\n verbose : int, default=0\n Controls output verbosity.\n\n return_path : bool, default=True\n If ``return_path==True`` returns the entire path, else returns only the\n last point of the path.\n\n return_n_iter : bool, default=False\n Whether to return the number of iterations.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0.\n This option is only allowed with method 'lasso'. Note that the model\n coefficients will not converge to the ordinary-least-squares solution\n for small values of alpha. Only coefficients up to the smallest alpha\n value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by\n the stepwise Lars-Lasso algorithm are typically in congruence with the\n solution of the coordinate descent lasso_path function.\n\n Returns\n -------\n alphas : array-like of shape (n_alphas + 1,)\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller.\n\n active : array-like of shape (n_alphas,)\n Indices of active variables at the end of the path.\n\n coefs : array-like of shape (n_features, n_alphas + 1)\n Coefficients along the path\n\n n_iter : int\n Number of iterations run. Returned only if return_n_iter is set\n to True.\n\n See Also\n --------\n lars_path_gram\n lasso_path\n lasso_path_gram\n LassoLars\n Lars\n LassoLarsCV\n LarsCV\n sklearn.decomposition.sparse_encode\n\n References\n ----------\n .. [1] \"Least Angle Regression\", Efron et al.\n http://statweb.stanford.edu/~tibs/ftp/lars.pdf\n\n .. [2] `Wikipedia entry on the Least-angle regression\n `_\n\n .. [3] `Wikipedia entry on the Lasso\n `_\n\n ", "source_code": "\ndef lars_path(X, y, Xy=None, *, Gram=None, max_iter=500, alpha_min=0, method='lar', copy_X=True, eps=np.finfo(float).eps, copy_Gram=True, verbose=0, return_path=True, return_n_iter=False, positive=False):\n \"\"\"Compute Least Angle Regression or Lasso path using LARS algorithm [1]\n\n The optimization objective for the case method='lasso' is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n in the case of method='lars', the objective function is only known in\n the form of an implicit equation (see discussion in [1])\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : None or array-like of shape (n_samples, n_features)\n Input data. Note that if X is None then the Gram matrix must be\n specified, i.e., cannot be None or False.\n\n y : None or array-like of shape (n_samples,)\n Input targets.\n\n Xy : array-like of shape (n_samples,) or (n_samples, n_targets), default=None\n Xy = np.dot(X.T, y) that can be precomputed. It is useful\n only when the Gram matrix is precomputed.\n\n Gram : None, 'auto', array-like of shape (n_features, n_features), default=None\n Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram\n matrix is precomputed from the given X, if there are more samples\n than features.\n\n max_iter : int, default=500\n Maximum number of iterations to perform, set to infinity for no limit.\n\n alpha_min : float, default=0\n Minimum correlation along the path. It corresponds to the\n regularization parameter alpha parameter in the Lasso.\n\n method : {'lar', 'lasso'}, default='lar'\n Specifies the returned model. Select ``'lar'`` for Least Angle\n Regression, ``'lasso'`` for the Lasso.\n\n copy_X : bool, default=True\n If ``False``, ``X`` is overwritten.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_Gram : bool, default=True\n If ``False``, ``Gram`` is overwritten.\n\n verbose : int, default=0\n Controls output verbosity.\n\n return_path : bool, default=True\n If ``return_path==True`` returns the entire path, else returns only the\n last point of the path.\n\n return_n_iter : bool, default=False\n Whether to return the number of iterations.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0.\n This option is only allowed with method 'lasso'. Note that the model\n coefficients will not converge to the ordinary-least-squares solution\n for small values of alpha. Only coefficients up to the smallest alpha\n value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by\n the stepwise Lars-Lasso algorithm are typically in congruence with the\n solution of the coordinate descent lasso_path function.\n\n Returns\n -------\n alphas : array-like of shape (n_alphas + 1,)\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller.\n\n active : array-like of shape (n_alphas,)\n Indices of active variables at the end of the path.\n\n coefs : array-like of shape (n_features, n_alphas + 1)\n Coefficients along the path\n\n n_iter : int\n Number of iterations run. Returned only if return_n_iter is set\n to True.\n\n See Also\n --------\n lars_path_gram\n lasso_path\n lasso_path_gram\n LassoLars\n Lars\n LassoLarsCV\n LarsCV\n sklearn.decomposition.sparse_encode\n\n References\n ----------\n .. [1] \"Least Angle Regression\", Efron et al.\n http://statweb.stanford.edu/~tibs/ftp/lars.pdf\n\n .. [2] `Wikipedia entry on the Least-angle regression\n `_\n\n .. [3] `Wikipedia entry on the Lasso\n `_\n\n \"\"\"\n if X is None and Gram is not None:\n raise ValueError('X cannot be None if Gram is not NoneUse lars_path_gram to avoid passing X and y.')\n return _lars_path_solver(X=X, y=y, Xy=Xy, Gram=Gram, n_samples=None, max_iter=max_iter, alpha_min=alpha_min, method=method, copy_X=copy_X, eps=eps, copy_Gram=copy_Gram, verbose=verbose, return_path=return_path, return_n_iter=return_n_iter, positive=positive)" }, { @@ -102526,7 +109515,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Xy = np.dot(X.T, y)." - } + }, + "refined_type": {} }, { "name": "Gram", @@ -102536,7 +109526,8 @@ "docstring": { "type": "array-like of shape (n_features, n_features)", "description": "Gram = np.dot(X.T * X)." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -102546,7 +109537,8 @@ "docstring": { "type": "int or float", "description": "Equivalent size of sample." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -102556,7 +109548,8 @@ "docstring": { "type": "int, default=500", "description": "Maximum number of iterations to perform, set to infinity for no limit." - } + }, + "refined_type": {} }, { "name": "alpha_min", @@ -102566,7 +109559,8 @@ "docstring": { "type": "float, default=0", "description": "Minimum correlation along the path. It corresponds to the\nregularization parameter alpha parameter in the Lasso." - } + }, + "refined_type": {} }, { "name": "method", @@ -102576,6 +109570,10 @@ "docstring": { "type": "{'lar', 'lasso'}, default='lar'", "description": "Specifies the returned model. Select ``'lar'`` for Least Angle\nRegression, ``'lasso'`` for the Lasso." + }, + "refined_type": { + "kind": "EnumType", + "values": ["lasso", "lar"] } }, { @@ -102586,7 +109584,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``False``, ``X`` is overwritten." - } + }, + "refined_type": {} }, { "name": "eps", @@ -102596,7 +109595,8 @@ "docstring": { "type": "float, default=np.finfo(float).eps", "description": "The machine-precision regularization in the computation of the\nCholesky diagonal factors. Increase this for very ill-conditioned\nsystems. Unlike the ``tol`` parameter in some iterative\noptimization-based algorithms, this parameter does not control\nthe tolerance of the optimization." - } + }, + "refined_type": {} }, { "name": "copy_Gram", @@ -102606,7 +109606,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``False``, ``Gram`` is overwritten." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -102616,7 +109617,8 @@ "docstring": { "type": "int, default=0", "description": "Controls output verbosity." - } + }, + "refined_type": {} }, { "name": "return_path", @@ -102626,7 +109628,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``return_path==True`` returns the entire path, else returns only the\nlast point of the path." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -102636,7 +109639,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the number of iterations." - } + }, + "refined_type": {} }, { "name": "positive", @@ -102646,13 +109650,14 @@ "docstring": { "type": "bool, default=False", "description": "Restrict coefficients to be >= 0.\nThis option is only allowed with method 'lasso'. Note that the model\ncoefficients will not converge to the ordinary-least-squares solution\nfor small values of alpha. Only coefficients up to the smallest alpha\nvalue (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by\nthe stepwise Lars-Lasso algorithm are typically in congruence with the\nsolution of the coordinate descent lasso_path function." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "lars_path in the sufficient stats mode [1]\n\nThe optimization objective for the case method='lasso' is:: (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 in the case of method='lars', the objective function is only known in the form of an implicit equation (see discussion in [1]) Read more in the :ref:`User Guide `.", - "docstring": "lars_path in the sufficient stats mode [1]\n\nThe optimization objective for the case method='lasso' is::\n\n(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nin the case of method='lars', the objective function is only known in\nthe form of an implicit equation (see discussion in [1])\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nXy : array-like of shape (n_samples,) or (n_samples, n_targets)\n Xy = np.dot(X.T, y).\n\nGram : array-like of shape (n_features, n_features)\n Gram = np.dot(X.T * X).\n\nn_samples : int or float\n Equivalent size of sample.\n\nmax_iter : int, default=500\n Maximum number of iterations to perform, set to infinity for no limit.\n\nalpha_min : float, default=0\n Minimum correlation along the path. It corresponds to the\n regularization parameter alpha parameter in the Lasso.\n\nmethod : {'lar', 'lasso'}, default='lar'\n Specifies the returned model. Select ``'lar'`` for Least Angle\n Regression, ``'lasso'`` for the Lasso.\n\ncopy_X : bool, default=True\n If ``False``, ``X`` is overwritten.\n\neps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\ncopy_Gram : bool, default=True\n If ``False``, ``Gram`` is overwritten.\n\nverbose : int, default=0\n Controls output verbosity.\n\nreturn_path : bool, default=True\n If ``return_path==True`` returns the entire path, else returns only the\n last point of the path.\n\nreturn_n_iter : bool, default=False\n Whether to return the number of iterations.\n\npositive : bool, default=False\n Restrict coefficients to be >= 0.\n This option is only allowed with method 'lasso'. Note that the model\n coefficients will not converge to the ordinary-least-squares solution\n for small values of alpha. Only coefficients up to the smallest alpha\n value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by\n the stepwise Lars-Lasso algorithm are typically in congruence with the\n solution of the coordinate descent lasso_path function.\n\nReturns\n-------\nalphas : array-like of shape (n_alphas + 1,)\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller.\n\nactive : array-like of shape (n_alphas,)\n Indices of active variables at the end of the path.\n\ncoefs : array-like of shape (n_features, n_alphas + 1)\n Coefficients along the path\n\nn_iter : int\n Number of iterations run. Returned only if return_n_iter is set\n to True.\n\nSee Also\n--------\nlars_path\nlasso_path\nlasso_path_gram\nLassoLars\nLars\nLassoLarsCV\nLarsCV\nsklearn.decomposition.sparse_encode\n\nReferences\n----------\n.. [1] \"Least Angle Regression\", Efron et al.\n http://statweb.stanford.edu/~tibs/ftp/lars.pdf\n\n.. [2] `Wikipedia entry on the Least-angle regression\n `_\n\n.. [3] `Wikipedia entry on the Lasso\n `_", + "description": "lars_path in the sufficient stats mode [1]\n\nThe optimization objective for the case method='lasso' is::\n\n(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\nin the case of method='lars', the objective function is only known in\nthe form of an implicit equation (see discussion in [1])\n\nRead more in the :ref:`User Guide `.", + "docstring": "lars_path in the sufficient stats mode [1]\n\n The optimization objective for the case method='lasso' is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n in the case of method='lars', the objective function is only known in\n the form of an implicit equation (see discussion in [1])\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n Xy : array-like of shape (n_samples,) or (n_samples, n_targets)\n Xy = np.dot(X.T, y).\n\n Gram : array-like of shape (n_features, n_features)\n Gram = np.dot(X.T * X).\n\n n_samples : int or float\n Equivalent size of sample.\n\n max_iter : int, default=500\n Maximum number of iterations to perform, set to infinity for no limit.\n\n alpha_min : float, default=0\n Minimum correlation along the path. It corresponds to the\n regularization parameter alpha parameter in the Lasso.\n\n method : {'lar', 'lasso'}, default='lar'\n Specifies the returned model. Select ``'lar'`` for Least Angle\n Regression, ``'lasso'`` for the Lasso.\n\n copy_X : bool, default=True\n If ``False``, ``X`` is overwritten.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_Gram : bool, default=True\n If ``False``, ``Gram`` is overwritten.\n\n verbose : int, default=0\n Controls output verbosity.\n\n return_path : bool, default=True\n If ``return_path==True`` returns the entire path, else returns only the\n last point of the path.\n\n return_n_iter : bool, default=False\n Whether to return the number of iterations.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0.\n This option is only allowed with method 'lasso'. Note that the model\n coefficients will not converge to the ordinary-least-squares solution\n for small values of alpha. Only coefficients up to the smallest alpha\n value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by\n the stepwise Lars-Lasso algorithm are typically in congruence with the\n solution of the coordinate descent lasso_path function.\n\n Returns\n -------\n alphas : array-like of shape (n_alphas + 1,)\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller.\n\n active : array-like of shape (n_alphas,)\n Indices of active variables at the end of the path.\n\n coefs : array-like of shape (n_features, n_alphas + 1)\n Coefficients along the path\n\n n_iter : int\n Number of iterations run. Returned only if return_n_iter is set\n to True.\n\n See Also\n --------\n lars_path\n lasso_path\n lasso_path_gram\n LassoLars\n Lars\n LassoLarsCV\n LarsCV\n sklearn.decomposition.sparse_encode\n\n References\n ----------\n .. [1] \"Least Angle Regression\", Efron et al.\n http://statweb.stanford.edu/~tibs/ftp/lars.pdf\n\n .. [2] `Wikipedia entry on the Least-angle regression\n `_\n\n .. [3] `Wikipedia entry on the Lasso\n `_\n\n ", "source_code": "\ndef lars_path_gram(Xy, Gram, *, n_samples, max_iter=500, alpha_min=0, method='lar', copy_X=True, eps=np.finfo(float).eps, copy_Gram=True, verbose=0, return_path=True, return_n_iter=False, positive=False):\n \"\"\"lars_path in the sufficient stats mode [1]\n\n The optimization objective for the case method='lasso' is::\n\n (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1\n\n in the case of method='lars', the objective function is only known in\n the form of an implicit equation (see discussion in [1])\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n Xy : array-like of shape (n_samples,) or (n_samples, n_targets)\n Xy = np.dot(X.T, y).\n\n Gram : array-like of shape (n_features, n_features)\n Gram = np.dot(X.T * X).\n\n n_samples : int or float\n Equivalent size of sample.\n\n max_iter : int, default=500\n Maximum number of iterations to perform, set to infinity for no limit.\n\n alpha_min : float, default=0\n Minimum correlation along the path. It corresponds to the\n regularization parameter alpha parameter in the Lasso.\n\n method : {'lar', 'lasso'}, default='lar'\n Specifies the returned model. Select ``'lar'`` for Least Angle\n Regression, ``'lasso'`` for the Lasso.\n\n copy_X : bool, default=True\n If ``False``, ``X`` is overwritten.\n\n eps : float, default=np.finfo(float).eps\n The machine-precision regularization in the computation of the\n Cholesky diagonal factors. Increase this for very ill-conditioned\n systems. Unlike the ``tol`` parameter in some iterative\n optimization-based algorithms, this parameter does not control\n the tolerance of the optimization.\n\n copy_Gram : bool, default=True\n If ``False``, ``Gram`` is overwritten.\n\n verbose : int, default=0\n Controls output verbosity.\n\n return_path : bool, default=True\n If ``return_path==True`` returns the entire path, else returns only the\n last point of the path.\n\n return_n_iter : bool, default=False\n Whether to return the number of iterations.\n\n positive : bool, default=False\n Restrict coefficients to be >= 0.\n This option is only allowed with method 'lasso'. Note that the model\n coefficients will not converge to the ordinary-least-squares solution\n for small values of alpha. Only coefficients up to the smallest alpha\n value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by\n the stepwise Lars-Lasso algorithm are typically in congruence with the\n solution of the coordinate descent lasso_path function.\n\n Returns\n -------\n alphas : array-like of shape (n_alphas + 1,)\n Maximum of covariances (in absolute value) at each iteration.\n ``n_alphas`` is either ``max_iter``, ``n_features`` or the\n number of nodes in the path with ``alpha >= alpha_min``, whichever\n is smaller.\n\n active : array-like of shape (n_alphas,)\n Indices of active variables at the end of the path.\n\n coefs : array-like of shape (n_features, n_alphas + 1)\n Coefficients along the path\n\n n_iter : int\n Number of iterations run. Returned only if return_n_iter is set\n to True.\n\n See Also\n --------\n lars_path\n lasso_path\n lasso_path_gram\n LassoLars\n Lars\n LassoLarsCV\n LarsCV\n sklearn.decomposition.sparse_encode\n\n References\n ----------\n .. [1] \"Least Angle Regression\", Efron et al.\n http://statweb.stanford.edu/~tibs/ftp/lars.pdf\n\n .. [2] `Wikipedia entry on the Least-angle regression\n `_\n\n .. [3] `Wikipedia entry on the Lasso\n `_\n\n \"\"\"\n return _lars_path_solver(X=None, y=None, Xy=Xy, Gram=Gram, n_samples=n_samples, max_iter=max_iter, alpha_min=alpha_min, method=method, copy_X=copy_X, eps=eps, copy_Gram=copy_Gram, verbose=verbose, return_path=return_path, return_n_iter=return_n_iter, positive=positive)" }, { @@ -102670,7 +109675,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "penalty", @@ -102680,6 +109686,10 @@ "docstring": { "type": "{'l1', 'l2', 'elasticnet', 'none'}, default='l2'", "description": "Specify the norm of the penalty:\n\n- `'none'`: no penalty is added;\n- `'l2'`: add a L2 penalty term and it is the default choice;\n- `'l1'`: add a L1 penalty term;\n- `'elasticnet'`: both L1 and L2 penalty terms are added.\n\n.. warning::\n Some penalties may not work with some solvers. See the parameter\n `solver` below, to know the compatibility between the penalty and\n solver.\n\n.. versionadded:: 0.19\n l1 penalty with SAGA solver (allowing 'multinomial' + L1)" + }, + "refined_type": { + "kind": "EnumType", + "values": ["none", "l2", "l1", "elasticnet"] } }, { @@ -102690,7 +109700,8 @@ "docstring": { "type": "bool, default=False", "description": "Dual or primal formulation. Dual formulation is only implemented for\nl2 penalty with liblinear solver. Prefer dual=False when\nn_samples > n_features." - } + }, + "refined_type": {} }, { "name": "tol", @@ -102700,7 +109711,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance for stopping criteria." - } + }, + "refined_type": {} }, { "name": "C", @@ -102710,7 +109722,8 @@ "docstring": { "type": "float, default=1.0", "description": "Inverse of regularization strength; must be a positive float.\nLike in support vector machines, smaller values specify stronger\nregularization." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -102720,7 +109733,8 @@ "docstring": { "type": "bool, default=True", "description": "Specifies if a constant (a.k.a. bias or intercept) should be\nadded to the decision function." - } + }, + "refined_type": {} }, { "name": "intercept_scaling", @@ -102730,7 +109744,8 @@ "docstring": { "type": "float, default=1", "description": "Useful only when the solver 'liblinear' is used\nand self.fit_intercept is set to True. In this case, x becomes\n[x, self.intercept_scaling],\ni.e. a \"synthetic\" feature with constant value equal to\nintercept_scaling is appended to the instance vector.\nThe intercept becomes ``intercept_scaling * synthetic_feature_weight``.\n\nNote! the synthetic feature weight is subject to l1/l2 regularization\nas all other features.\nTo lessen the effect of regularization on synthetic feature weight\n(and therefore on the intercept) intercept_scaling has to be increased." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -102740,6 +109755,10 @@ "docstring": { "type": "dict or 'balanced', default=None", "description": "Weights associated with classes in the form ``{class_label: weight}``.\nIf not given, all classes are supposed to have weight one.\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``.\n\nNote that these weights will be multiplied with sample_weight (passed\nthrough the fit method) if sample_weight is specified.\n\n.. versionadded:: 0.17\n *class_weight='balanced'*" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -102750,7 +109769,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the\ndata. See :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "solver", @@ -102760,6 +109780,16 @@ "docstring": { "type": "{'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, default='lbfgs'", "description": "Algorithm to use in the optimization problem. Default is 'lbfgs'.\nTo choose a solver, you might want to consider the following aspects:\n\n - For small datasets, 'liblinear' is a good choice, whereas 'sag'\n and 'saga' are faster for large ones;\n - For multiclass problems, only 'newton-cg', 'sag', 'saga' and\n 'lbfgs' handle multinomial loss;\n - 'liblinear' is limited to one-versus-rest schemes.\n\n.. warning::\n The choice of the algorithm depends on the penalty chosen:\n Supported penalties by solver:\n\n - 'newton-cg' - ['l2', 'none']\n - 'lbfgs' - ['l2', 'none']\n - 'liblinear' - ['l1', 'l2']\n - 'sag' - ['l2', 'none']\n - 'saga' - ['elasticnet', 'l1', 'l2', 'none']\n\n.. note::\n 'sag' and 'saga' fast convergence is only guaranteed on\n features with approximately the same scale. You can\n preprocess the data with a scaler from :mod:`sklearn.preprocessing`.\n\n.. seealso::\n Refer to the User Guide for more information regarding\n :class:`LogisticRegression` and more specifically the\n `Table `_\n summarazing solver/penalty supports.\n \n\n.. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n.. versionadded:: 0.19\n SAGA solver.\n.. versionchanged:: 0.22\n The default solver changed from 'liblinear' to 'lbfgs' in 0.22." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "newton-cg", + "saga", + "lbfgs", + "sag", + "liblinear" + ] } }, { @@ -102770,7 +109800,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum number of iterations taken for the solvers to converge." - } + }, + "refined_type": {} }, { "name": "multi_class", @@ -102780,6 +109811,10 @@ "docstring": { "type": "{'auto', 'ovr', 'multinomial'}, default='auto'", "description": "If the option chosen is 'ovr', then a binary problem is fit for each\nlabel. For 'multinomial' the loss minimised is the multinomial loss fit\nacross the entire probability distribution, *even when the data is\nbinary*. 'multinomial' is unavailable when solver='liblinear'.\n'auto' selects 'ovr' if the data is binary, or if solver='liblinear',\nand otherwise selects 'multinomial'.\n\n.. versionadded:: 0.18\n Stochastic Average Gradient descent solver for 'multinomial' case.\n.. versionchanged:: 0.22\n Default changed from 'ovr' to 'auto' in 0.22." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "ovr", "multinomial"] } }, { @@ -102790,7 +109825,8 @@ "docstring": { "type": "int, default=0", "description": "For the liblinear and lbfgs solvers set verbose to any positive\nnumber for verbosity." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -102800,7 +109836,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to True, reuse the solution of the previous call to fit as\ninitialization, otherwise, just erase the previous solution.\nUseless for liblinear solver. See :term:`the Glossary `.\n\n.. versionadded:: 0.17\n *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -102810,7 +109847,8 @@ "docstring": { "type": "int, default=None", "description": "Number of CPU cores used when parallelizing over classes if\nmulti_class='ovr'\". This parameter is ignored when the ``solver`` is\nset to 'liblinear' regardless of whether 'multi_class' is specified or\nnot. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`\ncontext. ``-1`` means using all processors.\nSee :term:`Glossary ` for more details." - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -102820,13 +109858,14 @@ "docstring": { "type": "float, default=None", "description": "The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only\nused if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent\nto using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent\nto using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a\ncombination of L1 and L2." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None):\n self.penalty = penalty\n self.dual = dual\n self.tol = tol\n self.C = C\n self.fit_intercept = fit_intercept\n self.intercept_scaling = intercept_scaling\n self.class_weight = class_weight\n self.random_state = random_state\n self.solver = solver\n self.max_iter = max_iter\n self.multi_class = multi_class\n self.verbose = verbose\n self.warm_start = warm_start\n self.n_jobs = n_jobs\n self.l1_ratio = l1_ratio" }, { @@ -102844,7 +109883,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -102854,6 +109894,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -102864,7 +109908,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target vector relative to X." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -102874,13 +109919,14 @@ "docstring": { "type": "array-like of shape (n_samples,) default=None", "description": "Array of weights that are assigned to individual samples.\nIf not provided, then each sample is given unit weight.\n\n.. versionadded:: 0.17\n *sample_weight* support to LogisticRegression." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model according to the given training data.", - "docstring": "Fit the model according to the given training data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target vector relative to X.\n\nsample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n .. versionadded:: 0.17\n *sample_weight* support to LogisticRegression.\n\nReturns\n-------\nself\n Fitted estimator.\n\nNotes\n-----\nThe SAGA solver supports both float64 and float32 bit arrays.", + "docstring": "\n Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n .. versionadded:: 0.17\n *sample_weight* support to LogisticRegression.\n\n Returns\n -------\n self\n Fitted estimator.\n\n Notes\n -----\n The SAGA solver supports both float64 and float32 bit arrays.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"\n Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n .. versionadded:: 0.17\n *sample_weight* support to LogisticRegression.\n\n Returns\n -------\n self\n Fitted estimator.\n\n Notes\n -----\n The SAGA solver supports both float64 and float32 bit arrays.\n \"\"\"\n solver = _check_solver(self.solver, self.penalty, self.dual)\n if not isinstance(self.C, numbers.Number) or self.C < 0:\n raise ValueError('Penalty term must be positive; got (C=%r)' % self.C)\n if self.penalty == 'elasticnet':\n if not isinstance(self.l1_ratio, numbers.Number) or self.l1_ratio < 0 or self.l1_ratio > 1:\n raise ValueError('l1_ratio must be between 0 and 1; got (l1_ratio=%r)' % self.l1_ratio)\n elif self.l1_ratio is not None:\n warnings.warn(\"l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty={})\".format(self.penalty))\n if self.penalty == 'none':\n if self.C != 1.0:\n warnings.warn(\"Setting penalty='none' will ignore the C and l1_ratio parameters\")\n C_ = np.inf\n penalty = 'l2'\n else:\n C_ = self.C\n penalty = self.penalty\n if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:\n raise ValueError('Maximum number of iteration must be positive; got (max_iter=%r)' % self.max_iter)\n if not isinstance(self.tol, numbers.Number) or self.tol < 0:\n raise ValueError('Tolerance for stopping criteria must be positive; got (tol=%r)' % self.tol)\n if solver == 'lbfgs':\n _dtype = np.float64\n else:\n _dtype = [np.float64, np.float32]\n (X, y) = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype, order='C', accept_large_sparse=solver not in ['liblinear', 'sag', 'saga'])\n check_classification_targets(y)\n self.classes_ = np.unique(y)\n multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_))\n if solver == 'liblinear':\n if effective_n_jobs(self.n_jobs) != 1:\n warnings.warn(\"'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = {}.\".format(effective_n_jobs(self.n_jobs)))\n (self.coef_, self.intercept_, n_iter_) = _fit_liblinear(X, y, self.C, self.fit_intercept, self.intercept_scaling, self.class_weight, self.penalty, self.dual, self.verbose, self.max_iter, self.tol, self.random_state, sample_weight=sample_weight)\n self.n_iter_ = np.array([n_iter_])\n return self\n if solver in ['sag', 'saga']:\n max_squared_sum = row_norms(X, squared=True).max()\n else:\n max_squared_sum = None\n n_classes = len(self.classes_)\n classes_ = self.classes_\n if n_classes < 2:\n raise ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: %r' % classes_[0])\n if len(self.classes_) == 2:\n n_classes = 1\n classes_ = classes_[1:]\n if self.warm_start:\n warm_start_coef = getattr(self, 'coef_', None)\n else:\n warm_start_coef = None\n if warm_start_coef is not None and self.fit_intercept:\n warm_start_coef = np.append(warm_start_coef, self.intercept_[:, np.newaxis], axis=1)\n if multi_class == 'multinomial':\n classes_ = [None]\n warm_start_coef = [warm_start_coef]\n if warm_start_coef is None:\n warm_start_coef = [None] * n_classes\n path_func = delayed(_logistic_regression_path)\n if solver in ['sag', 'saga']:\n prefer = 'threads'\n else:\n prefer = 'processes'\n fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer=prefer))((path_func(X, y, pos_class=class_, Cs=[C_], l1_ratio=self.l1_ratio, fit_intercept=self.fit_intercept, tol=self.tol, verbose=self.verbose, solver=solver, multi_class=multi_class, max_iter=self.max_iter, class_weight=self.class_weight, check_input=False, random_state=self.random_state, coef=warm_start_coef_, penalty=penalty, max_squared_sum=max_squared_sum, sample_weight=sample_weight) for (class_, warm_start_coef_) in zip(classes_, warm_start_coef)))\n (fold_coefs_, _, n_iter_) = zip(*fold_coefs_)\n self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0]\n n_features = X.shape[1]\n if multi_class == 'multinomial':\n self.coef_ = fold_coefs_[0][0]\n else:\n self.coef_ = np.asarray(fold_coefs_)\n self.coef_ = self.coef_.reshape(n_classes, n_features + int(self.fit_intercept))\n if self.fit_intercept:\n self.intercept_ = self.coef_[:, -1]\n self.coef_ = self.coef_[:, :-1]\n else:\n self.intercept_ = np.zeros(n_classes)\n return self" }, { @@ -102898,7 +109944,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -102908,13 +109955,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Vector to be scored, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict logarithm of probability estimates.\n\nThe returned estimates for all classes are ordered by the label of classes.", - "docstring": "Predict logarithm of probability estimates.\n\nThe returned estimates for all classes are ordered by the\nlabel of classes.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Vector to be scored, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nReturns\n-------\nT : array-like of shape (n_samples, n_classes)\n Returns the log-probability of the sample for each class in the\n model, where classes are ordered as they are in ``self.classes_``.", + "description": "Predict logarithm of probability estimates.\n\nThe returned estimates for all classes are ordered by the\nlabel of classes.", + "docstring": "\n Predict logarithm of probability estimates.\n\n The returned estimates for all classes are ordered by the\n label of classes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Vector to be scored, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n T : array-like of shape (n_samples, n_classes)\n Returns the log-probability of the sample for each class in the\n model, where classes are ordered as they are in ``self.classes_``.\n ", "source_code": "\ndef predict_log_proba(self, X):\n \"\"\"\n Predict logarithm of probability estimates.\n\n The returned estimates for all classes are ordered by the\n label of classes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Vector to be scored, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n T : array-like of shape (n_samples, n_classes)\n Returns the log-probability of the sample for each class in the\n model, where classes are ordered as they are in ``self.classes_``.\n \"\"\"\n return np.log(self.predict_proba(X))" }, { @@ -102932,7 +109980,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -102942,13 +109991,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Vector to be scored, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Probability estimates.\n\nThe returned estimates for all classes are ordered by the label of classes. For a multi_class problem, if multi_class is set to be \"multinomial\" the softmax function is used to find the predicted probability of each class. Else use a one-vs-rest approach, i.e calculate the probability of each class assuming it to be positive using the logistic function. and normalize these values across all the classes.", - "docstring": "Probability estimates.\n\nThe returned estimates for all classes are ordered by the\nlabel of classes.\n\nFor a multi_class problem, if multi_class is set to be \"multinomial\"\nthe softmax function is used to find the predicted probability of\neach class.\nElse use a one-vs-rest approach, i.e calculate the probability\nof each class assuming it to be positive using the logistic function.\nand normalize these values across all the classes.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Vector to be scored, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nReturns\n-------\nT : array-like of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in ``self.classes_``.", + "description": "Probability estimates.\n\nThe returned estimates for all classes are ordered by the\nlabel of classes.\n\nFor a multi_class problem, if multi_class is set to be \"multinomial\"\nthe softmax function is used to find the predicted probability of\neach class.\nElse use a one-vs-rest approach, i.e calculate the probability\nof each class assuming it to be positive using the logistic function.\nand normalize these values across all the classes.", + "docstring": "\n Probability estimates.\n\n The returned estimates for all classes are ordered by the\n label of classes.\n\n For a multi_class problem, if multi_class is set to be \"multinomial\"\n the softmax function is used to find the predicted probability of\n each class.\n Else use a one-vs-rest approach, i.e calculate the probability\n of each class assuming it to be positive using the logistic function.\n and normalize these values across all the classes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Vector to be scored, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n T : array-like of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in ``self.classes_``.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"\n Probability estimates.\n\n The returned estimates for all classes are ordered by the\n label of classes.\n\n For a multi_class problem, if multi_class is set to be \"multinomial\"\n the softmax function is used to find the predicted probability of\n each class.\n Else use a one-vs-rest approach, i.e calculate the probability\n of each class assuming it to be positive using the logistic function.\n and normalize these values across all the classes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Vector to be scored, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Returns\n -------\n T : array-like of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in ``self.classes_``.\n \"\"\"\n check_is_fitted(self)\n ovr = self.multi_class in ['ovr', 'warn'] or self.multi_class == 'auto' and (self.classes_.size <= 2 or self.solver == 'liblinear')\n if ovr:\n return super()._predict_proba_lr(X)\n else:\n decision = self.decision_function(X)\n if decision.ndim == 1:\n decision_2d = np.c_[-decision, decision]\n else:\n decision_2d = decision\n return softmax(decision_2d, copy=False)" }, { @@ -102966,7 +110016,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Cs", @@ -102976,7 +110027,8 @@ "docstring": { "type": "int or list of floats, default=10", "description": "Each of the values in Cs describes the inverse of regularization\nstrength. If Cs is as an int, then a grid of Cs values are chosen\nin a logarithmic scale between 1e-4 and 1e4.\nLike in support vector machines, smaller values specify stronger\nregularization." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -102986,7 +110038,8 @@ "docstring": { "type": "bool, default=True", "description": "Specifies if a constant (a.k.a. bias or intercept) should be\nadded to the decision function." - } + }, + "refined_type": {} }, { "name": "cv", @@ -102996,7 +110049,8 @@ "docstring": { "type": "int or cross-validation generator, default=None", "description": "The default cross-validation generator used is Stratified K-Folds.\nIf an integer is provided, then it is the number of folds used.\nSee the module :mod:`sklearn.model_selection` module for the\nlist of possible cross-validation objects.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "dual", @@ -103006,7 +110060,8 @@ "docstring": { "type": "bool, default=False", "description": "Dual or primal formulation. Dual formulation is only implemented for\nl2 penalty with liblinear solver. Prefer dual=False when\nn_samples > n_features." - } + }, + "refined_type": {} }, { "name": "penalty", @@ -103016,6 +110071,10 @@ "docstring": { "type": "{'l1', 'l2', 'elasticnet'}, default='l2'", "description": "Specify the norm of the penalty:\n\n- `'l2'`: add a L2 penalty term (used by default);\n- `'l1'`: add a L1 penalty term;\n- `'elasticnet'`: both L1 and L2 penalty terms are added.\n\n.. warning::\n Some penalties may not work with some solvers. See the parameter\n `solver` below, to know the compatibility between the penalty and\n solver." + }, + "refined_type": { + "kind": "EnumType", + "values": ["l2", "l1", "elasticnet"] } }, { @@ -103026,7 +110085,8 @@ "docstring": { "type": "str or callable, default=None", "description": "A string (see model evaluation documentation) or\na scorer callable object / function with signature\n``scorer(estimator, X, y)``. For a list of scoring functions\nthat can be used, look at :mod:`sklearn.metrics`. The\ndefault scoring option used is 'accuracy'." - } + }, + "refined_type": {} }, { "name": "solver", @@ -103036,6 +110096,16 @@ "docstring": { "type": "{'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, default='lbfgs'", "description": "Algorithm to use in the optimization problem. Default is 'lbfgs'.\nTo choose a solver, you might want to consider the following aspects:\n\n - For small datasets, 'liblinear' is a good choice, whereas 'sag'\n and 'saga' are faster for large ones;\n - For multiclass problems, only 'newton-cg', 'sag', 'saga' and\n 'lbfgs' handle multinomial loss;\n - 'liblinear' might be slower in :class:`LogisticRegressionCV`\n because it does not handle warm-starting. 'liblinear' is\n limited to one-versus-rest schemes.\n\n.. warning::\n The choice of the algorithm depends on the penalty chosen:\n\n - 'newton-cg' - ['l2']\n - 'lbfgs' - ['l2']\n - 'liblinear' - ['l1', 'l2']\n - 'sag' - ['l2']\n - 'saga' - ['elasticnet', 'l1', 'l2']\n\n.. note::\n 'sag' and 'saga' fast convergence is only guaranteed on features\n with approximately the same scale. You can preprocess the data with\n a scaler from :mod:`sklearn.preprocessing`.\n\n.. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n.. versionadded:: 0.19\n SAGA solver." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "newton-cg", + "saga", + "lbfgs", + "sag", + "liblinear" + ] } }, { @@ -103046,7 +110116,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance for stopping criteria." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -103056,7 +110127,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum number of iterations of the optimization algorithm." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -103066,6 +110138,10 @@ "docstring": { "type": "dict or 'balanced', default=None", "description": "Weights associated with classes in the form ``{class_label: weight}``.\nIf not given, all classes are supposed to have weight one.\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``.\n\nNote that these weights will be multiplied with sample_weight (passed\nthrough the fit method) if sample_weight is specified.\n\n.. versionadded:: 0.17\n class_weight == 'balanced'" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -103076,7 +110152,8 @@ "docstring": { "type": "int, default=None", "description": "Number of CPU cores used during the cross-validation loop.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -103086,7 +110163,8 @@ "docstring": { "type": "int, default=0", "description": "For the 'liblinear', 'sag' and 'lbfgs' solvers set verbose to any\npositive number for verbosity." - } + }, + "refined_type": {} }, { "name": "refit", @@ -103096,7 +110174,8 @@ "docstring": { "type": "bool, default=True", "description": "If set to True, the scores are averaged across all folds, and the\ncoefs and the C that corresponds to the best score is taken, and a\nfinal refit is done using these parameters.\nOtherwise the coefs, intercepts and C that correspond to the\nbest scores across folds are averaged." - } + }, + "refined_type": {} }, { "name": "intercept_scaling", @@ -103106,7 +110185,8 @@ "docstring": { "type": "float, default=1", "description": "Useful only when the solver 'liblinear' is used\nand self.fit_intercept is set to True. In this case, x becomes\n[x, self.intercept_scaling],\ni.e. a \"synthetic\" feature with constant value equal to\nintercept_scaling is appended to the instance vector.\nThe intercept becomes ``intercept_scaling * synthetic_feature_weight``.\n\nNote! the synthetic feature weight is subject to l1/l2 regularization\nas all other features.\nTo lessen the effect of regularization on synthetic feature weight\n(and therefore on the intercept) intercept_scaling has to be increased." - } + }, + "refined_type": {} }, { "name": "multi_class", @@ -103116,6 +110196,10 @@ "docstring": { "type": "{'auto, 'ovr', 'multinomial'}, default='auto'", "description": "If the option chosen is 'ovr', then a binary problem is fit for each\nlabel. For 'multinomial' the loss minimised is the multinomial loss fit\nacross the entire probability distribution, *even when the data is\nbinary*. 'multinomial' is unavailable when solver='liblinear'.\n'auto' selects 'ovr' if the data is binary, or if solver='liblinear',\nand otherwise selects 'multinomial'.\n\n.. versionadded:: 0.18\n Stochastic Average Gradient descent solver for 'multinomial' case.\n.. versionchanged:: 0.22\n Default changed from 'ovr' to 'auto' in 0.22." + }, + "refined_type": { + "kind": "EnumType", + "values": [", ", "auto, "] } }, { @@ -103126,7 +110210,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used when `solver='sag'`, 'saga' or 'liblinear' to shuffle the data.\nNote that this only applies to the solver and not the cross-validation\ngenerator. See :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "l1_ratios", @@ -103136,13 +110221,14 @@ "docstring": { "type": "list of float, default=None", "description": "The list of Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``.\nOnly used if ``penalty='elasticnet'``. A value of 0 is equivalent to\nusing ``penalty='l2'``, while 1 is equivalent to using\n``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a combination\nof L1 and L2." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, Cs=10, fit_intercept=True, cv=None, dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='auto', random_state=None, l1_ratios=None):\n self.Cs = Cs\n self.fit_intercept = fit_intercept\n self.cv = cv\n self.dual = dual\n self.penalty = penalty\n self.scoring = scoring\n self.tol = tol\n self.max_iter = max_iter\n self.class_weight = class_weight\n self.n_jobs = n_jobs\n self.verbose = verbose\n self.solver = solver\n self.refit = refit\n self.intercept_scaling = intercept_scaling\n self.multi_class = multi_class\n self.random_state = random_state\n self.l1_ratios = l1_ratios" }, { @@ -103160,13 +110246,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -103184,7 +110271,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -103194,6 +110282,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -103204,7 +110296,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target vector relative to X." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -103214,13 +110307,14 @@ "docstring": { "type": "array-like of shape (n_samples,) default=None", "description": "Array of weights that are assigned to individual samples.\nIf not provided, then each sample is given unit weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model according to the given training data.", - "docstring": "Fit the model according to the given training data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target vector relative to X.\n\nsample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\nReturns\n-------\nself : object\n Fitted LogisticRegressionCV estimator.", + "docstring": "Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n Returns\n -------\n self : object\n Fitted LogisticRegressionCV estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n Returns\n -------\n self : object\n Fitted LogisticRegressionCV estimator.\n \"\"\"\n solver = _check_solver(self.solver, self.penalty, self.dual)\n if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:\n raise ValueError('Maximum number of iteration must be positive; got (max_iter=%r)' % self.max_iter)\n if not isinstance(self.tol, numbers.Number) or self.tol < 0:\n raise ValueError('Tolerance for stopping criteria must be positive; got (tol=%r)' % self.tol)\n if self.penalty == 'elasticnet':\n if self.l1_ratios is None or len(self.l1_ratios) == 0 or any((not isinstance(l1_ratio, numbers.Number) or l1_ratio < 0 or l1_ratio > 1 for l1_ratio in self.l1_ratios)):\n raise ValueError('l1_ratios must be a list of numbers between 0 and 1; got (l1_ratios=%r)' % self.l1_ratios)\n l1_ratios_ = self.l1_ratios\n else:\n if self.l1_ratios is not None:\n warnings.warn(\"l1_ratios parameter is only used when penalty is 'elasticnet'. Got (penalty={})\".format(self.penalty))\n l1_ratios_ = [None]\n if self.penalty == 'none':\n raise ValueError(\"penalty='none' is not useful and not supported by LogisticRegressionCV.\")\n (X, y) = self._validate_data(X, y, accept_sparse='csr', dtype=np.float64, order='C', accept_large_sparse=solver not in ['liblinear', 'sag', 'saga'])\n check_classification_targets(y)\n class_weight = self.class_weight\n label_encoder = LabelEncoder().fit(y)\n y = label_encoder.transform(y)\n if isinstance(class_weight, dict):\n class_weight = {label_encoder.transform([cls])[0]: v for (cls, v) in class_weight.items()}\n classes = self.classes_ = label_encoder.classes_\n encoded_labels = label_encoder.transform(label_encoder.classes_)\n multi_class = _check_multi_class(self.multi_class, solver, len(classes))\n if solver in ['sag', 'saga']:\n max_squared_sum = row_norms(X, squared=True).max()\n else:\n max_squared_sum = None\n cv = check_cv(self.cv, y, classifier=True)\n folds = list(cv.split(X, y))\n n_classes = len(encoded_labels)\n if n_classes < 2:\n raise ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: %r' % classes[0])\n if n_classes == 2:\n n_classes = 1\n encoded_labels = encoded_labels[1:]\n classes = classes[1:]\n if multi_class == 'multinomial':\n iter_encoded_labels = iter_classes = [None]\n else:\n iter_encoded_labels = encoded_labels\n iter_classes = classes\n if class_weight == 'balanced':\n class_weight = compute_class_weight(class_weight, classes=np.arange(len(self.classes_)), y=y)\n class_weight = dict(enumerate(class_weight))\n path_func = delayed(_log_reg_scoring_path)\n if self.solver in ['sag', 'saga']:\n prefer = 'threads'\n else:\n prefer = 'processes'\n fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer=prefer))((path_func(X, y, train, test, pos_class=label, Cs=self.Cs, fit_intercept=self.fit_intercept, penalty=self.penalty, dual=self.dual, solver=solver, tol=self.tol, max_iter=self.max_iter, verbose=self.verbose, class_weight=class_weight, scoring=self.scoring, multi_class=multi_class, intercept_scaling=self.intercept_scaling, random_state=self.random_state, max_squared_sum=max_squared_sum, sample_weight=sample_weight, l1_ratio=l1_ratio) for label in iter_encoded_labels for (train, test) in folds for l1_ratio in l1_ratios_))\n (coefs_paths, Cs, scores, n_iter_) = zip(*fold_coefs_)\n self.Cs_ = Cs[0]\n if multi_class == 'multinomial':\n coefs_paths = np.reshape(coefs_paths, (len(folds), len(l1_ratios_) * len(self.Cs_), n_classes, -1))\n coefs_paths = np.swapaxes(coefs_paths, 0, 1)\n coefs_paths = np.swapaxes(coefs_paths, 0, 2)\n self.n_iter_ = np.reshape(n_iter_, (1, len(folds), len(self.Cs_) * len(l1_ratios_)))\n scores = np.tile(scores, (n_classes, 1, 1))\n else:\n coefs_paths = np.reshape(coefs_paths, (n_classes, len(folds), len(self.Cs_) * len(l1_ratios_), -1))\n self.n_iter_ = np.reshape(n_iter_, (n_classes, len(folds), len(self.Cs_) * len(l1_ratios_)))\n scores = np.reshape(scores, (n_classes, len(folds), -1))\n self.scores_ = dict(zip(classes, scores))\n self.coefs_paths_ = dict(zip(classes, coefs_paths))\n self.C_ = list()\n self.l1_ratio_ = list()\n self.coef_ = np.empty((n_classes, X.shape[1]))\n self.intercept_ = np.zeros(n_classes)\n for (index, (cls, encoded_label)) in enumerate(zip(iter_classes, iter_encoded_labels)):\n if multi_class == 'ovr':\n scores = self.scores_[cls]\n coefs_paths = self.coefs_paths_[cls]\n else:\n scores = scores[0]\n if self.refit:\n best_index = scores.sum(axis=0).argmax()\n best_index_C = best_index % len(self.Cs_)\n C_ = self.Cs_[best_index_C]\n self.C_.append(C_)\n best_index_l1 = best_index // len(self.Cs_)\n l1_ratio_ = l1_ratios_[best_index_l1]\n self.l1_ratio_.append(l1_ratio_)\n if multi_class == 'multinomial':\n coef_init = np.mean(coefs_paths[:, :, best_index, :], axis=1)\n else:\n coef_init = np.mean(coefs_paths[:, best_index, :], axis=0)\n (w, _, _) = _logistic_regression_path(X, y, pos_class=encoded_label, Cs=[C_], solver=solver, fit_intercept=self.fit_intercept, coef=coef_init, max_iter=self.max_iter, tol=self.tol, penalty=self.penalty, class_weight=class_weight, multi_class=multi_class, verbose=max(0, self.verbose - 1), random_state=self.random_state, check_input=False, max_squared_sum=max_squared_sum, sample_weight=sample_weight, l1_ratio=l1_ratio_)\n w = w[0]\n else:\n best_indices = np.argmax(scores, axis=1)\n if multi_class == 'ovr':\n w = np.mean([coefs_paths[i, best_indices[i], :] for i in range(len(folds))], axis=0)\n else:\n w = np.mean([coefs_paths[:, i, best_indices[i], :] for i in range(len(folds))], axis=0)\n best_indices_C = best_indices % len(self.Cs_)\n self.C_.append(np.mean(self.Cs_[best_indices_C]))\n if self.penalty == 'elasticnet':\n best_indices_l1 = best_indices // len(self.Cs_)\n self.l1_ratio_.append(np.mean(l1_ratios_[best_indices_l1]))\n else:\n self.l1_ratio_.append(None)\n if multi_class == 'multinomial':\n self.C_ = np.tile(self.C_, n_classes)\n self.l1_ratio_ = np.tile(self.l1_ratio_, n_classes)\n self.coef_ = w[:, :X.shape[1]]\n if self.fit_intercept:\n self.intercept_ = w[:, -1]\n else:\n self.coef_[index] = w[:X.shape[1]]\n if self.fit_intercept:\n self.intercept_[index] = w[-1]\n self.C_ = np.asarray(self.C_)\n self.l1_ratio_ = np.asarray(self.l1_ratio_)\n self.l1_ratios_ = np.asarray(l1_ratios_)\n if self.l1_ratios is not None:\n for (cls, coefs_path) in self.coefs_paths_.items():\n self.coefs_paths_[cls] = coefs_path.reshape((len(folds), self.l1_ratios_.size, self.Cs_.size, -1))\n self.coefs_paths_[cls] = np.transpose(self.coefs_paths_[cls], (0, 2, 1, 3))\n for (cls, score) in self.scores_.items():\n self.scores_[cls] = score.reshape((len(folds), self.l1_ratios_.size, self.Cs_.size))\n self.scores_[cls] = np.transpose(self.scores_[cls], (0, 2, 1))\n self.n_iter_ = self.n_iter_.reshape((-1, len(folds), self.l1_ratios_.size, self.Cs_.size))\n self.n_iter_ = np.transpose(self.n_iter_, (0, 1, 3, 2))\n return self" }, { @@ -103238,7 +110332,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -103248,7 +110343,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Test samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -103258,7 +110354,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "True labels for X." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -103268,13 +110365,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Score using the `scoring` option on the given test data and labels.", - "docstring": "Score using the `scoring` option on the given test data and labels.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Test samples.\n\ny : array-like of shape (n_samples,)\n True labels for X.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nscore : float\n Score of self.predict(X) wrt. y.", + "docstring": "Score using the `scoring` option on the given test data and labels.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples,)\n True labels for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Score of self.predict(X) wrt. y.\n ", "source_code": "\ndef score(self, X, y, sample_weight=None):\n \"\"\"Score using the `scoring` option on the given test data and labels.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples,)\n True labels for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Score of self.predict(X) wrt. y.\n \"\"\"\n scoring = self.scoring or 'accuracy'\n scoring = get_scorer(scoring)\n return scoring(self, X, y, sample_weight=sample_weight)" }, { @@ -103292,7 +110390,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "solver", @@ -103302,7 +110401,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -103312,13 +110412,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_multi_class(multi_class, solver, n_classes):\n if multi_class == 'auto':\n if solver == 'liblinear':\n multi_class = 'ovr'\n elif n_classes > 2:\n multi_class = 'multinomial'\n else:\n multi_class = 'ovr'\n if multi_class not in ('multinomial', 'ovr'):\n raise ValueError(\"multi_class should be 'multinomial', 'ovr' or 'auto'. Got %s.\" % multi_class)\n if multi_class == 'multinomial' and solver == 'liblinear':\n raise ValueError('Solver %s does not support a multinomial backend.' % solver)\n return multi_class" }, { @@ -103336,7 +110437,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "penalty", @@ -103346,7 +110448,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dual", @@ -103356,13 +110459,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_solver(solver, penalty, dual):\n all_solvers = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']\n if solver not in all_solvers:\n raise ValueError('Logistic Regression supports only solvers in %s, got %s.' % (all_solvers, solver))\n all_penalties = ['l1', 'l2', 'elasticnet', 'none']\n if penalty not in all_penalties:\n raise ValueError('Logistic Regression supports only penalties in %s, got %s.' % (all_penalties, penalty))\n if solver not in ['liblinear', 'saga'] and penalty not in ('l2', 'none'):\n raise ValueError(\"Solver %s supports only 'l2' or 'none' penalties, got %s penalty.\" % (solver, penalty))\n if solver != 'liblinear' and dual:\n raise ValueError('Solver %s supports only dual=False, got dual=%s' % (solver, dual))\n if penalty == 'elasticnet' and solver != 'saga':\n raise ValueError(\"Only 'saga' solver supports elasticnet penalty, got solver={}.\".format(solver))\n if solver == 'liblinear' and penalty == 'none':\n raise ValueError(\"penalty='none' is not supported for the liblinear solver\")\n return solver" }, { @@ -103380,7 +110484,8 @@ "docstring": { "type": "ndarray of shape (n_features,) or (n_features + 1,)", "description": "Coefficient vector." - } + }, + "refined_type": {} }, { "name": "X", @@ -103390,6 +110495,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -103400,13 +110509,14 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Array of labels." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes y * np.dot(X, w).\n\nIt takes into consideration if the intercept should be fit or not.", - "docstring": "Computes y * np.dot(X, w).\n\nIt takes into consideration if the intercept should be fit or not.\n\nParameters\n----------\nw : ndarray of shape (n_features,) or (n_features + 1,)\n Coefficient vector.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : ndarray of shape (n_samples,)\n Array of labels.\n\nReturns\n-------\nw : ndarray of shape (n_features,)\n Coefficient vector without the intercept weight (w[-1]) if the\n intercept should be fit. Unchanged otherwise.\n\nc : float\n The intercept.\n\nyz : float\n y * np.dot(X, w).", + "docstring": "Computes y * np.dot(X, w).\n\n It takes into consideration if the intercept should be fit or not.\n\n Parameters\n ----------\n w : ndarray of shape (n_features,) or (n_features + 1,)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Array of labels.\n\n Returns\n -------\n w : ndarray of shape (n_features,)\n Coefficient vector without the intercept weight (w[-1]) if the\n intercept should be fit. Unchanged otherwise.\n\n c : float\n The intercept.\n\n yz : float\n y * np.dot(X, w).\n ", "source_code": "\ndef _intercept_dot(w, X, y):\n \"\"\"Computes y * np.dot(X, w).\n\n It takes into consideration if the intercept should be fit or not.\n\n Parameters\n ----------\n w : ndarray of shape (n_features,) or (n_features + 1,)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Array of labels.\n\n Returns\n -------\n w : ndarray of shape (n_features,)\n Coefficient vector without the intercept weight (w[-1]) if the\n intercept should be fit. Unchanged otherwise.\n\n c : float\n The intercept.\n\n yz : float\n y * np.dot(X, w).\n \"\"\"\n c = 0.0\n if w.size == X.shape[1] + 1:\n c = w[-1]\n w = w[:-1]\n z = safe_sparse_dot(X, w) + c\n yz = y * z\n return w, c, yz" }, { @@ -103424,6 +110534,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -103434,7 +110548,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target labels." - } + }, + "refined_type": {} }, { "name": "train", @@ -103444,7 +110559,8 @@ "docstring": { "type": "list of indices", "description": "The indices of the train set." - } + }, + "refined_type": {} }, { "name": "test", @@ -103454,7 +110570,8 @@ "docstring": { "type": "list of indices", "description": "The indices of the test set." - } + }, + "refined_type": {} }, { "name": "pos_class", @@ -103464,7 +110581,8 @@ "docstring": { "type": "int, default=None", "description": "The class with respect to which we perform a one-vs-all fit.\nIf None, then it is assumed that the given problem is binary." - } + }, + "refined_type": {} }, { "name": "Cs", @@ -103474,7 +110592,8 @@ "docstring": { "type": "int or list of floats, default=10", "description": "Each of the values in Cs describes the inverse of\nregularization strength. If Cs is as an int, then a grid of Cs\nvalues are chosen in a logarithmic scale between 1e-4 and 1e4.\nIf not provided, then a fixed set of values for Cs are used." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -103484,7 +110603,8 @@ "docstring": { "type": "callable, default=None", "description": "A string (see model evaluation documentation) or\na scorer callable object / function with signature\n``scorer(estimator, X, y)``. For a list of scoring functions\nthat can be used, look at :mod:`sklearn.metrics`. The\ndefault scoring option used is accuracy_score." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -103494,7 +110614,8 @@ "docstring": { "type": "bool, default=False", "description": "If False, then the bias term is set to zero. Else the last\nterm of each coef_ gives us the intercept." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -103504,7 +110625,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum number of iterations for the solver." - } + }, + "refined_type": {} }, { "name": "tol", @@ -103514,7 +110636,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance for stopping criteria." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -103524,6 +110647,10 @@ "docstring": { "type": "dict or 'balanced', default=None", "description": "Weights associated with classes in the form ``{class_label: weight}``.\nIf not given, all classes are supposed to have weight one.\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``\n\nNote that these weights will be multiplied with sample_weight (passed\nthrough the fit method) if sample_weight is specified." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -103534,7 +110661,8 @@ "docstring": { "type": "int, default=0", "description": "For the liblinear and lbfgs solvers set verbose to any positive\nnumber for verbosity." - } + }, + "refined_type": {} }, { "name": "solver", @@ -103544,6 +110672,16 @@ "docstring": { "type": "{'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, default='lbfgs'", "description": "Decides which solver to use." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "newton-cg", + "saga", + "lbfgs", + "sag", + "liblinear" + ] } }, { @@ -103554,6 +110692,10 @@ "docstring": { "type": "{'l1', 'l2', 'elasticnet'}, default='l2'", "description": "Used to specify the norm used in the penalization. The 'newton-cg',\n'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is\nonly supported by the 'saga' solver." + }, + "refined_type": { + "kind": "EnumType", + "values": ["l2", "l1", "elasticnet"] } }, { @@ -103564,7 +110706,8 @@ "docstring": { "type": "bool, default=False", "description": "Dual or primal formulation. Dual formulation is only implemented for\nl2 penalty with liblinear solver. Prefer dual=False when\nn_samples > n_features." - } + }, + "refined_type": {} }, { "name": "intercept_scaling", @@ -103574,7 +110717,8 @@ "docstring": { "type": "float, default=1.", "description": "Useful only when the solver 'liblinear' is used\nand self.fit_intercept is set to True. In this case, x becomes\n[x, self.intercept_scaling],\ni.e. a \"synthetic\" feature with constant value equals to\nintercept_scaling is appended to the instance vector.\nThe intercept becomes intercept_scaling * synthetic feature weight\nNote! the synthetic feature weight is subject to l1/l2 regularization\nas all other features.\nTo lessen the effect of regularization on synthetic feature weight\n(and therefore on the intercept) intercept_scaling has to be increased." - } + }, + "refined_type": {} }, { "name": "multi_class", @@ -103584,6 +110728,10 @@ "docstring": { "type": "{'auto', 'ovr', 'multinomial'}, default='auto'", "description": "If the option chosen is 'ovr', then a binary problem is fit for each\nlabel. For 'multinomial' the loss minimised is the multinomial loss fit\nacross the entire probability distribution, *even when the data is\nbinary*. 'multinomial' is unavailable when solver='liblinear'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "ovr", "multinomial"] } }, { @@ -103594,7 +110742,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the\ndata. See :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "max_squared_sum", @@ -103604,7 +110753,8 @@ "docstring": { "type": "float, default=None", "description": "Maximum squared sum of X over samples. Used only in SAG solver.\nIf None, it will be computed, going through all the samples.\nThe value should be precomputed to speed up cross validation." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -103614,7 +110764,8 @@ "docstring": { "type": "array-like of shape(n_samples,), default=None", "description": "Array of weights that are assigned to individual samples.\nIf not provided, then each sample is given unit weight." - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -103624,13 +110775,14 @@ "docstring": { "type": "float, default=None", "description": "The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only\nused if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent\nto using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent\nto using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a\ncombination of L1 and L2." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes scores across logistic_regression_path", - "docstring": "Computes scores across logistic_regression_path\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target labels.\n\ntrain : list of indices\n The indices of the train set.\n\ntest : list of indices\n The indices of the test set.\n\npos_class : int, default=None\n The class with respect to which we perform a one-vs-all fit.\n If None, then it is assumed that the given problem is binary.\n\nCs : int or list of floats, default=10\n Each of the values in Cs describes the inverse of\n regularization strength. If Cs is as an int, then a grid of Cs\n values are chosen in a logarithmic scale between 1e-4 and 1e4.\n If not provided, then a fixed set of values for Cs are used.\n\nscoring : callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``. For a list of scoring functions\n that can be used, look at :mod:`sklearn.metrics`. The\n default scoring option used is accuracy_score.\n\nfit_intercept : bool, default=False\n If False, then the bias term is set to zero. Else the last\n term of each coef_ gives us the intercept.\n\nmax_iter : int, default=100\n Maximum number of iterations for the solver.\n\ntol : float, default=1e-4\n Tolerance for stopping criteria.\n\nclass_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\nverbose : int, default=0\n For the liblinear and lbfgs solvers set verbose to any positive\n number for verbosity.\n\nsolver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, default='lbfgs'\n Decides which solver to use.\n\npenalty : {'l1', 'l2', 'elasticnet'}, default='l2'\n Used to specify the norm used in the penalization. The 'newton-cg',\n 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is\n only supported by the 'saga' solver.\n\ndual : bool, default=False\n Dual or primal formulation. Dual formulation is only implemented for\n l2 penalty with liblinear solver. Prefer dual=False when\n n_samples > n_features.\n\nintercept_scaling : float, default=1.\n Useful only when the solver 'liblinear' is used\n and self.fit_intercept is set to True. In this case, x becomes\n [x, self.intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equals to\n intercept_scaling is appended to the instance vector.\n The intercept becomes intercept_scaling * synthetic feature weight\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\nmulti_class : {'auto', 'ovr', 'multinomial'}, default='auto'\n If the option chosen is 'ovr', then a binary problem is fit for each\n label. For 'multinomial' the loss minimised is the multinomial loss fit\n across the entire probability distribution, *even when the data is\n binary*. 'multinomial' is unavailable when solver='liblinear'.\n\nrandom_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the\n data. See :term:`Glossary ` for details.\n\nmax_squared_sum : float, default=None\n Maximum squared sum of X over samples. Used only in SAG solver.\n If None, it will be computed, going through all the samples.\n The value should be precomputed to speed up cross validation.\n\nsample_weight : array-like of shape(n_samples,), default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\nl1_ratio : float, default=None\n The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only\n used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent\n to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent\n to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a\n combination of L1 and L2.\n\nReturns\n-------\ncoefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)\n List of coefficients for the Logistic Regression model. If\n fit_intercept is set to True then the second dimension will be\n n_features + 1, where the last item represents the intercept.\n\nCs : ndarray\n Grid of Cs used for cross-validation.\n\nscores : ndarray of shape (n_cs,)\n Scores obtained for each Cs.\n\nn_iter : ndarray of shape(n_cs,)\n Actual number of iteration for each Cs.", + "docstring": "Computes scores across logistic_regression_path\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target labels.\n\n train : list of indices\n The indices of the train set.\n\n test : list of indices\n The indices of the test set.\n\n pos_class : int, default=None\n The class with respect to which we perform a one-vs-all fit.\n If None, then it is assumed that the given problem is binary.\n\n Cs : int or list of floats, default=10\n Each of the values in Cs describes the inverse of\n regularization strength. If Cs is as an int, then a grid of Cs\n values are chosen in a logarithmic scale between 1e-4 and 1e4.\n If not provided, then a fixed set of values for Cs are used.\n\n scoring : callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``. For a list of scoring functions\n that can be used, look at :mod:`sklearn.metrics`. The\n default scoring option used is accuracy_score.\n\n fit_intercept : bool, default=False\n If False, then the bias term is set to zero. Else the last\n term of each coef_ gives us the intercept.\n\n max_iter : int, default=100\n Maximum number of iterations for the solver.\n\n tol : float, default=1e-4\n Tolerance for stopping criteria.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n verbose : int, default=0\n For the liblinear and lbfgs solvers set verbose to any positive\n number for verbosity.\n\n solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, default='lbfgs'\n Decides which solver to use.\n\n penalty : {'l1', 'l2', 'elasticnet'}, default='l2'\n Used to specify the norm used in the penalization. The 'newton-cg',\n 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is\n only supported by the 'saga' solver.\n\n dual : bool, default=False\n Dual or primal formulation. Dual formulation is only implemented for\n l2 penalty with liblinear solver. Prefer dual=False when\n n_samples > n_features.\n\n intercept_scaling : float, default=1.\n Useful only when the solver 'liblinear' is used\n and self.fit_intercept is set to True. In this case, x becomes\n [x, self.intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equals to\n intercept_scaling is appended to the instance vector.\n The intercept becomes intercept_scaling * synthetic feature weight\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\n multi_class : {'auto', 'ovr', 'multinomial'}, default='auto'\n If the option chosen is 'ovr', then a binary problem is fit for each\n label. For 'multinomial' the loss minimised is the multinomial loss fit\n across the entire probability distribution, *even when the data is\n binary*. 'multinomial' is unavailable when solver='liblinear'.\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the\n data. See :term:`Glossary ` for details.\n\n max_squared_sum : float, default=None\n Maximum squared sum of X over samples. Used only in SAG solver.\n If None, it will be computed, going through all the samples.\n The value should be precomputed to speed up cross validation.\n\n sample_weight : array-like of shape(n_samples,), default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n l1_ratio : float, default=None\n The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only\n used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent\n to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent\n to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a\n combination of L1 and L2.\n\n Returns\n -------\n coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)\n List of coefficients for the Logistic Regression model. If\n fit_intercept is set to True then the second dimension will be\n n_features + 1, where the last item represents the intercept.\n\n Cs : ndarray\n Grid of Cs used for cross-validation.\n\n scores : ndarray of shape (n_cs,)\n Scores obtained for each Cs.\n\n n_iter : ndarray of shape(n_cs,)\n Actual number of iteration for each Cs.\n ", "source_code": "\ndef _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, scoring=None, fit_intercept=False, max_iter=100, tol=0.0001, class_weight=None, verbose=0, solver='lbfgs', penalty='l2', dual=False, intercept_scaling=1.0, multi_class='auto', random_state=None, max_squared_sum=None, sample_weight=None, l1_ratio=None):\n \"\"\"Computes scores across logistic_regression_path\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target labels.\n\n train : list of indices\n The indices of the train set.\n\n test : list of indices\n The indices of the test set.\n\n pos_class : int, default=None\n The class with respect to which we perform a one-vs-all fit.\n If None, then it is assumed that the given problem is binary.\n\n Cs : int or list of floats, default=10\n Each of the values in Cs describes the inverse of\n regularization strength. If Cs is as an int, then a grid of Cs\n values are chosen in a logarithmic scale between 1e-4 and 1e4.\n If not provided, then a fixed set of values for Cs are used.\n\n scoring : callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``. For a list of scoring functions\n that can be used, look at :mod:`sklearn.metrics`. The\n default scoring option used is accuracy_score.\n\n fit_intercept : bool, default=False\n If False, then the bias term is set to zero. Else the last\n term of each coef_ gives us the intercept.\n\n max_iter : int, default=100\n Maximum number of iterations for the solver.\n\n tol : float, default=1e-4\n Tolerance for stopping criteria.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n verbose : int, default=0\n For the liblinear and lbfgs solvers set verbose to any positive\n number for verbosity.\n\n solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, default='lbfgs'\n Decides which solver to use.\n\n penalty : {'l1', 'l2', 'elasticnet'}, default='l2'\n Used to specify the norm used in the penalization. The 'newton-cg',\n 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is\n only supported by the 'saga' solver.\n\n dual : bool, default=False\n Dual or primal formulation. Dual formulation is only implemented for\n l2 penalty with liblinear solver. Prefer dual=False when\n n_samples > n_features.\n\n intercept_scaling : float, default=1.\n Useful only when the solver 'liblinear' is used\n and self.fit_intercept is set to True. In this case, x becomes\n [x, self.intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equals to\n intercept_scaling is appended to the instance vector.\n The intercept becomes intercept_scaling * synthetic feature weight\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\n multi_class : {'auto', 'ovr', 'multinomial'}, default='auto'\n If the option chosen is 'ovr', then a binary problem is fit for each\n label. For 'multinomial' the loss minimised is the multinomial loss fit\n across the entire probability distribution, *even when the data is\n binary*. 'multinomial' is unavailable when solver='liblinear'.\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the\n data. See :term:`Glossary ` for details.\n\n max_squared_sum : float, default=None\n Maximum squared sum of X over samples. Used only in SAG solver.\n If None, it will be computed, going through all the samples.\n The value should be precomputed to speed up cross validation.\n\n sample_weight : array-like of shape(n_samples,), default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n l1_ratio : float, default=None\n The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only\n used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent\n to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent\n to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a\n combination of L1 and L2.\n\n Returns\n -------\n coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)\n List of coefficients for the Logistic Regression model. If\n fit_intercept is set to True then the second dimension will be\n n_features + 1, where the last item represents the intercept.\n\n Cs : ndarray\n Grid of Cs used for cross-validation.\n\n scores : ndarray of shape (n_cs,)\n Scores obtained for each Cs.\n\n n_iter : ndarray of shape(n_cs,)\n Actual number of iteration for each Cs.\n \"\"\"\n X_train = X[train]\n X_test = X[test]\n y_train = y[train]\n y_test = y[test]\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n sample_weight = sample_weight[train]\n (coefs, Cs, n_iter) = _logistic_regression_path(X_train, y_train, Cs=Cs, l1_ratio=l1_ratio, fit_intercept=fit_intercept, solver=solver, max_iter=max_iter, class_weight=class_weight, pos_class=pos_class, multi_class=multi_class, tol=tol, verbose=verbose, dual=dual, penalty=penalty, intercept_scaling=intercept_scaling, random_state=random_state, check_input=False, max_squared_sum=max_squared_sum, sample_weight=sample_weight)\n log_reg = LogisticRegression(solver=solver, multi_class=multi_class)\n if multi_class == 'ovr':\n log_reg.classes_ = np.array([-1, 1])\n elif multi_class == 'multinomial':\n log_reg.classes_ = np.unique(y_train)\n else:\n raise ValueError('multi_class should be either multinomial or ovr, got %d' % multi_class)\n if pos_class is not None:\n mask = y_test == pos_class\n y_test = np.ones(y_test.shape, dtype=np.float64)\n y_test[~mask] = -1.0\n scores = list()\n scoring = get_scorer(scoring)\n for w in coefs:\n if multi_class == 'ovr':\n w = w[np.newaxis, :]\n if fit_intercept:\n log_reg.coef_ = w[:, :-1]\n log_reg.intercept_ = w[:, -1]\n else:\n log_reg.coef_ = w\n log_reg.intercept_ = 0.0\n if scoring is None:\n scores.append(log_reg.score(X_test, y_test))\n else:\n scores.append(scoring(log_reg, X_test, y_test))\n return coefs, Cs, np.array(scores), n_iter" }, { @@ -103648,7 +110800,8 @@ "docstring": { "type": "ndarray of shape (n_features,) or (n_features + 1,)", "description": "Coefficient vector." - } + }, + "refined_type": {} }, { "name": "X", @@ -103658,6 +110811,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -103668,7 +110825,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Array of labels." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -103678,7 +110836,8 @@ "docstring": { "type": "float", "description": "Regularization parameter. alpha is equal to 1 / C." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -103688,13 +110847,14 @@ "docstring": { "type": "array-like of shape (n_samples,) default=None", "description": "Array of weights that are assigned to individual samples.\nIf not provided, then each sample is given unit weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes the gradient and the Hessian, in the case of a logistic loss.", - "docstring": "Computes the gradient and the Hessian, in the case of a logistic loss.\n\nParameters\n----------\nw : ndarray of shape (n_features,) or (n_features + 1,)\n Coefficient vector.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : ndarray of shape (n_samples,)\n Array of labels.\n\nalpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\nsample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\nReturns\n-------\ngrad : ndarray of shape (n_features,) or (n_features + 1,)\n Logistic gradient.\n\nHs : callable\n Function that takes the gradient as a parameter and returns the\n matrix product of the Hessian and gradient.", + "docstring": "Computes the gradient and the Hessian, in the case of a logistic loss.\n\n Parameters\n ----------\n w : ndarray of shape (n_features,) or (n_features + 1,)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Array of labels.\n\n alpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\n sample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n Returns\n -------\n grad : ndarray of shape (n_features,) or (n_features + 1,)\n Logistic gradient.\n\n Hs : callable\n Function that takes the gradient as a parameter and returns the\n matrix product of the Hessian and gradient.\n ", "source_code": "\ndef _logistic_grad_hess(w, X, y, alpha, sample_weight=None):\n \"\"\"Computes the gradient and the Hessian, in the case of a logistic loss.\n\n Parameters\n ----------\n w : ndarray of shape (n_features,) or (n_features + 1,)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Array of labels.\n\n alpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\n sample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n Returns\n -------\n grad : ndarray of shape (n_features,) or (n_features + 1,)\n Logistic gradient.\n\n Hs : callable\n Function that takes the gradient as a parameter and returns the\n matrix product of the Hessian and gradient.\n \"\"\"\n (n_samples, n_features) = X.shape\n grad = np.empty_like(w)\n fit_intercept = grad.shape[0] > n_features\n (w, c, yz) = _intercept_dot(w, X, y)\n if sample_weight is None:\n sample_weight = np.ones(y.shape[0])\n z = expit(yz)\n z0 = sample_weight * (z - 1) * y\n grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w\n if fit_intercept:\n grad[-1] = z0.sum()\n d = sample_weight * z * (1 - z)\n if sparse.issparse(X):\n dX = safe_sparse_dot(sparse.dia_matrix((d, 0), shape=(n_samples, n_samples)), X)\n else:\n dX = d[:, np.newaxis] * X\n if fit_intercept:\n dd_intercept = np.squeeze(np.array(dX.sum(axis=0)))\n \n def Hs(s):\n ret = np.empty_like(s)\n if sparse.issparse(X):\n ret[:n_features] = X.T.dot(dX.dot(s[:n_features]))\n else:\n ret[:n_features] = np.linalg.multi_dot([X.T, dX, s[:n_features]])\n ret[:n_features] += alpha * s[:n_features]\n if fit_intercept:\n ret[:n_features] += s[-1] * dd_intercept\n ret[-1] = dd_intercept.dot(s[:n_features])\n ret[-1] += d.sum() * s[-1]\n return ret\n return grad, Hs" }, { @@ -103712,7 +110872,8 @@ "docstring": { "type": "ndarray of shape (n_features,) or (n_features + 1,)", "description": "Coefficient vector." - } + }, + "refined_type": {} }, { "name": "X", @@ -103722,6 +110883,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -103732,7 +110897,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Array of labels." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -103742,7 +110908,8 @@ "docstring": { "type": "float", "description": "Regularization parameter. alpha is equal to 1 / C." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -103752,13 +110919,14 @@ "docstring": { "type": "array-like of shape (n_samples,) default=None", "description": "Array of weights that are assigned to individual samples.\nIf not provided, then each sample is given unit weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes the logistic loss.", - "docstring": "Computes the logistic loss.\n\nParameters\n----------\nw : ndarray of shape (n_features,) or (n_features + 1,)\n Coefficient vector.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : ndarray of shape (n_samples,)\n Array of labels.\n\nalpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\nsample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\nReturns\n-------\nout : float\n Logistic loss.", + "docstring": "Computes the logistic loss.\n\n Parameters\n ----------\n w : ndarray of shape (n_features,) or (n_features + 1,)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Array of labels.\n\n alpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\n sample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n Returns\n -------\n out : float\n Logistic loss.\n ", "source_code": "\ndef _logistic_loss(w, X, y, alpha, sample_weight=None):\n \"\"\"Computes the logistic loss.\n\n Parameters\n ----------\n w : ndarray of shape (n_features,) or (n_features + 1,)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Array of labels.\n\n alpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\n sample_weight : array-like of shape (n_samples,) default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n Returns\n -------\n out : float\n Logistic loss.\n \"\"\"\n (w, c, yz) = _intercept_dot(w, X, y)\n if sample_weight is None:\n sample_weight = np.ones(y.shape[0])\n out = -np.sum(sample_weight * log_logistic(yz)) + 0.5 * alpha * np.dot(w, w)\n return out" }, { @@ -103776,7 +110944,8 @@ "docstring": { "type": "ndarray of shape (n_features,) or (n_features + 1,)", "description": "Coefficient vector." - } + }, + "refined_type": {} }, { "name": "X", @@ -103786,6 +110955,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -103796,7 +110969,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Array of labels." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -103806,7 +110980,8 @@ "docstring": { "type": "float", "description": "Regularization parameter. alpha is equal to 1 / C." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -103816,13 +110991,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Array of weights that are assigned to individual samples.\nIf not provided, then each sample is given unit weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes the logistic loss and gradient.", - "docstring": "Computes the logistic loss and gradient.\n\nParameters\n----------\nw : ndarray of shape (n_features,) or (n_features + 1,)\n Coefficient vector.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : ndarray of shape (n_samples,)\n Array of labels.\n\nalpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\nReturns\n-------\nout : float\n Logistic loss.\n\ngrad : ndarray of shape (n_features,) or (n_features + 1,)\n Logistic gradient.", + "docstring": "Computes the logistic loss and gradient.\n\n Parameters\n ----------\n w : ndarray of shape (n_features,) or (n_features + 1,)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Array of labels.\n\n alpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n Returns\n -------\n out : float\n Logistic loss.\n\n grad : ndarray of shape (n_features,) or (n_features + 1,)\n Logistic gradient.\n ", "source_code": "\ndef _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None):\n \"\"\"Computes the logistic loss and gradient.\n\n Parameters\n ----------\n w : ndarray of shape (n_features,) or (n_features + 1,)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Array of labels.\n\n alpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n Returns\n -------\n out : float\n Logistic loss.\n\n grad : ndarray of shape (n_features,) or (n_features + 1,)\n Logistic gradient.\n \"\"\"\n (n_samples, n_features) = X.shape\n grad = np.empty_like(w)\n (w, c, yz) = _intercept_dot(w, X, y)\n if sample_weight is None:\n sample_weight = np.ones(n_samples)\n out = -np.sum(sample_weight * log_logistic(yz)) + 0.5 * alpha * np.dot(w, w)\n z = expit(yz)\n z0 = sample_weight * (z - 1) * y\n grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w\n if grad.shape[0] > n_features:\n grad[-1] = z0.sum()\n return out, grad" }, { @@ -103840,6 +111016,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -103850,7 +111030,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Input data, target values." - } + }, + "refined_type": {} }, { "name": "pos_class", @@ -103860,7 +111041,8 @@ "docstring": { "type": "int, default=None", "description": "The class with respect to which we perform a one-vs-all fit.\nIf None, then it is assumed that the given problem is binary." - } + }, + "refined_type": {} }, { "name": "Cs", @@ -103870,7 +111052,8 @@ "docstring": { "type": "int or array-like of shape (n_cs,), default=10", "description": "List of values for the regularization parameter or integer specifying\nthe number of regularization parameters that should be used. In this\ncase, the parameters will be chosen in a logarithmic scale between\n1e-4 and 1e4." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -103880,7 +111063,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to fit an intercept for the model. In this case the shape of\nthe returned array is (n_cs, n_features + 1)." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -103890,7 +111074,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum number of iterations for the solver." - } + }, + "refined_type": {} }, { "name": "tol", @@ -103900,6 +111085,10 @@ "docstring": { "type": "float, default=1e-4", "description": "Stopping criterion. For the newton-cg and lbfgs solvers, the iteration\nwill stop when ``max{|g_i | i = 1, ..., n} <= tol``\nwhere ``g_i`` is the i-th component of the gradient." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -103910,7 +111099,8 @@ "docstring": { "type": "int, default=0", "description": "For the liblinear and lbfgs solvers set verbose to any positive\nnumber for verbosity." - } + }, + "refined_type": {} }, { "name": "solver", @@ -103920,6 +111110,16 @@ "docstring": { "type": "{'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, default='lbfgs'", "description": "Numerical solver to use." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "newton-cg", + "saga", + "lbfgs", + "sag", + "liblinear" + ] } }, { @@ -103930,7 +111130,8 @@ "docstring": { "type": "array-like of shape (n_features,), default=None", "description": "Initialization value for coefficients of logistic regression.\nUseless for liblinear solver." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -103940,6 +111141,10 @@ "docstring": { "type": "dict or 'balanced', default=None", "description": "Weights associated with classes in the form ``{class_label: weight}``.\nIf not given, all classes are supposed to have weight one.\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``.\n\nNote that these weights will be multiplied with sample_weight (passed\nthrough the fit method) if sample_weight is specified." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -103950,7 +111155,8 @@ "docstring": { "type": "bool, default=False", "description": "Dual or primal formulation. Dual formulation is only implemented for\nl2 penalty with liblinear solver. Prefer dual=False when\nn_samples > n_features." - } + }, + "refined_type": {} }, { "name": "penalty", @@ -103960,6 +111166,10 @@ "docstring": { "type": "{'l1', 'l2', 'elasticnet'}, default='l2'", "description": "Used to specify the norm used in the penalization. The 'newton-cg',\n'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is\nonly supported by the 'saga' solver." + }, + "refined_type": { + "kind": "EnumType", + "values": ["l2", "l1", "elasticnet"] } }, { @@ -103970,7 +111180,8 @@ "docstring": { "type": "float, default=1.", "description": "Useful only when the solver 'liblinear' is used\nand self.fit_intercept is set to True. In this case, x becomes\n[x, self.intercept_scaling],\ni.e. a \"synthetic\" feature with constant value equal to\nintercept_scaling is appended to the instance vector.\nThe intercept becomes ``intercept_scaling * synthetic_feature_weight``.\n\nNote! the synthetic feature weight is subject to l1/l2 regularization\nas all other features.\nTo lessen the effect of regularization on synthetic feature weight\n(and therefore on the intercept) intercept_scaling has to be increased." - } + }, + "refined_type": {} }, { "name": "multi_class", @@ -103980,6 +111191,10 @@ "docstring": { "type": "{'ovr', 'multinomial', 'auto'}, default='auto'", "description": "If the option chosen is 'ovr', then a binary problem is fit for each\nlabel. For 'multinomial' the loss minimised is the multinomial loss fit\nacross the entire probability distribution, *even when the data is\nbinary*. 'multinomial' is unavailable when solver='liblinear'.\n'auto' selects 'ovr' if the data is binary, or if solver='liblinear',\nand otherwise selects 'multinomial'.\n\n.. versionadded:: 0.18\n Stochastic Average Gradient descent solver for 'multinomial' case.\n.. versionchanged:: 0.22\n Default changed from 'ovr' to 'auto' in 0.22." + }, + "refined_type": { + "kind": "EnumType", + "values": ["multinomial", "auto", "ovr"] } }, { @@ -103990,7 +111205,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the\ndata. See :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "check_input", @@ -104000,7 +111216,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, the input arrays X and y will not be checked." - } + }, + "refined_type": {} }, { "name": "max_squared_sum", @@ -104010,7 +111227,8 @@ "docstring": { "type": "float, default=None", "description": "Maximum squared sum of X over samples. Used only in SAG solver.\nIf None, it will be computed, going through all the samples.\nThe value should be precomputed to speed up cross validation." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -104020,7 +111238,8 @@ "docstring": { "type": "array-like of shape(n_samples,), default=None", "description": "Array of weights that are assigned to individual samples.\nIf not provided, then each sample is given unit weight." - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -104030,13 +111249,14 @@ "docstring": { "type": "float, default=None", "description": "The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only\nused if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent\nto using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent\nto using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a\ncombination of L1 and L2." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute a Logistic Regression model for a list of regularization parameters.\n\nThis is an implementation that uses the result of the previous model to speed up computations along the set of solutions, making it faster than sequentially calling LogisticRegression for the different parameters. Note that there will be no speedup with liblinear solver, since it does not handle warm-starting. Read more in the :ref:`User Guide `.", - "docstring": "Compute a Logistic Regression model for a list of regularization\nparameters.\n\nThis is an implementation that uses the result of the previous model\nto speed up computations along the set of solutions, making it faster\nthan sequentially calling LogisticRegression for the different parameters.\nNote that there will be no speedup with liblinear solver, since it does\nnot handle warm-starting.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Input data, target values.\n\npos_class : int, default=None\n The class with respect to which we perform a one-vs-all fit.\n If None, then it is assumed that the given problem is binary.\n\nCs : int or array-like of shape (n_cs,), default=10\n List of values for the regularization parameter or integer specifying\n the number of regularization parameters that should be used. In this\n case, the parameters will be chosen in a logarithmic scale between\n 1e-4 and 1e4.\n\nfit_intercept : bool, default=True\n Whether to fit an intercept for the model. In this case the shape of\n the returned array is (n_cs, n_features + 1).\n\nmax_iter : int, default=100\n Maximum number of iterations for the solver.\n\ntol : float, default=1e-4\n Stopping criterion. For the newton-cg and lbfgs solvers, the iteration\n will stop when ``max{|g_i | i = 1, ..., n} <= tol``\n where ``g_i`` is the i-th component of the gradient.\n\nverbose : int, default=0\n For the liblinear and lbfgs solvers set verbose to any positive\n number for verbosity.\n\nsolver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, default='lbfgs'\n Numerical solver to use.\n\ncoef : array-like of shape (n_features,), default=None\n Initialization value for coefficients of logistic regression.\n Useless for liblinear solver.\n\nclass_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\ndual : bool, default=False\n Dual or primal formulation. Dual formulation is only implemented for\n l2 penalty with liblinear solver. Prefer dual=False when\n n_samples > n_features.\n\npenalty : {'l1', 'l2', 'elasticnet'}, default='l2'\n Used to specify the norm used in the penalization. The 'newton-cg',\n 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is\n only supported by the 'saga' solver.\n\nintercept_scaling : float, default=1.\n Useful only when the solver 'liblinear' is used\n and self.fit_intercept is set to True. In this case, x becomes\n [x, self.intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equal to\n intercept_scaling is appended to the instance vector.\n The intercept becomes ``intercept_scaling * synthetic_feature_weight``.\n\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\nmulti_class : {'ovr', 'multinomial', 'auto'}, default='auto'\n If the option chosen is 'ovr', then a binary problem is fit for each\n label. For 'multinomial' the loss minimised is the multinomial loss fit\n across the entire probability distribution, *even when the data is\n binary*. 'multinomial' is unavailable when solver='liblinear'.\n 'auto' selects 'ovr' if the data is binary, or if solver='liblinear',\n and otherwise selects 'multinomial'.\n\n .. versionadded:: 0.18\n Stochastic Average Gradient descent solver for 'multinomial' case.\n .. versionchanged:: 0.22\n Default changed from 'ovr' to 'auto' in 0.22.\n\nrandom_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the\n data. See :term:`Glossary ` for details.\n\ncheck_input : bool, default=True\n If False, the input arrays X and y will not be checked.\n\nmax_squared_sum : float, default=None\n Maximum squared sum of X over samples. Used only in SAG solver.\n If None, it will be computed, going through all the samples.\n The value should be precomputed to speed up cross validation.\n\nsample_weight : array-like of shape(n_samples,), default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\nl1_ratio : float, default=None\n The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only\n used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent\n to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent\n to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a\n combination of L1 and L2.\n\nReturns\n-------\ncoefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)\n List of coefficients for the Logistic Regression model. If\n fit_intercept is set to True then the second dimension will be\n n_features + 1, where the last item represents the intercept. For\n ``multiclass='multinomial'``, the shape is (n_classes, n_cs,\n n_features) or (n_classes, n_cs, n_features + 1).\n\nCs : ndarray\n Grid of Cs used for cross-validation.\n\nn_iter : array of shape (n_cs,)\n Actual number of iteration for each Cs.\n\nNotes\n-----\nYou might get slightly different results with the solver liblinear than\nwith the others since this uses LIBLINEAR which penalizes the intercept.\n\n.. versionchanged:: 0.19\n The \"copy\" parameter was removed.", + "description": "Compute a Logistic Regression model for a list of regularization\nparameters.\n\nThis is an implementation that uses the result of the previous model\nto speed up computations along the set of solutions, making it faster\nthan sequentially calling LogisticRegression for the different parameters.\nNote that there will be no speedup with liblinear solver, since it does\nnot handle warm-starting.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute a Logistic Regression model for a list of regularization\n parameters.\n\n This is an implementation that uses the result of the previous model\n to speed up computations along the set of solutions, making it faster\n than sequentially calling LogisticRegression for the different parameters.\n Note that there will be no speedup with liblinear solver, since it does\n not handle warm-starting.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Input data, target values.\n\n pos_class : int, default=None\n The class with respect to which we perform a one-vs-all fit.\n If None, then it is assumed that the given problem is binary.\n\n Cs : int or array-like of shape (n_cs,), default=10\n List of values for the regularization parameter or integer specifying\n the number of regularization parameters that should be used. In this\n case, the parameters will be chosen in a logarithmic scale between\n 1e-4 and 1e4.\n\n fit_intercept : bool, default=True\n Whether to fit an intercept for the model. In this case the shape of\n the returned array is (n_cs, n_features + 1).\n\n max_iter : int, default=100\n Maximum number of iterations for the solver.\n\n tol : float, default=1e-4\n Stopping criterion. For the newton-cg and lbfgs solvers, the iteration\n will stop when ``max{|g_i | i = 1, ..., n} <= tol``\n where ``g_i`` is the i-th component of the gradient.\n\n verbose : int, default=0\n For the liblinear and lbfgs solvers set verbose to any positive\n number for verbosity.\n\n solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, default='lbfgs'\n Numerical solver to use.\n\n coef : array-like of shape (n_features,), default=None\n Initialization value for coefficients of logistic regression.\n Useless for liblinear solver.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n dual : bool, default=False\n Dual or primal formulation. Dual formulation is only implemented for\n l2 penalty with liblinear solver. Prefer dual=False when\n n_samples > n_features.\n\n penalty : {'l1', 'l2', 'elasticnet'}, default='l2'\n Used to specify the norm used in the penalization. The 'newton-cg',\n 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is\n only supported by the 'saga' solver.\n\n intercept_scaling : float, default=1.\n Useful only when the solver 'liblinear' is used\n and self.fit_intercept is set to True. In this case, x becomes\n [x, self.intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equal to\n intercept_scaling is appended to the instance vector.\n The intercept becomes ``intercept_scaling * synthetic_feature_weight``.\n\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\n multi_class : {'ovr', 'multinomial', 'auto'}, default='auto'\n If the option chosen is 'ovr', then a binary problem is fit for each\n label. For 'multinomial' the loss minimised is the multinomial loss fit\n across the entire probability distribution, *even when the data is\n binary*. 'multinomial' is unavailable when solver='liblinear'.\n 'auto' selects 'ovr' if the data is binary, or if solver='liblinear',\n and otherwise selects 'multinomial'.\n\n .. versionadded:: 0.18\n Stochastic Average Gradient descent solver for 'multinomial' case.\n .. versionchanged:: 0.22\n Default changed from 'ovr' to 'auto' in 0.22.\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the\n data. See :term:`Glossary ` for details.\n\n check_input : bool, default=True\n If False, the input arrays X and y will not be checked.\n\n max_squared_sum : float, default=None\n Maximum squared sum of X over samples. Used only in SAG solver.\n If None, it will be computed, going through all the samples.\n The value should be precomputed to speed up cross validation.\n\n sample_weight : array-like of shape(n_samples,), default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n l1_ratio : float, default=None\n The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only\n used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent\n to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent\n to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a\n combination of L1 and L2.\n\n Returns\n -------\n coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)\n List of coefficients for the Logistic Regression model. If\n fit_intercept is set to True then the second dimension will be\n n_features + 1, where the last item represents the intercept. For\n ``multiclass='multinomial'``, the shape is (n_classes, n_cs,\n n_features) or (n_classes, n_cs, n_features + 1).\n\n Cs : ndarray\n Grid of Cs used for cross-validation.\n\n n_iter : array of shape (n_cs,)\n Actual number of iteration for each Cs.\n\n Notes\n -----\n You might get slightly different results with the solver liblinear than\n with the others since this uses LIBLINEAR which penalizes the intercept.\n\n .. versionchanged:: 0.19\n The \"copy\" parameter was removed.\n ", "source_code": "\ndef _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True, max_iter=100, tol=0.0001, verbose=0, solver='lbfgs', coef=None, class_weight=None, dual=False, penalty='l2', intercept_scaling=1.0, multi_class='auto', random_state=None, check_input=True, max_squared_sum=None, sample_weight=None, l1_ratio=None):\n \"\"\"Compute a Logistic Regression model for a list of regularization\n parameters.\n\n This is an implementation that uses the result of the previous model\n to speed up computations along the set of solutions, making it faster\n than sequentially calling LogisticRegression for the different parameters.\n Note that there will be no speedup with liblinear solver, since it does\n not handle warm-starting.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Input data, target values.\n\n pos_class : int, default=None\n The class with respect to which we perform a one-vs-all fit.\n If None, then it is assumed that the given problem is binary.\n\n Cs : int or array-like of shape (n_cs,), default=10\n List of values for the regularization parameter or integer specifying\n the number of regularization parameters that should be used. In this\n case, the parameters will be chosen in a logarithmic scale between\n 1e-4 and 1e4.\n\n fit_intercept : bool, default=True\n Whether to fit an intercept for the model. In this case the shape of\n the returned array is (n_cs, n_features + 1).\n\n max_iter : int, default=100\n Maximum number of iterations for the solver.\n\n tol : float, default=1e-4\n Stopping criterion. For the newton-cg and lbfgs solvers, the iteration\n will stop when ``max{|g_i | i = 1, ..., n} <= tol``\n where ``g_i`` is the i-th component of the gradient.\n\n verbose : int, default=0\n For the liblinear and lbfgs solvers set verbose to any positive\n number for verbosity.\n\n solver : {'lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'}, default='lbfgs'\n Numerical solver to use.\n\n coef : array-like of shape (n_features,), default=None\n Initialization value for coefficients of logistic regression.\n Useless for liblinear solver.\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``.\n\n Note that these weights will be multiplied with sample_weight (passed\n through the fit method) if sample_weight is specified.\n\n dual : bool, default=False\n Dual or primal formulation. Dual formulation is only implemented for\n l2 penalty with liblinear solver. Prefer dual=False when\n n_samples > n_features.\n\n penalty : {'l1', 'l2', 'elasticnet'}, default='l2'\n Used to specify the norm used in the penalization. The 'newton-cg',\n 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is\n only supported by the 'saga' solver.\n\n intercept_scaling : float, default=1.\n Useful only when the solver 'liblinear' is used\n and self.fit_intercept is set to True. In this case, x becomes\n [x, self.intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equal to\n intercept_scaling is appended to the instance vector.\n The intercept becomes ``intercept_scaling * synthetic_feature_weight``.\n\n Note! the synthetic feature weight is subject to l1/l2 regularization\n as all other features.\n To lessen the effect of regularization on synthetic feature weight\n (and therefore on the intercept) intercept_scaling has to be increased.\n\n multi_class : {'ovr', 'multinomial', 'auto'}, default='auto'\n If the option chosen is 'ovr', then a binary problem is fit for each\n label. For 'multinomial' the loss minimised is the multinomial loss fit\n across the entire probability distribution, *even when the data is\n binary*. 'multinomial' is unavailable when solver='liblinear'.\n 'auto' selects 'ovr' if the data is binary, or if solver='liblinear',\n and otherwise selects 'multinomial'.\n\n .. versionadded:: 0.18\n Stochastic Average Gradient descent solver for 'multinomial' case.\n .. versionchanged:: 0.22\n Default changed from 'ovr' to 'auto' in 0.22.\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the\n data. See :term:`Glossary ` for details.\n\n check_input : bool, default=True\n If False, the input arrays X and y will not be checked.\n\n max_squared_sum : float, default=None\n Maximum squared sum of X over samples. Used only in SAG solver.\n If None, it will be computed, going through all the samples.\n The value should be precomputed to speed up cross validation.\n\n sample_weight : array-like of shape(n_samples,), default=None\n Array of weights that are assigned to individual samples.\n If not provided, then each sample is given unit weight.\n\n l1_ratio : float, default=None\n The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only\n used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent\n to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent\n to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a\n combination of L1 and L2.\n\n Returns\n -------\n coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)\n List of coefficients for the Logistic Regression model. If\n fit_intercept is set to True then the second dimension will be\n n_features + 1, where the last item represents the intercept. For\n ``multiclass='multinomial'``, the shape is (n_classes, n_cs,\n n_features) or (n_classes, n_cs, n_features + 1).\n\n Cs : ndarray\n Grid of Cs used for cross-validation.\n\n n_iter : array of shape (n_cs,)\n Actual number of iteration for each Cs.\n\n Notes\n -----\n You might get slightly different results with the solver liblinear than\n with the others since this uses LIBLINEAR which penalizes the intercept.\n\n .. versionchanged:: 0.19\n The \"copy\" parameter was removed.\n \"\"\"\n if isinstance(Cs, numbers.Integral):\n Cs = np.logspace(-4, 4, Cs)\n solver = _check_solver(solver, penalty, dual)\n if check_input:\n X = check_array(X, accept_sparse='csr', dtype=np.float64, accept_large_sparse=solver not in ['liblinear', 'sag', 'saga'])\n y = check_array(y, ensure_2d=False, dtype=None)\n check_consistent_length(X, y)\n (_, n_features) = X.shape\n classes = np.unique(y)\n random_state = check_random_state(random_state)\n multi_class = _check_multi_class(multi_class, solver, len(classes))\n if pos_class is None and multi_class != 'multinomial':\n if classes.size > 2:\n raise ValueError('To fit OvR, use the pos_class argument')\n pos_class = classes[1]\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True)\n le = LabelEncoder()\n if isinstance(class_weight, dict) or multi_class == 'multinomial':\n class_weight_ = compute_class_weight(class_weight, classes=classes, y=y)\n sample_weight *= class_weight_[le.fit_transform(y)]\n if multi_class == 'ovr':\n w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype)\n mask_classes = np.array([-1, 1])\n mask = y == pos_class\n y_bin = np.ones(y.shape, dtype=X.dtype)\n y_bin[~mask] = -1.0\n if class_weight == 'balanced':\n class_weight_ = compute_class_weight(class_weight, classes=mask_classes, y=y_bin)\n sample_weight *= class_weight_[le.fit_transform(y_bin)]\n else:\n if solver not in ['sag', 'saga']:\n lbin = LabelBinarizer()\n Y_multi = lbin.fit_transform(y)\n if Y_multi.shape[1] == 1:\n Y_multi = np.hstack([1 - Y_multi, Y_multi])\n else:\n le = LabelEncoder()\n Y_multi = le.fit_transform(y).astype(X.dtype, copy=False)\n w0 = np.zeros((classes.size, n_features + int(fit_intercept)), order='F', dtype=X.dtype)\n if coef is not None:\n if multi_class == 'ovr':\n if coef.size not in (n_features, w0.size):\n raise ValueError('Initialization coef is of shape %d, expected shape %d or %d' % (coef.size, n_features, w0.size))\n w0[:coef.size] = coef\n else:\n n_classes = classes.size\n if n_classes == 2:\n n_classes = 1\n if coef.shape[0] != n_classes or coef.shape[1] not in (n_features, n_features + 1):\n raise ValueError('Initialization coef is of shape (%d, %d), expected shape (%d, %d) or (%d, %d)' % (coef.shape[0], coef.shape[1], classes.size, n_features, classes.size, n_features + 1))\n if n_classes == 1:\n w0[0, :coef.shape[1]] = -coef\n w0[1, :coef.shape[1]] = coef\n else:\n w0[:, :coef.shape[1]] = coef\n if multi_class == 'multinomial':\n if solver in ['lbfgs', 'newton-cg']:\n w0 = w0.ravel()\n target = Y_multi\n if solver == 'lbfgs':\n \n def func(x, *args):\n return _multinomial_loss_grad(x, *args)[0:2]\n elif solver == 'newton-cg':\n \n def func(x, *args):\n return _multinomial_loss(x, *args)[0]\n \n def grad(x, *args):\n return _multinomial_loss_grad(x, *args)[1]\n hess = _multinomial_grad_hess\n warm_start_sag = {'coef': w0.T}\n else:\n target = y_bin\n if solver == 'lbfgs':\n func = _logistic_loss_and_grad\n elif solver == 'newton-cg':\n func = _logistic_loss\n \n def grad(x, *args):\n return _logistic_loss_and_grad(x, *args)[1]\n hess = _logistic_grad_hess\n warm_start_sag = {'coef': np.expand_dims(w0, axis=1)}\n coefs = list()\n n_iter = np.zeros(len(Cs), dtype=np.int32)\n for (i, C) in enumerate(Cs):\n if solver == 'lbfgs':\n iprint = [-1, 50, 1, 100, 101][np.searchsorted(np.array([0, 1, 2, 3]), verbose)]\n opt_res = optimize.minimize(func, w0, method='L-BFGS-B', jac=True, args=(X, target, 1.0 / C, sample_weight), options={'iprint': iprint, 'gtol': tol, 'maxiter': max_iter})\n n_iter_i = _check_optimize_result(solver, opt_res, max_iter, extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n (w0, loss) = (opt_res.x, opt_res.fun)\n elif solver == 'newton-cg':\n args = (X, target, 1.0 / C, sample_weight)\n (w0, n_iter_i) = _newton_cg(hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol)\n elif solver == 'liblinear':\n (coef_, intercept_, n_iter_i) = _fit_liblinear(X, target, C, fit_intercept, intercept_scaling, None, penalty, dual, verbose, max_iter, tol, random_state, sample_weight=sample_weight)\n if fit_intercept:\n w0 = np.concatenate([coef_.ravel(), intercept_])\n else:\n w0 = coef_.ravel()\n elif solver in ['sag', 'saga']:\n if multi_class == 'multinomial':\n target = target.astype(X.dtype, copy=False)\n loss = 'multinomial'\n else:\n loss = 'log'\n if penalty == 'l1':\n alpha = 0.0\n beta = 1.0 / C\n elif penalty == 'l2':\n alpha = 1.0 / C\n beta = 0.0\n else:\n alpha = 1.0 / C * (1 - l1_ratio)\n beta = 1.0 / C * l1_ratio\n (w0, n_iter_i, warm_start_sag) = sag_solver(X, target, sample_weight, loss, alpha, beta, max_iter, tol, verbose, random_state, False, max_squared_sum, warm_start_sag, is_saga=solver == 'saga')\n else:\n raise ValueError(\"solver must be one of {'liblinear', 'lbfgs', 'newton-cg', 'sag'}, got '%s' instead\" % solver)\n if multi_class == 'multinomial':\n n_classes = max(2, classes.size)\n multi_w0 = np.reshape(w0, (n_classes, -1))\n if n_classes == 2:\n multi_w0 = multi_w0[1][np.newaxis, :]\n coefs.append(multi_w0.copy())\n else:\n coefs.append(w0.copy())\n n_iter[i] = n_iter_i\n return np.array(coefs), np.array(Cs), n_iter" }, { @@ -104054,7 +111274,8 @@ "docstring": { "type": "ndarray of shape (n_classes * n_features,) or", "description": "(n_classes * (n_features + 1),)\nCoefficient vector." - } + }, + "refined_type": {} }, { "name": "X", @@ -104064,6 +111285,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -104074,7 +111299,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_classes)", "description": "Transformed labels according to the output of LabelBinarizer." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -104084,7 +111310,8 @@ "docstring": { "type": "float", "description": "Regularization parameter. alpha is equal to 1 / C." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -104094,13 +111321,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Array of weights that are assigned to individual samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes the gradient and the Hessian, in the case of a multinomial loss.", - "docstring": "Computes the gradient and the Hessian, in the case of a multinomial loss.\n\nParameters\n----------\nw : ndarray of shape (n_classes * n_features,) or\n (n_classes * (n_features + 1),)\n Coefficient vector.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\nY : ndarray of shape (n_samples, n_classes)\n Transformed labels according to the output of LabelBinarizer.\n\nalpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\nsample_weight : array-like of shape (n_samples,)\n Array of weights that are assigned to individual samples.\n\nReturns\n-------\ngrad : ndarray of shape (n_classes * n_features,) or (n_classes * (n_features + 1),)\n Ravelled gradient of the multinomial loss.\n\nhessp : callable\n Function that takes in a vector input of shape (n_classes * n_features)\n or (n_classes * (n_features + 1)) and returns matrix-vector product\n with hessian.\n\nReferences\n----------\nBarak A. Pearlmutter (1993). Fast Exact Multiplication by the Hessian.\n http://www.bcl.hamilton.ie/~barak/papers/nc-hessian.pdf", + "docstring": "\n Computes the gradient and the Hessian, in the case of a multinomial loss.\n\n Parameters\n ----------\n w : ndarray of shape (n_classes * n_features,) or\n (n_classes * (n_features + 1),)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n Y : ndarray of shape (n_samples, n_classes)\n Transformed labels according to the output of LabelBinarizer.\n\n alpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\n sample_weight : array-like of shape (n_samples,)\n Array of weights that are assigned to individual samples.\n\n Returns\n -------\n grad : ndarray of shape (n_classes * n_features,) or (n_classes * (n_features + 1),)\n Ravelled gradient of the multinomial loss.\n\n hessp : callable\n Function that takes in a vector input of shape (n_classes * n_features)\n or (n_classes * (n_features + 1)) and returns matrix-vector product\n with hessian.\n\n References\n ----------\n Barak A. Pearlmutter (1993). Fast Exact Multiplication by the Hessian.\n http://www.bcl.hamilton.ie/~barak/papers/nc-hessian.pdf\n ", "source_code": "\ndef _multinomial_grad_hess(w, X, Y, alpha, sample_weight):\n \"\"\"\n Computes the gradient and the Hessian, in the case of a multinomial loss.\n\n Parameters\n ----------\n w : ndarray of shape (n_classes * n_features,) or\n (n_classes * (n_features + 1),)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n Y : ndarray of shape (n_samples, n_classes)\n Transformed labels according to the output of LabelBinarizer.\n\n alpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\n sample_weight : array-like of shape (n_samples,)\n Array of weights that are assigned to individual samples.\n\n Returns\n -------\n grad : ndarray of shape (n_classes * n_features,) or (n_classes * (n_features + 1),)\n Ravelled gradient of the multinomial loss.\n\n hessp : callable\n Function that takes in a vector input of shape (n_classes * n_features)\n or (n_classes * (n_features + 1)) and returns matrix-vector product\n with hessian.\n\n References\n ----------\n Barak A. Pearlmutter (1993). Fast Exact Multiplication by the Hessian.\n http://www.bcl.hamilton.ie/~barak/papers/nc-hessian.pdf\n \"\"\"\n n_features = X.shape[1]\n n_classes = Y.shape[1]\n fit_intercept = w.size == n_classes * (n_features + 1)\n (loss, grad, p) = _multinomial_loss_grad(w, X, Y, alpha, sample_weight)\n sample_weight = sample_weight[:, np.newaxis]\n \n def hessp(v):\n v = v.reshape(n_classes, -1)\n if fit_intercept:\n inter_terms = v[:, -1]\n v = v[:, :-1]\n else:\n inter_terms = 0\n r_yhat = safe_sparse_dot(X, v.T)\n r_yhat += inter_terms\n r_yhat += (-p * r_yhat).sum(axis=1)[:, np.newaxis]\n r_yhat *= p\n r_yhat *= sample_weight\n hessProd = np.zeros((n_classes, n_features + bool(fit_intercept)))\n hessProd[:, :n_features] = safe_sparse_dot(r_yhat.T, X)\n hessProd[:, :n_features] += v * alpha\n if fit_intercept:\n hessProd[:, -1] = r_yhat.sum(axis=0)\n return hessProd.ravel()\n return grad, hessp" }, { @@ -104118,7 +111346,8 @@ "docstring": { "type": "ndarray of shape (n_classes * n_features,) or", "description": "(n_classes * (n_features + 1),)\nCoefficient vector." - } + }, + "refined_type": {} }, { "name": "X", @@ -104128,6 +111357,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -104138,7 +111371,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_classes)", "description": "Transformed labels according to the output of LabelBinarizer." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -104148,7 +111382,8 @@ "docstring": { "type": "float", "description": "Regularization parameter. alpha is equal to 1 / C." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -104158,13 +111393,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Array of weights that are assigned to individual samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes multinomial loss and class probabilities.", - "docstring": "Computes multinomial loss and class probabilities.\n\nParameters\n----------\nw : ndarray of shape (n_classes * n_features,) or\n (n_classes * (n_features + 1),)\n Coefficient vector.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\nY : ndarray of shape (n_samples, n_classes)\n Transformed labels according to the output of LabelBinarizer.\n\nalpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\nsample_weight : array-like of shape (n_samples,)\n Array of weights that are assigned to individual samples.\n\nReturns\n-------\nloss : float\n Multinomial loss.\n\np : ndarray of shape (n_samples, n_classes)\n Estimated class probabilities.\n\nw : ndarray of shape (n_classes, n_features)\n Reshaped param vector excluding intercept terms.\n\nReference\n---------\nBishop, C. M. (2006). Pattern recognition and machine learning.\nSpringer. (Chapter 4.3.4)", + "docstring": "Computes multinomial loss and class probabilities.\n\n Parameters\n ----------\n w : ndarray of shape (n_classes * n_features,) or\n (n_classes * (n_features + 1),)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n Y : ndarray of shape (n_samples, n_classes)\n Transformed labels according to the output of LabelBinarizer.\n\n alpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\n sample_weight : array-like of shape (n_samples,)\n Array of weights that are assigned to individual samples.\n\n Returns\n -------\n loss : float\n Multinomial loss.\n\n p : ndarray of shape (n_samples, n_classes)\n Estimated class probabilities.\n\n w : ndarray of shape (n_classes, n_features)\n Reshaped param vector excluding intercept terms.\n\n Reference\n ---------\n Bishop, C. M. (2006). Pattern recognition and machine learning.\n Springer. (Chapter 4.3.4)\n ", "source_code": "\ndef _multinomial_loss(w, X, Y, alpha, sample_weight):\n \"\"\"Computes multinomial loss and class probabilities.\n\n Parameters\n ----------\n w : ndarray of shape (n_classes * n_features,) or\n (n_classes * (n_features + 1),)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n Y : ndarray of shape (n_samples, n_classes)\n Transformed labels according to the output of LabelBinarizer.\n\n alpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\n sample_weight : array-like of shape (n_samples,)\n Array of weights that are assigned to individual samples.\n\n Returns\n -------\n loss : float\n Multinomial loss.\n\n p : ndarray of shape (n_samples, n_classes)\n Estimated class probabilities.\n\n w : ndarray of shape (n_classes, n_features)\n Reshaped param vector excluding intercept terms.\n\n Reference\n ---------\n Bishop, C. M. (2006). Pattern recognition and machine learning.\n Springer. (Chapter 4.3.4)\n \"\"\"\n n_classes = Y.shape[1]\n n_features = X.shape[1]\n fit_intercept = w.size == n_classes * (n_features + 1)\n w = w.reshape(n_classes, -1)\n sample_weight = sample_weight[:, np.newaxis]\n if fit_intercept:\n intercept = w[:, -1]\n w = w[:, :-1]\n else:\n intercept = 0\n p = safe_sparse_dot(X, w.T)\n p += intercept\n p -= logsumexp(p, axis=1)[:, np.newaxis]\n loss = -(sample_weight * Y * p).sum()\n loss += 0.5 * alpha * squared_norm(w)\n p = np.exp(p, p)\n return loss, p, w" }, { @@ -104182,7 +111418,8 @@ "docstring": { "type": "ndarray of shape (n_classes * n_features,) or", "description": "(n_classes * (n_features + 1),)\nCoefficient vector." - } + }, + "refined_type": {} }, { "name": "X", @@ -104192,6 +111429,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -104202,7 +111443,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_classes)", "description": "Transformed labels according to the output of LabelBinarizer." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -104212,7 +111454,8 @@ "docstring": { "type": "float", "description": "Regularization parameter. alpha is equal to 1 / C." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -104222,13 +111465,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Array of weights that are assigned to individual samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes the multinomial loss, gradient and class probabilities.", - "docstring": "Computes the multinomial loss, gradient and class probabilities.\n\nParameters\n----------\nw : ndarray of shape (n_classes * n_features,) or\n (n_classes * (n_features + 1),)\n Coefficient vector.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\nY : ndarray of shape (n_samples, n_classes)\n Transformed labels according to the output of LabelBinarizer.\n\nalpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\nsample_weight : array-like of shape (n_samples,)\n Array of weights that are assigned to individual samples.\n\nReturns\n-------\nloss : float\n Multinomial loss.\n\ngrad : ndarray of shape (n_classes * n_features,) or (n_classes * (n_features + 1),)\n Ravelled gradient of the multinomial loss.\n\np : ndarray of shape (n_samples, n_classes)\n Estimated class probabilities\n\nReference\n---------\nBishop, C. M. (2006). Pattern recognition and machine learning.\nSpringer. (Chapter 4.3.4)", + "docstring": "Computes the multinomial loss, gradient and class probabilities.\n\n Parameters\n ----------\n w : ndarray of shape (n_classes * n_features,) or\n (n_classes * (n_features + 1),)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n Y : ndarray of shape (n_samples, n_classes)\n Transformed labels according to the output of LabelBinarizer.\n\n alpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\n sample_weight : array-like of shape (n_samples,)\n Array of weights that are assigned to individual samples.\n\n Returns\n -------\n loss : float\n Multinomial loss.\n\n grad : ndarray of shape (n_classes * n_features,) or (n_classes * (n_features + 1),)\n Ravelled gradient of the multinomial loss.\n\n p : ndarray of shape (n_samples, n_classes)\n Estimated class probabilities\n\n Reference\n ---------\n Bishop, C. M. (2006). Pattern recognition and machine learning.\n Springer. (Chapter 4.3.4)\n ", "source_code": "\ndef _multinomial_loss_grad(w, X, Y, alpha, sample_weight):\n \"\"\"Computes the multinomial loss, gradient and class probabilities.\n\n Parameters\n ----------\n w : ndarray of shape (n_classes * n_features,) or\n (n_classes * (n_features + 1),)\n Coefficient vector.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n Y : ndarray of shape (n_samples, n_classes)\n Transformed labels according to the output of LabelBinarizer.\n\n alpha : float\n Regularization parameter. alpha is equal to 1 / C.\n\n sample_weight : array-like of shape (n_samples,)\n Array of weights that are assigned to individual samples.\n\n Returns\n -------\n loss : float\n Multinomial loss.\n\n grad : ndarray of shape (n_classes * n_features,) or (n_classes * (n_features + 1),)\n Ravelled gradient of the multinomial loss.\n\n p : ndarray of shape (n_samples, n_classes)\n Estimated class probabilities\n\n Reference\n ---------\n Bishop, C. M. (2006). Pattern recognition and machine learning.\n Springer. (Chapter 4.3.4)\n \"\"\"\n n_classes = Y.shape[1]\n n_features = X.shape[1]\n fit_intercept = w.size == n_classes * (n_features + 1)\n grad = np.zeros((n_classes, n_features + bool(fit_intercept)), dtype=X.dtype)\n (loss, p, w) = _multinomial_loss(w, X, Y, alpha, sample_weight)\n sample_weight = sample_weight[:, np.newaxis]\n diff = sample_weight * (p - Y)\n grad[:, :n_features] = safe_sparse_dot(diff.T, X)\n grad[:, :n_features] += alpha * w\n if fit_intercept:\n grad[:, -1] = diff.sum(axis=0)\n return loss, grad.ravel(), p" }, { @@ -104246,7 +111490,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_nonzero_coefs", @@ -104256,7 +111501,8 @@ "docstring": { "type": "int, default=None", "description": "Desired number of non-zero entries in the solution. If None (by\ndefault) this value is set to 10% of n_features." - } + }, + "refined_type": {} }, { "name": "tol", @@ -104266,7 +111512,8 @@ "docstring": { "type": "float, default=None", "description": "Maximum norm of the residual. If not None, overrides n_nonzero_coefs." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -104276,7 +111523,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -104286,7 +111534,8 @@ "docstring": { "type": "bool, default=True", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -104296,13 +111545,14 @@ "docstring": { "type": "'auto' or bool, default='auto'", "description": "Whether to use a precomputed Gram and Xy matrix to speed up\ncalculations. Improves performance when :term:`n_targets` or\n:term:`n_samples` is very large. Note that if you already have such\nmatrices, you can pass them directly to the fit method." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, n_nonzero_coefs=None, tol=None, fit_intercept=True, normalize='deprecated', precompute='auto'):\n self.n_nonzero_coefs = n_nonzero_coefs\n self.tol = tol\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.precompute = precompute" }, { @@ -104320,7 +111570,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -104330,7 +111581,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -104340,13 +111592,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values. Will be cast to X's dtype if necessary." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model using X, y as training data.", - "docstring": "Fit the model using X, y as training data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to X's dtype if necessary.\n\nReturns\n-------\nself : object\n Returns an instance of self.", + "docstring": "Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=True, estimator_name=self.__class__.__name__)\n (X, y) = self._validate_data(X, y, multi_output=True, y_numeric=True)\n n_features = X.shape[1]\n (X, y, X_offset, y_offset, X_scale, Gram, Xy) = _pre_fit(X, y, None, self.precompute, _normalize, self.fit_intercept, copy=True)\n if y.ndim == 1:\n y = y[:, np.newaxis]\n if self.n_nonzero_coefs is None and self.tol is None:\n self.n_nonzero_coefs_ = max(int(0.1 * n_features), 1)\n else:\n self.n_nonzero_coefs_ = self.n_nonzero_coefs\n if Gram is False:\n (coef_, self.n_iter_) = orthogonal_mp(X, y, n_nonzero_coefs=self.n_nonzero_coefs_, tol=self.tol, precompute=False, copy_X=True, return_n_iter=True)\n else:\n norms_sq = np.sum(y**2, axis=0) if self.tol is not None else None\n (coef_, self.n_iter_) = orthogonal_mp_gram(Gram, Xy=Xy, n_nonzero_coefs=self.n_nonzero_coefs_, tol=self.tol, norms_squared=norms_sq, copy_Gram=True, copy_Xy=True, return_n_iter=True)\n self.coef_ = coef_.T\n self._set_intercept(X_offset, y_offset, X_scale)\n return self" }, { @@ -104364,7 +111617,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy", @@ -104374,7 +111628,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the design matrix X must be copied by the algorithm. A false\nvalue is only helpful if X is already Fortran-ordered, otherwise a\ncopy is made anyway." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -104384,7 +111639,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -104394,7 +111650,8 @@ "docstring": { "type": "bool, default=True", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -104404,7 +111661,8 @@ "docstring": { "type": "int, default=None", "description": "Maximum numbers of iterations to perform, therefore maximum features\nto include. 10% of ``n_features`` but at least 5 if available." - } + }, + "refined_type": {} }, { "name": "cv", @@ -104414,7 +111672,8 @@ "docstring": { "type": "int, cross-validation generator or iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross-validation,\n- integer, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor integer/None inputs, :class:`KFold` is used.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -104424,7 +111683,8 @@ "docstring": { "type": "int, default=None", "description": "Number of CPUs to use during the cross validation.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -104434,13 +111694,14 @@ "docstring": { "type": "bool or int, default=False", "description": "Sets the verbosity amount." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, copy=True, fit_intercept=True, normalize='deprecated', max_iter=None, cv=None, n_jobs=None, verbose=False):\n self.copy = copy\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.max_iter = max_iter\n self.cv = cv\n self.n_jobs = n_jobs\n self.verbose = verbose" }, { @@ -104458,7 +111719,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -104468,7 +111730,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -104478,13 +111741,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values. Will be cast to X's dtype if necessary." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model using X, y as training data.", - "docstring": "Fit the model using X, y as training data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\nReturns\n-------\nself : object\n Returns an instance of self.", + "docstring": "Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit the model using X, y as training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=True, estimator_name=self.__class__.__name__)\n (X, y) = self._validate_data(X, y, y_numeric=True, ensure_min_features=2, estimator=self)\n X = as_float_array(X, copy=False, force_all_finite=False)\n cv = check_cv(self.cv, classifier=False)\n max_iter = min(max(int(0.1 * X.shape[1]), 5), X.shape[1]) if not self.max_iter else self.max_iter\n cv_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)((delayed(_omp_path_residues)(X[train], y[train], X[test], y[test], self.copy, self.fit_intercept, _normalize, max_iter) for (train, test) in cv.split(X)))\n min_early_stop = min((fold.shape[0] for fold in cv_paths))\n mse_folds = np.array([(fold[:min_early_stop]**2).mean(axis=1) for fold in cv_paths])\n best_n_nonzero_coefs = np.argmin(mse_folds.mean(axis=0)) + 1\n self.n_nonzero_coefs_ = best_n_nonzero_coefs\n omp = OrthogonalMatchingPursuit(n_nonzero_coefs=best_n_nonzero_coefs, fit_intercept=self.fit_intercept, normalize=_normalize)\n omp.fit(X, y)\n self.coef_ = omp.coef_\n self.intercept_ = omp.intercept_\n self.n_iter_ = omp.n_iter_\n return self" }, { @@ -104502,7 +111766,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Input dictionary. Columns are assumed to have unit norm." - } + }, + "refined_type": {} }, { "name": "y", @@ -104512,7 +111777,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Input targets." - } + }, + "refined_type": {} }, { "name": "n_nonzero_coefs", @@ -104522,7 +111788,8 @@ "docstring": { "type": "int", "description": "Targeted number of non-zero elements." - } + }, + "refined_type": {} }, { "name": "tol", @@ -104532,7 +111799,8 @@ "docstring": { "type": "float, default=None", "description": "Targeted squared error, if not None overrides n_nonzero_coefs." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -104542,7 +111810,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the design matrix X must be copied by the algorithm. A false\nvalue is only helpful if X is already Fortran-ordered, otherwise a\ncopy is made anyway." - } + }, + "refined_type": {} }, { "name": "return_path", @@ -104552,13 +111821,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return every value of the nonzero coefficients along the\nforward path. Useful for cross-validation." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Orthogonal Matching Pursuit step using the Cholesky decomposition.", - "docstring": "Orthogonal Matching Pursuit step using the Cholesky decomposition.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Input dictionary. Columns are assumed to have unit norm.\n\ny : ndarray of shape (n_samples,)\n Input targets.\n\nn_nonzero_coefs : int\n Targeted number of non-zero elements.\n\ntol : float, default=None\n Targeted squared error, if not None overrides n_nonzero_coefs.\n\ncopy_X : bool, default=True\n Whether the design matrix X must be copied by the algorithm. A false\n value is only helpful if X is already Fortran-ordered, otherwise a\n copy is made anyway.\n\nreturn_path : bool, default=False\n Whether to return every value of the nonzero coefficients along the\n forward path. Useful for cross-validation.\n\nReturns\n-------\ngamma : ndarray of shape (n_nonzero_coefs,)\n Non-zero elements of the solution.\n\nidx : ndarray of shape (n_nonzero_coefs,)\n Indices of the positions of the elements in gamma within the solution\n vector.\n\ncoef : ndarray of shape (n_features, n_nonzero_coefs)\n The first k values of column k correspond to the coefficient value\n for the active features at that step. The lower left triangle contains\n garbage. Only returned if ``return_path=True``.\n\nn_active : int\n Number of active features at convergence.", + "docstring": "Orthogonal Matching Pursuit step using the Cholesky decomposition.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Input dictionary. Columns are assumed to have unit norm.\n\n y : ndarray of shape (n_samples,)\n Input targets.\n\n n_nonzero_coefs : int\n Targeted number of non-zero elements.\n\n tol : float, default=None\n Targeted squared error, if not None overrides n_nonzero_coefs.\n\n copy_X : bool, default=True\n Whether the design matrix X must be copied by the algorithm. A false\n value is only helpful if X is already Fortran-ordered, otherwise a\n copy is made anyway.\n\n return_path : bool, default=False\n Whether to return every value of the nonzero coefficients along the\n forward path. Useful for cross-validation.\n\n Returns\n -------\n gamma : ndarray of shape (n_nonzero_coefs,)\n Non-zero elements of the solution.\n\n idx : ndarray of shape (n_nonzero_coefs,)\n Indices of the positions of the elements in gamma within the solution\n vector.\n\n coef : ndarray of shape (n_features, n_nonzero_coefs)\n The first k values of column k correspond to the coefficient value\n for the active features at that step. The lower left triangle contains\n garbage. Only returned if ``return_path=True``.\n\n n_active : int\n Number of active features at convergence.\n ", "source_code": "\ndef _cholesky_omp(X, y, n_nonzero_coefs, tol=None, copy_X=True, return_path=False):\n \"\"\"Orthogonal Matching Pursuit step using the Cholesky decomposition.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Input dictionary. Columns are assumed to have unit norm.\n\n y : ndarray of shape (n_samples,)\n Input targets.\n\n n_nonzero_coefs : int\n Targeted number of non-zero elements.\n\n tol : float, default=None\n Targeted squared error, if not None overrides n_nonzero_coefs.\n\n copy_X : bool, default=True\n Whether the design matrix X must be copied by the algorithm. A false\n value is only helpful if X is already Fortran-ordered, otherwise a\n copy is made anyway.\n\n return_path : bool, default=False\n Whether to return every value of the nonzero coefficients along the\n forward path. Useful for cross-validation.\n\n Returns\n -------\n gamma : ndarray of shape (n_nonzero_coefs,)\n Non-zero elements of the solution.\n\n idx : ndarray of shape (n_nonzero_coefs,)\n Indices of the positions of the elements in gamma within the solution\n vector.\n\n coef : ndarray of shape (n_features, n_nonzero_coefs)\n The first k values of column k correspond to the coefficient value\n for the active features at that step. The lower left triangle contains\n garbage. Only returned if ``return_path=True``.\n\n n_active : int\n Number of active features at convergence.\n \"\"\"\n if copy_X:\n X = X.copy('F')\n else:\n X = np.asfortranarray(X)\n min_float = np.finfo(X.dtype).eps\n (nrm2, swap) = linalg.get_blas_funcs(('nrm2', 'swap'), (X, ))\n (potrs, ) = get_lapack_funcs(('potrs', ), (X, ))\n alpha = np.dot(X.T, y)\n residual = y\n gamma = np.empty(0)\n n_active = 0\n indices = np.arange(X.shape[1])\n max_features = X.shape[1] if tol is not None else n_nonzero_coefs\n L = np.empty((max_features, max_features), dtype=X.dtype)\n if return_path:\n coefs = np.empty_like(L)\n while True:\n lam = np.argmax(np.abs(np.dot(X.T, residual)))\n if lam < n_active or alpha[lam]**2 < min_float:\n warnings.warn(premature, RuntimeWarning, stacklevel=2)\n break\n if n_active > 0:\n L[n_active, :n_active] = np.dot(X[:, :n_active].T, X[:, lam])\n linalg.solve_triangular(L[:n_active, :n_active], L[n_active, :n_active], trans=0, lower=1, overwrite_b=True, check_finite=False)\n v = nrm2(L[n_active, :n_active])**2\n Lkk = linalg.norm(X[:, lam])**2 - v\n if Lkk <= min_float:\n warnings.warn(premature, RuntimeWarning, stacklevel=2)\n break\n L[n_active, n_active] = sqrt(Lkk)\n else:\n L[0, 0] = linalg.norm(X[:, lam])\n (X.T[n_active], X.T[lam]) = swap(X.T[n_active], X.T[lam])\n (alpha[n_active], alpha[lam]) = (alpha[lam], alpha[n_active])\n (indices[n_active], indices[lam]) = (indices[lam], indices[n_active])\n n_active += 1\n (gamma, _) = potrs(L[:n_active, :n_active], alpha[:n_active], lower=True, overwrite_b=False)\n if return_path:\n coefs[:n_active, n_active - 1] = gamma\n residual = y - np.dot(X[:, :n_active], gamma)\n if tol is not None and nrm2(residual)**2 <= tol:\n break\n elif n_active == max_features:\n break\n if return_path:\n return gamma, indices[:n_active], coefs[:, :n_active], n_active\n else:\n return gamma, indices[:n_active], n_active" }, { @@ -104576,7 +111846,8 @@ "docstring": { "type": "ndarray of shape (n_features, n_features)", "description": "Gram matrix of the input data matrix." - } + }, + "refined_type": {} }, { "name": "Xy", @@ -104586,7 +111857,8 @@ "docstring": { "type": "ndarray of shape (n_features,)", "description": "Input targets." - } + }, + "refined_type": {} }, { "name": "n_nonzero_coefs", @@ -104596,7 +111868,8 @@ "docstring": { "type": "int", "description": "Targeted number of non-zero elements." - } + }, + "refined_type": {} }, { "name": "tol_0", @@ -104606,7 +111879,8 @@ "docstring": { "type": "float, default=None", "description": "Squared norm of y, required if tol is not None." - } + }, + "refined_type": {} }, { "name": "tol", @@ -104616,7 +111890,8 @@ "docstring": { "type": "float, default=None", "description": "Targeted squared error, if not None overrides n_nonzero_coefs." - } + }, + "refined_type": {} }, { "name": "copy_Gram", @@ -104626,7 +111901,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the gram matrix must be copied by the algorithm. A false\nvalue is only helpful if it is already Fortran-ordered, otherwise a\ncopy is made anyway." - } + }, + "refined_type": {} }, { "name": "copy_Xy", @@ -104636,7 +111912,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the covariance vector Xy must be copied by the algorithm.\nIf False, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "return_path", @@ -104646,13 +111923,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return every value of the nonzero coefficients along the\nforward path. Useful for cross-validation." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Orthogonal Matching Pursuit step on a precomputed Gram matrix.\n\nThis function uses the Cholesky decomposition method.", - "docstring": "Orthogonal Matching Pursuit step on a precomputed Gram matrix.\n\nThis function uses the Cholesky decomposition method.\n\nParameters\n----------\nGram : ndarray of shape (n_features, n_features)\n Gram matrix of the input data matrix.\n\nXy : ndarray of shape (n_features,)\n Input targets.\n\nn_nonzero_coefs : int\n Targeted number of non-zero elements.\n\ntol_0 : float, default=None\n Squared norm of y, required if tol is not None.\n\ntol : float, default=None\n Targeted squared error, if not None overrides n_nonzero_coefs.\n\ncopy_Gram : bool, default=True\n Whether the gram matrix must be copied by the algorithm. A false\n value is only helpful if it is already Fortran-ordered, otherwise a\n copy is made anyway.\n\ncopy_Xy : bool, default=True\n Whether the covariance vector Xy must be copied by the algorithm.\n If False, it may be overwritten.\n\nreturn_path : bool, default=False\n Whether to return every value of the nonzero coefficients along the\n forward path. Useful for cross-validation.\n\nReturns\n-------\ngamma : ndarray of shape (n_nonzero_coefs,)\n Non-zero elements of the solution.\n\nidx : ndarray of shape (n_nonzero_coefs,)\n Indices of the positions of the elements in gamma within the solution\n vector.\n\ncoefs : ndarray of shape (n_features, n_nonzero_coefs)\n The first k values of column k correspond to the coefficient value\n for the active features at that step. The lower left triangle contains\n garbage. Only returned if ``return_path=True``.\n\nn_active : int\n Number of active features at convergence.", + "docstring": "Orthogonal Matching Pursuit step on a precomputed Gram matrix.\n\n This function uses the Cholesky decomposition method.\n\n Parameters\n ----------\n Gram : ndarray of shape (n_features, n_features)\n Gram matrix of the input data matrix.\n\n Xy : ndarray of shape (n_features,)\n Input targets.\n\n n_nonzero_coefs : int\n Targeted number of non-zero elements.\n\n tol_0 : float, default=None\n Squared norm of y, required if tol is not None.\n\n tol : float, default=None\n Targeted squared error, if not None overrides n_nonzero_coefs.\n\n copy_Gram : bool, default=True\n Whether the gram matrix must be copied by the algorithm. A false\n value is only helpful if it is already Fortran-ordered, otherwise a\n copy is made anyway.\n\n copy_Xy : bool, default=True\n Whether the covariance vector Xy must be copied by the algorithm.\n If False, it may be overwritten.\n\n return_path : bool, default=False\n Whether to return every value of the nonzero coefficients along the\n forward path. Useful for cross-validation.\n\n Returns\n -------\n gamma : ndarray of shape (n_nonzero_coefs,)\n Non-zero elements of the solution.\n\n idx : ndarray of shape (n_nonzero_coefs,)\n Indices of the positions of the elements in gamma within the solution\n vector.\n\n coefs : ndarray of shape (n_features, n_nonzero_coefs)\n The first k values of column k correspond to the coefficient value\n for the active features at that step. The lower left triangle contains\n garbage. Only returned if ``return_path=True``.\n\n n_active : int\n Number of active features at convergence.\n ", "source_code": "\ndef _gram_omp(Gram, Xy, n_nonzero_coefs, tol_0=None, tol=None, copy_Gram=True, copy_Xy=True, return_path=False):\n \"\"\"Orthogonal Matching Pursuit step on a precomputed Gram matrix.\n\n This function uses the Cholesky decomposition method.\n\n Parameters\n ----------\n Gram : ndarray of shape (n_features, n_features)\n Gram matrix of the input data matrix.\n\n Xy : ndarray of shape (n_features,)\n Input targets.\n\n n_nonzero_coefs : int\n Targeted number of non-zero elements.\n\n tol_0 : float, default=None\n Squared norm of y, required if tol is not None.\n\n tol : float, default=None\n Targeted squared error, if not None overrides n_nonzero_coefs.\n\n copy_Gram : bool, default=True\n Whether the gram matrix must be copied by the algorithm. A false\n value is only helpful if it is already Fortran-ordered, otherwise a\n copy is made anyway.\n\n copy_Xy : bool, default=True\n Whether the covariance vector Xy must be copied by the algorithm.\n If False, it may be overwritten.\n\n return_path : bool, default=False\n Whether to return every value of the nonzero coefficients along the\n forward path. Useful for cross-validation.\n\n Returns\n -------\n gamma : ndarray of shape (n_nonzero_coefs,)\n Non-zero elements of the solution.\n\n idx : ndarray of shape (n_nonzero_coefs,)\n Indices of the positions of the elements in gamma within the solution\n vector.\n\n coefs : ndarray of shape (n_features, n_nonzero_coefs)\n The first k values of column k correspond to the coefficient value\n for the active features at that step. The lower left triangle contains\n garbage. Only returned if ``return_path=True``.\n\n n_active : int\n Number of active features at convergence.\n \"\"\"\n Gram = Gram.copy('F') if copy_Gram else np.asfortranarray(Gram)\n if copy_Xy or not Xy.flags.writeable:\n Xy = Xy.copy()\n min_float = np.finfo(Gram.dtype).eps\n (nrm2, swap) = linalg.get_blas_funcs(('nrm2', 'swap'), (Gram, ))\n (potrs, ) = get_lapack_funcs(('potrs', ), (Gram, ))\n indices = np.arange(len(Gram))\n alpha = Xy\n tol_curr = tol_0\n delta = 0\n gamma = np.empty(0)\n n_active = 0\n max_features = len(Gram) if tol is not None else n_nonzero_coefs\n L = np.empty((max_features, max_features), dtype=Gram.dtype)\n L[0, 0] = 1.0\n if return_path:\n coefs = np.empty_like(L)\n while True:\n lam = np.argmax(np.abs(alpha))\n if lam < n_active or alpha[lam]**2 < min_float:\n warnings.warn(premature, RuntimeWarning, stacklevel=3)\n break\n if n_active > 0:\n L[n_active, :n_active] = Gram[lam, :n_active]\n linalg.solve_triangular(L[:n_active, :n_active], L[n_active, :n_active], trans=0, lower=1, overwrite_b=True, check_finite=False)\n v = nrm2(L[n_active, :n_active])**2\n Lkk = Gram[lam, lam] - v\n if Lkk <= min_float:\n warnings.warn(premature, RuntimeWarning, stacklevel=3)\n break\n L[n_active, n_active] = sqrt(Lkk)\n else:\n L[0, 0] = sqrt(Gram[lam, lam])\n (Gram[n_active], Gram[lam]) = swap(Gram[n_active], Gram[lam])\n (Gram.T[n_active], Gram.T[lam]) = swap(Gram.T[n_active], Gram.T[lam])\n (indices[n_active], indices[lam]) = (indices[lam], indices[n_active])\n (Xy[n_active], Xy[lam]) = (Xy[lam], Xy[n_active])\n n_active += 1\n (gamma, _) = potrs(L[:n_active, :n_active], Xy[:n_active], lower=True, overwrite_b=False)\n if return_path:\n coefs[:n_active, n_active - 1] = gamma\n beta = np.dot(Gram[:, :n_active], gamma)\n alpha = Xy - beta\n if tol is not None:\n tol_curr += delta\n delta = np.inner(gamma, beta[:n_active])\n tol_curr -= delta\n if abs(tol_curr) <= tol:\n break\n elif n_active == max_features:\n break\n if return_path:\n return gamma, indices[:n_active], coefs[:, :n_active], n_active\n else:\n return gamma, indices[:n_active], n_active" }, { @@ -104670,7 +111948,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The data to fit the LARS on." - } + }, + "refined_type": {} }, { "name": "y_train", @@ -104680,7 +111959,8 @@ "docstring": { "type": "ndarray of shape (n_samples)", "description": "The target variable to fit LARS on." - } + }, + "refined_type": {} }, { "name": "X_test", @@ -104690,7 +111970,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The data to compute the residues on." - } + }, + "refined_type": {} }, { "name": "y_test", @@ -104700,7 +111981,8 @@ "docstring": { "type": "ndarray of shape (n_samples)", "description": "The target variable to compute the residues on." - } + }, + "refined_type": {} }, { "name": "copy", @@ -104710,7 +111992,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether X_train, X_test, y_train and y_test should be copied. If\nFalse, they may be overwritten." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -104720,7 +112003,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -104730,7 +112014,8 @@ "docstring": { "type": "bool, default=True", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -104740,13 +112025,14 @@ "docstring": { "type": "int, default=100", "description": "Maximum numbers of iterations to perform, therefore maximum features\nto include. 100 by default." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the residues on left-out data for a full LARS path.", - "docstring": "Compute the residues on left-out data for a full LARS path.\n\nParameters\n----------\nX_train : ndarray of shape (n_samples, n_features)\n The data to fit the LARS on.\n\ny_train : ndarray of shape (n_samples)\n The target variable to fit LARS on.\n\nX_test : ndarray of shape (n_samples, n_features)\n The data to compute the residues on.\n\ny_test : ndarray of shape (n_samples)\n The target variable to compute the residues on.\n\ncopy : bool, default=True\n Whether X_train, X_test, y_train and y_test should be copied. If\n False, they may be overwritten.\n\nfit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\nnormalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\nmax_iter : int, default=100\n Maximum numbers of iterations to perform, therefore maximum features\n to include. 100 by default.\n\nReturns\n-------\nresidues : ndarray of shape (n_samples, max_features)\n Residues of the prediction on the test data.", + "docstring": "Compute the residues on left-out data for a full LARS path.\n\n Parameters\n ----------\n X_train : ndarray of shape (n_samples, n_features)\n The data to fit the LARS on.\n\n y_train : ndarray of shape (n_samples)\n The target variable to fit LARS on.\n\n X_test : ndarray of shape (n_samples, n_features)\n The data to compute the residues on.\n\n y_test : ndarray of shape (n_samples)\n The target variable to compute the residues on.\n\n copy : bool, default=True\n Whether X_train, X_test, y_train and y_test should be copied. If\n False, they may be overwritten.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n max_iter : int, default=100\n Maximum numbers of iterations to perform, therefore maximum features\n to include. 100 by default.\n\n Returns\n -------\n residues : ndarray of shape (n_samples, max_features)\n Residues of the prediction on the test data.\n ", "source_code": "\ndef _omp_path_residues(X_train, y_train, X_test, y_test, copy=True, fit_intercept=True, normalize=True, max_iter=100):\n \"\"\"Compute the residues on left-out data for a full LARS path.\n\n Parameters\n ----------\n X_train : ndarray of shape (n_samples, n_features)\n The data to fit the LARS on.\n\n y_train : ndarray of shape (n_samples)\n The target variable to fit LARS on.\n\n X_test : ndarray of shape (n_samples, n_features)\n The data to compute the residues on.\n\n y_test : ndarray of shape (n_samples)\n The target variable to compute the residues on.\n\n copy : bool, default=True\n Whether X_train, X_test, y_train and y_test should be copied. If\n False, they may be overwritten.\n\n fit_intercept : bool, default=True\n Whether to calculate the intercept for this model. If set\n to false, no intercept will be used in calculations\n (i.e. data is expected to be centered).\n\n normalize : bool, default=True\n This parameter is ignored when ``fit_intercept`` is set to False.\n If True, the regressors X will be normalized before regression by\n subtracting the mean and dividing by the l2-norm.\n If you wish to standardize, please use\n :class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\n on an estimator with ``normalize=False``.\n\n .. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0. It will default\n to False in 1.2 and be removed in 1.4.\n\n max_iter : int, default=100\n Maximum numbers of iterations to perform, therefore maximum features\n to include. 100 by default.\n\n Returns\n -------\n residues : ndarray of shape (n_samples, max_features)\n Residues of the prediction on the test data.\n \"\"\"\n if copy:\n X_train = X_train.copy()\n y_train = y_train.copy()\n X_test = X_test.copy()\n y_test = y_test.copy()\n if fit_intercept:\n X_mean = X_train.mean(axis=0)\n X_train -= X_mean\n X_test -= X_mean\n y_mean = y_train.mean(axis=0)\n y_train = as_float_array(y_train, copy=False)\n y_train -= y_mean\n y_test = as_float_array(y_test, copy=False)\n y_test -= y_mean\n if normalize:\n norms = np.sqrt(np.sum(X_train**2, axis=0))\n nonzeros = np.flatnonzero(norms)\n X_train[:, nonzeros] /= norms[nonzeros]\n coefs = orthogonal_mp(X_train, y_train, n_nonzero_coefs=max_iter, tol=None, precompute=False, copy_X=False, return_path=True)\n if coefs.ndim == 1:\n coefs = coefs[:, np.newaxis]\n if normalize:\n coefs[nonzeros] /= norms[nonzeros][:, np.newaxis]\n return np.dot(coefs.T, X_test.T) - y_test" }, { @@ -104764,7 +112050,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Input data. Columns are assumed to have unit norm." - } + }, + "refined_type": {} }, { "name": "y", @@ -104774,7 +112061,8 @@ "docstring": { "type": "ndarray of shape (n_samples,) or (n_samples, n_targets)", "description": "Input targets." - } + }, + "refined_type": {} }, { "name": "n_nonzero_coefs", @@ -104784,7 +112072,8 @@ "docstring": { "type": "int, default=None", "description": "Desired number of non-zero entries in the solution. If None (by\ndefault) this value is set to 10% of n_features." - } + }, + "refined_type": {} }, { "name": "tol", @@ -104794,7 +112083,8 @@ "docstring": { "type": "float, default=None", "description": "Maximum norm of the residual. If not None, overrides n_nonzero_coefs." - } + }, + "refined_type": {} }, { "name": "precompute", @@ -104804,7 +112094,8 @@ "docstring": { "type": "'auto' or bool, default=False", "description": "Whether to perform precomputations. Improves performance when n_targets\nor n_samples is very large." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -104814,7 +112105,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the design matrix X must be copied by the algorithm. A false\nvalue is only helpful if X is already Fortran-ordered, otherwise a\ncopy is made anyway." - } + }, + "refined_type": {} }, { "name": "return_path", @@ -104824,7 +112116,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return every value of the nonzero coefficients along the\nforward path. Useful for cross-validation." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -104834,13 +112127,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether or not to return the number of iterations." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Orthogonal Matching Pursuit (OMP).\n\nSolves n_targets Orthogonal Matching Pursuit problems. An instance of the problem has the form: When parametrized by the number of non-zero coefficients using `n_nonzero_coefs`: argmin ||y - X\\gamma||^2 subject to ||\\gamma||_0 <= n_{nonzero coefs} When parametrized by error using the parameter `tol`: argmin ||\\gamma||_0 subject to ||y - X\\gamma||^2 <= tol Read more in the :ref:`User Guide `.", - "docstring": "Orthogonal Matching Pursuit (OMP).\n\nSolves n_targets Orthogonal Matching Pursuit problems.\nAn instance of the problem has the form:\n\nWhen parametrized by the number of non-zero coefficients using\n`n_nonzero_coefs`:\nargmin ||y - X\\gamma||^2 subject to ||\\gamma||_0 <= n_{nonzero coefs}\n\nWhen parametrized by error using the parameter `tol`:\nargmin ||\\gamma||_0 subject to ||y - X\\gamma||^2 <= tol\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Input data. Columns are assumed to have unit norm.\n\ny : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Input targets.\n\nn_nonzero_coefs : int, default=None\n Desired number of non-zero entries in the solution. If None (by\n default) this value is set to 10% of n_features.\n\ntol : float, default=None\n Maximum norm of the residual. If not None, overrides n_nonzero_coefs.\n\nprecompute : 'auto' or bool, default=False\n Whether to perform precomputations. Improves performance when n_targets\n or n_samples is very large.\n\ncopy_X : bool, default=True\n Whether the design matrix X must be copied by the algorithm. A false\n value is only helpful if X is already Fortran-ordered, otherwise a\n copy is made anyway.\n\nreturn_path : bool, default=False\n Whether to return every value of the nonzero coefficients along the\n forward path. Useful for cross-validation.\n\nreturn_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\nReturns\n-------\ncoef : ndarray of shape (n_features,) or (n_features, n_targets)\n Coefficients of the OMP solution. If `return_path=True`, this contains\n the whole coefficient path. In this case its shape is\n (n_features, n_features) or (n_features, n_targets, n_features) and\n iterating over the last axis yields coefficients in increasing order\n of active features.\n\nn_iters : array-like or int\n Number of active features across every target. Returned only if\n `return_n_iter` is set to True.\n\nSee Also\n--------\nOrthogonalMatchingPursuit\northogonal_mp_gram\nlars_path\nsklearn.decomposition.sparse_encode\n\nNotes\n-----\nOrthogonal matching pursuit was introduced in S. Mallat, Z. Zhang,\nMatching pursuits with time-frequency dictionaries, IEEE Transactions on\nSignal Processing, Vol. 41, No. 12. (December 1993), pp. 3397-3415.\n(http://blanche.polytechnique.fr/~mallat/papiers/MallatPursuit93.pdf)\n\nThis implementation is based on Rubinstein, R., Zibulevsky, M. and Elad,\nM., Efficient Implementation of the K-SVD Algorithm using Batch Orthogonal\nMatching Pursuit Technical Report - CS Technion, April 2008.\nhttps://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf", + "description": "Orthogonal Matching Pursuit (OMP).\n\nSolves n_targets Orthogonal Matching Pursuit problems.\nAn instance of the problem has the form:\n\nWhen parametrized by the number of non-zero coefficients using\n`n_nonzero_coefs`:\nargmin ||y - X\\gamma||^2 subject to ||\\gamma||_0 <= n_{nonzero coefs}\n\nWhen parametrized by error using the parameter `tol`:\nargmin ||\\gamma||_0 subject to ||y - X\\gamma||^2 <= tol\n\nRead more in the :ref:`User Guide `.", + "docstring": "Orthogonal Matching Pursuit (OMP).\n\n Solves n_targets Orthogonal Matching Pursuit problems.\n An instance of the problem has the form:\n\n When parametrized by the number of non-zero coefficients using\n `n_nonzero_coefs`:\n argmin ||y - X\\gamma||^2 subject to ||\\gamma||_0 <= n_{nonzero coefs}\n\n When parametrized by error using the parameter `tol`:\n argmin ||\\gamma||_0 subject to ||y - X\\gamma||^2 <= tol\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Input data. Columns are assumed to have unit norm.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Input targets.\n\n n_nonzero_coefs : int, default=None\n Desired number of non-zero entries in the solution. If None (by\n default) this value is set to 10% of n_features.\n\n tol : float, default=None\n Maximum norm of the residual. If not None, overrides n_nonzero_coefs.\n\n precompute : 'auto' or bool, default=False\n Whether to perform precomputations. Improves performance when n_targets\n or n_samples is very large.\n\n copy_X : bool, default=True\n Whether the design matrix X must be copied by the algorithm. A false\n value is only helpful if X is already Fortran-ordered, otherwise a\n copy is made anyway.\n\n return_path : bool, default=False\n Whether to return every value of the nonzero coefficients along the\n forward path. Useful for cross-validation.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n coef : ndarray of shape (n_features,) or (n_features, n_targets)\n Coefficients of the OMP solution. If `return_path=True`, this contains\n the whole coefficient path. In this case its shape is\n (n_features, n_features) or (n_features, n_targets, n_features) and\n iterating over the last axis yields coefficients in increasing order\n of active features.\n\n n_iters : array-like or int\n Number of active features across every target. Returned only if\n `return_n_iter` is set to True.\n\n See Also\n --------\n OrthogonalMatchingPursuit\n orthogonal_mp_gram\n lars_path\n sklearn.decomposition.sparse_encode\n\n Notes\n -----\n Orthogonal matching pursuit was introduced in S. Mallat, Z. Zhang,\n Matching pursuits with time-frequency dictionaries, IEEE Transactions on\n Signal Processing, Vol. 41, No. 12. (December 1993), pp. 3397-3415.\n (http://blanche.polytechnique.fr/~mallat/papiers/MallatPursuit93.pdf)\n\n This implementation is based on Rubinstein, R., Zibulevsky, M. and Elad,\n M., Efficient Implementation of the K-SVD Algorithm using Batch Orthogonal\n Matching Pursuit Technical Report - CS Technion, April 2008.\n https://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf\n\n ", "source_code": "\ndef orthogonal_mp(X, y, *, n_nonzero_coefs=None, tol=None, precompute=False, copy_X=True, return_path=False, return_n_iter=False):\n \"\"\"Orthogonal Matching Pursuit (OMP).\n\n Solves n_targets Orthogonal Matching Pursuit problems.\n An instance of the problem has the form:\n\n When parametrized by the number of non-zero coefficients using\n `n_nonzero_coefs`:\n argmin ||y - X\\gamma||^2 subject to ||\\gamma||_0 <= n_{nonzero coefs}\n\n When parametrized by error using the parameter `tol`:\n argmin ||\\gamma||_0 subject to ||y - X\\gamma||^2 <= tol\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Input data. Columns are assumed to have unit norm.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Input targets.\n\n n_nonzero_coefs : int, default=None\n Desired number of non-zero entries in the solution. If None (by\n default) this value is set to 10% of n_features.\n\n tol : float, default=None\n Maximum norm of the residual. If not None, overrides n_nonzero_coefs.\n\n precompute : 'auto' or bool, default=False\n Whether to perform precomputations. Improves performance when n_targets\n or n_samples is very large.\n\n copy_X : bool, default=True\n Whether the design matrix X must be copied by the algorithm. A false\n value is only helpful if X is already Fortran-ordered, otherwise a\n copy is made anyway.\n\n return_path : bool, default=False\n Whether to return every value of the nonzero coefficients along the\n forward path. Useful for cross-validation.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n coef : ndarray of shape (n_features,) or (n_features, n_targets)\n Coefficients of the OMP solution. If `return_path=True`, this contains\n the whole coefficient path. In this case its shape is\n (n_features, n_features) or (n_features, n_targets, n_features) and\n iterating over the last axis yields coefficients in increasing order\n of active features.\n\n n_iters : array-like or int\n Number of active features across every target. Returned only if\n `return_n_iter` is set to True.\n\n See Also\n --------\n OrthogonalMatchingPursuit\n orthogonal_mp_gram\n lars_path\n sklearn.decomposition.sparse_encode\n\n Notes\n -----\n Orthogonal matching pursuit was introduced in S. Mallat, Z. Zhang,\n Matching pursuits with time-frequency dictionaries, IEEE Transactions on\n Signal Processing, Vol. 41, No. 12. (December 1993), pp. 3397-3415.\n (http://blanche.polytechnique.fr/~mallat/papiers/MallatPursuit93.pdf)\n\n This implementation is based on Rubinstein, R., Zibulevsky, M. and Elad,\n M., Efficient Implementation of the K-SVD Algorithm using Batch Orthogonal\n Matching Pursuit Technical Report - CS Technion, April 2008.\n https://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf\n\n \"\"\"\n X = check_array(X, order='F', copy=copy_X)\n copy_X = False\n if y.ndim == 1:\n y = y.reshape(-1, 1)\n y = check_array(y)\n if y.shape[1] > 1:\n copy_X = True\n if n_nonzero_coefs is None and tol is None:\n n_nonzero_coefs = max(int(0.1 * X.shape[1]), 1)\n if tol is not None and tol < 0:\n raise ValueError('Epsilon cannot be negative')\n if tol is None and n_nonzero_coefs <= 0:\n raise ValueError('The number of atoms must be positive')\n if tol is None and n_nonzero_coefs > X.shape[1]:\n raise ValueError('The number of atoms cannot be more than the number of features')\n if precompute == 'auto':\n precompute = X.shape[0] > X.shape[1]\n if precompute:\n G = np.dot(X.T, X)\n G = np.asfortranarray(G)\n Xy = np.dot(X.T, y)\n if tol is not None:\n norms_squared = np.sum(y**2, axis=0)\n else:\n norms_squared = None\n return orthogonal_mp_gram(G, Xy, n_nonzero_coefs=n_nonzero_coefs, tol=tol, norms_squared=norms_squared, copy_Gram=copy_X, copy_Xy=False, return_path=return_path)\n if return_path:\n coef = np.zeros((X.shape[1], y.shape[1], X.shape[1]))\n else:\n coef = np.zeros((X.shape[1], y.shape[1]))\n n_iters = []\n for k in range(y.shape[1]):\n out = _cholesky_omp(X, y[:, k], n_nonzero_coefs, tol, copy_X=copy_X, return_path=return_path)\n if return_path:\n (_, idx, coefs, n_iter) = out\n coef = coef[:, :, :len(idx)]\n for (n_active, x) in enumerate(coefs.T):\n coef[idx[:n_active + 1], k, n_active] = x[:n_active + 1]\n else:\n (x, idx, n_iter) = out\n coef[idx, k] = x\n n_iters.append(n_iter)\n if y.shape[1] == 1:\n n_iters = n_iters[0]\n if return_n_iter:\n return np.squeeze(coef), n_iters\n else:\n return np.squeeze(coef)" }, { @@ -104858,7 +112152,8 @@ "docstring": { "type": "ndarray of shape (n_features, n_features)", "description": "Gram matrix of the input data: X.T * X." - } + }, + "refined_type": {} }, { "name": "Xy", @@ -104868,7 +112163,8 @@ "docstring": { "type": "ndarray of shape (n_features,) or (n_features, n_targets)", "description": "Input targets multiplied by X: X.T * y." - } + }, + "refined_type": {} }, { "name": "n_nonzero_coefs", @@ -104878,7 +112174,8 @@ "docstring": { "type": "int, default=None", "description": "Desired number of non-zero entries in the solution. If None (by\ndefault) this value is set to 10% of n_features." - } + }, + "refined_type": {} }, { "name": "tol", @@ -104888,7 +112185,8 @@ "docstring": { "type": "float, default=None", "description": "Maximum norm of the residual. If not None, overrides n_nonzero_coefs." - } + }, + "refined_type": {} }, { "name": "norms_squared", @@ -104898,7 +112196,8 @@ "docstring": { "type": "array-like of shape (n_targets,), default=None", "description": "Squared L2 norms of the lines of y. Required if tol is not None." - } + }, + "refined_type": {} }, { "name": "copy_Gram", @@ -104908,7 +112207,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the gram matrix must be copied by the algorithm. A false\nvalue is only helpful if it is already Fortran-ordered, otherwise a\ncopy is made anyway." - } + }, + "refined_type": {} }, { "name": "copy_Xy", @@ -104918,7 +112218,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the covariance vector Xy must be copied by the algorithm.\nIf False, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "return_path", @@ -104928,7 +112229,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return every value of the nonzero coefficients along the\nforward path. Useful for cross-validation." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -104938,13 +112240,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether or not to return the number of iterations." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Gram Orthogonal Matching Pursuit (OMP).\n\nSolves n_targets Orthogonal Matching Pursuit problems using only the Gram matrix X.T * X and the product X.T * y. Read more in the :ref:`User Guide `.", - "docstring": "Gram Orthogonal Matching Pursuit (OMP).\n\nSolves n_targets Orthogonal Matching Pursuit problems using only\nthe Gram matrix X.T * X and the product X.T * y.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nGram : ndarray of shape (n_features, n_features)\n Gram matrix of the input data: X.T * X.\n\nXy : ndarray of shape (n_features,) or (n_features, n_targets)\n Input targets multiplied by X: X.T * y.\n\nn_nonzero_coefs : int, default=None\n Desired number of non-zero entries in the solution. If None (by\n default) this value is set to 10% of n_features.\n\ntol : float, default=None\n Maximum norm of the residual. If not None, overrides n_nonzero_coefs.\n\nnorms_squared : array-like of shape (n_targets,), default=None\n Squared L2 norms of the lines of y. Required if tol is not None.\n\ncopy_Gram : bool, default=True\n Whether the gram matrix must be copied by the algorithm. A false\n value is only helpful if it is already Fortran-ordered, otherwise a\n copy is made anyway.\n\ncopy_Xy : bool, default=True\n Whether the covariance vector Xy must be copied by the algorithm.\n If False, it may be overwritten.\n\nreturn_path : bool, default=False\n Whether to return every value of the nonzero coefficients along the\n forward path. Useful for cross-validation.\n\nreturn_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\nReturns\n-------\ncoef : ndarray of shape (n_features,) or (n_features, n_targets)\n Coefficients of the OMP solution. If `return_path=True`, this contains\n the whole coefficient path. In this case its shape is\n (n_features, n_features) or (n_features, n_targets, n_features) and\n iterating over the last axis yields coefficients in increasing order\n of active features.\n\nn_iters : array-like or int\n Number of active features across every target. Returned only if\n `return_n_iter` is set to True.\n\nSee Also\n--------\nOrthogonalMatchingPursuit\northogonal_mp\nlars_path\nsklearn.decomposition.sparse_encode\n\nNotes\n-----\nOrthogonal matching pursuit was introduced in G. Mallat, Z. Zhang,\nMatching pursuits with time-frequency dictionaries, IEEE Transactions on\nSignal Processing, Vol. 41, No. 12. (December 1993), pp. 3397-3415.\n(http://blanche.polytechnique.fr/~mallat/papiers/MallatPursuit93.pdf)\n\nThis implementation is based on Rubinstein, R., Zibulevsky, M. and Elad,\nM., Efficient Implementation of the K-SVD Algorithm using Batch Orthogonal\nMatching Pursuit Technical Report - CS Technion, April 2008.\nhttps://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf", + "description": "Gram Orthogonal Matching Pursuit (OMP).\n\nSolves n_targets Orthogonal Matching Pursuit problems using only\nthe Gram matrix X.T * X and the product X.T * y.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Gram Orthogonal Matching Pursuit (OMP).\n\n Solves n_targets Orthogonal Matching Pursuit problems using only\n the Gram matrix X.T * X and the product X.T * y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n Gram : ndarray of shape (n_features, n_features)\n Gram matrix of the input data: X.T * X.\n\n Xy : ndarray of shape (n_features,) or (n_features, n_targets)\n Input targets multiplied by X: X.T * y.\n\n n_nonzero_coefs : int, default=None\n Desired number of non-zero entries in the solution. If None (by\n default) this value is set to 10% of n_features.\n\n tol : float, default=None\n Maximum norm of the residual. If not None, overrides n_nonzero_coefs.\n\n norms_squared : array-like of shape (n_targets,), default=None\n Squared L2 norms of the lines of y. Required if tol is not None.\n\n copy_Gram : bool, default=True\n Whether the gram matrix must be copied by the algorithm. A false\n value is only helpful if it is already Fortran-ordered, otherwise a\n copy is made anyway.\n\n copy_Xy : bool, default=True\n Whether the covariance vector Xy must be copied by the algorithm.\n If False, it may be overwritten.\n\n return_path : bool, default=False\n Whether to return every value of the nonzero coefficients along the\n forward path. Useful for cross-validation.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n coef : ndarray of shape (n_features,) or (n_features, n_targets)\n Coefficients of the OMP solution. If `return_path=True`, this contains\n the whole coefficient path. In this case its shape is\n (n_features, n_features) or (n_features, n_targets, n_features) and\n iterating over the last axis yields coefficients in increasing order\n of active features.\n\n n_iters : array-like or int\n Number of active features across every target. Returned only if\n `return_n_iter` is set to True.\n\n See Also\n --------\n OrthogonalMatchingPursuit\n orthogonal_mp\n lars_path\n sklearn.decomposition.sparse_encode\n\n Notes\n -----\n Orthogonal matching pursuit was introduced in G. Mallat, Z. Zhang,\n Matching pursuits with time-frequency dictionaries, IEEE Transactions on\n Signal Processing, Vol. 41, No. 12. (December 1993), pp. 3397-3415.\n (http://blanche.polytechnique.fr/~mallat/papiers/MallatPursuit93.pdf)\n\n This implementation is based on Rubinstein, R., Zibulevsky, M. and Elad,\n M., Efficient Implementation of the K-SVD Algorithm using Batch Orthogonal\n Matching Pursuit Technical Report - CS Technion, April 2008.\n https://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf\n\n ", "source_code": "\ndef orthogonal_mp_gram(Gram, Xy, *, n_nonzero_coefs=None, tol=None, norms_squared=None, copy_Gram=True, copy_Xy=True, return_path=False, return_n_iter=False):\n \"\"\"Gram Orthogonal Matching Pursuit (OMP).\n\n Solves n_targets Orthogonal Matching Pursuit problems using only\n the Gram matrix X.T * X and the product X.T * y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n Gram : ndarray of shape (n_features, n_features)\n Gram matrix of the input data: X.T * X.\n\n Xy : ndarray of shape (n_features,) or (n_features, n_targets)\n Input targets multiplied by X: X.T * y.\n\n n_nonzero_coefs : int, default=None\n Desired number of non-zero entries in the solution. If None (by\n default) this value is set to 10% of n_features.\n\n tol : float, default=None\n Maximum norm of the residual. If not None, overrides n_nonzero_coefs.\n\n norms_squared : array-like of shape (n_targets,), default=None\n Squared L2 norms of the lines of y. Required if tol is not None.\n\n copy_Gram : bool, default=True\n Whether the gram matrix must be copied by the algorithm. A false\n value is only helpful if it is already Fortran-ordered, otherwise a\n copy is made anyway.\n\n copy_Xy : bool, default=True\n Whether the covariance vector Xy must be copied by the algorithm.\n If False, it may be overwritten.\n\n return_path : bool, default=False\n Whether to return every value of the nonzero coefficients along the\n forward path. Useful for cross-validation.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n coef : ndarray of shape (n_features,) or (n_features, n_targets)\n Coefficients of the OMP solution. If `return_path=True`, this contains\n the whole coefficient path. In this case its shape is\n (n_features, n_features) or (n_features, n_targets, n_features) and\n iterating over the last axis yields coefficients in increasing order\n of active features.\n\n n_iters : array-like or int\n Number of active features across every target. Returned only if\n `return_n_iter` is set to True.\n\n See Also\n --------\n OrthogonalMatchingPursuit\n orthogonal_mp\n lars_path\n sklearn.decomposition.sparse_encode\n\n Notes\n -----\n Orthogonal matching pursuit was introduced in G. Mallat, Z. Zhang,\n Matching pursuits with time-frequency dictionaries, IEEE Transactions on\n Signal Processing, Vol. 41, No. 12. (December 1993), pp. 3397-3415.\n (http://blanche.polytechnique.fr/~mallat/papiers/MallatPursuit93.pdf)\n\n This implementation is based on Rubinstein, R., Zibulevsky, M. and Elad,\n M., Efficient Implementation of the K-SVD Algorithm using Batch Orthogonal\n Matching Pursuit Technical Report - CS Technion, April 2008.\n https://www.cs.technion.ac.il/~ronrubin/Publications/KSVD-OMP-v2.pdf\n\n \"\"\"\n Gram = check_array(Gram, order='F', copy=copy_Gram)\n Xy = np.asarray(Xy)\n if Xy.ndim > 1 and Xy.shape[1] > 1:\n copy_Gram = True\n if Xy.ndim == 1:\n Xy = Xy[:, np.newaxis]\n if tol is not None:\n norms_squared = [norms_squared]\n if copy_Xy or not Xy.flags.writeable:\n Xy = Xy.copy()\n if n_nonzero_coefs is None and tol is None:\n n_nonzero_coefs = int(0.1 * len(Gram))\n if tol is not None and norms_squared is None:\n raise ValueError('Gram OMP needs the precomputed norms in order to evaluate the error sum of squares.')\n if tol is not None and tol < 0:\n raise ValueError('Epsilon cannot be negative')\n if tol is None and n_nonzero_coefs <= 0:\n raise ValueError('The number of atoms must be positive')\n if tol is None and n_nonzero_coefs > len(Gram):\n raise ValueError('The number of atoms cannot be more than the number of features')\n if return_path:\n coef = np.zeros((len(Gram), Xy.shape[1], len(Gram)))\n else:\n coef = np.zeros((len(Gram), Xy.shape[1]))\n n_iters = []\n for k in range(Xy.shape[1]):\n out = _gram_omp(Gram, Xy[:, k], n_nonzero_coefs, norms_squared[k] if tol is not None else None, tol, copy_Gram=copy_Gram, copy_Xy=False, return_path=return_path)\n if return_path:\n (_, idx, coefs, n_iter) = out\n coef = coef[:, :, :len(idx)]\n for (n_active, x) in enumerate(coefs.T):\n coef[idx[:n_active + 1], k, n_active] = x[:n_active + 1]\n else:\n (x, idx, n_iter) = out\n coef[idx, k] = x\n n_iters.append(n_iter)\n if Xy.shape[1] == 1:\n n_iters = n_iters[0]\n if return_n_iter:\n return np.squeeze(coef), n_iters\n else:\n return np.squeeze(coef)" }, { @@ -104962,7 +112265,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -104972,7 +112276,8 @@ "docstring": { "type": "float, default=1.0", "description": "Maximum step size (regularization). Defaults to 1.0." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -104982,7 +112287,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the intercept should be estimated or not. If False, the\ndata is assumed to be already centered." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -104992,7 +112298,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of passes over the training data (aka epochs).\nIt only impacts the behavior in the ``fit`` method, and not the\n:meth:`partial_fit` method.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "tol", @@ -105002,7 +112309,8 @@ "docstring": { "type": "float or None, default=1e-3", "description": "The stopping criterion. If it is not None, the iterations will stop\nwhen (loss > previous_loss - tol).\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -105012,7 +112320,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use early stopping to terminate training when validation.\nscore is not improving. If set to True, it will automatically set aside\na stratified fraction of training data as validation and terminate\ntraining when validation score is not improving by at least tol for\nn_iter_no_change consecutive epochs.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -105022,7 +112331,8 @@ "docstring": { "type": "float, default=0.1", "description": "The proportion of training data to set aside as validation set for\nearly stopping. Must be between 0 and 1.\nOnly used if early_stopping is True.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -105032,7 +112342,8 @@ "docstring": { "type": "int, default=5", "description": "Number of iterations with no improvement to wait before early stopping.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -105042,7 +112353,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not the training data should be shuffled after each epoch." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -105052,7 +112364,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "loss", @@ -105062,7 +112375,8 @@ "docstring": { "type": "str, default=\"hinge\"", "description": "The loss function to be used:\nhinge: equivalent to PA-I in the reference paper.\nsquared_hinge: equivalent to PA-II in the reference paper." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -105072,7 +112386,8 @@ "docstring": { "type": "int or None, default=None", "description": "The number of CPUs to use to do the OVA (One Versus All, for\nmulti-class problems) computation.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -105082,7 +112397,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used to shuffle the training data, when ``shuffle`` is set to\n``True``. Pass an int for reproducible output across multiple\nfunction calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -105092,7 +112408,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to True, reuse the solution of the previous call to fit as\ninitialization, otherwise, just erase the previous solution.\nSee :term:`the Glossary `.\n\nRepeatedly calling fit or partial_fit when warm_start is True can\nresult in a different solution than when calling fit a single time\nbecause of the way the data is shuffled." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -105102,6 +112419,10 @@ "docstring": { "type": "dict, {class_label: weight} or \"balanced\" or None, default=None", "description": "Preset for the class_weight fit parameter.\n\nWeights associated with classes. If not given, all classes\nare supposed to have weight one.\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``.\n\n.. versionadded:: 0.17\n parameter *class_weight* to automatically weight samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -105112,13 +112433,14 @@ "docstring": { "type": "bool or int, default=False", "description": "When set to True, computes the averaged SGD weights and stores the\nresult in the ``coef_`` attribute. If set to an int greater than 1,\naveraging will begin once the total number of samples seen reaches\naverage. So average=10 will begin averaging after seeing 10 samples.\n\n.. versionadded:: 0.19\n parameter *average* to use weights averaging in SGD." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, C=1.0, fit_intercept=True, max_iter=1000, tol=0.001, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss='hinge', n_jobs=None, random_state=None, warm_start=False, class_weight=None, average=False):\n super().__init__(penalty=None, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, shuffle=shuffle, verbose=verbose, random_state=random_state, eta0=1.0, warm_start=warm_start, class_weight=class_weight, average=average, n_jobs=n_jobs)\n self.C = C\n self.loss = loss" }, { @@ -105136,7 +112458,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -105146,6 +112469,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -105156,7 +112483,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -105166,7 +112494,8 @@ "docstring": { "type": "ndarray of shape (n_classes, n_features)", "description": "The initial coefficients to warm-start the optimization." - } + }, + "refined_type": {} }, { "name": "intercept_init", @@ -105176,13 +112505,14 @@ "docstring": { "type": "ndarray of shape (n_classes,)", "description": "The initial intercept to warm-start the optimization." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit linear model with Passive Aggressive algorithm.", - "docstring": "Fit linear model with Passive Aggressive algorithm.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,)\n Target values.\n\ncoef_init : ndarray of shape (n_classes, n_features)\n The initial coefficients to warm-start the optimization.\n\nintercept_init : ndarray of shape (n_classes,)\n The initial intercept to warm-start the optimization.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit linear model with Passive Aggressive algorithm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n coef_init : ndarray of shape (n_classes, n_features)\n The initial coefficients to warm-start the optimization.\n\n intercept_init : ndarray of shape (n_classes,)\n The initial intercept to warm-start the optimization.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, coef_init=None, intercept_init=None):\n \"\"\"Fit linear model with Passive Aggressive algorithm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n coef_init : ndarray of shape (n_classes, n_features)\n The initial coefficients to warm-start the optimization.\n\n intercept_init : ndarray of shape (n_classes,)\n The initial intercept to warm-start the optimization.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self._validate_params()\n lr = 'pa1' if self.loss == 'hinge' else 'pa2'\n return self._fit(X, y, alpha=1.0, C=self.C, loss='hinge', learning_rate=lr, coef_init=coef_init, intercept_init=intercept_init)" }, { @@ -105200,7 +112530,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -105210,6 +112541,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Subset of the training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -105220,7 +112555,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Subset of the target values." - } + }, + "refined_type": {} }, { "name": "classes", @@ -105230,13 +112566,14 @@ "docstring": { "type": "ndarray of shape (n_classes,)", "description": "Classes across all calls to partial_fit.\nCan be obtained by via `np.unique(y_all)`, where y_all is the\ntarget vector of the entire dataset.\nThis argument is required for the first call to partial_fit\nand can be omitted in the subsequent calls.\nNote that y doesn't need to contain all labels in `classes`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit linear model with Passive Aggressive algorithm.", - "docstring": "Fit linear model with Passive Aggressive algorithm.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Subset of the training data.\n\ny : array-like of shape (n_samples,)\n Subset of the target values.\n\nclasses : ndarray of shape (n_classes,)\n Classes across all calls to partial_fit.\n Can be obtained by via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that y doesn't need to contain all labels in `classes`.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit linear model with Passive Aggressive algorithm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Subset of the training data.\n\n y : array-like of shape (n_samples,)\n Subset of the target values.\n\n classes : ndarray of shape (n_classes,)\n Classes across all calls to partial_fit.\n Can be obtained by via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that y doesn't need to contain all labels in `classes`.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef partial_fit(self, X, y, classes=None):\n \"\"\"Fit linear model with Passive Aggressive algorithm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Subset of the training data.\n\n y : array-like of shape (n_samples,)\n Subset of the target values.\n\n classes : ndarray of shape (n_classes,)\n Classes across all calls to partial_fit.\n Can be obtained by via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that y doesn't need to contain all labels in `classes`.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self._validate_params(for_partial_fit=True)\n if self.class_weight == 'balanced':\n raise ValueError(\"class_weight 'balanced' is not supported for partial_fit. For 'balanced' weights, use `sklearn.utils.compute_class_weight` with `class_weight='balanced'`. In place of y you can use a large enough subset of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\")\n lr = 'pa1' if self.loss == 'hinge' else 'pa2'\n return self._partial_fit(X, y, alpha=1.0, C=self.C, loss='hinge', learning_rate=lr, max_iter=1, classes=classes, sample_weight=None, coef_init=None, intercept_init=None)" }, { @@ -105254,7 +112591,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -105264,7 +112602,8 @@ "docstring": { "type": "float, default=1.0", "description": "Maximum step size (regularization). Defaults to 1.0." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -105274,7 +112613,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the intercept should be estimated or not. If False, the\ndata is assumed to be already centered. Defaults to True." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -105284,7 +112624,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of passes over the training data (aka epochs).\nIt only impacts the behavior in the ``fit`` method, and not the\n:meth:`partial_fit` method.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "tol", @@ -105294,7 +112635,8 @@ "docstring": { "type": "float or None, default=1e-3", "description": "The stopping criterion. If it is not None, the iterations will stop\nwhen (loss > previous_loss - tol).\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -105304,7 +112646,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use early stopping to terminate training when validation.\nscore is not improving. If set to True, it will automatically set aside\na fraction of training data as validation and terminate\ntraining when validation score is not improving by at least tol for\nn_iter_no_change consecutive epochs.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -105314,7 +112657,8 @@ "docstring": { "type": "float, default=0.1", "description": "The proportion of training data to set aside as validation set for\nearly stopping. Must be between 0 and 1.\nOnly used if early_stopping is True.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -105324,7 +112668,8 @@ "docstring": { "type": "int, default=5", "description": "Number of iterations with no improvement to wait before early stopping.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -105334,7 +112679,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not the training data should be shuffled after each epoch." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -105344,7 +112690,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "loss", @@ -105354,7 +112701,8 @@ "docstring": { "type": "str, default=\"epsilon_insensitive\"", "description": "The loss function to be used:\nepsilon_insensitive: equivalent to PA-I in the reference paper.\nsquared_epsilon_insensitive: equivalent to PA-II in the reference\npaper." - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -105364,7 +112712,8 @@ "docstring": { "type": "float, default=0.1", "description": "If the difference between the current prediction and the correct label\nis below this threshold, the model is not updated." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -105374,7 +112723,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used to shuffle the training data, when ``shuffle`` is set to\n``True``. Pass an int for reproducible output across multiple\nfunction calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -105384,7 +112734,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to True, reuse the solution of the previous call to fit as\ninitialization, otherwise, just erase the previous solution.\nSee :term:`the Glossary `.\n\nRepeatedly calling fit or partial_fit when warm_start is True can\nresult in a different solution than when calling fit a single time\nbecause of the way the data is shuffled." - } + }, + "refined_type": {} }, { "name": "average", @@ -105394,13 +112745,14 @@ "docstring": { "type": "bool or int, default=False", "description": "When set to True, computes the averaged SGD weights and stores the\nresult in the ``coef_`` attribute. If set to an int greater than 1,\naveraging will begin once the total number of samples seen reaches\naverage. So average=10 will begin averaging after seeing 10 samples.\n\n.. versionadded:: 0.19\n parameter *average* to use weights averaging in SGD." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, C=1.0, fit_intercept=True, max_iter=1000, tol=0.001, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss='epsilon_insensitive', epsilon=DEFAULT_EPSILON, random_state=None, warm_start=False, average=False):\n super().__init__(penalty=None, l1_ratio=0, epsilon=epsilon, eta0=1.0, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, shuffle=shuffle, verbose=verbose, random_state=random_state, warm_start=warm_start, average=average)\n self.C = C\n self.loss = loss" }, { @@ -105418,7 +112770,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -105428,6 +112781,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -105438,7 +112795,8 @@ "docstring": { "type": "numpy array of shape [n_samples]", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -105448,7 +112806,8 @@ "docstring": { "type": "array, shape = [n_features]", "description": "The initial coefficients to warm-start the optimization." - } + }, + "refined_type": {} }, { "name": "intercept_init", @@ -105458,13 +112817,14 @@ "docstring": { "type": "array, shape = [1]", "description": "The initial intercept to warm-start the optimization." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit linear model with Passive Aggressive algorithm.", - "docstring": "Fit linear model with Passive Aggressive algorithm.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : numpy array of shape [n_samples]\n Target values.\n\ncoef_init : array, shape = [n_features]\n The initial coefficients to warm-start the optimization.\n\nintercept_init : array, shape = [1]\n The initial intercept to warm-start the optimization.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit linear model with Passive Aggressive algorithm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : numpy array of shape [n_samples]\n Target values.\n\n coef_init : array, shape = [n_features]\n The initial coefficients to warm-start the optimization.\n\n intercept_init : array, shape = [1]\n The initial intercept to warm-start the optimization.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, coef_init=None, intercept_init=None):\n \"\"\"Fit linear model with Passive Aggressive algorithm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : numpy array of shape [n_samples]\n Target values.\n\n coef_init : array, shape = [n_features]\n The initial coefficients to warm-start the optimization.\n\n intercept_init : array, shape = [1]\n The initial intercept to warm-start the optimization.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self._validate_params()\n lr = 'pa1' if self.loss == 'epsilon_insensitive' else 'pa2'\n return self._fit(X, y, alpha=1.0, C=self.C, loss='epsilon_insensitive', learning_rate=lr, coef_init=coef_init, intercept_init=intercept_init)" }, { @@ -105482,7 +112842,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -105492,6 +112853,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Subset of training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -105502,13 +112867,14 @@ "docstring": { "type": "numpy array of shape [n_samples]", "description": "Subset of target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit linear model with Passive Aggressive algorithm.", - "docstring": "Fit linear model with Passive Aggressive algorithm.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Subset of training data.\n\ny : numpy array of shape [n_samples]\n Subset of target values.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit linear model with Passive Aggressive algorithm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Subset of training data.\n\n y : numpy array of shape [n_samples]\n Subset of target values.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef partial_fit(self, X, y):\n \"\"\"Fit linear model with Passive Aggressive algorithm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Subset of training data.\n\n y : numpy array of shape [n_samples]\n Subset of target values.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self._validate_params(for_partial_fit=True)\n lr = 'pa1' if self.loss == 'epsilon_insensitive' else 'pa2'\n return self._partial_fit(X, y, alpha=1.0, C=self.C, loss='epsilon_insensitive', learning_rate=lr, max_iter=1, sample_weight=None, coef_init=None, intercept_init=None)" }, { @@ -105526,7 +112892,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "penalty", @@ -105536,6 +112903,10 @@ "docstring": { "type": "{'l2','l1','elasticnet'}, default=None", "description": "The penalty (aka regularization term) to be used." + }, + "refined_type": { + "kind": "EnumType", + "values": ["l2", "l1", "elasticnet"] } }, { @@ -105546,7 +112917,8 @@ "docstring": { "type": "float, default=0.0001", "description": "Constant that multiplies the regularization term if regularization is\nused." - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -105556,7 +112928,8 @@ "docstring": { "type": "float, default=0.15", "description": "The Elastic Net mixing parameter, with `0 <= l1_ratio <= 1`.\n`l1_ratio=0` corresponds to L2 penalty, `l1_ratio=1` to L1.\nOnly used if `penalty='elasticnet'`.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -105566,7 +112939,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the intercept should be estimated or not. If False, the\ndata is assumed to be already centered." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -105576,7 +112950,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of passes over the training data (aka epochs).\nIt only impacts the behavior in the ``fit`` method, and not the\n:meth:`partial_fit` method.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "tol", @@ -105586,7 +112961,8 @@ "docstring": { "type": "float, default=1e-3", "description": "The stopping criterion. If it is not None, the iterations will stop\nwhen (loss > previous_loss - tol).\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -105596,7 +112972,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not the training data should be shuffled after each epoch." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -105606,7 +112983,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "eta0", @@ -105614,9 +112992,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "double, default=1", + "type": "float, default=1", "description": "Constant by which the updates are multiplied." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -105626,7 +113005,8 @@ "docstring": { "type": "int, default=None", "description": "The number of CPUs to use to do the OVA (One Versus All, for\nmulti-class problems) computation.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -105636,7 +113016,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used to shuffle the training data, when ``shuffle`` is set to\n``True``. Pass an int for reproducible output across multiple\nfunction calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -105646,7 +113027,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use early stopping to terminate training when validation.\nscore is not improving. If set to True, it will automatically set aside\na stratified fraction of training data as validation and terminate\ntraining when validation score is not improving by at least tol for\nn_iter_no_change consecutive epochs.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -105656,7 +113038,8 @@ "docstring": { "type": "float, default=0.1", "description": "The proportion of training data to set aside as validation set for\nearly stopping. Must be between 0 and 1.\nOnly used if early_stopping is True.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -105666,7 +113049,8 @@ "docstring": { "type": "int, default=5", "description": "Number of iterations with no improvement to wait before early stopping.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -105676,6 +113060,10 @@ "docstring": { "type": "dict, {class_label: weight} or \"balanced\", default=None", "description": "Preset for the class_weight fit parameter.\n\nWeights associated with classes. If not given, all classes\nare supposed to have weight one.\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -105686,13 +113074,14 @@ "docstring": { "type": "bool, default=False", "description": "When set to True, reuse the solution of the previous call to fit as\ninitialization, otherwise, just erase the previous solution. See\n:term:`the Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, penalty=None, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, eta0=1.0, n_jobs=None, random_state=0, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False):\n super().__init__(loss='perceptron', penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, random_state=random_state, learning_rate='constant', eta0=eta0, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, power_t=0.5, warm_start=warm_start, class_weight=class_weight, n_jobs=n_jobs)" }, { @@ -105710,7 +113099,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "quantile", @@ -105720,7 +113110,8 @@ "docstring": { "type": "float, default=0.5", "description": "The quantile that the model tries to predict. It must be strictly\nbetween 0 and 1. If 0.5 (default), the model predicts the 50%\nquantile, i.e. the median." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -105730,7 +113121,8 @@ "docstring": { "type": "float, default=1.0", "description": "Regularization constant that multiplies the L1 penalty term." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -105740,7 +113132,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to fit the intercept." - } + }, + "refined_type": {} }, { "name": "solver", @@ -105750,6 +113143,16 @@ "docstring": { "type": "{'highs-ds', 'highs-ipm', 'highs', 'interior-point', 'revised simplex'}, default='interior-point'", "description": "Method used by :func:`scipy.optimize.linprog` to solve the linear\nprogramming formulation. Note that the highs methods are recommended\nfor usage with `scipy>=1.6.0` because they are the fastest ones." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "highs-ipm", + "interior-point", + "revised simplex", + "highs-ds", + "highs" + ] } }, { @@ -105760,13 +113163,17 @@ "docstring": { "type": "dict, default=None", "description": "Additional parameters passed to :func:`scipy.optimize.linprog` as\noptions. If `None` and if `solver='interior-point'`, then\n`{\"lstsq\": True}` is passed to :func:`scipy.optimize.linprog` for the\nsake of stability." + }, + "refined_type": { + "kind": "EnumType", + "values": ["lstsq"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, quantile=0.5, alpha=1.0, fit_intercept=True, solver='interior-point', solver_options=None):\n self.quantile = quantile\n self.alpha = alpha\n self.fit_intercept = fit_intercept\n self.solver = solver\n self.solver_options = solver_options" }, { @@ -105784,7 +113191,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -105794,7 +113202,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -105804,7 +113213,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -105814,13 +113224,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model according to the given training data.", - "docstring": "Fit the model according to the given training data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nself : object\n Returns self.", + "docstring": "Fit the model according to the given training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Returns self.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n self : object\n Returns self.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=False, y_numeric=True, multi_output=False)\n sample_weight = _check_sample_weight(sample_weight, X)\n n_features = X.shape[1]\n n_params = n_features\n if self.fit_intercept:\n n_params += 1\n if self.alpha >= 0:\n alpha = np.sum(sample_weight) * self.alpha\n else:\n raise ValueError(f'Penalty alpha must be a non-negative number, got {self.alpha}')\n if self.quantile >= 1.0 or self.quantile <= 0.0:\n raise ValueError(f'Quantile should be strictly between 0.0 and 1.0, got {self.quantile}')\n if not isinstance(self.fit_intercept, bool):\n raise ValueError(f'The argument fit_intercept must be bool, got {self.fit_intercept}')\n if self.solver not in ('highs-ds', 'highs-ipm', 'highs', 'interior-point', 'revised simplex'):\n raise ValueError(f'Invalid value for argument solver, got {self.solver}')\n elif self.solver == 'revised simplex' and sp_version < parse_version('1.3.0'):\n raise ValueError(f\"Solver 'revised simplex' is only available with scipy>=1.3.0, got {sp_version}\")\n elif self.solver in ('highs-ds', 'highs-ipm', 'highs') and sp_version < parse_version('1.6.0'):\n raise ValueError(f'Solver {self.solver} is only available with scipy>=1.6.0, got {sp_version}')\n if self.solver_options is not None and not isinstance(self.solver_options, dict):\n raise ValueError(f'Invalid value for argument solver_options, must be None or a dictionary, got {self.solver_options}')\n if self.solver_options is None and self.solver == 'interior-point':\n solver_options = {'lstsq': True}\n else:\n solver_options = self.solver_options\n mask = sample_weight != 0\n n_mask = int(np.sum(mask))\n c = np.concatenate([np.full(2 * n_params, fill_value=alpha), sample_weight[mask] * self.quantile, sample_weight[mask] * (1 - self.quantile)])\n if self.fit_intercept:\n c[0] = 0\n c[n_params] = 0\n A_eq = np.concatenate([np.ones((n_mask, 1)), X[mask], -np.ones((n_mask, 1)), -X[mask], np.eye(n_mask), -np.eye(n_mask)], axis=1)\n else:\n A_eq = np.concatenate([X[mask], -X[mask], np.eye(n_mask), -np.eye(n_mask)], axis=1)\n b_eq = y[mask]\n result = linprog(c=c, A_eq=A_eq, b_eq=b_eq, method=self.solver, options=solver_options)\n solution = result.x\n if not result.success:\n failure = {1: 'Iteration limit reached.', 2: 'Problem appears to be infeasible.', 3: 'Problem appears to be unbounded.', 4: 'Numerical difficulties encountered.'}\n warnings.warn(f'Linear programming for QuantileRegressor did not succeed.\\nStatus is {result.status}: ' + failure.setdefault(result.status, 'unknown reason') + '\\n' + 'Result message of linprog:\\n' + result.message, ConvergenceWarning)\n params = solution[:n_params] - solution[n_params:2 * n_params]\n self.n_iter_ = result.nit\n if self.fit_intercept:\n self.coef_ = params[1:]\n self.intercept_ = params[0]\n else:\n self.coef_ = params\n self.intercept_ = 0.0\n return self" }, { @@ -105838,7 +113249,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -105848,7 +113260,8 @@ "docstring": { "type": "object, default=None", "description": "Base estimator object which implements the following methods:\n\n * `fit(X, y)`: Fit model to given training data and target values.\n * `score(X, y)`: Returns the mean accuracy on the given test data,\n which is used for the stop criterion defined by `stop_score`.\n Additionally, the score is used to decide which of two equally\n large consensus sets is chosen as the better one.\n * `predict(X)`: Returns predicted values using the linear model,\n which is used to compute residual error using loss function.\n\nIf `base_estimator` is None, then\n:class:`~sklearn.linear_model.LinearRegression` is used for\ntarget values of dtype float.\n\nNote that the current implementation only supports regression\nestimators." - } + }, + "refined_type": {} }, { "name": "min_samples", @@ -105858,7 +113271,8 @@ "docstring": { "type": "int (>= 1) or float ([0, 1]), default=None", "description": "Minimum number of samples chosen randomly from original data. Treated\nas an absolute number of samples for `min_samples >= 1`, treated as a\nrelative number `ceil(min_samples * X.shape[0])` for\n`min_samples < 1`. This is typically chosen as the minimal number of\nsamples necessary to estimate the given `base_estimator`. By default a\n``sklearn.linear_model.LinearRegression()`` estimator is assumed and\n`min_samples` is chosen as ``X.shape[1] + 1``. This parameter is highly\ndependent upon the model, so if a `base_estimator` other than\n:class:`linear_model.LinearRegression` is used, the user is\nencouraged to provide a value.\n\n.. deprecated:: 1.0\n Not setting `min_samples` explicitly will raise an error in version\n 1.2 for models other than\n :class:`~sklearn.linear_model.LinearRegression`. To keep the old\n default behavior, set `min_samples=X.shape[1] + 1` explicitly." - } + }, + "refined_type": {} }, { "name": "residual_threshold", @@ -105868,7 +113282,8 @@ "docstring": { "type": "float, default=None", "description": "Maximum residual for a data sample to be classified as an inlier.\nBy default the threshold is chosen as the MAD (median absolute\ndeviation) of the target values `y`. Points whose residuals are\nstrictly equal to the threshold are considered as inliers." - } + }, + "refined_type": {} }, { "name": "is_data_valid", @@ -105878,7 +113293,8 @@ "docstring": { "type": "callable, default=None", "description": "This function is called with the randomly selected data before the\nmodel is fitted to it: `is_data_valid(X, y)`. If its return value is\nFalse the current randomly chosen sub-sample is skipped." - } + }, + "refined_type": {} }, { "name": "is_model_valid", @@ -105888,7 +113304,8 @@ "docstring": { "type": "callable, default=None", "description": "This function is called with the estimated model and the randomly\nselected data: `is_model_valid(model, X, y)`. If its return value is\nFalse the current randomly chosen sub-sample is skipped.\nRejecting samples with this function is computationally costlier than\nwith `is_data_valid`. `is_model_valid` should therefore only be used if\nthe estimated model is needed for making the rejection decision." - } + }, + "refined_type": {} }, { "name": "max_trials", @@ -105898,7 +113315,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum number of iterations for random sample selection." - } + }, + "refined_type": {} }, { "name": "max_skips", @@ -105908,7 +113326,8 @@ "docstring": { "type": "int, default=np.inf", "description": "Maximum number of iterations that can be skipped due to finding zero\ninliers or invalid data defined by ``is_data_valid`` or invalid models\ndefined by ``is_model_valid``.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "stop_n_inliers", @@ -105918,7 +113337,8 @@ "docstring": { "type": "int, default=np.inf", "description": "Stop iteration if at least this number of inliers are found." - } + }, + "refined_type": {} }, { "name": "stop_score", @@ -105928,7 +113348,8 @@ "docstring": { "type": "float, default=np.inf", "description": "Stop iteration if score is greater equal than this threshold." - } + }, + "refined_type": {} }, { "name": "stop_probability", @@ -105938,6 +113359,14 @@ "docstring": { "type": "float in range [0, 1], default=0.99", "description": "RANSAC iteration stops if at least one outlier-free set of the training\ndata is sampled in RANSAC. This requires to generate at least N\nsamples (iterations)::\n\n N >= log(1 - probability) / log(1 - e**m)\n\nwhere the probability (confidence) is typically set to high value such\nas 0.99 (the default) and e is the current fraction of inliers w.r.t.\nthe total number of samples." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": true, + "max_inclusive": true } }, { @@ -105948,7 +113377,8 @@ "docstring": { "type": "str, callable, default='absolute_error'", "description": "String inputs, 'absolute_error' and 'squared_error' are supported which\nfind the absolute error and squared error per sample respectively.\n\nIf ``loss`` is a callable, then it should be a function that takes\ntwo arrays as inputs, the true and predicted value and returns a 1-D\narray with the i-th value of the array corresponding to the loss\non ``X[i]``.\n\nIf the loss on a sample is greater than the ``residual_threshold``,\nthen this sample is classified as an outlier.\n\n.. versionadded:: 0.18\n\n.. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent.\n\n.. deprecated:: 1.0\n The loss 'absolute_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='absolute_error'` which is equivalent." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -105958,13 +113388,14 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "The generator used to initialize the centers.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, base_estimator=None, *, min_samples=None, residual_threshold=None, is_data_valid=None, is_model_valid=None, max_trials=100, max_skips=np.inf, stop_n_inliers=np.inf, stop_score=np.inf, stop_probability=0.99, loss='absolute_error', random_state=None):\n self.base_estimator = base_estimator\n self.min_samples = min_samples\n self.residual_threshold = residual_threshold\n self.is_data_valid = is_data_valid\n self.is_model_valid = is_model_valid\n self.max_trials = max_trials\n self.max_skips = max_skips\n self.stop_n_inliers = stop_n_inliers\n self.stop_score = stop_score\n self.stop_probability = stop_probability\n self.random_state = random_state\n self.loss = loss" }, { @@ -105982,13 +113413,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -106006,7 +113438,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -106016,6 +113449,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -106026,7 +113463,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -106036,13 +113474,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Individual weights for each sample\nraises error if sample_weight is passed and base_estimator\nfit method does not support it.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit estimator using RANSAC algorithm.", - "docstring": "Fit estimator using RANSAC algorithm.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample\n raises error if sample_weight is passed and base_estimator\n fit method does not support it.\n\n .. versionadded:: 0.18\n\nReturns\n-------\nself : object\n Fitted `RANSACRegressor` estimator.\n\nRaises\n------\nValueError\n If no valid consensus set could be found. This occurs if\n `is_data_valid` and `is_model_valid` return False for all\n `max_trials` randomly chosen sub-samples.", + "docstring": "Fit estimator using RANSAC algorithm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample\n raises error if sample_weight is passed and base_estimator\n fit method does not support it.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n self : object\n Fitted `RANSACRegressor` estimator.\n\n Raises\n ------\n ValueError\n If no valid consensus set could be found. This occurs if\n `is_data_valid` and `is_model_valid` return False for all\n `max_trials` randomly chosen sub-samples.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit estimator using RANSAC algorithm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample\n raises error if sample_weight is passed and base_estimator\n fit method does not support it.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n self : object\n Fitted `RANSACRegressor` estimator.\n\n Raises\n ------\n ValueError\n If no valid consensus set could be found. This occurs if\n `is_data_valid` and `is_model_valid` return False for all\n `max_trials` randomly chosen sub-samples.\n \"\"\"\n check_X_params = dict(accept_sparse='csr', force_all_finite=False)\n check_y_params = dict(ensure_2d=False)\n (X, y) = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params))\n check_consistent_length(X, y)\n if self.base_estimator is not None:\n base_estimator = clone(self.base_estimator)\n else:\n base_estimator = LinearRegression()\n if self.min_samples is None:\n if not isinstance(base_estimator, LinearRegression):\n warnings.warn(f'From version 1.2, `min_samples` needs to be explicitly set otherwise an error will be raised. To keep the current behavior, you need to set `min_samples` to `X.shape[1] + 1 that is {X.shape[1] + 1}', FutureWarning)\n min_samples = X.shape[1] + 1\n elif 0 < self.min_samples < 1:\n min_samples = np.ceil(self.min_samples * X.shape[0])\n elif self.min_samples >= 1:\n if self.min_samples % 1 != 0:\n raise ValueError('Absolute number of samples must be an integer value.')\n min_samples = self.min_samples\n else:\n raise ValueError('Value for `min_samples` must be scalar and positive.')\n if min_samples > X.shape[0]:\n raise ValueError('`min_samples` may not be larger than number of samples: n_samples = %d.' % X.shape[0])\n if self.stop_probability < 0 or self.stop_probability > 1:\n raise ValueError('`stop_probability` must be in range [0, 1].')\n if self.residual_threshold is None:\n residual_threshold = np.median(np.abs(y - np.median(y)))\n else:\n residual_threshold = self.residual_threshold\n if self.loss in ('absolute_error', 'absolute_loss'):\n if self.loss == 'absolute_loss':\n warnings.warn(\"The loss 'absolute_loss' was deprecated in v1.0 and will be removed in version 1.2. Use `loss='absolute_error'` which is equivalent.\", FutureWarning)\n if y.ndim == 1:\n loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred)\n else:\n loss_function = lambda y_true, y_pred: np.sum(np.abs(y_true - y_pred), axis=1)\n elif self.loss in ('squared_error', 'squared_loss'):\n if self.loss == 'squared_loss':\n warnings.warn(\"The loss 'squared_loss' was deprecated in v1.0 and will be removed in version 1.2. Use `loss='squared_error'` which is equivalent.\", FutureWarning)\n if y.ndim == 1:\n loss_function = lambda y_true, y_pred: (y_true - y_pred)**2\n else:\n loss_function = lambda y_true, y_pred: np.sum((y_true - y_pred)**2, axis=1)\n elif callable(self.loss):\n loss_function = self.loss\n else:\n raise ValueError(\"loss should be 'absolute_error', 'squared_error' or a callable. Got %s. \" % self.loss)\n random_state = check_random_state(self.random_state)\n try:\n base_estimator.set_params(random_state=random_state)\n except ValueError:\n pass\n estimator_fit_has_sample_weight = has_fit_parameter(base_estimator, 'sample_weight')\n estimator_name = type(base_estimator).__name__\n if sample_weight is not None and not estimator_fit_has_sample_weight:\n raise ValueError('%s does not support sample_weight. Samples weights are only used for the calibration itself.' % estimator_name)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n n_inliers_best = 1\n score_best = -np.inf\n inlier_mask_best = None\n X_inlier_best = None\n y_inlier_best = None\n inlier_best_idxs_subset = None\n self.n_skips_no_inliers_ = 0\n self.n_skips_invalid_data_ = 0\n self.n_skips_invalid_model_ = 0\n n_samples = X.shape[0]\n sample_idxs = np.arange(n_samples)\n self.n_trials_ = 0\n max_trials = self.max_trials\n while self.n_trials_ < max_trials:\n self.n_trials_ += 1\n if self.n_skips_no_inliers_ + self.n_skips_invalid_data_ + self.n_skips_invalid_model_ > self.max_skips:\n break\n subset_idxs = sample_without_replacement(n_samples, min_samples, random_state=random_state)\n X_subset = X[subset_idxs]\n y_subset = y[subset_idxs]\n if self.is_data_valid is not None and not self.is_data_valid(X_subset, y_subset):\n self.n_skips_invalid_data_ += 1\n continue\n if sample_weight is None:\n base_estimator.fit(X_subset, y_subset)\n else:\n base_estimator.fit(X_subset, y_subset, sample_weight=sample_weight[subset_idxs])\n if self.is_model_valid is not None and not self.is_model_valid(base_estimator, X_subset, y_subset):\n self.n_skips_invalid_model_ += 1\n continue\n y_pred = base_estimator.predict(X)\n residuals_subset = loss_function(y, y_pred)\n inlier_mask_subset = residuals_subset <= residual_threshold\n n_inliers_subset = np.sum(inlier_mask_subset)\n if n_inliers_subset < n_inliers_best:\n self.n_skips_no_inliers_ += 1\n continue\n inlier_idxs_subset = sample_idxs[inlier_mask_subset]\n X_inlier_subset = X[inlier_idxs_subset]\n y_inlier_subset = y[inlier_idxs_subset]\n score_subset = base_estimator.score(X_inlier_subset, y_inlier_subset)\n if n_inliers_subset == n_inliers_best and score_subset < score_best:\n continue\n n_inliers_best = n_inliers_subset\n score_best = score_subset\n inlier_mask_best = inlier_mask_subset\n X_inlier_best = X_inlier_subset\n y_inlier_best = y_inlier_subset\n inlier_best_idxs_subset = inlier_idxs_subset\n max_trials = min(max_trials, _dynamic_max_trials(n_inliers_best, n_samples, min_samples, self.stop_probability))\n if n_inliers_best >= self.stop_n_inliers or score_best >= self.stop_score:\n break\n if inlier_mask_best is None:\n if self.n_skips_no_inliers_ + self.n_skips_invalid_data_ + self.n_skips_invalid_model_ > self.max_skips:\n raise ValueError('RANSAC skipped more iterations than `max_skips` without finding a valid consensus set. Iterations were skipped because each randomly chosen sub-sample failed the passing criteria. See estimator attributes for diagnostics (n_skips*).')\n else:\n raise ValueError('RANSAC could not find a valid consensus set. All `max_trials` iterations were skipped because each randomly chosen sub-sample failed the passing criteria. See estimator attributes for diagnostics (n_skips*).')\n elif self.n_skips_no_inliers_ + self.n_skips_invalid_data_ + self.n_skips_invalid_model_ > self.max_skips:\n warnings.warn('RANSAC found a valid consensus set but exited early due to skipping more iterations than `max_skips`. See estimator attributes for diagnostics (n_skips*).', ConvergenceWarning)\n if sample_weight is None:\n base_estimator.fit(X_inlier_best, y_inlier_best)\n else:\n base_estimator.fit(X_inlier_best, y_inlier_best, sample_weight=sample_weight[inlier_best_idxs_subset])\n self.estimator_ = base_estimator\n self.inlier_mask_ = inlier_mask_best\n return self" }, { @@ -106060,7 +113499,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -106070,13 +113510,17 @@ "docstring": { "type": "{array-like or sparse matrix} of shape (n_samples, n_features)", "description": "Input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict using the estimated model.\n\nThis is a wrapper for `estimator_.predict(X)`.", - "docstring": "Predict using the estimated model.\n\nThis is a wrapper for `estimator_.predict(X)`.\n\nParameters\n----------\nX : {array-like or sparse matrix} of shape (n_samples, n_features)\n Input data.\n\nReturns\n-------\ny : array, shape = [n_samples] or [n_samples, n_targets]\n Returns predicted values.", + "docstring": "Predict using the estimated model.\n\n This is a wrapper for `estimator_.predict(X)`.\n\n Parameters\n ----------\n X : {array-like or sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n y : array, shape = [n_samples] or [n_samples, n_targets]\n Returns predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict using the estimated model.\n\n This is a wrapper for `estimator_.predict(X)`.\n\n Parameters\n ----------\n X : {array-like or sparse matrix} of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n y : array, shape = [n_samples] or [n_samples, n_targets]\n Returns predicted values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, force_all_finite=False, accept_sparse=True, reset=False)\n return self.estimator_.predict(X)" }, { @@ -106094,7 +113538,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -106104,7 +113549,8 @@ "docstring": { "type": "(array-like or sparse matrix} of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -106114,13 +113560,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the score of the prediction.\n\nThis is a wrapper for `estimator_.score(X, y)`.", - "docstring": "Return the score of the prediction.\n\nThis is a wrapper for `estimator_.score(X, y)`.\n\nParameters\n----------\nX : (array-like or sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\nReturns\n-------\nz : float\n Score of the prediction.", + "docstring": "Return the score of the prediction.\n\n This is a wrapper for `estimator_.score(X, y)`.\n\n Parameters\n ----------\n X : (array-like or sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n Returns\n -------\n z : float\n Score of the prediction.\n ", "source_code": "\ndef score(self, X, y):\n \"\"\"Return the score of the prediction.\n\n This is a wrapper for `estimator_.score(X, y)`.\n\n Parameters\n ----------\n X : (array-like or sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n Returns\n -------\n z : float\n Score of the prediction.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, force_all_finite=False, accept_sparse=True, reset=False)\n return self.estimator_.score(X, y)" }, { @@ -106138,7 +113585,8 @@ "docstring": { "type": "int", "description": "Number of inliers in the data." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -106148,7 +113596,8 @@ "docstring": { "type": "int", "description": "Total number of samples in the data." - } + }, + "refined_type": {} }, { "name": "min_samples", @@ -106158,7 +113607,8 @@ "docstring": { "type": "int", "description": "Minimum number of samples chosen randomly from original data." - } + }, + "refined_type": {} }, { "name": "probability", @@ -106168,13 +113618,14 @@ "docstring": { "type": "float", "description": "Probability (confidence) that one outlier-free sample is generated." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Determine number trials such that at least one outlier-free subset is sampled for the given inlier/outlier ratio.", - "docstring": "Determine number trials such that at least one outlier-free subset is\nsampled for the given inlier/outlier ratio.\n\nParameters\n----------\nn_inliers : int\n Number of inliers in the data.\n\nn_samples : int\n Total number of samples in the data.\n\nmin_samples : int\n Minimum number of samples chosen randomly from original data.\n\nprobability : float\n Probability (confidence) that one outlier-free sample is generated.\n\nReturns\n-------\ntrials : int\n Number of trials.", + "description": "Determine number trials such that at least one outlier-free subset is\nsampled for the given inlier/outlier ratio.", + "docstring": "Determine number trials such that at least one outlier-free subset is\n sampled for the given inlier/outlier ratio.\n\n Parameters\n ----------\n n_inliers : int\n Number of inliers in the data.\n\n n_samples : int\n Total number of samples in the data.\n\n min_samples : int\n Minimum number of samples chosen randomly from original data.\n\n probability : float\n Probability (confidence) that one outlier-free sample is generated.\n\n Returns\n -------\n trials : int\n Number of trials.\n\n ", "source_code": "\ndef _dynamic_max_trials(n_inliers, n_samples, min_samples, probability):\n \"\"\"Determine number trials such that at least one outlier-free subset is\n sampled for the given inlier/outlier ratio.\n\n Parameters\n ----------\n n_inliers : int\n Number of inliers in the data.\n\n n_samples : int\n Total number of samples in the data.\n\n min_samples : int\n Minimum number of samples chosen randomly from original data.\n\n probability : float\n Probability (confidence) that one outlier-free sample is generated.\n\n Returns\n -------\n trials : int\n Number of trials.\n\n \"\"\"\n inlier_ratio = n_inliers / float(n_samples)\n nom = max(_EPSILON, 1 - probability)\n denom = max(_EPSILON, 1 - inlier_ratio**min_samples)\n if nom == 1:\n return 0\n if denom == 1:\n return float('inf')\n return abs(float(np.ceil(np.log(nom) / np.log(denom))))" }, { @@ -106192,7 +113643,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -106202,6 +113654,10 @@ "docstring": { "type": "{float, ndarray of shape (n_targets,)}, default=1.0", "description": "Regularization strength; must be a positive float. Regularization\nimproves the conditioning of the problem and reduces the variance of\nthe estimates. Larger values specify stronger regularization.\nAlpha corresponds to ``1 / (2C)`` in other linear models such as\n:class:`~sklearn.linear_model.LogisticRegression` or\n:class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are\nassumed to be specific to the targets. Hence they must correspond in\nnumber." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -106212,7 +113668,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to fit the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. ``X`` and ``y`` are expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -106222,7 +113679,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and\n will be removed in 1.2." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -106232,7 +113690,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -106242,7 +113701,8 @@ "docstring": { "type": "int, default=None", "description": "Maximum number of iterations for conjugate gradient solver.\nFor 'sparse_cg' and 'lsqr' solvers, the default value is determined\nby scipy.sparse.linalg. For 'sag' solver, the default value is 1000.\nFor 'lbfgs' solver, the default value is 15000." - } + }, + "refined_type": {} }, { "name": "tol", @@ -106252,7 +113712,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Precision of the solution." - } + }, + "refined_type": {} }, { "name": "solver", @@ -106262,6 +113723,19 @@ "docstring": { "type": "{'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto'", "description": "Solver to use in the computational routines:\n\n- 'auto' chooses the solver automatically based on the type of data.\n\n- 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n- 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution.\n\n- 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n- 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n- 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its improved, unbiased version named SAGA. Both methods also use an\n iterative procedure, and are often faster than other solvers when\n both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n- 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True.\n\nAll last six solvers support both dense and sparse data. However, only\n'sag', 'sparse_cg', and 'lbfgs' support sparse input when `fit_intercept`\nis True.\n\n.. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n.. versionadded:: 0.19\n SAGA solver." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "lsqr", + "saga", + "lbfgs", + "svd", + "sag", + "sparse_cg", + "auto", + "cholesky" + ] } }, { @@ -106272,7 +113746,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, forces the coefficients to be positive.\nOnly 'lbfgs' solver is supported in this case." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -106282,13 +113757,14 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used when ``solver`` == 'sag' or 'saga' to shuffle the data.\nSee :term:`Glossary ` for details.\n\n.. versionadded:: 0.17\n `random_state` to support Stochastic Average Gradient." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=None, tol=0.001, solver='auto', positive=False, random_state=None):\n super().__init__(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver, positive=positive, random_state=random_state)" }, { @@ -106306,7 +113782,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -106316,6 +113793,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -106326,7 +113807,8 @@ "docstring": { "type": "ndarray of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -106336,13 +113818,14 @@ "docstring": { "type": "float or ndarray of shape (n_samples,), default=None", "description": "Individual weights for each sample. If given a float, every sample\nwill have the same weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit Ridge regression model.", - "docstring": "Fit Ridge regression model.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\nsample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit Ridge regression model.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit Ridge regression model.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver)\n (X, y) = self._validate_data(X, y, accept_sparse=_accept_sparse, dtype=[np.float64, np.float32], multi_output=True, y_numeric=True)\n return super().fit(X, y, sample_weight=sample_weight)" }, { @@ -106360,7 +113843,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -106370,7 +113854,8 @@ "docstring": { "type": "float, default=1.0", "description": "Regularization strength; must be a positive float. Regularization\nimproves the conditioning of the problem and reduces the variance of\nthe estimates. Larger values specify stronger regularization.\nAlpha corresponds to ``1 / (2C)`` in other linear models such as\n:class:`~sklearn.linear_model.LogisticRegression` or\n:class:`~sklearn.svm.LinearSVC`." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -106380,7 +113865,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set to false, no\nintercept will be used in calculations (e.g. data is expected to be\nalready centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -106390,7 +113876,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and\n will be removed in 1.2." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -106400,7 +113887,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -106410,7 +113898,8 @@ "docstring": { "type": "int, default=None", "description": "Maximum number of iterations for conjugate gradient solver.\nThe default value is determined by scipy.sparse.linalg." - } + }, + "refined_type": {} }, { "name": "tol", @@ -106420,7 +113909,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Precision of the solution." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -106430,6 +113920,10 @@ "docstring": { "type": "dict or 'balanced', default=None", "description": "Weights associated with classes in the form ``{class_label: weight}``.\nIf not given, all classes are supposed to have weight one.\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -106439,7 +113933,20 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "{'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto'", - "description": "Solver to use in the computational routines:\n\n- 'auto' chooses the solver automatically based on the type of data.\n\n- 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n- 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution.\n\n- 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n- 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n- 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its unbiased and more flexible version named SAGA. Both methods\n use an iterative procedure, and are often faster than other solvers\n when both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\n- 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True." + "description": "Solver to use in the computational routines:\n\n- 'auto' chooses the solver automatically based on the type of data.\n\n- 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n- 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution.\n\n- 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n- 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n- 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its unbiased and more flexible version named SAGA. Both methods\n use an iterative procedure, and are often faster than other solvers\n when both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\n- 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "lsqr", + "saga", + "lbfgs", + "svd", + "sag", + "sparse_cg", + "auto", + "cholesky" + ] } }, { @@ -106450,7 +113957,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, forces the coefficients to be positive.\nOnly 'lbfgs' solver is supported in this case." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -106460,39 +113968,16 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used when ``solver`` == 'sag' or 'saga' to shuffle the data.\nSee :term:`Glossary ` for details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=None, tol=0.001, class_weight=None, solver='auto', positive=False, random_state=None):\n super().__init__(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver, positive=positive, random_state=random_state)\n self.class_weight = class_weight" }, - { - "name": "classes_", - "unique_name": "classes_@getter", - "qname": "sklearn.linear_model._ridge.RidgeClassifier.classes_", - "unique_qname": "sklearn.linear_model._ridge.RidgeClassifier.classes_@getter", - "decorators": ["property"], - "parameters": [ - { - "name": "self", - "default_value": null, - "is_public": true, - "assigned_by": "POSITION_OR_NAME", - "docstring": { - "type": "", - "description": "" - } - } - ], - "results": [], - "is_public": true, - "description": "Classes labels.", - "docstring": "Classes labels.", - "source_code": "\n@property\ndef classes_(self):\n \"\"\"Classes labels.\"\"\"\n return self._label_binarizer.classes_" - }, { "name": "fit", "unique_name": "fit", @@ -106508,7 +113993,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -106518,6 +114004,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -106528,7 +114018,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -106537,15 +114028,16 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "float or ndarray of shape (n_samples,), default=None", - "description": "Individual weights for each sample. If given a float, every sample\nwill have the same weight.\n\n.. versionadded:: 0.17\n *sample_weight* support to Classifier." - } + "description": "Individual weights for each sample. If given a float, every sample\nwill have the same weight.\n\n.. versionadded:: 0.17\n *sample_weight* support to RidgeClassifier." + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit Ridge classifier model.", - "docstring": "Fit Ridge classifier model.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : ndarray of shape (n_samples,)\n Target values.\n\nsample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n .. versionadded:: 0.17\n *sample_weight* support to Classifier.\n\nReturns\n-------\nself : object\n Instance of the estimator.", - "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit Ridge classifier model.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n .. versionadded:: 0.17\n *sample_weight* support to Classifier.\n\n Returns\n -------\n self : object\n Instance of the estimator.\n \"\"\"\n _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver)\n (X, y) = self._validate_data(X, y, accept_sparse=_accept_sparse, multi_output=True, y_numeric=False)\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)\n Y = self._label_binarizer.fit_transform(y)\n if not self._label_binarizer.y_type_.startswith('multilabel'):\n y = column_or_1d(y, warn=True)\n else:\n raise ValueError(\"%s doesn't support multi-label classification\" % self.__class__.__name__)\n if self.class_weight:\n sample_weight = sample_weight * compute_sample_weight(self.class_weight, y)\n super().fit(X, Y, sample_weight=sample_weight)\n return self" + "docstring": "Fit Ridge classifier model.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n .. versionadded:: 0.17\n *sample_weight* support to RidgeClassifier.\n\n Returns\n -------\n self : object\n Instance of the estimator.\n ", + "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit Ridge classifier model.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n .. versionadded:: 0.17\n *sample_weight* support to RidgeClassifier.\n\n Returns\n -------\n self : object\n Instance of the estimator.\n \"\"\"\n (X, y, sample_weight, Y) = self._prepare_data(X, y, sample_weight, self.solver)\n super().fit(X, Y, sample_weight=sample_weight)\n return self" }, { "name": "__init__", @@ -106562,7 +114054,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alphas", @@ -106572,7 +114065,8 @@ "docstring": { "type": "ndarray of shape (n_alphas,), default=(0.1, 1.0, 10.0)", "description": "Array of alpha values to try.\nRegularization strength; must be a positive float. Regularization\nimproves the conditioning of the problem and reduces the variance of\nthe estimates. Larger values specify stronger regularization.\nAlpha corresponds to ``1 / (2C)`` in other linear models such as\n:class:`~sklearn.linear_model.LogisticRegression` or\n:class:`~sklearn.svm.LinearSVC`." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -106582,7 +114076,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be centered)." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -106592,7 +114087,8 @@ "docstring": { "type": "bool, default=False", "description": "This parameter is ignored when ``fit_intercept`` is set to False.\nIf True, the regressors X will be normalized before regression by\nsubtracting the mean and dividing by the l2-norm.\nIf you wish to standardize, please use\n:class:`~sklearn.preprocessing.StandardScaler` before calling ``fit``\non an estimator with ``normalize=False``.\n\n.. deprecated:: 1.0\n ``normalize`` was deprecated in version 1.0 and\n will be removed in 1.2." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -106602,7 +114098,8 @@ "docstring": { "type": "str, callable, default=None", "description": "A string (see model evaluation documentation) or\na scorer callable object / function with signature\n``scorer(estimator, X, y)``." - } + }, + "refined_type": {} }, { "name": "cv", @@ -106612,7 +114109,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the efficient Leave-One-Out cross-validation\n- integer, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -106622,6 +114120,10 @@ "docstring": { "type": "dict or 'balanced', default=None", "description": "Weights associated with classes in the form ``{class_label: weight}``.\nIf not given, all classes are supposed to have weight one.\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -106632,13 +114134,14 @@ "docstring": { "type": "bool, default=False", "description": "Flag indicating if the cross-validation values corresponding to\neach alpha should be stored in the ``cv_values_`` attribute (see\nbelow). This flag is only compatible with ``cv=None`` (i.e. using\nLeave-One-Out Cross-Validation)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize='deprecated', scoring=None, cv=None, class_weight=None, store_cv_values=False):\n super().__init__(alphas=alphas, fit_intercept=fit_intercept, normalize=normalize, scoring=scoring, cv=cv, store_cv_values=store_cv_values)\n self.class_weight = class_weight" }, { @@ -106656,38 +114159,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", - "source_code": "\ndef _more_tags(self):\n return {'multilabel': True, '_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples', 'check_classifiers_multilabel_output_format_predict': 'RidgeClassifierCV.predict outputs an array of shape (25,) instead of (25, 5)'}}" - }, - { - "name": "classes_", - "unique_name": "classes_@getter", - "qname": "sklearn.linear_model._ridge.RidgeClassifierCV.classes_", - "unique_qname": "sklearn.linear_model._ridge.RidgeClassifierCV.classes_@getter", - "decorators": ["property"], - "parameters": [ - { - "name": "self", - "default_value": null, - "is_public": true, - "assigned_by": "POSITION_OR_NAME", - "docstring": { - "type": "", - "description": "" - } - } - ], - "results": [], - "is_public": true, - "description": "Classes labels.", - "docstring": "Classes labels.", - "source_code": "\n@property\ndef classes_(self):\n \"\"\"Classes labels.\"\"\"\n return self._label_binarizer.classes_" + "docstring": null, + "source_code": "\ndef _more_tags(self):\n return {'multilabel': True, '_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { "name": "fit", @@ -106704,7 +114184,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -106714,7 +114195,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples\nand `n_features` is the number of features. When using GCV,\nwill be cast to float64 if necessary." - } + }, + "refined_type": {} }, { "name": "y", @@ -106724,7 +114206,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target values. Will be cast to X's dtype if necessary." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -106734,14 +114217,15 @@ "docstring": { "type": "float or ndarray of shape (n_samples,), default=None", "description": "Individual weights for each sample. If given a float, every sample\nwill have the same weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit Ridge classifier with cv.", - "docstring": "Fit Ridge classifier with cv.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features. When using GCV,\n will be cast to float64 if necessary.\n\ny : ndarray of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\nsample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\nReturns\n-------\nself : object\n Fitted estimator.", - "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit Ridge classifier with cv.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features. When using GCV,\n will be cast to float64 if necessary.\n\n y : ndarray of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], multi_output=True, y_numeric=False)\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)\n Y = self._label_binarizer.fit_transform(y)\n if not self._label_binarizer.y_type_.startswith('multilabel'):\n y = column_or_1d(y, warn=True)\n if self.class_weight:\n sample_weight = sample_weight * compute_sample_weight(self.class_weight, y)\n target = Y if self.cv is None else y\n _BaseRidgeCV.fit(self, X, target, sample_weight=sample_weight)\n return self" + "docstring": "Fit Ridge classifier with cv.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features. When using GCV,\n will be cast to float64 if necessary.\n\n y : ndarray of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", + "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit Ridge classifier with cv.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features. When using GCV,\n will be cast to float64 if necessary.\n\n y : ndarray of shape (n_samples,)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y, sample_weight, Y) = self._prepare_data(X, y, sample_weight, solver='eigen')\n target = Y if self.cv is None else y\n super().fit(X, target, sample_weight=sample_weight)\n return self" }, { "name": "__init__", @@ -106758,7 +114242,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -106768,7 +114253,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -106778,7 +114264,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "normalize", @@ -106788,7 +114275,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -106798,7 +114286,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -106808,7 +114297,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -106818,7 +114308,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "solver", @@ -106828,7 +114319,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "positive", @@ -106838,7 +114330,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -106848,13 +114341,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, alpha=1.0, *, fit_intercept=True, normalize='deprecated', copy_X=True, max_iter=None, tol=0.001, solver='auto', positive=False, random_state=None):\n self.alpha = alpha\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.copy_X = copy_X\n self.max_iter = max_iter\n self.tol = tol\n self.solver = solver\n self.positive = positive\n self.random_state = random_state" }, { @@ -106872,7 +114366,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -106882,7 +114377,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -106892,7 +114388,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -106902,13 +114399,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef fit(self, X, y, sample_weight=None):\n self._normalize = _deprecate_normalize(self.normalize, default=False, estimator_name=self.__class__.__name__)\n if self.solver == 'lbfgs' and not self.positive:\n raise ValueError(\"'lbfgs' solver can be used only when positive=True. Please use another solver.\")\n if self.positive:\n if self.solver not in ['auto', 'lbfgs']:\n raise ValueError(f\"solver='{self.solver}' does not support positive fitting. Please set the solver to 'auto' or 'lbfgs', or set `positive=False`\")\n else:\n solver = self.solver\n elif sparse.issparse(X) and self.fit_intercept:\n if self.solver not in ['auto', 'sparse_cg', 'sag', 'lbfgs']:\n raise ValueError(\"solver='{}' does not support fitting the intercept on sparse data. Please set the solver to 'auto' or 'sparse_cg', 'sag', 'lbfgs' or set `fit_intercept=False`\".format(self.solver))\n if self.solver == 'lbfgs':\n solver = 'lbfgs'\n elif self.solver == 'sag' and self.max_iter is None and self.tol > 0.0001:\n warnings.warn('\"sag\" solver requires many iterations to fit an intercept with sparse inputs. Either set the solver to \"auto\" or \"sparse_cg\", or set a low \"tol\" and a high \"max_iter\" (especially if inputs are not standardized).')\n solver = 'sag'\n else:\n solver = 'sparse_cg'\n else:\n solver = self.solver\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n (X, y, X_offset, y_offset, X_scale) = self._preprocess_data(X, y, self.fit_intercept, self._normalize, self.copy_X, sample_weight=sample_weight, return_mean=True)\n if solver == 'sag' and sparse.issparse(X) and self.fit_intercept:\n (self.coef_, self.n_iter_, self.intercept_) = _ridge_regression(X, y, alpha=self.alpha, sample_weight=sample_weight, max_iter=self.max_iter, tol=self.tol, solver='sag', positive=self.positive, random_state=self.random_state, return_n_iter=True, return_intercept=True, check_input=False)\n self.intercept_ += y_offset\n else:\n if sparse.issparse(X) and self.fit_intercept:\n params = {'X_offset': X_offset, 'X_scale': X_scale}\n else:\n params = {}\n (self.coef_, self.n_iter_) = _ridge_regression(X, y, alpha=self.alpha, sample_weight=sample_weight, max_iter=self.max_iter, tol=self.tol, solver=solver, positive=self.positive, random_state=self.random_state, return_n_iter=True, return_intercept=False, check_input=False, **params)\n self._set_intercept(X_offset, y_offset, X_scale)\n return self" }, { @@ -106926,7 +114424,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alphas", @@ -106936,7 +114435,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -106946,7 +114446,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "normalize", @@ -106956,7 +114457,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scoring", @@ -106966,7 +114468,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cv", @@ -106976,7 +114479,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gcv_mode", @@ -106986,7 +114490,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "store_cv_values", @@ -106996,7 +114501,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha_per_target", @@ -107006,13 +114512,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize='deprecated', scoring=None, cv=None, gcv_mode=None, store_cv_values=False, alpha_per_target=False):\n self.alphas = np.asarray(alphas)\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.scoring = scoring\n self.cv = cv\n self.gcv_mode = gcv_mode\n self.store_cv_values = store_cv_values\n self.alpha_per_target = alpha_per_target" }, { @@ -107030,7 +114537,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -107040,7 +114548,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Training data. If using GCV, will be cast to float64\nif necessary." - } + }, + "refined_type": {} }, { "name": "y", @@ -107050,7 +114559,8 @@ "docstring": { "type": "ndarray of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values. Will be cast to X's dtype if necessary." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -107060,13 +114570,14 @@ "docstring": { "type": "float or ndarray of shape (n_samples,), default=None", "description": "Individual weights for each sample. If given a float, every sample\nwill have the same weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit Ridge regression model with cv.", - "docstring": "Fit Ridge regression model with cv.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Training data. If using GCV, will be cast to float64\n if necessary.\n\ny : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to X's dtype if necessary.\n\nsample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\nReturns\n-------\nself : object\n Fitted estimator.\n\nNotes\n-----\nWhen sample_weight is provided, the selected hyperparameter may depend\non whether we use leave-one-out cross-validation (cv=None or cv='auto')\nor another form of cross-validation, because only leave-one-out\ncross-validation takes the sample weights into account when computing\nthe validation score.", + "docstring": "Fit Ridge regression model with cv.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data. If using GCV, will be cast to float64\n if necessary.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n When sample_weight is provided, the selected hyperparameter may depend\n on whether we use leave-one-out cross-validation (cv=None or cv='auto')\n or another form of cross-validation, because only leave-one-out\n cross-validation takes the sample weights into account when computing\n the validation score.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit Ridge regression model with cv.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data. If using GCV, will be cast to float64\n if necessary.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to X's dtype if necessary.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n When sample_weight is provided, the selected hyperparameter may depend\n on whether we use leave-one-out cross-validation (cv=None or cv='auto')\n or another form of cross-validation, because only leave-one-out\n cross-validation takes the sample weights into account when computing\n the validation score.\n \"\"\"\n cv = self.cv\n if cv is None:\n estimator = _RidgeGCV(self.alphas, fit_intercept=self.fit_intercept, normalize=self.normalize, scoring=self.scoring, gcv_mode=self.gcv_mode, store_cv_values=self.store_cv_values, is_clf=is_classifier(self), alpha_per_target=self.alpha_per_target)\n estimator.fit(X, y, sample_weight=sample_weight)\n self.alpha_ = estimator.alpha_\n self.best_score_ = estimator.best_score_\n if self.store_cv_values:\n self.cv_values_ = estimator.cv_values_\n else:\n if self.store_cv_values:\n raise ValueError('cv!=None and store_cv_values=True are incompatible')\n if self.alpha_per_target:\n raise ValueError('cv!=None and alpha_per_target=True are incompatible')\n parameters = {'alpha': self.alphas}\n solver = 'sparse_cg' if sparse.issparse(X) else 'auto'\n model = RidgeClassifier if is_classifier(self) else Ridge\n gs = GridSearchCV(model(fit_intercept=self.fit_intercept, normalize=self.normalize, solver=solver), parameters, cv=cv, scoring=self.scoring)\n gs.fit(X, y, sample_weight=sample_weight)\n estimator = gs.best_estimator_\n self.alpha_ = gs.best_estimator_.alpha\n self.best_score_ = gs.best_score_\n self.coef_ = estimator.coef_\n self.intercept_ = estimator.intercept_\n self.n_features_in_ = estimator.n_features_in_\n if hasattr(estimator, 'feature_names_in_'):\n self.feature_names_in_ = estimator.feature_names_in_\n return self" }, { @@ -107084,7 +114595,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classes", @@ -107094,13 +114606,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, classes):\n self.classes_ = classes" }, { @@ -107118,7 +114631,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_predict", @@ -107128,13 +114642,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef decision_function(self, y_predict):\n return y_predict" }, { @@ -107152,7 +114667,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_predict", @@ -107162,13 +114678,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef decision_function(self, y_predict):\n return y_predict" }, { @@ -107186,7 +114703,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_predict", @@ -107196,15 +114714,177 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef predict(self, y_predict):\n return y_predict" }, + { + "name": "_more_tags", + "unique_name": "_more_tags", + "qname": "sklearn.linear_model._ridge._RidgeClassifierMixin._more_tags", + "unique_qname": "sklearn.linear_model._ridge._RidgeClassifierMixin._more_tags", + "decorators": [], + "parameters": [ + { + "name": "self", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + } + ], + "results": [], + "is_public": false, + "description": "", + "docstring": null, + "source_code": "\ndef _more_tags(self):\n return {'multilabel': True}" + }, + { + "name": "_prepare_data", + "unique_name": "_prepare_data", + "qname": "sklearn.linear_model._ridge._RidgeClassifierMixin._prepare_data", + "unique_qname": "sklearn.linear_model._ridge._RidgeClassifierMixin._prepare_data", + "decorators": [], + "parameters": [ + { + "name": "self", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + }, + { + "name": "X", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", + "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] + } + }, + { + "name": "y", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "ndarray of shape (n_samples,)", + "description": "Target values." + }, + "refined_type": {} + }, + { + "name": "sample_weight", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "float or ndarray of shape (n_samples,), default=None", + "description": "Individual weights for each sample. If given a float, every sample\nwill have the same weight." + }, + "refined_type": {} + }, + { + "name": "solver", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "str", + "description": "The solver used in `Ridge` to know which sparse format to support." + }, + "refined_type": {} + } + ], + "results": [], + "is_public": false, + "description": "Validate `X` and `y` and binarize `y`.", + "docstring": "Validate `X` and `y` and binarize `y`.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n solver : str\n The solver used in `Ridge` to know which sparse format to support.\n\n Returns\n -------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Validated training data.\n\n y : ndarray of shape (n_samples,)\n Validated target values.\n\n sample_weight : ndarray of shape (n_samples,)\n Validated sample weights.\n\n Y : ndarray of shape (n_samples, n_classes)\n The binarized version of `y`.\n ", + "source_code": "\ndef _prepare_data(self, X, y, sample_weight, solver):\n \"\"\"Validate `X` and `y` and binarize `y`.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n solver : str\n The solver used in `Ridge` to know which sparse format to support.\n\n Returns\n -------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Validated training data.\n\n y : ndarray of shape (n_samples,)\n Validated target values.\n\n sample_weight : ndarray of shape (n_samples,)\n Validated sample weights.\n\n Y : ndarray of shape (n_samples, n_classes)\n The binarized version of `y`.\n \"\"\"\n accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver)\n (X, y) = self._validate_data(X, y, accept_sparse=accept_sparse, multi_output=True, y_numeric=False)\n self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)\n Y = self._label_binarizer.fit_transform(y)\n if not self._label_binarizer.y_type_.startswith('multilabel'):\n y = column_or_1d(y, warn=True)\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n if self.class_weight:\n sample_weight = sample_weight * compute_sample_weight(self.class_weight, y)\n return X, y, sample_weight, Y" + }, + { + "name": "classes_", + "unique_name": "classes_@getter", + "qname": "sklearn.linear_model._ridge._RidgeClassifierMixin.classes_", + "unique_qname": "sklearn.linear_model._ridge._RidgeClassifierMixin.classes_@getter", + "decorators": ["property"], + "parameters": [ + { + "name": "self", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + } + ], + "results": [], + "is_public": false, + "description": "Classes labels.", + "docstring": "Classes labels.", + "source_code": "\n@property\ndef classes_(self):\n \"\"\"Classes labels.\"\"\"\n return self._label_binarizer.classes_" + }, + { + "name": "predict", + "unique_name": "predict", + "qname": "sklearn.linear_model._ridge._RidgeClassifierMixin.predict", + "unique_qname": "sklearn.linear_model._ridge._RidgeClassifierMixin.predict", + "decorators": [], + "parameters": [ + { + "name": "self", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + }, + { + "name": "X", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "{array-like, spare matrix} of shape (n_samples, n_features)", + "description": "The data matrix for which we want to predict the targets." + }, + "refined_type": { + "kind": "EnumType", + "values": [] + } + } + ], + "results": [], + "is_public": false, + "description": "Predict class labels for samples in `X`.", + "docstring": "Predict class labels for samples in `X`.\n\n Parameters\n ----------\n X : {array-like, spare matrix} of shape (n_samples, n_features)\n The data matrix for which we want to predict the targets.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Vector or matrix containing the predictions. In binary and\n multiclass problems, this is a vector containing `n_samples`. In\n a multilabel problem, it returns a matrix of shape\n `(n_samples, n_outputs)`.\n ", + "source_code": "\ndef predict(self, X):\n \"\"\"Predict class labels for samples in `X`.\n\n Parameters\n ----------\n X : {array-like, spare matrix} of shape (n_samples, n_features)\n The data matrix for which we want to predict the targets.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n Vector or matrix containing the predictions. In binary and\n multiclass problems, this is a vector containing `n_samples`. In\n a multilabel problem, it returns a matrix of shape\n `(n_samples, n_outputs)`.\n \"\"\"\n check_is_fitted(self, attributes=['_label_binarizer'])\n if self._label_binarizer.y_type_.startswith('multilabel'):\n scores = 2 * (self.decision_function(X) > 0) - 1\n return self._label_binarizer.inverse_transform(scores)\n return super().predict(X)" + }, { "name": "__init__", "unique_name": "__init__", @@ -107220,7 +114900,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alphas", @@ -107230,7 +114911,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -107240,7 +114922,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "normalize", @@ -107250,7 +114933,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scoring", @@ -107260,7 +114944,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -107270,7 +114955,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gcv_mode", @@ -107280,7 +114966,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "store_cv_values", @@ -107290,7 +114977,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "is_clf", @@ -107300,7 +114988,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha_per_target", @@ -107310,13 +114999,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize='deprecated', scoring=None, copy_X=True, gcv_mode=None, store_cv_values=False, is_clf=False, alpha_per_target=False):\n self.alphas = np.asarray(alphas)\n self.fit_intercept = fit_intercept\n self.normalize = normalize\n self.scoring = scoring\n self.copy_X = copy_X\n self.gcv_mode = gcv_mode\n self.store_cv_values = store_cv_values\n self.is_clf = is_clf\n self.alpha_per_target = alpha_per_target" }, { @@ -107334,7 +115024,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -107344,7 +115035,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "The preprocessed design matrix." - } + }, + "refined_type": {} }, { "name": "sqrt_sw", @@ -107354,13 +115046,14 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "square roots of sample weights" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes covariance matrix X^TX with possible centering.", - "docstring": "Computes covariance matrix X^TX with possible centering.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n The preprocessed design matrix.\n\nsqrt_sw : ndarray of shape (n_samples,)\n square roots of sample weights\n\nReturns\n-------\ncovariance : ndarray of shape (n_features, n_features)\n The covariance matrix.\nX_mean : ndarray of shape (n_feature,)\n The weighted mean of ``X`` for each feature.\n\nNotes\n-----\nSince X is sparse it has not been centered in preprocessing, but it has\nbeen scaled by sqrt(sample weights).\n\nWhen self.fit_intercept is False no centering is done.\n\nThe centered X is never actually computed because centering would break\nthe sparsity of X.", + "docstring": "Computes covariance matrix X^TX with possible centering.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n The preprocessed design matrix.\n\n sqrt_sw : ndarray of shape (n_samples,)\n square roots of sample weights\n\n Returns\n -------\n covariance : ndarray of shape (n_features, n_features)\n The covariance matrix.\n X_mean : ndarray of shape (n_feature,)\n The weighted mean of ``X`` for each feature.\n\n Notes\n -----\n Since X is sparse it has not been centered in preprocessing, but it has\n been scaled by sqrt(sample weights).\n\n When self.fit_intercept is False no centering is done.\n\n The centered X is never actually computed because centering would break\n the sparsity of X.\n ", "source_code": "\ndef _compute_covariance(self, X, sqrt_sw):\n \"\"\"Computes covariance matrix X^TX with possible centering.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n The preprocessed design matrix.\n\n sqrt_sw : ndarray of shape (n_samples,)\n square roots of sample weights\n\n Returns\n -------\n covariance : ndarray of shape (n_features, n_features)\n The covariance matrix.\n X_mean : ndarray of shape (n_feature,)\n The weighted mean of ``X`` for each feature.\n\n Notes\n -----\n Since X is sparse it has not been centered in preprocessing, but it has\n been scaled by sqrt(sample weights).\n\n When self.fit_intercept is False no centering is done.\n\n The centered X is never actually computed because centering would break\n the sparsity of X.\n \"\"\"\n if not self.fit_intercept:\n X_mean = np.zeros(X.shape[1], dtype=X.dtype)\n return safe_sparse_dot(X.T, X, dense_output=True), X_mean\n n_samples = X.shape[0]\n sample_weight_matrix = sparse.dia_matrix((sqrt_sw, 0), shape=(n_samples, n_samples))\n X_weighted = sample_weight_matrix.dot(X)\n (X_mean, _) = mean_variance_axis(X_weighted, axis=0)\n X_mean = X_mean * n_samples / sqrt_sw.dot(sqrt_sw)\n weight_sum = sqrt_sw.dot(sqrt_sw)\n return safe_sparse_dot(X.T, X, dense_output=True) - weight_sum * np.outer(X_mean, X_mean), X_mean" }, { @@ -107378,7 +115071,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -107388,6 +115082,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "The preprocessed design matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -107398,13 +115096,14 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "square roots of sample weights" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes the Gram matrix XX^T with possible centering.", - "docstring": "Computes the Gram matrix XX^T with possible centering.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The preprocessed design matrix.\n\nsqrt_sw : ndarray of shape (n_samples,)\n square roots of sample weights\n\nReturns\n-------\ngram : ndarray of shape (n_samples, n_samples)\n The Gram matrix.\nX_mean : ndarray of shape (n_feature,)\n The weighted mean of ``X`` for each feature.\n\nNotes\n-----\nWhen X is dense the centering has been done in preprocessing\nso the mean is 0 and we just compute XX^T.\n\nWhen X is sparse it has not been centered in preprocessing, but it has\nbeen scaled by sqrt(sample weights).\n\nWhen self.fit_intercept is False no centering is done.\n\nThe centered X is never actually computed because centering would break\nthe sparsity of X.", + "docstring": "Computes the Gram matrix XX^T with possible centering.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The preprocessed design matrix.\n\n sqrt_sw : ndarray of shape (n_samples,)\n square roots of sample weights\n\n Returns\n -------\n gram : ndarray of shape (n_samples, n_samples)\n The Gram matrix.\n X_mean : ndarray of shape (n_feature,)\n The weighted mean of ``X`` for each feature.\n\n Notes\n -----\n When X is dense the centering has been done in preprocessing\n so the mean is 0 and we just compute XX^T.\n\n When X is sparse it has not been centered in preprocessing, but it has\n been scaled by sqrt(sample weights).\n\n When self.fit_intercept is False no centering is done.\n\n The centered X is never actually computed because centering would break\n the sparsity of X.\n ", "source_code": "\ndef _compute_gram(self, X, sqrt_sw):\n \"\"\"Computes the Gram matrix XX^T with possible centering.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The preprocessed design matrix.\n\n sqrt_sw : ndarray of shape (n_samples,)\n square roots of sample weights\n\n Returns\n -------\n gram : ndarray of shape (n_samples, n_samples)\n The Gram matrix.\n X_mean : ndarray of shape (n_feature,)\n The weighted mean of ``X`` for each feature.\n\n Notes\n -----\n When X is dense the centering has been done in preprocessing\n so the mean is 0 and we just compute XX^T.\n\n When X is sparse it has not been centered in preprocessing, but it has\n been scaled by sqrt(sample weights).\n\n When self.fit_intercept is False no centering is done.\n\n The centered X is never actually computed because centering would break\n the sparsity of X.\n \"\"\"\n center = self.fit_intercept and sparse.issparse(X)\n if not center:\n X_mean = np.zeros(X.shape[1], dtype=X.dtype)\n return safe_sparse_dot(X, X.T, dense_output=True), X_mean\n n_samples = X.shape[0]\n sample_weight_matrix = sparse.dia_matrix((sqrt_sw, 0), shape=(n_samples, n_samples))\n X_weighted = sample_weight_matrix.dot(X)\n (X_mean, _) = mean_variance_axis(X_weighted, axis=0)\n X_mean *= n_samples / sqrt_sw.dot(sqrt_sw)\n X_mX = sqrt_sw[:, None] * safe_sparse_dot(X_mean, X.T, dense_output=True)\n X_mX_m = np.outer(sqrt_sw, sqrt_sw) * np.dot(X_mean, X_mean)\n return safe_sparse_dot(X, X.T, dense_output=True) + X_mX_m - X_mX - X_mX.T, X_mean" }, { @@ -107422,7 +115121,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Q", @@ -107432,13 +115132,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@staticmethod\ndef _decomp_diag(v_prime, Q):\n return (v_prime * Q**2).sum(axis=-1)" }, { @@ -107456,7 +115157,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "B", @@ -107466,13 +115168,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@staticmethod\ndef _diag_dot(D, B):\n if len(B.shape) > 1:\n D = D[(slice(None), ) + (np.newaxis, ) * (len(B.shape) - 1)]\n return D * B" }, { @@ -107490,7 +115193,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -107500,7 +115204,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -107510,7 +115215,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sqrt_sw", @@ -107520,13 +115226,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Eigendecomposition of X^T.X, used when n_samples > n_features and X is sparse.", - "docstring": "Eigendecomposition of X^T.X, used when n_samples > n_features\nand X is sparse.", + "description": "Eigendecomposition of X^T.X, used when n_samples > n_features\nand X is sparse.", + "docstring": "Eigendecomposition of X^T.X, used when n_samples > n_features\n and X is sparse.\n ", "source_code": "\ndef _eigen_decompose_covariance(self, X, y, sqrt_sw):\n \"\"\"Eigendecomposition of X^T.X, used when n_samples > n_features\n and X is sparse.\n \"\"\"\n (n_samples, n_features) = X.shape\n cov = np.empty((n_features + 1, n_features + 1), dtype=X.dtype)\n (cov[:-1, :-1], X_mean) = self._compute_covariance(X, sqrt_sw)\n if not self.fit_intercept:\n cov = cov[:-1, :-1]\n else:\n cov[-1] = 0\n cov[:, -1] = 0\n cov[-1, -1] = sqrt_sw.dot(sqrt_sw)\n nullspace_dim = max(0, n_features - n_samples)\n (eigvals, V) = linalg.eigh(cov)\n eigvals = eigvals[nullspace_dim:]\n V = V[:, nullspace_dim:]\n return X_mean, eigvals, V, X" }, { @@ -107544,7 +115251,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -107554,7 +115262,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -107564,7 +115273,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sqrt_sw", @@ -107574,7 +115284,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -107598,7 +115309,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -107608,7 +115320,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -107618,7 +115331,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sqrt_sw", @@ -107628,7 +115342,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_mean", @@ -107638,7 +115353,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eigvals", @@ -107648,7 +115364,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "V", @@ -107658,7 +115375,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -107668,13 +115386,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have a decomposition of X^T.X (n_samples > n_features and X is sparse).", - "docstring": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have a decomposition of X^T.X\n(n_samples > n_features and X is sparse).", + "description": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have a decomposition of X^T.X\n(n_samples > n_features and X is sparse).", + "docstring": "Compute dual coefficients and diagonal of G^-1.\n\n Used when we have a decomposition of X^T.X\n (n_samples > n_features and X is sparse).\n ", "source_code": "\ndef _solve_eigen_covariance(self, alpha, y, sqrt_sw, X_mean, eigvals, V, X):\n \"\"\"Compute dual coefficients and diagonal of G^-1.\n\n Used when we have a decomposition of X^T.X\n (n_samples > n_features and X is sparse).\n \"\"\"\n if self.fit_intercept:\n return self._solve_eigen_covariance_intercept(alpha, y, sqrt_sw, X_mean, eigvals, V, X)\n return self._solve_eigen_covariance_no_intercept(alpha, y, sqrt_sw, X_mean, eigvals, V, X)" }, { @@ -107692,7 +115411,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -107702,7 +115422,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -107712,7 +115433,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sqrt_sw", @@ -107722,7 +115444,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_mean", @@ -107732,7 +115455,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eigvals", @@ -107742,7 +115466,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "V", @@ -107752,7 +115477,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -107762,13 +115488,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have a decomposition of X^T.X (n_samples > n_features and X is sparse), and we are fitting an intercept.", - "docstring": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have a decomposition of X^T.X\n(n_samples > n_features and X is sparse),\nand we are fitting an intercept.", + "description": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have a decomposition of X^T.X\n(n_samples > n_features and X is sparse),\nand we are fitting an intercept.", + "docstring": "Compute dual coefficients and diagonal of G^-1.\n\n Used when we have a decomposition of X^T.X\n (n_samples > n_features and X is sparse),\n and we are fitting an intercept.\n ", "source_code": "\ndef _solve_eigen_covariance_intercept(self, alpha, y, sqrt_sw, X_mean, eigvals, V, X):\n \"\"\"Compute dual coefficients and diagonal of G^-1.\n\n Used when we have a decomposition of X^T.X\n (n_samples > n_features and X is sparse),\n and we are fitting an intercept.\n \"\"\"\n intercept_sv = np.zeros(V.shape[0])\n intercept_sv[-1] = 1\n intercept_dim = _find_smallest_angle(intercept_sv, V)\n w = 1 / (eigvals + alpha)\n w[intercept_dim] = 1 / eigvals[intercept_dim]\n A = (V * w).dot(V.T)\n X_op = _X_CenterStackOp(X, X_mean, sqrt_sw)\n AXy = A.dot(X_op.T.dot(y))\n y_hat = X_op.dot(AXy)\n hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)\n if len(y.shape) != 1:\n hat_diag = hat_diag[:, np.newaxis]\n return (1 - hat_diag) / alpha, (y - y_hat) / alpha" }, { @@ -107786,7 +115513,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -107796,7 +115524,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -107806,7 +115535,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sqrt_sw", @@ -107816,7 +115546,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_mean", @@ -107826,7 +115557,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eigvals", @@ -107836,7 +115568,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "V", @@ -107846,7 +115579,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -107856,13 +115590,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have a decomposition of X^T.X (n_samples > n_features and X is sparse), and not fitting an intercept.", - "docstring": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have a decomposition of X^T.X\n(n_samples > n_features and X is sparse), and not fitting an intercept.", + "description": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have a decomposition of X^T.X\n(n_samples > n_features and X is sparse), and not fitting an intercept.", + "docstring": "Compute dual coefficients and diagonal of G^-1.\n\n Used when we have a decomposition of X^T.X\n (n_samples > n_features and X is sparse), and not fitting an intercept.\n ", "source_code": "\ndef _solve_eigen_covariance_no_intercept(self, alpha, y, sqrt_sw, X_mean, eigvals, V, X):\n \"\"\"Compute dual coefficients and diagonal of G^-1.\n\n Used when we have a decomposition of X^T.X\n (n_samples > n_features and X is sparse), and not fitting an intercept.\n \"\"\"\n w = 1 / (eigvals + alpha)\n A = (V * w).dot(V.T)\n AXy = A.dot(safe_sparse_dot(X.T, y, dense_output=True))\n y_hat = safe_sparse_dot(X, AXy, dense_output=True)\n hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)\n if len(y.shape) != 1:\n hat_diag = hat_diag[:, np.newaxis]\n return (1 - hat_diag) / alpha, (y - y_hat) / alpha" }, { @@ -107880,7 +115615,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -107890,7 +115626,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -107900,7 +115637,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sqrt_sw", @@ -107910,7 +115648,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_mean", @@ -107920,7 +115659,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eigvals", @@ -107930,7 +115670,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Q", @@ -107940,7 +115681,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "QT_y", @@ -107950,13 +115692,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have a decomposition of X.X^T (n_samples <= n_features).", - "docstring": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have a decomposition of X.X^T (n_samples <= n_features).", + "docstring": "Compute dual coefficients and diagonal of G^-1.\n\n Used when we have a decomposition of X.X^T (n_samples <= n_features).\n ", "source_code": "\ndef _solve_eigen_gram(self, alpha, y, sqrt_sw, X_mean, eigvals, Q, QT_y):\n \"\"\"Compute dual coefficients and diagonal of G^-1.\n\n Used when we have a decomposition of X.X^T (n_samples <= n_features).\n \"\"\"\n w = 1.0 / (eigvals + alpha)\n if self.fit_intercept:\n normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)\n intercept_dim = _find_smallest_angle(normalized_sw, Q)\n w[intercept_dim] = 0\n c = np.dot(Q, self._diag_dot(w, QT_y))\n G_inverse_diag = self._decomp_diag(w, Q)\n if len(y.shape) != 1:\n G_inverse_diag = G_inverse_diag[:, np.newaxis]\n return G_inverse_diag, c" }, { @@ -107974,7 +115717,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -107984,7 +115728,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -107994,7 +115739,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sqrt_sw", @@ -108004,7 +115750,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_mean", @@ -108014,7 +115761,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "singvals_sq", @@ -108024,7 +115772,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "U", @@ -108034,7 +115783,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "UT_y", @@ -108044,13 +115794,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have an SVD decomposition of X (n_samples > n_features and X is dense).", - "docstring": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have an SVD decomposition of X\n(n_samples > n_features and X is dense).", + "description": "Compute dual coefficients and diagonal of G^-1.\n\nUsed when we have an SVD decomposition of X\n(n_samples > n_features and X is dense).", + "docstring": "Compute dual coefficients and diagonal of G^-1.\n\n Used when we have an SVD decomposition of X\n (n_samples > n_features and X is dense).\n ", "source_code": "\ndef _solve_svd_design_matrix(self, alpha, y, sqrt_sw, X_mean, singvals_sq, U, UT_y):\n \"\"\"Compute dual coefficients and diagonal of G^-1.\n\n Used when we have an SVD decomposition of X\n (n_samples > n_features and X is dense).\n \"\"\"\n w = (singvals_sq + alpha)**(-1) - alpha**(-1)\n if self.fit_intercept:\n normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)\n intercept_dim = _find_smallest_angle(normalized_sw, U)\n w[intercept_dim] = -alpha**(-1)\n c = np.dot(U, self._diag_dot(w, UT_y)) + alpha**(-1) * y\n G_inverse_diag = self._decomp_diag(w, U) + alpha**(-1)\n if len(y.shape) != 1:\n G_inverse_diag = G_inverse_diag[:, np.newaxis]\n return G_inverse_diag, c" }, { @@ -108068,7 +115819,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -108078,7 +115830,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "A", @@ -108088,7 +115841,8 @@ "docstring": { "type": "ndarray of shape (n_features, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "X_mean", @@ -108098,7 +115852,8 @@ "docstring": { "type": "ndarray of shape (n_features,)", "description": "" - } + }, + "refined_type": {} }, { "name": "sqrt_sw", @@ -108108,13 +115863,14 @@ "docstring": { "type": "ndarray of shape (n_features,)", "description": "square roots of sample weights" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T) without explicitly centering X nor computing X.dot(A) when X is sparse.", - "docstring": "Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T)\nwithout explicitly centering X nor computing X.dot(A)\nwhen X is sparse.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n\nA : ndarray of shape (n_features, n_features)\n\nX_mean : ndarray of shape (n_features,)\n\nsqrt_sw : ndarray of shape (n_features,)\n square roots of sample weights\n\nReturns\n-------\ndiag : np.ndarray, shape (n_samples,)\n The computed diagonal.", + "description": "Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T)\nwithout explicitly centering X nor computing X.dot(A)\nwhen X is sparse.", + "docstring": "Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T)\n without explicitly centering X nor computing X.dot(A)\n when X is sparse.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n\n A : ndarray of shape (n_features, n_features)\n\n X_mean : ndarray of shape (n_features,)\n\n sqrt_sw : ndarray of shape (n_features,)\n square roots of sample weights\n\n Returns\n -------\n diag : np.ndarray, shape (n_samples,)\n The computed diagonal.\n ", "source_code": "\ndef _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw):\n \"\"\"Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T)\n without explicitly centering X nor computing X.dot(A)\n when X is sparse.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n\n A : ndarray of shape (n_features, n_features)\n\n X_mean : ndarray of shape (n_features,)\n\n sqrt_sw : ndarray of shape (n_features,)\n square roots of sample weights\n\n Returns\n -------\n diag : np.ndarray, shape (n_samples,)\n The computed diagonal.\n \"\"\"\n intercept_col = scale = sqrt_sw\n batch_size = X.shape[1]\n diag = np.empty(X.shape[0], dtype=X.dtype)\n for start in range(0, X.shape[0], batch_size):\n batch = slice(start, min(X.shape[0], start + batch_size), 1)\n X_batch = np.empty((X[batch].shape[0], X.shape[1] + self.fit_intercept), dtype=X.dtype)\n if self.fit_intercept:\n X_batch[:, :-1] = X[batch].A - X_mean * scale[batch][:, None]\n X_batch[:, -1] = intercept_col[batch]\n else:\n X_batch = X[batch].A\n diag[batch] = (X_batch.dot(A) * X_batch).sum(axis=1)\n return diag" }, { @@ -108132,7 +115888,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -108142,7 +115899,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -108152,7 +115910,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sqrt_sw", @@ -108162,13 +115921,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _svd_decompose_design_matrix(self, X, y, sqrt_sw):\n X_mean = np.zeros(X.shape[1], dtype=X.dtype)\n if self.fit_intercept:\n intercept_column = sqrt_sw[:, None]\n X = np.hstack((X, intercept_column))\n (U, singvals, _) = linalg.svd(X, full_matrices=0)\n singvals_sq = singvals**2\n UT_y = np.dot(U.T, y)\n return X_mean, singvals_sq, U, UT_y" }, { @@ -108186,7 +115946,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -108196,6 +115957,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "Training data. Will be cast to float64 if necessary." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -108206,7 +115971,8 @@ "docstring": { "type": "ndarray of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values. Will be cast to float64 if necessary." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -108216,13 +115982,14 @@ "docstring": { "type": "float or ndarray of shape (n_samples,), default=None", "description": "Individual weights for each sample. If given a float, every sample\nwill have the same weight." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit Ridge regression model with gcv.", - "docstring": "Fit Ridge regression model with gcv.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data. Will be cast to float64 if necessary.\n\ny : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to float64 if necessary.\n\nsample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\nReturns\n-------\nself : object", + "docstring": "Fit Ridge regression model with gcv.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data. Will be cast to float64 if necessary.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to float64 if necessary.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n Returns\n -------\n self : object\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit Ridge regression model with gcv.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training data. Will be cast to float64 if necessary.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values. Will be cast to float64 if necessary.\n\n sample_weight : float or ndarray of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight.\n\n Returns\n -------\n self : object\n \"\"\"\n _normalize = _deprecate_normalize(self.normalize, default=False, estimator_name=self.__class__.__name__)\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float64], multi_output=True, y_numeric=True)\n assert not (self.is_clf and self.alpha_per_target)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n if np.any(self.alphas <= 0):\n raise ValueError('alphas must be strictly positive. Got {} containing some negative or null value instead.'.format(self.alphas))\n (X, y, X_offset, y_offset, X_scale) = LinearModel._preprocess_data(X, y, self.fit_intercept, _normalize, self.copy_X, sample_weight=sample_weight)\n gcv_mode = _check_gcv_mode(X, self.gcv_mode)\n if gcv_mode == 'eigen':\n decompose = self._eigen_decompose_gram\n solve = self._solve_eigen_gram\n elif gcv_mode == 'svd':\n if sparse.issparse(X):\n decompose = self._eigen_decompose_covariance\n solve = self._solve_eigen_covariance\n else:\n decompose = self._svd_decompose_design_matrix\n solve = self._solve_svd_design_matrix\n n_samples = X.shape[0]\n if sample_weight is not None:\n (X, y) = _rescale_data(X, y, sample_weight)\n sqrt_sw = np.sqrt(sample_weight)\n else:\n sqrt_sw = np.ones(n_samples, dtype=X.dtype)\n (X_mean, *decomposition) = decompose(X, y, sqrt_sw)\n scorer = check_scoring(self, scoring=self.scoring, allow_none=True)\n error = scorer is None\n n_y = 1 if len(y.shape) == 1 else y.shape[1]\n n_alphas = 1 if np.ndim(self.alphas) == 0 else len(self.alphas)\n if self.store_cv_values:\n self.cv_values_ = np.empty((n_samples * n_y, n_alphas), dtype=X.dtype)\n (best_coef, best_score, best_alpha) = (None, None, None)\n for (i, alpha) in enumerate(np.atleast_1d(self.alphas)):\n (G_inverse_diag, c) = solve(float(alpha), y, sqrt_sw, X_mean, *decomposition)\n if error:\n squared_errors = (c / G_inverse_diag)**2\n if self.alpha_per_target:\n alpha_score = -squared_errors.mean(axis=0)\n else:\n alpha_score = -squared_errors.mean()\n if self.store_cv_values:\n self.cv_values_[:, i] = squared_errors.ravel()\n else:\n predictions = y - c / G_inverse_diag\n if self.store_cv_values:\n self.cv_values_[:, i] = predictions.ravel()\n if self.is_clf:\n identity_estimator = _IdentityClassifier(classes=np.arange(n_y))\n alpha_score = scorer(identity_estimator, predictions, y.argmax(axis=1))\n else:\n identity_estimator = _IdentityRegressor()\n if self.alpha_per_target:\n alpha_score = np.array([scorer(identity_estimator, predictions[:, j], y[:, j]) for j in range(n_y)])\n else:\n alpha_score = scorer(identity_estimator, predictions.ravel(), y.ravel())\n if best_score is None:\n if self.alpha_per_target and n_y > 1:\n best_coef = c\n best_score = np.atleast_1d(alpha_score)\n best_alpha = np.full(n_y, alpha)\n else:\n best_coef = c\n best_score = alpha_score\n best_alpha = alpha\n elif self.alpha_per_target and n_y > 1:\n to_update = alpha_score > best_score\n best_coef[:, to_update] = c[:, to_update]\n best_score[to_update] = alpha_score[to_update]\n best_alpha[to_update] = alpha\n elif alpha_score > best_score:\n (best_coef, best_score, best_alpha) = (c, alpha_score, alpha)\n self.alpha_ = best_alpha\n self.best_score_ = best_score\n self.dual_coef_ = best_coef\n self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)\n X_offset += X_mean * X_scale\n self._set_intercept(X_offset, y_offset, X_scale)\n if self.store_cv_values:\n if len(y.shape) == 1:\n cv_values_shape = (n_samples, n_alphas)\n else:\n cv_values_shape = (n_samples, n_y, n_alphas)\n self.cv_values_ = self.cv_values_.reshape(cv_values_shape)\n return self" }, { @@ -108240,7 +116007,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -108250,7 +116018,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_mean", @@ -108260,7 +116029,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sqrt_sw", @@ -108270,13 +116040,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, X, X_mean, sqrt_sw):\n (n_samples, n_features) = X.shape\n super().__init__(X.dtype, (n_features + 1, n_samples))\n self.X = X\n self.X_mean = X_mean\n self.sqrt_sw = sqrt_sw" }, { @@ -108294,7 +116065,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "v", @@ -108304,13 +116076,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _matmat(self, v):\n n_features = self.shape[0]\n res = np.empty((n_features, v.shape[1]), dtype=self.X.dtype)\n res[:-1] = safe_sparse_dot(self.X.T, v, dense_output=True) - self.X_mean[:, None] * self.sqrt_sw.dot(v)\n res[-1] = np.dot(self.sqrt_sw, v)\n return res" }, { @@ -108328,7 +116101,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "v", @@ -108338,13 +116112,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _matvec(self, v):\n v = v.ravel()\n n_features = self.shape[0]\n res = np.empty(n_features, dtype=self.X.dtype)\n res[:-1] = safe_sparse_dot(self.X.T, v, dense_output=True) - self.X_mean * self.sqrt_sw.dot(v)\n res[-1] = np.dot(v, self.sqrt_sw)\n return res" }, { @@ -108362,7 +116137,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -108372,7 +116148,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_mean", @@ -108382,7 +116159,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sqrt_sw", @@ -108392,13 +116170,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, X, X_mean, sqrt_sw):\n (n_samples, n_features) = X.shape\n super().__init__(X.dtype, (n_samples, n_features + 1))\n self.X = X\n self.X_mean = X_mean\n self.sqrt_sw = sqrt_sw" }, { @@ -108416,7 +116195,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "v", @@ -108426,13 +116206,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _matmat(self, v):\n return safe_sparse_dot(self.X, v[:-1], dense_output=True) - self.sqrt_sw[:, None] * self.X_mean.dot(v[:-1]) + v[-1] * self.sqrt_sw[:, None]" }, { @@ -108450,7 +116231,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "v", @@ -108460,13 +116242,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _matvec(self, v):\n v = v.ravel()\n return safe_sparse_dot(self.X, v[:-1], dense_output=True) - self.sqrt_sw * self.X_mean.dot(v[:-1]) + v[-1] * self.sqrt_sw" }, { @@ -108484,13 +116267,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _transpose(self):\n return _XT_CenterStackOp(self.X, self.X_mean, self.sqrt_sw)" }, { @@ -108508,7 +116292,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gcv_mode", @@ -108518,13 +116303,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_gcv_mode(X, gcv_mode):\n possible_gcv_modes = [None, 'auto', 'svd', 'eigen']\n if gcv_mode not in possible_gcv_modes:\n raise ValueError(\"Unknown value for 'gcv_mode'. Got {} instead of one of {}\".format(gcv_mode, possible_gcv_modes))\n if gcv_mode in ['eigen', 'svd']:\n return gcv_mode\n if X.shape[0] > X.shape[1]:\n return 'svd'\n return 'eigen'" }, { @@ -108542,7 +116328,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Normalized query vector." - } + }, + "refined_type": {} }, { "name": "vectors", @@ -108552,13 +116339,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Vectors to which we compare query, as columns. Must be normalized." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Find the column of vectors that is most aligned with the query.\n\nBoth query and the columns of vectors must have their l2 norm equal to 1.", - "docstring": "Find the column of vectors that is most aligned with the query.\n\nBoth query and the columns of vectors must have their l2 norm equal to 1.\n\nParameters\n----------\nquery : ndarray of shape (n_samples,)\n Normalized query vector.\n\nvectors : ndarray of shape (n_samples, n_features)\n Vectors to which we compare query, as columns. Must be normalized.", + "docstring": "Find the column of vectors that is most aligned with the query.\n\n Both query and the columns of vectors must have their l2 norm equal to 1.\n\n Parameters\n ----------\n query : ndarray of shape (n_samples,)\n Normalized query vector.\n\n vectors : ndarray of shape (n_samples, n_features)\n Vectors to which we compare query, as columns. Must be normalized.\n ", "source_code": "\ndef _find_smallest_angle(query, vectors):\n \"\"\"Find the column of vectors that is most aligned with the query.\n\n Both query and the columns of vectors must have their l2 norm equal to 1.\n\n Parameters\n ----------\n query : ndarray of shape (n_samples,)\n Normalized query vector.\n\n vectors : ndarray of shape (n_samples, n_features)\n Vectors to which we compare query, as columns. Must be normalized.\n \"\"\"\n abs_cosine = np.abs(query.dot(vectors))\n index = np.argmax(abs_cosine)\n return index" }, { @@ -108576,7 +116364,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "solver", @@ -108586,13 +116375,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_valid_accept_sparse(is_X_sparse, solver):\n if is_X_sparse and solver in ['auto', 'sag', 'saga']:\n return 'csr'\n else:\n return ['csr', 'csc', 'coo']" }, { @@ -108610,7 +116400,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -108620,7 +116411,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -108630,7 +116422,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -108640,7 +116433,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "solver", @@ -108650,7 +116444,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -108660,7 +116455,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -108670,7 +116466,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -108680,7 +116477,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "positive", @@ -108690,7 +116488,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -108700,7 +116499,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -108710,7 +116510,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "return_intercept", @@ -108720,7 +116521,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_scale", @@ -108730,7 +116532,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_offset", @@ -108740,7 +116543,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check_input", @@ -108750,13 +116554,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', max_iter=None, tol=0.001, verbose=0, positive=False, random_state=None, return_n_iter=False, return_intercept=False, X_scale=None, X_offset=None, check_input=True):\n has_sw = sample_weight is not None\n if solver == 'auto':\n if positive:\n solver = 'lbfgs'\n elif return_intercept:\n solver = 'sag'\n elif not sparse.issparse(X):\n solver = 'cholesky'\n else:\n solver = 'sparse_cg'\n if solver not in ('sparse_cg', 'cholesky', 'svd', 'lsqr', 'sag', 'saga', 'lbfgs'):\n raise ValueError(\"Known solvers are 'sparse_cg', 'cholesky', 'svd' 'lsqr', 'sag', 'saga' or 'lbfgs'. Got %s.\" % solver)\n if positive and solver != 'lbfgs':\n raise ValueError(f\"When positive=True, only 'lbfgs' solver can be used. Please change solver {solver} to 'lbfgs' or set positive=False.\")\n if solver == 'lbfgs' and not positive:\n raise ValueError(\"'lbfgs' solver can be used only when positive=True. Please use another solver.\")\n if return_intercept and solver != 'sag':\n raise ValueError(\"In Ridge, only 'sag' solver can directly fit the intercept. Please change solver to 'sag' or set return_intercept=False.\")\n if check_input:\n _dtype = [np.float64, np.float32]\n _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver)\n X = check_array(X, accept_sparse=_accept_sparse, dtype=_dtype, order='C')\n y = check_array(y, dtype=X.dtype, ensure_2d=False, order=None)\n check_consistent_length(X, y)\n (n_samples, n_features) = X.shape\n if y.ndim > 2:\n raise ValueError('Target y has the wrong shape %s' % str(y.shape))\n ravel = False\n if y.ndim == 1:\n y = y.reshape(-1, 1)\n ravel = True\n (n_samples_, n_targets) = y.shape\n if n_samples != n_samples_:\n raise ValueError('Number of samples in X and y does not correspond: %d != %d' % (n_samples, n_samples_))\n if has_sw:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n if solver not in ['sag', 'saga']:\n (X, y) = _rescale_data(X, y, sample_weight)\n alpha = np.asarray(alpha, dtype=X.dtype).ravel()\n if alpha.size not in [1, n_targets]:\n raise ValueError('Number of targets and number of penalties do not correspond: %d != %d' % (alpha.size, n_targets))\n if alpha.size == 1 and n_targets > 1:\n alpha = np.repeat(alpha, n_targets)\n n_iter = None\n if solver == 'sparse_cg':\n coef = _solve_sparse_cg(X, y, alpha, max_iter=max_iter, tol=tol, verbose=verbose, X_offset=X_offset, X_scale=X_scale)\n elif solver == 'lsqr':\n (coef, n_iter) = _solve_lsqr(X, y, alpha, max_iter, tol)\n elif solver == 'cholesky':\n if n_features > n_samples:\n K = safe_sparse_dot(X, X.T, dense_output=True)\n try:\n dual_coef = _solve_cholesky_kernel(K, y, alpha)\n coef = safe_sparse_dot(X.T, dual_coef, dense_output=True).T\n except linalg.LinAlgError:\n solver = 'svd'\n else:\n try:\n coef = _solve_cholesky(X, y, alpha)\n except linalg.LinAlgError:\n solver = 'svd'\n elif solver in ['sag', 'saga']:\n max_squared_sum = row_norms(X, squared=True).max()\n coef = np.empty((y.shape[1], n_features), dtype=X.dtype)\n n_iter = np.empty(y.shape[1], dtype=np.int32)\n intercept = np.zeros((y.shape[1], ), dtype=X.dtype)\n for (i, (alpha_i, target)) in enumerate(zip(alpha, y.T)):\n init = {'coef': np.zeros((n_features + int(return_intercept), 1), dtype=X.dtype)}\n (coef_, n_iter_, _) = sag_solver(X, target.ravel(), sample_weight, 'squared', alpha_i, 0, max_iter, tol, verbose, random_state, False, max_squared_sum, init, is_saga=solver == 'saga')\n if return_intercept:\n coef[i] = coef_[:-1]\n intercept[i] = coef_[-1]\n else:\n coef[i] = coef_\n n_iter[i] = n_iter_\n if intercept.shape[0] == 1:\n intercept = intercept[0]\n coef = np.asarray(coef)\n elif solver == 'lbfgs':\n coef = _solve_lbfgs(X, y, alpha, positive=positive, tol=tol, max_iter=max_iter, X_offset=X_offset, X_scale=X_scale)\n if solver == 'svd':\n if sparse.issparse(X):\n raise TypeError('SVD solver does not support sparse inputs currently')\n coef = _solve_svd(X, y, alpha)\n if ravel:\n coef = coef.ravel()\n if return_n_iter and return_intercept:\n return coef, n_iter, intercept\n elif return_intercept:\n return coef, intercept\n elif return_n_iter:\n return coef, n_iter\n else:\n return coef" }, { @@ -108774,7 +116579,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -108784,7 +116590,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -108794,13 +116601,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _solve_cholesky(X, y, alpha):\n n_features = X.shape[1]\n n_targets = y.shape[1]\n A = safe_sparse_dot(X.T, X, dense_output=True)\n Xy = safe_sparse_dot(X.T, y, dense_output=True)\n one_alpha = np.array_equal(alpha, len(alpha) * [alpha[0]])\n if one_alpha:\n A.flat[::n_features + 1] += alpha[0]\n return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T\n else:\n coefs = np.empty([n_targets, n_features], dtype=X.dtype)\n for (coef, target, current_alpha) in zip(coefs, Xy.T, alpha):\n A.flat[::n_features + 1] += current_alpha\n coef[:] = linalg.solve(A, target, sym_pos=True, overwrite_a=False).ravel()\n A.flat[::n_features + 1] -= current_alpha\n return coefs" }, { @@ -108818,7 +116626,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -108828,7 +116637,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -108838,7 +116648,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -108848,7 +116659,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy", @@ -108858,13 +116670,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _solve_cholesky_kernel(K, y, alpha, sample_weight=None, copy=False):\n n_samples = K.shape[0]\n n_targets = y.shape[1]\n if copy:\n K = K.copy()\n alpha = np.atleast_1d(alpha)\n one_alpha = (alpha == alpha[0]).all()\n has_sw = isinstance(sample_weight, np.ndarray) or sample_weight not in [1.0, None]\n if has_sw:\n sw = np.sqrt(np.atleast_1d(sample_weight))\n y = y * sw[:, np.newaxis]\n K *= np.outer(sw, sw)\n if one_alpha:\n K.flat[::n_samples + 1] += alpha[0]\n try:\n dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)\n except np.linalg.LinAlgError:\n warnings.warn('Singular matrix in solving dual problem. Using least-squares solution instead.')\n dual_coef = linalg.lstsq(K, y)[0]\n K.flat[::n_samples + 1] -= alpha[0]\n if has_sw:\n dual_coef *= sw[:, np.newaxis]\n return dual_coef\n else:\n dual_coefs = np.empty([n_targets, n_samples], K.dtype)\n for (dual_coef, target, current_alpha) in zip(dual_coefs, y.T, alpha):\n K.flat[::n_samples + 1] += current_alpha\n dual_coef[:] = linalg.solve(K, target, sym_pos=True, overwrite_a=False).ravel()\n K.flat[::n_samples + 1] -= current_alpha\n if has_sw:\n dual_coefs *= sw[np.newaxis, :]\n return dual_coefs.T" }, { @@ -108882,7 +116695,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -108892,7 +116706,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -108902,7 +116717,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "positive", @@ -108912,7 +116728,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -108922,7 +116739,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -108932,7 +116750,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_offset", @@ -108942,7 +116761,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_scale", @@ -108952,13 +116772,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Solve ridge regression with LBFGS.\n\nThe main purpose is fitting with forcing coefficients to be positive. For unconstrained ridge regression, there are faster dedicated solver methods. Note that with positive bounds on the coefficients, LBFGS seems faster than scipy.optimize.lsq_linear.", - "docstring": "Solve ridge regression with LBFGS.\n\nThe main purpose is fitting with forcing coefficients to be positive.\nFor unconstrained ridge regression, there are faster dedicated solver methods.\nNote that with positive bounds on the coefficients, LBFGS seems faster\nthan scipy.optimize.lsq_linear.", + "description": "Solve ridge regression with LBFGS.\n\nThe main purpose is fitting with forcing coefficients to be positive.\nFor unconstrained ridge regression, there are faster dedicated solver methods.\nNote that with positive bounds on the coefficients, LBFGS seems faster\nthan scipy.optimize.lsq_linear.", + "docstring": "Solve ridge regression with LBFGS.\n\n The main purpose is fitting with forcing coefficients to be positive.\n For unconstrained ridge regression, there are faster dedicated solver methods.\n Note that with positive bounds on the coefficients, LBFGS seems faster\n than scipy.optimize.lsq_linear.\n ", "source_code": "\ndef _solve_lbfgs(X, y, alpha, positive=True, max_iter=None, tol=0.001, X_offset=None, X_scale=None):\n \"\"\"Solve ridge regression with LBFGS.\n\n The main purpose is fitting with forcing coefficients to be positive.\n For unconstrained ridge regression, there are faster dedicated solver methods.\n Note that with positive bounds on the coefficients, LBFGS seems faster\n than scipy.optimize.lsq_linear.\n \"\"\"\n (n_samples, n_features) = X.shape\n options = {}\n if max_iter is not None:\n options['maxiter'] = max_iter\n config = {'method': 'L-BFGS-B', 'tol': tol, 'jac': True, 'options': options}\n if positive:\n config['bounds'] = [(0, np.inf)] * n_features\n if X_offset is not None and X_scale is not None:\n X_offset_scale = X_offset / X_scale\n else:\n X_offset_scale = None\n coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)\n for i in range(y.shape[1]):\n x0 = np.zeros((n_features, ))\n y_column = y[:, i]\n \n def func(w):\n residual = X.dot(w) - y_column\n if X_offset_scale is not None:\n residual -= w.dot(X_offset_scale)\n f = 0.5 * residual.dot(residual) + 0.5 * alpha[i] * w.dot(w)\n grad = X.T @ residual + alpha[i] * w\n if X_offset_scale is not None:\n grad -= X_offset_scale * np.sum(residual)\n return f, grad\n result = optimize.minimize(func, x0, **config)\n if not result['success']:\n warnings.warn(f'The lbfgs solver did not converge. Try increasing max_iter or tol. Currently: max_iter={max_iter} and tol={tol}', ConvergenceWarning)\n coefs[i] = result['x']\n return coefs" }, { @@ -108976,7 +116797,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -108986,7 +116808,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -108996,7 +116819,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -109006,7 +116830,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -109016,13 +116841,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _solve_lsqr(X, y, alpha, max_iter=None, tol=0.001):\n (n_samples, n_features) = X.shape\n coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)\n n_iter = np.empty(y.shape[1], dtype=np.int32)\n sqrt_alpha = np.sqrt(alpha)\n for i in range(y.shape[1]):\n y_column = y[:, i]\n info = sp_linalg.lsqr(X, y_column, damp=sqrt_alpha[i], atol=tol, btol=tol, iter_lim=max_iter)\n coefs[i] = info[0]\n n_iter[i] = info[2]\n return coefs, n_iter" }, { @@ -109040,7 +116866,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -109050,7 +116877,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -109060,7 +116888,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -109070,7 +116899,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -109080,7 +116910,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -109090,7 +116921,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_offset", @@ -109100,7 +116932,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_scale", @@ -109110,13 +116943,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _solve_sparse_cg(X, y, alpha, max_iter=None, tol=0.001, verbose=0, X_offset=None, X_scale=None):\n \n def _get_rescaled_operator(X):\n X_offset_scale = X_offset / X_scale\n \n def matvec(b):\n return X.dot(b) - b.dot(X_offset_scale)\n \n def rmatvec(b):\n return X.T.dot(b) - X_offset_scale * np.sum(b)\n X1 = sparse.linalg.LinearOperator(shape=X.shape, matvec=matvec, rmatvec=rmatvec)\n return X1\n (n_samples, n_features) = X.shape\n if X_offset is None or X_scale is None:\n X1 = sp_linalg.aslinearoperator(X)\n else:\n X1 = _get_rescaled_operator(X)\n coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)\n if n_features > n_samples:\n \n def create_mv(curr_alpha):\n \n def _mv(x):\n return X1.matvec(X1.rmatvec(x)) + curr_alpha * x\n return _mv\n else:\n \n def create_mv(curr_alpha):\n \n def _mv(x):\n return X1.rmatvec(X1.matvec(x)) + curr_alpha * x\n return _mv\n for i in range(y.shape[1]):\n y_column = y[:, i]\n mv = create_mv(alpha[i])\n if n_features > n_samples:\n C = sp_linalg.LinearOperator((n_samples, n_samples), matvec=mv, dtype=X.dtype)\n try:\n (coef, info) = sp_linalg.cg(C, y_column, tol=tol, atol='legacy')\n except TypeError:\n (coef, info) = sp_linalg.cg(C, y_column, tol=tol)\n coefs[i] = X1.rmatvec(coef)\n else:\n y_column = X1.rmatvec(y_column)\n C = sp_linalg.LinearOperator((n_features, n_features), matvec=mv, dtype=X.dtype)\n try:\n (coefs[i], info) = sp_linalg.cg(C, y_column, maxiter=max_iter, tol=tol, atol='legacy')\n except TypeError:\n (coefs[i], info) = sp_linalg.cg(C, y_column, maxiter=max_iter, tol=tol)\n if info < 0:\n raise ValueError('Failed with error code %d' % info)\n if max_iter is None and info > 0 and verbose:\n warnings.warn('sparse_cg did not converge after %d iterations.' % info, ConvergenceWarning)\n return coefs" }, { @@ -109134,7 +116968,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -109144,7 +116979,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -109154,13 +116990,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _solve_svd(X, y, alpha):\n (U, s, Vt) = linalg.svd(X, full_matrices=False)\n idx = s > 1e-15\n s_nnz = s[idx][:, np.newaxis]\n UTy = np.dot(U.T, y)\n d = np.zeros((s.size, alpha.size), dtype=X.dtype)\n d[idx] = s_nnz / (s_nnz**2 + alpha)\n d_UT_y = d * UTy\n return np.dot(Vt.T, d_UT_y).T" }, { @@ -109178,6 +117015,10 @@ "docstring": { "type": "{ndarray, sparse matrix, LinearOperator} of shape (n_samples, n_features)", "description": "Training data" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -109188,7 +117029,8 @@ "docstring": { "type": "ndarray of shape (n_samples,) or (n_samples, n_targets)", "description": "Target values" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -109198,7 +117040,8 @@ "docstring": { "type": "float or array-like of shape (n_targets,)", "description": "Regularization strength; must be a positive float. Regularization\nimproves the conditioning of the problem and reduces the variance of\nthe estimates. Larger values specify stronger regularization.\nAlpha corresponds to ``1 / (2C)`` in other linear models such as\n:class:`~sklearn.linear_model.LogisticRegression` or\n:class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are\nassumed to be specific to the targets. Hence they must correspond in\nnumber." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -109208,7 +117051,8 @@ "docstring": { "type": "float or array-like of shape (n_samples,), default=None", "description": "Individual weights for each sample. If given a float, every sample\nwill have the same weight. If sample_weight is not None and\nsolver='auto', the solver will be set to 'cholesky'.\n\n.. versionadded:: 0.17" - } + }, + "refined_type": {} }, { "name": "solver", @@ -109218,6 +117062,19 @@ "docstring": { "type": "{'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto'", "description": "Solver to use in the computational routines:\n\n- 'auto' chooses the solver automatically based on the type of data.\n\n- 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n- 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution via a Cholesky decomposition of\n dot(X.T, X)\n\n- 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n- 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n- 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its improved, unbiased version named SAGA. Both methods also use an\n iterative procedure, and are often faster than other solvers when\n both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n- 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True.\n\nAll last six solvers support both dense and sparse data. However, only\n'sag', 'sparse_cg', and 'lbfgs' support sparse input when `fit_intercept`\nis True.\n\n.. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n.. versionadded:: 0.19\n SAGA solver." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "lsqr", + "saga", + "lbfgs", + "svd", + "sag", + "sparse_cg", + "auto", + "cholesky" + ] } }, { @@ -109228,7 +117085,8 @@ "docstring": { "type": "int, default=None", "description": "Maximum number of iterations for conjugate gradient solver.\nFor the 'sparse_cg' and 'lsqr' solvers, the default value is determined\nby scipy.sparse.linalg. For 'sag' and saga solver, the default value is\n1000. For 'lbfgs' solver, the default value is 15000." - } + }, + "refined_type": {} }, { "name": "tol", @@ -109238,7 +117096,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Precision of the solution." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -109248,7 +117107,8 @@ "docstring": { "type": "int, default=0", "description": "Verbosity level. Setting verbose > 0 will display additional\ninformation depending on the solver used." - } + }, + "refined_type": {} }, { "name": "positive", @@ -109258,7 +117118,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, forces the coefficients to be positive.\nOnly 'lbfgs' solver is supported in this case." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -109268,7 +117129,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used when ``solver`` == 'sag' or 'saga' to shuffle the data.\nSee :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -109278,7 +117140,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, the method also returns `n_iter`, the actual number of\niteration performed by the solver.\n\n.. versionadded:: 0.17" - } + }, + "refined_type": {} }, { "name": "return_intercept", @@ -109288,7 +117151,8 @@ "docstring": { "type": "bool, default=False", "description": "If True and if X is sparse, the method also returns the intercept,\nand the solver is automatically changed to 'sag'. This is only a\ntemporary fix for fitting the intercept with sparse data. For dense\ndata, use sklearn.linear_model._preprocess_data before your regression.\n\n.. versionadded:: 0.17" - } + }, + "refined_type": {} }, { "name": "check_input", @@ -109298,13 +117162,14 @@ "docstring": { "type": "bool, default=True", "description": "If False, the input arrays X and y will not be checked.\n\n.. versionadded:: 0.21" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Solve the ridge equation by the method of normal equations.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Solve the ridge equation by the method of normal equations.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {ndarray, sparse matrix, LinearOperator} of shape (n_samples, n_features)\n Training data\n\ny : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values\n\nalpha : float or array-like of shape (n_targets,)\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are\n assumed to be specific to the targets. Hence they must correspond in\n number.\n\nsample_weight : float or array-like of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight. If sample_weight is not None and\n solver='auto', the solver will be set to 'cholesky'.\n\n .. versionadded:: 0.17\n\nsolver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto'\n Solver to use in the computational routines:\n\n - 'auto' chooses the solver automatically based on the type of data.\n\n - 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n - 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution via a Cholesky decomposition of\n dot(X.T, X)\n\n - 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n - 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its improved, unbiased version named SAGA. Both methods also use an\n iterative procedure, and are often faster than other solvers when\n both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n - 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True.\n\n All last six solvers support both dense and sparse data. However, only\n 'sag', 'sparse_cg', and 'lbfgs' support sparse input when `fit_intercept`\n is True.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\nmax_iter : int, default=None\n Maximum number of iterations for conjugate gradient solver.\n For the 'sparse_cg' and 'lsqr' solvers, the default value is determined\n by scipy.sparse.linalg. For 'sag' and saga solver, the default value is\n 1000. For 'lbfgs' solver, the default value is 15000.\n\ntol : float, default=1e-3\n Precision of the solution.\n\nverbose : int, default=0\n Verbosity level. Setting verbose > 0 will display additional\n information depending on the solver used.\n\npositive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n Only 'lbfgs' solver is supported in this case.\n\nrandom_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag' or 'saga' to shuffle the data.\n See :term:`Glossary ` for details.\n\nreturn_n_iter : bool, default=False\n If True, the method also returns `n_iter`, the actual number of\n iteration performed by the solver.\n\n .. versionadded:: 0.17\n\nreturn_intercept : bool, default=False\n If True and if X is sparse, the method also returns the intercept,\n and the solver is automatically changed to 'sag'. This is only a\n temporary fix for fitting the intercept with sparse data. For dense\n data, use sklearn.linear_model._preprocess_data before your regression.\n\n .. versionadded:: 0.17\n\ncheck_input : bool, default=True\n If False, the input arrays X and y will not be checked.\n\n .. versionadded:: 0.21\n\nReturns\n-------\ncoef : ndarray of shape (n_features,) or (n_targets, n_features)\n Weight vector(s).\n\nn_iter : int, optional\n The actual number of iteration performed by the solver.\n Only returned if `return_n_iter` is True.\n\nintercept : float or ndarray of shape (n_targets,)\n The intercept of the model. Only returned if `return_intercept`\n is True and if X is a scipy sparse array.\n\nNotes\n-----\nThis function won't compute the intercept.", + "docstring": "Solve the ridge equation by the method of normal equations.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix, LinearOperator} of shape (n_samples, n_features)\n Training data\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values\n\n alpha : float or array-like of shape (n_targets,)\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are\n assumed to be specific to the targets. Hence they must correspond in\n number.\n\n sample_weight : float or array-like of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight. If sample_weight is not None and\n solver='auto', the solver will be set to 'cholesky'.\n\n .. versionadded:: 0.17\n\n solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto'\n Solver to use in the computational routines:\n\n - 'auto' chooses the solver automatically based on the type of data.\n\n - 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n - 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution via a Cholesky decomposition of\n dot(X.T, X)\n\n - 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n - 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its improved, unbiased version named SAGA. Both methods also use an\n iterative procedure, and are often faster than other solvers when\n both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n - 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True.\n\n All last six solvers support both dense and sparse data. However, only\n 'sag', 'sparse_cg', and 'lbfgs' support sparse input when `fit_intercept`\n is True.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\n max_iter : int, default=None\n Maximum number of iterations for conjugate gradient solver.\n For the 'sparse_cg' and 'lsqr' solvers, the default value is determined\n by scipy.sparse.linalg. For 'sag' and saga solver, the default value is\n 1000. For 'lbfgs' solver, the default value is 15000.\n\n tol : float, default=1e-3\n Precision of the solution.\n\n verbose : int, default=0\n Verbosity level. Setting verbose > 0 will display additional\n information depending on the solver used.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n Only 'lbfgs' solver is supported in this case.\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag' or 'saga' to shuffle the data.\n See :term:`Glossary ` for details.\n\n return_n_iter : bool, default=False\n If True, the method also returns `n_iter`, the actual number of\n iteration performed by the solver.\n\n .. versionadded:: 0.17\n\n return_intercept : bool, default=False\n If True and if X is sparse, the method also returns the intercept,\n and the solver is automatically changed to 'sag'. This is only a\n temporary fix for fitting the intercept with sparse data. For dense\n data, use sklearn.linear_model._preprocess_data before your regression.\n\n .. versionadded:: 0.17\n\n check_input : bool, default=True\n If False, the input arrays X and y will not be checked.\n\n .. versionadded:: 0.21\n\n Returns\n -------\n coef : ndarray of shape (n_features,) or (n_targets, n_features)\n Weight vector(s).\n\n n_iter : int, optional\n The actual number of iteration performed by the solver.\n Only returned if `return_n_iter` is True.\n\n intercept : float or ndarray of shape (n_targets,)\n The intercept of the model. Only returned if `return_intercept`\n is True and if X is a scipy sparse array.\n\n Notes\n -----\n This function won't compute the intercept.\n ", "source_code": "\ndef ridge_regression(X, y, alpha, *, sample_weight=None, solver='auto', max_iter=None, tol=0.001, verbose=0, positive=False, random_state=None, return_n_iter=False, return_intercept=False, check_input=True):\n \"\"\"Solve the ridge equation by the method of normal equations.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix, LinearOperator} of shape (n_samples, n_features)\n Training data\n\n y : ndarray of shape (n_samples,) or (n_samples, n_targets)\n Target values\n\n alpha : float or array-like of shape (n_targets,)\n Regularization strength; must be a positive float. Regularization\n improves the conditioning of the problem and reduces the variance of\n the estimates. Larger values specify stronger regularization.\n Alpha corresponds to ``1 / (2C)`` in other linear models such as\n :class:`~sklearn.linear_model.LogisticRegression` or\n :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are\n assumed to be specific to the targets. Hence they must correspond in\n number.\n\n sample_weight : float or array-like of shape (n_samples,), default=None\n Individual weights for each sample. If given a float, every sample\n will have the same weight. If sample_weight is not None and\n solver='auto', the solver will be set to 'cholesky'.\n\n .. versionadded:: 0.17\n\n solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'}, default='auto'\n Solver to use in the computational routines:\n\n - 'auto' chooses the solver automatically based on the type of data.\n\n - 'svd' uses a Singular Value Decomposition of X to compute the Ridge\n coefficients. More stable for singular matrices than 'cholesky'.\n\n - 'cholesky' uses the standard scipy.linalg.solve function to\n obtain a closed-form solution via a Cholesky decomposition of\n dot(X.T, X)\n\n - 'sparse_cg' uses the conjugate gradient solver as found in\n scipy.sparse.linalg.cg. As an iterative algorithm, this solver is\n more appropriate than 'cholesky' for large-scale data\n (possibility to set `tol` and `max_iter`).\n\n - 'lsqr' uses the dedicated regularized least-squares routine\n scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative\n procedure.\n\n - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses\n its improved, unbiased version named SAGA. Both methods also use an\n iterative procedure, and are often faster than other solvers when\n both n_samples and n_features are large. Note that 'sag' and\n 'saga' fast convergence is only guaranteed on features with\n approximately the same scale. You can preprocess the data with a\n scaler from sklearn.preprocessing.\n\n - 'lbfgs' uses L-BFGS-B algorithm implemented in\n `scipy.optimize.minimize`. It can be used only when `positive`\n is True.\n\n All last six solvers support both dense and sparse data. However, only\n 'sag', 'sparse_cg', and 'lbfgs' support sparse input when `fit_intercept`\n is True.\n\n .. versionadded:: 0.17\n Stochastic Average Gradient descent solver.\n .. versionadded:: 0.19\n SAGA solver.\n\n max_iter : int, default=None\n Maximum number of iterations for conjugate gradient solver.\n For the 'sparse_cg' and 'lsqr' solvers, the default value is determined\n by scipy.sparse.linalg. For 'sag' and saga solver, the default value is\n 1000. For 'lbfgs' solver, the default value is 15000.\n\n tol : float, default=1e-3\n Precision of the solution.\n\n verbose : int, default=0\n Verbosity level. Setting verbose > 0 will display additional\n information depending on the solver used.\n\n positive : bool, default=False\n When set to ``True``, forces the coefficients to be positive.\n Only 'lbfgs' solver is supported in this case.\n\n random_state : int, RandomState instance, default=None\n Used when ``solver`` == 'sag' or 'saga' to shuffle the data.\n See :term:`Glossary ` for details.\n\n return_n_iter : bool, default=False\n If True, the method also returns `n_iter`, the actual number of\n iteration performed by the solver.\n\n .. versionadded:: 0.17\n\n return_intercept : bool, default=False\n If True and if X is sparse, the method also returns the intercept,\n and the solver is automatically changed to 'sag'. This is only a\n temporary fix for fitting the intercept with sparse data. For dense\n data, use sklearn.linear_model._preprocess_data before your regression.\n\n .. versionadded:: 0.17\n\n check_input : bool, default=True\n If False, the input arrays X and y will not be checked.\n\n .. versionadded:: 0.21\n\n Returns\n -------\n coef : ndarray of shape (n_features,) or (n_targets, n_features)\n Weight vector(s).\n\n n_iter : int, optional\n The actual number of iteration performed by the solver.\n Only returned if `return_n_iter` is True.\n\n intercept : float or ndarray of shape (n_targets,)\n The intercept of the model. Only returned if `return_intercept`\n is True and if X is a scipy sparse array.\n\n Notes\n -----\n This function won't compute the intercept.\n \"\"\"\n return _ridge_regression(X, y, alpha, sample_weight=sample_weight, solver=solver, max_iter=max_iter, tol=tol, verbose=verbose, positive=positive, random_state=random_state, return_n_iter=return_n_iter, return_intercept=return_intercept, X_scale=None, X_offset=None, check_input=check_input)" }, { @@ -109322,7 +117187,8 @@ "docstring": { "type": "float", "description": "Maximum squared sum of X over samples." - } + }, + "refined_type": {} }, { "name": "alpha_scaled", @@ -109332,7 +117198,8 @@ "docstring": { "type": "float", "description": "Constant that multiplies the regularization term, scaled by\n1. / n_samples, the number of samples." - } + }, + "refined_type": {} }, { "name": "loss", @@ -109342,6 +117209,10 @@ "docstring": { "type": "{'log', 'squared', 'multinomial'}", "description": "The loss function used in SAG solver." + }, + "refined_type": { + "kind": "EnumType", + "values": ["multinomial", "log", "squared"] } }, { @@ -109352,7 +117223,8 @@ "docstring": { "type": "bool", "description": "Specifies if a constant (a.k.a. bias or intercept) will be\nadded to the decision function." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -109362,7 +117234,8 @@ "docstring": { "type": "int, default=None", "description": "Number of rows in X. Useful if is_saga=True." - } + }, + "refined_type": {} }, { "name": "is_saga", @@ -109372,13 +117245,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return step size for the SAGA algorithm or the SAG\nalgorithm." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute automatic step size for SAG solver.\n\nThe step size is set to 1 / (alpha_scaled + L + fit_intercept) where L is the max sum of squares for over all samples.", - "docstring": "Compute automatic step size for SAG solver.\n\nThe step size is set to 1 / (alpha_scaled + L + fit_intercept) where L is\nthe max sum of squares for over all samples.\n\nParameters\n----------\nmax_squared_sum : float\n Maximum squared sum of X over samples.\n\nalpha_scaled : float\n Constant that multiplies the regularization term, scaled by\n 1. / n_samples, the number of samples.\n\nloss : {'log', 'squared', 'multinomial'}\n The loss function used in SAG solver.\n\nfit_intercept : bool\n Specifies if a constant (a.k.a. bias or intercept) will be\n added to the decision function.\n\nn_samples : int, default=None\n Number of rows in X. Useful if is_saga=True.\n\nis_saga : bool, default=False\n Whether to return step size for the SAGA algorithm or the SAG\n algorithm.\n\nReturns\n-------\nstep_size : float\n Step size used in SAG solver.\n\nReferences\n----------\nSchmidt, M., Roux, N. L., & Bach, F. (2013).\nMinimizing finite sums with the stochastic average gradient\nhttps://hal.inria.fr/hal-00860051/document\n\nDefazio, A., Bach F. & Lacoste-Julien S. (2014).\nSAGA: A Fast Incremental Gradient Method With Support\nfor Non-Strongly Convex Composite Objectives\nhttps://arxiv.org/abs/1407.0202", + "description": "Compute automatic step size for SAG solver.\n\nThe step size is set to 1 / (alpha_scaled + L + fit_intercept) where L is\nthe max sum of squares for over all samples.", + "docstring": "Compute automatic step size for SAG solver.\n\n The step size is set to 1 / (alpha_scaled + L + fit_intercept) where L is\n the max sum of squares for over all samples.\n\n Parameters\n ----------\n max_squared_sum : float\n Maximum squared sum of X over samples.\n\n alpha_scaled : float\n Constant that multiplies the regularization term, scaled by\n 1. / n_samples, the number of samples.\n\n loss : {'log', 'squared', 'multinomial'}\n The loss function used in SAG solver.\n\n fit_intercept : bool\n Specifies if a constant (a.k.a. bias or intercept) will be\n added to the decision function.\n\n n_samples : int, default=None\n Number of rows in X. Useful if is_saga=True.\n\n is_saga : bool, default=False\n Whether to return step size for the SAGA algorithm or the SAG\n algorithm.\n\n Returns\n -------\n step_size : float\n Step size used in SAG solver.\n\n References\n ----------\n Schmidt, M., Roux, N. L., & Bach, F. (2013).\n Minimizing finite sums with the stochastic average gradient\n https://hal.inria.fr/hal-00860051/document\n\n Defazio, A., Bach F. & Lacoste-Julien S. (2014).\n SAGA: A Fast Incremental Gradient Method With Support\n for Non-Strongly Convex Composite Objectives\n https://arxiv.org/abs/1407.0202\n ", "source_code": "\ndef get_auto_step_size(max_squared_sum, alpha_scaled, loss, fit_intercept, n_samples=None, is_saga=False):\n \"\"\"Compute automatic step size for SAG solver.\n\n The step size is set to 1 / (alpha_scaled + L + fit_intercept) where L is\n the max sum of squares for over all samples.\n\n Parameters\n ----------\n max_squared_sum : float\n Maximum squared sum of X over samples.\n\n alpha_scaled : float\n Constant that multiplies the regularization term, scaled by\n 1. / n_samples, the number of samples.\n\n loss : {'log', 'squared', 'multinomial'}\n The loss function used in SAG solver.\n\n fit_intercept : bool\n Specifies if a constant (a.k.a. bias or intercept) will be\n added to the decision function.\n\n n_samples : int, default=None\n Number of rows in X. Useful if is_saga=True.\n\n is_saga : bool, default=False\n Whether to return step size for the SAGA algorithm or the SAG\n algorithm.\n\n Returns\n -------\n step_size : float\n Step size used in SAG solver.\n\n References\n ----------\n Schmidt, M., Roux, N. L., & Bach, F. (2013).\n Minimizing finite sums with the stochastic average gradient\n https://hal.inria.fr/hal-00860051/document\n\n Defazio, A., Bach F. & Lacoste-Julien S. (2014).\n SAGA: A Fast Incremental Gradient Method With Support\n for Non-Strongly Convex Composite Objectives\n https://arxiv.org/abs/1407.0202\n \"\"\"\n if loss in ('log', 'multinomial'):\n L = 0.25 * (max_squared_sum + int(fit_intercept)) + alpha_scaled\n elif loss == 'squared':\n L = max_squared_sum + int(fit_intercept) + alpha_scaled\n else:\n raise ValueError(\"Unknown loss function for SAG solver, got %s instead of 'log' or 'squared'\" % loss)\n if is_saga:\n mun = min(2 * n_samples * alpha_scaled, L)\n step = 1.0 / (2 * L + mun)\n else:\n step = 1.0 / L\n return step" }, { @@ -109396,6 +117270,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -109406,7 +117284,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target values. With loss='multinomial', y must be label encoded\n(see preprocessing.LabelEncoder)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -109416,7 +117295,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weights applied to individual samples (1. for unweighted)." - } + }, + "refined_type": {} }, { "name": "loss", @@ -109426,6 +117306,10 @@ "docstring": { "type": "{'log', 'squared', 'multinomial'}, default='log'", "description": "Loss function that will be optimized:\n-'log' is the binary logistic loss, as used in LogisticRegression.\n-'squared' is the squared loss, as used in Ridge.\n-'multinomial' is the multinomial logistic loss, as used in\n LogisticRegression.\n\n.. versionadded:: 0.18\n *loss='multinomial'*" + }, + "refined_type": { + "kind": "EnumType", + "values": ["multinomial", "log", "squared"] } }, { @@ -109436,7 +117320,8 @@ "docstring": { "type": "float, default=1.", "description": "L2 regularization term in the objective function\n``(0.5 * alpha * || W ||_F^2)``." - } + }, + "refined_type": {} }, { "name": "beta", @@ -109446,7 +117331,8 @@ "docstring": { "type": "float, default=0.", "description": "L1 regularization term in the objective function\n``(beta * || W ||_1)``. Only applied if ``is_saga`` is set to True." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -109456,7 +117342,8 @@ "docstring": { "type": "int, default=1000", "description": "The max number of passes over the training data if the stopping\ncriteria is not reached." - } + }, + "refined_type": {} }, { "name": "tol", @@ -109464,9 +117351,10 @@ "is_public": false, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "double, default=0.001", + "type": "float, default=0.001", "description": "The stopping criteria for the weights. The iterations will stop when\nmax(change in weights) / max(weights) < tol." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -109476,7 +117364,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -109486,7 +117375,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used when shuffling the data. Pass an int for reproducible output\nacross multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "check_input", @@ -109496,7 +117386,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, the input arrays X and y will not be checked." - } + }, + "refined_type": {} }, { "name": "max_squared_sum", @@ -109506,7 +117397,8 @@ "docstring": { "type": "float, default=None", "description": "Maximum squared sum of X over samples. If None, it will be computed,\ngoing through all the samples. The value should be precomputed\nto speed up cross validation." - } + }, + "refined_type": {} }, { "name": "warm_start_mem", @@ -109516,7 +117408,8 @@ "docstring": { "type": "dict, default=None", "description": "The initialization parameters used for warm starting. Warm starting is\ncurrently used in LogisticRegression but not in Ridge.\nIt contains:\n - 'coef': the weight vector, with the intercept in last line\n if the intercept is fitted.\n - 'gradient_memory': the scalar gradient for all seen samples.\n - 'sum_gradient': the sum of gradient over all seen samples,\n for each feature.\n - 'intercept_sum_gradient': the sum of gradient over all seen\n samples, for the intercept.\n - 'seen': array of boolean describing the seen samples.\n - 'num_seen': the number of seen samples." - } + }, + "refined_type": {} }, { "name": "is_saga", @@ -109526,14 +117419,15 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use the SAGA algorithm or the SAG algorithm. SAGA behaves\nbetter in the first epochs, and allow for l1 regularisation." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "SAG solver for Ridge and LogisticRegression.\n\nSAG stands for Stochastic Average Gradient: the gradient of the loss is estimated each sample at a time and the model is updated along the way with a constant learning rate. IMPORTANT NOTE: 'sag' solver converges faster on columns that are on the same scale. You can normalize the data by using sklearn.preprocessing.StandardScaler on your data before passing it to the fit method. This implementation works with data represented as dense numpy arrays or sparse scipy arrays of floating point values for the features. It will fit the data according to squared loss or log loss. The regularizer is a penalty added to the loss function that shrinks model parameters towards the zero vector using the squared euclidean norm L2. .. versionadded:: 0.17", - "docstring": "SAG solver for Ridge and LogisticRegression.\n\nSAG stands for Stochastic Average Gradient: the gradient of the loss is\nestimated each sample at a time and the model is updated along the way with\na constant learning rate.\n\nIMPORTANT NOTE: 'sag' solver converges faster on columns that are on the\nsame scale. You can normalize the data by using\nsklearn.preprocessing.StandardScaler on your data before passing it to the\nfit method.\n\nThis implementation works with data represented as dense numpy arrays or\nsparse scipy arrays of floating point values for the features. It will\nfit the data according to squared loss or log loss.\n\nThe regularizer is a penalty added to the loss function that shrinks model\nparameters towards the zero vector using the squared euclidean norm L2.\n\n.. versionadded:: 0.17\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : ndarray of shape (n_samples,)\n Target values. With loss='multinomial', y must be label encoded\n (see preprocessing.LabelEncoder).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\nloss : {'log', 'squared', 'multinomial'}, default='log'\n Loss function that will be optimized:\n -'log' is the binary logistic loss, as used in LogisticRegression.\n -'squared' is the squared loss, as used in Ridge.\n -'multinomial' is the multinomial logistic loss, as used in\n LogisticRegression.\n\n .. versionadded:: 0.18\n *loss='multinomial'*\n\nalpha : float, default=1.\n L2 regularization term in the objective function\n ``(0.5 * alpha * || W ||_F^2)``.\n\nbeta : float, default=0.\n L1 regularization term in the objective function\n ``(beta * || W ||_1)``. Only applied if ``is_saga`` is set to True.\n\nmax_iter : int, default=1000\n The max number of passes over the training data if the stopping\n criteria is not reached.\n\ntol : double, default=0.001\n The stopping criteria for the weights. The iterations will stop when\n max(change in weights) / max(weights) < tol.\n\nverbose : int, default=0\n The verbosity level.\n\nrandom_state : int, RandomState instance or None, default=None\n Used when shuffling the data. Pass an int for reproducible output\n across multiple function calls.\n See :term:`Glossary `.\n\ncheck_input : bool, default=True\n If False, the input arrays X and y will not be checked.\n\nmax_squared_sum : float, default=None\n Maximum squared sum of X over samples. If None, it will be computed,\n going through all the samples. The value should be precomputed\n to speed up cross validation.\n\nwarm_start_mem : dict, default=None\n The initialization parameters used for warm starting. Warm starting is\n currently used in LogisticRegression but not in Ridge.\n It contains:\n - 'coef': the weight vector, with the intercept in last line\n if the intercept is fitted.\n - 'gradient_memory': the scalar gradient for all seen samples.\n - 'sum_gradient': the sum of gradient over all seen samples,\n for each feature.\n - 'intercept_sum_gradient': the sum of gradient over all seen\n samples, for the intercept.\n - 'seen': array of boolean describing the seen samples.\n - 'num_seen': the number of seen samples.\n\nis_saga : bool, default=False\n Whether to use the SAGA algorithm or the SAG algorithm. SAGA behaves\n better in the first epochs, and allow for l1 regularisation.\n\nReturns\n-------\ncoef_ : ndarray of shape (n_features,)\n Weight vector.\n\nn_iter_ : int\n The number of full pass on all samples.\n\nwarm_start_mem : dict\n Contains a 'coef' key with the fitted result, and possibly the\n fitted intercept at the end of the array. Contains also other keys\n used for warm starting.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn import linear_model\n>>> n_samples, n_features = 10, 5\n>>> rng = np.random.RandomState(0)\n>>> X = rng.randn(n_samples, n_features)\n>>> y = rng.randn(n_samples)\n>>> clf = linear_model.Ridge(solver='sag')\n>>> clf.fit(X, y)\nRidge(solver='sag')\n\n>>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n>>> y = np.array([1, 1, 2, 2])\n>>> clf = linear_model.LogisticRegression(\n... solver='sag', multi_class='multinomial')\n>>> clf.fit(X, y)\nLogisticRegression(multi_class='multinomial', solver='sag')\n\nReferences\n----------\nSchmidt, M., Roux, N. L., & Bach, F. (2013).\nMinimizing finite sums with the stochastic average gradient\nhttps://hal.inria.fr/hal-00860051/document\n\nDefazio, A., Bach F. & Lacoste-Julien S. (2014).\nSAGA: A Fast Incremental Gradient Method With Support\nfor Non-Strongly Convex Composite Objectives\nhttps://arxiv.org/abs/1407.0202\n\nSee Also\n--------\nRidge, SGDRegressor, ElasticNet, Lasso, SVR,\nLogisticRegression, SGDClassifier, LinearSVC, Perceptron", - "source_code": "\ndef sag_solver(X, y, sample_weight=None, loss='log', alpha=1.0, beta=0.0, max_iter=1000, tol=0.001, verbose=0, random_state=None, check_input=True, max_squared_sum=None, warm_start_mem=None, is_saga=False):\n \"\"\"SAG solver for Ridge and LogisticRegression.\n\n SAG stands for Stochastic Average Gradient: the gradient of the loss is\n estimated each sample at a time and the model is updated along the way with\n a constant learning rate.\n\n IMPORTANT NOTE: 'sag' solver converges faster on columns that are on the\n same scale. You can normalize the data by using\n sklearn.preprocessing.StandardScaler on your data before passing it to the\n fit method.\n\n This implementation works with data represented as dense numpy arrays or\n sparse scipy arrays of floating point values for the features. It will\n fit the data according to squared loss or log loss.\n\n The regularizer is a penalty added to the loss function that shrinks model\n parameters towards the zero vector using the squared euclidean norm L2.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values. With loss='multinomial', y must be label encoded\n (see preprocessing.LabelEncoder).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n loss : {'log', 'squared', 'multinomial'}, default='log'\n Loss function that will be optimized:\n -'log' is the binary logistic loss, as used in LogisticRegression.\n -'squared' is the squared loss, as used in Ridge.\n -'multinomial' is the multinomial logistic loss, as used in\n LogisticRegression.\n\n .. versionadded:: 0.18\n *loss='multinomial'*\n\n alpha : float, default=1.\n L2 regularization term in the objective function\n ``(0.5 * alpha * || W ||_F^2)``.\n\n beta : float, default=0.\n L1 regularization term in the objective function\n ``(beta * || W ||_1)``. Only applied if ``is_saga`` is set to True.\n\n max_iter : int, default=1000\n The max number of passes over the training data if the stopping\n criteria is not reached.\n\n tol : double, default=0.001\n The stopping criteria for the weights. The iterations will stop when\n max(change in weights) / max(weights) < tol.\n\n verbose : int, default=0\n The verbosity level.\n\n random_state : int, RandomState instance or None, default=None\n Used when shuffling the data. Pass an int for reproducible output\n across multiple function calls.\n See :term:`Glossary `.\n\n check_input : bool, default=True\n If False, the input arrays X and y will not be checked.\n\n max_squared_sum : float, default=None\n Maximum squared sum of X over samples. If None, it will be computed,\n going through all the samples. The value should be precomputed\n to speed up cross validation.\n\n warm_start_mem : dict, default=None\n The initialization parameters used for warm starting. Warm starting is\n currently used in LogisticRegression but not in Ridge.\n It contains:\n - 'coef': the weight vector, with the intercept in last line\n if the intercept is fitted.\n - 'gradient_memory': the scalar gradient for all seen samples.\n - 'sum_gradient': the sum of gradient over all seen samples,\n for each feature.\n - 'intercept_sum_gradient': the sum of gradient over all seen\n samples, for the intercept.\n - 'seen': array of boolean describing the seen samples.\n - 'num_seen': the number of seen samples.\n\n is_saga : bool, default=False\n Whether to use the SAGA algorithm or the SAG algorithm. SAGA behaves\n better in the first epochs, and allow for l1 regularisation.\n\n Returns\n -------\n coef_ : ndarray of shape (n_features,)\n Weight vector.\n\n n_iter_ : int\n The number of full pass on all samples.\n\n warm_start_mem : dict\n Contains a 'coef' key with the fitted result, and possibly the\n fitted intercept at the end of the array. Contains also other keys\n used for warm starting.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import linear_model\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> X = rng.randn(n_samples, n_features)\n >>> y = rng.randn(n_samples)\n >>> clf = linear_model.Ridge(solver='sag')\n >>> clf.fit(X, y)\n Ridge(solver='sag')\n\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> y = np.array([1, 1, 2, 2])\n >>> clf = linear_model.LogisticRegression(\n ... solver='sag', multi_class='multinomial')\n >>> clf.fit(X, y)\n LogisticRegression(multi_class='multinomial', solver='sag')\n\n References\n ----------\n Schmidt, M., Roux, N. L., & Bach, F. (2013).\n Minimizing finite sums with the stochastic average gradient\n https://hal.inria.fr/hal-00860051/document\n\n Defazio, A., Bach F. & Lacoste-Julien S. (2014).\n SAGA: A Fast Incremental Gradient Method With Support\n for Non-Strongly Convex Composite Objectives\n https://arxiv.org/abs/1407.0202\n\n See Also\n --------\n Ridge, SGDRegressor, ElasticNet, Lasso, SVR,\n LogisticRegression, SGDClassifier, LinearSVC, Perceptron\n \"\"\"\n if warm_start_mem is None:\n warm_start_mem = {}\n if max_iter is None:\n max_iter = 1000\n if check_input:\n _dtype = [np.float64, np.float32]\n X = check_array(X, dtype=_dtype, accept_sparse='csr', order='C')\n y = check_array(y, dtype=_dtype, ensure_2d=False, order='C')\n (n_samples, n_features) = (X.shape[0], X.shape[1])\n alpha_scaled = float(alpha) / n_samples\n beta_scaled = float(beta) / n_samples\n n_classes = int(y.max()) + 1 if loss == 'multinomial' else 1\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n if 'coef' in warm_start_mem.keys():\n coef_init = warm_start_mem['coef']\n else:\n coef_init = np.zeros((n_features, n_classes), dtype=X.dtype, order='C')\n fit_intercept = coef_init.shape[0] == n_features + 1\n if fit_intercept:\n intercept_init = coef_init[-1, :]\n coef_init = coef_init[:-1, :]\n else:\n intercept_init = np.zeros(n_classes, dtype=X.dtype)\n if 'intercept_sum_gradient' in warm_start_mem.keys():\n intercept_sum_gradient = warm_start_mem['intercept_sum_gradient']\n else:\n intercept_sum_gradient = np.zeros(n_classes, dtype=X.dtype)\n if 'gradient_memory' in warm_start_mem.keys():\n gradient_memory_init = warm_start_mem['gradient_memory']\n else:\n gradient_memory_init = np.zeros((n_samples, n_classes), dtype=X.dtype, order='C')\n if 'sum_gradient' in warm_start_mem.keys():\n sum_gradient_init = warm_start_mem['sum_gradient']\n else:\n sum_gradient_init = np.zeros((n_features, n_classes), dtype=X.dtype, order='C')\n if 'seen' in warm_start_mem.keys():\n seen_init = warm_start_mem['seen']\n else:\n seen_init = np.zeros(n_samples, dtype=np.int32, order='C')\n if 'num_seen' in warm_start_mem.keys():\n num_seen_init = warm_start_mem['num_seen']\n else:\n num_seen_init = 0\n (dataset, intercept_decay) = make_dataset(X, y, sample_weight, random_state)\n if max_squared_sum is None:\n max_squared_sum = row_norms(X, squared=True).max()\n step_size = get_auto_step_size(max_squared_sum, alpha_scaled, loss, fit_intercept, n_samples=n_samples, is_saga=is_saga)\n if step_size * alpha_scaled == 1:\n raise ZeroDivisionError('Current sag implementation does not handle the case step_size * alpha_scaled == 1')\n sag = sag64 if X.dtype == np.float64 else sag32\n (num_seen, n_iter_) = sag(dataset, coef_init, intercept_init, n_samples, n_features, n_classes, tol, max_iter, loss, step_size, alpha_scaled, beta_scaled, sum_gradient_init, gradient_memory_init, seen_init, num_seen_init, fit_intercept, intercept_sum_gradient, intercept_decay, is_saga, verbose)\n if n_iter_ == max_iter:\n warnings.warn('The max_iter was reached which means the coef_ did not converge', ConvergenceWarning)\n if fit_intercept:\n coef_init = np.vstack((coef_init, intercept_init))\n warm_start_mem = {'coef': coef_init, 'sum_gradient': sum_gradient_init, 'intercept_sum_gradient': intercept_sum_gradient, 'gradient_memory': gradient_memory_init, 'seen': seen_init, 'num_seen': num_seen}\n if loss == 'multinomial':\n coef_ = coef_init.T\n else:\n coef_ = coef_init[:, 0]\n return coef_, n_iter_, warm_start_mem" + "description": "SAG solver for Ridge and LogisticRegression.\n\nSAG stands for Stochastic Average Gradient: the gradient of the loss is\nestimated each sample at a time and the model is updated along the way with\na constant learning rate.\n\nIMPORTANT NOTE: 'sag' solver converges faster on columns that are on the\nsame scale. You can normalize the data by using\nsklearn.preprocessing.StandardScaler on your data before passing it to the\nfit method.\n\nThis implementation works with data represented as dense numpy arrays or\nsparse scipy arrays of floating point values for the features. It will\nfit the data according to squared loss or log loss.\n\nThe regularizer is a penalty added to the loss function that shrinks model\nparameters towards the zero vector using the squared euclidean norm L2.\n\n.. versionadded:: 0.17", + "docstring": "SAG solver for Ridge and LogisticRegression.\n\n SAG stands for Stochastic Average Gradient: the gradient of the loss is\n estimated each sample at a time and the model is updated along the way with\n a constant learning rate.\n\n IMPORTANT NOTE: 'sag' solver converges faster on columns that are on the\n same scale. You can normalize the data by using\n sklearn.preprocessing.StandardScaler on your data before passing it to the\n fit method.\n\n This implementation works with data represented as dense numpy arrays or\n sparse scipy arrays of floating point values for the features. It will\n fit the data according to squared loss or log loss.\n\n The regularizer is a penalty added to the loss function that shrinks model\n parameters towards the zero vector using the squared euclidean norm L2.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values. With loss='multinomial', y must be label encoded\n (see preprocessing.LabelEncoder).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n loss : {'log', 'squared', 'multinomial'}, default='log'\n Loss function that will be optimized:\n -'log' is the binary logistic loss, as used in LogisticRegression.\n -'squared' is the squared loss, as used in Ridge.\n -'multinomial' is the multinomial logistic loss, as used in\n LogisticRegression.\n\n .. versionadded:: 0.18\n *loss='multinomial'*\n\n alpha : float, default=1.\n L2 regularization term in the objective function\n ``(0.5 * alpha * || W ||_F^2)``.\n\n beta : float, default=0.\n L1 regularization term in the objective function\n ``(beta * || W ||_1)``. Only applied if ``is_saga`` is set to True.\n\n max_iter : int, default=1000\n The max number of passes over the training data if the stopping\n criteria is not reached.\n\n tol : float, default=0.001\n The stopping criteria for the weights. The iterations will stop when\n max(change in weights) / max(weights) < tol.\n\n verbose : int, default=0\n The verbosity level.\n\n random_state : int, RandomState instance or None, default=None\n Used when shuffling the data. Pass an int for reproducible output\n across multiple function calls.\n See :term:`Glossary `.\n\n check_input : bool, default=True\n If False, the input arrays X and y will not be checked.\n\n max_squared_sum : float, default=None\n Maximum squared sum of X over samples. If None, it will be computed,\n going through all the samples. The value should be precomputed\n to speed up cross validation.\n\n warm_start_mem : dict, default=None\n The initialization parameters used for warm starting. Warm starting is\n currently used in LogisticRegression but not in Ridge.\n It contains:\n - 'coef': the weight vector, with the intercept in last line\n if the intercept is fitted.\n - 'gradient_memory': the scalar gradient for all seen samples.\n - 'sum_gradient': the sum of gradient over all seen samples,\n for each feature.\n - 'intercept_sum_gradient': the sum of gradient over all seen\n samples, for the intercept.\n - 'seen': array of boolean describing the seen samples.\n - 'num_seen': the number of seen samples.\n\n is_saga : bool, default=False\n Whether to use the SAGA algorithm or the SAG algorithm. SAGA behaves\n better in the first epochs, and allow for l1 regularisation.\n\n Returns\n -------\n coef_ : ndarray of shape (n_features,)\n Weight vector.\n\n n_iter_ : int\n The number of full pass on all samples.\n\n warm_start_mem : dict\n Contains a 'coef' key with the fitted result, and possibly the\n fitted intercept at the end of the array. Contains also other keys\n used for warm starting.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import linear_model\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> X = rng.randn(n_samples, n_features)\n >>> y = rng.randn(n_samples)\n >>> clf = linear_model.Ridge(solver='sag')\n >>> clf.fit(X, y)\n Ridge(solver='sag')\n\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> y = np.array([1, 1, 2, 2])\n >>> clf = linear_model.LogisticRegression(\n ... solver='sag', multi_class='multinomial')\n >>> clf.fit(X, y)\n LogisticRegression(multi_class='multinomial', solver='sag')\n\n References\n ----------\n Schmidt, M., Roux, N. L., & Bach, F. (2013).\n Minimizing finite sums with the stochastic average gradient\n https://hal.inria.fr/hal-00860051/document\n\n Defazio, A., Bach F. & Lacoste-Julien S. (2014).\n SAGA: A Fast Incremental Gradient Method With Support\n for Non-Strongly Convex Composite Objectives\n https://arxiv.org/abs/1407.0202\n\n See Also\n --------\n Ridge, SGDRegressor, ElasticNet, Lasso, SVR,\n LogisticRegression, SGDClassifier, LinearSVC, Perceptron\n ", + "source_code": "\ndef sag_solver(X, y, sample_weight=None, loss='log', alpha=1.0, beta=0.0, max_iter=1000, tol=0.001, verbose=0, random_state=None, check_input=True, max_squared_sum=None, warm_start_mem=None, is_saga=False):\n \"\"\"SAG solver for Ridge and LogisticRegression.\n\n SAG stands for Stochastic Average Gradient: the gradient of the loss is\n estimated each sample at a time and the model is updated along the way with\n a constant learning rate.\n\n IMPORTANT NOTE: 'sag' solver converges faster on columns that are on the\n same scale. You can normalize the data by using\n sklearn.preprocessing.StandardScaler on your data before passing it to the\n fit method.\n\n This implementation works with data represented as dense numpy arrays or\n sparse scipy arrays of floating point values for the features. It will\n fit the data according to squared loss or log loss.\n\n The regularizer is a penalty added to the loss function that shrinks model\n parameters towards the zero vector using the squared euclidean norm L2.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values. With loss='multinomial', y must be label encoded\n (see preprocessing.LabelEncoder).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n loss : {'log', 'squared', 'multinomial'}, default='log'\n Loss function that will be optimized:\n -'log' is the binary logistic loss, as used in LogisticRegression.\n -'squared' is the squared loss, as used in Ridge.\n -'multinomial' is the multinomial logistic loss, as used in\n LogisticRegression.\n\n .. versionadded:: 0.18\n *loss='multinomial'*\n\n alpha : float, default=1.\n L2 regularization term in the objective function\n ``(0.5 * alpha * || W ||_F^2)``.\n\n beta : float, default=0.\n L1 regularization term in the objective function\n ``(beta * || W ||_1)``. Only applied if ``is_saga`` is set to True.\n\n max_iter : int, default=1000\n The max number of passes over the training data if the stopping\n criteria is not reached.\n\n tol : float, default=0.001\n The stopping criteria for the weights. The iterations will stop when\n max(change in weights) / max(weights) < tol.\n\n verbose : int, default=0\n The verbosity level.\n\n random_state : int, RandomState instance or None, default=None\n Used when shuffling the data. Pass an int for reproducible output\n across multiple function calls.\n See :term:`Glossary `.\n\n check_input : bool, default=True\n If False, the input arrays X and y will not be checked.\n\n max_squared_sum : float, default=None\n Maximum squared sum of X over samples. If None, it will be computed,\n going through all the samples. The value should be precomputed\n to speed up cross validation.\n\n warm_start_mem : dict, default=None\n The initialization parameters used for warm starting. Warm starting is\n currently used in LogisticRegression but not in Ridge.\n It contains:\n - 'coef': the weight vector, with the intercept in last line\n if the intercept is fitted.\n - 'gradient_memory': the scalar gradient for all seen samples.\n - 'sum_gradient': the sum of gradient over all seen samples,\n for each feature.\n - 'intercept_sum_gradient': the sum of gradient over all seen\n samples, for the intercept.\n - 'seen': array of boolean describing the seen samples.\n - 'num_seen': the number of seen samples.\n\n is_saga : bool, default=False\n Whether to use the SAGA algorithm or the SAG algorithm. SAGA behaves\n better in the first epochs, and allow for l1 regularisation.\n\n Returns\n -------\n coef_ : ndarray of shape (n_features,)\n Weight vector.\n\n n_iter_ : int\n The number of full pass on all samples.\n\n warm_start_mem : dict\n Contains a 'coef' key with the fitted result, and possibly the\n fitted intercept at the end of the array. Contains also other keys\n used for warm starting.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import linear_model\n >>> n_samples, n_features = 10, 5\n >>> rng = np.random.RandomState(0)\n >>> X = rng.randn(n_samples, n_features)\n >>> y = rng.randn(n_samples)\n >>> clf = linear_model.Ridge(solver='sag')\n >>> clf.fit(X, y)\n Ridge(solver='sag')\n\n >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])\n >>> y = np.array([1, 1, 2, 2])\n >>> clf = linear_model.LogisticRegression(\n ... solver='sag', multi_class='multinomial')\n >>> clf.fit(X, y)\n LogisticRegression(multi_class='multinomial', solver='sag')\n\n References\n ----------\n Schmidt, M., Roux, N. L., & Bach, F. (2013).\n Minimizing finite sums with the stochastic average gradient\n https://hal.inria.fr/hal-00860051/document\n\n Defazio, A., Bach F. & Lacoste-Julien S. (2014).\n SAGA: A Fast Incremental Gradient Method With Support\n for Non-Strongly Convex Composite Objectives\n https://arxiv.org/abs/1407.0202\n\n See Also\n --------\n Ridge, SGDRegressor, ElasticNet, Lasso, SVR,\n LogisticRegression, SGDClassifier, LinearSVC, Perceptron\n \"\"\"\n if warm_start_mem is None:\n warm_start_mem = {}\n if max_iter is None:\n max_iter = 1000\n if check_input:\n _dtype = [np.float64, np.float32]\n X = check_array(X, dtype=_dtype, accept_sparse='csr', order='C')\n y = check_array(y, dtype=_dtype, ensure_2d=False, order='C')\n (n_samples, n_features) = (X.shape[0], X.shape[1])\n alpha_scaled = float(alpha) / n_samples\n beta_scaled = float(beta) / n_samples\n n_classes = int(y.max()) + 1 if loss == 'multinomial' else 1\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n if 'coef' in warm_start_mem.keys():\n coef_init = warm_start_mem['coef']\n else:\n coef_init = np.zeros((n_features, n_classes), dtype=X.dtype, order='C')\n fit_intercept = coef_init.shape[0] == n_features + 1\n if fit_intercept:\n intercept_init = coef_init[-1, :]\n coef_init = coef_init[:-1, :]\n else:\n intercept_init = np.zeros(n_classes, dtype=X.dtype)\n if 'intercept_sum_gradient' in warm_start_mem.keys():\n intercept_sum_gradient = warm_start_mem['intercept_sum_gradient']\n else:\n intercept_sum_gradient = np.zeros(n_classes, dtype=X.dtype)\n if 'gradient_memory' in warm_start_mem.keys():\n gradient_memory_init = warm_start_mem['gradient_memory']\n else:\n gradient_memory_init = np.zeros((n_samples, n_classes), dtype=X.dtype, order='C')\n if 'sum_gradient' in warm_start_mem.keys():\n sum_gradient_init = warm_start_mem['sum_gradient']\n else:\n sum_gradient_init = np.zeros((n_features, n_classes), dtype=X.dtype, order='C')\n if 'seen' in warm_start_mem.keys():\n seen_init = warm_start_mem['seen']\n else:\n seen_init = np.zeros(n_samples, dtype=np.int32, order='C')\n if 'num_seen' in warm_start_mem.keys():\n num_seen_init = warm_start_mem['num_seen']\n else:\n num_seen_init = 0\n (dataset, intercept_decay) = make_dataset(X, y, sample_weight, random_state)\n if max_squared_sum is None:\n max_squared_sum = row_norms(X, squared=True).max()\n step_size = get_auto_step_size(max_squared_sum, alpha_scaled, loss, fit_intercept, n_samples=n_samples, is_saga=is_saga)\n if step_size * alpha_scaled == 1:\n raise ZeroDivisionError('Current sag implementation does not handle the case step_size * alpha_scaled == 1')\n sag = sag64 if X.dtype == np.float64 else sag32\n (num_seen, n_iter_) = sag(dataset, coef_init, intercept_init, n_samples, n_features, n_classes, tol, max_iter, loss, step_size, alpha_scaled, beta_scaled, sum_gradient_init, gradient_memory_init, seen_init, num_seen_init, fit_intercept, intercept_sum_gradient, intercept_decay, is_saga, verbose)\n if n_iter_ == max_iter:\n warnings.warn('The max_iter was reached which means the coef_ did not converge', ConvergenceWarning)\n if fit_intercept:\n coef_init = np.vstack((coef_init, intercept_init))\n warm_start_mem = {'coef': coef_init, 'sum_gradient': sum_gradient_init, 'intercept_sum_gradient': intercept_sum_gradient, 'gradient_memory': gradient_memory_init, 'seen': seen_init, 'num_seen': num_seen}\n if loss == 'multinomial':\n coef_ = coef_init.T\n else:\n coef_ = coef_init[:, 0]\n return coef_, n_iter_, warm_start_mem" }, { "name": "__init__", @@ -109550,7 +117444,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -109560,7 +117455,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "penalty", @@ -109570,7 +117466,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -109580,7 +117477,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -109590,7 +117488,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -109600,7 +117499,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -109610,7 +117510,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -109620,7 +117521,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -109630,7 +117532,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -109640,7 +117543,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -109650,7 +117554,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -109660,7 +117565,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -109670,7 +117576,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -109680,7 +117587,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eta0", @@ -109690,7 +117598,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "power_t", @@ -109700,7 +117609,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -109710,7 +117620,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -109720,7 +117631,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -109730,7 +117642,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -109740,7 +117653,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "average", @@ -109750,13 +117664,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, loss, *, penalty='l2', alpha=0.0001, C=1.0, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False):\n self.loss = loss\n self.penalty = penalty\n self.learning_rate = learning_rate\n self.epsilon = epsilon\n self.alpha = alpha\n self.C = C\n self.l1_ratio = l1_ratio\n self.fit_intercept = fit_intercept\n self.shuffle = shuffle\n self.random_state = random_state\n self.verbose = verbose\n self.eta0 = eta0\n self.power_t = power_t\n self.early_stopping = early_stopping\n self.validation_fraction = validation_fraction\n self.n_iter_no_change = n_iter_no_change\n self.warm_start = warm_start\n self.average = average\n self.max_iter = max_iter\n self.tol = tol" }, { @@ -109774,7 +117689,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -109784,7 +117700,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -109794,7 +117711,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -109804,7 +117722,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "intercept_init", @@ -109814,7 +117733,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "one_class", @@ -109824,7 +117744,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -109848,7 +117769,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -109858,13 +117780,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_learning_rate_type(self, learning_rate):\n try:\n return LEARNING_RATE_TYPES[learning_rate]\n except KeyError as e:\n raise ValueError('learning rate %s is not supported. ' % learning_rate) from e" }, { @@ -109882,7 +117805,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -109892,7 +117816,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -109916,7 +117841,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "penalty", @@ -109926,13 +117852,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_penalty_type(self, penalty):\n penalty = str(penalty).lower()\n try:\n return PENALTY_TYPES[penalty]\n except KeyError as e:\n raise ValueError('Penalty %s is not supported. ' % penalty) from e" }, { @@ -109950,7 +117877,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "validation_mask", @@ -109960,7 +117888,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -109970,7 +117899,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -109980,7 +117910,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -109990,7 +117921,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classes", @@ -110000,13 +117932,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _make_validation_score_cb(self, validation_mask, X, y, sample_weight, classes=None):\n if not self.early_stopping:\n return None\n return _ValidationScoreCallback(self, X[validation_mask], y[validation_mask], sample_weight[validation_mask], classes=classes)" }, { @@ -110024,7 +117957,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -110034,13 +117968,14 @@ "docstring": { "type": "ndarray of shape (n_samples, )", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Split the dataset between training set and validation set.", - "docstring": "Split the dataset between training set and validation set.\n\nParameters\n----------\ny : ndarray of shape (n_samples, )\n Target values.\n\nReturns\n-------\nvalidation_mask : ndarray of shape (n_samples, )\n Equal to 1 on the validation set, 0 on the training set.", + "docstring": "Split the dataset between training set and validation set.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples, )\n Target values.\n\n Returns\n -------\n validation_mask : ndarray of shape (n_samples, )\n Equal to 1 on the validation set, 0 on the training set.\n ", "source_code": "\ndef _make_validation_split(self, y):\n \"\"\"Split the dataset between training set and validation set.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples, )\n Target values.\n\n Returns\n -------\n validation_mask : ndarray of shape (n_samples, )\n Equal to 1 on the validation set, 0 on the training set.\n \"\"\"\n n_samples = y.shape[0]\n validation_mask = np.zeros(n_samples, dtype=np.uint8)\n if not self.early_stopping:\n return validation_mask\n if is_classifier(self):\n splitter_type = StratifiedShuffleSplit\n else:\n splitter_type = ShuffleSplit\n cv = splitter_type(test_size=self.validation_fraction, random_state=self.random_state)\n (idx_train, idx_val) = next(cv.split(np.zeros(shape=(y.shape[0], 1)), y))\n if idx_train.shape[0] == 0 or idx_val.shape[0] == 0:\n raise ValueError('Splitting %d samples into a train set and a validation set with validation_fraction=%r led to an empty set (%d and %d samples). Please either change validation_fraction, increase number of samples, or disable early_stopping.' % (n_samples, self.validation_fraction, idx_train.shape[0], idx_val.shape[0]))\n validation_mask[idx_val] = 1\n return validation_mask" }, { @@ -110058,7 +117993,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "for_partial_fit", @@ -110068,7 +118004,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -110092,7 +118029,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -110102,7 +118040,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -110112,7 +118051,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -110136,7 +118076,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -110146,7 +118087,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "penalty", @@ -110156,7 +118098,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -110166,7 +118109,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -110176,7 +118120,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -110186,7 +118131,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -110196,7 +118142,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -110206,7 +118153,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -110216,7 +118164,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -110226,7 +118175,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -110236,7 +118186,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -110246,7 +118197,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -110256,7 +118208,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -110266,7 +118219,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eta0", @@ -110276,7 +118230,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "power_t", @@ -110286,7 +118241,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -110296,7 +118252,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -110306,7 +118263,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -110316,7 +118274,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -110326,7 +118285,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -110336,7 +118296,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "average", @@ -110346,13 +118307,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, loss='hinge', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False):\n super().__init__(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=epsilon, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, warm_start=warm_start, average=average)\n self.class_weight = class_weight\n self.n_jobs = n_jobs" }, { @@ -110370,7 +118332,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -110380,7 +118343,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -110390,7 +118354,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -110400,7 +118365,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -110410,7 +118376,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -110420,7 +118387,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -110430,7 +118398,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -110440,7 +118409,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "intercept_init", @@ -110450,7 +118420,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -110460,13 +118431,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None, intercept_init=None, sample_weight=None):\n self._validate_params()\n if hasattr(self, 'classes_'):\n delattr(self, 'classes_')\n y = self._validate_data(y=y)\n classes = np.unique(y)\n if self.warm_start and hasattr(self, 'coef_'):\n if coef_init is None:\n coef_init = self.coef_\n if intercept_init is None:\n intercept_init = self.intercept_\n else:\n self.coef_ = None\n self.intercept_ = None\n if self.average > 0:\n self._standard_coef = self.coef_\n self._standard_intercept = self.intercept_\n self._average_coef = None\n self._average_intercept = None\n self.t_ = 1.0\n self._partial_fit(X, y, alpha, C, loss, learning_rate, self.max_iter, classes, sample_weight, coef_init, intercept_init)\n if self.tol is not None and self.tol > -np.inf and self.n_iter_ == self.max_iter:\n warnings.warn('Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.', ConvergenceWarning)\n return self" }, { @@ -110484,7 +118456,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -110494,7 +118467,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -110504,7 +118478,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -110514,7 +118489,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -110524,7 +118500,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -110534,7 +118511,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -110544,7 +118522,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -110554,7 +118533,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -110578,7 +118558,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -110588,7 +118569,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -110598,7 +118580,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -110608,7 +118591,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -110618,7 +118602,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -110628,7 +118613,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -110638,7 +118624,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -110648,13 +118635,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Fit a multi-class classifier by combining binary classifiers\n\nEach binary classifier predicts one class versus all others. This strategy is called OvA (One versus All) or OvR (One versus Rest).", - "docstring": "Fit a multi-class classifier by combining binary classifiers\n\nEach binary classifier predicts one class versus all others. This\nstrategy is called OvA (One versus All) or OvR (One versus Rest).", + "description": "Fit a multi-class classifier by combining binary classifiers\n\nEach binary classifier predicts one class versus all others. This\nstrategy is called OvA (One versus All) or OvR (One versus Rest).", + "docstring": "Fit a multi-class classifier by combining binary classifiers\n\n Each binary classifier predicts one class versus all others. This\n strategy is called OvA (One versus All) or OvR (One versus Rest).\n ", "source_code": "\ndef _fit_multiclass(self, X, y, alpha, C, learning_rate, sample_weight, max_iter):\n \"\"\"Fit a multi-class classifier by combining binary classifiers\n\n Each binary classifier predicts one class versus all others. This\n strategy is called OvA (One versus All) or OvR (One versus Rest).\n \"\"\"\n validation_mask = self._make_validation_split(y)\n random_state = check_random_state(self.random_state)\n seeds = random_state.randint(MAX_INT, size=len(self.classes_))\n result = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(require='sharedmem'))((delayed(fit_binary)(self, i, X, y, alpha, C, learning_rate, max_iter, self._expanded_class_weight[i], 1.0, sample_weight, validation_mask=validation_mask, random_state=seed) for (i, seed) in enumerate(seeds)))\n n_iter_ = 0.0\n for (i, (_, intercept, n_iter_i)) in enumerate(result):\n self.intercept_[i] = intercept\n n_iter_ = max(n_iter_, n_iter_i)\n self.t_ += n_iter_ * X.shape[0]\n self.n_iter_ = n_iter_\n if self.average > 0:\n if self.average <= self.t_ - 1.0:\n self.coef_ = self._average_coef\n self.intercept_ = self._average_intercept\n else:\n self.coef_ = self._standard_coef\n self._standard_intercept = np.atleast_1d(self.intercept_)\n self.intercept_ = self._standard_intercept" }, { @@ -110672,7 +118660,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -110682,7 +118671,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -110692,7 +118682,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -110702,7 +118693,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -110712,7 +118704,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -110722,7 +118715,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -110732,7 +118726,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -110742,7 +118737,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classes", @@ -110752,7 +118748,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -110762,7 +118759,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -110772,7 +118770,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "intercept_init", @@ -110782,13 +118781,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, classes, sample_weight, coef_init, intercept_init):\n first_call = not hasattr(self, 'classes_')\n (X, y) = self._validate_data(X, y, accept_sparse='csr', dtype=np.float64, order='C', accept_large_sparse=False, reset=first_call)\n (n_samples, n_features) = X.shape\n _check_partial_fit_first_call(self, classes)\n n_classes = self.classes_.shape[0]\n self._expanded_class_weight = compute_class_weight(self.class_weight, classes=self.classes_, y=y)\n sample_weight = _check_sample_weight(sample_weight, X)\n if getattr(self, 'coef_', None) is None or coef_init is not None:\n self._allocate_parameter_mem(n_classes, n_features, coef_init, intercept_init)\n elif n_features != self.coef_.shape[-1]:\n raise ValueError('Number of features %d does not match previous data %d.' % (n_features, self.coef_.shape[-1]))\n self.loss_function_ = self._get_loss_function(loss)\n if not hasattr(self, 't_'):\n self.t_ = 1.0\n if n_classes > 2:\n self._fit_multiclass(X, y, alpha=alpha, C=C, learning_rate=learning_rate, sample_weight=sample_weight, max_iter=max_iter)\n elif n_classes == 2:\n self._fit_binary(X, y, alpha=alpha, C=C, learning_rate=learning_rate, sample_weight=sample_weight, max_iter=max_iter)\n else:\n raise ValueError('The number of classes has to be greater than one; got %d class' % n_classes)\n return self" }, { @@ -110806,7 +118806,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -110816,6 +118817,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -110826,7 +118831,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -110836,7 +118842,8 @@ "docstring": { "type": "ndarray of shape (n_classes, n_features), default=None", "description": "The initial coefficients to warm-start the optimization." - } + }, + "refined_type": {} }, { "name": "intercept_init", @@ -110846,7 +118853,8 @@ "docstring": { "type": "ndarray of shape (n_classes,), default=None", "description": "The initial intercept to warm-start the optimization." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -110856,13 +118864,14 @@ "docstring": { "type": "array-like, shape (n_samples,), default=None", "description": "Weights applied to individual samples.\nIf not provided, uniform weights are assumed. These weights will\nbe multiplied with class_weight (passed through the\nconstructor) if class_weight is specified." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit linear model with Stochastic Gradient Descent.", - "docstring": "Fit linear model with Stochastic Gradient Descent.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data.\n\ny : ndarray of shape (n_samples,)\n Target values.\n\ncoef_init : ndarray of shape (n_classes, n_features), default=None\n The initial coefficients to warm-start the optimization.\n\nintercept_init : ndarray of shape (n_classes,), default=None\n The initial intercept to warm-start the optimization.\n\nsample_weight : array-like, shape (n_samples,), default=None\n Weights applied to individual samples.\n If not provided, uniform weights are assumed. These weights will\n be multiplied with class_weight (passed through the\n constructor) if class_weight is specified.\n\nReturns\n-------\nself : object\n Returns an instance of self.", + "docstring": "Fit linear model with Stochastic Gradient Descent.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n coef_init : ndarray of shape (n_classes, n_features), default=None\n The initial coefficients to warm-start the optimization.\n\n intercept_init : ndarray of shape (n_classes,), default=None\n The initial intercept to warm-start the optimization.\n\n sample_weight : array-like, shape (n_samples,), default=None\n Weights applied to individual samples.\n If not provided, uniform weights are assumed. These weights will\n be multiplied with class_weight (passed through the\n constructor) if class_weight is specified.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n ", "source_code": "\ndef fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):\n \"\"\"Fit linear model with Stochastic Gradient Descent.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n coef_init : ndarray of shape (n_classes, n_features), default=None\n The initial coefficients to warm-start the optimization.\n\n intercept_init : ndarray of shape (n_classes,), default=None\n The initial intercept to warm-start the optimization.\n\n sample_weight : array-like, shape (n_samples,), default=None\n Weights applied to individual samples.\n If not provided, uniform weights are assumed. These weights will\n be multiplied with class_weight (passed through the\n constructor) if class_weight is specified.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n return self._fit(X, y, alpha=self.alpha, C=1.0, loss=self.loss, learning_rate=self.learning_rate, coef_init=coef_init, intercept_init=intercept_init, sample_weight=sample_weight)" }, { @@ -110880,7 +118889,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -110890,6 +118900,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Subset of the training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -110900,7 +118914,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Subset of the target values." - } + }, + "refined_type": {} }, { "name": "classes", @@ -110910,7 +118925,8 @@ "docstring": { "type": "ndarray of shape (n_classes,), default=None", "description": "Classes across all calls to partial_fit.\nCan be obtained by via `np.unique(y_all)`, where y_all is the\ntarget vector of the entire dataset.\nThis argument is required for the first call to partial_fit\nand can be omitted in the subsequent calls.\nNote that y doesn't need to contain all labels in `classes`." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -110920,13 +118936,14 @@ "docstring": { "type": "array-like, shape (n_samples,), default=None", "description": "Weights applied to individual samples.\nIf not provided, uniform weights are assumed." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Perform one epoch of stochastic gradient descent on given samples.\n\nInternally, this method uses ``max_iter = 1``. Therefore, it is not guaranteed that a minimum of the cost function is reached after calling it once. Matters such as objective convergence, early stopping, and learning rate adjustments should be handled by the user.", - "docstring": "Perform one epoch of stochastic gradient descent on given samples.\n\nInternally, this method uses ``max_iter = 1``. Therefore, it is not\nguaranteed that a minimum of the cost function is reached after calling\nit once. Matters such as objective convergence, early stopping, and\nlearning rate adjustments should be handled by the user.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Subset of the training data.\n\ny : ndarray of shape (n_samples,)\n Subset of the target values.\n\nclasses : ndarray of shape (n_classes,), default=None\n Classes across all calls to partial_fit.\n Can be obtained by via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that y doesn't need to contain all labels in `classes`.\n\nsample_weight : array-like, shape (n_samples,), default=None\n Weights applied to individual samples.\n If not provided, uniform weights are assumed.\n\nReturns\n-------\nself : object\n Returns an instance of self.", + "description": "Perform one epoch of stochastic gradient descent on given samples.\n\nInternally, this method uses ``max_iter = 1``. Therefore, it is not\nguaranteed that a minimum of the cost function is reached after calling\nit once. Matters such as objective convergence, early stopping, and\nlearning rate adjustments should be handled by the user.", + "docstring": "Perform one epoch of stochastic gradient descent on given samples.\n\n Internally, this method uses ``max_iter = 1``. Therefore, it is not\n guaranteed that a minimum of the cost function is reached after calling\n it once. Matters such as objective convergence, early stopping, and\n learning rate adjustments should be handled by the user.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Subset of the training data.\n\n y : ndarray of shape (n_samples,)\n Subset of the target values.\n\n classes : ndarray of shape (n_classes,), default=None\n Classes across all calls to partial_fit.\n Can be obtained by via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that y doesn't need to contain all labels in `classes`.\n\n sample_weight : array-like, shape (n_samples,), default=None\n Weights applied to individual samples.\n If not provided, uniform weights are assumed.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n ", "source_code": "\ndef partial_fit(self, X, y, classes=None, sample_weight=None):\n \"\"\"Perform one epoch of stochastic gradient descent on given samples.\n\n Internally, this method uses ``max_iter = 1``. Therefore, it is not\n guaranteed that a minimum of the cost function is reached after calling\n it once. Matters such as objective convergence, early stopping, and\n learning rate adjustments should be handled by the user.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Subset of the training data.\n\n y : ndarray of shape (n_samples,)\n Subset of the target values.\n\n classes : ndarray of shape (n_classes,), default=None\n Classes across all calls to partial_fit.\n Can be obtained by via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that y doesn't need to contain all labels in `classes`.\n\n sample_weight : array-like, shape (n_samples,), default=None\n Weights applied to individual samples.\n If not provided, uniform weights are assumed.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n self._validate_params(for_partial_fit=True)\n if self.class_weight in ['balanced']:\n raise ValueError(\"class_weight '{0}' is not supported for partial_fit. In order to use 'balanced' weights, use compute_class_weight('{0}', classes=classes, y=y). In place of y you can us a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\".format(self.class_weight))\n return self._partial_fit(X, y, alpha=self.alpha, C=1.0, loss=self.loss, learning_rate=self.learning_rate, max_iter=1, classes=classes, sample_weight=sample_weight, coef_init=None, intercept_init=None)" }, { @@ -110944,7 +118961,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -110954,7 +118972,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "penalty", @@ -110964,7 +118983,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -110974,7 +118994,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -110984,7 +119005,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -110994,7 +119016,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -111004,7 +119027,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -111014,7 +119038,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -111024,7 +119049,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -111034,7 +119060,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -111044,7 +119071,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -111054,7 +119082,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -111064,7 +119093,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eta0", @@ -111074,7 +119104,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "power_t", @@ -111084,7 +119115,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -111094,7 +119126,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -111104,7 +119137,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -111114,7 +119148,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -111124,7 +119159,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "average", @@ -111134,13 +119170,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, loss='squared_error', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False):\n super().__init__(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=epsilon, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, warm_start=warm_start, average=average)" }, { @@ -111158,7 +119195,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -111168,13 +119206,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Predict using the linear model", - "docstring": "Predict using the linear model\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n\nReturns\n-------\nndarray of shape (n_samples,)\n Predicted target values per element in X.", + "docstring": "Predict using the linear model\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n\n Returns\n -------\n ndarray of shape (n_samples,)\n Predicted target values per element in X.\n ", "source_code": "\ndef _decision_function(self, X):\n \"\"\"Predict using the linear model\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n\n Returns\n -------\n ndarray of shape (n_samples,)\n Predicted target values per element in X.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_\n return scores.ravel()" }, { @@ -111192,7 +119234,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -111202,7 +119245,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -111212,7 +119256,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -111222,7 +119267,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -111232,7 +119278,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -111242,7 +119289,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -111252,7 +119300,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -111262,7 +119311,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "intercept_init", @@ -111272,7 +119322,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -111282,13 +119333,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None, intercept_init=None, sample_weight=None):\n self._validate_params()\n if self.warm_start and getattr(self, 'coef_', None) is not None:\n if coef_init is None:\n coef_init = self.coef_\n if intercept_init is None:\n intercept_init = self.intercept_\n else:\n self.coef_ = None\n self.intercept_ = None\n self.t_ = 1.0\n self._partial_fit(X, y, alpha, C, loss, learning_rate, self.max_iter, sample_weight, coef_init, intercept_init)\n if self.tol is not None and self.tol > -np.inf and self.n_iter_ == self.max_iter:\n warnings.warn('Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.', ConvergenceWarning)\n return self" }, { @@ -111306,7 +119358,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -111316,7 +119369,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -111326,7 +119380,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -111336,7 +119391,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -111346,7 +119402,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -111356,7 +119413,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -111366,7 +119424,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -111376,7 +119435,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -111386,13 +119446,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit_regressor(self, X, y, alpha, C, loss, learning_rate, sample_weight, max_iter):\n (dataset, intercept_decay) = make_dataset(X, y, sample_weight)\n loss_function = self._get_loss_function(loss)\n penalty_type = self._get_penalty_type(self.penalty)\n learning_rate_type = self._get_learning_rate_type(learning_rate)\n if not hasattr(self, 't_'):\n self.t_ = 1.0\n validation_mask = self._make_validation_split(y)\n validation_score_cb = self._make_validation_score_cb(validation_mask, X, y, sample_weight)\n random_state = check_random_state(self.random_state)\n seed = random_state.randint(0, np.iinfo(np.int32).max)\n tol = self.tol if self.tol is not None else -np.inf\n if self.average:\n coef = self._standard_coef\n intercept = self._standard_intercept\n average_coef = self._average_coef\n average_intercept = self._average_intercept\n else:\n coef = self.coef_\n intercept = self.intercept_\n average_coef = None\n average_intercept = [0]\n (coef, intercept, average_coef, average_intercept, self.n_iter_) = _plain_sgd(coef, intercept[0], average_coef, average_intercept[0], loss_function, penalty_type, alpha, C, self.l1_ratio, dataset, validation_mask, self.early_stopping, validation_score_cb, int(self.n_iter_no_change), max_iter, tol, int(self.fit_intercept), int(self.verbose), int(self.shuffle), seed, 1.0, 1.0, learning_rate_type, self.eta0, self.power_t, 0, self.t_, intercept_decay, self.average)\n self.t_ += self.n_iter_ * X.shape[0]\n if self.average > 0:\n self._average_intercept = np.atleast_1d(average_intercept)\n self._standard_intercept = np.atleast_1d(intercept)\n if self.average <= self.t_ - 1.0:\n self.coef_ = average_coef\n self.intercept_ = np.atleast_1d(average_intercept)\n else:\n self.coef_ = coef\n self.intercept_ = np.atleast_1d(intercept)\n else:\n self.intercept_ = np.atleast_1d(intercept)" }, { @@ -111410,7 +119471,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -111420,7 +119482,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -111430,7 +119493,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -111440,7 +119504,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -111450,7 +119515,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -111460,7 +119526,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -111470,7 +119537,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -111480,7 +119548,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -111490,7 +119559,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -111500,7 +119570,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "intercept_init", @@ -111510,13 +119581,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, sample_weight, coef_init, intercept_init):\n first_call = getattr(self, 'coef_', None) is None\n (X, y) = self._validate_data(X, y, accept_sparse='csr', copy=False, order='C', dtype=np.float64, accept_large_sparse=False, reset=first_call)\n y = y.astype(np.float64, copy=False)\n (n_samples, n_features) = X.shape\n sample_weight = _check_sample_weight(sample_weight, X)\n if first_call:\n self._allocate_parameter_mem(1, n_features, coef_init, intercept_init)\n if self.average > 0 and getattr(self, '_average_coef', None) is None:\n self._average_coef = np.zeros(n_features, dtype=np.float64, order='C')\n self._average_intercept = np.zeros(1, dtype=np.float64, order='C')\n self._fit_regressor(X, y, alpha, C, loss, learning_rate, sample_weight, max_iter)\n return self" }, { @@ -111534,7 +119606,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -111544,6 +119617,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -111554,7 +119631,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -111564,7 +119642,8 @@ "docstring": { "type": "ndarray of shape (n_features,), default=None", "description": "The initial coefficients to warm-start the optimization." - } + }, + "refined_type": {} }, { "name": "intercept_init", @@ -111574,7 +119653,8 @@ "docstring": { "type": "ndarray of shape (1,), default=None", "description": "The initial intercept to warm-start the optimization." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -111584,13 +119664,14 @@ "docstring": { "type": "array-like, shape (n_samples,), default=None", "description": "Weights applied to individual samples (1. for unweighted)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit linear model with Stochastic Gradient Descent.", - "docstring": "Fit linear model with Stochastic Gradient Descent.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data.\n\ny : ndarray of shape (n_samples,)\n Target values.\n\ncoef_init : ndarray of shape (n_features,), default=None\n The initial coefficients to warm-start the optimization.\n\nintercept_init : ndarray of shape (1,), default=None\n The initial intercept to warm-start the optimization.\n\nsample_weight : array-like, shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\nReturns\n-------\nself : object\n Fitted `SGDRegressor` estimator.", + "docstring": "Fit linear model with Stochastic Gradient Descent.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n coef_init : ndarray of shape (n_features,), default=None\n The initial coefficients to warm-start the optimization.\n\n intercept_init : ndarray of shape (1,), default=None\n The initial intercept to warm-start the optimization.\n\n sample_weight : array-like, shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Fitted `SGDRegressor` estimator.\n ", "source_code": "\ndef fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):\n \"\"\"Fit linear model with Stochastic Gradient Descent.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data.\n\n y : ndarray of shape (n_samples,)\n Target values.\n\n coef_init : ndarray of shape (n_features,), default=None\n The initial coefficients to warm-start the optimization.\n\n intercept_init : ndarray of shape (1,), default=None\n The initial intercept to warm-start the optimization.\n\n sample_weight : array-like, shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Fitted `SGDRegressor` estimator.\n \"\"\"\n return self._fit(X, y, alpha=self.alpha, C=1.0, loss=self.loss, learning_rate=self.learning_rate, coef_init=coef_init, intercept_init=intercept_init, sample_weight=sample_weight)" }, { @@ -111608,7 +119689,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -111618,6 +119700,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Subset of training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -111628,7 +119714,8 @@ "docstring": { "type": "numpy array of shape (n_samples,)", "description": "Subset of target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -111638,13 +119725,14 @@ "docstring": { "type": "array-like, shape (n_samples,), default=None", "description": "Weights applied to individual samples.\nIf not provided, uniform weights are assumed." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Perform one epoch of stochastic gradient descent on given samples.\n\nInternally, this method uses ``max_iter = 1``. Therefore, it is not guaranteed that a minimum of the cost function is reached after calling it once. Matters such as objective convergence and early stopping should be handled by the user.", - "docstring": "Perform one epoch of stochastic gradient descent on given samples.\n\nInternally, this method uses ``max_iter = 1``. Therefore, it is not\nguaranteed that a minimum of the cost function is reached after calling\nit once. Matters such as objective convergence and early stopping\nshould be handled by the user.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Subset of training data.\n\ny : numpy array of shape (n_samples,)\n Subset of target values.\n\nsample_weight : array-like, shape (n_samples,), default=None\n Weights applied to individual samples.\n If not provided, uniform weights are assumed.\n\nReturns\n-------\nself : object\n Returns an instance of self.", + "description": "Perform one epoch of stochastic gradient descent on given samples.\n\nInternally, this method uses ``max_iter = 1``. Therefore, it is not\nguaranteed that a minimum of the cost function is reached after calling\nit once. Matters such as objective convergence and early stopping\nshould be handled by the user.", + "docstring": "Perform one epoch of stochastic gradient descent on given samples.\n\n Internally, this method uses ``max_iter = 1``. Therefore, it is not\n guaranteed that a minimum of the cost function is reached after calling\n it once. Matters such as objective convergence and early stopping\n should be handled by the user.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Subset of training data.\n\n y : numpy array of shape (n_samples,)\n Subset of target values.\n\n sample_weight : array-like, shape (n_samples,), default=None\n Weights applied to individual samples.\n If not provided, uniform weights are assumed.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n ", "source_code": "\ndef partial_fit(self, X, y, sample_weight=None):\n \"\"\"Perform one epoch of stochastic gradient descent on given samples.\n\n Internally, this method uses ``max_iter = 1``. Therefore, it is not\n guaranteed that a minimum of the cost function is reached after calling\n it once. Matters such as objective convergence and early stopping\n should be handled by the user.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Subset of training data.\n\n y : numpy array of shape (n_samples,)\n Subset of target values.\n\n sample_weight : array-like, shape (n_samples,), default=None\n Weights applied to individual samples.\n If not provided, uniform weights are assumed.\n\n Returns\n -------\n self : object\n Returns an instance of self.\n \"\"\"\n self._validate_params(for_partial_fit=True)\n return self._partial_fit(X, y, self.alpha, C=1.0, loss=self.loss, learning_rate=self.learning_rate, max_iter=1, sample_weight=sample_weight, coef_init=None, intercept_init=None)" }, { @@ -111662,7 +119750,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -111672,13 +119761,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Predict using the linear model.", - "docstring": "Predict using the linear model.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Input data.\n\nReturns\n-------\nndarray of shape (n_samples,)\n Predicted target values per element in X.", + "docstring": "Predict using the linear model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n ndarray of shape (n_samples,)\n Predicted target values per element in X.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict using the linear model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n ndarray of shape (n_samples,)\n Predicted target values per element in X.\n \"\"\"\n return self._decision_function(X)" }, { @@ -111696,7 +119789,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -111706,7 +119800,8 @@ "docstring": { "type": "str, default='hinge'", "description": "The loss function to be used. Defaults to 'hinge', which gives a\nlinear SVM.\n\nThe possible options are 'hinge', 'log', 'modified_huber',\n'squared_hinge', 'perceptron', or a regression loss: 'squared_error',\n'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\n\nThe 'log' loss gives logistic regression, a probabilistic classifier.\n'modified_huber' is another smooth loss that brings tolerance to\noutliers as well as probability estimates.\n'squared_hinge' is like hinge but is quadratically penalized.\n'perceptron' is the linear loss used by the perceptron algorithm.\nThe other losses are designed for regression but can be useful in\nclassification as well; see\n:class:`~sklearn.linear_model.SGDRegressor` for a description.\n\nMore details about the losses formulas can be found in the\n:ref:`User Guide `.\n\n.. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent." - } + }, + "refined_type": {} }, { "name": "penalty", @@ -111716,6 +119811,10 @@ "docstring": { "type": "{'l2', 'l1', 'elasticnet'}, default='l2'", "description": "The penalty (aka regularization term) to be used. Defaults to 'l2'\nwhich is the standard regularizer for linear SVM models. 'l1' and\n'elasticnet' might bring sparsity to the model (feature selection)\nnot achievable with 'l2'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["l2", "l1", "elasticnet"] } }, { @@ -111726,7 +119825,8 @@ "docstring": { "type": "float, default=0.0001", "description": "Constant that multiplies the regularization term. The higher the\nvalue, the stronger the regularization.\nAlso used to compute the learning rate when set to `learning_rate` is\nset to 'optimal'." - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -111736,7 +119836,8 @@ "docstring": { "type": "float, default=0.15", "description": "The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.\nl1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.\nOnly used if `penalty` is 'elasticnet'." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -111746,7 +119847,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the intercept should be estimated or not. If False, the\ndata is assumed to be already centered." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -111756,7 +119858,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of passes over the training data (aka epochs).\nIt only impacts the behavior in the ``fit`` method, and not the\n:meth:`partial_fit` method.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "tol", @@ -111766,7 +119869,8 @@ "docstring": { "type": "float, default=1e-3", "description": "The stopping criterion. If it is not None, training will stop\nwhen (loss > best_loss - tol) for ``n_iter_no_change`` consecutive\nepochs.\nConvergence is checked against the training loss or the\nvalidation loss depending on the `early_stopping` parameter.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -111776,7 +119880,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not the training data should be shuffled after each epoch." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -111786,7 +119891,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -111796,7 +119902,8 @@ "docstring": { "type": "float, default=0.1", "description": "Epsilon in the epsilon-insensitive loss functions; only if `loss` is\n'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\nFor 'huber', determines the threshold at which it becomes less\nimportant to get the prediction exactly right.\nFor epsilon-insensitive, any differences between the current prediction\nand the correct label are ignored if they are less than this threshold." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -111806,7 +119913,8 @@ "docstring": { "type": "int, default=None", "description": "The number of CPUs to use to do the OVA (One Versus All, for\nmulti-class problems) computation.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -111816,7 +119924,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used for shuffling the data, when ``shuffle`` is set to ``True``.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -111826,7 +119935,8 @@ "docstring": { "type": "str, default='optimal'", "description": "The learning rate schedule:\n\n- 'constant': `eta = eta0`\n- 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n- 'invscaling': `eta = eta0 / pow(t, power_t)`\n- 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n .. versionadded:: 0.20\n Added 'adaptive' option" - } + }, + "refined_type": {} }, { "name": "eta0", @@ -111834,9 +119944,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "double, default=0.0", + "type": "float, default=0.0", "description": "The initial learning rate for the 'constant', 'invscaling' or\n'adaptive' schedules. The default value is 0.0 as eta0 is not used by\nthe default schedule 'optimal'." - } + }, + "refined_type": {} }, { "name": "power_t", @@ -111844,9 +119955,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "double, default=0.5", + "type": "float, default=0.5", "description": "The exponent for inverse scaling learning rate [default 0.5]." - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -111856,7 +119968,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use early stopping to terminate training when validation\nscore is not improving. If set to True, it will automatically set aside\na stratified fraction of training data as validation and terminate\ntraining when validation score returned by the `score` method is not\nimproving by at least tol for n_iter_no_change consecutive epochs.\n\n.. versionadded:: 0.20\n Added 'early_stopping' option" - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -111866,7 +119979,8 @@ "docstring": { "type": "float, default=0.1", "description": "The proportion of training data to set aside as validation set for\nearly stopping. Must be between 0 and 1.\nOnly used if `early_stopping` is True.\n\n.. versionadded:: 0.20\n Added 'validation_fraction' option" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -111876,7 +119990,8 @@ "docstring": { "type": "int, default=5", "description": "Number of iterations with no improvement to wait before stopping\nfitting.\nConvergence is checked against the training loss or the\nvalidation loss depending on the `early_stopping` parameter.\n\n.. versionadded:: 0.20\n Added 'n_iter_no_change' option" - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -111886,6 +120001,10 @@ "docstring": { "type": "dict, {class_label: weight} or \"balanced\", default=None", "description": "Preset for the class_weight fit parameter.\n\nWeights associated with classes. If not given, all classes\nare supposed to have weight one.\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -111896,7 +120015,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to True, reuse the solution of the previous call to fit as\ninitialization, otherwise, just erase the previous solution.\nSee :term:`the Glossary `.\n\nRepeatedly calling fit or partial_fit when warm_start is True can\nresult in a different solution than when calling fit a single time\nbecause of the way the data is shuffled.\nIf a dynamic learning rate is used, the learning rate is adapted\ndepending on the number of samples already seen. Calling ``fit`` resets\nthis counter, while ``partial_fit`` will result in increasing the\nexisting counter." - } + }, + "refined_type": {} }, { "name": "average", @@ -111906,13 +120026,14 @@ "docstring": { "type": "bool or int, default=False", "description": "When set to True, computes the averaged SGD weights across all\nupdates and stores the result in the ``coef_`` attribute. If set to\nan int greater than 1, averaging will begin once the total number of\nsamples seen reaches `average`. So ``average=10`` will begin\naveraging after seeing 10 samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, loss='hinge', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False):\n super().__init__(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=epsilon, n_jobs=n_jobs, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, class_weight=class_weight, warm_start=warm_start, average=average)" }, { @@ -111930,13 +120051,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_proba(self):\n if self.loss not in ('log', 'modified_huber'):\n raise AttributeError('probability estimates are not available for loss=%r' % self.loss)\n return True" }, { @@ -111954,13 +120076,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -111978,7 +120101,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -111988,13 +120112,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input data for prediction." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Log of probability estimates.\n\nThis method is only available for log loss and modified Huber loss. When loss=\"modified_huber\", probability estimates may be hard zeros and ones, so taking the logarithm is not possible. See ``predict_proba`` for details.", - "docstring": "Log of probability estimates.\n\nThis method is only available for log loss and modified Huber loss.\n\nWhen loss=\"modified_huber\", probability estimates may be hard zeros\nand ones, so taking the logarithm is not possible.\n\nSee ``predict_proba`` for details.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data for prediction.\n\nReturns\n-------\nT : array-like, shape (n_samples, n_classes)\n Returns the log-probability of the sample for each class in the\n model, where classes are ordered as they are in\n `self.classes_`.", + "description": "Log of probability estimates.\n\nThis method is only available for log loss and modified Huber loss.\n\nWhen loss=\"modified_huber\", probability estimates may be hard zeros\nand ones, so taking the logarithm is not possible.\n\nSee ``predict_proba`` for details.", + "docstring": "Log of probability estimates.\n\n This method is only available for log loss and modified Huber loss.\n\n When loss=\"modified_huber\", probability estimates may be hard zeros\n and ones, so taking the logarithm is not possible.\n\n See ``predict_proba`` for details.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data for prediction.\n\n Returns\n -------\n T : array-like, shape (n_samples, n_classes)\n Returns the log-probability of the sample for each class in the\n model, where classes are ordered as they are in\n `self.classes_`.\n ", "source_code": "\n@available_if(_check_proba)\ndef predict_log_proba(self, X):\n \"\"\"Log of probability estimates.\n\n This method is only available for log loss and modified Huber loss.\n\n When loss=\"modified_huber\", probability estimates may be hard zeros\n and ones, so taking the logarithm is not possible.\n\n See ``predict_proba`` for details.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input data for prediction.\n\n Returns\n -------\n T : array-like, shape (n_samples, n_classes)\n Returns the log-probability of the sample for each class in the\n model, where classes are ordered as they are in\n `self.classes_`.\n \"\"\"\n return np.log(self.predict_proba(X))" }, { @@ -112012,7 +120140,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -112022,13 +120151,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Input data for prediction." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Probability estimates.\n\nThis method is only available for log loss and modified Huber loss. Multiclass probability estimates are derived from binary (one-vs.-rest) estimates by simple normalization, as recommended by Zadrozny and Elkan. Binary probability estimates for loss=\"modified_huber\" are given by (clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions it is necessary to perform proper probability calibration by wrapping the classifier with :class:`~sklearn.calibration.CalibratedClassifierCV` instead.", - "docstring": "Probability estimates.\n\nThis method is only available for log loss and modified Huber loss.\n\nMulticlass probability estimates are derived from binary (one-vs.-rest)\nestimates by simple normalization, as recommended by Zadrozny and\nElkan.\n\nBinary probability estimates for loss=\"modified_huber\" are given by\n(clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions\nit is necessary to perform proper probability calibration by wrapping\nthe classifier with\n:class:`~sklearn.calibration.CalibratedClassifierCV` instead.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Input data for prediction.\n\nReturns\n-------\nndarray of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in `self.classes_`.\n\nReferences\n----------\nZadrozny and Elkan, \"Transforming classifier scores into multiclass\nprobability estimates\", SIGKDD'02,\nhttps://dl.acm.org/doi/pdf/10.1145/775047.775151\n\nThe justification for the formula in the loss=\"modified_huber\"\ncase is in the appendix B in:\nhttp://jmlr.csail.mit.edu/papers/volume2/zhang02c/zhang02c.pdf", + "description": "Probability estimates.\n\nThis method is only available for log loss and modified Huber loss.\n\nMulticlass probability estimates are derived from binary (one-vs.-rest)\nestimates by simple normalization, as recommended by Zadrozny and\nElkan.\n\nBinary probability estimates for loss=\"modified_huber\" are given by\n(clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions\nit is necessary to perform proper probability calibration by wrapping\nthe classifier with\n:class:`~sklearn.calibration.CalibratedClassifierCV` instead.", + "docstring": "Probability estimates.\n\n This method is only available for log loss and modified Huber loss.\n\n Multiclass probability estimates are derived from binary (one-vs.-rest)\n estimates by simple normalization, as recommended by Zadrozny and\n Elkan.\n\n Binary probability estimates for loss=\"modified_huber\" are given by\n (clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions\n it is necessary to perform proper probability calibration by wrapping\n the classifier with\n :class:`~sklearn.calibration.CalibratedClassifierCV` instead.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Input data for prediction.\n\n Returns\n -------\n ndarray of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in `self.classes_`.\n\n References\n ----------\n Zadrozny and Elkan, \"Transforming classifier scores into multiclass\n probability estimates\", SIGKDD'02,\n https://dl.acm.org/doi/pdf/10.1145/775047.775151\n\n The justification for the formula in the loss=\"modified_huber\"\n case is in the appendix B in:\n http://jmlr.csail.mit.edu/papers/volume2/zhang02c/zhang02c.pdf\n ", "source_code": "\n@available_if(_check_proba)\ndef predict_proba(self, X):\n \"\"\"Probability estimates.\n\n This method is only available for log loss and modified Huber loss.\n\n Multiclass probability estimates are derived from binary (one-vs.-rest)\n estimates by simple normalization, as recommended by Zadrozny and\n Elkan.\n\n Binary probability estimates for loss=\"modified_huber\" are given by\n (clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions\n it is necessary to perform proper probability calibration by wrapping\n the classifier with\n :class:`~sklearn.calibration.CalibratedClassifierCV` instead.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Input data for prediction.\n\n Returns\n -------\n ndarray of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in `self.classes_`.\n\n References\n ----------\n Zadrozny and Elkan, \"Transforming classifier scores into multiclass\n probability estimates\", SIGKDD'02,\n https://dl.acm.org/doi/pdf/10.1145/775047.775151\n\n The justification for the formula in the loss=\"modified_huber\"\n case is in the appendix B in:\n http://jmlr.csail.mit.edu/papers/volume2/zhang02c/zhang02c.pdf\n \"\"\"\n check_is_fitted(self)\n if self.loss == 'log':\n return self._predict_proba_lr(X)\n elif self.loss == 'modified_huber':\n binary = len(self.classes_) == 2\n scores = self.decision_function(X)\n if binary:\n prob2 = np.ones((scores.shape[0], 2))\n prob = prob2[:, 1]\n else:\n prob = scores\n np.clip(scores, -1, 1, prob)\n prob += 1.0\n prob /= 2.0\n if binary:\n prob2[:, 0] -= prob\n prob = prob2\n else:\n prob_sum = prob.sum(axis=1)\n all_zero = prob_sum == 0\n if np.any(all_zero):\n prob[all_zero, :] = 1\n prob_sum[all_zero] = len(self.classes_)\n prob /= prob_sum.reshape((prob.shape[0], -1))\n return prob\n else:\n raise NotImplementedError(\"predict_(log_)proba only supported when loss='log' or loss='modified_huber' (%r given)\" % self.loss)" }, { @@ -112046,7 +120179,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nu", @@ -112054,8 +120188,16 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "float, optional", + "type": "float, default=0.5", "description": "The nu parameter of the One Class SVM: an upper bound on the\nfraction of training errors and a lower bound of the fraction of\nsupport vectors. Should be in the interval (0, 1]. By default 0.5\nwill be taken." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": false, + "max_inclusive": true } }, { @@ -112064,9 +120206,10 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "bool", + "type": "bool, default=True", "description": "Whether the intercept should be estimated or not. Defaults to True." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -112074,9 +120217,10 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "int, optional", + "type": "int, default=1000", "description": "The maximum number of passes over the training data (aka epochs).\nIt only impacts the behavior in the ``fit`` method, and not the\n`partial_fit`. Defaults to 1000." - } + }, + "refined_type": {} }, { "name": "tol", @@ -112084,9 +120228,10 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "float or None, optional", + "type": "float or None, default=1e-3", "description": "The stopping criterion. If it is not None, the iterations will stop\nwhen (loss > previous_loss - tol). Defaults to 1e-3." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -112094,9 +120239,10 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "bool, optional", + "type": "bool, default=True", "description": "Whether or not the training data should be shuffled after each epoch.\nDefaults to True." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -112104,9 +120250,10 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "int, optional", + "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -112114,9 +120261,10 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "int, RandomState instance or None, optional (default=None)", + "type": "int, RandomState instance or None, default=None", "description": "The seed of the pseudo random number generator to use when shuffling\nthe data. If int, random_state is the seed used by the random number\ngenerator; If RandomState instance, random_state is the random number\ngenerator; If None, the random number generator is the RandomState\ninstance used by `np.random`." - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -112124,8 +120272,17 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "str, optional", - "description": "The learning rate schedule to use with `fit`. (If using `partial_fit`,\nlearning rate must be controlled directly).\n\n'constant':\n eta = eta0\n'optimal': [default]\n eta = 1.0 / (alpha * (t + t0))\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n'invscaling':\n eta = eta0 / pow(t, power_t)\n'adaptive':\n eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5." + "type": "{'constant', 'optimal', 'invscaling', 'adaptive'}, default='optimal'", + "description": "The learning rate schedule to use with `fit`. (If using `partial_fit`,\nlearning rate must be controlled directly).\n\n- 'constant': `eta = eta0`\n- 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n- 'invscaling': `eta = eta0 / pow(t, power_t)`\n- 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "optimal", + "adaptive", + "constant", + "invscaling" + ] } }, { @@ -112134,9 +120291,10 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "double", + "type": "float, default=0.0", "description": "The initial learning rate for the 'constant', 'invscaling' or\n'adaptive' schedules. The default value is 0.0 as eta0 is not used by\nthe default schedule 'optimal'." - } + }, + "refined_type": {} }, { "name": "power_t", @@ -112144,9 +120302,10 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "double", + "type": "float, default=0.5", "description": "The exponent for inverse scaling learning rate [default 0.5]." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -112154,9 +120313,10 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "bool, optional", + "type": "bool, default=False", "description": "When set to True, reuse the solution of the previous call to fit as\ninitialization, otherwise, just erase the previous solution.\nSee :term:`the Glossary `.\n\nRepeatedly calling fit or partial_fit when warm_start is True can\nresult in a different solution than when calling fit a single time\nbecause of the way the data is shuffled.\nIf a dynamic learning rate is used, the learning rate is adapted\ndepending on the number of samples already seen. Calling ``fit`` resets\nthis counter, while ``partial_fit`` will result in increasing the\nexisting counter." - } + }, + "refined_type": {} }, { "name": "average", @@ -112164,15 +120324,16 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "bool or int, optional", + "type": "bool or int, default=False", "description": "When set to True, computes the averaged SGD weights and stores the\nresult in the ``coef_`` attribute. If set to an int greater than 1,\naveraging will begin once the total number of samples seen reaches\naverage. So ``average=10`` will begin averaging after seeing 10\nsamples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, nu=0.5, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, warm_start=False, average=False):\n alpha = nu / 2\n self.nu = nu\n super(SGDOneClassSVM, self).__init__(loss='hinge', penalty='l2', alpha=alpha, C=1.0, l1_ratio=0, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=DEFAULT_EPSILON, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=warm_start, average=average)" }, { @@ -112190,7 +120351,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -112200,7 +120362,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -112210,7 +120373,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -112220,7 +120384,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -112230,7 +120395,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -112240,7 +120406,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -112250,7 +120417,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "offset_init", @@ -112260,7 +120428,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -112270,13 +120439,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit(self, X, alpha, C, loss, learning_rate, coef_init=None, offset_init=None, sample_weight=None):\n self._validate_params()\n if self.warm_start and hasattr(self, 'coef_'):\n if coef_init is None:\n coef_init = self.coef_\n if offset_init is None:\n offset_init = self.offset_\n else:\n self.coef_ = None\n self.offset_ = None\n self.t_ = 1.0\n self._partial_fit(X, alpha, C, loss, learning_rate, self.max_iter, sample_weight, coef_init, offset_init)\n if self.tol is not None and self.tol > -np.inf and self.n_iter_ == self.max_iter:\n warnings.warn('Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.', ConvergenceWarning)\n return self" }, { @@ -112294,7 +120464,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -112304,7 +120475,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -112314,7 +120486,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -112324,7 +120497,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -112334,7 +120508,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -112344,7 +120519,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -112354,7 +120530,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -112378,13 +120555,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -112402,7 +120580,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -112412,7 +120591,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -112422,7 +120602,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -112432,7 +120613,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -112442,7 +120624,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -112452,7 +120635,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -112462,7 +120646,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -112472,7 +120657,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -112482,7 +120668,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "offset_init", @@ -112492,13 +120679,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _partial_fit(self, X, alpha, C, loss, learning_rate, max_iter, sample_weight, coef_init, offset_init):\n first_call = getattr(self, 'coef_', None) is None\n X = self._validate_data(X, None, accept_sparse='csr', dtype=np.float64, order='C', accept_large_sparse=False, reset=first_call)\n n_features = X.shape[1]\n sample_weight = _check_sample_weight(sample_weight, X)\n if getattr(self, 'coef_', None) is None or coef_init is not None:\n self._allocate_parameter_mem(1, n_features, coef_init, offset_init, 1)\n elif n_features != self.coef_.shape[-1]:\n raise ValueError('Number of features %d does not match previous data %d.' % (n_features, self.coef_.shape[-1]))\n if self.average and getattr(self, '_average_coef', None) is None:\n self._average_coef = np.zeros(n_features, dtype=np.float64, order='C')\n self._average_intercept = np.zeros(1, dtype=np.float64, order='C')\n self.loss_function_ = self._get_loss_function(loss)\n if not hasattr(self, 't_'):\n self.t_ = 1.0\n self._fit_one_class(X, alpha=alpha, C=C, learning_rate=learning_rate, sample_weight=sample_weight, max_iter=max_iter)\n return self" }, { @@ -112516,7 +120704,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "for_partial_fit", @@ -112526,7 +120715,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -112550,7 +120740,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -112560,13 +120751,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Testing data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Signed distance to the separating hyperplane.\n\nSigned distance is positive for an inlier and negative for an outlier.", - "docstring": "Signed distance to the separating hyperplane.\n\nSigned distance is positive for an inlier and negative for an\noutlier.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\nReturns\n-------\ndec : array-like, shape (n_samples,)\n Decision function values of the samples.", + "description": "Signed distance to the separating hyperplane.\n\nSigned distance is positive for an inlier and negative for an\noutlier.", + "docstring": "Signed distance to the separating hyperplane.\n\n Signed distance is positive for an inlier and negative for an\n outlier.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\n Returns\n -------\n dec : array-like, shape (n_samples,)\n Decision function values of the samples.\n ", "source_code": "\ndef decision_function(self, X):\n \"\"\"Signed distance to the separating hyperplane.\n\n Signed distance is positive for an inlier and negative for an\n outlier.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\n Returns\n -------\n dec : array-like, shape (n_samples,)\n Decision function values of the samples.\n \"\"\"\n check_is_fitted(self, 'coef_')\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n decisions = safe_sparse_dot(X, self.coef_.T, dense_output=True) - self.offset_\n return decisions.ravel()" }, { @@ -112584,7 +120779,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -112594,6 +120790,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -112604,7 +120804,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "coef_init", @@ -112614,7 +120815,8 @@ "docstring": { "type": "array, shape (n_classes, n_features)", "description": "The initial coefficients to warm-start the optimization." - } + }, + "refined_type": {} }, { "name": "offset_init", @@ -112624,7 +120826,8 @@ "docstring": { "type": "array, shape (n_classes,)", "description": "The initial offset to warm-start the optimization." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -112634,13 +120837,14 @@ "docstring": { "type": "array-like, shape (n_samples,), optional", "description": "Weights applied to individual samples.\nIf not provided, uniform weights are assumed. These weights will\nbe multiplied with class_weight (passed through the\nconstructor) if class_weight is specified." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit linear One-Class SVM with Stochastic Gradient Descent.\n\nThis solves an equivalent optimization problem of the One-Class SVM primal optimization problem and returns a weight vector w and an offset rho such that the decision function is given by - rho.", - "docstring": "Fit linear One-Class SVM with Stochastic Gradient Descent.\n\nThis solves an equivalent optimization problem of the\nOne-Class SVM primal optimization problem and returns a weight vector\nw and an offset rho such that the decision function is given by\n - rho.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data.\ny : Ignored\n Not used, present for API consistency by convention.\n\ncoef_init : array, shape (n_classes, n_features)\n The initial coefficients to warm-start the optimization.\n\noffset_init : array, shape (n_classes,)\n The initial offset to warm-start the optimization.\n\nsample_weight : array-like, shape (n_samples,), optional\n Weights applied to individual samples.\n If not provided, uniform weights are assumed. These weights will\n be multiplied with class_weight (passed through the\n constructor) if class_weight is specified.\n\nReturns\n-------\nself : object\n Returns a fitted instance of self.", + "description": "Fit linear One-Class SVM with Stochastic Gradient Descent.\n\nThis solves an equivalent optimization problem of the\nOne-Class SVM primal optimization problem and returns a weight vector\nw and an offset rho such that the decision function is given by\n - rho.", + "docstring": "Fit linear One-Class SVM with Stochastic Gradient Descent.\n\n This solves an equivalent optimization problem of the\n One-Class SVM primal optimization problem and returns a weight vector\n w and an offset rho such that the decision function is given by\n - rho.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data.\n y : Ignored\n Not used, present for API consistency by convention.\n\n coef_init : array, shape (n_classes, n_features)\n The initial coefficients to warm-start the optimization.\n\n offset_init : array, shape (n_classes,)\n The initial offset to warm-start the optimization.\n\n sample_weight : array-like, shape (n_samples,), optional\n Weights applied to individual samples.\n If not provided, uniform weights are assumed. These weights will\n be multiplied with class_weight (passed through the\n constructor) if class_weight is specified.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n ", "source_code": "\ndef fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None):\n \"\"\"Fit linear One-Class SVM with Stochastic Gradient Descent.\n\n This solves an equivalent optimization problem of the\n One-Class SVM primal optimization problem and returns a weight vector\n w and an offset rho such that the decision function is given by\n - rho.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Training data.\n y : Ignored\n Not used, present for API consistency by convention.\n\n coef_init : array, shape (n_classes, n_features)\n The initial coefficients to warm-start the optimization.\n\n offset_init : array, shape (n_classes,)\n The initial offset to warm-start the optimization.\n\n sample_weight : array-like, shape (n_samples,), optional\n Weights applied to individual samples.\n If not provided, uniform weights are assumed. These weights will\n be multiplied with class_weight (passed through the\n constructor) if class_weight is specified.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n alpha = self.nu / 2\n self._fit(X, alpha=alpha, C=1.0, loss=self.loss, learning_rate=self.learning_rate, coef_init=coef_init, offset_init=offset_init, sample_weight=sample_weight)\n return self" }, { @@ -112658,7 +120862,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -112668,6 +120873,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Subset of the training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -112678,7 +120887,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -112688,13 +120898,14 @@ "docstring": { "type": "array-like, shape (n_samples,), optional", "description": "Weights applied to individual samples.\nIf not provided, uniform weights are assumed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit linear One-Class SVM with Stochastic Gradient Descent.", - "docstring": "Fit linear One-Class SVM with Stochastic Gradient Descent.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Subset of the training data.\ny : Ignored\n Not used, present for API consistency by convention.\n\nsample_weight : array-like, shape (n_samples,), optional\n Weights applied to individual samples.\n If not provided, uniform weights are assumed.\n\nReturns\n-------\nself : object\n Returns a fitted instance of self.", + "docstring": "Fit linear One-Class SVM with Stochastic Gradient Descent.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Subset of the training data.\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like, shape (n_samples,), optional\n Weights applied to individual samples.\n If not provided, uniform weights are assumed.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n ", "source_code": "\ndef partial_fit(self, X, y=None, sample_weight=None):\n \"\"\"Fit linear One-Class SVM with Stochastic Gradient Descent.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Subset of the training data.\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like, shape (n_samples,), optional\n Weights applied to individual samples.\n If not provided, uniform weights are assumed.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n alpha = self.nu / 2\n self._validate_params(for_partial_fit=True)\n return self._partial_fit(X, alpha, C=1.0, loss=self.loss, learning_rate=self.learning_rate, max_iter=1, sample_weight=sample_weight, coef_init=None, offset_init=None)" }, { @@ -112712,7 +120923,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -112722,13 +120934,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Testing data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Return labels (1 inlier, -1 outlier) of the samples.", - "docstring": "Return labels (1 inlier, -1 outlier) of the samples.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\nReturns\n-------\ny : array, shape (n_samples,)\n Labels of the samples.", + "docstring": "Return labels (1 inlier, -1 outlier) of the samples.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\n Returns\n -------\n y : array, shape (n_samples,)\n Labels of the samples.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Return labels (1 inlier, -1 outlier) of the samples.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\n Returns\n -------\n y : array, shape (n_samples,)\n Labels of the samples.\n \"\"\"\n y = (self.decision_function(X) >= 0).astype(np.int32)\n y[y == 0] = -1\n return y" }, { @@ -112746,7 +120962,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -112756,13 +120973,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Testing data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Raw scoring function of the samples.", - "docstring": "Raw scoring function of the samples.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\nReturns\n-------\nscore_samples : array-like, shape (n_samples,)\n Unshiffted scoring function values of the samples.", + "docstring": "Raw scoring function of the samples.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\n Returns\n -------\n score_samples : array-like, shape (n_samples,)\n Unshiffted scoring function values of the samples.\n ", "source_code": "\ndef score_samples(self, X):\n \"\"\"Raw scoring function of the samples.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Testing data.\n\n Returns\n -------\n score_samples : array-like, shape (n_samples,)\n Unshiffted scoring function values of the samples.\n \"\"\"\n score_samples = self.decision_function(X) + self.offset_\n return score_samples" }, { @@ -112780,7 +121001,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -112790,7 +121012,8 @@ "docstring": { "type": "str, default='squared_error'", "description": "The loss function to be used. The possible values are 'squared_error',\n'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'\n\nThe 'squared_error' refers to the ordinary least squares fit.\n'huber' modifies 'squared_error' to focus less on getting outliers\ncorrect by switching from squared to linear loss past a distance of\nepsilon. 'epsilon_insensitive' ignores errors less than epsilon and is\nlinear past that; this is the loss function used in SVR.\n'squared_epsilon_insensitive' is the same but becomes squared loss past\na tolerance of epsilon.\n\nMore details about the losses formulas can be found in the\n:ref:`User Guide `.\n\n.. deprecated:: 1.0\n The loss 'squared_loss' was deprecated in v1.0 and will be removed\n in version 1.2. Use `loss='squared_error'` which is equivalent." - } + }, + "refined_type": {} }, { "name": "penalty", @@ -112800,6 +121023,10 @@ "docstring": { "type": "{'l2', 'l1', 'elasticnet'}, default='l2'", "description": "The penalty (aka regularization term) to be used. Defaults to 'l2'\nwhich is the standard regularizer for linear SVM models. 'l1' and\n'elasticnet' might bring sparsity to the model (feature selection)\nnot achievable with 'l2'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["l2", "l1", "elasticnet"] } }, { @@ -112810,7 +121037,8 @@ "docstring": { "type": "float, default=0.0001", "description": "Constant that multiplies the regularization term. The higher the\nvalue, the stronger the regularization.\nAlso used to compute the learning rate when set to `learning_rate` is\nset to 'optimal'." - } + }, + "refined_type": {} }, { "name": "l1_ratio", @@ -112820,7 +121048,8 @@ "docstring": { "type": "float, default=0.15", "description": "The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.\nl1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.\nOnly used if `penalty` is 'elasticnet'." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -112830,7 +121059,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether the intercept should be estimated or not. If False, the\ndata is assumed to be already centered." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -112840,7 +121070,8 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of passes over the training data (aka epochs).\nIt only impacts the behavior in the ``fit`` method, and not the\n:meth:`partial_fit` method.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "tol", @@ -112850,7 +121081,8 @@ "docstring": { "type": "float, default=1e-3", "description": "The stopping criterion. If it is not None, training will stop\nwhen (loss > best_loss - tol) for ``n_iter_no_change`` consecutive\nepochs.\nConvergence is checked against the training loss or the\nvalidation loss depending on the `early_stopping` parameter.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -112860,7 +121092,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not the training data should be shuffled after each epoch." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -112870,7 +121103,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -112880,7 +121114,8 @@ "docstring": { "type": "float, default=0.1", "description": "Epsilon in the epsilon-insensitive loss functions; only if `loss` is\n'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'.\nFor 'huber', determines the threshold at which it becomes less\nimportant to get the prediction exactly right.\nFor epsilon-insensitive, any differences between the current prediction\nand the correct label are ignored if they are less than this threshold." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -112890,7 +121125,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Used for shuffling the data, when ``shuffle`` is set to ``True``.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -112900,7 +121136,8 @@ "docstring": { "type": "str, default='invscaling'", "description": "The learning rate schedule:\n\n- 'constant': `eta = eta0`\n- 'optimal': `eta = 1.0 / (alpha * (t + t0))`\n where t0 is chosen by a heuristic proposed by Leon Bottou.\n- 'invscaling': `eta = eta0 / pow(t, power_t)`\n- 'adaptive': eta = eta0, as long as the training keeps decreasing.\n Each time n_iter_no_change consecutive epochs fail to decrease the\n training loss by tol or fail to increase validation score by tol if\n early_stopping is True, the current learning rate is divided by 5.\n\n .. versionadded:: 0.20\n Added 'adaptive' option" - } + }, + "refined_type": {} }, { "name": "eta0", @@ -112908,9 +121145,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "double, default=0.01", + "type": "float, default=0.01", "description": "The initial learning rate for the 'constant', 'invscaling' or\n'adaptive' schedules. The default value is 0.01." - } + }, + "refined_type": {} }, { "name": "power_t", @@ -112918,9 +121156,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "double, default=0.25", + "type": "float, default=0.25", "description": "The exponent for inverse scaling learning rate." - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -112930,7 +121169,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use early stopping to terminate training when validation\nscore is not improving. If set to True, it will automatically set aside\na fraction of training data as validation and terminate\ntraining when validation score returned by the `score` method is not\nimproving by at least `tol` for `n_iter_no_change` consecutive\nepochs.\n\n.. versionadded:: 0.20\n Added 'early_stopping' option" - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -112940,7 +121180,8 @@ "docstring": { "type": "float, default=0.1", "description": "The proportion of training data to set aside as validation set for\nearly stopping. Must be between 0 and 1.\nOnly used if `early_stopping` is True.\n\n.. versionadded:: 0.20\n Added 'validation_fraction' option" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -112950,7 +121191,8 @@ "docstring": { "type": "int, default=5", "description": "Number of iterations with no improvement to wait before stopping\nfitting.\nConvergence is checked against the training loss or the\nvalidation loss depending on the `early_stopping` parameter.\n\n.. versionadded:: 0.20\n Added 'n_iter_no_change' option" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -112960,7 +121202,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to True, reuse the solution of the previous call to fit as\ninitialization, otherwise, just erase the previous solution.\nSee :term:`the Glossary `.\n\nRepeatedly calling fit or partial_fit when warm_start is True can\nresult in a different solution than when calling fit a single time\nbecause of the way the data is shuffled.\nIf a dynamic learning rate is used, the learning rate is adapted\ndepending on the number of samples already seen. Calling ``fit`` resets\nthis counter, while ``partial_fit`` will result in increasing the\nexisting counter." - } + }, + "refined_type": {} }, { "name": "average", @@ -112970,13 +121213,14 @@ "docstring": { "type": "bool or int, default=False", "description": "When set to True, computes the averaged SGD weights across all\nupdates and stores the result in the ``coef_`` attribute. If set to\nan int greater than 1, averaging will begin once the total number of\nsamples seen reaches `average`. So ``average=10`` will begin\naveraging after seeing 10 samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, loss='squared_error', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False):\n super().__init__(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, shuffle=shuffle, verbose=verbose, epsilon=epsilon, random_state=random_state, learning_rate=learning_rate, eta0=eta0, power_t=power_t, early_stopping=early_stopping, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, warm_start=warm_start, average=average)" }, { @@ -112994,13 +121238,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -113018,7 +121263,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef", @@ -113028,7 +121274,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "intercept", @@ -113038,13 +121285,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __call__(self, coef, intercept):\n est = self.estimator\n est.coef_ = coef.reshape(1, -1)\n est.intercept_ = np.atleast_1d(intercept)\n return est.score(self.X_val, self.y_val, self.sample_weight_val)" }, { @@ -113062,7 +121310,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -113072,7 +121321,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_val", @@ -113082,7 +121332,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_val", @@ -113092,7 +121343,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight_val", @@ -113102,7 +121354,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classes", @@ -113112,13 +121365,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, X_val, y_val, sample_weight_val, classes=None):\n self.estimator = clone(estimator)\n self.estimator.t_ = 1\n if classes is not None:\n self.estimator.classes_ = classes\n self.X_val = X_val\n self.y_val = y_val\n self.sample_weight_val = sample_weight_val" }, { @@ -113136,7 +121390,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -113146,7 +121401,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "i", @@ -113156,13 +121412,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Initialization for fit_binary.\n\nReturns y, coef, intercept, average_coef, average_intercept.", - "docstring": "Initialization for fit_binary.\n\nReturns y, coef, intercept, average_coef, average_intercept.", + "docstring": "Initialization for fit_binary.\n\n Returns y, coef, intercept, average_coef, average_intercept.\n ", "source_code": "\ndef _prepare_fit_binary(est, y, i):\n \"\"\"Initialization for fit_binary.\n\n Returns y, coef, intercept, average_coef, average_intercept.\n \"\"\"\n y_i = np.ones(y.shape, dtype=np.float64, order='C')\n y_i[y != est.classes_[i]] = -1.0\n average_intercept = 0\n average_coef = None\n if len(est.classes_) == 2:\n if not est.average:\n coef = est.coef_.ravel()\n intercept = est.intercept_[0]\n else:\n coef = est._standard_coef.ravel()\n intercept = est._standard_intercept[0]\n average_coef = est._average_coef.ravel()\n average_intercept = est._average_intercept[0]\n elif not est.average:\n coef = est.coef_[i]\n intercept = est.intercept_[i]\n else:\n coef = est._standard_coef[i]\n intercept = est._standard_intercept[i]\n average_coef = est._average_coef[i]\n average_intercept = est._average_intercept[i]\n return y_i, coef, intercept, average_coef, average_intercept" }, { @@ -113180,7 +121437,8 @@ "docstring": { "type": "Estimator object", "description": "The estimator to fit" - } + }, + "refined_type": {} }, { "name": "i", @@ -113190,7 +121448,8 @@ "docstring": { "type": "int", "description": "Index of the positive class" - } + }, + "refined_type": {} }, { "name": "X", @@ -113200,7 +121459,8 @@ "docstring": { "type": "numpy array or sparse matrix of shape [n_samples,n_features]", "description": "Training data" - } + }, + "refined_type": {} }, { "name": "y", @@ -113210,7 +121470,8 @@ "docstring": { "type": "numpy array of shape [n_samples, ]", "description": "Target values" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -113220,7 +121481,8 @@ "docstring": { "type": "float", "description": "The regularization parameter" - } + }, + "refined_type": {} }, { "name": "C", @@ -113230,7 +121492,8 @@ "docstring": { "type": "float", "description": "Maximum step size for passive aggressive" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -113240,7 +121503,8 @@ "docstring": { "type": "str", "description": "The learning rate. Accepted values are 'constant', 'optimal',\n'invscaling', 'pa1' and 'pa2'." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -113250,7 +121514,8 @@ "docstring": { "type": "int", "description": "The maximum number of iterations (epochs)" - } + }, + "refined_type": {} }, { "name": "pos_weight", @@ -113260,7 +121525,8 @@ "docstring": { "type": "float", "description": "The weight of the positive class" - } + }, + "refined_type": {} }, { "name": "neg_weight", @@ -113270,7 +121536,8 @@ "docstring": { "type": "float", "description": "The weight of the negative class" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -113280,7 +121547,8 @@ "docstring": { "type": "numpy array of shape [n_samples, ]", "description": "The weight of each sample" - } + }, + "refined_type": {} }, { "name": "validation_mask", @@ -113290,7 +121558,8 @@ "docstring": { "type": "numpy array of shape [n_samples, ], default=None", "description": "Precomputed validation mask in case _fit_binary is called in the\ncontext of a one-vs-rest reduction." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -113300,13 +121569,14 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "If int, random_state is the seed used by the random number generator;\nIf RandomState instance, random_state is the random number generator;\nIf None, the random number generator is the RandomState instance used\nby `np.random`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit a single binary classifier.\n\nThe i'th class is considered the \"positive\" class.", - "docstring": "Fit a single binary classifier.\n\nThe i'th class is considered the \"positive\" class.\n\nParameters\n----------\nest : Estimator object\n The estimator to fit\n\ni : int\n Index of the positive class\n\nX : numpy array or sparse matrix of shape [n_samples,n_features]\n Training data\n\ny : numpy array of shape [n_samples, ]\n Target values\n\nalpha : float\n The regularization parameter\n\nC : float\n Maximum step size for passive aggressive\n\nlearning_rate : str\n The learning rate. Accepted values are 'constant', 'optimal',\n 'invscaling', 'pa1' and 'pa2'.\n\nmax_iter : int\n The maximum number of iterations (epochs)\n\npos_weight : float\n The weight of the positive class\n\nneg_weight : float\n The weight of the negative class\n\nsample_weight : numpy array of shape [n_samples, ]\n The weight of each sample\n\nvalidation_mask : numpy array of shape [n_samples, ], default=None\n Precomputed validation mask in case _fit_binary is called in the\n context of a one-vs-rest reduction.\n\nrandom_state : int, RandomState instance, default=None\n If int, random_state is the seed used by the random number generator;\n If RandomState instance, random_state is the random number generator;\n If None, the random number generator is the RandomState instance used\n by `np.random`.", + "docstring": "Fit a single binary classifier.\n\n The i'th class is considered the \"positive\" class.\n\n Parameters\n ----------\n est : Estimator object\n The estimator to fit\n\n i : int\n Index of the positive class\n\n X : numpy array or sparse matrix of shape [n_samples,n_features]\n Training data\n\n y : numpy array of shape [n_samples, ]\n Target values\n\n alpha : float\n The regularization parameter\n\n C : float\n Maximum step size for passive aggressive\n\n learning_rate : str\n The learning rate. Accepted values are 'constant', 'optimal',\n 'invscaling', 'pa1' and 'pa2'.\n\n max_iter : int\n The maximum number of iterations (epochs)\n\n pos_weight : float\n The weight of the positive class\n\n neg_weight : float\n The weight of the negative class\n\n sample_weight : numpy array of shape [n_samples, ]\n The weight of each sample\n\n validation_mask : numpy array of shape [n_samples, ], default=None\n Precomputed validation mask in case _fit_binary is called in the\n context of a one-vs-rest reduction.\n\n random_state : int, RandomState instance, default=None\n If int, random_state is the seed used by the random number generator;\n If RandomState instance, random_state is the random number generator;\n If None, the random number generator is the RandomState instance used\n by `np.random`.\n ", "source_code": "\ndef fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter, pos_weight, neg_weight, sample_weight, validation_mask=None, random_state=None):\n \"\"\"Fit a single binary classifier.\n\n The i'th class is considered the \"positive\" class.\n\n Parameters\n ----------\n est : Estimator object\n The estimator to fit\n\n i : int\n Index of the positive class\n\n X : numpy array or sparse matrix of shape [n_samples,n_features]\n Training data\n\n y : numpy array of shape [n_samples, ]\n Target values\n\n alpha : float\n The regularization parameter\n\n C : float\n Maximum step size for passive aggressive\n\n learning_rate : str\n The learning rate. Accepted values are 'constant', 'optimal',\n 'invscaling', 'pa1' and 'pa2'.\n\n max_iter : int\n The maximum number of iterations (epochs)\n\n pos_weight : float\n The weight of the positive class\n\n neg_weight : float\n The weight of the negative class\n\n sample_weight : numpy array of shape [n_samples, ]\n The weight of each sample\n\n validation_mask : numpy array of shape [n_samples, ], default=None\n Precomputed validation mask in case _fit_binary is called in the\n context of a one-vs-rest reduction.\n\n random_state : int, RandomState instance, default=None\n If int, random_state is the seed used by the random number generator;\n If RandomState instance, random_state is the random number generator;\n If None, the random number generator is the RandomState instance used\n by `np.random`.\n \"\"\"\n (y_i, coef, intercept, average_coef, average_intercept) = _prepare_fit_binary(est, y, i)\n assert y_i.shape[0] == y.shape[0] == sample_weight.shape[0]\n random_state = check_random_state(random_state)\n (dataset, intercept_decay) = make_dataset(X, y_i, sample_weight, random_state=random_state)\n penalty_type = est._get_penalty_type(est.penalty)\n learning_rate_type = est._get_learning_rate_type(learning_rate)\n if validation_mask is None:\n validation_mask = est._make_validation_split(y_i)\n classes = np.array([-1, 1], dtype=y_i.dtype)\n validation_score_cb = est._make_validation_score_cb(validation_mask, X, y_i, sample_weight, classes=classes)\n seed = random_state.randint(MAX_INT)\n tol = est.tol if est.tol is not None else -np.inf\n (coef, intercept, average_coef, average_intercept, n_iter_) = _plain_sgd(coef, intercept, average_coef, average_intercept, est.loss_function_, penalty_type, alpha, C, est.l1_ratio, dataset, validation_mask, est.early_stopping, validation_score_cb, int(est.n_iter_no_change), max_iter, tol, int(est.fit_intercept), int(est.verbose), int(est.shuffle), seed, pos_weight, neg_weight, learning_rate_type, est.eta0, est.power_t, 0, est.t_, intercept_decay, est.average)\n if est.average:\n if len(est.classes_) == 2:\n est._average_intercept[0] = average_intercept\n else:\n est._average_intercept[i] = average_intercept\n return coef, intercept, n_iter_" }, { @@ -113324,7 +121594,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -113334,7 +121605,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations." - } + }, + "refined_type": {} }, { "name": "copy_X", @@ -113344,7 +121616,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, X will be copied; else, it may be overwritten." - } + }, + "refined_type": {} }, { "name": "max_subpopulation", @@ -113354,7 +121627,8 @@ "docstring": { "type": "int, default=1e4", "description": "Instead of computing with a set of cardinality 'n choose k', where n is\nthe number of samples and k is the number of subsamples (at least\nnumber of features), consider only a stochastic subpopulation of a\ngiven maximal size if 'n choose k' is larger than max_subpopulation.\nFor other than small problem sizes this parameter will determine\nmemory usage and runtime if n_subsamples is not changed." - } + }, + "refined_type": {} }, { "name": "n_subsamples", @@ -113364,7 +121638,8 @@ "docstring": { "type": "int, default=None", "description": "Number of samples to calculate the parameters. This is at least the\nnumber of features (plus 1 if fit_intercept=True) and the number of\nsamples as a maximum. A lower number leads to a higher breakdown\npoint and a low efficiency while a high number leads to a low\nbreakdown point and a high efficiency. If None, take the\nminimum number of subsamples leading to maximal robustness.\nIf n_subsamples is set to n_samples, Theil-Sen is identical to least\nsquares." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -113374,7 +121649,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations for the calculation of spatial median." - } + }, + "refined_type": {} }, { "name": "tol", @@ -113382,9 +121658,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "float, default=1.e-3", + "type": "float, default=1e-3", "description": "Tolerance when calculating spatial median." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -113394,7 +121671,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "A random number generator instance to define the state of the random\npermutations generator. Pass an int for reproducible output across\nmultiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -113404,7 +121682,8 @@ "docstring": { "type": "int, default=None", "description": "Number of CPUs to use during the cross validation.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -113414,13 +121693,14 @@ "docstring": { "type": "bool, default=False", "description": "Verbose mode when fitting the model." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, fit_intercept=True, copy_X=True, max_subpopulation=10000.0, n_subsamples=None, max_iter=300, tol=0.001, random_state=None, n_jobs=None, verbose=False):\n self.fit_intercept = fit_intercept\n self.copy_X = copy_X\n self.max_subpopulation = int(max_subpopulation)\n self.n_subsamples = n_subsamples\n self.max_iter = max_iter\n self.tol = tol\n self.random_state = random_state\n self.n_jobs = n_jobs\n self.verbose = verbose" }, { @@ -113438,7 +121718,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -113448,7 +121729,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -113458,13 +121740,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_subparams(self, n_samples, n_features):\n n_subsamples = self.n_subsamples\n if self.fit_intercept:\n n_dim = n_features + 1\n else:\n n_dim = n_features\n if n_subsamples is not None:\n if n_subsamples > n_samples:\n raise ValueError('Invalid parameter since n_subsamples > n_samples ({0} > {1}).'.format(n_subsamples, n_samples))\n if n_samples >= n_features:\n if n_dim > n_subsamples:\n plus_1 = '+1' if self.fit_intercept else ''\n raise ValueError('Invalid parameter since n_features{0} > n_subsamples ({1} > {2}).'.format(plus_1, n_dim, n_samples))\n elif n_subsamples != n_samples:\n raise ValueError('Invalid parameter since n_subsamples != n_samples ({0} != {1}) while n_samples < n_features.'.format(n_subsamples, n_samples))\n else:\n n_subsamples = min(n_dim, n_samples)\n if self.max_subpopulation <= 0:\n raise ValueError('Subpopulation must be strictly positive ({0} <= 0).'.format(self.max_subpopulation))\n all_combinations = max(1, np.rint(binom(n_samples, n_subsamples)))\n n_subpopulation = int(min(self.max_subpopulation, all_combinations))\n return n_subsamples, n_subpopulation" }, { @@ -113482,7 +121765,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -113492,7 +121776,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -113502,13 +121787,14 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit linear model.", - "docstring": "Fit linear model.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Training data.\ny : ndarray of shape (n_samples,)\n Target values.\n\nReturns\n-------\nself : returns an instance of self.\n Fitted `TheilSenRegressor` estimator.", + "docstring": "Fit linear model.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n y : ndarray of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : returns an instance of self.\n Fitted `TheilSenRegressor` estimator.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit linear model.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n y : ndarray of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : returns an instance of self.\n Fitted `TheilSenRegressor` estimator.\n \"\"\"\n random_state = check_random_state(self.random_state)\n (X, y) = self._validate_data(X, y, y_numeric=True)\n (n_samples, n_features) = X.shape\n (n_subsamples, self.n_subpopulation_) = self._check_subparams(n_samples, n_features)\n self.breakdown_ = _breakdown_point(n_samples, n_subsamples)\n if self.verbose:\n print('Breakdown point: {0}'.format(self.breakdown_))\n print('Number of samples: {0}'.format(n_samples))\n tol_outliers = int(self.breakdown_ * n_samples)\n print('Tolerable outliers: {0}'.format(tol_outliers))\n print('Number of subpopulations: {0}'.format(self.n_subpopulation_))\n if np.rint(binom(n_samples, n_subsamples)) <= self.max_subpopulation:\n indices = list(combinations(range(n_samples), n_subsamples))\n else:\n indices = [random_state.choice(n_samples, size=n_subsamples, replace=False) for _ in range(self.n_subpopulation_)]\n n_jobs = effective_n_jobs(self.n_jobs)\n index_list = np.array_split(indices, n_jobs)\n weights = Parallel(n_jobs=n_jobs, verbose=self.verbose)((delayed(_lstsq)(X, y, index_list[job], self.fit_intercept) for job in range(n_jobs)))\n weights = np.vstack(weights)\n (self.n_iter_, coefs) = _spatial_median(weights, max_iter=self.max_iter, tol=self.tol)\n if self.fit_intercept:\n self.intercept_ = coefs[0]\n self.coef_ = coefs[1:]\n else:\n self.intercept_ = 0.0\n self.coef_ = coefs\n return self" }, { @@ -113526,7 +121812,8 @@ "docstring": { "type": "int", "description": "Number of samples." - } + }, + "refined_type": {} }, { "name": "n_subsamples", @@ -113536,13 +121823,14 @@ "docstring": { "type": "int", "description": "Number of subsamples to consider." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Approximation of the breakdown point.", - "docstring": "Approximation of the breakdown point.\n\nParameters\n----------\nn_samples : int\n Number of samples.\n\nn_subsamples : int\n Number of subsamples to consider.\n\nReturns\n-------\nbreakdown_point : float\n Approximation of breakdown point.", + "docstring": "Approximation of the breakdown point.\n\n Parameters\n ----------\n n_samples : int\n Number of samples.\n\n n_subsamples : int\n Number of subsamples to consider.\n\n Returns\n -------\n breakdown_point : float\n Approximation of breakdown point.\n ", "source_code": "\ndef _breakdown_point(n_samples, n_subsamples):\n \"\"\"Approximation of the breakdown point.\n\n Parameters\n ----------\n n_samples : int\n Number of samples.\n\n n_subsamples : int\n Number of subsamples to consider.\n\n Returns\n -------\n breakdown_point : float\n Approximation of breakdown point.\n \"\"\"\n return 1 - (0.5**(1 / n_subsamples) * (n_samples - n_subsamples + 1) + n_subsamples - 1) / n_samples" }, { @@ -113560,7 +121848,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Design matrix, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -113570,7 +121859,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target vector, where `n_samples` is the number of samples." - } + }, + "refined_type": {} }, { "name": "indices", @@ -113580,7 +121870,8 @@ "docstring": { "type": "ndarray of shape (n_subpopulation, n_subsamples)", "description": "Indices of all subsamples with respect to the chosen subpopulation." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -113590,13 +121881,14 @@ "docstring": { "type": "bool", "description": "Fit intercept or not." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Least Squares Estimator for TheilSenRegressor class.\n\nThis function calculates the least squares method on a subset of rows of X and y defined by the indices array. Optionally, an intercept column is added if intercept is set to true.", - "docstring": "Least Squares Estimator for TheilSenRegressor class.\n\nThis function calculates the least squares method on a subset of rows of X\nand y defined by the indices array. Optionally, an intercept column is\nadded if intercept is set to true.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Design matrix, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : ndarray of shape (n_samples,)\n Target vector, where `n_samples` is the number of samples.\n\nindices : ndarray of shape (n_subpopulation, n_subsamples)\n Indices of all subsamples with respect to the chosen subpopulation.\n\nfit_intercept : bool\n Fit intercept or not.\n\nReturns\n-------\nweights : ndarray of shape (n_subpopulation, n_features + intercept)\n Solution matrix of n_subpopulation solved least square problems.", + "description": "Least Squares Estimator for TheilSenRegressor class.\n\nThis function calculates the least squares method on a subset of rows of X\nand y defined by the indices array. Optionally, an intercept column is\nadded if intercept is set to true.", + "docstring": "Least Squares Estimator for TheilSenRegressor class.\n\n This function calculates the least squares method on a subset of rows of X\n and y defined by the indices array. Optionally, an intercept column is\n added if intercept is set to true.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Design matrix, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : ndarray of shape (n_samples,)\n Target vector, where `n_samples` is the number of samples.\n\n indices : ndarray of shape (n_subpopulation, n_subsamples)\n Indices of all subsamples with respect to the chosen subpopulation.\n\n fit_intercept : bool\n Fit intercept or not.\n\n Returns\n -------\n weights : ndarray of shape (n_subpopulation, n_features + intercept)\n Solution matrix of n_subpopulation solved least square problems.\n ", "source_code": "\ndef _lstsq(X, y, indices, fit_intercept):\n \"\"\"Least Squares Estimator for TheilSenRegressor class.\n\n This function calculates the least squares method on a subset of rows of X\n and y defined by the indices array. Optionally, an intercept column is\n added if intercept is set to true.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Design matrix, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : ndarray of shape (n_samples,)\n Target vector, where `n_samples` is the number of samples.\n\n indices : ndarray of shape (n_subpopulation, n_subsamples)\n Indices of all subsamples with respect to the chosen subpopulation.\n\n fit_intercept : bool\n Fit intercept or not.\n\n Returns\n -------\n weights : ndarray of shape (n_subpopulation, n_features + intercept)\n Solution matrix of n_subpopulation solved least square problems.\n \"\"\"\n fit_intercept = int(fit_intercept)\n n_features = X.shape[1] + fit_intercept\n n_subsamples = indices.shape[1]\n weights = np.empty((indices.shape[0], n_features))\n X_subpopulation = np.ones((n_subsamples, n_features))\n y_subpopulation = np.zeros(max(n_subsamples, n_features))\n (lstsq, ) = get_lapack_funcs(('gelss', ), (X_subpopulation, y_subpopulation))\n for (index, subset) in enumerate(indices):\n X_subpopulation[:, fit_intercept:] = X[subset, :]\n y_subpopulation[:n_subsamples] = y[subset]\n weights[index] = lstsq(X_subpopulation, y_subpopulation)[1][:n_features]\n return weights" }, { @@ -113614,7 +121906,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "x_old", @@ -113624,13 +121917,14 @@ "docstring": { "type": "ndarray of shape = (n_features,)", "description": "Current start vector." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Modified Weiszfeld step.\n\nThis function defines one iteration step in order to approximate the spatial median (L1 median). It is a form of an iteratively re-weighted least squares method.", - "docstring": "Modified Weiszfeld step.\n\nThis function defines one iteration step in order to approximate the\nspatial median (L1 median). It is a form of an iteratively re-weighted\nleast squares method.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nx_old : ndarray of shape = (n_features,)\n Current start vector.\n\nReturns\n-------\nx_new : ndarray of shape (n_features,)\n New iteration step.\n\nReferences\n----------\n- On Computation of Spatial Median for Robust Data Mining, 2005\n T. K\u00e4rkk\u00e4inen and S. \u00c4yr\u00e4m\u00f6\n http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf", + "description": "Modified Weiszfeld step.\n\nThis function defines one iteration step in order to approximate the\nspatial median (L1 median). It is a form of an iteratively re-weighted\nleast squares method.", + "docstring": "Modified Weiszfeld step.\n\n This function defines one iteration step in order to approximate the\n spatial median (L1 median). It is a form of an iteratively re-weighted\n least squares method.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n x_old : ndarray of shape = (n_features,)\n Current start vector.\n\n Returns\n -------\n x_new : ndarray of shape (n_features,)\n New iteration step.\n\n References\n ----------\n - On Computation of Spatial Median for Robust Data Mining, 2005\n T. K\u00e4rkk\u00e4inen and S. \u00c4yr\u00e4m\u00f6\n http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf\n ", "source_code": "\ndef _modified_weiszfeld_step(X, x_old):\n \"\"\"Modified Weiszfeld step.\n\n This function defines one iteration step in order to approximate the\n spatial median (L1 median). It is a form of an iteratively re-weighted\n least squares method.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n x_old : ndarray of shape = (n_features,)\n Current start vector.\n\n Returns\n -------\n x_new : ndarray of shape (n_features,)\n New iteration step.\n\n References\n ----------\n - On Computation of Spatial Median for Robust Data Mining, 2005\n T. K\u00e4rkk\u00e4inen and S. \u00c4yr\u00e4m\u00f6\n http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf\n \"\"\"\n diff = X - x_old\n diff_norm = np.sqrt(np.sum(diff**2, axis=1))\n mask = diff_norm >= _EPSILON\n is_x_old_in_X = int(mask.sum() < X.shape[0])\n diff = diff[mask]\n diff_norm = diff_norm[mask][:, np.newaxis]\n quotient_norm = linalg.norm(np.sum(diff / diff_norm, axis=0))\n if quotient_norm > _EPSILON:\n new_direction = np.sum(X[mask, :] / diff_norm, axis=0) / np.sum(1 / diff_norm, axis=0)\n else:\n new_direction = 1.0\n quotient_norm = 1.0\n return max(0.0, 1.0 - is_x_old_in_X / quotient_norm) * new_direction + min(1.0, is_x_old_in_X / quotient_norm) * x_old" }, { @@ -113648,7 +121942,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -113658,7 +121953,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "tol", @@ -113668,13 +121964,14 @@ "docstring": { "type": "float, default=1.e-3", "description": "Stop the algorithm if spatial_median has converged." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Spatial median (L1 median).\n\nThe spatial median is member of a class of so-called M-estimators which are defined by an optimization problem. Given a number of p points in an n-dimensional space, the point x minimizing the sum of all distances to the p other points is called spatial median.", - "docstring": "Spatial median (L1 median).\n\nThe spatial median is member of a class of so-called M-estimators which\nare defined by an optimization problem. Given a number of p points in an\nn-dimensional space, the point x minimizing the sum of all distances to the\np other points is called spatial median.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nmax_iter : int, default=300\n Maximum number of iterations.\n\ntol : float, default=1.e-3\n Stop the algorithm if spatial_median has converged.\n\nReturns\n-------\nspatial_median : ndarray of shape = (n_features,)\n Spatial median.\n\nn_iter : int\n Number of iterations needed.\n\nReferences\n----------\n- On Computation of Spatial Median for Robust Data Mining, 2005\n T. K\u00e4rkk\u00e4inen and S. \u00c4yr\u00e4m\u00f6\n http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf", + "description": "Spatial median (L1 median).\n\nThe spatial median is member of a class of so-called M-estimators which\nare defined by an optimization problem. Given a number of p points in an\nn-dimensional space, the point x minimizing the sum of all distances to the\np other points is called spatial median.", + "docstring": "Spatial median (L1 median).\n\n The spatial median is member of a class of so-called M-estimators which\n are defined by an optimization problem. Given a number of p points in an\n n-dimensional space, the point x minimizing the sum of all distances to the\n p other points is called spatial median.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n max_iter : int, default=300\n Maximum number of iterations.\n\n tol : float, default=1.e-3\n Stop the algorithm if spatial_median has converged.\n\n Returns\n -------\n spatial_median : ndarray of shape = (n_features,)\n Spatial median.\n\n n_iter : int\n Number of iterations needed.\n\n References\n ----------\n - On Computation of Spatial Median for Robust Data Mining, 2005\n T. K\u00e4rkk\u00e4inen and S. \u00c4yr\u00e4m\u00f6\n http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf\n ", "source_code": "\ndef _spatial_median(X, max_iter=300, tol=0.001):\n \"\"\"Spatial median (L1 median).\n\n The spatial median is member of a class of so-called M-estimators which\n are defined by an optimization problem. Given a number of p points in an\n n-dimensional space, the point x minimizing the sum of all distances to the\n p other points is called spatial median.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n max_iter : int, default=300\n Maximum number of iterations.\n\n tol : float, default=1.e-3\n Stop the algorithm if spatial_median has converged.\n\n Returns\n -------\n spatial_median : ndarray of shape = (n_features,)\n Spatial median.\n\n n_iter : int\n Number of iterations needed.\n\n References\n ----------\n - On Computation of Spatial Median for Robust Data Mining, 2005\n T. K\u00e4rkk\u00e4inen and S. \u00c4yr\u00e4m\u00f6\n http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf\n \"\"\"\n if X.shape[1] == 1:\n return 1, np.median(X.ravel(), keepdims=True)\n tol **= 2\n spatial_median_old = np.mean(X, axis=0)\n for n_iter in range(max_iter):\n spatial_median = _modified_weiszfeld_step(X, spatial_median_old)\n if np.sum((spatial_median_old - spatial_median)**2) < tol:\n break\n else:\n spatial_median_old = spatial_median\n else:\n warnings.warn('Maximum number of iterations {max_iter} reached in spatial median for TheilSen regressor.'.format(max_iter=max_iter), ConvergenceWarning)\n return n_iter, spatial_median" }, { @@ -113692,7 +121989,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -113702,13 +122000,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n from numpy.distutils.misc_util import Configuration\n config = Configuration('linear_model', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_extension('_cd_fast', sources=['_cd_fast.pyx'], include_dirs=numpy.get_include(), libraries=libraries)\n config.add_extension('_sgd_fast', sources=['_sgd_fast.pyx'], include_dirs=numpy.get_include(), libraries=libraries)\n templates = ['sklearn/linear_model/_sag_fast.pyx.tp']\n gen_from_templates(templates)\n config.add_extension('_sag_fast', sources=['_sag_fast.pyx'], include_dirs=numpy.get_include())\n config.add_subpackage('tests')\n config.add_subpackage('_glm')\n config.add_subpackage('_glm/tests')\n return config" }, { @@ -113726,7 +122025,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -113736,7 +122036,8 @@ "docstring": { "type": "int, default=5", "description": "Number of neighbors to consider for each point." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -113746,7 +122047,8 @@ "docstring": { "type": "int, default=2", "description": "Number of coordinates for the manifold." - } + }, + "refined_type": {} }, { "name": "eigen_solver", @@ -113756,6 +122058,10 @@ "docstring": { "type": "{'auto', 'arpack', 'dense'}, default='auto'", "description": "'auto' : Attempt to choose the most efficient solver\nfor the given problem.\n\n'arpack' : Use Arnoldi decomposition to find the eigenvalues\nand eigenvectors.\n\n'dense' : Use a direct solver (i.e. LAPACK)\nfor the eigenvalue decomposition." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "dense", "arpack"] } }, { @@ -113766,7 +122072,8 @@ "docstring": { "type": "float, default=0", "description": "Convergence tolerance passed to arpack or lobpcg.\nnot used if eigen_solver == 'dense'." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -113776,7 +122083,8 @@ "docstring": { "type": "int, default=None", "description": "Maximum number of iterations for the arpack solver.\nnot used if eigen_solver == 'dense'." - } + }, + "refined_type": {} }, { "name": "path_method", @@ -113786,6 +122094,10 @@ "docstring": { "type": "{'auto', 'FW', 'D'}, default='auto'", "description": "Method to use in finding shortest path.\n\n'auto' : attempt to choose the best algorithm automatically.\n\n'FW' : Floyd-Warshall algorithm.\n\n'D' : Dijkstra's algorithm." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "FW", "D"] } }, { @@ -113796,6 +122108,10 @@ "docstring": { "type": "{'auto', 'brute', 'kd_tree', 'ball_tree'}, default='auto'", "description": "Algorithm to use for nearest neighbors search,\npassed to neighbors.NearestNeighbors instance." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -113806,7 +122122,8 @@ "docstring": { "type": "int or None, default=None", "description": "The number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "metric", @@ -113816,7 +122133,8 @@ "docstring": { "type": "str, or callable, default=\"minkowski\"", "description": "The metric to use when calculating distance between instances in a\nfeature array. If metric is a string or callable, it must be one of\nthe options allowed by :func:`sklearn.metrics.pairwise_distances` for\nits metric parameter.\nIf metric is \"precomputed\", X is assumed to be a distance matrix and\nmust be square. X may be a :term:`Glossary `.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "p", @@ -113826,7 +122144,8 @@ "docstring": { "type": "int, default=2", "description": "Parameter for the Minkowski metric from\nsklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\nequivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -113836,13 +122155,14 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, n_neighbors=5, n_components=2, eigen_solver='auto', tol=0, max_iter=None, path_method='auto', neighbors_algorithm='auto', n_jobs=None, metric='minkowski', p=2, metric_params=None):\n self.n_neighbors = n_neighbors\n self.n_components = n_components\n self.eigen_solver = eigen_solver\n self.tol = tol\n self.max_iter = max_iter\n self.path_method = path_method\n self.neighbors_algorithm = neighbors_algorithm\n self.n_jobs = n_jobs\n self.metric = metric\n self.p = p\n self.metric_params = metric_params" }, { @@ -113860,7 +122180,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -113870,14 +122191,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", - "source_code": "\ndef _fit_transform(self, X):\n self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.neighbors_algorithm, metric=self.metric, p=self.p, metric_params=self.metric_params, n_jobs=self.n_jobs)\n self.nbrs_.fit(X)\n self.n_features_in_ = self.nbrs_.n_features_in_\n if hasattr(self.nbrs_, 'feature_names_in_'):\n self.feature_names_in_ = self.nbrs_.feature_names_in_\n self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel='precomputed', eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter, n_jobs=self.n_jobs)\n kng = kneighbors_graph(self.nbrs_, self.n_neighbors, metric=self.metric, p=self.p, metric_params=self.metric_params, mode='distance', n_jobs=self.n_jobs)\n (n_connected_components, labels) = connected_components(kng)\n if n_connected_components > 1:\n if self.metric == 'precomputed':\n raise RuntimeError(f\"The number of connected components of the neighbors graph is {n_connected_components} > 1. The graph cannot be completed with metric='precomputed', and Isomap cannot befitted. Increase the number of neighbors to avoid this issue.\")\n warnings.warn(f'The number of connected components of the neighbors graph is {n_connected_components} > 1. Completing the graph to fit Isomap might be slow. Increase the number of neighbors to avoid this issue.', stacklevel=2)\n kng = _fix_connected_components(X=self.nbrs_._fit_X, graph=kng, n_connected_components=n_connected_components, component_labels=labels, mode='distance', metric=self.nbrs_.effective_metric_, **self.nbrs_.effective_metric_params_)\n if parse_version(scipy.__version__) < parse_version('1.3.2'):\n kng.data += 1e-15\n self.dist_matrix_ = shortest_path(kng, method=self.path_method, directed=False)\n G = self.dist_matrix_**2\n G *= -0.5\n self.embedding_ = self.kernel_pca_.fit_transform(G)" + "docstring": null, + "source_code": "\ndef _fit_transform(self, X):\n self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.neighbors_algorithm, metric=self.metric, p=self.p, metric_params=self.metric_params, n_jobs=self.n_jobs)\n self.nbrs_.fit(X)\n self.n_features_in_ = self.nbrs_.n_features_in_\n if hasattr(self.nbrs_, 'feature_names_in_'):\n self.feature_names_in_ = self.nbrs_.feature_names_in_\n self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel='precomputed', eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter, n_jobs=self.n_jobs)\n kng = kneighbors_graph(self.nbrs_, self.n_neighbors, metric=self.metric, p=self.p, metric_params=self.metric_params, mode='distance', n_jobs=self.n_jobs)\n (n_connected_components, labels) = connected_components(kng)\n if n_connected_components > 1:\n if self.metric == 'precomputed' and issparse(X):\n raise RuntimeError(f\"The number of connected components of the neighbors graph is {n_connected_components} > 1. The graph cannot be completed with metric='precomputed', and Isomap cannot befitted. Increase the number of neighbors to avoid this issue, or precompute the full distance matrix instead of passing a sparse neighbors graph.\")\n warnings.warn(f'The number of connected components of the neighbors graph is {n_connected_components} > 1. Completing the graph to fit Isomap might be slow. Increase the number of neighbors to avoid this issue.', stacklevel=2)\n kng = _fix_connected_components(X=self.nbrs_._fit_X, graph=kng, n_connected_components=n_connected_components, component_labels=labels, mode='distance', metric=self.nbrs_.effective_metric_, **self.nbrs_.effective_metric_params_)\n if parse_version(scipy.__version__) < parse_version('1.3.2'):\n kng.data += 1e-15\n self.dist_matrix_ = shortest_path(kng, method=self.path_method, directed=False)\n G = self.dist_matrix_**2\n G *= -0.5\n self.embedding_ = self.kernel_pca_.fit_transform(G)" }, { "name": "fit", @@ -113894,7 +122216,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -113904,6 +122227,10 @@ "docstring": { "type": "{array-like, sparse graph, BallTree, KDTree, NearestNeighbors}", "description": "Sample data, shape = (n_samples, n_features), in the form of a\nnumpy array, sparse graph, precomputed tree, or NearestNeighbors\nobject." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -113914,13 +122241,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the embedding vectors for data X.", - "docstring": "Compute the embedding vectors for data X.\n\nParameters\n----------\nX : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}\n Sample data, shape = (n_samples, n_features), in the form of a\n numpy array, sparse graph, precomputed tree, or NearestNeighbors\n object.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n Returns a fitted instance of self.", + "docstring": "Compute the embedding vectors for data X.\n\n Parameters\n ----------\n X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}\n Sample data, shape = (n_samples, n_features), in the form of a\n numpy array, sparse graph, precomputed tree, or NearestNeighbors\n object.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Compute the embedding vectors for data X.\n\n Parameters\n ----------\n X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}\n Sample data, shape = (n_samples, n_features), in the form of a\n numpy array, sparse graph, precomputed tree, or NearestNeighbors\n object.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n self._fit_transform(X)\n return self" }, { @@ -113938,7 +122266,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -113948,6 +122277,10 @@ "docstring": { "type": "{array-like, sparse graph, BallTree, KDTree}", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -113958,13 +122291,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model from data in X and transform X.", - "docstring": "Fit the model from data in X and transform X.\n\nParameters\n----------\nX : {array-like, sparse graph, BallTree, KDTree}\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nX_new : array-like, shape (n_samples, n_components)\n X transformed in the new space.", + "docstring": "Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse graph, BallTree, KDTree}\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n X transformed in the new space.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse graph, BallTree, KDTree}\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n X transformed in the new space.\n \"\"\"\n self._fit_transform(X)\n return self.embedding_" }, { @@ -113982,13 +122316,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the reconstruction error for the embedding.", - "docstring": "Compute the reconstruction error for the embedding.\n\nReturns\n-------\nreconstruction_error : float\n Reconstruction error.\n\nNotes\n-----\nThe cost function of an isomap embedding is\n\n``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``\n\nWhere D is the matrix of distances for the input data X,\nD_fit is the matrix of distances for the output embedding X_fit,\nand K is the isomap kernel:\n\n``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``", + "docstring": "Compute the reconstruction error for the embedding.\n\n Returns\n -------\n reconstruction_error : float\n Reconstruction error.\n\n Notes\n -----\n The cost function of an isomap embedding is\n\n ``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``\n\n Where D is the matrix of distances for the input data X,\n D_fit is the matrix of distances for the output embedding X_fit,\n and K is the isomap kernel:\n\n ``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``\n ", "source_code": "\ndef reconstruction_error(self):\n \"\"\"Compute the reconstruction error for the embedding.\n\n Returns\n -------\n reconstruction_error : float\n Reconstruction error.\n\n Notes\n -----\n The cost function of an isomap embedding is\n\n ``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``\n\n Where D is the matrix of distances for the input data X,\n D_fit is the matrix of distances for the output embedding X_fit,\n and K is the isomap kernel:\n\n ``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``\n \"\"\"\n G = -0.5 * self.dist_matrix_**2\n G_center = KernelCenterer().fit_transform(G)\n evals = self.kernel_pca_.eigenvalues_\n return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]" }, { @@ -114006,7 +122341,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -114016,13 +122352,14 @@ "docstring": { "type": "array-like, shape (n_queries, n_features)", "description": "If neighbors_algorithm='precomputed', X is assumed to be a\ndistance matrix or a sparse graph of shape\n(n_queries, n_samples_fit)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform X.\n\nThis is implemented by linking the points X into the graph of geodesic distances of the training data. First the `n_neighbors` nearest neighbors of X are found in the training data, and from these the shortest geodesic distances from each point in X to each point in the training data are computed in order to construct the kernel. The embedding of X is the projection of this kernel onto the embedding vectors of the training set.", - "docstring": "Transform X.\n\nThis is implemented by linking the points X into the graph of geodesic\ndistances of the training data. First the `n_neighbors` nearest\nneighbors of X are found in the training data, and from these the\nshortest geodesic distances from each point in X to each point in\nthe training data are computed in order to construct the kernel.\nThe embedding of X is the projection of this kernel onto the\nembedding vectors of the training set.\n\nParameters\n----------\nX : array-like, shape (n_queries, n_features)\n If neighbors_algorithm='precomputed', X is assumed to be a\n distance matrix or a sparse graph of shape\n (n_queries, n_samples_fit).\n\nReturns\n-------\nX_new : array-like, shape (n_queries, n_components)\n X transformed in the new space.", + "description": "Transform X.\n\nThis is implemented by linking the points X into the graph of geodesic\ndistances of the training data. First the `n_neighbors` nearest\nneighbors of X are found in the training data, and from these the\nshortest geodesic distances from each point in X to each point in\nthe training data are computed in order to construct the kernel.\nThe embedding of X is the projection of this kernel onto the\nembedding vectors of the training set.", + "docstring": "Transform X.\n\n This is implemented by linking the points X into the graph of geodesic\n distances of the training data. First the `n_neighbors` nearest\n neighbors of X are found in the training data, and from these the\n shortest geodesic distances from each point in X to each point in\n the training data are computed in order to construct the kernel.\n The embedding of X is the projection of this kernel onto the\n embedding vectors of the training set.\n\n Parameters\n ----------\n X : array-like, shape (n_queries, n_features)\n If neighbors_algorithm='precomputed', X is assumed to be a\n distance matrix or a sparse graph of shape\n (n_queries, n_samples_fit).\n\n Returns\n -------\n X_new : array-like, shape (n_queries, n_components)\n X transformed in the new space.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform X.\n\n This is implemented by linking the points X into the graph of geodesic\n distances of the training data. First the `n_neighbors` nearest\n neighbors of X are found in the training data, and from these the\n shortest geodesic distances from each point in X to each point in\n the training data are computed in order to construct the kernel.\n The embedding of X is the projection of this kernel onto the\n embedding vectors of the training set.\n\n Parameters\n ----------\n X : array-like, shape (n_queries, n_features)\n If neighbors_algorithm='precomputed', X is assumed to be a\n distance matrix or a sparse graph of shape\n (n_queries, n_samples_fit).\n\n Returns\n -------\n X_new : array-like, shape (n_queries, n_components)\n X transformed in the new space.\n \"\"\"\n check_is_fitted(self)\n (distances, indices) = self.nbrs_.kneighbors(X, return_distance=True)\n n_samples_fit = self.nbrs_.n_samples_fit_\n n_queries = distances.shape[0]\n G_X = np.zeros((n_queries, n_samples_fit))\n for i in range(n_queries):\n G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0)\n G_X **= 2\n G_X *= -0.5\n return self.kernel_pca_.transform(G_X)" }, { @@ -114040,7 +122377,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -114050,7 +122388,8 @@ "docstring": { "type": "int, default=5", "description": "Number of neighbors to consider for each point." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -114060,7 +122399,8 @@ "docstring": { "type": "int, default=2", "description": "Number of coordinates for the manifold." - } + }, + "refined_type": {} }, { "name": "reg", @@ -114070,7 +122410,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Regularization constant, multiplies the trace of the local covariance\nmatrix of the distances." - } + }, + "refined_type": {} }, { "name": "eigen_solver", @@ -114080,6 +122421,10 @@ "docstring": { "type": "{'auto', 'arpack', 'dense'}, default='auto'", "description": "The solver used to compute the eigenvectors. The available options are:\n\n- `'auto'` : algorithm will attempt to choose the best method for input\n data.\n- `'arpack'` : use arnoldi iteration in shift-invert mode. For this\n method, M may be a dense matrix, sparse matrix, or general linear\n operator.\n- `'dense'` : use standard dense matrix operations for the eigenvalue\n decomposition. For this method, M must be an array or matrix type.\n This method should be avoided for large problems.\n\n.. warning::\n ARPACK can be unstable for some problems. It is best to try several\n random seeds in order to check results." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "dense", "arpack"] } }, { @@ -114090,7 +122435,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Tolerance for 'arpack' method\nNot used if eigen_solver=='dense'." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -114100,7 +122446,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum number of iterations for the arpack solver.\nNot used if eigen_solver=='dense'." - } + }, + "refined_type": {} }, { "name": "method", @@ -114110,6 +122457,10 @@ "docstring": { "type": "{'standard', 'hessian', 'modified', 'ltsa'}, default='standard'", "description": "- `standard`: use the standard locally linear embedding algorithm. see\n reference [1]_\n- `hessian`: use the Hessian eigenmap method. This method requires\n ``n_neighbors > n_components * (1 + (n_components + 1) / 2``. see\n reference [2]_\n- `modified`: use the modified locally linear embedding algorithm.\n see reference [3]_\n- `ltsa`: use local tangent space alignment algorithm. see\n reference [4]_" + }, + "refined_type": { + "kind": "EnumType", + "values": ["hessian", "ltsa", "standard", "modified"] } }, { @@ -114120,7 +122471,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance for Hessian eigenmapping method.\nOnly used if ``method == 'hessian'``." - } + }, + "refined_type": {} }, { "name": "modified_tol", @@ -114130,7 +122482,8 @@ "docstring": { "type": "float, default=1e-12", "description": "Tolerance for modified LLE method.\nOnly used if ``method == 'modified'``." - } + }, + "refined_type": {} }, { "name": "neighbors_algorithm", @@ -114140,6 +122493,10 @@ "docstring": { "type": "{'auto', 'brute', 'kd_tree', 'ball_tree'}, default='auto'", "description": "Algorithm to use for nearest neighbors search, passed to\n:class:`~sklearn.neighbors.NearestNeighbors` instance." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -114150,7 +122507,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Determines the random number generator when\n``eigen_solver`` == 'arpack'. Pass an int for reproducible results\nacross multiple function calls. See :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -114160,13 +122518,14 @@ "docstring": { "type": "int or None, default=None", "description": "The number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, n_neighbors=5, n_components=2, reg=0.001, eigen_solver='auto', tol=1e-06, max_iter=100, method='standard', hessian_tol=0.0001, modified_tol=1e-12, neighbors_algorithm='auto', random_state=None, n_jobs=None):\n self.n_neighbors = n_neighbors\n self.n_components = n_components\n self.reg = reg\n self.eigen_solver = eigen_solver\n self.tol = tol\n self.max_iter = max_iter\n self.method = method\n self.hessian_tol = hessian_tol\n self.modified_tol = modified_tol\n self.random_state = random_state\n self.neighbors_algorithm = neighbors_algorithm\n self.n_jobs = n_jobs" }, { @@ -114184,7 +122543,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -114194,13 +122554,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit_transform(self, X):\n self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.neighbors_algorithm, n_jobs=self.n_jobs)\n random_state = check_random_state(self.random_state)\n X = self._validate_data(X, dtype=float)\n self.nbrs_.fit(X)\n (self.embedding_, self.reconstruction_error_) = locally_linear_embedding(X=self.nbrs_, n_neighbors=self.n_neighbors, n_components=self.n_components, eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter, method=self.method, hessian_tol=self.hessian_tol, modified_tol=self.modified_tol, random_state=random_state, reg=self.reg, n_jobs=self.n_jobs)" }, { @@ -114218,7 +122579,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -114228,7 +122590,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training set." - } + }, + "refined_type": {} }, { "name": "y", @@ -114238,13 +122601,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the embedding vectors for data X.", - "docstring": "Compute the embedding vectors for data X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training set.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n Fitted `LocallyLinearEmbedding` class instance.", + "docstring": "Compute the embedding vectors for data X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training set.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted `LocallyLinearEmbedding` class instance.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Compute the embedding vectors for data X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training set.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted `LocallyLinearEmbedding` class instance.\n \"\"\"\n self._fit_transform(X)\n return self" }, { @@ -114262,7 +122626,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -114272,7 +122637,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training set." - } + }, + "refined_type": {} }, { "name": "y", @@ -114282,13 +122648,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the embedding vectors for data X and transform X.", - "docstring": "Compute the embedding vectors for data X and transform X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training set.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nX_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.", + "docstring": "Compute the embedding vectors for data X and transform X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training set.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Compute the embedding vectors for data X and transform X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training set.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n X_new : array-like, shape (n_samples, n_components)\n Returns the instance itself.\n \"\"\"\n self._fit_transform(X)\n return self.embedding_" }, { @@ -114306,7 +122673,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -114316,13 +122684,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training set." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform new points into embedding space.", - "docstring": "Transform new points into embedding space.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training set.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.\n\nNotes\n-----\nBecause of scaling performed by this method, it is discouraged to use\nit together with methods that are not scale-invariant (like SVMs).", + "docstring": "\n Transform new points into embedding space.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training set.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.\n\n Notes\n -----\n Because of scaling performed by this method, it is discouraged to use\n it together with methods that are not scale-invariant (like SVMs).\n ", "source_code": "\ndef transform(self, X):\n \"\"\"\n Transform new points into embedding space.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training set.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Returns the instance itself.\n\n Notes\n -----\n Because of scaling performed by this method, it is discouraged to use\n it together with methods that are not scale-invariant (like SVMs).\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n ind = self.nbrs_.kneighbors(X, n_neighbors=self.n_neighbors, return_distance=False)\n weights = barycenter_weights(X, self.nbrs_._fit_X, ind, reg=self.reg)\n X_new = np.empty((X.shape[0], self.n_components))\n for i in range(X.shape[0]):\n X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i])\n return X_new" }, { @@ -114340,6 +122709,10 @@ "docstring": { "type": "{array-like, NearestNeighbors}", "description": "Sample data, shape = (n_samples, n_features), in the form of a\nnumpy array or a NearestNeighbors object." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -114350,7 +122723,8 @@ "docstring": { "type": "int", "description": "Number of neighbors for each sample." - } + }, + "refined_type": {} }, { "name": "reg", @@ -114360,7 +122734,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Amount of regularization when solving the least-squares\nproblem. Only relevant if mode='barycenter'. If None, use the\ndefault." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -114370,13 +122745,14 @@ "docstring": { "type": "int or None, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes the barycenter weighted graph of k-Neighbors for points in X", - "docstring": "Computes the barycenter weighted graph of k-Neighbors for points in X\n\nParameters\n----------\nX : {array-like, NearestNeighbors}\n Sample data, shape = (n_samples, n_features), in the form of a\n numpy array or a NearestNeighbors object.\n\nn_neighbors : int\n Number of neighbors for each sample.\n\nreg : float, default=1e-3\n Amount of regularization when solving the least-squares\n problem. Only relevant if mode='barycenter'. If None, use the\n default.\n\nn_jobs : int or None, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nReturns\n-------\nA : sparse matrix in CSR format, shape = [n_samples, n_samples]\n A[i, j] is assigned the weight of edge that connects i to j.\n\nSee Also\n--------\nsklearn.neighbors.kneighbors_graph\nsklearn.neighbors.radius_neighbors_graph", + "docstring": "Computes the barycenter weighted graph of k-Neighbors for points in X\n\n Parameters\n ----------\n X : {array-like, NearestNeighbors}\n Sample data, shape = (n_samples, n_features), in the form of a\n numpy array or a NearestNeighbors object.\n\n n_neighbors : int\n Number of neighbors for each sample.\n\n reg : float, default=1e-3\n Amount of regularization when solving the least-squares\n problem. Only relevant if mode='barycenter'. If None, use the\n default.\n\n n_jobs : int or None, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n A : sparse matrix in CSR format, shape = [n_samples, n_samples]\n A[i, j] is assigned the weight of edge that connects i to j.\n\n See Also\n --------\n sklearn.neighbors.kneighbors_graph\n sklearn.neighbors.radius_neighbors_graph\n ", "source_code": "\ndef barycenter_kneighbors_graph(X, n_neighbors, reg=0.001, n_jobs=None):\n \"\"\"Computes the barycenter weighted graph of k-Neighbors for points in X\n\n Parameters\n ----------\n X : {array-like, NearestNeighbors}\n Sample data, shape = (n_samples, n_features), in the form of a\n numpy array or a NearestNeighbors object.\n\n n_neighbors : int\n Number of neighbors for each sample.\n\n reg : float, default=1e-3\n Amount of regularization when solving the least-squares\n problem. Only relevant if mode='barycenter'. If None, use the\n default.\n\n n_jobs : int or None, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n A : sparse matrix in CSR format, shape = [n_samples, n_samples]\n A[i, j] is assigned the weight of edge that connects i to j.\n\n See Also\n --------\n sklearn.neighbors.kneighbors_graph\n sklearn.neighbors.radius_neighbors_graph\n \"\"\"\n knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X)\n X = knn._fit_X\n n_samples = knn.n_samples_fit_\n ind = knn.kneighbors(X, return_distance=False)[:, 1:]\n data = barycenter_weights(X, X, ind, reg=reg)\n indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors)\n return csr_matrix((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples))" }, { @@ -114394,7 +122770,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_dim)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -114404,7 +122781,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_dim)", "description": "" - } + }, + "refined_type": {} }, { "name": "indices", @@ -114414,7 +122792,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_dim)", "description": "Indices of the points in Y used to compute the barycenter" - } + }, + "refined_type": {} }, { "name": "reg", @@ -114424,13 +122803,14 @@ "docstring": { "type": "float, default=1e-3", "description": "amount of regularization to add for the problem to be\nwell-posed in the case of n_neighbors > n_dim" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute barycenter weights of X from Y along the first axis\n\nWe estimate the weights to assign to each point in Y[indices] to recover the point X[i]. The barycenter weights sum to 1.", - "docstring": "Compute barycenter weights of X from Y along the first axis\n\nWe estimate the weights to assign to each point in Y[indices] to recover\nthe point X[i]. The barycenter weights sum to 1.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_dim)\n\nY : array-like, shape (n_samples, n_dim)\n\nindices : array-like, shape (n_samples, n_dim)\n Indices of the points in Y used to compute the barycenter\n\nreg : float, default=1e-3\n amount of regularization to add for the problem to be\n well-posed in the case of n_neighbors > n_dim\n\nReturns\n-------\nB : array-like, shape (n_samples, n_neighbors)\n\nNotes\n-----\nSee developers note for more information.", + "description": "Compute barycenter weights of X from Y along the first axis\n\nWe estimate the weights to assign to each point in Y[indices] to recover\nthe point X[i]. The barycenter weights sum to 1.", + "docstring": "Compute barycenter weights of X from Y along the first axis\n\n We estimate the weights to assign to each point in Y[indices] to recover\n the point X[i]. The barycenter weights sum to 1.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_dim)\n\n Y : array-like, shape (n_samples, n_dim)\n\n indices : array-like, shape (n_samples, n_dim)\n Indices of the points in Y used to compute the barycenter\n\n reg : float, default=1e-3\n amount of regularization to add for the problem to be\n well-posed in the case of n_neighbors > n_dim\n\n Returns\n -------\n B : array-like, shape (n_samples, n_neighbors)\n\n Notes\n -----\n See developers note for more information.\n ", "source_code": "\ndef barycenter_weights(X, Y, indices, reg=0.001):\n \"\"\"Compute barycenter weights of X from Y along the first axis\n\n We estimate the weights to assign to each point in Y[indices] to recover\n the point X[i]. The barycenter weights sum to 1.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_dim)\n\n Y : array-like, shape (n_samples, n_dim)\n\n indices : array-like, shape (n_samples, n_dim)\n Indices of the points in Y used to compute the barycenter\n\n reg : float, default=1e-3\n amount of regularization to add for the problem to be\n well-posed in the case of n_neighbors > n_dim\n\n Returns\n -------\n B : array-like, shape (n_samples, n_neighbors)\n\n Notes\n -----\n See developers note for more information.\n \"\"\"\n X = check_array(X, dtype=FLOAT_DTYPES)\n Y = check_array(Y, dtype=FLOAT_DTYPES)\n indices = check_array(indices, dtype=int)\n (n_samples, n_neighbors) = indices.shape\n assert X.shape[0] == n_samples\n B = np.empty((n_samples, n_neighbors), dtype=X.dtype)\n v = np.ones(n_neighbors, dtype=X.dtype)\n for (i, ind) in enumerate(indices):\n A = Y[ind]\n C = A - X[i]\n G = np.dot(C, C.T)\n trace = np.trace(G)\n if trace > 0:\n R = reg * trace\n else:\n R = reg\n G.flat[::n_neighbors + 1] += R\n w = solve(G, v, sym_pos=True)\n B[i, :] = w / np.sum(w)\n return B" }, { @@ -114448,6 +122828,10 @@ "docstring": { "type": "{array-like, NearestNeighbors}", "description": "Sample data, shape = (n_samples, n_features), in the form of a\nnumpy array or a NearestNeighbors object." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -114458,7 +122842,8 @@ "docstring": { "type": "int", "description": "number of neighbors to consider for each point." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -114468,7 +122853,8 @@ "docstring": { "type": "int", "description": "number of coordinates for the manifold." - } + }, + "refined_type": {} }, { "name": "reg", @@ -114478,7 +122864,8 @@ "docstring": { "type": "float, default=1e-3", "description": "regularization constant, multiplies the trace of the local covariance\nmatrix of the distances." - } + }, + "refined_type": {} }, { "name": "eigen_solver", @@ -114488,6 +122875,10 @@ "docstring": { "type": "{'auto', 'arpack', 'dense'}, default='auto'", "description": "auto : algorithm will attempt to choose the best method for input data\n\narpack : use arnoldi iteration in shift-invert mode.\n For this method, M may be a dense matrix, sparse matrix,\n or general linear operator.\n Warning: ARPACK can be unstable for some problems. It is\n best to try several random seeds in order to check results.\n\ndense : use standard dense matrix operations for the eigenvalue\n decomposition. For this method, M must be an array\n or matrix type. This method should be avoided for\n large problems." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "dense", "arpack"] } }, { @@ -114498,7 +122889,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Tolerance for 'arpack' method\nNot used if eigen_solver=='dense'." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -114508,7 +122900,8 @@ "docstring": { "type": "int, default=100", "description": "maximum number of iterations for the arpack solver." - } + }, + "refined_type": {} }, { "name": "method", @@ -114518,6 +122911,10 @@ "docstring": { "type": "{'standard', 'hessian', 'modified', 'ltsa'}, default='standard'", "description": "standard : use the standard locally linear embedding algorithm.\n see reference [1]_\nhessian : use the Hessian eigenmap method. This method requires\n n_neighbors > n_components * (1 + (n_components + 1) / 2.\n see reference [2]_\nmodified : use the modified locally linear embedding algorithm.\n see reference [3]_\nltsa : use local tangent space alignment algorithm\n see reference [4]_" + }, + "refined_type": { + "kind": "EnumType", + "values": ["hessian", "ltsa", "standard", "modified"] } }, { @@ -114528,7 +122925,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance for Hessian eigenmapping method.\nOnly used if method == 'hessian'" - } + }, + "refined_type": {} }, { "name": "modified_tol", @@ -114538,7 +122936,8 @@ "docstring": { "type": "float, default=1e-12", "description": "Tolerance for modified LLE method.\nOnly used if method == 'modified'" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -114548,7 +122947,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Determines the random number generator when ``solver`` == 'arpack'.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -114558,13 +122958,14 @@ "docstring": { "type": "int or None, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform a Locally Linear Embedding analysis on the data.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Perform a Locally Linear Embedding analysis on the data.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, NearestNeighbors}\n Sample data, shape = (n_samples, n_features), in the form of a\n numpy array or a NearestNeighbors object.\n\nn_neighbors : int\n number of neighbors to consider for each point.\n\nn_components : int\n number of coordinates for the manifold.\n\nreg : float, default=1e-3\n regularization constant, multiplies the trace of the local covariance\n matrix of the distances.\n\neigen_solver : {'auto', 'arpack', 'dense'}, default='auto'\n auto : algorithm will attempt to choose the best method for input data\n\n arpack : use arnoldi iteration in shift-invert mode.\n For this method, M may be a dense matrix, sparse matrix,\n or general linear operator.\n Warning: ARPACK can be unstable for some problems. It is\n best to try several random seeds in order to check results.\n\n dense : use standard dense matrix operations for the eigenvalue\n decomposition. For this method, M must be an array\n or matrix type. This method should be avoided for\n large problems.\n\ntol : float, default=1e-6\n Tolerance for 'arpack' method\n Not used if eigen_solver=='dense'.\n\nmax_iter : int, default=100\n maximum number of iterations for the arpack solver.\n\nmethod : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'\n standard : use the standard locally linear embedding algorithm.\n see reference [1]_\n hessian : use the Hessian eigenmap method. This method requires\n n_neighbors > n_components * (1 + (n_components + 1) / 2.\n see reference [2]_\n modified : use the modified locally linear embedding algorithm.\n see reference [3]_\n ltsa : use local tangent space alignment algorithm\n see reference [4]_\n\nhessian_tol : float, default=1e-4\n Tolerance for Hessian eigenmapping method.\n Only used if method == 'hessian'\n\nmodified_tol : float, default=1e-12\n Tolerance for modified LLE method.\n Only used if method == 'modified'\n\nrandom_state : int, RandomState instance, default=None\n Determines the random number generator when ``solver`` == 'arpack'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nn_jobs : int or None, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nReturns\n-------\nY : array-like, shape [n_samples, n_components]\n Embedding vectors.\n\nsquared_error : float\n Reconstruction error for the embedding vectors. Equivalent to\n ``norm(Y - W Y, 'fro')**2``, where W are the reconstruction weights.\n\nReferences\n----------\n\n.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction\n by locally linear embedding. Science 290:2323 (2000).\n.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally\n linear embedding techniques for high-dimensional data.\n Proc Natl Acad Sci U S A. 100:5591 (2003).\n.. [3] Zhang, Z. & Wang, J. MLLE: Modified Locally Linear\n Embedding Using Multiple Weights.\n http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382\n.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear\n dimensionality reduction via tangent space alignment.\n Journal of Shanghai Univ. 8:406 (2004)", + "docstring": "Perform a Locally Linear Embedding analysis on the data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, NearestNeighbors}\n Sample data, shape = (n_samples, n_features), in the form of a\n numpy array or a NearestNeighbors object.\n\n n_neighbors : int\n number of neighbors to consider for each point.\n\n n_components : int\n number of coordinates for the manifold.\n\n reg : float, default=1e-3\n regularization constant, multiplies the trace of the local covariance\n matrix of the distances.\n\n eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'\n auto : algorithm will attempt to choose the best method for input data\n\n arpack : use arnoldi iteration in shift-invert mode.\n For this method, M may be a dense matrix, sparse matrix,\n or general linear operator.\n Warning: ARPACK can be unstable for some problems. It is\n best to try several random seeds in order to check results.\n\n dense : use standard dense matrix operations for the eigenvalue\n decomposition. For this method, M must be an array\n or matrix type. This method should be avoided for\n large problems.\n\n tol : float, default=1e-6\n Tolerance for 'arpack' method\n Not used if eigen_solver=='dense'.\n\n max_iter : int, default=100\n maximum number of iterations for the arpack solver.\n\n method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'\n standard : use the standard locally linear embedding algorithm.\n see reference [1]_\n hessian : use the Hessian eigenmap method. This method requires\n n_neighbors > n_components * (1 + (n_components + 1) / 2.\n see reference [2]_\n modified : use the modified locally linear embedding algorithm.\n see reference [3]_\n ltsa : use local tangent space alignment algorithm\n see reference [4]_\n\n hessian_tol : float, default=1e-4\n Tolerance for Hessian eigenmapping method.\n Only used if method == 'hessian'\n\n modified_tol : float, default=1e-12\n Tolerance for modified LLE method.\n Only used if method == 'modified'\n\n random_state : int, RandomState instance, default=None\n Determines the random number generator when ``solver`` == 'arpack'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int or None, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n Y : array-like, shape [n_samples, n_components]\n Embedding vectors.\n\n squared_error : float\n Reconstruction error for the embedding vectors. Equivalent to\n ``norm(Y - W Y, 'fro')**2``, where W are the reconstruction weights.\n\n References\n ----------\n\n .. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction\n by locally linear embedding. Science 290:2323 (2000).\n .. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally\n linear embedding techniques for high-dimensional data.\n Proc Natl Acad Sci U S A. 100:5591 (2003).\n .. [3] Zhang, Z. & Wang, J. MLLE: Modified Locally Linear\n Embedding Using Multiple Weights.\n http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382\n .. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear\n dimensionality reduction via tangent space alignment.\n Journal of Shanghai Univ. 8:406 (2004)\n ", "source_code": "\ndef locally_linear_embedding(X, *, n_neighbors, n_components, reg=0.001, eigen_solver='auto', tol=1e-06, max_iter=100, method='standard', hessian_tol=0.0001, modified_tol=1e-12, random_state=None, n_jobs=None):\n \"\"\"Perform a Locally Linear Embedding analysis on the data.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, NearestNeighbors}\n Sample data, shape = (n_samples, n_features), in the form of a\n numpy array or a NearestNeighbors object.\n\n n_neighbors : int\n number of neighbors to consider for each point.\n\n n_components : int\n number of coordinates for the manifold.\n\n reg : float, default=1e-3\n regularization constant, multiplies the trace of the local covariance\n matrix of the distances.\n\n eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'\n auto : algorithm will attempt to choose the best method for input data\n\n arpack : use arnoldi iteration in shift-invert mode.\n For this method, M may be a dense matrix, sparse matrix,\n or general linear operator.\n Warning: ARPACK can be unstable for some problems. It is\n best to try several random seeds in order to check results.\n\n dense : use standard dense matrix operations for the eigenvalue\n decomposition. For this method, M must be an array\n or matrix type. This method should be avoided for\n large problems.\n\n tol : float, default=1e-6\n Tolerance for 'arpack' method\n Not used if eigen_solver=='dense'.\n\n max_iter : int, default=100\n maximum number of iterations for the arpack solver.\n\n method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'\n standard : use the standard locally linear embedding algorithm.\n see reference [1]_\n hessian : use the Hessian eigenmap method. This method requires\n n_neighbors > n_components * (1 + (n_components + 1) / 2.\n see reference [2]_\n modified : use the modified locally linear embedding algorithm.\n see reference [3]_\n ltsa : use local tangent space alignment algorithm\n see reference [4]_\n\n hessian_tol : float, default=1e-4\n Tolerance for Hessian eigenmapping method.\n Only used if method == 'hessian'\n\n modified_tol : float, default=1e-12\n Tolerance for modified LLE method.\n Only used if method == 'modified'\n\n random_state : int, RandomState instance, default=None\n Determines the random number generator when ``solver`` == 'arpack'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n n_jobs : int or None, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n Y : array-like, shape [n_samples, n_components]\n Embedding vectors.\n\n squared_error : float\n Reconstruction error for the embedding vectors. Equivalent to\n ``norm(Y - W Y, 'fro')**2``, where W are the reconstruction weights.\n\n References\n ----------\n\n .. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction\n by locally linear embedding. Science 290:2323 (2000).\n .. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally\n linear embedding techniques for high-dimensional data.\n Proc Natl Acad Sci U S A. 100:5591 (2003).\n .. [3] Zhang, Z. & Wang, J. MLLE: Modified Locally Linear\n Embedding Using Multiple Weights.\n http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.70.382\n .. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear\n dimensionality reduction via tangent space alignment.\n Journal of Shanghai Univ. 8:406 (2004)\n \"\"\"\n if eigen_solver not in ('auto', 'arpack', 'dense'):\n raise ValueError(\"unrecognized eigen_solver '%s'\" % eigen_solver)\n if method not in ('standard', 'hessian', 'modified', 'ltsa'):\n raise ValueError(\"unrecognized method '%s'\" % method)\n nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs)\n nbrs.fit(X)\n X = nbrs._fit_X\n (N, d_in) = X.shape\n if n_components > d_in:\n raise ValueError('output dimension must be less than or equal to input dimension')\n if n_neighbors >= N:\n raise ValueError('Expected n_neighbors <= n_samples, but n_samples = %d, n_neighbors = %d' % (N, n_neighbors))\n if n_neighbors <= 0:\n raise ValueError('n_neighbors must be positive')\n M_sparse = eigen_solver != 'dense'\n if method == 'standard':\n W = barycenter_kneighbors_graph(nbrs, n_neighbors=n_neighbors, reg=reg, n_jobs=n_jobs)\n if M_sparse:\n M = eye(*W.shape, format=W.format) - W\n M = (M.T * M).tocsr()\n else:\n M = (W.T * W - W.T - W).toarray()\n M.flat[::M.shape[0] + 1] += 1\n elif method == 'hessian':\n dp = n_components * (n_components + 1) // 2\n if n_neighbors <= n_components + dp:\n raise ValueError(\"for method='hessian', n_neighbors must be greater than [n_components * (n_components + 3) / 2]\")\n neighbors = nbrs.kneighbors(X, n_neighbors=n_neighbors + 1, return_distance=False)\n neighbors = neighbors[:, 1:]\n Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float64)\n Yi[:, 0] = 1\n M = np.zeros((N, N), dtype=np.float64)\n use_svd = n_neighbors > d_in\n for i in range(N):\n Gi = X[neighbors[i]]\n Gi -= Gi.mean(0)\n if use_svd:\n U = svd(Gi, full_matrices=0)[0]\n else:\n Ci = np.dot(Gi, Gi.T)\n U = eigh(Ci)[1][:, ::-1]\n Yi[:, 1:1 + n_components] = U[:, :n_components]\n j = 1 + n_components\n for k in range(n_components):\n Yi[:, j:j + n_components - k] = U[:, k:k + 1] * U[:, k:n_components]\n j += n_components - k\n (Q, R) = qr(Yi)\n w = Q[:, n_components + 1:]\n S = w.sum(0)\n S[np.where(abs(S) < hessian_tol)] = 1\n w /= S\n (nbrs_x, nbrs_y) = np.meshgrid(neighbors[i], neighbors[i])\n M[nbrs_x, nbrs_y] += np.dot(w, w.T)\n if M_sparse:\n M = csr_matrix(M)\n elif method == 'modified':\n if n_neighbors < n_components:\n raise ValueError('modified LLE requires n_neighbors >= n_components')\n neighbors = nbrs.kneighbors(X, n_neighbors=n_neighbors + 1, return_distance=False)\n neighbors = neighbors[:, 1:]\n V = np.zeros((N, n_neighbors, n_neighbors))\n nev = min(d_in, n_neighbors)\n evals = np.zeros([N, nev])\n use_svd = n_neighbors > d_in\n if use_svd:\n for i in range(N):\n X_nbrs = X[neighbors[i]] - X[i]\n (V[i], evals[i], _) = svd(X_nbrs, full_matrices=True)\n evals **= 2\n else:\n for i in range(N):\n X_nbrs = X[neighbors[i]] - X[i]\n C_nbrs = np.dot(X_nbrs, X_nbrs.T)\n (evi, vi) = eigh(C_nbrs)\n evals[i] = evi[::-1]\n V[i] = vi[:, ::-1]\n reg = 0.001 * evals.sum(1)\n tmp = np.dot(V.transpose(0, 2, 1), np.ones(n_neighbors))\n tmp[:, :nev] /= evals + reg[:, None]\n tmp[:, nev:] /= reg[:, None]\n w_reg = np.zeros((N, n_neighbors))\n for i in range(N):\n w_reg[i] = np.dot(V[i], tmp[i])\n w_reg /= w_reg.sum(1)[:, None]\n rho = evals[:, n_components:].sum(1) / evals[:, :n_components].sum(1)\n eta = np.median(rho)\n s_range = np.zeros(N, dtype=int)\n evals_cumsum = stable_cumsum(evals, 1)\n eta_range = evals_cumsum[:, -1:] / evals_cumsum[:, :-1] - 1\n for i in range(N):\n s_range[i] = np.searchsorted(eta_range[i, ::-1], eta)\n s_range += n_neighbors - nev\n M = np.zeros((N, N), dtype=np.float64)\n for i in range(N):\n s_i = s_range[i]\n Vi = V[i, :, n_neighbors - s_i:]\n alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)\n h = np.full(s_i, alpha_i) - np.dot(Vi.T, np.ones(n_neighbors))\n norm_h = np.linalg.norm(h)\n if norm_h < modified_tol:\n h *= 0\n else:\n h /= norm_h\n Wi = Vi - 2 * np.outer(np.dot(Vi, h), h) + (1 - alpha_i) * w_reg[i, :, None]\n (nbrs_x, nbrs_y) = np.meshgrid(neighbors[i], neighbors[i])\n M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T)\n Wi_sum1 = Wi.sum(1)\n M[i, neighbors[i]] -= Wi_sum1\n M[neighbors[i], i] -= Wi_sum1\n M[i, i] += s_i\n if M_sparse:\n M = csr_matrix(M)\n elif method == 'ltsa':\n neighbors = nbrs.kneighbors(X, n_neighbors=n_neighbors + 1, return_distance=False)\n neighbors = neighbors[:, 1:]\n M = np.zeros((N, N))\n use_svd = n_neighbors > d_in\n for i in range(N):\n Xi = X[neighbors[i]]\n Xi -= Xi.mean(0)\n if use_svd:\n v = svd(Xi, full_matrices=True)[0]\n else:\n Ci = np.dot(Xi, Xi.T)\n v = eigh(Ci)[1][:, ::-1]\n Gi = np.zeros((n_neighbors, n_components + 1))\n Gi[:, 1:] = v[:, :n_components]\n Gi[:, 0] = 1.0 / np.sqrt(n_neighbors)\n GiGiT = np.dot(Gi, Gi.T)\n (nbrs_x, nbrs_y) = np.meshgrid(neighbors[i], neighbors[i])\n M[nbrs_x, nbrs_y] -= GiGiT\n M[neighbors[i], neighbors[i]] += 1\n return null_space(M, n_components, k_skip=1, eigen_solver=eigen_solver, tol=tol, max_iter=max_iter, random_state=random_state)" }, { @@ -114582,6 +122983,10 @@ "docstring": { "type": "{array, matrix, sparse matrix, LinearOperator}", "description": "Input covariance matrix: should be symmetric positive semi-definite" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -114592,7 +122997,8 @@ "docstring": { "type": "int", "description": "Number of eigenvalues/vectors to return" - } + }, + "refined_type": {} }, { "name": "k_skip", @@ -114602,7 +123008,8 @@ "docstring": { "type": "int, default=1", "description": "Number of low eigenvalues to skip." - } + }, + "refined_type": {} }, { "name": "eigen_solver", @@ -114612,6 +123019,10 @@ "docstring": { "type": "{'auto', 'arpack', 'dense'}, default='arpack'", "description": "auto : algorithm will attempt to choose the best method for input data\narpack : use arnoldi iteration in shift-invert mode.\n For this method, M may be a dense matrix, sparse matrix,\n or general linear operator.\n Warning: ARPACK can be unstable for some problems. It is\n best to try several random seeds in order to check results.\ndense : use standard dense matrix operations for the eigenvalue\n decomposition. For this method, M must be an array\n or matrix type. This method should be avoided for\n large problems." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "dense", "arpack"] } }, { @@ -114622,7 +123033,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Tolerance for 'arpack' method.\nNot used if eigen_solver=='dense'." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -114632,7 +123044,8 @@ "docstring": { "type": "int, default=100", "description": "Maximum number of iterations for 'arpack' method.\nNot used if eigen_solver=='dense'" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -114642,13 +123055,14 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Determines the random number generator when ``solver`` == 'arpack'.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Find the null space of a matrix M.", - "docstring": "Find the null space of a matrix M.\n\nParameters\n----------\nM : {array, matrix, sparse matrix, LinearOperator}\n Input covariance matrix: should be symmetric positive semi-definite\n\nk : int\n Number of eigenvalues/vectors to return\n\nk_skip : int, default=1\n Number of low eigenvalues to skip.\n\neigen_solver : {'auto', 'arpack', 'dense'}, default='arpack'\n auto : algorithm will attempt to choose the best method for input data\n arpack : use arnoldi iteration in shift-invert mode.\n For this method, M may be a dense matrix, sparse matrix,\n or general linear operator.\n Warning: ARPACK can be unstable for some problems. It is\n best to try several random seeds in order to check results.\n dense : use standard dense matrix operations for the eigenvalue\n decomposition. For this method, M must be an array\n or matrix type. This method should be avoided for\n large problems.\n\ntol : float, default=1e-6\n Tolerance for 'arpack' method.\n Not used if eigen_solver=='dense'.\n\nmax_iter : int, default=100\n Maximum number of iterations for 'arpack' method.\n Not used if eigen_solver=='dense'\n\nrandom_state : int, RandomState instance, default=None\n Determines the random number generator when ``solver`` == 'arpack'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.", + "docstring": "\n Find the null space of a matrix M.\n\n Parameters\n ----------\n M : {array, matrix, sparse matrix, LinearOperator}\n Input covariance matrix: should be symmetric positive semi-definite\n\n k : int\n Number of eigenvalues/vectors to return\n\n k_skip : int, default=1\n Number of low eigenvalues to skip.\n\n eigen_solver : {'auto', 'arpack', 'dense'}, default='arpack'\n auto : algorithm will attempt to choose the best method for input data\n arpack : use arnoldi iteration in shift-invert mode.\n For this method, M may be a dense matrix, sparse matrix,\n or general linear operator.\n Warning: ARPACK can be unstable for some problems. It is\n best to try several random seeds in order to check results.\n dense : use standard dense matrix operations for the eigenvalue\n decomposition. For this method, M must be an array\n or matrix type. This method should be avoided for\n large problems.\n\n tol : float, default=1e-6\n Tolerance for 'arpack' method.\n Not used if eigen_solver=='dense'.\n\n max_iter : int, default=100\n Maximum number of iterations for 'arpack' method.\n Not used if eigen_solver=='dense'\n\n random_state : int, RandomState instance, default=None\n Determines the random number generator when ``solver`` == 'arpack'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n ", "source_code": "\ndef null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1e-06, max_iter=100, random_state=None):\n \"\"\"\n Find the null space of a matrix M.\n\n Parameters\n ----------\n M : {array, matrix, sparse matrix, LinearOperator}\n Input covariance matrix: should be symmetric positive semi-definite\n\n k : int\n Number of eigenvalues/vectors to return\n\n k_skip : int, default=1\n Number of low eigenvalues to skip.\n\n eigen_solver : {'auto', 'arpack', 'dense'}, default='arpack'\n auto : algorithm will attempt to choose the best method for input data\n arpack : use arnoldi iteration in shift-invert mode.\n For this method, M may be a dense matrix, sparse matrix,\n or general linear operator.\n Warning: ARPACK can be unstable for some problems. It is\n best to try several random seeds in order to check results.\n dense : use standard dense matrix operations for the eigenvalue\n decomposition. For this method, M must be an array\n or matrix type. This method should be avoided for\n large problems.\n\n tol : float, default=1e-6\n Tolerance for 'arpack' method.\n Not used if eigen_solver=='dense'.\n\n max_iter : int, default=100\n Maximum number of iterations for 'arpack' method.\n Not used if eigen_solver=='dense'\n\n random_state : int, RandomState instance, default=None\n Determines the random number generator when ``solver`` == 'arpack'.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n \"\"\"\n if eigen_solver == 'auto':\n if M.shape[0] > 200 and k + k_skip < 10:\n eigen_solver = 'arpack'\n else:\n eigen_solver = 'dense'\n if eigen_solver == 'arpack':\n v0 = _init_arpack_v0(M.shape[0], random_state)\n try:\n (eigen_values, eigen_vectors) = eigsh(M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, v0=v0)\n except RuntimeError as e:\n raise ValueError(\"Error in determining null-space with ARPACK. Error message: '%s'. Note that eigen_solver='arpack' can fail when the weight matrix is singular or otherwise ill-behaved. In that case, eigen_solver='dense' is recommended. See online documentation for more information.\" % e) from e\n return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:])\n elif eigen_solver == 'dense':\n if hasattr(M, 'toarray'):\n M = M.toarray()\n (eigen_values, eigen_vectors) = eigh(M, eigvals=(k_skip, k + k_skip - 1), overwrite_a=True)\n index = np.argsort(np.abs(eigen_values))\n return eigen_vectors[:, index], np.sum(eigen_values)\n else:\n raise ValueError(\"Unrecognized eigen_solver '%s'\" % eigen_solver)" }, { @@ -114666,7 +123080,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -114676,7 +123091,8 @@ "docstring": { "type": "int, default=2", "description": "Number of dimensions in which to immerse the dissimilarities." - } + }, + "refined_type": {} }, { "name": "metric", @@ -114686,7 +123102,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``True``, perform metric MDS; otherwise, perform nonmetric MDS." - } + }, + "refined_type": {} }, { "name": "n_init", @@ -114696,7 +123113,8 @@ "docstring": { "type": "int, default=4", "description": "Number of times the SMACOF algorithm will be run with different\ninitializations. The final results will be the best output of the runs,\ndetermined by the run with the smallest final stress." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -114706,7 +123124,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations of the SMACOF algorithm for a single run." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -114716,7 +123135,8 @@ "docstring": { "type": "int, default=0", "description": "Level of verbosity." - } + }, + "refined_type": {} }, { "name": "eps", @@ -114726,7 +123146,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Relative tolerance with respect to stress at which to declare\nconvergence." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -114736,7 +123157,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation. If multiple\ninitializations are used (``n_init``), each run of the algorithm is\ncomputed in parallel.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -114746,7 +123168,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines the random number generator used to initialize the centers.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "dissimilarity", @@ -114756,13 +123179,17 @@ "docstring": { "type": "{'euclidean', 'precomputed'}, default='euclidean'", "description": "Dissimilarity measure to use:\n\n- 'euclidean':\n Pairwise Euclidean distances between points in the dataset.\n\n- 'precomputed':\n Pre-computed dissimilarities are passed directly to ``fit`` and\n ``fit_transform``." + }, + "refined_type": { + "kind": "EnumType", + "values": ["euclidean", "precomputed"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=2, *, metric=True, n_init=4, max_iter=300, verbose=0, eps=0.001, n_jobs=None, random_state=None, dissimilarity='euclidean'):\n self.n_components = n_components\n self.dissimilarity = dissimilarity\n self.metric = metric\n self.n_init = n_init\n self.max_iter = max_iter\n self.eps = eps\n self.verbose = verbose\n self.n_jobs = n_jobs\n self.random_state = random_state" }, { @@ -114780,13 +123207,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'pairwise': self.dissimilarity == 'precomputed'}" }, { @@ -114807,13 +123235,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef _pairwise(self):\n return self.dissimilarity == 'precomputed'" }, { @@ -114831,7 +123260,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -114841,7 +123271,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "Input data. If ``dissimilarity=='precomputed'``, the input should\nbe the dissimilarity matrix." - } + }, + "refined_type": {} }, { "name": "y", @@ -114851,7 +123282,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "init", @@ -114861,13 +123293,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Starting configuration of the embedding to initialize the SMACOF\nalgorithm. By default, the algorithm is initialized with a randomly\nchosen array." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the position of the points in the embedding space.", - "docstring": "Compute the position of the points in the embedding space.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n Input data. If ``dissimilarity=='precomputed'``, the input should\n be the dissimilarity matrix.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\ninit : ndarray of shape (n_samples,), default=None\n Starting configuration of the embedding to initialize the SMACOF\n algorithm. By default, the algorithm is initialized with a randomly\n chosen array.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "\n Compute the position of the points in the embedding space.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n Input data. If ``dissimilarity=='precomputed'``, the input should\n be the dissimilarity matrix.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n init : ndarray of shape (n_samples,), default=None\n Starting configuration of the embedding to initialize the SMACOF\n algorithm. By default, the algorithm is initialized with a randomly\n chosen array.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y=None, init=None):\n \"\"\"\n Compute the position of the points in the embedding space.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n Input data. If ``dissimilarity=='precomputed'``, the input should\n be the dissimilarity matrix.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n init : ndarray of shape (n_samples,), default=None\n Starting configuration of the embedding to initialize the SMACOF\n algorithm. By default, the algorithm is initialized with a randomly\n chosen array.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self.fit_transform(X, init=init)\n return self" }, { @@ -114885,7 +123318,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -114895,7 +123329,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "Input data. If ``dissimilarity=='precomputed'``, the input should\nbe the dissimilarity matrix." - } + }, + "refined_type": {} }, { "name": "y", @@ -114905,7 +123340,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "init", @@ -114915,13 +123351,14 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Starting configuration of the embedding to initialize the SMACOF\nalgorithm. By default, the algorithm is initialized with a randomly\nchosen array." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the data from `X`, and returns the embedded coordinates.", - "docstring": "Fit the data from `X`, and returns the embedded coordinates.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n Input data. If ``dissimilarity=='precomputed'``, the input should\n be the dissimilarity matrix.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\ninit : ndarray of shape (n_samples,), default=None\n Starting configuration of the embedding to initialize the SMACOF\n algorithm. By default, the algorithm is initialized with a randomly\n chosen array.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n X transformed in the new space.", + "docstring": "\n Fit the data from `X`, and returns the embedded coordinates.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n Input data. If ``dissimilarity=='precomputed'``, the input should\n be the dissimilarity matrix.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n init : ndarray of shape (n_samples,), default=None\n Starting configuration of the embedding to initialize the SMACOF\n algorithm. By default, the algorithm is initialized with a randomly\n chosen array.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n X transformed in the new space.\n ", "source_code": "\ndef fit_transform(self, X, y=None, init=None):\n \"\"\"\n Fit the data from `X`, and returns the embedded coordinates.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or (n_samples, n_samples)\n Input data. If ``dissimilarity=='precomputed'``, the input should\n be the dissimilarity matrix.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n init : ndarray of shape (n_samples,), default=None\n Starting configuration of the embedding to initialize the SMACOF\n algorithm. By default, the algorithm is initialized with a randomly\n chosen array.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n X transformed in the new space.\n \"\"\"\n X = self._validate_data(X)\n if X.shape[0] == X.shape[1] and self.dissimilarity != 'precomputed':\n warnings.warn(\"The MDS API has changed. ``fit`` now constructs an dissimilarity matrix from data. To use a custom dissimilarity matrix, set ``dissimilarity='precomputed'``.\")\n if self.dissimilarity == 'precomputed':\n self.dissimilarity_matrix_ = X\n elif self.dissimilarity == 'euclidean':\n self.dissimilarity_matrix_ = euclidean_distances(X)\n else:\n raise ValueError(\"Proximity must be 'precomputed' or 'euclidean'. Got %s instead\" % str(self.dissimilarity))\n (self.embedding_, self.stress_, self.n_iter_) = smacof(self.dissimilarity_matrix_, metric=self.metric, n_components=self.n_components, init=init, n_init=self.n_init, n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose, eps=self.eps, random_state=self.random_state, return_n_iter=True)\n return self.embedding_" }, { @@ -114939,7 +123376,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_samples)", "description": "Pairwise dissimilarities between the points. Must be symmetric." - } + }, + "refined_type": {} }, { "name": "metric", @@ -114949,7 +123387,8 @@ "docstring": { "type": "bool, default=True", "description": "Compute metric or nonmetric SMACOF algorithm." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -114959,7 +123398,8 @@ "docstring": { "type": "int, default=2", "description": "Number of dimensions in which to immerse the dissimilarities. If an\n``init`` array is provided, this option is overridden and the shape of\n``init`` is used to determine the dimensionality of the embedding\nspace." - } + }, + "refined_type": {} }, { "name": "init", @@ -114969,7 +123409,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_components), default=None", "description": "Starting configuration of the embedding to initialize the algorithm. By\ndefault, the algorithm is initialized with a randomly chosen array." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -114979,7 +123420,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations of the SMACOF algorithm for a single run." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -114989,7 +123431,8 @@ "docstring": { "type": "int, default=0", "description": "Level of verbosity." - } + }, + "refined_type": {} }, { "name": "eps", @@ -114999,7 +123442,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Relative tolerance with respect to stress at which to declare\nconvergence." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -115009,13 +123453,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines the random number generator used to initialize the centers.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes multidimensional scaling using SMACOF algorithm.", - "docstring": "Computes multidimensional scaling using SMACOF algorithm.\n\nParameters\n----------\ndissimilarities : ndarray of shape (n_samples, n_samples)\n Pairwise dissimilarities between the points. Must be symmetric.\n\nmetric : bool, default=True\n Compute metric or nonmetric SMACOF algorithm.\n\nn_components : int, default=2\n Number of dimensions in which to immerse the dissimilarities. If an\n ``init`` array is provided, this option is overridden and the shape of\n ``init`` is used to determine the dimensionality of the embedding\n space.\n\ninit : ndarray of shape (n_samples, n_components), default=None\n Starting configuration of the embedding to initialize the algorithm. By\n default, the algorithm is initialized with a randomly chosen array.\n\nmax_iter : int, default=300\n Maximum number of iterations of the SMACOF algorithm for a single run.\n\nverbose : int, default=0\n Level of verbosity.\n\neps : float, default=1e-3\n Relative tolerance with respect to stress at which to declare\n convergence.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines the random number generator used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_components)\n Coordinates of the points in a ``n_components``-space.\n\nstress : float\n The final value of the stress (sum of squared distance of the\n disparities and the distances for all constrained points).\n\nn_iter : int\n The number of iterations corresponding to the best stress.", + "docstring": "Computes multidimensional scaling using SMACOF algorithm.\n\n Parameters\n ----------\n dissimilarities : ndarray of shape (n_samples, n_samples)\n Pairwise dissimilarities between the points. Must be symmetric.\n\n metric : bool, default=True\n Compute metric or nonmetric SMACOF algorithm.\n\n n_components : int, default=2\n Number of dimensions in which to immerse the dissimilarities. If an\n ``init`` array is provided, this option is overridden and the shape of\n ``init`` is used to determine the dimensionality of the embedding\n space.\n\n init : ndarray of shape (n_samples, n_components), default=None\n Starting configuration of the embedding to initialize the algorithm. By\n default, the algorithm is initialized with a randomly chosen array.\n\n max_iter : int, default=300\n Maximum number of iterations of the SMACOF algorithm for a single run.\n\n verbose : int, default=0\n Level of verbosity.\n\n eps : float, default=1e-3\n Relative tolerance with respect to stress at which to declare\n convergence.\n\n random_state : int, RandomState instance or None, default=None\n Determines the random number generator used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_components)\n Coordinates of the points in a ``n_components``-space.\n\n stress : float\n The final value of the stress (sum of squared distance of the\n disparities and the distances for all constrained points).\n\n n_iter : int\n The number of iterations corresponding to the best stress.\n ", "source_code": "\ndef _smacof_single(dissimilarities, metric=True, n_components=2, init=None, max_iter=300, verbose=0, eps=0.001, random_state=None):\n \"\"\"Computes multidimensional scaling using SMACOF algorithm.\n\n Parameters\n ----------\n dissimilarities : ndarray of shape (n_samples, n_samples)\n Pairwise dissimilarities between the points. Must be symmetric.\n\n metric : bool, default=True\n Compute metric or nonmetric SMACOF algorithm.\n\n n_components : int, default=2\n Number of dimensions in which to immerse the dissimilarities. If an\n ``init`` array is provided, this option is overridden and the shape of\n ``init`` is used to determine the dimensionality of the embedding\n space.\n\n init : ndarray of shape (n_samples, n_components), default=None\n Starting configuration of the embedding to initialize the algorithm. By\n default, the algorithm is initialized with a randomly chosen array.\n\n max_iter : int, default=300\n Maximum number of iterations of the SMACOF algorithm for a single run.\n\n verbose : int, default=0\n Level of verbosity.\n\n eps : float, default=1e-3\n Relative tolerance with respect to stress at which to declare\n convergence.\n\n random_state : int, RandomState instance or None, default=None\n Determines the random number generator used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_components)\n Coordinates of the points in a ``n_components``-space.\n\n stress : float\n The final value of the stress (sum of squared distance of the\n disparities and the distances for all constrained points).\n\n n_iter : int\n The number of iterations corresponding to the best stress.\n \"\"\"\n dissimilarities = check_symmetric(dissimilarities, raise_exception=True)\n n_samples = dissimilarities.shape[0]\n random_state = check_random_state(random_state)\n sim_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()\n sim_flat_w = sim_flat[sim_flat != 0]\n if init is None:\n X = random_state.rand(n_samples * n_components)\n X = X.reshape((n_samples, n_components))\n else:\n n_components = init.shape[1]\n if n_samples != init.shape[0]:\n raise ValueError('init matrix should be of shape (%d, %d)' % (n_samples, n_components))\n X = init\n old_stress = None\n ir = IsotonicRegression()\n for it in range(max_iter):\n dis = euclidean_distances(X)\n if metric:\n disparities = dissimilarities\n else:\n dis_flat = dis.ravel()\n dis_flat_w = dis_flat[sim_flat != 0]\n disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)\n disparities = dis_flat.copy()\n disparities[sim_flat != 0] = disparities_flat\n disparities = disparities.reshape((n_samples, n_samples))\n disparities *= np.sqrt(n_samples * (n_samples - 1) / 2 / (disparities**2).sum())\n stress = ((dis.ravel() - disparities.ravel())**2).sum() / 2\n dis[dis == 0] = 1e-05\n ratio = disparities / dis\n B = -ratio\n B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)\n X = 1.0 / n_samples * np.dot(B, X)\n dis = np.sqrt((X**2).sum(axis=1)).sum()\n if verbose >= 2:\n print('it: %d, stress %s' % (it, stress))\n if old_stress is not None:\n if old_stress - stress / dis < eps:\n if verbose:\n print('breaking at iteration %d with stress %s' % (it, stress))\n break\n old_stress = stress / dis\n return X, stress, it + 1" }, { @@ -115033,7 +123478,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_samples)", "description": "Pairwise dissimilarities between the points. Must be symmetric." - } + }, + "refined_type": {} }, { "name": "metric", @@ -115043,7 +123489,8 @@ "docstring": { "type": "bool, default=True", "description": "Compute metric or nonmetric SMACOF algorithm." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -115053,7 +123500,8 @@ "docstring": { "type": "int, default=2", "description": "Number of dimensions in which to immerse the dissimilarities. If an\n``init`` array is provided, this option is overridden and the shape of\n``init`` is used to determine the dimensionality of the embedding\nspace." - } + }, + "refined_type": {} }, { "name": "init", @@ -115063,7 +123511,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_components), default=None", "description": "Starting configuration of the embedding to initialize the algorithm. By\ndefault, the algorithm is initialized with a randomly chosen array." - } + }, + "refined_type": {} }, { "name": "n_init", @@ -115073,7 +123522,8 @@ "docstring": { "type": "int, default=8", "description": "Number of times the SMACOF algorithm will be run with different\ninitializations. The final results will be the best output of the runs,\ndetermined by the run with the smallest final stress. If ``init`` is\nprovided, this option is overridden and a single run is performed." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -115083,7 +123533,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation. If multiple\ninitializations are used (``n_init``), each run of the algorithm is\ncomputed in parallel.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -115093,7 +123544,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations of the SMACOF algorithm for a single run." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -115103,7 +123555,8 @@ "docstring": { "type": "int, default=0", "description": "Level of verbosity." - } + }, + "refined_type": {} }, { "name": "eps", @@ -115113,7 +123566,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Relative tolerance with respect to stress at which to declare\nconvergence." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -115123,7 +123577,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines the random number generator used to initialize the centers.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "return_n_iter", @@ -115133,13 +123588,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether or not to return the number of iterations." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute multidimensional scaling using the SMACOF algorithm.\n\nThe SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a multidimensional scaling algorithm which minimizes an objective function (the *stress*) using a majorization technique. Stress majorization, also known as the Guttman Transform, guarantees a monotone convergence of stress, and is more powerful than traditional techniques such as gradient descent. The SMACOF algorithm for metric MDS can be summarized by the following steps: 1. Set an initial start configuration, randomly or not. 2. Compute the stress 3. Compute the Guttman Transform 4. Iterate 2 and 3 until convergence. The nonmetric algorithm adds a monotonic regression step before computing the stress.", - "docstring": "Compute multidimensional scaling using the SMACOF algorithm.\n\nThe SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a\nmultidimensional scaling algorithm which minimizes an objective function\n(the *stress*) using a majorization technique. Stress majorization, also\nknown as the Guttman Transform, guarantees a monotone convergence of\nstress, and is more powerful than traditional techniques such as gradient\ndescent.\n\nThe SMACOF algorithm for metric MDS can be summarized by the following\nsteps:\n\n1. Set an initial start configuration, randomly or not.\n2. Compute the stress\n3. Compute the Guttman Transform\n4. Iterate 2 and 3 until convergence.\n\nThe nonmetric algorithm adds a monotonic regression step before computing\nthe stress.\n\nParameters\n----------\ndissimilarities : ndarray of shape (n_samples, n_samples)\n Pairwise dissimilarities between the points. Must be symmetric.\n\nmetric : bool, default=True\n Compute metric or nonmetric SMACOF algorithm.\n\nn_components : int, default=2\n Number of dimensions in which to immerse the dissimilarities. If an\n ``init`` array is provided, this option is overridden and the shape of\n ``init`` is used to determine the dimensionality of the embedding\n space.\n\ninit : ndarray of shape (n_samples, n_components), default=None\n Starting configuration of the embedding to initialize the algorithm. By\n default, the algorithm is initialized with a randomly chosen array.\n\nn_init : int, default=8\n Number of times the SMACOF algorithm will be run with different\n initializations. The final results will be the best output of the runs,\n determined by the run with the smallest final stress. If ``init`` is\n provided, this option is overridden and a single run is performed.\n\nn_jobs : int, default=None\n The number of jobs to use for the computation. If multiple\n initializations are used (``n_init``), each run of the algorithm is\n computed in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nmax_iter : int, default=300\n Maximum number of iterations of the SMACOF algorithm for a single run.\n\nverbose : int, default=0\n Level of verbosity.\n\neps : float, default=1e-3\n Relative tolerance with respect to stress at which to declare\n convergence.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines the random number generator used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nreturn_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_components)\n Coordinates of the points in a ``n_components``-space.\n\nstress : float\n The final value of the stress (sum of squared distance of the\n disparities and the distances for all constrained points).\n\nn_iter : int\n The number of iterations corresponding to the best stress. Returned\n only if ``return_n_iter`` is set to ``True``.\n\nNotes\n-----\n\"Modern Multidimensional Scaling - Theory and Applications\" Borg, I.;\nGroenen P. Springer Series in Statistics (1997)\n\n\"Nonmetric multidimensional scaling: a numerical method\" Kruskal, J.\nPsychometrika, 29 (1964)\n\n\"Multidimensional scaling by optimizing goodness of fit to a nonmetric\nhypothesis\" Kruskal, J. Psychometrika, 29, (1964)", + "description": "Compute multidimensional scaling using the SMACOF algorithm.\n\nThe SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a\nmultidimensional scaling algorithm which minimizes an objective function\n(the *stress*) using a majorization technique. Stress majorization, also\nknown as the Guttman Transform, guarantees a monotone convergence of\nstress, and is more powerful than traditional techniques such as gradient\ndescent.\n\nThe SMACOF algorithm for metric MDS can be summarized by the following\nsteps:\n\n1. Set an initial start configuration, randomly or not.\n2. Compute the stress\n3. Compute the Guttman Transform\n4. Iterate 2 and 3 until convergence.\n\nThe nonmetric algorithm adds a monotonic regression step before computing\nthe stress.", + "docstring": "Compute multidimensional scaling using the SMACOF algorithm.\n\n The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a\n multidimensional scaling algorithm which minimizes an objective function\n (the *stress*) using a majorization technique. Stress majorization, also\n known as the Guttman Transform, guarantees a monotone convergence of\n stress, and is more powerful than traditional techniques such as gradient\n descent.\n\n The SMACOF algorithm for metric MDS can be summarized by the following\n steps:\n\n 1. Set an initial start configuration, randomly or not.\n 2. Compute the stress\n 3. Compute the Guttman Transform\n 4. Iterate 2 and 3 until convergence.\n\n The nonmetric algorithm adds a monotonic regression step before computing\n the stress.\n\n Parameters\n ----------\n dissimilarities : ndarray of shape (n_samples, n_samples)\n Pairwise dissimilarities between the points. Must be symmetric.\n\n metric : bool, default=True\n Compute metric or nonmetric SMACOF algorithm.\n\n n_components : int, default=2\n Number of dimensions in which to immerse the dissimilarities. If an\n ``init`` array is provided, this option is overridden and the shape of\n ``init`` is used to determine the dimensionality of the embedding\n space.\n\n init : ndarray of shape (n_samples, n_components), default=None\n Starting configuration of the embedding to initialize the algorithm. By\n default, the algorithm is initialized with a randomly chosen array.\n\n n_init : int, default=8\n Number of times the SMACOF algorithm will be run with different\n initializations. The final results will be the best output of the runs,\n determined by the run with the smallest final stress. If ``init`` is\n provided, this option is overridden and a single run is performed.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. If multiple\n initializations are used (``n_init``), each run of the algorithm is\n computed in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n max_iter : int, default=300\n Maximum number of iterations of the SMACOF algorithm for a single run.\n\n verbose : int, default=0\n Level of verbosity.\n\n eps : float, default=1e-3\n Relative tolerance with respect to stress at which to declare\n convergence.\n\n random_state : int, RandomState instance or None, default=None\n Determines the random number generator used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_components)\n Coordinates of the points in a ``n_components``-space.\n\n stress : float\n The final value of the stress (sum of squared distance of the\n disparities and the distances for all constrained points).\n\n n_iter : int\n The number of iterations corresponding to the best stress. Returned\n only if ``return_n_iter`` is set to ``True``.\n\n Notes\n -----\n \"Modern Multidimensional Scaling - Theory and Applications\" Borg, I.;\n Groenen P. Springer Series in Statistics (1997)\n\n \"Nonmetric multidimensional scaling: a numerical method\" Kruskal, J.\n Psychometrika, 29 (1964)\n\n \"Multidimensional scaling by optimizing goodness of fit to a nonmetric\n hypothesis\" Kruskal, J. Psychometrika, 29, (1964)\n ", "source_code": "\ndef smacof(dissimilarities, *, metric=True, n_components=2, init=None, n_init=8, n_jobs=None, max_iter=300, verbose=0, eps=0.001, random_state=None, return_n_iter=False):\n \"\"\"Compute multidimensional scaling using the SMACOF algorithm.\n\n The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a\n multidimensional scaling algorithm which minimizes an objective function\n (the *stress*) using a majorization technique. Stress majorization, also\n known as the Guttman Transform, guarantees a monotone convergence of\n stress, and is more powerful than traditional techniques such as gradient\n descent.\n\n The SMACOF algorithm for metric MDS can be summarized by the following\n steps:\n\n 1. Set an initial start configuration, randomly or not.\n 2. Compute the stress\n 3. Compute the Guttman Transform\n 4. Iterate 2 and 3 until convergence.\n\n The nonmetric algorithm adds a monotonic regression step before computing\n the stress.\n\n Parameters\n ----------\n dissimilarities : ndarray of shape (n_samples, n_samples)\n Pairwise dissimilarities between the points. Must be symmetric.\n\n metric : bool, default=True\n Compute metric or nonmetric SMACOF algorithm.\n\n n_components : int, default=2\n Number of dimensions in which to immerse the dissimilarities. If an\n ``init`` array is provided, this option is overridden and the shape of\n ``init`` is used to determine the dimensionality of the embedding\n space.\n\n init : ndarray of shape (n_samples, n_components), default=None\n Starting configuration of the embedding to initialize the algorithm. By\n default, the algorithm is initialized with a randomly chosen array.\n\n n_init : int, default=8\n Number of times the SMACOF algorithm will be run with different\n initializations. The final results will be the best output of the runs,\n determined by the run with the smallest final stress. If ``init`` is\n provided, this option is overridden and a single run is performed.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. If multiple\n initializations are used (``n_init``), each run of the algorithm is\n computed in parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n max_iter : int, default=300\n Maximum number of iterations of the SMACOF algorithm for a single run.\n\n verbose : int, default=0\n Level of verbosity.\n\n eps : float, default=1e-3\n Relative tolerance with respect to stress at which to declare\n convergence.\n\n random_state : int, RandomState instance or None, default=None\n Determines the random number generator used to initialize the centers.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n return_n_iter : bool, default=False\n Whether or not to return the number of iterations.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_components)\n Coordinates of the points in a ``n_components``-space.\n\n stress : float\n The final value of the stress (sum of squared distance of the\n disparities and the distances for all constrained points).\n\n n_iter : int\n The number of iterations corresponding to the best stress. Returned\n only if ``return_n_iter`` is set to ``True``.\n\n Notes\n -----\n \"Modern Multidimensional Scaling - Theory and Applications\" Borg, I.;\n Groenen P. Springer Series in Statistics (1997)\n\n \"Nonmetric multidimensional scaling: a numerical method\" Kruskal, J.\n Psychometrika, 29 (1964)\n\n \"Multidimensional scaling by optimizing goodness of fit to a nonmetric\n hypothesis\" Kruskal, J. Psychometrika, 29, (1964)\n \"\"\"\n dissimilarities = check_array(dissimilarities)\n random_state = check_random_state(random_state)\n if hasattr(init, '__array__'):\n init = np.asarray(init).copy()\n if not n_init == 1:\n warnings.warn('Explicit initial positions passed: performing only one init of the MDS instead of %d' % n_init)\n n_init = 1\n (best_pos, best_stress) = (None, None)\n if effective_n_jobs(n_jobs) == 1:\n for it in range(n_init):\n (pos, stress, n_iter_) = _smacof_single(dissimilarities, metric=metric, n_components=n_components, init=init, max_iter=max_iter, verbose=verbose, eps=eps, random_state=random_state)\n if best_stress is None or stress < best_stress:\n best_stress = stress\n best_pos = pos.copy()\n best_iter = n_iter_\n else:\n seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)\n results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))((delayed(_smacof_single)(dissimilarities, metric=metric, n_components=n_components, init=init, max_iter=max_iter, verbose=verbose, eps=eps, random_state=seed) for seed in seeds))\n (positions, stress, n_iters) = zip(*results)\n best = np.argmin(stress)\n best_stress = stress[best]\n best_pos = positions[best]\n best_iter = n_iters[best]\n if return_n_iter:\n return best_pos, best_stress, best_iter\n else:\n return best_pos, best_stress" }, { @@ -115157,7 +123613,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -115167,7 +123624,8 @@ "docstring": { "type": "int, default=2", "description": "The dimension of the projected subspace." - } + }, + "refined_type": {} }, { "name": "affinity", @@ -115177,6 +123635,15 @@ "docstring": { "type": "{'nearest_neighbors', 'rbf', 'precomputed', 'precomputed_nearest_neighbors'} or callable, default='nearest_neighbors'", "description": "How to construct the affinity matrix.\n - 'nearest_neighbors' : construct the affinity matrix by computing a\n graph of nearest neighbors.\n - 'rbf' : construct the affinity matrix by computing a radial basis\n function (RBF) kernel.\n - 'precomputed' : interpret ``X`` as a precomputed affinity matrix.\n - 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph\n of precomputed nearest neighbors, and constructs the affinity matrix\n by selecting the ``n_neighbors`` nearest neighbors.\n - callable : use passed in function as affinity\n the function takes in data matrix (n_samples, n_features)\n and return affinity matrix (n_samples, n_samples)." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "precomputed_nearest_neighbors", + "nearest_neighbors", + "precomputed", + "rbf" + ] } }, { @@ -115187,7 +123654,8 @@ "docstring": { "type": "float, default=None", "description": "Kernel coefficient for rbf kernel. If None, gamma will be set to\n1/n_features." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -115197,7 +123665,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "A pseudo random number generator used for the initialization\nof the lobpcg eigen vectors decomposition when `eigen_solver ==\n'amg'`, and for the K-Means initialization. Use an int to make\nthe results deterministic across calls (See\n:term:`Glossary `).\n\n.. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information." - } + }, + "refined_type": {} }, { "name": "eigen_solver", @@ -115207,6 +123676,10 @@ "docstring": { "type": "{'arpack', 'lobpcg', 'amg'}, default=None", "description": "The eigenvalue decomposition strategy to use. AMG requires pyamg\nto be installed. It can be faster on very large, sparse problems.\nIf None, then ``'arpack'`` is used." + }, + "refined_type": { + "kind": "EnumType", + "values": ["lobpcg", "amg", "arpack"] } }, { @@ -115217,7 +123690,8 @@ "docstring": { "type": "int, default=None", "description": "Number of nearest neighbors for nearest_neighbors graph building.\nIf None, n_neighbors will be set to max(n_samples/10, 1)." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -115227,13 +123701,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=2, *, affinity='nearest_neighbors', gamma=None, random_state=None, eigen_solver=None, n_neighbors=None, n_jobs=None):\n self.n_components = n_components\n self.affinity = affinity\n self.gamma = gamma\n self.random_state = random_state\n self.eigen_solver = eigen_solver\n self.n_neighbors = n_neighbors\n self.n_jobs = n_jobs" }, { @@ -115251,7 +123726,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -115261,7 +123737,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -115271,13 +123748,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Calculate the affinity matrix from data Parameters ---------- X : array-like of shape (n_samples, n_features) Training vector, where `n_samples` is the number of samples and `n_features` is the number of features.\n\n If affinity is \"precomputed\" X : array-like of shape (n_samples, n_samples), Interpret X as precomputed adjacency graph computed from samples. Y: Ignored", - "docstring": "Calculate the affinity matrix from data\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : array-like of shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\nY: Ignored\n\nReturns\n-------\naffinity_matrix of shape (n_samples, n_samples)", + "description": "Calculate the affinity matrix from data\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : array-like of shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\nY: Ignored", + "docstring": "Calculate the affinity matrix from data\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : array-like of shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n Y: Ignored\n\n Returns\n -------\n affinity_matrix of shape (n_samples, n_samples)\n ", "source_code": "\ndef _get_affinity_matrix(self, X, Y=None):\n \"\"\"Calculate the affinity matrix from data\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : array-like of shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n Y: Ignored\n\n Returns\n -------\n affinity_matrix of shape (n_samples, n_samples)\n \"\"\"\n if self.affinity == 'precomputed':\n self.affinity_matrix_ = X\n return self.affinity_matrix_\n if self.affinity == 'precomputed_nearest_neighbors':\n estimator = NearestNeighbors(n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric='precomputed').fit(X)\n connectivity = estimator.kneighbors_graph(X=X, mode='connectivity')\n self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)\n return self.affinity_matrix_\n if self.affinity == 'nearest_neighbors':\n if sparse.issparse(X):\n warnings.warn('Nearest neighbors affinity currently does not support sparse input, falling back to rbf affinity')\n self.affinity = 'rbf'\n else:\n self.n_neighbors_ = self.n_neighbors if self.n_neighbors is not None else max(int(X.shape[0] / 10), 1)\n self.affinity_matrix_ = kneighbors_graph(X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs)\n self.affinity_matrix_ = 0.5 * (self.affinity_matrix_ + self.affinity_matrix_.T)\n return self.affinity_matrix_\n if self.affinity == 'rbf':\n self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1]\n self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)\n return self.affinity_matrix_\n self.affinity_matrix_ = self.affinity(X)\n return self.affinity_matrix_" }, { @@ -115295,13 +123773,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'pairwise': self.affinity in ['precomputed', 'precomputed_nearest_neighbors']}" }, { @@ -115322,13 +123801,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef _pairwise(self):\n return self.affinity in ['precomputed', 'precomputed_nearest_neighbors']" }, { @@ -115346,7 +123826,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -115356,6 +123837,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features.\n\nIf affinity is \"precomputed\"\nX : {array-like, sparse matrix}, shape (n_samples, n_samples),\nInterpret X as precomputed adjacency graph computed from\nsamples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -115365,15 +123850,16 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "Ignored", - "description": "" - } + "description": "Not used, present for API consistency by convention." + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model from data in X.", - "docstring": "Fit the model from data in X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : {array-like, sparse matrix}, shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\ny : Ignored\n\nReturns\n-------\nself : object\n Returns the instance itself.", - "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : {array-like, sparse matrix}, shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n y : Ignored\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', ensure_min_samples=2, estimator=self)\n random_state = check_random_state(self.random_state)\n if isinstance(self.affinity, str):\n if self.affinity not in {'nearest_neighbors', 'rbf', 'precomputed', 'precomputed_nearest_neighbors'}:\n raise ValueError(\"%s is not a valid affinity. Expected 'precomputed', 'rbf', 'nearest_neighbors' or a callable.\" % self.affinity)\n elif not callable(self.affinity):\n raise ValueError(\"'affinity' is expected to be an affinity name or a callable. Got: %s\" % self.affinity)\n affinity_matrix = self._get_affinity_matrix(X)\n self.embedding_ = spectral_embedding(affinity_matrix, n_components=self.n_components, eigen_solver=self.eigen_solver, random_state=random_state)\n return self" + "docstring": "Fit the model from data in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : {array-like, sparse matrix}, shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", + "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model from data in X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : {array-like, sparse matrix}, shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', ensure_min_samples=2, estimator=self)\n random_state = check_random_state(self.random_state)\n if isinstance(self.affinity, str):\n if self.affinity not in {'nearest_neighbors', 'rbf', 'precomputed', 'precomputed_nearest_neighbors'}:\n raise ValueError(\"%s is not a valid affinity. Expected 'precomputed', 'rbf', 'nearest_neighbors' or a callable.\" % self.affinity)\n elif not callable(self.affinity):\n raise ValueError(\"'affinity' is expected to be an affinity name or a callable. Got: %s\" % self.affinity)\n affinity_matrix = self._get_affinity_matrix(X)\n self.embedding_ = spectral_embedding(affinity_matrix, n_components=self.n_components, eigen_solver=self.eigen_solver, random_state=random_state)\n return self" }, { "name": "fit_transform", @@ -115390,7 +123876,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -115400,6 +123887,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples\nand `n_features` is the number of features.\n\nIf affinity is \"precomputed\"\nX : {array-like, sparse matrix} of shape (n_samples, n_samples),\nInterpret X as precomputed adjacency graph computed from\nsamples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -115409,15 +123900,16 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "Ignored", - "description": "" - } + "description": "Not used, present for API consistency by convention." + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model from data in X and transform X.", - "docstring": "Fit the model from data in X and transform X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : {array-like, sparse matrix} of shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\ny : Ignored\n\nReturns\n-------\nX_new : array-like of shape (n_samples, n_components)", - "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : {array-like, sparse matrix} of shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n y : Ignored\n\n Returns\n -------\n X_new : array-like of shape (n_samples, n_components)\n \"\"\"\n self.fit(X)\n return self.embedding_" + "docstring": "Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : {array-like, sparse matrix} of shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : array-like of shape (n_samples, n_components)\n Spectral embedding of the training matrix.\n ", + "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Fit the model from data in X and transform X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n If affinity is \"precomputed\"\n X : {array-like, sparse matrix} of shape (n_samples, n_samples),\n Interpret X as precomputed adjacency graph computed from\n samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : array-like of shape (n_samples, n_components)\n Spectral embedding of the training matrix.\n \"\"\"\n self.fit(X)\n return self.embedding_" }, { "name": "_graph_connected_component", @@ -115434,7 +123926,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_samples)", "description": "Adjacency matrix of the graph, non-zero weight means an edge\nbetween the nodes." - } + }, + "refined_type": {} }, { "name": "node_id", @@ -115444,13 +123937,14 @@ "docstring": { "type": "int", "description": "The index of the query node of the graph." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Find the largest graph connected components that contains one given node.", - "docstring": "Find the largest graph connected components that contains one\ngiven node.\n\nParameters\n----------\ngraph : array-like of shape (n_samples, n_samples)\n Adjacency matrix of the graph, non-zero weight means an edge\n between the nodes.\n\nnode_id : int\n The index of the query node of the graph.\n\nReturns\n-------\nconnected_components_matrix : array-like of shape (n_samples,)\n An array of bool value indicating the indexes of the nodes\n belonging to the largest connected components of the given query\n node.", + "description": "Find the largest graph connected components that contains one\ngiven node.", + "docstring": "Find the largest graph connected components that contains one\n given node.\n\n Parameters\n ----------\n graph : array-like of shape (n_samples, n_samples)\n Adjacency matrix of the graph, non-zero weight means an edge\n between the nodes.\n\n node_id : int\n The index of the query node of the graph.\n\n Returns\n -------\n connected_components_matrix : array-like of shape (n_samples,)\n An array of bool value indicating the indexes of the nodes\n belonging to the largest connected components of the given query\n node.\n ", "source_code": "\ndef _graph_connected_component(graph, node_id):\n \"\"\"Find the largest graph connected components that contains one\n given node.\n\n Parameters\n ----------\n graph : array-like of shape (n_samples, n_samples)\n Adjacency matrix of the graph, non-zero weight means an edge\n between the nodes.\n\n node_id : int\n The index of the query node of the graph.\n\n Returns\n -------\n connected_components_matrix : array-like of shape (n_samples,)\n An array of bool value indicating the indexes of the nodes\n belonging to the largest connected components of the given query\n node.\n \"\"\"\n n_node = graph.shape[0]\n if sparse.issparse(graph):\n graph = graph.tocsr()\n connected_nodes = np.zeros(n_node, dtype=bool)\n nodes_to_explore = np.zeros(n_node, dtype=bool)\n nodes_to_explore[node_id] = True\n for _ in range(n_node):\n last_num_component = connected_nodes.sum()\n np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)\n if last_num_component >= connected_nodes.sum():\n break\n indices = np.where(nodes_to_explore)[0]\n nodes_to_explore.fill(False)\n for i in indices:\n if sparse.issparse(graph):\n neighbors = graph[i].toarray().ravel()\n else:\n neighbors = graph[i]\n np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)\n return connected_nodes" }, { @@ -115468,13 +123962,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_samples)", "description": "Adjacency matrix of the graph, non-zero weight means an edge\nbetween the nodes." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Return whether the graph is connected (True) or Not (False).", - "docstring": "Return whether the graph is connected (True) or Not (False).\n\nParameters\n----------\ngraph : {array-like, sparse matrix} of shape (n_samples, n_samples)\n Adjacency matrix of the graph, non-zero weight means an edge\n between the nodes.\n\nReturns\n-------\nis_connected : bool\n True means the graph is fully connected and False means not.", + "docstring": "Return whether the graph is connected (True) or Not (False).\n\n Parameters\n ----------\n graph : {array-like, sparse matrix} of shape (n_samples, n_samples)\n Adjacency matrix of the graph, non-zero weight means an edge\n between the nodes.\n\n Returns\n -------\n is_connected : bool\n True means the graph is fully connected and False means not.\n ", "source_code": "\ndef _graph_is_connected(graph):\n \"\"\"Return whether the graph is connected (True) or Not (False).\n\n Parameters\n ----------\n graph : {array-like, sparse matrix} of shape (n_samples, n_samples)\n Adjacency matrix of the graph, non-zero weight means an edge\n between the nodes.\n\n Returns\n -------\n is_connected : bool\n True means the graph is fully connected and False means not.\n \"\"\"\n if sparse.isspmatrix(graph):\n (n_connected_components, _) = connected_components(graph)\n return n_connected_components == 1\n else:\n return _graph_connected_component(graph, 0).sum() == graph.shape[0]" }, { @@ -115492,6 +123990,10 @@ "docstring": { "type": "{ndarray, sparse matrix}", "description": "The graph laplacian." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -115502,7 +124004,8 @@ "docstring": { "type": "float", "description": "The value of the diagonal." - } + }, + "refined_type": {} }, { "name": "norm_laplacian", @@ -115512,13 +124015,14 @@ "docstring": { "type": "bool", "description": "Whether the value of the diagonal should be changed or not." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Set the diagonal of the laplacian matrix and convert it to a sparse format well suited for eigenvalue decomposition.", - "docstring": "Set the diagonal of the laplacian matrix and convert it to a\nsparse format well suited for eigenvalue decomposition.\n\nParameters\n----------\nlaplacian : {ndarray, sparse matrix}\n The graph laplacian.\n\nvalue : float\n The value of the diagonal.\n\nnorm_laplacian : bool\n Whether the value of the diagonal should be changed or not.\n\nReturns\n-------\nlaplacian : {array, sparse matrix}\n An array of matrix in a form that is well suited to fast\n eigenvalue decomposition, depending on the band width of the\n matrix.", + "description": "Set the diagonal of the laplacian matrix and convert it to a\nsparse format well suited for eigenvalue decomposition.", + "docstring": "Set the diagonal of the laplacian matrix and convert it to a\n sparse format well suited for eigenvalue decomposition.\n\n Parameters\n ----------\n laplacian : {ndarray, sparse matrix}\n The graph laplacian.\n\n value : float\n The value of the diagonal.\n\n norm_laplacian : bool\n Whether the value of the diagonal should be changed or not.\n\n Returns\n -------\n laplacian : {array, sparse matrix}\n An array of matrix in a form that is well suited to fast\n eigenvalue decomposition, depending on the band width of the\n matrix.\n ", "source_code": "\ndef _set_diag(laplacian, value, norm_laplacian):\n \"\"\"Set the diagonal of the laplacian matrix and convert it to a\n sparse format well suited for eigenvalue decomposition.\n\n Parameters\n ----------\n laplacian : {ndarray, sparse matrix}\n The graph laplacian.\n\n value : float\n The value of the diagonal.\n\n norm_laplacian : bool\n Whether the value of the diagonal should be changed or not.\n\n Returns\n -------\n laplacian : {array, sparse matrix}\n An array of matrix in a form that is well suited to fast\n eigenvalue decomposition, depending on the band width of the\n matrix.\n \"\"\"\n n_nodes = laplacian.shape[0]\n if not sparse.isspmatrix(laplacian):\n if norm_laplacian:\n laplacian.flat[::n_nodes + 1] = value\n else:\n laplacian = laplacian.tocoo()\n if norm_laplacian:\n diag_idx = laplacian.row == laplacian.col\n laplacian.data[diag_idx] = value\n n_diags = np.unique(laplacian.row - laplacian.col).size\n if n_diags <= 7:\n laplacian = laplacian.todia()\n else:\n laplacian = laplacian.tocsr()\n return laplacian" }, { @@ -115536,6 +124040,10 @@ "docstring": { "type": "{array-like, sparse graph} of shape (n_samples, n_samples)", "description": "The adjacency matrix of the graph to embed." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -115546,7 +124054,8 @@ "docstring": { "type": "int, default=8", "description": "The dimension of the projection subspace." - } + }, + "refined_type": {} }, { "name": "eigen_solver", @@ -115556,6 +124065,10 @@ "docstring": { "type": "{'arpack', 'lobpcg', 'amg'}, default=None", "description": "The eigenvalue decomposition strategy to use. AMG requires pyamg\nto be installed. It can be faster on very large, sparse problems,\nbut may also lead to instabilities. If None, then ``'arpack'`` is\nused." + }, + "refined_type": { + "kind": "EnumType", + "values": ["lobpcg", "amg", "arpack"] } }, { @@ -115566,7 +124079,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "A pseudo random number generator used for the initialization\nof the lobpcg eigen vectors decomposition when `eigen_solver ==\n'amg'`, and for the K-Means initialization. Use an int to make\nthe results deterministic across calls (See\n:term:`Glossary `).\n\n.. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information." - } + }, + "refined_type": {} }, { "name": "eigen_tol", @@ -115576,7 +124090,8 @@ "docstring": { "type": "float, default=0.0", "description": "Stopping criterion for eigendecomposition of the Laplacian matrix\nwhen using arpack eigen_solver." - } + }, + "refined_type": {} }, { "name": "norm_laplacian", @@ -115586,7 +124101,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, then compute symmetric normalized Laplacian." - } + }, + "refined_type": {} }, { "name": "drop_first", @@ -115596,13 +124112,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to drop the first eigenvector. For spectral embedding, this\nshould be True as the first eigenvector should be constant vector for\nconnected graph, but for spectral clustering, this should be kept as\nFalse to retain the first eigenvector." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Project the sample on the first eigenvectors of the graph Laplacian.\n\nThe adjacency matrix is used to compute a normalized graph Laplacian whose spectrum (especially the eigenvectors associated to the smallest eigenvalues) has an interpretation in terms of minimal number of cuts necessary to split the graph into comparably sized components. This embedding can also 'work' even if the ``adjacency`` variable is not strictly the adjacency matrix of a graph but more generally an affinity or similarity matrix between samples (for instance the heat kernel of a euclidean distance matrix or a k-NN matrix). However care must taken to always make the affinity matrix symmetric so that the eigenvector decomposition works as expected. Note : Laplacian Eigenmaps is the actual algorithm implemented here. Read more in the :ref:`User Guide `.", - "docstring": "Project the sample on the first eigenvectors of the graph Laplacian.\n\nThe adjacency matrix is used to compute a normalized graph Laplacian\nwhose spectrum (especially the eigenvectors associated to the\nsmallest eigenvalues) has an interpretation in terms of minimal\nnumber of cuts necessary to split the graph into comparably sized\ncomponents.\n\nThis embedding can also 'work' even if the ``adjacency`` variable is\nnot strictly the adjacency matrix of a graph but more generally\nan affinity or similarity matrix between samples (for instance the\nheat kernel of a euclidean distance matrix or a k-NN matrix).\n\nHowever care must taken to always make the affinity matrix symmetric\nso that the eigenvector decomposition works as expected.\n\nNote : Laplacian Eigenmaps is the actual algorithm implemented here.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nadjacency : {array-like, sparse graph} of shape (n_samples, n_samples)\n The adjacency matrix of the graph to embed.\n\nn_components : int, default=8\n The dimension of the projection subspace.\n\neigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None\n The eigenvalue decomposition strategy to use. AMG requires pyamg\n to be installed. It can be faster on very large, sparse problems,\n but may also lead to instabilities. If None, then ``'arpack'`` is\n used.\n\nrandom_state : int, RandomState instance or None, default=None\n A pseudo random number generator used for the initialization\n of the lobpcg eigen vectors decomposition when `eigen_solver ==\n 'amg'`, and for the K-Means initialization. Use an int to make\n the results deterministic across calls (See\n :term:`Glossary `).\n\n .. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information.\n\neigen_tol : float, default=0.0\n Stopping criterion for eigendecomposition of the Laplacian matrix\n when using arpack eigen_solver.\n\nnorm_laplacian : bool, default=True\n If True, then compute symmetric normalized Laplacian.\n\ndrop_first : bool, default=True\n Whether to drop the first eigenvector. For spectral embedding, this\n should be True as the first eigenvector should be constant vector for\n connected graph, but for spectral clustering, this should be kept as\n False to retain the first eigenvector.\n\nReturns\n-------\nembedding : ndarray of shape (n_samples, n_components)\n The reduced samples.\n\nNotes\n-----\nSpectral Embedding (Laplacian Eigenmaps) is most useful when the graph\nhas one connected component. If there graph has many components, the first\nfew eigenvectors will simply uncover the connected components of the graph.\n\nReferences\n----------\n* https://en.wikipedia.org/wiki/LOBPCG\n\n* Toward the Optimal Preconditioned Eigensolver: Locally Optimal\n Block Preconditioned Conjugate Gradient Method\n Andrew V. Knyazev\n https://doi.org/10.1137%2FS1064827500366124", + "description": "Project the sample on the first eigenvectors of the graph Laplacian.\n\nThe adjacency matrix is used to compute a normalized graph Laplacian\nwhose spectrum (especially the eigenvectors associated to the\nsmallest eigenvalues) has an interpretation in terms of minimal\nnumber of cuts necessary to split the graph into comparably sized\ncomponents.\n\nThis embedding can also 'work' even if the ``adjacency`` variable is\nnot strictly the adjacency matrix of a graph but more generally\nan affinity or similarity matrix between samples (for instance the\nheat kernel of a euclidean distance matrix or a k-NN matrix).\n\nHowever care must taken to always make the affinity matrix symmetric\nso that the eigenvector decomposition works as expected.\n\nNote : Laplacian Eigenmaps is the actual algorithm implemented here.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Project the sample on the first eigenvectors of the graph Laplacian.\n\n The adjacency matrix is used to compute a normalized graph Laplacian\n whose spectrum (especially the eigenvectors associated to the\n smallest eigenvalues) has an interpretation in terms of minimal\n number of cuts necessary to split the graph into comparably sized\n components.\n\n This embedding can also 'work' even if the ``adjacency`` variable is\n not strictly the adjacency matrix of a graph but more generally\n an affinity or similarity matrix between samples (for instance the\n heat kernel of a euclidean distance matrix or a k-NN matrix).\n\n However care must taken to always make the affinity matrix symmetric\n so that the eigenvector decomposition works as expected.\n\n Note : Laplacian Eigenmaps is the actual algorithm implemented here.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n adjacency : {array-like, sparse graph} of shape (n_samples, n_samples)\n The adjacency matrix of the graph to embed.\n\n n_components : int, default=8\n The dimension of the projection subspace.\n\n eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None\n The eigenvalue decomposition strategy to use. AMG requires pyamg\n to be installed. It can be faster on very large, sparse problems,\n but may also lead to instabilities. If None, then ``'arpack'`` is\n used.\n\n random_state : int, RandomState instance or None, default=None\n A pseudo random number generator used for the initialization\n of the lobpcg eigen vectors decomposition when `eigen_solver ==\n 'amg'`, and for the K-Means initialization. Use an int to make\n the results deterministic across calls (See\n :term:`Glossary `).\n\n .. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information.\n\n eigen_tol : float, default=0.0\n Stopping criterion for eigendecomposition of the Laplacian matrix\n when using arpack eigen_solver.\n\n norm_laplacian : bool, default=True\n If True, then compute symmetric normalized Laplacian.\n\n drop_first : bool, default=True\n Whether to drop the first eigenvector. For spectral embedding, this\n should be True as the first eigenvector should be constant vector for\n connected graph, but for spectral clustering, this should be kept as\n False to retain the first eigenvector.\n\n Returns\n -------\n embedding : ndarray of shape (n_samples, n_components)\n The reduced samples.\n\n Notes\n -----\n Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph\n has one connected component. If there graph has many components, the first\n few eigenvectors will simply uncover the connected components of the graph.\n\n References\n ----------\n * https://en.wikipedia.org/wiki/LOBPCG\n\n * Toward the Optimal Preconditioned Eigensolver: Locally Optimal\n Block Preconditioned Conjugate Gradient Method\n Andrew V. Knyazev\n https://doi.org/10.1137%2FS1064827500366124\n ", "source_code": "\ndef spectral_embedding(adjacency, *, n_components=8, eigen_solver=None, random_state=None, eigen_tol=0.0, norm_laplacian=True, drop_first=True):\n \"\"\"Project the sample on the first eigenvectors of the graph Laplacian.\n\n The adjacency matrix is used to compute a normalized graph Laplacian\n whose spectrum (especially the eigenvectors associated to the\n smallest eigenvalues) has an interpretation in terms of minimal\n number of cuts necessary to split the graph into comparably sized\n components.\n\n This embedding can also 'work' even if the ``adjacency`` variable is\n not strictly the adjacency matrix of a graph but more generally\n an affinity or similarity matrix between samples (for instance the\n heat kernel of a euclidean distance matrix or a k-NN matrix).\n\n However care must taken to always make the affinity matrix symmetric\n so that the eigenvector decomposition works as expected.\n\n Note : Laplacian Eigenmaps is the actual algorithm implemented here.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n adjacency : {array-like, sparse graph} of shape (n_samples, n_samples)\n The adjacency matrix of the graph to embed.\n\n n_components : int, default=8\n The dimension of the projection subspace.\n\n eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None\n The eigenvalue decomposition strategy to use. AMG requires pyamg\n to be installed. It can be faster on very large, sparse problems,\n but may also lead to instabilities. If None, then ``'arpack'`` is\n used.\n\n random_state : int, RandomState instance or None, default=None\n A pseudo random number generator used for the initialization\n of the lobpcg eigen vectors decomposition when `eigen_solver ==\n 'amg'`, and for the K-Means initialization. Use an int to make\n the results deterministic across calls (See\n :term:`Glossary `).\n\n .. note::\n When using `eigen_solver == 'amg'`,\n it is necessary to also fix the global numpy seed with\n `np.random.seed(int)` to get deterministic results. See\n https://github.com/pyamg/pyamg/issues/139 for further\n information.\n\n eigen_tol : float, default=0.0\n Stopping criterion for eigendecomposition of the Laplacian matrix\n when using arpack eigen_solver.\n\n norm_laplacian : bool, default=True\n If True, then compute symmetric normalized Laplacian.\n\n drop_first : bool, default=True\n Whether to drop the first eigenvector. For spectral embedding, this\n should be True as the first eigenvector should be constant vector for\n connected graph, but for spectral clustering, this should be kept as\n False to retain the first eigenvector.\n\n Returns\n -------\n embedding : ndarray of shape (n_samples, n_components)\n The reduced samples.\n\n Notes\n -----\n Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph\n has one connected component. If there graph has many components, the first\n few eigenvectors will simply uncover the connected components of the graph.\n\n References\n ----------\n * https://en.wikipedia.org/wiki/LOBPCG\n\n * Toward the Optimal Preconditioned Eigensolver: Locally Optimal\n Block Preconditioned Conjugate Gradient Method\n Andrew V. Knyazev\n https://doi.org/10.1137%2FS1064827500366124\n \"\"\"\n adjacency = check_symmetric(adjacency)\n try:\n from pyamg import smoothed_aggregation_solver\n except ImportError as e:\n if eigen_solver == 'amg':\n raise ValueError(\"The eigen_solver was set to 'amg', but pyamg is not available.\") from e\n if eigen_solver is None:\n eigen_solver = 'arpack'\n elif eigen_solver not in ('arpack', 'lobpcg', 'amg'):\n raise ValueError(\"Unknown value for eigen_solver: '%s'.Should be 'amg', 'arpack', or 'lobpcg'\" % eigen_solver)\n random_state = check_random_state(random_state)\n n_nodes = adjacency.shape[0]\n if drop_first:\n n_components = n_components + 1\n if not _graph_is_connected(adjacency):\n warnings.warn('Graph is not fully connected, spectral embedding may not work as expected.')\n (laplacian, dd) = csgraph_laplacian(adjacency, normed=norm_laplacian, return_diag=True)\n if eigen_solver == 'arpack' or eigen_solver != 'lobpcg' and (not sparse.isspmatrix(laplacian) or n_nodes < 5 * n_components):\n laplacian = _set_diag(laplacian, 1, norm_laplacian)\n try:\n laplacian *= -1\n v0 = _init_arpack_v0(laplacian.shape[0], random_state)\n (_, diffusion_map) = eigsh(laplacian, k=n_components, sigma=1.0, which='LM', tol=eigen_tol, v0=v0)\n embedding = diffusion_map.T[n_components::-1]\n if norm_laplacian:\n embedding = embedding / dd\n except RuntimeError:\n eigen_solver = 'lobpcg'\n laplacian *= -1\n elif eigen_solver == 'amg':\n if not sparse.issparse(laplacian):\n warnings.warn('AMG works better for sparse matrices')\n laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True)\n laplacian = _set_diag(laplacian, 1, norm_laplacian)\n diag_shift = 1e-05 * sparse.eye(laplacian.shape[0])\n laplacian += diag_shift\n ml = smoothed_aggregation_solver(check_array(laplacian, accept_sparse='csr'))\n laplacian -= diag_shift\n M = ml.aspreconditioner()\n X = random_state.rand(laplacian.shape[0], n_components + 1)\n X[:, 0] = dd.ravel()\n (_, diffusion_map) = lobpcg(laplacian, X, M=M, tol=1e-05, largest=False)\n embedding = diffusion_map.T\n if norm_laplacian:\n embedding = embedding / dd\n if embedding.shape[0] == 1:\n raise ValueError\n if eigen_solver == 'lobpcg':\n laplacian = check_array(laplacian, dtype=np.float64, accept_sparse=True)\n if n_nodes < 5 * n_components + 1:\n if sparse.isspmatrix(laplacian):\n laplacian = laplacian.toarray()\n (_, diffusion_map) = eigh(laplacian, check_finite=False)\n embedding = diffusion_map.T[:n_components]\n if norm_laplacian:\n embedding = embedding / dd\n else:\n laplacian = _set_diag(laplacian, 1, norm_laplacian)\n X = random_state.rand(laplacian.shape[0], n_components + 1)\n X[:, 0] = dd.ravel()\n (_, diffusion_map) = lobpcg(laplacian, X, tol=1e-05, largest=False, maxiter=2000)\n embedding = diffusion_map.T[:n_components]\n if norm_laplacian:\n embedding = embedding / dd\n if embedding.shape[0] == 1:\n raise ValueError\n embedding = _deterministic_vector_sign_flip(embedding)\n if drop_first:\n return embedding[1:n_components].T\n else:\n return embedding[:n_components].T" }, { @@ -115620,7 +124137,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -115630,7 +124148,8 @@ "docstring": { "type": "int, default=2", "description": "Dimension of the embedded space." - } + }, + "refined_type": {} }, { "name": "perplexity", @@ -115640,7 +124159,8 @@ "docstring": { "type": "float, default=30.0", "description": "The perplexity is related to the number of nearest neighbors that\nis used in other manifold learning algorithms. Larger datasets\nusually require a larger perplexity. Consider selecting a value\nbetween 5 and 50. Different values can result in significantly\ndifferent results." - } + }, + "refined_type": {} }, { "name": "early_exaggeration", @@ -115650,7 +124170,8 @@ "docstring": { "type": "float, default=12.0", "description": "Controls how tight natural clusters in the original space are in\nthe embedded space and how much space will be between them. For\nlarger values, the space between natural clusters will be larger\nin the embedded space. Again, the choice of this parameter is not\nvery critical. If the cost function increases during initial\noptimization, the early exaggeration factor or the learning rate\nmight be too high." - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -115660,6 +124181,14 @@ "docstring": { "type": "float or 'auto', default=200.0", "description": "The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If\nthe learning rate is too high, the data may look like a 'ball' with any\npoint approximately equidistant from its nearest neighbours. If the\nlearning rate is too low, most points may look compressed in a dense\ncloud with few outliers. If the cost function gets stuck in a bad local\nminimum increasing the learning rate may help.\nNote that many other t-SNE implementations (bhtsne, FIt-SNE, openTSNE,\netc.) use a definition of learning_rate that is 4 times smaller than\nours. So our learning_rate=200 corresponds to learning_rate=800 in\nthose other implementations. The 'auto' option sets the learning_rate\nto `max(N / early_exaggeration / 4, 50)` where N is the sample size,\nfollowing [4] and [5]. This will become default in 1.2." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 10.0, + "max": 1000.0, + "min_inclusive": true, + "max_inclusive": true } }, { @@ -115670,7 +124199,8 @@ "docstring": { "type": "int, default=1000", "description": "Maximum number of iterations for the optimization. Should be at\nleast 250." - } + }, + "refined_type": {} }, { "name": "n_iter_without_progress", @@ -115680,7 +124210,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations without progress before we abort the\noptimization, used after 250 initial iterations with early\nexaggeration. Note that progress is only checked every 50 iterations so\nthis value is rounded to the next multiple of 50.\n\n.. versionadded:: 0.17\n parameter *n_iter_without_progress* to control stopping criteria." - } + }, + "refined_type": {} }, { "name": "min_grad_norm", @@ -115690,7 +124221,8 @@ "docstring": { "type": "float, default=1e-7", "description": "If the gradient norm is below this threshold, the optimization will\nbe stopped." - } + }, + "refined_type": {} }, { "name": "metric", @@ -115700,7 +124232,8 @@ "docstring": { "type": "str or callable, default='euclidean'", "description": "The metric to use when calculating distance between instances in a\nfeature array. If metric is a string, it must be one of the options\nallowed by scipy.spatial.distance.pdist for its metric parameter, or\na metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.\nIf metric is \"precomputed\", X is assumed to be a distance matrix.\nAlternatively, if metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two arrays from X as input and return a value indicating\nthe distance between them. The default is \"euclidean\" which is\ninterpreted as squared euclidean distance." - } + }, + "refined_type": {} }, { "name": "init", @@ -115710,6 +124243,10 @@ "docstring": { "type": "{'random', 'pca'} or ndarray of shape (n_samples, n_components), default='random'", "description": "Initialization of embedding. Possible options are 'random', 'pca',\nand a numpy array of shape (n_samples, n_components).\nPCA initialization cannot be used with precomputed distances and is\nusually more globally stable than random initialization. `init='pca'`\nwill become default in 1.2." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "pca"] } }, { @@ -115720,7 +124257,8 @@ "docstring": { "type": "int, default=0", "description": "Verbosity level." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -115730,7 +124268,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines the random number generator. Pass an int for reproducible\nresults across multiple function calls. Note that different\ninitializations might result in different local minima of the cost\nfunction. See :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "method", @@ -115740,7 +124279,8 @@ "docstring": { "type": "str, default='barnes_hut'", "description": "By default the gradient calculation algorithm uses Barnes-Hut\napproximation running in O(NlogN) time. method='exact'\nwill run on the slower, but exact, algorithm in O(N^2) time. The\nexact algorithm should be used when nearest-neighbor errors need\nto be better than 3%. However, the exact method cannot scale to\nmillions of examples.\n\n.. versionadded:: 0.17\n Approximate optimization *method* via the Barnes-Hut." - } + }, + "refined_type": {} }, { "name": "angle", @@ -115750,7 +124290,8 @@ "docstring": { "type": "float, default=0.5", "description": "Only used if method='barnes_hut'\nThis is the trade-off between speed and accuracy for Barnes-Hut T-SNE.\n'angle' is the angular size (referred to as theta in [3]) of a distant\nnode as measured from a point. If this size is below 'angle' then it is\nused as a summary node of all points contained within it.\nThis method is not very sensitive to changes in this parameter\nin the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing\ncomputation time and angle greater 0.8 has quickly increasing error." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -115760,7 +124301,8 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search. This parameter\nhas no impact when ``metric=\"precomputed\"`` or\n(``metric=\"euclidean\"`` and ``method=\"exact\"``).\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "square_distances", @@ -115770,13 +124312,14 @@ "docstring": { "type": "True or 'legacy', default='legacy'", "description": "Whether TSNE should square the distance values. ``'legacy'`` means\nthat distance values are squared only when ``metric=\"euclidean\"``.\n``True`` means that distance values are squared for all metrics.\n\n.. versionadded:: 0.24\n Added to provide backward compatibility during deprecation of\n legacy squaring behavior.\n.. deprecated:: 0.24\n Legacy squaring behavior was deprecated in 0.24. The ``'legacy'``\n value will be removed in 1.1 (renaming of 0.26), at which point the\n default value will change to ``True``." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=2, *, perplexity=30.0, early_exaggeration=12.0, learning_rate='warn', n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07, metric='euclidean', init='warn', verbose=0, random_state=None, method='barnes_hut', angle=0.5, n_jobs=None, square_distances='legacy'):\n self.n_components = n_components\n self.perplexity = perplexity\n self.early_exaggeration = early_exaggeration\n self.learning_rate = learning_rate\n self.n_iter = n_iter\n self.n_iter_without_progress = n_iter_without_progress\n self.min_grad_norm = min_grad_norm\n self.metric = metric\n self.init = init\n self.verbose = verbose\n self.random_state = random_state\n self.method = method\n self.angle = angle\n self.n_jobs = n_jobs\n self.square_distances = square_distances" }, { @@ -115794,7 +124337,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -115804,7 +124348,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "skip_num_points", @@ -115814,7 +124359,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -115838,7 +124384,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "P", @@ -115848,7 +124395,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "degrees_of_freedom", @@ -115858,7 +124406,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -115868,7 +124417,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_embedded", @@ -115878,7 +124428,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "neighbors", @@ -115888,7 +124439,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "skip_num_points", @@ -115898,7 +124450,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -115922,7 +124475,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -115932,7 +124486,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "If the metric is 'precomputed' X must be a square distance\nmatrix. Otherwise it contains a sample per row. If the method\nis 'exact', X may be a sparse matrix of type 'csr', 'csc'\nor 'coo'. If the method is 'barnes_hut' and the metric is\n'precomputed', X may be a precomputed sparse graph." - } + }, + "refined_type": {} }, { "name": "y", @@ -115942,13 +124497,14 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit X into an embedded space.", - "docstring": "Fit X into an embedded space.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n If the metric is 'precomputed' X must be a square distance\n matrix. Otherwise it contains a sample per row. If the method\n is 'exact', X may be a sparse matrix of type 'csr', 'csc'\n or 'coo'. If the method is 'barnes_hut' and the metric is\n 'precomputed', X may be a precomputed sparse graph.\n\ny : None\n Ignored.\n\nReturns\n-------\nX_new : array of shape (n_samples, n_components)\n Embedding of the training data in low-dimensional space.", + "docstring": "Fit X into an embedded space.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n If the metric is 'precomputed' X must be a square distance\n matrix. Otherwise it contains a sample per row. If the method\n is 'exact', X may be a sparse matrix of type 'csr', 'csc'\n or 'coo'. If the method is 'barnes_hut' and the metric is\n 'precomputed', X may be a precomputed sparse graph.\n\n y : None\n Ignored.\n\n Returns\n -------\n X_new : array of shape (n_samples, n_components)\n Embedding of the training data in low-dimensional space.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit X into an embedded space.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n If the metric is 'precomputed' X must be a square distance\n matrix. Otherwise it contains a sample per row. If the method\n is 'exact', X may be a sparse matrix of type 'csr', 'csc'\n or 'coo'. If the method is 'barnes_hut' and the metric is\n 'precomputed', X may be a precomputed sparse graph.\n\n y : None\n Ignored.\n\n Returns\n -------\n X_new : array of shape (n_samples, n_components)\n Embedding of the training data in low-dimensional space.\n \"\"\"\n self.fit_transform(X)\n return self" }, { @@ -115966,7 +124522,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -115976,7 +124533,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "If the metric is 'precomputed' X must be a square distance\nmatrix. Otherwise it contains a sample per row. If the method\nis 'exact', X may be a sparse matrix of type 'csr', 'csc'\nor 'coo'. If the method is 'barnes_hut' and the metric is\n'precomputed', X may be a precomputed sparse graph." - } + }, + "refined_type": {} }, { "name": "y", @@ -115986,13 +124544,14 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit X into an embedded space and return that transformed output.", - "docstring": "Fit X into an embedded space and return that transformed output.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n If the metric is 'precomputed' X must be a square distance\n matrix. Otherwise it contains a sample per row. If the method\n is 'exact', X may be a sparse matrix of type 'csr', 'csc'\n or 'coo'. If the method is 'barnes_hut' and the metric is\n 'precomputed', X may be a precomputed sparse graph.\n\ny : None\n Ignored.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_components)\n Embedding of the training data in low-dimensional space.", + "docstring": "Fit X into an embedded space and return that transformed output.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n If the metric is 'precomputed' X must be a square distance\n matrix. Otherwise it contains a sample per row. If the method\n is 'exact', X may be a sparse matrix of type 'csr', 'csc'\n or 'coo'. If the method is 'barnes_hut' and the metric is\n 'precomputed', X may be a precomputed sparse graph.\n\n y : None\n Ignored.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Embedding of the training data in low-dimensional space.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Fit X into an embedded space and return that transformed output.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n If the metric is 'precomputed' X must be a square distance\n matrix. Otherwise it contains a sample per row. If the method\n is 'exact', X may be a sparse matrix of type 'csr', 'csc'\n or 'coo'. If the method is 'barnes_hut' and the metric is\n 'precomputed', X may be a precomputed sparse graph.\n\n y : None\n Ignored.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_components)\n Embedding of the training data in low-dimensional space.\n \"\"\"\n embedding = self._fit(X)\n self.embedding_ = embedding\n return self.embedding_" }, { @@ -116010,7 +124569,8 @@ "docstring": { "type": "callable", "description": "Should return a tuple of cost and gradient for a given parameter\nvector. When expensive to compute, the cost can optionally\nbe None and can be computed every n_iter_check steps using\nthe objective_error function." - } + }, + "refined_type": {} }, { "name": "p0", @@ -116020,7 +124580,8 @@ "docstring": { "type": "array-like of shape (n_params,)", "description": "Initial parameter vector." - } + }, + "refined_type": {} }, { "name": "it", @@ -116030,7 +124591,8 @@ "docstring": { "type": "int", "description": "Current number of iterations (this function will be called more than\nonce during the optimization)." - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -116040,7 +124602,8 @@ "docstring": { "type": "int", "description": "Maximum number of gradient descent iterations." - } + }, + "refined_type": {} }, { "name": "n_iter_check", @@ -116050,7 +124613,8 @@ "docstring": { "type": "int, default=1", "description": "Number of iterations before evaluating the global error. If the error\nis sufficiently low, we abort the optimization." - } + }, + "refined_type": {} }, { "name": "n_iter_without_progress", @@ -116060,7 +124624,8 @@ "docstring": { "type": "int, default=300", "description": "Maximum number of iterations without progress before we abort the\noptimization." - } + }, + "refined_type": {} }, { "name": "momentum", @@ -116070,7 +124635,8 @@ "docstring": { "type": "float within (0.0, 1.0), default=0.8", "description": "The momentum generates a weight for previous gradients that decays\nexponentially." - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -116080,6 +124646,14 @@ "docstring": { "type": "float, default=200.0", "description": "The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If\nthe learning rate is too high, the data may look like a 'ball' with any\npoint approximately equidistant from its nearest neighbours. If the\nlearning rate is too low, most points may look compressed in a dense\ncloud with few outliers." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 10.0, + "max": 1000.0, + "min_inclusive": true, + "max_inclusive": true } }, { @@ -116090,7 +124664,8 @@ "docstring": { "type": "float, default=0.01", "description": "Minimum individual gain for each parameter." - } + }, + "refined_type": {} }, { "name": "min_grad_norm", @@ -116100,7 +124675,8 @@ "docstring": { "type": "float, default=1e-7", "description": "If the gradient norm is below this threshold, the optimization will\nbe aborted." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -116110,7 +124686,8 @@ "docstring": { "type": "int, default=0", "description": "Verbosity level." - } + }, + "refined_type": {} }, { "name": "args", @@ -116120,7 +124697,8 @@ "docstring": { "type": "sequence, default=None", "description": "Arguments to pass to objective function." - } + }, + "refined_type": {} }, { "name": "kwargs", @@ -116130,13 +124708,14 @@ "docstring": { "type": "dict, default=None", "description": "Keyword arguments to pass to objective function." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Batch gradient descent with momentum and individual gains.", - "docstring": "Batch gradient descent with momentum and individual gains.\n\nParameters\n----------\nobjective : callable\n Should return a tuple of cost and gradient for a given parameter\n vector. When expensive to compute, the cost can optionally\n be None and can be computed every n_iter_check steps using\n the objective_error function.\n\np0 : array-like of shape (n_params,)\n Initial parameter vector.\n\nit : int\n Current number of iterations (this function will be called more than\n once during the optimization).\n\nn_iter : int\n Maximum number of gradient descent iterations.\n\nn_iter_check : int, default=1\n Number of iterations before evaluating the global error. If the error\n is sufficiently low, we abort the optimization.\n\nn_iter_without_progress : int, default=300\n Maximum number of iterations without progress before we abort the\n optimization.\n\nmomentum : float within (0.0, 1.0), default=0.8\n The momentum generates a weight for previous gradients that decays\n exponentially.\n\nlearning_rate : float, default=200.0\n The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If\n the learning rate is too high, the data may look like a 'ball' with any\n point approximately equidistant from its nearest neighbours. If the\n learning rate is too low, most points may look compressed in a dense\n cloud with few outliers.\n\nmin_gain : float, default=0.01\n Minimum individual gain for each parameter.\n\nmin_grad_norm : float, default=1e-7\n If the gradient norm is below this threshold, the optimization will\n be aborted.\n\nverbose : int, default=0\n Verbosity level.\n\nargs : sequence, default=None\n Arguments to pass to objective function.\n\nkwargs : dict, default=None\n Keyword arguments to pass to objective function.\n\nReturns\n-------\np : ndarray of shape (n_params,)\n Optimum parameters.\n\nerror : float\n Optimum.\n\ni : int\n Last iteration.", + "docstring": "Batch gradient descent with momentum and individual gains.\n\n Parameters\n ----------\n objective : callable\n Should return a tuple of cost and gradient for a given parameter\n vector. When expensive to compute, the cost can optionally\n be None and can be computed every n_iter_check steps using\n the objective_error function.\n\n p0 : array-like of shape (n_params,)\n Initial parameter vector.\n\n it : int\n Current number of iterations (this function will be called more than\n once during the optimization).\n\n n_iter : int\n Maximum number of gradient descent iterations.\n\n n_iter_check : int, default=1\n Number of iterations before evaluating the global error. If the error\n is sufficiently low, we abort the optimization.\n\n n_iter_without_progress : int, default=300\n Maximum number of iterations without progress before we abort the\n optimization.\n\n momentum : float within (0.0, 1.0), default=0.8\n The momentum generates a weight for previous gradients that decays\n exponentially.\n\n learning_rate : float, default=200.0\n The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If\n the learning rate is too high, the data may look like a 'ball' with any\n point approximately equidistant from its nearest neighbours. If the\n learning rate is too low, most points may look compressed in a dense\n cloud with few outliers.\n\n min_gain : float, default=0.01\n Minimum individual gain for each parameter.\n\n min_grad_norm : float, default=1e-7\n If the gradient norm is below this threshold, the optimization will\n be aborted.\n\n verbose : int, default=0\n Verbosity level.\n\n args : sequence, default=None\n Arguments to pass to objective function.\n\n kwargs : dict, default=None\n Keyword arguments to pass to objective function.\n\n Returns\n -------\n p : ndarray of shape (n_params,)\n Optimum parameters.\n\n error : float\n Optimum.\n\n i : int\n Last iteration.\n ", "source_code": "\ndef _gradient_descent(objective, p0, it, n_iter, n_iter_check=1, n_iter_without_progress=300, momentum=0.8, learning_rate=200.0, min_gain=0.01, min_grad_norm=1e-07, verbose=0, args=None, kwargs=None):\n \"\"\"Batch gradient descent with momentum and individual gains.\n\n Parameters\n ----------\n objective : callable\n Should return a tuple of cost and gradient for a given parameter\n vector. When expensive to compute, the cost can optionally\n be None and can be computed every n_iter_check steps using\n the objective_error function.\n\n p0 : array-like of shape (n_params,)\n Initial parameter vector.\n\n it : int\n Current number of iterations (this function will be called more than\n once during the optimization).\n\n n_iter : int\n Maximum number of gradient descent iterations.\n\n n_iter_check : int, default=1\n Number of iterations before evaluating the global error. If the error\n is sufficiently low, we abort the optimization.\n\n n_iter_without_progress : int, default=300\n Maximum number of iterations without progress before we abort the\n optimization.\n\n momentum : float within (0.0, 1.0), default=0.8\n The momentum generates a weight for previous gradients that decays\n exponentially.\n\n learning_rate : float, default=200.0\n The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If\n the learning rate is too high, the data may look like a 'ball' with any\n point approximately equidistant from its nearest neighbours. If the\n learning rate is too low, most points may look compressed in a dense\n cloud with few outliers.\n\n min_gain : float, default=0.01\n Minimum individual gain for each parameter.\n\n min_grad_norm : float, default=1e-7\n If the gradient norm is below this threshold, the optimization will\n be aborted.\n\n verbose : int, default=0\n Verbosity level.\n\n args : sequence, default=None\n Arguments to pass to objective function.\n\n kwargs : dict, default=None\n Keyword arguments to pass to objective function.\n\n Returns\n -------\n p : ndarray of shape (n_params,)\n Optimum parameters.\n\n error : float\n Optimum.\n\n i : int\n Last iteration.\n \"\"\"\n if args is None:\n args = []\n if kwargs is None:\n kwargs = {}\n p = p0.copy().ravel()\n update = np.zeros_like(p)\n gains = np.ones_like(p)\n error = np.finfo(float).max\n best_error = np.finfo(float).max\n best_iter = i = it\n tic = time()\n for i in range(it, n_iter):\n check_convergence = (i + 1) % n_iter_check == 0\n kwargs['compute_error'] = check_convergence or i == n_iter - 1\n (error, grad) = objective(p, *args, **kwargs)\n grad_norm = linalg.norm(grad)\n inc = update * grad < 0.0\n dec = np.invert(inc)\n gains[inc] += 0.2\n gains[dec] *= 0.8\n np.clip(gains, min_gain, np.inf, out=gains)\n grad *= gains\n update = momentum * update - learning_rate * grad\n p += update\n if check_convergence:\n toc = time()\n duration = toc - tic\n tic = toc\n if verbose >= 2:\n print('[t-SNE] Iteration %d: error = %.7f, gradient norm = %.7f (%s iterations in %0.3fs)' % (i + 1, error, grad_norm, n_iter_check, duration))\n if error < best_error:\n best_error = error\n best_iter = i\n elif i - best_iter > n_iter_without_progress:\n if verbose >= 2:\n print('[t-SNE] Iteration %d: did not make any progress during the last %d episodes. Finished.' % (i + 1, n_iter_without_progress))\n break\n if grad_norm <= min_grad_norm:\n if verbose >= 2:\n print('[t-SNE] Iteration %d: gradient norm %f. Finished.' % (i + 1, grad_norm))\n break\n return p, error, i" }, { @@ -116154,7 +124733,8 @@ "docstring": { "type": "ndarray of shape (n_samples * (n_samples-1) / 2,)", "description": "Distances of samples are stored as condensed matrices, i.e.\nwe omit the diagonal and duplicate entries and store everything\nin a one-dimensional array." - } + }, + "refined_type": {} }, { "name": "desired_perplexity", @@ -116164,7 +124744,8 @@ "docstring": { "type": "float", "description": "Desired perplexity of the joint probability distributions." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -116174,13 +124755,14 @@ "docstring": { "type": "int", "description": "Verbosity level." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute joint probabilities p_ij from distances.", - "docstring": "Compute joint probabilities p_ij from distances.\n\nParameters\n----------\ndistances : ndarray of shape (n_samples * (n_samples-1) / 2,)\n Distances of samples are stored as condensed matrices, i.e.\n we omit the diagonal and duplicate entries and store everything\n in a one-dimensional array.\n\ndesired_perplexity : float\n Desired perplexity of the joint probability distributions.\n\nverbose : int\n Verbosity level.\n\nReturns\n-------\nP : ndarray of shape (n_samples * (n_samples-1) / 2,)\n Condensed joint probability matrix.", + "docstring": "Compute joint probabilities p_ij from distances.\n\n Parameters\n ----------\n distances : ndarray of shape (n_samples * (n_samples-1) / 2,)\n Distances of samples are stored as condensed matrices, i.e.\n we omit the diagonal and duplicate entries and store everything\n in a one-dimensional array.\n\n desired_perplexity : float\n Desired perplexity of the joint probability distributions.\n\n verbose : int\n Verbosity level.\n\n Returns\n -------\n P : ndarray of shape (n_samples * (n_samples-1) / 2,)\n Condensed joint probability matrix.\n ", "source_code": "\ndef _joint_probabilities(distances, desired_perplexity, verbose):\n \"\"\"Compute joint probabilities p_ij from distances.\n\n Parameters\n ----------\n distances : ndarray of shape (n_samples * (n_samples-1) / 2,)\n Distances of samples are stored as condensed matrices, i.e.\n we omit the diagonal and duplicate entries and store everything\n in a one-dimensional array.\n\n desired_perplexity : float\n Desired perplexity of the joint probability distributions.\n\n verbose : int\n Verbosity level.\n\n Returns\n -------\n P : ndarray of shape (n_samples * (n_samples-1) / 2,)\n Condensed joint probability matrix.\n \"\"\"\n distances = distances.astype(np.float32, copy=False)\n conditional_P = _utils._binary_search_perplexity(distances, desired_perplexity, verbose)\n P = conditional_P + conditional_P.T\n sum_P = np.maximum(np.sum(P), MACHINE_EPSILON)\n P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON)\n return P" }, { @@ -116198,7 +124780,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_samples)", "description": "Distances of samples to its n_neighbors nearest neighbors. All other\ndistances are left to zero (and are not materialized in memory).\nMatrix should be of CSR format." - } + }, + "refined_type": {} }, { "name": "desired_perplexity", @@ -116208,7 +124791,8 @@ "docstring": { "type": "float", "description": "Desired perplexity of the joint probability distributions." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -116218,13 +124802,14 @@ "docstring": { "type": "int", "description": "Verbosity level." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute joint probabilities p_ij from distances using just nearest neighbors.\n\nThis method is approximately equal to _joint_probabilities. The latter is O(N), but limiting the joint probability to nearest neighbors improves this substantially to O(uN).", - "docstring": "Compute joint probabilities p_ij from distances using just nearest\nneighbors.\n\nThis method is approximately equal to _joint_probabilities. The latter\nis O(N), but limiting the joint probability to nearest neighbors improves\nthis substantially to O(uN).\n\nParameters\n----------\ndistances : sparse matrix of shape (n_samples, n_samples)\n Distances of samples to its n_neighbors nearest neighbors. All other\n distances are left to zero (and are not materialized in memory).\n Matrix should be of CSR format.\n\ndesired_perplexity : float\n Desired perplexity of the joint probability distributions.\n\nverbose : int\n Verbosity level.\n\nReturns\n-------\nP : sparse matrix of shape (n_samples, n_samples)\n Condensed joint probability matrix with only nearest neighbors. Matrix\n will be of CSR format.", + "description": "Compute joint probabilities p_ij from distances using just nearest\nneighbors.\n\nThis method is approximately equal to _joint_probabilities. The latter\nis O(N), but limiting the joint probability to nearest neighbors improves\nthis substantially to O(uN).", + "docstring": "Compute joint probabilities p_ij from distances using just nearest\n neighbors.\n\n This method is approximately equal to _joint_probabilities. The latter\n is O(N), but limiting the joint probability to nearest neighbors improves\n this substantially to O(uN).\n\n Parameters\n ----------\n distances : sparse matrix of shape (n_samples, n_samples)\n Distances of samples to its n_neighbors nearest neighbors. All other\n distances are left to zero (and are not materialized in memory).\n Matrix should be of CSR format.\n\n desired_perplexity : float\n Desired perplexity of the joint probability distributions.\n\n verbose : int\n Verbosity level.\n\n Returns\n -------\n P : sparse matrix of shape (n_samples, n_samples)\n Condensed joint probability matrix with only nearest neighbors. Matrix\n will be of CSR format.\n ", "source_code": "\ndef _joint_probabilities_nn(distances, desired_perplexity, verbose):\n \"\"\"Compute joint probabilities p_ij from distances using just nearest\n neighbors.\n\n This method is approximately equal to _joint_probabilities. The latter\n is O(N), but limiting the joint probability to nearest neighbors improves\n this substantially to O(uN).\n\n Parameters\n ----------\n distances : sparse matrix of shape (n_samples, n_samples)\n Distances of samples to its n_neighbors nearest neighbors. All other\n distances are left to zero (and are not materialized in memory).\n Matrix should be of CSR format.\n\n desired_perplexity : float\n Desired perplexity of the joint probability distributions.\n\n verbose : int\n Verbosity level.\n\n Returns\n -------\n P : sparse matrix of shape (n_samples, n_samples)\n Condensed joint probability matrix with only nearest neighbors. Matrix\n will be of CSR format.\n \"\"\"\n t0 = time()\n distances.sort_indices()\n n_samples = distances.shape[0]\n distances_data = distances.data.reshape(n_samples, -1)\n distances_data = distances_data.astype(np.float32, copy=False)\n conditional_P = _utils._binary_search_perplexity(distances_data, desired_perplexity, verbose)\n assert np.all(np.isfinite(conditional_P)), 'All probabilities should be finite'\n P = csr_matrix((conditional_P.ravel(), distances.indices, distances.indptr), shape=(n_samples, n_samples))\n P = P + P.T\n sum_P = np.maximum(P.sum(), MACHINE_EPSILON)\n P /= sum_P\n assert np.all(np.abs(P.data) <= 1.0)\n if verbose >= 2:\n duration = time() - t0\n print('[t-SNE] Computed conditional probabilities in {:.3f}s'.format(duration))\n return P" }, { @@ -116242,7 +124827,8 @@ "docstring": { "type": "ndarray of shape (n_params,)", "description": "Unraveled embedding." - } + }, + "refined_type": {} }, { "name": "P", @@ -116252,7 +124838,8 @@ "docstring": { "type": "ndarray of shape (n_samples * (n_samples-1) / 2,)", "description": "Condensed joint probability matrix." - } + }, + "refined_type": {} }, { "name": "degrees_of_freedom", @@ -116262,7 +124849,8 @@ "docstring": { "type": "int", "description": "Degrees of freedom of the Student's-t distribution." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -116272,7 +124860,8 @@ "docstring": { "type": "int", "description": "Number of samples." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -116282,7 +124871,8 @@ "docstring": { "type": "int", "description": "Dimension of the embedded space." - } + }, + "refined_type": {} }, { "name": "skip_num_points", @@ -116292,7 +124882,8 @@ "docstring": { "type": "int, default=0", "description": "This does not compute the gradient for points with indices below\n`skip_num_points`. This is useful when computing transforms of new\ndata where you'd like to keep the old data fixed." - } + }, + "refined_type": {} }, { "name": "compute_error", @@ -116302,13 +124893,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "t-SNE objective function: gradient of the KL divergence of p_ijs and q_ijs and the absolute error.", - "docstring": "t-SNE objective function: gradient of the KL divergence\nof p_ijs and q_ijs and the absolute error.\n\nParameters\n----------\nparams : ndarray of shape (n_params,)\n Unraveled embedding.\n\nP : ndarray of shape (n_samples * (n_samples-1) / 2,)\n Condensed joint probability matrix.\n\ndegrees_of_freedom : int\n Degrees of freedom of the Student's-t distribution.\n\nn_samples : int\n Number of samples.\n\nn_components : int\n Dimension of the embedded space.\n\nskip_num_points : int, default=0\n This does not compute the gradient for points with indices below\n `skip_num_points`. This is useful when computing transforms of new\n data where you'd like to keep the old data fixed.\n\ncompute_error: bool, default=True\n If False, the kl_divergence is not computed and returns NaN.\n\nReturns\n-------\nkl_divergence : float\n Kullback-Leibler divergence of p_ij and q_ij.\n\ngrad : ndarray of shape (n_params,)\n Unraveled gradient of the Kullback-Leibler divergence with respect to\n the embedding.", + "description": "t-SNE objective function: gradient of the KL divergence\nof p_ijs and q_ijs and the absolute error.", + "docstring": "t-SNE objective function: gradient of the KL divergence\n of p_ijs and q_ijs and the absolute error.\n\n Parameters\n ----------\n params : ndarray of shape (n_params,)\n Unraveled embedding.\n\n P : ndarray of shape (n_samples * (n_samples-1) / 2,)\n Condensed joint probability matrix.\n\n degrees_of_freedom : int\n Degrees of freedom of the Student's-t distribution.\n\n n_samples : int\n Number of samples.\n\n n_components : int\n Dimension of the embedded space.\n\n skip_num_points : int, default=0\n This does not compute the gradient for points with indices below\n `skip_num_points`. This is useful when computing transforms of new\n data where you'd like to keep the old data fixed.\n\n compute_error: bool, default=True\n If False, the kl_divergence is not computed and returns NaN.\n\n Returns\n -------\n kl_divergence : float\n Kullback-Leibler divergence of p_ij and q_ij.\n\n grad : ndarray of shape (n_params,)\n Unraveled gradient of the Kullback-Leibler divergence with respect to\n the embedding.\n ", "source_code": "\ndef _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components, skip_num_points=0, compute_error=True):\n \"\"\"t-SNE objective function: gradient of the KL divergence\n of p_ijs and q_ijs and the absolute error.\n\n Parameters\n ----------\n params : ndarray of shape (n_params,)\n Unraveled embedding.\n\n P : ndarray of shape (n_samples * (n_samples-1) / 2,)\n Condensed joint probability matrix.\n\n degrees_of_freedom : int\n Degrees of freedom of the Student's-t distribution.\n\n n_samples : int\n Number of samples.\n\n n_components : int\n Dimension of the embedded space.\n\n skip_num_points : int, default=0\n This does not compute the gradient for points with indices below\n `skip_num_points`. This is useful when computing transforms of new\n data where you'd like to keep the old data fixed.\n\n compute_error: bool, default=True\n If False, the kl_divergence is not computed and returns NaN.\n\n Returns\n -------\n kl_divergence : float\n Kullback-Leibler divergence of p_ij and q_ij.\n\n grad : ndarray of shape (n_params,)\n Unraveled gradient of the Kullback-Leibler divergence with respect to\n the embedding.\n \"\"\"\n X_embedded = params.reshape(n_samples, n_components)\n dist = pdist(X_embedded, 'sqeuclidean')\n dist /= degrees_of_freedom\n dist += 1.0\n dist **= (degrees_of_freedom + 1.0) / -2.0\n Q = np.maximum(dist / (2.0 * np.sum(dist)), MACHINE_EPSILON)\n if compute_error:\n kl_divergence = 2.0 * np.dot(P, np.log(np.maximum(P, MACHINE_EPSILON) / Q))\n else:\n kl_divergence = np.nan\n grad = np.ndarray((n_samples, n_components), dtype=params.dtype)\n PQd = squareform((P - Q) * dist)\n for i in range(skip_num_points, n_samples):\n grad[i] = np.dot(np.ravel(PQd[i], order='K'), X_embedded[i] - X_embedded)\n grad = grad.ravel()\n c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom\n grad *= c\n return kl_divergence, grad" }, { @@ -116326,7 +124918,8 @@ "docstring": { "type": "ndarray of shape (n_params,)", "description": "Unraveled embedding." - } + }, + "refined_type": {} }, { "name": "P", @@ -116336,7 +124929,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_sample)", "description": "Sparse approximate joint probability matrix, computed only for the\nk nearest-neighbors and symmetrized. Matrix should be of CSR format." - } + }, + "refined_type": {} }, { "name": "degrees_of_freedom", @@ -116346,7 +124940,8 @@ "docstring": { "type": "int", "description": "Degrees of freedom of the Student's-t distribution." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -116356,7 +124951,8 @@ "docstring": { "type": "int", "description": "Number of samples." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -116366,7 +124962,8 @@ "docstring": { "type": "int", "description": "Dimension of the embedded space." - } + }, + "refined_type": {} }, { "name": "angle", @@ -116376,7 +124973,8 @@ "docstring": { "type": "float, default=0.5", "description": "This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.\n'angle' is the angular size (referred to as theta in [3]) of a distant\nnode as measured from a point. If this size is below 'angle' then it is\nused as a summary node of all points contained within it.\nThis method is not very sensitive to changes in this parameter\nin the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing\ncomputation time and angle greater 0.8 has quickly increasing error." - } + }, + "refined_type": {} }, { "name": "skip_num_points", @@ -116386,7 +124984,8 @@ "docstring": { "type": "int, default=0", "description": "This does not compute the gradient for points with indices below\n`skip_num_points`. This is useful when computing transforms of new\ndata where you'd like to keep the old data fixed." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -116396,7 +124995,8 @@ "docstring": { "type": "int, default=False", "description": "Verbosity level." - } + }, + "refined_type": {} }, { "name": "compute_error", @@ -116406,7 +125006,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "num_threads", @@ -116416,13 +125017,14 @@ "docstring": { "type": "int, default=1", "description": "Number of threads used to compute the gradient. This is set here to\navoid calling _openmp_effective_n_threads for each gradient step." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "t-SNE objective function: KL divergence of p_ijs and q_ijs.\n\nUses Barnes-Hut tree methods to calculate the gradient that runs in O(NlogN) instead of O(N^2).", - "docstring": "t-SNE objective function: KL divergence of p_ijs and q_ijs.\n\nUses Barnes-Hut tree methods to calculate the gradient that\nruns in O(NlogN) instead of O(N^2).\n\nParameters\n----------\nparams : ndarray of shape (n_params,)\n Unraveled embedding.\n\nP : sparse matrix of shape (n_samples, n_sample)\n Sparse approximate joint probability matrix, computed only for the\n k nearest-neighbors and symmetrized. Matrix should be of CSR format.\n\ndegrees_of_freedom : int\n Degrees of freedom of the Student's-t distribution.\n\nn_samples : int\n Number of samples.\n\nn_components : int\n Dimension of the embedded space.\n\nangle : float, default=0.5\n This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.\n 'angle' is the angular size (referred to as theta in [3]) of a distant\n node as measured from a point. If this size is below 'angle' then it is\n used as a summary node of all points contained within it.\n This method is not very sensitive to changes in this parameter\n in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing\n computation time and angle greater 0.8 has quickly increasing error.\n\nskip_num_points : int, default=0\n This does not compute the gradient for points with indices below\n `skip_num_points`. This is useful when computing transforms of new\n data where you'd like to keep the old data fixed.\n\nverbose : int, default=False\n Verbosity level.\n\ncompute_error: bool, default=True\n If False, the kl_divergence is not computed and returns NaN.\n\nnum_threads : int, default=1\n Number of threads used to compute the gradient. This is set here to\n avoid calling _openmp_effective_n_threads for each gradient step.\n\nReturns\n-------\nkl_divergence : float\n Kullback-Leibler divergence of p_ij and q_ij.\n\ngrad : ndarray of shape (n_params,)\n Unraveled gradient of the Kullback-Leibler divergence with respect to\n the embedding.", + "description": "t-SNE objective function: KL divergence of p_ijs and q_ijs.\n\nUses Barnes-Hut tree methods to calculate the gradient that\nruns in O(NlogN) instead of O(N^2).", + "docstring": "t-SNE objective function: KL divergence of p_ijs and q_ijs.\n\n Uses Barnes-Hut tree methods to calculate the gradient that\n runs in O(NlogN) instead of O(N^2).\n\n Parameters\n ----------\n params : ndarray of shape (n_params,)\n Unraveled embedding.\n\n P : sparse matrix of shape (n_samples, n_sample)\n Sparse approximate joint probability matrix, computed only for the\n k nearest-neighbors and symmetrized. Matrix should be of CSR format.\n\n degrees_of_freedom : int\n Degrees of freedom of the Student's-t distribution.\n\n n_samples : int\n Number of samples.\n\n n_components : int\n Dimension of the embedded space.\n\n angle : float, default=0.5\n This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.\n 'angle' is the angular size (referred to as theta in [3]) of a distant\n node as measured from a point. If this size is below 'angle' then it is\n used as a summary node of all points contained within it.\n This method is not very sensitive to changes in this parameter\n in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing\n computation time and angle greater 0.8 has quickly increasing error.\n\n skip_num_points : int, default=0\n This does not compute the gradient for points with indices below\n `skip_num_points`. This is useful when computing transforms of new\n data where you'd like to keep the old data fixed.\n\n verbose : int, default=False\n Verbosity level.\n\n compute_error: bool, default=True\n If False, the kl_divergence is not computed and returns NaN.\n\n num_threads : int, default=1\n Number of threads used to compute the gradient. This is set here to\n avoid calling _openmp_effective_n_threads for each gradient step.\n\n Returns\n -------\n kl_divergence : float\n Kullback-Leibler divergence of p_ij and q_ij.\n\n grad : ndarray of shape (n_params,)\n Unraveled gradient of the Kullback-Leibler divergence with respect to\n the embedding.\n ", "source_code": "\ndef _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components, angle=0.5, skip_num_points=0, verbose=False, compute_error=True, num_threads=1):\n \"\"\"t-SNE objective function: KL divergence of p_ijs and q_ijs.\n\n Uses Barnes-Hut tree methods to calculate the gradient that\n runs in O(NlogN) instead of O(N^2).\n\n Parameters\n ----------\n params : ndarray of shape (n_params,)\n Unraveled embedding.\n\n P : sparse matrix of shape (n_samples, n_sample)\n Sparse approximate joint probability matrix, computed only for the\n k nearest-neighbors and symmetrized. Matrix should be of CSR format.\n\n degrees_of_freedom : int\n Degrees of freedom of the Student's-t distribution.\n\n n_samples : int\n Number of samples.\n\n n_components : int\n Dimension of the embedded space.\n\n angle : float, default=0.5\n This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.\n 'angle' is the angular size (referred to as theta in [3]) of a distant\n node as measured from a point. If this size is below 'angle' then it is\n used as a summary node of all points contained within it.\n This method is not very sensitive to changes in this parameter\n in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing\n computation time and angle greater 0.8 has quickly increasing error.\n\n skip_num_points : int, default=0\n This does not compute the gradient for points with indices below\n `skip_num_points`. This is useful when computing transforms of new\n data where you'd like to keep the old data fixed.\n\n verbose : int, default=False\n Verbosity level.\n\n compute_error: bool, default=True\n If False, the kl_divergence is not computed and returns NaN.\n\n num_threads : int, default=1\n Number of threads used to compute the gradient. This is set here to\n avoid calling _openmp_effective_n_threads for each gradient step.\n\n Returns\n -------\n kl_divergence : float\n Kullback-Leibler divergence of p_ij and q_ij.\n\n grad : ndarray of shape (n_params,)\n Unraveled gradient of the Kullback-Leibler divergence with respect to\n the embedding.\n \"\"\"\n params = params.astype(np.float32, copy=False)\n X_embedded = params.reshape(n_samples, n_components)\n val_P = P.data.astype(np.float32, copy=False)\n neighbors = P.indices.astype(np.int64, copy=False)\n indptr = P.indptr.astype(np.int64, copy=False)\n grad = np.zeros(X_embedded.shape, dtype=np.float32)\n error = _barnes_hut_tsne.gradient(val_P, X_embedded, neighbors, indptr, grad, angle, n_components, verbose, dof=degrees_of_freedom, compute_error=compute_error, num_threads=num_threads)\n c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom\n grad = grad.ravel()\n grad *= c\n return error, grad" }, { @@ -116440,7 +125042,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "If the metric is 'precomputed' X must be a square distance\nmatrix. Otherwise it contains a sample per row." - } + }, + "refined_type": {} }, { "name": "X_embedded", @@ -116450,7 +125053,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_components)", "description": "Embedding of the training data in low-dimensional space." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -116460,7 +125064,8 @@ "docstring": { "type": "int, default=5", "description": "Number of neighbors k that will be considered." - } + }, + "refined_type": {} }, { "name": "metric", @@ -116470,13 +125075,14 @@ "docstring": { "type": "str or callable, default='euclidean'", "description": "Which metric to use for computing pairwise distances between samples\nfrom the original input space. If metric is 'precomputed', X must be a\nmatrix of pairwise distances or squared distances. Otherwise, see the\ndocumentation of argument metric in sklearn.pairwise.pairwise_distances\nfor a list of available metrics.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Expresses to what extent the local structure is retained.\n\nThe trustworthiness is within [0, 1]. It is defined as .. math:: T(k) = 1 - \\frac{2}{nk (2n - 3k - 1)} \\sum^n_{i=1} \\sum_{j \\in \\mathcal{N}_{i}^{k}} \\max(0, (r(i, j) - k)) where for each sample i, :math:`\\mathcal{N}_{i}^{k}` are its k nearest neighbors in the output space, and every sample j is its :math:`r(i, j)`-th nearest neighbor in the input space. In other words, any unexpected nearest neighbors in the output space are penalised in proportion to their rank in the input space. * \"Neighborhood Preservation in Nonlinear Projection Methods: An Experimental Study\" J. Venna, S. Kaski * \"Learning a Parametric Embedding by Preserving Local Structure\" L.J.P. van der Maaten", - "docstring": "Expresses to what extent the local structure is retained.\n\nThe trustworthiness is within [0, 1]. It is defined as\n\n.. math::\n\n T(k) = 1 - \\frac{2}{nk (2n - 3k - 1)} \\sum^n_{i=1}\n \\sum_{j \\in \\mathcal{N}_{i}^{k}} \\max(0, (r(i, j) - k))\n\nwhere for each sample i, :math:`\\mathcal{N}_{i}^{k}` are its k nearest\nneighbors in the output space, and every sample j is its :math:`r(i, j)`-th\nnearest neighbor in the input space. In other words, any unexpected nearest\nneighbors in the output space are penalised in proportion to their rank in\nthe input space.\n\n* \"Neighborhood Preservation in Nonlinear Projection Methods: An\n Experimental Study\"\n J. Venna, S. Kaski\n* \"Learning a Parametric Embedding by Preserving Local Structure\"\n L.J.P. van der Maaten\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n If the metric is 'precomputed' X must be a square distance\n matrix. Otherwise it contains a sample per row.\n\nX_embedded : ndarray of shape (n_samples, n_components)\n Embedding of the training data in low-dimensional space.\n\nn_neighbors : int, default=5\n Number of neighbors k that will be considered.\n\nmetric : str or callable, default='euclidean'\n Which metric to use for computing pairwise distances between samples\n from the original input space. If metric is 'precomputed', X must be a\n matrix of pairwise distances or squared distances. Otherwise, see the\n documentation of argument metric in sklearn.pairwise.pairwise_distances\n for a list of available metrics.\n\n .. versionadded:: 0.20\n\nReturns\n-------\ntrustworthiness : float\n Trustworthiness of the low-dimensional embedding.", + "description": "Expresses to what extent the local structure is retained.\n\nThe trustworthiness is within [0, 1]. It is defined as\n\n.. math::\n\n T(k) = 1 - \\frac{2}{nk (2n - 3k - 1)} \\sum^n_{i=1}\n \\sum_{j \\in \\mathcal{N}_{i}^{k}} \\max(0, (r(i, j) - k))\n\nwhere for each sample i, :math:`\\mathcal{N}_{i}^{k}` are its k nearest\nneighbors in the output space, and every sample j is its :math:`r(i, j)`-th\nnearest neighbor in the input space. In other words, any unexpected nearest\nneighbors in the output space are penalised in proportion to their rank in\nthe input space.\n\n* \"Neighborhood Preservation in Nonlinear Projection Methods: An\n Experimental Study\"\n J. Venna, S. Kaski\n* \"Learning a Parametric Embedding by Preserving Local Structure\"\n L.J.P. van der Maaten", + "docstring": "Expresses to what extent the local structure is retained.\n\n The trustworthiness is within [0, 1]. It is defined as\n\n .. math::\n\n T(k) = 1 - \\frac{2}{nk (2n - 3k - 1)} \\sum^n_{i=1}\n \\sum_{j \\in \\mathcal{N}_{i}^{k}} \\max(0, (r(i, j) - k))\n\n where for each sample i, :math:`\\mathcal{N}_{i}^{k}` are its k nearest\n neighbors in the output space, and every sample j is its :math:`r(i, j)`-th\n nearest neighbor in the input space. In other words, any unexpected nearest\n neighbors in the output space are penalised in proportion to their rank in\n the input space.\n\n * \"Neighborhood Preservation in Nonlinear Projection Methods: An\n Experimental Study\"\n J. Venna, S. Kaski\n * \"Learning a Parametric Embedding by Preserving Local Structure\"\n L.J.P. van der Maaten\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n If the metric is 'precomputed' X must be a square distance\n matrix. Otherwise it contains a sample per row.\n\n X_embedded : ndarray of shape (n_samples, n_components)\n Embedding of the training data in low-dimensional space.\n\n n_neighbors : int, default=5\n Number of neighbors k that will be considered.\n\n metric : str or callable, default='euclidean'\n Which metric to use for computing pairwise distances between samples\n from the original input space. If metric is 'precomputed', X must be a\n matrix of pairwise distances or squared distances. Otherwise, see the\n documentation of argument metric in sklearn.pairwise.pairwise_distances\n for a list of available metrics.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n trustworthiness : float\n Trustworthiness of the low-dimensional embedding.\n ", "source_code": "\ndef trustworthiness(X, X_embedded, *, n_neighbors=5, metric='euclidean'):\n \"\"\"Expresses to what extent the local structure is retained.\n\n The trustworthiness is within [0, 1]. It is defined as\n\n .. math::\n\n T(k) = 1 - \\frac{2}{nk (2n - 3k - 1)} \\sum^n_{i=1}\n \\sum_{j \\in \\mathcal{N}_{i}^{k}} \\max(0, (r(i, j) - k))\n\n where for each sample i, :math:`\\mathcal{N}_{i}^{k}` are its k nearest\n neighbors in the output space, and every sample j is its :math:`r(i, j)`-th\n nearest neighbor in the input space. In other words, any unexpected nearest\n neighbors in the output space are penalised in proportion to their rank in\n the input space.\n\n * \"Neighborhood Preservation in Nonlinear Projection Methods: An\n Experimental Study\"\n J. Venna, S. Kaski\n * \"Learning a Parametric Embedding by Preserving Local Structure\"\n L.J.P. van der Maaten\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)\n If the metric is 'precomputed' X must be a square distance\n matrix. Otherwise it contains a sample per row.\n\n X_embedded : ndarray of shape (n_samples, n_components)\n Embedding of the training data in low-dimensional space.\n\n n_neighbors : int, default=5\n Number of neighbors k that will be considered.\n\n metric : str or callable, default='euclidean'\n Which metric to use for computing pairwise distances between samples\n from the original input space. If metric is 'precomputed', X must be a\n matrix of pairwise distances or squared distances. Otherwise, see the\n documentation of argument metric in sklearn.pairwise.pairwise_distances\n for a list of available metrics.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n trustworthiness : float\n Trustworthiness of the low-dimensional embedding.\n \"\"\"\n dist_X = pairwise_distances(X, metric=metric)\n if metric == 'precomputed':\n dist_X = dist_X.copy()\n np.fill_diagonal(dist_X, np.inf)\n ind_X = np.argsort(dist_X, axis=1)\n ind_X_embedded = NearestNeighbors(n_neighbors=n_neighbors).fit(X_embedded).kneighbors(return_distance=False)\n n_samples = X.shape[0]\n inverted_index = np.zeros((n_samples, n_samples), dtype=int)\n ordered_indices = np.arange(n_samples + 1)\n inverted_index[ordered_indices[:-1, np.newaxis], ind_X] = ordered_indices[1:]\n ranks = inverted_index[ordered_indices[:-1, np.newaxis], ind_X_embedded] - n_neighbors\n t = np.sum(ranks[ranks > 0])\n t = 1.0 - t * (2.0 / (n_samples * n_neighbors * (2.0 * n_samples - 3.0 * n_neighbors - 1.0)))\n return t" }, { @@ -116494,7 +125100,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -116504,13 +125111,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n from numpy.distutils.misc_util import Configuration\n config = Configuration('manifold', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_extension('_utils', sources=['_utils.pyx'], include_dirs=[numpy.get_include()], libraries=libraries, extra_compile_args=['-O3'])\n config.add_extension('_barnes_hut_tsne', sources=['_barnes_hut_tsne.pyx'], include_dirs=[numpy.get_include()], libraries=libraries, extra_compile_args=['-O3'])\n config.add_subpackage('tests')\n return config" }, { @@ -116528,7 +125136,8 @@ "docstring": { "type": "callable, returns shape [n_classes]", "description": "The binary metric function to use." - } + }, + "refined_type": {} }, { "name": "y_true", @@ -116538,7 +125147,8 @@ "docstring": { "type": "array, shape = [n_samples] or [n_samples, n_classes]", "description": "True binary labels in binary label indicators." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -116548,7 +125158,8 @@ "docstring": { "type": "array, shape = [n_samples] or [n_samples, n_classes]", "description": "Target scores, can either be probability estimates of the positive\nclass, confidence values, or binary decisions." - } + }, + "refined_type": {} }, { "name": "average", @@ -116558,6 +125169,10 @@ "docstring": { "type": "{None, 'micro', 'macro', 'samples', 'weighted'}, default='macro'", "description": "If ``None``, the scores for each class are returned. Otherwise,\nthis determines the type of averaging performed on the data:\n\n``'micro'``:\n Calculate metrics globally by considering each element of the label\n indicator matrix as a label.\n``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label).\n``'samples'``:\n Calculate metrics for each instance, and find their average.\n\nWill be ignored when ``y_true`` is binary." + }, + "refined_type": { + "kind": "EnumType", + "values": ["samples", "micro", "macro", "weighted"] } }, { @@ -116568,13 +125183,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Average a binary metric for multilabel classification.", - "docstring": "Average a binary metric for multilabel classification.\n\nParameters\n----------\ny_true : array, shape = [n_samples] or [n_samples, n_classes]\n True binary labels in binary label indicators.\n\ny_score : array, shape = [n_samples] or [n_samples, n_classes]\n Target scores, can either be probability estimates of the positive\n class, confidence values, or binary decisions.\n\naverage : {None, 'micro', 'macro', 'samples', 'weighted'}, default='macro'\n If ``None``, the scores for each class are returned. Otherwise,\n this determines the type of averaging performed on the data:\n\n ``'micro'``:\n Calculate metrics globally by considering each element of the label\n indicator matrix as a label.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label).\n ``'samples'``:\n Calculate metrics for each instance, and find their average.\n\n Will be ignored when ``y_true`` is binary.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nbinary_metric : callable, returns shape [n_classes]\n The binary metric function to use.\n\nReturns\n-------\nscore : float or array of shape [n_classes]\n If not ``None``, average the score, else return the score for each\n classes.", + "docstring": "Average a binary metric for multilabel classification.\n\n Parameters\n ----------\n y_true : array, shape = [n_samples] or [n_samples, n_classes]\n True binary labels in binary label indicators.\n\n y_score : array, shape = [n_samples] or [n_samples, n_classes]\n Target scores, can either be probability estimates of the positive\n class, confidence values, or binary decisions.\n\n average : {None, 'micro', 'macro', 'samples', 'weighted'}, default='macro'\n If ``None``, the scores for each class are returned. Otherwise,\n this determines the type of averaging performed on the data:\n\n ``'micro'``:\n Calculate metrics globally by considering each element of the label\n indicator matrix as a label.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label).\n ``'samples'``:\n Calculate metrics for each instance, and find their average.\n\n Will be ignored when ``y_true`` is binary.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n binary_metric : callable, returns shape [n_classes]\n The binary metric function to use.\n\n Returns\n -------\n score : float or array of shape [n_classes]\n If not ``None``, average the score, else return the score for each\n classes.\n\n ", "source_code": "\ndef _average_binary_score(binary_metric, y_true, y_score, average, sample_weight=None):\n \"\"\"Average a binary metric for multilabel classification.\n\n Parameters\n ----------\n y_true : array, shape = [n_samples] or [n_samples, n_classes]\n True binary labels in binary label indicators.\n\n y_score : array, shape = [n_samples] or [n_samples, n_classes]\n Target scores, can either be probability estimates of the positive\n class, confidence values, or binary decisions.\n\n average : {None, 'micro', 'macro', 'samples', 'weighted'}, default='macro'\n If ``None``, the scores for each class are returned. Otherwise,\n this determines the type of averaging performed on the data:\n\n ``'micro'``:\n Calculate metrics globally by considering each element of the label\n indicator matrix as a label.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label).\n ``'samples'``:\n Calculate metrics for each instance, and find their average.\n\n Will be ignored when ``y_true`` is binary.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n binary_metric : callable, returns shape [n_classes]\n The binary metric function to use.\n\n Returns\n -------\n score : float or array of shape [n_classes]\n If not ``None``, average the score, else return the score for each\n classes.\n\n \"\"\"\n average_options = (None, 'micro', 'macro', 'weighted', 'samples')\n if average not in average_options:\n raise ValueError('average has to be one of {0}'.format(average_options))\n y_type = type_of_target(y_true)\n if y_type not in ('binary', 'multilabel-indicator'):\n raise ValueError('{0} format is not supported'.format(y_type))\n if y_type == 'binary':\n return binary_metric(y_true, y_score, sample_weight=sample_weight)\n check_consistent_length(y_true, y_score, sample_weight)\n y_true = check_array(y_true)\n y_score = check_array(y_score)\n not_average_axis = 1\n score_weight = sample_weight\n average_weight = None\n if average == 'micro':\n if score_weight is not None:\n score_weight = np.repeat(score_weight, y_true.shape[1])\n y_true = y_true.ravel()\n y_score = y_score.ravel()\n elif average == 'weighted':\n if score_weight is not None:\n average_weight = np.sum(np.multiply(y_true, np.reshape(score_weight, (-1, 1))), axis=0)\n else:\n average_weight = np.sum(y_true, axis=0)\n if np.isclose(average_weight.sum(), 0.0):\n return 0\n elif average == 'samples':\n average_weight = score_weight\n score_weight = None\n not_average_axis = 0\n if y_true.ndim == 1:\n y_true = y_true.reshape((-1, 1))\n if y_score.ndim == 1:\n y_score = y_score.reshape((-1, 1))\n n_classes = y_score.shape[not_average_axis]\n score = np.zeros((n_classes, ))\n for c in range(n_classes):\n y_true_c = y_true.take([c], axis=not_average_axis).ravel()\n y_score_c = y_score.take([c], axis=not_average_axis).ravel()\n score[c] = binary_metric(y_true_c, y_score_c, sample_weight=score_weight)\n if average is not None:\n if average_weight is not None:\n average_weight = np.asarray(average_weight)\n score[average_weight == 0] = 0\n return np.average(score, weights=average_weight)\n else:\n return score" }, { @@ -116592,7 +125208,8 @@ "docstring": { "type": "callable", "description": "The binary metric function to use that accepts the following as input:\n y_true_target : array, shape = [n_samples_target]\n Some sub-array of y_true for a pair of classes designated\n positive and negative in the one-vs-one scheme.\n y_score_target : array, shape = [n_samples_target]\n Scores corresponding to the probability estimates\n of a sample belonging to the designated positive class label" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -116602,7 +125219,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "True multiclass labels." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -116612,7 +125230,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_classes)", "description": "Target scores corresponding to probability estimates of a sample\nbelonging to a particular class." - } + }, + "refined_type": {} }, { "name": "average", @@ -116622,13 +125241,17 @@ "docstring": { "type": "{'macro', 'weighted'}, default='macro'", "description": "Determines the type of averaging performed on the pairwise binary\nmetric scores:\n``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account. Classes\n are assumed to be uniformly distributed.\n``'weighted'``:\n Calculate metrics for each label, taking into account the\n prevalence of the classes." + }, + "refined_type": { + "kind": "EnumType", + "values": ["macro", "weighted"] } } ], "results": [], "is_public": false, - "description": "Average one-versus-one scores for multiclass classification.\n\nUses the binary metric for one-vs-one multiclass classification, where the score is computed according to the Hand & Till (2001) algorithm.", - "docstring": "Average one-versus-one scores for multiclass classification.\n\nUses the binary metric for one-vs-one multiclass classification,\nwhere the score is computed according to the Hand & Till (2001) algorithm.\n\nParameters\n----------\nbinary_metric : callable\n The binary metric function to use that accepts the following as input:\n y_true_target : array, shape = [n_samples_target]\n Some sub-array of y_true for a pair of classes designated\n positive and negative in the one-vs-one scheme.\n y_score_target : array, shape = [n_samples_target]\n Scores corresponding to the probability estimates\n of a sample belonging to the designated positive class label\n\ny_true : array-like of shape (n_samples,)\n True multiclass labels.\n\ny_score : array-like of shape (n_samples, n_classes)\n Target scores corresponding to probability estimates of a sample\n belonging to a particular class.\n\naverage : {'macro', 'weighted'}, default='macro'\n Determines the type of averaging performed on the pairwise binary\n metric scores:\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account. Classes\n are assumed to be uniformly distributed.\n ``'weighted'``:\n Calculate metrics for each label, taking into account the\n prevalence of the classes.\n\nReturns\n-------\nscore : float\n Average of the pairwise binary metric scores.", + "description": "Average one-versus-one scores for multiclass classification.\n\nUses the binary metric for one-vs-one multiclass classification,\nwhere the score is computed according to the Hand & Till (2001) algorithm.", + "docstring": "Average one-versus-one scores for multiclass classification.\n\n Uses the binary metric for one-vs-one multiclass classification,\n where the score is computed according to the Hand & Till (2001) algorithm.\n\n Parameters\n ----------\n binary_metric : callable\n The binary metric function to use that accepts the following as input:\n y_true_target : array, shape = [n_samples_target]\n Some sub-array of y_true for a pair of classes designated\n positive and negative in the one-vs-one scheme.\n y_score_target : array, shape = [n_samples_target]\n Scores corresponding to the probability estimates\n of a sample belonging to the designated positive class label\n\n y_true : array-like of shape (n_samples,)\n True multiclass labels.\n\n y_score : array-like of shape (n_samples, n_classes)\n Target scores corresponding to probability estimates of a sample\n belonging to a particular class.\n\n average : {'macro', 'weighted'}, default='macro'\n Determines the type of averaging performed on the pairwise binary\n metric scores:\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account. Classes\n are assumed to be uniformly distributed.\n ``'weighted'``:\n Calculate metrics for each label, taking into account the\n prevalence of the classes.\n\n Returns\n -------\n score : float\n Average of the pairwise binary metric scores.\n ", "source_code": "\ndef _average_multiclass_ovo_score(binary_metric, y_true, y_score, average='macro'):\n \"\"\"Average one-versus-one scores for multiclass classification.\n\n Uses the binary metric for one-vs-one multiclass classification,\n where the score is computed according to the Hand & Till (2001) algorithm.\n\n Parameters\n ----------\n binary_metric : callable\n The binary metric function to use that accepts the following as input:\n y_true_target : array, shape = [n_samples_target]\n Some sub-array of y_true for a pair of classes designated\n positive and negative in the one-vs-one scheme.\n y_score_target : array, shape = [n_samples_target]\n Scores corresponding to the probability estimates\n of a sample belonging to the designated positive class label\n\n y_true : array-like of shape (n_samples,)\n True multiclass labels.\n\n y_score : array-like of shape (n_samples, n_classes)\n Target scores corresponding to probability estimates of a sample\n belonging to a particular class.\n\n average : {'macro', 'weighted'}, default='macro'\n Determines the type of averaging performed on the pairwise binary\n metric scores:\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account. Classes\n are assumed to be uniformly distributed.\n ``'weighted'``:\n Calculate metrics for each label, taking into account the\n prevalence of the classes.\n\n Returns\n -------\n score : float\n Average of the pairwise binary metric scores.\n \"\"\"\n check_consistent_length(y_true, y_score)\n y_true_unique = np.unique(y_true)\n n_classes = y_true_unique.shape[0]\n n_pairs = n_classes * (n_classes - 1) // 2\n pair_scores = np.empty(n_pairs)\n is_weighted = average == 'weighted'\n prevalence = np.empty(n_pairs) if is_weighted else None\n for (ix, (a, b)) in enumerate(combinations(y_true_unique, 2)):\n a_mask = y_true == a\n b_mask = y_true == b\n ab_mask = np.logical_or(a_mask, b_mask)\n if is_weighted:\n prevalence[ix] = np.average(ab_mask)\n a_true = a_mask[ab_mask]\n b_true = b_mask[ab_mask]\n a_true_score = binary_metric(a_true, y_score[ab_mask, a])\n b_true_score = binary_metric(b_true, y_score[ab_mask, b])\n pair_scores[ix] = (a_true_score + b_true_score) / 2\n return np.average(pair_scores, weights=prevalence)" }, { @@ -116646,7 +125269,8 @@ "docstring": { "type": "int, str or None", "description": "The positive label." - } + }, + "refined_type": {} }, { "name": "y_true", @@ -116656,13 +125280,14 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target vector." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Check if `pos_label` need to be specified or not.\n\nIn binary classification, we fix `pos_label=1` if the labels are in the set {-1, 1} or {0, 1}. Otherwise, we raise an error asking to specify the `pos_label` parameters.", - "docstring": "Check if `pos_label` need to be specified or not.\n\nIn binary classification, we fix `pos_label=1` if the labels are in the set\n{-1, 1} or {0, 1}. Otherwise, we raise an error asking to specify the\n`pos_label` parameters.\n\nParameters\n----------\npos_label : int, str or None\n The positive label.\ny_true : ndarray of shape (n_samples,)\n The target vector.\n\nReturns\n-------\npos_label : int\n If `pos_label` can be inferred, it will be returned.\n\nRaises\n------\nValueError\n In the case that `y_true` does not have label in {-1, 1} or {0, 1},\n it will raise a `ValueError`.", + "description": "Check if `pos_label` need to be specified or not.\n\nIn binary classification, we fix `pos_label=1` if the labels are in the set\n{-1, 1} or {0, 1}. Otherwise, we raise an error asking to specify the\n`pos_label` parameters.", + "docstring": "Check if `pos_label` need to be specified or not.\n\n In binary classification, we fix `pos_label=1` if the labels are in the set\n {-1, 1} or {0, 1}. Otherwise, we raise an error asking to specify the\n `pos_label` parameters.\n\n Parameters\n ----------\n pos_label : int, str or None\n The positive label.\n y_true : ndarray of shape (n_samples,)\n The target vector.\n\n Returns\n -------\n pos_label : int\n If `pos_label` can be inferred, it will be returned.\n\n Raises\n ------\n ValueError\n In the case that `y_true` does not have label in {-1, 1} or {0, 1},\n it will raise a `ValueError`.\n ", "source_code": "\ndef _check_pos_label_consistency(pos_label, y_true):\n \"\"\"Check if `pos_label` need to be specified or not.\n\n In binary classification, we fix `pos_label=1` if the labels are in the set\n {-1, 1} or {0, 1}. Otherwise, we raise an error asking to specify the\n `pos_label` parameters.\n\n Parameters\n ----------\n pos_label : int, str or None\n The positive label.\n y_true : ndarray of shape (n_samples,)\n The target vector.\n\n Returns\n -------\n pos_label : int\n If `pos_label` can be inferred, it will be returned.\n\n Raises\n ------\n ValueError\n In the case that `y_true` does not have label in {-1, 1} or {0, 1},\n it will raise a `ValueError`.\n \"\"\"\n classes = np.unique(y_true)\n if pos_label is None and (classes.dtype.kind in 'OUS' or not (np.array_equal(classes, [0, 1]) or np.array_equal(classes, [-1, 1]) or np.array_equal(classes, [0]) or np.array_equal(classes, [-1]) or np.array_equal(classes, [1]))):\n classes_repr = ', '.join((repr(c) for c in classes))\n raise ValueError(f'y_true takes value in {{{classes_repr}}} and pos_label is not specified: either make y_true take value in {{0, 1}} or {{-1, 1}} or pass pos_label explicitly.')\n elif pos_label is None:\n pos_label = 1\n return pos_label" }, { @@ -116680,7 +125305,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -116690,7 +125316,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "average", @@ -116700,7 +125327,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "labels", @@ -116710,7 +125338,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -116720,13 +125349,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Validation associated with set-wise metrics.\n\nReturns identified labels.", - "docstring": "Validation associated with set-wise metrics.\n\nReturns identified labels.", + "docstring": "Validation associated with set-wise metrics.\n\n Returns identified labels.\n ", "source_code": "\ndef _check_set_wise_labels(y_true, y_pred, average, labels, pos_label):\n \"\"\"Validation associated with set-wise metrics.\n\n Returns identified labels.\n \"\"\"\n average_options = (None, 'micro', 'macro', 'weighted', 'samples')\n if average not in average_options and average != 'binary':\n raise ValueError('average has to be one of ' + str(average_options))\n (y_type, y_true, y_pred) = _check_targets(y_true, y_pred)\n present_labels = unique_labels(y_true, y_pred).tolist()\n if average == 'binary':\n if y_type == 'binary':\n if pos_label not in present_labels:\n if len(present_labels) >= 2:\n raise ValueError(f'pos_label={pos_label} is not a valid label. It should be one of {present_labels}')\n labels = [pos_label]\n else:\n average_options = list(average_options)\n if y_type == 'multiclass':\n average_options.remove('samples')\n raise ValueError(\"Target is %s but average='binary'. Please choose another average setting, one of %r.\" % (y_type, average_options))\n elif pos_label not in (None, 1):\n warnings.warn(\"Note that pos_label (set to %r) is ignored when average != 'binary' (got %r). You may use labels=[pos_label] to specify a single positive class.\" % (pos_label, average), UserWarning)\n return labels" }, { @@ -116744,7 +125374,8 @@ "docstring": { "type": "array-like", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -116754,13 +125385,14 @@ "docstring": { "type": "array-like", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Check that y_true and y_pred belong to the same classification task.\n\nThis converts multiclass or binary types to a common shape, and raises a ValueError for a mix of multilabel and multiclass targets, a mix of multilabel formats, for the presence of continuous-valued or multioutput targets, or for targets of different lengths. Column vectors are squeezed to 1d, while multilabel formats are returned as CSR sparse label indicators.", - "docstring": "Check that y_true and y_pred belong to the same classification task.\n\nThis converts multiclass or binary types to a common shape, and raises a\nValueError for a mix of multilabel and multiclass targets, a mix of\nmultilabel formats, for the presence of continuous-valued or multioutput\ntargets, or for targets of different lengths.\n\nColumn vectors are squeezed to 1d, while multilabel formats are returned\nas CSR sparse label indicators.\n\nParameters\n----------\ny_true : array-like\n\ny_pred : array-like\n\nReturns\n-------\ntype_true : one of {'multilabel-indicator', 'multiclass', 'binary'}\n The type of the true target data, as output by\n ``utils.multiclass.type_of_target``.\n\ny_true : array or indicator matrix\n\ny_pred : array or indicator matrix", + "description": "Check that y_true and y_pred belong to the same classification task.\n\nThis converts multiclass or binary types to a common shape, and raises a\nValueError for a mix of multilabel and multiclass targets, a mix of\nmultilabel formats, for the presence of continuous-valued or multioutput\ntargets, or for targets of different lengths.\n\nColumn vectors are squeezed to 1d, while multilabel formats are returned\nas CSR sparse label indicators.", + "docstring": "Check that y_true and y_pred belong to the same classification task.\n\n This converts multiclass or binary types to a common shape, and raises a\n ValueError for a mix of multilabel and multiclass targets, a mix of\n multilabel formats, for the presence of continuous-valued or multioutput\n targets, or for targets of different lengths.\n\n Column vectors are squeezed to 1d, while multilabel formats are returned\n as CSR sparse label indicators.\n\n Parameters\n ----------\n y_true : array-like\n\n y_pred : array-like\n\n Returns\n -------\n type_true : one of {'multilabel-indicator', 'multiclass', 'binary'}\n The type of the true target data, as output by\n ``utils.multiclass.type_of_target``.\n\n y_true : array or indicator matrix\n\n y_pred : array or indicator matrix\n ", "source_code": "\ndef _check_targets(y_true, y_pred):\n \"\"\"Check that y_true and y_pred belong to the same classification task.\n\n This converts multiclass or binary types to a common shape, and raises a\n ValueError for a mix of multilabel and multiclass targets, a mix of\n multilabel formats, for the presence of continuous-valued or multioutput\n targets, or for targets of different lengths.\n\n Column vectors are squeezed to 1d, while multilabel formats are returned\n as CSR sparse label indicators.\n\n Parameters\n ----------\n y_true : array-like\n\n y_pred : array-like\n\n Returns\n -------\n type_true : one of {'multilabel-indicator', 'multiclass', 'binary'}\n The type of the true target data, as output by\n ``utils.multiclass.type_of_target``.\n\n y_true : array or indicator matrix\n\n y_pred : array or indicator matrix\n \"\"\"\n check_consistent_length(y_true, y_pred)\n type_true = type_of_target(y_true)\n type_pred = type_of_target(y_pred)\n y_type = {type_true, type_pred}\n if y_type == {'binary', 'multiclass'}:\n y_type = {'multiclass'}\n if len(y_type) > 1:\n raise ValueError(\"Classification metrics can't handle a mix of {0} and {1} targets\".format(type_true, type_pred))\n y_type = y_type.pop()\n if y_type not in ['binary', 'multiclass', 'multilabel-indicator']:\n raise ValueError('{0} is not supported'.format(y_type))\n if y_type in ['binary', 'multiclass']:\n y_true = column_or_1d(y_true)\n y_pred = column_or_1d(y_pred)\n if y_type == 'binary':\n try:\n unique_values = np.union1d(y_true, y_pred)\n except TypeError as e:\n raise TypeError(f'Labels in y_true and y_pred should be of the same type. Got y_true={np.unique(y_true)} and y_pred={np.unique(y_pred)}. Make sure that the predictions provided by the classifier coincides with the true labels.') from e\n if len(unique_values) > 2:\n y_type = 'multiclass'\n if y_type.startswith('multilabel'):\n y_true = csr_matrix(y_true)\n y_pred = csr_matrix(y_pred)\n y_type = 'multilabel-indicator'\n return y_type, y_true, y_pred" }, { @@ -116778,13 +125410,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_zero_division(zero_division):\n if isinstance(zero_division, str) and zero_division == 'warn':\n return\n elif isinstance(zero_division, (int, float)) and zero_division in [0, 1]:\n return\n raise ValueError('Got zero_division={0}. Must be one of [\"warn\", 0, 1]'.format(zero_division))" }, { @@ -116802,7 +125435,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "denominator", @@ -116812,7 +125446,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "metric", @@ -116822,7 +125457,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "modifier", @@ -116832,7 +125468,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "average", @@ -116842,7 +125479,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warn_for", @@ -116852,7 +125490,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "zero_division", @@ -116862,13 +125501,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Performs division and handles divide-by-zero.\n\nOn zero-division, sets the corresponding result elements equal to 0 or 1 (according to ``zero_division``). Plus, if ``zero_division != \"warn\"`` raises a warning. The metric, modifier and average arguments are used only for determining an appropriate warning.", - "docstring": "Performs division and handles divide-by-zero.\n\nOn zero-division, sets the corresponding result elements equal to\n0 or 1 (according to ``zero_division``). Plus, if\n``zero_division != \"warn\"`` raises a warning.\n\nThe metric, modifier and average arguments are used only for determining\nan appropriate warning.", + "description": "Performs division and handles divide-by-zero.\n\nOn zero-division, sets the corresponding result elements equal to\n0 or 1 (according to ``zero_division``). Plus, if\n``zero_division != \"warn\"`` raises a warning.\n\nThe metric, modifier and average arguments are used only for determining\nan appropriate warning.", + "docstring": "Performs division and handles divide-by-zero.\n\n On zero-division, sets the corresponding result elements equal to\n 0 or 1 (according to ``zero_division``). Plus, if\n ``zero_division != \"warn\"`` raises a warning.\n\n The metric, modifier and average arguments are used only for determining\n an appropriate warning.\n ", "source_code": "\ndef _prf_divide(numerator, denominator, metric, modifier, average, warn_for, zero_division='warn'):\n \"\"\"Performs division and handles divide-by-zero.\n\n On zero-division, sets the corresponding result elements equal to\n 0 or 1 (according to ``zero_division``). Plus, if\n ``zero_division != \"warn\"`` raises a warning.\n\n The metric, modifier and average arguments are used only for determining\n an appropriate warning.\n \"\"\"\n mask = denominator == 0.0\n denominator = denominator.copy()\n denominator[mask] = 1\n result = numerator / denominator\n if not np.any(mask):\n return result\n result[mask] = 0.0 if zero_division in ['warn', 0] else 1.0\n if zero_division != 'warn' or metric not in warn_for:\n return result\n if metric in warn_for and 'f-score' in warn_for:\n msg_start = '{0} and F-score are'.format(metric.title())\n elif metric in warn_for:\n msg_start = '{0} is'.format(metric.title())\n elif 'f-score' in warn_for:\n msg_start = 'F-score is'\n else:\n return result\n _warn_prf(average, modifier, msg_start, len(result))\n return result" }, { @@ -116886,7 +125526,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "modifier", @@ -116896,7 +125537,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "msg_start", @@ -116906,7 +125548,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "result_size", @@ -116916,13 +125559,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _warn_prf(average, modifier, msg_start, result_size):\n (axis0, axis1) = ('sample', 'label')\n if average == 'samples':\n (axis0, axis1) = (axis1, axis0)\n msg = '{0} ill-defined and being set to 0.0 {{0}} no {1} {2}s. Use `zero_division` parameter to control this behavior.'.format(msg_start, modifier, axis0)\n if result_size == 1:\n msg = msg.format('due to')\n else:\n msg = msg.format('in {0}s with'.format(axis1))\n warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)" }, { @@ -116940,7 +125584,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -116950,7 +125595,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "normalize", @@ -116960,13 +125606,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _weighted_sum(sample_score, sample_weight, normalize=False):\n if normalize:\n return np.average(sample_score, weights=sample_weight)\n elif sample_weight is not None:\n return np.dot(sample_score, sample_weight)\n else:\n return sample_score.sum()" }, { @@ -116984,7 +125631,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Ground truth (correct) labels." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -116994,7 +125642,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Predicted labels, as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -117004,7 +125653,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``False``, return the number of correctly classified samples.\nOtherwise, return the fraction of correctly classified samples." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -117014,14 +125664,15 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Accuracy classification score.\n\nIn multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must *exactly* match the corresponding set of labels in y_true. Read more in the :ref:`User Guide `.", - "docstring": "Accuracy classification score.\n\nIn multilabel classification, this function computes subset accuracy:\nthe set of labels predicted for a sample must *exactly* match the\ncorresponding set of labels in y_true.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\ny_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\nnormalize : bool, default=True\n If ``False``, return the number of correctly classified samples.\n Otherwise, return the fraction of correctly classified samples.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nscore : float\n If ``normalize == True``, return the fraction of correctly\n classified samples (float), else returns the number of correctly\n classified samples (int).\n\n The best performance is 1 with ``normalize == True`` and the number\n of samples with ``normalize == False``.\n\nSee Also\n--------\njaccard_score, hamming_loss, zero_one_loss\n\nNotes\n-----\nIn binary classification, this function is equal to the `jaccard_score`\nfunction.\n\nExamples\n--------\n>>> from sklearn.metrics import accuracy_score\n>>> y_pred = [0, 2, 1, 3]\n>>> y_true = [0, 1, 2, 3]\n>>> accuracy_score(y_true, y_pred)\n0.5\n>>> accuracy_score(y_true, y_pred, normalize=False)\n2\n\nIn the multilabel case with binary label indicators:\n\n>>> import numpy as np\n>>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))\n0.5", - "source_code": "\ndef accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):\n \"\"\"Accuracy classification score.\n\n In multilabel classification, this function computes subset accuracy:\n the set of labels predicted for a sample must *exactly* match the\n corresponding set of labels in y_true.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\n normalize : bool, default=True\n If ``False``, return the number of correctly classified samples.\n Otherwise, return the fraction of correctly classified samples.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n If ``normalize == True``, return the fraction of correctly\n classified samples (float), else returns the number of correctly\n classified samples (int).\n\n The best performance is 1 with ``normalize == True`` and the number\n of samples with ``normalize == False``.\n\n See Also\n --------\n jaccard_score, hamming_loss, zero_one_loss\n\n Notes\n -----\n In binary classification, this function is equal to the `jaccard_score`\n function.\n\n Examples\n --------\n >>> from sklearn.metrics import accuracy_score\n >>> y_pred = [0, 2, 1, 3]\n >>> y_true = [0, 1, 2, 3]\n >>> accuracy_score(y_true, y_pred)\n 0.5\n >>> accuracy_score(y_true, y_pred, normalize=False)\n 2\n\n In the multilabel case with binary label indicators:\n\n >>> import numpy as np\n >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))\n 0.5\n \"\"\"\n (y_type, y_true, y_pred) = _check_targets(y_true, y_pred)\n check_consistent_length(y_true, y_pred, sample_weight)\n if y_type.startswith('multilabel'):\n differing_labels = count_nonzero(y_true - y_pred, axis=1)\n score = differing_labels == 0\n else:\n score = y_true == y_pred\n return _weighted_sum(score, sample_weight, normalize)" + "description": "Accuracy classification score.\n\nIn multilabel classification, this function computes subset accuracy:\nthe set of labels predicted for a sample must *exactly* match the\ncorresponding set of labels in y_true.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Accuracy classification score.\n\n In multilabel classification, this function computes subset accuracy:\n the set of labels predicted for a sample must *exactly* match the\n corresponding set of labels in y_true.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\n normalize : bool, default=True\n If ``False``, return the number of correctly classified samples.\n Otherwise, return the fraction of correctly classified samples.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n If ``normalize == True``, return the fraction of correctly\n classified samples (float), else returns the number of correctly\n classified samples (int).\n\n The best performance is 1 with ``normalize == True`` and the number\n of samples with ``normalize == False``.\n\n See Also\n --------\n balanced_accuracy_score : Compute the balanced accuracy to deal with\n imbalanced datasets.\n jaccard_score : Compute the Jaccard similarity coefficient score.\n hamming_loss : Compute the average Hamming loss or Hamming distance between\n two sets of samples.\n zero_one_loss : Compute the Zero-one classification loss. By default, the\n function will return the percentage of imperfectly predicted subsets.\n\n Notes\n -----\n In binary classification, this function is equal to the `jaccard_score`\n function.\n\n Examples\n --------\n >>> from sklearn.metrics import accuracy_score\n >>> y_pred = [0, 2, 1, 3]\n >>> y_true = [0, 1, 2, 3]\n >>> accuracy_score(y_true, y_pred)\n 0.5\n >>> accuracy_score(y_true, y_pred, normalize=False)\n 2\n\n In the multilabel case with binary label indicators:\n\n >>> import numpy as np\n >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))\n 0.5\n ", + "source_code": "\ndef accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):\n \"\"\"Accuracy classification score.\n\n In multilabel classification, this function computes subset accuracy:\n the set of labels predicted for a sample must *exactly* match the\n corresponding set of labels in y_true.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\n normalize : bool, default=True\n If ``False``, return the number of correctly classified samples.\n Otherwise, return the fraction of correctly classified samples.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n If ``normalize == True``, return the fraction of correctly\n classified samples (float), else returns the number of correctly\n classified samples (int).\n\n The best performance is 1 with ``normalize == True`` and the number\n of samples with ``normalize == False``.\n\n See Also\n --------\n balanced_accuracy_score : Compute the balanced accuracy to deal with\n imbalanced datasets.\n jaccard_score : Compute the Jaccard similarity coefficient score.\n hamming_loss : Compute the average Hamming loss or Hamming distance between\n two sets of samples.\n zero_one_loss : Compute the Zero-one classification loss. By default, the\n function will return the percentage of imperfectly predicted subsets.\n\n Notes\n -----\n In binary classification, this function is equal to the `jaccard_score`\n function.\n\n Examples\n --------\n >>> from sklearn.metrics import accuracy_score\n >>> y_pred = [0, 2, 1, 3]\n >>> y_true = [0, 1, 2, 3]\n >>> accuracy_score(y_true, y_pred)\n 0.5\n >>> accuracy_score(y_true, y_pred, normalize=False)\n 2\n\n In the multilabel case with binary label indicators:\n\n >>> import numpy as np\n >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))\n 0.5\n \"\"\"\n (y_type, y_true, y_pred) = _check_targets(y_true, y_pred)\n check_consistent_length(y_true, y_pred, sample_weight)\n if y_type.startswith('multilabel'):\n differing_labels = count_nonzero(y_true - y_pred, axis=1)\n score = differing_labels == 0\n else:\n score = y_true == y_pred\n return _weighted_sum(score, sample_weight, normalize)" }, { "name": "balanced_accuracy_score", @@ -117038,7 +125689,8 @@ "docstring": { "type": "1d array-like", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -117048,7 +125700,8 @@ "docstring": { "type": "1d array-like", "description": "Estimated targets as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -117058,7 +125711,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "adjusted", @@ -117068,14 +125722,15 @@ "docstring": { "type": "bool, default=False", "description": "When true, the result is adjusted for chance, so that random\nperformance would score 0, while keeping perfect performance at a score\nof 1." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the balanced accuracy.\n\nThe balanced accuracy in binary and multiclass classification problems to deal with imbalanced datasets. It is defined as the average of recall obtained on each class. The best value is 1 and the worst value is 0 when ``adjusted=False``. Read more in the :ref:`User Guide `. .. versionadded:: 0.20", - "docstring": "Compute the balanced accuracy.\n\nThe balanced accuracy in binary and multiclass classification problems to\ndeal with imbalanced datasets. It is defined as the average of recall\nobtained on each class.\n\nThe best value is 1 and the worst value is 0 when ``adjusted=False``.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20\n\nParameters\n----------\ny_true : 1d array-like\n Ground truth (correct) target values.\n\ny_pred : 1d array-like\n Estimated targets as returned by a classifier.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nadjusted : bool, default=False\n When true, the result is adjusted for chance, so that random\n performance would score 0, while keeping perfect performance at a score\n of 1.\n\nReturns\n-------\nbalanced_accuracy : float\n\nSee Also\n--------\nrecall_score, roc_auc_score\n\nNotes\n-----\nSome literature promotes alternative definitions of balanced accuracy. Our\ndefinition is equivalent to :func:`accuracy_score` with class-balanced\nsample weights, and shares desirable properties with the binary case.\nSee the :ref:`User Guide `.\n\nReferences\n----------\n.. [1] Brodersen, K.H.; Ong, C.S.; Stephan, K.E.; Buhmann, J.M. (2010).\n The balanced accuracy and its posterior distribution.\n Proceedings of the 20th International Conference on Pattern\n Recognition, 3121-24.\n.. [2] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, (2015).\n `Fundamentals of Machine Learning for Predictive Data Analytics:\n Algorithms, Worked Examples, and Case Studies\n `_.\n\nExamples\n--------\n>>> from sklearn.metrics import balanced_accuracy_score\n>>> y_true = [0, 1, 0, 0, 1, 0]\n>>> y_pred = [0, 1, 0, 0, 0, 1]\n>>> balanced_accuracy_score(y_true, y_pred)\n0.625", - "source_code": "\ndef balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, adjusted=False):\n \"\"\"Compute the balanced accuracy.\n\n The balanced accuracy in binary and multiclass classification problems to\n deal with imbalanced datasets. It is defined as the average of recall\n obtained on each class.\n\n The best value is 1 and the worst value is 0 when ``adjusted=False``.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n y_true : 1d array-like\n Ground truth (correct) target values.\n\n y_pred : 1d array-like\n Estimated targets as returned by a classifier.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n adjusted : bool, default=False\n When true, the result is adjusted for chance, so that random\n performance would score 0, while keeping perfect performance at a score\n of 1.\n\n Returns\n -------\n balanced_accuracy : float\n\n See Also\n --------\n recall_score, roc_auc_score\n\n Notes\n -----\n Some literature promotes alternative definitions of balanced accuracy. Our\n definition is equivalent to :func:`accuracy_score` with class-balanced\n sample weights, and shares desirable properties with the binary case.\n See the :ref:`User Guide `.\n\n References\n ----------\n .. [1] Brodersen, K.H.; Ong, C.S.; Stephan, K.E.; Buhmann, J.M. (2010).\n The balanced accuracy and its posterior distribution.\n Proceedings of the 20th International Conference on Pattern\n Recognition, 3121-24.\n .. [2] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, (2015).\n `Fundamentals of Machine Learning for Predictive Data Analytics:\n Algorithms, Worked Examples, and Case Studies\n `_.\n\n Examples\n --------\n >>> from sklearn.metrics import balanced_accuracy_score\n >>> y_true = [0, 1, 0, 0, 1, 0]\n >>> y_pred = [0, 1, 0, 0, 0, 1]\n >>> balanced_accuracy_score(y_true, y_pred)\n 0.625\n\n \"\"\"\n C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)\n with np.errstate(divide='ignore', invalid='ignore'):\n per_class = np.diag(C) / C.sum(axis=1)\n if np.any(np.isnan(per_class)):\n warnings.warn('y_pred contains classes not in y_true')\n per_class = per_class[~np.isnan(per_class)]\n score = np.mean(per_class)\n if adjusted:\n n_classes = len(per_class)\n chance = 1 / n_classes\n score -= chance\n score /= 1 - chance\n return score" + "description": "Compute the balanced accuracy.\n\nThe balanced accuracy in binary and multiclass classification problems to\ndeal with imbalanced datasets. It is defined as the average of recall\nobtained on each class.\n\nThe best value is 1 and the worst value is 0 when ``adjusted=False``.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20", + "docstring": "Compute the balanced accuracy.\n\n The balanced accuracy in binary and multiclass classification problems to\n deal with imbalanced datasets. It is defined as the average of recall\n obtained on each class.\n\n The best value is 1 and the worst value is 0 when ``adjusted=False``.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n y_true : 1d array-like\n Ground truth (correct) target values.\n\n y_pred : 1d array-like\n Estimated targets as returned by a classifier.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n adjusted : bool, default=False\n When true, the result is adjusted for chance, so that random\n performance would score 0, while keeping perfect performance at a score\n of 1.\n\n Returns\n -------\n balanced_accuracy : float\n Balanced accuracy score.\n\n See Also\n --------\n average_precision_score : Compute average precision (AP) from prediction\n scores.\n precision_score : Compute the precision score.\n recall_score : Compute the recall score.\n roc_auc_score : Compute Area Under the Receiver Operating Characteristic\n Curve (ROC AUC) from prediction scores.\n\n Notes\n -----\n Some literature promotes alternative definitions of balanced accuracy. Our\n definition is equivalent to :func:`accuracy_score` with class-balanced\n sample weights, and shares desirable properties with the binary case.\n See the :ref:`User Guide `.\n\n References\n ----------\n .. [1] Brodersen, K.H.; Ong, C.S.; Stephan, K.E.; Buhmann, J.M. (2010).\n The balanced accuracy and its posterior distribution.\n Proceedings of the 20th International Conference on Pattern\n Recognition, 3121-24.\n .. [2] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, (2015).\n `Fundamentals of Machine Learning for Predictive Data Analytics:\n Algorithms, Worked Examples, and Case Studies\n `_.\n\n Examples\n --------\n >>> from sklearn.metrics import balanced_accuracy_score\n >>> y_true = [0, 1, 0, 0, 1, 0]\n >>> y_pred = [0, 1, 0, 0, 0, 1]\n >>> balanced_accuracy_score(y_true, y_pred)\n 0.625\n ", + "source_code": "\ndef balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, adjusted=False):\n \"\"\"Compute the balanced accuracy.\n\n The balanced accuracy in binary and multiclass classification problems to\n deal with imbalanced datasets. It is defined as the average of recall\n obtained on each class.\n\n The best value is 1 and the worst value is 0 when ``adjusted=False``.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n y_true : 1d array-like\n Ground truth (correct) target values.\n\n y_pred : 1d array-like\n Estimated targets as returned by a classifier.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n adjusted : bool, default=False\n When true, the result is adjusted for chance, so that random\n performance would score 0, while keeping perfect performance at a score\n of 1.\n\n Returns\n -------\n balanced_accuracy : float\n Balanced accuracy score.\n\n See Also\n --------\n average_precision_score : Compute average precision (AP) from prediction\n scores.\n precision_score : Compute the precision score.\n recall_score : Compute the recall score.\n roc_auc_score : Compute Area Under the Receiver Operating Characteristic\n Curve (ROC AUC) from prediction scores.\n\n Notes\n -----\n Some literature promotes alternative definitions of balanced accuracy. Our\n definition is equivalent to :func:`accuracy_score` with class-balanced\n sample weights, and shares desirable properties with the binary case.\n See the :ref:`User Guide `.\n\n References\n ----------\n .. [1] Brodersen, K.H.; Ong, C.S.; Stephan, K.E.; Buhmann, J.M. (2010).\n The balanced accuracy and its posterior distribution.\n Proceedings of the 20th International Conference on Pattern\n Recognition, 3121-24.\n .. [2] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, (2015).\n `Fundamentals of Machine Learning for Predictive Data Analytics:\n Algorithms, Worked Examples, and Case Studies\n `_.\n\n Examples\n --------\n >>> from sklearn.metrics import balanced_accuracy_score\n >>> y_true = [0, 1, 0, 0, 1, 0]\n >>> y_pred = [0, 1, 0, 0, 0, 1]\n >>> balanced_accuracy_score(y_true, y_pred)\n 0.625\n \"\"\"\n C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)\n with np.errstate(divide='ignore', invalid='ignore'):\n per_class = np.diag(C) / C.sum(axis=1)\n if np.any(np.isnan(per_class)):\n warnings.warn('y_pred contains classes not in y_true')\n per_class = per_class[~np.isnan(per_class)]\n score = np.mean(per_class)\n if adjusted:\n n_classes = len(per_class)\n chance = 1 / n_classes\n score -= chance\n score /= 1 - chance\n return score" }, { "name": "brier_score_loss", @@ -117092,7 +125747,8 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "True targets." - } + }, + "refined_type": {} }, { "name": "y_prob", @@ -117102,7 +125758,8 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Probabilities of the positive class." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -117112,7 +125769,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -117122,13 +125780,17 @@ "docstring": { "type": "int or str, default=None", "description": "Label of the positive class. `pos_label` will be inferred in the\nfollowing manner:\n\n* if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;\n* else if `y_true` contains string, an error will be raised and\n `pos_label` should be explicitly specified;\n* otherwise, `pos_label` defaults to the greater label,\n i.e. `np.unique(y_true)[-1]`." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Compute the Brier score loss.\n\nThe smaller the Brier score loss, the better, hence the naming with \"loss\". The Brier score measures the mean squared difference between the predicted probability and the actual outcome. The Brier score always takes on a value between zero and one, since this is the largest possible difference between a predicted probability (which must be between zero and one) and the actual outcome (which can take on values of only 0 and 1). It can be decomposed is the sum of refinement loss and calibration loss. The Brier score is appropriate for binary and categorical outcomes that can be structured as true or false, but is inappropriate for ordinal variables which can take on three or more values (this is because the Brier score assumes that all possible outcomes are equivalently \"distant\" from one another). Which label is considered to be the positive label is controlled via the parameter `pos_label`, which defaults to the greater label unless `y_true` is all 0 or all -1, in which case `pos_label` defaults to 1. Read more in the :ref:`User Guide `.", - "docstring": "Compute the Brier score loss.\n\nThe smaller the Brier score loss, the better, hence the naming with \"loss\".\nThe Brier score measures the mean squared difference between the predicted\nprobability and the actual outcome. The Brier score always\ntakes on a value between zero and one, since this is the largest\npossible difference between a predicted probability (which must be\nbetween zero and one) and the actual outcome (which can take on values\nof only 0 and 1). It can be decomposed is the sum of refinement loss and\ncalibration loss.\n\nThe Brier score is appropriate for binary and categorical outcomes that\ncan be structured as true or false, but is inappropriate for ordinal\nvariables which can take on three or more values (this is because the\nBrier score assumes that all possible outcomes are equivalently\n\"distant\" from one another). Which label is considered to be the positive\nlabel is controlled via the parameter `pos_label`, which defaults to\nthe greater label unless `y_true` is all 0 or all -1, in which case\n`pos_label` defaults to 1.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array of shape (n_samples,)\n True targets.\n\ny_prob : array of shape (n_samples,)\n Probabilities of the positive class.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\npos_label : int or str, default=None\n Label of the positive class. `pos_label` will be inferred in the\n following manner:\n\n * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;\n * else if `y_true` contains string, an error will be raised and\n `pos_label` should be explicitly specified;\n * otherwise, `pos_label` defaults to the greater label,\n i.e. `np.unique(y_true)[-1]`.\n\nReturns\n-------\nscore : float\n Brier score loss.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.metrics import brier_score_loss\n>>> y_true = np.array([0, 1, 1, 0])\n>>> y_true_categorical = np.array([\"spam\", \"ham\", \"ham\", \"spam\"])\n>>> y_prob = np.array([0.1, 0.9, 0.8, 0.3])\n>>> brier_score_loss(y_true, y_prob)\n0.037...\n>>> brier_score_loss(y_true, 1-y_prob, pos_label=0)\n0.037...\n>>> brier_score_loss(y_true_categorical, y_prob, pos_label=\"ham\")\n0.037...\n>>> brier_score_loss(y_true, np.array(y_prob) > 0.5)\n0.0\n\nReferences\n----------\n.. [1] `Wikipedia entry for the Brier score\n `_.", + "description": "Compute the Brier score loss.\n\nThe smaller the Brier score loss, the better, hence the naming with \"loss\".\nThe Brier score measures the mean squared difference between the predicted\nprobability and the actual outcome. The Brier score always\ntakes on a value between zero and one, since this is the largest\npossible difference between a predicted probability (which must be\nbetween zero and one) and the actual outcome (which can take on values\nof only 0 and 1). It can be decomposed is the sum of refinement loss and\ncalibration loss.\n\nThe Brier score is appropriate for binary and categorical outcomes that\ncan be structured as true or false, but is inappropriate for ordinal\nvariables which can take on three or more values (this is because the\nBrier score assumes that all possible outcomes are equivalently\n\"distant\" from one another). Which label is considered to be the positive\nlabel is controlled via the parameter `pos_label`, which defaults to\nthe greater label unless `y_true` is all 0 or all -1, in which case\n`pos_label` defaults to 1.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the Brier score loss.\n\n The smaller the Brier score loss, the better, hence the naming with \"loss\".\n The Brier score measures the mean squared difference between the predicted\n probability and the actual outcome. The Brier score always\n takes on a value between zero and one, since this is the largest\n possible difference between a predicted probability (which must be\n between zero and one) and the actual outcome (which can take on values\n of only 0 and 1). It can be decomposed is the sum of refinement loss and\n calibration loss.\n\n The Brier score is appropriate for binary and categorical outcomes that\n can be structured as true or false, but is inappropriate for ordinal\n variables which can take on three or more values (this is because the\n Brier score assumes that all possible outcomes are equivalently\n \"distant\" from one another). Which label is considered to be the positive\n label is controlled via the parameter `pos_label`, which defaults to\n the greater label unless `y_true` is all 0 or all -1, in which case\n `pos_label` defaults to 1.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array of shape (n_samples,)\n True targets.\n\n y_prob : array of shape (n_samples,)\n Probabilities of the positive class.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n pos_label : int or str, default=None\n Label of the positive class. `pos_label` will be inferred in the\n following manner:\n\n * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;\n * else if `y_true` contains string, an error will be raised and\n `pos_label` should be explicitly specified;\n * otherwise, `pos_label` defaults to the greater label,\n i.e. `np.unique(y_true)[-1]`.\n\n Returns\n -------\n score : float\n Brier score loss.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import brier_score_loss\n >>> y_true = np.array([0, 1, 1, 0])\n >>> y_true_categorical = np.array([\"spam\", \"ham\", \"ham\", \"spam\"])\n >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3])\n >>> brier_score_loss(y_true, y_prob)\n 0.037...\n >>> brier_score_loss(y_true, 1-y_prob, pos_label=0)\n 0.037...\n >>> brier_score_loss(y_true_categorical, y_prob, pos_label=\"ham\")\n 0.037...\n >>> brier_score_loss(y_true, np.array(y_prob) > 0.5)\n 0.0\n\n References\n ----------\n .. [1] `Wikipedia entry for the Brier score\n `_.\n ", "source_code": "\ndef brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):\n \"\"\"Compute the Brier score loss.\n\n The smaller the Brier score loss, the better, hence the naming with \"loss\".\n The Brier score measures the mean squared difference between the predicted\n probability and the actual outcome. The Brier score always\n takes on a value between zero and one, since this is the largest\n possible difference between a predicted probability (which must be\n between zero and one) and the actual outcome (which can take on values\n of only 0 and 1). It can be decomposed is the sum of refinement loss and\n calibration loss.\n\n The Brier score is appropriate for binary and categorical outcomes that\n can be structured as true or false, but is inappropriate for ordinal\n variables which can take on three or more values (this is because the\n Brier score assumes that all possible outcomes are equivalently\n \"distant\" from one another). Which label is considered to be the positive\n label is controlled via the parameter `pos_label`, which defaults to\n the greater label unless `y_true` is all 0 or all -1, in which case\n `pos_label` defaults to 1.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array of shape (n_samples,)\n True targets.\n\n y_prob : array of shape (n_samples,)\n Probabilities of the positive class.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n pos_label : int or str, default=None\n Label of the positive class. `pos_label` will be inferred in the\n following manner:\n\n * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;\n * else if `y_true` contains string, an error will be raised and\n `pos_label` should be explicitly specified;\n * otherwise, `pos_label` defaults to the greater label,\n i.e. `np.unique(y_true)[-1]`.\n\n Returns\n -------\n score : float\n Brier score loss.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import brier_score_loss\n >>> y_true = np.array([0, 1, 1, 0])\n >>> y_true_categorical = np.array([\"spam\", \"ham\", \"ham\", \"spam\"])\n >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3])\n >>> brier_score_loss(y_true, y_prob)\n 0.037...\n >>> brier_score_loss(y_true, 1-y_prob, pos_label=0)\n 0.037...\n >>> brier_score_loss(y_true_categorical, y_prob, pos_label=\"ham\")\n 0.037...\n >>> brier_score_loss(y_true, np.array(y_prob) > 0.5)\n 0.0\n\n References\n ----------\n .. [1] `Wikipedia entry for the Brier score\n `_.\n \"\"\"\n y_true = column_or_1d(y_true)\n y_prob = column_or_1d(y_prob)\n assert_all_finite(y_true)\n assert_all_finite(y_prob)\n check_consistent_length(y_true, y_prob, sample_weight)\n y_type = type_of_target(y_true)\n if y_type != 'binary':\n raise ValueError(f'Only binary classification is supported. The type of the target is {y_type}.')\n if y_prob.max() > 1:\n raise ValueError('y_prob contains values greater than 1.')\n if y_prob.min() < 0:\n raise ValueError('y_prob contains values less than 0.')\n try:\n pos_label = _check_pos_label_consistency(pos_label, y_true)\n except ValueError:\n classes = np.unique(y_true)\n if classes.dtype.kind not in ('O', 'U', 'S'):\n pos_label = classes[-1]\n else:\n raise\n y_true = np.array(y_true == pos_label, int)\n return np.average((y_true - y_prob)**2, weights=sample_weight)" }, { @@ -117146,7 +125808,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -117156,7 +125819,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Estimated targets as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "labels", @@ -117166,7 +125830,8 @@ "docstring": { "type": "array-like of shape (n_labels,), default=None", "description": "Optional list of label indices to include in the report." - } + }, + "refined_type": {} }, { "name": "target_names", @@ -117176,7 +125841,8 @@ "docstring": { "type": "list of str of shape (n_labels,), default=None", "description": "Optional display names matching the labels (same order)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -117186,7 +125852,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "digits", @@ -117196,7 +125863,8 @@ "docstring": { "type": "int, default=2", "description": "Number of digits for formatting output floating point values.\nWhen ``output_dict`` is ``True``, this will be ignored and the\nreturned values will not be rounded." - } + }, + "refined_type": {} }, { "name": "output_dict", @@ -117206,7 +125874,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, return output as dict.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "zero_division", @@ -117216,13 +125885,14 @@ "docstring": { "type": "\"warn\", 0 or 1, default=\"warn\"", "description": "Sets the value to return when there is a zero division. If set to\n\"warn\", this acts as 0, but warnings are also raised." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Build a text report showing the main classification metrics.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Build a text report showing the main classification metrics.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\ny_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\nlabels : array-like of shape (n_labels,), default=None\n Optional list of label indices to include in the report.\n\ntarget_names : list of str of shape (n_labels,), default=None\n Optional display names matching the labels (same order).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\ndigits : int, default=2\n Number of digits for formatting output floating point values.\n When ``output_dict`` is ``True``, this will be ignored and the\n returned values will not be rounded.\n\noutput_dict : bool, default=False\n If True, return output as dict.\n\n .. versionadded:: 0.20\n\nzero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division. If set to\n \"warn\", this acts as 0, but warnings are also raised.\n\nReturns\n-------\nreport : str or dict\n Text summary of the precision, recall, F1 score for each class.\n Dictionary returned if output_dict is True. Dictionary has the\n following structure::\n\n {'label 1': {'precision':0.5,\n 'recall':1.0,\n 'f1-score':0.67,\n 'support':1},\n 'label 2': { ... },\n ...\n }\n\n The reported averages include macro average (averaging the unweighted\n mean per label), weighted average (averaging the support-weighted mean\n per label), and sample average (only for multilabel classification).\n Micro average (averaging the total true positives, false negatives and\n false positives) is only shown for multi-label or multi-class\n with a subset of classes, because it corresponds to accuracy\n otherwise and would be the same for all metrics.\n See also :func:`precision_recall_fscore_support` for more details\n on averages.\n\n Note that in binary classification, recall of the positive class\n is also known as \"sensitivity\"; recall of the negative class is\n \"specificity\".\n\nSee Also\n--------\nprecision_recall_fscore_support, confusion_matrix,\nmultilabel_confusion_matrix\n\nExamples\n--------\n>>> from sklearn.metrics import classification_report\n>>> y_true = [0, 1, 2, 2, 2]\n>>> y_pred = [0, 0, 2, 2, 1]\n>>> target_names = ['class 0', 'class 1', 'class 2']\n>>> print(classification_report(y_true, y_pred, target_names=target_names))\n precision recall f1-score support\n\n class 0 0.50 1.00 0.67 1\n class 1 0.00 0.00 0.00 1\n class 2 1.00 0.67 0.80 3\n\n accuracy 0.60 5\n macro avg 0.50 0.56 0.49 5\nweighted avg 0.70 0.60 0.61 5\n\n>>> y_pred = [1, 1, 0]\n>>> y_true = [1, 1, 1]\n>>> print(classification_report(y_true, y_pred, labels=[1, 2, 3]))\n precision recall f1-score support\n\n 1 1.00 0.67 0.80 3\n 2 0.00 0.00 0.00 0\n 3 0.00 0.00 0.00 0\n\n micro avg 1.00 0.67 0.80 3\n macro avg 0.33 0.22 0.27 3\nweighted avg 1.00 0.67 0.80 3\n", + "docstring": "Build a text report showing the main classification metrics.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n labels : array-like of shape (n_labels,), default=None\n Optional list of label indices to include in the report.\n\n target_names : list of str of shape (n_labels,), default=None\n Optional display names matching the labels (same order).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n digits : int, default=2\n Number of digits for formatting output floating point values.\n When ``output_dict`` is ``True``, this will be ignored and the\n returned values will not be rounded.\n\n output_dict : bool, default=False\n If True, return output as dict.\n\n .. versionadded:: 0.20\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division. If set to\n \"warn\", this acts as 0, but warnings are also raised.\n\n Returns\n -------\n report : str or dict\n Text summary of the precision, recall, F1 score for each class.\n Dictionary returned if output_dict is True. Dictionary has the\n following structure::\n\n {'label 1': {'precision':0.5,\n 'recall':1.0,\n 'f1-score':0.67,\n 'support':1},\n 'label 2': { ... },\n ...\n }\n\n The reported averages include macro average (averaging the unweighted\n mean per label), weighted average (averaging the support-weighted mean\n per label), and sample average (only for multilabel classification).\n Micro average (averaging the total true positives, false negatives and\n false positives) is only shown for multi-label or multi-class\n with a subset of classes, because it corresponds to accuracy\n otherwise and would be the same for all metrics.\n See also :func:`precision_recall_fscore_support` for more details\n on averages.\n\n Note that in binary classification, recall of the positive class\n is also known as \"sensitivity\"; recall of the negative class is\n \"specificity\".\n\n See Also\n --------\n precision_recall_fscore_support, confusion_matrix,\n multilabel_confusion_matrix\n\n Examples\n --------\n >>> from sklearn.metrics import classification_report\n >>> y_true = [0, 1, 2, 2, 2]\n >>> y_pred = [0, 0, 2, 2, 1]\n >>> target_names = ['class 0', 'class 1', 'class 2']\n >>> print(classification_report(y_true, y_pred, target_names=target_names))\n precision recall f1-score support\n \n class 0 0.50 1.00 0.67 1\n class 1 0.00 0.00 0.00 1\n class 2 1.00 0.67 0.80 3\n \n accuracy 0.60 5\n macro avg 0.50 0.56 0.49 5\n weighted avg 0.70 0.60 0.61 5\n \n >>> y_pred = [1, 1, 0]\n >>> y_true = [1, 1, 1]\n >>> print(classification_report(y_true, y_pred, labels=[1, 2, 3]))\n precision recall f1-score support\n \n 1 1.00 0.67 0.80 3\n 2 0.00 0.00 0.00 0\n 3 0.00 0.00 0.00 0\n \n micro avg 1.00 0.67 0.80 3\n macro avg 0.33 0.22 0.27 3\n weighted avg 1.00 0.67 0.80 3\n \n ", "source_code": "\ndef classification_report(y_true, y_pred, *, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division='warn'):\n \"\"\"Build a text report showing the main classification metrics.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n labels : array-like of shape (n_labels,), default=None\n Optional list of label indices to include in the report.\n\n target_names : list of str of shape (n_labels,), default=None\n Optional display names matching the labels (same order).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n digits : int, default=2\n Number of digits for formatting output floating point values.\n When ``output_dict`` is ``True``, this will be ignored and the\n returned values will not be rounded.\n\n output_dict : bool, default=False\n If True, return output as dict.\n\n .. versionadded:: 0.20\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division. If set to\n \"warn\", this acts as 0, but warnings are also raised.\n\n Returns\n -------\n report : str or dict\n Text summary of the precision, recall, F1 score for each class.\n Dictionary returned if output_dict is True. Dictionary has the\n following structure::\n\n {'label 1': {'precision':0.5,\n 'recall':1.0,\n 'f1-score':0.67,\n 'support':1},\n 'label 2': { ... },\n ...\n }\n\n The reported averages include macro average (averaging the unweighted\n mean per label), weighted average (averaging the support-weighted mean\n per label), and sample average (only for multilabel classification).\n Micro average (averaging the total true positives, false negatives and\n false positives) is only shown for multi-label or multi-class\n with a subset of classes, because it corresponds to accuracy\n otherwise and would be the same for all metrics.\n See also :func:`precision_recall_fscore_support` for more details\n on averages.\n\n Note that in binary classification, recall of the positive class\n is also known as \"sensitivity\"; recall of the negative class is\n \"specificity\".\n\n See Also\n --------\n precision_recall_fscore_support, confusion_matrix,\n multilabel_confusion_matrix\n\n Examples\n --------\n >>> from sklearn.metrics import classification_report\n >>> y_true = [0, 1, 2, 2, 2]\n >>> y_pred = [0, 0, 2, 2, 1]\n >>> target_names = ['class 0', 'class 1', 'class 2']\n >>> print(classification_report(y_true, y_pred, target_names=target_names))\n precision recall f1-score support\n \n class 0 0.50 1.00 0.67 1\n class 1 0.00 0.00 0.00 1\n class 2 1.00 0.67 0.80 3\n \n accuracy 0.60 5\n macro avg 0.50 0.56 0.49 5\n weighted avg 0.70 0.60 0.61 5\n \n >>> y_pred = [1, 1, 0]\n >>> y_true = [1, 1, 1]\n >>> print(classification_report(y_true, y_pred, labels=[1, 2, 3]))\n precision recall f1-score support\n \n 1 1.00 0.67 0.80 3\n 2 0.00 0.00 0.00 0\n 3 0.00 0.00 0.00 0\n \n micro avg 1.00 0.67 0.80 3\n macro avg 0.33 0.22 0.27 3\n weighted avg 1.00 0.67 0.80 3\n \n \"\"\"\n (y_type, y_true, y_pred) = _check_targets(y_true, y_pred)\n if labels is None:\n labels = unique_labels(y_true, y_pred)\n labels_given = False\n else:\n labels = np.asarray(labels)\n labels_given = True\n micro_is_accuracy = (y_type == 'multiclass' or y_type == 'binary') and (not labels_given or set(labels) == set(unique_labels(y_true, y_pred)))\n if target_names is not None and len(labels) != len(target_names):\n if labels_given:\n warnings.warn('labels size, {0}, does not match size of target_names, {1}'.format(len(labels), len(target_names)))\n else:\n raise ValueError('Number of classes, {0}, does not match size of target_names, {1}. Try specifying the labels parameter'.format(len(labels), len(target_names)))\n if target_names is None:\n target_names = ['%s' % l for l in labels]\n headers = ['precision', 'recall', 'f1-score', 'support']\n (p, r, f1, s) = precision_recall_fscore_support(y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight, zero_division=zero_division)\n rows = zip(target_names, p, r, f1, s)\n if y_type.startswith('multilabel'):\n average_options = ('micro', 'macro', 'weighted', 'samples')\n else:\n average_options = ('micro', 'macro', 'weighted')\n if output_dict:\n report_dict = {label[0]: label[1:] for label in rows}\n for (label, scores) in report_dict.items():\n report_dict[label] = dict(zip(headers, [i.item() for i in scores]))\n else:\n longest_last_line_heading = 'weighted avg'\n name_width = max((len(cn) for cn in target_names))\n width = max(name_width, len(longest_last_line_heading), digits)\n head_fmt = '{:>{width}s} ' + ' {:>9}' * len(headers)\n report = head_fmt.format('', *headers, width=width)\n report += '\\n\\n'\n row_fmt = '{:>{width}s} ' + ' {:>9.{digits}f}' * 3 + ' {:>9}\\n'\n for row in rows:\n report += row_fmt.format(*row, width=width, digits=digits)\n report += '\\n'\n for average in average_options:\n if average.startswith('micro') and micro_is_accuracy:\n line_heading = 'accuracy'\n else:\n line_heading = average + ' avg'\n (avg_p, avg_r, avg_f1, _) = precision_recall_fscore_support(y_true, y_pred, labels=labels, average=average, sample_weight=sample_weight, zero_division=zero_division)\n avg = [avg_p, avg_r, avg_f1, np.sum(s)]\n if output_dict:\n report_dict[line_heading] = dict(zip(headers, [i.item() for i in avg]))\n elif line_heading == 'accuracy':\n row_fmt_accuracy = '{:>{width}s} ' + ' {:>9.{digits}}' * 2 + ' {:>9.{digits}f}' + ' {:>9}\\n'\n report += row_fmt_accuracy.format(line_heading, '', '', *avg[2:], width=width, digits=digits)\n else:\n report += row_fmt.format(line_heading, *avg, width=width, digits=digits)\n if output_dict:\n if 'accuracy' in report_dict.keys():\n report_dict['accuracy'] = report_dict['accuracy']['precision']\n return report_dict\n else:\n return report" }, { @@ -117240,7 +125910,8 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Labels assigned by the first annotator." - } + }, + "refined_type": {} }, { "name": "y2", @@ -117250,7 +125921,8 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "Labels assigned by the second annotator. The kappa statistic is\nsymmetric, so swapping ``y1`` and ``y2`` doesn't change the value." - } + }, + "refined_type": {} }, { "name": "labels", @@ -117260,7 +125932,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "List of labels to index the matrix. This may be used to select a\nsubset of labels. If `None`, all labels that appear at least once in\n``y1`` or ``y2`` are used." - } + }, + "refined_type": {} }, { "name": "weights", @@ -117270,6 +125943,10 @@ "docstring": { "type": "{'linear', 'quadratic'}, default=None", "description": "Weighting type to calculate the score. `None` means no weighted;\n\"linear\" means linear weighted; \"quadratic\" means quadratic weighted." + }, + "refined_type": { + "kind": "EnumType", + "values": ["quadratic", "linear"] } }, { @@ -117280,13 +125957,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Cohen's kappa: a statistic that measures inter-annotator agreement.\n\nThis function computes Cohen's kappa [1]_, a score that expresses the level of agreement between two annotators on a classification problem. It is defined as .. math:: \\kappa = (p_o - p_e) / (1 - p_e) where :math:`p_o` is the empirical probability of agreement on the label assigned to any sample (the observed agreement ratio), and :math:`p_e` is the expected agreement when both annotators assign labels randomly. :math:`p_e` is estimated using a per-annotator empirical prior over the class labels [2]_. Read more in the :ref:`User Guide `.", - "docstring": "Cohen's kappa: a statistic that measures inter-annotator agreement.\n\nThis function computes Cohen's kappa [1]_, a score that expresses the level\nof agreement between two annotators on a classification problem. It is\ndefined as\n\n.. math::\n \\kappa = (p_o - p_e) / (1 - p_e)\n\nwhere :math:`p_o` is the empirical probability of agreement on the label\nassigned to any sample (the observed agreement ratio), and :math:`p_e` is\nthe expected agreement when both annotators assign labels randomly.\n:math:`p_e` is estimated using a per-annotator empirical prior over the\nclass labels [2]_.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny1 : array of shape (n_samples,)\n Labels assigned by the first annotator.\n\ny2 : array of shape (n_samples,)\n Labels assigned by the second annotator. The kappa statistic is\n symmetric, so swapping ``y1`` and ``y2`` doesn't change the value.\n\nlabels : array-like of shape (n_classes,), default=None\n List of labels to index the matrix. This may be used to select a\n subset of labels. If `None`, all labels that appear at least once in\n ``y1`` or ``y2`` are used.\n\nweights : {'linear', 'quadratic'}, default=None\n Weighting type to calculate the score. `None` means no weighted;\n \"linear\" means linear weighted; \"quadratic\" means quadratic weighted.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nkappa : float\n The kappa statistic, which is a number between -1 and 1. The maximum\n value means complete agreement; zero or lower means chance agreement.\n\nReferences\n----------\n.. [1] J. Cohen (1960). \"A coefficient of agreement for nominal scales\".\n Educational and Psychological Measurement 20(1):37-46.\n doi:10.1177/001316446002000104.\n.. [2] `R. Artstein and M. Poesio (2008). \"Inter-coder agreement for\n computational linguistics\". Computational Linguistics 34(4):555-596\n `_.\n.. [3] `Wikipedia entry for the Cohen's kappa\n `_.", + "description": "Cohen's kappa: a statistic that measures inter-annotator agreement.\n\nThis function computes Cohen's kappa [1]_, a score that expresses the level\nof agreement between two annotators on a classification problem. It is\ndefined as\n\n.. math::\n \\kappa = (p_o - p_e) / (1 - p_e)\n\nwhere :math:`p_o` is the empirical probability of agreement on the label\nassigned to any sample (the observed agreement ratio), and :math:`p_e` is\nthe expected agreement when both annotators assign labels randomly.\n:math:`p_e` is estimated using a per-annotator empirical prior over the\nclass labels [2]_.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Cohen's kappa: a statistic that measures inter-annotator agreement.\n\n This function computes Cohen's kappa [1]_, a score that expresses the level\n of agreement between two annotators on a classification problem. It is\n defined as\n\n .. math::\n \\kappa = (p_o - p_e) / (1 - p_e)\n\n where :math:`p_o` is the empirical probability of agreement on the label\n assigned to any sample (the observed agreement ratio), and :math:`p_e` is\n the expected agreement when both annotators assign labels randomly.\n :math:`p_e` is estimated using a per-annotator empirical prior over the\n class labels [2]_.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y1 : array of shape (n_samples,)\n Labels assigned by the first annotator.\n\n y2 : array of shape (n_samples,)\n Labels assigned by the second annotator. The kappa statistic is\n symmetric, so swapping ``y1`` and ``y2`` doesn't change the value.\n\n labels : array-like of shape (n_classes,), default=None\n List of labels to index the matrix. This may be used to select a\n subset of labels. If `None`, all labels that appear at least once in\n ``y1`` or ``y2`` are used.\n\n weights : {'linear', 'quadratic'}, default=None\n Weighting type to calculate the score. `None` means no weighted;\n \"linear\" means linear weighted; \"quadratic\" means quadratic weighted.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n kappa : float\n The kappa statistic, which is a number between -1 and 1. The maximum\n value means complete agreement; zero or lower means chance agreement.\n\n References\n ----------\n .. [1] J. Cohen (1960). \"A coefficient of agreement for nominal scales\".\n Educational and Psychological Measurement 20(1):37-46.\n doi:10.1177/001316446002000104.\n .. [2] `R. Artstein and M. Poesio (2008). \"Inter-coder agreement for\n computational linguistics\". Computational Linguistics 34(4):555-596\n `_.\n .. [3] `Wikipedia entry for the Cohen's kappa\n `_.\n ", "source_code": "\ndef cohen_kappa_score(y1, y2, *, labels=None, weights=None, sample_weight=None):\n \"\"\"Cohen's kappa: a statistic that measures inter-annotator agreement.\n\n This function computes Cohen's kappa [1]_, a score that expresses the level\n of agreement between two annotators on a classification problem. It is\n defined as\n\n .. math::\n \\kappa = (p_o - p_e) / (1 - p_e)\n\n where :math:`p_o` is the empirical probability of agreement on the label\n assigned to any sample (the observed agreement ratio), and :math:`p_e` is\n the expected agreement when both annotators assign labels randomly.\n :math:`p_e` is estimated using a per-annotator empirical prior over the\n class labels [2]_.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y1 : array of shape (n_samples,)\n Labels assigned by the first annotator.\n\n y2 : array of shape (n_samples,)\n Labels assigned by the second annotator. The kappa statistic is\n symmetric, so swapping ``y1`` and ``y2`` doesn't change the value.\n\n labels : array-like of shape (n_classes,), default=None\n List of labels to index the matrix. This may be used to select a\n subset of labels. If `None`, all labels that appear at least once in\n ``y1`` or ``y2`` are used.\n\n weights : {'linear', 'quadratic'}, default=None\n Weighting type to calculate the score. `None` means no weighted;\n \"linear\" means linear weighted; \"quadratic\" means quadratic weighted.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n kappa : float\n The kappa statistic, which is a number between -1 and 1. The maximum\n value means complete agreement; zero or lower means chance agreement.\n\n References\n ----------\n .. [1] J. Cohen (1960). \"A coefficient of agreement for nominal scales\".\n Educational and Psychological Measurement 20(1):37-46.\n doi:10.1177/001316446002000104.\n .. [2] `R. Artstein and M. Poesio (2008). \"Inter-coder agreement for\n computational linguistics\". Computational Linguistics 34(4):555-596\n `_.\n .. [3] `Wikipedia entry for the Cohen's kappa\n `_.\n \"\"\"\n confusion = confusion_matrix(y1, y2, labels=labels, sample_weight=sample_weight)\n n_classes = confusion.shape[0]\n sum0 = np.sum(confusion, axis=0)\n sum1 = np.sum(confusion, axis=1)\n expected = np.outer(sum0, sum1) / np.sum(sum0)\n if weights is None:\n w_mat = np.ones([n_classes, n_classes], dtype=int)\n w_mat.flat[::n_classes + 1] = 0\n elif weights == 'linear' or weights == 'quadratic':\n w_mat = np.zeros([n_classes, n_classes], dtype=int)\n w_mat += np.arange(n_classes)\n if weights == 'linear':\n w_mat = np.abs(w_mat - w_mat.T)\n else:\n w_mat = (w_mat - w_mat.T)**2\n else:\n raise ValueError('Unknown kappa weighting type.')\n k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)\n return 1 - k" }, { @@ -117304,7 +125982,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -117314,7 +125993,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Estimated targets as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "labels", @@ -117324,7 +126004,8 @@ "docstring": { "type": "array-like of shape (n_classes), default=None", "description": "List of labels to index the matrix. This may be used to reorder\nor select a subset of labels.\nIf ``None`` is given, those that appear at least once\nin ``y_true`` or ``y_pred`` are used in sorted order." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -117334,7 +126015,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "normalize", @@ -117344,14 +126026,18 @@ "docstring": { "type": "{'true', 'pred', 'all'}, default=None", "description": "Normalizes confusion matrix over the true (rows), predicted (columns)\nconditions or all the population. If None, confusion matrix will not be\nnormalized." + }, + "refined_type": { + "kind": "EnumType", + "values": ["pred", "all", "true"] } } ], "results": [], "is_public": true, - "description": "Compute confusion matrix to evaluate the accuracy of a classification.\n\nBy definition a confusion matrix :math:`C` is such that :math:`C_{i, j}` is equal to the number of observations known to be in group :math:`i` and predicted to be in group :math:`j`. Thus in binary classification, the count of true negatives is :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is :math:`C_{1,1}` and false positives is :math:`C_{0,1}`. Read more in the :ref:`User Guide `.", - "docstring": "Compute confusion matrix to evaluate the accuracy of a classification.\n\nBy definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`\nis equal to the number of observations known to be in group :math:`i` and\npredicted to be in group :math:`j`.\n\nThus in binary classification, the count of true negatives is\n:math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is\n:math:`C_{1,1}` and false positives is :math:`C_{0,1}`.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape (n_samples,)\n Estimated targets as returned by a classifier.\n\nlabels : array-like of shape (n_classes), default=None\n List of labels to index the matrix. This may be used to reorder\n or select a subset of labels.\n If ``None`` is given, those that appear at least once\n in ``y_true`` or ``y_pred`` are used in sorted order.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.18\n\nnormalize : {'true', 'pred', 'all'}, default=None\n Normalizes confusion matrix over the true (rows), predicted (columns)\n conditions or all the population. If None, confusion matrix will not be\n normalized.\n\nReturns\n-------\nC : ndarray of shape (n_classes, n_classes)\n Confusion matrix whose i-th row and j-th\n column entry indicates the number of\n samples with true label being i-th class\n and predicted label being j-th class.\n\nSee Also\n--------\nConfusionMatrixDisplay.from_estimator : Plot the confusion matrix\n given an estimator, the data, and the label.\nConfusionMatrixDisplay.from_predictions : Plot the confusion matrix\n given the true and predicted labels.\nConfusionMatrixDisplay : Confusion Matrix visualization.\n\nReferences\n----------\n.. [1] `Wikipedia entry for the Confusion matrix\n `_\n (Wikipedia and other references may use a different\n convention for axes).\n\nExamples\n--------\n>>> from sklearn.metrics import confusion_matrix\n>>> y_true = [2, 0, 2, 2, 0, 1]\n>>> y_pred = [0, 0, 2, 2, 0, 2]\n>>> confusion_matrix(y_true, y_pred)\narray([[2, 0, 0],\n [0, 0, 1],\n [1, 0, 2]])\n\n>>> y_true = [\"cat\", \"ant\", \"cat\", \"cat\", \"ant\", \"bird\"]\n>>> y_pred = [\"ant\", \"ant\", \"cat\", \"cat\", \"ant\", \"cat\"]\n>>> confusion_matrix(y_true, y_pred, labels=[\"ant\", \"bird\", \"cat\"])\narray([[2, 0, 0],\n [0, 0, 1],\n [1, 0, 2]])\n\nIn the binary case, we can extract true positives, etc as follows:\n\n>>> tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()\n>>> (tn, fp, fn, tp)\n(0, 2, 1, 1)", - "source_code": "\ndef confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None):\n \"\"\"Compute confusion matrix to evaluate the accuracy of a classification.\n\n By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`\n is equal to the number of observations known to be in group :math:`i` and\n predicted to be in group :math:`j`.\n\n Thus in binary classification, the count of true negatives is\n :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is\n :math:`C_{1,1}` and false positives is :math:`C_{0,1}`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,)\n Estimated targets as returned by a classifier.\n\n labels : array-like of shape (n_classes), default=None\n List of labels to index the matrix. This may be used to reorder\n or select a subset of labels.\n If ``None`` is given, those that appear at least once\n in ``y_true`` or ``y_pred`` are used in sorted order.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.18\n\n normalize : {'true', 'pred', 'all'}, default=None\n Normalizes confusion matrix over the true (rows), predicted (columns)\n conditions or all the population. If None, confusion matrix will not be\n normalized.\n\n Returns\n -------\n C : ndarray of shape (n_classes, n_classes)\n Confusion matrix whose i-th row and j-th\n column entry indicates the number of\n samples with true label being i-th class\n and predicted label being j-th class.\n\n See Also\n --------\n ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix\n given an estimator, the data, and the label.\n ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix\n given the true and predicted labels.\n ConfusionMatrixDisplay : Confusion Matrix visualization.\n\n References\n ----------\n .. [1] `Wikipedia entry for the Confusion matrix\n `_\n (Wikipedia and other references may use a different\n convention for axes).\n\n Examples\n --------\n >>> from sklearn.metrics import confusion_matrix\n >>> y_true = [2, 0, 2, 2, 0, 1]\n >>> y_pred = [0, 0, 2, 2, 0, 2]\n >>> confusion_matrix(y_true, y_pred)\n array([[2, 0, 0],\n [0, 0, 1],\n [1, 0, 2]])\n\n >>> y_true = [\"cat\", \"ant\", \"cat\", \"cat\", \"ant\", \"bird\"]\n >>> y_pred = [\"ant\", \"ant\", \"cat\", \"cat\", \"ant\", \"cat\"]\n >>> confusion_matrix(y_true, y_pred, labels=[\"ant\", \"bird\", \"cat\"])\n array([[2, 0, 0],\n [0, 0, 1],\n [1, 0, 2]])\n\n In the binary case, we can extract true positives, etc as follows:\n\n >>> tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()\n >>> (tn, fp, fn, tp)\n (0, 2, 1, 1)\n\n \"\"\"\n (y_type, y_true, y_pred) = _check_targets(y_true, y_pred)\n if y_type not in ('binary', 'multiclass'):\n raise ValueError('%s is not supported' % y_type)\n if labels is None:\n labels = unique_labels(y_true, y_pred)\n else:\n labels = np.asarray(labels)\n n_labels = labels.size\n if n_labels == 0:\n raise ValueError(\"'labels' should contains at least one label.\")\n elif y_true.size == 0:\n return np.zeros((n_labels, n_labels), dtype=int)\n elif len(np.intersect1d(y_true, labels)) == 0:\n raise ValueError('At least one label specified must be in y_true')\n if sample_weight is None:\n sample_weight = np.ones(y_true.shape[0], dtype=np.int64)\n else:\n sample_weight = np.asarray(sample_weight)\n check_consistent_length(y_true, y_pred, sample_weight)\n if normalize not in ['true', 'pred', 'all', None]:\n raise ValueError(\"normalize must be one of {'true', 'pred', 'all', None}\")\n n_labels = labels.size\n need_index_conversion = not (labels.dtype.kind in {'i', 'u', 'b'} and np.all(labels == np.arange(n_labels)) and y_true.min() >= 0 and y_pred.min() >= 0)\n if need_index_conversion:\n label_to_ind = {y: x for (x, y) in enumerate(labels)}\n y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred])\n y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true])\n ind = np.logical_and(y_pred < n_labels, y_true < n_labels)\n if not np.all(ind):\n y_pred = y_pred[ind]\n y_true = y_true[ind]\n sample_weight = sample_weight[ind]\n if sample_weight.dtype.kind in {'i', 'u', 'b'}:\n dtype = np.int64\n else:\n dtype = np.float64\n cm = coo_matrix((sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels), dtype=dtype).toarray()\n with np.errstate(all='ignore'):\n if normalize == 'true':\n cm = cm / cm.sum(axis=1, keepdims=True)\n elif normalize == 'pred':\n cm = cm / cm.sum(axis=0, keepdims=True)\n elif normalize == 'all':\n cm = cm / cm.sum()\n cm = np.nan_to_num(cm)\n return cm" + "description": "Compute confusion matrix to evaluate the accuracy of a classification.\n\nBy definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`\nis equal to the number of observations known to be in group :math:`i` and\npredicted to be in group :math:`j`.\n\nThus in binary classification, the count of true negatives is\n:math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is\n:math:`C_{1,1}` and false positives is :math:`C_{0,1}`.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute confusion matrix to evaluate the accuracy of a classification.\n\n By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`\n is equal to the number of observations known to be in group :math:`i` and\n predicted to be in group :math:`j`.\n\n Thus in binary classification, the count of true negatives is\n :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is\n :math:`C_{1,1}` and false positives is :math:`C_{0,1}`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,)\n Estimated targets as returned by a classifier.\n\n labels : array-like of shape (n_classes), default=None\n List of labels to index the matrix. This may be used to reorder\n or select a subset of labels.\n If ``None`` is given, those that appear at least once\n in ``y_true`` or ``y_pred`` are used in sorted order.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.18\n\n normalize : {'true', 'pred', 'all'}, default=None\n Normalizes confusion matrix over the true (rows), predicted (columns)\n conditions or all the population. If None, confusion matrix will not be\n normalized.\n\n Returns\n -------\n C : ndarray of shape (n_classes, n_classes)\n Confusion matrix whose i-th row and j-th\n column entry indicates the number of\n samples with true label being i-th class\n and predicted label being j-th class.\n\n See Also\n --------\n ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix\n given an estimator, the data, and the label.\n ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix\n given the true and predicted labels.\n ConfusionMatrixDisplay : Confusion Matrix visualization.\n\n References\n ----------\n .. [1] `Wikipedia entry for the Confusion matrix\n `_\n (Wikipedia and other references may use a different\n convention for axes).\n\n Examples\n --------\n >>> from sklearn.metrics import confusion_matrix\n >>> y_true = [2, 0, 2, 2, 0, 1]\n >>> y_pred = [0, 0, 2, 2, 0, 2]\n >>> confusion_matrix(y_true, y_pred)\n array([[2, 0, 0],\n [0, 0, 1],\n [1, 0, 2]])\n\n >>> y_true = [\"cat\", \"ant\", \"cat\", \"cat\", \"ant\", \"bird\"]\n >>> y_pred = [\"ant\", \"ant\", \"cat\", \"cat\", \"ant\", \"cat\"]\n >>> confusion_matrix(y_true, y_pred, labels=[\"ant\", \"bird\", \"cat\"])\n array([[2, 0, 0],\n [0, 0, 1],\n [1, 0, 2]])\n\n In the binary case, we can extract true positives, etc as follows:\n\n >>> tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()\n >>> (tn, fp, fn, tp)\n (0, 2, 1, 1)\n ", + "source_code": "\ndef confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None):\n \"\"\"Compute confusion matrix to evaluate the accuracy of a classification.\n\n By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`\n is equal to the number of observations known to be in group :math:`i` and\n predicted to be in group :math:`j`.\n\n Thus in binary classification, the count of true negatives is\n :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is\n :math:`C_{1,1}` and false positives is :math:`C_{0,1}`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,)\n Estimated targets as returned by a classifier.\n\n labels : array-like of shape (n_classes), default=None\n List of labels to index the matrix. This may be used to reorder\n or select a subset of labels.\n If ``None`` is given, those that appear at least once\n in ``y_true`` or ``y_pred`` are used in sorted order.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.18\n\n normalize : {'true', 'pred', 'all'}, default=None\n Normalizes confusion matrix over the true (rows), predicted (columns)\n conditions or all the population. If None, confusion matrix will not be\n normalized.\n\n Returns\n -------\n C : ndarray of shape (n_classes, n_classes)\n Confusion matrix whose i-th row and j-th\n column entry indicates the number of\n samples with true label being i-th class\n and predicted label being j-th class.\n\n See Also\n --------\n ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix\n given an estimator, the data, and the label.\n ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix\n given the true and predicted labels.\n ConfusionMatrixDisplay : Confusion Matrix visualization.\n\n References\n ----------\n .. [1] `Wikipedia entry for the Confusion matrix\n `_\n (Wikipedia and other references may use a different\n convention for axes).\n\n Examples\n --------\n >>> from sklearn.metrics import confusion_matrix\n >>> y_true = [2, 0, 2, 2, 0, 1]\n >>> y_pred = [0, 0, 2, 2, 0, 2]\n >>> confusion_matrix(y_true, y_pred)\n array([[2, 0, 0],\n [0, 0, 1],\n [1, 0, 2]])\n\n >>> y_true = [\"cat\", \"ant\", \"cat\", \"cat\", \"ant\", \"bird\"]\n >>> y_pred = [\"ant\", \"ant\", \"cat\", \"cat\", \"ant\", \"cat\"]\n >>> confusion_matrix(y_true, y_pred, labels=[\"ant\", \"bird\", \"cat\"])\n array([[2, 0, 0],\n [0, 0, 1],\n [1, 0, 2]])\n\n In the binary case, we can extract true positives, etc as follows:\n\n >>> tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()\n >>> (tn, fp, fn, tp)\n (0, 2, 1, 1)\n \"\"\"\n (y_type, y_true, y_pred) = _check_targets(y_true, y_pred)\n if y_type not in ('binary', 'multiclass'):\n raise ValueError('%s is not supported' % y_type)\n if labels is None:\n labels = unique_labels(y_true, y_pred)\n else:\n labels = np.asarray(labels)\n n_labels = labels.size\n if n_labels == 0:\n raise ValueError(\"'labels' should contains at least one label.\")\n elif y_true.size == 0:\n return np.zeros((n_labels, n_labels), dtype=int)\n elif len(np.intersect1d(y_true, labels)) == 0:\n raise ValueError('At least one label specified must be in y_true')\n if sample_weight is None:\n sample_weight = np.ones(y_true.shape[0], dtype=np.int64)\n else:\n sample_weight = np.asarray(sample_weight)\n check_consistent_length(y_true, y_pred, sample_weight)\n if normalize not in ['true', 'pred', 'all', None]:\n raise ValueError(\"normalize must be one of {'true', 'pred', 'all', None}\")\n n_labels = labels.size\n need_index_conversion = not (labels.dtype.kind in {'i', 'u', 'b'} and np.all(labels == np.arange(n_labels)) and y_true.min() >= 0 and y_pred.min() >= 0)\n if need_index_conversion:\n label_to_ind = {y: x for (x, y) in enumerate(labels)}\n y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred])\n y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true])\n ind = np.logical_and(y_pred < n_labels, y_true < n_labels)\n if not np.all(ind):\n y_pred = y_pred[ind]\n y_true = y_true[ind]\n sample_weight = sample_weight[ind]\n if sample_weight.dtype.kind in {'i', 'u', 'b'}:\n dtype = np.int64\n else:\n dtype = np.float64\n cm = coo_matrix((sample_weight, (y_true, y_pred)), shape=(n_labels, n_labels), dtype=dtype).toarray()\n with np.errstate(all='ignore'):\n if normalize == 'true':\n cm = cm / cm.sum(axis=1, keepdims=True)\n elif normalize == 'pred':\n cm = cm / cm.sum(axis=0, keepdims=True)\n elif normalize == 'all':\n cm = cm / cm.sum()\n cm = np.nan_to_num(cm)\n return cm" }, { "name": "f1_score", @@ -117368,7 +126054,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -117378,7 +126065,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Estimated targets as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "labels", @@ -117388,7 +126076,8 @@ "docstring": { "type": "array-like, default=None", "description": "The set of labels to include when ``average != 'binary'``, and their\norder if ``average is None``. Labels present in the data can be\nexcluded, for example to calculate a multiclass average ignoring a\nmajority negative class, while labels not present in the data will\nresult in 0 components in a macro average. For multilabel targets,\nlabels are column indices. By default, all labels in ``y_true`` and\n``y_pred`` are used in sorted order.\n\n.. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -117398,7 +126087,8 @@ "docstring": { "type": "str or int, default=1", "description": "The class to report if ``average='binary'`` and the data is binary.\nIf the data are multiclass or multilabel, this will be ignored;\nsetting ``labels=[pos_label]`` and ``average != 'binary'`` will report\nscores for that label only." - } + }, + "refined_type": {} }, { "name": "average", @@ -117408,6 +126098,16 @@ "docstring": { "type": "{'micro', 'macro', 'samples','weighted', 'binary'} or None, default='binary'", "description": "This parameter is required for multiclass/multilabel targets.\nIf ``None``, the scores for each class are returned. Otherwise, this\ndetermines the type of averaging performed on the data:\n\n``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`)." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "samples", + "weighted", + "micro", + "macro", + "binary" + ] } }, { @@ -117418,7 +126118,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "zero_division", @@ -117428,13 +126129,14 @@ "docstring": { "type": "\"warn\", 0 or 1, default=\"warn\"", "description": "Sets the value to return when there is a zero division, i.e. when all\npredictions and labels are negative. If set to \"warn\", this acts as 0,\nbut warnings are also raised." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the F1 score, also known as balanced F-score or F-measure.\n\nThe F1 score can be interpreted as a harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is:: F1 = 2 * (precision * recall) / (precision + recall) In the multi-class and multi-label case, this is the average of the F1 score of each class with weighting depending on the ``average`` parameter. Read more in the :ref:`User Guide `.", - "docstring": "Compute the F1 score, also known as balanced F-score or F-measure.\n\nThe F1 score can be interpreted as a harmonic mean of the precision and\nrecall, where an F1 score reaches its best value at 1 and worst score at 0.\nThe relative contribution of precision and recall to the F1 score are\nequal. The formula for the F1 score is::\n\n F1 = 2 * (precision * recall) / (precision + recall)\n\nIn the multi-class and multi-label case, this is the average of\nthe F1 score of each class with weighting depending on the ``average``\nparameter.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\ny_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\nlabels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\npos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\naverage : {'micro', 'macro', 'samples','weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nzero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division, i.e. when all\n predictions and labels are negative. If set to \"warn\", this acts as 0,\n but warnings are also raised.\n\nReturns\n-------\nf1_score : float or array of float, shape = [n_unique_labels]\n F1 score of the positive class in binary classification or weighted\n average of the F1 scores of each class for the multiclass task.\n\nSee Also\n--------\nfbeta_score, precision_recall_fscore_support, jaccard_score,\nmultilabel_confusion_matrix\n\nReferences\n----------\n.. [1] `Wikipedia entry for the F1-score\n `_.\n\nExamples\n--------\n>>> from sklearn.metrics import f1_score\n>>> y_true = [0, 1, 2, 0, 1, 2]\n>>> y_pred = [0, 2, 1, 0, 0, 1]\n>>> f1_score(y_true, y_pred, average='macro')\n0.26...\n>>> f1_score(y_true, y_pred, average='micro')\n0.33...\n>>> f1_score(y_true, y_pred, average='weighted')\n0.26...\n>>> f1_score(y_true, y_pred, average=None)\narray([0.8, 0. , 0. ])\n>>> y_true = [0, 0, 0, 0, 0, 0]\n>>> y_pred = [0, 0, 0, 0, 0, 0]\n>>> f1_score(y_true, y_pred, zero_division=1)\n1.0...\n>>> # multilabel classification\n>>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]\n>>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]\n>>> f1_score(y_true, y_pred, average=None)\narray([0.66666667, 1. , 0.66666667])\n\nNotes\n-----\nWhen ``true positive + false positive == 0``, precision is undefined.\nWhen ``true positive + false negative == 0``, recall is undefined.\nIn such cases, by default the metric will be set to 0, as will f-score,\nand ``UndefinedMetricWarning`` will be raised. This behavior can be\nmodified with ``zero_division``.", + "description": "Compute the F1 score, also known as balanced F-score or F-measure.\n\nThe F1 score can be interpreted as a harmonic mean of the precision and\nrecall, where an F1 score reaches its best value at 1 and worst score at 0.\nThe relative contribution of precision and recall to the F1 score are\nequal. The formula for the F1 score is::\n\n F1 = 2 * (precision * recall) / (precision + recall)\n\nIn the multi-class and multi-label case, this is the average of\nthe F1 score of each class with weighting depending on the ``average``\nparameter.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the F1 score, also known as balanced F-score or F-measure.\n\n The F1 score can be interpreted as a harmonic mean of the precision and\n recall, where an F1 score reaches its best value at 1 and worst score at 0.\n The relative contribution of precision and recall to the F1 score are\n equal. The formula for the F1 score is::\n\n F1 = 2 * (precision * recall) / (precision + recall)\n\n In the multi-class and multi-label case, this is the average of\n the F1 score of each class with weighting depending on the ``average``\n parameter.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n labels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'micro', 'macro', 'samples','weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division, i.e. when all\n predictions and labels are negative. If set to \"warn\", this acts as 0,\n but warnings are also raised.\n\n Returns\n -------\n f1_score : float or array of float, shape = [n_unique_labels]\n F1 score of the positive class in binary classification or weighted\n average of the F1 scores of each class for the multiclass task.\n\n See Also\n --------\n fbeta_score, precision_recall_fscore_support, jaccard_score,\n multilabel_confusion_matrix\n\n References\n ----------\n .. [1] `Wikipedia entry for the F1-score\n `_.\n\n Examples\n --------\n >>> from sklearn.metrics import f1_score\n >>> y_true = [0, 1, 2, 0, 1, 2]\n >>> y_pred = [0, 2, 1, 0, 0, 1]\n >>> f1_score(y_true, y_pred, average='macro')\n 0.26...\n >>> f1_score(y_true, y_pred, average='micro')\n 0.33...\n >>> f1_score(y_true, y_pred, average='weighted')\n 0.26...\n >>> f1_score(y_true, y_pred, average=None)\n array([0.8, 0. , 0. ])\n >>> y_true = [0, 0, 0, 0, 0, 0]\n >>> y_pred = [0, 0, 0, 0, 0, 0]\n >>> f1_score(y_true, y_pred, zero_division=1)\n 1.0...\n >>> # multilabel classification\n >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]\n >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]\n >>> f1_score(y_true, y_pred, average=None)\n array([0.66666667, 1. , 0.66666667])\n\n Notes\n -----\n When ``true positive + false positive == 0``, precision is undefined.\n When ``true positive + false negative == 0``, recall is undefined.\n In such cases, by default the metric will be set to 0, as will f-score,\n and ``UndefinedMetricWarning`` will be raised. This behavior can be\n modified with ``zero_division``.\n ", "source_code": "\ndef f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn'):\n \"\"\"Compute the F1 score, also known as balanced F-score or F-measure.\n\n The F1 score can be interpreted as a harmonic mean of the precision and\n recall, where an F1 score reaches its best value at 1 and worst score at 0.\n The relative contribution of precision and recall to the F1 score are\n equal. The formula for the F1 score is::\n\n F1 = 2 * (precision * recall) / (precision + recall)\n\n In the multi-class and multi-label case, this is the average of\n the F1 score of each class with weighting depending on the ``average``\n parameter.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n labels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'micro', 'macro', 'samples','weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division, i.e. when all\n predictions and labels are negative. If set to \"warn\", this acts as 0,\n but warnings are also raised.\n\n Returns\n -------\n f1_score : float or array of float, shape = [n_unique_labels]\n F1 score of the positive class in binary classification or weighted\n average of the F1 scores of each class for the multiclass task.\n\n See Also\n --------\n fbeta_score, precision_recall_fscore_support, jaccard_score,\n multilabel_confusion_matrix\n\n References\n ----------\n .. [1] `Wikipedia entry for the F1-score\n `_.\n\n Examples\n --------\n >>> from sklearn.metrics import f1_score\n >>> y_true = [0, 1, 2, 0, 1, 2]\n >>> y_pred = [0, 2, 1, 0, 0, 1]\n >>> f1_score(y_true, y_pred, average='macro')\n 0.26...\n >>> f1_score(y_true, y_pred, average='micro')\n 0.33...\n >>> f1_score(y_true, y_pred, average='weighted')\n 0.26...\n >>> f1_score(y_true, y_pred, average=None)\n array([0.8, 0. , 0. ])\n >>> y_true = [0, 0, 0, 0, 0, 0]\n >>> y_pred = [0, 0, 0, 0, 0, 0]\n >>> f1_score(y_true, y_pred, zero_division=1)\n 1.0...\n >>> # multilabel classification\n >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]\n >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]\n >>> f1_score(y_true, y_pred, average=None)\n array([0.66666667, 1. , 0.66666667])\n\n Notes\n -----\n When ``true positive + false positive == 0``, precision is undefined.\n When ``true positive + false negative == 0``, recall is undefined.\n In such cases, by default the metric will be set to 0, as will f-score,\n and ``UndefinedMetricWarning`` will be raised. This behavior can be\n modified with ``zero_division``.\n \"\"\"\n return fbeta_score(y_true, y_pred, beta=1, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight, zero_division=zero_division)" }, { @@ -117452,7 +126154,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -117462,7 +126165,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Estimated targets as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "beta", @@ -117472,7 +126176,8 @@ "docstring": { "type": "float", "description": "Determines the weight of recall in the combined score." - } + }, + "refined_type": {} }, { "name": "labels", @@ -117482,7 +126187,8 @@ "docstring": { "type": "array-like, default=None", "description": "The set of labels to include when ``average != 'binary'``, and their\norder if ``average is None``. Labels present in the data can be\nexcluded, for example to calculate a multiclass average ignoring a\nmajority negative class, while labels not present in the data will\nresult in 0 components in a macro average. For multilabel targets,\nlabels are column indices. By default, all labels in ``y_true`` and\n``y_pred`` are used in sorted order.\n\n.. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -117492,7 +126198,8 @@ "docstring": { "type": "str or int, default=1", "description": "The class to report if ``average='binary'`` and the data is binary.\nIf the data are multiclass or multilabel, this will be ignored;\nsetting ``labels=[pos_label]`` and ``average != 'binary'`` will report\nscores for that label only." - } + }, + "refined_type": {} }, { "name": "average", @@ -117502,6 +126209,16 @@ "docstring": { "type": "{'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'", "description": "This parameter is required for multiclass/multilabel targets.\nIf ``None``, the scores for each class are returned. Otherwise, this\ndetermines the type of averaging performed on the data:\n\n``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`)." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "samples", + "weighted", + "micro", + "macro", + "binary" + ] } }, { @@ -117512,7 +126229,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "zero_division", @@ -117522,13 +126240,14 @@ "docstring": { "type": "\"warn\", 0 or 1, default=\"warn\"", "description": "Sets the value to return when there is a zero division, i.e. when all\npredictions and labels are negative. If set to \"warn\", this acts as 0,\nbut warnings are also raised." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the F-beta score.\n\nThe F-beta score is the weighted harmonic mean of precision and recall, reaching its optimal value at 1 and its worst value at 0. The `beta` parameter determines the weight of recall in the combined score. ``beta < 1`` lends more weight to precision, while ``beta > 1`` favors recall (``beta -> 0`` considers only precision, ``beta -> +inf`` only recall). Read more in the :ref:`User Guide `.", - "docstring": "Compute the F-beta score.\n\nThe F-beta score is the weighted harmonic mean of precision and recall,\nreaching its optimal value at 1 and its worst value at 0.\n\nThe `beta` parameter determines the weight of recall in the combined\nscore. ``beta < 1`` lends more weight to precision, while ``beta > 1``\nfavors recall (``beta -> 0`` considers only precision, ``beta -> +inf``\nonly recall).\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\ny_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\nbeta : float\n Determines the weight of recall in the combined score.\n\nlabels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\npos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\naverage : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nzero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division, i.e. when all\n predictions and labels are negative. If set to \"warn\", this acts as 0,\n but warnings are also raised.\n\nReturns\n-------\nfbeta_score : float (if average is not None) or array of float, shape = [n_unique_labels]\n F-beta score of the positive class in binary classification or weighted\n average of the F-beta score of each class for the multiclass task.\n\nSee Also\n--------\nprecision_recall_fscore_support, multilabel_confusion_matrix\n\nNotes\n-----\nWhen ``true positive + false positive == 0`` or\n``true positive + false negative == 0``, f-score returns 0 and raises\n``UndefinedMetricWarning``. This behavior can be\nmodified with ``zero_division``.\n\nReferences\n----------\n.. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011).\n Modern Information Retrieval. Addison Wesley, pp. 327-328.\n\n.. [2] `Wikipedia entry for the F1-score\n `_.\n\nExamples\n--------\n>>> from sklearn.metrics import fbeta_score\n>>> y_true = [0, 1, 2, 0, 1, 2]\n>>> y_pred = [0, 2, 1, 0, 0, 1]\n>>> fbeta_score(y_true, y_pred, average='macro', beta=0.5)\n0.23...\n>>> fbeta_score(y_true, y_pred, average='micro', beta=0.5)\n0.33...\n>>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5)\n0.23...\n>>> fbeta_score(y_true, y_pred, average=None, beta=0.5)\narray([0.71..., 0. , 0. ])", + "description": "Compute the F-beta score.\n\nThe F-beta score is the weighted harmonic mean of precision and recall,\nreaching its optimal value at 1 and its worst value at 0.\n\nThe `beta` parameter determines the weight of recall in the combined\nscore. ``beta < 1`` lends more weight to precision, while ``beta > 1``\nfavors recall (``beta -> 0`` considers only precision, ``beta -> +inf``\nonly recall).\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the F-beta score.\n\n The F-beta score is the weighted harmonic mean of precision and recall,\n reaching its optimal value at 1 and its worst value at 0.\n\n The `beta` parameter determines the weight of recall in the combined\n score. ``beta < 1`` lends more weight to precision, while ``beta > 1``\n favors recall (``beta -> 0`` considers only precision, ``beta -> +inf``\n only recall).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n beta : float\n Determines the weight of recall in the combined score.\n\n labels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division, i.e. when all\n predictions and labels are negative. If set to \"warn\", this acts as 0,\n but warnings are also raised.\n\n Returns\n -------\n fbeta_score : float (if average is not None) or array of float, shape = [n_unique_labels]\n F-beta score of the positive class in binary classification or weighted\n average of the F-beta score of each class for the multiclass task.\n\n See Also\n --------\n precision_recall_fscore_support, multilabel_confusion_matrix\n\n Notes\n -----\n When ``true positive + false positive == 0`` or\n ``true positive + false negative == 0``, f-score returns 0 and raises\n ``UndefinedMetricWarning``. This behavior can be\n modified with ``zero_division``.\n\n References\n ----------\n .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011).\n Modern Information Retrieval. Addison Wesley, pp. 327-328.\n\n .. [2] `Wikipedia entry for the F1-score\n `_.\n\n Examples\n --------\n >>> from sklearn.metrics import fbeta_score\n >>> y_true = [0, 1, 2, 0, 1, 2]\n >>> y_pred = [0, 2, 1, 0, 0, 1]\n >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5)\n 0.23...\n >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5)\n 0.33...\n >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5)\n 0.23...\n >>> fbeta_score(y_true, y_pred, average=None, beta=0.5)\n array([0.71..., 0. , 0. ])\n ", "source_code": "\ndef fbeta_score(y_true, y_pred, *, beta, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn'):\n \"\"\"Compute the F-beta score.\n\n The F-beta score is the weighted harmonic mean of precision and recall,\n reaching its optimal value at 1 and its worst value at 0.\n\n The `beta` parameter determines the weight of recall in the combined\n score. ``beta < 1`` lends more weight to precision, while ``beta > 1``\n favors recall (``beta -> 0`` considers only precision, ``beta -> +inf``\n only recall).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n beta : float\n Determines the weight of recall in the combined score.\n\n labels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division, i.e. when all\n predictions and labels are negative. If set to \"warn\", this acts as 0,\n but warnings are also raised.\n\n Returns\n -------\n fbeta_score : float (if average is not None) or array of float, shape = [n_unique_labels]\n F-beta score of the positive class in binary classification or weighted\n average of the F-beta score of each class for the multiclass task.\n\n See Also\n --------\n precision_recall_fscore_support, multilabel_confusion_matrix\n\n Notes\n -----\n When ``true positive + false positive == 0`` or\n ``true positive + false negative == 0``, f-score returns 0 and raises\n ``UndefinedMetricWarning``. This behavior can be\n modified with ``zero_division``.\n\n References\n ----------\n .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011).\n Modern Information Retrieval. Addison Wesley, pp. 327-328.\n\n .. [2] `Wikipedia entry for the F1-score\n `_.\n\n Examples\n --------\n >>> from sklearn.metrics import fbeta_score\n >>> y_true = [0, 1, 2, 0, 1, 2]\n >>> y_pred = [0, 2, 1, 0, 0, 1]\n >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5)\n 0.23...\n >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5)\n 0.33...\n >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5)\n 0.23...\n >>> fbeta_score(y_true, y_pred, average=None, beta=0.5)\n array([0.71..., 0. , 0. ])\n \"\"\"\n (_, _, f, _) = precision_recall_fscore_support(y_true, y_pred, beta=beta, labels=labels, pos_label=pos_label, average=average, warn_for=('f-score', ), sample_weight=sample_weight, zero_division=zero_division)\n return f" }, { @@ -117546,7 +126265,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Ground truth (correct) labels." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -117556,7 +126276,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Predicted labels, as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -117566,14 +126287,15 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the average Hamming loss.\n\nThe Hamming loss is the fraction of labels that are incorrectly predicted. Read more in the :ref:`User Guide `.", - "docstring": "Compute the average Hamming loss.\n\nThe Hamming loss is the fraction of labels that are incorrectly predicted.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\ny_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.18\n\nReturns\n-------\nloss : float or int\n Return the average Hamming loss between element of ``y_true`` and\n ``y_pred``.\n\nSee Also\n--------\naccuracy_score, jaccard_score, zero_one_loss\n\nNotes\n-----\nIn multiclass classification, the Hamming loss corresponds to the Hamming\ndistance between ``y_true`` and ``y_pred`` which is equivalent to the\nsubset ``zero_one_loss`` function, when `normalize` parameter is set to\nTrue.\n\nIn multilabel classification, the Hamming loss is different from the\nsubset zero-one loss. The zero-one loss considers the entire set of labels\nfor a given sample incorrect if it does not entirely match the true set of\nlabels. Hamming loss is more forgiving in that it penalizes only the\nindividual labels.\n\nThe Hamming loss is upperbounded by the subset zero-one loss, when\n`normalize` parameter is set to True. It is always between 0 and 1,\nlower being better.\n\nReferences\n----------\n.. [1] Grigorios Tsoumakas, Ioannis Katakis. Multi-Label Classification:\n An Overview. International Journal of Data Warehousing & Mining,\n 3(3), 1-13, July-September 2007.\n\n.. [2] `Wikipedia entry on the Hamming distance\n `_.\n\nExamples\n--------\n>>> from sklearn.metrics import hamming_loss\n>>> y_pred = [1, 2, 3, 4]\n>>> y_true = [2, 2, 3, 4]\n>>> hamming_loss(y_true, y_pred)\n0.25\n\nIn the multilabel case with binary label indicators:\n\n>>> import numpy as np\n>>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2)))\n0.75", - "source_code": "\ndef hamming_loss(y_true, y_pred, *, sample_weight=None):\n \"\"\"Compute the average Hamming loss.\n\n The Hamming loss is the fraction of labels that are incorrectly predicted.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n loss : float or int\n Return the average Hamming loss between element of ``y_true`` and\n ``y_pred``.\n\n See Also\n --------\n accuracy_score, jaccard_score, zero_one_loss\n\n Notes\n -----\n In multiclass classification, the Hamming loss corresponds to the Hamming\n distance between ``y_true`` and ``y_pred`` which is equivalent to the\n subset ``zero_one_loss`` function, when `normalize` parameter is set to\n True.\n\n In multilabel classification, the Hamming loss is different from the\n subset zero-one loss. The zero-one loss considers the entire set of labels\n for a given sample incorrect if it does not entirely match the true set of\n labels. Hamming loss is more forgiving in that it penalizes only the\n individual labels.\n\n The Hamming loss is upperbounded by the subset zero-one loss, when\n `normalize` parameter is set to True. It is always between 0 and 1,\n lower being better.\n\n References\n ----------\n .. [1] Grigorios Tsoumakas, Ioannis Katakis. Multi-Label Classification:\n An Overview. International Journal of Data Warehousing & Mining,\n 3(3), 1-13, July-September 2007.\n\n .. [2] `Wikipedia entry on the Hamming distance\n `_.\n\n Examples\n --------\n >>> from sklearn.metrics import hamming_loss\n >>> y_pred = [1, 2, 3, 4]\n >>> y_true = [2, 2, 3, 4]\n >>> hamming_loss(y_true, y_pred)\n 0.25\n\n In the multilabel case with binary label indicators:\n\n >>> import numpy as np\n >>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2)))\n 0.75\n \"\"\"\n (y_type, y_true, y_pred) = _check_targets(y_true, y_pred)\n check_consistent_length(y_true, y_pred, sample_weight)\n if sample_weight is None:\n weight_average = 1.0\n else:\n weight_average = np.mean(sample_weight)\n if y_type.startswith('multilabel'):\n n_differences = count_nonzero(y_true - y_pred, sample_weight=sample_weight)\n return n_differences / (y_true.shape[0] * y_true.shape[1] * weight_average)\n elif y_type in ['binary', 'multiclass']:\n return _weighted_sum(y_true != y_pred, sample_weight, normalize=True)\n else:\n raise ValueError('{0} is not supported'.format(y_type))" + "description": "Compute the average Hamming loss.\n\nThe Hamming loss is the fraction of labels that are incorrectly predicted.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the average Hamming loss.\n\n The Hamming loss is the fraction of labels that are incorrectly predicted.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n loss : float or int\n Return the average Hamming loss between element of ``y_true`` and\n ``y_pred``.\n\n See Also\n --------\n accuracy_score : Compute the accuracy score. By default, the function will\n return the fraction of correct predictions divided by the total number\n of predictions.\n jaccard_score : Compute the Jaccard similarity coefficient score.\n zero_one_loss : Compute the Zero-one classification loss. By default, the\n function will return the percentage of imperfectly predicted subsets.\n\n Notes\n -----\n In multiclass classification, the Hamming loss corresponds to the Hamming\n distance between ``y_true`` and ``y_pred`` which is equivalent to the\n subset ``zero_one_loss`` function, when `normalize` parameter is set to\n True.\n\n In multilabel classification, the Hamming loss is different from the\n subset zero-one loss. The zero-one loss considers the entire set of labels\n for a given sample incorrect if it does not entirely match the true set of\n labels. Hamming loss is more forgiving in that it penalizes only the\n individual labels.\n\n The Hamming loss is upperbounded by the subset zero-one loss, when\n `normalize` parameter is set to True. It is always between 0 and 1,\n lower being better.\n\n References\n ----------\n .. [1] Grigorios Tsoumakas, Ioannis Katakis. Multi-Label Classification:\n An Overview. International Journal of Data Warehousing & Mining,\n 3(3), 1-13, July-September 2007.\n\n .. [2] `Wikipedia entry on the Hamming distance\n `_.\n\n Examples\n --------\n >>> from sklearn.metrics import hamming_loss\n >>> y_pred = [1, 2, 3, 4]\n >>> y_true = [2, 2, 3, 4]\n >>> hamming_loss(y_true, y_pred)\n 0.25\n\n In the multilabel case with binary label indicators:\n\n >>> import numpy as np\n >>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2)))\n 0.75\n ", + "source_code": "\ndef hamming_loss(y_true, y_pred, *, sample_weight=None):\n \"\"\"Compute the average Hamming loss.\n\n The Hamming loss is the fraction of labels that are incorrectly predicted.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n loss : float or int\n Return the average Hamming loss between element of ``y_true`` and\n ``y_pred``.\n\n See Also\n --------\n accuracy_score : Compute the accuracy score. By default, the function will\n return the fraction of correct predictions divided by the total number\n of predictions.\n jaccard_score : Compute the Jaccard similarity coefficient score.\n zero_one_loss : Compute the Zero-one classification loss. By default, the\n function will return the percentage of imperfectly predicted subsets.\n\n Notes\n -----\n In multiclass classification, the Hamming loss corresponds to the Hamming\n distance between ``y_true`` and ``y_pred`` which is equivalent to the\n subset ``zero_one_loss`` function, when `normalize` parameter is set to\n True.\n\n In multilabel classification, the Hamming loss is different from the\n subset zero-one loss. The zero-one loss considers the entire set of labels\n for a given sample incorrect if it does not entirely match the true set of\n labels. Hamming loss is more forgiving in that it penalizes only the\n individual labels.\n\n The Hamming loss is upperbounded by the subset zero-one loss, when\n `normalize` parameter is set to True. It is always between 0 and 1,\n lower being better.\n\n References\n ----------\n .. [1] Grigorios Tsoumakas, Ioannis Katakis. Multi-Label Classification:\n An Overview. International Journal of Data Warehousing & Mining,\n 3(3), 1-13, July-September 2007.\n\n .. [2] `Wikipedia entry on the Hamming distance\n `_.\n\n Examples\n --------\n >>> from sklearn.metrics import hamming_loss\n >>> y_pred = [1, 2, 3, 4]\n >>> y_true = [2, 2, 3, 4]\n >>> hamming_loss(y_true, y_pred)\n 0.25\n\n In the multilabel case with binary label indicators:\n\n >>> import numpy as np\n >>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2)))\n 0.75\n \"\"\"\n (y_type, y_true, y_pred) = _check_targets(y_true, y_pred)\n check_consistent_length(y_true, y_pred, sample_weight)\n if sample_weight is None:\n weight_average = 1.0\n else:\n weight_average = np.mean(sample_weight)\n if y_type.startswith('multilabel'):\n n_differences = count_nonzero(y_true - y_pred, sample_weight=sample_weight)\n return n_differences / (y_true.shape[0] * y_true.shape[1] * weight_average)\n elif y_type in ['binary', 'multiclass']:\n return _weighted_sum(y_true != y_pred, sample_weight, normalize=True)\n else:\n raise ValueError('{0} is not supported'.format(y_type))" }, { "name": "hinge_loss", @@ -117590,7 +126312,8 @@ "docstring": { "type": "array of shape (n_samples,)", "description": "True target, consisting of integers of two values. The positive label\nmust be greater than the negative label." - } + }, + "refined_type": {} }, { "name": "pred_decision", @@ -117600,7 +126323,8 @@ "docstring": { "type": "array of shape (n_samples,) or (n_samples, n_classes)", "description": "Predicted decisions, as output by decision_function (floats)." - } + }, + "refined_type": {} }, { "name": "labels", @@ -117610,7 +126334,8 @@ "docstring": { "type": "array-like, default=None", "description": "Contains all the labels for the problem. Used in multiclass hinge loss." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -117620,13 +126345,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Average hinge loss (non-regularized).\n\nIn binary class case, assuming labels in y_true are encoded with +1 and -1, when a prediction mistake is made, ``margin = y_true * pred_decision`` is always negative (since the signs disagree), implying ``1 - margin`` is always greater than 1. The cumulated hinge loss is therefore an upper bound of the number of mistakes made by the classifier. In multiclass case, the function expects that either all the labels are included in y_true or an optional labels argument is provided which contains all the labels. The multilabel margin is calculated according to Crammer-Singer's method. As in the binary case, the cumulated hinge loss is an upper bound of the number of mistakes made by the classifier. Read more in the :ref:`User Guide `.", - "docstring": "Average hinge loss (non-regularized).\n\nIn binary class case, assuming labels in y_true are encoded with +1 and -1,\nwhen a prediction mistake is made, ``margin = y_true * pred_decision`` is\nalways negative (since the signs disagree), implying ``1 - margin`` is\nalways greater than 1. The cumulated hinge loss is therefore an upper\nbound of the number of mistakes made by the classifier.\n\nIn multiclass case, the function expects that either all the labels are\nincluded in y_true or an optional labels argument is provided which\ncontains all the labels. The multilabel margin is calculated according\nto Crammer-Singer's method. As in the binary case, the cumulated hinge loss\nis an upper bound of the number of mistakes made by the classifier.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array of shape (n_samples,)\n True target, consisting of integers of two values. The positive label\n must be greater than the negative label.\n\npred_decision : array of shape (n_samples,) or (n_samples, n_classes)\n Predicted decisions, as output by decision_function (floats).\n\nlabels : array-like, default=None\n Contains all the labels for the problem. Used in multiclass hinge loss.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nloss : float\n\nReferences\n----------\n.. [1] `Wikipedia entry on the Hinge loss\n `_.\n\n.. [2] Koby Crammer, Yoram Singer. On the Algorithmic\n Implementation of Multiclass Kernel-based Vector\n Machines. Journal of Machine Learning Research 2,\n (2001), 265-292.\n\n.. [3] `L1 AND L2 Regularization for Multiclass Hinge Loss Models\n by Robert C. Moore, John DeNero\n `_.\n\nExamples\n--------\n>>> from sklearn import svm\n>>> from sklearn.metrics import hinge_loss\n>>> X = [[0], [1]]\n>>> y = [-1, 1]\n>>> est = svm.LinearSVC(random_state=0)\n>>> est.fit(X, y)\nLinearSVC(random_state=0)\n>>> pred_decision = est.decision_function([[-2], [3], [0.5]])\n>>> pred_decision\narray([-2.18..., 2.36..., 0.09...])\n>>> hinge_loss([-1, 1, 1], pred_decision)\n0.30...\n\nIn the multiclass case:\n\n>>> import numpy as np\n>>> X = np.array([[0], [1], [2], [3]])\n>>> Y = np.array([0, 1, 2, 3])\n>>> labels = np.array([0, 1, 2, 3])\n>>> est = svm.LinearSVC()\n>>> est.fit(X, Y)\nLinearSVC()\n>>> pred_decision = est.decision_function([[-1], [2], [3]])\n>>> y_true = [0, 2, 3]\n>>> hinge_loss(y_true, pred_decision, labels=labels)\n0.56...", + "description": "Average hinge loss (non-regularized).\n\nIn binary class case, assuming labels in y_true are encoded with +1 and -1,\nwhen a prediction mistake is made, ``margin = y_true * pred_decision`` is\nalways negative (since the signs disagree), implying ``1 - margin`` is\nalways greater than 1. The cumulated hinge loss is therefore an upper\nbound of the number of mistakes made by the classifier.\n\nIn multiclass case, the function expects that either all the labels are\nincluded in y_true or an optional labels argument is provided which\ncontains all the labels. The multilabel margin is calculated according\nto Crammer-Singer's method. As in the binary case, the cumulated hinge loss\nis an upper bound of the number of mistakes made by the classifier.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Average hinge loss (non-regularized).\n\n In binary class case, assuming labels in y_true are encoded with +1 and -1,\n when a prediction mistake is made, ``margin = y_true * pred_decision`` is\n always negative (since the signs disagree), implying ``1 - margin`` is\n always greater than 1. The cumulated hinge loss is therefore an upper\n bound of the number of mistakes made by the classifier.\n\n In multiclass case, the function expects that either all the labels are\n included in y_true or an optional labels argument is provided which\n contains all the labels. The multilabel margin is calculated according\n to Crammer-Singer's method. As in the binary case, the cumulated hinge loss\n is an upper bound of the number of mistakes made by the classifier.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array of shape (n_samples,)\n True target, consisting of integers of two values. The positive label\n must be greater than the negative label.\n\n pred_decision : array of shape (n_samples,) or (n_samples, n_classes)\n Predicted decisions, as output by decision_function (floats).\n\n labels : array-like, default=None\n Contains all the labels for the problem. Used in multiclass hinge loss.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n loss : float\n\n References\n ----------\n .. [1] `Wikipedia entry on the Hinge loss\n `_.\n\n .. [2] Koby Crammer, Yoram Singer. On the Algorithmic\n Implementation of Multiclass Kernel-based Vector\n Machines. Journal of Machine Learning Research 2,\n (2001), 265-292.\n\n .. [3] `L1 AND L2 Regularization for Multiclass Hinge Loss Models\n by Robert C. Moore, John DeNero\n `_.\n\n Examples\n --------\n >>> from sklearn import svm\n >>> from sklearn.metrics import hinge_loss\n >>> X = [[0], [1]]\n >>> y = [-1, 1]\n >>> est = svm.LinearSVC(random_state=0)\n >>> est.fit(X, y)\n LinearSVC(random_state=0)\n >>> pred_decision = est.decision_function([[-2], [3], [0.5]])\n >>> pred_decision\n array([-2.18..., 2.36..., 0.09...])\n >>> hinge_loss([-1, 1, 1], pred_decision)\n 0.30...\n\n In the multiclass case:\n\n >>> import numpy as np\n >>> X = np.array([[0], [1], [2], [3]])\n >>> Y = np.array([0, 1, 2, 3])\n >>> labels = np.array([0, 1, 2, 3])\n >>> est = svm.LinearSVC()\n >>> est.fit(X, Y)\n LinearSVC()\n >>> pred_decision = est.decision_function([[-1], [2], [3]])\n >>> y_true = [0, 2, 3]\n >>> hinge_loss(y_true, pred_decision, labels=labels)\n 0.56...\n ", "source_code": "\ndef hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None):\n \"\"\"Average hinge loss (non-regularized).\n\n In binary class case, assuming labels in y_true are encoded with +1 and -1,\n when a prediction mistake is made, ``margin = y_true * pred_decision`` is\n always negative (since the signs disagree), implying ``1 - margin`` is\n always greater than 1. The cumulated hinge loss is therefore an upper\n bound of the number of mistakes made by the classifier.\n\n In multiclass case, the function expects that either all the labels are\n included in y_true or an optional labels argument is provided which\n contains all the labels. The multilabel margin is calculated according\n to Crammer-Singer's method. As in the binary case, the cumulated hinge loss\n is an upper bound of the number of mistakes made by the classifier.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array of shape (n_samples,)\n True target, consisting of integers of two values. The positive label\n must be greater than the negative label.\n\n pred_decision : array of shape (n_samples,) or (n_samples, n_classes)\n Predicted decisions, as output by decision_function (floats).\n\n labels : array-like, default=None\n Contains all the labels for the problem. Used in multiclass hinge loss.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n loss : float\n\n References\n ----------\n .. [1] `Wikipedia entry on the Hinge loss\n `_.\n\n .. [2] Koby Crammer, Yoram Singer. On the Algorithmic\n Implementation of Multiclass Kernel-based Vector\n Machines. Journal of Machine Learning Research 2,\n (2001), 265-292.\n\n .. [3] `L1 AND L2 Regularization for Multiclass Hinge Loss Models\n by Robert C. Moore, John DeNero\n `_.\n\n Examples\n --------\n >>> from sklearn import svm\n >>> from sklearn.metrics import hinge_loss\n >>> X = [[0], [1]]\n >>> y = [-1, 1]\n >>> est = svm.LinearSVC(random_state=0)\n >>> est.fit(X, y)\n LinearSVC(random_state=0)\n >>> pred_decision = est.decision_function([[-2], [3], [0.5]])\n >>> pred_decision\n array([-2.18..., 2.36..., 0.09...])\n >>> hinge_loss([-1, 1, 1], pred_decision)\n 0.30...\n\n In the multiclass case:\n\n >>> import numpy as np\n >>> X = np.array([[0], [1], [2], [3]])\n >>> Y = np.array([0, 1, 2, 3])\n >>> labels = np.array([0, 1, 2, 3])\n >>> est = svm.LinearSVC()\n >>> est.fit(X, Y)\n LinearSVC()\n >>> pred_decision = est.decision_function([[-1], [2], [3]])\n >>> y_true = [0, 2, 3]\n >>> hinge_loss(y_true, pred_decision, labels=labels)\n 0.56...\n \"\"\"\n check_consistent_length(y_true, pred_decision, sample_weight)\n pred_decision = check_array(pred_decision, ensure_2d=False)\n y_true = column_or_1d(y_true)\n y_true_unique = np.unique(labels if labels is not None else y_true)\n if y_true_unique.size > 2:\n if pred_decision.ndim <= 1:\n raise ValueError(f'The shape of pred_decision cannot be 1d arraywith a multiclass target. pred_decision shape must be (n_samples, n_classes), that is ({y_true.shape[0]}, {y_true_unique.size}). Got: {pred_decision.shape}')\n if y_true_unique.size != pred_decision.shape[1]:\n if labels is None:\n raise ValueError('Please include all labels in y_true or pass labels as third argument')\n else:\n raise ValueError(f'The shape of pred_decision is not consistent with the number of classes. With a multiclass target, pred_decision shape must be (n_samples, n_classes), that is ({y_true.shape[0]}, {y_true_unique.size}). Got: {pred_decision.shape}')\n if labels is None:\n labels = y_true_unique\n le = LabelEncoder()\n le.fit(labels)\n y_true = le.transform(y_true)\n mask = np.ones_like(pred_decision, dtype=bool)\n mask[np.arange(y_true.shape[0]), y_true] = False\n margin = pred_decision[~mask]\n margin -= np.max(pred_decision[mask].reshape(y_true.shape[0], -1), axis=1)\n else:\n pred_decision = column_or_1d(pred_decision)\n pred_decision = np.ravel(pred_decision)\n lbin = LabelBinarizer(neg_label=-1)\n y_true = lbin.fit_transform(y_true)[:, 0]\n try:\n margin = y_true * pred_decision\n except TypeError:\n raise TypeError('pred_decision should be an array of floats.')\n losses = 1 - margin\n np.clip(losses, 0, None, out=losses)\n return np.average(losses, weights=sample_weight)" }, { @@ -117644,7 +126370,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Ground truth (correct) labels." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -117654,7 +126381,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Predicted labels, as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "labels", @@ -117664,7 +126392,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "The set of labels to include when ``average != 'binary'``, and their\norder if ``average is None``. Labels present in the data can be\nexcluded, for example to calculate a multiclass average ignoring a\nmajority negative class, while labels not present in the data will\nresult in 0 components in a macro average. For multilabel targets,\nlabels are column indices. By default, all labels in ``y_true`` and\n``y_pred`` are used in sorted order." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -117674,7 +126403,8 @@ "docstring": { "type": "str or int, default=1", "description": "The class to report if ``average='binary'`` and the data is binary.\nIf the data are multiclass or multilabel, this will be ignored;\nsetting ``labels=[pos_label]`` and ``average != 'binary'`` will report\nscores for that label only." - } + }, + "refined_type": {} }, { "name": "average", @@ -117684,6 +126414,16 @@ "docstring": { "type": "{'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'", "description": "If ``None``, the scores for each class are returned. Otherwise, this\ndetermines the type of averaging performed on the data:\n\n``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance.\n``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification)." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "samples", + "weighted", + "micro", + "macro", + "binary" + ] } }, { @@ -117694,7 +126434,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "zero_division", @@ -117704,13 +126445,17 @@ "docstring": { "type": "\"warn\", {0.0, 1.0}, default=\"warn\"", "description": "Sets the value to return when there is a zero division, i.e. when there\nthere are no negative values in predictions and labels. If set to\n\"warn\", this acts like 0, but a warning is also raised." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Jaccard similarity coefficient score.\n\nThe Jaccard index [1], or Jaccard similarity coefficient, defined as the size of the intersection divided by the size of the union of two label sets, is used to compare set of predicted labels for a sample to the corresponding set of labels in ``y_true``. Read more in the :ref:`User Guide `.", - "docstring": "Jaccard similarity coefficient score.\n\nThe Jaccard index [1], or Jaccard similarity coefficient, defined as\nthe size of the intersection divided by the size of the union of two label\nsets, is used to compare set of predicted labels for a sample to the\ncorresponding set of labels in ``y_true``.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\ny_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\nlabels : array-like of shape (n_classes,), default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\npos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\naverage : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nzero_division : \"warn\", {0.0, 1.0}, default=\"warn\"\n Sets the value to return when there is a zero division, i.e. when there\n there are no negative values in predictions and labels. If set to\n \"warn\", this acts like 0, but a warning is also raised.\n\nReturns\n-------\nscore : float (if average is not None) or array of floats, shape = [n_unique_labels]\n\nSee Also\n--------\naccuracy_score, f1_score, multilabel_confusion_matrix\n\nNotes\n-----\n:func:`jaccard_score` may be a poor metric if there are no\npositives for some samples or classes. Jaccard is undefined if there are\nno true or predicted labels, and our implementation will return a score\nof 0 with a warning.\n\nReferences\n----------\n.. [1] `Wikipedia entry for the Jaccard index\n `_.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.metrics import jaccard_score\n>>> y_true = np.array([[0, 1, 1],\n... [1, 1, 0]])\n>>> y_pred = np.array([[1, 1, 1],\n... [1, 0, 0]])\n\nIn the binary case:\n\n>>> jaccard_score(y_true[0], y_pred[0])\n0.6666...\n\nIn the multilabel case:\n\n>>> jaccard_score(y_true, y_pred, average='samples')\n0.5833...\n>>> jaccard_score(y_true, y_pred, average='macro')\n0.6666...\n>>> jaccard_score(y_true, y_pred, average=None)\narray([0.5, 0.5, 1. ])\n\nIn the multiclass case:\n\n>>> y_pred = [0, 2, 1, 2]\n>>> y_true = [0, 1, 2, 2]\n>>> jaccard_score(y_true, y_pred, average=None)\narray([1. , 0. , 0.33...])", + "description": "Jaccard similarity coefficient score.\n\nThe Jaccard index [1], or Jaccard similarity coefficient, defined as\nthe size of the intersection divided by the size of the union of two label\nsets, is used to compare set of predicted labels for a sample to the\ncorresponding set of labels in ``y_true``.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Jaccard similarity coefficient score.\n\n The Jaccard index [1], or Jaccard similarity coefficient, defined as\n the size of the intersection divided by the size of the union of two label\n sets, is used to compare set of predicted labels for a sample to the\n corresponding set of labels in ``y_true``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\n labels : array-like of shape (n_classes,), default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", {0.0, 1.0}, default=\"warn\"\n Sets the value to return when there is a zero division, i.e. when there\n there are no negative values in predictions and labels. If set to\n \"warn\", this acts like 0, but a warning is also raised.\n\n Returns\n -------\n score : float (if average is not None) or array of floats, shape = [n_unique_labels]\n\n See Also\n --------\n accuracy_score, f1_score, multilabel_confusion_matrix\n\n Notes\n -----\n :func:`jaccard_score` may be a poor metric if there are no\n positives for some samples or classes. Jaccard is undefined if there are\n no true or predicted labels, and our implementation will return a score\n of 0 with a warning.\n\n References\n ----------\n .. [1] `Wikipedia entry for the Jaccard index\n `_.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import jaccard_score\n >>> y_true = np.array([[0, 1, 1],\n ... [1, 1, 0]])\n >>> y_pred = np.array([[1, 1, 1],\n ... [1, 0, 0]])\n\n In the binary case:\n\n >>> jaccard_score(y_true[0], y_pred[0])\n 0.6666...\n\n In the multilabel case:\n\n >>> jaccard_score(y_true, y_pred, average='samples')\n 0.5833...\n >>> jaccard_score(y_true, y_pred, average='macro')\n 0.6666...\n >>> jaccard_score(y_true, y_pred, average=None)\n array([0.5, 0.5, 1. ])\n\n In the multiclass case:\n\n >>> y_pred = [0, 2, 1, 2]\n >>> y_true = [0, 1, 2, 2]\n >>> jaccard_score(y_true, y_pred, average=None)\n array([1. , 0. , 0.33...])\n ", "source_code": "\ndef jaccard_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn'):\n \"\"\"Jaccard similarity coefficient score.\n\n The Jaccard index [1], or Jaccard similarity coefficient, defined as\n the size of the intersection divided by the size of the union of two label\n sets, is used to compare set of predicted labels for a sample to the\n corresponding set of labels in ``y_true``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\n labels : array-like of shape (n_classes,), default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", {0.0, 1.0}, default=\"warn\"\n Sets the value to return when there is a zero division, i.e. when there\n there are no negative values in predictions and labels. If set to\n \"warn\", this acts like 0, but a warning is also raised.\n\n Returns\n -------\n score : float (if average is not None) or array of floats, shape = [n_unique_labels]\n\n See Also\n --------\n accuracy_score, f1_score, multilabel_confusion_matrix\n\n Notes\n -----\n :func:`jaccard_score` may be a poor metric if there are no\n positives for some samples or classes. Jaccard is undefined if there are\n no true or predicted labels, and our implementation will return a score\n of 0 with a warning.\n\n References\n ----------\n .. [1] `Wikipedia entry for the Jaccard index\n `_.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import jaccard_score\n >>> y_true = np.array([[0, 1, 1],\n ... [1, 1, 0]])\n >>> y_pred = np.array([[1, 1, 1],\n ... [1, 0, 0]])\n\n In the binary case:\n\n >>> jaccard_score(y_true[0], y_pred[0])\n 0.6666...\n\n In the multilabel case:\n\n >>> jaccard_score(y_true, y_pred, average='samples')\n 0.5833...\n >>> jaccard_score(y_true, y_pred, average='macro')\n 0.6666...\n >>> jaccard_score(y_true, y_pred, average=None)\n array([0.5, 0.5, 1. ])\n\n In the multiclass case:\n\n >>> y_pred = [0, 2, 1, 2]\n >>> y_true = [0, 1, 2, 2]\n >>> jaccard_score(y_true, y_pred, average=None)\n array([1. , 0. , 0.33...])\n \"\"\"\n labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)\n samplewise = average == 'samples'\n MCM = multilabel_confusion_matrix(y_true, y_pred, sample_weight=sample_weight, labels=labels, samplewise=samplewise)\n numerator = MCM[:, 1, 1]\n denominator = MCM[:, 1, 1] + MCM[:, 0, 1] + MCM[:, 1, 0]\n if average == 'micro':\n numerator = np.array([numerator.sum()])\n denominator = np.array([denominator.sum()])\n jaccard = _prf_divide(numerator, denominator, 'jaccard', 'true or predicted', average, ('jaccard', ), zero_division=zero_division)\n if average is None:\n return jaccard\n if average == 'weighted':\n weights = MCM[:, 1, 0] + MCM[:, 1, 1]\n if not np.any(weights):\n weights = None\n elif average == 'samples' and sample_weight is not None:\n weights = sample_weight\n else:\n weights = None\n return np.average(jaccard, weights=weights)" }, { @@ -117728,7 +126473,8 @@ "docstring": { "type": "array-like or label indicator matrix", "description": "Ground truth (correct) labels for n_samples samples." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -117738,7 +126484,8 @@ "docstring": { "type": "array-like of float, shape = (n_samples, n_classes) or (n_samples,)", "description": "Predicted probabilities, as returned by a classifier's\npredict_proba method. If ``y_pred.shape = (n_samples,)``\nthe probabilities provided are assumed to be that of the\npositive class. The labels in ``y_pred`` are assumed to be\nordered alphabetically, as done by\n:class:`preprocessing.LabelBinarizer`." - } + }, + "refined_type": {} }, { "name": "eps", @@ -117748,7 +126495,8 @@ "docstring": { "type": "float, default=1e-15", "description": "Log loss is undefined for p=0 or p=1, so probabilities are\nclipped to max(eps, min(1 - eps, p))." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -117758,7 +126506,8 @@ "docstring": { "type": "bool, default=True", "description": "If true, return the mean loss per sample.\nOtherwise, return the sum of the per-sample losses." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -117768,7 +126517,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "labels", @@ -117778,13 +126528,14 @@ "docstring": { "type": "array-like, default=None", "description": "If not provided, labels will be inferred from y_true. If ``labels``\nis ``None`` and ``y_pred`` has shape (n_samples,) the labels are\nassumed to be binary and are inferred from ``y_true``.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Log loss, aka logistic loss or cross-entropy loss.\n\nThis is the loss function used in (multinomial) logistic regression and extensions of it such as neural networks, defined as the negative log-likelihood of a logistic model that returns ``y_pred`` probabilities for its training data ``y_true``. The log loss is only defined for two or more labels. For a single sample with true label :math:`y \\in \\{0,1\\}` and a probability estimate :math:`p = \\operatorname{Pr}(y = 1)`, the log loss is: .. math:: L_{\\log}(y, p) = -(y \\log (p) + (1 - y) \\log (1 - p)) Read more in the :ref:`User Guide `.", - "docstring": "Log loss, aka logistic loss or cross-entropy loss.\n\nThis is the loss function used in (multinomial) logistic regression\nand extensions of it such as neural networks, defined as the negative\nlog-likelihood of a logistic model that returns ``y_pred`` probabilities\nfor its training data ``y_true``.\nThe log loss is only defined for two or more labels.\nFor a single sample with true label :math:`y \\in \\{0,1\\}` and\na probability estimate :math:`p = \\operatorname{Pr}(y = 1)`, the log\nloss is:\n\n.. math::\n L_{\\log}(y, p) = -(y \\log (p) + (1 - y) \\log (1 - p))\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like or label indicator matrix\n Ground truth (correct) labels for n_samples samples.\n\ny_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,)\n Predicted probabilities, as returned by a classifier's\n predict_proba method. If ``y_pred.shape = (n_samples,)``\n the probabilities provided are assumed to be that of the\n positive class. The labels in ``y_pred`` are assumed to be\n ordered alphabetically, as done by\n :class:`preprocessing.LabelBinarizer`.\n\neps : float, default=1e-15\n Log loss is undefined for p=0 or p=1, so probabilities are\n clipped to max(eps, min(1 - eps, p)).\n\nnormalize : bool, default=True\n If true, return the mean loss per sample.\n Otherwise, return the sum of the per-sample losses.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nlabels : array-like, default=None\n If not provided, labels will be inferred from y_true. If ``labels``\n is ``None`` and ``y_pred`` has shape (n_samples,) the labels are\n assumed to be binary and are inferred from ``y_true``.\n\n .. versionadded:: 0.18\n\nReturns\n-------\nloss : float\n\nNotes\n-----\nThe logarithm used is the natural logarithm (base-e).\n\nExamples\n--------\n>>> from sklearn.metrics import log_loss\n>>> log_loss([\"spam\", \"ham\", \"ham\", \"spam\"],\n... [[.1, .9], [.9, .1], [.8, .2], [.35, .65]])\n0.21616...\n\nReferences\n----------\nC.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,\np. 209.", + "description": "Log loss, aka logistic loss or cross-entropy loss.\n\nThis is the loss function used in (multinomial) logistic regression\nand extensions of it such as neural networks, defined as the negative\nlog-likelihood of a logistic model that returns ``y_pred`` probabilities\nfor its training data ``y_true``.\nThe log loss is only defined for two or more labels.\nFor a single sample with true label :math:`y \\in \\{0,1\\}` and\na probability estimate :math:`p = \\operatorname{Pr}(y = 1)`, the log\nloss is:\n\n.. math::\n L_{\\log}(y, p) = -(y \\log (p) + (1 - y) \\log (1 - p))\n\nRead more in the :ref:`User Guide `.", + "docstring": "Log loss, aka logistic loss or cross-entropy loss.\n\n This is the loss function used in (multinomial) logistic regression\n and extensions of it such as neural networks, defined as the negative\n log-likelihood of a logistic model that returns ``y_pred`` probabilities\n for its training data ``y_true``.\n The log loss is only defined for two or more labels.\n For a single sample with true label :math:`y \\in \\{0,1\\}` and\n a probability estimate :math:`p = \\operatorname{Pr}(y = 1)`, the log\n loss is:\n\n .. math::\n L_{\\log}(y, p) = -(y \\log (p) + (1 - y) \\log (1 - p))\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like or label indicator matrix\n Ground truth (correct) labels for n_samples samples.\n\n y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,)\n Predicted probabilities, as returned by a classifier's\n predict_proba method. If ``y_pred.shape = (n_samples,)``\n the probabilities provided are assumed to be that of the\n positive class. The labels in ``y_pred`` are assumed to be\n ordered alphabetically, as done by\n :class:`preprocessing.LabelBinarizer`.\n\n eps : float, default=1e-15\n Log loss is undefined for p=0 or p=1, so probabilities are\n clipped to max(eps, min(1 - eps, p)).\n\n normalize : bool, default=True\n If true, return the mean loss per sample.\n Otherwise, return the sum of the per-sample losses.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n labels : array-like, default=None\n If not provided, labels will be inferred from y_true. If ``labels``\n is ``None`` and ``y_pred`` has shape (n_samples,) the labels are\n assumed to be binary and are inferred from ``y_true``.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n loss : float\n\n Notes\n -----\n The logarithm used is the natural logarithm (base-e).\n\n Examples\n --------\n >>> from sklearn.metrics import log_loss\n >>> log_loss([\"spam\", \"ham\", \"ham\", \"spam\"],\n ... [[.1, .9], [.9, .1], [.8, .2], [.35, .65]])\n 0.21616...\n\n References\n ----------\n C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,\n p. 209.\n ", "source_code": "\ndef log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None, labels=None):\n \"\"\"Log loss, aka logistic loss or cross-entropy loss.\n\n This is the loss function used in (multinomial) logistic regression\n and extensions of it such as neural networks, defined as the negative\n log-likelihood of a logistic model that returns ``y_pred`` probabilities\n for its training data ``y_true``.\n The log loss is only defined for two or more labels.\n For a single sample with true label :math:`y \\in \\{0,1\\}` and\n a probability estimate :math:`p = \\operatorname{Pr}(y = 1)`, the log\n loss is:\n\n .. math::\n L_{\\log}(y, p) = -(y \\log (p) + (1 - y) \\log (1 - p))\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like or label indicator matrix\n Ground truth (correct) labels for n_samples samples.\n\n y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,)\n Predicted probabilities, as returned by a classifier's\n predict_proba method. If ``y_pred.shape = (n_samples,)``\n the probabilities provided are assumed to be that of the\n positive class. The labels in ``y_pred`` are assumed to be\n ordered alphabetically, as done by\n :class:`preprocessing.LabelBinarizer`.\n\n eps : float, default=1e-15\n Log loss is undefined for p=0 or p=1, so probabilities are\n clipped to max(eps, min(1 - eps, p)).\n\n normalize : bool, default=True\n If true, return the mean loss per sample.\n Otherwise, return the sum of the per-sample losses.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n labels : array-like, default=None\n If not provided, labels will be inferred from y_true. If ``labels``\n is ``None`` and ``y_pred`` has shape (n_samples,) the labels are\n assumed to be binary and are inferred from ``y_true``.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n loss : float\n\n Notes\n -----\n The logarithm used is the natural logarithm (base-e).\n\n Examples\n --------\n >>> from sklearn.metrics import log_loss\n >>> log_loss([\"spam\", \"ham\", \"ham\", \"spam\"],\n ... [[.1, .9], [.9, .1], [.8, .2], [.35, .65]])\n 0.21616...\n\n References\n ----------\n C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,\n p. 209.\n \"\"\"\n y_pred = check_array(y_pred, ensure_2d=False)\n check_consistent_length(y_pred, y_true, sample_weight)\n lb = LabelBinarizer()\n if labels is not None:\n lb.fit(labels)\n else:\n lb.fit(y_true)\n if len(lb.classes_) == 1:\n if labels is None:\n raise ValueError('y_true contains only one label ({0}). Please provide the true labels explicitly through the labels argument.'.format(lb.classes_[0]))\n else:\n raise ValueError('The labels array needs to contain at least two labels for log_loss, got {0}.'.format(lb.classes_))\n transformed_labels = lb.transform(y_true)\n if transformed_labels.shape[1] == 1:\n transformed_labels = np.append(1 - transformed_labels, transformed_labels, axis=1)\n y_pred = np.clip(y_pred, eps, 1 - eps)\n if y_pred.ndim == 1:\n y_pred = y_pred[:, np.newaxis]\n if y_pred.shape[1] == 1:\n y_pred = np.append(1 - y_pred, y_pred, axis=1)\n transformed_labels = check_array(transformed_labels)\n if len(lb.classes_) != y_pred.shape[1]:\n if labels is None:\n raise ValueError('y_true and y_pred contain different number of classes {0}, {1}. Please provide the true labels explicitly through the labels argument. Classes found in y_true: {2}'.format(transformed_labels.shape[1], y_pred.shape[1], lb.classes_))\n else:\n raise ValueError('The number of classes in labels is different from that in y_pred. Classes found in labels: {0}'.format(lb.classes_))\n y_pred /= y_pred.sum(axis=1)[:, np.newaxis]\n loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)\n return _weighted_sum(loss, sample_weight, normalize)" }, { @@ -117802,7 +126553,8 @@ "docstring": { "type": "array, shape = [n_samples]", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -117812,7 +126564,8 @@ "docstring": { "type": "array, shape = [n_samples]", "description": "Estimated targets as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -117822,13 +126575,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the Matthews correlation coefficient (MCC).\n\nThe Matthews correlation coefficient is used in machine learning as a measure of the quality of binary and multiclass classifications. It takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes. The MCC is in essence a correlation coefficient value between -1 and +1. A coefficient of +1 represents a perfect prediction, 0 an average random prediction and -1 an inverse prediction. The statistic is also known as the phi coefficient. [source: Wikipedia] Binary and multiclass labels are supported. Only in the binary case does this relate to information about true and false positives and negatives. See references below. Read more in the :ref:`User Guide `.", - "docstring": "Compute the Matthews correlation coefficient (MCC).\n\nThe Matthews correlation coefficient is used in machine learning as a\nmeasure of the quality of binary and multiclass classifications. It takes\ninto account true and false positives and negatives and is generally\nregarded as a balanced measure which can be used even if the classes are of\nvery different sizes. The MCC is in essence a correlation coefficient value\nbetween -1 and +1. A coefficient of +1 represents a perfect prediction, 0\nan average random prediction and -1 an inverse prediction. The statistic\nis also known as the phi coefficient. [source: Wikipedia]\n\nBinary and multiclass labels are supported. Only in the binary case does\nthis relate to information about true and false positives and negatives.\nSee references below.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array, shape = [n_samples]\n Ground truth (correct) target values.\n\ny_pred : array, shape = [n_samples]\n Estimated targets as returned by a classifier.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.18\n\nReturns\n-------\nmcc : float\n The Matthews correlation coefficient (+1 represents a perfect\n prediction, 0 an average random prediction and -1 and inverse\n prediction).\n\nReferences\n----------\n.. [1] `Baldi, Brunak, Chauvin, Andersen and Nielsen, (2000). Assessing the\n accuracy of prediction algorithms for classification: an overview\n `_.\n\n.. [2] `Wikipedia entry for the Matthews Correlation Coefficient\n `_.\n\n.. [3] `Gorodkin, (2004). Comparing two K-category assignments by a\n K-category correlation coefficient\n `_.\n\n.. [4] `Jurman, Riccadonna, Furlanello, (2012). A Comparison of MCC and CEN\n Error Measures in MultiClass Prediction\n `_.\n\nExamples\n--------\n>>> from sklearn.metrics import matthews_corrcoef\n>>> y_true = [+1, +1, +1, -1]\n>>> y_pred = [+1, -1, +1, +1]\n>>> matthews_corrcoef(y_true, y_pred)\n-0.33...", + "description": "Compute the Matthews correlation coefficient (MCC).\n\nThe Matthews correlation coefficient is used in machine learning as a\nmeasure of the quality of binary and multiclass classifications. It takes\ninto account true and false positives and negatives and is generally\nregarded as a balanced measure which can be used even if the classes are of\nvery different sizes. The MCC is in essence a correlation coefficient value\nbetween -1 and +1. A coefficient of +1 represents a perfect prediction, 0\nan average random prediction and -1 an inverse prediction. The statistic\nis also known as the phi coefficient. [source: Wikipedia]\n\nBinary and multiclass labels are supported. Only in the binary case does\nthis relate to information about true and false positives and negatives.\nSee references below.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the Matthews correlation coefficient (MCC).\n\n The Matthews correlation coefficient is used in machine learning as a\n measure of the quality of binary and multiclass classifications. It takes\n into account true and false positives and negatives and is generally\n regarded as a balanced measure which can be used even if the classes are of\n very different sizes. The MCC is in essence a correlation coefficient value\n between -1 and +1. A coefficient of +1 represents a perfect prediction, 0\n an average random prediction and -1 an inverse prediction. The statistic\n is also known as the phi coefficient. [source: Wikipedia]\n\n Binary and multiclass labels are supported. Only in the binary case does\n this relate to information about true and false positives and negatives.\n See references below.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array, shape = [n_samples]\n Ground truth (correct) target values.\n\n y_pred : array, shape = [n_samples]\n Estimated targets as returned by a classifier.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n mcc : float\n The Matthews correlation coefficient (+1 represents a perfect\n prediction, 0 an average random prediction and -1 and inverse\n prediction).\n\n References\n ----------\n .. [1] `Baldi, Brunak, Chauvin, Andersen and Nielsen, (2000). Assessing the\n accuracy of prediction algorithms for classification: an overview\n `_.\n\n .. [2] `Wikipedia entry for the Matthews Correlation Coefficient\n `_.\n\n .. [3] `Gorodkin, (2004). Comparing two K-category assignments by a\n K-category correlation coefficient\n `_.\n\n .. [4] `Jurman, Riccadonna, Furlanello, (2012). A Comparison of MCC and CEN\n Error Measures in MultiClass Prediction\n `_.\n\n Examples\n --------\n >>> from sklearn.metrics import matthews_corrcoef\n >>> y_true = [+1, +1, +1, -1]\n >>> y_pred = [+1, -1, +1, +1]\n >>> matthews_corrcoef(y_true, y_pred)\n -0.33...\n ", "source_code": "\ndef matthews_corrcoef(y_true, y_pred, *, sample_weight=None):\n \"\"\"Compute the Matthews correlation coefficient (MCC).\n\n The Matthews correlation coefficient is used in machine learning as a\n measure of the quality of binary and multiclass classifications. It takes\n into account true and false positives and negatives and is generally\n regarded as a balanced measure which can be used even if the classes are of\n very different sizes. The MCC is in essence a correlation coefficient value\n between -1 and +1. A coefficient of +1 represents a perfect prediction, 0\n an average random prediction and -1 an inverse prediction. The statistic\n is also known as the phi coefficient. [source: Wikipedia]\n\n Binary and multiclass labels are supported. Only in the binary case does\n this relate to information about true and false positives and negatives.\n See references below.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array, shape = [n_samples]\n Ground truth (correct) target values.\n\n y_pred : array, shape = [n_samples]\n Estimated targets as returned by a classifier.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n mcc : float\n The Matthews correlation coefficient (+1 represents a perfect\n prediction, 0 an average random prediction and -1 and inverse\n prediction).\n\n References\n ----------\n .. [1] `Baldi, Brunak, Chauvin, Andersen and Nielsen, (2000). Assessing the\n accuracy of prediction algorithms for classification: an overview\n `_.\n\n .. [2] `Wikipedia entry for the Matthews Correlation Coefficient\n `_.\n\n .. [3] `Gorodkin, (2004). Comparing two K-category assignments by a\n K-category correlation coefficient\n `_.\n\n .. [4] `Jurman, Riccadonna, Furlanello, (2012). A Comparison of MCC and CEN\n Error Measures in MultiClass Prediction\n `_.\n\n Examples\n --------\n >>> from sklearn.metrics import matthews_corrcoef\n >>> y_true = [+1, +1, +1, -1]\n >>> y_pred = [+1, -1, +1, +1]\n >>> matthews_corrcoef(y_true, y_pred)\n -0.33...\n \"\"\"\n (y_type, y_true, y_pred) = _check_targets(y_true, y_pred)\n check_consistent_length(y_true, y_pred, sample_weight)\n if y_type not in {'binary', 'multiclass'}:\n raise ValueError('%s is not supported' % y_type)\n lb = LabelEncoder()\n lb.fit(np.hstack([y_true, y_pred]))\n y_true = lb.transform(y_true)\n y_pred = lb.transform(y_pred)\n C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)\n t_sum = C.sum(axis=1, dtype=np.float64)\n p_sum = C.sum(axis=0, dtype=np.float64)\n n_correct = np.trace(C, dtype=np.float64)\n n_samples = p_sum.sum()\n cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum)\n cov_ypyp = n_samples**2 - np.dot(p_sum, p_sum)\n cov_ytyt = n_samples**2 - np.dot(t_sum, t_sum)\n if cov_ypyp * cov_ytyt == 0:\n return 0.0\n else:\n return cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)" }, { @@ -117846,6 +126600,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_outputs) or (n_samples,)", "description": "Ground truth (correct) target values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -117856,6 +126614,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_outputs) or (n_samples,)", "description": "Estimated targets as returned by a classifier." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -117866,7 +126628,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "labels", @@ -117876,7 +126639,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "A list of classes or column indices to select some (or to force\ninclusion of classes absent from the data)." - } + }, + "refined_type": {} }, { "name": "samplewise", @@ -117886,13 +126650,14 @@ "docstring": { "type": "bool, default=False", "description": "In the multilabel case, this calculates a confusion matrix per sample." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute a confusion matrix for each class or sample.\n\n.. versionadded:: 0.21 Compute class-wise (default) or sample-wise (samplewise=True) multilabel confusion matrix to evaluate the accuracy of a classification, and output confusion matrices for each class or sample. In multilabel confusion matrix :math:`MCM`, the count of true negatives is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`, true positives is :math:`MCM_{:,1,1}` and false positives is :math:`MCM_{:,0,1}`. Multiclass data will be treated as if binarized under a one-vs-rest transformation. Returned confusion matrices will be in the order of sorted unique labels in the union of (y_true, y_pred). Read more in the :ref:`User Guide `.", - "docstring": "Compute a confusion matrix for each class or sample.\n\n.. versionadded:: 0.21\n\nCompute class-wise (default) or sample-wise (samplewise=True) multilabel\nconfusion matrix to evaluate the accuracy of a classification, and output\nconfusion matrices for each class or sample.\n\nIn multilabel confusion matrix :math:`MCM`, the count of true negatives\nis :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`,\ntrue positives is :math:`MCM_{:,1,1}` and false positives is\n:math:`MCM_{:,0,1}`.\n\nMulticlass data will be treated as if binarized under a one-vs-rest\ntransformation. Returned confusion matrices will be in the order of\nsorted unique labels in the union of (y_true, y_pred).\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : {array-like, sparse matrix} of shape (n_samples, n_outputs) or (n_samples,)\n Ground truth (correct) target values.\n\ny_pred : {array-like, sparse matrix} of shape (n_samples, n_outputs) or (n_samples,)\n Estimated targets as returned by a classifier.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nlabels : array-like of shape (n_classes,), default=None\n A list of classes or column indices to select some (or to force\n inclusion of classes absent from the data).\n\nsamplewise : bool, default=False\n In the multilabel case, this calculates a confusion matrix per sample.\n\nReturns\n-------\nmulti_confusion : ndarray of shape (n_outputs, 2, 2)\n A 2x2 confusion matrix corresponding to each output in the input.\n When calculating class-wise multi_confusion (default), then\n n_outputs = n_labels; when calculating sample-wise multi_confusion\n (samplewise=True), n_outputs = n_samples. If ``labels`` is defined,\n the results will be returned in the order specified in ``labels``,\n otherwise the results will be returned in sorted order by default.\n\nSee Also\n--------\nconfusion_matrix : Compute confusion matrix to evaluate the accuracy of a\n classifier.\n\nNotes\n-----\nThe `multilabel_confusion_matrix` calculates class-wise or sample-wise\nmultilabel confusion matrices, and in multiclass tasks, labels are\nbinarized under a one-vs-rest way; while\n:func:`~sklearn.metrics.confusion_matrix` calculates one confusion matrix\nfor confusion between every two classes.\n\nExamples\n--------\nMultilabel-indicator case:\n\n>>> import numpy as np\n>>> from sklearn.metrics import multilabel_confusion_matrix\n>>> y_true = np.array([[1, 0, 1],\n... [0, 1, 0]])\n>>> y_pred = np.array([[1, 0, 0],\n... [0, 1, 1]])\n>>> multilabel_confusion_matrix(y_true, y_pred)\narray([[[1, 0],\n [0, 1]],\n\n [[1, 0],\n [0, 1]],\n\n [[0, 1],\n [1, 0]]])\n\nMulticlass case:\n\n>>> y_true = [\"cat\", \"ant\", \"cat\", \"cat\", \"ant\", \"bird\"]\n>>> y_pred = [\"ant\", \"ant\", \"cat\", \"cat\", \"ant\", \"cat\"]\n>>> multilabel_confusion_matrix(y_true, y_pred,\n... labels=[\"ant\", \"bird\", \"cat\"])\narray([[[3, 1],\n [0, 2]],\n\n [[5, 0],\n [1, 0]],\n\n [[2, 1],\n [1, 2]]])", + "description": "Compute a confusion matrix for each class or sample.\n\n.. versionadded:: 0.21\n\nCompute class-wise (default) or sample-wise (samplewise=True) multilabel\nconfusion matrix to evaluate the accuracy of a classification, and output\nconfusion matrices for each class or sample.\n\nIn multilabel confusion matrix :math:`MCM`, the count of true negatives\nis :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`,\ntrue positives is :math:`MCM_{:,1,1}` and false positives is\n:math:`MCM_{:,0,1}`.\n\nMulticlass data will be treated as if binarized under a one-vs-rest\ntransformation. Returned confusion matrices will be in the order of\nsorted unique labels in the union of (y_true, y_pred).\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute a confusion matrix for each class or sample.\n\n .. versionadded:: 0.21\n\n Compute class-wise (default) or sample-wise (samplewise=True) multilabel\n confusion matrix to evaluate the accuracy of a classification, and output\n confusion matrices for each class or sample.\n\n In multilabel confusion matrix :math:`MCM`, the count of true negatives\n is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`,\n true positives is :math:`MCM_{:,1,1}` and false positives is\n :math:`MCM_{:,0,1}`.\n\n Multiclass data will be treated as if binarized under a one-vs-rest\n transformation. Returned confusion matrices will be in the order of\n sorted unique labels in the union of (y_true, y_pred).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : {array-like, sparse matrix} of shape (n_samples, n_outputs) or (n_samples,)\n Ground truth (correct) target values.\n\n y_pred : {array-like, sparse matrix} of shape (n_samples, n_outputs) or (n_samples,)\n Estimated targets as returned by a classifier.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n labels : array-like of shape (n_classes,), default=None\n A list of classes or column indices to select some (or to force\n inclusion of classes absent from the data).\n\n samplewise : bool, default=False\n In the multilabel case, this calculates a confusion matrix per sample.\n\n Returns\n -------\n multi_confusion : ndarray of shape (n_outputs, 2, 2)\n A 2x2 confusion matrix corresponding to each output in the input.\n When calculating class-wise multi_confusion (default), then\n n_outputs = n_labels; when calculating sample-wise multi_confusion\n (samplewise=True), n_outputs = n_samples. If ``labels`` is defined,\n the results will be returned in the order specified in ``labels``,\n otherwise the results will be returned in sorted order by default.\n\n See Also\n --------\n confusion_matrix : Compute confusion matrix to evaluate the accuracy of a\n classifier.\n\n Notes\n -----\n The `multilabel_confusion_matrix` calculates class-wise or sample-wise\n multilabel confusion matrices, and in multiclass tasks, labels are\n binarized under a one-vs-rest way; while\n :func:`~sklearn.metrics.confusion_matrix` calculates one confusion matrix\n for confusion between every two classes.\n\n Examples\n --------\n Multilabel-indicator case:\n\n >>> import numpy as np\n >>> from sklearn.metrics import multilabel_confusion_matrix\n >>> y_true = np.array([[1, 0, 1],\n ... [0, 1, 0]])\n >>> y_pred = np.array([[1, 0, 0],\n ... [0, 1, 1]])\n >>> multilabel_confusion_matrix(y_true, y_pred)\n array([[[1, 0],\n [0, 1]],\n \n [[1, 0],\n [0, 1]],\n \n [[0, 1],\n [1, 0]]])\n\n Multiclass case:\n\n >>> y_true = [\"cat\", \"ant\", \"cat\", \"cat\", \"ant\", \"bird\"]\n >>> y_pred = [\"ant\", \"ant\", \"cat\", \"cat\", \"ant\", \"cat\"]\n >>> multilabel_confusion_matrix(y_true, y_pred,\n ... labels=[\"ant\", \"bird\", \"cat\"])\n array([[[3, 1],\n [0, 2]],\n \n [[5, 0],\n [1, 0]],\n \n [[2, 1],\n [1, 2]]])\n ", "source_code": "\ndef multilabel_confusion_matrix(y_true, y_pred, *, sample_weight=None, labels=None, samplewise=False):\n \"\"\"Compute a confusion matrix for each class or sample.\n\n .. versionadded:: 0.21\n\n Compute class-wise (default) or sample-wise (samplewise=True) multilabel\n confusion matrix to evaluate the accuracy of a classification, and output\n confusion matrices for each class or sample.\n\n In multilabel confusion matrix :math:`MCM`, the count of true negatives\n is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`,\n true positives is :math:`MCM_{:,1,1}` and false positives is\n :math:`MCM_{:,0,1}`.\n\n Multiclass data will be treated as if binarized under a one-vs-rest\n transformation. Returned confusion matrices will be in the order of\n sorted unique labels in the union of (y_true, y_pred).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : {array-like, sparse matrix} of shape (n_samples, n_outputs) or (n_samples,)\n Ground truth (correct) target values.\n\n y_pred : {array-like, sparse matrix} of shape (n_samples, n_outputs) or (n_samples,)\n Estimated targets as returned by a classifier.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n labels : array-like of shape (n_classes,), default=None\n A list of classes or column indices to select some (or to force\n inclusion of classes absent from the data).\n\n samplewise : bool, default=False\n In the multilabel case, this calculates a confusion matrix per sample.\n\n Returns\n -------\n multi_confusion : ndarray of shape (n_outputs, 2, 2)\n A 2x2 confusion matrix corresponding to each output in the input.\n When calculating class-wise multi_confusion (default), then\n n_outputs = n_labels; when calculating sample-wise multi_confusion\n (samplewise=True), n_outputs = n_samples. If ``labels`` is defined,\n the results will be returned in the order specified in ``labels``,\n otherwise the results will be returned in sorted order by default.\n\n See Also\n --------\n confusion_matrix : Compute confusion matrix to evaluate the accuracy of a\n classifier.\n\n Notes\n -----\n The `multilabel_confusion_matrix` calculates class-wise or sample-wise\n multilabel confusion matrices, and in multiclass tasks, labels are\n binarized under a one-vs-rest way; while\n :func:`~sklearn.metrics.confusion_matrix` calculates one confusion matrix\n for confusion between every two classes.\n\n Examples\n --------\n Multilabel-indicator case:\n\n >>> import numpy as np\n >>> from sklearn.metrics import multilabel_confusion_matrix\n >>> y_true = np.array([[1, 0, 1],\n ... [0, 1, 0]])\n >>> y_pred = np.array([[1, 0, 0],\n ... [0, 1, 1]])\n >>> multilabel_confusion_matrix(y_true, y_pred)\n array([[[1, 0],\n [0, 1]],\n \n [[1, 0],\n [0, 1]],\n \n [[0, 1],\n [1, 0]]])\n\n Multiclass case:\n\n >>> y_true = [\"cat\", \"ant\", \"cat\", \"cat\", \"ant\", \"bird\"]\n >>> y_pred = [\"ant\", \"ant\", \"cat\", \"cat\", \"ant\", \"cat\"]\n >>> multilabel_confusion_matrix(y_true, y_pred,\n ... labels=[\"ant\", \"bird\", \"cat\"])\n array([[[3, 1],\n [0, 2]],\n \n [[5, 0],\n [1, 0]],\n \n [[2, 1],\n [1, 2]]])\n \"\"\"\n (y_type, y_true, y_pred) = _check_targets(y_true, y_pred)\n if sample_weight is not None:\n sample_weight = column_or_1d(sample_weight)\n check_consistent_length(y_true, y_pred, sample_weight)\n if y_type not in ('binary', 'multiclass', 'multilabel-indicator'):\n raise ValueError('%s is not supported' % y_type)\n present_labels = unique_labels(y_true, y_pred)\n if labels is None:\n labels = present_labels\n n_labels = None\n else:\n n_labels = len(labels)\n labels = np.hstack([labels, np.setdiff1d(present_labels, labels, assume_unique=True)])\n if y_true.ndim == 1:\n if samplewise:\n raise ValueError('Samplewise metrics are not available outside of multilabel classification.')\n le = LabelEncoder()\n le.fit(labels)\n y_true = le.transform(y_true)\n y_pred = le.transform(y_pred)\n sorted_labels = le.classes_\n tp = y_true == y_pred\n tp_bins = y_true[tp]\n if sample_weight is not None:\n tp_bins_weights = np.asarray(sample_weight)[tp]\n else:\n tp_bins_weights = None\n if len(tp_bins):\n tp_sum = np.bincount(tp_bins, weights=tp_bins_weights, minlength=len(labels))\n else:\n true_sum = pred_sum = tp_sum = np.zeros(len(labels))\n if len(y_pred):\n pred_sum = np.bincount(y_pred, weights=sample_weight, minlength=len(labels))\n if len(y_true):\n true_sum = np.bincount(y_true, weights=sample_weight, minlength=len(labels))\n indices = np.searchsorted(sorted_labels, labels[:n_labels])\n tp_sum = tp_sum[indices]\n true_sum = true_sum[indices]\n pred_sum = pred_sum[indices]\n else:\n sum_axis = 1 if samplewise else 0\n if not np.array_equal(labels, present_labels):\n if np.max(labels) > np.max(present_labels):\n raise ValueError('All labels must be in [0, n labels) for multilabel targets. Got %d > %d' % (np.max(labels), np.max(present_labels)))\n if np.min(labels) < 0:\n raise ValueError('All labels must be in [0, n labels) for multilabel targets. Got %d < 0' % np.min(labels))\n if n_labels is not None:\n y_true = y_true[:, labels[:n_labels]]\n y_pred = y_pred[:, labels[:n_labels]]\n true_and_pred = y_true.multiply(y_pred)\n tp_sum = count_nonzero(true_and_pred, axis=sum_axis, sample_weight=sample_weight)\n pred_sum = count_nonzero(y_pred, axis=sum_axis, sample_weight=sample_weight)\n true_sum = count_nonzero(y_true, axis=sum_axis, sample_weight=sample_weight)\n fp = pred_sum - tp_sum\n fn = true_sum - tp_sum\n tp = tp_sum\n if sample_weight is not None and samplewise:\n sample_weight = np.array(sample_weight)\n tp = np.array(tp)\n fp = np.array(fp)\n fn = np.array(fn)\n tn = sample_weight * y_true.shape[1] - tp - fp - fn\n elif sample_weight is not None:\n tn = sum(sample_weight) - tp - fp - fn\n elif samplewise:\n tn = y_true.shape[1] - tp - fp - fn\n else:\n tn = y_true.shape[0] - tp - fp - fn\n return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2)" }, { @@ -117910,7 +126675,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -117920,7 +126686,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Estimated targets as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "beta", @@ -117930,7 +126697,8 @@ "docstring": { "type": "float, default=1.0", "description": "The strength of recall versus precision in the F-score." - } + }, + "refined_type": {} }, { "name": "labels", @@ -117940,7 +126708,8 @@ "docstring": { "type": "array-like, default=None", "description": "The set of labels to include when ``average != 'binary'``, and their\norder if ``average is None``. Labels present in the data can be\nexcluded, for example to calculate a multiclass average ignoring a\nmajority negative class, while labels not present in the data will\nresult in 0 components in a macro average. For multilabel targets,\nlabels are column indices. By default, all labels in ``y_true`` and\n``y_pred`` are used in sorted order." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -117950,7 +126719,8 @@ "docstring": { "type": "str or int, default=1", "description": "The class to report if ``average='binary'`` and the data is binary.\nIf the data are multiclass or multilabel, this will be ignored;\nsetting ``labels=[pos_label]`` and ``average != 'binary'`` will report\nscores for that label only." - } + }, + "refined_type": {} }, { "name": "average", @@ -117960,6 +126730,16 @@ "docstring": { "type": "{'binary', 'micro', 'macro', 'samples','weighted'}, default=None", "description": "If ``None``, the scores for each class are returned. Otherwise, this\ndetermines the type of averaging performed on the data:\n\n``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`)." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "samples", + "weighted", + "micro", + "macro", + "binary" + ] } }, { @@ -117970,7 +126750,8 @@ "docstring": { "type": "tuple or set, for internal use", "description": "This determines which warnings will be made in the case that this\nfunction is being used to return only one of its metrics." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -117980,7 +126761,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "zero_division", @@ -117990,13 +126772,14 @@ "docstring": { "type": "\"warn\", 0 or 1, default=\"warn\"", "description": "Sets the value to return when there is a zero division:\n - recall: when there are no positive labels\n - precision: when there are no positive predictions\n - f-score: both\n\nIf set to \"warn\", this acts as 0, but warnings are also raised." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute precision, recall, F-measure and support for each class.\n\nThe precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0. The F-beta score weights recall more than precision by a factor of ``beta``. ``beta == 1.0`` means recall and precision are equally important. The support is the number of occurrences of each class in ``y_true``. If ``pos_label is None`` and in binary classification, this function returns the average precision, recall and F-measure if ``average`` is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``. Read more in the :ref:`User Guide `.", - "docstring": "Compute precision, recall, F-measure and support for each class.\n\nThe precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\ntrue positives and ``fp`` the number of false positives. The precision is\nintuitively the ability of the classifier not to label as positive a sample\nthat is negative.\n\nThe recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\ntrue positives and ``fn`` the number of false negatives. The recall is\nintuitively the ability of the classifier to find all the positive samples.\n\nThe F-beta score can be interpreted as a weighted harmonic mean of\nthe precision and recall, where an F-beta score reaches its best\nvalue at 1 and worst score at 0.\n\nThe F-beta score weights recall more than precision by a factor of\n``beta``. ``beta == 1.0`` means recall and precision are equally important.\n\nThe support is the number of occurrences of each class in ``y_true``.\n\nIf ``pos_label is None`` and in binary classification, this function\nreturns the average precision, recall and F-measure if ``average``\nis one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\ny_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\nbeta : float, default=1.0\n The strength of recall versus precision in the F-score.\n\nlabels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\npos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\naverage : {'binary', 'micro', 'macro', 'samples','weighted'}, default=None\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\nwarn_for : tuple or set, for internal use\n This determines which warnings will be made in the case that this\n function is being used to return only one of its metrics.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nzero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division:\n - recall: when there are no positive labels\n - precision: when there are no positive predictions\n - f-score: both\n\n If set to \"warn\", this acts as 0, but warnings are also raised.\n\nReturns\n-------\nprecision : float (if average is not None) or array of float, shape = [n_unique_labels]\n\nrecall : float (if average is not None) or array of float, shape = [n_unique_labels]\n\nfbeta_score : float (if average is not None) or array of float, shape = [n_unique_labels]\n\nsupport : None (if average is not None) or array of int, shape = [n_unique_labels]\n The number of occurrences of each label in ``y_true``.\n\nNotes\n-----\nWhen ``true positive + false positive == 0``, precision is undefined.\nWhen ``true positive + false negative == 0``, recall is undefined.\nIn such cases, by default the metric will be set to 0, as will f-score,\nand ``UndefinedMetricWarning`` will be raised. This behavior can be\nmodified with ``zero_division``.\n\nReferences\n----------\n.. [1] `Wikipedia entry for the Precision and recall\n `_.\n\n.. [2] `Wikipedia entry for the F1-score\n `_.\n\n.. [3] `Discriminative Methods for Multi-labeled Classification Advances\n in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu\n Godbole, Sunita Sarawagi\n `_.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.metrics import precision_recall_fscore_support\n>>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig'])\n>>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog'])\n>>> precision_recall_fscore_support(y_true, y_pred, average='macro')\n(0.22..., 0.33..., 0.26..., None)\n>>> precision_recall_fscore_support(y_true, y_pred, average='micro')\n(0.33..., 0.33..., 0.33..., None)\n>>> precision_recall_fscore_support(y_true, y_pred, average='weighted')\n(0.22..., 0.33..., 0.26..., None)\n\nIt is possible to compute per-label precisions, recalls, F1-scores and\nsupports instead of averaging:\n\n>>> precision_recall_fscore_support(y_true, y_pred, average=None,\n... labels=['pig', 'dog', 'cat'])\n(array([0. , 0. , 0.66...]),\n array([0., 0., 1.]), array([0. , 0. , 0.8]),\n array([2, 2, 2]))", + "description": "Compute precision, recall, F-measure and support for each class.\n\nThe precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\ntrue positives and ``fp`` the number of false positives. The precision is\nintuitively the ability of the classifier not to label as positive a sample\nthat is negative.\n\nThe recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\ntrue positives and ``fn`` the number of false negatives. The recall is\nintuitively the ability of the classifier to find all the positive samples.\n\nThe F-beta score can be interpreted as a weighted harmonic mean of\nthe precision and recall, where an F-beta score reaches its best\nvalue at 1 and worst score at 0.\n\nThe F-beta score weights recall more than precision by a factor of\n``beta``. ``beta == 1.0`` means recall and precision are equally important.\n\nThe support is the number of occurrences of each class in ``y_true``.\n\nIf ``pos_label is None`` and in binary classification, this function\nreturns the average precision, recall and F-measure if ``average``\nis one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute precision, recall, F-measure and support for each class.\n\n The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\n true positives and ``fp`` the number of false positives. The precision is\n intuitively the ability of the classifier not to label as positive a sample\n that is negative.\n\n The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\n true positives and ``fn`` the number of false negatives. The recall is\n intuitively the ability of the classifier to find all the positive samples.\n\n The F-beta score can be interpreted as a weighted harmonic mean of\n the precision and recall, where an F-beta score reaches its best\n value at 1 and worst score at 0.\n\n The F-beta score weights recall more than precision by a factor of\n ``beta``. ``beta == 1.0`` means recall and precision are equally important.\n\n The support is the number of occurrences of each class in ``y_true``.\n\n If ``pos_label is None`` and in binary classification, this function\n returns the average precision, recall and F-measure if ``average``\n is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n beta : float, default=1.0\n The strength of recall versus precision in the F-score.\n\n labels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'binary', 'micro', 'macro', 'samples','weighted'}, default=None\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\n warn_for : tuple or set, for internal use\n This determines which warnings will be made in the case that this\n function is being used to return only one of its metrics.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division:\n - recall: when there are no positive labels\n - precision: when there are no positive predictions\n - f-score: both\n\n If set to \"warn\", this acts as 0, but warnings are also raised.\n\n Returns\n -------\n precision : float (if average is not None) or array of float, shape = [n_unique_labels]\n\n recall : float (if average is not None) or array of float, shape = [n_unique_labels]\n\n fbeta_score : float (if average is not None) or array of float, shape = [n_unique_labels]\n\n support : None (if average is not None) or array of int, shape = [n_unique_labels]\n The number of occurrences of each label in ``y_true``.\n\n Notes\n -----\n When ``true positive + false positive == 0``, precision is undefined.\n When ``true positive + false negative == 0``, recall is undefined.\n In such cases, by default the metric will be set to 0, as will f-score,\n and ``UndefinedMetricWarning`` will be raised. This behavior can be\n modified with ``zero_division``.\n\n References\n ----------\n .. [1] `Wikipedia entry for the Precision and recall\n `_.\n\n .. [2] `Wikipedia entry for the F1-score\n `_.\n\n .. [3] `Discriminative Methods for Multi-labeled Classification Advances\n in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu\n Godbole, Sunita Sarawagi\n `_.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import precision_recall_fscore_support\n >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig'])\n >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog'])\n >>> precision_recall_fscore_support(y_true, y_pred, average='macro')\n (0.22..., 0.33..., 0.26..., None)\n >>> precision_recall_fscore_support(y_true, y_pred, average='micro')\n (0.33..., 0.33..., 0.33..., None)\n >>> precision_recall_fscore_support(y_true, y_pred, average='weighted')\n (0.22..., 0.33..., 0.26..., None)\n\n It is possible to compute per-label precisions, recalls, F1-scores and\n supports instead of averaging:\n\n >>> precision_recall_fscore_support(y_true, y_pred, average=None,\n ... labels=['pig', 'dog', 'cat'])\n (array([0. , 0. , 0.66...]),\n array([0., 0., 1.]), array([0. , 0. , 0.8]),\n array([2, 2, 2]))\n ", "source_code": "\ndef precision_recall_fscore_support(y_true, y_pred, *, beta=1.0, labels=None, pos_label=1, average=None, warn_for=('precision', 'recall', 'f-score'), sample_weight=None, zero_division='warn'):\n \"\"\"Compute precision, recall, F-measure and support for each class.\n\n The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\n true positives and ``fp`` the number of false positives. The precision is\n intuitively the ability of the classifier not to label as positive a sample\n that is negative.\n\n The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\n true positives and ``fn`` the number of false negatives. The recall is\n intuitively the ability of the classifier to find all the positive samples.\n\n The F-beta score can be interpreted as a weighted harmonic mean of\n the precision and recall, where an F-beta score reaches its best\n value at 1 and worst score at 0.\n\n The F-beta score weights recall more than precision by a factor of\n ``beta``. ``beta == 1.0`` means recall and precision are equally important.\n\n The support is the number of occurrences of each class in ``y_true``.\n\n If ``pos_label is None`` and in binary classification, this function\n returns the average precision, recall and F-measure if ``average``\n is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n beta : float, default=1.0\n The strength of recall versus precision in the F-score.\n\n labels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'binary', 'micro', 'macro', 'samples','weighted'}, default=None\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\n warn_for : tuple or set, for internal use\n This determines which warnings will be made in the case that this\n function is being used to return only one of its metrics.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division:\n - recall: when there are no positive labels\n - precision: when there are no positive predictions\n - f-score: both\n\n If set to \"warn\", this acts as 0, but warnings are also raised.\n\n Returns\n -------\n precision : float (if average is not None) or array of float, shape = [n_unique_labels]\n\n recall : float (if average is not None) or array of float, shape = [n_unique_labels]\n\n fbeta_score : float (if average is not None) or array of float, shape = [n_unique_labels]\n\n support : None (if average is not None) or array of int, shape = [n_unique_labels]\n The number of occurrences of each label in ``y_true``.\n\n Notes\n -----\n When ``true positive + false positive == 0``, precision is undefined.\n When ``true positive + false negative == 0``, recall is undefined.\n In such cases, by default the metric will be set to 0, as will f-score,\n and ``UndefinedMetricWarning`` will be raised. This behavior can be\n modified with ``zero_division``.\n\n References\n ----------\n .. [1] `Wikipedia entry for the Precision and recall\n `_.\n\n .. [2] `Wikipedia entry for the F1-score\n `_.\n\n .. [3] `Discriminative Methods for Multi-labeled Classification Advances\n in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu\n Godbole, Sunita Sarawagi\n `_.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import precision_recall_fscore_support\n >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig'])\n >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog'])\n >>> precision_recall_fscore_support(y_true, y_pred, average='macro')\n (0.22..., 0.33..., 0.26..., None)\n >>> precision_recall_fscore_support(y_true, y_pred, average='micro')\n (0.33..., 0.33..., 0.33..., None)\n >>> precision_recall_fscore_support(y_true, y_pred, average='weighted')\n (0.22..., 0.33..., 0.26..., None)\n\n It is possible to compute per-label precisions, recalls, F1-scores and\n supports instead of averaging:\n\n >>> precision_recall_fscore_support(y_true, y_pred, average=None,\n ... labels=['pig', 'dog', 'cat'])\n (array([0. , 0. , 0.66...]),\n array([0., 0., 1.]), array([0. , 0. , 0.8]),\n array([2, 2, 2]))\n \"\"\"\n _check_zero_division(zero_division)\n if beta < 0:\n raise ValueError('beta should be >=0 in the F-beta score')\n labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)\n samplewise = average == 'samples'\n MCM = multilabel_confusion_matrix(y_true, y_pred, sample_weight=sample_weight, labels=labels, samplewise=samplewise)\n tp_sum = MCM[:, 1, 1]\n pred_sum = tp_sum + MCM[:, 0, 1]\n true_sum = tp_sum + MCM[:, 1, 0]\n if average == 'micro':\n tp_sum = np.array([tp_sum.sum()])\n pred_sum = np.array([pred_sum.sum()])\n true_sum = np.array([true_sum.sum()])\n beta2 = beta**2\n precision = _prf_divide(tp_sum, pred_sum, 'precision', 'predicted', average, warn_for, zero_division)\n recall = _prf_divide(tp_sum, true_sum, 'recall', 'true', average, warn_for, zero_division)\n if zero_division == 'warn' and ('f-score', ) == warn_for:\n if (pred_sum[true_sum == 0] == 0).any():\n _warn_prf(average, 'true nor predicted', 'F-score is', len(true_sum))\n if np.isposinf(beta):\n f_score = recall\n else:\n denom = beta2 * precision + recall\n denom[denom == 0.0] = 1\n f_score = (1 + beta2) * precision * recall / denom\n if average == 'weighted':\n weights = true_sum\n if weights.sum() == 0:\n zero_division_value = np.float64(1.0)\n if zero_division in ['warn', 0]:\n zero_division_value = np.float64(0.0)\n if pred_sum.sum() == 0:\n return zero_division_value, zero_division_value, zero_division_value, None\n else:\n return np.float64(0.0), zero_division_value, np.float64(0.0), None\n elif average == 'samples':\n weights = sample_weight\n else:\n weights = None\n if average is not None:\n assert average != 'binary' or len(precision) == 1\n precision = np.average(precision, weights=weights)\n recall = np.average(recall, weights=weights)\n f_score = np.average(f_score, weights=weights)\n true_sum = None\n return precision, recall, f_score, true_sum" }, { @@ -118014,7 +126797,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -118024,7 +126808,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Estimated targets as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "labels", @@ -118034,7 +126819,8 @@ "docstring": { "type": "array-like, default=None", "description": "The set of labels to include when ``average != 'binary'``, and their\norder if ``average is None``. Labels present in the data can be\nexcluded, for example to calculate a multiclass average ignoring a\nmajority negative class, while labels not present in the data will\nresult in 0 components in a macro average. For multilabel targets,\nlabels are column indices. By default, all labels in ``y_true`` and\n``y_pred`` are used in sorted order.\n\n.. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -118044,7 +126830,8 @@ "docstring": { "type": "str or int, default=1", "description": "The class to report if ``average='binary'`` and the data is binary.\nIf the data are multiclass or multilabel, this will be ignored;\nsetting ``labels=[pos_label]`` and ``average != 'binary'`` will report\nscores for that label only." - } + }, + "refined_type": {} }, { "name": "average", @@ -118054,6 +126841,16 @@ "docstring": { "type": "{'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'", "description": "This parameter is required for multiclass/multilabel targets.\nIf ``None``, the scores for each class are returned. Otherwise, this\ndetermines the type of averaging performed on the data:\n\n``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`)." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "samples", + "weighted", + "micro", + "macro", + "binary" + ] } }, { @@ -118064,7 +126861,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "zero_division", @@ -118074,14 +126872,15 @@ "docstring": { "type": "\"warn\", 0 or 1, default=\"warn\"", "description": "Sets the value to return when there is a zero division. If set to\n\"warn\", this acts as 0, but warnings are also raised." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the precision.\n\nThe precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. The best value is 1 and the worst value is 0. Read more in the :ref:`User Guide `.", - "docstring": "Compute the precision.\n\nThe precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\ntrue positives and ``fp`` the number of false positives. The precision is\nintuitively the ability of the classifier not to label as positive a sample\nthat is negative.\n\nThe best value is 1 and the worst value is 0.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\ny_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\nlabels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\npos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\naverage : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nzero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division. If set to\n \"warn\", this acts as 0, but warnings are also raised.\n\nReturns\n-------\nprecision : float (if average is not None) or array of float of shape (n_unique_labels,)\n Precision of the positive class in binary classification or weighted\n average of the precision of each class for the multiclass task.\n\nSee Also\n--------\nprecision_recall_fscore_support, multilabel_confusion_matrix\n\nNotes\n-----\nWhen ``true positive + false positive == 0``, precision returns 0 and\nraises ``UndefinedMetricWarning``. This behavior can be\nmodified with ``zero_division``.\n\nExamples\n--------\n>>> from sklearn.metrics import precision_score\n>>> y_true = [0, 1, 2, 0, 1, 2]\n>>> y_pred = [0, 2, 1, 0, 0, 1]\n>>> precision_score(y_true, y_pred, average='macro')\n0.22...\n>>> precision_score(y_true, y_pred, average='micro')\n0.33...\n>>> precision_score(y_true, y_pred, average='weighted')\n0.22...\n>>> precision_score(y_true, y_pred, average=None)\narray([0.66..., 0. , 0. ])\n>>> y_pred = [0, 0, 0, 0, 0, 0]\n>>> precision_score(y_true, y_pred, average=None)\narray([0.33..., 0. , 0. ])\n>>> precision_score(y_true, y_pred, average=None, zero_division=1)\narray([0.33..., 1. , 1. ])\n>>> # multilabel classification\n>>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]\n>>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]\n>>> precision_score(y_true, y_pred, average=None)\narray([0.5, 1. , 1. ])", - "source_code": "\ndef precision_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn'):\n \"\"\"Compute the precision.\n\n The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\n true positives and ``fp`` the number of false positives. The precision is\n intuitively the ability of the classifier not to label as positive a sample\n that is negative.\n\n The best value is 1 and the worst value is 0.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n labels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division. If set to\n \"warn\", this acts as 0, but warnings are also raised.\n\n Returns\n -------\n precision : float (if average is not None) or array of float of shape (n_unique_labels,)\n Precision of the positive class in binary classification or weighted\n average of the precision of each class for the multiclass task.\n\n See Also\n --------\n precision_recall_fscore_support, multilabel_confusion_matrix\n\n Notes\n -----\n When ``true positive + false positive == 0``, precision returns 0 and\n raises ``UndefinedMetricWarning``. This behavior can be\n modified with ``zero_division``.\n\n Examples\n --------\n >>> from sklearn.metrics import precision_score\n >>> y_true = [0, 1, 2, 0, 1, 2]\n >>> y_pred = [0, 2, 1, 0, 0, 1]\n >>> precision_score(y_true, y_pred, average='macro')\n 0.22...\n >>> precision_score(y_true, y_pred, average='micro')\n 0.33...\n >>> precision_score(y_true, y_pred, average='weighted')\n 0.22...\n >>> precision_score(y_true, y_pred, average=None)\n array([0.66..., 0. , 0. ])\n >>> y_pred = [0, 0, 0, 0, 0, 0]\n >>> precision_score(y_true, y_pred, average=None)\n array([0.33..., 0. , 0. ])\n >>> precision_score(y_true, y_pred, average=None, zero_division=1)\n array([0.33..., 1. , 1. ])\n >>> # multilabel classification\n >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]\n >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]\n >>> precision_score(y_true, y_pred, average=None)\n array([0.5, 1. , 1. ])\n \"\"\"\n (p, _, _, _) = precision_recall_fscore_support(y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=('precision', ), sample_weight=sample_weight, zero_division=zero_division)\n return p" + "description": "Compute the precision.\n\nThe precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\ntrue positives and ``fp`` the number of false positives. The precision is\nintuitively the ability of the classifier not to label as positive a sample\nthat is negative.\n\nThe best value is 1 and the worst value is 0.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the precision.\n\n The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\n true positives and ``fp`` the number of false positives. The precision is\n intuitively the ability of the classifier not to label as positive a sample\n that is negative.\n\n The best value is 1 and the worst value is 0.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n labels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division. If set to\n \"warn\", this acts as 0, but warnings are also raised.\n\n Returns\n -------\n precision : float (if average is not None) or array of float of shape (n_unique_labels,)\n Precision of the positive class in binary classification or weighted\n average of the precision of each class for the multiclass task.\n\n See Also\n --------\n precision_recall_fscore_support : Compute precision, recall, F-measure and\n support for each class.\n recall_score : Compute the ratio ``tp / (tp + fn)`` where ``tp`` is the\n number of true positives and ``fn`` the number of false negatives.\n PrecisionRecallDisplay.from_estimator : Plot precision-recall curve given\n an estimator and some data.\n PrecisionRecallDisplay.from_predictions : Plot precision-recall curve given\n binary class predictions.\n multilabel_confusion_matrix : Compute a confusion matrix for each class or\n sample.\n\n Notes\n -----\n When ``true positive + false positive == 0``, precision returns 0 and\n raises ``UndefinedMetricWarning``. This behavior can be\n modified with ``zero_division``.\n\n Examples\n --------\n >>> from sklearn.metrics import precision_score\n >>> y_true = [0, 1, 2, 0, 1, 2]\n >>> y_pred = [0, 2, 1, 0, 0, 1]\n >>> precision_score(y_true, y_pred, average='macro')\n 0.22...\n >>> precision_score(y_true, y_pred, average='micro')\n 0.33...\n >>> precision_score(y_true, y_pred, average='weighted')\n 0.22...\n >>> precision_score(y_true, y_pred, average=None)\n array([0.66..., 0. , 0. ])\n >>> y_pred = [0, 0, 0, 0, 0, 0]\n >>> precision_score(y_true, y_pred, average=None)\n array([0.33..., 0. , 0. ])\n >>> precision_score(y_true, y_pred, average=None, zero_division=1)\n array([0.33..., 1. , 1. ])\n >>> # multilabel classification\n >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]\n >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]\n >>> precision_score(y_true, y_pred, average=None)\n array([0.5, 1. , 1. ])\n ", + "source_code": "\ndef precision_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn'):\n \"\"\"Compute the precision.\n\n The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\n true positives and ``fp`` the number of false positives. The precision is\n intuitively the ability of the classifier not to label as positive a sample\n that is negative.\n\n The best value is 1 and the worst value is 0.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n labels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division. If set to\n \"warn\", this acts as 0, but warnings are also raised.\n\n Returns\n -------\n precision : float (if average is not None) or array of float of shape (n_unique_labels,)\n Precision of the positive class in binary classification or weighted\n average of the precision of each class for the multiclass task.\n\n See Also\n --------\n precision_recall_fscore_support : Compute precision, recall, F-measure and\n support for each class.\n recall_score : Compute the ratio ``tp / (tp + fn)`` where ``tp`` is the\n number of true positives and ``fn`` the number of false negatives.\n PrecisionRecallDisplay.from_estimator : Plot precision-recall curve given\n an estimator and some data.\n PrecisionRecallDisplay.from_predictions : Plot precision-recall curve given\n binary class predictions.\n multilabel_confusion_matrix : Compute a confusion matrix for each class or\n sample.\n\n Notes\n -----\n When ``true positive + false positive == 0``, precision returns 0 and\n raises ``UndefinedMetricWarning``. This behavior can be\n modified with ``zero_division``.\n\n Examples\n --------\n >>> from sklearn.metrics import precision_score\n >>> y_true = [0, 1, 2, 0, 1, 2]\n >>> y_pred = [0, 2, 1, 0, 0, 1]\n >>> precision_score(y_true, y_pred, average='macro')\n 0.22...\n >>> precision_score(y_true, y_pred, average='micro')\n 0.33...\n >>> precision_score(y_true, y_pred, average='weighted')\n 0.22...\n >>> precision_score(y_true, y_pred, average=None)\n array([0.66..., 0. , 0. ])\n >>> y_pred = [0, 0, 0, 0, 0, 0]\n >>> precision_score(y_true, y_pred, average=None)\n array([0.33..., 0. , 0. ])\n >>> precision_score(y_true, y_pred, average=None, zero_division=1)\n array([0.33..., 1. , 1. ])\n >>> # multilabel classification\n >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]\n >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]\n >>> precision_score(y_true, y_pred, average=None)\n array([0.5, 1. , 1. ])\n \"\"\"\n (p, _, _, _) = precision_recall_fscore_support(y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=('precision', ), sample_weight=sample_weight, zero_division=zero_division)\n return p" }, { "name": "recall_score", @@ -118098,7 +126897,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -118108,7 +126908,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Estimated targets as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "labels", @@ -118118,7 +126919,8 @@ "docstring": { "type": "array-like, default=None", "description": "The set of labels to include when ``average != 'binary'``, and their\norder if ``average is None``. Labels present in the data can be\nexcluded, for example to calculate a multiclass average ignoring a\nmajority negative class, while labels not present in the data will\nresult in 0 components in a macro average. For multilabel targets,\nlabels are column indices. By default, all labels in ``y_true`` and\n``y_pred`` are used in sorted order.\n\n.. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -118128,7 +126930,8 @@ "docstring": { "type": "str or int, default=1", "description": "The class to report if ``average='binary'`` and the data is binary.\nIf the data are multiclass or multilabel, this will be ignored;\nsetting ``labels=[pos_label]`` and ``average != 'binary'`` will report\nscores for that label only." - } + }, + "refined_type": {} }, { "name": "average", @@ -118138,6 +126941,16 @@ "docstring": { "type": "{'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'", "description": "This parameter is required for multiclass/multilabel targets.\nIf ``None``, the scores for each class are returned. Otherwise, this\ndetermines the type of averaging performed on the data:\n\n``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall. Weighted recall\n is equal to accuracy.\n``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`)." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "samples", + "weighted", + "micro", + "macro", + "binary" + ] } }, { @@ -118148,7 +126961,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "zero_division", @@ -118158,14 +126972,15 @@ "docstring": { "type": "\"warn\", 0 or 1, default=\"warn\"", "description": "Sets the value to return when there is a zero division. If set to\n\"warn\", this acts as 0, but warnings are also raised." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the recall.\n\nThe recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The best value is 1 and the worst value is 0. Read more in the :ref:`User Guide `.", - "docstring": "Compute the recall.\n\nThe recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\ntrue positives and ``fn`` the number of false negatives. The recall is\nintuitively the ability of the classifier to find all the positive samples.\n\nThe best value is 1 and the worst value is 0.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\ny_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\nlabels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\npos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\naverage : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall. Weighted recall\n is equal to accuracy.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nzero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division. If set to\n \"warn\", this acts as 0, but warnings are also raised.\n\nReturns\n-------\nrecall : float (if average is not None) or array of float of shape (n_unique_labels,)\n Recall of the positive class in binary classification or weighted\n average of the recall of each class for the multiclass task.\n\nSee Also\n--------\nprecision_recall_fscore_support, balanced_accuracy_score,\nmultilabel_confusion_matrix\n\nNotes\n-----\nWhen ``true positive + false negative == 0``, recall returns 0 and raises\n``UndefinedMetricWarning``. This behavior can be modified with\n``zero_division``.\n\nExamples\n--------\n>>> from sklearn.metrics import recall_score\n>>> y_true = [0, 1, 2, 0, 1, 2]\n>>> y_pred = [0, 2, 1, 0, 0, 1]\n>>> recall_score(y_true, y_pred, average='macro')\n0.33...\n>>> recall_score(y_true, y_pred, average='micro')\n0.33...\n>>> recall_score(y_true, y_pred, average='weighted')\n0.33...\n>>> recall_score(y_true, y_pred, average=None)\narray([1., 0., 0.])\n>>> y_true = [0, 0, 0, 0, 0, 0]\n>>> recall_score(y_true, y_pred, average=None)\narray([0.5, 0. , 0. ])\n>>> recall_score(y_true, y_pred, average=None, zero_division=1)\narray([0.5, 1. , 1. ])\n>>> # multilabel classification\n>>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]\n>>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]\n>>> recall_score(y_true, y_pred, average=None)\narray([1. , 1. , 0.5])", - "source_code": "\ndef recall_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn'):\n \"\"\"Compute the recall.\n\n The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\n true positives and ``fn`` the number of false negatives. The recall is\n intuitively the ability of the classifier to find all the positive samples.\n\n The best value is 1 and the worst value is 0.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n labels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall. Weighted recall\n is equal to accuracy.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division. If set to\n \"warn\", this acts as 0, but warnings are also raised.\n\n Returns\n -------\n recall : float (if average is not None) or array of float of shape (n_unique_labels,)\n Recall of the positive class in binary classification or weighted\n average of the recall of each class for the multiclass task.\n\n See Also\n --------\n precision_recall_fscore_support, balanced_accuracy_score,\n multilabel_confusion_matrix\n\n Notes\n -----\n When ``true positive + false negative == 0``, recall returns 0 and raises\n ``UndefinedMetricWarning``. This behavior can be modified with\n ``zero_division``.\n\n Examples\n --------\n >>> from sklearn.metrics import recall_score\n >>> y_true = [0, 1, 2, 0, 1, 2]\n >>> y_pred = [0, 2, 1, 0, 0, 1]\n >>> recall_score(y_true, y_pred, average='macro')\n 0.33...\n >>> recall_score(y_true, y_pred, average='micro')\n 0.33...\n >>> recall_score(y_true, y_pred, average='weighted')\n 0.33...\n >>> recall_score(y_true, y_pred, average=None)\n array([1., 0., 0.])\n >>> y_true = [0, 0, 0, 0, 0, 0]\n >>> recall_score(y_true, y_pred, average=None)\n array([0.5, 0. , 0. ])\n >>> recall_score(y_true, y_pred, average=None, zero_division=1)\n array([0.5, 1. , 1. ])\n >>> # multilabel classification\n >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]\n >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]\n >>> recall_score(y_true, y_pred, average=None)\n array([1. , 1. , 0.5])\n \"\"\"\n (_, r, _, _) = precision_recall_fscore_support(y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=('recall', ), sample_weight=sample_weight, zero_division=zero_division)\n return r" + "description": "Compute the recall.\n\nThe recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\ntrue positives and ``fn`` the number of false negatives. The recall is\nintuitively the ability of the classifier to find all the positive samples.\n\nThe best value is 1 and the worst value is 0.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the recall.\n\n The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\n true positives and ``fn`` the number of false negatives. The recall is\n intuitively the ability of the classifier to find all the positive samples.\n\n The best value is 1 and the worst value is 0.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n labels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall. Weighted recall\n is equal to accuracy.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division. If set to\n \"warn\", this acts as 0, but warnings are also raised.\n\n Returns\n -------\n recall : float (if average is not None) or array of float of shape (n_unique_labels,)\n Recall of the positive class in binary classification or weighted\n average of the recall of each class for the multiclass task.\n\n See Also\n --------\n precision_recall_fscore_support : Compute precision, recall, F-measure and\n support for each class.\n precision_score : Compute the ratio ``tp / (tp + fp)`` where ``tp`` is the\n number of true positives and ``fp`` the number of false positives.\n balanced_accuracy_score : Compute balanced accuracy to deal with imbalanced\n datasets.\n multilabel_confusion_matrix : Compute a confusion matrix for each class or\n sample.\n PrecisionRecallDisplay.from_estimator : Plot precision-recall curve given\n an estimator and some data.\n PrecisionRecallDisplay.from_predictions : Plot precision-recall curve given\n binary class predictions.\n\n Notes\n -----\n When ``true positive + false negative == 0``, recall returns 0 and raises\n ``UndefinedMetricWarning``. This behavior can be modified with\n ``zero_division``.\n\n Examples\n --------\n >>> from sklearn.metrics import recall_score\n >>> y_true = [0, 1, 2, 0, 1, 2]\n >>> y_pred = [0, 2, 1, 0, 0, 1]\n >>> recall_score(y_true, y_pred, average='macro')\n 0.33...\n >>> recall_score(y_true, y_pred, average='micro')\n 0.33...\n >>> recall_score(y_true, y_pred, average='weighted')\n 0.33...\n >>> recall_score(y_true, y_pred, average=None)\n array([1., 0., 0.])\n >>> y_true = [0, 0, 0, 0, 0, 0]\n >>> recall_score(y_true, y_pred, average=None)\n array([0.5, 0. , 0. ])\n >>> recall_score(y_true, y_pred, average=None, zero_division=1)\n array([0.5, 1. , 1. ])\n >>> # multilabel classification\n >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]\n >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]\n >>> recall_score(y_true, y_pred, average=None)\n array([1. , 1. , 0.5])\n ", + "source_code": "\ndef recall_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn'):\n \"\"\"Compute the recall.\n\n The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\n true positives and ``fn`` the number of false negatives. The recall is\n intuitively the ability of the classifier to find all the positive samples.\n\n The best value is 1 and the worst value is 0.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) target values.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Estimated targets as returned by a classifier.\n\n labels : array-like, default=None\n The set of labels to include when ``average != 'binary'``, and their\n order if ``average is None``. Labels present in the data can be\n excluded, for example to calculate a multiclass average ignoring a\n majority negative class, while labels not present in the data will\n result in 0 components in a macro average. For multilabel targets,\n labels are column indices. By default, all labels in ``y_true`` and\n ``y_pred`` are used in sorted order.\n\n .. versionchanged:: 0.17\n Parameter `labels` improved for multiclass problem.\n\n pos_label : str or int, default=1\n The class to report if ``average='binary'`` and the data is binary.\n If the data are multiclass or multilabel, this will be ignored;\n setting ``labels=[pos_label]`` and ``average != 'binary'`` will report\n scores for that label only.\n\n average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, default='binary'\n This parameter is required for multiclass/multilabel targets.\n If ``None``, the scores for each class are returned. Otherwise, this\n determines the type of averaging performed on the data:\n\n ``'binary'``:\n Only report results for the class specified by ``pos_label``.\n This is applicable only if targets (``y_{true,pred}``) are binary.\n ``'micro'``:\n Calculate metrics globally by counting the total true positives,\n false negatives and false positives.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average weighted\n by support (the number of true instances for each label). This\n alters 'macro' to account for label imbalance; it can result in an\n F-score that is not between precision and recall. Weighted recall\n is equal to accuracy.\n ``'samples'``:\n Calculate metrics for each instance, and find their average (only\n meaningful for multilabel classification where this differs from\n :func:`accuracy_score`).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n zero_division : \"warn\", 0 or 1, default=\"warn\"\n Sets the value to return when there is a zero division. If set to\n \"warn\", this acts as 0, but warnings are also raised.\n\n Returns\n -------\n recall : float (if average is not None) or array of float of shape (n_unique_labels,)\n Recall of the positive class in binary classification or weighted\n average of the recall of each class for the multiclass task.\n\n See Also\n --------\n precision_recall_fscore_support : Compute precision, recall, F-measure and\n support for each class.\n precision_score : Compute the ratio ``tp / (tp + fp)`` where ``tp`` is the\n number of true positives and ``fp`` the number of false positives.\n balanced_accuracy_score : Compute balanced accuracy to deal with imbalanced\n datasets.\n multilabel_confusion_matrix : Compute a confusion matrix for each class or\n sample.\n PrecisionRecallDisplay.from_estimator : Plot precision-recall curve given\n an estimator and some data.\n PrecisionRecallDisplay.from_predictions : Plot precision-recall curve given\n binary class predictions.\n\n Notes\n -----\n When ``true positive + false negative == 0``, recall returns 0 and raises\n ``UndefinedMetricWarning``. This behavior can be modified with\n ``zero_division``.\n\n Examples\n --------\n >>> from sklearn.metrics import recall_score\n >>> y_true = [0, 1, 2, 0, 1, 2]\n >>> y_pred = [0, 2, 1, 0, 0, 1]\n >>> recall_score(y_true, y_pred, average='macro')\n 0.33...\n >>> recall_score(y_true, y_pred, average='micro')\n 0.33...\n >>> recall_score(y_true, y_pred, average='weighted')\n 0.33...\n >>> recall_score(y_true, y_pred, average=None)\n array([1., 0., 0.])\n >>> y_true = [0, 0, 0, 0, 0, 0]\n >>> recall_score(y_true, y_pred, average=None)\n array([0.5, 0. , 0. ])\n >>> recall_score(y_true, y_pred, average=None, zero_division=1)\n array([0.5, 1. , 1. ])\n >>> # multilabel classification\n >>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]\n >>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]\n >>> recall_score(y_true, y_pred, average=None)\n array([1. , 1. , 0.5])\n \"\"\"\n (_, r, _, _) = precision_recall_fscore_support(y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=('recall', ), sample_weight=sample_weight, zero_division=zero_division)\n return r" }, { "name": "zero_one_loss", @@ -118182,7 +126997,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Ground truth (correct) labels." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -118192,7 +127008,8 @@ "docstring": { "type": "1d array-like, or label indicator array / sparse matrix", "description": "Predicted labels, as returned by a classifier." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -118202,7 +127019,8 @@ "docstring": { "type": "bool, default=True", "description": "If ``False``, return the number of misclassifications.\nOtherwise, return the fraction of misclassifications." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -118212,14 +127030,15 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Zero-one classification loss.\n\nIf normalize is ``True``, return the fraction of misclassifications (float), else it returns the number of misclassifications (int). The best performance is 0. Read more in the :ref:`User Guide `.", - "docstring": "Zero-one classification loss.\n\nIf normalize is ``True``, return the fraction of misclassifications\n(float), else it returns the number of misclassifications (int). The best\nperformance is 0.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\ny_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\nnormalize : bool, default=True\n If ``False``, return the number of misclassifications.\n Otherwise, return the fraction of misclassifications.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nloss : float or int,\n If ``normalize == True``, return the fraction of misclassifications\n (float), else it returns the number of misclassifications (int).\n\nNotes\n-----\nIn multilabel classification, the zero_one_loss function corresponds to\nthe subset zero-one loss: for each sample, the entire set of labels must be\ncorrectly predicted, otherwise the loss for that sample is equal to one.\n\nSee Also\n--------\naccuracy_score, hamming_loss, jaccard_score\n\nExamples\n--------\n>>> from sklearn.metrics import zero_one_loss\n>>> y_pred = [1, 2, 3, 4]\n>>> y_true = [2, 2, 3, 4]\n>>> zero_one_loss(y_true, y_pred)\n0.25\n>>> zero_one_loss(y_true, y_pred, normalize=False)\n1\n\nIn the multilabel case with binary label indicators:\n\n>>> import numpy as np\n>>> zero_one_loss(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))\n0.5", - "source_code": "\ndef zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None):\n \"\"\"Zero-one classification loss.\n\n If normalize is ``True``, return the fraction of misclassifications\n (float), else it returns the number of misclassifications (int). The best\n performance is 0.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\n normalize : bool, default=True\n If ``False``, return the number of misclassifications.\n Otherwise, return the fraction of misclassifications.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n loss : float or int,\n If ``normalize == True``, return the fraction of misclassifications\n (float), else it returns the number of misclassifications (int).\n\n Notes\n -----\n In multilabel classification, the zero_one_loss function corresponds to\n the subset zero-one loss: for each sample, the entire set of labels must be\n correctly predicted, otherwise the loss for that sample is equal to one.\n\n See Also\n --------\n accuracy_score, hamming_loss, jaccard_score\n\n Examples\n --------\n >>> from sklearn.metrics import zero_one_loss\n >>> y_pred = [1, 2, 3, 4]\n >>> y_true = [2, 2, 3, 4]\n >>> zero_one_loss(y_true, y_pred)\n 0.25\n >>> zero_one_loss(y_true, y_pred, normalize=False)\n 1\n\n In the multilabel case with binary label indicators:\n\n >>> import numpy as np\n >>> zero_one_loss(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))\n 0.5\n \"\"\"\n score = accuracy_score(y_true, y_pred, normalize=normalize, sample_weight=sample_weight)\n if normalize:\n return 1 - score\n else:\n if sample_weight is not None:\n n_samples = np.sum(sample_weight)\n else:\n n_samples = _num_samples(y_true)\n return n_samples - score" + "description": "Zero-one classification loss.\n\nIf normalize is ``True``, return the fraction of misclassifications\n(float), else it returns the number of misclassifications (int). The best\nperformance is 0.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Zero-one classification loss.\n\n If normalize is ``True``, return the fraction of misclassifications\n (float), else it returns the number of misclassifications (int). The best\n performance is 0.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\n normalize : bool, default=True\n If ``False``, return the number of misclassifications.\n Otherwise, return the fraction of misclassifications.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n loss : float or int,\n If ``normalize == True``, return the fraction of misclassifications\n (float), else it returns the number of misclassifications (int).\n\n See Also\n --------\n accuracy_score : Compute the accuracy score. By default, the function will\n return the fraction of correct predictions divided by the total number\n of predictions.\n hamming_loss : Compute the average Hamming loss or Hamming distance between\n two sets of samples.\n jaccard_score : Compute the Jaccard similarity coefficient score.\n\n Notes\n -----\n In multilabel classification, the zero_one_loss function corresponds to\n the subset zero-one loss: for each sample, the entire set of labels must be\n correctly predicted, otherwise the loss for that sample is equal to one.\n\n Examples\n --------\n >>> from sklearn.metrics import zero_one_loss\n >>> y_pred = [1, 2, 3, 4]\n >>> y_true = [2, 2, 3, 4]\n >>> zero_one_loss(y_true, y_pred)\n 0.25\n >>> zero_one_loss(y_true, y_pred, normalize=False)\n 1\n\n In the multilabel case with binary label indicators:\n\n >>> import numpy as np\n >>> zero_one_loss(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))\n 0.5\n ", + "source_code": "\ndef zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None):\n \"\"\"Zero-one classification loss.\n\n If normalize is ``True``, return the fraction of misclassifications\n (float), else it returns the number of misclassifications (int). The best\n performance is 0.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : 1d array-like, or label indicator array / sparse matrix\n Ground truth (correct) labels.\n\n y_pred : 1d array-like, or label indicator array / sparse matrix\n Predicted labels, as returned by a classifier.\n\n normalize : bool, default=True\n If ``False``, return the number of misclassifications.\n Otherwise, return the fraction of misclassifications.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n loss : float or int,\n If ``normalize == True``, return the fraction of misclassifications\n (float), else it returns the number of misclassifications (int).\n\n See Also\n --------\n accuracy_score : Compute the accuracy score. By default, the function will\n return the fraction of correct predictions divided by the total number\n of predictions.\n hamming_loss : Compute the average Hamming loss or Hamming distance between\n two sets of samples.\n jaccard_score : Compute the Jaccard similarity coefficient score.\n\n Notes\n -----\n In multilabel classification, the zero_one_loss function corresponds to\n the subset zero-one loss: for each sample, the entire set of labels must be\n correctly predicted, otherwise the loss for that sample is equal to one.\n\n Examples\n --------\n >>> from sklearn.metrics import zero_one_loss\n >>> y_pred = [1, 2, 3, 4]\n >>> y_true = [2, 2, 3, 4]\n >>> zero_one_loss(y_true, y_pred)\n 0.25\n >>> zero_one_loss(y_true, y_pred, normalize=False)\n 1\n\n In the multilabel case with binary label indicators:\n\n >>> import numpy as np\n >>> zero_one_loss(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))\n 0.5\n \"\"\"\n score = accuracy_score(y_true, y_pred, normalize=normalize, sample_weight=sample_weight)\n if normalize:\n return 1 - score\n else:\n if sample_weight is not None:\n n_samples = np.sum(sample_weight)\n else:\n n_samples = _num_samples(y_true)\n return n_samples - score" }, { "name": "_check_classifier_response_method", @@ -118236,7 +127055,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "response_method", @@ -118246,13 +127066,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return prediction method from the response_method", - "docstring": "Return prediction method from the response_method\n\nParameters\n----------\nestimator: object\n Classifier to check\n\nresponse_method: {'auto', 'predict_proba', 'decision_function'}\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\nReturns\n-------\nprediction_method: callable\n prediction method of estimator", + "docstring": "Return prediction method from the response_method\n\n Parameters\n ----------\n estimator: object\n Classifier to check\n\n response_method: {'auto', 'predict_proba', 'decision_function'}\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n Returns\n -------\n prediction_method: callable\n prediction method of estimator\n ", "source_code": "\ndef _check_classifier_response_method(estimator, response_method):\n \"\"\"Return prediction method from the response_method\n\n Parameters\n ----------\n estimator: object\n Classifier to check\n\n response_method: {'auto', 'predict_proba', 'decision_function'}\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n Returns\n -------\n prediction_method: callable\n prediction method of estimator\n \"\"\"\n if response_method not in ('predict_proba', 'decision_function', 'auto'):\n raise ValueError(\"response_method must be 'predict_proba', 'decision_function' or 'auto'\")\n error_msg = 'response method {} is not defined in {}'\n if response_method != 'auto':\n prediction_method = getattr(estimator, response_method, None)\n if prediction_method is None:\n raise ValueError(error_msg.format(response_method, estimator.__class__.__name__))\n else:\n predict_proba = getattr(estimator, 'predict_proba', None)\n decision_function = getattr(estimator, 'decision_function', None)\n prediction_method = predict_proba or decision_function\n if prediction_method is None:\n raise ValueError(error_msg.format('decision_function or predict_proba', estimator.__class__.__name__))\n return prediction_method" }, { @@ -118270,6 +127091,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -118280,7 +127105,8 @@ "docstring": { "type": "estimator instance", "description": "Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\nin which the last estimator is a classifier." - } + }, + "refined_type": {} }, { "name": "response_method", @@ -118290,7 +127116,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -118300,13 +127127,14 @@ "docstring": { "type": "str or int, default=None", "description": "The class considered as the positive class when computing\nthe metrics. By default, `estimators.classes_[1]` is\nconsidered as the positive class." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return response and positive label.", - "docstring": "Return response and positive label.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\nestimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\nresponse_method: {'auto', 'predict_proba', 'decision_function'}\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\npos_label : str or int, default=None\n The class considered as the positive class when computing\n the metrics. By default, `estimators.classes_[1]` is\n considered as the positive class.\n\nReturns\n-------\ny_pred: ndarray of shape (n_samples,)\n Target scores calculated from the provided response_method\n and pos_label.\n\npos_label: str or int\n The class considered as the positive class when computing\n the metrics.", + "docstring": "Return response and positive label.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n response_method: {'auto', 'predict_proba', 'decision_function'}\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing\n the metrics. By default, `estimators.classes_[1]` is\n considered as the positive class.\n\n Returns\n -------\n y_pred: ndarray of shape (n_samples,)\n Target scores calculated from the provided response_method\n and pos_label.\n\n pos_label: str or int\n The class considered as the positive class when computing\n the metrics.\n ", "source_code": "\ndef _get_response(X, estimator, response_method, pos_label=None):\n \"\"\"Return response and positive label.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n response_method: {'auto', 'predict_proba', 'decision_function'}\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing\n the metrics. By default, `estimators.classes_[1]` is\n considered as the positive class.\n\n Returns\n -------\n y_pred: ndarray of shape (n_samples,)\n Target scores calculated from the provided response_method\n and pos_label.\n\n pos_label: str or int\n The class considered as the positive class when computing\n the metrics.\n \"\"\"\n classification_error = f\"Expected 'estimator' to be a binary classifier, but got {estimator.__class__.__name__}\"\n if not is_classifier(estimator):\n raise ValueError(classification_error)\n prediction_method = _check_classifier_response_method(estimator, response_method)\n y_pred = prediction_method(X)\n if pos_label is not None:\n try:\n class_idx = estimator.classes_.tolist().index(pos_label)\n except ValueError as e:\n raise ValueError(f\"The class provided by 'pos_label' is unknown. Got {pos_label} instead of one of {set(estimator.classes_)}\") from e\n else:\n class_idx = 1\n pos_label = estimator.classes_[class_idx]\n if y_pred.ndim != 1:\n y_pred_shape = y_pred.shape[1]\n if y_pred_shape != 2:\n raise ValueError(f'{classification_error} fit on multiclass ({y_pred_shape} classes) data')\n y_pred = y_pred[:, class_idx]\n elif pos_label == estimator.classes_[0]:\n y_pred *= -1\n return y_pred, pos_label" }, { @@ -118324,7 +127152,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "confusion_matrix", @@ -118334,7 +127163,8 @@ "docstring": { "type": "ndarray of shape (n_classes, n_classes)", "description": "Confusion matrix." - } + }, + "refined_type": {} }, { "name": "display_labels", @@ -118344,13 +127174,14 @@ "docstring": { "type": "ndarray of shape (n_classes,), default=None", "description": "Display labels for plot. If None, display labels are set from 0 to\n`n_classes - 1`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, confusion_matrix, *, display_labels=None):\n self.confusion_matrix = confusion_matrix\n self.display_labels = display_labels" }, { @@ -118368,7 +127199,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -118378,7 +127210,8 @@ "docstring": { "type": "estimator instance", "description": "Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\nin which the last estimator is a classifier." - } + }, + "refined_type": {} }, { "name": "X", @@ -118388,6 +127221,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -118398,7 +127235,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "labels", @@ -118408,7 +127246,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "List of labels to index the confusion matrix. This may be used to\nreorder or select a subset of labels. If `None` is given, those\nthat appear at least once in `y_true` or `y_pred` are used in\nsorted order." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -118418,7 +127257,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -118428,6 +127268,10 @@ "docstring": { "type": "{'true', 'pred', 'all'}, default=None", "description": "Either to normalize the counts display in the matrix:\n\n- if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n- if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n- if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n- if `None` (default), the confusion matrix will not be normalized." + }, + "refined_type": { + "kind": "EnumType", + "values": ["pred", "all", "true"] } }, { @@ -118438,7 +127282,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "Target names used for plotting. By default, `labels` will be used\nif it is defined, otherwise the unique labels of `y_true` and\n`y_pred` will be used." - } + }, + "refined_type": {} }, { "name": "include_values", @@ -118448,7 +127293,8 @@ "docstring": { "type": "bool, default=True", "description": "Includes values in confusion matrix." - } + }, + "refined_type": {} }, { "name": "xticks_rotation", @@ -118458,6 +127304,10 @@ "docstring": { "type": "{'vertical', 'horizontal'} or float, default='horizontal'", "description": "Rotation of xtick labels." + }, + "refined_type": { + "kind": "EnumType", + "values": ["horizontal", "vertical"] } }, { @@ -118468,7 +127318,8 @@ "docstring": { "type": "str, default=None", "description": "Format specification for values in confusion matrix. If `None`, the\nformat specification is 'd' or '.2g' whichever is shorter." - } + }, + "refined_type": {} }, { "name": "cmap", @@ -118478,7 +127329,8 @@ "docstring": { "type": "str or matplotlib Colormap, default='viridis'", "description": "Colormap recognized by matplotlib." - } + }, + "refined_type": {} }, { "name": "ax", @@ -118488,7 +127340,8 @@ "docstring": { "type": "matplotlib Axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} }, { "name": "colorbar", @@ -118498,13 +127351,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to add a colorbar to the plot." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Plot Confusion Matrix given an estimator and some data.\n\nRead more in the :ref:`User Guide `. .. versionadded:: 1.0", - "docstring": "Plot Confusion Matrix given an estimator and some data.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0\n\nParameters\n----------\nestimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nlabels : array-like of shape (n_classes,), default=None\n List of labels to index the confusion matrix. This may be used to\n reorder or select a subset of labels. If `None` is given, those\n that appear at least once in `y_true` or `y_pred` are used in\n sorted order.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nnormalize : {'true', 'pred', 'all'}, default=None\n Either to normalize the counts display in the matrix:\n\n - if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n - if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n - if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n - if `None` (default), the confusion matrix will not be normalized.\n\ndisplay_labels : array-like of shape (n_classes,), default=None\n Target names used for plotting. By default, `labels` will be used\n if it is defined, otherwise the unique labels of `y_true` and\n `y_pred` will be used.\n\ninclude_values : bool, default=True\n Includes values in confusion matrix.\n\nxticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\nvalues_format : str, default=None\n Format specification for values in confusion matrix. If `None`, the\n format specification is 'd' or '.2g' whichever is shorter.\n\ncmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\nax : matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\ncolorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n\nSee Also\n--------\nConfusionMatrixDisplay.from_predictions : Plot the confusion matrix\n given the true and predicted labels.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.metrics import ConfusionMatrixDisplay\n>>> from sklearn.model_selection import train_test_split\n>>> from sklearn.svm import SVC\n>>> X, y = make_classification(random_state=0)\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, random_state=0)\n>>> clf = SVC(random_state=0)\n>>> clf.fit(X_train, y_train)\nSVC(random_state=0)\n>>> ConfusionMatrixDisplay.from_estimator(\n... clf, X_test, y_test)\n<...>\n>>> plt.show()", + "description": "Plot Confusion Matrix given an estimator and some data.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0", + "docstring": "Plot Confusion Matrix given an estimator and some data.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n labels : array-like of shape (n_classes,), default=None\n List of labels to index the confusion matrix. This may be used to\n reorder or select a subset of labels. If `None` is given, those\n that appear at least once in `y_true` or `y_pred` are used in\n sorted order.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n normalize : {'true', 'pred', 'all'}, default=None\n Either to normalize the counts display in the matrix:\n\n - if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n - if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n - if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n - if `None` (default), the confusion matrix will not be normalized.\n\n display_labels : array-like of shape (n_classes,), default=None\n Target names used for plotting. By default, `labels` will be used\n if it is defined, otherwise the unique labels of `y_true` and\n `y_pred` will be used.\n\n include_values : bool, default=True\n Includes values in confusion matrix.\n\n xticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\n values_format : str, default=None\n Format specification for values in confusion matrix. If `None`, the\n format specification is 'd' or '.2g' whichever is shorter.\n\n cmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\n ax : matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n colorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n\n See Also\n --------\n ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix\n given the true and predicted labels.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import ConfusionMatrixDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> ConfusionMatrixDisplay.from_estimator(\n ... clf, X_test, y_test)\n <...>\n >>> plt.show()\n ", "source_code": "\n@classmethod\ndef from_estimator(cls, estimator, X, y, *, labels=None, sample_weight=None, normalize=None, display_labels=None, include_values=True, xticks_rotation='horizontal', values_format=None, cmap='viridis', ax=None, colorbar=True):\n \"\"\"Plot Confusion Matrix given an estimator and some data.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n labels : array-like of shape (n_classes,), default=None\n List of labels to index the confusion matrix. This may be used to\n reorder or select a subset of labels. If `None` is given, those\n that appear at least once in `y_true` or `y_pred` are used in\n sorted order.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n normalize : {'true', 'pred', 'all'}, default=None\n Either to normalize the counts display in the matrix:\n\n - if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n - if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n - if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n - if `None` (default), the confusion matrix will not be normalized.\n\n display_labels : array-like of shape (n_classes,), default=None\n Target names used for plotting. By default, `labels` will be used\n if it is defined, otherwise the unique labels of `y_true` and\n `y_pred` will be used.\n\n include_values : bool, default=True\n Includes values in confusion matrix.\n\n xticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\n values_format : str, default=None\n Format specification for values in confusion matrix. If `None`, the\n format specification is 'd' or '.2g' whichever is shorter.\n\n cmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\n ax : matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n colorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n\n See Also\n --------\n ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix\n given the true and predicted labels.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import ConfusionMatrixDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> ConfusionMatrixDisplay.from_estimator(\n ... clf, X_test, y_test)\n <...>\n >>> plt.show()\n \"\"\"\n method_name = f'{cls.__name__}.from_estimator'\n check_matplotlib_support(method_name)\n if not is_classifier(estimator):\n raise ValueError(f'{method_name} only supports classifiers')\n y_pred = estimator.predict(X)\n return cls.from_predictions(y, y_pred, sample_weight=sample_weight, labels=labels, normalize=normalize, display_labels=display_labels, include_values=include_values, cmap=cmap, ax=ax, xticks_rotation=xticks_rotation, values_format=values_format, colorbar=colorbar)" }, { @@ -118522,7 +127376,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -118532,7 +127387,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -118542,7 +127398,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The predicted labels given by the method `predict` of an\nclassifier." - } + }, + "refined_type": {} }, { "name": "labels", @@ -118552,7 +127409,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "List of labels to index the confusion matrix. This may be used to\nreorder or select a subset of labels. If `None` is given, those\nthat appear at least once in `y_true` or `y_pred` are used in\nsorted order." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -118562,7 +127420,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -118572,6 +127431,10 @@ "docstring": { "type": "{'true', 'pred', 'all'}, default=None", "description": "Either to normalize the counts display in the matrix:\n\n- if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n- if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n- if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n- if `None` (default), the confusion matrix will not be normalized." + }, + "refined_type": { + "kind": "EnumType", + "values": ["pred", "all", "true"] } }, { @@ -118582,7 +127445,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "Target names used for plotting. By default, `labels` will be used\nif it is defined, otherwise the unique labels of `y_true` and\n`y_pred` will be used." - } + }, + "refined_type": {} }, { "name": "include_values", @@ -118592,7 +127456,8 @@ "docstring": { "type": "bool, default=True", "description": "Includes values in confusion matrix." - } + }, + "refined_type": {} }, { "name": "xticks_rotation", @@ -118602,6 +127467,10 @@ "docstring": { "type": "{'vertical', 'horizontal'} or float, default='horizontal'", "description": "Rotation of xtick labels." + }, + "refined_type": { + "kind": "EnumType", + "values": ["horizontal", "vertical"] } }, { @@ -118612,7 +127481,8 @@ "docstring": { "type": "str, default=None", "description": "Format specification for values in confusion matrix. If `None`, the\nformat specification is 'd' or '.2g' whichever is shorter." - } + }, + "refined_type": {} }, { "name": "cmap", @@ -118622,7 +127492,8 @@ "docstring": { "type": "str or matplotlib Colormap, default='viridis'", "description": "Colormap recognized by matplotlib." - } + }, + "refined_type": {} }, { "name": "ax", @@ -118632,7 +127503,8 @@ "docstring": { "type": "matplotlib Axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} }, { "name": "colorbar", @@ -118642,13 +127514,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to add a colorbar to the plot." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Plot Confusion Matrix given true and predicted labels.\n\nRead more in the :ref:`User Guide `. .. versionadded:: 0.24", - "docstring": "Plot Confusion Matrix given true and predicted labels.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.24\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n True labels.\n\ny_pred : array-like of shape (n_samples,)\n The predicted labels given by the method `predict` of an\n classifier.\n\nlabels : array-like of shape (n_classes,), default=None\n List of labels to index the confusion matrix. This may be used to\n reorder or select a subset of labels. If `None` is given, those\n that appear at least once in `y_true` or `y_pred` are used in\n sorted order.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nnormalize : {'true', 'pred', 'all'}, default=None\n Either to normalize the counts display in the matrix:\n\n - if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n - if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n - if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n - if `None` (default), the confusion matrix will not be normalized.\n\ndisplay_labels : array-like of shape (n_classes,), default=None\n Target names used for plotting. By default, `labels` will be used\n if it is defined, otherwise the unique labels of `y_true` and\n `y_pred` will be used.\n\ninclude_values : bool, default=True\n Includes values in confusion matrix.\n\nxticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\nvalues_format : str, default=None\n Format specification for values in confusion matrix. If `None`, the\n format specification is 'd' or '.2g' whichever is shorter.\n\ncmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\nax : matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\ncolorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n\nSee Also\n--------\nConfusionMatrixDisplay.from_estimator : Plot the confusion matrix\n given an estimator, the data, and the label.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.metrics import ConfusionMatrixDisplay\n>>> from sklearn.model_selection import train_test_split\n>>> from sklearn.svm import SVC\n>>> X, y = make_classification(random_state=0)\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, random_state=0)\n>>> clf = SVC(random_state=0)\n>>> clf.fit(X_train, y_train)\nSVC(random_state=0)\n>>> y_pred = clf.predict(X_test)\n>>> ConfusionMatrixDisplay.from_predictions(\n... y_test, y_pred)\n<...>\n>>> plt.show()", + "description": "Plot Confusion Matrix given true and predicted labels.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.24", + "docstring": "Plot Confusion Matrix given true and predicted labels.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_pred : array-like of shape (n_samples,)\n The predicted labels given by the method `predict` of an\n classifier.\n\n labels : array-like of shape (n_classes,), default=None\n List of labels to index the confusion matrix. This may be used to\n reorder or select a subset of labels. If `None` is given, those\n that appear at least once in `y_true` or `y_pred` are used in\n sorted order.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n normalize : {'true', 'pred', 'all'}, default=None\n Either to normalize the counts display in the matrix:\n\n - if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n - if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n - if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n - if `None` (default), the confusion matrix will not be normalized.\n\n display_labels : array-like of shape (n_classes,), default=None\n Target names used for plotting. By default, `labels` will be used\n if it is defined, otherwise the unique labels of `y_true` and\n `y_pred` will be used.\n\n include_values : bool, default=True\n Includes values in confusion matrix.\n\n xticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\n values_format : str, default=None\n Format specification for values in confusion matrix. If `None`, the\n format specification is 'd' or '.2g' whichever is shorter.\n\n cmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\n ax : matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n colorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n\n See Also\n --------\n ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix\n given an estimator, the data, and the label.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import ConfusionMatrixDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> y_pred = clf.predict(X_test)\n >>> ConfusionMatrixDisplay.from_predictions(\n ... y_test, y_pred)\n <...>\n >>> plt.show()\n ", "source_code": "\n@classmethod\ndef from_predictions(cls, y_true, y_pred, *, labels=None, sample_weight=None, normalize=None, display_labels=None, include_values=True, xticks_rotation='horizontal', values_format=None, cmap='viridis', ax=None, colorbar=True):\n \"\"\"Plot Confusion Matrix given true and predicted labels.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_pred : array-like of shape (n_samples,)\n The predicted labels given by the method `predict` of an\n classifier.\n\n labels : array-like of shape (n_classes,), default=None\n List of labels to index the confusion matrix. This may be used to\n reorder or select a subset of labels. If `None` is given, those\n that appear at least once in `y_true` or `y_pred` are used in\n sorted order.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n normalize : {'true', 'pred', 'all'}, default=None\n Either to normalize the counts display in the matrix:\n\n - if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n - if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n - if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n - if `None` (default), the confusion matrix will not be normalized.\n\n display_labels : array-like of shape (n_classes,), default=None\n Target names used for plotting. By default, `labels` will be used\n if it is defined, otherwise the unique labels of `y_true` and\n `y_pred` will be used.\n\n include_values : bool, default=True\n Includes values in confusion matrix.\n\n xticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\n values_format : str, default=None\n Format specification for values in confusion matrix. If `None`, the\n format specification is 'd' or '.2g' whichever is shorter.\n\n cmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\n ax : matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n colorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n\n See Also\n --------\n ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix\n given an estimator, the data, and the label.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import ConfusionMatrixDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> y_pred = clf.predict(X_test)\n >>> ConfusionMatrixDisplay.from_predictions(\n ... y_test, y_pred)\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_predictions')\n if display_labels is None:\n if labels is None:\n display_labels = unique_labels(y_true, y_pred)\n else:\n display_labels = labels\n cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight, labels=labels, normalize=normalize)\n disp = cls(confusion_matrix=cm, display_labels=display_labels)\n return disp.plot(include_values=include_values, cmap=cmap, ax=ax, xticks_rotation=xticks_rotation, values_format=values_format, colorbar=colorbar)" }, { @@ -118666,7 +127539,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "include_values", @@ -118676,7 +127550,8 @@ "docstring": { "type": "bool, default=True", "description": "Includes values in confusion matrix." - } + }, + "refined_type": {} }, { "name": "cmap", @@ -118686,7 +127561,8 @@ "docstring": { "type": "str or matplotlib Colormap, default='viridis'", "description": "Colormap recognized by matplotlib." - } + }, + "refined_type": {} }, { "name": "xticks_rotation", @@ -118696,6 +127572,10 @@ "docstring": { "type": "{'vertical', 'horizontal'} or float, default='horizontal'", "description": "Rotation of xtick labels." + }, + "refined_type": { + "kind": "EnumType", + "values": ["horizontal", "vertical"] } }, { @@ -118706,7 +127586,8 @@ "docstring": { "type": "str, default=None", "description": "Format specification for values in confusion matrix. If `None`,\nthe format specification is 'd' or '.2g' whichever is shorter." - } + }, + "refined_type": {} }, { "name": "ax", @@ -118716,7 +127597,8 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} }, { "name": "colorbar", @@ -118726,13 +127608,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to add a colorbar to the plot." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Plot visualization.", - "docstring": "Plot visualization.\n\nParameters\n----------\ninclude_values : bool, default=True\n Includes values in confusion matrix.\n\ncmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\nxticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\nvalues_format : str, default=None\n Format specification for values in confusion matrix. If `None`,\n the format specification is 'd' or '.2g' whichever is shorter.\n\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\ncolorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.ConfusionMatrixDisplay`", + "docstring": "Plot visualization.\n\n Parameters\n ----------\n include_values : bool, default=True\n Includes values in confusion matrix.\n\n cmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\n xticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\n values_format : str, default=None\n Format specification for values in confusion matrix. If `None`,\n the format specification is 'd' or '.2g' whichever is shorter.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n colorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n ", "source_code": "\ndef plot(self, *, include_values=True, cmap='viridis', xticks_rotation='horizontal', values_format=None, ax=None, colorbar=True):\n \"\"\"Plot visualization.\n\n Parameters\n ----------\n include_values : bool, default=True\n Includes values in confusion matrix.\n\n cmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\n xticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\n values_format : str, default=None\n Format specification for values in confusion matrix. If `None`,\n the format specification is 'd' or '.2g' whichever is shorter.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n colorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n \"\"\"\n check_matplotlib_support('ConfusionMatrixDisplay.plot')\n import matplotlib.pyplot as plt\n if ax is None:\n (fig, ax) = plt.subplots()\n else:\n fig = ax.figure\n cm = self.confusion_matrix\n n_classes = cm.shape[0]\n self.im_ = ax.imshow(cm, interpolation='nearest', cmap=cmap)\n self.text_ = None\n (cmap_min, cmap_max) = (self.im_.cmap(0), self.im_.cmap(1.0))\n if include_values:\n self.text_ = np.empty_like(cm, dtype=object)\n thresh = (cm.max() + cm.min()) / 2.0\n for (i, j) in product(range(n_classes), range(n_classes)):\n color = cmap_max if cm[i, j] < thresh else cmap_min\n if values_format is None:\n text_cm = format(cm[i, j], '.2g')\n if cm.dtype.kind != 'f':\n text_d = format(cm[i, j], 'd')\n if len(text_d) < len(text_cm):\n text_cm = text_d\n else:\n text_cm = format(cm[i, j], values_format)\n self.text_[i, j] = ax.text(j, i, text_cm, ha='center', va='center', color=color)\n if self.display_labels is None:\n display_labels = np.arange(n_classes)\n else:\n display_labels = self.display_labels\n if colorbar:\n fig.colorbar(self.im_, ax=ax)\n ax.set(xticks=np.arange(n_classes), yticks=np.arange(n_classes), xticklabels=display_labels, yticklabels=display_labels, ylabel='True label', xlabel='Predicted label')\n ax.set_ylim((n_classes - 0.5, -0.5))\n plt.setp(ax.get_xticklabels(), rotation=xticks_rotation)\n self.figure_ = fig\n self.ax_ = ax\n return self" }, { @@ -118752,7 +127635,8 @@ "docstring": { "type": "estimator instance", "description": "Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\nin which the last estimator is a classifier." - } + }, + "refined_type": {} }, { "name": "X", @@ -118762,6 +127646,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -118772,7 +127660,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "labels", @@ -118782,7 +127671,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "List of labels to index the matrix. This may be used to reorder or\nselect a subset of labels. If `None` is given, those that appear at\nleast once in `y_true` or `y_pred` are used in sorted order." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -118792,7 +127682,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -118802,6 +127693,10 @@ "docstring": { "type": "{'true', 'pred', 'all'}, default=None", "description": "Either to normalize the counts display in the matrix:\n\n - if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n - if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n - if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n - if `None` (default), the confusion matrix will not be normalized." + }, + "refined_type": { + "kind": "EnumType", + "values": ["pred", "all", "true"] } }, { @@ -118812,7 +127707,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "Target names used for plotting. By default, `labels` will be used if\nit is defined, otherwise the unique labels of `y_true` and `y_pred`\nwill be used." - } + }, + "refined_type": {} }, { "name": "include_values", @@ -118822,7 +127718,8 @@ "docstring": { "type": "bool, default=True", "description": "Includes values in confusion matrix." - } + }, + "refined_type": {} }, { "name": "xticks_rotation", @@ -118832,6 +127729,10 @@ "docstring": { "type": "{'vertical', 'horizontal'} or float, default='horizontal'", "description": "Rotation of xtick labels." + }, + "refined_type": { + "kind": "EnumType", + "values": ["horizontal", "vertical"] } }, { @@ -118842,7 +127743,8 @@ "docstring": { "type": "str, default=None", "description": "Format specification for values in confusion matrix. If `None`,\nthe format specification is 'd' or '.2g' whichever is shorter." - } + }, + "refined_type": {} }, { "name": "cmap", @@ -118852,7 +127754,8 @@ "docstring": { "type": "str or matplotlib Colormap, default='viridis'", "description": "Colormap recognized by matplotlib." - } + }, + "refined_type": {} }, { "name": "ax", @@ -118862,7 +127765,8 @@ "docstring": { "type": "matplotlib Axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} }, { "name": "colorbar", @@ -118872,13 +127776,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to add a colorbar to the plot.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Plot Confusion Matrix.\n\nRead more in the :ref:`User Guide `. .. deprecated:: 1.0 `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the following class methods: :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` or :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator`.", - "docstring": "Plot Confusion Matrix.\n\nRead more in the :ref:`User Guide `.\n\n.. deprecated:: 1.0\n `plot_confusion_matrix` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` or\n :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator`.\n\nParameters\n----------\nestimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\ny_true : array-like of shape (n_samples,)\n Target values.\n\nlabels : array-like of shape (n_classes,), default=None\n List of labels to index the matrix. This may be used to reorder or\n select a subset of labels. If `None` is given, those that appear at\n least once in `y_true` or `y_pred` are used in sorted order.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nnormalize : {'true', 'pred', 'all'}, default=None\n Either to normalize the counts display in the matrix:\n\n - if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n - if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n - if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n - if `None` (default), the confusion matrix will not be normalized.\n\ndisplay_labels : array-like of shape (n_classes,), default=None\n Target names used for plotting. By default, `labels` will be used if\n it is defined, otherwise the unique labels of `y_true` and `y_pred`\n will be used.\n\ninclude_values : bool, default=True\n Includes values in confusion matrix.\n\nxticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\nvalues_format : str, default=None\n Format specification for values in confusion matrix. If `None`,\n the format specification is 'd' or '.2g' whichever is shorter.\n\ncmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\nax : matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\ncolorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\n .. versionadded:: 0.24\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n\nSee Also\n--------\nconfusion_matrix : Compute Confusion Matrix to evaluate the accuracy of a\n classification.\nConfusionMatrixDisplay : Confusion Matrix visualization.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.metrics import plot_confusion_matrix\n>>> from sklearn.model_selection import train_test_split\n>>> from sklearn.svm import SVC\n>>> X, y = make_classification(random_state=0)\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, random_state=0)\n>>> clf = SVC(random_state=0)\n>>> clf.fit(X_train, y_train)\nSVC(random_state=0)\n>>> plot_confusion_matrix(clf, X_test, y_test) # doctest: +SKIP\n>>> plt.show()", + "description": "Plot Confusion Matrix.\n\nRead more in the :ref:`User Guide `.\n\n.. deprecated:: 1.0\n `plot_confusion_matrix` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` or\n :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator`.", + "docstring": "Plot Confusion Matrix.\n\n Read more in the :ref:`User Guide `.\n\n .. deprecated:: 1.0\n `plot_confusion_matrix` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` or\n :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator`.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y_true : array-like of shape (n_samples,)\n Target values.\n\n labels : array-like of shape (n_classes,), default=None\n List of labels to index the matrix. This may be used to reorder or\n select a subset of labels. If `None` is given, those that appear at\n least once in `y_true` or `y_pred` are used in sorted order.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n normalize : {'true', 'pred', 'all'}, default=None\n Either to normalize the counts display in the matrix:\n\n - if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n - if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n - if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n - if `None` (default), the confusion matrix will not be normalized.\n\n display_labels : array-like of shape (n_classes,), default=None\n Target names used for plotting. By default, `labels` will be used if\n it is defined, otherwise the unique labels of `y_true` and `y_pred`\n will be used.\n\n include_values : bool, default=True\n Includes values in confusion matrix.\n\n xticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\n values_format : str, default=None\n Format specification for values in confusion matrix. If `None`,\n the format specification is 'd' or '.2g' whichever is shorter.\n\n cmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\n ax : matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n colorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n\n See Also\n --------\n confusion_matrix : Compute Confusion Matrix to evaluate the accuracy of a\n classification.\n ConfusionMatrixDisplay : Confusion Matrix visualization.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import plot_confusion_matrix\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> plot_confusion_matrix(clf, X_test, y_test) # doctest: +SKIP\n >>> plt.show()\n ", "source_code": "\n@deprecated('Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.')\ndef plot_confusion_matrix(estimator, X, y_true, *, labels=None, sample_weight=None, normalize=None, display_labels=None, include_values=True, xticks_rotation='horizontal', values_format=None, cmap='viridis', ax=None, colorbar=True):\n \"\"\"Plot Confusion Matrix.\n\n Read more in the :ref:`User Guide `.\n\n .. deprecated:: 1.0\n `plot_confusion_matrix` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` or\n :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_estimator`.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y_true : array-like of shape (n_samples,)\n Target values.\n\n labels : array-like of shape (n_classes,), default=None\n List of labels to index the matrix. This may be used to reorder or\n select a subset of labels. If `None` is given, those that appear at\n least once in `y_true` or `y_pred` are used in sorted order.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n normalize : {'true', 'pred', 'all'}, default=None\n Either to normalize the counts display in the matrix:\n\n - if `'true'`, the confusion matrix is normalized over the true\n conditions (e.g. rows);\n - if `'pred'`, the confusion matrix is normalized over the\n predicted conditions (e.g. columns);\n - if `'all'`, the confusion matrix is normalized by the total\n number of samples;\n - if `None` (default), the confusion matrix will not be normalized.\n\n display_labels : array-like of shape (n_classes,), default=None\n Target names used for plotting. By default, `labels` will be used if\n it is defined, otherwise the unique labels of `y_true` and `y_pred`\n will be used.\n\n include_values : bool, default=True\n Includes values in confusion matrix.\n\n xticks_rotation : {'vertical', 'horizontal'} or float, default='horizontal'\n Rotation of xtick labels.\n\n values_format : str, default=None\n Format specification for values in confusion matrix. If `None`,\n the format specification is 'd' or '.2g' whichever is shorter.\n\n cmap : str or matplotlib Colormap, default='viridis'\n Colormap recognized by matplotlib.\n\n ax : matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n colorbar : bool, default=True\n Whether or not to add a colorbar to the plot.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n display : :class:`~sklearn.metrics.ConfusionMatrixDisplay`\n\n See Also\n --------\n confusion_matrix : Compute Confusion Matrix to evaluate the accuracy of a\n classification.\n ConfusionMatrixDisplay : Confusion Matrix visualization.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import plot_confusion_matrix\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> plot_confusion_matrix(clf, X_test, y_test) # doctest: +SKIP\n >>> plt.show()\n \"\"\"\n check_matplotlib_support('plot_confusion_matrix')\n if not is_classifier(estimator):\n raise ValueError('plot_confusion_matrix only supports classifiers')\n y_pred = estimator.predict(X)\n cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight, labels=labels, normalize=normalize)\n if display_labels is None:\n if labels is None:\n display_labels = unique_labels(y_true, y_pred)\n else:\n display_labels = labels\n disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels)\n return disp.plot(include_values=include_values, cmap=cmap, ax=ax, xticks_rotation=xticks_rotation, values_format=values_format, colorbar=colorbar)" }, { @@ -118896,7 +127801,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fpr", @@ -118906,7 +127812,8 @@ "docstring": { "type": "ndarray", "description": "False positive rate." - } + }, + "refined_type": {} }, { "name": "fnr", @@ -118916,7 +127823,8 @@ "docstring": { "type": "ndarray", "description": "False negative rate." - } + }, + "refined_type": {} }, { "name": "estimator_name", @@ -118926,7 +127834,8 @@ "docstring": { "type": "str, default=None", "description": "Name of estimator. If None, the estimator name is not shown." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -118936,13 +127845,14 @@ "docstring": { "type": "str or int, default=None", "description": "The label of the positive class." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, fpr, fnr, estimator_name=None, pos_label=None):\n self.fpr = fpr\n self.fnr = fnr\n self.estimator_name = estimator_name\n self.pos_label = pos_label" }, { @@ -118960,7 +127870,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -118970,7 +127881,8 @@ "docstring": { "type": "estimator instance", "description": "Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\nin which the last estimator is a classifier." - } + }, + "refined_type": {} }, { "name": "X", @@ -118980,6 +127892,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -118990,7 +127906,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -119000,7 +127917,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "response_method", @@ -119010,6 +127928,10 @@ "docstring": { "type": "{'predict_proba', 'decision_function', 'auto'} default='auto'", "description": "Specifies whether to use :term:`predict_proba` or\n:term:`decision_function` as the predicted target response. If set\nto 'auto', :term:`predict_proba` is tried first and if it does not\nexist :term:`decision_function` is tried next." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "decision_function", "predict_proba"] } }, { @@ -119020,6 +127942,10 @@ "docstring": { "type": "str or int, default=None", "description": "The label of the positive class. When `pos_label=None`, if `y_true`\nis in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\nerror will be raised." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -119030,7 +127956,8 @@ "docstring": { "type": "str, default=None", "description": "Name of DET curve for labeling. If `None`, use the name of the\nestimator." - } + }, + "refined_type": {} }, { "name": "ax", @@ -119040,13 +127967,14 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Plot DET curve given an estimator and data.\n\nRead more in the :ref:`User Guide `. .. versionadded:: 1.0", - "docstring": "Plot DET curve given an estimator and data.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0\n\nParameters\n----------\nestimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nresponse_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the predicted target response. If set\n to 'auto', :term:`predict_proba` is tried first and if it does not\n exist :term:`decision_function` is tried next.\n\npos_label : str or int, default=None\n The label of the positive class. When `pos_label=None`, if `y_true`\n is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\n error will be raised.\n\nname : str, default=None\n Name of DET curve for labeling. If `None`, use the name of the\n estimator.\n\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n**kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\nSee Also\n--------\ndet_curve : Compute error rates for different probability thresholds.\nDetCurveDisplay.from_predictions : Plot DET curve given the true and\n predicted labels.\nplot_roc_curve : Plot Receiver operating characteristic (ROC) curve.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.metrics import DetCurveDisplay\n>>> from sklearn.model_selection import train_test_split\n>>> from sklearn.svm import SVC\n>>> X, y = make_classification(n_samples=1000, random_state=0)\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, test_size=0.4, random_state=0)\n>>> clf = SVC(random_state=0).fit(X_train, y_train)\n>>> DetCurveDisplay.from_estimator(\n... clf, X_test, y_test)\n<...>\n>>> plt.show()", + "description": "Plot DET curve given an estimator and data.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0", + "docstring": "Plot DET curve given an estimator and data.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n response_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the predicted target response. If set\n to 'auto', :term:`predict_proba` is tried first and if it does not\n exist :term:`decision_function` is tried next.\n\n pos_label : str or int, default=None\n The label of the positive class. When `pos_label=None`, if `y_true`\n is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\n error will be raised.\n\n name : str, default=None\n Name of DET curve for labeling. If `None`, use the name of the\n estimator.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n det_curve : Compute error rates for different probability thresholds.\n DetCurveDisplay.from_predictions : Plot DET curve given the true and\n predicted labels.\n plot_roc_curve : Plot Receiver operating characteristic (ROC) curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import DetCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(n_samples=1000, random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.4, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> DetCurveDisplay.from_estimator(\n ... clf, X_test, y_test)\n <...>\n >>> plt.show()\n ", "source_code": "\n@classmethod\ndef from_estimator(cls, estimator, X, y, *, sample_weight=None, response_method='auto', pos_label=None, name=None, ax=None, **kwargs):\n \"\"\"Plot DET curve given an estimator and data.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n response_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the predicted target response. If set\n to 'auto', :term:`predict_proba` is tried first and if it does not\n exist :term:`decision_function` is tried next.\n\n pos_label : str or int, default=None\n The label of the positive class. When `pos_label=None`, if `y_true`\n is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\n error will be raised.\n\n name : str, default=None\n Name of DET curve for labeling. If `None`, use the name of the\n estimator.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n det_curve : Compute error rates for different probability thresholds.\n DetCurveDisplay.from_predictions : Plot DET curve given the true and\n predicted labels.\n plot_roc_curve : Plot Receiver operating characteristic (ROC) curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import DetCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(n_samples=1000, random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.4, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> DetCurveDisplay.from_estimator(\n ... clf, X_test, y_test)\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_estimator')\n name = estimator.__class__.__name__ if name is None else name\n (y_pred, pos_label) = _get_response(X, estimator, response_method, pos_label=pos_label)\n return cls.from_predictions(y_true=y, y_pred=y_pred, sample_weight=sample_weight, name=name, ax=ax, pos_label=pos_label, **kwargs)" }, { @@ -119064,7 +127992,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -119074,7 +128003,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -119084,7 +128014,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target scores, can either be probability estimates of the positive\nclass, confidence values, or non-thresholded measure of decisions\n(as returned by `decision_function` on some classifiers)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -119094,7 +128025,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -119104,6 +128036,10 @@ "docstring": { "type": "str or int, default=None", "description": "The label of the positive class. When `pos_label=None`, if `y_true`\nis in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\nerror will be raised." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -119114,7 +128050,8 @@ "docstring": { "type": "str, default=None", "description": "Name of DET curve for labeling. If `None`, name will be set to\n`\"Classifier\"`." - } + }, + "refined_type": {} }, { "name": "ax", @@ -119124,13 +128061,14 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Plot DET curve given the true and predicted labels.\n\nRead more in the :ref:`User Guide `. .. versionadded:: 1.0", - "docstring": "Plot DET curve given the true and\npredicted labels.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n True labels.\n\ny_pred : array-like of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by `decision_function` on some classifiers).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\npos_label : str or int, default=None\n The label of the positive class. When `pos_label=None`, if `y_true`\n is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\n error will be raised.\n\nname : str, default=None\n Name of DET curve for labeling. If `None`, name will be set to\n `\"Classifier\"`.\n\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n**kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\nSee Also\n--------\ndet_curve : Compute error rates for different probability thresholds.\nDetCurveDisplay.from_estimator : Plot DET curve given an estimator and\n some data.\nplot_roc_curve : Plot Receiver operating characteristic (ROC) curve.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.metrics import DetCurveDisplay\n>>> from sklearn.model_selection import train_test_split\n>>> from sklearn.svm import SVC\n>>> X, y = make_classification(n_samples=1000, random_state=0)\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, test_size=0.4, random_state=0)\n>>> clf = SVC(random_state=0).fit(X_train, y_train)\n>>> y_pred = clf.decision_function(X_test)\n>>> DetCurveDisplay.from_predictions(\n... y_test, y_pred)\n<...>\n>>> plt.show()", + "description": "Plot DET curve given the true and\npredicted labels.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0", + "docstring": "Plot DET curve given the true and\n predicted labels.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_pred : array-like of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by `decision_function` on some classifiers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n pos_label : str or int, default=None\n The label of the positive class. When `pos_label=None`, if `y_true`\n is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\n error will be raised.\n\n name : str, default=None\n Name of DET curve for labeling. If `None`, name will be set to\n `\"Classifier\"`.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n det_curve : Compute error rates for different probability thresholds.\n DetCurveDisplay.from_estimator : Plot DET curve given an estimator and\n some data.\n plot_roc_curve : Plot Receiver operating characteristic (ROC) curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import DetCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(n_samples=1000, random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.4, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> y_pred = clf.decision_function(X_test)\n >>> DetCurveDisplay.from_predictions(\n ... y_test, y_pred)\n <...>\n >>> plt.show()\n ", "source_code": "\n@classmethod\ndef from_predictions(cls, y_true, y_pred, *, sample_weight=None, pos_label=None, name=None, ax=None, **kwargs):\n \"\"\"Plot DET curve given the true and\n predicted labels.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_pred : array-like of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by `decision_function` on some classifiers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n pos_label : str or int, default=None\n The label of the positive class. When `pos_label=None`, if `y_true`\n is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\n error will be raised.\n\n name : str, default=None\n Name of DET curve for labeling. If `None`, name will be set to\n `\"Classifier\"`.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n det_curve : Compute error rates for different probability thresholds.\n DetCurveDisplay.from_estimator : Plot DET curve given an estimator and\n some data.\n plot_roc_curve : Plot Receiver operating characteristic (ROC) curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import DetCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(n_samples=1000, random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.4, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> y_pred = clf.decision_function(X_test)\n >>> DetCurveDisplay.from_predictions(\n ... y_test, y_pred)\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_predictions')\n (fpr, fnr, _) = det_curve(y_true, y_pred, pos_label=pos_label, sample_weight=sample_weight)\n pos_label = _check_pos_label_consistency(pos_label, y_true)\n name = 'Classifier' if name is None else name\n viz = DetCurveDisplay(fpr=fpr, fnr=fnr, estimator_name=name, pos_label=pos_label)\n return viz.plot(ax=ax, name=name, **kwargs)" }, { @@ -119148,7 +128086,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ax", @@ -119158,7 +128097,8 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} }, { "name": "name", @@ -119168,13 +128108,14 @@ "docstring": { "type": "str, default=None", "description": "Name of DET curve for labeling. If `None`, use `estimator_name` if\nit is not `None`, otherwise no labeling is shown." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Plot visualization.", - "docstring": "Plot visualization.\n\nParameters\n----------\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\nname : str, default=None\n Name of DET curve for labeling. If `None`, use `estimator_name` if\n it is not `None`, otherwise no labeling is shown.\n\n**kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.plot.DetCurveDisplay`\n Object that stores computed values.", + "docstring": "Plot visualization.\n\n Parameters\n ----------\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name of DET curve for labeling. If `None`, use `estimator_name` if\n it is not `None`, otherwise no labeling is shown.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.plot.DetCurveDisplay`\n Object that stores computed values.\n ", "source_code": "\ndef plot(self, ax=None, *, name=None, **kwargs):\n \"\"\"Plot visualization.\n\n Parameters\n ----------\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name of DET curve for labeling. If `None`, use `estimator_name` if\n it is not `None`, otherwise no labeling is shown.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.plot.DetCurveDisplay`\n Object that stores computed values.\n \"\"\"\n check_matplotlib_support('DetCurveDisplay.plot')\n name = self.estimator_name if name is None else name\n line_kwargs = {} if name is None else {'label': name}\n line_kwargs.update(**kwargs)\n import matplotlib.pyplot as plt\n if ax is None:\n (_, ax) = plt.subplots()\n (self.line_, ) = ax.plot(sp.stats.norm.ppf(self.fpr), sp.stats.norm.ppf(self.fnr), **line_kwargs)\n info_pos_label = f' (Positive label: {self.pos_label})' if self.pos_label is not None else ''\n xlabel = 'False Positive Rate' + info_pos_label\n ylabel = 'False Negative Rate' + info_pos_label\n ax.set(xlabel=xlabel, ylabel=ylabel)\n if 'label' in line_kwargs:\n ax.legend(loc='lower right')\n ticks = [0.001, 0.01, 0.05, 0.2, 0.5, 0.8, 0.95, 0.99, 0.999]\n tick_locations = sp.stats.norm.ppf(ticks)\n tick_labels = ['{:.0%}'.format(s) if (100 * s).is_integer() else '{:.1%}'.format(s) for s in ticks]\n ax.set_xticks(tick_locations)\n ax.set_xticklabels(tick_labels)\n ax.set_xlim(-3, 3)\n ax.set_yticks(tick_locations)\n ax.set_yticklabels(tick_labels)\n ax.set_ylim(-3, 3)\n self.ax_ = ax\n self.figure_ = ax.figure\n return self" }, { @@ -119194,7 +128135,8 @@ "docstring": { "type": "estimator instance", "description": "Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\nin which the last estimator is a classifier." - } + }, + "refined_type": {} }, { "name": "X", @@ -119204,6 +128146,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -119214,7 +128160,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -119224,7 +128171,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "response_method", @@ -119234,6 +128182,10 @@ "docstring": { "type": "{'predict_proba', 'decision_function', 'auto'} default='auto'", "description": "Specifies whether to use :term:`predict_proba` or\n:term:`decision_function` as the predicted target response. If set to\n'auto', :term:`predict_proba` is tried first and if it does not exist\n:term:`decision_function` is tried next." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "decision_function", "predict_proba"] } }, { @@ -119244,7 +128196,8 @@ "docstring": { "type": "str, default=None", "description": "Name of DET curve for labeling. If `None`, use the name of the\nestimator." - } + }, + "refined_type": {} }, { "name": "ax", @@ -119254,7 +128207,8 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is created." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -119264,13 +128218,17 @@ "docstring": { "type": "str or int, default=None", "description": "The label of the positive class.\nWhen `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1},\n`pos_label` is set to 1, otherwise an error will be raised." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Plot detection error tradeoff (DET) curve.\n\nExtra keyword arguments will be passed to matplotlib's `plot`. Read more in the :ref:`User Guide `. .. versionadded:: 0.24 .. deprecated:: 1.0 `plot_det_curve` is deprecated in 1.0 and will be removed in 1.2. Use one of the following class methods: :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` or :func:`~sklearn.metrics.DetCurveDisplay.from_estimator`.", - "docstring": "Plot detection error tradeoff (DET) curve.\n\nExtra keyword arguments will be passed to matplotlib's `plot`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.24\n\n.. deprecated:: 1.0\n `plot_det_curve` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` or\n :func:`~sklearn.metrics.DetCurveDisplay.from_estimator`.\n\nParameters\n----------\nestimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nresponse_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the predicted target response. If set to\n 'auto', :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\nname : str, default=None\n Name of DET curve for labeling. If `None`, use the name of the\n estimator.\n\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\npos_label : str or int, default=None\n The label of the positive class.\n When `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1},\n `pos_label` is set to 1, otherwise an error will be raised.\n\n**kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\nSee Also\n--------\ndet_curve : Compute error rates for different probability thresholds.\nDetCurveDisplay : DET curve visualization.\nDetCurveDisplay.from_estimator : Plot DET curve given an estimator and\n some data.\nDetCurveDisplay.from_predictions : Plot DET curve given the true and\n predicted labels.\nRocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic\n (ROC) curve given an estimator and some data.\nRocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic\n (ROC) curve given the true and predicted values.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.metrics import plot_det_curve\n>>> from sklearn.model_selection import train_test_split\n>>> from sklearn.svm import SVC\n>>> X, y = make_classification(n_samples=1000, random_state=0)\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, test_size=0.4, random_state=0)\n>>> clf = SVC(random_state=0).fit(X_train, y_train)\n>>> plot_det_curve(clf, X_test, y_test) # doctest: +SKIP\n<...>\n>>> plt.show()", + "description": "Plot detection error tradeoff (DET) curve.\n\nExtra keyword arguments will be passed to matplotlib's `plot`.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.24\n\n.. deprecated:: 1.0\n `plot_det_curve` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` or\n :func:`~sklearn.metrics.DetCurveDisplay.from_estimator`.", + "docstring": "Plot detection error tradeoff (DET) curve.\n\n Extra keyword arguments will be passed to matplotlib's `plot`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n .. deprecated:: 1.0\n `plot_det_curve` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` or\n :func:`~sklearn.metrics.DetCurveDisplay.from_estimator`.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n response_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the predicted target response. If set to\n 'auto', :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n name : str, default=None\n Name of DET curve for labeling. If `None`, use the name of the\n estimator.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n pos_label : str or int, default=None\n The label of the positive class.\n When `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1},\n `pos_label` is set to 1, otherwise an error will be raised.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n det_curve : Compute error rates for different probability thresholds.\n DetCurveDisplay : DET curve visualization.\n DetCurveDisplay.from_estimator : Plot DET curve given an estimator and\n some data.\n DetCurveDisplay.from_predictions : Plot DET curve given the true and\n predicted labels.\n RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic\n (ROC) curve given an estimator and some data.\n RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic\n (ROC) curve given the true and predicted values.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import plot_det_curve\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(n_samples=1000, random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.4, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> plot_det_curve(clf, X_test, y_test) # doctest: +SKIP\n <...>\n >>> plt.show()\n ", "source_code": "\n@deprecated('Function plot_det_curve is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: DetCurveDisplay.from_predictions or DetCurveDisplay.from_estimator.')\ndef plot_det_curve(estimator, X, y, *, sample_weight=None, response_method='auto', name=None, ax=None, pos_label=None, **kwargs):\n \"\"\"Plot detection error tradeoff (DET) curve.\n\n Extra keyword arguments will be passed to matplotlib's `plot`.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n .. deprecated:: 1.0\n `plot_det_curve` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.DetCurveDisplay.from_predictions` or\n :func:`~sklearn.metrics.DetCurveDisplay.from_estimator`.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n response_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the predicted target response. If set to\n 'auto', :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n name : str, default=None\n Name of DET curve for labeling. If `None`, use the name of the\n estimator.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n pos_label : str or int, default=None\n The label of the positive class.\n When `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1},\n `pos_label` is set to 1, otherwise an error will be raised.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n det_curve : Compute error rates for different probability thresholds.\n DetCurveDisplay : DET curve visualization.\n DetCurveDisplay.from_estimator : Plot DET curve given an estimator and\n some data.\n DetCurveDisplay.from_predictions : Plot DET curve given the true and\n predicted labels.\n RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic\n (ROC) curve given an estimator and some data.\n RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic\n (ROC) curve given the true and predicted values.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import plot_det_curve\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(n_samples=1000, random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.4, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> plot_det_curve(clf, X_test, y_test) # doctest: +SKIP\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support('plot_det_curve')\n (y_pred, pos_label) = _get_response(X, estimator, response_method, pos_label=pos_label)\n (fpr, fnr, _) = det_curve(y, y_pred, pos_label=pos_label, sample_weight=sample_weight)\n name = estimator.__class__.__name__ if name is None else name\n viz = DetCurveDisplay(fpr=fpr, fnr=fnr, estimator_name=name, pos_label=pos_label)\n return viz.plot(ax=ax, name=name, **kwargs)" }, { @@ -119288,7 +128246,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "precision", @@ -119298,7 +128257,8 @@ "docstring": { "type": "ndarray", "description": "Precision values." - } + }, + "refined_type": {} }, { "name": "recall", @@ -119308,7 +128268,8 @@ "docstring": { "type": "ndarray", "description": "Recall values." - } + }, + "refined_type": {} }, { "name": "average_precision", @@ -119318,7 +128279,8 @@ "docstring": { "type": "float, default=None", "description": "Average precision. If None, the average precision is not shown." - } + }, + "refined_type": {} }, { "name": "estimator_name", @@ -119328,7 +128290,8 @@ "docstring": { "type": "str, default=None", "description": "Name of estimator. If None, then the estimator name is not shown." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -119338,13 +128301,14 @@ "docstring": { "type": "str or int, default=None", "description": "The class considered as the positive class. If None, the class will not\nbe shown in the legend.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, precision, recall, *, average_precision=None, estimator_name=None, pos_label=None):\n self.estimator_name = estimator_name\n self.precision = precision\n self.recall = recall\n self.average_precision = average_precision\n self.pos_label = pos_label" }, { @@ -119362,7 +128326,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -119372,7 +128337,8 @@ "docstring": { "type": "estimator instance", "description": "Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\nin which the last estimator is a classifier." - } + }, + "refined_type": {} }, { "name": "X", @@ -119382,6 +128348,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -119392,7 +128362,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -119402,7 +128373,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -119412,7 +128384,8 @@ "docstring": { "type": "str or int, default=None", "description": "The class considered as the positive class when computing the\nprecision and recall metrics. By default, `estimators.classes_[1]`\nis considered as the positive class." - } + }, + "refined_type": {} }, { "name": "response_method", @@ -119422,6 +128395,10 @@ "docstring": { "type": "{'predict_proba', 'decision_function', 'auto'}, default='auto'", "description": "Specifies whether to use :term:`predict_proba` or\n:term:`decision_function` as the target response. If set to 'auto',\n:term:`predict_proba` is tried first and if it does not exist\n:term:`decision_function` is tried next." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "decision_function", "predict_proba"] } }, { @@ -119432,7 +128409,8 @@ "docstring": { "type": "str, default=None", "description": "Name for labeling curve. If `None`, no name is used." - } + }, + "refined_type": {} }, { "name": "ax", @@ -119442,13 +128420,14 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is created." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Plot precision-recall curve given an estimator and some data.", - "docstring": "Plot precision-recall curve given an estimator and some data.\n\nParameters\n----------\nestimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\npos_label : str or int, default=None\n The class considered as the positive class when computing the\n precision and recall metrics. By default, `estimators.classes_[1]`\n is considered as the positive class.\n\nresponse_method : {'predict_proba', 'decision_function', 'auto'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\nname : str, default=None\n Name for labeling curve. If `None`, no name is used.\n\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n**kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n\nSee Also\n--------\nPrecisionRecallDisplay.from_predictions : Plot precision-recall curve\n using estimated probabilities or output of decision function.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.metrics import PrecisionRecallDisplay\n>>> from sklearn.model_selection import train_test_split\n>>> from sklearn.linear_model import LogisticRegression\n>>> X, y = make_classification(random_state=0)\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, random_state=0)\n>>> clf = LogisticRegression()\n>>> clf.fit(X_train, y_train)\nLogisticRegression()\n>>> PrecisionRecallDisplay.from_estimator(\n... clf, X_test, y_test)\n<...>\n>>> plt.show()", + "docstring": "Plot precision-recall curve given an estimator and some data.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the\n precision and recall metrics. By default, `estimators.classes_[1]`\n is considered as the positive class.\n\n response_method : {'predict_proba', 'decision_function', 'auto'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n name : str, default=None\n Name for labeling curve. If `None`, no name is used.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n\n See Also\n --------\n PrecisionRecallDisplay.from_predictions : Plot precision-recall curve\n using estimated probabilities or output of decision function.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import PrecisionRecallDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression()\n >>> clf.fit(X_train, y_train)\n LogisticRegression()\n >>> PrecisionRecallDisplay.from_estimator(\n ... clf, X_test, y_test)\n <...>\n >>> plt.show()\n ", "source_code": "\n@classmethod\ndef from_estimator(cls, estimator, X, y, *, sample_weight=None, pos_label=None, response_method='auto', name=None, ax=None, **kwargs):\n \"\"\"Plot precision-recall curve given an estimator and some data.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the\n precision and recall metrics. By default, `estimators.classes_[1]`\n is considered as the positive class.\n\n response_method : {'predict_proba', 'decision_function', 'auto'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n name : str, default=None\n Name for labeling curve. If `None`, no name is used.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n\n See Also\n --------\n PrecisionRecallDisplay.from_predictions : Plot precision-recall curve\n using estimated probabilities or output of decision function.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import PrecisionRecallDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression()\n >>> clf.fit(X_train, y_train)\n LogisticRegression()\n >>> PrecisionRecallDisplay.from_estimator(\n ... clf, X_test, y_test)\n <...>\n >>> plt.show()\n \"\"\"\n method_name = f'{cls.__name__}.from_estimator'\n check_matplotlib_support(method_name)\n if not is_classifier(estimator):\n raise ValueError(f'{method_name} only supports classifiers')\n (y_pred, pos_label) = _get_response(X, estimator, response_method, pos_label=pos_label)\n name = name if name is not None else estimator.__class__.__name__\n return cls.from_predictions(y, y_pred, sample_weight=sample_weight, name=name, pos_label=pos_label, ax=ax, **kwargs)" }, { @@ -119466,7 +128445,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -119476,7 +128456,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "True binary labels." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -119486,7 +128467,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Estimated probabilities or output of decision function." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -119496,7 +128478,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -119506,7 +128489,8 @@ "docstring": { "type": "str or int, default=None", "description": "The class considered as the positive class when computing the\nprecision and recall metrics." - } + }, + "refined_type": {} }, { "name": "name", @@ -119516,7 +128500,8 @@ "docstring": { "type": "str, default=None", "description": "Name for labeling curve. If `None`, name will be set to\n`\"Classifier\"`." - } + }, + "refined_type": {} }, { "name": "ax", @@ -119526,13 +128511,14 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is created." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Plot precision-recall curve given binary class predictions.", - "docstring": "Plot precision-recall curve given binary class predictions.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n True binary labels.\n\ny_pred : array-like of shape (n_samples,)\n Estimated probabilities or output of decision function.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\npos_label : str or int, default=None\n The class considered as the positive class when computing the\n precision and recall metrics.\n\nname : str, default=None\n Name for labeling curve. If `None`, name will be set to\n `\"Classifier\"`.\n\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n**kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n\nSee Also\n--------\nPrecisionRecallDisplay.from_estimator : Plot precision-recall curve\n using an estimator.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.metrics import PrecisionRecallDisplay\n>>> from sklearn.model_selection import train_test_split\n>>> from sklearn.linear_model import LogisticRegression\n>>> X, y = make_classification(random_state=0)\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, random_state=0)\n>>> clf = LogisticRegression()\n>>> clf.fit(X_train, y_train)\nLogisticRegression()\n>>> y_pred = clf.predict_proba(X_test)[:, 1]\n>>> PrecisionRecallDisplay.from_predictions(\n... y_test, y_pred)\n<...>\n>>> plt.show()", + "docstring": "Plot precision-recall curve given binary class predictions.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True binary labels.\n\n y_pred : array-like of shape (n_samples,)\n Estimated probabilities or output of decision function.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the\n precision and recall metrics.\n\n name : str, default=None\n Name for labeling curve. If `None`, name will be set to\n `\"Classifier\"`.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n\n See Also\n --------\n PrecisionRecallDisplay.from_estimator : Plot precision-recall curve\n using an estimator.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import PrecisionRecallDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression()\n >>> clf.fit(X_train, y_train)\n LogisticRegression()\n >>> y_pred = clf.predict_proba(X_test)[:, 1]\n >>> PrecisionRecallDisplay.from_predictions(\n ... y_test, y_pred)\n <...>\n >>> plt.show()\n ", "source_code": "\n@classmethod\ndef from_predictions(cls, y_true, y_pred, *, sample_weight=None, pos_label=None, name=None, ax=None, **kwargs):\n \"\"\"Plot precision-recall curve given binary class predictions.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True binary labels.\n\n y_pred : array-like of shape (n_samples,)\n Estimated probabilities or output of decision function.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the\n precision and recall metrics.\n\n name : str, default=None\n Name for labeling curve. If `None`, name will be set to\n `\"Classifier\"`.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n\n See Also\n --------\n PrecisionRecallDisplay.from_estimator : Plot precision-recall curve\n using an estimator.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import PrecisionRecallDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.linear_model import LogisticRegression\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = LogisticRegression()\n >>> clf.fit(X_train, y_train)\n LogisticRegression()\n >>> y_pred = clf.predict_proba(X_test)[:, 1]\n >>> PrecisionRecallDisplay.from_predictions(\n ... y_test, y_pred)\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_predictions')\n check_consistent_length(y_true, y_pred, sample_weight)\n pos_label = _check_pos_label_consistency(pos_label, y_true)\n (precision, recall, _) = precision_recall_curve(y_true, y_pred, pos_label=pos_label, sample_weight=sample_weight)\n average_precision = average_precision_score(y_true, y_pred, pos_label=pos_label, sample_weight=sample_weight)\n name = name if name is not None else 'Classifier'\n viz = PrecisionRecallDisplay(precision=precision, recall=recall, average_precision=average_precision, estimator_name=name, pos_label=pos_label)\n return viz.plot(ax=ax, name=name, **kwargs)" }, { @@ -119550,7 +128536,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ax", @@ -119560,7 +128547,8 @@ "docstring": { "type": "Matplotlib Axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} }, { "name": "name", @@ -119570,13 +128558,14 @@ "docstring": { "type": "str, default=None", "description": "Name of precision recall curve for labeling. If `None`, use\n`estimator_name` if not `None`, otherwise no labeling is shown." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Plot visualization.\n\nExtra keyword arguments will be passed to matplotlib's `plot`.", - "docstring": "Plot visualization.\n\nExtra keyword arguments will be passed to matplotlib's `plot`.\n\nParameters\n----------\nax : Matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\nname : str, default=None\n Name of precision recall curve for labeling. If `None`, use\n `estimator_name` if not `None`, otherwise no labeling is shown.\n\n**kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n Object that stores computed values.", + "docstring": "Plot visualization.\n\n Extra keyword arguments will be passed to matplotlib's `plot`.\n\n Parameters\n ----------\n ax : Matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name of precision recall curve for labeling. If `None`, use\n `estimator_name` if not `None`, otherwise no labeling is shown.\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n Object that stores computed values.\n ", "source_code": "\ndef plot(self, ax=None, *, name=None, **kwargs):\n \"\"\"Plot visualization.\n\n Extra keyword arguments will be passed to matplotlib's `plot`.\n\n Parameters\n ----------\n ax : Matplotlib Axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name of precision recall curve for labeling. If `None`, use\n `estimator_name` if not `None`, otherwise no labeling is shown.\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n Object that stores computed values.\n \"\"\"\n check_matplotlib_support('PrecisionRecallDisplay.plot')\n name = self.estimator_name if name is None else name\n line_kwargs = {'drawstyle': 'steps-post'}\n if self.average_precision is not None and name is not None:\n line_kwargs['label'] = f'{name} (AP = {self.average_precision:0.2f})'\n elif self.average_precision is not None:\n line_kwargs['label'] = f'AP = {self.average_precision:0.2f}'\n elif name is not None:\n line_kwargs['label'] = name\n line_kwargs.update(**kwargs)\n import matplotlib.pyplot as plt\n if ax is None:\n (fig, ax) = plt.subplots()\n (self.line_, ) = ax.plot(self.recall, self.precision, **line_kwargs)\n info_pos_label = f' (Positive label: {self.pos_label})' if self.pos_label is not None else ''\n xlabel = 'Recall' + info_pos_label\n ylabel = 'Precision' + info_pos_label\n ax.set(xlabel=xlabel, ylabel=ylabel)\n if 'label' in line_kwargs:\n ax.legend(loc='lower left')\n self.ax_ = ax\n self.figure_ = ax.figure\n return self" }, { @@ -119596,7 +128585,8 @@ "docstring": { "type": "estimator instance", "description": "Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\nin which the last estimator is a classifier." - } + }, + "refined_type": {} }, { "name": "X", @@ -119606,6 +128596,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -119616,7 +128610,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Binary target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -119626,7 +128621,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "response_method", @@ -119636,6 +128632,10 @@ "docstring": { "type": "{'predict_proba', 'decision_function', 'auto'}, default='auto'", "description": "Specifies whether to use :term:`predict_proba` or\n:term:`decision_function` as the target response. If set to 'auto',\n:term:`predict_proba` is tried first and if it does not exist\n:term:`decision_function` is tried next." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "decision_function", "predict_proba"] } }, { @@ -119646,7 +128646,8 @@ "docstring": { "type": "str, default=None", "description": "Name for labeling curve. If `None`, the name of the\nestimator is used." - } + }, + "refined_type": {} }, { "name": "ax", @@ -119656,7 +128657,8 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is created." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -119666,13 +128668,14 @@ "docstring": { "type": "str or int, default=None", "description": "The class considered as the positive class when computing the precision\nand recall metrics. By default, `estimators.classes_[1]` is considered\nas the positive class.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Plot Precision Recall Curve for binary classifiers.\n\nExtra keyword arguments will be passed to matplotlib's `plot`. Read more in the :ref:`User Guide `. .. deprecated:: 1.0 `plot_precision_recall_curve` is deprecated in 1.0 and will be removed in 1.2. Use one of the following class methods: :func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` or :func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator`.", - "docstring": "Plot Precision Recall Curve for binary classifiers.\n\nExtra keyword arguments will be passed to matplotlib's `plot`.\n\nRead more in the :ref:`User Guide `.\n\n.. deprecated:: 1.0\n `plot_precision_recall_curve` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` or\n :func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator`.\n\nParameters\n----------\nestimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\ny : array-like of shape (n_samples,)\n Binary target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nresponse_method : {'predict_proba', 'decision_function', 'auto'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\nname : str, default=None\n Name for labeling curve. If `None`, the name of the\n estimator is used.\n\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\npos_label : str or int, default=None\n The class considered as the positive class when computing the precision\n and recall metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\n .. versionadded:: 0.24\n\n**kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n Object that stores computed values.\n\nSee Also\n--------\nprecision_recall_curve : Compute precision-recall pairs for different\n probability thresholds.\nPrecisionRecallDisplay : Precision Recall visualization.", + "description": "Plot Precision Recall Curve for binary classifiers.\n\nExtra keyword arguments will be passed to matplotlib's `plot`.\n\nRead more in the :ref:`User Guide `.\n\n.. deprecated:: 1.0\n `plot_precision_recall_curve` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` or\n :func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator`.", + "docstring": "Plot Precision Recall Curve for binary classifiers.\n\n Extra keyword arguments will be passed to matplotlib's `plot`.\n\n Read more in the :ref:`User Guide `.\n\n .. deprecated:: 1.0\n `plot_precision_recall_curve` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` or\n :func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator`.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Binary target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n response_method : {'predict_proba', 'decision_function', 'auto'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n name : str, default=None\n Name for labeling curve. If `None`, the name of the\n estimator is used.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the precision\n and recall metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\n .. versionadded:: 0.24\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n Object that stores computed values.\n\n See Also\n --------\n precision_recall_curve : Compute precision-recall pairs for different\n probability thresholds.\n PrecisionRecallDisplay : Precision Recall visualization.\n ", "source_code": "\n@deprecated('Function `plot_precision_recall_curve` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: PrecisionRecallDisplay.from_predictions or PrecisionRecallDisplay.from_estimator.')\ndef plot_precision_recall_curve(estimator, X, y, *, sample_weight=None, response_method='auto', name=None, ax=None, pos_label=None, **kwargs):\n \"\"\"Plot Precision Recall Curve for binary classifiers.\n\n Extra keyword arguments will be passed to matplotlib's `plot`.\n\n Read more in the :ref:`User Guide `.\n\n .. deprecated:: 1.0\n `plot_precision_recall_curve` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.PrecisionRecallDisplay.from_predictions` or\n :func:`~sklearn.metrics.PrecisionRecallDisplay.from_estimator`.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Binary target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n response_method : {'predict_proba', 'decision_function', 'auto'}, default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n name : str, default=None\n Name for labeling curve. If `None`, the name of the\n estimator is used.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the precision\n and recall metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\n .. versionadded:: 0.24\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.PrecisionRecallDisplay`\n Object that stores computed values.\n\n See Also\n --------\n precision_recall_curve : Compute precision-recall pairs for different\n probability thresholds.\n PrecisionRecallDisplay : Precision Recall visualization.\n \"\"\"\n check_matplotlib_support('plot_precision_recall_curve')\n (y_pred, pos_label) = _get_response(X, estimator, response_method, pos_label=pos_label)\n (precision, recall, _) = precision_recall_curve(y, y_pred, pos_label=pos_label, sample_weight=sample_weight)\n average_precision = average_precision_score(y, y_pred, pos_label=pos_label, sample_weight=sample_weight)\n name = name if name is not None else estimator.__class__.__name__\n viz = PrecisionRecallDisplay(precision=precision, recall=recall, average_precision=average_precision, estimator_name=name, pos_label=pos_label)\n return viz.plot(ax=ax, name=name, **kwargs)" }, { @@ -119690,7 +128693,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fpr", @@ -119700,7 +128704,8 @@ "docstring": { "type": "ndarray", "description": "False positive rate." - } + }, + "refined_type": {} }, { "name": "tpr", @@ -119710,7 +128715,8 @@ "docstring": { "type": "ndarray", "description": "True positive rate." - } + }, + "refined_type": {} }, { "name": "roc_auc", @@ -119720,7 +128726,8 @@ "docstring": { "type": "float, default=None", "description": "Area under ROC curve. If None, the roc_auc score is not shown." - } + }, + "refined_type": {} }, { "name": "estimator_name", @@ -119730,7 +128737,8 @@ "docstring": { "type": "str, default=None", "description": "Name of estimator. If None, the estimator name is not shown." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -119740,13 +128748,14 @@ "docstring": { "type": "str or int, default=None", "description": "The class considered as the positive class when computing the roc auc\nmetrics. By default, `estimators.classes_[1]` is considered\nas the positive class.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, fpr, tpr, roc_auc=None, estimator_name=None, pos_label=None):\n self.estimator_name = estimator_name\n self.fpr = fpr\n self.tpr = tpr\n self.roc_auc = roc_auc\n self.pos_label = pos_label" }, { @@ -119764,7 +128773,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -119774,7 +128784,8 @@ "docstring": { "type": "estimator instance", "description": "Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\nin which the last estimator is a classifier." - } + }, + "refined_type": {} }, { "name": "X", @@ -119784,6 +128795,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -119794,7 +128809,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -119804,7 +128820,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "drop_intermediate", @@ -119814,7 +128831,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to drop some suboptimal thresholds which would not appear\non a plotted ROC curve. This is useful in order to create lighter\nROC curves." - } + }, + "refined_type": {} }, { "name": "response_method", @@ -119824,6 +128842,10 @@ "docstring": { "type": "{'predict_proba', 'decision_function', 'auto'} default='auto'", "description": "Specifies whether to use :term:`predict_proba` or\n:term:`decision_function` as the target response. If set to 'auto',\n:term:`predict_proba` is tried first and if it does not exist\n:term:`decision_function` is tried next." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "decision_function", "predict_proba"] } }, { @@ -119834,7 +128856,8 @@ "docstring": { "type": "str or int, default=None", "description": "The class considered as the positive class when computing the roc auc\nmetrics. By default, `estimators.classes_[1]` is considered\nas the positive class." - } + }, + "refined_type": {} }, { "name": "name", @@ -119844,7 +128867,8 @@ "docstring": { "type": "str, default=None", "description": "Name of ROC Curve for labeling. If `None`, use the name of the\nestimator." - } + }, + "refined_type": {} }, { "name": "ax", @@ -119854,13 +128878,14 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is created." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Create a ROC Curve display from an estimator.", - "docstring": "Create a ROC Curve display from an estimator.\n\nParameters\n----------\nestimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\ndrop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\nresponse_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\npos_label : str or int, default=None\n The class considered as the positive class when computing the roc auc\n metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\nname : str, default=None\n Name of ROC Curve for labeling. If `None`, use the name of the\n estimator.\n\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n**kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.plot.RocCurveDisplay`\n The ROC Curve display.\n\nSee Also\n--------\nroc_curve : Compute Receiver operating characteristic (ROC) curve.\nRocCurveDisplay.from_predictions : ROC Curve visualization given the\n probabilities of scores of a classifier.\nroc_auc_score : Compute the area under the ROC curve.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.metrics import RocCurveDisplay\n>>> from sklearn.model_selection import train_test_split\n>>> from sklearn.svm import SVC\n>>> X, y = make_classification(random_state=0)\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, random_state=0)\n>>> clf = SVC(random_state=0).fit(X_train, y_train)\n>>> RocCurveDisplay.from_estimator(\n... clf, X_test, y_test)\n<...>\n>>> plt.show()", + "docstring": "Create a ROC Curve display from an estimator.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n drop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\n response_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the roc auc\n metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\n name : str, default=None\n Name of ROC Curve for labeling. If `None`, use the name of the\n estimator.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.plot.RocCurveDisplay`\n The ROC Curve display.\n\n See Also\n --------\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_predictions : ROC Curve visualization given the\n probabilities of scores of a classifier.\n roc_auc_score : Compute the area under the ROC curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import RocCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> RocCurveDisplay.from_estimator(\n ... clf, X_test, y_test)\n <...>\n >>> plt.show()\n ", "source_code": "\n@classmethod\ndef from_estimator(cls, estimator, X, y, *, sample_weight=None, drop_intermediate=True, response_method='auto', pos_label=None, name=None, ax=None, **kwargs):\n \"\"\"Create a ROC Curve display from an estimator.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n drop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\n response_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the roc auc\n metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\n name : str, default=None\n Name of ROC Curve for labeling. If `None`, use the name of the\n estimator.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n **kwargs : dict\n Keyword arguments to be passed to matplotlib's `plot`.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.plot.RocCurveDisplay`\n The ROC Curve display.\n\n See Also\n --------\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_predictions : ROC Curve visualization given the\n probabilities of scores of a classifier.\n roc_auc_score : Compute the area under the ROC curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import RocCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> RocCurveDisplay.from_estimator(\n ... clf, X_test, y_test)\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_estimator')\n name = estimator.__class__.__name__ if name is None else name\n (y_pred, pos_label) = _get_response(X, estimator, response_method=response_method, pos_label=pos_label)\n return cls.from_predictions(y_true=y, y_pred=y_pred, sample_weight=sample_weight, drop_intermediate=drop_intermediate, name=name, ax=ax, pos_label=pos_label, **kwargs)" }, { @@ -119878,7 +128903,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_true", @@ -119888,7 +128914,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -119898,7 +128925,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target scores, can either be probability estimates of the positive\nclass, confidence values, or non-thresholded measure of decisions\n(as returned by \u201cdecision_function\u201d on some classifiers)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -119908,7 +128936,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "drop_intermediate", @@ -119918,7 +128947,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to drop some suboptimal thresholds which would not appear\non a plotted ROC curve. This is useful in order to create lighter\nROC curves." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -119928,6 +128958,10 @@ "docstring": { "type": "str or int, default=None", "description": "The label of the positive class. When `pos_label=None`, if `y_true`\nis in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\nerror will be raised." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -119938,7 +128972,8 @@ "docstring": { "type": "str, default=None", "description": "Name of ROC curve for labeling. If `None`, name will be set to\n`\"Classifier\"`." - } + }, + "refined_type": {} }, { "name": "ax", @@ -119948,13 +128983,14 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Plot ROC curve given the true and predicted values.\n\nRead more in the :ref:`User Guide `. .. versionadded:: 1.0", - "docstring": "Plot ROC curve given the true and predicted values.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n True labels.\n\ny_pred : array-like of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \u201cdecision_function\u201d on some classifiers).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\ndrop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\npos_label : str or int, default=None\n The label of the positive class. When `pos_label=None`, if `y_true`\n is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\n error will be raised.\n\nname : str, default=None\n Name of ROC curve for labeling. If `None`, name will be set to\n `\"Classifier\"`.\n\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n**kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\nSee Also\n--------\nroc_curve : Compute Receiver operating characteristic (ROC) curve.\nRocCurveDisplay.from_estimator : ROC Curve visualization given an\n estimator and some data.\nroc_auc_score : Compute the area under the ROC curve.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn.datasets import make_classification\n>>> from sklearn.metrics import RocCurveDisplay\n>>> from sklearn.model_selection import train_test_split\n>>> from sklearn.svm import SVC\n>>> X, y = make_classification(random_state=0)\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, random_state=0)\n>>> clf = SVC(random_state=0).fit(X_train, y_train)\n>>> y_pred = clf.decision_function(X_test)\n>>> RocCurveDisplay.from_predictions(\n... y_test, y_pred)\n<...>\n>>> plt.show()", + "description": "Plot ROC curve given the true and predicted values.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0", + "docstring": "Plot ROC curve given the true and predicted values.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_pred : array-like of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \u201cdecision_function\u201d on some classifiers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n drop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\n pos_label : str or int, default=None\n The label of the positive class. When `pos_label=None`, if `y_true`\n is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\n error will be raised.\n\n name : str, default=None\n Name of ROC curve for labeling. If `None`, name will be set to\n `\"Classifier\"`.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_estimator : ROC Curve visualization given an\n estimator and some data.\n roc_auc_score : Compute the area under the ROC curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import RocCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> y_pred = clf.decision_function(X_test)\n >>> RocCurveDisplay.from_predictions(\n ... y_test, y_pred)\n <...>\n >>> plt.show()\n ", "source_code": "\n@classmethod\ndef from_predictions(cls, y_true, y_pred, *, sample_weight=None, drop_intermediate=True, pos_label=None, name=None, ax=None, **kwargs):\n \"\"\"Plot ROC curve given the true and predicted values.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_pred : array-like of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \u201cdecision_function\u201d on some classifiers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n drop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\n pos_label : str or int, default=None\n The label of the positive class. When `pos_label=None`, if `y_true`\n is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an\n error will be raised.\n\n name : str, default=None\n Name of ROC curve for labeling. If `None`, name will be set to\n `\"Classifier\"`.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.DetCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_estimator : ROC Curve visualization given an\n estimator and some data.\n roc_auc_score : Compute the area under the ROC curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn.datasets import make_classification\n >>> from sklearn.metrics import RocCurveDisplay\n >>> from sklearn.model_selection import train_test_split\n >>> from sklearn.svm import SVC\n >>> X, y = make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, random_state=0)\n >>> clf = SVC(random_state=0).fit(X_train, y_train)\n >>> y_pred = clf.decision_function(X_test)\n >>> RocCurveDisplay.from_predictions(\n ... y_test, y_pred)\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support(f'{cls.__name__}.from_predictions')\n (fpr, tpr, _) = roc_curve(y_true, y_pred, pos_label=pos_label, sample_weight=sample_weight, drop_intermediate=drop_intermediate)\n roc_auc = auc(fpr, tpr)\n name = 'Classifier' if name is None else name\n pos_label = _check_pos_label_consistency(pos_label, y_true)\n viz = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=name, pos_label=pos_label)\n return viz.plot(ax=ax, name=name, **kwargs)" }, { @@ -119972,7 +129008,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ax", @@ -119982,7 +129019,8 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is\ncreated." - } + }, + "refined_type": {} }, { "name": "name", @@ -119992,13 +129030,14 @@ "docstring": { "type": "str, default=None", "description": "Name of ROC Curve for labeling. If `None`, use `estimator_name` if\nnot `None`, otherwise no labeling is shown." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Plot visualization\n\nExtra keyword arguments will be passed to matplotlib's ``plot``.", - "docstring": "Plot visualization\n\nExtra keyword arguments will be passed to matplotlib's ``plot``.\n\nParameters\n----------\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\nname : str, default=None\n Name of ROC Curve for labeling. If `None`, use `estimator_name` if\n not `None`, otherwise no labeling is shown.\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.plot.RocCurveDisplay`\n Object that stores computed values.", + "docstring": "Plot visualization\n\n Extra keyword arguments will be passed to matplotlib's ``plot``.\n\n Parameters\n ----------\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name of ROC Curve for labeling. If `None`, use `estimator_name` if\n not `None`, otherwise no labeling is shown.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.plot.RocCurveDisplay`\n Object that stores computed values.\n ", "source_code": "\ndef plot(self, ax=None, *, name=None, **kwargs):\n \"\"\"Plot visualization\n\n Extra keyword arguments will be passed to matplotlib's ``plot``.\n\n Parameters\n ----------\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is\n created.\n\n name : str, default=None\n Name of ROC Curve for labeling. If `None`, use `estimator_name` if\n not `None`, otherwise no labeling is shown.\n\n Returns\n -------\n display : :class:`~sklearn.metrics.plot.RocCurveDisplay`\n Object that stores computed values.\n \"\"\"\n check_matplotlib_support('RocCurveDisplay.plot')\n name = self.estimator_name if name is None else name\n line_kwargs = {}\n if self.roc_auc is not None and name is not None:\n line_kwargs['label'] = f'{name} (AUC = {self.roc_auc:0.2f})'\n elif self.roc_auc is not None:\n line_kwargs['label'] = f'AUC = {self.roc_auc:0.2f}'\n elif name is not None:\n line_kwargs['label'] = name\n line_kwargs.update(**kwargs)\n import matplotlib.pyplot as plt\n if ax is None:\n (fig, ax) = plt.subplots()\n (self.line_, ) = ax.plot(self.fpr, self.tpr, **line_kwargs)\n info_pos_label = f' (Positive label: {self.pos_label})' if self.pos_label is not None else ''\n xlabel = 'False Positive Rate' + info_pos_label\n ylabel = 'True Positive Rate' + info_pos_label\n ax.set(xlabel=xlabel, ylabel=ylabel)\n if 'label' in line_kwargs:\n ax.legend(loc='lower right')\n self.ax_ = ax\n self.figure_ = ax.figure\n return self" }, { @@ -120007,7 +129046,7 @@ "qname": "sklearn.metrics._plot.roc_curve.plot_roc_curve", "unique_qname": "sklearn.metrics._plot.roc_curve.plot_roc_curve", "decorators": [ - "deprecated('Function `plot_roc_curve` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: RocCurveDisplay.from_predictions or RocCurveDisplay.from_estimator.')" + "deprecated('Function :func:`plot_roc_curve` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: :meth:`sklearn.metric.RocCurveDisplay.from_predictions` or :meth:`sklearn.metric.RocCurveDisplay.from_estimator`.')" ], "parameters": [ { @@ -120018,7 +129057,8 @@ "docstring": { "type": "estimator instance", "description": "Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\nin which the last estimator is a classifier." - } + }, + "refined_type": {} }, { "name": "X", @@ -120028,6 +129068,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Input values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -120038,7 +129082,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -120048,7 +129093,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "drop_intermediate", @@ -120058,7 +129104,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to drop some suboptimal thresholds which would not appear\non a plotted ROC curve. This is useful in order to create lighter\nROC curves." - } + }, + "refined_type": {} }, { "name": "response_method", @@ -120068,6 +129115,10 @@ "docstring": { "type": "{'predict_proba', 'decision_function', 'auto'} default='auto'", "description": "Specifies whether to use :term:`predict_proba` or\n:term:`decision_function` as the target response. If set to 'auto',\n:term:`predict_proba` is tried first and if it does not exist\n:term:`decision_function` is tried next." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "decision_function", "predict_proba"] } }, { @@ -120078,7 +129129,8 @@ "docstring": { "type": "str, default=None", "description": "Name of ROC Curve for labeling. If `None`, use the name of the\nestimator." - } + }, + "refined_type": {} }, { "name": "ax", @@ -120088,7 +129140,8 @@ "docstring": { "type": "matplotlib axes, default=None", "description": "Axes object to plot on. If `None`, a new figure and axes is created." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -120097,15 +129150,16 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "str or int, default=None", - "description": "The class considered as the positive class when computing the roc auc\nmetrics. By default, `estimators.classes_[1]` is considered\nas the positive class.\n\n.. versionadded:: 0.24" - } + "description": "The class considered as the positive class when computing the roc auc\nmetrics. By default, `estimators.classes_[1]` is considered\nas the positive class." + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Plot Receiver operating characteristic (ROC) curve.\n\nExtra keyword arguments will be passed to matplotlib's `plot`. Read more in the :ref:`User Guide `. .. deprecated:: 1.0 `plot_roc_curve` is deprecated in 1.0 and will be removed in 1.2. Use one of the following class methods: :func:`~sklearn.metrics.RocCurveDisplay.from_predictions` or :func:`~sklearn.metrics.RocCurveDisplay.from_estimator`.", - "docstring": "Plot Receiver operating characteristic (ROC) curve.\n\nExtra keyword arguments will be passed to matplotlib's `plot`.\n\nRead more in the :ref:`User Guide `.\n\n.. deprecated:: 1.0\n `plot_roc_curve` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.RocCurveDisplay.from_predictions` or\n :func:`~sklearn.metrics.RocCurveDisplay.from_estimator`.\n\nParameters\n----------\nestimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\ndrop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\nresponse_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\nname : str, default=None\n Name of ROC Curve for labeling. If `None`, use the name of the\n estimator.\n\nax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\npos_label : str or int, default=None\n The class considered as the positive class when computing the roc auc\n metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\n .. versionadded:: 0.24\n\nReturns\n-------\ndisplay : :class:`~sklearn.metrics.RocCurveDisplay`\n Object that stores computed values.\n\nSee Also\n--------\nroc_curve : Compute Receiver operating characteristic (ROC) curve.\nRocCurveDisplay.from_estimator : ROC Curve visualization given an estimator\n and some data.\nRocCurveDisplay.from_predictions : ROC Curve visualisation given the\n true and predicted values.\nroc_auc_score : Compute the area under the ROC curve.\n\nExamples\n--------\n>>> import matplotlib.pyplot as plt\n>>> from sklearn import datasets, metrics, model_selection, svm\n>>> X, y = datasets.make_classification(random_state=0)\n>>> X_train, X_test, y_train, y_test = model_selection.train_test_split(\n... X, y, random_state=0)\n>>> clf = svm.SVC(random_state=0)\n>>> clf.fit(X_train, y_train)\nSVC(random_state=0)\n>>> metrics.plot_roc_curve(clf, X_test, y_test) # doctest: +SKIP\n<...>\n>>> plt.show()", - "source_code": "\n@deprecated('Function `plot_roc_curve` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: RocCurveDisplay.from_predictions or RocCurveDisplay.from_estimator.')\ndef plot_roc_curve(estimator, X, y, *, sample_weight=None, drop_intermediate=True, response_method='auto', name=None, ax=None, pos_label=None, **kwargs):\n \"\"\"Plot Receiver operating characteristic (ROC) curve.\n\n Extra keyword arguments will be passed to matplotlib's `plot`.\n\n Read more in the :ref:`User Guide `.\n\n .. deprecated:: 1.0\n `plot_roc_curve` is deprecated in 1.0 and will be removed in\n 1.2. Use one of the following class methods:\n :func:`~sklearn.metrics.RocCurveDisplay.from_predictions` or\n :func:`~sklearn.metrics.RocCurveDisplay.from_estimator`.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n drop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\n response_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n name : str, default=None\n Name of ROC Curve for labeling. If `None`, use the name of the\n estimator.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the roc auc\n metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n display : :class:`~sklearn.metrics.RocCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_estimator : ROC Curve visualization given an estimator\n and some data.\n RocCurveDisplay.from_predictions : ROC Curve visualisation given the\n true and predicted values.\n roc_auc_score : Compute the area under the ROC curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn import datasets, metrics, model_selection, svm\n >>> X, y = datasets.make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = model_selection.train_test_split(\n ... X, y, random_state=0)\n >>> clf = svm.SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> metrics.plot_roc_curve(clf, X_test, y_test) # doctest: +SKIP\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support('plot_roc_curve')\n (y_pred, pos_label) = _get_response(X, estimator, response_method, pos_label=pos_label)\n (fpr, tpr, _) = roc_curve(y, y_pred, pos_label=pos_label, sample_weight=sample_weight, drop_intermediate=drop_intermediate)\n roc_auc = auc(fpr, tpr)\n name = estimator.__class__.__name__ if name is None else name\n viz = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=name, pos_label=pos_label)\n return viz.plot(ax=ax, name=name, **kwargs)" + "description": "Plot Receiver operating characteristic (ROC) curve.\n\nExtra keyword arguments will be passed to matplotlib's `plot`.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Plot Receiver operating characteristic (ROC) curve.\n\n Extra keyword arguments will be passed to matplotlib's `plot`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n drop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\n response_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n name : str, default=None\n Name of ROC Curve for labeling. If `None`, use the name of the\n estimator.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the roc auc\n metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n display : :class:`~sklearn.metrics.RocCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_estimator : ROC Curve visualization given an estimator\n and some data.\n RocCurveDisplay.from_predictions : ROC Curve visualisation given the\n true and predicted values.\n roc_auc_score : Compute the area under the ROC curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn import datasets, metrics, model_selection, svm\n >>> X, y = datasets.make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = model_selection.train_test_split(\n ... X, y, random_state=0)\n >>> clf = svm.SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> metrics.plot_roc_curve(clf, X_test, y_test) # doctest: +SKIP\n <...>\n >>> plt.show()\n ", + "source_code": "\n@deprecated('Function :func:`plot_roc_curve` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: :meth:`sklearn.metric.RocCurveDisplay.from_predictions` or :meth:`sklearn.metric.RocCurveDisplay.from_estimator`.')\ndef plot_roc_curve(estimator, X, y, *, sample_weight=None, drop_intermediate=True, response_method='auto', name=None, ax=None, pos_label=None, **kwargs):\n \"\"\"Plot Receiver operating characteristic (ROC) curve.\n\n Extra keyword arguments will be passed to matplotlib's `plot`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator instance\n Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`\n in which the last estimator is a classifier.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Input values.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n drop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\n response_method : {'predict_proba', 'decision_function', 'auto'} default='auto'\n Specifies whether to use :term:`predict_proba` or\n :term:`decision_function` as the target response. If set to 'auto',\n :term:`predict_proba` is tried first and if it does not exist\n :term:`decision_function` is tried next.\n\n name : str, default=None\n Name of ROC Curve for labeling. If `None`, use the name of the\n estimator.\n\n ax : matplotlib axes, default=None\n Axes object to plot on. If `None`, a new figure and axes is created.\n\n pos_label : str or int, default=None\n The class considered as the positive class when computing the roc auc\n metrics. By default, `estimators.classes_[1]` is considered\n as the positive class.\n\n **kwargs : dict\n Additional keywords arguments passed to matplotlib `plot` function.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n display : :class:`~sklearn.metrics.RocCurveDisplay`\n Object that stores computed values.\n\n See Also\n --------\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_estimator : ROC Curve visualization given an estimator\n and some data.\n RocCurveDisplay.from_predictions : ROC Curve visualisation given the\n true and predicted values.\n roc_auc_score : Compute the area under the ROC curve.\n\n Examples\n --------\n >>> import matplotlib.pyplot as plt\n >>> from sklearn import datasets, metrics, model_selection, svm\n >>> X, y = datasets.make_classification(random_state=0)\n >>> X_train, X_test, y_train, y_test = model_selection.train_test_split(\n ... X, y, random_state=0)\n >>> clf = svm.SVC(random_state=0)\n >>> clf.fit(X_train, y_train)\n SVC(random_state=0)\n >>> metrics.plot_roc_curve(clf, X_test, y_test) # doctest: +SKIP\n <...>\n >>> plt.show()\n \"\"\"\n check_matplotlib_support('plot_roc_curve')\n (y_pred, pos_label) = _get_response(X, estimator, response_method, pos_label=pos_label)\n (fpr, tpr, _) = roc_curve(y, y_pred, pos_label=pos_label, sample_weight=sample_weight, drop_intermediate=drop_intermediate)\n roc_auc = auc(fpr, tpr)\n name = estimator.__class__.__name__ if name is None else name\n viz = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=name, pos_label=pos_label)\n return viz.plot(ax=ax, name=name, **kwargs)" }, { "name": "_binary_clf_curve", @@ -120122,7 +129176,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True targets of binary classification." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -120132,7 +129187,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Estimated probabilities or output of a decision function." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -120142,7 +129198,8 @@ "docstring": { "type": "int or str, default=None", "description": "The label of the positive class." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -120152,13 +129209,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Calculate true and false positives per binary classification threshold.", - "docstring": "Calculate true and false positives per binary classification threshold.\n\nParameters\n----------\ny_true : ndarray of shape (n_samples,)\n True targets of binary classification.\n\ny_score : ndarray of shape (n_samples,)\n Estimated probabilities or output of a decision function.\n\npos_label : int or str, default=None\n The label of the positive class.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nfps : ndarray of shape (n_thresholds,)\n A count of false positives, at index i being the number of negative\n samples assigned a score >= thresholds[i]. The total number of\n negative samples is equal to fps[-1] (thus true negatives are given by\n fps[-1] - fps).\n\ntps : ndarray of shape (n_thresholds,)\n An increasing count of true positives, at index i being the number\n of positive samples assigned a score >= thresholds[i]. The total\n number of positive samples is equal to tps[-1] (thus false negatives\n are given by tps[-1] - tps).\n\nthresholds : ndarray of shape (n_thresholds,)\n Decreasing score values.", + "docstring": "Calculate true and false positives per binary classification threshold.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples,)\n True targets of binary classification.\n\n y_score : ndarray of shape (n_samples,)\n Estimated probabilities or output of a decision function.\n\n pos_label : int or str, default=None\n The label of the positive class.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n fps : ndarray of shape (n_thresholds,)\n A count of false positives, at index i being the number of negative\n samples assigned a score >= thresholds[i]. The total number of\n negative samples is equal to fps[-1] (thus true negatives are given by\n fps[-1] - fps).\n\n tps : ndarray of shape (n_thresholds,)\n An increasing count of true positives, at index i being the number\n of positive samples assigned a score >= thresholds[i]. The total\n number of positive samples is equal to tps[-1] (thus false negatives\n are given by tps[-1] - tps).\n\n thresholds : ndarray of shape (n_thresholds,)\n Decreasing score values.\n ", "source_code": "\ndef _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):\n \"\"\"Calculate true and false positives per binary classification threshold.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples,)\n True targets of binary classification.\n\n y_score : ndarray of shape (n_samples,)\n Estimated probabilities or output of a decision function.\n\n pos_label : int or str, default=None\n The label of the positive class.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n fps : ndarray of shape (n_thresholds,)\n A count of false positives, at index i being the number of negative\n samples assigned a score >= thresholds[i]. The total number of\n negative samples is equal to fps[-1] (thus true negatives are given by\n fps[-1] - fps).\n\n tps : ndarray of shape (n_thresholds,)\n An increasing count of true positives, at index i being the number\n of positive samples assigned a score >= thresholds[i]. The total\n number of positive samples is equal to tps[-1] (thus false negatives\n are given by tps[-1] - tps).\n\n thresholds : ndarray of shape (n_thresholds,)\n Decreasing score values.\n \"\"\"\n y_type = type_of_target(y_true)\n if not (y_type == 'binary' or y_type == 'multiclass' and pos_label is not None):\n raise ValueError('{0} format is not supported'.format(y_type))\n check_consistent_length(y_true, y_score, sample_weight)\n y_true = column_or_1d(y_true)\n y_score = column_or_1d(y_score)\n assert_all_finite(y_true)\n assert_all_finite(y_score)\n if sample_weight is not None:\n sample_weight = column_or_1d(sample_weight)\n sample_weight = _check_sample_weight(sample_weight, y_true)\n nonzero_weight_mask = sample_weight != 0\n y_true = y_true[nonzero_weight_mask]\n y_score = y_score[nonzero_weight_mask]\n sample_weight = sample_weight[nonzero_weight_mask]\n pos_label = _check_pos_label_consistency(pos_label, y_true)\n y_true = y_true == pos_label\n desc_score_indices = np.argsort(y_score, kind='mergesort')[::-1]\n y_score = y_score[desc_score_indices]\n y_true = y_true[desc_score_indices]\n if sample_weight is not None:\n weight = sample_weight[desc_score_indices]\n else:\n weight = 1.0\n distinct_value_indices = np.where(np.diff(y_score))[0]\n threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]\n tps = stable_cumsum(y_true * weight)[threshold_idxs]\n if sample_weight is not None:\n fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs]\n else:\n fps = 1 + threshold_idxs - tps\n return fps, tps, y_score[threshold_idxs]" }, { @@ -120176,7 +129234,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_score", @@ -120186,7 +129245,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -120196,7 +129256,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_fpr", @@ -120206,7 +129267,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -120230,13 +129292,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_dcg_target_type(y_true):\n y_type = type_of_target(y_true)\n supported_fmt = ('multilabel-indicator', 'continuous-multioutput', 'multiclass-multioutput')\n if y_type not in supported_fmt:\n raise ValueError('Only {} formats are supported. Got {} instead'.format(supported_fmt, y_type))" }, { @@ -120254,7 +129317,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_labels)", "description": "True targets of multilabel classification, or true scores of entities\nto be ranked." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -120264,7 +129328,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_labels)", "description": "Target scores, can either be probability estimates, confidence values,\nor non-thresholded measure of decisions (as returned by\n\"decision_function\" on some classifiers)." - } + }, + "refined_type": {} }, { "name": "k", @@ -120274,7 +129339,8 @@ "docstring": { "type": "int, default=None", "description": "Only consider the highest k scores in the ranking. If `None`, use all\noutputs." - } + }, + "refined_type": {} }, { "name": "log_base", @@ -120284,7 +129350,8 @@ "docstring": { "type": "float, default=2", "description": "Base of the logarithm used for the discount. A low value means a\nsharper discount (top results are more important)." - } + }, + "refined_type": {} }, { "name": "ignore_ties", @@ -120294,13 +129361,14 @@ "docstring": { "type": "bool, default=False", "description": "Assume that there are no ties in y_score (which is likely to be the\ncase if y_score is continuous) for efficiency gains." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute Discounted Cumulative Gain.\n\nSum the true scores ranked in the order induced by the predicted scores, after applying a logarithmic discount. This ranking metric yields a high value if true labels are ranked high by ``y_score``.", - "docstring": "Compute Discounted Cumulative Gain.\n\nSum the true scores ranked in the order induced by the predicted scores,\nafter applying a logarithmic discount.\n\nThis ranking metric yields a high value if true labels are ranked high by\n``y_score``.\n\nParameters\n----------\ny_true : ndarray of shape (n_samples, n_labels)\n True targets of multilabel classification, or true scores of entities\n to be ranked.\n\ny_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates, confidence values,\n or non-thresholded measure of decisions (as returned by\n \"decision_function\" on some classifiers).\n\nk : int, default=None\n Only consider the highest k scores in the ranking. If `None`, use all\n outputs.\n\nlog_base : float, default=2\n Base of the logarithm used for the discount. A low value means a\n sharper discount (top results are more important).\n\nignore_ties : bool, default=False\n Assume that there are no ties in y_score (which is likely to be the\n case if y_score is continuous) for efficiency gains.\n\nReturns\n-------\ndiscounted_cumulative_gain : ndarray of shape (n_samples,)\n The DCG score for each sample.\n\nSee Also\n--------\nndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted\n Cumulative Gain (the DCG obtained for a perfect ranking), in order to\n have a score between 0 and 1.", + "description": "Compute Discounted Cumulative Gain.\n\nSum the true scores ranked in the order induced by the predicted scores,\nafter applying a logarithmic discount.\n\nThis ranking metric yields a high value if true labels are ranked high by\n``y_score``.", + "docstring": "Compute Discounted Cumulative Gain.\n\n Sum the true scores ranked in the order induced by the predicted scores,\n after applying a logarithmic discount.\n\n This ranking metric yields a high value if true labels are ranked high by\n ``y_score``.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples, n_labels)\n True targets of multilabel classification, or true scores of entities\n to be ranked.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates, confidence values,\n or non-thresholded measure of decisions (as returned by\n \"decision_function\" on some classifiers).\n\n k : int, default=None\n Only consider the highest k scores in the ranking. If `None`, use all\n outputs.\n\n log_base : float, default=2\n Base of the logarithm used for the discount. A low value means a\n sharper discount (top results are more important).\n\n ignore_ties : bool, default=False\n Assume that there are no ties in y_score (which is likely to be the\n case if y_score is continuous) for efficiency gains.\n\n Returns\n -------\n discounted_cumulative_gain : ndarray of shape (n_samples,)\n The DCG score for each sample.\n\n See Also\n --------\n ndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted\n Cumulative Gain (the DCG obtained for a perfect ranking), in order to\n have a score between 0 and 1.\n ", "source_code": "\ndef _dcg_sample_scores(y_true, y_score, k=None, log_base=2, ignore_ties=False):\n \"\"\"Compute Discounted Cumulative Gain.\n\n Sum the true scores ranked in the order induced by the predicted scores,\n after applying a logarithmic discount.\n\n This ranking metric yields a high value if true labels are ranked high by\n ``y_score``.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples, n_labels)\n True targets of multilabel classification, or true scores of entities\n to be ranked.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates, confidence values,\n or non-thresholded measure of decisions (as returned by\n \"decision_function\" on some classifiers).\n\n k : int, default=None\n Only consider the highest k scores in the ranking. If `None`, use all\n outputs.\n\n log_base : float, default=2\n Base of the logarithm used for the discount. A low value means a\n sharper discount (top results are more important).\n\n ignore_ties : bool, default=False\n Assume that there are no ties in y_score (which is likely to be the\n case if y_score is continuous) for efficiency gains.\n\n Returns\n -------\n discounted_cumulative_gain : ndarray of shape (n_samples,)\n The DCG score for each sample.\n\n See Also\n --------\n ndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted\n Cumulative Gain (the DCG obtained for a perfect ranking), in order to\n have a score between 0 and 1.\n \"\"\"\n discount = 1 / (np.log(np.arange(y_true.shape[1]) + 2) / np.log(log_base))\n if k is not None:\n discount[k:] = 0\n if ignore_ties:\n ranking = np.argsort(y_score)[:, ::-1]\n ranked = y_true[np.arange(ranking.shape[0])[:, np.newaxis], ranking]\n cumulative_gains = discount.dot(ranked.T)\n else:\n discount_cumsum = np.cumsum(discount)\n cumulative_gains = [_tie_averaged_dcg(y_t, y_s, discount_cumsum) for (y_t, y_s) in zip(y_true, y_score)]\n cumulative_gains = np.asarray(cumulative_gains)\n return cumulative_gains" }, { @@ -120318,7 +129386,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "True multiclass labels." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -120328,7 +129397,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_classes)", "description": "Target scores corresponding to probability estimates of a sample\nbelonging to a particular class" - } + }, + "refined_type": {} }, { "name": "labels", @@ -120338,7 +129408,8 @@ "docstring": { "type": "array-like of shape (n_classes,) or None", "description": "List of labels to index ``y_score`` used for multiclass. If ``None``,\nthe lexical order of ``y_true`` is used to index ``y_score``." - } + }, + "refined_type": {} }, { "name": "multi_class", @@ -120348,6 +129419,10 @@ "docstring": { "type": "{'ovr', 'ovo'}", "description": "Determines the type of multiclass configuration to use.\n``'ovr'``:\n Calculate metrics for the multiclass case using the one-vs-rest\n approach.\n``'ovo'``:\n Calculate metrics for the multiclass case using the one-vs-one\n approach." + }, + "refined_type": { + "kind": "EnumType", + "values": ["ovo", "ovr"] } }, { @@ -120358,6 +129433,10 @@ "docstring": { "type": "{'macro', 'weighted'}", "description": "Determines the type of averaging performed on the pairwise binary\nmetric scores\n``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account. Classes\n are assumed to be uniformly distributed.\n``'weighted'``:\n Calculate metrics for each label, taking into account the\n prevalence of the classes." + }, + "refined_type": { + "kind": "EnumType", + "values": ["macro", "weighted"] } }, { @@ -120368,13 +129447,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Multiclass roc auc score.", - "docstring": "Multiclass roc auc score.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n True multiclass labels.\n\ny_score : array-like of shape (n_samples, n_classes)\n Target scores corresponding to probability estimates of a sample\n belonging to a particular class\n\nlabels : array-like of shape (n_classes,) or None\n List of labels to index ``y_score`` used for multiclass. If ``None``,\n the lexical order of ``y_true`` is used to index ``y_score``.\n\nmulti_class : {'ovr', 'ovo'}\n Determines the type of multiclass configuration to use.\n ``'ovr'``:\n Calculate metrics for the multiclass case using the one-vs-rest\n approach.\n ``'ovo'``:\n Calculate metrics for the multiclass case using the one-vs-one\n approach.\n\naverage : {'macro', 'weighted'}\n Determines the type of averaging performed on the pairwise binary\n metric scores\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account. Classes\n are assumed to be uniformly distributed.\n ``'weighted'``:\n Calculate metrics for each label, taking into account the\n prevalence of the classes.\n\nsample_weight : array-like of shape (n_samples,) or None\n Sample weights.", + "docstring": "Multiclass roc auc score.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True multiclass labels.\n\n y_score : array-like of shape (n_samples, n_classes)\n Target scores corresponding to probability estimates of a sample\n belonging to a particular class\n\n labels : array-like of shape (n_classes,) or None\n List of labels to index ``y_score`` used for multiclass. If ``None``,\n the lexical order of ``y_true`` is used to index ``y_score``.\n\n multi_class : {'ovr', 'ovo'}\n Determines the type of multiclass configuration to use.\n ``'ovr'``:\n Calculate metrics for the multiclass case using the one-vs-rest\n approach.\n ``'ovo'``:\n Calculate metrics for the multiclass case using the one-vs-one\n approach.\n\n average : {'macro', 'weighted'}\n Determines the type of averaging performed on the pairwise binary\n metric scores\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account. Classes\n are assumed to be uniformly distributed.\n ``'weighted'``:\n Calculate metrics for each label, taking into account the\n prevalence of the classes.\n\n sample_weight : array-like of shape (n_samples,) or None\n Sample weights.\n\n ", "source_code": "\ndef _multiclass_roc_auc_score(y_true, y_score, labels, multi_class, average, sample_weight):\n \"\"\"Multiclass roc auc score.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True multiclass labels.\n\n y_score : array-like of shape (n_samples, n_classes)\n Target scores corresponding to probability estimates of a sample\n belonging to a particular class\n\n labels : array-like of shape (n_classes,) or None\n List of labels to index ``y_score`` used for multiclass. If ``None``,\n the lexical order of ``y_true`` is used to index ``y_score``.\n\n multi_class : {'ovr', 'ovo'}\n Determines the type of multiclass configuration to use.\n ``'ovr'``:\n Calculate metrics for the multiclass case using the one-vs-rest\n approach.\n ``'ovo'``:\n Calculate metrics for the multiclass case using the one-vs-one\n approach.\n\n average : {'macro', 'weighted'}\n Determines the type of averaging performed on the pairwise binary\n metric scores\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account. Classes\n are assumed to be uniformly distributed.\n ``'weighted'``:\n Calculate metrics for each label, taking into account the\n prevalence of the classes.\n\n sample_weight : array-like of shape (n_samples,) or None\n Sample weights.\n\n \"\"\"\n if not np.allclose(1, y_score.sum(axis=1)):\n raise ValueError('Target scores need to be probabilities for multiclass roc_auc, i.e. they should sum up to 1.0 over classes')\n average_options = ('macro', 'weighted')\n if average not in average_options:\n raise ValueError('average must be one of {0} for multiclass problems'.format(average_options))\n multiclass_options = ('ovo', 'ovr')\n if multi_class not in multiclass_options:\n raise ValueError(\"multi_class='{0}' is not supported for multiclass ROC AUC, multi_class must be in {1}\".format(multi_class, multiclass_options))\n if labels is not None:\n labels = column_or_1d(labels)\n classes = _unique(labels)\n if len(classes) != len(labels):\n raise ValueError(\"Parameter 'labels' must be unique\")\n if not np.array_equal(classes, labels):\n raise ValueError(\"Parameter 'labels' must be ordered\")\n if len(classes) != y_score.shape[1]:\n raise ValueError(\"Number of given labels, {0}, not equal to the number of columns in 'y_score', {1}\".format(len(classes), y_score.shape[1]))\n if len(np.setdiff1d(y_true, classes)):\n raise ValueError(\"'y_true' contains labels not in parameter 'labels'\")\n else:\n classes = _unique(y_true)\n if len(classes) != y_score.shape[1]:\n raise ValueError(\"Number of classes in y_true not equal to the number of columns in 'y_score'\")\n if multi_class == 'ovo':\n if sample_weight is not None:\n raise ValueError(\"sample_weight is not supported for multiclass one-vs-one ROC AUC, 'sample_weight' must be None in this case.\")\n y_true_encoded = _encode(y_true, uniques=classes)\n return _average_multiclass_ovo_score(_binary_roc_auc_score, y_true_encoded, y_score, average=average)\n else:\n y_true_multilabel = label_binarize(y_true, classes=classes)\n return _average_binary_score(_binary_roc_auc_score, y_true_multilabel, y_score, average, sample_weight=sample_weight)" }, { @@ -120392,7 +129472,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_labels)", "description": "True targets of multilabel classification, or true scores of entities\nto be ranked." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -120402,7 +129483,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_labels)", "description": "Target scores, can either be probability estimates, confidence values,\nor non-thresholded measure of decisions (as returned by\n\"decision_function\" on some classifiers)." - } + }, + "refined_type": {} }, { "name": "k", @@ -120412,7 +129494,8 @@ "docstring": { "type": "int, default=None", "description": "Only consider the highest k scores in the ranking. If None, use all\noutputs." - } + }, + "refined_type": {} }, { "name": "ignore_ties", @@ -120422,13 +129505,14 @@ "docstring": { "type": "bool, default=False", "description": "Assume that there are no ties in y_score (which is likely to be the\ncase if y_score is continuous) for efficiency gains." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute Normalized Discounted Cumulative Gain.\n\nSum the true scores ranked in the order induced by the predicted scores, after applying a logarithmic discount. Then divide by the best possible score (Ideal DCG, obtained for a perfect ranking) to obtain a score between 0 and 1. This ranking metric yields a high value if true labels are ranked high by ``y_score``.", - "docstring": "Compute Normalized Discounted Cumulative Gain.\n\nSum the true scores ranked in the order induced by the predicted scores,\nafter applying a logarithmic discount. Then divide by the best possible\nscore (Ideal DCG, obtained for a perfect ranking) to obtain a score between\n0 and 1.\n\nThis ranking metric yields a high value if true labels are ranked high by\n``y_score``.\n\nParameters\n----------\ny_true : ndarray of shape (n_samples, n_labels)\n True targets of multilabel classification, or true scores of entities\n to be ranked.\n\ny_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates, confidence values,\n or non-thresholded measure of decisions (as returned by\n \"decision_function\" on some classifiers).\n\nk : int, default=None\n Only consider the highest k scores in the ranking. If None, use all\n outputs.\n\nignore_ties : bool, default=False\n Assume that there are no ties in y_score (which is likely to be the\n case if y_score is continuous) for efficiency gains.\n\nReturns\n-------\nnormalized_discounted_cumulative_gain : ndarray of shape (n_samples,)\n The NDCG score for each sample (float in [0., 1.]).\n\nSee Also\n--------\ndcg_score : Discounted Cumulative Gain (not normalized).", + "description": "Compute Normalized Discounted Cumulative Gain.\n\nSum the true scores ranked in the order induced by the predicted scores,\nafter applying a logarithmic discount. Then divide by the best possible\nscore (Ideal DCG, obtained for a perfect ranking) to obtain a score between\n0 and 1.\n\nThis ranking metric yields a high value if true labels are ranked high by\n``y_score``.", + "docstring": "Compute Normalized Discounted Cumulative Gain.\n\n Sum the true scores ranked in the order induced by the predicted scores,\n after applying a logarithmic discount. Then divide by the best possible\n score (Ideal DCG, obtained for a perfect ranking) to obtain a score between\n 0 and 1.\n\n This ranking metric yields a high value if true labels are ranked high by\n ``y_score``.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples, n_labels)\n True targets of multilabel classification, or true scores of entities\n to be ranked.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates, confidence values,\n or non-thresholded measure of decisions (as returned by\n \"decision_function\" on some classifiers).\n\n k : int, default=None\n Only consider the highest k scores in the ranking. If None, use all\n outputs.\n\n ignore_ties : bool, default=False\n Assume that there are no ties in y_score (which is likely to be the\n case if y_score is continuous) for efficiency gains.\n\n Returns\n -------\n normalized_discounted_cumulative_gain : ndarray of shape (n_samples,)\n The NDCG score for each sample (float in [0., 1.]).\n\n See Also\n --------\n dcg_score : Discounted Cumulative Gain (not normalized).\n\n ", "source_code": "\ndef _ndcg_sample_scores(y_true, y_score, k=None, ignore_ties=False):\n \"\"\"Compute Normalized Discounted Cumulative Gain.\n\n Sum the true scores ranked in the order induced by the predicted scores,\n after applying a logarithmic discount. Then divide by the best possible\n score (Ideal DCG, obtained for a perfect ranking) to obtain a score between\n 0 and 1.\n\n This ranking metric yields a high value if true labels are ranked high by\n ``y_score``.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples, n_labels)\n True targets of multilabel classification, or true scores of entities\n to be ranked.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates, confidence values,\n or non-thresholded measure of decisions (as returned by\n \"decision_function\" on some classifiers).\n\n k : int, default=None\n Only consider the highest k scores in the ranking. If None, use all\n outputs.\n\n ignore_ties : bool, default=False\n Assume that there are no ties in y_score (which is likely to be the\n case if y_score is continuous) for efficiency gains.\n\n Returns\n -------\n normalized_discounted_cumulative_gain : ndarray of shape (n_samples,)\n The NDCG score for each sample (float in [0., 1.]).\n\n See Also\n --------\n dcg_score : Discounted Cumulative Gain (not normalized).\n\n \"\"\"\n gain = _dcg_sample_scores(y_true, y_score, k, ignore_ties=ignore_ties)\n normalizing_gain = _dcg_sample_scores(y_true, y_true, k, ignore_ties=True)\n all_irrelevant = normalizing_gain == 0\n gain[all_irrelevant] = 0\n gain[~all_irrelevant] /= normalizing_gain[~all_irrelevant]\n return gain" }, { @@ -120446,7 +129530,8 @@ "docstring": { "type": "ndarray", "description": "The true relevance scores." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -120456,7 +129541,8 @@ "docstring": { "type": "ndarray", "description": "Predicted scores." - } + }, + "refined_type": {} }, { "name": "discount_cumsum", @@ -120466,13 +129552,14 @@ "docstring": { "type": "ndarray", "description": "Precomputed cumulative sum of the discounts." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute DCG by averaging over possible permutations of ties.\n\nThe gain (`y_true`) of an index falling inside a tied group (in the order induced by `y_score`) is replaced by the average gain within this group. The discounted gain for a tied group is then the average `y_true` within this group times the sum of discounts of the corresponding ranks. This amounts to averaging scores for all possible orderings of the tied groups. (note in the case of dcg@k the discount is 0 after index k)", - "docstring": "Compute DCG by averaging over possible permutations of ties.\n\nThe gain (`y_true`) of an index falling inside a tied group (in the order\ninduced by `y_score`) is replaced by the average gain within this group.\nThe discounted gain for a tied group is then the average `y_true` within\nthis group times the sum of discounts of the corresponding ranks.\n\nThis amounts to averaging scores for all possible orderings of the tied\ngroups.\n\n(note in the case of dcg@k the discount is 0 after index k)\n\nParameters\n----------\ny_true : ndarray\n The true relevance scores.\n\ny_score : ndarray\n Predicted scores.\n\ndiscount_cumsum : ndarray\n Precomputed cumulative sum of the discounts.\n\nReturns\n-------\ndiscounted_cumulative_gain : float\n The discounted cumulative gain.\n\nReferences\n----------\nMcSherry, F., & Najork, M. (2008, March). Computing information retrieval\nperformance measures efficiently in the presence of tied scores. In\nEuropean conference on information retrieval (pp. 414-421). Springer,\nBerlin, Heidelberg.", + "description": "Compute DCG by averaging over possible permutations of ties.\n\nThe gain (`y_true`) of an index falling inside a tied group (in the order\ninduced by `y_score`) is replaced by the average gain within this group.\nThe discounted gain for a tied group is then the average `y_true` within\nthis group times the sum of discounts of the corresponding ranks.\n\nThis amounts to averaging scores for all possible orderings of the tied\ngroups.\n\n(note in the case of dcg@k the discount is 0 after index k)", + "docstring": "\n Compute DCG by averaging over possible permutations of ties.\n\n The gain (`y_true`) of an index falling inside a tied group (in the order\n induced by `y_score`) is replaced by the average gain within this group.\n The discounted gain for a tied group is then the average `y_true` within\n this group times the sum of discounts of the corresponding ranks.\n\n This amounts to averaging scores for all possible orderings of the tied\n groups.\n\n (note in the case of dcg@k the discount is 0 after index k)\n\n Parameters\n ----------\n y_true : ndarray\n The true relevance scores.\n\n y_score : ndarray\n Predicted scores.\n\n discount_cumsum : ndarray\n Precomputed cumulative sum of the discounts.\n\n Returns\n -------\n discounted_cumulative_gain : float\n The discounted cumulative gain.\n\n References\n ----------\n McSherry, F., & Najork, M. (2008, March). Computing information retrieval\n performance measures efficiently in the presence of tied scores. In\n European conference on information retrieval (pp. 414-421). Springer,\n Berlin, Heidelberg.\n ", "source_code": "\ndef _tie_averaged_dcg(y_true, y_score, discount_cumsum):\n \"\"\"\n Compute DCG by averaging over possible permutations of ties.\n\n The gain (`y_true`) of an index falling inside a tied group (in the order\n induced by `y_score`) is replaced by the average gain within this group.\n The discounted gain for a tied group is then the average `y_true` within\n this group times the sum of discounts of the corresponding ranks.\n\n This amounts to averaging scores for all possible orderings of the tied\n groups.\n\n (note in the case of dcg@k the discount is 0 after index k)\n\n Parameters\n ----------\n y_true : ndarray\n The true relevance scores.\n\n y_score : ndarray\n Predicted scores.\n\n discount_cumsum : ndarray\n Precomputed cumulative sum of the discounts.\n\n Returns\n -------\n discounted_cumulative_gain : float\n The discounted cumulative gain.\n\n References\n ----------\n McSherry, F., & Najork, M. (2008, March). Computing information retrieval\n performance measures efficiently in the presence of tied scores. In\n European conference on information retrieval (pp. 414-421). Springer,\n Berlin, Heidelberg.\n \"\"\"\n (_, inv, counts) = np.unique(-y_score, return_inverse=True, return_counts=True)\n ranked = np.zeros(len(counts))\n np.add.at(ranked, inv, y_true)\n ranked /= counts\n groups = np.cumsum(counts) - 1\n discount_sums = np.empty(len(counts))\n discount_sums[0] = discount_cumsum[groups[0]]\n discount_sums[1:] = np.diff(discount_cumsum[groups])\n return (ranked * discount_sums).sum()" }, { @@ -120490,7 +129577,8 @@ "docstring": { "type": "ndarray of shape (n,)", "description": "x coordinates. These must be either monotonic increasing or monotonic\ndecreasing." - } + }, + "refined_type": {} }, { "name": "y", @@ -120500,13 +129588,14 @@ "docstring": { "type": "ndarray of shape, (n,)", "description": "y coordinates." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute Area Under the Curve (AUC) using the trapezoidal rule.\n\nThis is a general function, given points on a curve. For computing the area under the ROC-curve, see :func:`roc_auc_score`. For an alternative way to summarize a precision-recall curve, see :func:`average_precision_score`.", - "docstring": "Compute Area Under the Curve (AUC) using the trapezoidal rule.\n\nThis is a general function, given points on a curve. For computing the\narea under the ROC-curve, see :func:`roc_auc_score`. For an alternative\nway to summarize a precision-recall curve, see\n:func:`average_precision_score`.\n\nParameters\n----------\nx : ndarray of shape (n,)\n x coordinates. These must be either monotonic increasing or monotonic\n decreasing.\ny : ndarray of shape, (n,)\n y coordinates.\n\nReturns\n-------\nauc : float\n\nSee Also\n--------\nroc_auc_score : Compute the area under the ROC curve.\naverage_precision_score : Compute average precision from prediction scores.\nprecision_recall_curve : Compute precision-recall pairs for different\n probability thresholds.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn import metrics\n>>> y = np.array([1, 1, 2, 2])\n>>> pred = np.array([0.1, 0.4, 0.35, 0.8])\n>>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)\n>>> metrics.auc(fpr, tpr)\n0.75", + "description": "Compute Area Under the Curve (AUC) using the trapezoidal rule.\n\nThis is a general function, given points on a curve. For computing the\narea under the ROC-curve, see :func:`roc_auc_score`. For an alternative\nway to summarize a precision-recall curve, see\n:func:`average_precision_score`.", + "docstring": "Compute Area Under the Curve (AUC) using the trapezoidal rule.\n\n This is a general function, given points on a curve. For computing the\n area under the ROC-curve, see :func:`roc_auc_score`. For an alternative\n way to summarize a precision-recall curve, see\n :func:`average_precision_score`.\n\n Parameters\n ----------\n x : ndarray of shape (n,)\n x coordinates. These must be either monotonic increasing or monotonic\n decreasing.\n y : ndarray of shape, (n,)\n y coordinates.\n\n Returns\n -------\n auc : float\n\n See Also\n --------\n roc_auc_score : Compute the area under the ROC curve.\n average_precision_score : Compute average precision from prediction scores.\n precision_recall_curve : Compute precision-recall pairs for different\n probability thresholds.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import metrics\n >>> y = np.array([1, 1, 2, 2])\n >>> pred = np.array([0.1, 0.4, 0.35, 0.8])\n >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)\n >>> metrics.auc(fpr, tpr)\n 0.75\n ", "source_code": "\ndef auc(x, y):\n \"\"\"Compute Area Under the Curve (AUC) using the trapezoidal rule.\n\n This is a general function, given points on a curve. For computing the\n area under the ROC-curve, see :func:`roc_auc_score`. For an alternative\n way to summarize a precision-recall curve, see\n :func:`average_precision_score`.\n\n Parameters\n ----------\n x : ndarray of shape (n,)\n x coordinates. These must be either monotonic increasing or monotonic\n decreasing.\n y : ndarray of shape, (n,)\n y coordinates.\n\n Returns\n -------\n auc : float\n\n See Also\n --------\n roc_auc_score : Compute the area under the ROC curve.\n average_precision_score : Compute average precision from prediction scores.\n precision_recall_curve : Compute precision-recall pairs for different\n probability thresholds.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import metrics\n >>> y = np.array([1, 1, 2, 2])\n >>> pred = np.array([0.1, 0.4, 0.35, 0.8])\n >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)\n >>> metrics.auc(fpr, tpr)\n 0.75\n \"\"\"\n check_consistent_length(x, y)\n x = column_or_1d(x)\n y = column_or_1d(y)\n if x.shape[0] < 2:\n raise ValueError('At least 2 points are needed to compute area under curve, but x.shape = %s' % x.shape)\n direction = 1\n dx = np.diff(x)\n if np.any(dx < 0):\n if np.all(dx <= 0):\n direction = -1\n else:\n raise ValueError('x is neither increasing nor decreasing : {}.'.format(x))\n area = direction * np.trapz(y, x)\n if isinstance(area, np.memmap):\n area = area.dtype.type(area)\n return area" }, { @@ -120524,7 +129613,8 @@ "docstring": { "type": "ndarray of shape (n_samples,) or (n_samples, n_classes)", "description": "True binary labels or binary label indicators." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -120534,7 +129624,8 @@ "docstring": { "type": "ndarray of shape (n_samples,) or (n_samples, n_classes)", "description": "Target scores, can either be probability estimates of the positive\nclass, confidence values, or non-thresholded measure of decisions\n(as returned by :term:`decision_function` on some classifiers)." - } + }, + "refined_type": {} }, { "name": "average", @@ -120544,6 +129635,10 @@ "docstring": { "type": "{'micro', 'samples', 'weighted', 'macro'} or None, default='macro'", "description": "If ``None``, the scores for each class are returned. Otherwise,\nthis determines the type of averaging performed on the data:\n\n``'micro'``:\n Calculate metrics globally by considering each element of the label\n indicator matrix as a label.\n``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label).\n``'samples'``:\n Calculate metrics for each instance, and find their average.\n\nWill be ignored when ``y_true`` is binary." + }, + "refined_type": { + "kind": "EnumType", + "values": ["samples", "micro", "weighted", "macro"] } }, { @@ -120554,7 +129649,8 @@ "docstring": { "type": "int or str, default=1", "description": "The label of the positive class. Only applied to binary ``y_true``.\nFor multilabel-indicator ``y_true``, ``pos_label`` is fixed to 1." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -120564,13 +129660,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute average precision (AP) from prediction scores.\n\nAP summarizes a precision-recall curve as the weighted mean of precisions achieved at each threshold, with the increase in recall from the previous threshold used as the weight: .. math:: \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n where :math:`P_n` and :math:`R_n` are the precision and recall at the nth threshold [1]_. This implementation is not interpolated and is different from computing the area under the precision-recall curve with the trapezoidal rule, which uses linear interpolation and can be too optimistic. Note: this implementation is restricted to the binary classification task or multilabel classification task. Read more in the :ref:`User Guide `.", - "docstring": "Compute average precision (AP) from prediction scores.\n\nAP summarizes a precision-recall curve as the weighted mean of precisions\nachieved at each threshold, with the increase in recall from the previous\nthreshold used as the weight:\n\n.. math::\n \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n\n\nwhere :math:`P_n` and :math:`R_n` are the precision and recall at the nth\nthreshold [1]_. This implementation is not interpolated and is different\nfrom computing the area under the precision-recall curve with the\ntrapezoidal rule, which uses linear interpolation and can be too\noptimistic.\n\nNote: this implementation is restricted to the binary classification task\nor multilabel classification task.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : ndarray of shape (n_samples,) or (n_samples, n_classes)\n True binary labels or binary label indicators.\n\ny_score : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by :term:`decision_function` on some classifiers).\n\naverage : {'micro', 'samples', 'weighted', 'macro'} or None, default='macro'\n If ``None``, the scores for each class are returned. Otherwise,\n this determines the type of averaging performed on the data:\n\n ``'micro'``:\n Calculate metrics globally by considering each element of the label\n indicator matrix as a label.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label).\n ``'samples'``:\n Calculate metrics for each instance, and find their average.\n\n Will be ignored when ``y_true`` is binary.\n\npos_label : int or str, default=1\n The label of the positive class. Only applied to binary ``y_true``.\n For multilabel-indicator ``y_true``, ``pos_label`` is fixed to 1.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\naverage_precision : float\n\nSee Also\n--------\nroc_auc_score : Compute the area under the ROC curve.\nprecision_recall_curve : Compute precision-recall pairs for different\n probability thresholds.\n\nNotes\n-----\n.. versionchanged:: 0.19\n Instead of linearly interpolating between operating points, precisions\n are weighted by the change in recall since the last operating point.\n\nReferences\n----------\n.. [1] `Wikipedia entry for the Average precision\n `_\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.metrics import average_precision_score\n>>> y_true = np.array([0, 0, 1, 1])\n>>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])\n>>> average_precision_score(y_true, y_scores)\n0.83...", + "description": "Compute average precision (AP) from prediction scores.\n\nAP summarizes a precision-recall curve as the weighted mean of precisions\nachieved at each threshold, with the increase in recall from the previous\nthreshold used as the weight:\n\n.. math::\n \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n\n\nwhere :math:`P_n` and :math:`R_n` are the precision and recall at the nth\nthreshold [1]_. This implementation is not interpolated and is different\nfrom computing the area under the precision-recall curve with the\ntrapezoidal rule, which uses linear interpolation and can be too\noptimistic.\n\nNote: this implementation is restricted to the binary classification task\nor multilabel classification task.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute average precision (AP) from prediction scores.\n\n AP summarizes a precision-recall curve as the weighted mean of precisions\n achieved at each threshold, with the increase in recall from the previous\n threshold used as the weight:\n\n .. math::\n \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n\n\n where :math:`P_n` and :math:`R_n` are the precision and recall at the nth\n threshold [1]_. This implementation is not interpolated and is different\n from computing the area under the precision-recall curve with the\n trapezoidal rule, which uses linear interpolation and can be too\n optimistic.\n\n Note: this implementation is restricted to the binary classification task\n or multilabel classification task.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples,) or (n_samples, n_classes)\n True binary labels or binary label indicators.\n\n y_score : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by :term:`decision_function` on some classifiers).\n\n average : {'micro', 'samples', 'weighted', 'macro'} or None, default='macro'\n If ``None``, the scores for each class are returned. Otherwise,\n this determines the type of averaging performed on the data:\n\n ``'micro'``:\n Calculate metrics globally by considering each element of the label\n indicator matrix as a label.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label).\n ``'samples'``:\n Calculate metrics for each instance, and find their average.\n\n Will be ignored when ``y_true`` is binary.\n\n pos_label : int or str, default=1\n The label of the positive class. Only applied to binary ``y_true``.\n For multilabel-indicator ``y_true``, ``pos_label`` is fixed to 1.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n average_precision : float\n\n See Also\n --------\n roc_auc_score : Compute the area under the ROC curve.\n precision_recall_curve : Compute precision-recall pairs for different\n probability thresholds.\n\n Notes\n -----\n .. versionchanged:: 0.19\n Instead of linearly interpolating between operating points, precisions\n are weighted by the change in recall since the last operating point.\n\n References\n ----------\n .. [1] `Wikipedia entry for the Average precision\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import average_precision_score\n >>> y_true = np.array([0, 0, 1, 1])\n >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])\n >>> average_precision_score(y_true, y_scores)\n 0.83...\n ", "source_code": "\ndef average_precision_score(y_true, y_score, *, average='macro', pos_label=1, sample_weight=None):\n \"\"\"Compute average precision (AP) from prediction scores.\n\n AP summarizes a precision-recall curve as the weighted mean of precisions\n achieved at each threshold, with the increase in recall from the previous\n threshold used as the weight:\n\n .. math::\n \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n\n\n where :math:`P_n` and :math:`R_n` are the precision and recall at the nth\n threshold [1]_. This implementation is not interpolated and is different\n from computing the area under the precision-recall curve with the\n trapezoidal rule, which uses linear interpolation and can be too\n optimistic.\n\n Note: this implementation is restricted to the binary classification task\n or multilabel classification task.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples,) or (n_samples, n_classes)\n True binary labels or binary label indicators.\n\n y_score : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by :term:`decision_function` on some classifiers).\n\n average : {'micro', 'samples', 'weighted', 'macro'} or None, default='macro'\n If ``None``, the scores for each class are returned. Otherwise,\n this determines the type of averaging performed on the data:\n\n ``'micro'``:\n Calculate metrics globally by considering each element of the label\n indicator matrix as a label.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label).\n ``'samples'``:\n Calculate metrics for each instance, and find their average.\n\n Will be ignored when ``y_true`` is binary.\n\n pos_label : int or str, default=1\n The label of the positive class. Only applied to binary ``y_true``.\n For multilabel-indicator ``y_true``, ``pos_label`` is fixed to 1.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n average_precision : float\n\n See Also\n --------\n roc_auc_score : Compute the area under the ROC curve.\n precision_recall_curve : Compute precision-recall pairs for different\n probability thresholds.\n\n Notes\n -----\n .. versionchanged:: 0.19\n Instead of linearly interpolating between operating points, precisions\n are weighted by the change in recall since the last operating point.\n\n References\n ----------\n .. [1] `Wikipedia entry for the Average precision\n `_\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import average_precision_score\n >>> y_true = np.array([0, 0, 1, 1])\n >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])\n >>> average_precision_score(y_true, y_scores)\n 0.83...\n \"\"\"\n \n def _binary_uninterpolated_average_precision(y_true, y_score, pos_label=1, sample_weight=None):\n (precision, recall, _) = precision_recall_curve(y_true, y_score, pos_label=pos_label, sample_weight=sample_weight)\n return -np.sum(np.diff(recall) * np.array(precision)[:-1])\n y_type = type_of_target(y_true)\n if y_type == 'multilabel-indicator' and pos_label != 1:\n raise ValueError('Parameter pos_label is fixed to 1 for multilabel-indicator y_true. Do not set pos_label or set pos_label to 1.')\n elif y_type == 'binary':\n present_labels = np.unique(y_true).tolist()\n if len(present_labels) == 2 and pos_label not in present_labels:\n raise ValueError(f'pos_label={pos_label} is not a valid label. It should be one of {present_labels}')\n average_precision = partial(_binary_uninterpolated_average_precision, pos_label=pos_label)\n return _average_binary_score(average_precision, y_true, y_score, average, sample_weight=sample_weight)" }, { @@ -120588,7 +129685,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_labels)", "description": "True binary labels in binary indicator format." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -120598,7 +129696,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_labels)", "description": "Target scores, can either be probability estimates of the positive\nclass, confidence values, or non-thresholded measure of decisions\n(as returned by \"decision_function\" on some classifiers)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -120608,13 +129707,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Coverage error measure.\n\nCompute how far we need to go through the ranked scores to cover all true labels. The best value is equal to the average number of labels in ``y_true`` per sample. Ties in ``y_scores`` are broken by giving maximal rank that would have been assigned to all tied values. Note: Our implementation's score is 1 greater than the one given in Tsoumakas et al., 2010. This extends it to handle the degenerate case in which an instance has 0 true labels. Read more in the :ref:`User Guide `.", - "docstring": "Coverage error measure.\n\nCompute how far we need to go through the ranked scores to cover all\ntrue labels. The best value is equal to the average number\nof labels in ``y_true`` per sample.\n\nTies in ``y_scores`` are broken by giving maximal rank that would have\nbeen assigned to all tied values.\n\nNote: Our implementation's score is 1 greater than the one given in\nTsoumakas et al., 2010. This extends it to handle the degenerate case\nin which an instance has 0 true labels.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : ndarray of shape (n_samples, n_labels)\n True binary labels in binary indicator format.\n\ny_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\ncoverage_error : float\n\nReferences\n----------\n.. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).\n Mining multi-label data. In Data mining and knowledge discovery\n handbook (pp. 667-685). Springer US.", + "description": "Coverage error measure.\n\nCompute how far we need to go through the ranked scores to cover all\ntrue labels. The best value is equal to the average number\nof labels in ``y_true`` per sample.\n\nTies in ``y_scores`` are broken by giving maximal rank that would have\nbeen assigned to all tied values.\n\nNote: Our implementation's score is 1 greater than the one given in\nTsoumakas et al., 2010. This extends it to handle the degenerate case\nin which an instance has 0 true labels.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Coverage error measure.\n\n Compute how far we need to go through the ranked scores to cover all\n true labels. The best value is equal to the average number\n of labels in ``y_true`` per sample.\n\n Ties in ``y_scores`` are broken by giving maximal rank that would have\n been assigned to all tied values.\n\n Note: Our implementation's score is 1 greater than the one given in\n Tsoumakas et al., 2010. This extends it to handle the degenerate case\n in which an instance has 0 true labels.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples, n_labels)\n True binary labels in binary indicator format.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n coverage_error : float\n\n References\n ----------\n .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).\n Mining multi-label data. In Data mining and knowledge discovery\n handbook (pp. 667-685). Springer US.\n\n ", "source_code": "\ndef coverage_error(y_true, y_score, *, sample_weight=None):\n \"\"\"Coverage error measure.\n\n Compute how far we need to go through the ranked scores to cover all\n true labels. The best value is equal to the average number\n of labels in ``y_true`` per sample.\n\n Ties in ``y_scores`` are broken by giving maximal rank that would have\n been assigned to all tied values.\n\n Note: Our implementation's score is 1 greater than the one given in\n Tsoumakas et al., 2010. This extends it to handle the degenerate case\n in which an instance has 0 true labels.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples, n_labels)\n True binary labels in binary indicator format.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n coverage_error : float\n\n References\n ----------\n .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).\n Mining multi-label data. In Data mining and knowledge discovery\n handbook (pp. 667-685). Springer US.\n\n \"\"\"\n y_true = check_array(y_true, ensure_2d=False)\n y_score = check_array(y_score, ensure_2d=False)\n check_consistent_length(y_true, y_score, sample_weight)\n y_type = type_of_target(y_true)\n if y_type != 'multilabel-indicator':\n raise ValueError('{0} format is not supported'.format(y_type))\n if y_true.shape != y_score.shape:\n raise ValueError('y_true and y_score have different shape')\n y_score_mask = np.ma.masked_array(y_score, mask=np.logical_not(y_true))\n y_min_relevant = y_score_mask.min(axis=1).reshape((-1, 1))\n coverage = (y_score >= y_min_relevant).sum(axis=1)\n coverage = coverage.filled(0)\n return np.average(coverage, weights=sample_weight)" }, { @@ -120632,7 +129732,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_labels)", "description": "True targets of multilabel classification, or true scores of entities\nto be ranked." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -120642,7 +129743,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_labels)", "description": "Target scores, can either be probability estimates, confidence values,\nor non-thresholded measure of decisions (as returned by\n\"decision_function\" on some classifiers)." - } + }, + "refined_type": {} }, { "name": "k", @@ -120652,7 +129754,8 @@ "docstring": { "type": "int, default=None", "description": "Only consider the highest k scores in the ranking. If None, use all\noutputs." - } + }, + "refined_type": {} }, { "name": "log_base", @@ -120662,7 +129765,8 @@ "docstring": { "type": "float, default=2", "description": "Base of the logarithm used for the discount. A low value means a\nsharper discount (top results are more important)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -120672,7 +129776,8 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Sample weights. If `None`, all samples are given the same weight." - } + }, + "refined_type": {} }, { "name": "ignore_ties", @@ -120682,13 +129787,14 @@ "docstring": { "type": "bool, default=False", "description": "Assume that there are no ties in y_score (which is likely to be the\ncase if y_score is continuous) for efficiency gains." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute Discounted Cumulative Gain.\n\nSum the true scores ranked in the order induced by the predicted scores, after applying a logarithmic discount. This ranking metric yields a high value if true labels are ranked high by ``y_score``. Usually the Normalized Discounted Cumulative Gain (NDCG, computed by ndcg_score) is preferred.", - "docstring": "Compute Discounted Cumulative Gain.\n\nSum the true scores ranked in the order induced by the predicted scores,\nafter applying a logarithmic discount.\n\nThis ranking metric yields a high value if true labels are ranked high by\n``y_score``.\n\nUsually the Normalized Discounted Cumulative Gain (NDCG, computed by\nndcg_score) is preferred.\n\nParameters\n----------\ny_true : ndarray of shape (n_samples, n_labels)\n True targets of multilabel classification, or true scores of entities\n to be ranked.\n\ny_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates, confidence values,\n or non-thresholded measure of decisions (as returned by\n \"decision_function\" on some classifiers).\n\nk : int, default=None\n Only consider the highest k scores in the ranking. If None, use all\n outputs.\n\nlog_base : float, default=2\n Base of the logarithm used for the discount. A low value means a\n sharper discount (top results are more important).\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Sample weights. If `None`, all samples are given the same weight.\n\nignore_ties : bool, default=False\n Assume that there are no ties in y_score (which is likely to be the\n case if y_score is continuous) for efficiency gains.\n\nReturns\n-------\ndiscounted_cumulative_gain : float\n The averaged sample DCG scores.\n\nSee Also\n--------\nndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted\n Cumulative Gain (the DCG obtained for a perfect ranking), in order to\n have a score between 0 and 1.\n\nReferences\n----------\n`Wikipedia entry for Discounted Cumulative Gain\n`_.\n\nJarvelin, K., & Kekalainen, J. (2002).\nCumulated gain-based evaluation of IR techniques. ACM Transactions on\nInformation Systems (TOIS), 20(4), 422-446.\n\nWang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May).\nA theoretical analysis of NDCG ranking measures. In Proceedings of the 26th\nAnnual Conference on Learning Theory (COLT 2013).\n\nMcSherry, F., & Najork, M. (2008, March). Computing information retrieval\nperformance measures efficiently in the presence of tied scores. In\nEuropean conference on information retrieval (pp. 414-421). Springer,\nBerlin, Heidelberg.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.metrics import dcg_score\n>>> # we have groud-truth relevance of some answers to a query:\n>>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])\n>>> # we predict scores for the answers\n>>> scores = np.asarray([[.1, .2, .3, 4, 70]])\n>>> dcg_score(true_relevance, scores)\n9.49...\n>>> # we can set k to truncate the sum; only top k answers contribute\n>>> dcg_score(true_relevance, scores, k=2)\n5.63...\n>>> # now we have some ties in our prediction\n>>> scores = np.asarray([[1, 0, 0, 0, 1]])\n>>> # by default ties are averaged, so here we get the average true\n>>> # relevance of our top predictions: (10 + 5) / 2 = 7.5\n>>> dcg_score(true_relevance, scores, k=1)\n7.5\n>>> # we can choose to ignore ties for faster results, but only\n>>> # if we know there aren't ties in our scores, otherwise we get\n>>> # wrong results:\n>>> dcg_score(true_relevance,\n... scores, k=1, ignore_ties=True)\n5.0", + "description": "Compute Discounted Cumulative Gain.\n\nSum the true scores ranked in the order induced by the predicted scores,\nafter applying a logarithmic discount.\n\nThis ranking metric yields a high value if true labels are ranked high by\n``y_score``.\n\nUsually the Normalized Discounted Cumulative Gain (NDCG, computed by\nndcg_score) is preferred.", + "docstring": "Compute Discounted Cumulative Gain.\n\n Sum the true scores ranked in the order induced by the predicted scores,\n after applying a logarithmic discount.\n\n This ranking metric yields a high value if true labels are ranked high by\n ``y_score``.\n\n Usually the Normalized Discounted Cumulative Gain (NDCG, computed by\n ndcg_score) is preferred.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples, n_labels)\n True targets of multilabel classification, or true scores of entities\n to be ranked.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates, confidence values,\n or non-thresholded measure of decisions (as returned by\n \"decision_function\" on some classifiers).\n\n k : int, default=None\n Only consider the highest k scores in the ranking. If None, use all\n outputs.\n\n log_base : float, default=2\n Base of the logarithm used for the discount. A low value means a\n sharper discount (top results are more important).\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights. If `None`, all samples are given the same weight.\n\n ignore_ties : bool, default=False\n Assume that there are no ties in y_score (which is likely to be the\n case if y_score is continuous) for efficiency gains.\n\n Returns\n -------\n discounted_cumulative_gain : float\n The averaged sample DCG scores.\n\n See Also\n --------\n ndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted\n Cumulative Gain (the DCG obtained for a perfect ranking), in order to\n have a score between 0 and 1.\n\n References\n ----------\n `Wikipedia entry for Discounted Cumulative Gain\n `_.\n\n Jarvelin, K., & Kekalainen, J. (2002).\n Cumulated gain-based evaluation of IR techniques. ACM Transactions on\n Information Systems (TOIS), 20(4), 422-446.\n\n Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May).\n A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th\n Annual Conference on Learning Theory (COLT 2013).\n\n McSherry, F., & Najork, M. (2008, March). Computing information retrieval\n performance measures efficiently in the presence of tied scores. In\n European conference on information retrieval (pp. 414-421). Springer,\n Berlin, Heidelberg.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import dcg_score\n >>> # we have groud-truth relevance of some answers to a query:\n >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])\n >>> # we predict scores for the answers\n >>> scores = np.asarray([[.1, .2, .3, 4, 70]])\n >>> dcg_score(true_relevance, scores)\n 9.49...\n >>> # we can set k to truncate the sum; only top k answers contribute\n >>> dcg_score(true_relevance, scores, k=2)\n 5.63...\n >>> # now we have some ties in our prediction\n >>> scores = np.asarray([[1, 0, 0, 0, 1]])\n >>> # by default ties are averaged, so here we get the average true\n >>> # relevance of our top predictions: (10 + 5) / 2 = 7.5\n >>> dcg_score(true_relevance, scores, k=1)\n 7.5\n >>> # we can choose to ignore ties for faster results, but only\n >>> # if we know there aren't ties in our scores, otherwise we get\n >>> # wrong results:\n >>> dcg_score(true_relevance,\n ... scores, k=1, ignore_ties=True)\n 5.0\n\n ", "source_code": "\ndef dcg_score(y_true, y_score, *, k=None, log_base=2, sample_weight=None, ignore_ties=False):\n \"\"\"Compute Discounted Cumulative Gain.\n\n Sum the true scores ranked in the order induced by the predicted scores,\n after applying a logarithmic discount.\n\n This ranking metric yields a high value if true labels are ranked high by\n ``y_score``.\n\n Usually the Normalized Discounted Cumulative Gain (NDCG, computed by\n ndcg_score) is preferred.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples, n_labels)\n True targets of multilabel classification, or true scores of entities\n to be ranked.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates, confidence values,\n or non-thresholded measure of decisions (as returned by\n \"decision_function\" on some classifiers).\n\n k : int, default=None\n Only consider the highest k scores in the ranking. If None, use all\n outputs.\n\n log_base : float, default=2\n Base of the logarithm used for the discount. A low value means a\n sharper discount (top results are more important).\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights. If `None`, all samples are given the same weight.\n\n ignore_ties : bool, default=False\n Assume that there are no ties in y_score (which is likely to be the\n case if y_score is continuous) for efficiency gains.\n\n Returns\n -------\n discounted_cumulative_gain : float\n The averaged sample DCG scores.\n\n See Also\n --------\n ndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted\n Cumulative Gain (the DCG obtained for a perfect ranking), in order to\n have a score between 0 and 1.\n\n References\n ----------\n `Wikipedia entry for Discounted Cumulative Gain\n `_.\n\n Jarvelin, K., & Kekalainen, J. (2002).\n Cumulated gain-based evaluation of IR techniques. ACM Transactions on\n Information Systems (TOIS), 20(4), 422-446.\n\n Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May).\n A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th\n Annual Conference on Learning Theory (COLT 2013).\n\n McSherry, F., & Najork, M. (2008, March). Computing information retrieval\n performance measures efficiently in the presence of tied scores. In\n European conference on information retrieval (pp. 414-421). Springer,\n Berlin, Heidelberg.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import dcg_score\n >>> # we have groud-truth relevance of some answers to a query:\n >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])\n >>> # we predict scores for the answers\n >>> scores = np.asarray([[.1, .2, .3, 4, 70]])\n >>> dcg_score(true_relevance, scores)\n 9.49...\n >>> # we can set k to truncate the sum; only top k answers contribute\n >>> dcg_score(true_relevance, scores, k=2)\n 5.63...\n >>> # now we have some ties in our prediction\n >>> scores = np.asarray([[1, 0, 0, 0, 1]])\n >>> # by default ties are averaged, so here we get the average true\n >>> # relevance of our top predictions: (10 + 5) / 2 = 7.5\n >>> dcg_score(true_relevance, scores, k=1)\n 7.5\n >>> # we can choose to ignore ties for faster results, but only\n >>> # if we know there aren't ties in our scores, otherwise we get\n >>> # wrong results:\n >>> dcg_score(true_relevance,\n ... scores, k=1, ignore_ties=True)\n 5.0\n\n \"\"\"\n y_true = check_array(y_true, ensure_2d=False)\n y_score = check_array(y_score, ensure_2d=False)\n check_consistent_length(y_true, y_score, sample_weight)\n _check_dcg_target_type(y_true)\n return np.average(_dcg_sample_scores(y_true, y_score, k=k, log_base=log_base, ignore_ties=ignore_ties), weights=sample_weight)" }, { @@ -120706,6 +129812,10 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True binary labels. If labels are not either {-1, 1} or {0, 1}, then\npos_label should be explicitly given." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -120716,7 +129826,8 @@ "docstring": { "type": "ndarray of shape of (n_samples,)", "description": "Target scores, can either be probability estimates of the positive\nclass, confidence values, or non-thresholded measure of decisions\n(as returned by \"decision_function\" on some classifiers)." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -120726,6 +129837,10 @@ "docstring": { "type": "int or str, default=None", "description": "The label of the positive class.\nWhen ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},\n``pos_label`` is set to 1, otherwise an error will be raised." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -120736,13 +129851,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute error rates for different probability thresholds.\n\n.. note:: This metric is used for evaluation of ranking and error tradeoffs of a binary classification task. Read more in the :ref:`User Guide `. .. versionadded:: 0.24", - "docstring": "Compute error rates for different probability thresholds.\n\n.. note::\n This metric is used for evaluation of ranking and error tradeoffs of\n a binary classification task.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.24\n\nParameters\n----------\ny_true : ndarray of shape (n_samples,)\n True binary labels. If labels are not either {-1, 1} or {0, 1}, then\n pos_label should be explicitly given.\n\ny_score : ndarray of shape of (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\npos_label : int or str, default=None\n The label of the positive class.\n When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},\n ``pos_label`` is set to 1, otherwise an error will be raised.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nfpr : ndarray of shape (n_thresholds,)\n False positive rate (FPR) such that element i is the false positive\n rate of predictions with score >= thresholds[i]. This is occasionally\n referred to as false acceptance propability or fall-out.\n\nfnr : ndarray of shape (n_thresholds,)\n False negative rate (FNR) such that element i is the false negative\n rate of predictions with score >= thresholds[i]. This is occasionally\n referred to as false rejection or miss rate.\n\nthresholds : ndarray of shape (n_thresholds,)\n Decreasing score values.\n\nSee Also\n--------\nDetCurveDisplay.from_estimator : Plot DET curve given an estimator and\n some data.\nDetCurveDisplay.from_predictions : Plot DET curve given the true and\n predicted labels.\nDetCurveDisplay : DET curve visualization.\nroc_curve : Compute Receiver operating characteristic (ROC) curve.\nprecision_recall_curve : Compute precision-recall curve.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.metrics import det_curve\n>>> y_true = np.array([0, 0, 1, 1])\n>>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])\n>>> fpr, fnr, thresholds = det_curve(y_true, y_scores)\n>>> fpr\narray([0.5, 0.5, 0. ])\n>>> fnr\narray([0. , 0.5, 0.5])\n>>> thresholds\narray([0.35, 0.4 , 0.8 ])", + "description": "Compute error rates for different probability thresholds.\n\n.. note::\n This metric is used for evaluation of ranking and error tradeoffs of\n a binary classification task.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.24", + "docstring": "Compute error rates for different probability thresholds.\n\n .. note::\n This metric is used for evaluation of ranking and error tradeoffs of\n a binary classification task.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples,)\n True binary labels. If labels are not either {-1, 1} or {0, 1}, then\n pos_label should be explicitly given.\n\n y_score : ndarray of shape of (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\n pos_label : int or str, default=None\n The label of the positive class.\n When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},\n ``pos_label`` is set to 1, otherwise an error will be raised.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n fpr : ndarray of shape (n_thresholds,)\n False positive rate (FPR) such that element i is the false positive\n rate of predictions with score >= thresholds[i]. This is occasionally\n referred to as false acceptance propability or fall-out.\n\n fnr : ndarray of shape (n_thresholds,)\n False negative rate (FNR) such that element i is the false negative\n rate of predictions with score >= thresholds[i]. This is occasionally\n referred to as false rejection or miss rate.\n\n thresholds : ndarray of shape (n_thresholds,)\n Decreasing score values.\n\n See Also\n --------\n DetCurveDisplay.from_estimator : Plot DET curve given an estimator and\n some data.\n DetCurveDisplay.from_predictions : Plot DET curve given the true and\n predicted labels.\n DetCurveDisplay : DET curve visualization.\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n precision_recall_curve : Compute precision-recall curve.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import det_curve\n >>> y_true = np.array([0, 0, 1, 1])\n >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])\n >>> fpr, fnr, thresholds = det_curve(y_true, y_scores)\n >>> fpr\n array([0.5, 0.5, 0. ])\n >>> fnr\n array([0. , 0.5, 0.5])\n >>> thresholds\n array([0.35, 0.4 , 0.8 ])\n ", "source_code": "\ndef det_curve(y_true, y_score, pos_label=None, sample_weight=None):\n \"\"\"Compute error rates for different probability thresholds.\n\n .. note::\n This metric is used for evaluation of ranking and error tradeoffs of\n a binary classification task.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples,)\n True binary labels. If labels are not either {-1, 1} or {0, 1}, then\n pos_label should be explicitly given.\n\n y_score : ndarray of shape of (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\n pos_label : int or str, default=None\n The label of the positive class.\n When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},\n ``pos_label`` is set to 1, otherwise an error will be raised.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n fpr : ndarray of shape (n_thresholds,)\n False positive rate (FPR) such that element i is the false positive\n rate of predictions with score >= thresholds[i]. This is occasionally\n referred to as false acceptance propability or fall-out.\n\n fnr : ndarray of shape (n_thresholds,)\n False negative rate (FNR) such that element i is the false negative\n rate of predictions with score >= thresholds[i]. This is occasionally\n referred to as false rejection or miss rate.\n\n thresholds : ndarray of shape (n_thresholds,)\n Decreasing score values.\n\n See Also\n --------\n DetCurveDisplay.from_estimator : Plot DET curve given an estimator and\n some data.\n DetCurveDisplay.from_predictions : Plot DET curve given the true and\n predicted labels.\n DetCurveDisplay : DET curve visualization.\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n precision_recall_curve : Compute precision-recall curve.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import det_curve\n >>> y_true = np.array([0, 0, 1, 1])\n >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])\n >>> fpr, fnr, thresholds = det_curve(y_true, y_scores)\n >>> fpr\n array([0.5, 0.5, 0. ])\n >>> fnr\n array([0. , 0.5, 0.5])\n >>> thresholds\n array([0.35, 0.4 , 0.8 ])\n \"\"\"\n (fps, tps, thresholds) = _binary_clf_curve(y_true, y_score, pos_label=pos_label, sample_weight=sample_weight)\n if len(np.unique(y_true)) != 2:\n raise ValueError('Only one class present in y_true. Detection error tradeoff curve is not defined in that case.')\n fns = tps[-1] - tps\n p_count = tps[-1]\n n_count = fps[-1]\n first_ind = fps.searchsorted(fps[0], side='right') - 1 if fps.searchsorted(fps[0], side='right') > 0 else None\n last_ind = tps.searchsorted(tps[-1]) + 1\n sl = slice(first_ind, last_ind)\n return fps[sl][::-1] / n_count, fns[sl][::-1] / p_count, thresholds[sl][::-1]" }, { @@ -120760,6 +129876,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_labels)", "description": "True binary labels in binary indicator format." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -120770,7 +129890,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_labels)", "description": "Target scores, can either be probability estimates of the positive\nclass, confidence values, or non-thresholded measure of decisions\n(as returned by \"decision_function\" on some classifiers)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -120780,13 +129901,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute ranking-based average precision.\n\nLabel ranking average precision (LRAP) is the average over each ground truth label assigned to each sample, of the ratio of true vs. total labels with lower score. This metric is used in multilabel ranking problem, where the goal is to give better rank to the labels associated to each sample. The obtained score is always strictly greater than 0 and the best value is 1. Read more in the :ref:`User Guide `.", - "docstring": "Compute ranking-based average precision.\n\nLabel ranking average precision (LRAP) is the average over each ground\ntruth label assigned to each sample, of the ratio of true vs. total\nlabels with lower score.\n\nThis metric is used in multilabel ranking problem, where the goal\nis to give better rank to the labels associated to each sample.\n\nThe obtained score is always strictly greater than 0 and\nthe best value is 1.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : {ndarray, sparse matrix} of shape (n_samples, n_labels)\n True binary labels in binary indicator format.\n\ny_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.20\n\nReturns\n-------\nscore : float\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.metrics import label_ranking_average_precision_score\n>>> y_true = np.array([[1, 0, 0], [0, 0, 1]])\n>>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]])\n>>> label_ranking_average_precision_score(y_true, y_score)\n0.416...", + "description": "Compute ranking-based average precision.\n\nLabel ranking average precision (LRAP) is the average over each ground\ntruth label assigned to each sample, of the ratio of true vs. total\nlabels with lower score.\n\nThis metric is used in multilabel ranking problem, where the goal\nis to give better rank to the labels associated to each sample.\n\nThe obtained score is always strictly greater than 0 and\nthe best value is 1.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute ranking-based average precision.\n\n Label ranking average precision (LRAP) is the average over each ground\n truth label assigned to each sample, of the ratio of true vs. total\n labels with lower score.\n\n This metric is used in multilabel ranking problem, where the goal\n is to give better rank to the labels associated to each sample.\n\n The obtained score is always strictly greater than 0 and\n the best value is 1.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : {ndarray, sparse matrix} of shape (n_samples, n_labels)\n True binary labels in binary indicator format.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n score : float\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import label_ranking_average_precision_score\n >>> y_true = np.array([[1, 0, 0], [0, 0, 1]])\n >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]])\n >>> label_ranking_average_precision_score(y_true, y_score)\n 0.416...\n\n ", "source_code": "\ndef label_ranking_average_precision_score(y_true, y_score, *, sample_weight=None):\n \"\"\"Compute ranking-based average precision.\n\n Label ranking average precision (LRAP) is the average over each ground\n truth label assigned to each sample, of the ratio of true vs. total\n labels with lower score.\n\n This metric is used in multilabel ranking problem, where the goal\n is to give better rank to the labels associated to each sample.\n\n The obtained score is always strictly greater than 0 and\n the best value is 1.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : {ndarray, sparse matrix} of shape (n_samples, n_labels)\n True binary labels in binary indicator format.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n score : float\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import label_ranking_average_precision_score\n >>> y_true = np.array([[1, 0, 0], [0, 0, 1]])\n >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]])\n >>> label_ranking_average_precision_score(y_true, y_score)\n 0.416...\n\n \"\"\"\n check_consistent_length(y_true, y_score, sample_weight)\n y_true = check_array(y_true, ensure_2d=False)\n y_score = check_array(y_score, ensure_2d=False)\n if y_true.shape != y_score.shape:\n raise ValueError('y_true and y_score have different shape')\n y_type = type_of_target(y_true)\n if y_type != 'multilabel-indicator' and not (y_type == 'binary' and y_true.ndim == 2):\n raise ValueError('{0} format is not supported'.format(y_type))\n y_true = csr_matrix(y_true)\n y_score = -y_score\n (n_samples, n_labels) = y_true.shape\n out = 0.0\n for (i, (start, stop)) in enumerate(zip(y_true.indptr, y_true.indptr[1:])):\n relevant = y_true.indices[start:stop]\n if relevant.size == 0 or relevant.size == n_labels:\n aux = 1.0\n else:\n scores_i = y_score[i]\n rank = rankdata(scores_i, 'max')[relevant]\n L = rankdata(scores_i[relevant], 'max')\n aux = (L / rank).mean()\n if sample_weight is not None:\n aux = aux * sample_weight[i]\n out += aux\n if sample_weight is None:\n out /= n_samples\n else:\n out /= np.sum(sample_weight)\n return out" }, { @@ -120804,6 +129926,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_labels)", "description": "True binary labels in binary indicator format." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -120814,7 +129940,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_labels)", "description": "Target scores, can either be probability estimates of the positive\nclass, confidence values, or non-thresholded measure of decisions\n(as returned by \"decision_function\" on some classifiers)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -120824,13 +129951,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute Ranking loss measure.\n\nCompute the average number of label pairs that are incorrectly ordered given y_score weighted by the size of the label set and the number of labels not in the label set. This is similar to the error set size, but weighted by the number of relevant and irrelevant labels. The best performance is achieved with a ranking loss of zero. Read more in the :ref:`User Guide `. .. versionadded:: 0.17 A function *label_ranking_loss*", - "docstring": "Compute Ranking loss measure.\n\nCompute the average number of label pairs that are incorrectly ordered\ngiven y_score weighted by the size of the label set and the number of\nlabels not in the label set.\n\nThis is similar to the error set size, but weighted by the number of\nrelevant and irrelevant labels. The best performance is achieved with\na ranking loss of zero.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.17\n A function *label_ranking_loss*\n\nParameters\n----------\ny_true : {ndarray, sparse matrix} of shape (n_samples, n_labels)\n True binary labels in binary indicator format.\n\ny_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nloss : float\n\nReferences\n----------\n.. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).\n Mining multi-label data. In Data mining and knowledge discovery\n handbook (pp. 667-685). Springer US.", + "description": "Compute Ranking loss measure.\n\nCompute the average number of label pairs that are incorrectly ordered\ngiven y_score weighted by the size of the label set and the number of\nlabels not in the label set.\n\nThis is similar to the error set size, but weighted by the number of\nrelevant and irrelevant labels. The best performance is achieved with\na ranking loss of zero.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.17\n A function *label_ranking_loss*", + "docstring": "Compute Ranking loss measure.\n\n Compute the average number of label pairs that are incorrectly ordered\n given y_score weighted by the size of the label set and the number of\n labels not in the label set.\n\n This is similar to the error set size, but weighted by the number of\n relevant and irrelevant labels. The best performance is achieved with\n a ranking loss of zero.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.17\n A function *label_ranking_loss*\n\n Parameters\n ----------\n y_true : {ndarray, sparse matrix} of shape (n_samples, n_labels)\n True binary labels in binary indicator format.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n loss : float\n\n References\n ----------\n .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).\n Mining multi-label data. In Data mining and knowledge discovery\n handbook (pp. 667-685). Springer US.\n ", "source_code": "\ndef label_ranking_loss(y_true, y_score, *, sample_weight=None):\n \"\"\"Compute Ranking loss measure.\n\n Compute the average number of label pairs that are incorrectly ordered\n given y_score weighted by the size of the label set and the number of\n labels not in the label set.\n\n This is similar to the error set size, but weighted by the number of\n relevant and irrelevant labels. The best performance is achieved with\n a ranking loss of zero.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.17\n A function *label_ranking_loss*\n\n Parameters\n ----------\n y_true : {ndarray, sparse matrix} of shape (n_samples, n_labels)\n True binary labels in binary indicator format.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n loss : float\n\n References\n ----------\n .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).\n Mining multi-label data. In Data mining and knowledge discovery\n handbook (pp. 667-685). Springer US.\n \"\"\"\n y_true = check_array(y_true, ensure_2d=False, accept_sparse='csr')\n y_score = check_array(y_score, ensure_2d=False)\n check_consistent_length(y_true, y_score, sample_weight)\n y_type = type_of_target(y_true)\n if y_type not in ('multilabel-indicator', ):\n raise ValueError('{0} format is not supported'.format(y_type))\n if y_true.shape != y_score.shape:\n raise ValueError('y_true and y_score have different shape')\n (n_samples, n_labels) = y_true.shape\n y_true = csr_matrix(y_true)\n loss = np.zeros(n_samples)\n for (i, (start, stop)) in enumerate(zip(y_true.indptr, y_true.indptr[1:])):\n (unique_scores, unique_inverse) = np.unique(y_score[i], return_inverse=True)\n true_at_reversed_rank = np.bincount(unique_inverse[y_true.indices[start:stop]], minlength=len(unique_scores))\n all_at_reversed_rank = np.bincount(unique_inverse, minlength=len(unique_scores))\n false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank\n loss[i] = np.dot(true_at_reversed_rank.cumsum(), false_at_reversed_rank)\n n_positives = count_nonzero(y_true, axis=1)\n with np.errstate(divide='ignore', invalid='ignore'):\n loss /= (n_labels - n_positives) * n_positives\n loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0.0\n return np.average(loss, weights=sample_weight)" }, { @@ -120848,7 +129976,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_labels)", "description": "True targets of multilabel classification, or true scores of entities\nto be ranked." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -120858,7 +129987,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_labels)", "description": "Target scores, can either be probability estimates, confidence values,\nor non-thresholded measure of decisions (as returned by\n\"decision_function\" on some classifiers)." - } + }, + "refined_type": {} }, { "name": "k", @@ -120868,7 +129998,8 @@ "docstring": { "type": "int, default=None", "description": "Only consider the highest k scores in the ranking. If `None`, use all\noutputs." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -120878,7 +130009,8 @@ "docstring": { "type": "ndarray of shape (n_samples,), default=None", "description": "Sample weights. If `None`, all samples are given the same weight." - } + }, + "refined_type": {} }, { "name": "ignore_ties", @@ -120888,13 +130020,14 @@ "docstring": { "type": "bool, default=False", "description": "Assume that there are no ties in y_score (which is likely to be the\ncase if y_score is continuous) for efficiency gains." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute Normalized Discounted Cumulative Gain.\n\nSum the true scores ranked in the order induced by the predicted scores, after applying a logarithmic discount. Then divide by the best possible score (Ideal DCG, obtained for a perfect ranking) to obtain a score between 0 and 1. This ranking metric yields a high value if true labels are ranked high by ``y_score``.", - "docstring": "Compute Normalized Discounted Cumulative Gain.\n\nSum the true scores ranked in the order induced by the predicted scores,\nafter applying a logarithmic discount. Then divide by the best possible\nscore (Ideal DCG, obtained for a perfect ranking) to obtain a score between\n0 and 1.\n\nThis ranking metric yields a high value if true labels are ranked high by\n``y_score``.\n\nParameters\n----------\ny_true : ndarray of shape (n_samples, n_labels)\n True targets of multilabel classification, or true scores of entities\n to be ranked.\n\ny_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates, confidence values,\n or non-thresholded measure of decisions (as returned by\n \"decision_function\" on some classifiers).\n\nk : int, default=None\n Only consider the highest k scores in the ranking. If `None`, use all\n outputs.\n\nsample_weight : ndarray of shape (n_samples,), default=None\n Sample weights. If `None`, all samples are given the same weight.\n\nignore_ties : bool, default=False\n Assume that there are no ties in y_score (which is likely to be the\n case if y_score is continuous) for efficiency gains.\n\nReturns\n-------\nnormalized_discounted_cumulative_gain : float in [0., 1.]\n The averaged NDCG scores for all samples.\n\nSee Also\n--------\ndcg_score : Discounted Cumulative Gain (not normalized).\n\nReferences\n----------\n`Wikipedia entry for Discounted Cumulative Gain\n`_\n\nJarvelin, K., & Kekalainen, J. (2002).\nCumulated gain-based evaluation of IR techniques. ACM Transactions on\nInformation Systems (TOIS), 20(4), 422-446.\n\nWang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May).\nA theoretical analysis of NDCG ranking measures. In Proceedings of the 26th\nAnnual Conference on Learning Theory (COLT 2013)\n\nMcSherry, F., & Najork, M. (2008, March). Computing information retrieval\nperformance measures efficiently in the presence of tied scores. In\nEuropean conference on information retrieval (pp. 414-421). Springer,\nBerlin, Heidelberg.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.metrics import ndcg_score\n>>> # we have groud-truth relevance of some answers to a query:\n>>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])\n>>> # we predict some scores (relevance) for the answers\n>>> scores = np.asarray([[.1, .2, .3, 4, 70]])\n>>> ndcg_score(true_relevance, scores)\n0.69...\n>>> scores = np.asarray([[.05, 1.1, 1., .5, .0]])\n>>> ndcg_score(true_relevance, scores)\n0.49...\n>>> # we can set k to truncate the sum; only top k answers contribute.\n>>> ndcg_score(true_relevance, scores, k=4)\n0.35...\n>>> # the normalization takes k into account so a perfect answer\n>>> # would still get 1.0\n>>> ndcg_score(true_relevance, true_relevance, k=4)\n1.0\n>>> # now we have some ties in our prediction\n>>> scores = np.asarray([[1, 0, 0, 0, 1]])\n>>> # by default ties are averaged, so here we get the average (normalized)\n>>> # true relevance of our top predictions: (10 / 10 + 5 / 10) / 2 = .75\n>>> ndcg_score(true_relevance, scores, k=1)\n0.75\n>>> # we can choose to ignore ties for faster results, but only\n>>> # if we know there aren't ties in our scores, otherwise we get\n>>> # wrong results:\n>>> ndcg_score(true_relevance,\n... scores, k=1, ignore_ties=True)\n0.5", + "description": "Compute Normalized Discounted Cumulative Gain.\n\nSum the true scores ranked in the order induced by the predicted scores,\nafter applying a logarithmic discount. Then divide by the best possible\nscore (Ideal DCG, obtained for a perfect ranking) to obtain a score between\n0 and 1.\n\nThis ranking metric yields a high value if true labels are ranked high by\n``y_score``.", + "docstring": "Compute Normalized Discounted Cumulative Gain.\n\n Sum the true scores ranked in the order induced by the predicted scores,\n after applying a logarithmic discount. Then divide by the best possible\n score (Ideal DCG, obtained for a perfect ranking) to obtain a score between\n 0 and 1.\n\n This ranking metric yields a high value if true labels are ranked high by\n ``y_score``.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples, n_labels)\n True targets of multilabel classification, or true scores of entities\n to be ranked.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates, confidence values,\n or non-thresholded measure of decisions (as returned by\n \"decision_function\" on some classifiers).\n\n k : int, default=None\n Only consider the highest k scores in the ranking. If `None`, use all\n outputs.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights. If `None`, all samples are given the same weight.\n\n ignore_ties : bool, default=False\n Assume that there are no ties in y_score (which is likely to be the\n case if y_score is continuous) for efficiency gains.\n\n Returns\n -------\n normalized_discounted_cumulative_gain : float in [0., 1.]\n The averaged NDCG scores for all samples.\n\n See Also\n --------\n dcg_score : Discounted Cumulative Gain (not normalized).\n\n References\n ----------\n `Wikipedia entry for Discounted Cumulative Gain\n `_\n\n Jarvelin, K., & Kekalainen, J. (2002).\n Cumulated gain-based evaluation of IR techniques. ACM Transactions on\n Information Systems (TOIS), 20(4), 422-446.\n\n Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May).\n A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th\n Annual Conference on Learning Theory (COLT 2013)\n\n McSherry, F., & Najork, M. (2008, March). Computing information retrieval\n performance measures efficiently in the presence of tied scores. In\n European conference on information retrieval (pp. 414-421). Springer,\n Berlin, Heidelberg.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import ndcg_score\n >>> # we have groud-truth relevance of some answers to a query:\n >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])\n >>> # we predict some scores (relevance) for the answers\n >>> scores = np.asarray([[.1, .2, .3, 4, 70]])\n >>> ndcg_score(true_relevance, scores)\n 0.69...\n >>> scores = np.asarray([[.05, 1.1, 1., .5, .0]])\n >>> ndcg_score(true_relevance, scores)\n 0.49...\n >>> # we can set k to truncate the sum; only top k answers contribute.\n >>> ndcg_score(true_relevance, scores, k=4)\n 0.35...\n >>> # the normalization takes k into account so a perfect answer\n >>> # would still get 1.0\n >>> ndcg_score(true_relevance, true_relevance, k=4)\n 1.0\n >>> # now we have some ties in our prediction\n >>> scores = np.asarray([[1, 0, 0, 0, 1]])\n >>> # by default ties are averaged, so here we get the average (normalized)\n >>> # true relevance of our top predictions: (10 / 10 + 5 / 10) / 2 = .75\n >>> ndcg_score(true_relevance, scores, k=1)\n 0.75\n >>> # we can choose to ignore ties for faster results, but only\n >>> # if we know there aren't ties in our scores, otherwise we get\n >>> # wrong results:\n >>> ndcg_score(true_relevance,\n ... scores, k=1, ignore_ties=True)\n 0.5\n\n ", "source_code": "\ndef ndcg_score(y_true, y_score, *, k=None, sample_weight=None, ignore_ties=False):\n \"\"\"Compute Normalized Discounted Cumulative Gain.\n\n Sum the true scores ranked in the order induced by the predicted scores,\n after applying a logarithmic discount. Then divide by the best possible\n score (Ideal DCG, obtained for a perfect ranking) to obtain a score between\n 0 and 1.\n\n This ranking metric yields a high value if true labels are ranked high by\n ``y_score``.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples, n_labels)\n True targets of multilabel classification, or true scores of entities\n to be ranked.\n\n y_score : ndarray of shape (n_samples, n_labels)\n Target scores, can either be probability estimates, confidence values,\n or non-thresholded measure of decisions (as returned by\n \"decision_function\" on some classifiers).\n\n k : int, default=None\n Only consider the highest k scores in the ranking. If `None`, use all\n outputs.\n\n sample_weight : ndarray of shape (n_samples,), default=None\n Sample weights. If `None`, all samples are given the same weight.\n\n ignore_ties : bool, default=False\n Assume that there are no ties in y_score (which is likely to be the\n case if y_score is continuous) for efficiency gains.\n\n Returns\n -------\n normalized_discounted_cumulative_gain : float in [0., 1.]\n The averaged NDCG scores for all samples.\n\n See Also\n --------\n dcg_score : Discounted Cumulative Gain (not normalized).\n\n References\n ----------\n `Wikipedia entry for Discounted Cumulative Gain\n `_\n\n Jarvelin, K., & Kekalainen, J. (2002).\n Cumulated gain-based evaluation of IR techniques. ACM Transactions on\n Information Systems (TOIS), 20(4), 422-446.\n\n Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May).\n A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th\n Annual Conference on Learning Theory (COLT 2013)\n\n McSherry, F., & Najork, M. (2008, March). Computing information retrieval\n performance measures efficiently in the presence of tied scores. In\n European conference on information retrieval (pp. 414-421). Springer,\n Berlin, Heidelberg.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import ndcg_score\n >>> # we have groud-truth relevance of some answers to a query:\n >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])\n >>> # we predict some scores (relevance) for the answers\n >>> scores = np.asarray([[.1, .2, .3, 4, 70]])\n >>> ndcg_score(true_relevance, scores)\n 0.69...\n >>> scores = np.asarray([[.05, 1.1, 1., .5, .0]])\n >>> ndcg_score(true_relevance, scores)\n 0.49...\n >>> # we can set k to truncate the sum; only top k answers contribute.\n >>> ndcg_score(true_relevance, scores, k=4)\n 0.35...\n >>> # the normalization takes k into account so a perfect answer\n >>> # would still get 1.0\n >>> ndcg_score(true_relevance, true_relevance, k=4)\n 1.0\n >>> # now we have some ties in our prediction\n >>> scores = np.asarray([[1, 0, 0, 0, 1]])\n >>> # by default ties are averaged, so here we get the average (normalized)\n >>> # true relevance of our top predictions: (10 / 10 + 5 / 10) / 2 = .75\n >>> ndcg_score(true_relevance, scores, k=1)\n 0.75\n >>> # we can choose to ignore ties for faster results, but only\n >>> # if we know there aren't ties in our scores, otherwise we get\n >>> # wrong results:\n >>> ndcg_score(true_relevance,\n ... scores, k=1, ignore_ties=True)\n 0.5\n\n \"\"\"\n y_true = check_array(y_true, ensure_2d=False)\n y_score = check_array(y_score, ensure_2d=False)\n check_consistent_length(y_true, y_score, sample_weight)\n _check_dcg_target_type(y_true)\n gain = _ndcg_sample_scores(y_true, y_score, k=k, ignore_ties=ignore_ties)\n return np.average(gain, weights=sample_weight)" }, { @@ -120912,6 +130045,10 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True binary labels. If labels are not either {-1, 1} or {0, 1}, then\npos_label should be explicitly given." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -120922,7 +130059,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target scores, can either be probability estimates of the positive\nclass, or non-thresholded measure of decisions (as returned by\n`decision_function` on some classifiers)." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -120932,6 +130070,10 @@ "docstring": { "type": "int or str, default=None", "description": "The label of the positive class.\nWhen ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},\n``pos_label`` is set to 1, otherwise an error will be raised." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -120942,13 +130084,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute precision-recall pairs for different probability thresholds.\n\nNote: this implementation is restricted to the binary classification task. The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The last precision and recall values are 1. and 0. respectively and do not have a corresponding threshold. This ensures that the graph starts on the y axis. Read more in the :ref:`User Guide `.", - "docstring": "Compute precision-recall pairs for different probability thresholds.\n\nNote: this implementation is restricted to the binary classification task.\n\nThe precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\ntrue positives and ``fp`` the number of false positives. The precision is\nintuitively the ability of the classifier not to label as positive a sample\nthat is negative.\n\nThe recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\ntrue positives and ``fn`` the number of false negatives. The recall is\nintuitively the ability of the classifier to find all the positive samples.\n\nThe last precision and recall values are 1. and 0. respectively and do not\nhave a corresponding threshold. This ensures that the graph starts on the\ny axis.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : ndarray of shape (n_samples,)\n True binary labels. If labels are not either {-1, 1} or {0, 1}, then\n pos_label should be explicitly given.\n\nprobas_pred : ndarray of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, or non-thresholded measure of decisions (as returned by\n `decision_function` on some classifiers).\n\npos_label : int or str, default=None\n The label of the positive class.\n When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},\n ``pos_label`` is set to 1, otherwise an error will be raised.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nprecision : ndarray of shape (n_thresholds + 1,)\n Precision values such that element i is the precision of\n predictions with score >= thresholds[i] and the last element is 1.\n\nrecall : ndarray of shape (n_thresholds + 1,)\n Decreasing recall values such that element i is the recall of\n predictions with score >= thresholds[i] and the last element is 0.\n\nthresholds : ndarray of shape (n_thresholds,)\n Increasing thresholds on the decision function used to compute\n precision and recall. n_thresholds <= len(np.unique(probas_pred)).\n\nSee Also\n--------\nPrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given\n a binary classifier.\nPrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve\n using predictions from a binary classifier.\naverage_precision_score : Compute average precision from prediction scores.\ndet_curve: Compute error rates for different probability thresholds.\nroc_curve : Compute Receiver operating characteristic (ROC) curve.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.metrics import precision_recall_curve\n>>> y_true = np.array([0, 0, 1, 1])\n>>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])\n>>> precision, recall, thresholds = precision_recall_curve(\n... y_true, y_scores)\n>>> precision\narray([0.66666667, 0.5 , 1. , 1. ])\n>>> recall\narray([1. , 0.5, 0.5, 0. ])\n>>> thresholds\narray([0.35, 0.4 , 0.8 ])", + "description": "Compute precision-recall pairs for different probability thresholds.\n\nNote: this implementation is restricted to the binary classification task.\n\nThe precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\ntrue positives and ``fp`` the number of false positives. The precision is\nintuitively the ability of the classifier not to label as positive a sample\nthat is negative.\n\nThe recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\ntrue positives and ``fn`` the number of false negatives. The recall is\nintuitively the ability of the classifier to find all the positive samples.\n\nThe last precision and recall values are 1. and 0. respectively and do not\nhave a corresponding threshold. This ensures that the graph starts on the\ny axis.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute precision-recall pairs for different probability thresholds.\n\n Note: this implementation is restricted to the binary classification task.\n\n The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\n true positives and ``fp`` the number of false positives. The precision is\n intuitively the ability of the classifier not to label as positive a sample\n that is negative.\n\n The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\n true positives and ``fn`` the number of false negatives. The recall is\n intuitively the ability of the classifier to find all the positive samples.\n\n The last precision and recall values are 1. and 0. respectively and do not\n have a corresponding threshold. This ensures that the graph starts on the\n y axis.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples,)\n True binary labels. If labels are not either {-1, 1} or {0, 1}, then\n pos_label should be explicitly given.\n\n probas_pred : ndarray of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, or non-thresholded measure of decisions (as returned by\n `decision_function` on some classifiers).\n\n pos_label : int or str, default=None\n The label of the positive class.\n When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},\n ``pos_label`` is set to 1, otherwise an error will be raised.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n precision : ndarray of shape (n_thresholds + 1,)\n Precision values such that element i is the precision of\n predictions with score >= thresholds[i] and the last element is 1.\n\n recall : ndarray of shape (n_thresholds + 1,)\n Decreasing recall values such that element i is the recall of\n predictions with score >= thresholds[i] and the last element is 0.\n\n thresholds : ndarray of shape (n_thresholds,)\n Increasing thresholds on the decision function used to compute\n precision and recall. n_thresholds <= len(np.unique(probas_pred)).\n\n See Also\n --------\n PrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given\n a binary classifier.\n PrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve\n using predictions from a binary classifier.\n average_precision_score : Compute average precision from prediction scores.\n det_curve: Compute error rates for different probability thresholds.\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import precision_recall_curve\n >>> y_true = np.array([0, 0, 1, 1])\n >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])\n >>> precision, recall, thresholds = precision_recall_curve(\n ... y_true, y_scores)\n >>> precision\n array([0.66666667, 0.5 , 1. , 1. ])\n >>> recall\n array([1. , 0.5, 0.5, 0. ])\n >>> thresholds\n array([0.35, 0.4 , 0.8 ])\n\n ", "source_code": "\ndef precision_recall_curve(y_true, probas_pred, *, pos_label=None, sample_weight=None):\n \"\"\"Compute precision-recall pairs for different probability thresholds.\n\n Note: this implementation is restricted to the binary classification task.\n\n The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of\n true positives and ``fp`` the number of false positives. The precision is\n intuitively the ability of the classifier not to label as positive a sample\n that is negative.\n\n The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of\n true positives and ``fn`` the number of false negatives. The recall is\n intuitively the ability of the classifier to find all the positive samples.\n\n The last precision and recall values are 1. and 0. respectively and do not\n have a corresponding threshold. This ensures that the graph starts on the\n y axis.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples,)\n True binary labels. If labels are not either {-1, 1} or {0, 1}, then\n pos_label should be explicitly given.\n\n probas_pred : ndarray of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, or non-thresholded measure of decisions (as returned by\n `decision_function` on some classifiers).\n\n pos_label : int or str, default=None\n The label of the positive class.\n When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},\n ``pos_label`` is set to 1, otherwise an error will be raised.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n precision : ndarray of shape (n_thresholds + 1,)\n Precision values such that element i is the precision of\n predictions with score >= thresholds[i] and the last element is 1.\n\n recall : ndarray of shape (n_thresholds + 1,)\n Decreasing recall values such that element i is the recall of\n predictions with score >= thresholds[i] and the last element is 0.\n\n thresholds : ndarray of shape (n_thresholds,)\n Increasing thresholds on the decision function used to compute\n precision and recall. n_thresholds <= len(np.unique(probas_pred)).\n\n See Also\n --------\n PrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given\n a binary classifier.\n PrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve\n using predictions from a binary classifier.\n average_precision_score : Compute average precision from prediction scores.\n det_curve: Compute error rates for different probability thresholds.\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import precision_recall_curve\n >>> y_true = np.array([0, 0, 1, 1])\n >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])\n >>> precision, recall, thresholds = precision_recall_curve(\n ... y_true, y_scores)\n >>> precision\n array([0.66666667, 0.5 , 1. , 1. ])\n >>> recall\n array([1. , 0.5, 0.5, 0. ])\n >>> thresholds\n array([0.35, 0.4 , 0.8 ])\n\n \"\"\"\n (fps, tps, thresholds) = _binary_clf_curve(y_true, probas_pred, pos_label=pos_label, sample_weight=sample_weight)\n precision = tps / (tps + fps)\n precision[np.isnan(precision)] = 0\n recall = tps / tps[-1]\n last_ind = tps.searchsorted(tps[-1])\n sl = slice(last_ind, None, -1)\n return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]" }, { @@ -120966,7 +130109,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_classes)", "description": "True labels or binary label indicators. The binary and multiclass cases\nexpect labels with shape (n_samples,) while the multilabel case expects\nbinary label indicators with shape (n_samples, n_classes)." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -120976,7 +130120,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_classes)", "description": "Target scores.\n\n* In the binary case, it corresponds to an array of shape\n `(n_samples,)`. Both probability estimates and non-thresholded\n decision values can be provided. The probability estimates correspond\n to the **probability of the class with the greater label**,\n i.e. `estimator.classes_[1]` and thus\n `estimator.predict_proba(X, y)[:, 1]`. The decision values\n corresponds to the output of `estimator.decision_function(X, y)`.\n See more information in the :ref:`User guide `;\n* In the multiclass case, it corresponds to an array of shape\n `(n_samples, n_classes)` of probability estimates provided by the\n `predict_proba` method. The probability estimates **must**\n sum to 1 across the possible classes. In addition, the order of the\n class scores must correspond to the order of ``labels``,\n if provided, or else to the numerical or lexicographical order of\n the labels in ``y_true``. See more information in the\n :ref:`User guide `;\n* In the multilabel case, it corresponds to an array of shape\n `(n_samples, n_classes)`. Probability estimates are provided by the\n `predict_proba` method and the non-thresholded decision values by\n the `decision_function` method. The probability estimates correspond\n to the **probability of the class with the greater label for each\n output** of the classifier. See more information in the\n :ref:`User guide `." - } + }, + "refined_type": {} }, { "name": "average", @@ -120986,6 +130131,10 @@ "docstring": { "type": "{'micro', 'macro', 'samples', 'weighted'} or None, default='macro'", "description": "If ``None``, the scores for each class are returned. Otherwise,\nthis determines the type of averaging performed on the data:\nNote: multiclass ROC AUC currently only handles the 'macro' and\n'weighted' averages.\n\n``'micro'``:\n Calculate metrics globally by considering each element of the label\n indicator matrix as a label.\n``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label).\n``'samples'``:\n Calculate metrics for each instance, and find their average.\n\nWill be ignored when ``y_true`` is binary." + }, + "refined_type": { + "kind": "EnumType", + "values": ["samples", "micro", "macro", "weighted"] } }, { @@ -120996,7 +130145,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "max_fpr", @@ -121006,7 +130156,8 @@ "docstring": { "type": "float > 0 and <= 1, default=None", "description": "If not ``None``, the standardized partial AUC [2]_ over the range\n[0, max_fpr] is returned. For the multiclass case, ``max_fpr``,\nshould be either equal to ``None`` or ``1.0`` as AUC ROC partial\ncomputation currently is not supported for multiclass." - } + }, + "refined_type": {} }, { "name": "multi_class", @@ -121016,6 +130167,10 @@ "docstring": { "type": "{'raise', 'ovr', 'ovo'}, default='raise'", "description": "Only used for multiclass targets. Determines the type of configuration\nto use. The default value raises an error, so either\n``'ovr'`` or ``'ovo'`` must be passed explicitly.\n\n``'ovr'``:\n Stands for One-vs-rest. Computes the AUC of each class\n against the rest [3]_ [4]_. This\n treats the multiclass case in the same way as the multilabel case.\n Sensitive to class imbalance even when ``average == 'macro'``,\n because class imbalance affects the composition of each of the\n 'rest' groupings.\n``'ovo'``:\n Stands for One-vs-one. Computes the average AUC of all\n possible pairwise combinations of classes [5]_.\n Insensitive to class imbalance when\n ``average == 'macro'``." + }, + "refined_type": { + "kind": "EnumType", + "values": ["raise", "ovo", "ovr"] } }, { @@ -121026,13 +130181,14 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "Only used for multiclass targets. List of labels that index the\nclasses in ``y_score``. If ``None``, the numerical or lexicographical\norder of the labels in ``y_true`` is used." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.\n\nNote: this implementation can be used with binary, multiclass and multilabel classification, but some restrictions apply (see Parameters). Read more in the :ref:`User Guide `.", - "docstring": "Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)\nfrom prediction scores.\n\nNote: this implementation can be used with binary, multiclass and\nmultilabel classification, but some restrictions apply (see Parameters).\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,) or (n_samples, n_classes)\n True labels or binary label indicators. The binary and multiclass cases\n expect labels with shape (n_samples,) while the multilabel case expects\n binary label indicators with shape (n_samples, n_classes).\n\ny_score : array-like of shape (n_samples,) or (n_samples, n_classes)\n Target scores.\n\n * In the binary case, it corresponds to an array of shape\n `(n_samples,)`. Both probability estimates and non-thresholded\n decision values can be provided. The probability estimates correspond\n to the **probability of the class with the greater label**,\n i.e. `estimator.classes_[1]` and thus\n `estimator.predict_proba(X, y)[:, 1]`. The decision values\n corresponds to the output of `estimator.decision_function(X, y)`.\n See more information in the :ref:`User guide `;\n * In the multiclass case, it corresponds to an array of shape\n `(n_samples, n_classes)` of probability estimates provided by the\n `predict_proba` method. The probability estimates **must**\n sum to 1 across the possible classes. In addition, the order of the\n class scores must correspond to the order of ``labels``,\n if provided, or else to the numerical or lexicographical order of\n the labels in ``y_true``. See more information in the\n :ref:`User guide `;\n * In the multilabel case, it corresponds to an array of shape\n `(n_samples, n_classes)`. Probability estimates are provided by the\n `predict_proba` method and the non-thresholded decision values by\n the `decision_function` method. The probability estimates correspond\n to the **probability of the class with the greater label for each\n output** of the classifier. See more information in the\n :ref:`User guide `.\n\naverage : {'micro', 'macro', 'samples', 'weighted'} or None, default='macro'\n If ``None``, the scores for each class are returned. Otherwise,\n this determines the type of averaging performed on the data:\n Note: multiclass ROC AUC currently only handles the 'macro' and\n 'weighted' averages.\n\n ``'micro'``:\n Calculate metrics globally by considering each element of the label\n indicator matrix as a label.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label).\n ``'samples'``:\n Calculate metrics for each instance, and find their average.\n\n Will be ignored when ``y_true`` is binary.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nmax_fpr : float > 0 and <= 1, default=None\n If not ``None``, the standardized partial AUC [2]_ over the range\n [0, max_fpr] is returned. For the multiclass case, ``max_fpr``,\n should be either equal to ``None`` or ``1.0`` as AUC ROC partial\n computation currently is not supported for multiclass.\n\nmulti_class : {'raise', 'ovr', 'ovo'}, default='raise'\n Only used for multiclass targets. Determines the type of configuration\n to use. The default value raises an error, so either\n ``'ovr'`` or ``'ovo'`` must be passed explicitly.\n\n ``'ovr'``:\n Stands for One-vs-rest. Computes the AUC of each class\n against the rest [3]_ [4]_. This\n treats the multiclass case in the same way as the multilabel case.\n Sensitive to class imbalance even when ``average == 'macro'``,\n because class imbalance affects the composition of each of the\n 'rest' groupings.\n ``'ovo'``:\n Stands for One-vs-one. Computes the average AUC of all\n possible pairwise combinations of classes [5]_.\n Insensitive to class imbalance when\n ``average == 'macro'``.\n\nlabels : array-like of shape (n_classes,), default=None\n Only used for multiclass targets. List of labels that index the\n classes in ``y_score``. If ``None``, the numerical or lexicographical\n order of the labels in ``y_true`` is used.\n\nReturns\n-------\nauc : float\n\nReferences\n----------\n.. [1] `Wikipedia entry for the Receiver operating characteristic\n `_\n\n.. [2] `Analyzing a portion of the ROC curve. McClish, 1989\n `_\n\n.. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving\n probability estimation trees (Section 6.2), CeDER Working Paper\n #IS-00-04, Stern School of Business, New York University.\n\n.. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern\n Recognition Letters, 27(8), 861-874.\n `_\n\n.. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area\n Under the ROC Curve for Multiple Class Classification Problems.\n Machine Learning, 45(2), 171-186.\n `_\n\nSee Also\n--------\naverage_precision_score : Area under the precision-recall curve.\nroc_curve : Compute Receiver operating characteristic (ROC) curve.\nRocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic\n (ROC) curve given an estimator and some data.\nRocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic\n (ROC) curve given the true and predicted values.\n\nExamples\n--------\nBinary case:\n\n>>> from sklearn.datasets import load_breast_cancer\n>>> from sklearn.linear_model import LogisticRegression\n>>> from sklearn.metrics import roc_auc_score\n>>> X, y = load_breast_cancer(return_X_y=True)\n>>> clf = LogisticRegression(solver=\"liblinear\", random_state=0).fit(X, y)\n>>> roc_auc_score(y, clf.predict_proba(X)[:, 1])\n0.99...\n>>> roc_auc_score(y, clf.decision_function(X))\n0.99...\n\nMulticlass case:\n\n>>> from sklearn.datasets import load_iris\n>>> X, y = load_iris(return_X_y=True)\n>>> clf = LogisticRegression(solver=\"liblinear\").fit(X, y)\n>>> roc_auc_score(y, clf.predict_proba(X), multi_class='ovr')\n0.99...\n\nMultilabel case:\n\n>>> import numpy as np\n>>> from sklearn.datasets import make_multilabel_classification\n>>> from sklearn.multioutput import MultiOutputClassifier\n>>> X, y = make_multilabel_classification(random_state=0)\n>>> clf = MultiOutputClassifier(clf).fit(X, y)\n>>> # get a list of n_output containing probability arrays of shape\n>>> # (n_samples, n_classes)\n>>> y_pred = clf.predict_proba(X)\n>>> # extract the positive columns for each output\n>>> y_pred = np.transpose([pred[:, 1] for pred in y_pred])\n>>> roc_auc_score(y, y_pred, average=None)\narray([0.82..., 0.86..., 0.94..., 0.85... , 0.94...])\n>>> from sklearn.linear_model import RidgeClassifierCV\n>>> clf = RidgeClassifierCV().fit(X, y)\n>>> roc_auc_score(y, clf.decision_function(X), average=None)\narray([0.81..., 0.84... , 0.93..., 0.87..., 0.94...])", + "description": "Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)\nfrom prediction scores.\n\nNote: this implementation can be used with binary, multiclass and\nmultilabel classification, but some restrictions apply (see Parameters).\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)\n from prediction scores.\n\n Note: this implementation can be used with binary, multiclass and\n multilabel classification, but some restrictions apply (see Parameters).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_classes)\n True labels or binary label indicators. The binary and multiclass cases\n expect labels with shape (n_samples,) while the multilabel case expects\n binary label indicators with shape (n_samples, n_classes).\n\n y_score : array-like of shape (n_samples,) or (n_samples, n_classes)\n Target scores.\n\n * In the binary case, it corresponds to an array of shape\n `(n_samples,)`. Both probability estimates and non-thresholded\n decision values can be provided. The probability estimates correspond\n to the **probability of the class with the greater label**,\n i.e. `estimator.classes_[1]` and thus\n `estimator.predict_proba(X, y)[:, 1]`. The decision values\n corresponds to the output of `estimator.decision_function(X, y)`.\n See more information in the :ref:`User guide `;\n * In the multiclass case, it corresponds to an array of shape\n `(n_samples, n_classes)` of probability estimates provided by the\n `predict_proba` method. The probability estimates **must**\n sum to 1 across the possible classes. In addition, the order of the\n class scores must correspond to the order of ``labels``,\n if provided, or else to the numerical or lexicographical order of\n the labels in ``y_true``. See more information in the\n :ref:`User guide `;\n * In the multilabel case, it corresponds to an array of shape\n `(n_samples, n_classes)`. Probability estimates are provided by the\n `predict_proba` method and the non-thresholded decision values by\n the `decision_function` method. The probability estimates correspond\n to the **probability of the class with the greater label for each\n output** of the classifier. See more information in the\n :ref:`User guide `.\n\n average : {'micro', 'macro', 'samples', 'weighted'} or None, default='macro'\n If ``None``, the scores for each class are returned. Otherwise,\n this determines the type of averaging performed on the data:\n Note: multiclass ROC AUC currently only handles the 'macro' and\n 'weighted' averages.\n\n ``'micro'``:\n Calculate metrics globally by considering each element of the label\n indicator matrix as a label.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label).\n ``'samples'``:\n Calculate metrics for each instance, and find their average.\n\n Will be ignored when ``y_true`` is binary.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n max_fpr : float > 0 and <= 1, default=None\n If not ``None``, the standardized partial AUC [2]_ over the range\n [0, max_fpr] is returned. For the multiclass case, ``max_fpr``,\n should be either equal to ``None`` or ``1.0`` as AUC ROC partial\n computation currently is not supported for multiclass.\n\n multi_class : {'raise', 'ovr', 'ovo'}, default='raise'\n Only used for multiclass targets. Determines the type of configuration\n to use. The default value raises an error, so either\n ``'ovr'`` or ``'ovo'`` must be passed explicitly.\n\n ``'ovr'``:\n Stands for One-vs-rest. Computes the AUC of each class\n against the rest [3]_ [4]_. This\n treats the multiclass case in the same way as the multilabel case.\n Sensitive to class imbalance even when ``average == 'macro'``,\n because class imbalance affects the composition of each of the\n 'rest' groupings.\n ``'ovo'``:\n Stands for One-vs-one. Computes the average AUC of all\n possible pairwise combinations of classes [5]_.\n Insensitive to class imbalance when\n ``average == 'macro'``.\n\n labels : array-like of shape (n_classes,), default=None\n Only used for multiclass targets. List of labels that index the\n classes in ``y_score``. If ``None``, the numerical or lexicographical\n order of the labels in ``y_true`` is used.\n\n Returns\n -------\n auc : float\n\n References\n ----------\n .. [1] `Wikipedia entry for the Receiver operating characteristic\n `_\n\n .. [2] `Analyzing a portion of the ROC curve. McClish, 1989\n `_\n\n .. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving\n probability estimation trees (Section 6.2), CeDER Working Paper\n #IS-00-04, Stern School of Business, New York University.\n\n .. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern\n Recognition Letters, 27(8), 861-874.\n `_\n\n .. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area\n Under the ROC Curve for Multiple Class Classification Problems.\n Machine Learning, 45(2), 171-186.\n `_\n\n See Also\n --------\n average_precision_score : Area under the precision-recall curve.\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic\n (ROC) curve given an estimator and some data.\n RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic\n (ROC) curve given the true and predicted values.\n\n Examples\n --------\n Binary case:\n\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.metrics import roc_auc_score\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> clf = LogisticRegression(solver=\"liblinear\", random_state=0).fit(X, y)\n >>> roc_auc_score(y, clf.predict_proba(X)[:, 1])\n 0.99...\n >>> roc_auc_score(y, clf.decision_function(X))\n 0.99...\n\n Multiclass case:\n\n >>> from sklearn.datasets import load_iris\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = LogisticRegression(solver=\"liblinear\").fit(X, y)\n >>> roc_auc_score(y, clf.predict_proba(X), multi_class='ovr')\n 0.99...\n\n Multilabel case:\n\n >>> import numpy as np\n >>> from sklearn.datasets import make_multilabel_classification\n >>> from sklearn.multioutput import MultiOutputClassifier\n >>> X, y = make_multilabel_classification(random_state=0)\n >>> clf = MultiOutputClassifier(clf).fit(X, y)\n >>> # get a list of n_output containing probability arrays of shape\n >>> # (n_samples, n_classes)\n >>> y_pred = clf.predict_proba(X)\n >>> # extract the positive columns for each output\n >>> y_pred = np.transpose([pred[:, 1] for pred in y_pred])\n >>> roc_auc_score(y, y_pred, average=None)\n array([0.82..., 0.86..., 0.94..., 0.85... , 0.94...])\n >>> from sklearn.linear_model import RidgeClassifierCV\n >>> clf = RidgeClassifierCV().fit(X, y)\n >>> roc_auc_score(y, clf.decision_function(X), average=None)\n array([0.81..., 0.84... , 0.93..., 0.87..., 0.94...])\n ", "source_code": "\ndef roc_auc_score(y_true, y_score, *, average='macro', sample_weight=None, max_fpr=None, multi_class='raise', labels=None):\n \"\"\"Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)\n from prediction scores.\n\n Note: this implementation can be used with binary, multiclass and\n multilabel classification, but some restrictions apply (see Parameters).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_classes)\n True labels or binary label indicators. The binary and multiclass cases\n expect labels with shape (n_samples,) while the multilabel case expects\n binary label indicators with shape (n_samples, n_classes).\n\n y_score : array-like of shape (n_samples,) or (n_samples, n_classes)\n Target scores.\n\n * In the binary case, it corresponds to an array of shape\n `(n_samples,)`. Both probability estimates and non-thresholded\n decision values can be provided. The probability estimates correspond\n to the **probability of the class with the greater label**,\n i.e. `estimator.classes_[1]` and thus\n `estimator.predict_proba(X, y)[:, 1]`. The decision values\n corresponds to the output of `estimator.decision_function(X, y)`.\n See more information in the :ref:`User guide `;\n * In the multiclass case, it corresponds to an array of shape\n `(n_samples, n_classes)` of probability estimates provided by the\n `predict_proba` method. The probability estimates **must**\n sum to 1 across the possible classes. In addition, the order of the\n class scores must correspond to the order of ``labels``,\n if provided, or else to the numerical or lexicographical order of\n the labels in ``y_true``. See more information in the\n :ref:`User guide `;\n * In the multilabel case, it corresponds to an array of shape\n `(n_samples, n_classes)`. Probability estimates are provided by the\n `predict_proba` method and the non-thresholded decision values by\n the `decision_function` method. The probability estimates correspond\n to the **probability of the class with the greater label for each\n output** of the classifier. See more information in the\n :ref:`User guide `.\n\n average : {'micro', 'macro', 'samples', 'weighted'} or None, default='macro'\n If ``None``, the scores for each class are returned. Otherwise,\n this determines the type of averaging performed on the data:\n Note: multiclass ROC AUC currently only handles the 'macro' and\n 'weighted' averages.\n\n ``'micro'``:\n Calculate metrics globally by considering each element of the label\n indicator matrix as a label.\n ``'macro'``:\n Calculate metrics for each label, and find their unweighted\n mean. This does not take label imbalance into account.\n ``'weighted'``:\n Calculate metrics for each label, and find their average, weighted\n by support (the number of true instances for each label).\n ``'samples'``:\n Calculate metrics for each instance, and find their average.\n\n Will be ignored when ``y_true`` is binary.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n max_fpr : float > 0 and <= 1, default=None\n If not ``None``, the standardized partial AUC [2]_ over the range\n [0, max_fpr] is returned. For the multiclass case, ``max_fpr``,\n should be either equal to ``None`` or ``1.0`` as AUC ROC partial\n computation currently is not supported for multiclass.\n\n multi_class : {'raise', 'ovr', 'ovo'}, default='raise'\n Only used for multiclass targets. Determines the type of configuration\n to use. The default value raises an error, so either\n ``'ovr'`` or ``'ovo'`` must be passed explicitly.\n\n ``'ovr'``:\n Stands for One-vs-rest. Computes the AUC of each class\n against the rest [3]_ [4]_. This\n treats the multiclass case in the same way as the multilabel case.\n Sensitive to class imbalance even when ``average == 'macro'``,\n because class imbalance affects the composition of each of the\n 'rest' groupings.\n ``'ovo'``:\n Stands for One-vs-one. Computes the average AUC of all\n possible pairwise combinations of classes [5]_.\n Insensitive to class imbalance when\n ``average == 'macro'``.\n\n labels : array-like of shape (n_classes,), default=None\n Only used for multiclass targets. List of labels that index the\n classes in ``y_score``. If ``None``, the numerical or lexicographical\n order of the labels in ``y_true`` is used.\n\n Returns\n -------\n auc : float\n\n References\n ----------\n .. [1] `Wikipedia entry for the Receiver operating characteristic\n `_\n\n .. [2] `Analyzing a portion of the ROC curve. McClish, 1989\n `_\n\n .. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving\n probability estimation trees (Section 6.2), CeDER Working Paper\n #IS-00-04, Stern School of Business, New York University.\n\n .. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern\n Recognition Letters, 27(8), 861-874.\n `_\n\n .. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area\n Under the ROC Curve for Multiple Class Classification Problems.\n Machine Learning, 45(2), 171-186.\n `_\n\n See Also\n --------\n average_precision_score : Area under the precision-recall curve.\n roc_curve : Compute Receiver operating characteristic (ROC) curve.\n RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic\n (ROC) curve given an estimator and some data.\n RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic\n (ROC) curve given the true and predicted values.\n\n Examples\n --------\n Binary case:\n\n >>> from sklearn.datasets import load_breast_cancer\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.metrics import roc_auc_score\n >>> X, y = load_breast_cancer(return_X_y=True)\n >>> clf = LogisticRegression(solver=\"liblinear\", random_state=0).fit(X, y)\n >>> roc_auc_score(y, clf.predict_proba(X)[:, 1])\n 0.99...\n >>> roc_auc_score(y, clf.decision_function(X))\n 0.99...\n\n Multiclass case:\n\n >>> from sklearn.datasets import load_iris\n >>> X, y = load_iris(return_X_y=True)\n >>> clf = LogisticRegression(solver=\"liblinear\").fit(X, y)\n >>> roc_auc_score(y, clf.predict_proba(X), multi_class='ovr')\n 0.99...\n\n Multilabel case:\n\n >>> import numpy as np\n >>> from sklearn.datasets import make_multilabel_classification\n >>> from sklearn.multioutput import MultiOutputClassifier\n >>> X, y = make_multilabel_classification(random_state=0)\n >>> clf = MultiOutputClassifier(clf).fit(X, y)\n >>> # get a list of n_output containing probability arrays of shape\n >>> # (n_samples, n_classes)\n >>> y_pred = clf.predict_proba(X)\n >>> # extract the positive columns for each output\n >>> y_pred = np.transpose([pred[:, 1] for pred in y_pred])\n >>> roc_auc_score(y, y_pred, average=None)\n array([0.82..., 0.86..., 0.94..., 0.85... , 0.94...])\n >>> from sklearn.linear_model import RidgeClassifierCV\n >>> clf = RidgeClassifierCV().fit(X, y)\n >>> roc_auc_score(y, clf.decision_function(X), average=None)\n array([0.81..., 0.84... , 0.93..., 0.87..., 0.94...])\n \"\"\"\n y_type = type_of_target(y_true)\n y_true = check_array(y_true, ensure_2d=False, dtype=None)\n y_score = check_array(y_score, ensure_2d=False)\n if y_type == 'multiclass' or y_type == 'binary' and y_score.ndim == 2 and y_score.shape[1] > 2:\n if max_fpr is not None and max_fpr != 1.0:\n raise ValueError(\"Partial AUC computation not available in multiclass setting, 'max_fpr' must be set to `None`, received `max_fpr={0}` instead\".format(max_fpr))\n if multi_class == 'raise':\n raise ValueError(\"multi_class must be in ('ovo', 'ovr')\")\n return _multiclass_roc_auc_score(y_true, y_score, labels, multi_class, average, sample_weight)\n elif y_type == 'binary':\n labels = np.unique(y_true)\n y_true = label_binarize(y_true, classes=labels)[:, 0]\n return _average_binary_score(partial(_binary_roc_auc_score, max_fpr=max_fpr), y_true, y_score, average, sample_weight=sample_weight)\n else:\n return _average_binary_score(partial(_binary_roc_auc_score, max_fpr=max_fpr), y_true, y_score, average, sample_weight=sample_weight)" }, { @@ -121050,6 +130206,10 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "True binary labels. If labels are not either {-1, 1} or {0, 1}, then\npos_label should be explicitly given." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -121060,7 +130220,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target scores, can either be probability estimates of the positive\nclass, confidence values, or non-thresholded measure of decisions\n(as returned by \"decision_function\" on some classifiers)." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -121070,6 +130231,10 @@ "docstring": { "type": "int or str, default=None", "description": "The label of the positive class.\nWhen ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},\n``pos_label`` is set to 1, otherwise an error will be raised." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -121080,7 +130245,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "drop_intermediate", @@ -121090,13 +130256,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to drop some suboptimal thresholds which would not appear\non a plotted ROC curve. This is useful in order to create lighter\nROC curves.\n\n.. versionadded:: 0.17\n parameter *drop_intermediate*." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute Receiver operating characteristic (ROC).\n\nNote: this implementation is restricted to the binary classification task. Read more in the :ref:`User Guide `.", - "docstring": "Compute Receiver operating characteristic (ROC).\n\nNote: this implementation is restricted to the binary classification task.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : ndarray of shape (n_samples,)\n True binary labels. If labels are not either {-1, 1} or {0, 1}, then\n pos_label should be explicitly given.\n\ny_score : ndarray of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\npos_label : int or str, default=None\n The label of the positive class.\n When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},\n ``pos_label`` is set to 1, otherwise an error will be raised.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\ndrop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\n .. versionadded:: 0.17\n parameter *drop_intermediate*.\n\nReturns\n-------\nfpr : ndarray of shape (>2,)\n Increasing false positive rates such that element i is the false\n positive rate of predictions with score >= `thresholds[i]`.\n\ntpr : ndarray of shape (>2,)\n Increasing true positive rates such that element `i` is the true\n positive rate of predictions with score >= `thresholds[i]`.\n\nthresholds : ndarray of shape = (n_thresholds,)\n Decreasing thresholds on the decision function used to compute\n fpr and tpr. `thresholds[0]` represents no instances being predicted\n and is arbitrarily set to `max(y_score) + 1`.\n\nSee Also\n--------\nRocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic\n (ROC) curve given an estimator and some data.\nRocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic\n (ROC) curve given the true and predicted values.\ndet_curve: Compute error rates for different probability thresholds.\nroc_auc_score : Compute the area under the ROC curve.\n\nNotes\n-----\nSince the thresholds are sorted from low to high values, they\nare reversed upon returning them to ensure they correspond to both ``fpr``\nand ``tpr``, which are sorted in reversed order during their calculation.\n\nReferences\n----------\n.. [1] `Wikipedia entry for the Receiver operating characteristic\n `_\n\n.. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition\n Letters, 2006, 27(8):861-874.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn import metrics\n>>> y = np.array([1, 1, 2, 2])\n>>> scores = np.array([0.1, 0.4, 0.35, 0.8])\n>>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)\n>>> fpr\narray([0. , 0. , 0.5, 0.5, 1. ])\n>>> tpr\narray([0. , 0.5, 0.5, 1. , 1. ])\n>>> thresholds\narray([1.8 , 0.8 , 0.4 , 0.35, 0.1 ])", + "description": "Compute Receiver operating characteristic (ROC).\n\nNote: this implementation is restricted to the binary classification task.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute Receiver operating characteristic (ROC).\n\n Note: this implementation is restricted to the binary classification task.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples,)\n True binary labels. If labels are not either {-1, 1} or {0, 1}, then\n pos_label should be explicitly given.\n\n y_score : ndarray of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\n pos_label : int or str, default=None\n The label of the positive class.\n When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},\n ``pos_label`` is set to 1, otherwise an error will be raised.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n drop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\n .. versionadded:: 0.17\n parameter *drop_intermediate*.\n\n Returns\n -------\n fpr : ndarray of shape (>2,)\n Increasing false positive rates such that element i is the false\n positive rate of predictions with score >= `thresholds[i]`.\n\n tpr : ndarray of shape (>2,)\n Increasing true positive rates such that element `i` is the true\n positive rate of predictions with score >= `thresholds[i]`.\n\n thresholds : ndarray of shape = (n_thresholds,)\n Decreasing thresholds on the decision function used to compute\n fpr and tpr. `thresholds[0]` represents no instances being predicted\n and is arbitrarily set to `max(y_score) + 1`.\n\n See Also\n --------\n RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic\n (ROC) curve given an estimator and some data.\n RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic\n (ROC) curve given the true and predicted values.\n det_curve: Compute error rates for different probability thresholds.\n roc_auc_score : Compute the area under the ROC curve.\n\n Notes\n -----\n Since the thresholds are sorted from low to high values, they\n are reversed upon returning them to ensure they correspond to both ``fpr``\n and ``tpr``, which are sorted in reversed order during their calculation.\n\n References\n ----------\n .. [1] `Wikipedia entry for the Receiver operating characteristic\n `_\n\n .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition\n Letters, 2006, 27(8):861-874.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import metrics\n >>> y = np.array([1, 1, 2, 2])\n >>> scores = np.array([0.1, 0.4, 0.35, 0.8])\n >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)\n >>> fpr\n array([0. , 0. , 0.5, 0.5, 1. ])\n >>> tpr\n array([0. , 0.5, 0.5, 1. , 1. ])\n >>> thresholds\n array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ])\n\n ", "source_code": "\ndef roc_curve(y_true, y_score, *, pos_label=None, sample_weight=None, drop_intermediate=True):\n \"\"\"Compute Receiver operating characteristic (ROC).\n\n Note: this implementation is restricted to the binary classification task.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : ndarray of shape (n_samples,)\n True binary labels. If labels are not either {-1, 1} or {0, 1}, then\n pos_label should be explicitly given.\n\n y_score : ndarray of shape (n_samples,)\n Target scores, can either be probability estimates of the positive\n class, confidence values, or non-thresholded measure of decisions\n (as returned by \"decision_function\" on some classifiers).\n\n pos_label : int or str, default=None\n The label of the positive class.\n When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1},\n ``pos_label`` is set to 1, otherwise an error will be raised.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n drop_intermediate : bool, default=True\n Whether to drop some suboptimal thresholds which would not appear\n on a plotted ROC curve. This is useful in order to create lighter\n ROC curves.\n\n .. versionadded:: 0.17\n parameter *drop_intermediate*.\n\n Returns\n -------\n fpr : ndarray of shape (>2,)\n Increasing false positive rates such that element i is the false\n positive rate of predictions with score >= `thresholds[i]`.\n\n tpr : ndarray of shape (>2,)\n Increasing true positive rates such that element `i` is the true\n positive rate of predictions with score >= `thresholds[i]`.\n\n thresholds : ndarray of shape = (n_thresholds,)\n Decreasing thresholds on the decision function used to compute\n fpr and tpr. `thresholds[0]` represents no instances being predicted\n and is arbitrarily set to `max(y_score) + 1`.\n\n See Also\n --------\n RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic\n (ROC) curve given an estimator and some data.\n RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic\n (ROC) curve given the true and predicted values.\n det_curve: Compute error rates for different probability thresholds.\n roc_auc_score : Compute the area under the ROC curve.\n\n Notes\n -----\n Since the thresholds are sorted from low to high values, they\n are reversed upon returning them to ensure they correspond to both ``fpr``\n and ``tpr``, which are sorted in reversed order during their calculation.\n\n References\n ----------\n .. [1] `Wikipedia entry for the Receiver operating characteristic\n `_\n\n .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition\n Letters, 2006, 27(8):861-874.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn import metrics\n >>> y = np.array([1, 1, 2, 2])\n >>> scores = np.array([0.1, 0.4, 0.35, 0.8])\n >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)\n >>> fpr\n array([0. , 0. , 0.5, 0.5, 1. ])\n >>> tpr\n array([0. , 0.5, 0.5, 1. , 1. ])\n >>> thresholds\n array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ])\n\n \"\"\"\n (fps, tps, thresholds) = _binary_clf_curve(y_true, y_score, pos_label=pos_label, sample_weight=sample_weight)\n if drop_intermediate and len(fps) > 2:\n optimal_idxs = np.where(np.r_[True, np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True])[0]\n fps = fps[optimal_idxs]\n tps = tps[optimal_idxs]\n thresholds = thresholds[optimal_idxs]\n tps = np.r_[0, tps]\n fps = np.r_[0, fps]\n thresholds = np.r_[thresholds[0] + 1, thresholds]\n if fps[-1] <= 0:\n warnings.warn('No negative samples in y_true, false positive value should be meaningless', UndefinedMetricWarning)\n fpr = np.repeat(np.nan, fps.shape)\n else:\n fpr = fps / fps[-1]\n if tps[-1] <= 0:\n warnings.warn('No positive samples in y_true, true positive value should be meaningless', UndefinedMetricWarning)\n tpr = np.repeat(np.nan, tps.shape)\n else:\n tpr = tps / tps[-1]\n return fpr, tpr, thresholds" }, { @@ -121114,7 +130281,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "True labels." - } + }, + "refined_type": {} }, { "name": "y_score", @@ -121123,8 +130291,9 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_classes)", - "description": "Target scores. These can be either probability estimates or\nnon-thresholded decision values (as returned by\n:term:`decision_function` on some classifiers). The binary case expects\nscores with shape (n_samples,) while the multiclass case expects scores\nwith shape (n_samples, n_classes). In the multiclass case, the order of\nthe class scores must correspond to the order of ``labels``, if\nprovided, or else to the numerical or lexicographical order of the\nlabels in ``y_true``." - } + "description": "Target scores. These can be either probability estimates or\nnon-thresholded decision values (as returned by\n:term:`decision_function` on some classifiers).\nThe binary case expects scores with shape (n_samples,) while the\nmulticlass case expects scores with shape (n_samples, n_classes).\nIn the multiclass case, the order of the class scores must\ncorrespond to the order of ``labels``, if provided, or else to\nthe numerical or lexicographical order of the labels in ``y_true``.\nIf ``y_true`` does not contain all the labels, ``labels`` must be\nprovided." + }, + "refined_type": {} }, { "name": "k", @@ -121134,7 +130303,8 @@ "docstring": { "type": "int, default=2", "description": "Number of most likely outcomes considered to find the correct label." - } + }, + "refined_type": {} }, { "name": "normalize", @@ -121144,7 +130314,8 @@ "docstring": { "type": "bool, default=True", "description": "If `True`, return the fraction of correctly classified samples.\nOtherwise, return the number of correctly classified samples." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -121154,7 +130325,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If `None`, all samples are given the same weight." - } + }, + "refined_type": {} }, { "name": "labels", @@ -121163,15 +130335,16 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "array-like of shape (n_classes,), default=None", - "description": "Multiclass only. List of labels that index the classes in ``y_score``.\nIf ``None``, the numerical or lexicographical order of the labels in\n``y_true`` is used." - } + "description": "Multiclass only. List of labels that index the classes in ``y_score``.\nIf ``None``, the numerical or lexicographical order of the labels in\n``y_true`` is used. If ``y_true`` does not contain all the labels,\n``labels`` must be provided." + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Top-k Accuracy classification score.\n\nThis metric computes the number of times where the correct label is among the top `k` labels predicted (ranked by predicted scores). Note that the multilabel case isn't covered here. Read more in the :ref:`User Guide `", - "docstring": "Top-k Accuracy classification score.\n\nThis metric computes the number of times where the correct label is among\nthe top `k` labels predicted (ranked by predicted scores). Note that the\nmultilabel case isn't covered here.\n\nRead more in the :ref:`User Guide `\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n True labels.\n\ny_score : array-like of shape (n_samples,) or (n_samples, n_classes)\n Target scores. These can be either probability estimates or\n non-thresholded decision values (as returned by\n :term:`decision_function` on some classifiers). The binary case expects\n scores with shape (n_samples,) while the multiclass case expects scores\n with shape (n_samples, n_classes). In the multiclass case, the order of\n the class scores must correspond to the order of ``labels``, if\n provided, or else to the numerical or lexicographical order of the\n labels in ``y_true``.\n\nk : int, default=2\n Number of most likely outcomes considered to find the correct label.\n\nnormalize : bool, default=True\n If `True`, return the fraction of correctly classified samples.\n Otherwise, return the number of correctly classified samples.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, all samples are given the same weight.\n\nlabels : array-like of shape (n_classes,), default=None\n Multiclass only. List of labels that index the classes in ``y_score``.\n If ``None``, the numerical or lexicographical order of the labels in\n ``y_true`` is used.\n\nReturns\n-------\nscore : float\n The top-k accuracy score. The best performance is 1 with\n `normalize == True` and the number of samples with\n `normalize == False`.\n\nSee also\n--------\naccuracy_score\n\nNotes\n-----\nIn cases where two or more labels are assigned equal predicted scores,\nthe labels with the highest indices will be chosen first. This might\nimpact the result if the correct label falls after the threshold because\nof that.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.metrics import top_k_accuracy_score\n>>> y_true = np.array([0, 1, 2, 2])\n>>> y_score = np.array([[0.5, 0.2, 0.2], # 0 is in top 2\n... [0.3, 0.4, 0.2], # 1 is in top 2\n... [0.2, 0.4, 0.3], # 2 is in top 2\n... [0.7, 0.2, 0.1]]) # 2 isn't in top 2\n>>> top_k_accuracy_score(y_true, y_score, k=2)\n0.75\n>>> # Not normalizing gives the number of \"correctly\" classified samples\n>>> top_k_accuracy_score(y_true, y_score, k=2, normalize=False)\n3", - "source_code": "\ndef top_k_accuracy_score(y_true, y_score, *, k=2, normalize=True, sample_weight=None, labels=None):\n \"\"\"Top-k Accuracy classification score.\n\n This metric computes the number of times where the correct label is among\n the top `k` labels predicted (ranked by predicted scores). Note that the\n multilabel case isn't covered here.\n\n Read more in the :ref:`User Guide `\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_score : array-like of shape (n_samples,) or (n_samples, n_classes)\n Target scores. These can be either probability estimates or\n non-thresholded decision values (as returned by\n :term:`decision_function` on some classifiers). The binary case expects\n scores with shape (n_samples,) while the multiclass case expects scores\n with shape (n_samples, n_classes). In the multiclass case, the order of\n the class scores must correspond to the order of ``labels``, if\n provided, or else to the numerical or lexicographical order of the\n labels in ``y_true``.\n\n k : int, default=2\n Number of most likely outcomes considered to find the correct label.\n\n normalize : bool, default=True\n If `True`, return the fraction of correctly classified samples.\n Otherwise, return the number of correctly classified samples.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, all samples are given the same weight.\n\n labels : array-like of shape (n_classes,), default=None\n Multiclass only. List of labels that index the classes in ``y_score``.\n If ``None``, the numerical or lexicographical order of the labels in\n ``y_true`` is used.\n\n Returns\n -------\n score : float\n The top-k accuracy score. The best performance is 1 with\n `normalize == True` and the number of samples with\n `normalize == False`.\n\n See also\n --------\n accuracy_score\n\n Notes\n -----\n In cases where two or more labels are assigned equal predicted scores,\n the labels with the highest indices will be chosen first. This might\n impact the result if the correct label falls after the threshold because\n of that.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import top_k_accuracy_score\n >>> y_true = np.array([0, 1, 2, 2])\n >>> y_score = np.array([[0.5, 0.2, 0.2], # 0 is in top 2\n ... [0.3, 0.4, 0.2], # 1 is in top 2\n ... [0.2, 0.4, 0.3], # 2 is in top 2\n ... [0.7, 0.2, 0.1]]) # 2 isn't in top 2\n >>> top_k_accuracy_score(y_true, y_score, k=2)\n 0.75\n >>> # Not normalizing gives the number of \"correctly\" classified samples\n >>> top_k_accuracy_score(y_true, y_score, k=2, normalize=False)\n 3\n\n \"\"\"\n y_true = check_array(y_true, ensure_2d=False, dtype=None)\n y_true = column_or_1d(y_true)\n y_type = type_of_target(y_true)\n if y_type == 'binary' and labels is not None and len(labels) > 2:\n y_type = 'multiclass'\n y_score = check_array(y_score, ensure_2d=False)\n y_score = column_or_1d(y_score) if y_type == 'binary' else y_score\n check_consistent_length(y_true, y_score, sample_weight)\n if y_type not in {'binary', 'multiclass'}:\n raise ValueError(f\"y type must be 'binary' or 'multiclass', got '{y_type}' instead.\")\n y_score_n_classes = y_score.shape[1] if y_score.ndim == 2 else 2\n if labels is None:\n classes = _unique(y_true)\n n_classes = len(classes)\n if n_classes != y_score_n_classes:\n raise ValueError(f\"Number of classes in 'y_true' ({n_classes}) not equal to the number of classes in 'y_score' ({y_score_n_classes}).\")\n else:\n labels = column_or_1d(labels)\n classes = _unique(labels)\n n_labels = len(labels)\n n_classes = len(classes)\n if n_classes != n_labels:\n raise ValueError(\"Parameter 'labels' must be unique.\")\n if not np.array_equal(classes, labels):\n raise ValueError(\"Parameter 'labels' must be ordered.\")\n if n_classes != y_score_n_classes:\n raise ValueError(f\"Number of given labels ({n_classes}) not equal to the number of classes in 'y_score' ({y_score_n_classes}).\")\n if len(np.setdiff1d(y_true, classes)):\n raise ValueError(\"'y_true' contains labels not in parameter 'labels'.\")\n if k >= n_classes:\n warnings.warn(f\"'k' ({k}) greater than or equal to 'n_classes' ({n_classes}) will result in a perfect score and is therefore meaningless.\", UndefinedMetricWarning)\n y_true_encoded = _encode(y_true, uniques=classes)\n if y_type == 'binary':\n if k == 1:\n threshold = 0.5 if y_score.min() >= 0 and y_score.max() <= 1 else 0\n y_pred = (y_score > threshold).astype(np.int64)\n hits = y_pred == y_true_encoded\n else:\n hits = np.ones_like(y_score, dtype=np.bool_)\n elif y_type == 'multiclass':\n sorted_pred = np.argsort(y_score, axis=1, kind='mergesort')[:, ::-1]\n hits = (y_true_encoded == sorted_pred[:, :k].T).any(axis=0)\n if normalize:\n return np.average(hits, weights=sample_weight)\n elif sample_weight is None:\n return np.sum(hits)\n else:\n return np.dot(hits, sample_weight)" + "description": "Top-k Accuracy classification score.\n\nThis metric computes the number of times where the correct label is among\nthe top `k` labels predicted (ranked by predicted scores). Note that the\nmultilabel case isn't covered here.\n\nRead more in the :ref:`User Guide `", + "docstring": "Top-k Accuracy classification score.\n\n This metric computes the number of times where the correct label is among\n the top `k` labels predicted (ranked by predicted scores). Note that the\n multilabel case isn't covered here.\n\n Read more in the :ref:`User Guide `\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_score : array-like of shape (n_samples,) or (n_samples, n_classes)\n Target scores. These can be either probability estimates or\n non-thresholded decision values (as returned by\n :term:`decision_function` on some classifiers).\n The binary case expects scores with shape (n_samples,) while the\n multiclass case expects scores with shape (n_samples, n_classes).\n In the multiclass case, the order of the class scores must\n correspond to the order of ``labels``, if provided, or else to\n the numerical or lexicographical order of the labels in ``y_true``.\n If ``y_true`` does not contain all the labels, ``labels`` must be\n provided.\n\n k : int, default=2\n Number of most likely outcomes considered to find the correct label.\n\n normalize : bool, default=True\n If `True`, return the fraction of correctly classified samples.\n Otherwise, return the number of correctly classified samples.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, all samples are given the same weight.\n\n labels : array-like of shape (n_classes,), default=None\n Multiclass only. List of labels that index the classes in ``y_score``.\n If ``None``, the numerical or lexicographical order of the labels in\n ``y_true`` is used. If ``y_true`` does not contain all the labels,\n ``labels`` must be provided.\n\n Returns\n -------\n score : float\n The top-k accuracy score. The best performance is 1 with\n `normalize == True` and the number of samples with\n `normalize == False`.\n\n See also\n --------\n accuracy_score\n\n Notes\n -----\n In cases where two or more labels are assigned equal predicted scores,\n the labels with the highest indices will be chosen first. This might\n impact the result if the correct label falls after the threshold because\n of that.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import top_k_accuracy_score\n >>> y_true = np.array([0, 1, 2, 2])\n >>> y_score = np.array([[0.5, 0.2, 0.2], # 0 is in top 2\n ... [0.3, 0.4, 0.2], # 1 is in top 2\n ... [0.2, 0.4, 0.3], # 2 is in top 2\n ... [0.7, 0.2, 0.1]]) # 2 isn't in top 2\n >>> top_k_accuracy_score(y_true, y_score, k=2)\n 0.75\n >>> # Not normalizing gives the number of \"correctly\" classified samples\n >>> top_k_accuracy_score(y_true, y_score, k=2, normalize=False)\n 3\n\n ", + "source_code": "\ndef top_k_accuracy_score(y_true, y_score, *, k=2, normalize=True, sample_weight=None, labels=None):\n \"\"\"Top-k Accuracy classification score.\n\n This metric computes the number of times where the correct label is among\n the top `k` labels predicted (ranked by predicted scores). Note that the\n multilabel case isn't covered here.\n\n Read more in the :ref:`User Guide `\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n True labels.\n\n y_score : array-like of shape (n_samples,) or (n_samples, n_classes)\n Target scores. These can be either probability estimates or\n non-thresholded decision values (as returned by\n :term:`decision_function` on some classifiers).\n The binary case expects scores with shape (n_samples,) while the\n multiclass case expects scores with shape (n_samples, n_classes).\n In the multiclass case, the order of the class scores must\n correspond to the order of ``labels``, if provided, or else to\n the numerical or lexicographical order of the labels in ``y_true``.\n If ``y_true`` does not contain all the labels, ``labels`` must be\n provided.\n\n k : int, default=2\n Number of most likely outcomes considered to find the correct label.\n\n normalize : bool, default=True\n If `True`, return the fraction of correctly classified samples.\n Otherwise, return the number of correctly classified samples.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, all samples are given the same weight.\n\n labels : array-like of shape (n_classes,), default=None\n Multiclass only. List of labels that index the classes in ``y_score``.\n If ``None``, the numerical or lexicographical order of the labels in\n ``y_true`` is used. If ``y_true`` does not contain all the labels,\n ``labels`` must be provided.\n\n Returns\n -------\n score : float\n The top-k accuracy score. The best performance is 1 with\n `normalize == True` and the number of samples with\n `normalize == False`.\n\n See also\n --------\n accuracy_score\n\n Notes\n -----\n In cases where two or more labels are assigned equal predicted scores,\n the labels with the highest indices will be chosen first. This might\n impact the result if the correct label falls after the threshold because\n of that.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.metrics import top_k_accuracy_score\n >>> y_true = np.array([0, 1, 2, 2])\n >>> y_score = np.array([[0.5, 0.2, 0.2], # 0 is in top 2\n ... [0.3, 0.4, 0.2], # 1 is in top 2\n ... [0.2, 0.4, 0.3], # 2 is in top 2\n ... [0.7, 0.2, 0.1]]) # 2 isn't in top 2\n >>> top_k_accuracy_score(y_true, y_score, k=2)\n 0.75\n >>> # Not normalizing gives the number of \"correctly\" classified samples\n >>> top_k_accuracy_score(y_true, y_score, k=2, normalize=False)\n 3\n\n \"\"\"\n y_true = check_array(y_true, ensure_2d=False, dtype=None)\n y_true = column_or_1d(y_true)\n y_type = type_of_target(y_true)\n if y_type == 'binary' and labels is not None and len(labels) > 2:\n y_type = 'multiclass'\n y_score = check_array(y_score, ensure_2d=False)\n y_score = column_or_1d(y_score) if y_type == 'binary' else y_score\n check_consistent_length(y_true, y_score, sample_weight)\n if y_type not in {'binary', 'multiclass'}:\n raise ValueError(f\"y type must be 'binary' or 'multiclass', got '{y_type}' instead.\")\n y_score_n_classes = y_score.shape[1] if y_score.ndim == 2 else 2\n if labels is None:\n classes = _unique(y_true)\n n_classes = len(classes)\n if n_classes != y_score_n_classes:\n raise ValueError(f\"Number of classes in 'y_true' ({n_classes}) not equal to the number of classes in 'y_score' ({y_score_n_classes}).\")\n else:\n labels = column_or_1d(labels)\n classes = _unique(labels)\n n_labels = len(labels)\n n_classes = len(classes)\n if n_classes != n_labels:\n raise ValueError(\"Parameter 'labels' must be unique.\")\n if not np.array_equal(classes, labels):\n raise ValueError(\"Parameter 'labels' must be ordered.\")\n if n_classes != y_score_n_classes:\n raise ValueError(f\"Number of given labels ({n_classes}) not equal to the number of classes in 'y_score' ({y_score_n_classes}).\")\n if len(np.setdiff1d(y_true, classes)):\n raise ValueError(\"'y_true' contains labels not in parameter 'labels'.\")\n if k >= n_classes:\n warnings.warn(f\"'k' ({k}) greater than or equal to 'n_classes' ({n_classes}) will result in a perfect score and is therefore meaningless.\", UndefinedMetricWarning)\n y_true_encoded = _encode(y_true, uniques=classes)\n if y_type == 'binary':\n if k == 1:\n threshold = 0.5 if y_score.min() >= 0 and y_score.max() <= 1 else 0\n y_pred = (y_score > threshold).astype(np.int64)\n hits = y_pred == y_true_encoded\n else:\n hits = np.ones_like(y_score, dtype=np.bool_)\n elif y_type == 'multiclass':\n sorted_pred = np.argsort(y_score, axis=1, kind='mergesort')[:, ::-1]\n hits = (y_true_encoded == sorted_pred[:, :k].T).any(axis=0)\n if normalize:\n return np.average(hits, weights=sample_weight)\n elif sample_weight is None:\n return np.sum(hits)\n else:\n return np.dot(hits, sample_weight)" }, { "name": "_check_reg_targets", @@ -121188,7 +130361,8 @@ "docstring": { "type": "array-like", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121198,7 +130372,8 @@ "docstring": { "type": "array-like", "description": "" - } + }, + "refined_type": {} }, { "name": "multioutput", @@ -121208,7 +130383,8 @@ "docstring": { "type": "array-like or string in ['raw_values', uniform_average',", "description": "'variance_weighted'] or None\nNone is accepted due to backward compatibility of r2_score()." - } + }, + "refined_type": {} }, { "name": "dtype", @@ -121218,13 +130394,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check that y_true and y_pred belong to the same regression task.", - "docstring": "Check that y_true and y_pred belong to the same regression task.\n\nParameters\n----------\ny_true : array-like\n\ny_pred : array-like\n\nmultioutput : array-like or string in ['raw_values', uniform_average',\n 'variance_weighted'] or None\n None is accepted due to backward compatibility of r2_score().\n\nReturns\n-------\ntype_true : one of {'continuous', continuous-multioutput'}\n The type of the true target data, as output by\n 'utils.multiclass.type_of_target'.\n\ny_true : array-like of shape (n_samples, n_outputs)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape (n_samples, n_outputs)\n Estimated target values.\n\nmultioutput : array-like of shape (n_outputs) or string in ['raw_values',\n uniform_average', 'variance_weighted'] or None\n Custom output weights if ``multioutput`` is array-like or\n just the corresponding argument if ``multioutput`` is a\n correct keyword.\n\ndtype : str or list, default=\"numeric\"\n the dtype argument passed to check_array.", + "docstring": "Check that y_true and y_pred belong to the same regression task.\n\n Parameters\n ----------\n y_true : array-like\n\n y_pred : array-like\n\n multioutput : array-like or string in ['raw_values', uniform_average',\n 'variance_weighted'] or None\n None is accepted due to backward compatibility of r2_score().\n\n Returns\n -------\n type_true : one of {'continuous', continuous-multioutput'}\n The type of the true target data, as output by\n 'utils.multiclass.type_of_target'.\n\n y_true : array-like of shape (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples, n_outputs)\n Estimated target values.\n\n multioutput : array-like of shape (n_outputs) or string in ['raw_values',\n uniform_average', 'variance_weighted'] or None\n Custom output weights if ``multioutput`` is array-like or\n just the corresponding argument if ``multioutput`` is a\n correct keyword.\n\n dtype : str or list, default=\"numeric\"\n the dtype argument passed to check_array.\n ", "source_code": "\ndef _check_reg_targets(y_true, y_pred, multioutput, dtype='numeric'):\n \"\"\"Check that y_true and y_pred belong to the same regression task.\n\n Parameters\n ----------\n y_true : array-like\n\n y_pred : array-like\n\n multioutput : array-like or string in ['raw_values', uniform_average',\n 'variance_weighted'] or None\n None is accepted due to backward compatibility of r2_score().\n\n Returns\n -------\n type_true : one of {'continuous', continuous-multioutput'}\n The type of the true target data, as output by\n 'utils.multiclass.type_of_target'.\n\n y_true : array-like of shape (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples, n_outputs)\n Estimated target values.\n\n multioutput : array-like of shape (n_outputs) or string in ['raw_values',\n uniform_average', 'variance_weighted'] or None\n Custom output weights if ``multioutput`` is array-like or\n just the corresponding argument if ``multioutput`` is a\n correct keyword.\n\n dtype : str or list, default=\"numeric\"\n the dtype argument passed to check_array.\n \"\"\"\n check_consistent_length(y_true, y_pred)\n y_true = check_array(y_true, ensure_2d=False, dtype=dtype)\n y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)\n if y_true.ndim == 1:\n y_true = y_true.reshape((-1, 1))\n if y_pred.ndim == 1:\n y_pred = y_pred.reshape((-1, 1))\n if y_true.shape[1] != y_pred.shape[1]:\n raise ValueError('y_true and y_pred have different number of output ({0}!={1})'.format(y_true.shape[1], y_pred.shape[1]))\n n_outputs = y_true.shape[1]\n allowed_multioutput_str = ('raw_values', 'uniform_average', 'variance_weighted')\n if isinstance(multioutput, str):\n if multioutput not in allowed_multioutput_str:\n raise ValueError(\"Allowed 'multioutput' string values are {}. You provided multioutput={!r}\".format(allowed_multioutput_str, multioutput))\n elif multioutput is not None:\n multioutput = check_array(multioutput, ensure_2d=False)\n if n_outputs == 1:\n raise ValueError('Custom weights are useful only in multi-output cases.')\n elif n_outputs != len(multioutput):\n raise ValueError('There must be equally many custom weights (%d) as outputs (%d).' % (len(multioutput), n_outputs))\n y_type = 'continuous' if n_outputs == 1 else 'continuous-multioutput'\n return y_type, y_true, y_pred, multioutput" }, { @@ -121242,7 +130419,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121252,7 +130430,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Estimated target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -121262,7 +130441,8 @@ "docstring": { "type": "array-like of shape (n_samples,), optional", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "power", @@ -121272,13 +130452,14 @@ "docstring": { "type": "float, default=0", "description": "Tweedie power parameter. Either power <= 0 or power >= 1.\n\nThe higher `p` the less weight is given to extreme\ndeviations between true and predicted targets.\n\n- power < 0: Extreme stable distribution. Requires: y_pred > 0.\n- power = 0 : Normal distribution, output corresponds to r2_score.\n y_true and y_pred can be any real numbers.\n- power = 1 : Poisson distribution. Requires: y_true >= 0 and\n y_pred > 0.\n- 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0\n and y_pred > 0.\n- power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.\n- power = 3 : Inverse Gaussian distribution. Requires: y_true > 0\n and y_pred > 0.\n- otherwise : Positive stable distribution. Requires: y_true > 0\n and y_pred > 0." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "D^2 regression score function, percentage of Tweedie deviance explained.\n\nBest possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A model that always uses the empirical mean of `y_true` as constant prediction, disregarding the input features, gets a D^2 score of 0.0. Read more in the :ref:`User Guide `. .. versionadded:: 1.0", - "docstring": "D^2 regression score function, percentage of Tweedie deviance explained.\n\nBest possible score is 1.0 and it can be negative (because the model can be\narbitrarily worse). A model that always uses the empirical mean of `y_true` as\nconstant prediction, disregarding the input features, gets a D^2 score of 0.0.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape (n_samples,)\n Estimated target values.\n\nsample_weight : array-like of shape (n_samples,), optional\n Sample weights.\n\npower : float, default=0\n Tweedie power parameter. Either power <= 0 or power >= 1.\n\n The higher `p` the less weight is given to extreme\n deviations between true and predicted targets.\n\n - power < 0: Extreme stable distribution. Requires: y_pred > 0.\n - power = 0 : Normal distribution, output corresponds to r2_score.\n y_true and y_pred can be any real numbers.\n - power = 1 : Poisson distribution. Requires: y_true >= 0 and\n y_pred > 0.\n - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0\n and y_pred > 0.\n - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.\n - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0\n and y_pred > 0.\n - otherwise : Positive stable distribution. Requires: y_true > 0\n and y_pred > 0.\n\nReturns\n-------\nz : float or ndarray of floats\n The D^2 score.\n\nNotes\n-----\nThis is not a symmetric function.\n\nLike R^2, D^2 score may be negative (it need not actually be the square of\na quantity D).\n\nThis metric is not well-defined for single samples and will return a NaN\nvalue if n_samples is less than two.\n\nReferences\n----------\n.. [1] Eq. (3.11) of Hastie, Trevor J., Robert Tibshirani and Martin J.\n Wainwright. \"Statistical Learning with Sparsity: The Lasso and\n Generalizations.\" (2015). https://trevorhastie.github.io\n\nExamples\n--------\n>>> from sklearn.metrics import d2_tweedie_score\n>>> y_true = [0.5, 1, 2.5, 7]\n>>> y_pred = [1, 1, 5, 3.5]\n>>> d2_tweedie_score(y_true, y_pred)\n0.285...\n>>> d2_tweedie_score(y_true, y_pred, power=1)\n0.487...\n>>> d2_tweedie_score(y_true, y_pred, power=2)\n0.630...\n>>> d2_tweedie_score(y_true, y_true, power=2)\n1.0", + "description": "D^2 regression score function, percentage of Tweedie deviance explained.\n\nBest possible score is 1.0 and it can be negative (because the model can be\narbitrarily worse). A model that always uses the empirical mean of `y_true` as\nconstant prediction, disregarding the input features, gets a D^2 score of 0.0.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 1.0", + "docstring": "D^2 regression score function, percentage of Tweedie deviance explained.\n\n Best possible score is 1.0 and it can be negative (because the model can be\n arbitrarily worse). A model that always uses the empirical mean of `y_true` as\n constant prediction, disregarding the input features, gets a D^2 score of 0.0.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), optional\n Sample weights.\n\n power : float, default=0\n Tweedie power parameter. Either power <= 0 or power >= 1.\n\n The higher `p` the less weight is given to extreme\n deviations between true and predicted targets.\n\n - power < 0: Extreme stable distribution. Requires: y_pred > 0.\n - power = 0 : Normal distribution, output corresponds to r2_score.\n y_true and y_pred can be any real numbers.\n - power = 1 : Poisson distribution. Requires: y_true >= 0 and\n y_pred > 0.\n - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0\n and y_pred > 0.\n - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.\n - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0\n and y_pred > 0.\n - otherwise : Positive stable distribution. Requires: y_true > 0\n and y_pred > 0.\n\n Returns\n -------\n z : float or ndarray of floats\n The D^2 score.\n\n Notes\n -----\n This is not a symmetric function.\n\n Like R^2, D^2 score may be negative (it need not actually be the square of\n a quantity D).\n\n This metric is not well-defined for single samples and will return a NaN\n value if n_samples is less than two.\n\n References\n ----------\n .. [1] Eq. (3.11) of Hastie, Trevor J., Robert Tibshirani and Martin J.\n Wainwright. \"Statistical Learning with Sparsity: The Lasso and\n Generalizations.\" (2015). https://trevorhastie.github.io\n\n Examples\n --------\n >>> from sklearn.metrics import d2_tweedie_score\n >>> y_true = [0.5, 1, 2.5, 7]\n >>> y_pred = [1, 1, 5, 3.5]\n >>> d2_tweedie_score(y_true, y_pred)\n 0.285...\n >>> d2_tweedie_score(y_true, y_pred, power=1)\n 0.487...\n >>> d2_tweedie_score(y_true, y_pred, power=2)\n 0.630...\n >>> d2_tweedie_score(y_true, y_true, power=2)\n 1.0\n ", "source_code": "\ndef d2_tweedie_score(y_true, y_pred, *, sample_weight=None, power=0):\n \"\"\"D^2 regression score function, percentage of Tweedie deviance explained.\n\n Best possible score is 1.0 and it can be negative (because the model can be\n arbitrarily worse). A model that always uses the empirical mean of `y_true` as\n constant prediction, disregarding the input features, gets a D^2 score of 0.0.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 1.0\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), optional\n Sample weights.\n\n power : float, default=0\n Tweedie power parameter. Either power <= 0 or power >= 1.\n\n The higher `p` the less weight is given to extreme\n deviations between true and predicted targets.\n\n - power < 0: Extreme stable distribution. Requires: y_pred > 0.\n - power = 0 : Normal distribution, output corresponds to r2_score.\n y_true and y_pred can be any real numbers.\n - power = 1 : Poisson distribution. Requires: y_true >= 0 and\n y_pred > 0.\n - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0\n and y_pred > 0.\n - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.\n - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0\n and y_pred > 0.\n - otherwise : Positive stable distribution. Requires: y_true > 0\n and y_pred > 0.\n\n Returns\n -------\n z : float or ndarray of floats\n The D^2 score.\n\n Notes\n -----\n This is not a symmetric function.\n\n Like R^2, D^2 score may be negative (it need not actually be the square of\n a quantity D).\n\n This metric is not well-defined for single samples and will return a NaN\n value if n_samples is less than two.\n\n References\n ----------\n .. [1] Eq. (3.11) of Hastie, Trevor J., Robert Tibshirani and Martin J.\n Wainwright. \"Statistical Learning with Sparsity: The Lasso and\n Generalizations.\" (2015). https://trevorhastie.github.io\n\n Examples\n --------\n >>> from sklearn.metrics import d2_tweedie_score\n >>> y_true = [0.5, 1, 2.5, 7]\n >>> y_pred = [1, 1, 5, 3.5]\n >>> d2_tweedie_score(y_true, y_pred)\n 0.285...\n >>> d2_tweedie_score(y_true, y_pred, power=1)\n 0.487...\n >>> d2_tweedie_score(y_true, y_pred, power=2)\n 0.630...\n >>> d2_tweedie_score(y_true, y_true, power=2)\n 1.0\n \"\"\"\n (y_type, y_true, y_pred, _) = _check_reg_targets(y_true, y_pred, None, dtype=[np.float64, np.float32])\n if y_type == 'continuous-multioutput':\n raise ValueError('Multioutput not supported in d2_tweedie_score')\n check_consistent_length(y_true, y_pred, sample_weight)\n if _num_samples(y_pred) < 2:\n msg = 'D^2 score is not well-defined with less than two samples.'\n warnings.warn(msg, UndefinedMetricWarning)\n return float('nan')\n if sample_weight is not None:\n sample_weight = column_or_1d(sample_weight)\n sample_weight = sample_weight[:, np.newaxis]\n dist = TweedieDistribution(power=power)\n dev = dist.unit_deviance(y_true, y_pred, check_input=True)\n numerator = np.average(dev, weights=sample_weight)\n y_avg = np.average(y_true, weights=sample_weight)\n dev = dist.unit_deviance(y_true, y_avg, check_input=True)\n denominator = np.average(dev, weights=sample_weight)\n return 1 - numerator / denominator" }, { @@ -121296,7 +130477,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121306,7 +130488,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Estimated target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -121316,7 +130499,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "multioutput", @@ -121326,13 +130510,21 @@ "docstring": { "type": "{'raw_values', 'uniform_average', 'variance_weighted'} or array-like of shape (n_outputs,), default='uniform_average'", "description": "Defines aggregating of multiple output scores.\nArray-like value defines weights used to average scores.\n\n'raw_values' :\n Returns a full set of scores in case of multioutput input.\n\n'uniform_average' :\n Scores of all outputs are averaged with uniform weight.\n\n'variance_weighted' :\n Scores of all outputs are averaged, weighted by the variances\n of each individual output." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "raw_values", + "uniform_average", + "variance_weighted" + ] } } ], "results": [], "is_public": true, - "description": "Explained variance regression score function.\n\nBest possible score is 1.0, lower values are worse. Read more in the :ref:`User Guide `.", - "docstring": "Explained variance regression score function.\n\nBest possible score is 1.0, lower values are worse.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nmultioutput : {'raw_values', 'uniform_average', 'variance_weighted'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output scores.\n Array-like value defines weights used to average scores.\n\n 'raw_values' :\n Returns a full set of scores in case of multioutput input.\n\n 'uniform_average' :\n Scores of all outputs are averaged with uniform weight.\n\n 'variance_weighted' :\n Scores of all outputs are averaged, weighted by the variances\n of each individual output.\n\nReturns\n-------\nscore : float or ndarray of floats\n The explained variance or ndarray if 'multioutput' is 'raw_values'.\n\nNotes\n-----\nThis is not a symmetric function.\n\nExamples\n--------\n>>> from sklearn.metrics import explained_variance_score\n>>> y_true = [3, -0.5, 2, 7]\n>>> y_pred = [2.5, 0.0, 2, 8]\n>>> explained_variance_score(y_true, y_pred)\n0.957...\n>>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n>>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n>>> explained_variance_score(y_true, y_pred, multioutput='uniform_average')\n0.983...", + "description": "Explained variance regression score function.\n\nBest possible score is 1.0, lower values are worse.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Explained variance regression score function.\n\n Best possible score is 1.0, lower values are worse.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average', 'variance_weighted'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output scores.\n Array-like value defines weights used to average scores.\n\n 'raw_values' :\n Returns a full set of scores in case of multioutput input.\n\n 'uniform_average' :\n Scores of all outputs are averaged with uniform weight.\n\n 'variance_weighted' :\n Scores of all outputs are averaged, weighted by the variances\n of each individual output.\n\n Returns\n -------\n score : float or ndarray of floats\n The explained variance or ndarray if 'multioutput' is 'raw_values'.\n\n Notes\n -----\n This is not a symmetric function.\n\n Examples\n --------\n >>> from sklearn.metrics import explained_variance_score\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> explained_variance_score(y_true, y_pred)\n 0.957...\n >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n >>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n >>> explained_variance_score(y_true, y_pred, multioutput='uniform_average')\n 0.983...\n ", "source_code": "\ndef explained_variance_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average'):\n \"\"\"Explained variance regression score function.\n\n Best possible score is 1.0, lower values are worse.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average', 'variance_weighted'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output scores.\n Array-like value defines weights used to average scores.\n\n 'raw_values' :\n Returns a full set of scores in case of multioutput input.\n\n 'uniform_average' :\n Scores of all outputs are averaged with uniform weight.\n\n 'variance_weighted' :\n Scores of all outputs are averaged, weighted by the variances\n of each individual output.\n\n Returns\n -------\n score : float or ndarray of floats\n The explained variance or ndarray if 'multioutput' is 'raw_values'.\n\n Notes\n -----\n This is not a symmetric function.\n\n Examples\n --------\n >>> from sklearn.metrics import explained_variance_score\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> explained_variance_score(y_true, y_pred)\n 0.957...\n >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n >>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n >>> explained_variance_score(y_true, y_pred, multioutput='uniform_average')\n 0.983...\n \"\"\"\n (y_type, y_true, y_pred, multioutput) = _check_reg_targets(y_true, y_pred, multioutput)\n check_consistent_length(y_true, y_pred, sample_weight)\n y_diff_avg = np.average(y_true - y_pred, weights=sample_weight, axis=0)\n numerator = np.average((y_true - y_pred - y_diff_avg)**2, weights=sample_weight, axis=0)\n y_true_avg = np.average(y_true, weights=sample_weight, axis=0)\n denominator = np.average((y_true - y_true_avg)**2, weights=sample_weight, axis=0)\n nonzero_numerator = numerator != 0\n nonzero_denominator = denominator != 0\n valid_score = nonzero_numerator & nonzero_denominator\n output_scores = np.ones(y_true.shape[1])\n output_scores[valid_score] = 1 - numerator[valid_score] / denominator[valid_score]\n output_scores[nonzero_numerator & ~nonzero_denominator] = 0.0\n if isinstance(multioutput, str):\n if multioutput == 'raw_values':\n return output_scores\n elif multioutput == 'uniform_average':\n avg_weights = None\n elif multioutput == 'variance_weighted':\n avg_weights = denominator\n else:\n avg_weights = multioutput\n return np.average(output_scores, weights=avg_weights)" }, { @@ -121350,7 +130542,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121360,14 +130553,15 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Estimated target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "max_error metric calculates the maximum residual error.\n\nRead more in the :ref:`User Guide `.", - "docstring": "max_error metric calculates the maximum residual error.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape (n_samples,)\n Estimated target values.\n\nReturns\n-------\nmax_error : float\n A positive floating point value (the best value is 0.0).\n\nExamples\n--------\n>>> from sklearn.metrics import max_error\n>>> y_true = [3, 2, 7, 1]\n>>> y_pred = [4, 2, 7, 1]\n>>> max_error(y_true, y_pred)\n1", - "source_code": "\ndef max_error(y_true, y_pred):\n \"\"\"\n max_error metric calculates the maximum residual error.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,)\n Estimated target values.\n\n Returns\n -------\n max_error : float\n A positive floating point value (the best value is 0.0).\n\n Examples\n --------\n >>> from sklearn.metrics import max_error\n >>> y_true = [3, 2, 7, 1]\n >>> y_pred = [4, 2, 7, 1]\n >>> max_error(y_true, y_pred)\n 1\n \"\"\"\n (y_type, y_true, y_pred, _) = _check_reg_targets(y_true, y_pred, None)\n if y_type == 'continuous-multioutput':\n raise ValueError('Multioutput not supported in max_error')\n return np.max(np.abs(y_true - y_pred))" + "description": "The max_error metric calculates the maximum residual error.\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n The max_error metric calculates the maximum residual error.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,)\n Estimated target values.\n\n Returns\n -------\n max_error : float\n A positive floating point value (the best value is 0.0).\n\n Examples\n --------\n >>> from sklearn.metrics import max_error\n >>> y_true = [3, 2, 7, 1]\n >>> y_pred = [4, 2, 7, 1]\n >>> max_error(y_true, y_pred)\n 1\n ", + "source_code": "\ndef max_error(y_true, y_pred):\n \"\"\"\n The max_error metric calculates the maximum residual error.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,)\n Estimated target values.\n\n Returns\n -------\n max_error : float\n A positive floating point value (the best value is 0.0).\n\n Examples\n --------\n >>> from sklearn.metrics import max_error\n >>> y_true = [3, 2, 7, 1]\n >>> y_pred = [4, 2, 7, 1]\n >>> max_error(y_true, y_pred)\n 1\n \"\"\"\n (y_type, y_true, y_pred, _) = _check_reg_targets(y_true, y_pred, None)\n if y_type == 'continuous-multioutput':\n raise ValueError('Multioutput not supported in max_error')\n return np.max(np.abs(y_true - y_pred))" }, { "name": "mean_absolute_error", @@ -121384,7 +130578,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121394,7 +130589,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Estimated target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -121404,7 +130600,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "multioutput", @@ -121414,13 +130611,17 @@ "docstring": { "type": "{'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'", "description": "Defines aggregating of multiple output values.\nArray-like value defines weights used to average errors.\n\n'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n'uniform_average' :\n Errors of all outputs are averaged with uniform weight." + }, + "refined_type": { + "kind": "EnumType", + "values": ["raw_values", "uniform_average"] } } ], "results": [], "is_public": true, "description": "Mean absolute error regression loss.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Mean absolute error regression loss.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nmultioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n\nReturns\n-------\nloss : float or ndarray of floats\n If multioutput is 'raw_values', then mean absolute error is returned\n for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n MAE output is non-negative floating point. The best value is 0.0.\n\nExamples\n--------\n>>> from sklearn.metrics import mean_absolute_error\n>>> y_true = [3, -0.5, 2, 7]\n>>> y_pred = [2.5, 0.0, 2, 8]\n>>> mean_absolute_error(y_true, y_pred)\n0.5\n>>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n>>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n>>> mean_absolute_error(y_true, y_pred)\n0.75\n>>> mean_absolute_error(y_true, y_pred, multioutput='raw_values')\narray([0.5, 1. ])\n>>> mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])\n0.85...", + "docstring": "Mean absolute error regression loss.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n\n Returns\n -------\n loss : float or ndarray of floats\n If multioutput is 'raw_values', then mean absolute error is returned\n for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n MAE output is non-negative floating point. The best value is 0.0.\n\n Examples\n --------\n >>> from sklearn.metrics import mean_absolute_error\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> mean_absolute_error(y_true, y_pred)\n 0.5\n >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n >>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n >>> mean_absolute_error(y_true, y_pred)\n 0.75\n >>> mean_absolute_error(y_true, y_pred, multioutput='raw_values')\n array([0.5, 1. ])\n >>> mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])\n 0.85...\n ", "source_code": "\ndef mean_absolute_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average'):\n \"\"\"Mean absolute error regression loss.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n\n Returns\n -------\n loss : float or ndarray of floats\n If multioutput is 'raw_values', then mean absolute error is returned\n for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n MAE output is non-negative floating point. The best value is 0.0.\n\n Examples\n --------\n >>> from sklearn.metrics import mean_absolute_error\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> mean_absolute_error(y_true, y_pred)\n 0.5\n >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n >>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n >>> mean_absolute_error(y_true, y_pred)\n 0.75\n >>> mean_absolute_error(y_true, y_pred, multioutput='raw_values')\n array([0.5, 1. ])\n >>> mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])\n 0.85...\n \"\"\"\n (y_type, y_true, y_pred, multioutput) = _check_reg_targets(y_true, y_pred, multioutput)\n check_consistent_length(y_true, y_pred, sample_weight)\n output_errors = np.average(np.abs(y_pred - y_true), weights=sample_weight, axis=0)\n if isinstance(multioutput, str):\n if multioutput == 'raw_values':\n return output_errors\n elif multioutput == 'uniform_average':\n multioutput = None\n return np.average(output_errors, weights=multioutput)" }, { @@ -121428,7 +130629,7 @@ "unique_name": "mean_absolute_percentage_error", "qname": "sklearn.metrics._regression.mean_absolute_percentage_error", "unique_qname": "sklearn.metrics._regression.mean_absolute_percentage_error", - "decorators": [], + "decorators": ["_deprecate_positional_args(version='1.1')"], "parameters": [ { "name": "y_true", @@ -121438,7 +130639,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121448,34 +130650,40 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Estimated target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", "default_value": "None", "is_public": true, - "assigned_by": "POSITION_OR_NAME", + "assigned_by": "NAME_ONLY", "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "multioutput", "default_value": "'uniform_average'", "is_public": true, - "assigned_by": "POSITION_OR_NAME", + "assigned_by": "NAME_ONLY", "docstring": { "type": "{'raw_values', 'uniform_average'} or array-like", "description": "Defines aggregating of multiple output values.\nArray-like value defines weights used to average errors.\nIf input is list then the shape must be (n_outputs,).\n\n'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n'uniform_average' :\n Errors of all outputs are averaged with uniform weight." + }, + "refined_type": { + "kind": "EnumType", + "values": ["raw_values", "uniform_average"] } } ], "results": [], "is_public": true, - "description": "Mean absolute percentage error regression loss.\n\nNote here that we do not represent the output as a percentage in range [0, 100]. Instead, we represent it in range [0, 1/eps]. Read more in the :ref:`User Guide `. .. versionadded:: 0.24", - "docstring": "Mean absolute percentage error regression loss.\n\nNote here that we do not represent the output as a percentage in range\n[0, 100]. Instead, we represent it in range [0, 1/eps]. Read more in the\n:ref:`User Guide `.\n\n.. versionadded:: 0.24\n\nParameters\n----------\ny_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nmultioutput : {'raw_values', 'uniform_average'} or array-like\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n If input is list then the shape must be (n_outputs,).\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\nReturns\n-------\nloss : float or ndarray of floats in the range [0, 1/eps]\n If multioutput is 'raw_values', then mean absolute percentage error\n is returned for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n MAPE output is non-negative floating point. The best value is 0.0.\n But note the fact that bad predictions can lead to arbitrarily large\n MAPE values, especially if some y_true values are very close to zero.\n Note that we return a large value instead of `inf` when y_true is zero.\n\nExamples\n--------\n>>> from sklearn.metrics import mean_absolute_percentage_error\n>>> y_true = [3, -0.5, 2, 7]\n>>> y_pred = [2.5, 0.0, 2, 8]\n>>> mean_absolute_percentage_error(y_true, y_pred)\n0.3273...\n>>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n>>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n>>> mean_absolute_percentage_error(y_true, y_pred)\n0.5515...\n>>> mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7])\n0.6198...", - "source_code": "\ndef mean_absolute_percentage_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average'):\n \"\"\"Mean absolute percentage error regression loss.\n\n Note here that we do not represent the output as a percentage in range\n [0, 100]. Instead, we represent it in range [0, 1/eps]. Read more in the\n :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n If input is list then the shape must be (n_outputs,).\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n Returns\n -------\n loss : float or ndarray of floats in the range [0, 1/eps]\n If multioutput is 'raw_values', then mean absolute percentage error\n is returned for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n MAPE output is non-negative floating point. The best value is 0.0.\n But note the fact that bad predictions can lead to arbitrarily large\n MAPE values, especially if some y_true values are very close to zero.\n Note that we return a large value instead of `inf` when y_true is zero.\n\n Examples\n --------\n >>> from sklearn.metrics import mean_absolute_percentage_error\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> mean_absolute_percentage_error(y_true, y_pred)\n 0.3273...\n >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n >>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n >>> mean_absolute_percentage_error(y_true, y_pred)\n 0.5515...\n >>> mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7])\n 0.6198...\n \"\"\"\n (y_type, y_true, y_pred, multioutput) = _check_reg_targets(y_true, y_pred, multioutput)\n check_consistent_length(y_true, y_pred, sample_weight)\n epsilon = np.finfo(np.float64).eps\n mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon)\n output_errors = np.average(mape, weights=sample_weight, axis=0)\n if isinstance(multioutput, str):\n if multioutput == 'raw_values':\n return output_errors\n elif multioutput == 'uniform_average':\n multioutput = None\n return np.average(output_errors, weights=multioutput)" + "description": "Mean absolute percentage error (MAPE) regression loss.\n\nNote here that the output is not a percentage in the range [0, 100]\nand a value of 100 does not mean 100% but 1e2. Furthermore, the output\ncan be arbitrarily high when `y_true` is small (which is specific to the\nmetric) or when `abs(y_true - y_pred)` is large (which is common for most\nregression metrics). Read more in the\n:ref:`User Guide `.\n\n.. versionadded:: 0.24", + "docstring": "Mean absolute percentage error (MAPE) regression loss.\n\n Note here that the output is not a percentage in the range [0, 100]\n and a value of 100 does not mean 100% but 1e2. Furthermore, the output\n can be arbitrarily high when `y_true` is small (which is specific to the\n metric) or when `abs(y_true - y_pred)` is large (which is common for most\n regression metrics). Read more in the\n :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n If input is list then the shape must be (n_outputs,).\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n Returns\n -------\n loss : float or ndarray of floats\n If multioutput is 'raw_values', then mean absolute percentage error\n is returned for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n MAPE output is non-negative floating point. The best value is 0.0.\n But note that bad predictions can lead to arbitrarily large\n MAPE values, especially if some `y_true` values are very close to zero.\n Note that we return a large value instead of `inf` when `y_true` is zero.\n\n Examples\n --------\n >>> from sklearn.metrics import mean_absolute_percentage_error\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> mean_absolute_percentage_error(y_true, y_pred)\n 0.3273...\n >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n >>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n >>> mean_absolute_percentage_error(y_true, y_pred)\n 0.5515...\n >>> mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7])\n 0.6198...\n >>> # the value when some element of the y_true is zero is arbitrarily high because\n >>> # of the division by epsilon\n >>> y_true = [1., 0., 2.4, 7.]\n >>> y_pred = [1.2, 0.1, 2.4, 8.]\n >>> mean_absolute_percentage_error(y_true, y_pred)\n 112589990684262.48\n ", + "source_code": "\n@_deprecate_positional_args(version='1.1')\ndef mean_absolute_percentage_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average'):\n \"\"\"Mean absolute percentage error (MAPE) regression loss.\n\n Note here that the output is not a percentage in the range [0, 100]\n and a value of 100 does not mean 100% but 1e2. Furthermore, the output\n can be arbitrarily high when `y_true` is small (which is specific to the\n metric) or when `abs(y_true - y_pred)` is large (which is common for most\n regression metrics). Read more in the\n :ref:`User Guide `.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n If input is list then the shape must be (n_outputs,).\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n Returns\n -------\n loss : float or ndarray of floats\n If multioutput is 'raw_values', then mean absolute percentage error\n is returned for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n MAPE output is non-negative floating point. The best value is 0.0.\n But note that bad predictions can lead to arbitrarily large\n MAPE values, especially if some `y_true` values are very close to zero.\n Note that we return a large value instead of `inf` when `y_true` is zero.\n\n Examples\n --------\n >>> from sklearn.metrics import mean_absolute_percentage_error\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> mean_absolute_percentage_error(y_true, y_pred)\n 0.3273...\n >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n >>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n >>> mean_absolute_percentage_error(y_true, y_pred)\n 0.5515...\n >>> mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7])\n 0.6198...\n >>> # the value when some element of the y_true is zero is arbitrarily high because\n >>> # of the division by epsilon\n >>> y_true = [1., 0., 2.4, 7.]\n >>> y_pred = [1.2, 0.1, 2.4, 8.]\n >>> mean_absolute_percentage_error(y_true, y_pred)\n 112589990684262.48\n \"\"\"\n (y_type, y_true, y_pred, multioutput) = _check_reg_targets(y_true, y_pred, multioutput)\n check_consistent_length(y_true, y_pred, sample_weight)\n epsilon = np.finfo(np.float64).eps\n mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon)\n output_errors = np.average(mape, weights=sample_weight, axis=0)\n if isinstance(multioutput, str):\n if multioutput == 'raw_values':\n return output_errors\n elif multioutput == 'uniform_average':\n multioutput = None\n return np.average(output_errors, weights=multioutput)" }, { "name": "mean_gamma_deviance", @@ -121492,7 +130700,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Ground truth (correct) target values. Requires y_true > 0." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121502,7 +130711,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Estimated target values. Requires y_pred > 0." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -121512,13 +130722,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Mean Gamma deviance regression loss.\n\nGamma deviance is equivalent to the Tweedie deviance with the power parameter `power=2`. It is invariant to scaling of the target variable, and measures relative errors. Read more in the :ref:`User Guide `.", - "docstring": "Mean Gamma deviance regression loss.\n\nGamma deviance is equivalent to the Tweedie deviance with\nthe power parameter `power=2`. It is invariant to scaling of\nthe target variable, and measures relative errors.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n Ground truth (correct) target values. Requires y_true > 0.\n\ny_pred : array-like of shape (n_samples,)\n Estimated target values. Requires y_pred > 0.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nloss : float\n A non-negative floating point value (the best value is 0.0).\n\nExamples\n--------\n>>> from sklearn.metrics import mean_gamma_deviance\n>>> y_true = [2, 0.5, 1, 4]\n>>> y_pred = [0.5, 0.5, 2., 2.]\n>>> mean_gamma_deviance(y_true, y_pred)\n1.0568...", + "description": "Mean Gamma deviance regression loss.\n\nGamma deviance is equivalent to the Tweedie deviance with\nthe power parameter `power=2`. It is invariant to scaling of\nthe target variable, and measures relative errors.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Mean Gamma deviance regression loss.\n\n Gamma deviance is equivalent to the Tweedie deviance with\n the power parameter `power=2`. It is invariant to scaling of\n the target variable, and measures relative errors.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values. Requires y_true > 0.\n\n y_pred : array-like of shape (n_samples,)\n Estimated target values. Requires y_pred > 0.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n loss : float\n A non-negative floating point value (the best value is 0.0).\n\n Examples\n --------\n >>> from sklearn.metrics import mean_gamma_deviance\n >>> y_true = [2, 0.5, 1, 4]\n >>> y_pred = [0.5, 0.5, 2., 2.]\n >>> mean_gamma_deviance(y_true, y_pred)\n 1.0568...\n ", "source_code": "\ndef mean_gamma_deviance(y_true, y_pred, *, sample_weight=None):\n \"\"\"Mean Gamma deviance regression loss.\n\n Gamma deviance is equivalent to the Tweedie deviance with\n the power parameter `power=2`. It is invariant to scaling of\n the target variable, and measures relative errors.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values. Requires y_true > 0.\n\n y_pred : array-like of shape (n_samples,)\n Estimated target values. Requires y_pred > 0.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n loss : float\n A non-negative floating point value (the best value is 0.0).\n\n Examples\n --------\n >>> from sklearn.metrics import mean_gamma_deviance\n >>> y_true = [2, 0.5, 1, 4]\n >>> y_pred = [0.5, 0.5, 2., 2.]\n >>> mean_gamma_deviance(y_true, y_pred)\n 1.0568...\n \"\"\"\n return mean_tweedie_deviance(y_true, y_pred, sample_weight=sample_weight, power=2)" }, { @@ -121536,7 +130747,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121546,7 +130758,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Estimated target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -121556,7 +130769,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -121566,7 +130780,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "multioutput", @@ -121576,14 +130791,18 @@ "docstring": { "type": "{'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'", "description": "Defines aggregating of multiple output values.\nArray-like value defines weights used to average errors.\n\n'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n'uniform_average' :\n Errors of all outputs are averaged with uniform weight." + }, + "refined_type": { + "kind": "EnumType", + "values": ["raw_values", "uniform_average"] } } ], "results": [], "is_public": true, "description": "Pinball loss for quantile regression.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Pinball loss for quantile regression.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nalpha: double, slope of the pinball loss, default=0.5,\n this loss is equivalent to :ref:`mean_absolute_error` when `alpha=0.5`,\n `alpha=0.95` is minimized by estimators of the 95th percentile.\n\nmultioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\nReturns\n-------\nloss : float or ndarray of floats\n If multioutput is 'raw_values', then mean absolute error is returned\n for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n The pinball loss output is a non-negative floating point. The best\n value is 0.0.\n\nExamples\n--------\n>>> from sklearn.metrics import mean_pinball_loss\n>>> y_true = [1, 2, 3]\n>>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.1)\n0.03...\n>>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.1)\n0.3...\n>>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.9)\n0.3...\n>>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.9)\n0.03...\n>>> mean_pinball_loss(y_true, y_true, alpha=0.1)\n0.0\n>>> mean_pinball_loss(y_true, y_true, alpha=0.9)\n0.0", - "source_code": "\ndef mean_pinball_loss(y_true, y_pred, *, sample_weight=None, alpha=0.5, multioutput='uniform_average'):\n \"\"\"Pinball loss for quantile regression.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n alpha: double, slope of the pinball loss, default=0.5,\n this loss is equivalent to :ref:`mean_absolute_error` when `alpha=0.5`,\n `alpha=0.95` is minimized by estimators of the 95th percentile.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n Returns\n -------\n loss : float or ndarray of floats\n If multioutput is 'raw_values', then mean absolute error is returned\n for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n The pinball loss output is a non-negative floating point. The best\n value is 0.0.\n\n Examples\n --------\n >>> from sklearn.metrics import mean_pinball_loss\n >>> y_true = [1, 2, 3]\n >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.1)\n 0.03...\n >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.1)\n 0.3...\n >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.9)\n 0.3...\n >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.9)\n 0.03...\n >>> mean_pinball_loss(y_true, y_true, alpha=0.1)\n 0.0\n >>> mean_pinball_loss(y_true, y_true, alpha=0.9)\n 0.0\n \"\"\"\n (y_type, y_true, y_pred, multioutput) = _check_reg_targets(y_true, y_pred, multioutput)\n check_consistent_length(y_true, y_pred, sample_weight)\n diff = y_true - y_pred\n sign = (diff >= 0).astype(diff.dtype)\n loss = alpha * sign * diff - (1 - alpha) * (1 - sign) * diff\n output_errors = np.average(loss, weights=sample_weight, axis=0)\n if isinstance(multioutput, str):\n if multioutput == 'raw_values':\n return output_errors\n elif multioutput == 'uniform_average':\n multioutput = None\n else:\n raise ValueError(\"multioutput is expected to be 'raw_values' or 'uniform_average' but we got %r instead.\" % multioutput)\n return np.average(output_errors, weights=multioutput)" + "docstring": "Pinball loss for quantile regression.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n alpha: float, slope of the pinball loss, default=0.5,\n this loss is equivalent to :ref:`mean_absolute_error` when `alpha=0.5`,\n `alpha=0.95` is minimized by estimators of the 95th percentile.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n Returns\n -------\n loss : float or ndarray of floats\n If multioutput is 'raw_values', then mean absolute error is returned\n for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n The pinball loss output is a non-negative floating point. The best\n value is 0.0.\n\n Examples\n --------\n >>> from sklearn.metrics import mean_pinball_loss\n >>> y_true = [1, 2, 3]\n >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.1)\n 0.03...\n >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.1)\n 0.3...\n >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.9)\n 0.3...\n >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.9)\n 0.03...\n >>> mean_pinball_loss(y_true, y_true, alpha=0.1)\n 0.0\n >>> mean_pinball_loss(y_true, y_true, alpha=0.9)\n 0.0\n ", + "source_code": "\ndef mean_pinball_loss(y_true, y_pred, *, sample_weight=None, alpha=0.5, multioutput='uniform_average'):\n \"\"\"Pinball loss for quantile regression.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n alpha: float, slope of the pinball loss, default=0.5,\n this loss is equivalent to :ref:`mean_absolute_error` when `alpha=0.5`,\n `alpha=0.95` is minimized by estimators of the 95th percentile.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n Returns\n -------\n loss : float or ndarray of floats\n If multioutput is 'raw_values', then mean absolute error is returned\n for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n The pinball loss output is a non-negative floating point. The best\n value is 0.0.\n\n Examples\n --------\n >>> from sklearn.metrics import mean_pinball_loss\n >>> y_true = [1, 2, 3]\n >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.1)\n 0.03...\n >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.1)\n 0.3...\n >>> mean_pinball_loss(y_true, [0, 2, 3], alpha=0.9)\n 0.3...\n >>> mean_pinball_loss(y_true, [1, 2, 4], alpha=0.9)\n 0.03...\n >>> mean_pinball_loss(y_true, y_true, alpha=0.1)\n 0.0\n >>> mean_pinball_loss(y_true, y_true, alpha=0.9)\n 0.0\n \"\"\"\n (y_type, y_true, y_pred, multioutput) = _check_reg_targets(y_true, y_pred, multioutput)\n check_consistent_length(y_true, y_pred, sample_weight)\n diff = y_true - y_pred\n sign = (diff >= 0).astype(diff.dtype)\n loss = alpha * sign * diff - (1 - alpha) * (1 - sign) * diff\n output_errors = np.average(loss, weights=sample_weight, axis=0)\n if isinstance(multioutput, str):\n if multioutput == 'raw_values':\n return output_errors\n elif multioutput == 'uniform_average':\n multioutput = None\n else:\n raise ValueError(\"multioutput is expected to be 'raw_values' or 'uniform_average' but we got %r instead.\" % multioutput)\n return np.average(output_errors, weights=multioutput)" }, { "name": "mean_poisson_deviance", @@ -121600,7 +130819,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Ground truth (correct) target values. Requires y_true >= 0." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121610,7 +130830,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Estimated target values. Requires y_pred > 0." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -121620,13 +130841,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Mean Poisson deviance regression loss.\n\nPoisson deviance is equivalent to the Tweedie deviance with the power parameter `power=1`. Read more in the :ref:`User Guide `.", - "docstring": "Mean Poisson deviance regression loss.\n\nPoisson deviance is equivalent to the Tweedie deviance with\nthe power parameter `power=1`.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n Ground truth (correct) target values. Requires y_true >= 0.\n\ny_pred : array-like of shape (n_samples,)\n Estimated target values. Requires y_pred > 0.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nloss : float\n A non-negative floating point value (the best value is 0.0).\n\nExamples\n--------\n>>> from sklearn.metrics import mean_poisson_deviance\n>>> y_true = [2, 0, 1, 4]\n>>> y_pred = [0.5, 0.5, 2., 2.]\n>>> mean_poisson_deviance(y_true, y_pred)\n1.4260...", + "description": "Mean Poisson deviance regression loss.\n\nPoisson deviance is equivalent to the Tweedie deviance with\nthe power parameter `power=1`.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Mean Poisson deviance regression loss.\n\n Poisson deviance is equivalent to the Tweedie deviance with\n the power parameter `power=1`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values. Requires y_true >= 0.\n\n y_pred : array-like of shape (n_samples,)\n Estimated target values. Requires y_pred > 0.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n loss : float\n A non-negative floating point value (the best value is 0.0).\n\n Examples\n --------\n >>> from sklearn.metrics import mean_poisson_deviance\n >>> y_true = [2, 0, 1, 4]\n >>> y_pred = [0.5, 0.5, 2., 2.]\n >>> mean_poisson_deviance(y_true, y_pred)\n 1.4260...\n ", "source_code": "\ndef mean_poisson_deviance(y_true, y_pred, *, sample_weight=None):\n \"\"\"Mean Poisson deviance regression loss.\n\n Poisson deviance is equivalent to the Tweedie deviance with\n the power parameter `power=1`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values. Requires y_true >= 0.\n\n y_pred : array-like of shape (n_samples,)\n Estimated target values. Requires y_pred > 0.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n loss : float\n A non-negative floating point value (the best value is 0.0).\n\n Examples\n --------\n >>> from sklearn.metrics import mean_poisson_deviance\n >>> y_true = [2, 0, 1, 4]\n >>> y_pred = [0.5, 0.5, 2., 2.]\n >>> mean_poisson_deviance(y_true, y_pred)\n 1.4260...\n \"\"\"\n return mean_tweedie_deviance(y_true, y_pred, sample_weight=sample_weight, power=1)" }, { @@ -121644,7 +130866,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121654,7 +130877,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Estimated target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -121664,7 +130888,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "multioutput", @@ -121674,6 +130899,10 @@ "docstring": { "type": "{'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'", "description": "Defines aggregating of multiple output values.\nArray-like value defines weights used to average errors.\n\n'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n'uniform_average' :\n Errors of all outputs are averaged with uniform weight." + }, + "refined_type": { + "kind": "EnumType", + "values": ["raw_values", "uniform_average"] } }, { @@ -121684,13 +130913,14 @@ "docstring": { "type": "bool, default=True", "description": "If True returns MSE value, if False returns RMSE value." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Mean squared error regression loss.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Mean squared error regression loss.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nmultioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\nsquared : bool, default=True\n If True returns MSE value, if False returns RMSE value.\n\nReturns\n-------\nloss : float or ndarray of floats\n A non-negative floating point value (the best value is 0.0), or an\n array of floating point values, one for each individual target.\n\nExamples\n--------\n>>> from sklearn.metrics import mean_squared_error\n>>> y_true = [3, -0.5, 2, 7]\n>>> y_pred = [2.5, 0.0, 2, 8]\n>>> mean_squared_error(y_true, y_pred)\n0.375\n>>> y_true = [3, -0.5, 2, 7]\n>>> y_pred = [2.5, 0.0, 2, 8]\n>>> mean_squared_error(y_true, y_pred, squared=False)\n0.612...\n>>> y_true = [[0.5, 1],[-1, 1],[7, -6]]\n>>> y_pred = [[0, 2],[-1, 2],[8, -5]]\n>>> mean_squared_error(y_true, y_pred)\n0.708...\n>>> mean_squared_error(y_true, y_pred, squared=False)\n0.822...\n>>> mean_squared_error(y_true, y_pred, multioutput='raw_values')\narray([0.41666667, 1. ])\n>>> mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])\n0.825...", + "docstring": "Mean squared error regression loss.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n squared : bool, default=True\n If True returns MSE value, if False returns RMSE value.\n\n Returns\n -------\n loss : float or ndarray of floats\n A non-negative floating point value (the best value is 0.0), or an\n array of floating point values, one for each individual target.\n\n Examples\n --------\n >>> from sklearn.metrics import mean_squared_error\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> mean_squared_error(y_true, y_pred)\n 0.375\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> mean_squared_error(y_true, y_pred, squared=False)\n 0.612...\n >>> y_true = [[0.5, 1],[-1, 1],[7, -6]]\n >>> y_pred = [[0, 2],[-1, 2],[8, -5]]\n >>> mean_squared_error(y_true, y_pred)\n 0.708...\n >>> mean_squared_error(y_true, y_pred, squared=False)\n 0.822...\n >>> mean_squared_error(y_true, y_pred, multioutput='raw_values')\n array([0.41666667, 1. ])\n >>> mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])\n 0.825...\n ", "source_code": "\ndef mean_squared_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average', squared=True):\n \"\"\"Mean squared error regression loss.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n squared : bool, default=True\n If True returns MSE value, if False returns RMSE value.\n\n Returns\n -------\n loss : float or ndarray of floats\n A non-negative floating point value (the best value is 0.0), or an\n array of floating point values, one for each individual target.\n\n Examples\n --------\n >>> from sklearn.metrics import mean_squared_error\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> mean_squared_error(y_true, y_pred)\n 0.375\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> mean_squared_error(y_true, y_pred, squared=False)\n 0.612...\n >>> y_true = [[0.5, 1],[-1, 1],[7, -6]]\n >>> y_pred = [[0, 2],[-1, 2],[8, -5]]\n >>> mean_squared_error(y_true, y_pred)\n 0.708...\n >>> mean_squared_error(y_true, y_pred, squared=False)\n 0.822...\n >>> mean_squared_error(y_true, y_pred, multioutput='raw_values')\n array([0.41666667, 1. ])\n >>> mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])\n 0.825...\n \"\"\"\n (y_type, y_true, y_pred, multioutput) = _check_reg_targets(y_true, y_pred, multioutput)\n check_consistent_length(y_true, y_pred, sample_weight)\n output_errors = np.average((y_true - y_pred)**2, axis=0, weights=sample_weight)\n if not squared:\n output_errors = np.sqrt(output_errors)\n if isinstance(multioutput, str):\n if multioutput == 'raw_values':\n return output_errors\n elif multioutput == 'uniform_average':\n multioutput = None\n return np.average(output_errors, weights=multioutput)" }, { @@ -121708,7 +130938,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121718,7 +130949,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Estimated target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -121728,7 +130960,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "multioutput", @@ -121738,6 +130971,10 @@ "docstring": { "type": "{'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'", "description": "Defines aggregating of multiple output values.\nArray-like value defines weights used to average errors.\n\n'raw_values' :\n Returns a full set of errors when the input is of multioutput\n format.\n\n'uniform_average' :\n Errors of all outputs are averaged with uniform weight." + }, + "refined_type": { + "kind": "EnumType", + "values": ["raw_values", "uniform_average"] } }, { @@ -121748,13 +130985,14 @@ "docstring": { "type": "bool, default=True", "description": "If True returns MSLE (mean squared log error) value.\nIf False returns RMSLE (root mean squared log error) value." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Mean squared logarithmic error regression loss.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Mean squared logarithmic error regression loss.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nmultioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors when the input is of multioutput\n format.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\nsquared : bool, default=True\n If True returns MSLE (mean squared log error) value.\n If False returns RMSLE (root mean squared log error) value.\n\nReturns\n-------\nloss : float or ndarray of floats\n A non-negative floating point value (the best value is 0.0), or an\n array of floating point values, one for each individual target.\n\nExamples\n--------\n>>> from sklearn.metrics import mean_squared_log_error\n>>> y_true = [3, 5, 2.5, 7]\n>>> y_pred = [2.5, 5, 4, 8]\n>>> mean_squared_log_error(y_true, y_pred)\n0.039...\n>>> mean_squared_log_error(y_true, y_pred, squared=False)\n0.199...\n>>> y_true = [[0.5, 1], [1, 2], [7, 6]]\n>>> y_pred = [[0.5, 2], [1, 2.5], [8, 8]]\n>>> mean_squared_log_error(y_true, y_pred)\n0.044...\n>>> mean_squared_log_error(y_true, y_pred, multioutput='raw_values')\narray([0.00462428, 0.08377444])\n>>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])\n0.060...", + "docstring": "Mean squared logarithmic error regression loss.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors when the input is of multioutput\n format.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n squared : bool, default=True\n If True returns MSLE (mean squared log error) value.\n If False returns RMSLE (root mean squared log error) value.\n\n Returns\n -------\n loss : float or ndarray of floats\n A non-negative floating point value (the best value is 0.0), or an\n array of floating point values, one for each individual target.\n\n Examples\n --------\n >>> from sklearn.metrics import mean_squared_log_error\n >>> y_true = [3, 5, 2.5, 7]\n >>> y_pred = [2.5, 5, 4, 8]\n >>> mean_squared_log_error(y_true, y_pred)\n 0.039...\n >>> mean_squared_log_error(y_true, y_pred, squared=False)\n 0.199...\n >>> y_true = [[0.5, 1], [1, 2], [7, 6]]\n >>> y_pred = [[0.5, 2], [1, 2.5], [8, 8]]\n >>> mean_squared_log_error(y_true, y_pred)\n 0.044...\n >>> mean_squared_log_error(y_true, y_pred, multioutput='raw_values')\n array([0.00462428, 0.08377444])\n >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])\n 0.060...\n ", "source_code": "\ndef mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average', squared=True):\n \"\"\"Mean squared logarithmic error regression loss.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n\n Defines aggregating of multiple output values.\n Array-like value defines weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors when the input is of multioutput\n format.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n squared : bool, default=True\n If True returns MSLE (mean squared log error) value.\n If False returns RMSLE (root mean squared log error) value.\n\n Returns\n -------\n loss : float or ndarray of floats\n A non-negative floating point value (the best value is 0.0), or an\n array of floating point values, one for each individual target.\n\n Examples\n --------\n >>> from sklearn.metrics import mean_squared_log_error\n >>> y_true = [3, 5, 2.5, 7]\n >>> y_pred = [2.5, 5, 4, 8]\n >>> mean_squared_log_error(y_true, y_pred)\n 0.039...\n >>> mean_squared_log_error(y_true, y_pred, squared=False)\n 0.199...\n >>> y_true = [[0.5, 1], [1, 2], [7, 6]]\n >>> y_pred = [[0.5, 2], [1, 2.5], [8, 8]]\n >>> mean_squared_log_error(y_true, y_pred)\n 0.044...\n >>> mean_squared_log_error(y_true, y_pred, multioutput='raw_values')\n array([0.00462428, 0.08377444])\n >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])\n 0.060...\n \"\"\"\n (y_type, y_true, y_pred, multioutput) = _check_reg_targets(y_true, y_pred, multioutput)\n check_consistent_length(y_true, y_pred, sample_weight)\n if (y_true < 0).any() or (y_pred < 0).any():\n raise ValueError('Mean Squared Logarithmic Error cannot be used when targets contain negative values.')\n return mean_squared_error(np.log1p(y_true), np.log1p(y_pred), sample_weight=sample_weight, multioutput=multioutput, squared=squared)" }, { @@ -121772,7 +131010,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121782,7 +131021,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Estimated target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -121792,7 +131032,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "power", @@ -121802,13 +131043,14 @@ "docstring": { "type": "float, default=0", "description": "Tweedie power parameter. Either power <= 0 or power >= 1.\n\nThe higher `p` the less weight is given to extreme\ndeviations between true and predicted targets.\n\n- power < 0: Extreme stable distribution. Requires: y_pred > 0.\n- power = 0 : Normal distribution, output corresponds to\n mean_squared_error. y_true and y_pred can be any real numbers.\n- power = 1 : Poisson distribution. Requires: y_true >= 0 and\n y_pred > 0.\n- 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0\n and y_pred > 0.\n- power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.\n- power = 3 : Inverse Gaussian distribution. Requires: y_true > 0\n and y_pred > 0.\n- otherwise : Positive stable distribution. Requires: y_true > 0\n and y_pred > 0." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Mean Tweedie deviance regression loss.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Mean Tweedie deviance regression loss.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape (n_samples,)\n Estimated target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\npower : float, default=0\n Tweedie power parameter. Either power <= 0 or power >= 1.\n\n The higher `p` the less weight is given to extreme\n deviations between true and predicted targets.\n\n - power < 0: Extreme stable distribution. Requires: y_pred > 0.\n - power = 0 : Normal distribution, output corresponds to\n mean_squared_error. y_true and y_pred can be any real numbers.\n - power = 1 : Poisson distribution. Requires: y_true >= 0 and\n y_pred > 0.\n - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0\n and y_pred > 0.\n - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.\n - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0\n and y_pred > 0.\n - otherwise : Positive stable distribution. Requires: y_true > 0\n and y_pred > 0.\n\nReturns\n-------\nloss : float\n A non-negative floating point value (the best value is 0.0).\n\nExamples\n--------\n>>> from sklearn.metrics import mean_tweedie_deviance\n>>> y_true = [2, 0, 1, 4]\n>>> y_pred = [0.5, 0.5, 2., 2.]\n>>> mean_tweedie_deviance(y_true, y_pred, power=1)\n1.4260...", + "docstring": "Mean Tweedie deviance regression loss.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n power : float, default=0\n Tweedie power parameter. Either power <= 0 or power >= 1.\n\n The higher `p` the less weight is given to extreme\n deviations between true and predicted targets.\n\n - power < 0: Extreme stable distribution. Requires: y_pred > 0.\n - power = 0 : Normal distribution, output corresponds to\n mean_squared_error. y_true and y_pred can be any real numbers.\n - power = 1 : Poisson distribution. Requires: y_true >= 0 and\n y_pred > 0.\n - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0\n and y_pred > 0.\n - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.\n - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0\n and y_pred > 0.\n - otherwise : Positive stable distribution. Requires: y_true > 0\n and y_pred > 0.\n\n Returns\n -------\n loss : float\n A non-negative floating point value (the best value is 0.0).\n\n Examples\n --------\n >>> from sklearn.metrics import mean_tweedie_deviance\n >>> y_true = [2, 0, 1, 4]\n >>> y_pred = [0.5, 0.5, 2., 2.]\n >>> mean_tweedie_deviance(y_true, y_pred, power=1)\n 1.4260...\n ", "source_code": "\ndef mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0):\n \"\"\"Mean Tweedie deviance regression loss.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n power : float, default=0\n Tweedie power parameter. Either power <= 0 or power >= 1.\n\n The higher `p` the less weight is given to extreme\n deviations between true and predicted targets.\n\n - power < 0: Extreme stable distribution. Requires: y_pred > 0.\n - power = 0 : Normal distribution, output corresponds to\n mean_squared_error. y_true and y_pred can be any real numbers.\n - power = 1 : Poisson distribution. Requires: y_true >= 0 and\n y_pred > 0.\n - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0\n and y_pred > 0.\n - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0.\n - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0\n and y_pred > 0.\n - otherwise : Positive stable distribution. Requires: y_true > 0\n and y_pred > 0.\n\n Returns\n -------\n loss : float\n A non-negative floating point value (the best value is 0.0).\n\n Examples\n --------\n >>> from sklearn.metrics import mean_tweedie_deviance\n >>> y_true = [2, 0, 1, 4]\n >>> y_pred = [0.5, 0.5, 2., 2.]\n >>> mean_tweedie_deviance(y_true, y_pred, power=1)\n 1.4260...\n \"\"\"\n (y_type, y_true, y_pred, _) = _check_reg_targets(y_true, y_pred, None, dtype=[np.float64, np.float32])\n if y_type == 'continuous-multioutput':\n raise ValueError('Multioutput not supported in mean_tweedie_deviance')\n check_consistent_length(y_true, y_pred, sample_weight)\n if sample_weight is not None:\n sample_weight = column_or_1d(sample_weight)\n sample_weight = sample_weight[:, np.newaxis]\n dist = TweedieDistribution(power=power)\n dev = dist.unit_deviance(y_true, y_pred, check_input=True)\n return np.average(dev, weights=sample_weight)" }, { @@ -121826,7 +131068,8 @@ "docstring": { "type": "array-like of shape = (n_samples) or (n_samples, n_outputs)", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121836,7 +131079,8 @@ "docstring": { "type": "array-like of shape = (n_samples) or (n_samples, n_outputs)", "description": "Estimated target values." - } + }, + "refined_type": {} }, { "name": "multioutput", @@ -121846,6 +131090,10 @@ "docstring": { "type": "{'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'", "description": "Defines aggregating of multiple output values. Array-like value defines\nweights used to average errors.\n\n'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n'uniform_average' :\n Errors of all outputs are averaged with uniform weight." + }, + "refined_type": { + "kind": "EnumType", + "values": ["raw_values", "uniform_average"] } }, { @@ -121856,13 +131104,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Median absolute error regression loss.\n\nMedian absolute error output is non-negative floating point. The best value is 0.0. Read more in the :ref:`User Guide `.", - "docstring": "Median absolute error regression loss.\n\nMedian absolute error output is non-negative floating point. The best value\nis 0.0. Read more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape = (n_samples) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)\n Estimated target values.\n\nmultioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values. Array-like value defines\n weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.24\n\nReturns\n-------\nloss : float or ndarray of floats\n If multioutput is 'raw_values', then mean absolute error is returned\n for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\nExamples\n--------\n>>> from sklearn.metrics import median_absolute_error\n>>> y_true = [3, -0.5, 2, 7]\n>>> y_pred = [2.5, 0.0, 2, 8]\n>>> median_absolute_error(y_true, y_pred)\n0.5\n>>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n>>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n>>> median_absolute_error(y_true, y_pred)\n0.75\n>>> median_absolute_error(y_true, y_pred, multioutput='raw_values')\narray([0.5, 1. ])\n>>> median_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])\n0.85", + "description": "Median absolute error regression loss.\n\nMedian absolute error output is non-negative floating point. The best value\nis 0.0. Read more in the :ref:`User Guide `.", + "docstring": "Median absolute error regression loss.\n\n Median absolute error output is non-negative floating point. The best value\n is 0.0. Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)\n Estimated target values.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values. Array-like value defines\n weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n loss : float or ndarray of floats\n If multioutput is 'raw_values', then mean absolute error is returned\n for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n Examples\n --------\n >>> from sklearn.metrics import median_absolute_error\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> median_absolute_error(y_true, y_pred)\n 0.5\n >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n >>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n >>> median_absolute_error(y_true, y_pred)\n 0.75\n >>> median_absolute_error(y_true, y_pred, multioutput='raw_values')\n array([0.5, 1. ])\n >>> median_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])\n 0.85\n ", "source_code": "\ndef median_absolute_error(y_true, y_pred, *, multioutput='uniform_average', sample_weight=None):\n \"\"\"Median absolute error regression loss.\n\n Median absolute error output is non-negative floating point. The best value\n is 0.0. Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)\n Estimated target values.\n\n multioutput : {'raw_values', 'uniform_average'} or array-like of shape (n_outputs,), default='uniform_average'\n Defines aggregating of multiple output values. Array-like value defines\n weights used to average errors.\n\n 'raw_values' :\n Returns a full set of errors in case of multioutput input.\n\n 'uniform_average' :\n Errors of all outputs are averaged with uniform weight.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n loss : float or ndarray of floats\n If multioutput is 'raw_values', then mean absolute error is returned\n for each output separately.\n If multioutput is 'uniform_average' or an ndarray of weights, then the\n weighted average of all output errors is returned.\n\n Examples\n --------\n >>> from sklearn.metrics import median_absolute_error\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> median_absolute_error(y_true, y_pred)\n 0.5\n >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n >>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n >>> median_absolute_error(y_true, y_pred)\n 0.75\n >>> median_absolute_error(y_true, y_pred, multioutput='raw_values')\n array([0.5, 1. ])\n >>> median_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])\n 0.85\n \"\"\"\n (y_type, y_true, y_pred, multioutput) = _check_reg_targets(y_true, y_pred, multioutput)\n if sample_weight is None:\n output_errors = np.median(np.abs(y_pred - y_true), axis=0)\n else:\n sample_weight = _check_sample_weight(sample_weight, y_pred)\n output_errors = _weighted_percentile(np.abs(y_pred - y_true), sample_weight=sample_weight)\n if isinstance(multioutput, str):\n if multioutput == 'raw_values':\n return output_errors\n elif multioutput == 'uniform_average':\n multioutput = None\n return np.average(output_errors, weights=multioutput)" }, { @@ -121880,7 +131129,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Ground truth (correct) target values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -121890,7 +131140,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Estimated target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -121900,7 +131151,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} }, { "name": "multioutput", @@ -121910,13 +131162,21 @@ "docstring": { "type": "{'raw_values', 'uniform_average', 'variance_weighted'}, array-like of shape (n_outputs,) or None, default='uniform_average'", "description": "Defines aggregating of multiple output scores.\nArray-like value defines weights used to average scores.\nDefault is \"uniform_average\".\n\n'raw_values' :\n Returns a full set of scores in case of multioutput input.\n\n'uniform_average' :\n Scores of all outputs are averaged with uniform weight.\n\n'variance_weighted' :\n Scores of all outputs are averaged, weighted by the variances\n of each individual output.\n\n.. versionchanged:: 0.19\n Default value of multioutput is 'uniform_average'." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "raw_values", + "uniform_average", + "variance_weighted" + ] } } ], "results": [], "is_public": true, - "description": ":math:`R^2` (coefficient of determination) regression score function.\n\nBest possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a :math:`R^2` score of 0.0. Read more in the :ref:`User Guide `.", - "docstring": ":math:`R^2` (coefficient of determination) regression score function.\n\nBest possible score is 1.0 and it can be negative (because the\nmodel can be arbitrarily worse). A constant model that always\npredicts the expected value of y, disregarding the input features,\nwould get a :math:`R^2` score of 0.0.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ny_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\ny_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nmultioutput : {'raw_values', 'uniform_average', 'variance_weighted'}, array-like of shape (n_outputs,) or None, default='uniform_average'\n\n Defines aggregating of multiple output scores.\n Array-like value defines weights used to average scores.\n Default is \"uniform_average\".\n\n 'raw_values' :\n Returns a full set of scores in case of multioutput input.\n\n 'uniform_average' :\n Scores of all outputs are averaged with uniform weight.\n\n 'variance_weighted' :\n Scores of all outputs are averaged, weighted by the variances\n of each individual output.\n\n .. versionchanged:: 0.19\n Default value of multioutput is 'uniform_average'.\n\nReturns\n-------\nz : float or ndarray of floats\n The :math:`R^2` score or ndarray of scores if 'multioutput' is\n 'raw_values'.\n\nNotes\n-----\nThis is not a symmetric function.\n\nUnlike most other scores, :math:`R^2` score may be negative (it need not\nactually be the square of a quantity R).\n\nThis metric is not well-defined for single samples and will return a NaN\nvalue if n_samples is less than two.\n\nReferences\n----------\n.. [1] `Wikipedia entry on the Coefficient of determination\n `_\n\nExamples\n--------\n>>> from sklearn.metrics import r2_score\n>>> y_true = [3, -0.5, 2, 7]\n>>> y_pred = [2.5, 0.0, 2, 8]\n>>> r2_score(y_true, y_pred)\n0.948...\n>>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n>>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n>>> r2_score(y_true, y_pred,\n... multioutput='variance_weighted')\n0.938...\n>>> y_true = [1, 2, 3]\n>>> y_pred = [1, 2, 3]\n>>> r2_score(y_true, y_pred)\n1.0\n>>> y_true = [1, 2, 3]\n>>> y_pred = [2, 2, 2]\n>>> r2_score(y_true, y_pred)\n0.0\n>>> y_true = [1, 2, 3]\n>>> y_pred = [3, 2, 1]\n>>> r2_score(y_true, y_pred)\n-3.0", + "description": ":math:`R^2` (coefficient of determination) regression score function.\n\nBest possible score is 1.0 and it can be negative (because the\nmodel can be arbitrarily worse). A constant model that always\npredicts the expected value of y, disregarding the input features,\nwould get a :math:`R^2` score of 0.0.\n\nRead more in the :ref:`User Guide `.", + "docstring": ":math:`R^2` (coefficient of determination) regression score function.\n\n Best possible score is 1.0 and it can be negative (because the\n model can be arbitrarily worse). A constant model that always\n predicts the expected value of y, disregarding the input features,\n would get a :math:`R^2` score of 0.0.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average', 'variance_weighted'}, array-like of shape (n_outputs,) or None, default='uniform_average'\n\n Defines aggregating of multiple output scores.\n Array-like value defines weights used to average scores.\n Default is \"uniform_average\".\n\n 'raw_values' :\n Returns a full set of scores in case of multioutput input.\n\n 'uniform_average' :\n Scores of all outputs are averaged with uniform weight.\n\n 'variance_weighted' :\n Scores of all outputs are averaged, weighted by the variances\n of each individual output.\n\n .. versionchanged:: 0.19\n Default value of multioutput is 'uniform_average'.\n\n Returns\n -------\n z : float or ndarray of floats\n The :math:`R^2` score or ndarray of scores if 'multioutput' is\n 'raw_values'.\n\n Notes\n -----\n This is not a symmetric function.\n\n Unlike most other scores, :math:`R^2` score may be negative (it need not\n actually be the square of a quantity R).\n\n This metric is not well-defined for single samples and will return a NaN\n value if n_samples is less than two.\n\n References\n ----------\n .. [1] `Wikipedia entry on the Coefficient of determination\n `_\n\n Examples\n --------\n >>> from sklearn.metrics import r2_score\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> r2_score(y_true, y_pred)\n 0.948...\n >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n >>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n >>> r2_score(y_true, y_pred,\n ... multioutput='variance_weighted')\n 0.938...\n >>> y_true = [1, 2, 3]\n >>> y_pred = [1, 2, 3]\n >>> r2_score(y_true, y_pred)\n 1.0\n >>> y_true = [1, 2, 3]\n >>> y_pred = [2, 2, 2]\n >>> r2_score(y_true, y_pred)\n 0.0\n >>> y_true = [1, 2, 3]\n >>> y_pred = [3, 2, 1]\n >>> r2_score(y_true, y_pred)\n -3.0\n ", "source_code": "\ndef r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average'):\n \"\"\":math:`R^2` (coefficient of determination) regression score function.\n\n Best possible score is 1.0 and it can be negative (because the\n model can be arbitrarily worse). A constant model that always\n predicts the expected value of y, disregarding the input features,\n would get a :math:`R^2` score of 0.0.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Ground truth (correct) target values.\n\n y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Estimated target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n multioutput : {'raw_values', 'uniform_average', 'variance_weighted'}, array-like of shape (n_outputs,) or None, default='uniform_average'\n\n Defines aggregating of multiple output scores.\n Array-like value defines weights used to average scores.\n Default is \"uniform_average\".\n\n 'raw_values' :\n Returns a full set of scores in case of multioutput input.\n\n 'uniform_average' :\n Scores of all outputs are averaged with uniform weight.\n\n 'variance_weighted' :\n Scores of all outputs are averaged, weighted by the variances\n of each individual output.\n\n .. versionchanged:: 0.19\n Default value of multioutput is 'uniform_average'.\n\n Returns\n -------\n z : float or ndarray of floats\n The :math:`R^2` score or ndarray of scores if 'multioutput' is\n 'raw_values'.\n\n Notes\n -----\n This is not a symmetric function.\n\n Unlike most other scores, :math:`R^2` score may be negative (it need not\n actually be the square of a quantity R).\n\n This metric is not well-defined for single samples and will return a NaN\n value if n_samples is less than two.\n\n References\n ----------\n .. [1] `Wikipedia entry on the Coefficient of determination\n `_\n\n Examples\n --------\n >>> from sklearn.metrics import r2_score\n >>> y_true = [3, -0.5, 2, 7]\n >>> y_pred = [2.5, 0.0, 2, 8]\n >>> r2_score(y_true, y_pred)\n 0.948...\n >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]\n >>> y_pred = [[0, 2], [-1, 2], [8, -5]]\n >>> r2_score(y_true, y_pred,\n ... multioutput='variance_weighted')\n 0.938...\n >>> y_true = [1, 2, 3]\n >>> y_pred = [1, 2, 3]\n >>> r2_score(y_true, y_pred)\n 1.0\n >>> y_true = [1, 2, 3]\n >>> y_pred = [2, 2, 2]\n >>> r2_score(y_true, y_pred)\n 0.0\n >>> y_true = [1, 2, 3]\n >>> y_pred = [3, 2, 1]\n >>> r2_score(y_true, y_pred)\n -3.0\n \"\"\"\n (y_type, y_true, y_pred, multioutput) = _check_reg_targets(y_true, y_pred, multioutput)\n check_consistent_length(y_true, y_pred, sample_weight)\n if _num_samples(y_pred) < 2:\n msg = 'R^2 score is not well-defined with less than two samples.'\n warnings.warn(msg, UndefinedMetricWarning)\n return float('nan')\n if sample_weight is not None:\n sample_weight = column_or_1d(sample_weight)\n weight = sample_weight[:, np.newaxis]\n else:\n weight = 1.0\n numerator = (weight * (y_true - y_pred)**2).sum(axis=0, dtype=np.float64)\n denominator = (weight * (y_true - np.average(y_true, axis=0, weights=sample_weight))**2).sum(axis=0, dtype=np.float64)\n nonzero_denominator = denominator != 0\n nonzero_numerator = numerator != 0\n valid_score = nonzero_denominator & nonzero_numerator\n output_scores = np.ones([y_true.shape[1]])\n output_scores[valid_score] = 1 - numerator[valid_score] / denominator[valid_score]\n output_scores[nonzero_numerator & ~nonzero_denominator] = 0.0\n if isinstance(multioutput, str):\n if multioutput == 'raw_values':\n return output_scores\n elif multioutput == 'uniform_average':\n avg_weights = None\n elif multioutput == 'variance_weighted':\n avg_weights = denominator\n if not np.any(nonzero_denominator):\n if not np.any(nonzero_numerator):\n return 1.0\n else:\n return 0.0\n else:\n avg_weights = multioutput\n return np.average(output_scores, weights=avg_weights)" }, { @@ -121934,7 +131194,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -121944,7 +131205,8 @@ "docstring": { "type": "object", "description": "Trained estimator to use for scoring. Must have a predict_proba\nmethod; the output of that is used to compute the score." - } + }, + "refined_type": {} }, { "name": "X", @@ -121954,6 +131216,10 @@ "docstring": { "type": "{array-like, sparse matrix}", "description": "Test data that will be fed to estimator.predict." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -121964,7 +131230,8 @@ "docstring": { "type": "array-like", "description": "Gold standard target values for X." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -121974,13 +131241,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Evaluate predicted target values for X relative to y_true.", - "docstring": "Evaluate predicted target values for X relative to y_true.\n\nParameters\n----------\nestimator : object\n Trained estimator to use for scoring. Must have a predict_proba\n method; the output of that is used to compute the score.\n\nX : {array-like, sparse matrix}\n Test data that will be fed to estimator.predict.\n\ny_true : array-like\n Gold standard target values for X.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nscore : float\n Score function applied to prediction of estimator on X.", + "docstring": "Evaluate predicted target values for X relative to y_true.\n\n Parameters\n ----------\n estimator : object\n Trained estimator to use for scoring. Must have a predict_proba\n method; the output of that is used to compute the score.\n\n X : {array-like, sparse matrix}\n Test data that will be fed to estimator.predict.\n\n y_true : array-like\n Gold standard target values for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Score function applied to prediction of estimator on X.\n ", "source_code": "\ndef __call__(self, estimator, X, y_true, sample_weight=None):\n \"\"\"Evaluate predicted target values for X relative to y_true.\n\n Parameters\n ----------\n estimator : object\n Trained estimator to use for scoring. Must have a predict_proba\n method; the output of that is used to compute the score.\n\n X : {array-like, sparse matrix}\n Test data that will be fed to estimator.predict.\n\n y_true : array-like\n Gold standard target values for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Score function applied to prediction of estimator on X.\n \"\"\"\n return self._score(partial(_cached_call, None), estimator, X, y_true, sample_weight=sample_weight)" }, { @@ -121998,7 +131266,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "score_func", @@ -122008,7 +131277,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sign", @@ -122018,7 +131288,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kwargs", @@ -122028,13 +131299,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, score_func, sign, kwargs):\n self._kwargs = kwargs\n self._score_func = score_func\n self._sign = sign" }, { @@ -122052,13 +131324,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n kwargs_string = ''.join([', %s=%s' % (str(k), str(v)) for (k, v) in self._kwargs.items()])\n return 'make_scorer(%s%s%s%s)' % (self._score_func.__name__, '' if self._sign > 0 else ', greater_is_better=False', self._factory_args(), kwargs_string)" }, { @@ -122076,7 +131349,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classes", @@ -122086,13 +131360,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@staticmethod\ndef _check_pos_label(pos_label, classes):\n if pos_label not in list(classes):\n raise ValueError(f'pos_label={pos_label} is not a valid label: {classes}')" }, { @@ -122110,7 +131385,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -122134,7 +131410,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -122144,7 +131421,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_classes)", "description": "The prediction given by `predict_proba`." - } + }, + "refined_type": {} }, { "name": "classes", @@ -122154,13 +131432,14 @@ "docstring": { "type": "ndarray of shape (n_classes,)", "description": "The class labels for the estimator." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Select the column of the positive label in `y_pred` when probabilities are provided.", - "docstring": "Select the column of the positive label in `y_pred` when\nprobabilities are provided.\n\nParameters\n----------\ny_pred : ndarray of shape (n_samples, n_classes)\n The prediction given by `predict_proba`.\n\nclasses : ndarray of shape (n_classes,)\n The class labels for the estimator.\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,)\n Probability predictions of the positive class.", + "description": "Select the column of the positive label in `y_pred` when\nprobabilities are provided.", + "docstring": "Select the column of the positive label in `y_pred` when\n probabilities are provided.\n\n Parameters\n ----------\n y_pred : ndarray of shape (n_samples, n_classes)\n The prediction given by `predict_proba`.\n\n classes : ndarray of shape (n_classes,)\n The class labels for the estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Probability predictions of the positive class.\n ", "source_code": "\ndef _select_proba_binary(self, y_pred, classes):\n \"\"\"Select the column of the positive label in `y_pred` when\n probabilities are provided.\n\n Parameters\n ----------\n y_pred : ndarray of shape (n_samples, n_classes)\n The prediction given by `predict_proba`.\n\n classes : ndarray of shape (n_classes,)\n The class labels for the estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Probability predictions of the positive class.\n \"\"\"\n if y_pred.shape[1] == 2:\n pos_label = self._kwargs.get('pos_label', classes[1])\n self._check_pos_label(pos_label, classes)\n col_idx = np.flatnonzero(classes == pos_label)[0]\n return y_pred[:, col_idx]\n err_msg = f'Got predict_proba of shape {y_pred.shape}, but need classifier with two classes for {self._score_func.__name__} scoring'\n raise ValueError(err_msg)" }, { @@ -122178,7 +131457,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -122188,7 +131468,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -122212,13 +131493,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, **scorers):\n self._scorers = scorers" }, { @@ -122236,7 +131518,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -122246,13 +131529,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return True if using a cache is beneficial.\n\nCaching may be beneficial when one of these conditions holds: - `_ProbaScorer` will be called twice. - `_PredictScorer` will be called twice. - `_ThresholdScorer` will be called twice. - `_ThresholdScorer` and `_PredictScorer` are called and estimator is a regressor. - `_ThresholdScorer` and `_ProbaScorer` are called and estimator does not have a `decision_function` attribute.", - "docstring": "Return True if using a cache is beneficial.\n\nCaching may be beneficial when one of these conditions holds:\n - `_ProbaScorer` will be called twice.\n - `_PredictScorer` will be called twice.\n - `_ThresholdScorer` will be called twice.\n - `_ThresholdScorer` and `_PredictScorer` are called and\n estimator is a regressor.\n - `_ThresholdScorer` and `_ProbaScorer` are called and\n estimator does not have a `decision_function` attribute.", + "description": "Return True if using a cache is beneficial.\n\nCaching may be beneficial when one of these conditions holds:\n - `_ProbaScorer` will be called twice.\n - `_PredictScorer` will be called twice.\n - `_ThresholdScorer` will be called twice.\n - `_ThresholdScorer` and `_PredictScorer` are called and\n estimator is a regressor.\n - `_ThresholdScorer` and `_ProbaScorer` are called and\n estimator does not have a `decision_function` attribute.", + "docstring": "Return True if using a cache is beneficial.\n\n Caching may be beneficial when one of these conditions holds:\n - `_ProbaScorer` will be called twice.\n - `_PredictScorer` will be called twice.\n - `_ThresholdScorer` will be called twice.\n - `_ThresholdScorer` and `_PredictScorer` are called and\n estimator is a regressor.\n - `_ThresholdScorer` and `_ProbaScorer` are called and\n estimator does not have a `decision_function` attribute.\n\n ", "source_code": "\ndef _use_cache(self, estimator):\n \"\"\"Return True if using a cache is beneficial.\n\n Caching may be beneficial when one of these conditions holds:\n - `_ProbaScorer` will be called twice.\n - `_PredictScorer` will be called twice.\n - `_ThresholdScorer` will be called twice.\n - `_ThresholdScorer` and `_PredictScorer` are called and\n estimator is a regressor.\n - `_ThresholdScorer` and `_ProbaScorer` are called and\n estimator does not have a `decision_function` attribute.\n\n \"\"\"\n if len(self._scorers) == 1:\n return False\n counter = Counter([type(v) for v in self._scorers.values()])\n if any((counter[known_type] > 1 for known_type in [_PredictScorer, _ProbaScorer, _ThresholdScorer])):\n return True\n if counter[_ThresholdScorer]:\n if is_regressor(estimator) and counter[_PredictScorer]:\n return True\n elif counter[_ProbaScorer] and not hasattr(estimator, 'decision_function'):\n return True\n return False" }, { @@ -122270,7 +131554,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "method_caller", @@ -122280,7 +131565,8 @@ "docstring": { "type": "callable", "description": "Returns predictions given an estimator, method name, and other\narguments, potentially caching results." - } + }, + "refined_type": {} }, { "name": "estimator", @@ -122290,7 +131576,8 @@ "docstring": { "type": "object", "description": "Trained estimator to use for scoring. Must have a `predict`\nmethod; the output of that is used to compute the score." - } + }, + "refined_type": {} }, { "name": "X", @@ -122300,6 +131587,10 @@ "docstring": { "type": "{array-like, sparse matrix}", "description": "Test data that will be fed to estimator.predict." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -122310,7 +131601,8 @@ "docstring": { "type": "array-like", "description": "Gold standard target values for X." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -122320,13 +131612,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Evaluate predicted target values for X relative to y_true.", - "docstring": "Evaluate predicted target values for X relative to y_true.\n\nParameters\n----------\nmethod_caller : callable\n Returns predictions given an estimator, method name, and other\n arguments, potentially caching results.\n\nestimator : object\n Trained estimator to use for scoring. Must have a `predict`\n method; the output of that is used to compute the score.\n\nX : {array-like, sparse matrix}\n Test data that will be fed to estimator.predict.\n\ny_true : array-like\n Gold standard target values for X.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nscore : float\n Score function applied to prediction of estimator on X.", + "docstring": "Evaluate predicted target values for X relative to y_true.\n\n Parameters\n ----------\n method_caller : callable\n Returns predictions given an estimator, method name, and other\n arguments, potentially caching results.\n\n estimator : object\n Trained estimator to use for scoring. Must have a `predict`\n method; the output of that is used to compute the score.\n\n X : {array-like, sparse matrix}\n Test data that will be fed to estimator.predict.\n\n y_true : array-like\n Gold standard target values for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Score function applied to prediction of estimator on X.\n ", "source_code": "\ndef _score(self, method_caller, estimator, X, y_true, sample_weight=None):\n \"\"\"Evaluate predicted target values for X relative to y_true.\n\n Parameters\n ----------\n method_caller : callable\n Returns predictions given an estimator, method name, and other\n arguments, potentially caching results.\n\n estimator : object\n Trained estimator to use for scoring. Must have a `predict`\n method; the output of that is used to compute the score.\n\n X : {array-like, sparse matrix}\n Test data that will be fed to estimator.predict.\n\n y_true : array-like\n Gold standard target values for X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Score function applied to prediction of estimator on X.\n \"\"\"\n y_pred = method_caller(estimator, 'predict', X)\n if sample_weight is not None:\n return self._sign * self._score_func(y_true, y_pred, sample_weight=sample_weight, **self._kwargs)\n else:\n return self._sign * self._score_func(y_true, y_pred, **self._kwargs)" }, { @@ -122344,13 +131637,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _factory_args(self):\n return ', needs_proba=True'" }, { @@ -122368,7 +131662,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "method_caller", @@ -122378,7 +131673,8 @@ "docstring": { "type": "callable", "description": "Returns predictions given an estimator, method name, and other\narguments, potentially caching results." - } + }, + "refined_type": {} }, { "name": "clf", @@ -122388,7 +131684,8 @@ "docstring": { "type": "object", "description": "Trained classifier to use for scoring. Must have a `predict_proba`\nmethod; the output of that is used to compute the score." - } + }, + "refined_type": {} }, { "name": "X", @@ -122398,6 +131695,10 @@ "docstring": { "type": "{array-like, sparse matrix}", "description": "Test data that will be fed to clf.predict_proba." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -122408,7 +131709,8 @@ "docstring": { "type": "array-like", "description": "Gold standard target values for X. These must be class labels,\nnot probabilities." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -122418,13 +131720,14 @@ "docstring": { "type": "array-like, default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Evaluate predicted probabilities for X relative to y_true.", - "docstring": "Evaluate predicted probabilities for X relative to y_true.\n\nParameters\n----------\nmethod_caller : callable\n Returns predictions given an estimator, method name, and other\n arguments, potentially caching results.\n\nclf : object\n Trained classifier to use for scoring. Must have a `predict_proba`\n method; the output of that is used to compute the score.\n\nX : {array-like, sparse matrix}\n Test data that will be fed to clf.predict_proba.\n\ny : array-like\n Gold standard target values for X. These must be class labels,\n not probabilities.\n\nsample_weight : array-like, default=None\n Sample weights.\n\nReturns\n-------\nscore : float\n Score function applied to prediction of estimator on X.", + "docstring": "Evaluate predicted probabilities for X relative to y_true.\n\n Parameters\n ----------\n method_caller : callable\n Returns predictions given an estimator, method name, and other\n arguments, potentially caching results.\n\n clf : object\n Trained classifier to use for scoring. Must have a `predict_proba`\n method; the output of that is used to compute the score.\n\n X : {array-like, sparse matrix}\n Test data that will be fed to clf.predict_proba.\n\n y : array-like\n Gold standard target values for X. These must be class labels,\n not probabilities.\n\n sample_weight : array-like, default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Score function applied to prediction of estimator on X.\n ", "source_code": "\ndef _score(self, method_caller, clf, X, y, sample_weight=None):\n \"\"\"Evaluate predicted probabilities for X relative to y_true.\n\n Parameters\n ----------\n method_caller : callable\n Returns predictions given an estimator, method name, and other\n arguments, potentially caching results.\n\n clf : object\n Trained classifier to use for scoring. Must have a `predict_proba`\n method; the output of that is used to compute the score.\n\n X : {array-like, sparse matrix}\n Test data that will be fed to clf.predict_proba.\n\n y : array-like\n Gold standard target values for X. These must be class labels,\n not probabilities.\n\n sample_weight : array-like, default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Score function applied to prediction of estimator on X.\n \"\"\"\n y_type = type_of_target(y)\n y_pred = method_caller(clf, 'predict_proba', X)\n if y_type == 'binary' and y_pred.shape[1] <= 2:\n y_pred = self._select_proba_binary(y_pred, clf.classes_)\n if sample_weight is not None:\n return self._sign * self._score_func(y, y_pred, sample_weight=sample_weight, **self._kwargs)\n else:\n return self._sign * self._score_func(y, y_pred, **self._kwargs)" }, { @@ -122442,13 +131745,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _factory_args(self):\n return ', needs_threshold=True'" }, { @@ -122466,7 +131770,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "method_caller", @@ -122476,7 +131781,8 @@ "docstring": { "type": "callable", "description": "Returns predictions given an estimator, method name, and other\narguments, potentially caching results." - } + }, + "refined_type": {} }, { "name": "clf", @@ -122486,7 +131792,8 @@ "docstring": { "type": "object", "description": "Trained classifier to use for scoring. Must have either a\ndecision_function method or a predict_proba method; the output of\nthat is used to compute the score." - } + }, + "refined_type": {} }, { "name": "X", @@ -122496,6 +131803,10 @@ "docstring": { "type": "{array-like, sparse matrix}", "description": "Test data that will be fed to clf.decision_function or\nclf.predict_proba." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -122506,7 +131817,8 @@ "docstring": { "type": "array-like", "description": "Gold standard target values for X. These must be class labels,\nnot decision function values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -122516,13 +131828,14 @@ "docstring": { "type": "array-like, default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Evaluate decision function output for X relative to y_true.", - "docstring": "Evaluate decision function output for X relative to y_true.\n\nParameters\n----------\nmethod_caller : callable\n Returns predictions given an estimator, method name, and other\n arguments, potentially caching results.\n\nclf : object\n Trained classifier to use for scoring. Must have either a\n decision_function method or a predict_proba method; the output of\n that is used to compute the score.\n\nX : {array-like, sparse matrix}\n Test data that will be fed to clf.decision_function or\n clf.predict_proba.\n\ny : array-like\n Gold standard target values for X. These must be class labels,\n not decision function values.\n\nsample_weight : array-like, default=None\n Sample weights.\n\nReturns\n-------\nscore : float\n Score function applied to prediction of estimator on X.", + "docstring": "Evaluate decision function output for X relative to y_true.\n\n Parameters\n ----------\n method_caller : callable\n Returns predictions given an estimator, method name, and other\n arguments, potentially caching results.\n\n clf : object\n Trained classifier to use for scoring. Must have either a\n decision_function method or a predict_proba method; the output of\n that is used to compute the score.\n\n X : {array-like, sparse matrix}\n Test data that will be fed to clf.decision_function or\n clf.predict_proba.\n\n y : array-like\n Gold standard target values for X. These must be class labels,\n not decision function values.\n\n sample_weight : array-like, default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Score function applied to prediction of estimator on X.\n ", "source_code": "\ndef _score(self, method_caller, clf, X, y, sample_weight=None):\n \"\"\"Evaluate decision function output for X relative to y_true.\n\n Parameters\n ----------\n method_caller : callable\n Returns predictions given an estimator, method name, and other\n arguments, potentially caching results.\n\n clf : object\n Trained classifier to use for scoring. Must have either a\n decision_function method or a predict_proba method; the output of\n that is used to compute the score.\n\n X : {array-like, sparse matrix}\n Test data that will be fed to clf.decision_function or\n clf.predict_proba.\n\n y : array-like\n Gold standard target values for X. These must be class labels,\n not decision function values.\n\n sample_weight : array-like, default=None\n Sample weights.\n\n Returns\n -------\n score : float\n Score function applied to prediction of estimator on X.\n \"\"\"\n y_type = type_of_target(y)\n if y_type not in ('binary', 'multilabel-indicator'):\n raise ValueError('{0} format is not supported'.format(y_type))\n if is_regressor(clf):\n y_pred = method_caller(clf, 'predict', X)\n else:\n try:\n y_pred = method_caller(clf, 'decision_function', X)\n if isinstance(y_pred, list):\n y_pred = np.vstack([p for p in y_pred]).T\n elif y_type == 'binary' and 'pos_label' in self._kwargs:\n self._check_pos_label(self._kwargs['pos_label'], clf.classes_)\n if self._kwargs['pos_label'] == clf.classes_[0]:\n y_pred *= -1\n except (NotImplementedError, AttributeError):\n y_pred = method_caller(clf, 'predict_proba', X)\n if y_type == 'binary':\n y_pred = self._select_proba_binary(y_pred, clf.classes_)\n elif isinstance(y_pred, list):\n y_pred = np.vstack([p[:, -1] for p in y_pred]).T\n if sample_weight is not None:\n return self._sign * self._score_func(y, y_pred, sample_weight=sample_weight, **self._kwargs)\n else:\n return self._sign * self._score_func(y, y_pred, **self._kwargs)" }, { @@ -122540,7 +131853,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -122550,7 +131864,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "method", @@ -122560,7 +131875,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -122584,7 +131900,8 @@ "docstring": { "type": "sklearn estimator instance", "description": "The estimator for which the scoring will be applied." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -122594,13 +131911,14 @@ "docstring": { "type": "list, tuple or dict", "description": "Strategy to evaluate the performance of the cross-validated model on\nthe test set.\n\nThe possibilities are:\n\n- a list or tuple of unique strings;\n- a callable returning a dictionary where they keys are the metric\n names and the values are the metric scores;\n- a dictionary with metric names as keys and callables a values.\n\nSee :ref:`multimetric_grid_search` for an example." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check the scoring parameter in cases when multiple metrics are allowed.", - "docstring": "Check the scoring parameter in cases when multiple metrics are allowed.\n\nParameters\n----------\nestimator : sklearn estimator instance\n The estimator for which the scoring will be applied.\n\nscoring : list, tuple or dict\n Strategy to evaluate the performance of the cross-validated model on\n the test set.\n\n The possibilities are:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where they keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n See :ref:`multimetric_grid_search` for an example.\n\nReturns\n-------\nscorers_dict : dict\n A dict mapping each scorer name to its validated scorer.", + "docstring": "Check the scoring parameter in cases when multiple metrics are allowed.\n\n Parameters\n ----------\n estimator : sklearn estimator instance\n The estimator for which the scoring will be applied.\n\n scoring : list, tuple or dict\n Strategy to evaluate the performance of the cross-validated model on\n the test set.\n\n The possibilities are:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where they keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n See :ref:`multimetric_grid_search` for an example.\n\n Returns\n -------\n scorers_dict : dict\n A dict mapping each scorer name to its validated scorer.\n ", "source_code": "\ndef _check_multimetric_scoring(estimator, scoring):\n \"\"\"Check the scoring parameter in cases when multiple metrics are allowed.\n\n Parameters\n ----------\n estimator : sklearn estimator instance\n The estimator for which the scoring will be applied.\n\n scoring : list, tuple or dict\n Strategy to evaluate the performance of the cross-validated model on\n the test set.\n\n The possibilities are:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where they keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n See :ref:`multimetric_grid_search` for an example.\n\n Returns\n -------\n scorers_dict : dict\n A dict mapping each scorer name to its validated scorer.\n \"\"\"\n err_msg_generic = f'scoring is invalid (got {scoring!r}). Refer to the scoring glossary for details: https://scikit-learn.org/stable/glossary.html#term-scoring'\n if isinstance(scoring, (list, tuple, set)):\n err_msg = 'The list/tuple elements must be unique strings of predefined scorers. '\n try:\n keys = set(scoring)\n except TypeError as e:\n raise ValueError(err_msg) from e\n if len(keys) != len(scoring):\n raise ValueError(f'{err_msg} Duplicate elements were found in the given list. {scoring!r}')\n elif len(keys) > 0:\n if not all((isinstance(k, str) for k in keys)):\n if any((callable(k) for k in keys)):\n raise ValueError(f'{err_msg} One or more of the elements were callables. Use a dict of score name mapped to the scorer callable. Got {scoring!r}')\n else:\n raise ValueError(f'{err_msg} Non-string types were found in the given list. Got {scoring!r}')\n scorers = {scorer: check_scoring(estimator, scoring=scorer) for scorer in scoring}\n else:\n raise ValueError(f'{err_msg} Empty list was given. {scoring!r}')\n elif isinstance(scoring, dict):\n keys = set(scoring)\n if not all((isinstance(k, str) for k in keys)):\n raise ValueError(f'Non-string types were found in the keys of the given dict. scoring={scoring!r}')\n if len(keys) == 0:\n raise ValueError(f'An empty dict was passed. {scoring!r}')\n scorers = {key: check_scoring(estimator, scoring=scorer) for (key, scorer) in scoring.items()}\n else:\n raise ValueError(err_msg_generic)\n return scorers" }, { @@ -122618,7 +131936,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -122642,7 +131961,8 @@ "docstring": { "type": "estimator object implementing 'fit'", "description": "The object to use to fit the data." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -122652,7 +131972,8 @@ "docstring": { "type": "str or callable, default=None", "description": "A string (see model evaluation documentation) or\na scorer callable object / function with signature\n``scorer(estimator, X, y)``.\nIf None, the provided estimator object's `score` method is used." - } + }, + "refined_type": {} }, { "name": "allow_none", @@ -122662,13 +131983,14 @@ "docstring": { "type": "bool, default=False", "description": "If no scoring is specified and the estimator has no score function, we\ncan either return None or raise an exception." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Determine scorer from user options.\n\nA TypeError will be thrown if the estimator cannot be scored.", - "docstring": "Determine scorer from user options.\n\nA TypeError will be thrown if the estimator cannot be scored.\n\nParameters\n----------\nestimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\nscoring : str or callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n If None, the provided estimator object's `score` method is used.\n\nallow_none : bool, default=False\n If no scoring is specified and the estimator has no score function, we\n can either return None or raise an exception.\n\nReturns\n-------\nscoring : callable\n A scorer callable object / function with signature\n ``scorer(estimator, X, y)``.", + "docstring": "Determine scorer from user options.\n\n A TypeError will be thrown if the estimator cannot be scored.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\n scoring : str or callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n If None, the provided estimator object's `score` method is used.\n\n allow_none : bool, default=False\n If no scoring is specified and the estimator has no score function, we\n can either return None or raise an exception.\n\n Returns\n -------\n scoring : callable\n A scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n ", "source_code": "\ndef check_scoring(estimator, scoring=None, *, allow_none=False):\n \"\"\"Determine scorer from user options.\n\n A TypeError will be thrown if the estimator cannot be scored.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\n scoring : str or callable, default=None\n A string (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n If None, the provided estimator object's `score` method is used.\n\n allow_none : bool, default=False\n If no scoring is specified and the estimator has no score function, we\n can either return None or raise an exception.\n\n Returns\n -------\n scoring : callable\n A scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n \"\"\"\n if not hasattr(estimator, 'fit'):\n raise TypeError(\"estimator should be an estimator implementing 'fit' method, %r was passed\" % estimator)\n if isinstance(scoring, str):\n return get_scorer(scoring)\n elif callable(scoring):\n module = getattr(scoring, '__module__', None)\n if hasattr(module, 'startswith') and module.startswith('sklearn.metrics.') and not module.startswith('sklearn.metrics._scorer') and not module.startswith('sklearn.metrics.tests.'):\n raise ValueError('scoring value %r looks like it is a metric function rather than a scorer. A scorer should require an estimator as its first parameter. Please use `make_scorer` to convert a metric to a scorer.' % scoring)\n return get_scorer(scoring)\n elif scoring is None:\n if hasattr(estimator, 'score'):\n return _passthrough_scorer\n elif allow_none:\n return None\n else:\n raise TypeError(\"If no scoring is specified, the estimator passed should have a 'score' method. The estimator %r does not.\" % estimator)\n elif isinstance(scoring, Iterable):\n raise ValueError('For evaluating multiple scores, use sklearn.model_selection.cross_validate instead. {0} was passed.'.format(scoring))\n else:\n raise ValueError('scoring value should either be a callable, string or None. %r was passed' % scoring)" }, { @@ -122686,13 +132008,14 @@ "docstring": { "type": "str or callable", "description": "Scoring method as string. If callable it is returned as is." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get a scorer from string.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Get a scorer from string.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nscoring : str or callable\n Scoring method as string. If callable it is returned as is.\n\nReturns\n-------\nscorer : callable\n The scorer.", + "docstring": "Get a scorer from string.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n scoring : str or callable\n Scoring method as string. If callable it is returned as is.\n\n Returns\n -------\n scorer : callable\n The scorer.\n ", "source_code": "\ndef get_scorer(scoring):\n \"\"\"Get a scorer from string.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n scoring : str or callable\n Scoring method as string. If callable it is returned as is.\n\n Returns\n -------\n scorer : callable\n The scorer.\n \"\"\"\n if isinstance(scoring, str):\n try:\n scorer = SCORERS[scoring]\n except KeyError:\n raise ValueError('%r is not a valid scoring value. Use sorted(sklearn.metrics.SCORERS.keys()) to get valid options.' % scoring)\n else:\n scorer = scoring\n return scorer" }, { @@ -122710,7 +132033,8 @@ "docstring": { "type": "callable", "description": "Score function (or loss function) with signature\n``score_func(y, y_pred, **kwargs)``." - } + }, + "refined_type": {} }, { "name": "greater_is_better", @@ -122720,7 +132044,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether score_func is a score function (default), meaning high is good,\nor a loss function, meaning low is good. In the latter case, the\nscorer object will sign-flip the outcome of the score_func." - } + }, + "refined_type": {} }, { "name": "needs_proba", @@ -122730,7 +132055,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether score_func requires predict_proba to get probability estimates\nout of a classifier.\n\nIf True, for binary `y_true`, the score function is supposed to accept\na 1D `y_pred` (i.e., probability of the positive class, shape\n`(n_samples,)`)." - } + }, + "refined_type": {} }, { "name": "needs_threshold", @@ -122740,13 +132066,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether score_func takes a continuous decision certainty.\nThis only works for binary classification using estimators that\nhave either a decision_function or predict_proba method.\n\nIf True, for binary `y_true`, the score function is supposed to accept\na 1D `y_pred` (i.e., probability of the positive class or the decision\nfunction, shape `(n_samples,)`).\n\nFor example ``average_precision`` or the area under the roc curve\ncan not be computed using discrete predictions alone." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Make a scorer from a performance metric or loss function.\n\nThis factory function wraps scoring functions for use in :class:`~sklearn.model_selection.GridSearchCV` and :func:`~sklearn.model_selection.cross_val_score`. It takes a score function, such as :func:`~sklearn.metrics.accuracy_score`, :func:`~sklearn.metrics.mean_squared_error`, :func:`~sklearn.metrics.adjusted_rand_index` or :func:`~sklearn.metrics.average_precision` and returns a callable that scores an estimator's output. The signature of the call is `(estimator, X, y)` where `estimator` is the model to be evaluated, `X` is the data and `y` is the ground truth labeling (or `None` in the case of unsupervised models). Read more in the :ref:`User Guide `.", - "docstring": "Make a scorer from a performance metric or loss function.\n\nThis factory function wraps scoring functions for use in\n:class:`~sklearn.model_selection.GridSearchCV` and\n:func:`~sklearn.model_selection.cross_val_score`.\nIt takes a score function, such as :func:`~sklearn.metrics.accuracy_score`,\n:func:`~sklearn.metrics.mean_squared_error`,\n:func:`~sklearn.metrics.adjusted_rand_index` or\n:func:`~sklearn.metrics.average_precision`\nand returns a callable that scores an estimator's output.\nThe signature of the call is `(estimator, X, y)` where `estimator`\nis the model to be evaluated, `X` is the data and `y` is the\nground truth labeling (or `None` in the case of unsupervised models).\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nscore_func : callable\n Score function (or loss function) with signature\n ``score_func(y, y_pred, **kwargs)``.\n\ngreater_is_better : bool, default=True\n Whether score_func is a score function (default), meaning high is good,\n or a loss function, meaning low is good. In the latter case, the\n scorer object will sign-flip the outcome of the score_func.\n\nneeds_proba : bool, default=False\n Whether score_func requires predict_proba to get probability estimates\n out of a classifier.\n\n If True, for binary `y_true`, the score function is supposed to accept\n a 1D `y_pred` (i.e., probability of the positive class, shape\n `(n_samples,)`).\n\nneeds_threshold : bool, default=False\n Whether score_func takes a continuous decision certainty.\n This only works for binary classification using estimators that\n have either a decision_function or predict_proba method.\n\n If True, for binary `y_true`, the score function is supposed to accept\n a 1D `y_pred` (i.e., probability of the positive class or the decision\n function, shape `(n_samples,)`).\n\n For example ``average_precision`` or the area under the roc curve\n can not be computed using discrete predictions alone.\n\n**kwargs : additional arguments\n Additional parameters to be passed to score_func.\n\nReturns\n-------\nscorer : callable\n Callable object that returns a scalar score; greater is better.\n\nExamples\n--------\n>>> from sklearn.metrics import fbeta_score, make_scorer\n>>> ftwo_scorer = make_scorer(fbeta_score, beta=2)\n>>> ftwo_scorer\nmake_scorer(fbeta_score, beta=2)\n>>> from sklearn.model_selection import GridSearchCV\n>>> from sklearn.svm import LinearSVC\n>>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]},\n... scoring=ftwo_scorer)\n\nNotes\n-----\nIf `needs_proba=False` and `needs_threshold=False`, the score\nfunction is supposed to accept the output of :term:`predict`. If\n`needs_proba=True`, the score function is supposed to accept the\noutput of :term:`predict_proba` (For binary `y_true`, the score function is\nsupposed to accept probability of the positive class). If\n`needs_threshold=True`, the score function is supposed to accept the\noutput of :term:`decision_function` or :term:`predict_proba` when\n:term:`decision_function` is not present.", + "description": "Make a scorer from a performance metric or loss function.\n\nThis factory function wraps scoring functions for use in\n:class:`~sklearn.model_selection.GridSearchCV` and\n:func:`~sklearn.model_selection.cross_val_score`.\nIt takes a score function, such as :func:`~sklearn.metrics.accuracy_score`,\n:func:`~sklearn.metrics.mean_squared_error`,\n:func:`~sklearn.metrics.adjusted_rand_index` or\n:func:`~sklearn.metrics.average_precision`\nand returns a callable that scores an estimator's output.\nThe signature of the call is `(estimator, X, y)` where `estimator`\nis the model to be evaluated, `X` is the data and `y` is the\nground truth labeling (or `None` in the case of unsupervised models).\n\nRead more in the :ref:`User Guide `.", + "docstring": "Make a scorer from a performance metric or loss function.\n\n This factory function wraps scoring functions for use in\n :class:`~sklearn.model_selection.GridSearchCV` and\n :func:`~sklearn.model_selection.cross_val_score`.\n It takes a score function, such as :func:`~sklearn.metrics.accuracy_score`,\n :func:`~sklearn.metrics.mean_squared_error`,\n :func:`~sklearn.metrics.adjusted_rand_index` or\n :func:`~sklearn.metrics.average_precision`\n and returns a callable that scores an estimator's output.\n The signature of the call is `(estimator, X, y)` where `estimator`\n is the model to be evaluated, `X` is the data and `y` is the\n ground truth labeling (or `None` in the case of unsupervised models).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n score_func : callable\n Score function (or loss function) with signature\n ``score_func(y, y_pred, **kwargs)``.\n\n greater_is_better : bool, default=True\n Whether score_func is a score function (default), meaning high is good,\n or a loss function, meaning low is good. In the latter case, the\n scorer object will sign-flip the outcome of the score_func.\n\n needs_proba : bool, default=False\n Whether score_func requires predict_proba to get probability estimates\n out of a classifier.\n\n If True, for binary `y_true`, the score function is supposed to accept\n a 1D `y_pred` (i.e., probability of the positive class, shape\n `(n_samples,)`).\n\n needs_threshold : bool, default=False\n Whether score_func takes a continuous decision certainty.\n This only works for binary classification using estimators that\n have either a decision_function or predict_proba method.\n\n If True, for binary `y_true`, the score function is supposed to accept\n a 1D `y_pred` (i.e., probability of the positive class or the decision\n function, shape `(n_samples,)`).\n\n For example ``average_precision`` or the area under the roc curve\n can not be computed using discrete predictions alone.\n\n **kwargs : additional arguments\n Additional parameters to be passed to score_func.\n\n Returns\n -------\n scorer : callable\n Callable object that returns a scalar score; greater is better.\n\n Examples\n --------\n >>> from sklearn.metrics import fbeta_score, make_scorer\n >>> ftwo_scorer = make_scorer(fbeta_score, beta=2)\n >>> ftwo_scorer\n make_scorer(fbeta_score, beta=2)\n >>> from sklearn.model_selection import GridSearchCV\n >>> from sklearn.svm import LinearSVC\n >>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]},\n ... scoring=ftwo_scorer)\n\n Notes\n -----\n If `needs_proba=False` and `needs_threshold=False`, the score\n function is supposed to accept the output of :term:`predict`. If\n `needs_proba=True`, the score function is supposed to accept the\n output of :term:`predict_proba` (For binary `y_true`, the score function is\n supposed to accept probability of the positive class). If\n `needs_threshold=True`, the score function is supposed to accept the\n output of :term:`decision_function` or :term:`predict_proba` when\n :term:`decision_function` is not present.\n ", "source_code": "\ndef make_scorer(score_func, *, greater_is_better=True, needs_proba=False, needs_threshold=False, **kwargs):\n \"\"\"Make a scorer from a performance metric or loss function.\n\n This factory function wraps scoring functions for use in\n :class:`~sklearn.model_selection.GridSearchCV` and\n :func:`~sklearn.model_selection.cross_val_score`.\n It takes a score function, such as :func:`~sklearn.metrics.accuracy_score`,\n :func:`~sklearn.metrics.mean_squared_error`,\n :func:`~sklearn.metrics.adjusted_rand_index` or\n :func:`~sklearn.metrics.average_precision`\n and returns a callable that scores an estimator's output.\n The signature of the call is `(estimator, X, y)` where `estimator`\n is the model to be evaluated, `X` is the data and `y` is the\n ground truth labeling (or `None` in the case of unsupervised models).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n score_func : callable\n Score function (or loss function) with signature\n ``score_func(y, y_pred, **kwargs)``.\n\n greater_is_better : bool, default=True\n Whether score_func is a score function (default), meaning high is good,\n or a loss function, meaning low is good. In the latter case, the\n scorer object will sign-flip the outcome of the score_func.\n\n needs_proba : bool, default=False\n Whether score_func requires predict_proba to get probability estimates\n out of a classifier.\n\n If True, for binary `y_true`, the score function is supposed to accept\n a 1D `y_pred` (i.e., probability of the positive class, shape\n `(n_samples,)`).\n\n needs_threshold : bool, default=False\n Whether score_func takes a continuous decision certainty.\n This only works for binary classification using estimators that\n have either a decision_function or predict_proba method.\n\n If True, for binary `y_true`, the score function is supposed to accept\n a 1D `y_pred` (i.e., probability of the positive class or the decision\n function, shape `(n_samples,)`).\n\n For example ``average_precision`` or the area under the roc curve\n can not be computed using discrete predictions alone.\n\n **kwargs : additional arguments\n Additional parameters to be passed to score_func.\n\n Returns\n -------\n scorer : callable\n Callable object that returns a scalar score; greater is better.\n\n Examples\n --------\n >>> from sklearn.metrics import fbeta_score, make_scorer\n >>> ftwo_scorer = make_scorer(fbeta_score, beta=2)\n >>> ftwo_scorer\n make_scorer(fbeta_score, beta=2)\n >>> from sklearn.model_selection import GridSearchCV\n >>> from sklearn.svm import LinearSVC\n >>> grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]},\n ... scoring=ftwo_scorer)\n\n Notes\n -----\n If `needs_proba=False` and `needs_threshold=False`, the score\n function is supposed to accept the output of :term:`predict`. If\n `needs_proba=True`, the score function is supposed to accept the\n output of :term:`predict_proba` (For binary `y_true`, the score function is\n supposed to accept probability of the positive class). If\n `needs_threshold=True`, the score function is supposed to accept the\n output of :term:`decision_function` or :term:`predict_proba` when\n :term:`decision_function` is not present.\n \"\"\"\n sign = 1 if greater_is_better else -1\n if needs_proba and needs_threshold:\n raise ValueError('Set either needs_proba or needs_threshold to True, but not both.')\n if needs_proba:\n cls = _ProbaScorer\n elif needs_threshold:\n cls = _ThresholdScorer\n else:\n cls = _PredictScorer\n return cls(score_func, sign, kwargs)" }, { @@ -122764,7 +132091,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b", @@ -122774,7 +132102,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -122798,7 +132127,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "a_cols", @@ -122808,7 +132138,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b_rows", @@ -122818,7 +132149,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b_cols", @@ -122828,7 +132160,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -122852,7 +132185,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "b", @@ -122862,7 +132196,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "similarity", @@ -122872,13 +132207,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Computes pairwise similarity matrix.\n\nresult[i, j] is the Jaccard coefficient of a's bicluster i and b's bicluster j.", - "docstring": "Computes pairwise similarity matrix.\n\nresult[i, j] is the Jaccard coefficient of a's bicluster i and b's\nbicluster j.", + "description": "Computes pairwise similarity matrix.\n\nresult[i, j] is the Jaccard coefficient of a's bicluster i and b's\nbicluster j.", + "docstring": "Computes pairwise similarity matrix.\n\n result[i, j] is the Jaccard coefficient of a's bicluster i and b's\n bicluster j.\n\n ", "source_code": "\ndef _pairwise_similarity(a, b, similarity):\n \"\"\"Computes pairwise similarity matrix.\n\n result[i, j] is the Jaccard coefficient of a's bicluster i and b's\n bicluster j.\n\n \"\"\"\n (a_rows, a_cols, b_rows, b_cols) = _check_rows_and_columns(a, b)\n n_a = a_rows.shape[0]\n n_b = b_rows.shape[0]\n result = np.array(list((list((similarity(a_rows[i], a_cols[i], b_rows[j], b_cols[j]) for j in range(n_b))) for i in range(n_a))))\n return result" }, { @@ -122896,7 +132232,8 @@ "docstring": { "type": "(rows, columns)", "description": "Tuple of row and column indicators for a set of biclusters." - } + }, + "refined_type": {} }, { "name": "b", @@ -122906,7 +132243,8 @@ "docstring": { "type": "(rows, columns)", "description": "Another set of biclusters like ``a``." - } + }, + "refined_type": {} }, { "name": "similarity", @@ -122916,13 +132254,14 @@ "docstring": { "type": "'jaccard' or callable, default='jaccard'", "description": "May be the string \"jaccard\" to use the Jaccard coefficient, or\nany function that takes four arguments, each of which is a 1d\nindicator vector: (a_rows, a_columns, b_rows, b_columns)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "The similarity of two sets of biclusters.\n\nSimilarity between individual biclusters is computed. Then the best matching between sets is found using the Hungarian algorithm. The final score is the sum of similarities divided by the size of the larger set. Read more in the :ref:`User Guide `.", - "docstring": "The similarity of two sets of biclusters.\n\nSimilarity between individual biclusters is computed. Then the\nbest matching between sets is found using the Hungarian algorithm.\nThe final score is the sum of similarities divided by the size of\nthe larger set.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\na : (rows, columns)\n Tuple of row and column indicators for a set of biclusters.\n\nb : (rows, columns)\n Another set of biclusters like ``a``.\n\nsimilarity : 'jaccard' or callable, default='jaccard'\n May be the string \"jaccard\" to use the Jaccard coefficient, or\n any function that takes four arguments, each of which is a 1d\n indicator vector: (a_rows, a_columns, b_rows, b_columns).\n\nReferences\n----------\n\n* Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis\n for bicluster acquisition\n `__.", + "description": "The similarity of two sets of biclusters.\n\nSimilarity between individual biclusters is computed. Then the\nbest matching between sets is found using the Hungarian algorithm.\nThe final score is the sum of similarities divided by the size of\nthe larger set.\n\nRead more in the :ref:`User Guide `.", + "docstring": "The similarity of two sets of biclusters.\n\n Similarity between individual biclusters is computed. Then the\n best matching between sets is found using the Hungarian algorithm.\n The final score is the sum of similarities divided by the size of\n the larger set.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n a : (rows, columns)\n Tuple of row and column indicators for a set of biclusters.\n\n b : (rows, columns)\n Another set of biclusters like ``a``.\n\n similarity : 'jaccard' or callable, default='jaccard'\n May be the string \"jaccard\" to use the Jaccard coefficient, or\n any function that takes four arguments, each of which is a 1d\n indicator vector: (a_rows, a_columns, b_rows, b_columns).\n\n References\n ----------\n\n * Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis\n for bicluster acquisition\n `__.\n\n ", "source_code": "\ndef consensus_score(a, b, *, similarity='jaccard'):\n \"\"\"The similarity of two sets of biclusters.\n\n Similarity between individual biclusters is computed. Then the\n best matching between sets is found using the Hungarian algorithm.\n The final score is the sum of similarities divided by the size of\n the larger set.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n a : (rows, columns)\n Tuple of row and column indicators for a set of biclusters.\n\n b : (rows, columns)\n Another set of biclusters like ``a``.\n\n similarity : 'jaccard' or callable, default='jaccard'\n May be the string \"jaccard\" to use the Jaccard coefficient, or\n any function that takes four arguments, each of which is a 1d\n indicator vector: (a_rows, a_columns, b_rows, b_columns).\n\n References\n ----------\n\n * Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis\n for bicluster acquisition\n `__.\n\n \"\"\"\n if similarity == 'jaccard':\n similarity = _jaccard\n matrix = _pairwise_similarity(a, b, similarity)\n (row_indices, col_indices) = linear_sum_assignment(1.0 - matrix)\n n_a = len(a[0])\n n_b = len(b[0])\n return matrix[row_indices, col_indices].sum() / max(n_a, n_b)" }, { @@ -122940,7 +132279,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "V", @@ -122950,7 +132290,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "average_method", @@ -122960,7 +132301,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -122984,7 +132326,8 @@ "docstring": { "type": "int array, shape = [n_samples]", "description": "A clustering of the data into disjoint subsets, called :math:`U` in\nthe above formula." - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -122994,7 +132337,8 @@ "docstring": { "type": "int array-like of shape (n_samples,)", "description": "A clustering of the data into disjoint subsets, called :math:`V` in\nthe above formula." - } + }, + "refined_type": {} }, { "name": "average_method", @@ -123004,13 +132348,14 @@ "docstring": { "type": "str, default='arithmetic'", "description": "How to compute the normalizer in the denominator. Possible options\nare 'min', 'geometric', 'arithmetic', and 'max'.\n\n.. versionadded:: 0.20\n\n.. versionchanged:: 0.22\n The default value of ``average_method`` changed from 'max' to\n 'arithmetic'." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Adjusted Mutual Information between two clusterings.\n\nAdjusted Mutual Information (AMI) is an adjustment of the Mutual Information (MI) score to account for chance. It accounts for the fact that the MI is generally higher for two clusterings with a larger number of clusters, regardless of whether there is actually more information shared. For two clusterings :math:`U` and :math:`V`, the AMI is given as:: AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))] This metric is independent of the absolute values of the labels: a permutation of the class or cluster label values won't change the score value in any way. This metric is furthermore symmetric: switching :math:`U` (``label_true``) with :math:`V` (``labels_pred``) will return the same score value. This can be useful to measure the agreement of two independent label assignments strategies on the same dataset when the real ground truth is not known. Be mindful that this function is an order of magnitude slower than other metrics, such as the Adjusted Rand Index. Read more in the :ref:`User Guide `.", - "docstring": "Adjusted Mutual Information between two clusterings.\n\nAdjusted Mutual Information (AMI) is an adjustment of the Mutual\nInformation (MI) score to account for chance. It accounts for the fact that\nthe MI is generally higher for two clusterings with a larger number of\nclusters, regardless of whether there is actually more information shared.\nFor two clusterings :math:`U` and :math:`V`, the AMI is given as::\n\n AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]\n\nThis metric is independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore value in any way.\n\nThis metric is furthermore symmetric: switching :math:`U` (``label_true``)\nwith :math:`V` (``labels_pred``) will return the same score value. This can\nbe useful to measure the agreement of two independent label assignments\nstrategies on the same dataset when the real ground truth is not known.\n\nBe mindful that this function is an order of magnitude slower than other\nmetrics, such as the Adjusted Rand Index.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nlabels_true : int array, shape = [n_samples]\n A clustering of the data into disjoint subsets, called :math:`U` in\n the above formula.\n\nlabels_pred : int array-like of shape (n_samples,)\n A clustering of the data into disjoint subsets, called :math:`V` in\n the above formula.\n\naverage_method : str, default='arithmetic'\n How to compute the normalizer in the denominator. Possible options\n are 'min', 'geometric', 'arithmetic', and 'max'.\n\n .. versionadded:: 0.20\n\n .. versionchanged:: 0.22\n The default value of ``average_method`` changed from 'max' to\n 'arithmetic'.\n\nReturns\n-------\nami: float (upperlimited by 1.0)\n The AMI returns a value of 1 when the two partitions are identical\n (ie perfectly matched). Random partitions (independent labellings) have\n an expected AMI around 0 on average hence can be negative. The value is\n in adjusted nats (based on the natural logarithm).\n\nSee Also\n--------\nadjusted_rand_score : Adjusted Rand Index.\nmutual_info_score : Mutual Information (not adjusted for chance).\n\nExamples\n--------\n\nPerfect labelings are both homogeneous and complete, hence have\nscore 1.0::\n\n >>> from sklearn.metrics.cluster import adjusted_mutual_info_score\n >>> adjusted_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])\n ... # doctest: +SKIP\n 1.0\n >>> adjusted_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])\n ... # doctest: +SKIP\n 1.0\n\nIf classes members are completely split across different clusters,\nthe assignment is totally in-complete, hence the AMI is null::\n\n >>> adjusted_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])\n ... # doctest: +SKIP\n 0.0\n\nReferences\n----------\n.. [1] `Vinh, Epps, and Bailey, (2010). Information Theoretic Measures for\n Clusterings Comparison: Variants, Properties, Normalization and\n Correction for Chance, JMLR\n `_\n\n.. [2] `Wikipedia entry for the Adjusted Mutual Information\n `_", + "description": "Adjusted Mutual Information between two clusterings.\n\nAdjusted Mutual Information (AMI) is an adjustment of the Mutual\nInformation (MI) score to account for chance. It accounts for the fact that\nthe MI is generally higher for two clusterings with a larger number of\nclusters, regardless of whether there is actually more information shared.\nFor two clusterings :math:`U` and :math:`V`, the AMI is given as::\n\n AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]\n\nThis metric is independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore value in any way.\n\nThis metric is furthermore symmetric: switching :math:`U` (``label_true``)\nwith :math:`V` (``labels_pred``) will return the same score value. This can\nbe useful to measure the agreement of two independent label assignments\nstrategies on the same dataset when the real ground truth is not known.\n\nBe mindful that this function is an order of magnitude slower than other\nmetrics, such as the Adjusted Rand Index.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Adjusted Mutual Information between two clusterings.\n\n Adjusted Mutual Information (AMI) is an adjustment of the Mutual\n Information (MI) score to account for chance. It accounts for the fact that\n the MI is generally higher for two clusterings with a larger number of\n clusters, regardless of whether there is actually more information shared.\n For two clusterings :math:`U` and :math:`V`, the AMI is given as::\n\n AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]\n\n This metric is independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score value in any way.\n\n This metric is furthermore symmetric: switching :math:`U` (``label_true``)\n with :math:`V` (``labels_pred``) will return the same score value. This can\n be useful to measure the agreement of two independent label assignments\n strategies on the same dataset when the real ground truth is not known.\n\n Be mindful that this function is an order of magnitude slower than other\n metrics, such as the Adjusted Rand Index.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n A clustering of the data into disjoint subsets, called :math:`U` in\n the above formula.\n\n labels_pred : int array-like of shape (n_samples,)\n A clustering of the data into disjoint subsets, called :math:`V` in\n the above formula.\n\n average_method : str, default='arithmetic'\n How to compute the normalizer in the denominator. Possible options\n are 'min', 'geometric', 'arithmetic', and 'max'.\n\n .. versionadded:: 0.20\n\n .. versionchanged:: 0.22\n The default value of ``average_method`` changed from 'max' to\n 'arithmetic'.\n\n Returns\n -------\n ami: float (upperlimited by 1.0)\n The AMI returns a value of 1 when the two partitions are identical\n (ie perfectly matched). Random partitions (independent labellings) have\n an expected AMI around 0 on average hence can be negative. The value is\n in adjusted nats (based on the natural logarithm).\n\n See Also\n --------\n adjusted_rand_score : Adjusted Rand Index.\n mutual_info_score : Mutual Information (not adjusted for chance).\n\n Examples\n --------\n\n Perfect labelings are both homogeneous and complete, hence have\n score 1.0::\n\n >>> from sklearn.metrics.cluster import adjusted_mutual_info_score\n >>> adjusted_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])\n ... # doctest: +SKIP\n 1.0\n >>> adjusted_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])\n ... # doctest: +SKIP\n 1.0\n\n If classes members are completely split across different clusters,\n the assignment is totally in-complete, hence the AMI is null::\n\n >>> adjusted_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])\n ... # doctest: +SKIP\n 0.0\n\n References\n ----------\n .. [1] `Vinh, Epps, and Bailey, (2010). Information Theoretic Measures for\n Clusterings Comparison: Variants, Properties, Normalization and\n Correction for Chance, JMLR\n `_\n\n .. [2] `Wikipedia entry for the Adjusted Mutual Information\n `_\n ", "source_code": "\ndef adjusted_mutual_info_score(labels_true, labels_pred, *, average_method='arithmetic'):\n \"\"\"Adjusted Mutual Information between two clusterings.\n\n Adjusted Mutual Information (AMI) is an adjustment of the Mutual\n Information (MI) score to account for chance. It accounts for the fact that\n the MI is generally higher for two clusterings with a larger number of\n clusters, regardless of whether there is actually more information shared.\n For two clusterings :math:`U` and :math:`V`, the AMI is given as::\n\n AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg(H(U), H(V)) - E(MI(U, V))]\n\n This metric is independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score value in any way.\n\n This metric is furthermore symmetric: switching :math:`U` (``label_true``)\n with :math:`V` (``labels_pred``) will return the same score value. This can\n be useful to measure the agreement of two independent label assignments\n strategies on the same dataset when the real ground truth is not known.\n\n Be mindful that this function is an order of magnitude slower than other\n metrics, such as the Adjusted Rand Index.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n A clustering of the data into disjoint subsets, called :math:`U` in\n the above formula.\n\n labels_pred : int array-like of shape (n_samples,)\n A clustering of the data into disjoint subsets, called :math:`V` in\n the above formula.\n\n average_method : str, default='arithmetic'\n How to compute the normalizer in the denominator. Possible options\n are 'min', 'geometric', 'arithmetic', and 'max'.\n\n .. versionadded:: 0.20\n\n .. versionchanged:: 0.22\n The default value of ``average_method`` changed from 'max' to\n 'arithmetic'.\n\n Returns\n -------\n ami: float (upperlimited by 1.0)\n The AMI returns a value of 1 when the two partitions are identical\n (ie perfectly matched). Random partitions (independent labellings) have\n an expected AMI around 0 on average hence can be negative. The value is\n in adjusted nats (based on the natural logarithm).\n\n See Also\n --------\n adjusted_rand_score : Adjusted Rand Index.\n mutual_info_score : Mutual Information (not adjusted for chance).\n\n Examples\n --------\n\n Perfect labelings are both homogeneous and complete, hence have\n score 1.0::\n\n >>> from sklearn.metrics.cluster import adjusted_mutual_info_score\n >>> adjusted_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])\n ... # doctest: +SKIP\n 1.0\n >>> adjusted_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])\n ... # doctest: +SKIP\n 1.0\n\n If classes members are completely split across different clusters,\n the assignment is totally in-complete, hence the AMI is null::\n\n >>> adjusted_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])\n ... # doctest: +SKIP\n 0.0\n\n References\n ----------\n .. [1] `Vinh, Epps, and Bailey, (2010). Information Theoretic Measures for\n Clusterings Comparison: Variants, Properties, Normalization and\n Correction for Chance, JMLR\n `_\n\n .. [2] `Wikipedia entry for the Adjusted Mutual Information\n `_\n \"\"\"\n (labels_true, labels_pred) = check_clusterings(labels_true, labels_pred)\n n_samples = labels_true.shape[0]\n classes = np.unique(labels_true)\n clusters = np.unique(labels_pred)\n if classes.shape[0] == clusters.shape[0] == 1 or classes.shape[0] == clusters.shape[0] == 0:\n return 1.0\n contingency = contingency_matrix(labels_true, labels_pred, sparse=True)\n contingency = contingency.astype(np.float64, **_astype_copy_false(contingency))\n mi = mutual_info_score(labels_true, labels_pred, contingency=contingency)\n emi = expected_mutual_information(contingency, n_samples)\n (h_true, h_pred) = (entropy(labels_true), entropy(labels_pred))\n normalizer = _generalized_average(h_true, h_pred, average_method)\n denominator = normalizer - emi\n if denominator < 0:\n denominator = min(denominator, -np.finfo('float64').eps)\n else:\n denominator = max(denominator, np.finfo('float64').eps)\n ami = (mi - emi) / denominator\n return ami" }, { @@ -123028,7 +132373,8 @@ "docstring": { "type": "int array, shape = [n_samples]", "description": "Ground truth class labels to be used as a reference" - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -123038,13 +132384,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Cluster labels to evaluate" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Rand index adjusted for chance.\n\nThe Rand Index computes a similarity measure between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings. The raw RI score is then \"adjusted for chance\" into the ARI score using the following scheme:: ARI = (RI - Expected_RI) / (max(RI) - Expected_RI) The adjusted Rand index is thus ensured to have a value close to 0.0 for random labeling independently of the number of clusters and samples and exactly 1.0 when the clusterings are identical (up to a permutation). ARI is a symmetric measure:: adjusted_rand_score(a, b) == adjusted_rand_score(b, a) Read more in the :ref:`User Guide `.", - "docstring": "Rand index adjusted for chance.\n\nThe Rand Index computes a similarity measure between two clusterings\nby considering all pairs of samples and counting pairs that are\nassigned in the same or different clusters in the predicted and\ntrue clusterings.\n\nThe raw RI score is then \"adjusted for chance\" into the ARI score\nusing the following scheme::\n\n ARI = (RI - Expected_RI) / (max(RI) - Expected_RI)\n\nThe adjusted Rand index is thus ensured to have a value close to\n0.0 for random labeling independently of the number of clusters and\nsamples and exactly 1.0 when the clusterings are identical (up to\na permutation).\n\nARI is a symmetric measure::\n\n adjusted_rand_score(a, b) == adjusted_rand_score(b, a)\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nlabels_true : int array, shape = [n_samples]\n Ground truth class labels to be used as a reference\n\nlabels_pred : array-like of shape (n_samples,)\n Cluster labels to evaluate\n\nReturns\n-------\nARI : float\n Similarity score between -1.0 and 1.0. Random labelings have an ARI\n close to 0.0. 1.0 stands for perfect match.\n\nExamples\n--------\nPerfectly matching labelings have a score of 1 even\n\n >>> from sklearn.metrics.cluster import adjusted_rand_score\n >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 1])\n 1.0\n >>> adjusted_rand_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\nLabelings that assign all classes members to the same clusters\nare complete but may not always be pure, hence penalized::\n\n >>> adjusted_rand_score([0, 0, 1, 2], [0, 0, 1, 1])\n 0.57...\n\nARI is symmetric, so labelings that have pure clusters with members\ncoming from the same classes but unnecessary splits are penalized::\n\n >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 2])\n 0.57...\n\nIf classes members are completely split across different clusters, the\nassignment is totally incomplete, hence the ARI is very low::\n\n >>> adjusted_rand_score([0, 0, 0, 0], [0, 1, 2, 3])\n 0.0\n\nReferences\n----------\n.. [Hubert1985] L. Hubert and P. Arabie, Comparing Partitions,\n Journal of Classification 1985\n https://link.springer.com/article/10.1007%2FBF01908075\n\n.. [Steinley2004] D. Steinley, Properties of the Hubert-Arabie\n adjusted Rand index, Psychological Methods 2004\n\n.. [wk] https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index\n\nSee Also\n--------\nadjusted_mutual_info_score : Adjusted Mutual Information.", + "description": "Rand index adjusted for chance.\n\nThe Rand Index computes a similarity measure between two clusterings\nby considering all pairs of samples and counting pairs that are\nassigned in the same or different clusters in the predicted and\ntrue clusterings.\n\nThe raw RI score is then \"adjusted for chance\" into the ARI score\nusing the following scheme::\n\n ARI = (RI - Expected_RI) / (max(RI) - Expected_RI)\n\nThe adjusted Rand index is thus ensured to have a value close to\n0.0 for random labeling independently of the number of clusters and\nsamples and exactly 1.0 when the clusterings are identical (up to\na permutation).\n\nARI is a symmetric measure::\n\n adjusted_rand_score(a, b) == adjusted_rand_score(b, a)\n\nRead more in the :ref:`User Guide `.", + "docstring": "Rand index adjusted for chance.\n\n The Rand Index computes a similarity measure between two clusterings\n by considering all pairs of samples and counting pairs that are\n assigned in the same or different clusters in the predicted and\n true clusterings.\n\n The raw RI score is then \"adjusted for chance\" into the ARI score\n using the following scheme::\n\n ARI = (RI - Expected_RI) / (max(RI) - Expected_RI)\n\n The adjusted Rand index is thus ensured to have a value close to\n 0.0 for random labeling independently of the number of clusters and\n samples and exactly 1.0 when the clusterings are identical (up to\n a permutation).\n\n ARI is a symmetric measure::\n\n adjusted_rand_score(a, b) == adjusted_rand_score(b, a)\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n Ground truth class labels to be used as a reference\n\n labels_pred : array-like of shape (n_samples,)\n Cluster labels to evaluate\n\n Returns\n -------\n ARI : float\n Similarity score between -1.0 and 1.0. Random labelings have an ARI\n close to 0.0. 1.0 stands for perfect match.\n\n Examples\n --------\n Perfectly matching labelings have a score of 1 even\n\n >>> from sklearn.metrics.cluster import adjusted_rand_score\n >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 1])\n 1.0\n >>> adjusted_rand_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\n Labelings that assign all classes members to the same clusters\n are complete but may not always be pure, hence penalized::\n\n >>> adjusted_rand_score([0, 0, 1, 2], [0, 0, 1, 1])\n 0.57...\n\n ARI is symmetric, so labelings that have pure clusters with members\n coming from the same classes but unnecessary splits are penalized::\n\n >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 2])\n 0.57...\n\n If classes members are completely split across different clusters, the\n assignment is totally incomplete, hence the ARI is very low::\n\n >>> adjusted_rand_score([0, 0, 0, 0], [0, 1, 2, 3])\n 0.0\n\n References\n ----------\n .. [Hubert1985] L. Hubert and P. Arabie, Comparing Partitions,\n Journal of Classification 1985\n https://link.springer.com/article/10.1007%2FBF01908075\n\n .. [Steinley2004] D. Steinley, Properties of the Hubert-Arabie\n adjusted Rand index, Psychological Methods 2004\n\n .. [wk] https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index\n\n See Also\n --------\n adjusted_mutual_info_score : Adjusted Mutual Information.\n ", "source_code": "\ndef adjusted_rand_score(labels_true, labels_pred):\n \"\"\"Rand index adjusted for chance.\n\n The Rand Index computes a similarity measure between two clusterings\n by considering all pairs of samples and counting pairs that are\n assigned in the same or different clusters in the predicted and\n true clusterings.\n\n The raw RI score is then \"adjusted for chance\" into the ARI score\n using the following scheme::\n\n ARI = (RI - Expected_RI) / (max(RI) - Expected_RI)\n\n The adjusted Rand index is thus ensured to have a value close to\n 0.0 for random labeling independently of the number of clusters and\n samples and exactly 1.0 when the clusterings are identical (up to\n a permutation).\n\n ARI is a symmetric measure::\n\n adjusted_rand_score(a, b) == adjusted_rand_score(b, a)\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n Ground truth class labels to be used as a reference\n\n labels_pred : array-like of shape (n_samples,)\n Cluster labels to evaluate\n\n Returns\n -------\n ARI : float\n Similarity score between -1.0 and 1.0. Random labelings have an ARI\n close to 0.0. 1.0 stands for perfect match.\n\n Examples\n --------\n Perfectly matching labelings have a score of 1 even\n\n >>> from sklearn.metrics.cluster import adjusted_rand_score\n >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 1])\n 1.0\n >>> adjusted_rand_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\n Labelings that assign all classes members to the same clusters\n are complete but may not always be pure, hence penalized::\n\n >>> adjusted_rand_score([0, 0, 1, 2], [0, 0, 1, 1])\n 0.57...\n\n ARI is symmetric, so labelings that have pure clusters with members\n coming from the same classes but unnecessary splits are penalized::\n\n >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 2])\n 0.57...\n\n If classes members are completely split across different clusters, the\n assignment is totally incomplete, hence the ARI is very low::\n\n >>> adjusted_rand_score([0, 0, 0, 0], [0, 1, 2, 3])\n 0.0\n\n References\n ----------\n .. [Hubert1985] L. Hubert and P. Arabie, Comparing Partitions,\n Journal of Classification 1985\n https://link.springer.com/article/10.1007%2FBF01908075\n\n .. [Steinley2004] D. Steinley, Properties of the Hubert-Arabie\n adjusted Rand index, Psychological Methods 2004\n\n .. [wk] https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index\n\n See Also\n --------\n adjusted_mutual_info_score : Adjusted Mutual Information.\n \"\"\"\n ((tn, fp), (fn, tp)) = pair_confusion_matrix(labels_true, labels_pred)\n (tn, fp, fn, tp) = (int(tn), int(fp), int(fn), int(tp))\n if fn == 0 and fp == 0:\n return 1.0\n return 2.0 * (tp * tn - fn * fp) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn))" }, { @@ -123062,7 +132409,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The true labels." - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -123072,13 +132420,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The predicted labels." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check that the labels arrays are 1D and of same dimension.", - "docstring": "Check that the labels arrays are 1D and of same dimension.\n\nParameters\n----------\nlabels_true : array-like of shape (n_samples,)\n The true labels.\n\nlabels_pred : array-like of shape (n_samples,)\n The predicted labels.", + "docstring": "Check that the labels arrays are 1D and of same dimension.\n\n Parameters\n ----------\n labels_true : array-like of shape (n_samples,)\n The true labels.\n\n labels_pred : array-like of shape (n_samples,)\n The predicted labels.\n ", "source_code": "\ndef check_clusterings(labels_true, labels_pred):\n \"\"\"Check that the labels arrays are 1D and of same dimension.\n\n Parameters\n ----------\n labels_true : array-like of shape (n_samples,)\n The true labels.\n\n labels_pred : array-like of shape (n_samples,)\n The predicted labels.\n \"\"\"\n labels_true = check_array(labels_true, ensure_2d=False, ensure_min_samples=0, dtype=None)\n labels_pred = check_array(labels_pred, ensure_2d=False, ensure_min_samples=0, dtype=None)\n type_label = type_of_target(labels_true)\n type_pred = type_of_target(labels_pred)\n if 'continuous' in (type_pred, type_label):\n msg = f'Clustering metrics expects discrete values but received {type_label} values for label, and {type_pred} values for target'\n warnings.warn(msg, UserWarning)\n if labels_true.ndim != 1:\n raise ValueError('labels_true must be 1D: shape is %r' % (labels_true.shape, ))\n if labels_pred.ndim != 1:\n raise ValueError('labels_pred must be 1D: shape is %r' % (labels_pred.shape, ))\n check_consistent_length(labels_true, labels_pred)\n return labels_true, labels_pred" }, { @@ -123096,7 +132445,8 @@ "docstring": { "type": "int array, shape = [n_samples]", "description": "ground truth class labels to be used as a reference" - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -123106,13 +132456,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "cluster labels to evaluate" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Completeness metric of a cluster labeling given a ground truth.\n\nA clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster. This metric is independent of the absolute values of the labels: a permutation of the class or cluster label values won't change the score value in any way. This metric is not symmetric: switching ``label_true`` with ``label_pred`` will return the :func:`homogeneity_score` which will be different in general. Read more in the :ref:`User Guide `.", - "docstring": "Completeness metric of a cluster labeling given a ground truth.\n\nA clustering result satisfies completeness if all the data points\nthat are members of a given class are elements of the same cluster.\n\nThis metric is independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore value in any way.\n\nThis metric is not symmetric: switching ``label_true`` with ``label_pred``\nwill return the :func:`homogeneity_score` which will be different in\ngeneral.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nlabels_true : int array, shape = [n_samples]\n ground truth class labels to be used as a reference\n\nlabels_pred : array-like of shape (n_samples,)\n cluster labels to evaluate\n\nReturns\n-------\ncompleteness : float\n score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling\n\nReferences\n----------\n\n.. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A\n conditional entropy-based external cluster evaluation measure\n `_\n\nSee Also\n--------\nhomogeneity_score\nv_measure_score\n\nExamples\n--------\n\nPerfect labelings are complete::\n\n >>> from sklearn.metrics.cluster import completeness_score\n >>> completeness_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\nNon-perfect labelings that assign all classes members to the same clusters\nare still complete::\n\n >>> print(completeness_score([0, 0, 1, 1], [0, 0, 0, 0]))\n 1.0\n >>> print(completeness_score([0, 1, 2, 3], [0, 0, 1, 1]))\n 0.999...\n\nIf classes members are split across different clusters, the\nassignment cannot be complete::\n\n >>> print(completeness_score([0, 0, 1, 1], [0, 1, 0, 1]))\n 0.0\n >>> print(completeness_score([0, 0, 0, 0], [0, 1, 2, 3]))\n 0.0", + "description": "Completeness metric of a cluster labeling given a ground truth.\n\nA clustering result satisfies completeness if all the data points\nthat are members of a given class are elements of the same cluster.\n\nThis metric is independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore value in any way.\n\nThis metric is not symmetric: switching ``label_true`` with ``label_pred``\nwill return the :func:`homogeneity_score` which will be different in\ngeneral.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Completeness metric of a cluster labeling given a ground truth.\n\n A clustering result satisfies completeness if all the data points\n that are members of a given class are elements of the same cluster.\n\n This metric is independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score value in any way.\n\n This metric is not symmetric: switching ``label_true`` with ``label_pred``\n will return the :func:`homogeneity_score` which will be different in\n general.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n ground truth class labels to be used as a reference\n\n labels_pred : array-like of shape (n_samples,)\n cluster labels to evaluate\n\n Returns\n -------\n completeness : float\n score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling\n\n References\n ----------\n\n .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A\n conditional entropy-based external cluster evaluation measure\n `_\n\n See Also\n --------\n homogeneity_score\n v_measure_score\n\n Examples\n --------\n\n Perfect labelings are complete::\n\n >>> from sklearn.metrics.cluster import completeness_score\n >>> completeness_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\n Non-perfect labelings that assign all classes members to the same clusters\n are still complete::\n\n >>> print(completeness_score([0, 0, 1, 1], [0, 0, 0, 0]))\n 1.0\n >>> print(completeness_score([0, 1, 2, 3], [0, 0, 1, 1]))\n 0.999...\n\n If classes members are split across different clusters, the\n assignment cannot be complete::\n\n >>> print(completeness_score([0, 0, 1, 1], [0, 1, 0, 1]))\n 0.0\n >>> print(completeness_score([0, 0, 0, 0], [0, 1, 2, 3]))\n 0.0\n ", "source_code": "\ndef completeness_score(labels_true, labels_pred):\n \"\"\"Completeness metric of a cluster labeling given a ground truth.\n\n A clustering result satisfies completeness if all the data points\n that are members of a given class are elements of the same cluster.\n\n This metric is independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score value in any way.\n\n This metric is not symmetric: switching ``label_true`` with ``label_pred``\n will return the :func:`homogeneity_score` which will be different in\n general.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n ground truth class labels to be used as a reference\n\n labels_pred : array-like of shape (n_samples,)\n cluster labels to evaluate\n\n Returns\n -------\n completeness : float\n score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling\n\n References\n ----------\n\n .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A\n conditional entropy-based external cluster evaluation measure\n `_\n\n See Also\n --------\n homogeneity_score\n v_measure_score\n\n Examples\n --------\n\n Perfect labelings are complete::\n\n >>> from sklearn.metrics.cluster import completeness_score\n >>> completeness_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\n Non-perfect labelings that assign all classes members to the same clusters\n are still complete::\n\n >>> print(completeness_score([0, 0, 1, 1], [0, 0, 0, 0]))\n 1.0\n >>> print(completeness_score([0, 1, 2, 3], [0, 0, 1, 1]))\n 0.999...\n\n If classes members are split across different clusters, the\n assignment cannot be complete::\n\n >>> print(completeness_score([0, 0, 1, 1], [0, 1, 0, 1]))\n 0.0\n >>> print(completeness_score([0, 0, 0, 0], [0, 1, 2, 3]))\n 0.0\n \"\"\"\n return homogeneity_completeness_v_measure(labels_true, labels_pred)[1]" }, { @@ -123130,7 +132481,8 @@ "docstring": { "type": "int array, shape = [n_samples]", "description": "Ground truth class labels to be used as a reference." - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -123140,7 +132492,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Cluster labels to evaluate." - } + }, + "refined_type": {} }, { "name": "eps", @@ -123150,7 +132503,8 @@ "docstring": { "type": "float, default=None", "description": "If a float, that value is added to all values in the contingency\nmatrix. This helps to stop NaN propagation.\nIf ``None``, nothing is adjusted." - } + }, + "refined_type": {} }, { "name": "sparse", @@ -123160,7 +132514,8 @@ "docstring": { "type": "bool, default=False", "description": "If `True`, return a sparse CSR continency matrix. If `eps` is not\n`None` and `sparse` is `True` will raise ValueError.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -123170,13 +132525,14 @@ "docstring": { "type": "numeric type, default=np.int64", "description": "Output dtype. Ignored if `eps` is not `None`.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Build a contingency matrix describing the relationship between labels.", - "docstring": "Build a contingency matrix describing the relationship between labels.\n\nParameters\n----------\nlabels_true : int array, shape = [n_samples]\n Ground truth class labels to be used as a reference.\n\nlabels_pred : array-like of shape (n_samples,)\n Cluster labels to evaluate.\n\neps : float, default=None\n If a float, that value is added to all values in the contingency\n matrix. This helps to stop NaN propagation.\n If ``None``, nothing is adjusted.\n\nsparse : bool, default=False\n If `True`, return a sparse CSR continency matrix. If `eps` is not\n `None` and `sparse` is `True` will raise ValueError.\n\n .. versionadded:: 0.18\n\ndtype : numeric type, default=np.int64\n Output dtype. Ignored if `eps` is not `None`.\n\n .. versionadded:: 0.24\n\nReturns\n-------\ncontingency : {array-like, sparse}, shape=[n_classes_true, n_classes_pred]\n Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in\n true class :math:`i` and in predicted class :math:`j`. If\n ``eps is None``, the dtype of this array will be integer unless set\n otherwise with the ``dtype`` argument. If ``eps`` is given, the dtype\n will be float.\n Will be a ``sklearn.sparse.csr_matrix`` if ``sparse=True``.", + "docstring": "Build a contingency matrix describing the relationship between labels.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n Ground truth class labels to be used as a reference.\n\n labels_pred : array-like of shape (n_samples,)\n Cluster labels to evaluate.\n\n eps : float, default=None\n If a float, that value is added to all values in the contingency\n matrix. This helps to stop NaN propagation.\n If ``None``, nothing is adjusted.\n\n sparse : bool, default=False\n If `True`, return a sparse CSR continency matrix. If `eps` is not\n `None` and `sparse` is `True` will raise ValueError.\n\n .. versionadded:: 0.18\n\n dtype : numeric type, default=np.int64\n Output dtype. Ignored if `eps` is not `None`.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n contingency : {array-like, sparse}, shape=[n_classes_true, n_classes_pred]\n Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in\n true class :math:`i` and in predicted class :math:`j`. If\n ``eps is None``, the dtype of this array will be integer unless set\n otherwise with the ``dtype`` argument. If ``eps`` is given, the dtype\n will be float.\n Will be a ``sklearn.sparse.csr_matrix`` if ``sparse=True``.\n ", "source_code": "\ndef contingency_matrix(labels_true, labels_pred, *, eps=None, sparse=False, dtype=np.int64):\n \"\"\"Build a contingency matrix describing the relationship between labels.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n Ground truth class labels to be used as a reference.\n\n labels_pred : array-like of shape (n_samples,)\n Cluster labels to evaluate.\n\n eps : float, default=None\n If a float, that value is added to all values in the contingency\n matrix. This helps to stop NaN propagation.\n If ``None``, nothing is adjusted.\n\n sparse : bool, default=False\n If `True`, return a sparse CSR continency matrix. If `eps` is not\n `None` and `sparse` is `True` will raise ValueError.\n\n .. versionadded:: 0.18\n\n dtype : numeric type, default=np.int64\n Output dtype. Ignored if `eps` is not `None`.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n contingency : {array-like, sparse}, shape=[n_classes_true, n_classes_pred]\n Matrix :math:`C` such that :math:`C_{i, j}` is the number of samples in\n true class :math:`i` and in predicted class :math:`j`. If\n ``eps is None``, the dtype of this array will be integer unless set\n otherwise with the ``dtype`` argument. If ``eps`` is given, the dtype\n will be float.\n Will be a ``sklearn.sparse.csr_matrix`` if ``sparse=True``.\n \"\"\"\n if eps is not None and sparse:\n raise ValueError(\"Cannot set 'eps' when sparse=True\")\n (classes, class_idx) = np.unique(labels_true, return_inverse=True)\n (clusters, cluster_idx) = np.unique(labels_pred, return_inverse=True)\n n_classes = classes.shape[0]\n n_clusters = clusters.shape[0]\n contingency = sp.coo_matrix((np.ones(class_idx.shape[0]), (class_idx, cluster_idx)), shape=(n_classes, n_clusters), dtype=dtype)\n if sparse:\n contingency = contingency.tocsr()\n contingency.sum_duplicates()\n else:\n contingency = contingency.toarray()\n if eps is not None:\n contingency = contingency + eps\n return contingency" }, { @@ -123194,13 +132550,14 @@ "docstring": { "type": "int array, shape = [n_samples]", "description": "The labels" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Calculates the entropy for a labeling.", - "docstring": "Calculates the entropy for a labeling.\n\nParameters\n----------\nlabels : int array, shape = [n_samples]\n The labels\n\nNotes\n-----\nThe logarithm used is the natural logarithm (base-e).", + "docstring": "Calculates the entropy for a labeling.\n\n Parameters\n ----------\n labels : int array, shape = [n_samples]\n The labels\n\n Notes\n -----\n The logarithm used is the natural logarithm (base-e).\n ", "source_code": "\ndef entropy(labels):\n \"\"\"Calculates the entropy for a labeling.\n\n Parameters\n ----------\n labels : int array, shape = [n_samples]\n The labels\n\n Notes\n -----\n The logarithm used is the natural logarithm (base-e).\n \"\"\"\n if len(labels) == 0:\n return 1.0\n label_idx = np.unique(labels, return_inverse=True)[1]\n pi = np.bincount(label_idx).astype(np.float64)\n pi = pi[pi > 0]\n pi_sum = np.sum(pi)\n return -np.sum(pi / pi_sum * (np.log(pi) - log(pi_sum)))" }, { @@ -123218,7 +132575,8 @@ "docstring": { "type": "int array, shape = (``n_samples``,)", "description": "A clustering of the data into disjoint subsets." - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -123228,7 +132586,8 @@ "docstring": { "type": "array, shape = (``n_samples``, )", "description": "A clustering of the data into disjoint subsets." - } + }, + "refined_type": {} }, { "name": "sparse", @@ -123238,13 +132597,14 @@ "docstring": { "type": "bool, default=False", "description": "Compute contingency matrix internally with sparse matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Measure the similarity of two clusterings of a set of points.\n\n.. versionadded:: 0.18 The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of the precision and recall:: FMI = TP / sqrt((TP + FP) * (TP + FN)) Where ``TP`` is the number of **True Positive** (i.e. the number of pair of points that belongs in the same clusters in both ``labels_true`` and ``labels_pred``), ``FP`` is the number of **False Positive** (i.e. the number of pair of points that belongs in the same clusters in ``labels_true`` and not in ``labels_pred``) and ``FN`` is the number of **False Negative** (i.e the number of pair of points that belongs in the same clusters in ``labels_pred`` and not in ``labels_True``). The score ranges from 0 to 1. A high value indicates a good similarity between two clusters. Read more in the :ref:`User Guide `.", - "docstring": "Measure the similarity of two clusterings of a set of points.\n\n.. versionadded:: 0.18\n\nThe Fowlkes-Mallows index (FMI) is defined as the geometric mean between of\nthe precision and recall::\n\n FMI = TP / sqrt((TP + FP) * (TP + FN))\n\nWhere ``TP`` is the number of **True Positive** (i.e. the number of pair of\npoints that belongs in the same clusters in both ``labels_true`` and\n``labels_pred``), ``FP`` is the number of **False Positive** (i.e. the\nnumber of pair of points that belongs in the same clusters in\n``labels_true`` and not in ``labels_pred``) and ``FN`` is the number of\n**False Negative** (i.e the number of pair of points that belongs in the\nsame clusters in ``labels_pred`` and not in ``labels_True``).\n\nThe score ranges from 0 to 1. A high value indicates a good similarity\nbetween two clusters.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nlabels_true : int array, shape = (``n_samples``,)\n A clustering of the data into disjoint subsets.\n\nlabels_pred : array, shape = (``n_samples``, )\n A clustering of the data into disjoint subsets.\n\nsparse : bool, default=False\n Compute contingency matrix internally with sparse matrix.\n\nReturns\n-------\nscore : float\n The resulting Fowlkes-Mallows score.\n\nExamples\n--------\n\nPerfect labelings are both homogeneous and complete, hence have\nscore 1.0::\n\n >>> from sklearn.metrics.cluster import fowlkes_mallows_score\n >>> fowlkes_mallows_score([0, 0, 1, 1], [0, 0, 1, 1])\n 1.0\n >>> fowlkes_mallows_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\nIf classes members are completely split across different clusters,\nthe assignment is totally random, hence the FMI is null::\n\n >>> fowlkes_mallows_score([0, 0, 0, 0], [0, 1, 2, 3])\n 0.0\n\nReferences\n----------\n.. [1] `E. B. Fowkles and C. L. Mallows, 1983. \"A method for comparing two\n hierarchical clusterings\". Journal of the American Statistical\n Association\n `_\n\n.. [2] `Wikipedia entry for the Fowlkes-Mallows Index\n `_", + "description": "Measure the similarity of two clusterings of a set of points.\n\n.. versionadded:: 0.18\n\nThe Fowlkes-Mallows index (FMI) is defined as the geometric mean between of\nthe precision and recall::\n\n FMI = TP / sqrt((TP + FP) * (TP + FN))\n\nWhere ``TP`` is the number of **True Positive** (i.e. the number of pair of\npoints that belongs in the same clusters in both ``labels_true`` and\n``labels_pred``), ``FP`` is the number of **False Positive** (i.e. the\nnumber of pair of points that belongs in the same clusters in\n``labels_true`` and not in ``labels_pred``) and ``FN`` is the number of\n**False Negative** (i.e the number of pair of points that belongs in the\nsame clusters in ``labels_pred`` and not in ``labels_True``).\n\nThe score ranges from 0 to 1. A high value indicates a good similarity\nbetween two clusters.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Measure the similarity of two clusterings of a set of points.\n\n .. versionadded:: 0.18\n\n The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of\n the precision and recall::\n\n FMI = TP / sqrt((TP + FP) * (TP + FN))\n\n Where ``TP`` is the number of **True Positive** (i.e. the number of pair of\n points that belongs in the same clusters in both ``labels_true`` and\n ``labels_pred``), ``FP`` is the number of **False Positive** (i.e. the\n number of pair of points that belongs in the same clusters in\n ``labels_true`` and not in ``labels_pred``) and ``FN`` is the number of\n **False Negative** (i.e the number of pair of points that belongs in the\n same clusters in ``labels_pred`` and not in ``labels_True``).\n\n The score ranges from 0 to 1. A high value indicates a good similarity\n between two clusters.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = (``n_samples``,)\n A clustering of the data into disjoint subsets.\n\n labels_pred : array, shape = (``n_samples``, )\n A clustering of the data into disjoint subsets.\n\n sparse : bool, default=False\n Compute contingency matrix internally with sparse matrix.\n\n Returns\n -------\n score : float\n The resulting Fowlkes-Mallows score.\n\n Examples\n --------\n\n Perfect labelings are both homogeneous and complete, hence have\n score 1.0::\n\n >>> from sklearn.metrics.cluster import fowlkes_mallows_score\n >>> fowlkes_mallows_score([0, 0, 1, 1], [0, 0, 1, 1])\n 1.0\n >>> fowlkes_mallows_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\n If classes members are completely split across different clusters,\n the assignment is totally random, hence the FMI is null::\n\n >>> fowlkes_mallows_score([0, 0, 0, 0], [0, 1, 2, 3])\n 0.0\n\n References\n ----------\n .. [1] `E. B. Fowkles and C. L. Mallows, 1983. \"A method for comparing two\n hierarchical clusterings\". Journal of the American Statistical\n Association\n `_\n\n .. [2] `Wikipedia entry for the Fowlkes-Mallows Index\n `_\n ", "source_code": "\ndef fowlkes_mallows_score(labels_true, labels_pred, *, sparse=False):\n \"\"\"Measure the similarity of two clusterings of a set of points.\n\n .. versionadded:: 0.18\n\n The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of\n the precision and recall::\n\n FMI = TP / sqrt((TP + FP) * (TP + FN))\n\n Where ``TP`` is the number of **True Positive** (i.e. the number of pair of\n points that belongs in the same clusters in both ``labels_true`` and\n ``labels_pred``), ``FP`` is the number of **False Positive** (i.e. the\n number of pair of points that belongs in the same clusters in\n ``labels_true`` and not in ``labels_pred``) and ``FN`` is the number of\n **False Negative** (i.e the number of pair of points that belongs in the\n same clusters in ``labels_pred`` and not in ``labels_True``).\n\n The score ranges from 0 to 1. A high value indicates a good similarity\n between two clusters.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = (``n_samples``,)\n A clustering of the data into disjoint subsets.\n\n labels_pred : array, shape = (``n_samples``, )\n A clustering of the data into disjoint subsets.\n\n sparse : bool, default=False\n Compute contingency matrix internally with sparse matrix.\n\n Returns\n -------\n score : float\n The resulting Fowlkes-Mallows score.\n\n Examples\n --------\n\n Perfect labelings are both homogeneous and complete, hence have\n score 1.0::\n\n >>> from sklearn.metrics.cluster import fowlkes_mallows_score\n >>> fowlkes_mallows_score([0, 0, 1, 1], [0, 0, 1, 1])\n 1.0\n >>> fowlkes_mallows_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\n If classes members are completely split across different clusters,\n the assignment is totally random, hence the FMI is null::\n\n >>> fowlkes_mallows_score([0, 0, 0, 0], [0, 1, 2, 3])\n 0.0\n\n References\n ----------\n .. [1] `E. B. Fowkles and C. L. Mallows, 1983. \"A method for comparing two\n hierarchical clusterings\". Journal of the American Statistical\n Association\n `_\n\n .. [2] `Wikipedia entry for the Fowlkes-Mallows Index\n `_\n \"\"\"\n (labels_true, labels_pred) = check_clusterings(labels_true, labels_pred)\n (n_samples, ) = labels_true.shape\n c = contingency_matrix(labels_true, labels_pred, sparse=True)\n c = c.astype(np.int64, **_astype_copy_false(c))\n tk = np.dot(c.data, c.data) - n_samples\n pk = np.sum(np.asarray(c.sum(axis=0)).ravel()**2) - n_samples\n qk = np.sum(np.asarray(c.sum(axis=1)).ravel()**2) - n_samples\n return np.sqrt(tk / pk) * np.sqrt(tk / qk) if tk != 0.0 else 0.0" }, { @@ -123262,7 +132622,8 @@ "docstring": { "type": "int array, shape = [n_samples]", "description": "ground truth class labels to be used as a reference" - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -123272,7 +132633,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "cluster labels to evaluate" - } + }, + "refined_type": {} }, { "name": "beta", @@ -123282,13 +132644,14 @@ "docstring": { "type": "float, default=1.0", "description": "Ratio of weight attributed to ``homogeneity`` vs ``completeness``.\nIf ``beta`` is greater than 1, ``completeness`` is weighted more\nstrongly in the calculation. If ``beta`` is less than 1,\n``homogeneity`` is weighted more strongly." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the homogeneity and completeness and V-Measure scores at once.\n\nThose metrics are based on normalized conditional entropy measures of the clustering labeling to evaluate given the knowledge of a Ground Truth class labels of the same samples. A clustering result satisfies homogeneity if all of its clusters contain only data points which are members of a single class. A clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster. Both scores have positive values between 0.0 and 1.0, larger values being desirable. Those 3 metrics are independent of the absolute values of the labels: a permutation of the class or cluster label values won't change the score values in any way. V-Measure is furthermore symmetric: swapping ``labels_true`` and ``label_pred`` will give the same score. This does not hold for homogeneity and completeness. V-Measure is identical to :func:`normalized_mutual_info_score` with the arithmetic averaging method. Read more in the :ref:`User Guide `.", - "docstring": "Compute the homogeneity and completeness and V-Measure scores at once.\n\nThose metrics are based on normalized conditional entropy measures of\nthe clustering labeling to evaluate given the knowledge of a Ground\nTruth class labels of the same samples.\n\nA clustering result satisfies homogeneity if all of its clusters\ncontain only data points which are members of a single class.\n\nA clustering result satisfies completeness if all the data points\nthat are members of a given class are elements of the same cluster.\n\nBoth scores have positive values between 0.0 and 1.0, larger values\nbeing desirable.\n\nThose 3 metrics are independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore values in any way.\n\nV-Measure is furthermore symmetric: swapping ``labels_true`` and\n``label_pred`` will give the same score. This does not hold for\nhomogeneity and completeness. V-Measure is identical to\n:func:`normalized_mutual_info_score` with the arithmetic averaging\nmethod.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nlabels_true : int array, shape = [n_samples]\n ground truth class labels to be used as a reference\n\nlabels_pred : array-like of shape (n_samples,)\n cluster labels to evaluate\n\nbeta : float, default=1.0\n Ratio of weight attributed to ``homogeneity`` vs ``completeness``.\n If ``beta`` is greater than 1, ``completeness`` is weighted more\n strongly in the calculation. If ``beta`` is less than 1,\n ``homogeneity`` is weighted more strongly.\n\nReturns\n-------\nhomogeneity : float\n score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling\n\ncompleteness : float\n score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling\n\nv_measure : float\n harmonic mean of the first two\n\nSee Also\n--------\nhomogeneity_score\ncompleteness_score\nv_measure_score", + "description": "Compute the homogeneity and completeness and V-Measure scores at once.\n\nThose metrics are based on normalized conditional entropy measures of\nthe clustering labeling to evaluate given the knowledge of a Ground\nTruth class labels of the same samples.\n\nA clustering result satisfies homogeneity if all of its clusters\ncontain only data points which are members of a single class.\n\nA clustering result satisfies completeness if all the data points\nthat are members of a given class are elements of the same cluster.\n\nBoth scores have positive values between 0.0 and 1.0, larger values\nbeing desirable.\n\nThose 3 metrics are independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore values in any way.\n\nV-Measure is furthermore symmetric: swapping ``labels_true`` and\n``label_pred`` will give the same score. This does not hold for\nhomogeneity and completeness. V-Measure is identical to\n:func:`normalized_mutual_info_score` with the arithmetic averaging\nmethod.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the homogeneity and completeness and V-Measure scores at once.\n\n Those metrics are based on normalized conditional entropy measures of\n the clustering labeling to evaluate given the knowledge of a Ground\n Truth class labels of the same samples.\n\n A clustering result satisfies homogeneity if all of its clusters\n contain only data points which are members of a single class.\n\n A clustering result satisfies completeness if all the data points\n that are members of a given class are elements of the same cluster.\n\n Both scores have positive values between 0.0 and 1.0, larger values\n being desirable.\n\n Those 3 metrics are independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score values in any way.\n\n V-Measure is furthermore symmetric: swapping ``labels_true`` and\n ``label_pred`` will give the same score. This does not hold for\n homogeneity and completeness. V-Measure is identical to\n :func:`normalized_mutual_info_score` with the arithmetic averaging\n method.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n ground truth class labels to be used as a reference\n\n labels_pred : array-like of shape (n_samples,)\n cluster labels to evaluate\n\n beta : float, default=1.0\n Ratio of weight attributed to ``homogeneity`` vs ``completeness``.\n If ``beta`` is greater than 1, ``completeness`` is weighted more\n strongly in the calculation. If ``beta`` is less than 1,\n ``homogeneity`` is weighted more strongly.\n\n Returns\n -------\n homogeneity : float\n score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling\n\n completeness : float\n score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling\n\n v_measure : float\n harmonic mean of the first two\n\n See Also\n --------\n homogeneity_score\n completeness_score\n v_measure_score\n ", "source_code": "\ndef homogeneity_completeness_v_measure(labels_true, labels_pred, *, beta=1.0):\n \"\"\"Compute the homogeneity and completeness and V-Measure scores at once.\n\n Those metrics are based on normalized conditional entropy measures of\n the clustering labeling to evaluate given the knowledge of a Ground\n Truth class labels of the same samples.\n\n A clustering result satisfies homogeneity if all of its clusters\n contain only data points which are members of a single class.\n\n A clustering result satisfies completeness if all the data points\n that are members of a given class are elements of the same cluster.\n\n Both scores have positive values between 0.0 and 1.0, larger values\n being desirable.\n\n Those 3 metrics are independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score values in any way.\n\n V-Measure is furthermore symmetric: swapping ``labels_true`` and\n ``label_pred`` will give the same score. This does not hold for\n homogeneity and completeness. V-Measure is identical to\n :func:`normalized_mutual_info_score` with the arithmetic averaging\n method.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n ground truth class labels to be used as a reference\n\n labels_pred : array-like of shape (n_samples,)\n cluster labels to evaluate\n\n beta : float, default=1.0\n Ratio of weight attributed to ``homogeneity`` vs ``completeness``.\n If ``beta`` is greater than 1, ``completeness`` is weighted more\n strongly in the calculation. If ``beta`` is less than 1,\n ``homogeneity`` is weighted more strongly.\n\n Returns\n -------\n homogeneity : float\n score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling\n\n completeness : float\n score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling\n\n v_measure : float\n harmonic mean of the first two\n\n See Also\n --------\n homogeneity_score\n completeness_score\n v_measure_score\n \"\"\"\n (labels_true, labels_pred) = check_clusterings(labels_true, labels_pred)\n if len(labels_true) == 0:\n return 1.0, 1.0, 1.0\n entropy_C = entropy(labels_true)\n entropy_K = entropy(labels_pred)\n contingency = contingency_matrix(labels_true, labels_pred, sparse=True)\n MI = mutual_info_score(None, None, contingency=contingency)\n homogeneity = MI / entropy_C if entropy_C else 1.0\n completeness = MI / entropy_K if entropy_K else 1.0\n if homogeneity + completeness == 0.0:\n v_measure_score = 0.0\n else:\n v_measure_score = (1 + beta) * homogeneity * completeness / (beta * homogeneity + completeness)\n return homogeneity, completeness, v_measure_score" }, { @@ -123306,7 +132669,8 @@ "docstring": { "type": "int array, shape = [n_samples]", "description": "ground truth class labels to be used as a reference" - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -123316,13 +132680,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "cluster labels to evaluate" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Homogeneity metric of a cluster labeling given a ground truth.\n\nA clustering result satisfies homogeneity if all of its clusters contain only data points which are members of a single class. This metric is independent of the absolute values of the labels: a permutation of the class or cluster label values won't change the score value in any way. This metric is not symmetric: switching ``label_true`` with ``label_pred`` will return the :func:`completeness_score` which will be different in general. Read more in the :ref:`User Guide `.", - "docstring": "Homogeneity metric of a cluster labeling given a ground truth.\n\nA clustering result satisfies homogeneity if all of its clusters\ncontain only data points which are members of a single class.\n\nThis metric is independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore value in any way.\n\nThis metric is not symmetric: switching ``label_true`` with ``label_pred``\nwill return the :func:`completeness_score` which will be different in\ngeneral.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nlabels_true : int array, shape = [n_samples]\n ground truth class labels to be used as a reference\n\nlabels_pred : array-like of shape (n_samples,)\n cluster labels to evaluate\n\nReturns\n-------\nhomogeneity : float\n score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling\n\nReferences\n----------\n\n.. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A\n conditional entropy-based external cluster evaluation measure\n `_\n\nSee Also\n--------\ncompleteness_score\nv_measure_score\n\nExamples\n--------\n\nPerfect labelings are homogeneous::\n\n >>> from sklearn.metrics.cluster import homogeneity_score\n >>> homogeneity_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\nNon-perfect labelings that further split classes into more clusters can be\nperfectly homogeneous::\n\n >>> print(\"%.6f\" % homogeneity_score([0, 0, 1, 1], [0, 0, 1, 2]))\n 1.000000\n >>> print(\"%.6f\" % homogeneity_score([0, 0, 1, 1], [0, 1, 2, 3]))\n 1.000000\n\nClusters that include samples from different classes do not make for an\nhomogeneous labeling::\n\n >>> print(\"%.6f\" % homogeneity_score([0, 0, 1, 1], [0, 1, 0, 1]))\n 0.0...\n >>> print(\"%.6f\" % homogeneity_score([0, 0, 1, 1], [0, 0, 0, 0]))\n 0.0...", + "description": "Homogeneity metric of a cluster labeling given a ground truth.\n\nA clustering result satisfies homogeneity if all of its clusters\ncontain only data points which are members of a single class.\n\nThis metric is independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore value in any way.\n\nThis metric is not symmetric: switching ``label_true`` with ``label_pred``\nwill return the :func:`completeness_score` which will be different in\ngeneral.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Homogeneity metric of a cluster labeling given a ground truth.\n\n A clustering result satisfies homogeneity if all of its clusters\n contain only data points which are members of a single class.\n\n This metric is independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score value in any way.\n\n This metric is not symmetric: switching ``label_true`` with ``label_pred``\n will return the :func:`completeness_score` which will be different in\n general.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n ground truth class labels to be used as a reference\n\n labels_pred : array-like of shape (n_samples,)\n cluster labels to evaluate\n\n Returns\n -------\n homogeneity : float\n score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling\n\n References\n ----------\n\n .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A\n conditional entropy-based external cluster evaluation measure\n `_\n\n See Also\n --------\n completeness_score\n v_measure_score\n\n Examples\n --------\n\n Perfect labelings are homogeneous::\n\n >>> from sklearn.metrics.cluster import homogeneity_score\n >>> homogeneity_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\n Non-perfect labelings that further split classes into more clusters can be\n perfectly homogeneous::\n\n >>> print(\"%.6f\" % homogeneity_score([0, 0, 1, 1], [0, 0, 1, 2]))\n 1.000000\n >>> print(\"%.6f\" % homogeneity_score([0, 0, 1, 1], [0, 1, 2, 3]))\n 1.000000\n\n Clusters that include samples from different classes do not make for an\n homogeneous labeling::\n\n >>> print(\"%.6f\" % homogeneity_score([0, 0, 1, 1], [0, 1, 0, 1]))\n 0.0...\n >>> print(\"%.6f\" % homogeneity_score([0, 0, 1, 1], [0, 0, 0, 0]))\n 0.0...\n ", "source_code": "\ndef homogeneity_score(labels_true, labels_pred):\n \"\"\"Homogeneity metric of a cluster labeling given a ground truth.\n\n A clustering result satisfies homogeneity if all of its clusters\n contain only data points which are members of a single class.\n\n This metric is independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score value in any way.\n\n This metric is not symmetric: switching ``label_true`` with ``label_pred``\n will return the :func:`completeness_score` which will be different in\n general.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n ground truth class labels to be used as a reference\n\n labels_pred : array-like of shape (n_samples,)\n cluster labels to evaluate\n\n Returns\n -------\n homogeneity : float\n score between 0.0 and 1.0. 1.0 stands for perfectly homogeneous labeling\n\n References\n ----------\n\n .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A\n conditional entropy-based external cluster evaluation measure\n `_\n\n See Also\n --------\n completeness_score\n v_measure_score\n\n Examples\n --------\n\n Perfect labelings are homogeneous::\n\n >>> from sklearn.metrics.cluster import homogeneity_score\n >>> homogeneity_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\n Non-perfect labelings that further split classes into more clusters can be\n perfectly homogeneous::\n\n >>> print(\"%.6f\" % homogeneity_score([0, 0, 1, 1], [0, 0, 1, 2]))\n 1.000000\n >>> print(\"%.6f\" % homogeneity_score([0, 0, 1, 1], [0, 1, 2, 3]))\n 1.000000\n\n Clusters that include samples from different classes do not make for an\n homogeneous labeling::\n\n >>> print(\"%.6f\" % homogeneity_score([0, 0, 1, 1], [0, 1, 0, 1]))\n 0.0...\n >>> print(\"%.6f\" % homogeneity_score([0, 0, 1, 1], [0, 0, 0, 0]))\n 0.0...\n \"\"\"\n return homogeneity_completeness_v_measure(labels_true, labels_pred)[0]" }, { @@ -123340,7 +132705,8 @@ "docstring": { "type": "int array, shape = [n_samples]", "description": "A clustering of the data into disjoint subsets, called :math:`U` in\nthe above formula." - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -123350,7 +132716,8 @@ "docstring": { "type": "int array-like of shape (n_samples,)", "description": "A clustering of the data into disjoint subsets, called :math:`V` in\nthe above formula." - } + }, + "refined_type": {} }, { "name": "contingency", @@ -123360,13 +132727,17 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_classes_true, n_classes_pred), default=None", "description": "A contingency matrix given by the :func:`contingency_matrix` function.\nIf value is ``None``, it will be computed, otherwise the given value is\nused, with ``labels_true`` and ``labels_pred`` ignored." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Mutual Information between two clusterings.\n\nThe Mutual Information is a measure of the similarity between two labels of the same data. Where :math:`|U_i|` is the number of the samples in cluster :math:`U_i` and :math:`|V_j|` is the number of the samples in cluster :math:`V_j`, the Mutual Information between clusterings :math:`U` and :math:`V` is given as: .. math:: MI(U,V)=\\sum_{i=1}^{|U|} \\sum_{j=1}^{|V|} \\frac{|U_i\\cap V_j|}{N} \\log\\frac{N|U_i \\cap V_j|}{|U_i||V_j|} This metric is independent of the absolute values of the labels: a permutation of the class or cluster label values won't change the score value in any way. This metric is furthermore symmetric: switching :math:`U` (i.e ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the same score value. This can be useful to measure the agreement of two independent label assignments strategies on the same dataset when the real ground truth is not known. Read more in the :ref:`User Guide `.", - "docstring": "Mutual Information between two clusterings.\n\nThe Mutual Information is a measure of the similarity between two labels\nof the same data. Where :math:`|U_i|` is the number of the samples\nin cluster :math:`U_i` and :math:`|V_j|` is the number of the\nsamples in cluster :math:`V_j`, the Mutual Information\nbetween clusterings :math:`U` and :math:`V` is given as:\n\n.. math::\n\n MI(U,V)=\\sum_{i=1}^{|U|} \\sum_{j=1}^{|V|} \\frac{|U_i\\cap V_j|}{N}\n \\log\\frac{N|U_i \\cap V_j|}{|U_i||V_j|}\n\nThis metric is independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore value in any way.\n\nThis metric is furthermore symmetric: switching :math:`U` (i.e\n``label_true``) with :math:`V` (i.e. ``label_pred``) will return the\nsame score value. This can be useful to measure the agreement of two\nindependent label assignments strategies on the same dataset when the\nreal ground truth is not known.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nlabels_true : int array, shape = [n_samples]\n A clustering of the data into disjoint subsets, called :math:`U` in\n the above formula.\n\nlabels_pred : int array-like of shape (n_samples,)\n A clustering of the data into disjoint subsets, called :math:`V` in\n the above formula.\n\ncontingency : {ndarray, sparse matrix} of shape (n_classes_true, n_classes_pred), default=None\n A contingency matrix given by the :func:`contingency_matrix` function.\n If value is ``None``, it will be computed, otherwise the given value is\n used, with ``labels_true`` and ``labels_pred`` ignored.\n\nReturns\n-------\nmi : float\n Mutual information, a non-negative value, measured in nats using the\n natural logarithm.\n\nNotes\n-----\nThe logarithm used is the natural logarithm (base-e).\n\nSee Also\n--------\nadjusted_mutual_info_score : Adjusted against chance Mutual Information.\nnormalized_mutual_info_score : Normalized Mutual Information.", + "description": "Mutual Information between two clusterings.\n\nThe Mutual Information is a measure of the similarity between two labels\nof the same data. Where :math:`|U_i|` is the number of the samples\nin cluster :math:`U_i` and :math:`|V_j|` is the number of the\nsamples in cluster :math:`V_j`, the Mutual Information\nbetween clusterings :math:`U` and :math:`V` is given as:\n\n.. math::\n\n MI(U,V)=\\sum_{i=1}^{|U|} \\sum_{j=1}^{|V|} \\frac{|U_i\\cap V_j|}{N}\n \\log\\frac{N|U_i \\cap V_j|}{|U_i||V_j|}\n\nThis metric is independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore value in any way.\n\nThis metric is furthermore symmetric: switching :math:`U` (i.e\n``label_true``) with :math:`V` (i.e. ``label_pred``) will return the\nsame score value. This can be useful to measure the agreement of two\nindependent label assignments strategies on the same dataset when the\nreal ground truth is not known.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Mutual Information between two clusterings.\n\n The Mutual Information is a measure of the similarity between two labels\n of the same data. Where :math:`|U_i|` is the number of the samples\n in cluster :math:`U_i` and :math:`|V_j|` is the number of the\n samples in cluster :math:`V_j`, the Mutual Information\n between clusterings :math:`U` and :math:`V` is given as:\n\n .. math::\n\n MI(U,V)=\\sum_{i=1}^{|U|} \\sum_{j=1}^{|V|} \\frac{|U_i\\cap V_j|}{N}\n \\log\\frac{N|U_i \\cap V_j|}{|U_i||V_j|}\n\n This metric is independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score value in any way.\n\n This metric is furthermore symmetric: switching :math:`U` (i.e\n ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the\n same score value. This can be useful to measure the agreement of two\n independent label assignments strategies on the same dataset when the\n real ground truth is not known.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n A clustering of the data into disjoint subsets, called :math:`U` in\n the above formula.\n\n labels_pred : int array-like of shape (n_samples,)\n A clustering of the data into disjoint subsets, called :math:`V` in\n the above formula.\n\n contingency : {ndarray, sparse matrix} of shape (n_classes_true, n_classes_pred), default=None\n A contingency matrix given by the :func:`contingency_matrix` function.\n If value is ``None``, it will be computed, otherwise the given value is\n used, with ``labels_true`` and ``labels_pred`` ignored.\n\n Returns\n -------\n mi : float\n Mutual information, a non-negative value, measured in nats using the\n natural logarithm.\n\n Notes\n -----\n The logarithm used is the natural logarithm (base-e).\n\n See Also\n --------\n adjusted_mutual_info_score : Adjusted against chance Mutual Information.\n normalized_mutual_info_score : Normalized Mutual Information.\n ", "source_code": "\ndef mutual_info_score(labels_true, labels_pred, *, contingency=None):\n \"\"\"Mutual Information between two clusterings.\n\n The Mutual Information is a measure of the similarity between two labels\n of the same data. Where :math:`|U_i|` is the number of the samples\n in cluster :math:`U_i` and :math:`|V_j|` is the number of the\n samples in cluster :math:`V_j`, the Mutual Information\n between clusterings :math:`U` and :math:`V` is given as:\n\n .. math::\n\n MI(U,V)=\\sum_{i=1}^{|U|} \\sum_{j=1}^{|V|} \\frac{|U_i\\cap V_j|}{N}\n \\log\\frac{N|U_i \\cap V_j|}{|U_i||V_j|}\n\n This metric is independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score value in any way.\n\n This metric is furthermore symmetric: switching :math:`U` (i.e\n ``label_true``) with :math:`V` (i.e. ``label_pred``) will return the\n same score value. This can be useful to measure the agreement of two\n independent label assignments strategies on the same dataset when the\n real ground truth is not known.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n A clustering of the data into disjoint subsets, called :math:`U` in\n the above formula.\n\n labels_pred : int array-like of shape (n_samples,)\n A clustering of the data into disjoint subsets, called :math:`V` in\n the above formula.\n\n contingency : {ndarray, sparse matrix} of shape (n_classes_true, n_classes_pred), default=None\n A contingency matrix given by the :func:`contingency_matrix` function.\n If value is ``None``, it will be computed, otherwise the given value is\n used, with ``labels_true`` and ``labels_pred`` ignored.\n\n Returns\n -------\n mi : float\n Mutual information, a non-negative value, measured in nats using the\n natural logarithm.\n\n Notes\n -----\n The logarithm used is the natural logarithm (base-e).\n\n See Also\n --------\n adjusted_mutual_info_score : Adjusted against chance Mutual Information.\n normalized_mutual_info_score : Normalized Mutual Information.\n \"\"\"\n if contingency is None:\n (labels_true, labels_pred) = check_clusterings(labels_true, labels_pred)\n contingency = contingency_matrix(labels_true, labels_pred, sparse=True)\n else:\n contingency = check_array(contingency, accept_sparse=['csr', 'csc', 'coo'], dtype=[int, np.int32, np.int64])\n if isinstance(contingency, np.ndarray):\n (nzx, nzy) = np.nonzero(contingency)\n nz_val = contingency[nzx, nzy]\n elif sp.issparse(contingency):\n (nzx, nzy, nz_val) = sp.find(contingency)\n else:\n raise ValueError(\"Unsupported type for 'contingency': %s\" % type(contingency))\n contingency_sum = contingency.sum()\n pi = np.ravel(contingency.sum(axis=1))\n pj = np.ravel(contingency.sum(axis=0))\n log_contingency_nm = np.log(nz_val)\n contingency_nm = nz_val / contingency_sum\n outer = pi.take(nzx).astype(np.int64, copy=False) * pj.take(nzy).astype(np.int64, copy=False)\n log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum())\n mi = contingency_nm * (log_contingency_nm - log(contingency_sum)) + contingency_nm * log_outer\n mi = np.where(np.abs(mi) < np.finfo(mi.dtype).eps, 0.0, mi)\n return np.clip(mi.sum(), 0.0, None)" }, { @@ -123384,7 +132755,8 @@ "docstring": { "type": "int array, shape = [n_samples]", "description": "A clustering of the data into disjoint subsets." - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -123394,7 +132766,8 @@ "docstring": { "type": "int array-like of shape (n_samples,)", "description": "A clustering of the data into disjoint subsets." - } + }, + "refined_type": {} }, { "name": "average_method", @@ -123404,13 +132777,14 @@ "docstring": { "type": "str, default='arithmetic'", "description": "How to compute the normalizer in the denominator. Possible options\nare 'min', 'geometric', 'arithmetic', and 'max'.\n\n.. versionadded:: 0.20\n\n.. versionchanged:: 0.22\n The default value of ``average_method`` changed from 'geometric' to\n 'arithmetic'." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Normalized Mutual Information between two clusterings.\n\nNormalized Mutual Information (NMI) is a normalization of the Mutual Information (MI) score to scale the results between 0 (no mutual information) and 1 (perfect correlation). In this function, mutual information is normalized by some generalized mean of ``H(labels_true)`` and ``H(labels_pred))``, defined by the `average_method`. This measure is not adjusted for chance. Therefore :func:`adjusted_mutual_info_score` might be preferred. This metric is independent of the absolute values of the labels: a permutation of the class or cluster label values won't change the score value in any way. This metric is furthermore symmetric: switching ``label_true`` with ``label_pred`` will return the same score value. This can be useful to measure the agreement of two independent label assignments strategies on the same dataset when the real ground truth is not known. Read more in the :ref:`User Guide `.", - "docstring": "Normalized Mutual Information between two clusterings.\n\nNormalized Mutual Information (NMI) is a normalization of the Mutual\nInformation (MI) score to scale the results between 0 (no mutual\ninformation) and 1 (perfect correlation). In this function, mutual\ninformation is normalized by some generalized mean of ``H(labels_true)``\nand ``H(labels_pred))``, defined by the `average_method`.\n\nThis measure is not adjusted for chance. Therefore\n:func:`adjusted_mutual_info_score` might be preferred.\n\nThis metric is independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore value in any way.\n\nThis metric is furthermore symmetric: switching ``label_true`` with\n``label_pred`` will return the same score value. This can be useful to\nmeasure the agreement of two independent label assignments strategies\non the same dataset when the real ground truth is not known.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nlabels_true : int array, shape = [n_samples]\n A clustering of the data into disjoint subsets.\n\nlabels_pred : int array-like of shape (n_samples,)\n A clustering of the data into disjoint subsets.\n\naverage_method : str, default='arithmetic'\n How to compute the normalizer in the denominator. Possible options\n are 'min', 'geometric', 'arithmetic', and 'max'.\n\n .. versionadded:: 0.20\n\n .. versionchanged:: 0.22\n The default value of ``average_method`` changed from 'geometric' to\n 'arithmetic'.\n\nReturns\n-------\nnmi : float\n Score between 0.0 and 1.0 in normalized nats (based on the natural\n logarithm). 1.0 stands for perfectly complete labeling.\n\nSee Also\n--------\nv_measure_score : V-Measure (NMI with arithmetic mean option).\nadjusted_rand_score : Adjusted Rand Index.\nadjusted_mutual_info_score : Adjusted Mutual Information (adjusted\n against chance).\n\nExamples\n--------\n\nPerfect labelings are both homogeneous and complete, hence have\nscore 1.0::\n\n >>> from sklearn.metrics.cluster import normalized_mutual_info_score\n >>> normalized_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])\n ... # doctest: +SKIP\n 1.0\n >>> normalized_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])\n ... # doctest: +SKIP\n 1.0\n\nIf classes members are completely split across different clusters,\nthe assignment is totally in-complete, hence the NMI is null::\n\n >>> normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])\n ... # doctest: +SKIP\n 0.0", + "description": "Normalized Mutual Information between two clusterings.\n\nNormalized Mutual Information (NMI) is a normalization of the Mutual\nInformation (MI) score to scale the results between 0 (no mutual\ninformation) and 1 (perfect correlation). In this function, mutual\ninformation is normalized by some generalized mean of ``H(labels_true)``\nand ``H(labels_pred))``, defined by the `average_method`.\n\nThis measure is not adjusted for chance. Therefore\n:func:`adjusted_mutual_info_score` might be preferred.\n\nThis metric is independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore value in any way.\n\nThis metric is furthermore symmetric: switching ``label_true`` with\n``label_pred`` will return the same score value. This can be useful to\nmeasure the agreement of two independent label assignments strategies\non the same dataset when the real ground truth is not known.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Normalized Mutual Information between two clusterings.\n\n Normalized Mutual Information (NMI) is a normalization of the Mutual\n Information (MI) score to scale the results between 0 (no mutual\n information) and 1 (perfect correlation). In this function, mutual\n information is normalized by some generalized mean of ``H(labels_true)``\n and ``H(labels_pred))``, defined by the `average_method`.\n\n This measure is not adjusted for chance. Therefore\n :func:`adjusted_mutual_info_score` might be preferred.\n\n This metric is independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score value in any way.\n\n This metric is furthermore symmetric: switching ``label_true`` with\n ``label_pred`` will return the same score value. This can be useful to\n measure the agreement of two independent label assignments strategies\n on the same dataset when the real ground truth is not known.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n A clustering of the data into disjoint subsets.\n\n labels_pred : int array-like of shape (n_samples,)\n A clustering of the data into disjoint subsets.\n\n average_method : str, default='arithmetic'\n How to compute the normalizer in the denominator. Possible options\n are 'min', 'geometric', 'arithmetic', and 'max'.\n\n .. versionadded:: 0.20\n\n .. versionchanged:: 0.22\n The default value of ``average_method`` changed from 'geometric' to\n 'arithmetic'.\n\n Returns\n -------\n nmi : float\n Score between 0.0 and 1.0 in normalized nats (based on the natural\n logarithm). 1.0 stands for perfectly complete labeling.\n\n See Also\n --------\n v_measure_score : V-Measure (NMI with arithmetic mean option).\n adjusted_rand_score : Adjusted Rand Index.\n adjusted_mutual_info_score : Adjusted Mutual Information (adjusted\n against chance).\n\n Examples\n --------\n\n Perfect labelings are both homogeneous and complete, hence have\n score 1.0::\n\n >>> from sklearn.metrics.cluster import normalized_mutual_info_score\n >>> normalized_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])\n ... # doctest: +SKIP\n 1.0\n >>> normalized_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])\n ... # doctest: +SKIP\n 1.0\n\n If classes members are completely split across different clusters,\n the assignment is totally in-complete, hence the NMI is null::\n\n >>> normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])\n ... # doctest: +SKIP\n 0.0\n ", "source_code": "\ndef normalized_mutual_info_score(labels_true, labels_pred, *, average_method='arithmetic'):\n \"\"\"Normalized Mutual Information between two clusterings.\n\n Normalized Mutual Information (NMI) is a normalization of the Mutual\n Information (MI) score to scale the results between 0 (no mutual\n information) and 1 (perfect correlation). In this function, mutual\n information is normalized by some generalized mean of ``H(labels_true)``\n and ``H(labels_pred))``, defined by the `average_method`.\n\n This measure is not adjusted for chance. Therefore\n :func:`adjusted_mutual_info_score` might be preferred.\n\n This metric is independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score value in any way.\n\n This metric is furthermore symmetric: switching ``label_true`` with\n ``label_pred`` will return the same score value. This can be useful to\n measure the agreement of two independent label assignments strategies\n on the same dataset when the real ground truth is not known.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n A clustering of the data into disjoint subsets.\n\n labels_pred : int array-like of shape (n_samples,)\n A clustering of the data into disjoint subsets.\n\n average_method : str, default='arithmetic'\n How to compute the normalizer in the denominator. Possible options\n are 'min', 'geometric', 'arithmetic', and 'max'.\n\n .. versionadded:: 0.20\n\n .. versionchanged:: 0.22\n The default value of ``average_method`` changed from 'geometric' to\n 'arithmetic'.\n\n Returns\n -------\n nmi : float\n Score between 0.0 and 1.0 in normalized nats (based on the natural\n logarithm). 1.0 stands for perfectly complete labeling.\n\n See Also\n --------\n v_measure_score : V-Measure (NMI with arithmetic mean option).\n adjusted_rand_score : Adjusted Rand Index.\n adjusted_mutual_info_score : Adjusted Mutual Information (adjusted\n against chance).\n\n Examples\n --------\n\n Perfect labelings are both homogeneous and complete, hence have\n score 1.0::\n\n >>> from sklearn.metrics.cluster import normalized_mutual_info_score\n >>> normalized_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1])\n ... # doctest: +SKIP\n 1.0\n >>> normalized_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0])\n ... # doctest: +SKIP\n 1.0\n\n If classes members are completely split across different clusters,\n the assignment is totally in-complete, hence the NMI is null::\n\n >>> normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3])\n ... # doctest: +SKIP\n 0.0\n \"\"\"\n (labels_true, labels_pred) = check_clusterings(labels_true, labels_pred)\n classes = np.unique(labels_true)\n clusters = np.unique(labels_pred)\n if classes.shape[0] == clusters.shape[0] == 1 or classes.shape[0] == clusters.shape[0] == 0:\n return 1.0\n contingency = contingency_matrix(labels_true, labels_pred, sparse=True)\n contingency = contingency.astype(np.float64, **_astype_copy_false(contingency))\n mi = mutual_info_score(labels_true, labels_pred, contingency=contingency)\n (h_true, h_pred) = (entropy(labels_true), entropy(labels_pred))\n normalizer = _generalized_average(h_true, h_pred, average_method)\n normalizer = max(normalizer, np.finfo('float64').eps)\n nmi = mi / normalizer\n return nmi" }, { @@ -123428,7 +132802,8 @@ "docstring": { "type": "array-like of shape (n_samples,), dtype=integral", "description": "Ground truth class labels to be used as a reference." - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -123438,13 +132813,14 @@ "docstring": { "type": "array-like of shape (n_samples,), dtype=integral", "description": "Cluster labels to evaluate." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Pair confusion matrix arising from two clusterings.\n\nThe pair confusion matrix :math:`C` computes a 2 by 2 similarity matrix between two clusterings by considering all pairs of samples and counting pairs that are assigned into the same or into different clusters under the true and predicted clusterings. Considering a pair of samples that is clustered together a positive pair, then as in binary classification the count of true negatives is :math:`C_{00}`, false negatives is :math:`C_{10}`, true positives is :math:`C_{11}` and false positives is :math:`C_{01}`. Read more in the :ref:`User Guide `.", - "docstring": "Pair confusion matrix arising from two clusterings.\n\nThe pair confusion matrix :math:`C` computes a 2 by 2 similarity matrix\nbetween two clusterings by considering all pairs of samples and counting\npairs that are assigned into the same or into different clusters under\nthe true and predicted clusterings.\n\nConsidering a pair of samples that is clustered together a positive pair,\nthen as in binary classification the count of true negatives is\n:math:`C_{00}`, false negatives is :math:`C_{10}`, true positives is\n:math:`C_{11}` and false positives is :math:`C_{01}`.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nlabels_true : array-like of shape (n_samples,), dtype=integral\n Ground truth class labels to be used as a reference.\n\nlabels_pred : array-like of shape (n_samples,), dtype=integral\n Cluster labels to evaluate.\n\nReturns\n-------\nC : ndarray of shape (2, 2), dtype=np.int64\n The contingency matrix.\n\nSee Also\n--------\nrand_score: Rand Score\nadjusted_rand_score: Adjusted Rand Score\nadjusted_mutual_info_score: Adjusted Mutual Information\n\nExamples\n--------\nPerfectly matching labelings have all non-zero entries on the\ndiagonal regardless of actual label values:\n\n >>> from sklearn.metrics.cluster import pair_confusion_matrix\n >>> pair_confusion_matrix([0, 0, 1, 1], [1, 1, 0, 0])\n array([[8, 0],\n [0, 4]]...\n\nLabelings that assign all classes members to the same clusters\nare complete but may be not always pure, hence penalized, and\nhave some off-diagonal non-zero entries:\n\n >>> pair_confusion_matrix([0, 0, 1, 2], [0, 0, 1, 1])\n array([[8, 2],\n [0, 2]]...\n\nNote that the matrix is not symmetric.\n\nReferences\n----------\n.. L. Hubert and P. Arabie, Comparing Partitions, Journal of\n Classification 1985\n https://link.springer.com/article/10.1007%2FBF01908075", + "description": "Pair confusion matrix arising from two clusterings.\n\nThe pair confusion matrix :math:`C` computes a 2 by 2 similarity matrix\nbetween two clusterings by considering all pairs of samples and counting\npairs that are assigned into the same or into different clusters under\nthe true and predicted clusterings.\n\nConsidering a pair of samples that is clustered together a positive pair,\nthen as in binary classification the count of true negatives is\n:math:`C_{00}`, false negatives is :math:`C_{10}`, true positives is\n:math:`C_{11}` and false positives is :math:`C_{01}`.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Pair confusion matrix arising from two clusterings.\n\n The pair confusion matrix :math:`C` computes a 2 by 2 similarity matrix\n between two clusterings by considering all pairs of samples and counting\n pairs that are assigned into the same or into different clusters under\n the true and predicted clusterings.\n\n Considering a pair of samples that is clustered together a positive pair,\n then as in binary classification the count of true negatives is\n :math:`C_{00}`, false negatives is :math:`C_{10}`, true positives is\n :math:`C_{11}` and false positives is :math:`C_{01}`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : array-like of shape (n_samples,), dtype=integral\n Ground truth class labels to be used as a reference.\n\n labels_pred : array-like of shape (n_samples,), dtype=integral\n Cluster labels to evaluate.\n\n Returns\n -------\n C : ndarray of shape (2, 2), dtype=np.int64\n The contingency matrix.\n\n See Also\n --------\n rand_score: Rand Score\n adjusted_rand_score: Adjusted Rand Score\n adjusted_mutual_info_score: Adjusted Mutual Information\n\n Examples\n --------\n Perfectly matching labelings have all non-zero entries on the\n diagonal regardless of actual label values:\n\n >>> from sklearn.metrics.cluster import pair_confusion_matrix\n >>> pair_confusion_matrix([0, 0, 1, 1], [1, 1, 0, 0])\n array([[8, 0],\n [0, 4]]...\n\n Labelings that assign all classes members to the same clusters\n are complete but may be not always pure, hence penalized, and\n have some off-diagonal non-zero entries:\n\n >>> pair_confusion_matrix([0, 0, 1, 2], [0, 0, 1, 1])\n array([[8, 2],\n [0, 2]]...\n\n Note that the matrix is not symmetric.\n\n References\n ----------\n .. L. Hubert and P. Arabie, Comparing Partitions, Journal of\n Classification 1985\n https://link.springer.com/article/10.1007%2FBF01908075\n ", "source_code": "\ndef pair_confusion_matrix(labels_true, labels_pred):\n \"\"\"Pair confusion matrix arising from two clusterings.\n\n The pair confusion matrix :math:`C` computes a 2 by 2 similarity matrix\n between two clusterings by considering all pairs of samples and counting\n pairs that are assigned into the same or into different clusters under\n the true and predicted clusterings.\n\n Considering a pair of samples that is clustered together a positive pair,\n then as in binary classification the count of true negatives is\n :math:`C_{00}`, false negatives is :math:`C_{10}`, true positives is\n :math:`C_{11}` and false positives is :math:`C_{01}`.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : array-like of shape (n_samples,), dtype=integral\n Ground truth class labels to be used as a reference.\n\n labels_pred : array-like of shape (n_samples,), dtype=integral\n Cluster labels to evaluate.\n\n Returns\n -------\n C : ndarray of shape (2, 2), dtype=np.int64\n The contingency matrix.\n\n See Also\n --------\n rand_score: Rand Score\n adjusted_rand_score: Adjusted Rand Score\n adjusted_mutual_info_score: Adjusted Mutual Information\n\n Examples\n --------\n Perfectly matching labelings have all non-zero entries on the\n diagonal regardless of actual label values:\n\n >>> from sklearn.metrics.cluster import pair_confusion_matrix\n >>> pair_confusion_matrix([0, 0, 1, 1], [1, 1, 0, 0])\n array([[8, 0],\n [0, 4]]...\n\n Labelings that assign all classes members to the same clusters\n are complete but may be not always pure, hence penalized, and\n have some off-diagonal non-zero entries:\n\n >>> pair_confusion_matrix([0, 0, 1, 2], [0, 0, 1, 1])\n array([[8, 2],\n [0, 2]]...\n\n Note that the matrix is not symmetric.\n\n References\n ----------\n .. L. Hubert and P. Arabie, Comparing Partitions, Journal of\n Classification 1985\n https://link.springer.com/article/10.1007%2FBF01908075\n \"\"\"\n (labels_true, labels_pred) = check_clusterings(labels_true, labels_pred)\n n_samples = np.int64(labels_true.shape[0])\n contingency = contingency_matrix(labels_true, labels_pred, sparse=True, dtype=np.int64)\n n_c = np.ravel(contingency.sum(axis=1))\n n_k = np.ravel(contingency.sum(axis=0))\n sum_squares = (contingency.data**2).sum()\n C = np.empty((2, 2), dtype=np.int64)\n C[1, 1] = sum_squares - n_samples\n C[0, 1] = contingency.dot(n_k).sum() - sum_squares\n C[1, 0] = contingency.transpose().dot(n_c).sum() - sum_squares\n C[0, 0] = n_samples**2 - C[0, 1] - C[1, 0] - sum_squares\n return C" }, { @@ -123462,7 +132838,8 @@ "docstring": { "type": "array-like of shape (n_samples,), dtype=integral", "description": "Ground truth class labels to be used as a reference." - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -123472,13 +132849,14 @@ "docstring": { "type": "array-like of shape (n_samples,), dtype=integral", "description": "Cluster labels to evaluate." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Rand index.\n\nThe Rand Index computes a similarity measure between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings. The raw RI score is: RI = (number of agreeing pairs) / (number of pairs) Read more in the :ref:`User Guide `.", - "docstring": "Rand index.\n\nThe Rand Index computes a similarity measure between two clusterings\nby considering all pairs of samples and counting pairs that are\nassigned in the same or different clusters in the predicted and\ntrue clusterings.\n\nThe raw RI score is:\n\n RI = (number of agreeing pairs) / (number of pairs)\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nlabels_true : array-like of shape (n_samples,), dtype=integral\n Ground truth class labels to be used as a reference.\n\nlabels_pred : array-like of shape (n_samples,), dtype=integral\n Cluster labels to evaluate.\n\nReturns\n-------\nRI : float\n Similarity score between 0.0 and 1.0, inclusive, 1.0 stands for\n perfect match.\n\nSee Also\n--------\nadjusted_rand_score: Adjusted Rand Score\nadjusted_mutual_info_score: Adjusted Mutual Information\n\nExamples\n--------\nPerfectly matching labelings have a score of 1 even\n\n >>> from sklearn.metrics.cluster import rand_score\n >>> rand_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\nLabelings that assign all classes members to the same clusters\nare complete but may not always be pure, hence penalized:\n\n >>> rand_score([0, 0, 1, 2], [0, 0, 1, 1])\n 0.83...\n\nReferences\n----------\n.. L. Hubert and P. Arabie, Comparing Partitions, Journal of\n Classification 1985\n https://link.springer.com/article/10.1007%2FBF01908075\n\n.. https://en.wikipedia.org/wiki/Simple_matching_coefficient\n\n.. https://en.wikipedia.org/wiki/Rand_index", + "description": "Rand index.\n\nThe Rand Index computes a similarity measure between two clusterings\nby considering all pairs of samples and counting pairs that are\nassigned in the same or different clusters in the predicted and\ntrue clusterings.\n\nThe raw RI score is:\n\n RI = (number of agreeing pairs) / (number of pairs)\n\nRead more in the :ref:`User Guide `.", + "docstring": "Rand index.\n\n The Rand Index computes a similarity measure between two clusterings\n by considering all pairs of samples and counting pairs that are\n assigned in the same or different clusters in the predicted and\n true clusterings.\n\n The raw RI score is:\n\n RI = (number of agreeing pairs) / (number of pairs)\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : array-like of shape (n_samples,), dtype=integral\n Ground truth class labels to be used as a reference.\n\n labels_pred : array-like of shape (n_samples,), dtype=integral\n Cluster labels to evaluate.\n\n Returns\n -------\n RI : float\n Similarity score between 0.0 and 1.0, inclusive, 1.0 stands for\n perfect match.\n\n See Also\n --------\n adjusted_rand_score: Adjusted Rand Score\n adjusted_mutual_info_score: Adjusted Mutual Information\n\n Examples\n --------\n Perfectly matching labelings have a score of 1 even\n\n >>> from sklearn.metrics.cluster import rand_score\n >>> rand_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\n Labelings that assign all classes members to the same clusters\n are complete but may not always be pure, hence penalized:\n\n >>> rand_score([0, 0, 1, 2], [0, 0, 1, 1])\n 0.83...\n\n References\n ----------\n .. L. Hubert and P. Arabie, Comparing Partitions, Journal of\n Classification 1985\n https://link.springer.com/article/10.1007%2FBF01908075\n\n .. https://en.wikipedia.org/wiki/Simple_matching_coefficient\n\n .. https://en.wikipedia.org/wiki/Rand_index\n ", "source_code": "\ndef rand_score(labels_true, labels_pred):\n \"\"\"Rand index.\n\n The Rand Index computes a similarity measure between two clusterings\n by considering all pairs of samples and counting pairs that are\n assigned in the same or different clusters in the predicted and\n true clusterings.\n\n The raw RI score is:\n\n RI = (number of agreeing pairs) / (number of pairs)\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : array-like of shape (n_samples,), dtype=integral\n Ground truth class labels to be used as a reference.\n\n labels_pred : array-like of shape (n_samples,), dtype=integral\n Cluster labels to evaluate.\n\n Returns\n -------\n RI : float\n Similarity score between 0.0 and 1.0, inclusive, 1.0 stands for\n perfect match.\n\n See Also\n --------\n adjusted_rand_score: Adjusted Rand Score\n adjusted_mutual_info_score: Adjusted Mutual Information\n\n Examples\n --------\n Perfectly matching labelings have a score of 1 even\n\n >>> from sklearn.metrics.cluster import rand_score\n >>> rand_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\n Labelings that assign all classes members to the same clusters\n are complete but may not always be pure, hence penalized:\n\n >>> rand_score([0, 0, 1, 2], [0, 0, 1, 1])\n 0.83...\n\n References\n ----------\n .. L. Hubert and P. Arabie, Comparing Partitions, Journal of\n Classification 1985\n https://link.springer.com/article/10.1007%2FBF01908075\n\n .. https://en.wikipedia.org/wiki/Simple_matching_coefficient\n\n .. https://en.wikipedia.org/wiki/Rand_index\n \"\"\"\n contingency = pair_confusion_matrix(labels_true, labels_pred)\n numerator = contingency.diagonal().sum()\n denominator = contingency.sum()\n if numerator == denominator or denominator == 0:\n return 1.0\n return numerator / denominator" }, { @@ -123496,7 +132874,8 @@ "docstring": { "type": "int array, shape = [n_samples]", "description": "ground truth class labels to be used as a reference" - } + }, + "refined_type": {} }, { "name": "labels_pred", @@ -123506,7 +132885,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "cluster labels to evaluate" - } + }, + "refined_type": {} }, { "name": "beta", @@ -123516,13 +132896,14 @@ "docstring": { "type": "float, default=1.0", "description": "Ratio of weight attributed to ``homogeneity`` vs ``completeness``.\nIf ``beta`` is greater than 1, ``completeness`` is weighted more\nstrongly in the calculation. If ``beta`` is less than 1,\n``homogeneity`` is weighted more strongly." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "V-measure cluster labeling given a ground truth.\n\nThis score is identical to :func:`normalized_mutual_info_score` with the ``'arithmetic'`` option for averaging. The V-measure is the harmonic mean between homogeneity and completeness:: v = (1 + beta) * homogeneity * completeness / (beta * homogeneity + completeness) This metric is independent of the absolute values of the labels: a permutation of the class or cluster label values won't change the score value in any way. This metric is furthermore symmetric: switching ``label_true`` with ``label_pred`` will return the same score value. This can be useful to measure the agreement of two independent label assignments strategies on the same dataset when the real ground truth is not known. Read more in the :ref:`User Guide `.", - "docstring": "V-measure cluster labeling given a ground truth.\n\nThis score is identical to :func:`normalized_mutual_info_score` with\nthe ``'arithmetic'`` option for averaging.\n\nThe V-measure is the harmonic mean between homogeneity and completeness::\n\n v = (1 + beta) * homogeneity * completeness\n / (beta * homogeneity + completeness)\n\nThis metric is independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore value in any way.\n\nThis metric is furthermore symmetric: switching ``label_true`` with\n``label_pred`` will return the same score value. This can be useful to\nmeasure the agreement of two independent label assignments strategies\non the same dataset when the real ground truth is not known.\n\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nlabels_true : int array, shape = [n_samples]\n ground truth class labels to be used as a reference\n\nlabels_pred : array-like of shape (n_samples,)\n cluster labels to evaluate\n\nbeta : float, default=1.0\n Ratio of weight attributed to ``homogeneity`` vs ``completeness``.\n If ``beta`` is greater than 1, ``completeness`` is weighted more\n strongly in the calculation. If ``beta`` is less than 1,\n ``homogeneity`` is weighted more strongly.\n\nReturns\n-------\nv_measure : float\n score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling\n\nReferences\n----------\n\n.. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A\n conditional entropy-based external cluster evaluation measure\n `_\n\nSee Also\n--------\nhomogeneity_score\ncompleteness_score\nnormalized_mutual_info_score\n\nExamples\n--------\n\nPerfect labelings are both homogeneous and complete, hence have score 1.0::\n\n >>> from sklearn.metrics.cluster import v_measure_score\n >>> v_measure_score([0, 0, 1, 1], [0, 0, 1, 1])\n 1.0\n >>> v_measure_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\nLabelings that assign all classes members to the same clusters\nare complete be not homogeneous, hence penalized::\n\n >>> print(\"%.6f\" % v_measure_score([0, 0, 1, 2], [0, 0, 1, 1]))\n 0.8...\n >>> print(\"%.6f\" % v_measure_score([0, 1, 2, 3], [0, 0, 1, 1]))\n 0.66...\n\nLabelings that have pure clusters with members coming from the same\nclasses are homogeneous but un-necessary splits harms completeness\nand thus penalize V-measure as well::\n\n >>> print(\"%.6f\" % v_measure_score([0, 0, 1, 1], [0, 0, 1, 2]))\n 0.8...\n >>> print(\"%.6f\" % v_measure_score([0, 0, 1, 1], [0, 1, 2, 3]))\n 0.66...\n\nIf classes members are completely split across different clusters,\nthe assignment is totally incomplete, hence the V-Measure is null::\n\n >>> print(\"%.6f\" % v_measure_score([0, 0, 0, 0], [0, 1, 2, 3]))\n 0.0...\n\nClusters that include samples from totally different classes totally\ndestroy the homogeneity of the labeling, hence::\n\n >>> print(\"%.6f\" % v_measure_score([0, 0, 1, 1], [0, 0, 0, 0]))\n 0.0...", + "description": "V-measure cluster labeling given a ground truth.\n\nThis score is identical to :func:`normalized_mutual_info_score` with\nthe ``'arithmetic'`` option for averaging.\n\nThe V-measure is the harmonic mean between homogeneity and completeness::\n\n v = (1 + beta) * homogeneity * completeness\n / (beta * homogeneity + completeness)\n\nThis metric is independent of the absolute values of the labels:\na permutation of the class or cluster label values won't change the\nscore value in any way.\n\nThis metric is furthermore symmetric: switching ``label_true`` with\n``label_pred`` will return the same score value. This can be useful to\nmeasure the agreement of two independent label assignments strategies\non the same dataset when the real ground truth is not known.\n\nRead more in the :ref:`User Guide `.", + "docstring": "V-measure cluster labeling given a ground truth.\n\n This score is identical to :func:`normalized_mutual_info_score` with\n the ``'arithmetic'`` option for averaging.\n\n The V-measure is the harmonic mean between homogeneity and completeness::\n\n v = (1 + beta) * homogeneity * completeness\n / (beta * homogeneity + completeness)\n\n This metric is independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score value in any way.\n\n This metric is furthermore symmetric: switching ``label_true`` with\n ``label_pred`` will return the same score value. This can be useful to\n measure the agreement of two independent label assignments strategies\n on the same dataset when the real ground truth is not known.\n\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n ground truth class labels to be used as a reference\n\n labels_pred : array-like of shape (n_samples,)\n cluster labels to evaluate\n\n beta : float, default=1.0\n Ratio of weight attributed to ``homogeneity`` vs ``completeness``.\n If ``beta`` is greater than 1, ``completeness`` is weighted more\n strongly in the calculation. If ``beta`` is less than 1,\n ``homogeneity`` is weighted more strongly.\n\n Returns\n -------\n v_measure : float\n score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling\n\n References\n ----------\n\n .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A\n conditional entropy-based external cluster evaluation measure\n `_\n\n See Also\n --------\n homogeneity_score\n completeness_score\n normalized_mutual_info_score\n\n Examples\n --------\n\n Perfect labelings are both homogeneous and complete, hence have score 1.0::\n\n >>> from sklearn.metrics.cluster import v_measure_score\n >>> v_measure_score([0, 0, 1, 1], [0, 0, 1, 1])\n 1.0\n >>> v_measure_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\n Labelings that assign all classes members to the same clusters\n are complete be not homogeneous, hence penalized::\n\n >>> print(\"%.6f\" % v_measure_score([0, 0, 1, 2], [0, 0, 1, 1]))\n 0.8...\n >>> print(\"%.6f\" % v_measure_score([0, 1, 2, 3], [0, 0, 1, 1]))\n 0.66...\n\n Labelings that have pure clusters with members coming from the same\n classes are homogeneous but un-necessary splits harms completeness\n and thus penalize V-measure as well::\n\n >>> print(\"%.6f\" % v_measure_score([0, 0, 1, 1], [0, 0, 1, 2]))\n 0.8...\n >>> print(\"%.6f\" % v_measure_score([0, 0, 1, 1], [0, 1, 2, 3]))\n 0.66...\n\n If classes members are completely split across different clusters,\n the assignment is totally incomplete, hence the V-Measure is null::\n\n >>> print(\"%.6f\" % v_measure_score([0, 0, 0, 0], [0, 1, 2, 3]))\n 0.0...\n\n Clusters that include samples from totally different classes totally\n destroy the homogeneity of the labeling, hence::\n\n >>> print(\"%.6f\" % v_measure_score([0, 0, 1, 1], [0, 0, 0, 0]))\n 0.0...\n ", "source_code": "\ndef v_measure_score(labels_true, labels_pred, *, beta=1.0):\n \"\"\"V-measure cluster labeling given a ground truth.\n\n This score is identical to :func:`normalized_mutual_info_score` with\n the ``'arithmetic'`` option for averaging.\n\n The V-measure is the harmonic mean between homogeneity and completeness::\n\n v = (1 + beta) * homogeneity * completeness\n / (beta * homogeneity + completeness)\n\n This metric is independent of the absolute values of the labels:\n a permutation of the class or cluster label values won't change the\n score value in any way.\n\n This metric is furthermore symmetric: switching ``label_true`` with\n ``label_pred`` will return the same score value. This can be useful to\n measure the agreement of two independent label assignments strategies\n on the same dataset when the real ground truth is not known.\n\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n labels_true : int array, shape = [n_samples]\n ground truth class labels to be used as a reference\n\n labels_pred : array-like of shape (n_samples,)\n cluster labels to evaluate\n\n beta : float, default=1.0\n Ratio of weight attributed to ``homogeneity`` vs ``completeness``.\n If ``beta`` is greater than 1, ``completeness`` is weighted more\n strongly in the calculation. If ``beta`` is less than 1,\n ``homogeneity`` is weighted more strongly.\n\n Returns\n -------\n v_measure : float\n score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling\n\n References\n ----------\n\n .. [1] `Andrew Rosenberg and Julia Hirschberg, 2007. V-Measure: A\n conditional entropy-based external cluster evaluation measure\n `_\n\n See Also\n --------\n homogeneity_score\n completeness_score\n normalized_mutual_info_score\n\n Examples\n --------\n\n Perfect labelings are both homogeneous and complete, hence have score 1.0::\n\n >>> from sklearn.metrics.cluster import v_measure_score\n >>> v_measure_score([0, 0, 1, 1], [0, 0, 1, 1])\n 1.0\n >>> v_measure_score([0, 0, 1, 1], [1, 1, 0, 0])\n 1.0\n\n Labelings that assign all classes members to the same clusters\n are complete be not homogeneous, hence penalized::\n\n >>> print(\"%.6f\" % v_measure_score([0, 0, 1, 2], [0, 0, 1, 1]))\n 0.8...\n >>> print(\"%.6f\" % v_measure_score([0, 1, 2, 3], [0, 0, 1, 1]))\n 0.66...\n\n Labelings that have pure clusters with members coming from the same\n classes are homogeneous but un-necessary splits harms completeness\n and thus penalize V-measure as well::\n\n >>> print(\"%.6f\" % v_measure_score([0, 0, 1, 1], [0, 0, 1, 2]))\n 0.8...\n >>> print(\"%.6f\" % v_measure_score([0, 0, 1, 1], [0, 1, 2, 3]))\n 0.66...\n\n If classes members are completely split across different clusters,\n the assignment is totally incomplete, hence the V-Measure is null::\n\n >>> print(\"%.6f\" % v_measure_score([0, 0, 0, 0], [0, 1, 2, 3]))\n 0.0...\n\n Clusters that include samples from totally different classes totally\n destroy the homogeneity of the labeling, hence::\n\n >>> print(\"%.6f\" % v_measure_score([0, 0, 1, 1], [0, 0, 0, 0]))\n 0.0...\n \"\"\"\n return homogeneity_completeness_v_measure(labels_true, labels_pred, beta=beta)[2]" }, { @@ -123540,7 +132921,8 @@ "docstring": { "type": "array-like of shape (n_chunk_samples, n_samples)", "description": "Precomputed distances for a chunk." - } + }, + "refined_type": {} }, { "name": "start", @@ -123550,7 +132932,8 @@ "docstring": { "type": "int", "description": "First index in the chunk." - } + }, + "refined_type": {} }, { "name": "labels", @@ -123560,6 +132943,10 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Corresponding cluster labels, encoded as {0, ..., n_clusters-1}." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -123570,13 +132957,14 @@ "docstring": { "type": "array-like", "description": "Distribution of cluster labels in ``labels``." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Accumulate silhouette statistics for vertical chunk of X.", - "docstring": "Accumulate silhouette statistics for vertical chunk of X.\n\nParameters\n----------\nD_chunk : array-like of shape (n_chunk_samples, n_samples)\n Precomputed distances for a chunk.\nstart : int\n First index in the chunk.\nlabels : array-like of shape (n_samples,)\n Corresponding cluster labels, encoded as {0, ..., n_clusters-1}.\nlabel_freqs : array-like\n Distribution of cluster labels in ``labels``.", + "docstring": "Accumulate silhouette statistics for vertical chunk of X.\n\n Parameters\n ----------\n D_chunk : array-like of shape (n_chunk_samples, n_samples)\n Precomputed distances for a chunk.\n start : int\n First index in the chunk.\n labels : array-like of shape (n_samples,)\n Corresponding cluster labels, encoded as {0, ..., n_clusters-1}.\n label_freqs : array-like\n Distribution of cluster labels in ``labels``.\n ", "source_code": "\ndef _silhouette_reduce(D_chunk, start, labels, label_freqs):\n \"\"\"Accumulate silhouette statistics for vertical chunk of X.\n\n Parameters\n ----------\n D_chunk : array-like of shape (n_chunk_samples, n_samples)\n Precomputed distances for a chunk.\n start : int\n First index in the chunk.\n labels : array-like of shape (n_samples,)\n Corresponding cluster labels, encoded as {0, ..., n_clusters-1}.\n label_freqs : array-like\n Distribution of cluster labels in ``labels``.\n \"\"\"\n clust_dists = np.zeros((len(D_chunk), len(label_freqs)), dtype=D_chunk.dtype)\n for i in range(len(D_chunk)):\n clust_dists[i] += np.bincount(labels, weights=D_chunk[i], minlength=len(label_freqs))\n intra_index = (np.arange(len(D_chunk)), labels[start:start + len(D_chunk)])\n intra_clust_dists = clust_dists[intra_index]\n clust_dists[intra_index] = np.inf\n clust_dists /= label_freqs\n inter_clust_dists = clust_dists.min(axis=1)\n return intra_clust_dists, inter_clust_dists" }, { @@ -123594,7 +132982,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "A list of ``n_features``-dimensional data points. Each row corresponds\nto a single data point." - } + }, + "refined_type": {} }, { "name": "labels", @@ -123604,13 +132993,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Predicted labels for each sample." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the Calinski and Harabasz score.\n\nIt is also known as the Variance Ratio Criterion. The score is defined as ratio between the within-cluster dispersion and the between-cluster dispersion. Read more in the :ref:`User Guide `.", - "docstring": "Compute the Calinski and Harabasz score.\n\nIt is also known as the Variance Ratio Criterion.\n\nThe score is defined as ratio between the within-cluster dispersion and\nthe between-cluster dispersion.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n A list of ``n_features``-dimensional data points. Each row corresponds\n to a single data point.\n\nlabels : array-like of shape (n_samples,)\n Predicted labels for each sample.\n\nReturns\n-------\nscore : float\n The resulting Calinski-Harabasz score.\n\nReferences\n----------\n.. [1] `T. Calinski and J. Harabasz, 1974. \"A dendrite method for cluster\n analysis\". Communications in Statistics\n `_", + "description": "Compute the Calinski and Harabasz score.\n\nIt is also known as the Variance Ratio Criterion.\n\nThe score is defined as ratio between the within-cluster dispersion and\nthe between-cluster dispersion.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the Calinski and Harabasz score.\n\n It is also known as the Variance Ratio Criterion.\n\n The score is defined as ratio between the within-cluster dispersion and\n the between-cluster dispersion.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n A list of ``n_features``-dimensional data points. Each row corresponds\n to a single data point.\n\n labels : array-like of shape (n_samples,)\n Predicted labels for each sample.\n\n Returns\n -------\n score : float\n The resulting Calinski-Harabasz score.\n\n References\n ----------\n .. [1] `T. Calinski and J. Harabasz, 1974. \"A dendrite method for cluster\n analysis\". Communications in Statistics\n `_\n ", "source_code": "\ndef calinski_harabasz_score(X, labels):\n \"\"\"Compute the Calinski and Harabasz score.\n\n It is also known as the Variance Ratio Criterion.\n\n The score is defined as ratio between the within-cluster dispersion and\n the between-cluster dispersion.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n A list of ``n_features``-dimensional data points. Each row corresponds\n to a single data point.\n\n labels : array-like of shape (n_samples,)\n Predicted labels for each sample.\n\n Returns\n -------\n score : float\n The resulting Calinski-Harabasz score.\n\n References\n ----------\n .. [1] `T. Calinski and J. Harabasz, 1974. \"A dendrite method for cluster\n analysis\". Communications in Statistics\n `_\n \"\"\"\n (X, labels) = check_X_y(X, labels)\n le = LabelEncoder()\n labels = le.fit_transform(labels)\n (n_samples, _) = X.shape\n n_labels = len(le.classes_)\n check_number_of_labels(n_labels, n_samples)\n (extra_disp, intra_disp) = (0.0, 0.0)\n mean = np.mean(X, axis=0)\n for k in range(n_labels):\n cluster_k = X[labels == k]\n mean_k = np.mean(cluster_k, axis=0)\n extra_disp += len(cluster_k) * np.sum((mean_k - mean)**2)\n intra_disp += np.sum((cluster_k - mean_k)**2)\n return 1.0 if intra_disp == 0.0 else extra_disp * (n_samples - n_labels) / (intra_disp * (n_labels - 1.0))" }, { @@ -123628,7 +133018,8 @@ "docstring": { "type": "int", "description": "Number of labels." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -123638,13 +133029,14 @@ "docstring": { "type": "int", "description": "Number of samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check that number of labels are valid.", - "docstring": "Check that number of labels are valid.\n\nParameters\n----------\nn_labels : int\n Number of labels.\n\nn_samples : int\n Number of samples.", + "docstring": "Check that number of labels are valid.\n\n Parameters\n ----------\n n_labels : int\n Number of labels.\n\n n_samples : int\n Number of samples.\n ", "source_code": "\ndef check_number_of_labels(n_labels, n_samples):\n \"\"\"Check that number of labels are valid.\n\n Parameters\n ----------\n n_labels : int\n Number of labels.\n\n n_samples : int\n Number of samples.\n \"\"\"\n if not 1 < n_labels < n_samples:\n raise ValueError('Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)' % n_labels)" }, { @@ -123662,7 +133054,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "A list of ``n_features``-dimensional data points. Each row corresponds\nto a single data point." - } + }, + "refined_type": {} }, { "name": "labels", @@ -123672,14 +133065,15 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Predicted labels for each sample." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Computes the Davies-Bouldin score.\n\nThe score is defined as the average similarity measure of each cluster with its most similar cluster, where similarity is the ratio of within-cluster distances to between-cluster distances. Thus, clusters which are farther apart and less dispersed will result in a better score. The minimum score is zero, with lower values indicating better clustering. Read more in the :ref:`User Guide `. .. versionadded:: 0.20", - "docstring": "Computes the Davies-Bouldin score.\n\nThe score is defined as the average similarity measure of each cluster with\nits most similar cluster, where similarity is the ratio of within-cluster\ndistances to between-cluster distances. Thus, clusters which are farther\napart and less dispersed will result in a better score.\n\nThe minimum score is zero, with lower values indicating better clustering.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n A list of ``n_features``-dimensional data points. Each row corresponds\n to a single data point.\n\nlabels : array-like of shape (n_samples,)\n Predicted labels for each sample.\n\nReturns\n-------\nscore: float\n The resulting Davies-Bouldin score.\n\nReferences\n----------\n.. [1] Davies, David L.; Bouldin, Donald W. (1979).\n `\"A Cluster Separation Measure\"\n `__.\n IEEE Transactions on Pattern Analysis and Machine Intelligence.\n PAMI-1 (2): 224-227", - "source_code": "\ndef davies_bouldin_score(X, labels):\n \"\"\"Computes the Davies-Bouldin score.\n\n The score is defined as the average similarity measure of each cluster with\n its most similar cluster, where similarity is the ratio of within-cluster\n distances to between-cluster distances. Thus, clusters which are farther\n apart and less dispersed will result in a better score.\n\n The minimum score is zero, with lower values indicating better clustering.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n A list of ``n_features``-dimensional data points. Each row corresponds\n to a single data point.\n\n labels : array-like of shape (n_samples,)\n Predicted labels for each sample.\n\n Returns\n -------\n score: float\n The resulting Davies-Bouldin score.\n\n References\n ----------\n .. [1] Davies, David L.; Bouldin, Donald W. (1979).\n `\"A Cluster Separation Measure\"\n `__.\n IEEE Transactions on Pattern Analysis and Machine Intelligence.\n PAMI-1 (2): 224-227\n \"\"\"\n (X, labels) = check_X_y(X, labels)\n le = LabelEncoder()\n labels = le.fit_transform(labels)\n (n_samples, _) = X.shape\n n_labels = len(le.classes_)\n check_number_of_labels(n_labels, n_samples)\n intra_dists = np.zeros(n_labels)\n centroids = np.zeros((n_labels, len(X[0])), dtype=float)\n for k in range(n_labels):\n cluster_k = _safe_indexing(X, labels == k)\n centroid = cluster_k.mean(axis=0)\n centroids[k] = centroid\n intra_dists[k] = np.average(pairwise_distances(cluster_k, [centroid]))\n centroid_distances = pairwise_distances(centroids)\n if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0):\n return 0.0\n centroid_distances[centroid_distances == 0] = np.inf\n combined_intra_dists = intra_dists[:, None] + intra_dists\n scores = np.max(combined_intra_dists / centroid_distances, axis=1)\n return np.mean(scores)" + "description": "Compute the Davies-Bouldin score.\n\nThe score is defined as the average similarity measure of each cluster with\nits most similar cluster, where similarity is the ratio of within-cluster\ndistances to between-cluster distances. Thus, clusters which are farther\napart and less dispersed will result in a better score.\n\nThe minimum score is zero, with lower values indicating better clustering.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.20", + "docstring": "Compute the Davies-Bouldin score.\n\n The score is defined as the average similarity measure of each cluster with\n its most similar cluster, where similarity is the ratio of within-cluster\n distances to between-cluster distances. Thus, clusters which are farther\n apart and less dispersed will result in a better score.\n\n The minimum score is zero, with lower values indicating better clustering.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n A list of ``n_features``-dimensional data points. Each row corresponds\n to a single data point.\n\n labels : array-like of shape (n_samples,)\n Predicted labels for each sample.\n\n Returns\n -------\n score: float\n The resulting Davies-Bouldin score.\n\n References\n ----------\n .. [1] Davies, David L.; Bouldin, Donald W. (1979).\n `\"A Cluster Separation Measure\"\n `__.\n IEEE Transactions on Pattern Analysis and Machine Intelligence.\n PAMI-1 (2): 224-227\n ", + "source_code": "\ndef davies_bouldin_score(X, labels):\n \"\"\"Compute the Davies-Bouldin score.\n\n The score is defined as the average similarity measure of each cluster with\n its most similar cluster, where similarity is the ratio of within-cluster\n distances to between-cluster distances. Thus, clusters which are farther\n apart and less dispersed will result in a better score.\n\n The minimum score is zero, with lower values indicating better clustering.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n A list of ``n_features``-dimensional data points. Each row corresponds\n to a single data point.\n\n labels : array-like of shape (n_samples,)\n Predicted labels for each sample.\n\n Returns\n -------\n score: float\n The resulting Davies-Bouldin score.\n\n References\n ----------\n .. [1] Davies, David L.; Bouldin, Donald W. (1979).\n `\"A Cluster Separation Measure\"\n `__.\n IEEE Transactions on Pattern Analysis and Machine Intelligence.\n PAMI-1 (2): 224-227\n \"\"\"\n (X, labels) = check_X_y(X, labels)\n le = LabelEncoder()\n labels = le.fit_transform(labels)\n (n_samples, _) = X.shape\n n_labels = len(le.classes_)\n check_number_of_labels(n_labels, n_samples)\n intra_dists = np.zeros(n_labels)\n centroids = np.zeros((n_labels, len(X[0])), dtype=float)\n for k in range(n_labels):\n cluster_k = _safe_indexing(X, labels == k)\n centroid = cluster_k.mean(axis=0)\n centroids[k] = centroid\n intra_dists[k] = np.average(pairwise_distances(cluster_k, [centroid]))\n centroid_distances = pairwise_distances(centroids)\n if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0):\n return 0.0\n centroid_distances[centroid_distances == 0] = np.inf\n combined_intra_dists = intra_dists[:, None] + intra_dists\n scores = np.max(combined_intra_dists / centroid_distances, axis=1)\n return np.mean(scores)" }, { "name": "silhouette_samples", @@ -123696,7 +133090,8 @@ "docstring": { "type": "array-like of shape (n_samples_a, n_samples_a) if metric == \"precomputed\" or (n_samples_a, n_features) otherwise", "description": "An array of pairwise distances between samples, or a feature array." - } + }, + "refined_type": {} }, { "name": "labels", @@ -123706,7 +133101,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Label values for each sample." - } + }, + "refined_type": {} }, { "name": "metric", @@ -123716,14 +133112,15 @@ "docstring": { "type": "str or callable, default='euclidean'", "description": "The metric to use when calculating distance between instances in a\nfeature array. If metric is a string, it must be one of the options\nallowed by :func:`sklearn.metrics.pairwise.pairwise_distances`.\nIf ``X`` is the distance array itself, use \"precomputed\" as the metric.\nPrecomputed distance matrices must have 0 along the diagonal." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the Silhouette Coefficient for each sample.\n\nThe Silhouette Coefficient is a measure of how well samples are clustered with samples that are similar to themselves. Clustering models with a high Silhouette Coefficient are said to be dense, where samples in the same cluster are similar to each other, and well separated, where samples in different clusters are not very similar to each other. The Silhouette Coefficient is calculated using the mean intra-cluster distance (``a``) and the mean nearest-cluster distance (``b``) for each sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``. Note that Silhouette Coefficient is only defined if number of labels is 2 ``<= n_labels <= n_samples - 1``. This function returns the Silhouette Coefficient for each sample. The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Read more in the :ref:`User Guide `.", - "docstring": "Compute the Silhouette Coefficient for each sample.\n\nThe Silhouette Coefficient is a measure of how well samples are clustered\nwith samples that are similar to themselves. Clustering models with a high\nSilhouette Coefficient are said to be dense, where samples in the same\ncluster are similar to each other, and well separated, where samples in\ndifferent clusters are not very similar to each other.\n\nThe Silhouette Coefficient is calculated using the mean intra-cluster\ndistance (``a``) and the mean nearest-cluster distance (``b``) for each\nsample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,\nb)``.\nNote that Silhouette Coefficient is only defined if number of labels\nis 2 ``<= n_labels <= n_samples - 1``.\n\nThis function returns the Silhouette Coefficient for each sample.\n\nThe best value is 1 and the worst value is -1. Values near 0 indicate\noverlapping clusters.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples_a, n_samples_a) if metric == \"precomputed\" or (n_samples_a, n_features) otherwise\n An array of pairwise distances between samples, or a feature array.\n\nlabels : array-like of shape (n_samples,)\n Label values for each sample.\n\nmetric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`.\n If ``X`` is the distance array itself, use \"precomputed\" as the metric.\n Precomputed distance matrices must have 0 along the diagonal.\n\n`**kwds` : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a ``scipy.spatial.distance`` metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\nReturns\n-------\nsilhouette : array-like of shape (n_samples,)\n Silhouette Coefficients for each sample.\n\nReferences\n----------\n\n.. [1] `Peter J. Rousseeuw (1987). \"Silhouettes: a Graphical Aid to the\n Interpretation and Validation of Cluster Analysis\". Computational\n and Applied Mathematics 20: 53-65.\n `_\n\n.. [2] `Wikipedia entry on the Silhouette Coefficient\n `_", - "source_code": "\ndef silhouette_samples(X, labels, *, metric='euclidean', **kwds):\n \"\"\"Compute the Silhouette Coefficient for each sample.\n\n The Silhouette Coefficient is a measure of how well samples are clustered\n with samples that are similar to themselves. Clustering models with a high\n Silhouette Coefficient are said to be dense, where samples in the same\n cluster are similar to each other, and well separated, where samples in\n different clusters are not very similar to each other.\n\n The Silhouette Coefficient is calculated using the mean intra-cluster\n distance (``a``) and the mean nearest-cluster distance (``b``) for each\n sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,\n b)``.\n Note that Silhouette Coefficient is only defined if number of labels\n is 2 ``<= n_labels <= n_samples - 1``.\n\n This function returns the Silhouette Coefficient for each sample.\n\n The best value is 1 and the worst value is -1. Values near 0 indicate\n overlapping clusters.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_a, n_samples_a) if metric == \"precomputed\" or (n_samples_a, n_features) otherwise\n An array of pairwise distances between samples, or a feature array.\n\n labels : array-like of shape (n_samples,)\n Label values for each sample.\n\n metric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`.\n If ``X`` is the distance array itself, use \"precomputed\" as the metric.\n Precomputed distance matrices must have 0 along the diagonal.\n\n `**kwds` : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a ``scipy.spatial.distance`` metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\n Returns\n -------\n silhouette : array-like of shape (n_samples,)\n Silhouette Coefficients for each sample.\n\n References\n ----------\n\n .. [1] `Peter J. Rousseeuw (1987). \"Silhouettes: a Graphical Aid to the\n Interpretation and Validation of Cluster Analysis\". Computational\n and Applied Mathematics 20: 53-65.\n `_\n\n .. [2] `Wikipedia entry on the Silhouette Coefficient\n `_\n\n \"\"\"\n (X, labels) = check_X_y(X, labels, accept_sparse=['csc', 'csr'])\n if metric == 'precomputed':\n atol = np.finfo(X.dtype).eps * 100\n if np.any(np.abs(np.diagonal(X)) > atol):\n raise ValueError('The precomputed distance matrix contains non-zero elements on the diagonal. Use np.fill_diagonal(X, 0).')\n le = LabelEncoder()\n labels = le.fit_transform(labels)\n n_samples = len(labels)\n label_freqs = np.bincount(labels)\n check_number_of_labels(len(le.classes_), n_samples)\n kwds['metric'] = metric\n reduce_func = functools.partial(_silhouette_reduce, labels=labels, label_freqs=label_freqs)\n results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds))\n (intra_clust_dists, inter_clust_dists) = results\n intra_clust_dists = np.concatenate(intra_clust_dists)\n inter_clust_dists = np.concatenate(inter_clust_dists)\n denom = (label_freqs - 1).take(labels, mode='clip')\n with np.errstate(divide='ignore', invalid='ignore'):\n intra_clust_dists /= denom\n sil_samples = inter_clust_dists - intra_clust_dists\n with np.errstate(divide='ignore', invalid='ignore'):\n sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)\n return np.nan_to_num(sil_samples)" + "description": "Compute the Silhouette Coefficient for each sample.\n\nThe Silhouette Coefficient is a measure of how well samples are clustered\nwith samples that are similar to themselves. Clustering models with a high\nSilhouette Coefficient are said to be dense, where samples in the same\ncluster are similar to each other, and well separated, where samples in\ndifferent clusters are not very similar to each other.\n\nThe Silhouette Coefficient is calculated using the mean intra-cluster\ndistance (``a``) and the mean nearest-cluster distance (``b``) for each\nsample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,\nb)``.\nNote that Silhouette Coefficient is only defined if number of labels\nis 2 ``<= n_labels <= n_samples - 1``.\n\nThis function returns the Silhouette Coefficient for each sample.\n\nThe best value is 1 and the worst value is -1. Values near 0 indicate\noverlapping clusters.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the Silhouette Coefficient for each sample.\n\n The Silhouette Coefficient is a measure of how well samples are clustered\n with samples that are similar to themselves. Clustering models with a high\n Silhouette Coefficient are said to be dense, where samples in the same\n cluster are similar to each other, and well separated, where samples in\n different clusters are not very similar to each other.\n\n The Silhouette Coefficient is calculated using the mean intra-cluster\n distance (``a``) and the mean nearest-cluster distance (``b``) for each\n sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,\n b)``.\n Note that Silhouette Coefficient is only defined if number of labels\n is 2 ``<= n_labels <= n_samples - 1``.\n\n This function returns the Silhouette Coefficient for each sample.\n\n The best value is 1 and the worst value is -1. Values near 0 indicate\n overlapping clusters.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_a, n_samples_a) if metric == \"precomputed\" or (n_samples_a, n_features) otherwise\n An array of pairwise distances between samples, or a feature array.\n\n labels : array-like of shape (n_samples,)\n Label values for each sample.\n\n metric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`.\n If ``X`` is the distance array itself, use \"precomputed\" as the metric.\n Precomputed distance matrices must have 0 along the diagonal.\n\n **kwds : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a ``scipy.spatial.distance`` metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\n Returns\n -------\n silhouette : array-like of shape (n_samples,)\n Silhouette Coefficients for each sample.\n\n References\n ----------\n\n .. [1] `Peter J. Rousseeuw (1987). \"Silhouettes: a Graphical Aid to the\n Interpretation and Validation of Cluster Analysis\". Computational\n and Applied Mathematics 20: 53-65.\n `_\n\n .. [2] `Wikipedia entry on the Silhouette Coefficient\n `_\n ", + "source_code": "\ndef silhouette_samples(X, labels, *, metric='euclidean', **kwds):\n \"\"\"Compute the Silhouette Coefficient for each sample.\n\n The Silhouette Coefficient is a measure of how well samples are clustered\n with samples that are similar to themselves. Clustering models with a high\n Silhouette Coefficient are said to be dense, where samples in the same\n cluster are similar to each other, and well separated, where samples in\n different clusters are not very similar to each other.\n\n The Silhouette Coefficient is calculated using the mean intra-cluster\n distance (``a``) and the mean nearest-cluster distance (``b``) for each\n sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,\n b)``.\n Note that Silhouette Coefficient is only defined if number of labels\n is 2 ``<= n_labels <= n_samples - 1``.\n\n This function returns the Silhouette Coefficient for each sample.\n\n The best value is 1 and the worst value is -1. Values near 0 indicate\n overlapping clusters.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_a, n_samples_a) if metric == \"precomputed\" or (n_samples_a, n_features) otherwise\n An array of pairwise distances between samples, or a feature array.\n\n labels : array-like of shape (n_samples,)\n Label values for each sample.\n\n metric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by :func:`sklearn.metrics.pairwise.pairwise_distances`.\n If ``X`` is the distance array itself, use \"precomputed\" as the metric.\n Precomputed distance matrices must have 0 along the diagonal.\n\n **kwds : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a ``scipy.spatial.distance`` metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\n Returns\n -------\n silhouette : array-like of shape (n_samples,)\n Silhouette Coefficients for each sample.\n\n References\n ----------\n\n .. [1] `Peter J. Rousseeuw (1987). \"Silhouettes: a Graphical Aid to the\n Interpretation and Validation of Cluster Analysis\". Computational\n and Applied Mathematics 20: 53-65.\n `_\n\n .. [2] `Wikipedia entry on the Silhouette Coefficient\n `_\n \"\"\"\n (X, labels) = check_X_y(X, labels, accept_sparse=['csc', 'csr'])\n if metric == 'precomputed':\n atol = np.finfo(X.dtype).eps * 100\n if np.any(np.abs(np.diagonal(X)) > atol):\n raise ValueError('The precomputed distance matrix contains non-zero elements on the diagonal. Use np.fill_diagonal(X, 0).')\n le = LabelEncoder()\n labels = le.fit_transform(labels)\n n_samples = len(labels)\n label_freqs = np.bincount(labels)\n check_number_of_labels(len(le.classes_), n_samples)\n kwds['metric'] = metric\n reduce_func = functools.partial(_silhouette_reduce, labels=labels, label_freqs=label_freqs)\n results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds))\n (intra_clust_dists, inter_clust_dists) = results\n intra_clust_dists = np.concatenate(intra_clust_dists)\n inter_clust_dists = np.concatenate(inter_clust_dists)\n denom = (label_freqs - 1).take(labels, mode='clip')\n with np.errstate(divide='ignore', invalid='ignore'):\n intra_clust_dists /= denom\n sil_samples = inter_clust_dists - intra_clust_dists\n with np.errstate(divide='ignore', invalid='ignore'):\n sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)\n return np.nan_to_num(sil_samples)" }, { "name": "silhouette_score", @@ -123740,7 +133137,8 @@ "docstring": { "type": "array-like of shape (n_samples_a, n_samples_a) if metric == \"precomputed\" or (n_samples_a, n_features) otherwise", "description": "An array of pairwise distances between samples, or a feature array." - } + }, + "refined_type": {} }, { "name": "labels", @@ -123750,7 +133148,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Predicted labels for each sample." - } + }, + "refined_type": {} }, { "name": "metric", @@ -123760,7 +133159,8 @@ "docstring": { "type": "str or callable, default='euclidean'", "description": "The metric to use when calculating distance between instances in a\nfeature array. If metric is a string, it must be one of the options\nallowed by :func:`metrics.pairwise.pairwise_distances\n`. If ``X`` is\nthe distance array itself, use ``metric=\"precomputed\"``." - } + }, + "refined_type": {} }, { "name": "sample_size", @@ -123770,7 +133170,8 @@ "docstring": { "type": "int, default=None", "description": "The size of the sample to use when computing the Silhouette Coefficient\non a random subset of the data.\nIf ``sample_size is None``, no sampling is used." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -123780,14 +133181,15 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for selecting a subset of samples.\nUsed when ``sample_size is not None``.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the mean Silhouette Coefficient of all samples.\n\nThe Silhouette Coefficient is calculated using the mean intra-cluster distance (``a``) and the mean nearest-cluster distance (``b``) for each sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``. To clarify, ``b`` is the distance between a sample and the nearest cluster that the sample is not a part of. Note that Silhouette Coefficient is only defined if number of labels is ``2 <= n_labels <= n_samples - 1``. This function returns the mean Silhouette Coefficient over all samples. To obtain the values for each sample, use :func:`silhouette_samples`. The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar. Read more in the :ref:`User Guide `.", - "docstring": "Compute the mean Silhouette Coefficient of all samples.\n\nThe Silhouette Coefficient is calculated using the mean intra-cluster\ndistance (``a``) and the mean nearest-cluster distance (``b``) for each\nsample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,\nb)``. To clarify, ``b`` is the distance between a sample and the nearest\ncluster that the sample is not a part of.\nNote that Silhouette Coefficient is only defined if number of labels\nis ``2 <= n_labels <= n_samples - 1``.\n\nThis function returns the mean Silhouette Coefficient over all samples.\nTo obtain the values for each sample, use :func:`silhouette_samples`.\n\nThe best value is 1 and the worst value is -1. Values near 0 indicate\noverlapping clusters. Negative values generally indicate that a sample has\nbeen assigned to the wrong cluster, as a different cluster is more similar.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples_a, n_samples_a) if metric == \"precomputed\" or (n_samples_a, n_features) otherwise\n An array of pairwise distances between samples, or a feature array.\n\nlabels : array-like of shape (n_samples,)\n Predicted labels for each sample.\n\nmetric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by :func:`metrics.pairwise.pairwise_distances\n `. If ``X`` is\n the distance array itself, use ``metric=\"precomputed\"``.\n\nsample_size : int, default=None\n The size of the sample to use when computing the Silhouette Coefficient\n on a random subset of the data.\n If ``sample_size is None``, no sampling is used.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for selecting a subset of samples.\n Used when ``sample_size is not None``.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n**kwds : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a scipy.spatial.distance metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\nReturns\n-------\nsilhouette : float\n Mean Silhouette Coefficient for all samples.\n\nReferences\n----------\n\n.. [1] `Peter J. Rousseeuw (1987). \"Silhouettes: a Graphical Aid to the\n Interpretation and Validation of Cluster Analysis\". Computational\n and Applied Mathematics 20: 53-65.\n `_\n\n.. [2] `Wikipedia entry on the Silhouette Coefficient\n `_", - "source_code": "\ndef silhouette_score(X, labels, *, metric='euclidean', sample_size=None, random_state=None, **kwds):\n \"\"\"Compute the mean Silhouette Coefficient of all samples.\n\n The Silhouette Coefficient is calculated using the mean intra-cluster\n distance (``a``) and the mean nearest-cluster distance (``b``) for each\n sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,\n b)``. To clarify, ``b`` is the distance between a sample and the nearest\n cluster that the sample is not a part of.\n Note that Silhouette Coefficient is only defined if number of labels\n is ``2 <= n_labels <= n_samples - 1``.\n\n This function returns the mean Silhouette Coefficient over all samples.\n To obtain the values for each sample, use :func:`silhouette_samples`.\n\n The best value is 1 and the worst value is -1. Values near 0 indicate\n overlapping clusters. Negative values generally indicate that a sample has\n been assigned to the wrong cluster, as a different cluster is more similar.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_a, n_samples_a) if metric == \"precomputed\" or (n_samples_a, n_features) otherwise\n An array of pairwise distances between samples, or a feature array.\n\n labels : array-like of shape (n_samples,)\n Predicted labels for each sample.\n\n metric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by :func:`metrics.pairwise.pairwise_distances\n `. If ``X`` is\n the distance array itself, use ``metric=\"precomputed\"``.\n\n sample_size : int, default=None\n The size of the sample to use when computing the Silhouette Coefficient\n on a random subset of the data.\n If ``sample_size is None``, no sampling is used.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for selecting a subset of samples.\n Used when ``sample_size is not None``.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n **kwds : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a scipy.spatial.distance metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\n Returns\n -------\n silhouette : float\n Mean Silhouette Coefficient for all samples.\n\n References\n ----------\n\n .. [1] `Peter J. Rousseeuw (1987). \"Silhouettes: a Graphical Aid to the\n Interpretation and Validation of Cluster Analysis\". Computational\n and Applied Mathematics 20: 53-65.\n `_\n\n .. [2] `Wikipedia entry on the Silhouette Coefficient\n `_\n\n \"\"\"\n if sample_size is not None:\n (X, labels) = check_X_y(X, labels, accept_sparse=['csc', 'csr'])\n random_state = check_random_state(random_state)\n indices = random_state.permutation(X.shape[0])[:sample_size]\n if metric == 'precomputed':\n (X, labels) = (X[indices].T[indices].T, labels[indices])\n else:\n (X, labels) = (X[indices], labels[indices])\n return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))" + "description": "Compute the mean Silhouette Coefficient of all samples.\n\nThe Silhouette Coefficient is calculated using the mean intra-cluster\ndistance (``a``) and the mean nearest-cluster distance (``b``) for each\nsample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,\nb)``. To clarify, ``b`` is the distance between a sample and the nearest\ncluster that the sample is not a part of.\nNote that Silhouette Coefficient is only defined if number of labels\nis ``2 <= n_labels <= n_samples - 1``.\n\nThis function returns the mean Silhouette Coefficient over all samples.\nTo obtain the values for each sample, use :func:`silhouette_samples`.\n\nThe best value is 1 and the worst value is -1. Values near 0 indicate\noverlapping clusters. Negative values generally indicate that a sample has\nbeen assigned to the wrong cluster, as a different cluster is more similar.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the mean Silhouette Coefficient of all samples.\n\n The Silhouette Coefficient is calculated using the mean intra-cluster\n distance (``a``) and the mean nearest-cluster distance (``b``) for each\n sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,\n b)``. To clarify, ``b`` is the distance between a sample and the nearest\n cluster that the sample is not a part of.\n Note that Silhouette Coefficient is only defined if number of labels\n is ``2 <= n_labels <= n_samples - 1``.\n\n This function returns the mean Silhouette Coefficient over all samples.\n To obtain the values for each sample, use :func:`silhouette_samples`.\n\n The best value is 1 and the worst value is -1. Values near 0 indicate\n overlapping clusters. Negative values generally indicate that a sample has\n been assigned to the wrong cluster, as a different cluster is more similar.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_a, n_samples_a) if metric == \"precomputed\" or (n_samples_a, n_features) otherwise\n An array of pairwise distances between samples, or a feature array.\n\n labels : array-like of shape (n_samples,)\n Predicted labels for each sample.\n\n metric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by :func:`metrics.pairwise.pairwise_distances\n `. If ``X`` is\n the distance array itself, use ``metric=\"precomputed\"``.\n\n sample_size : int, default=None\n The size of the sample to use when computing the Silhouette Coefficient\n on a random subset of the data.\n If ``sample_size is None``, no sampling is used.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for selecting a subset of samples.\n Used when ``sample_size is not None``.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n **kwds : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a scipy.spatial.distance metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\n Returns\n -------\n silhouette : float\n Mean Silhouette Coefficient for all samples.\n\n References\n ----------\n\n .. [1] `Peter J. Rousseeuw (1987). \"Silhouettes: a Graphical Aid to the\n Interpretation and Validation of Cluster Analysis\". Computational\n and Applied Mathematics 20: 53-65.\n `_\n\n .. [2] `Wikipedia entry on the Silhouette Coefficient\n `_\n ", + "source_code": "\ndef silhouette_score(X, labels, *, metric='euclidean', sample_size=None, random_state=None, **kwds):\n \"\"\"Compute the mean Silhouette Coefficient of all samples.\n\n The Silhouette Coefficient is calculated using the mean intra-cluster\n distance (``a``) and the mean nearest-cluster distance (``b``) for each\n sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a,\n b)``. To clarify, ``b`` is the distance between a sample and the nearest\n cluster that the sample is not a part of.\n Note that Silhouette Coefficient is only defined if number of labels\n is ``2 <= n_labels <= n_samples - 1``.\n\n This function returns the mean Silhouette Coefficient over all samples.\n To obtain the values for each sample, use :func:`silhouette_samples`.\n\n The best value is 1 and the worst value is -1. Values near 0 indicate\n overlapping clusters. Negative values generally indicate that a sample has\n been assigned to the wrong cluster, as a different cluster is more similar.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_a, n_samples_a) if metric == \"precomputed\" or (n_samples_a, n_features) otherwise\n An array of pairwise distances between samples, or a feature array.\n\n labels : array-like of shape (n_samples,)\n Predicted labels for each sample.\n\n metric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by :func:`metrics.pairwise.pairwise_distances\n `. If ``X`` is\n the distance array itself, use ``metric=\"precomputed\"``.\n\n sample_size : int, default=None\n The size of the sample to use when computing the Silhouette Coefficient\n on a random subset of the data.\n If ``sample_size is None``, no sampling is used.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for selecting a subset of samples.\n Used when ``sample_size is not None``.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n **kwds : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a scipy.spatial.distance metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\n Returns\n -------\n silhouette : float\n Mean Silhouette Coefficient for all samples.\n\n References\n ----------\n\n .. [1] `Peter J. Rousseeuw (1987). \"Silhouettes: a Graphical Aid to the\n Interpretation and Validation of Cluster Analysis\". Computational\n and Applied Mathematics 20: 53-65.\n `_\n\n .. [2] `Wikipedia entry on the Silhouette Coefficient\n `_\n \"\"\"\n if sample_size is not None:\n (X, labels) = check_X_y(X, labels, accept_sparse=['csc', 'csr'])\n random_state = check_random_state(random_state)\n indices = random_state.permutation(X.shape[0])[:sample_size]\n if metric == 'precomputed':\n (X, labels) = (X[indices].T[indices].T, labels[indices])\n else:\n (X, labels) = (X[indices], labels[indices])\n return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))" }, { "name": "configuration", @@ -123804,7 +133206,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -123814,13 +133217,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n config = Configuration('cluster', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_extension('_expected_mutual_info_fast', sources=['_expected_mutual_info_fast.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_subpackage('tests')\n return config" }, { @@ -123838,7 +133242,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "start", @@ -123848,13 +133253,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _argmin_min_reduce(dist, start):\n indices = dist.argmin(axis=1)\n values = dist[np.arange(dist.shape[0]), indices]\n return indices, values" }, { @@ -123872,7 +133278,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "chunk_size", @@ -123882,7 +133289,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -123906,7 +133314,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dist_matrix", @@ -123916,7 +133325,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "slice_", @@ -123926,7 +133336,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -123950,7 +133361,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -123960,7 +133372,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_norm_squared", @@ -123970,7 +133383,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y_norm_squared", @@ -123980,7 +133394,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "squared", @@ -123990,13 +133405,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Computational part of euclidean_distances\n\nAssumes inputs are already checked. If norms are passed as float32, they are unused. If arrays are passed as float32, norms needs to be recomputed on upcast chunks. TODO: use a float64 accumulator in row_norms to avoid the latter.", - "docstring": "Computational part of euclidean_distances\n\nAssumes inputs are already checked.\n\nIf norms are passed as float32, they are unused. If arrays are passed as\nfloat32, norms needs to be recomputed on upcast chunks.\nTODO: use a float64 accumulator in row_norms to avoid the latter.", + "description": "Computational part of euclidean_distances\n\nAssumes inputs are already checked.\n\nIf norms are passed as float32, they are unused. If arrays are passed as\nfloat32, norms needs to be recomputed on upcast chunks.\nTODO: use a float64 accumulator in row_norms to avoid the latter.", + "docstring": "Computational part of euclidean_distances\n\n Assumes inputs are already checked.\n\n If norms are passed as float32, they are unused. If arrays are passed as\n float32, norms needs to be recomputed on upcast chunks.\n TODO: use a float64 accumulator in row_norms to avoid the latter.\n ", "source_code": "\ndef _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None, squared=False):\n \"\"\"Computational part of euclidean_distances\n\n Assumes inputs are already checked.\n\n If norms are passed as float32, they are unused. If arrays are passed as\n float32, norms needs to be recomputed on upcast chunks.\n TODO: use a float64 accumulator in row_norms to avoid the latter.\n \"\"\"\n if X_norm_squared is not None:\n if X_norm_squared.dtype == np.float32:\n XX = None\n else:\n XX = X_norm_squared.reshape(-1, 1)\n elif X.dtype == np.float32:\n XX = None\n else:\n XX = row_norms(X, squared=True)[:, np.newaxis]\n if Y is X:\n YY = None if XX is None else XX.T\n elif Y_norm_squared is not None:\n if Y_norm_squared.dtype == np.float32:\n YY = None\n else:\n YY = Y_norm_squared.reshape(1, -1)\n elif Y.dtype == np.float32:\n YY = None\n else:\n YY = row_norms(Y, squared=True)[np.newaxis, :]\n if X.dtype == np.float32:\n distances = _euclidean_distances_upcast(X, XX, Y, YY)\n else:\n distances = -2 * safe_sparse_dot(X, Y.T, dense_output=True)\n distances += XX\n distances += YY\n np.maximum(distances, 0, out=distances)\n if X is Y:\n np.fill_diagonal(distances, 0)\n return distances if squared else np.sqrt(distances, out=distances)" }, { @@ -124014,7 +133430,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "XX", @@ -124024,7 +133441,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124034,7 +133452,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "YY", @@ -124044,7 +133463,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "batch_size", @@ -124054,13 +133474,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Euclidean distances between X and Y.\n\nAssumes X and Y have float32 dtype. Assumes XX and YY have float64 dtype or are None. X and Y are upcast to float64 by chunks, which size is chosen to limit memory increase by approximately 10% (at least 10MiB).", - "docstring": "Euclidean distances between X and Y.\n\nAssumes X and Y have float32 dtype.\nAssumes XX and YY have float64 dtype or are None.\n\nX and Y are upcast to float64 by chunks, which size is chosen to limit\nmemory increase by approximately 10% (at least 10MiB).", + "description": "Euclidean distances between X and Y.\n\nAssumes X and Y have float32 dtype.\nAssumes XX and YY have float64 dtype or are None.\n\nX and Y are upcast to float64 by chunks, which size is chosen to limit\nmemory increase by approximately 10% (at least 10MiB).", + "docstring": "Euclidean distances between X and Y.\n\n Assumes X and Y have float32 dtype.\n Assumes XX and YY have float64 dtype or are None.\n\n X and Y are upcast to float64 by chunks, which size is chosen to limit\n memory increase by approximately 10% (at least 10MiB).\n ", "source_code": "\ndef _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None):\n \"\"\"Euclidean distances between X and Y.\n\n Assumes X and Y have float32 dtype.\n Assumes XX and YY have float64 dtype or are None.\n\n X and Y are upcast to float64 by chunks, which size is chosen to limit\n memory increase by approximately 10% (at least 10MiB).\n \"\"\"\n n_samples_X = X.shape[0]\n n_samples_Y = Y.shape[0]\n n_features = X.shape[1]\n distances = np.empty((n_samples_X, n_samples_Y), dtype=np.float32)\n if batch_size is None:\n x_density = X.nnz / np.prod(X.shape) if issparse(X) else 1\n y_density = Y.nnz / np.prod(Y.shape) if issparse(Y) else 1\n maxmem = max(((x_density * n_samples_X + y_density * n_samples_Y) * n_features + x_density * n_samples_X * y_density * n_samples_Y) / 10, 10 * 2**17)\n tmp = (x_density + y_density) * n_features\n batch_size = (-tmp + np.sqrt(tmp**2 + 4 * maxmem)) / 2\n batch_size = max(int(batch_size), 1)\n x_batches = gen_batches(n_samples_X, batch_size)\n for (i, x_slice) in enumerate(x_batches):\n X_chunk = X[x_slice].astype(np.float64)\n if XX is None:\n XX_chunk = row_norms(X_chunk, squared=True)[:, np.newaxis]\n else:\n XX_chunk = XX[x_slice]\n y_batches = gen_batches(n_samples_Y, batch_size)\n for (j, y_slice) in enumerate(y_batches):\n if X is Y and j < i:\n d = distances[y_slice, x_slice].T\n else:\n Y_chunk = Y[y_slice].astype(np.float64)\n if YY is None:\n YY_chunk = row_norms(Y_chunk, squared=True)[np.newaxis, :]\n else:\n YY_chunk = YY[:, y_slice]\n d = -2 * safe_sparse_dot(X_chunk, Y_chunk.T, dense_output=True)\n d += XX_chunk\n d += YY_chunk\n distances[x_slice, y_slice] = d.astype(np.float32, copy=False)\n return distances" }, { @@ -124078,7 +133499,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124088,7 +133510,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "metric", @@ -124098,7 +133521,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "force_all_finite", @@ -124108,7 +133532,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -124132,7 +133557,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124142,7 +133568,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "func", @@ -124152,7 +133579,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -124162,13 +133590,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Break the pairwise matrix in n_jobs even slices and compute them in parallel.", - "docstring": "Break the pairwise matrix in n_jobs even slices\nand compute them in parallel.", + "description": "Break the pairwise matrix in n_jobs even slices\nand compute them in parallel.", + "docstring": "Break the pairwise matrix in n_jobs even slices\n and compute them in parallel.", "source_code": "\ndef _parallel_pairwise(X, Y, func, n_jobs, **kwds):\n \"\"\"Break the pairwise matrix in n_jobs even slices\n and compute them in parallel.\"\"\"\n if Y is None:\n Y = X\n (X, Y, dtype) = _return_float_dtype(X, Y)\n if effective_n_jobs(n_jobs) == 1:\n return func(X, Y, **kwds)\n fd = delayed(_dist_wrapper)\n ret = np.empty((X.shape[0], Y.shape[0]), dtype=dtype, order='F')\n Parallel(backend='threading', n_jobs=n_jobs)((fd(func, ret, s, X, Y[s], **kwds) for s in gen_even_slices(_num_samples(Y), effective_n_jobs(n_jobs))))\n if (X is Y or Y is None) and func is euclidean_distances:\n np.fill_diagonal(ret, 0)\n return ret" }, { @@ -124186,7 +133615,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124196,7 +133626,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "metric", @@ -124206,7 +133637,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -124230,7 +133662,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124240,13 +133673,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "1. If dtype of X and Y is float32, then dtype float32 is returned. 2. Else dtype float is returned.", - "docstring": "1. If dtype of X and Y is float32, then dtype float32 is returned.\n2. Else dtype float is returned.", + "description": "1. If dtype of X and Y is float32, then dtype float32 is returned.\n2. Else dtype float is returned.", + "docstring": "\n 1. If dtype of X and Y is float32, then dtype float32 is returned.\n 2. Else dtype float is returned.\n ", "source_code": "\ndef _return_float_dtype(X, Y):\n \"\"\"\n 1. If dtype of X and Y is float32, then dtype float32 is returned.\n 2. Else dtype float is returned.\n \"\"\"\n if not issparse(X) and not isinstance(X, np.ndarray):\n X = np.asarray(X)\n if Y is None:\n Y_dtype = X.dtype\n elif not issparse(Y) and not isinstance(Y, np.ndarray):\n Y = np.asarray(Y)\n Y_dtype = Y.dtype\n else:\n Y_dtype = Y.dtype\n if X.dtype == Y_dtype == np.float32:\n dtype = np.float32\n else:\n dtype = float\n return X, Y, dtype" }, { @@ -124264,7 +133698,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124274,13 +133709,14 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "If `None`, uses `Y=X`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Computes the additive chi-squared kernel between observations in X and Y.\n\nThe chi-squared kernel is computed between each pair of rows in X and Y. X and Y have to be non-negative. This kernel is most commonly applied to histograms. The chi-squared kernel is given by:: k(x, y) = -Sum [(x - y)^2 / (x + y)] It can be interpreted as a weighted difference per entry. Read more in the :ref:`User Guide `.", - "docstring": "Computes the additive chi-squared kernel between observations in X and\nY.\n\nThe chi-squared kernel is computed between each pair of rows in X and Y. X\nand Y have to be non-negative. This kernel is most commonly applied to\nhistograms.\n\nThe chi-squared kernel is given by::\n\n k(x, y) = -Sum [(x - y)^2 / (x + y)]\n\nIt can be interpreted as a weighted difference per entry.\n\nRead more in the :ref:`User Guide `.\n\nNotes\n-----\nAs the negative of a distance, this kernel is only conditionally positive\ndefinite.\n\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\nReturns\n-------\nkernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)\n\nSee Also\n--------\nchi2_kernel : The exponentiated version of the kernel, which is usually\n preferable.\nsklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation\n to this kernel.\n\nReferences\n----------\n* Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C.\n Local features and kernels for classification of texture and object\n categories: A comprehensive study\n International Journal of Computer Vision 2007\n https://research.microsoft.com/en-us/um/people/manik/projects/trade-off/papers/ZhangIJCV06.pdf", + "description": "Computes the additive chi-squared kernel between observations in X and\nY.\n\nThe chi-squared kernel is computed between each pair of rows in X and Y. X\nand Y have to be non-negative. This kernel is most commonly applied to\nhistograms.\n\nThe chi-squared kernel is given by::\n\n k(x, y) = -Sum [(x - y)^2 / (x + y)]\n\nIt can be interpreted as a weighted difference per entry.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Computes the additive chi-squared kernel between observations in X and\n Y.\n\n The chi-squared kernel is computed between each pair of rows in X and Y. X\n and Y have to be non-negative. This kernel is most commonly applied to\n histograms.\n\n The chi-squared kernel is given by::\n\n k(x, y) = -Sum [(x - y)^2 / (x + y)]\n\n It can be interpreted as a weighted difference per entry.\n\n Read more in the :ref:`User Guide `.\n\n Notes\n -----\n As the negative of a distance, this kernel is only conditionally positive\n definite.\n\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\n Returns\n -------\n kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)\n\n See Also\n --------\n chi2_kernel : The exponentiated version of the kernel, which is usually\n preferable.\n sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation\n to this kernel.\n\n References\n ----------\n * Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C.\n Local features and kernels for classification of texture and object\n categories: A comprehensive study\n International Journal of Computer Vision 2007\n https://research.microsoft.com/en-us/um/people/manik/projects/trade-off/papers/ZhangIJCV06.pdf\n ", "source_code": "\ndef additive_chi2_kernel(X, Y=None):\n \"\"\"Computes the additive chi-squared kernel between observations in X and\n Y.\n\n The chi-squared kernel is computed between each pair of rows in X and Y. X\n and Y have to be non-negative. This kernel is most commonly applied to\n histograms.\n\n The chi-squared kernel is given by::\n\n k(x, y) = -Sum [(x - y)^2 / (x + y)]\n\n It can be interpreted as a weighted difference per entry.\n\n Read more in the :ref:`User Guide `.\n\n Notes\n -----\n As the negative of a distance, this kernel is only conditionally positive\n definite.\n\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\n Returns\n -------\n kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)\n\n See Also\n --------\n chi2_kernel : The exponentiated version of the kernel, which is usually\n preferable.\n sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation\n to this kernel.\n\n References\n ----------\n * Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C.\n Local features and kernels for classification of texture and object\n categories: A comprehensive study\n International Journal of Computer Vision 2007\n https://research.microsoft.com/en-us/um/people/manik/projects/trade-off/papers/ZhangIJCV06.pdf\n \"\"\"\n if issparse(X) or issparse(Y):\n raise ValueError('additive_chi2 does not support sparse matrices.')\n (X, Y) = check_pairwise_arrays(X, Y)\n if (X < 0).any():\n raise ValueError('X contains negative values.')\n if Y is not X and (Y < 0).any():\n raise ValueError('Y contains negative values.')\n result = np.zeros((X.shape[0], Y.shape[0]), dtype=X.dtype)\n _chi2_kernel_fast(X, Y, result)\n return result" }, { @@ -124298,6 +133734,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples_X, n_features)", "description": "" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -124308,13 +133748,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples_Y, n_features)", "description": "" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Set X and Y appropriately and checks inputs for paired distances.\n\nAll paired distance metrics should use this function first to assert that the given parameters are correct and safe to use. Specifically, this function first ensures that both X and Y are arrays, then checks that they are at least two dimensional while ensuring that their elements are floats. Finally, the function checks that the size of the dimensions of the two arrays are equal.", - "docstring": "Set X and Y appropriately and checks inputs for paired distances.\n\nAll paired distance metrics should use this function first to assert that\nthe given parameters are correct and safe to use.\n\nSpecifically, this function first ensures that both X and Y are arrays,\nthen checks that they are at least two dimensional while ensuring that\ntheir elements are floats. Finally, the function checks that the size\nof the dimensions of the two arrays are equal.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n\nY : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n\nReturns\n-------\nsafe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n An array equal to X, guaranteed to be a numpy array.\n\nsafe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n An array equal to Y if Y was not None, guaranteed to be a numpy array.\n If Y was None, safe_Y will be a pointer to X.", + "description": "Set X and Y appropriately and checks inputs for paired distances.\n\nAll paired distance metrics should use this function first to assert that\nthe given parameters are correct and safe to use.\n\nSpecifically, this function first ensures that both X and Y are arrays,\nthen checks that they are at least two dimensional while ensuring that\ntheir elements are floats. Finally, the function checks that the size\nof the dimensions of the two arrays are equal.", + "docstring": "Set X and Y appropriately and checks inputs for paired distances.\n\n All paired distance metrics should use this function first to assert that\n the given parameters are correct and safe to use.\n\n Specifically, this function first ensures that both X and Y are arrays,\n then checks that they are at least two dimensional while ensuring that\n their elements are floats. Finally, the function checks that the size\n of the dimensions of the two arrays are equal.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n\n Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n\n Returns\n -------\n safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n An array equal to X, guaranteed to be a numpy array.\n\n safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n An array equal to Y if Y was not None, guaranteed to be a numpy array.\n If Y was None, safe_Y will be a pointer to X.\n\n ", "source_code": "\ndef check_paired_arrays(X, Y):\n \"\"\"Set X and Y appropriately and checks inputs for paired distances.\n\n All paired distance metrics should use this function first to assert that\n the given parameters are correct and safe to use.\n\n Specifically, this function first ensures that both X and Y are arrays,\n then checks that they are at least two dimensional while ensuring that\n their elements are floats. Finally, the function checks that the size\n of the dimensions of the two arrays are equal.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n\n Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n\n Returns\n -------\n safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n An array equal to X, guaranteed to be a numpy array.\n\n safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n An array equal to Y if Y was not None, guaranteed to be a numpy array.\n If Y was None, safe_Y will be a pointer to X.\n\n \"\"\"\n (X, Y) = check_pairwise_arrays(X, Y)\n if X.shape != Y.shape:\n raise ValueError('X and Y should be of same shape. They were respectively %r and %r long.' % (X.shape, Y.shape))\n return X, Y" }, { @@ -124332,6 +133776,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples_X, n_features)", "description": "" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -124342,6 +133790,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples_Y, n_features)", "description": "" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -124352,7 +133804,8 @@ "docstring": { "type": "bool, default=False", "description": "True if X is to be treated as precomputed distances to the samples in\nY." - } + }, + "refined_type": {} }, { "name": "dtype", @@ -124362,7 +133815,8 @@ "docstring": { "type": "str, type, list of type, default=None", "description": "Data type required for X and Y. If None, the dtype will be an\nappropriate float type selected by _return_float_dtype.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "accept_sparse", @@ -124372,7 +133826,8 @@ "docstring": { "type": "str, bool or list/tuple of str, default='csr'", "description": "String[s] representing allowed sparse matrix formats, such as 'csc',\n'csr', etc. If the input is sparse but not in the allowed format,\nit will be converted to the first listed format. True allows the input\nto be any format. False means that a sparse matrix input will\nraise an error." - } + }, + "refined_type": {} }, { "name": "force_all_finite", @@ -124382,7 +133837,8 @@ "docstring": { "type": "bool or 'allow-nan', default=True", "description": "Whether to raise an error on np.inf, np.nan, pd.NA in array. The\npossibilities are:\n\n- True: Force all values of array to be finite.\n- False: accepts np.inf, np.nan, pd.NA in array.\n- 'allow-nan': accepts only np.nan and pd.NA values in array. Values\n cannot be infinite.\n\n.. versionadded:: 0.22\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n.. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`." - } + }, + "refined_type": {} }, { "name": "copy", @@ -124392,13 +133848,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether a forced copy will be triggered. If copy=False, a copy might\nbe triggered by a conversion.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Set X and Y appropriately and checks inputs.\n\nIf Y is None, it is set as a pointer to X (i.e. not a copy). If Y is given, this does not happen. All distance metrics should use this function first to assert that the given parameters are correct and safe to use. Specifically, this function first ensures that both X and Y are arrays, then checks that they are at least two dimensional while ensuring that their elements are floats (or dtype if provided). Finally, the function checks that the size of the second dimension of the two arrays is equal, or the equivalent check for a precomputed distance matrix.", - "docstring": "Set X and Y appropriately and checks inputs.\n\nIf Y is None, it is set as a pointer to X (i.e. not a copy).\nIf Y is given, this does not happen.\nAll distance metrics should use this function first to assert that the\ngiven parameters are correct and safe to use.\n\nSpecifically, this function first ensures that both X and Y are arrays,\nthen checks that they are at least two dimensional while ensuring that\ntheir elements are floats (or dtype if provided). Finally, the function\nchecks that the size of the second dimension of the two arrays is equal, or\nthe equivalent check for a precomputed distance matrix.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n\nY : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n\nprecomputed : bool, default=False\n True if X is to be treated as precomputed distances to the samples in\n Y.\n\ndtype : str, type, list of type, default=None\n Data type required for X and Y. If None, the dtype will be an\n appropriate float type selected by _return_float_dtype.\n\n .. versionadded:: 0.18\n\naccept_sparse : str, bool or list/tuple of str, default='csr'\n String[s] representing allowed sparse matrix formats, such as 'csc',\n 'csr', etc. If the input is sparse but not in the allowed format,\n it will be converted to the first listed format. True allows the input\n to be any format. False means that a sparse matrix input will\n raise an error.\n\nforce_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in array. The\n possibilities are:\n\n - True: Force all values of array to be finite.\n - False: accepts np.inf, np.nan, pd.NA in array.\n - 'allow-nan': accepts only np.nan and pd.NA values in array. Values\n cannot be infinite.\n\n .. versionadded:: 0.22\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`.\n\ncopy : bool, default=False\n Whether a forced copy will be triggered. If copy=False, a copy might\n be triggered by a conversion.\n\n .. versionadded:: 0.22\n\nReturns\n-------\nsafe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n An array equal to X, guaranteed to be a numpy array.\n\nsafe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n An array equal to Y if Y was not None, guaranteed to be a numpy array.\n If Y was None, safe_Y will be a pointer to X.", + "description": "Set X and Y appropriately and checks inputs.\n\nIf Y is None, it is set as a pointer to X (i.e. not a copy).\nIf Y is given, this does not happen.\nAll distance metrics should use this function first to assert that the\ngiven parameters are correct and safe to use.\n\nSpecifically, this function first ensures that both X and Y are arrays,\nthen checks that they are at least two dimensional while ensuring that\ntheir elements are floats (or dtype if provided). Finally, the function\nchecks that the size of the second dimension of the two arrays is equal, or\nthe equivalent check for a precomputed distance matrix.", + "docstring": "Set X and Y appropriately and checks inputs.\n\n If Y is None, it is set as a pointer to X (i.e. not a copy).\n If Y is given, this does not happen.\n All distance metrics should use this function first to assert that the\n given parameters are correct and safe to use.\n\n Specifically, this function first ensures that both X and Y are arrays,\n then checks that they are at least two dimensional while ensuring that\n their elements are floats (or dtype if provided). Finally, the function\n checks that the size of the second dimension of the two arrays is equal, or\n the equivalent check for a precomputed distance matrix.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n\n Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n\n precomputed : bool, default=False\n True if X is to be treated as precomputed distances to the samples in\n Y.\n\n dtype : str, type, list of type, default=None\n Data type required for X and Y. If None, the dtype will be an\n appropriate float type selected by _return_float_dtype.\n\n .. versionadded:: 0.18\n\n accept_sparse : str, bool or list/tuple of str, default='csr'\n String[s] representing allowed sparse matrix formats, such as 'csc',\n 'csr', etc. If the input is sparse but not in the allowed format,\n it will be converted to the first listed format. True allows the input\n to be any format. False means that a sparse matrix input will\n raise an error.\n\n force_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in array. The\n possibilities are:\n\n - True: Force all values of array to be finite.\n - False: accepts np.inf, np.nan, pd.NA in array.\n - 'allow-nan': accepts only np.nan and pd.NA values in array. Values\n cannot be infinite.\n\n .. versionadded:: 0.22\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`.\n\n copy : bool, default=False\n Whether a forced copy will be triggered. If copy=False, a copy might\n be triggered by a conversion.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n An array equal to X, guaranteed to be a numpy array.\n\n safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n An array equal to Y if Y was not None, guaranteed to be a numpy array.\n If Y was None, safe_Y will be a pointer to X.\n\n ", "source_code": "\ndef check_pairwise_arrays(X, Y, *, precomputed=False, dtype=None, accept_sparse='csr', force_all_finite=True, copy=False):\n \"\"\"Set X and Y appropriately and checks inputs.\n\n If Y is None, it is set as a pointer to X (i.e. not a copy).\n If Y is given, this does not happen.\n All distance metrics should use this function first to assert that the\n given parameters are correct and safe to use.\n\n Specifically, this function first ensures that both X and Y are arrays,\n then checks that they are at least two dimensional while ensuring that\n their elements are floats (or dtype if provided). Finally, the function\n checks that the size of the second dimension of the two arrays is equal, or\n the equivalent check for a precomputed distance matrix.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n\n Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n\n precomputed : bool, default=False\n True if X is to be treated as precomputed distances to the samples in\n Y.\n\n dtype : str, type, list of type, default=None\n Data type required for X and Y. If None, the dtype will be an\n appropriate float type selected by _return_float_dtype.\n\n .. versionadded:: 0.18\n\n accept_sparse : str, bool or list/tuple of str, default='csr'\n String[s] representing allowed sparse matrix formats, such as 'csc',\n 'csr', etc. If the input is sparse but not in the allowed format,\n it will be converted to the first listed format. True allows the input\n to be any format. False means that a sparse matrix input will\n raise an error.\n\n force_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in array. The\n possibilities are:\n\n - True: Force all values of array to be finite.\n - False: accepts np.inf, np.nan, pd.NA in array.\n - 'allow-nan': accepts only np.nan and pd.NA values in array. Values\n cannot be infinite.\n\n .. versionadded:: 0.22\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`.\n\n copy : bool, default=False\n Whether a forced copy will be triggered. If copy=False, a copy might\n be triggered by a conversion.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n An array equal to X, guaranteed to be a numpy array.\n\n safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n An array equal to Y if Y was not None, guaranteed to be a numpy array.\n If Y was None, safe_Y will be a pointer to X.\n\n \"\"\"\n (X, Y, dtype_float) = _return_float_dtype(X, Y)\n estimator = 'check_pairwise_arrays'\n if dtype is None:\n dtype = dtype_float\n if Y is X or Y is None:\n X = Y = check_array(X, accept_sparse=accept_sparse, dtype=dtype, copy=copy, force_all_finite=force_all_finite, estimator=estimator)\n else:\n X = check_array(X, accept_sparse=accept_sparse, dtype=dtype, copy=copy, force_all_finite=force_all_finite, estimator=estimator)\n Y = check_array(Y, accept_sparse=accept_sparse, dtype=dtype, copy=copy, force_all_finite=force_all_finite, estimator=estimator)\n if precomputed:\n if X.shape[1] != Y.shape[0]:\n raise ValueError('Precomputed metric requires shape (n_queries, n_indexed). Got (%d, %d) for %d indexed.' % (X.shape[0], X.shape[1], Y.shape[0]))\n elif X.shape[1] != Y.shape[1]:\n raise ValueError('Incompatible dimension for X and Y matrices: X.shape[1] == %d while Y.shape[1] == %d' % (X.shape[1], Y.shape[1]))\n return X, Y" }, { @@ -124416,7 +133873,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124426,7 +133884,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "" - } + }, + "refined_type": {} }, { "name": "gamma", @@ -124436,13 +133895,14 @@ "docstring": { "type": "float, default=1.", "description": "Scaling parameter of the chi2 kernel." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Computes the exponential chi-squared kernel X and Y.\n\nThe chi-squared kernel is computed between each pair of rows in X and Y. X and Y have to be non-negative. This kernel is most commonly applied to histograms. The chi-squared kernel is given by:: k(x, y) = exp(-gamma Sum [(x - y)^2 / (x + y)]) It can be interpreted as a weighted difference per entry. Read more in the :ref:`User Guide `.", - "docstring": "Computes the exponential chi-squared kernel X and Y.\n\nThe chi-squared kernel is computed between each pair of rows in X and Y. X\nand Y have to be non-negative. This kernel is most commonly applied to\nhistograms.\n\nThe chi-squared kernel is given by::\n\n k(x, y) = exp(-gamma Sum [(x - y)^2 / (x + y)])\n\nIt can be interpreted as a weighted difference per entry.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n\ngamma : float, default=1.\n Scaling parameter of the chi2 kernel.\n\nReturns\n-------\nkernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)\n\nSee Also\n--------\nadditive_chi2_kernel : The additive version of this kernel.\nsklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation\n to the additive version of this kernel.\n\nReferences\n----------\n* Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C.\n Local features and kernels for classification of texture and object\n categories: A comprehensive study\n International Journal of Computer Vision 2007\n https://research.microsoft.com/en-us/um/people/manik/projects/trade-off/papers/ZhangIJCV06.pdf", + "description": "Computes the exponential chi-squared kernel X and Y.\n\nThe chi-squared kernel is computed between each pair of rows in X and Y. X\nand Y have to be non-negative. This kernel is most commonly applied to\nhistograms.\n\nThe chi-squared kernel is given by::\n\n k(x, y) = exp(-gamma Sum [(x - y)^2 / (x + y)])\n\nIt can be interpreted as a weighted difference per entry.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Computes the exponential chi-squared kernel X and Y.\n\n The chi-squared kernel is computed between each pair of rows in X and Y. X\n and Y have to be non-negative. This kernel is most commonly applied to\n histograms.\n\n The chi-squared kernel is given by::\n\n k(x, y) = exp(-gamma Sum [(x - y)^2 / (x + y)])\n\n It can be interpreted as a weighted difference per entry.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n\n gamma : float, default=1.\n Scaling parameter of the chi2 kernel.\n\n Returns\n -------\n kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)\n\n See Also\n --------\n additive_chi2_kernel : The additive version of this kernel.\n sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation\n to the additive version of this kernel.\n\n References\n ----------\n * Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C.\n Local features and kernels for classification of texture and object\n categories: A comprehensive study\n International Journal of Computer Vision 2007\n https://research.microsoft.com/en-us/um/people/manik/projects/trade-off/papers/ZhangIJCV06.pdf\n ", "source_code": "\ndef chi2_kernel(X, Y=None, gamma=1.0):\n \"\"\"Computes the exponential chi-squared kernel X and Y.\n\n The chi-squared kernel is computed between each pair of rows in X and Y. X\n and Y have to be non-negative. This kernel is most commonly applied to\n histograms.\n\n The chi-squared kernel is given by::\n\n k(x, y) = exp(-gamma Sum [(x - y)^2 / (x + y)])\n\n It can be interpreted as a weighted difference per entry.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n\n gamma : float, default=1.\n Scaling parameter of the chi2 kernel.\n\n Returns\n -------\n kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)\n\n See Also\n --------\n additive_chi2_kernel : The additive version of this kernel.\n sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation\n to the additive version of this kernel.\n\n References\n ----------\n * Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C.\n Local features and kernels for classification of texture and object\n categories: A comprehensive study\n International Journal of Computer Vision 2007\n https://research.microsoft.com/en-us/um/people/manik/projects/trade-off/papers/ZhangIJCV06.pdf\n \"\"\"\n K = additive_chi2_kernel(X, Y)\n K *= gamma\n return np.exp(K, K)" }, { @@ -124460,6 +133920,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples_X, n_features)", "description": "Matrix `X`." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -124470,13 +133934,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None", "description": "Matrix `Y`." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Compute cosine distance between samples in X and Y.\n\nCosine distance is defined as 1.0 minus the cosine similarity. Read more in the :ref:`User Guide `.", - "docstring": "Compute cosine distance between samples in X and Y.\n\nCosine distance is defined as 1.0 minus the cosine similarity.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n Matrix `X`.\n\nY : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None\n Matrix `Y`.\n\nReturns\n-------\ndistance matrix : ndarray of shape (n_samples_X, n_samples_Y)\n\nSee Also\n--------\ncosine_similarity\nscipy.spatial.distance.cosine : Dense matrices only.", + "description": "Compute cosine distance between samples in X and Y.\n\nCosine distance is defined as 1.0 minus the cosine similarity.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute cosine distance between samples in X and Y.\n\n Cosine distance is defined as 1.0 minus the cosine similarity.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n Matrix `X`.\n\n Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None\n Matrix `Y`.\n\n Returns\n -------\n distance matrix : ndarray of shape (n_samples_X, n_samples_Y)\n\n See Also\n --------\n cosine_similarity\n scipy.spatial.distance.cosine : Dense matrices only.\n ", "source_code": "\ndef cosine_distances(X, Y=None):\n \"\"\"Compute cosine distance between samples in X and Y.\n\n Cosine distance is defined as 1.0 minus the cosine similarity.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n Matrix `X`.\n\n Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None\n Matrix `Y`.\n\n Returns\n -------\n distance matrix : ndarray of shape (n_samples_X, n_samples_Y)\n\n See Also\n --------\n cosine_similarity\n scipy.spatial.distance.cosine : Dense matrices only.\n \"\"\"\n S = cosine_similarity(X, Y)\n S *= -1\n S += 1\n np.clip(S, 0, 2, out=S)\n if X is Y or Y is None:\n S[np.diag_indices_from(S)] = 0.0\n return S" }, { @@ -124494,6 +133962,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples_X, n_features)", "description": "Input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -124504,6 +133976,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples_Y, n_features), default=None", "description": "Input data. If ``None``, the output will be the pairwise\nsimilarities between all samples in ``X``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -124514,13 +133990,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to return dense output even when the input is sparse. If\n``False``, the output is sparse if both input arrays are sparse.\n\n.. versionadded:: 0.17\n parameter ``dense_output`` for dense output." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute cosine similarity between samples in X and Y.\n\nCosine similarity, or the cosine kernel, computes similarity as the normalized dot product of X and Y: K(X, Y) = / (||X||*||Y||) On L2-normalized data, this function is equivalent to linear_kernel. Read more in the :ref:`User Guide `.", - "docstring": "Compute cosine similarity between samples in X and Y.\n\nCosine similarity, or the cosine kernel, computes similarity as the\nnormalized dot product of X and Y:\n\n K(X, Y) = / (||X||*||Y||)\n\nOn L2-normalized data, this function is equivalent to linear_kernel.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples_X, n_features)\n Input data.\n\nY : {ndarray, sparse matrix} of shape (n_samples_Y, n_features), default=None\n Input data. If ``None``, the output will be the pairwise\n similarities between all samples in ``X``.\n\ndense_output : bool, default=True\n Whether to return dense output even when the input is sparse. If\n ``False``, the output is sparse if both input arrays are sparse.\n\n .. versionadded:: 0.17\n parameter ``dense_output`` for dense output.\n\nReturns\n-------\nkernel matrix : ndarray of shape (n_samples_X, n_samples_Y)", + "description": "Compute cosine similarity between samples in X and Y.\n\nCosine similarity, or the cosine kernel, computes similarity as the\nnormalized dot product of X and Y:\n\n K(X, Y) = / (||X||*||Y||)\n\nOn L2-normalized data, this function is equivalent to linear_kernel.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute cosine similarity between samples in X and Y.\n\n Cosine similarity, or the cosine kernel, computes similarity as the\n normalized dot product of X and Y:\n\n K(X, Y) = / (||X||*||Y||)\n\n On L2-normalized data, this function is equivalent to linear_kernel.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)\n Input data.\n\n Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features), default=None\n Input data. If ``None``, the output will be the pairwise\n similarities between all samples in ``X``.\n\n dense_output : bool, default=True\n Whether to return dense output even when the input is sparse. If\n ``False``, the output is sparse if both input arrays are sparse.\n\n .. versionadded:: 0.17\n parameter ``dense_output`` for dense output.\n\n Returns\n -------\n kernel matrix : ndarray of shape (n_samples_X, n_samples_Y)\n ", "source_code": "\ndef cosine_similarity(X, Y=None, dense_output=True):\n \"\"\"Compute cosine similarity between samples in X and Y.\n\n Cosine similarity, or the cosine kernel, computes similarity as the\n normalized dot product of X and Y:\n\n K(X, Y) = / (||X||*||Y||)\n\n On L2-normalized data, this function is equivalent to linear_kernel.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)\n Input data.\n\n Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features), default=None\n Input data. If ``None``, the output will be the pairwise\n similarities between all samples in ``X``.\n\n dense_output : bool, default=True\n Whether to return dense output even when the input is sparse. If\n ``False``, the output is sparse if both input arrays are sparse.\n\n .. versionadded:: 0.17\n parameter ``dense_output`` for dense output.\n\n Returns\n -------\n kernel matrix : ndarray of shape (n_samples_X, n_samples_Y)\n \"\"\"\n (X, Y) = check_pairwise_arrays(X, Y)\n X_normalized = normalize(X, copy=True)\n if X is Y:\n Y_normalized = X_normalized\n else:\n Y_normalized = normalize(Y, copy=True)\n K = safe_sparse_dot(X_normalized, Y_normalized.T, dense_output=dense_output)\n return K" }, { @@ -124532,8 +134009,8 @@ "parameters": [], "results": [], "is_public": true, - "description": "Valid metrics for pairwise_distances.\n\nThis function simply returns the valid pairwise distance metrics. It exists to allow for a description of the mapping for each of the valid strings. The valid distance metrics, and the function they map to, are: =============== ======================================== metric Function =============== ======================================== 'cityblock' metrics.pairwise.manhattan_distances 'cosine' metrics.pairwise.cosine_distances 'euclidean' metrics.pairwise.euclidean_distances 'haversine' metrics.pairwise.haversine_distances 'l1' metrics.pairwise.manhattan_distances 'l2' metrics.pairwise.euclidean_distances 'manhattan' metrics.pairwise.manhattan_distances 'nan_euclidean' metrics.pairwise.nan_euclidean_distances =============== ======================================== Read more in the :ref:`User Guide `.", - "docstring": "Valid metrics for pairwise_distances.\n\nThis function simply returns the valid pairwise distance metrics.\nIt exists to allow for a description of the mapping for\neach of the valid strings.\n\nThe valid distance metrics, and the function they map to, are:\n\n=============== ========================================\nmetric Function\n=============== ========================================\n'cityblock' metrics.pairwise.manhattan_distances\n'cosine' metrics.pairwise.cosine_distances\n'euclidean' metrics.pairwise.euclidean_distances\n'haversine' metrics.pairwise.haversine_distances\n'l1' metrics.pairwise.manhattan_distances\n'l2' metrics.pairwise.euclidean_distances\n'manhattan' metrics.pairwise.manhattan_distances\n'nan_euclidean' metrics.pairwise.nan_euclidean_distances\n=============== ========================================\n\nRead more in the :ref:`User Guide `.", + "description": "Valid metrics for pairwise_distances.\n\nThis function simply returns the valid pairwise distance metrics.\nIt exists to allow for a description of the mapping for\neach of the valid strings.\n\nThe valid distance metrics, and the function they map to, are:\n\n=============== ========================================\nmetric Function\n=============== ========================================\n'cityblock' metrics.pairwise.manhattan_distances\n'cosine' metrics.pairwise.cosine_distances\n'euclidean' metrics.pairwise.euclidean_distances\n'haversine' metrics.pairwise.haversine_distances\n'l1' metrics.pairwise.manhattan_distances\n'l2' metrics.pairwise.euclidean_distances\n'manhattan' metrics.pairwise.manhattan_distances\n'nan_euclidean' metrics.pairwise.nan_euclidean_distances\n=============== ========================================\n\nRead more in the :ref:`User Guide `.", + "docstring": "Valid metrics for pairwise_distances.\n\n This function simply returns the valid pairwise distance metrics.\n It exists to allow for a description of the mapping for\n each of the valid strings.\n\n The valid distance metrics, and the function they map to, are:\n\n =============== ========================================\n metric Function\n =============== ========================================\n 'cityblock' metrics.pairwise.manhattan_distances\n 'cosine' metrics.pairwise.cosine_distances\n 'euclidean' metrics.pairwise.euclidean_distances\n 'haversine' metrics.pairwise.haversine_distances\n 'l1' metrics.pairwise.manhattan_distances\n 'l2' metrics.pairwise.euclidean_distances\n 'manhattan' metrics.pairwise.manhattan_distances\n 'nan_euclidean' metrics.pairwise.nan_euclidean_distances\n =============== ========================================\n\n Read more in the :ref:`User Guide `.\n\n ", "source_code": "\ndef distance_metrics():\n \"\"\"Valid metrics for pairwise_distances.\n\n This function simply returns the valid pairwise distance metrics.\n It exists to allow for a description of the mapping for\n each of the valid strings.\n\n The valid distance metrics, and the function they map to, are:\n\n =============== ========================================\n metric Function\n =============== ========================================\n 'cityblock' metrics.pairwise.manhattan_distances\n 'cosine' metrics.pairwise.cosine_distances\n 'euclidean' metrics.pairwise.euclidean_distances\n 'haversine' metrics.pairwise.haversine_distances\n 'l1' metrics.pairwise.manhattan_distances\n 'l2' metrics.pairwise.euclidean_distances\n 'manhattan' metrics.pairwise.manhattan_distances\n 'nan_euclidean' metrics.pairwise.nan_euclidean_distances\n =============== ========================================\n\n Read more in the :ref:`User Guide `.\n\n \"\"\"\n return PAIRWISE_DISTANCE_FUNCTIONS" }, { @@ -124550,7 +134027,11 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples_X, n_features)", - "description": "" + "description": "An array where each row is a sample and each column is a feature." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -124560,7 +134041,11 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None", - "description": "If `None`, uses `Y=X`." + "description": "An array where each row is a sample and each column is a feature.\nIf `None`, method uses `Y=X`." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -124571,7 +134056,8 @@ "docstring": { "type": "array-like of shape (n_samples_Y,) or (n_samples_Y, 1) or (1, n_samples_Y), default=None", "description": "Pre-computed dot-products of vectors in Y (e.g.,\n``(Y**2).sum(axis=1)``)\nMay be ignored in some cases, see the note below." - } + }, + "refined_type": {} }, { "name": "squared", @@ -124581,7 +134067,8 @@ "docstring": { "type": "bool, default=False", "description": "Return squared Euclidean distances." - } + }, + "refined_type": {} }, { "name": "X_norm_squared", @@ -124591,14 +134078,15 @@ "docstring": { "type": "array-like of shape (n_samples_X,) or (n_samples_X, 1) or (1, n_samples_X), default=None", "description": "Pre-computed dot-products of vectors in X (e.g.,\n``(X**2).sum(axis=1)``)\nMay be ignored in some cases, see the note below." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors.\n\nFor efficiency reasons, the euclidean distance between a pair of row vector x and y is computed as:: dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) This formulation has two advantages over other ways of computing distances. First, it is computationally efficient when dealing with sparse data. Second, if one argument varies but the other remains unchanged, then `dot(x, x)` and/or `dot(y, y)` can be pre-computed. However, this is not the most precise way of doing this computation, because this equation potentially suffers from \"catastrophic cancellation\". Also, the distance matrix returned by this function may not be exactly symmetric as required by, e.g., ``scipy.spatial.distance`` functions. Read more in the :ref:`User Guide `.", - "docstring": "Considering the rows of X (and Y=X) as vectors, compute the\ndistance matrix between each pair of vectors.\n\nFor efficiency reasons, the euclidean distance between a pair of row\nvector x and y is computed as::\n\n dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))\n\nThis formulation has two advantages over other ways of computing distances.\nFirst, it is computationally efficient when dealing with sparse data.\nSecond, if one argument varies but the other remains unchanged, then\n`dot(x, x)` and/or `dot(y, y)` can be pre-computed.\n\nHowever, this is not the most precise way of doing this computation,\nbecause this equation potentially suffers from \"catastrophic cancellation\".\nAlso, the distance matrix returned by this function may not be exactly\nsymmetric as required by, e.g., ``scipy.spatial.distance`` functions.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n\nY : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\nY_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) or (1, n_samples_Y), default=None\n Pre-computed dot-products of vectors in Y (e.g.,\n ``(Y**2).sum(axis=1)``)\n May be ignored in some cases, see the note below.\n\nsquared : bool, default=False\n Return squared Euclidean distances.\n\nX_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) or (1, n_samples_X), default=None\n Pre-computed dot-products of vectors in X (e.g.,\n ``(X**2).sum(axis=1)``)\n May be ignored in some cases, see the note below.\n\nNotes\n-----\nTo achieve better accuracy, `X_norm_squared`\u00a0and `Y_norm_squared` may be\nunused if they are passed as ``float32``.\n\nReturns\n-------\ndistances : ndarray of shape (n_samples_X, n_samples_Y)\n\nSee Also\n--------\npaired_distances : Distances betweens pairs of elements of X and Y.\n\nExamples\n--------\n>>> from sklearn.metrics.pairwise import euclidean_distances\n>>> X = [[0, 1], [1, 1]]\n>>> # distance between rows of X\n>>> euclidean_distances(X, X)\narray([[0., 1.],\n [1., 0.]])\n>>> # get distance to origin\n>>> euclidean_distances(X, [[0, 0]])\narray([[1. ],\n [1.41421356]])", - "source_code": "\ndef euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False, X_norm_squared=None):\n \"\"\"\n Considering the rows of X (and Y=X) as vectors, compute the\n distance matrix between each pair of vectors.\n\n For efficiency reasons, the euclidean distance between a pair of row\n vector x and y is computed as::\n\n dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))\n\n This formulation has two advantages over other ways of computing distances.\n First, it is computationally efficient when dealing with sparse data.\n Second, if one argument varies but the other remains unchanged, then\n `dot(x, x)` and/or `dot(y, y)` can be pre-computed.\n\n However, this is not the most precise way of doing this computation,\n because this equation potentially suffers from \"catastrophic cancellation\".\n Also, the distance matrix returned by this function may not be exactly\n symmetric as required by, e.g., ``scipy.spatial.distance`` functions.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n\n Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\n Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) or (1, n_samples_Y), default=None\n Pre-computed dot-products of vectors in Y (e.g.,\n ``(Y**2).sum(axis=1)``)\n May be ignored in some cases, see the note below.\n\n squared : bool, default=False\n Return squared Euclidean distances.\n\n X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) or (1, n_samples_X), default=None\n Pre-computed dot-products of vectors in X (e.g.,\n ``(X**2).sum(axis=1)``)\n May be ignored in some cases, see the note below.\n\n Notes\n -----\n To achieve better accuracy, `X_norm_squared`\u00a0and `Y_norm_squared` may be\n unused if they are passed as ``float32``.\n\n Returns\n -------\n distances : ndarray of shape (n_samples_X, n_samples_Y)\n\n See Also\n --------\n paired_distances : Distances betweens pairs of elements of X and Y.\n\n Examples\n --------\n >>> from sklearn.metrics.pairwise import euclidean_distances\n >>> X = [[0, 1], [1, 1]]\n >>> # distance between rows of X\n >>> euclidean_distances(X, X)\n array([[0., 1.],\n [1., 0.]])\n >>> # get distance to origin\n >>> euclidean_distances(X, [[0, 0]])\n array([[1. ],\n [1.41421356]])\n \"\"\"\n (X, Y) = check_pairwise_arrays(X, Y)\n if X_norm_squared is not None:\n X_norm_squared = check_array(X_norm_squared, ensure_2d=False)\n original_shape = X_norm_squared.shape\n if X_norm_squared.shape == (X.shape[0], ):\n X_norm_squared = X_norm_squared.reshape(-1, 1)\n if X_norm_squared.shape == (1, X.shape[0]):\n X_norm_squared = X_norm_squared.T\n if X_norm_squared.shape != (X.shape[0], 1):\n raise ValueError(f'Incompatible dimensions for X of shape {X.shape} and X_norm_squared of shape {original_shape}.')\n if Y_norm_squared is not None:\n Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False)\n original_shape = Y_norm_squared.shape\n if Y_norm_squared.shape == (Y.shape[0], ):\n Y_norm_squared = Y_norm_squared.reshape(1, -1)\n if Y_norm_squared.shape == (Y.shape[0], 1):\n Y_norm_squared = Y_norm_squared.T\n if Y_norm_squared.shape != (1, Y.shape[0]):\n raise ValueError(f'Incompatible dimensions for Y of shape {Y.shape} and Y_norm_squared of shape {original_shape}.')\n return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared)" + "description": "Compute the distance matrix between each pair from a vector array X and Y.\n\nFor efficiency reasons, the euclidean distance between a pair of row\nvector x and y is computed as::\n\n dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))\n\nThis formulation has two advantages over other ways of computing distances.\nFirst, it is computationally efficient when dealing with sparse data.\nSecond, if one argument varies but the other remains unchanged, then\n`dot(x, x)` and/or `dot(y, y)` can be pre-computed.\n\nHowever, this is not the most precise way of doing this computation,\nbecause this equation potentially suffers from \"catastrophic cancellation\".\nAlso, the distance matrix returned by this function may not be exactly\nsymmetric as required by, e.g., ``scipy.spatial.distance`` functions.\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n Compute the distance matrix between each pair from a vector array X and Y.\n\n For efficiency reasons, the euclidean distance between a pair of row\n vector x and y is computed as::\n\n dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))\n\n This formulation has two advantages over other ways of computing distances.\n First, it is computationally efficient when dealing with sparse data.\n Second, if one argument varies but the other remains unchanged, then\n `dot(x, x)` and/or `dot(y, y)` can be pre-computed.\n\n However, this is not the most precise way of doing this computation,\n because this equation potentially suffers from \"catastrophic cancellation\".\n Also, the distance matrix returned by this function may not be exactly\n symmetric as required by, e.g., ``scipy.spatial.distance`` functions.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n An array where each row is a sample and each column is a feature.\n\n Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None\n An array where each row is a sample and each column is a feature.\n If `None`, method uses `Y=X`.\n\n Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) or (1, n_samples_Y), default=None\n Pre-computed dot-products of vectors in Y (e.g.,\n ``(Y**2).sum(axis=1)``)\n May be ignored in some cases, see the note below.\n\n squared : bool, default=False\n Return squared Euclidean distances.\n\n X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) or (1, n_samples_X), default=None\n Pre-computed dot-products of vectors in X (e.g.,\n ``(X**2).sum(axis=1)``)\n May be ignored in some cases, see the note below.\n\n Returns\n -------\n distances : ndarray of shape (n_samples_X, n_samples_Y)\n Returns the distances between the row vectors of `X`\n and the row vectors of `Y`.\n\n See Also\n --------\n paired_distances : Distances betweens pairs of elements of X and Y.\n\n Notes\n -----\n To achieve a better accuracy, `X_norm_squared`\u00a0and `Y_norm_squared` may be\n unused if they are passed as `np.float32`.\n\n Examples\n --------\n >>> from sklearn.metrics.pairwise import euclidean_distances\n >>> X = [[0, 1], [1, 1]]\n >>> # distance between rows of X\n >>> euclidean_distances(X, X)\n array([[0., 1.],\n [1., 0.]])\n >>> # get distance to origin\n >>> euclidean_distances(X, [[0, 0]])\n array([[1. ],\n [1.41421356]])\n ", + "source_code": "\ndef euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False, X_norm_squared=None):\n \"\"\"\n Compute the distance matrix between each pair from a vector array X and Y.\n\n For efficiency reasons, the euclidean distance between a pair of row\n vector x and y is computed as::\n\n dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))\n\n This formulation has two advantages over other ways of computing distances.\n First, it is computationally efficient when dealing with sparse data.\n Second, if one argument varies but the other remains unchanged, then\n `dot(x, x)` and/or `dot(y, y)` can be pre-computed.\n\n However, this is not the most precise way of doing this computation,\n because this equation potentially suffers from \"catastrophic cancellation\".\n Also, the distance matrix returned by this function may not be exactly\n symmetric as required by, e.g., ``scipy.spatial.distance`` functions.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n An array where each row is a sample and each column is a feature.\n\n Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None\n An array where each row is a sample and each column is a feature.\n If `None`, method uses `Y=X`.\n\n Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) or (1, n_samples_Y), default=None\n Pre-computed dot-products of vectors in Y (e.g.,\n ``(Y**2).sum(axis=1)``)\n May be ignored in some cases, see the note below.\n\n squared : bool, default=False\n Return squared Euclidean distances.\n\n X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) or (1, n_samples_X), default=None\n Pre-computed dot-products of vectors in X (e.g.,\n ``(X**2).sum(axis=1)``)\n May be ignored in some cases, see the note below.\n\n Returns\n -------\n distances : ndarray of shape (n_samples_X, n_samples_Y)\n Returns the distances between the row vectors of `X`\n and the row vectors of `Y`.\n\n See Also\n --------\n paired_distances : Distances betweens pairs of elements of X and Y.\n\n Notes\n -----\n To achieve a better accuracy, `X_norm_squared`\u00a0and `Y_norm_squared` may be\n unused if they are passed as `np.float32`.\n\n Examples\n --------\n >>> from sklearn.metrics.pairwise import euclidean_distances\n >>> X = [[0, 1], [1, 1]]\n >>> # distance between rows of X\n >>> euclidean_distances(X, X)\n array([[0., 1.],\n [1., 0.]])\n >>> # get distance to origin\n >>> euclidean_distances(X, [[0, 0]])\n array([[1. ],\n [1.41421356]])\n \"\"\"\n (X, Y) = check_pairwise_arrays(X, Y)\n if X_norm_squared is not None:\n X_norm_squared = check_array(X_norm_squared, ensure_2d=False)\n original_shape = X_norm_squared.shape\n if X_norm_squared.shape == (X.shape[0], ):\n X_norm_squared = X_norm_squared.reshape(-1, 1)\n if X_norm_squared.shape == (1, X.shape[0]):\n X_norm_squared = X_norm_squared.T\n if X_norm_squared.shape != (X.shape[0], 1):\n raise ValueError(f'Incompatible dimensions for X of shape {X.shape} and X_norm_squared of shape {original_shape}.')\n if Y_norm_squared is not None:\n Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False)\n original_shape = Y_norm_squared.shape\n if Y_norm_squared.shape == (Y.shape[0], ):\n Y_norm_squared = Y_norm_squared.reshape(1, -1)\n if Y_norm_squared.shape == (Y.shape[0], 1):\n Y_norm_squared = Y_norm_squared.T\n if Y_norm_squared.shape != (1, Y.shape[0]):\n raise ValueError(f'Incompatible dimensions for Y of shape {Y.shape} and Y_norm_squared of shape {original_shape}.')\n return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared)" }, { "name": "haversine_distances", @@ -124615,7 +134103,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, 2)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124625,14 +134114,15 @@ "docstring": { "type": "array-like of shape (n_samples_Y, 2), default=None", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the Haversine distance between samples in X and Y.\n\nThe Haversine (or great circle) distance is the angular distance between two points on the surface of a sphere. The first coordinate of each point is assumed to be the latitude, the second is the longitude, given in radians. The dimension of the data must be 2. .. math:: D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2) + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]", - "docstring": "Compute the Haversine distance between samples in X and Y.\n\nThe Haversine (or great circle) distance is the angular distance between\ntwo points on the surface of a sphere. The first coordinate of each point\nis assumed to be the latitude, the second is the longitude, given\nin radians. The dimension of the data must be 2.\n\n.. math::\n D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2)\n + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]\n\nParameters\n----------\nX : array-like of shape (n_samples_X, 2)\n\nY : array-like of shape (n_samples_Y, 2), default=None\n\nReturns\n-------\ndistance : ndarray of shape (n_samples_X, n_samples_Y)\n\nNotes\n-----\nAs the Earth is nearly spherical, the haversine formula provides a good\napproximation of the distance between two points of the Earth surface, with\na less than 1% error on average.\n\nExamples\n--------\nWe want to calculate the distance between the Ezeiza Airport\n(Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris,\nFrance).\n\n>>> from sklearn.metrics.pairwise import haversine_distances\n>>> from math import radians\n>>> bsas = [-34.83333, -58.5166646]\n>>> paris = [49.0083899664, 2.53844117956]\n>>> bsas_in_radians = [radians(_) for _ in bsas]\n>>> paris_in_radians = [radians(_) for _ in paris]\n>>> result = haversine_distances([bsas_in_radians, paris_in_radians])\n>>> result * 6371000/1000 # multiply by Earth radius to get kilometers\narray([[ 0. , 11099.54035582],\n [11099.54035582, 0. ]])", - "source_code": "\ndef haversine_distances(X, Y=None):\n \"\"\"Compute the Haversine distance between samples in X and Y.\n\n The Haversine (or great circle) distance is the angular distance between\n two points on the surface of a sphere. The first coordinate of each point\n is assumed to be the latitude, the second is the longitude, given\n in radians. The dimension of the data must be 2.\n\n .. math::\n D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2)\n + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, 2)\n\n Y : array-like of shape (n_samples_Y, 2), default=None\n\n Returns\n -------\n distance : ndarray of shape (n_samples_X, n_samples_Y)\n\n Notes\n -----\n As the Earth is nearly spherical, the haversine formula provides a good\n approximation of the distance between two points of the Earth surface, with\n a less than 1% error on average.\n\n Examples\n --------\n We want to calculate the distance between the Ezeiza Airport\n (Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris,\n France).\n\n >>> from sklearn.metrics.pairwise import haversine_distances\n >>> from math import radians\n >>> bsas = [-34.83333, -58.5166646]\n >>> paris = [49.0083899664, 2.53844117956]\n >>> bsas_in_radians = [radians(_) for _ in bsas]\n >>> paris_in_radians = [radians(_) for _ in paris]\n >>> result = haversine_distances([bsas_in_radians, paris_in_radians])\n >>> result * 6371000/1000 # multiply by Earth radius to get kilometers\n array([[ 0. , 11099.54035582],\n [11099.54035582, 0. ]])\n \"\"\"\n from ..neighbors import DistanceMetric\n return DistanceMetric.get_metric('haversine').pairwise(X, Y)" + "description": "Compute the Haversine distance between samples in X and Y.\n\nThe Haversine (or great circle) distance is the angular distance between\ntwo points on the surface of a sphere. The first coordinate of each point\nis assumed to be the latitude, the second is the longitude, given\nin radians. The dimension of the data must be 2.\n\n.. math::\n D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2)\n + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]", + "docstring": "Compute the Haversine distance between samples in X and Y.\n\n The Haversine (or great circle) distance is the angular distance between\n two points on the surface of a sphere. The first coordinate of each point\n is assumed to be the latitude, the second is the longitude, given\n in radians. The dimension of the data must be 2.\n\n .. math::\n D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2)\n + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, 2)\n\n Y : array-like of shape (n_samples_Y, 2), default=None\n\n Returns\n -------\n distance : ndarray of shape (n_samples_X, n_samples_Y)\n\n Notes\n -----\n As the Earth is nearly spherical, the haversine formula provides a good\n approximation of the distance between two points of the Earth surface, with\n a less than 1% error on average.\n\n Examples\n --------\n We want to calculate the distance between the Ezeiza Airport\n (Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris,\n France).\n\n >>> from sklearn.metrics.pairwise import haversine_distances\n >>> from math import radians\n >>> bsas = [-34.83333, -58.5166646]\n >>> paris = [49.0083899664, 2.53844117956]\n >>> bsas_in_radians = [radians(_) for _ in bsas]\n >>> paris_in_radians = [radians(_) for _ in paris]\n >>> result = haversine_distances([bsas_in_radians, paris_in_radians])\n >>> result * 6371000/1000 # multiply by Earth radius to get kilometers\n array([[ 0. , 11099.54035582],\n [11099.54035582, 0. ]])\n ", + "source_code": "\ndef haversine_distances(X, Y=None):\n \"\"\"Compute the Haversine distance between samples in X and Y.\n\n The Haversine (or great circle) distance is the angular distance between\n two points on the surface of a sphere. The first coordinate of each point\n is assumed to be the latitude, the second is the longitude, given\n in radians. The dimension of the data must be 2.\n\n .. math::\n D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2)\n + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, 2)\n\n Y : array-like of shape (n_samples_Y, 2), default=None\n\n Returns\n -------\n distance : ndarray of shape (n_samples_X, n_samples_Y)\n\n Notes\n -----\n As the Earth is nearly spherical, the haversine formula provides a good\n approximation of the distance between two points of the Earth surface, with\n a less than 1% error on average.\n\n Examples\n --------\n We want to calculate the distance between the Ezeiza Airport\n (Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris,\n France).\n\n >>> from sklearn.metrics.pairwise import haversine_distances\n >>> from math import radians\n >>> bsas = [-34.83333, -58.5166646]\n >>> paris = [49.0083899664, 2.53844117956]\n >>> bsas_in_radians = [radians(_) for _ in bsas]\n >>> paris_in_radians = [radians(_) for _ in paris]\n >>> result = haversine_distances([bsas_in_radians, paris_in_radians])\n >>> result * 6371000/1000 # multiply by Earth radius to get kilometers\n array([[ 0. , 11099.54035582],\n [11099.54035582, 0. ]])\n \"\"\"\n from ..metrics import DistanceMetric\n return DistanceMetric.get_metric('haversine').pairwise(X, Y)" }, { "name": "kernel_metrics", @@ -124643,8 +134133,8 @@ "parameters": [], "results": [], "is_public": true, - "description": "Valid metrics for pairwise_kernels.\n\nThis function simply returns the valid pairwise distance metrics. It exists, however, to allow for a verbose description of the mapping for each of the valid strings. The valid distance metrics, and the function they map to, are: =============== ======================================== metric Function =============== ======================================== 'additive_chi2' sklearn.pairwise.additive_chi2_kernel 'chi2' sklearn.pairwise.chi2_kernel 'linear' sklearn.pairwise.linear_kernel 'poly' sklearn.pairwise.polynomial_kernel 'polynomial' sklearn.pairwise.polynomial_kernel 'rbf' sklearn.pairwise.rbf_kernel 'laplacian' sklearn.pairwise.laplacian_kernel 'sigmoid' sklearn.pairwise.sigmoid_kernel 'cosine' sklearn.pairwise.cosine_similarity =============== ======================================== Read more in the :ref:`User Guide `.", - "docstring": "Valid metrics for pairwise_kernels.\n\nThis function simply returns the valid pairwise distance metrics.\nIt exists, however, to allow for a verbose description of the mapping for\neach of the valid strings.\n\nThe valid distance metrics, and the function they map to, are:\n =============== ========================================\n metric Function\n =============== ========================================\n 'additive_chi2' sklearn.pairwise.additive_chi2_kernel\n 'chi2' sklearn.pairwise.chi2_kernel\n 'linear' sklearn.pairwise.linear_kernel\n 'poly' sklearn.pairwise.polynomial_kernel\n 'polynomial' sklearn.pairwise.polynomial_kernel\n 'rbf' sklearn.pairwise.rbf_kernel\n 'laplacian' sklearn.pairwise.laplacian_kernel\n 'sigmoid' sklearn.pairwise.sigmoid_kernel\n 'cosine' sklearn.pairwise.cosine_similarity\n =============== ========================================\n\nRead more in the :ref:`User Guide `.", + "description": "Valid metrics for pairwise_kernels.\n\nThis function simply returns the valid pairwise distance metrics.\nIt exists, however, to allow for a verbose description of the mapping for\neach of the valid strings.\n\nThe valid distance metrics, and the function they map to, are:\n =============== ========================================\n metric Function\n =============== ========================================\n 'additive_chi2' sklearn.pairwise.additive_chi2_kernel\n 'chi2' sklearn.pairwise.chi2_kernel\n 'linear' sklearn.pairwise.linear_kernel\n 'poly' sklearn.pairwise.polynomial_kernel\n 'polynomial' sklearn.pairwise.polynomial_kernel\n 'rbf' sklearn.pairwise.rbf_kernel\n 'laplacian' sklearn.pairwise.laplacian_kernel\n 'sigmoid' sklearn.pairwise.sigmoid_kernel\n 'cosine' sklearn.pairwise.cosine_similarity\n =============== ========================================\n\nRead more in the :ref:`User Guide `.", + "docstring": "Valid metrics for pairwise_kernels.\n\n This function simply returns the valid pairwise distance metrics.\n It exists, however, to allow for a verbose description of the mapping for\n each of the valid strings.\n\n The valid distance metrics, and the function they map to, are:\n =============== ========================================\n metric Function\n =============== ========================================\n 'additive_chi2' sklearn.pairwise.additive_chi2_kernel\n 'chi2' sklearn.pairwise.chi2_kernel\n 'linear' sklearn.pairwise.linear_kernel\n 'poly' sklearn.pairwise.polynomial_kernel\n 'polynomial' sklearn.pairwise.polynomial_kernel\n 'rbf' sklearn.pairwise.rbf_kernel\n 'laplacian' sklearn.pairwise.laplacian_kernel\n 'sigmoid' sklearn.pairwise.sigmoid_kernel\n 'cosine' sklearn.pairwise.cosine_similarity\n =============== ========================================\n\n Read more in the :ref:`User Guide `.\n ", "source_code": "\ndef kernel_metrics():\n \"\"\"Valid metrics for pairwise_kernels.\n\n This function simply returns the valid pairwise distance metrics.\n It exists, however, to allow for a verbose description of the mapping for\n each of the valid strings.\n\n The valid distance metrics, and the function they map to, are:\n =============== ========================================\n metric Function\n =============== ========================================\n 'additive_chi2' sklearn.pairwise.additive_chi2_kernel\n 'chi2' sklearn.pairwise.chi2_kernel\n 'linear' sklearn.pairwise.linear_kernel\n 'poly' sklearn.pairwise.polynomial_kernel\n 'polynomial' sklearn.pairwise.polynomial_kernel\n 'rbf' sklearn.pairwise.rbf_kernel\n 'laplacian' sklearn.pairwise.laplacian_kernel\n 'sigmoid' sklearn.pairwise.sigmoid_kernel\n 'cosine' sklearn.pairwise.cosine_similarity\n =============== ========================================\n\n Read more in the :ref:`User Guide `.\n \"\"\"\n return PAIRWISE_KERNEL_FUNCTIONS" }, { @@ -124662,7 +134152,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124672,7 +134163,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "If `None`, uses `Y=X`." - } + }, + "refined_type": {} }, { "name": "gamma", @@ -124682,13 +134174,14 @@ "docstring": { "type": "float, default=None", "description": "If None, defaults to 1.0 / n_features." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the laplacian kernel between X and Y.\n\nThe laplacian kernel is defined as:: K(x, y) = exp(-gamma ||x-y||_1) for each pair of rows x in X and y in Y. Read more in the :ref:`User Guide `. .. versionadded:: 0.17", - "docstring": "Compute the laplacian kernel between X and Y.\n\nThe laplacian kernel is defined as::\n\n K(x, y) = exp(-gamma ||x-y||_1)\n\nfor each pair of rows x in X and y in Y.\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.17\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\ngamma : float, default=None\n If None, defaults to 1.0 / n_features.\n\nReturns\n-------\nkernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)", + "description": "Compute the laplacian kernel between X and Y.\n\nThe laplacian kernel is defined as::\n\n K(x, y) = exp(-gamma ||x-y||_1)\n\nfor each pair of rows x in X and y in Y.\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.17", + "docstring": "Compute the laplacian kernel between X and Y.\n\n The laplacian kernel is defined as::\n\n K(x, y) = exp(-gamma ||x-y||_1)\n\n for each pair of rows x in X and y in Y.\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\n gamma : float, default=None\n If None, defaults to 1.0 / n_features.\n\n Returns\n -------\n kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)\n ", "source_code": "\ndef laplacian_kernel(X, Y=None, gamma=None):\n \"\"\"Compute the laplacian kernel between X and Y.\n\n The laplacian kernel is defined as::\n\n K(x, y) = exp(-gamma ||x-y||_1)\n\n for each pair of rows x in X and y in Y.\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\n gamma : float, default=None\n If None, defaults to 1.0 / n_features.\n\n Returns\n -------\n kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)\n \"\"\"\n (X, Y) = check_pairwise_arrays(X, Y)\n if gamma is None:\n gamma = 1.0 / X.shape[1]\n K = -gamma * manhattan_distances(X, Y)\n np.exp(K, K)\n return K" }, { @@ -124705,8 +134198,9 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", - "description": "" - } + "description": "A feature array." + }, + "refined_type": {} }, { "name": "Y", @@ -124715,8 +134209,9 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", - "description": "If `None`, uses `Y=X`." - } + "description": "An optional second feature array. If `None`, uses `Y=X`." + }, + "refined_type": {} }, { "name": "dense_output", @@ -124726,14 +134221,15 @@ "docstring": { "type": "bool, default=True", "description": "Whether to return dense output even when the input is sparse. If\n``False``, the output is sparse if both input arrays are sparse.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the linear kernel between X and Y.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Compute the linear kernel between X and Y.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\ndense_output : bool, default=True\n Whether to return dense output even when the input is sparse. If\n ``False``, the output is sparse if both input arrays are sparse.\n\n .. versionadded:: 0.20\n\nReturns\n-------\nGram matrix : ndarray of shape (n_samples_X, n_samples_Y)", - "source_code": "\ndef linear_kernel(X, Y=None, dense_output=True):\n \"\"\"\n Compute the linear kernel between X and Y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\n dense_output : bool, default=True\n Whether to return dense output even when the input is sparse. If\n ``False``, the output is sparse if both input arrays are sparse.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n Gram matrix : ndarray of shape (n_samples_X, n_samples_Y)\n \"\"\"\n (X, Y) = check_pairwise_arrays(X, Y)\n return safe_sparse_dot(X, Y.T, dense_output=dense_output)" + "docstring": "\n Compute the linear kernel between X and Y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n A feature array.\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n An optional second feature array. If `None`, uses `Y=X`.\n\n dense_output : bool, default=True\n Whether to return dense output even when the input is sparse. If\n ``False``, the output is sparse if both input arrays are sparse.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n Gram matrix : ndarray of shape (n_samples_X, n_samples_Y)\n The Gram matrix of the linear kernel, i.e. `X @ Y.T`.\n ", + "source_code": "\ndef linear_kernel(X, Y=None, dense_output=True):\n \"\"\"\n Compute the linear kernel between X and Y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n A feature array.\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n An optional second feature array. If `None`, uses `Y=X`.\n\n dense_output : bool, default=True\n Whether to return dense output even when the input is sparse. If\n ``False``, the output is sparse if both input arrays are sparse.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n Gram matrix : ndarray of shape (n_samples_X, n_samples_Y)\n The Gram matrix of the linear kernel, i.e. `X @ Y.T`.\n \"\"\"\n (X, Y) = check_pairwise_arrays(X, Y)\n return safe_sparse_dot(X, Y.T, dense_output=dense_output)" }, { "name": "manhattan_distances", @@ -124750,7 +134246,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124760,7 +134257,8 @@ "docstring": { "type": "array-like of shape (n_samples_Y, n_features), default=None", "description": "If `None`, uses `Y=X`." - } + }, + "refined_type": {} }, { "name": "sum_over_features", @@ -124770,13 +134268,14 @@ "docstring": { "type": "bool, default=True", "description": "If True the function returns the pairwise distance matrix\nelse it returns the componentwise L1 pairwise-distances.\nNot supported for sparse matrix inputs." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the L1 distances between the vectors in X and Y.\n\nWith sum_over_features equal to False it returns the componentwise distances. Read more in the :ref:`User Guide `.", - "docstring": "Compute the L1 distances between the vectors in X and Y.\n\nWith sum_over_features equal to False it returns the componentwise\ndistances.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features)\n\nY : array-like of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\nsum_over_features : bool, default=True\n If True the function returns the pairwise distance matrix\n else it returns the componentwise L1 pairwise-distances.\n Not supported for sparse matrix inputs.\n\nReturns\n-------\nD : ndarray of shape (n_samples_X * n_samples_Y, n_features) or (n_samples_X, n_samples_Y)\n If sum_over_features is False shape is\n (n_samples_X * n_samples_Y, n_features) and D contains the\n componentwise L1 pairwise-distances (ie. absolute difference),\n else shape is (n_samples_X, n_samples_Y) and D contains\n the pairwise L1 distances.\n\nNotes\n--------\nWhen X and/or Y are CSR sparse matrices and they are not already\nin canonical format, this function modifies them in-place to\nmake them canonical.\n\nExamples\n--------\n>>> from sklearn.metrics.pairwise import manhattan_distances\n>>> manhattan_distances([[3]], [[3]])\narray([[0.]])\n>>> manhattan_distances([[3]], [[2]])\narray([[1.]])\n>>> manhattan_distances([[2]], [[3]])\narray([[1.]])\n>>> manhattan_distances([[1, 2], [3, 4]], [[1, 2], [0, 3]])\narray([[0., 2.],\n [4., 4.]])\n>>> import numpy as np\n>>> X = np.ones((1, 2))\n>>> y = np.full((2, 2), 2.)\n>>> manhattan_distances(X, y, sum_over_features=False)\narray([[1., 1.],\n [1., 1.]])", + "description": "Compute the L1 distances between the vectors in X and Y.\n\nWith sum_over_features equal to False it returns the componentwise\ndistances.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the L1 distances between the vectors in X and Y.\n\n With sum_over_features equal to False it returns the componentwise\n distances.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features)\n\n Y : array-like of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\n sum_over_features : bool, default=True\n If True the function returns the pairwise distance matrix\n else it returns the componentwise L1 pairwise-distances.\n Not supported for sparse matrix inputs.\n\n Returns\n -------\n D : ndarray of shape (n_samples_X * n_samples_Y, n_features) or (n_samples_X, n_samples_Y)\n If sum_over_features is False shape is\n (n_samples_X * n_samples_Y, n_features) and D contains the\n componentwise L1 pairwise-distances (ie. absolute difference),\n else shape is (n_samples_X, n_samples_Y) and D contains\n the pairwise L1 distances.\n\n Notes\n --------\n When X and/or Y are CSR sparse matrices and they are not already\n in canonical format, this function modifies them in-place to\n make them canonical.\n\n Examples\n --------\n >>> from sklearn.metrics.pairwise import manhattan_distances\n >>> manhattan_distances([[3]], [[3]])\n array([[0.]])\n >>> manhattan_distances([[3]], [[2]])\n array([[1.]])\n >>> manhattan_distances([[2]], [[3]])\n array([[1.]])\n >>> manhattan_distances([[1, 2], [3, 4]], [[1, 2], [0, 3]])\n array([[0., 2.],\n [4., 4.]])\n >>> import numpy as np\n >>> X = np.ones((1, 2))\n >>> y = np.full((2, 2), 2.)\n >>> manhattan_distances(X, y, sum_over_features=False)\n array([[1., 1.],\n [1., 1.]])\n ", "source_code": "\ndef manhattan_distances(X, Y=None, *, sum_over_features=True):\n \"\"\"Compute the L1 distances between the vectors in X and Y.\n\n With sum_over_features equal to False it returns the componentwise\n distances.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features)\n\n Y : array-like of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\n sum_over_features : bool, default=True\n If True the function returns the pairwise distance matrix\n else it returns the componentwise L1 pairwise-distances.\n Not supported for sparse matrix inputs.\n\n Returns\n -------\n D : ndarray of shape (n_samples_X * n_samples_Y, n_features) or (n_samples_X, n_samples_Y)\n If sum_over_features is False shape is\n (n_samples_X * n_samples_Y, n_features) and D contains the\n componentwise L1 pairwise-distances (ie. absolute difference),\n else shape is (n_samples_X, n_samples_Y) and D contains\n the pairwise L1 distances.\n\n Notes\n --------\n When X and/or Y are CSR sparse matrices and they are not already\n in canonical format, this function modifies them in-place to\n make them canonical.\n\n Examples\n --------\n >>> from sklearn.metrics.pairwise import manhattan_distances\n >>> manhattan_distances([[3]], [[3]])\n array([[0.]])\n >>> manhattan_distances([[3]], [[2]])\n array([[1.]])\n >>> manhattan_distances([[2]], [[3]])\n array([[1.]])\n >>> manhattan_distances([[1, 2], [3, 4]], [[1, 2], [0, 3]])\n array([[0., 2.],\n [4., 4.]])\n >>> import numpy as np\n >>> X = np.ones((1, 2))\n >>> y = np.full((2, 2), 2.)\n >>> manhattan_distances(X, y, sum_over_features=False)\n array([[1., 1.],\n [1., 1.]])\n \"\"\"\n (X, Y) = check_pairwise_arrays(X, Y)\n if issparse(X) or issparse(Y):\n if not sum_over_features:\n raise TypeError('sum_over_features=%r not supported for sparse matrices' % sum_over_features)\n X = csr_matrix(X, copy=False)\n Y = csr_matrix(Y, copy=False)\n X.sum_duplicates()\n Y.sum_duplicates()\n D = np.zeros((X.shape[0], Y.shape[0]))\n _sparse_manhattan(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, D)\n return D\n if sum_over_features:\n return distance.cdist(X, Y, 'cityblock')\n D = X[:, np.newaxis, :] - Y[np.newaxis, :, :]\n D = np.abs(D, D)\n return D.reshape((-1, X.shape[1]))" }, { @@ -124794,7 +134293,8 @@ "docstring": { "type": "array-like of shape=(n_samples_X, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124804,7 +134304,8 @@ "docstring": { "type": "array-like of shape=(n_samples_Y, n_features), default=None", "description": "" - } + }, + "refined_type": {} }, { "name": "squared", @@ -124814,7 +134315,8 @@ "docstring": { "type": "bool, default=False", "description": "Return squared Euclidean distances." - } + }, + "refined_type": {} }, { "name": "missing_values", @@ -124824,7 +134326,8 @@ "docstring": { "type": "np.nan or int, default=np.nan", "description": "Representation of missing value." - } + }, + "refined_type": {} }, { "name": "copy", @@ -124834,13 +134337,14 @@ "docstring": { "type": "bool, default=True", "description": "Make and use a deep copy of X and Y (if Y exists)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Calculate the euclidean distances in the presence of missing values.\n\nCompute the euclidean distance between each pair of samples in X and Y, where Y=X is assumed if Y=None. When calculating the distance between a pair of samples, this formulation ignores feature coordinates with a missing value in either sample and scales up the weight of the remaining coordinates: dist(x,y) = sqrt(weight * sq. distance from present coordinates) where, weight = Total # of coordinates / # of present coordinates For example, the distance between ``[3, na, na, 6]`` and ``[1, na, 4, 5]`` is: .. math:: \\sqrt{\\frac{4}{2}((3-1)^2 + (6-5)^2)} If all the coordinates are missing or if there are no common present coordinates then NaN is returned for that pair. Read more in the :ref:`User Guide `. .. versionadded:: 0.22", - "docstring": "Calculate the euclidean distances in the presence of missing values.\n\nCompute the euclidean distance between each pair of samples in X and Y,\nwhere Y=X is assumed if Y=None. When calculating the distance between a\npair of samples, this formulation ignores feature coordinates with a\nmissing value in either sample and scales up the weight of the remaining\ncoordinates:\n\n dist(x,y) = sqrt(weight * sq. distance from present coordinates)\n where,\n weight = Total # of coordinates / # of present coordinates\n\nFor example, the distance between ``[3, na, na, 6]`` and ``[1, na, 4, 5]``\nis:\n\n .. math::\n \\sqrt{\\frac{4}{2}((3-1)^2 + (6-5)^2)}\n\nIf all the coordinates are missing or if there are no common present\ncoordinates then NaN is returned for that pair.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.22\n\nParameters\n----------\nX : array-like of shape=(n_samples_X, n_features)\n\nY : array-like of shape=(n_samples_Y, n_features), default=None\n\nsquared : bool, default=False\n Return squared Euclidean distances.\n\nmissing_values : np.nan or int, default=np.nan\n Representation of missing value.\n\ncopy : bool, default=True\n Make and use a deep copy of X and Y (if Y exists).\n\nReturns\n-------\ndistances : ndarray of shape (n_samples_X, n_samples_Y)\n\nSee Also\n--------\npaired_distances : Distances between pairs of elements of X and Y.\n\nExamples\n--------\n>>> from sklearn.metrics.pairwise import nan_euclidean_distances\n>>> nan = float(\"NaN\")\n>>> X = [[0, 1], [1, nan]]\n>>> nan_euclidean_distances(X, X) # distance between rows of X\narray([[0. , 1.41421356],\n [1.41421356, 0. ]])\n\n>>> # get distance to origin\n>>> nan_euclidean_distances(X, [[0, 0]])\narray([[1. ],\n [1.41421356]])\n\nReferences\n----------\n* John K. Dixon, \"Pattern Recognition with Partly Missing Data\",\n IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue:\n 10, pp. 617 - 621, Oct. 1979.\n http://ieeexplore.ieee.org/abstract/document/4310090/", + "description": "Calculate the euclidean distances in the presence of missing values.\n\nCompute the euclidean distance between each pair of samples in X and Y,\nwhere Y=X is assumed if Y=None. When calculating the distance between a\npair of samples, this formulation ignores feature coordinates with a\nmissing value in either sample and scales up the weight of the remaining\ncoordinates:\n\n dist(x,y) = sqrt(weight * sq. distance from present coordinates)\n where,\n weight = Total # of coordinates / # of present coordinates\n\nFor example, the distance between ``[3, na, na, 6]`` and ``[1, na, 4, 5]``\nis:\n\n .. math::\n \\sqrt{\\frac{4}{2}((3-1)^2 + (6-5)^2)}\n\nIf all the coordinates are missing or if there are no common present\ncoordinates then NaN is returned for that pair.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.22", + "docstring": "Calculate the euclidean distances in the presence of missing values.\n\n Compute the euclidean distance between each pair of samples in X and Y,\n where Y=X is assumed if Y=None. When calculating the distance between a\n pair of samples, this formulation ignores feature coordinates with a\n missing value in either sample and scales up the weight of the remaining\n coordinates:\n\n dist(x,y) = sqrt(weight * sq. distance from present coordinates)\n where,\n weight = Total # of coordinates / # of present coordinates\n\n For example, the distance between ``[3, na, na, 6]`` and ``[1, na, 4, 5]``\n is:\n\n .. math::\n \\sqrt{\\frac{4}{2}((3-1)^2 + (6-5)^2)}\n\n If all the coordinates are missing or if there are no common present\n coordinates then NaN is returned for that pair.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n X : array-like of shape=(n_samples_X, n_features)\n\n Y : array-like of shape=(n_samples_Y, n_features), default=None\n\n squared : bool, default=False\n Return squared Euclidean distances.\n\n missing_values : np.nan or int, default=np.nan\n Representation of missing value.\n\n copy : bool, default=True\n Make and use a deep copy of X and Y (if Y exists).\n\n Returns\n -------\n distances : ndarray of shape (n_samples_X, n_samples_Y)\n\n See Also\n --------\n paired_distances : Distances between pairs of elements of X and Y.\n\n Examples\n --------\n >>> from sklearn.metrics.pairwise import nan_euclidean_distances\n >>> nan = float(\"NaN\")\n >>> X = [[0, 1], [1, nan]]\n >>> nan_euclidean_distances(X, X) # distance between rows of X\n array([[0. , 1.41421356],\n [1.41421356, 0. ]])\n\n >>> # get distance to origin\n >>> nan_euclidean_distances(X, [[0, 0]])\n array([[1. ],\n [1.41421356]])\n\n References\n ----------\n * John K. Dixon, \"Pattern Recognition with Partly Missing Data\",\n IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue:\n 10, pp. 617 - 621, Oct. 1979.\n http://ieeexplore.ieee.org/abstract/document/4310090/\n ", "source_code": "\ndef nan_euclidean_distances(X, Y=None, *, squared=False, missing_values=np.nan, copy=True):\n \"\"\"Calculate the euclidean distances in the presence of missing values.\n\n Compute the euclidean distance between each pair of samples in X and Y,\n where Y=X is assumed if Y=None. When calculating the distance between a\n pair of samples, this formulation ignores feature coordinates with a\n missing value in either sample and scales up the weight of the remaining\n coordinates:\n\n dist(x,y) = sqrt(weight * sq. distance from present coordinates)\n where,\n weight = Total # of coordinates / # of present coordinates\n\n For example, the distance between ``[3, na, na, 6]`` and ``[1, na, 4, 5]``\n is:\n\n .. math::\n \\sqrt{\\frac{4}{2}((3-1)^2 + (6-5)^2)}\n\n If all the coordinates are missing or if there are no common present\n coordinates then NaN is returned for that pair.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.22\n\n Parameters\n ----------\n X : array-like of shape=(n_samples_X, n_features)\n\n Y : array-like of shape=(n_samples_Y, n_features), default=None\n\n squared : bool, default=False\n Return squared Euclidean distances.\n\n missing_values : np.nan or int, default=np.nan\n Representation of missing value.\n\n copy : bool, default=True\n Make and use a deep copy of X and Y (if Y exists).\n\n Returns\n -------\n distances : ndarray of shape (n_samples_X, n_samples_Y)\n\n See Also\n --------\n paired_distances : Distances between pairs of elements of X and Y.\n\n Examples\n --------\n >>> from sklearn.metrics.pairwise import nan_euclidean_distances\n >>> nan = float(\"NaN\")\n >>> X = [[0, 1], [1, nan]]\n >>> nan_euclidean_distances(X, X) # distance between rows of X\n array([[0. , 1.41421356],\n [1.41421356, 0. ]])\n\n >>> # get distance to origin\n >>> nan_euclidean_distances(X, [[0, 0]])\n array([[1. ],\n [1.41421356]])\n\n References\n ----------\n * John K. Dixon, \"Pattern Recognition with Partly Missing Data\",\n IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue:\n 10, pp. 617 - 621, Oct. 1979.\n http://ieeexplore.ieee.org/abstract/document/4310090/\n \"\"\"\n force_all_finite = 'allow-nan' if is_scalar_nan(missing_values) else True\n (X, Y) = check_pairwise_arrays(X, Y, accept_sparse=False, force_all_finite=force_all_finite, copy=copy)\n missing_X = _get_mask(X, missing_values)\n missing_Y = missing_X if Y is X else _get_mask(Y, missing_values)\n X[missing_X] = 0\n Y[missing_Y] = 0\n distances = euclidean_distances(X, Y, squared=True)\n XX = X * X\n YY = Y * Y\n distances -= np.dot(XX, missing_Y.T)\n distances -= np.dot(missing_X, YY.T)\n np.clip(distances, 0, None, out=distances)\n if X is Y:\n np.fill_diagonal(distances, 0.0)\n present_X = 1 - missing_X\n present_Y = present_X if Y is X else ~missing_Y\n present_count = np.dot(present_X, present_Y.T)\n distances[present_count == 0] = np.nan\n np.maximum(1, present_count, out=present_count)\n distances /= present_count\n distances *= X.shape[1]\n if not squared:\n np.sqrt(distances, out=distances)\n return distances" }, { @@ -124858,7 +134362,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124868,13 +134373,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Computes the paired cosine distances between X and Y.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Computes the paired cosine distances between X and Y.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nY : array-like of shape (n_samples, n_features)\n\nReturns\n-------\ndistances : ndarray of shape (n_samples,)\n\nNotes\n-----\nThe cosine distance is equivalent to the half the squared\neuclidean distance if each sample is normalized to unit norm.", + "docstring": "\n Computes the paired cosine distances between X and Y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Y : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n distances : ndarray of shape (n_samples,)\n\n Notes\n -----\n The cosine distance is equivalent to the half the squared\n euclidean distance if each sample is normalized to unit norm.\n ", "source_code": "\ndef paired_cosine_distances(X, Y):\n \"\"\"\n Computes the paired cosine distances between X and Y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Y : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n distances : ndarray of shape (n_samples,)\n\n Notes\n -----\n The cosine distance is equivalent to the half the squared\n euclidean distance if each sample is normalized to unit norm.\n \"\"\"\n (X, Y) = check_paired_arrays(X, Y)\n return 0.5 * row_norms(normalize(X) - normalize(Y), squared=True)" }, { @@ -124892,7 +134398,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Array 1 for distance computation." - } + }, + "refined_type": {} }, { "name": "Y", @@ -124902,7 +134409,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Array 2 for distance computation." - } + }, + "refined_type": {} }, { "name": "metric", @@ -124912,13 +134420,14 @@ "docstring": { "type": "str or callable, default=\"euclidean\"", "description": "The metric to use when calculating distance between instances in a\nfeature array. If metric is a string, it must be one of the options\nspecified in PAIRED_DISTANCES, including \"euclidean\",\n\"manhattan\", or \"cosine\".\nAlternatively, if metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two arrays from X as input and return a value indicating\nthe distance between them." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Computes the paired distances between X and Y.\n\nComputes the distances between (X[0], Y[0]), (X[1], Y[1]), etc... Read more in the :ref:`User Guide `.", - "docstring": "Computes the paired distances between X and Y.\n\nComputes the distances between (X[0], Y[0]), (X[1], Y[1]), etc...\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Array 1 for distance computation.\n\nY : ndarray of shape (n_samples, n_features)\n Array 2 for distance computation.\n\nmetric : str or callable, default=\"euclidean\"\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n specified in PAIRED_DISTANCES, including \"euclidean\",\n \"manhattan\", or \"cosine\".\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them.\n\nReturns\n-------\ndistances : ndarray of shape (n_samples,)\n\nSee Also\n--------\npairwise_distances : Computes the distance between every pair of samples.\n\nExamples\n--------\n>>> from sklearn.metrics.pairwise import paired_distances\n>>> X = [[0, 1], [1, 1]]\n>>> Y = [[0, 1], [2, 1]]\n>>> paired_distances(X, Y)\narray([0., 1.])", + "description": "Computes the paired distances between X and Y.\n\nComputes the distances between (X[0], Y[0]), (X[1], Y[1]), etc...\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n Computes the paired distances between X and Y.\n\n Computes the distances between (X[0], Y[0]), (X[1], Y[1]), etc...\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Array 1 for distance computation.\n\n Y : ndarray of shape (n_samples, n_features)\n Array 2 for distance computation.\n\n metric : str or callable, default=\"euclidean\"\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n specified in PAIRED_DISTANCES, including \"euclidean\",\n \"manhattan\", or \"cosine\".\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them.\n\n Returns\n -------\n distances : ndarray of shape (n_samples,)\n\n See Also\n --------\n pairwise_distances : Computes the distance between every pair of samples.\n\n Examples\n --------\n >>> from sklearn.metrics.pairwise import paired_distances\n >>> X = [[0, 1], [1, 1]]\n >>> Y = [[0, 1], [2, 1]]\n >>> paired_distances(X, Y)\n array([0., 1.])\n ", "source_code": "\ndef paired_distances(X, Y, *, metric='euclidean', **kwds):\n \"\"\"\n Computes the paired distances between X and Y.\n\n Computes the distances between (X[0], Y[0]), (X[1], Y[1]), etc...\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Array 1 for distance computation.\n\n Y : ndarray of shape (n_samples, n_features)\n Array 2 for distance computation.\n\n metric : str or callable, default=\"euclidean\"\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n specified in PAIRED_DISTANCES, including \"euclidean\",\n \"manhattan\", or \"cosine\".\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them.\n\n Returns\n -------\n distances : ndarray of shape (n_samples,)\n\n See Also\n --------\n pairwise_distances : Computes the distance between every pair of samples.\n\n Examples\n --------\n >>> from sklearn.metrics.pairwise import paired_distances\n >>> X = [[0, 1], [1, 1]]\n >>> Y = [[0, 1], [2, 1]]\n >>> paired_distances(X, Y)\n array([0., 1.])\n \"\"\"\n if metric in PAIRED_DISTANCES:\n func = PAIRED_DISTANCES[metric]\n return func(X, Y)\n elif callable(metric):\n (X, Y) = check_paired_arrays(X, Y)\n distances = np.zeros(len(X))\n for i in range(len(X)):\n distances[i] = metric(X[i], Y[i])\n return distances\n else:\n raise ValueError('Unknown distance %s' % metric)" }, { @@ -124936,7 +134445,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124946,13 +134456,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Computes the paired euclidean distances between X and Y.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Computes the paired euclidean distances between X and Y.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nY : array-like of shape (n_samples, n_features)\n\nReturns\n-------\ndistances : ndarray of shape (n_samples,)", + "docstring": "\n Computes the paired euclidean distances between X and Y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Y : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n distances : ndarray of shape (n_samples,)\n ", "source_code": "\ndef paired_euclidean_distances(X, Y):\n \"\"\"\n Computes the paired euclidean distances between X and Y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Y : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n distances : ndarray of shape (n_samples,)\n \"\"\"\n (X, Y) = check_paired_arrays(X, Y)\n return row_norms(X - Y)" }, { @@ -124970,7 +134481,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -124980,13 +134492,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the L1 distances between the vectors in X and Y.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Compute the L1 distances between the vectors in X and Y.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nY : array-like of shape (n_samples, n_features)\n\nReturns\n-------\ndistances : ndarray of shape (n_samples,)", + "docstring": "Compute the L1 distances between the vectors in X and Y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Y : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n distances : ndarray of shape (n_samples,)\n ", "source_code": "\ndef paired_manhattan_distances(X, Y):\n \"\"\"Compute the L1 distances between the vectors in X and Y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Y : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n distances : ndarray of shape (n_samples,)\n \"\"\"\n (X, Y) = check_paired_arrays(X, Y)\n diff = X - Y\n if issparse(diff):\n diff.data = np.abs(diff.data)\n return np.squeeze(np.array(diff.sum(axis=1)))\n else:\n return np.abs(diff).sum(axis=-1)" }, { @@ -125004,7 +134517,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)", "description": "Array of pairwise distances between samples, or a feature array.\nThe shape of the array should be (n_samples_X, n_samples_X) if\nmetric == \"precomputed\" and (n_samples_X, n_features) otherwise." - } + }, + "refined_type": {} }, { "name": "Y", @@ -125014,7 +134528,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "An optional second feature array. Only allowed if\nmetric != \"precomputed\"." - } + }, + "refined_type": {} }, { "name": "metric", @@ -125024,7 +134539,8 @@ "docstring": { "type": "str or callable, default='euclidean'", "description": "The metric to use when calculating distance between instances in a\nfeature array. If metric is a string, it must be one of the options\nallowed by scipy.spatial.distance.pdist for its metric parameter, or\na metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``.\nIf metric is \"precomputed\", X is assumed to be a distance matrix.\nAlternatively, if metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two arrays from X as input and return a value indicating\nthe distance between them." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -125034,7 +134550,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation. This works by breaking\ndown the pairwise matrix into n_jobs even slices and computing them in\nparallel.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "force_all_finite", @@ -125044,13 +134561,14 @@ "docstring": { "type": "bool or 'allow-nan', default=True", "description": "Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored\nfor a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The\npossibilities are:\n\n- True: Force all values of array to be finite.\n- False: accepts np.inf, np.nan, pd.NA in array.\n- 'allow-nan': accepts only np.nan and pd.NA values in array. Values\n cannot be infinite.\n\n.. versionadded:: 0.22\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n.. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the distance matrix from a vector array X and optional Y.\n\nThis method takes either a vector array or a distance matrix, and returns a distance matrix. If the input is a vector array, the distances are computed. If the input is a distances matrix, it is returned instead. This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array. If Y is given (default is None), then the returned matrix is the pairwise distance between the arrays from both X and Y. Valid values for metric are: - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs. ['nan_euclidean'] but it does not yet support sparse matrices. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are valid scipy.spatial.distance metrics), the scikit-learn implementation will be used, which is faster and has support for sparse matrices (except for 'cityblock'). For a verbose description of the metrics from scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics function. Read more in the :ref:`User Guide `.", - "docstring": "Compute the distance matrix from a vector array X and optional Y.\n\nThis method takes either a vector array or a distance matrix, and returns\na distance matrix. If the input is a vector array, the distances are\ncomputed. If the input is a distances matrix, it is returned instead.\n\nThis method provides a safe way to take a distance matrix as input, while\npreserving compatibility with many other algorithms that take a vector\narray.\n\nIf Y is given (default is None), then the returned matrix is the pairwise\ndistance between the arrays from both X and Y.\n\nValid values for metric are:\n\n- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']. These metrics support sparse matrix\n inputs.\n ['nan_euclidean'] but it does not yet support sparse matrices.\n\n- From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',\n 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',\n 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']\n See the documentation for scipy.spatial.distance for details on these\n metrics. These metrics do not support sparse matrix inputs.\n\nNote that in the case of 'cityblock', 'cosine' and 'euclidean' (which are\nvalid scipy.spatial.distance metrics), the scikit-learn implementation\nwill be used, which is faster and has support for sparse matrices (except\nfor 'cityblock'). For a verbose description of the metrics from\nscikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics\nfunction.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)\n Array of pairwise distances between samples, or a feature array.\n The shape of the array should be (n_samples_X, n_samples_X) if\n metric == \"precomputed\" and (n_samples_X, n_features) otherwise.\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n An optional second feature array. Only allowed if\n metric != \"precomputed\".\n\nmetric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by scipy.spatial.distance.pdist for its metric parameter, or\n a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``.\n If metric is \"precomputed\", X is assumed to be a distance matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them.\n\nn_jobs : int, default=None\n The number of jobs to use for the computation. This works by breaking\n down the pairwise matrix into n_jobs even slices and computing them in\n parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nforce_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored\n for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The\n possibilities are:\n\n - True: Force all values of array to be finite.\n - False: accepts np.inf, np.nan, pd.NA in array.\n - 'allow-nan': accepts only np.nan and pd.NA values in array. Values\n cannot be infinite.\n\n .. versionadded:: 0.22\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`.\n\n**kwds : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a scipy.spatial.distance metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\nReturns\n-------\nD : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_samples_Y)\n A distance matrix D such that D_{i, j} is the distance between the\n ith and jth vectors of the given matrix X, if Y is None.\n If Y is not None, then D_{i, j} is the distance between the ith array\n from X and the jth array from Y.\n\nSee Also\n--------\npairwise_distances_chunked : Performs the same calculation as this\n function, but returns a generator of chunks of the distance matrix, in\n order to limit memory usage.\npaired_distances : Computes the distances between corresponding elements\n of two arrays.", + "description": "Compute the distance matrix from a vector array X and optional Y.\n\nThis method takes either a vector array or a distance matrix, and returns\na distance matrix. If the input is a vector array, the distances are\ncomputed. If the input is a distances matrix, it is returned instead.\n\nThis method provides a safe way to take a distance matrix as input, while\npreserving compatibility with many other algorithms that take a vector\narray.\n\nIf Y is given (default is None), then the returned matrix is the pairwise\ndistance between the arrays from both X and Y.\n\nValid values for metric are:\n\n- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']. These metrics support sparse matrix\n inputs.\n ['nan_euclidean'] but it does not yet support sparse matrices.\n\n- From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',\n 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',\n 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']\n See the documentation for scipy.spatial.distance for details on these\n metrics. These metrics do not support sparse matrix inputs.\n\nNote that in the case of 'cityblock', 'cosine' and 'euclidean' (which are\nvalid scipy.spatial.distance metrics), the scikit-learn implementation\nwill be used, which is faster and has support for sparse matrices (except\nfor 'cityblock'). For a verbose description of the metrics from\nscikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics\nfunction.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the distance matrix from a vector array X and optional Y.\n\n This method takes either a vector array or a distance matrix, and returns\n a distance matrix. If the input is a vector array, the distances are\n computed. If the input is a distances matrix, it is returned instead.\n\n This method provides a safe way to take a distance matrix as input, while\n preserving compatibility with many other algorithms that take a vector\n array.\n\n If Y is given (default is None), then the returned matrix is the pairwise\n distance between the arrays from both X and Y.\n\n Valid values for metric are:\n\n - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']. These metrics support sparse matrix\n inputs.\n ['nan_euclidean'] but it does not yet support sparse matrices.\n\n - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',\n 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',\n 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']\n See the documentation for scipy.spatial.distance for details on these\n metrics. These metrics do not support sparse matrix inputs.\n\n Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are\n valid scipy.spatial.distance metrics), the scikit-learn implementation\n will be used, which is faster and has support for sparse matrices (except\n for 'cityblock'). For a verbose description of the metrics from\n scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics\n function.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)\n Array of pairwise distances between samples, or a feature array.\n The shape of the array should be (n_samples_X, n_samples_X) if\n metric == \"precomputed\" and (n_samples_X, n_features) otherwise.\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n An optional second feature array. Only allowed if\n metric != \"precomputed\".\n\n metric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by scipy.spatial.distance.pdist for its metric parameter, or\n a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``.\n If metric is \"precomputed\", X is assumed to be a distance matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by breaking\n down the pairwise matrix into n_jobs even slices and computing them in\n parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n force_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored\n for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The\n possibilities are:\n\n - True: Force all values of array to be finite.\n - False: accepts np.inf, np.nan, pd.NA in array.\n - 'allow-nan': accepts only np.nan and pd.NA values in array. Values\n cannot be infinite.\n\n .. versionadded:: 0.22\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`.\n\n **kwds : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a scipy.spatial.distance metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\n Returns\n -------\n D : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_samples_Y)\n A distance matrix D such that D_{i, j} is the distance between the\n ith and jth vectors of the given matrix X, if Y is None.\n If Y is not None, then D_{i, j} is the distance between the ith array\n from X and the jth array from Y.\n\n See Also\n --------\n pairwise_distances_chunked : Performs the same calculation as this\n function, but returns a generator of chunks of the distance matrix, in\n order to limit memory usage.\n paired_distances : Computes the distances between corresponding elements\n of two arrays.\n ", "source_code": "\ndef pairwise_distances(X, Y=None, metric='euclidean', *, n_jobs=None, force_all_finite=True, **kwds):\n \"\"\"Compute the distance matrix from a vector array X and optional Y.\n\n This method takes either a vector array or a distance matrix, and returns\n a distance matrix. If the input is a vector array, the distances are\n computed. If the input is a distances matrix, it is returned instead.\n\n This method provides a safe way to take a distance matrix as input, while\n preserving compatibility with many other algorithms that take a vector\n array.\n\n If Y is given (default is None), then the returned matrix is the pairwise\n distance between the arrays from both X and Y.\n\n Valid values for metric are:\n\n - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']. These metrics support sparse matrix\n inputs.\n ['nan_euclidean'] but it does not yet support sparse matrices.\n\n - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',\n 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',\n 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']\n See the documentation for scipy.spatial.distance for details on these\n metrics. These metrics do not support sparse matrix inputs.\n\n Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are\n valid scipy.spatial.distance metrics), the scikit-learn implementation\n will be used, which is faster and has support for sparse matrices (except\n for 'cityblock'). For a verbose description of the metrics from\n scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics\n function.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)\n Array of pairwise distances between samples, or a feature array.\n The shape of the array should be (n_samples_X, n_samples_X) if\n metric == \"precomputed\" and (n_samples_X, n_features) otherwise.\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n An optional second feature array. Only allowed if\n metric != \"precomputed\".\n\n metric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by scipy.spatial.distance.pdist for its metric parameter, or\n a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``.\n If metric is \"precomputed\", X is assumed to be a distance matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by breaking\n down the pairwise matrix into n_jobs even slices and computing them in\n parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n force_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored\n for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The\n possibilities are:\n\n - True: Force all values of array to be finite.\n - False: accepts np.inf, np.nan, pd.NA in array.\n - 'allow-nan': accepts only np.nan and pd.NA values in array. Values\n cannot be infinite.\n\n .. versionadded:: 0.22\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`.\n\n **kwds : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a scipy.spatial.distance metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\n Returns\n -------\n D : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_samples_Y)\n A distance matrix D such that D_{i, j} is the distance between the\n ith and jth vectors of the given matrix X, if Y is None.\n If Y is not None, then D_{i, j} is the distance between the ith array\n from X and the jth array from Y.\n\n See Also\n --------\n pairwise_distances_chunked : Performs the same calculation as this\n function, but returns a generator of chunks of the distance matrix, in\n order to limit memory usage.\n paired_distances : Computes the distances between corresponding elements\n of two arrays.\n \"\"\"\n if metric not in _VALID_METRICS and not callable(metric) and metric != 'precomputed':\n raise ValueError(\"Unknown metric %s. Valid metrics are %s, or 'precomputed', or a callable\" % (metric, _VALID_METRICS))\n if metric == 'precomputed':\n (X, _) = check_pairwise_arrays(X, Y, precomputed=True, force_all_finite=force_all_finite)\n whom = '`pairwise_distances`. Precomputed distance need to have non-negative values.'\n check_non_negative(X, whom=whom)\n return X\n elif metric in PAIRWISE_DISTANCE_FUNCTIONS:\n func = PAIRWISE_DISTANCE_FUNCTIONS[metric]\n elif callable(metric):\n func = partial(_pairwise_callable, metric=metric, force_all_finite=force_all_finite, **kwds)\n else:\n if issparse(X) or issparse(Y):\n raise TypeError('scipy distance metrics do not support sparse matrices.')\n dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None\n if dtype == bool and (X.dtype != bool or Y is not None and Y.dtype != bool):\n msg = 'Data was converted to boolean for metric %s' % metric\n warnings.warn(msg, DataConversionWarning)\n (X, Y) = check_pairwise_arrays(X, Y, dtype=dtype, force_all_finite=force_all_finite)\n params = _precompute_metric_params(X, Y, metric=metric, **kwds)\n kwds.update(**params)\n if effective_n_jobs(n_jobs) == 1 and X is Y:\n return distance.squareform(distance.pdist(X, metric=metric, **kwds))\n func = partial(distance.cdist, metric=metric, **kwds)\n return _parallel_pairwise(X, Y, func, n_jobs, **kwds)" }, { @@ -125068,7 +134586,8 @@ "docstring": { "type": "array-like of shape (n_samples_X, n_features)", "description": "Array containing points." - } + }, + "refined_type": {} }, { "name": "Y", @@ -125078,7 +134597,8 @@ "docstring": { "type": "array-like of shape (n_samples_Y, n_features)", "description": "Arrays containing points." - } + }, + "refined_type": {} }, { "name": "axis", @@ -125088,7 +134608,8 @@ "docstring": { "type": "int, default=1", "description": "Axis along which the argmin and distances are to be computed." - } + }, + "refined_type": {} }, { "name": "metric", @@ -125098,7 +134619,8 @@ "docstring": { "type": "str or callable, default=\"euclidean\"", "description": "Metric to use for distance computation. Any metric from scikit-learn\nor scipy.spatial.distance can be used.\n\nIf metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two arrays as input and return one value indicating the\ndistance between them. This works for Scipy's metrics, but is less\nefficient than passing the metric name as a string.\n\nDistance matrices are not supported.\n\nValid values for metric are:\n\n- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\nSee the documentation for scipy.spatial.distance for details on these\nmetrics." - } + }, + "refined_type": {} }, { "name": "metric_kwargs", @@ -125108,13 +134630,14 @@ "docstring": { "type": "dict, default=None", "description": "Keyword arguments to pass to specified metric function." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute minimum distances between one point and a set of points.\n\nThis function computes for each row in X, the index of the row of Y which is closest (according to the specified distance). This is mostly equivalent to calling: pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis) but uses much less memory, and is faster for large arrays. This function works with dense 2D arrays only.", - "docstring": "Compute minimum distances between one point and a set of points.\n\nThis function computes for each row in X, the index of the row of Y which\nis closest (according to the specified distance).\n\nThis is mostly equivalent to calling:\n\n pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis)\n\nbut uses much less memory, and is faster for large arrays.\n\nThis function works with dense 2D arrays only.\n\nParameters\n----------\nX : array-like of shape (n_samples_X, n_features)\n Array containing points.\n\nY : array-like of shape (n_samples_Y, n_features)\n Arrays containing points.\n\naxis : int, default=1\n Axis along which the argmin and distances are to be computed.\n\nmetric : str or callable, default=\"euclidean\"\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string.\n\n Distance matrices are not supported.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\nmetric_kwargs : dict, default=None\n Keyword arguments to pass to specified metric function.\n\nReturns\n-------\nargmin : numpy.ndarray\n Y[argmin[i], :] is the row in Y that is closest to X[i, :].\n\nSee Also\n--------\nsklearn.metrics.pairwise_distances\nsklearn.metrics.pairwise_distances_argmin_min", + "description": "Compute minimum distances between one point and a set of points.\n\nThis function computes for each row in X, the index of the row of Y which\nis closest (according to the specified distance).\n\nThis is mostly equivalent to calling:\n\n pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis)\n\nbut uses much less memory, and is faster for large arrays.\n\nThis function works with dense 2D arrays only.", + "docstring": "Compute minimum distances between one point and a set of points.\n\n This function computes for each row in X, the index of the row of Y which\n is closest (according to the specified distance).\n\n This is mostly equivalent to calling:\n\n pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis)\n\n but uses much less memory, and is faster for large arrays.\n\n This function works with dense 2D arrays only.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features)\n Array containing points.\n\n Y : array-like of shape (n_samples_Y, n_features)\n Arrays containing points.\n\n axis : int, default=1\n Axis along which the argmin and distances are to be computed.\n\n metric : str or callable, default=\"euclidean\"\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string.\n\n Distance matrices are not supported.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n metric_kwargs : dict, default=None\n Keyword arguments to pass to specified metric function.\n\n Returns\n -------\n argmin : numpy.ndarray\n Y[argmin[i], :] is the row in Y that is closest to X[i, :].\n\n See Also\n --------\n sklearn.metrics.pairwise_distances\n sklearn.metrics.pairwise_distances_argmin_min\n ", "source_code": "\ndef pairwise_distances_argmin(X, Y, *, axis=1, metric='euclidean', metric_kwargs=None):\n \"\"\"Compute minimum distances between one point and a set of points.\n\n This function computes for each row in X, the index of the row of Y which\n is closest (according to the specified distance).\n\n This is mostly equivalent to calling:\n\n pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis)\n\n but uses much less memory, and is faster for large arrays.\n\n This function works with dense 2D arrays only.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_X, n_features)\n Array containing points.\n\n Y : array-like of shape (n_samples_Y, n_features)\n Arrays containing points.\n\n axis : int, default=1\n Axis along which the argmin and distances are to be computed.\n\n metric : str or callable, default=\"euclidean\"\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string.\n\n Distance matrices are not supported.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n metric_kwargs : dict, default=None\n Keyword arguments to pass to specified metric function.\n\n Returns\n -------\n argmin : numpy.ndarray\n Y[argmin[i], :] is the row in Y that is closest to X[i, :].\n\n See Also\n --------\n sklearn.metrics.pairwise_distances\n sklearn.metrics.pairwise_distances_argmin_min\n \"\"\"\n if metric_kwargs is None:\n metric_kwargs = {}\n return pairwise_distances_argmin_min(X, Y, axis=axis, metric=metric, metric_kwargs=metric_kwargs)[0]" }, { @@ -125132,6 +134655,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples_X, n_features)", "description": "Array containing points." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -125142,6 +134669,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples_Y, n_features)", "description": "Array containing points." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -125152,7 +134683,8 @@ "docstring": { "type": "int, default=1", "description": "Axis along which the argmin and distances are to be computed." - } + }, + "refined_type": {} }, { "name": "metric", @@ -125162,7 +134694,8 @@ "docstring": { "type": "str or callable, default='euclidean'", "description": "Metric to use for distance computation. Any metric from scikit-learn\nor scipy.spatial.distance can be used.\n\nIf metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two arrays as input and return one value indicating the\ndistance between them. This works for Scipy's metrics, but is less\nefficient than passing the metric name as a string.\n\nDistance matrices are not supported.\n\nValid values for metric are:\n\n- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\nSee the documentation for scipy.spatial.distance for details on these\nmetrics." - } + }, + "refined_type": {} }, { "name": "metric_kwargs", @@ -125172,13 +134705,14 @@ "docstring": { "type": "dict, default=None", "description": "Keyword arguments to pass to specified metric function." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute minimum distances between one point and a set of points.\n\nThis function computes for each row in X, the index of the row of Y which is closest (according to the specified distance). The minimal distances are also returned. This is mostly equivalent to calling: (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis), pairwise_distances(X, Y=Y, metric=metric).min(axis=axis)) but uses much less memory, and is faster for large arrays.", - "docstring": "Compute minimum distances between one point and a set of points.\n\nThis function computes for each row in X, the index of the row of Y which\nis closest (according to the specified distance). The minimal distances are\nalso returned.\n\nThis is mostly equivalent to calling:\n\n (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis),\n pairwise_distances(X, Y=Y, metric=metric).min(axis=axis))\n\nbut uses much less memory, and is faster for large arrays.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n Array containing points.\n\nY : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n Array containing points.\n\naxis : int, default=1\n Axis along which the argmin and distances are to be computed.\n\nmetric : str or callable, default='euclidean'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string.\n\n Distance matrices are not supported.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\nmetric_kwargs : dict, default=None\n Keyword arguments to pass to specified metric function.\n\nReturns\n-------\nargmin : ndarray\n Y[argmin[i], :] is the row in Y that is closest to X[i, :].\n\ndistances : ndarray\n distances[i] is the distance between the i-th row in X and the\n argmin[i]-th row in Y.\n\nSee Also\n--------\nsklearn.metrics.pairwise_distances\nsklearn.metrics.pairwise_distances_argmin", + "description": "Compute minimum distances between one point and a set of points.\n\nThis function computes for each row in X, the index of the row of Y which\nis closest (according to the specified distance). The minimal distances are\nalso returned.\n\nThis is mostly equivalent to calling:\n\n (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis),\n pairwise_distances(X, Y=Y, metric=metric).min(axis=axis))\n\nbut uses much less memory, and is faster for large arrays.", + "docstring": "Compute minimum distances between one point and a set of points.\n\n This function computes for each row in X, the index of the row of Y which\n is closest (according to the specified distance). The minimal distances are\n also returned.\n\n This is mostly equivalent to calling:\n\n (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis),\n pairwise_distances(X, Y=Y, metric=metric).min(axis=axis))\n\n but uses much less memory, and is faster for large arrays.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n Array containing points.\n\n Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n Array containing points.\n\n axis : int, default=1\n Axis along which the argmin and distances are to be computed.\n\n metric : str or callable, default='euclidean'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string.\n\n Distance matrices are not supported.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n metric_kwargs : dict, default=None\n Keyword arguments to pass to specified metric function.\n\n Returns\n -------\n argmin : ndarray\n Y[argmin[i], :] is the row in Y that is closest to X[i, :].\n\n distances : ndarray\n distances[i] is the distance between the i-th row in X and the\n argmin[i]-th row in Y.\n\n See Also\n --------\n sklearn.metrics.pairwise_distances\n sklearn.metrics.pairwise_distances_argmin\n ", "source_code": "\ndef pairwise_distances_argmin_min(X, Y, *, axis=1, metric='euclidean', metric_kwargs=None):\n \"\"\"Compute minimum distances between one point and a set of points.\n\n This function computes for each row in X, the index of the row of Y which\n is closest (according to the specified distance). The minimal distances are\n also returned.\n\n This is mostly equivalent to calling:\n\n (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis),\n pairwise_distances(X, Y=Y, metric=metric).min(axis=axis))\n\n but uses much less memory, and is faster for large arrays.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples_X, n_features)\n Array containing points.\n\n Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features)\n Array containing points.\n\n axis : int, default=1\n Axis along which the argmin and distances are to be computed.\n\n metric : str or callable, default='euclidean'\n Metric to use for distance computation. Any metric from scikit-learn\n or scipy.spatial.distance can be used.\n\n If metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays as input and return one value indicating the\n distance between them. This works for Scipy's metrics, but is less\n efficient than passing the metric name as a string.\n\n Distance matrices are not supported.\n\n Valid values for metric are:\n\n - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\n See the documentation for scipy.spatial.distance for details on these\n metrics.\n\n metric_kwargs : dict, default=None\n Keyword arguments to pass to specified metric function.\n\n Returns\n -------\n argmin : ndarray\n Y[argmin[i], :] is the row in Y that is closest to X[i, :].\n\n distances : ndarray\n distances[i] is the distance between the i-th row in X and the\n argmin[i]-th row in Y.\n\n See Also\n --------\n sklearn.metrics.pairwise_distances\n sklearn.metrics.pairwise_distances_argmin\n \"\"\"\n (X, Y) = check_pairwise_arrays(X, Y)\n if metric_kwargs is None:\n metric_kwargs = {}\n if axis == 0:\n (X, Y) = (Y, X)\n (indices, values) = zip(*pairwise_distances_chunked(X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs))\n indices = np.concatenate(indices)\n values = np.concatenate(values)\n return indices, values" }, { @@ -125196,7 +134730,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)", "description": "Array of pairwise distances between samples, or a feature array.\nThe shape the array should be (n_samples_X, n_samples_X) if\nmetric='precomputed' and (n_samples_X, n_features) otherwise." - } + }, + "refined_type": {} }, { "name": "Y", @@ -125206,7 +134741,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "An optional second feature array. Only allowed if\nmetric != \"precomputed\"." - } + }, + "refined_type": {} }, { "name": "reduce_func", @@ -125216,7 +134752,8 @@ "docstring": { "type": "callable, default=None", "description": "The function which is applied on each chunk of the distance matrix,\nreducing it to needed values. ``reduce_func(D_chunk, start)``\nis called repeatedly, where ``D_chunk`` is a contiguous vertical\nslice of the pairwise distance matrix, starting at row ``start``.\nIt should return one of: None; an array, a list, or a sparse matrix\nof length ``D_chunk.shape[0]``; or a tuple of such objects. Returning\nNone is useful for in-place operations, rather than reductions.\n\nIf None, pairwise_distances_chunked returns a generator of vertical\nchunks of the distance matrix." - } + }, + "refined_type": {} }, { "name": "metric", @@ -125226,7 +134763,8 @@ "docstring": { "type": "str or callable, default='euclidean'", "description": "The metric to use when calculating distance between instances in a\nfeature array. If metric is a string, it must be one of the options\nallowed by scipy.spatial.distance.pdist for its metric parameter, or\na metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.\nIf metric is \"precomputed\", X is assumed to be a distance matrix.\nAlternatively, if metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two arrays from X as input and return a value indicating\nthe distance between them." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -125236,7 +134774,8 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation. This works by breaking\ndown the pairwise matrix into n_jobs even slices and computing them in\nparallel.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "working_memory", @@ -125246,13 +134785,14 @@ "docstring": { "type": "int, default=None", "description": "The sought maximum memory for temporary distance matrix chunks.\nWhen None (default), the value of\n``sklearn.get_config()['working_memory']`` is used." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generate a distance matrix chunk by chunk with optional reduction.\n\nIn cases where not all of a pairwise distance matrix needs to be stored at once, this is used to calculate pairwise distances in ``working_memory``-sized chunks. If ``reduce_func`` is given, it is run on each chunk and its return values are concatenated into lists, arrays or sparse matrices.", - "docstring": "Generate a distance matrix chunk by chunk with optional reduction.\n\nIn cases where not all of a pairwise distance matrix needs to be stored at\nonce, this is used to calculate pairwise distances in\n``working_memory``-sized chunks. If ``reduce_func`` is given, it is run\non each chunk and its return values are concatenated into lists, arrays\nor sparse matrices.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)\n Array of pairwise distances between samples, or a feature array.\n The shape the array should be (n_samples_X, n_samples_X) if\n metric='precomputed' and (n_samples_X, n_features) otherwise.\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n An optional second feature array. Only allowed if\n metric != \"precomputed\".\n\nreduce_func : callable, default=None\n The function which is applied on each chunk of the distance matrix,\n reducing it to needed values. ``reduce_func(D_chunk, start)``\n is called repeatedly, where ``D_chunk`` is a contiguous vertical\n slice of the pairwise distance matrix, starting at row ``start``.\n It should return one of: None; an array, a list, or a sparse matrix\n of length ``D_chunk.shape[0]``; or a tuple of such objects. Returning\n None is useful for in-place operations, rather than reductions.\n\n If None, pairwise_distances_chunked returns a generator of vertical\n chunks of the distance matrix.\n\nmetric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by scipy.spatial.distance.pdist for its metric parameter, or\n a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.\n If metric is \"precomputed\", X is assumed to be a distance matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them.\n\nn_jobs : int, default=None\n The number of jobs to use for the computation. This works by breaking\n down the pairwise matrix into n_jobs even slices and computing them in\n parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nworking_memory : int, default=None\n The sought maximum memory for temporary distance matrix chunks.\n When None (default), the value of\n ``sklearn.get_config()['working_memory']`` is used.\n\n`**kwds` : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a scipy.spatial.distance metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\nYields\n------\nD_chunk : {ndarray, sparse matrix}\n A contiguous slice of distance matrix, optionally processed by\n ``reduce_func``.\n\nExamples\n--------\nWithout reduce_func:\n\n>>> import numpy as np\n>>> from sklearn.metrics import pairwise_distances_chunked\n>>> X = np.random.RandomState(0).rand(5, 3)\n>>> D_chunk = next(pairwise_distances_chunked(X))\n>>> D_chunk\narray([[0. ..., 0.29..., 0.41..., 0.19..., 0.57...],\n [0.29..., 0. ..., 0.57..., 0.41..., 0.76...],\n [0.41..., 0.57..., 0. ..., 0.44..., 0.90...],\n [0.19..., 0.41..., 0.44..., 0. ..., 0.51...],\n [0.57..., 0.76..., 0.90..., 0.51..., 0. ...]])\n\nRetrieve all neighbors and average distance within radius r:\n\n>>> r = .2\n>>> def reduce_func(D_chunk, start):\n... neigh = [np.flatnonzero(d < r) for d in D_chunk]\n... avg_dist = (D_chunk * (D_chunk < r)).mean(axis=1)\n... return neigh, avg_dist\n>>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func)\n>>> neigh, avg_dist = next(gen)\n>>> neigh\n[array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])]\n>>> avg_dist\narray([0.039..., 0. , 0. , 0.039..., 0. ])\n\nWhere r is defined per sample, we need to make use of ``start``:\n\n>>> r = [.2, .4, .4, .3, .1]\n>>> def reduce_func(D_chunk, start):\n... neigh = [np.flatnonzero(d < r[i])\n... for i, d in enumerate(D_chunk, start)]\n... return neigh\n>>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func))\n>>> neigh\n[array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])]\n\nForce row-by-row generation by reducing ``working_memory``:\n\n>>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func,\n... working_memory=0)\n>>> next(gen)\n[array([0, 3])]\n>>> next(gen)\n[array([0, 1])]", + "description": "Generate a distance matrix chunk by chunk with optional reduction.\n\nIn cases where not all of a pairwise distance matrix needs to be stored at\nonce, this is used to calculate pairwise distances in\n``working_memory``-sized chunks. If ``reduce_func`` is given, it is run\non each chunk and its return values are concatenated into lists, arrays\nor sparse matrices.", + "docstring": "Generate a distance matrix chunk by chunk with optional reduction.\n\n In cases where not all of a pairwise distance matrix needs to be stored at\n once, this is used to calculate pairwise distances in\n ``working_memory``-sized chunks. If ``reduce_func`` is given, it is run\n on each chunk and its return values are concatenated into lists, arrays\n or sparse matrices.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)\n Array of pairwise distances between samples, or a feature array.\n The shape the array should be (n_samples_X, n_samples_X) if\n metric='precomputed' and (n_samples_X, n_features) otherwise.\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n An optional second feature array. Only allowed if\n metric != \"precomputed\".\n\n reduce_func : callable, default=None\n The function which is applied on each chunk of the distance matrix,\n reducing it to needed values. ``reduce_func(D_chunk, start)``\n is called repeatedly, where ``D_chunk`` is a contiguous vertical\n slice of the pairwise distance matrix, starting at row ``start``.\n It should return one of: None; an array, a list, or a sparse matrix\n of length ``D_chunk.shape[0]``; or a tuple of such objects. Returning\n None is useful for in-place operations, rather than reductions.\n\n If None, pairwise_distances_chunked returns a generator of vertical\n chunks of the distance matrix.\n\n metric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by scipy.spatial.distance.pdist for its metric parameter, or\n a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.\n If metric is \"precomputed\", X is assumed to be a distance matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by breaking\n down the pairwise matrix into n_jobs even slices and computing them in\n parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n working_memory : int, default=None\n The sought maximum memory for temporary distance matrix chunks.\n When None (default), the value of\n ``sklearn.get_config()['working_memory']`` is used.\n\n `**kwds` : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a scipy.spatial.distance metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\n Yields\n ------\n D_chunk : {ndarray, sparse matrix}\n A contiguous slice of distance matrix, optionally processed by\n ``reduce_func``.\n\n Examples\n --------\n Without reduce_func:\n\n >>> import numpy as np\n >>> from sklearn.metrics import pairwise_distances_chunked\n >>> X = np.random.RandomState(0).rand(5, 3)\n >>> D_chunk = next(pairwise_distances_chunked(X))\n >>> D_chunk\n array([[0. ..., 0.29..., 0.41..., 0.19..., 0.57...],\n [0.29..., 0. ..., 0.57..., 0.41..., 0.76...],\n [0.41..., 0.57..., 0. ..., 0.44..., 0.90...],\n [0.19..., 0.41..., 0.44..., 0. ..., 0.51...],\n [0.57..., 0.76..., 0.90..., 0.51..., 0. ...]])\n\n Retrieve all neighbors and average distance within radius r:\n\n >>> r = .2\n >>> def reduce_func(D_chunk, start):\n ... neigh = [np.flatnonzero(d < r) for d in D_chunk]\n ... avg_dist = (D_chunk * (D_chunk < r)).mean(axis=1)\n ... return neigh, avg_dist\n >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func)\n >>> neigh, avg_dist = next(gen)\n >>> neigh\n [array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])]\n >>> avg_dist\n array([0.039..., 0. , 0. , 0.039..., 0. ])\n\n Where r is defined per sample, we need to make use of ``start``:\n\n >>> r = [.2, .4, .4, .3, .1]\n >>> def reduce_func(D_chunk, start):\n ... neigh = [np.flatnonzero(d < r[i])\n ... for i, d in enumerate(D_chunk, start)]\n ... return neigh\n >>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func))\n >>> neigh\n [array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])]\n\n Force row-by-row generation by reducing ``working_memory``:\n\n >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func,\n ... working_memory=0)\n >>> next(gen)\n [array([0, 3])]\n >>> next(gen)\n [array([0, 1])]\n ", "source_code": "\ndef pairwise_distances_chunked(X, Y=None, *, reduce_func=None, metric='euclidean', n_jobs=None, working_memory=None, **kwds):\n \"\"\"Generate a distance matrix chunk by chunk with optional reduction.\n\n In cases where not all of a pairwise distance matrix needs to be stored at\n once, this is used to calculate pairwise distances in\n ``working_memory``-sized chunks. If ``reduce_func`` is given, it is run\n on each chunk and its return values are concatenated into lists, arrays\n or sparse matrices.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)\n Array of pairwise distances between samples, or a feature array.\n The shape the array should be (n_samples_X, n_samples_X) if\n metric='precomputed' and (n_samples_X, n_features) otherwise.\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n An optional second feature array. Only allowed if\n metric != \"precomputed\".\n\n reduce_func : callable, default=None\n The function which is applied on each chunk of the distance matrix,\n reducing it to needed values. ``reduce_func(D_chunk, start)``\n is called repeatedly, where ``D_chunk`` is a contiguous vertical\n slice of the pairwise distance matrix, starting at row ``start``.\n It should return one of: None; an array, a list, or a sparse matrix\n of length ``D_chunk.shape[0]``; or a tuple of such objects. Returning\n None is useful for in-place operations, rather than reductions.\n\n If None, pairwise_distances_chunked returns a generator of vertical\n chunks of the distance matrix.\n\n metric : str or callable, default='euclidean'\n The metric to use when calculating distance between instances in a\n feature array. If metric is a string, it must be one of the options\n allowed by scipy.spatial.distance.pdist for its metric parameter, or\n a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.\n If metric is \"precomputed\", X is assumed to be a distance matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two arrays from X as input and return a value indicating\n the distance between them.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by breaking\n down the pairwise matrix into n_jobs even slices and computing them in\n parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n working_memory : int, default=None\n The sought maximum memory for temporary distance matrix chunks.\n When None (default), the value of\n ``sklearn.get_config()['working_memory']`` is used.\n\n `**kwds` : optional keyword parameters\n Any further parameters are passed directly to the distance function.\n If using a scipy.spatial.distance metric, the parameters are still\n metric dependent. See the scipy docs for usage examples.\n\n Yields\n ------\n D_chunk : {ndarray, sparse matrix}\n A contiguous slice of distance matrix, optionally processed by\n ``reduce_func``.\n\n Examples\n --------\n Without reduce_func:\n\n >>> import numpy as np\n >>> from sklearn.metrics import pairwise_distances_chunked\n >>> X = np.random.RandomState(0).rand(5, 3)\n >>> D_chunk = next(pairwise_distances_chunked(X))\n >>> D_chunk\n array([[0. ..., 0.29..., 0.41..., 0.19..., 0.57...],\n [0.29..., 0. ..., 0.57..., 0.41..., 0.76...],\n [0.41..., 0.57..., 0. ..., 0.44..., 0.90...],\n [0.19..., 0.41..., 0.44..., 0. ..., 0.51...],\n [0.57..., 0.76..., 0.90..., 0.51..., 0. ...]])\n\n Retrieve all neighbors and average distance within radius r:\n\n >>> r = .2\n >>> def reduce_func(D_chunk, start):\n ... neigh = [np.flatnonzero(d < r) for d in D_chunk]\n ... avg_dist = (D_chunk * (D_chunk < r)).mean(axis=1)\n ... return neigh, avg_dist\n >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func)\n >>> neigh, avg_dist = next(gen)\n >>> neigh\n [array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])]\n >>> avg_dist\n array([0.039..., 0. , 0. , 0.039..., 0. ])\n\n Where r is defined per sample, we need to make use of ``start``:\n\n >>> r = [.2, .4, .4, .3, .1]\n >>> def reduce_func(D_chunk, start):\n ... neigh = [np.flatnonzero(d < r[i])\n ... for i, d in enumerate(D_chunk, start)]\n ... return neigh\n >>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func))\n >>> neigh\n [array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])]\n\n Force row-by-row generation by reducing ``working_memory``:\n\n >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func,\n ... working_memory=0)\n >>> next(gen)\n [array([0, 3])]\n >>> next(gen)\n [array([0, 1])]\n \"\"\"\n n_samples_X = _num_samples(X)\n if metric == 'precomputed':\n slices = (slice(0, n_samples_X), )\n else:\n if Y is None:\n Y = X\n chunk_n_rows = get_chunk_n_rows(row_bytes=8 * _num_samples(Y), max_n_rows=n_samples_X, working_memory=working_memory)\n slices = gen_batches(n_samples_X, chunk_n_rows)\n params = _precompute_metric_params(X, Y, metric=metric, **kwds)\n kwds.update(**params)\n for sl in slices:\n if sl.start == 0 and sl.stop == n_samples_X:\n X_chunk = X\n else:\n X_chunk = X[sl]\n D_chunk = pairwise_distances(X_chunk, Y, metric=metric, n_jobs=n_jobs, **kwds)\n if (X is Y or Y is None) and PAIRWISE_DISTANCE_FUNCTIONS.get(metric, None) is euclidean_distances:\n D_chunk.flat[sl.start::_num_samples(X) + 1] = 0\n if reduce_func is not None:\n chunk_size = D_chunk.shape[0]\n D_chunk = reduce_func(D_chunk, sl.start)\n _check_chunk_size(D_chunk, chunk_size)\n yield D_chunk" }, { @@ -125270,7 +134810,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)", "description": "Array of pairwise kernels between samples, or a feature array.\nThe shape of the array should be (n_samples_X, n_samples_X) if\nmetric == \"precomputed\" and (n_samples_X, n_features) otherwise." - } + }, + "refined_type": {} }, { "name": "Y", @@ -125280,7 +134821,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "A second feature array only if X has shape (n_samples_X, n_features)." - } + }, + "refined_type": {} }, { "name": "metric", @@ -125290,7 +134832,8 @@ "docstring": { "type": "str or callable, default=\"linear\"", "description": "The metric to use when calculating kernel between instances in a\nfeature array. If metric is a string, it must be one of the metrics\nin pairwise.PAIRWISE_KERNEL_FUNCTIONS.\nIf metric is \"precomputed\", X is assumed to be a kernel matrix.\nAlternatively, if metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two rows from X as input and return the corresponding\nkernel value as a single number. This means that callables from\n:mod:`sklearn.metrics.pairwise` are not allowed, as they operate on\nmatrices, not single samples. Use the string identifying the kernel\ninstead." - } + }, + "refined_type": {} }, { "name": "filter_params", @@ -125300,7 +134843,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to filter invalid parameters or not." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -125310,13 +134854,14 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation. This works by breaking\ndown the pairwise matrix into n_jobs even slices and computing them in\nparallel.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the kernel between arrays X and optional array Y.\n\nThis method takes either a vector array or a kernel matrix, and returns a kernel matrix. If the input is a vector array, the kernels are computed. If the input is a kernel matrix, it is returned instead. This method provides a safe way to take a kernel matrix as input, while preserving compatibility with many other algorithms that take a vector array. If Y is given (default is None), then the returned matrix is the pairwise kernel between the arrays from both X and Y. Valid values for metric are: ['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'rbf', 'laplacian', 'sigmoid', 'cosine'] Read more in the :ref:`User Guide `.", - "docstring": "Compute the kernel between arrays X and optional array Y.\n\nThis method takes either a vector array or a kernel matrix, and returns\na kernel matrix. If the input is a vector array, the kernels are\ncomputed. If the input is a kernel matrix, it is returned instead.\n\nThis method provides a safe way to take a kernel matrix as input, while\npreserving compatibility with many other algorithms that take a vector\narray.\n\nIf Y is given (default is None), then the returned matrix is the pairwise\nkernel between the arrays from both X and Y.\n\nValid values for metric are:\n ['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'rbf',\n 'laplacian', 'sigmoid', 'cosine']\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)\n Array of pairwise kernels between samples, or a feature array.\n The shape of the array should be (n_samples_X, n_samples_X) if\n metric == \"precomputed\" and (n_samples_X, n_features) otherwise.\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n A second feature array only if X has shape (n_samples_X, n_features).\n\nmetric : str or callable, default=\"linear\"\n The metric to use when calculating kernel between instances in a\n feature array. If metric is a string, it must be one of the metrics\n in pairwise.PAIRWISE_KERNEL_FUNCTIONS.\n If metric is \"precomputed\", X is assumed to be a kernel matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two rows from X as input and return the corresponding\n kernel value as a single number. This means that callables from\n :mod:`sklearn.metrics.pairwise` are not allowed, as they operate on\n matrices, not single samples. Use the string identifying the kernel\n instead.\n\nfilter_params : bool, default=False\n Whether to filter invalid parameters or not.\n\nn_jobs : int, default=None\n The number of jobs to use for the computation. This works by breaking\n down the pairwise matrix into n_jobs even slices and computing them in\n parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n**kwds : optional keyword parameters\n Any further parameters are passed directly to the kernel function.\n\nReturns\n-------\nK : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_samples_Y)\n A kernel matrix K such that K_{i, j} is the kernel between the\n ith and jth vectors of the given matrix X, if Y is None.\n If Y is not None, then K_{i, j} is the kernel between the ith array\n from X and the jth array from Y.\n\nNotes\n-----\nIf metric is 'precomputed', Y is ignored and X is returned.", + "description": "Compute the kernel between arrays X and optional array Y.\n\nThis method takes either a vector array or a kernel matrix, and returns\na kernel matrix. If the input is a vector array, the kernels are\ncomputed. If the input is a kernel matrix, it is returned instead.\n\nThis method provides a safe way to take a kernel matrix as input, while\npreserving compatibility with many other algorithms that take a vector\narray.\n\nIf Y is given (default is None), then the returned matrix is the pairwise\nkernel between the arrays from both X and Y.\n\nValid values for metric are:\n ['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'rbf',\n 'laplacian', 'sigmoid', 'cosine']\n\nRead more in the :ref:`User Guide `.", + "docstring": "Compute the kernel between arrays X and optional array Y.\n\n This method takes either a vector array or a kernel matrix, and returns\n a kernel matrix. If the input is a vector array, the kernels are\n computed. If the input is a kernel matrix, it is returned instead.\n\n This method provides a safe way to take a kernel matrix as input, while\n preserving compatibility with many other algorithms that take a vector\n array.\n\n If Y is given (default is None), then the returned matrix is the pairwise\n kernel between the arrays from both X and Y.\n\n Valid values for metric are:\n ['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'rbf',\n 'laplacian', 'sigmoid', 'cosine']\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)\n Array of pairwise kernels between samples, or a feature array.\n The shape of the array should be (n_samples_X, n_samples_X) if\n metric == \"precomputed\" and (n_samples_X, n_features) otherwise.\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n A second feature array only if X has shape (n_samples_X, n_features).\n\n metric : str or callable, default=\"linear\"\n The metric to use when calculating kernel between instances in a\n feature array. If metric is a string, it must be one of the metrics\n in pairwise.PAIRWISE_KERNEL_FUNCTIONS.\n If metric is \"precomputed\", X is assumed to be a kernel matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two rows from X as input and return the corresponding\n kernel value as a single number. This means that callables from\n :mod:`sklearn.metrics.pairwise` are not allowed, as they operate on\n matrices, not single samples. Use the string identifying the kernel\n instead.\n\n filter_params : bool, default=False\n Whether to filter invalid parameters or not.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by breaking\n down the pairwise matrix into n_jobs even slices and computing them in\n parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n **kwds : optional keyword parameters\n Any further parameters are passed directly to the kernel function.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_samples_Y)\n A kernel matrix K such that K_{i, j} is the kernel between the\n ith and jth vectors of the given matrix X, if Y is None.\n If Y is not None, then K_{i, j} is the kernel between the ith array\n from X and the jth array from Y.\n\n Notes\n -----\n If metric is 'precomputed', Y is ignored and X is returned.\n\n ", "source_code": "\ndef pairwise_kernels(X, Y=None, metric='linear', *, filter_params=False, n_jobs=None, **kwds):\n \"\"\"Compute the kernel between arrays X and optional array Y.\n\n This method takes either a vector array or a kernel matrix, and returns\n a kernel matrix. If the input is a vector array, the kernels are\n computed. If the input is a kernel matrix, it is returned instead.\n\n This method provides a safe way to take a kernel matrix as input, while\n preserving compatibility with many other algorithms that take a vector\n array.\n\n If Y is given (default is None), then the returned matrix is the pairwise\n kernel between the arrays from both X and Y.\n\n Valid values for metric are:\n ['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'rbf',\n 'laplacian', 'sigmoid', 'cosine']\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_features)\n Array of pairwise kernels between samples, or a feature array.\n The shape of the array should be (n_samples_X, n_samples_X) if\n metric == \"precomputed\" and (n_samples_X, n_features) otherwise.\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n A second feature array only if X has shape (n_samples_X, n_features).\n\n metric : str or callable, default=\"linear\"\n The metric to use when calculating kernel between instances in a\n feature array. If metric is a string, it must be one of the metrics\n in pairwise.PAIRWISE_KERNEL_FUNCTIONS.\n If metric is \"precomputed\", X is assumed to be a kernel matrix.\n Alternatively, if metric is a callable function, it is called on each\n pair of instances (rows) and the resulting value recorded. The callable\n should take two rows from X as input and return the corresponding\n kernel value as a single number. This means that callables from\n :mod:`sklearn.metrics.pairwise` are not allowed, as they operate on\n matrices, not single samples. Use the string identifying the kernel\n instead.\n\n filter_params : bool, default=False\n Whether to filter invalid parameters or not.\n\n n_jobs : int, default=None\n The number of jobs to use for the computation. This works by breaking\n down the pairwise matrix into n_jobs even slices and computing them in\n parallel.\n\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n **kwds : optional keyword parameters\n Any further parameters are passed directly to the kernel function.\n\n Returns\n -------\n K : ndarray of shape (n_samples_X, n_samples_X) or (n_samples_X, n_samples_Y)\n A kernel matrix K such that K_{i, j} is the kernel between the\n ith and jth vectors of the given matrix X, if Y is None.\n If Y is not None, then K_{i, j} is the kernel between the ith array\n from X and the jth array from Y.\n\n Notes\n -----\n If metric is 'precomputed', Y is ignored and X is returned.\n\n \"\"\"\n from ..gaussian_process.kernels import Kernel as GPKernel\n if metric == 'precomputed':\n (X, _) = check_pairwise_arrays(X, Y, precomputed=True)\n return X\n elif isinstance(metric, GPKernel):\n func = metric.__call__\n elif metric in PAIRWISE_KERNEL_FUNCTIONS:\n if filter_params:\n kwds = {k: kwds[k] for k in kwds if k in KERNEL_PARAMS[metric]}\n func = PAIRWISE_KERNEL_FUNCTIONS[metric]\n elif callable(metric):\n func = partial(_pairwise_callable, metric=metric, **kwds)\n else:\n raise ValueError('Unknown kernel %r' % metric)\n return _parallel_pairwise(X, Y, func, n_jobs, **kwds)" }, { @@ -125334,7 +134879,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -125344,7 +134890,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "" - } + }, + "refined_type": {} }, { "name": "degree", @@ -125354,7 +134901,8 @@ "docstring": { "type": "int, default=3", "description": "" - } + }, + "refined_type": {} }, { "name": "gamma", @@ -125364,7 +134912,8 @@ "docstring": { "type": "float, default=None", "description": "If None, defaults to 1.0 / n_features." - } + }, + "refined_type": {} }, { "name": "coef0", @@ -125374,13 +134923,14 @@ "docstring": { "type": "float, default=1", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the polynomial kernel between X and Y::\n\n K(X, Y) = (gamma + coef0)^degree Read more in the :ref:`User Guide `.", - "docstring": "Compute the polynomial kernel between X and Y::\n\n K(X, Y) = (gamma + coef0)^degree\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n\ndegree : int, default=3\n\ngamma : float, default=None\n If None, defaults to 1.0 / n_features.\n\ncoef0 : float, default=1\n\nReturns\n-------\nGram matrix : ndarray of shape (n_samples_X, n_samples_Y)", + "description": "Compute the polynomial kernel between X and Y::\n\n K(X, Y) = (gamma + coef0)^degree\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n Compute the polynomial kernel between X and Y::\n\n K(X, Y) = (gamma + coef0)^degree\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n\n degree : int, default=3\n\n gamma : float, default=None\n If None, defaults to 1.0 / n_features.\n\n coef0 : float, default=1\n\n Returns\n -------\n Gram matrix : ndarray of shape (n_samples_X, n_samples_Y)\n ", "source_code": "\ndef polynomial_kernel(X, Y=None, degree=3, gamma=None, coef0=1):\n \"\"\"\n Compute the polynomial kernel between X and Y::\n\n K(X, Y) = (gamma + coef0)^degree\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n\n degree : int, default=3\n\n gamma : float, default=None\n If None, defaults to 1.0 / n_features.\n\n coef0 : float, default=1\n\n Returns\n -------\n Gram matrix : ndarray of shape (n_samples_X, n_samples_Y)\n \"\"\"\n (X, Y) = check_pairwise_arrays(X, Y)\n if gamma is None:\n gamma = 1.0 / X.shape[1]\n K = safe_sparse_dot(X, Y.T, dense_output=True)\n K *= gamma\n K += coef0\n K **= degree\n return K" }, { @@ -125398,7 +134948,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -125408,7 +134959,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "If `None`, uses `Y=X`." - } + }, + "refined_type": {} }, { "name": "gamma", @@ -125418,13 +134970,14 @@ "docstring": { "type": "float, default=None", "description": "If None, defaults to 1.0 / n_features." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the rbf (gaussian) kernel between X and Y::\n\n K(x, y) = exp(-gamma ||x-y||^2) for each pair of rows x in X and y in Y. Read more in the :ref:`User Guide `.", - "docstring": "Compute the rbf (gaussian) kernel between X and Y::\n\n K(x, y) = exp(-gamma ||x-y||^2)\n\nfor each pair of rows x in X and y in Y.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\ngamma : float, default=None\n If None, defaults to 1.0 / n_features.\n\nReturns\n-------\nkernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)", + "description": "Compute the rbf (gaussian) kernel between X and Y::\n\n K(x, y) = exp(-gamma ||x-y||^2)\n\nfor each pair of rows x in X and y in Y.\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n Compute the rbf (gaussian) kernel between X and Y::\n\n K(x, y) = exp(-gamma ||x-y||^2)\n\n for each pair of rows x in X and y in Y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\n gamma : float, default=None\n If None, defaults to 1.0 / n_features.\n\n Returns\n -------\n kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)\n ", "source_code": "\ndef rbf_kernel(X, Y=None, gamma=None):\n \"\"\"\n Compute the rbf (gaussian) kernel between X and Y::\n\n K(x, y) = exp(-gamma ||x-y||^2)\n\n for each pair of rows x in X and y in Y.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\n gamma : float, default=None\n If None, defaults to 1.0 / n_features.\n\n Returns\n -------\n kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y)\n \"\"\"\n (X, Y) = check_pairwise_arrays(X, Y)\n if gamma is None:\n gamma = 1.0 / X.shape[1]\n K = euclidean_distances(X, Y, squared=True)\n K *= -gamma\n np.exp(K, K)\n return K" }, { @@ -125442,7 +134995,8 @@ "docstring": { "type": "ndarray of shape (n_samples_X, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -125452,7 +135006,8 @@ "docstring": { "type": "ndarray of shape (n_samples_Y, n_features), default=None", "description": "If `None`, uses `Y=X`." - } + }, + "refined_type": {} }, { "name": "gamma", @@ -125462,7 +135017,8 @@ "docstring": { "type": "float, default=None", "description": "If None, defaults to 1.0 / n_features." - } + }, + "refined_type": {} }, { "name": "coef0", @@ -125472,13 +135028,14 @@ "docstring": { "type": "float, default=1", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the sigmoid kernel between X and Y::\n\n K(X, Y) = tanh(gamma + coef0) Read more in the :ref:`User Guide `.", - "docstring": "Compute the sigmoid kernel between X and Y::\n\n K(X, Y) = tanh(gamma + coef0)\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : ndarray of shape (n_samples_X, n_features)\n\nY : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\ngamma : float, default=None\n If None, defaults to 1.0 / n_features.\n\ncoef0 : float, default=1\n\nReturns\n-------\nGram matrix : ndarray of shape (n_samples_X, n_samples_Y)", + "description": "Compute the sigmoid kernel between X and Y::\n\n K(X, Y) = tanh(gamma + coef0)\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n Compute the sigmoid kernel between X and Y::\n\n K(X, Y) = tanh(gamma + coef0)\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\n gamma : float, default=None\n If None, defaults to 1.0 / n_features.\n\n coef0 : float, default=1\n\n Returns\n -------\n Gram matrix : ndarray of shape (n_samples_X, n_samples_Y)\n ", "source_code": "\ndef sigmoid_kernel(X, Y=None, gamma=None, coef0=1):\n \"\"\"\n Compute the sigmoid kernel between X and Y::\n\n K(X, Y) = tanh(gamma + coef0)\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples_X, n_features)\n\n Y : ndarray of shape (n_samples_Y, n_features), default=None\n If `None`, uses `Y=X`.\n\n gamma : float, default=None\n If None, defaults to 1.0 / n_features.\n\n coef0 : float, default=1\n\n Returns\n -------\n Gram matrix : ndarray of shape (n_samples_X, n_samples_Y)\n \"\"\"\n (X, Y) = check_pairwise_arrays(X, Y)\n if gamma is None:\n gamma = 1.0 / X.shape[1]\n K = safe_sparse_dot(X, Y.T, dense_output=True)\n K *= gamma\n K += coef0\n np.tanh(K, K)\n return K" }, { @@ -125496,7 +135053,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -125506,14 +135064,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", - "source_code": "\ndef configuration(parent_package='', top_path=None):\n config = Configuration('metrics', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_subpackage('_plot')\n config.add_subpackage('_plot.tests')\n config.add_subpackage('cluster')\n config.add_extension('_pairwise_fast', sources=['_pairwise_fast.pyx'], libraries=libraries)\n config.add_subpackage('tests')\n return config" + "docstring": null, + "source_code": "\ndef configuration(parent_package='', top_path=None):\n config = Configuration('metrics', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_subpackage('_plot')\n config.add_subpackage('_plot.tests')\n config.add_subpackage('cluster')\n config.add_extension('_pairwise_fast', sources=['_pairwise_fast.pyx'], libraries=libraries)\n config.add_extension('_dist_metrics', sources=['_dist_metrics.pyx'], include_dirs=[np.get_include(), os.path.join(np.get_include(), 'numpy')], libraries=libraries)\n config.add_subpackage('tests')\n return config" }, { "name": "__init__", @@ -125530,7 +135089,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -125540,7 +135100,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -125550,7 +135111,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "reg_covar", @@ -125560,7 +135122,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -125570,7 +135133,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_init", @@ -125580,7 +135144,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "init_params", @@ -125590,7 +135155,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -125600,7 +135166,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -125610,7 +135177,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -125620,7 +135188,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose_interval", @@ -125630,13 +135199,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components, tol, reg_covar, max_iter, n_init, init_params, random_state, warm_start, verbose, verbose_interval):\n self.n_components = n_components\n self.tol = tol\n self.reg_covar = reg_covar\n self.max_iter = max_iter\n self.n_init = n_init\n self.init_params = init_params\n self.random_state = random_state\n self.warm_start = warm_start\n self.verbose = verbose\n self.verbose_interval = verbose_interval" }, { @@ -125654,7 +135224,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -125664,13 +135235,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check values of the basic parameters.", - "docstring": "Check values of the basic parameters.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)", + "docstring": "Check values of the basic parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n ", "source_code": "\ndef _check_initial_parameters(self, X):\n \"\"\"Check values of the basic parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n \"\"\"\n if self.n_components < 1:\n raise ValueError(\"Invalid value for 'n_components': %d Estimation requires at least one component\" % self.n_components)\n if self.tol < 0.0:\n raise ValueError(\"Invalid value for 'tol': %.5f Tolerance used by the EM must be non-negative\" % self.tol)\n if self.n_init < 1:\n raise ValueError(\"Invalid value for 'n_init': %d Estimation requires at least one run\" % self.n_init)\n if self.max_iter < 1:\n raise ValueError(\"Invalid value for 'max_iter': %d Estimation requires at least one iteration\" % self.max_iter)\n if self.reg_covar < 0.0:\n raise ValueError(\"Invalid value for 'reg_covar': %.5f regularization on covariance must be non-negative\" % self.reg_covar)\n self._check_parameters(X)" }, { @@ -125688,7 +135260,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -125698,13 +135271,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check initial parameters of the derived class.", - "docstring": "Check initial parameters of the derived class.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)", + "docstring": "Check initial parameters of the derived class.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n ", "source_code": "\n@abstractmethod\ndef _check_parameters(self, X):\n \"\"\"Check initial parameters of the derived class.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n \"\"\"\n pass" }, { @@ -125722,7 +135296,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -125732,13 +135307,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "E step.", - "docstring": "E step.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nReturns\n-------\nlog_prob_norm : float\n Mean of the logarithms of the probabilities of each sample in X\n\nlog_responsibility : array, shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.", + "docstring": "E step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n log_prob_norm : float\n Mean of the logarithms of the probabilities of each sample in X\n\n log_responsibility : array, shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n ", "source_code": "\ndef _e_step(self, X):\n \"\"\"E step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n log_prob_norm : float\n Mean of the logarithms of the probabilities of each sample in X\n\n log_responsibility : array, shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n \"\"\"\n (log_prob_norm, log_resp) = self._estimate_log_prob_resp(X)\n return np.mean(log_prob_norm), log_resp" }, { @@ -125756,7 +135332,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -125766,13 +135343,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the log-probabilities log P(X | Z).\n\nCompute the log-probabilities per each component for each sample.", - "docstring": "Estimate the log-probabilities log P(X | Z).\n\nCompute the log-probabilities per each component for each sample.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nReturns\n-------\nlog_prob : array, shape (n_samples, n_component)", + "docstring": "Estimate the log-probabilities log P(X | Z).\n\n Compute the log-probabilities per each component for each sample.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n log_prob : array, shape (n_samples, n_component)\n ", "source_code": "\n@abstractmethod\ndef _estimate_log_prob(self, X):\n \"\"\"Estimate the log-probabilities log P(X | Z).\n\n Compute the log-probabilities per each component for each sample.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n log_prob : array, shape (n_samples, n_component)\n \"\"\"\n pass" }, { @@ -125790,7 +135368,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -125800,13 +135379,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Estimate log probabilities and responsibilities for each sample.\n\nCompute the log probabilities, weighted log probabilities per component and responsibilities for each sample in X with respect to the current state of the model.", - "docstring": "Estimate log probabilities and responsibilities for each sample.\n\nCompute the log probabilities, weighted log probabilities per\ncomponent and responsibilities for each sample in X with respect to\nthe current state of the model.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nReturns\n-------\nlog_prob_norm : array, shape (n_samples,)\n log p(X)\n\nlog_responsibilities : array, shape (n_samples, n_components)\n logarithm of the responsibilities", + "description": "Estimate log probabilities and responsibilities for each sample.\n\nCompute the log probabilities, weighted log probabilities per\ncomponent and responsibilities for each sample in X with respect to\nthe current state of the model.", + "docstring": "Estimate log probabilities and responsibilities for each sample.\n\n Compute the log probabilities, weighted log probabilities per\n component and responsibilities for each sample in X with respect to\n the current state of the model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n log_prob_norm : array, shape (n_samples,)\n log p(X)\n\n log_responsibilities : array, shape (n_samples, n_components)\n logarithm of the responsibilities\n ", "source_code": "\ndef _estimate_log_prob_resp(self, X):\n \"\"\"Estimate log probabilities and responsibilities for each sample.\n\n Compute the log probabilities, weighted log probabilities per\n component and responsibilities for each sample in X with respect to\n the current state of the model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n log_prob_norm : array, shape (n_samples,)\n log p(X)\n\n log_responsibilities : array, shape (n_samples, n_components)\n logarithm of the responsibilities\n \"\"\"\n weighted_log_prob = self._estimate_weighted_log_prob(X)\n log_prob_norm = logsumexp(weighted_log_prob, axis=1)\n with np.errstate(under='ignore'):\n log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis]\n return log_prob_norm, log_resp" }, { @@ -125824,13 +135404,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate log-weights in EM algorithm, E[ log pi ] in VB algorithm.", - "docstring": "Estimate log-weights in EM algorithm, E[ log pi ] in VB algorithm.\n\nReturns\n-------\nlog_weight : array, shape (n_components, )", + "docstring": "Estimate log-weights in EM algorithm, E[ log pi ] in VB algorithm.\n\n Returns\n -------\n log_weight : array, shape (n_components, )\n ", "source_code": "\n@abstractmethod\ndef _estimate_log_weights(self):\n \"\"\"Estimate log-weights in EM algorithm, E[ log pi ] in VB algorithm.\n\n Returns\n -------\n log_weight : array, shape (n_components, )\n \"\"\"\n pass" }, { @@ -125848,7 +135429,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -125858,13 +135440,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the weighted log-probabilities, log P(X | Z) + log weights.", - "docstring": "Estimate the weighted log-probabilities, log P(X | Z) + log weights.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nReturns\n-------\nweighted_log_prob : array, shape (n_samples, n_component)", + "docstring": "Estimate the weighted log-probabilities, log P(X | Z) + log weights.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n weighted_log_prob : array, shape (n_samples, n_component)\n ", "source_code": "\ndef _estimate_weighted_log_prob(self, X):\n \"\"\"Estimate the weighted log-probabilities, log P(X | Z) + log weights.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n weighted_log_prob : array, shape (n_samples, n_component)\n \"\"\"\n return self._estimate_log_prob(X) + self._estimate_log_weights()" }, { @@ -125882,13 +135465,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef _get_parameters(self):\n pass" }, { @@ -125906,7 +135490,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -125916,7 +135501,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "resp", @@ -125926,13 +135512,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Initialize the model parameters of the derived class.", - "docstring": "Initialize the model parameters of the derived class.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nresp : array-like of shape (n_samples, n_components)", + "docstring": "Initialize the model parameters of the derived class.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n resp : array-like of shape (n_samples, n_components)\n ", "source_code": "\n@abstractmethod\ndef _initialize(self, X, resp):\n \"\"\"Initialize the model parameters of the derived class.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n resp : array-like of shape (n_samples, n_components)\n \"\"\"\n pass" }, { @@ -125950,7 +135537,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -125960,7 +135548,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -125970,13 +135559,14 @@ "docstring": { "type": "RandomState", "description": "A random number generator instance that controls the random seed\nused for the method chosen to initialize the parameters." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Initialize the model parameters.", - "docstring": "Initialize the model parameters.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nrandom_state : RandomState\n A random number generator instance that controls the random seed\n used for the method chosen to initialize the parameters.", + "docstring": "Initialize the model parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n random_state : RandomState\n A random number generator instance that controls the random seed\n used for the method chosen to initialize the parameters.\n ", "source_code": "\ndef _initialize_parameters(self, X, random_state):\n \"\"\"Initialize the model parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n random_state : RandomState\n A random number generator instance that controls the random seed\n used for the method chosen to initialize the parameters.\n \"\"\"\n (n_samples, _) = X.shape\n if self.init_params == 'kmeans':\n resp = np.zeros((n_samples, self.n_components))\n label = cluster.KMeans(n_clusters=self.n_components, n_init=1, random_state=random_state).fit(X).labels_\n resp[np.arange(n_samples), label] = 1\n elif self.init_params == 'random':\n resp = random_state.rand(n_samples, self.n_components)\n resp /= resp.sum(axis=1)[:, np.newaxis]\n else:\n raise ValueError(\"Unimplemented initialization method '%s'\" % self.init_params)\n self._initialize(X, resp)" }, { @@ -125994,7 +135584,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -126004,7 +135595,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "log_resp", @@ -126014,13 +135606,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "Logarithm of the posterior probabilities (or responsibilities) of\nthe point of each sample in X." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "M step.", - "docstring": "M step.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nlog_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.", + "docstring": "M step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n ", "source_code": "\n@abstractmethod\ndef _m_step(self, X, log_resp):\n \"\"\"M step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n \"\"\"\n pass" }, { @@ -126038,7 +135631,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_init", @@ -126048,7 +135642,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -126072,7 +135667,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ll", @@ -126082,7 +135678,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -126106,7 +135703,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -126116,7 +135714,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "diff_ll", @@ -126126,7 +135725,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -126150,7 +135750,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "params", @@ -126160,13 +135761,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef _set_parameters(self, params):\n pass" }, { @@ -126184,7 +135786,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -126194,7 +135797,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "List of n_features-dimensional data points. Each row\ncorresponds to a single data point." - } + }, + "refined_type": {} }, { "name": "y", @@ -126204,13 +135808,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Estimate model parameters with the EM algorithm.\n\nThe method fits the model ``n_init`` times and sets the parameters with which the model has the largest likelihood or lower bound. Within each trial, the method iterates between E-step and M-step for ``max_iter`` times until the change of likelihood or lower bound is less than ``tol``, otherwise, a ``ConvergenceWarning`` is raised. If ``warm_start`` is ``True``, then ``n_init`` is ignored and a single initialization is performed upon the first call. Upon consecutive calls, training starts where it left off.", - "docstring": "Estimate model parameters with the EM algorithm.\n\nThe method fits the model ``n_init`` times and sets the parameters with\nwhich the model has the largest likelihood or lower bound. Within each\ntrial, the method iterates between E-step and M-step for ``max_iter``\ntimes until the change of likelihood or lower bound is less than\n``tol``, otherwise, a ``ConvergenceWarning`` is raised.\nIf ``warm_start`` is ``True``, then ``n_init`` is ignored and a single\ninitialization is performed upon the first call. Upon consecutive\ncalls, training starts where it left off.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : object\n The fitted mixture.", + "description": "Estimate model parameters with the EM algorithm.\n\nThe method fits the model ``n_init`` times and sets the parameters with\nwhich the model has the largest likelihood or lower bound. Within each\ntrial, the method iterates between E-step and M-step for ``max_iter``\ntimes until the change of likelihood or lower bound is less than\n``tol``, otherwise, a ``ConvergenceWarning`` is raised.\nIf ``warm_start`` is ``True``, then ``n_init`` is ignored and a single\ninitialization is performed upon the first call. Upon consecutive\ncalls, training starts where it left off.", + "docstring": "Estimate model parameters with the EM algorithm.\n\n The method fits the model ``n_init`` times and sets the parameters with\n which the model has the largest likelihood or lower bound. Within each\n trial, the method iterates between E-step and M-step for ``max_iter``\n times until the change of likelihood or lower bound is less than\n ``tol``, otherwise, a ``ConvergenceWarning`` is raised.\n If ``warm_start`` is ``True``, then ``n_init`` is ignored and a single\n initialization is performed upon the first call. Upon consecutive\n calls, training starts where it left off.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n The fitted mixture.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Estimate model parameters with the EM algorithm.\n\n The method fits the model ``n_init`` times and sets the parameters with\n which the model has the largest likelihood or lower bound. Within each\n trial, the method iterates between E-step and M-step for ``max_iter``\n times until the change of likelihood or lower bound is less than\n ``tol``, otherwise, a ``ConvergenceWarning`` is raised.\n If ``warm_start`` is ``True``, then ``n_init`` is ignored and a single\n initialization is performed upon the first call. Upon consecutive\n calls, training starts where it left off.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : object\n The fitted mixture.\n \"\"\"\n self.fit_predict(X, y)\n return self" }, { @@ -126228,7 +135833,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -126238,7 +135844,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "List of n_features-dimensional data points. Each row\ncorresponds to a single data point." - } + }, + "refined_type": {} }, { "name": "y", @@ -126248,13 +135855,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Estimate model parameters using X and predict the labels for X.\n\nThe method fits the model n_init times and sets the parameters with which the model has the largest likelihood or lower bound. Within each trial, the method iterates between E-step and M-step for `max_iter` times until the change of likelihood or lower bound is less than `tol`, otherwise, a :class:`~sklearn.exceptions.ConvergenceWarning` is raised. After fitting, it predicts the most probable label for the input data points. .. versionadded:: 0.20", - "docstring": "Estimate model parameters using X and predict the labels for X.\n\nThe method fits the model n_init times and sets the parameters with\nwhich the model has the largest likelihood or lower bound. Within each\ntrial, the method iterates between E-step and M-step for `max_iter`\ntimes until the change of likelihood or lower bound is less than\n`tol`, otherwise, a :class:`~sklearn.exceptions.ConvergenceWarning` is\nraised. After fitting, it predicts the most probable label for the\ninput data points.\n\n.. versionadded:: 0.20\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nlabels : array, shape (n_samples,)\n Component labels.", + "description": "Estimate model parameters using X and predict the labels for X.\n\nThe method fits the model n_init times and sets the parameters with\nwhich the model has the largest likelihood or lower bound. Within each\ntrial, the method iterates between E-step and M-step for `max_iter`\ntimes until the change of likelihood or lower bound is less than\n`tol`, otherwise, a :class:`~sklearn.exceptions.ConvergenceWarning` is\nraised. After fitting, it predicts the most probable label for the\ninput data points.\n\n.. versionadded:: 0.20", + "docstring": "Estimate model parameters using X and predict the labels for X.\n\n The method fits the model n_init times and sets the parameters with\n which the model has the largest likelihood or lower bound. Within each\n trial, the method iterates between E-step and M-step for `max_iter`\n times until the change of likelihood or lower bound is less than\n `tol`, otherwise, a :class:`~sklearn.exceptions.ConvergenceWarning` is\n raised. After fitting, it predicts the most probable label for the\n input data points.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n labels : array, shape (n_samples,)\n Component labels.\n ", "source_code": "\ndef fit_predict(self, X, y=None):\n \"\"\"Estimate model parameters using X and predict the labels for X.\n\n The method fits the model n_init times and sets the parameters with\n which the model has the largest likelihood or lower bound. Within each\n trial, the method iterates between E-step and M-step for `max_iter`\n times until the change of likelihood or lower bound is less than\n `tol`, otherwise, a :class:`~sklearn.exceptions.ConvergenceWarning` is\n raised. After fitting, it predicts the most probable label for the\n input data points.\n\n .. versionadded:: 0.20\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n labels : array, shape (n_samples,)\n Component labels.\n \"\"\"\n X = self._validate_data(X, dtype=[np.float64, np.float32], ensure_min_samples=2)\n if X.shape[0] < self.n_components:\n raise ValueError(f'Expected n_samples >= n_components but got n_components = {self.n_components}, n_samples = {X.shape[0]}')\n self._check_initial_parameters(X)\n do_init = not (self.warm_start and hasattr(self, 'converged_'))\n n_init = self.n_init if do_init else 1\n max_lower_bound = -np.inf\n self.converged_ = False\n random_state = check_random_state(self.random_state)\n (n_samples, _) = X.shape\n for init in range(n_init):\n self._print_verbose_msg_init_beg(init)\n if do_init:\n self._initialize_parameters(X, random_state)\n lower_bound = -np.inf if do_init else self.lower_bound_\n for n_iter in range(1, self.max_iter + 1):\n prev_lower_bound = lower_bound\n (log_prob_norm, log_resp) = self._e_step(X)\n self._m_step(X, log_resp)\n lower_bound = self._compute_lower_bound(log_resp, log_prob_norm)\n change = lower_bound - prev_lower_bound\n self._print_verbose_msg_iter_end(n_iter, change)\n if abs(change) < self.tol:\n self.converged_ = True\n break\n self._print_verbose_msg_init_end(lower_bound)\n if lower_bound > max_lower_bound or max_lower_bound == -np.inf:\n max_lower_bound = lower_bound\n best_params = self._get_parameters()\n best_n_iter = n_iter\n if not self.converged_:\n warnings.warn('Initialization %d did not converge. Try different init parameters, or increase max_iter, tol or check for degenerate data.' % (init + 1), ConvergenceWarning)\n self._set_parameters(best_params)\n self.n_iter_ = best_n_iter\n self.lower_bound_ = max_lower_bound\n (_, log_resp) = self._e_step(X)\n return log_resp.argmax(axis=1)" }, { @@ -126272,7 +135880,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -126282,13 +135891,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "List of n_features-dimensional data points. Each row\ncorresponds to a single data point." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Predict the labels for the data samples in X using trained model.", - "docstring": "Predict the labels for the data samples in X using trained model.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\nReturns\n-------\nlabels : array, shape (n_samples,)\n Component labels.", + "docstring": "Predict the labels for the data samples in X using trained model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n Returns\n -------\n labels : array, shape (n_samples,)\n Component labels.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict the labels for the data samples in X using trained model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n Returns\n -------\n labels : array, shape (n_samples,)\n Component labels.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n return self._estimate_weighted_log_prob(X).argmax(axis=1)" }, { @@ -126306,7 +135916,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -126316,13 +135927,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "List of n_features-dimensional data points. Each row\ncorresponds to a single data point." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Evaluate the components' density for each sample.", - "docstring": "Evaluate the components' density for each sample.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\nReturns\n-------\nresp : array, shape (n_samples, n_components)\n Density of each Gaussian component for each sample in X.", + "docstring": "Evaluate the components' density for each sample.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n Returns\n -------\n resp : array, shape (n_samples, n_components)\n Density of each Gaussian component for each sample in X.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Evaluate the components' density for each sample.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n Returns\n -------\n resp : array, shape (n_samples, n_components)\n Density of each Gaussian component for each sample in X.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n (_, log_resp) = self._estimate_log_prob_resp(X)\n return np.exp(log_resp)" }, { @@ -126340,7 +135952,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -126350,13 +135963,14 @@ "docstring": { "type": "int, default=1", "description": "Number of samples to generate." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Generate random samples from the fitted Gaussian distribution.", - "docstring": "Generate random samples from the fitted Gaussian distribution.\n\nParameters\n----------\nn_samples : int, default=1\n Number of samples to generate.\n\nReturns\n-------\nX : array, shape (n_samples, n_features)\n Randomly generated sample.\n\ny : array, shape (nsamples,)\n Component labels.", + "docstring": "Generate random samples from the fitted Gaussian distribution.\n\n Parameters\n ----------\n n_samples : int, default=1\n Number of samples to generate.\n\n Returns\n -------\n X : array, shape (n_samples, n_features)\n Randomly generated sample.\n\n y : array, shape (nsamples,)\n Component labels.\n ", "source_code": "\ndef sample(self, n_samples=1):\n \"\"\"Generate random samples from the fitted Gaussian distribution.\n\n Parameters\n ----------\n n_samples : int, default=1\n Number of samples to generate.\n\n Returns\n -------\n X : array, shape (n_samples, n_features)\n Randomly generated sample.\n\n y : array, shape (nsamples,)\n Component labels.\n \"\"\"\n check_is_fitted(self)\n if n_samples < 1:\n raise ValueError(\"Invalid value for 'n_samples': %d . The sampling requires at least one sample.\" % self.n_components)\n (_, n_features) = self.means_.shape\n rng = check_random_state(self.random_state)\n n_samples_comp = rng.multinomial(n_samples, self.weights_)\n if self.covariance_type == 'full':\n X = np.vstack([rng.multivariate_normal(mean, covariance, int(sample)) for (mean, covariance, sample) in zip(self.means_, self.covariances_, n_samples_comp)])\n elif self.covariance_type == 'tied':\n X = np.vstack([rng.multivariate_normal(mean, self.covariances_, int(sample)) for (mean, sample) in zip(self.means_, n_samples_comp)])\n else:\n X = np.vstack([mean + rng.randn(sample, n_features) * np.sqrt(covariance) for (mean, covariance, sample) in zip(self.means_, self.covariances_, n_samples_comp)])\n y = np.concatenate([np.full(sample, j, dtype=int) for (j, sample) in enumerate(n_samples_comp)])\n return X, y" }, { @@ -126374,7 +135988,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -126384,7 +135999,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_dimensions)", "description": "List of n_features-dimensional data points. Each row\ncorresponds to a single data point." - } + }, + "refined_type": {} }, { "name": "y", @@ -126394,13 +136010,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the per-sample average log-likelihood of the given data X.", - "docstring": "Compute the per-sample average log-likelihood of the given data X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_dimensions)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nlog_likelihood : float\n Log-likelihood of `X` under the Gaussian mixture model.", + "docstring": "Compute the per-sample average log-likelihood of the given data X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_dimensions)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n log_likelihood : float\n Log-likelihood of `X` under the Gaussian mixture model.\n ", "source_code": "\ndef score(self, X, y=None):\n \"\"\"Compute the per-sample average log-likelihood of the given data X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_dimensions)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n log_likelihood : float\n Log-likelihood of `X` under the Gaussian mixture model.\n \"\"\"\n return self.score_samples(X).mean()" }, { @@ -126418,7 +136035,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -126428,13 +136046,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "List of n_features-dimensional data points. Each row\ncorresponds to a single data point." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the log-likelihood of each sample.", - "docstring": "Compute the log-likelihood of each sample.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\nReturns\n-------\nlog_prob : array, shape (n_samples,)\n Log-likelihood of each sample in `X` under the current model.", + "docstring": "Compute the log-likelihood of each sample.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n Returns\n -------\n log_prob : array, shape (n_samples,)\n Log-likelihood of each sample in `X` under the current model.\n ", "source_code": "\ndef score_samples(self, X):\n \"\"\"Compute the log-likelihood of each sample.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n Returns\n -------\n log_prob : array, shape (n_samples,)\n Log-likelihood of each sample in `X` under the current model.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n return logsumexp(self._estimate_weighted_log_prob(X), axis=1)" }, { @@ -126452,7 +136071,8 @@ "docstring": { "type": "array", "description": "" - } + }, + "refined_type": {} }, { "name": "param_shape", @@ -126462,7 +136082,8 @@ "docstring": { "type": "tuple", "description": "" - } + }, + "refined_type": {} }, { "name": "name", @@ -126472,13 +136093,14 @@ "docstring": { "type": "str", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Validate the shape of the input parameter 'param'.", - "docstring": "Validate the shape of the input parameter 'param'.\n\nParameters\n----------\nparam : array\n\nparam_shape : tuple\n\nname : str", + "docstring": "Validate the shape of the input parameter 'param'.\n\n Parameters\n ----------\n param : array\n\n param_shape : tuple\n\n name : str\n ", "source_code": "\ndef _check_shape(param, param_shape, name):\n \"\"\"Validate the shape of the input parameter 'param'.\n\n Parameters\n ----------\n param : array\n\n param_shape : tuple\n\n name : str\n \"\"\"\n param = np.array(param)\n if param.shape != param_shape:\n raise ValueError(\"The parameter '%s' should have the shape of %s, but got %s\" % (name, param_shape, param.shape))" }, { @@ -126496,7 +136118,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -126506,7 +136129,8 @@ "docstring": { "type": "int, default=1", "description": "The number of mixture components. Depending on the data and the value\nof the `weight_concentration_prior` the model can decide to not use\nall the components by setting some component `weights_` to values very\nclose to zero. The number of effective components is therefore smaller\nthan n_components." - } + }, + "refined_type": {} }, { "name": "covariance_type", @@ -126516,6 +136140,10 @@ "docstring": { "type": "{'full', 'tied', 'diag', 'spherical'}, default='full'", "description": "String describing the type of covariance parameters to use.\nMust be one of::\n\n 'full' (each component has its own general covariance matrix),\n 'tied' (all components share the same general covariance matrix),\n 'diag' (each component has its own diagonal covariance matrix),\n 'spherical' (each component has its own single variance)." + }, + "refined_type": { + "kind": "EnumType", + "values": ["tied", "full", "diag", "spherical"] } }, { @@ -126526,7 +136154,8 @@ "docstring": { "type": "float, default=1e-3", "description": "The convergence threshold. EM iterations will stop when the\nlower bound average gain on the likelihood (of the training data with\nrespect to the model) is below this threshold." - } + }, + "refined_type": {} }, { "name": "reg_covar", @@ -126536,7 +136165,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Non-negative regularization added to the diagonal of covariance.\nAllows to assure that the covariance matrices are all positive." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -126546,7 +136176,8 @@ "docstring": { "type": "int, default=100", "description": "The number of EM iterations to perform." - } + }, + "refined_type": {} }, { "name": "n_init", @@ -126556,7 +136187,8 @@ "docstring": { "type": "int, default=1", "description": "The number of initializations to perform. The result with the highest\nlower bound value on the likelihood is kept." - } + }, + "refined_type": {} }, { "name": "init_params", @@ -126566,6 +136198,10 @@ "docstring": { "type": "{'kmeans', 'random'}, default='kmeans'", "description": "The method used to initialize the weights, the means and the\ncovariances.\nMust be one of::\n\n 'kmeans' : responsibilities are initialized using kmeans.\n 'random' : responsibilities are initialized randomly." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "kmeans"] } }, { @@ -126576,7 +136212,8 @@ "docstring": { "type": "str, default='dirichlet_process'", "description": "String describing the type of the weight concentration prior.\nMust be one of::\n\n 'dirichlet_process' (using the Stick-breaking representation),\n 'dirichlet_distribution' (can favor more uniform weights)." - } + }, + "refined_type": {} }, { "name": "weight_concentration_prior", @@ -126586,7 +136223,8 @@ "docstring": { "type": "float or None, default=None", "description": "The dirichlet concentration of each component on the weight\ndistribution (Dirichlet). This is commonly called gamma in the\nliterature. The higher concentration puts more mass in\nthe center and will lead to more components being active, while a lower\nconcentration parameter will lead to more mass at the edge of the\nmixture weights simplex. The value of the parameter must be greater\nthan 0. If it is None, it's set to ``1. / n_components``." - } + }, + "refined_type": {} }, { "name": "mean_precision_prior", @@ -126596,7 +136234,8 @@ "docstring": { "type": "float or None, default=None", "description": "The precision prior on the mean distribution (Gaussian).\nControls the extent of where means can be placed. Larger\nvalues concentrate the cluster means around `mean_prior`.\nThe value of the parameter must be greater than 0.\nIf it is None, it is set to 1." - } + }, + "refined_type": {} }, { "name": "mean_prior", @@ -126606,7 +136245,8 @@ "docstring": { "type": "array-like, shape (n_features,), default=None", "description": "The prior on the mean distribution (Gaussian).\nIf it is None, it is set to the mean of X." - } + }, + "refined_type": {} }, { "name": "degrees_of_freedom_prior", @@ -126616,7 +136256,8 @@ "docstring": { "type": "float or None, default=None", "description": "The prior of the number of degrees of freedom on the covariance\ndistributions (Wishart). If it is None, it's set to `n_features`." - } + }, + "refined_type": {} }, { "name": "covariance_prior", @@ -126626,7 +136267,8 @@ "docstring": { "type": "float or array-like, default=None", "description": "The prior on the covariance distribution (Wishart).\nIf it is None, the emiprical covariance prior is initialized using the\ncovariance of X. The shape depends on `covariance_type`::\n\n (n_features, n_features) if 'full',\n (n_features, n_features) if 'tied',\n (n_features) if 'diag',\n float if 'spherical'" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -126636,7 +136278,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the random seed given to the method chosen to initialize the\nparameters (see `init_params`).\nIn addition, it controls the generation of random samples from the\nfitted distribution (see the method `sample`).\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -126646,7 +136289,8 @@ "docstring": { "type": "bool, default=False", "description": "If 'warm_start' is True, the solution of the last fitting is used as\ninitialization for the next call of fit(). This can speed up\nconvergence when fit is called several times on similar problems.\nSee :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -126656,7 +136300,8 @@ "docstring": { "type": "int, default=0", "description": "Enable verbose output. If 1 then it prints the current\ninitialization and each iteration step. If greater than 1 then\nit prints also the log probability and the time needed\nfor each step." - } + }, + "refined_type": {} }, { "name": "verbose_interval", @@ -126666,13 +136311,14 @@ "docstring": { "type": "int, default=10", "description": "Number of iteration done before the next print." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, n_components=1, covariance_type='full', tol=0.001, reg_covar=1e-06, max_iter=100, n_init=1, init_params='kmeans', weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=None, mean_precision_prior=None, mean_prior=None, degrees_of_freedom_prior=None, covariance_prior=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10):\n super().__init__(n_components=n_components, tol=tol, reg_covar=reg_covar, max_iter=max_iter, n_init=n_init, init_params=init_params, random_state=random_state, warm_start=warm_start, verbose=verbose, verbose_interval=verbose_interval)\n self.covariance_type = covariance_type\n self.weight_concentration_prior_type = weight_concentration_prior_type\n self.weight_concentration_prior = weight_concentration_prior\n self.mean_precision_prior = mean_precision_prior\n self.mean_prior = mean_prior\n self.degrees_of_freedom_prior = degrees_of_freedom_prior\n self.covariance_prior = covariance_prior" }, { @@ -126690,7 +136336,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -126700,13 +136347,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check the parameters of the Gaussian distribution.", - "docstring": "Check the parameters of the Gaussian distribution.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)", + "docstring": "Check the parameters of the Gaussian distribution.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n ", "source_code": "\ndef _check_means_parameters(self, X):\n \"\"\"Check the parameters of the Gaussian distribution.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n \"\"\"\n (_, n_features) = X.shape\n if self.mean_precision_prior is None:\n self.mean_precision_prior_ = 1.0\n elif self.mean_precision_prior > 0.0:\n self.mean_precision_prior_ = self.mean_precision_prior\n else:\n raise ValueError(\"The parameter 'mean_precision_prior' should be greater than 0., but got %.3f.\" % self.mean_precision_prior)\n if self.mean_prior is None:\n self.mean_prior_ = X.mean(axis=0)\n else:\n self.mean_prior_ = check_array(self.mean_prior, dtype=[np.float64, np.float32], ensure_2d=False)\n _check_shape(self.mean_prior_, (n_features, ), 'means')" }, { @@ -126724,7 +136372,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -126734,13 +136383,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check that the parameters are well defined.", - "docstring": "Check that the parameters are well defined.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)", + "docstring": "Check that the parameters are well defined.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n ", "source_code": "\ndef _check_parameters(self, X):\n \"\"\"Check that the parameters are well defined.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n \"\"\"\n if self.covariance_type not in ['spherical', 'tied', 'diag', 'full']:\n raise ValueError(\"Invalid value for 'covariance_type': %s 'covariance_type' should be in ['spherical', 'tied', 'diag', 'full']\" % self.covariance_type)\n if self.weight_concentration_prior_type not in ['dirichlet_process', 'dirichlet_distribution']:\n raise ValueError(\"Invalid value for 'weight_concentration_prior_type': %s 'weight_concentration_prior_type' should be in ['dirichlet_process', 'dirichlet_distribution']\" % self.weight_concentration_prior_type)\n self._check_weights_parameters()\n self._check_means_parameters(X)\n self._check_precision_parameters(X)\n self._checkcovariance_prior_parameter(X)" }, { @@ -126758,7 +136408,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -126768,13 +136419,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check the prior parameters of the precision distribution.", - "docstring": "Check the prior parameters of the precision distribution.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)", + "docstring": "Check the prior parameters of the precision distribution.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n ", "source_code": "\ndef _check_precision_parameters(self, X):\n \"\"\"Check the prior parameters of the precision distribution.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n \"\"\"\n (_, n_features) = X.shape\n if self.degrees_of_freedom_prior is None:\n self.degrees_of_freedom_prior_ = n_features\n elif self.degrees_of_freedom_prior > n_features - 1.0:\n self.degrees_of_freedom_prior_ = self.degrees_of_freedom_prior\n else:\n raise ValueError(\"The parameter 'degrees_of_freedom_prior' should be greater than %d, but got %.3f.\" % (n_features - 1, self.degrees_of_freedom_prior))" }, { @@ -126792,7 +136444,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -126816,7 +136469,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -126826,13 +136480,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check the `covariance_prior_`.", - "docstring": "Check the `covariance_prior_`.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)", + "docstring": "Check the `covariance_prior_`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n ", "source_code": "\ndef _checkcovariance_prior_parameter(self, X):\n \"\"\"Check the `covariance_prior_`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n \"\"\"\n (_, n_features) = X.shape\n if self.covariance_prior is None:\n self.covariance_prior_ = {'full': np.atleast_2d(np.cov(X.T)), 'tied': np.atleast_2d(np.cov(X.T)), 'diag': np.var(X, axis=0, ddof=1), 'spherical': np.var(X, axis=0, ddof=1).mean()}[self.covariance_type]\n elif self.covariance_type in ['full', 'tied']:\n self.covariance_prior_ = check_array(self.covariance_prior, dtype=[np.float64, np.float32], ensure_2d=False)\n _check_shape(self.covariance_prior_, (n_features, n_features), '%s covariance_prior' % self.covariance_type)\n _check_precision_matrix(self.covariance_prior_, self.covariance_type)\n elif self.covariance_type == 'diag':\n self.covariance_prior_ = check_array(self.covariance_prior, dtype=[np.float64, np.float32], ensure_2d=False)\n _check_shape(self.covariance_prior_, (n_features, ), '%s covariance_prior' % self.covariance_type)\n _check_precision_positivity(self.covariance_prior_, self.covariance_type)\n elif self.covariance_prior > 0.0:\n self.covariance_prior_ = self.covariance_prior\n else:\n raise ValueError(\"The parameter 'spherical covariance_prior' should be greater than 0., but got %.3f.\" % self.covariance_prior)" }, { @@ -126850,7 +136505,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "log_resp", @@ -126860,7 +136516,8 @@ "docstring": { "type": "array, shape (n_samples, n_components)", "description": "Logarithm of the posterior probabilities (or responsibilities) of\nthe point of each sample in X." - } + }, + "refined_type": {} }, { "name": "log_prob_norm", @@ -126870,13 +136527,14 @@ "docstring": { "type": "float", "description": "Logarithm of the probability of each sample in X." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Estimate the lower bound of the model.\n\nThe lower bound on the likelihood (of the training data with respect to the model) is used to detect the convergence and has to increase at each iteration.", - "docstring": "Estimate the lower bound of the model.\n\nThe lower bound on the likelihood (of the training data with respect to\nthe model) is used to detect the convergence and has to increase at\neach iteration.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nlog_resp : array, shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n\nlog_prob_norm : float\n Logarithm of the probability of each sample in X.\n\nReturns\n-------\nlower_bound : float", + "description": "Estimate the lower bound of the model.\n\nThe lower bound on the likelihood (of the training data with respect to\nthe model) is used to detect the convergence and has to increase at\neach iteration.", + "docstring": "Estimate the lower bound of the model.\n\n The lower bound on the likelihood (of the training data with respect to\n the model) is used to detect the convergence and has to increase at\n each iteration.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array, shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n\n log_prob_norm : float\n Logarithm of the probability of each sample in X.\n\n Returns\n -------\n lower_bound : float\n ", "source_code": "\ndef _compute_lower_bound(self, log_resp, log_prob_norm):\n \"\"\"Estimate the lower bound of the model.\n\n The lower bound on the likelihood (of the training data with respect to\n the model) is used to detect the convergence and has to increase at\n each iteration.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array, shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n\n log_prob_norm : float\n Logarithm of the probability of each sample in X.\n\n Returns\n -------\n lower_bound : float\n \"\"\"\n (n_features, ) = self.mean_prior_.shape\n log_det_precisions_chol = _compute_log_det_cholesky(self.precisions_cholesky_, self.covariance_type, n_features) - 0.5 * n_features * np.log(self.degrees_of_freedom_)\n if self.covariance_type == 'tied':\n log_wishart = self.n_components * np.float64(_log_wishart_norm(self.degrees_of_freedom_, log_det_precisions_chol, n_features))\n else:\n log_wishart = np.sum(_log_wishart_norm(self.degrees_of_freedom_, log_det_precisions_chol, n_features))\n if self.weight_concentration_prior_type == 'dirichlet_process':\n log_norm_weight = -np.sum(betaln(self.weight_concentration_[0], self.weight_concentration_[1]))\n else:\n log_norm_weight = _log_dirichlet_norm(self.weight_concentration_)\n return -np.sum(np.exp(log_resp) * log_resp) - log_wishart - log_norm_weight - 0.5 * n_features * np.sum(np.log(self.mean_precision_))" }, { @@ -126894,7 +136552,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -126904,13 +136563,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _estimate_log_prob(self, X):\n (_, n_features) = X.shape\n log_gauss = _estimate_log_gaussian_prob(X, self.means_, self.precisions_cholesky_, self.covariance_type) - 0.5 * n_features * np.log(self.degrees_of_freedom_)\n log_lambda = n_features * np.log(2.0) + np.sum(digamma(0.5 * (self.degrees_of_freedom_ - np.arange(0, n_features)[:, np.newaxis])), 0)\n return log_gauss + 0.5 * (log_lambda - n_features / self.mean_precision_)" }, { @@ -126928,13 +136588,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _estimate_log_weights(self):\n if self.weight_concentration_prior_type == 'dirichlet_process':\n digamma_sum = digamma(self.weight_concentration_[0] + self.weight_concentration_[1])\n digamma_a = digamma(self.weight_concentration_[0])\n digamma_b = digamma(self.weight_concentration_[1])\n return digamma_a - digamma_sum + np.hstack((0, np.cumsum(digamma_b - digamma_sum)[:-1]))\n else:\n return digamma(self.weight_concentration_) - digamma(np.sum(self.weight_concentration_))" }, { @@ -126952,7 +136613,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nk", @@ -126962,7 +136624,8 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "" - } + }, + "refined_type": {} }, { "name": "xk", @@ -126972,13 +136635,14 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the parameters of the Gaussian distribution.", - "docstring": "Estimate the parameters of the Gaussian distribution.\n\nParameters\n----------\nnk : array-like of shape (n_components,)\n\nxk : array-like of shape (n_components, n_features)", + "docstring": "Estimate the parameters of the Gaussian distribution.\n\n Parameters\n ----------\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n ", "source_code": "\ndef _estimate_means(self, nk, xk):\n \"\"\"Estimate the parameters of the Gaussian distribution.\n\n Parameters\n ----------\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n \"\"\"\n self.mean_precision_ = self.mean_precision_prior_ + nk\n self.means_ = (self.mean_precision_prior_ * self.mean_prior_ + nk[:, np.newaxis] * xk) / self.mean_precision_[:, np.newaxis]" }, { @@ -126996,7 +136660,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nk", @@ -127006,7 +136671,8 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "" - } + }, + "refined_type": {} }, { "name": "xk", @@ -127016,7 +136682,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "sk", @@ -127026,13 +136693,14 @@ "docstring": { "type": "array-like", "description": "The shape depends of `covariance_type`:\n'full' : (n_components, n_features, n_features)\n'tied' : (n_features, n_features)\n'diag' : (n_components, n_features)\n'spherical' : (n_components,)" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the precisions parameters of the precision distribution.", - "docstring": "Estimate the precisions parameters of the precision distribution.\n\nParameters\n----------\nnk : array-like of shape (n_components,)\n\nxk : array-like of shape (n_components, n_features)\n\nsk : array-like\n The shape depends of `covariance_type`:\n 'full' : (n_components, n_features, n_features)\n 'tied' : (n_features, n_features)\n 'diag' : (n_components, n_features)\n 'spherical' : (n_components,)", + "docstring": "Estimate the precisions parameters of the precision distribution.\n\n Parameters\n ----------\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like\n The shape depends of `covariance_type`:\n 'full' : (n_components, n_features, n_features)\n 'tied' : (n_features, n_features)\n 'diag' : (n_components, n_features)\n 'spherical' : (n_components,)\n ", "source_code": "\ndef _estimate_precisions(self, nk, xk, sk):\n \"\"\"Estimate the precisions parameters of the precision distribution.\n\n Parameters\n ----------\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like\n The shape depends of `covariance_type`:\n 'full' : (n_components, n_features, n_features)\n 'tied' : (n_features, n_features)\n 'diag' : (n_components, n_features)\n 'spherical' : (n_components,)\n \"\"\"\n {'full': self._estimate_wishart_full, 'tied': self._estimate_wishart_tied, 'diag': self._estimate_wishart_diag, 'spherical': self._estimate_wishart_spherical}[self.covariance_type](nk, xk, sk)\n self.precisions_cholesky_ = _compute_precision_cholesky(self.covariances_, self.covariance_type)" }, { @@ -127050,7 +136718,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nk", @@ -127060,13 +136729,14 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the parameters of the Dirichlet distribution.", - "docstring": "Estimate the parameters of the Dirichlet distribution.\n\nParameters\n----------\nnk : array-like of shape (n_components,)", + "docstring": "Estimate the parameters of the Dirichlet distribution.\n\n Parameters\n ----------\n nk : array-like of shape (n_components,)\n ", "source_code": "\ndef _estimate_weights(self, nk):\n \"\"\"Estimate the parameters of the Dirichlet distribution.\n\n Parameters\n ----------\n nk : array-like of shape (n_components,)\n \"\"\"\n if self.weight_concentration_prior_type == 'dirichlet_process':\n self.weight_concentration_ = (1.0 + nk, self.weight_concentration_prior_ + np.hstack((np.cumsum(nk[::-1])[-2::-1], 0)))\n else:\n self.weight_concentration_ = self.weight_concentration_prior_ + nk" }, { @@ -127084,7 +136754,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nk", @@ -127094,7 +136765,8 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "" - } + }, + "refined_type": {} }, { "name": "xk", @@ -127104,7 +136776,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "sk", @@ -127114,13 +136787,14 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the diag Wishart distribution parameters.", - "docstring": "Estimate the diag Wishart distribution parameters.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nnk : array-like of shape (n_components,)\n\nxk : array-like of shape (n_components, n_features)\n\nsk : array-like of shape (n_components, n_features)", + "docstring": "Estimate the diag Wishart distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like of shape (n_components, n_features)\n ", "source_code": "\ndef _estimate_wishart_diag(self, nk, xk, sk):\n \"\"\"Estimate the diag Wishart distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like of shape (n_components, n_features)\n \"\"\"\n (_, n_features) = xk.shape\n self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk\n diff = xk - self.mean_prior_\n self.covariances_ = self.covariance_prior_ + nk[:, np.newaxis] * (sk + (self.mean_precision_prior_ / self.mean_precision_)[:, np.newaxis] * np.square(diff))\n self.covariances_ /= self.degrees_of_freedom_[:, np.newaxis]" }, { @@ -127138,7 +136812,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nk", @@ -127148,7 +136823,8 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "" - } + }, + "refined_type": {} }, { "name": "xk", @@ -127158,7 +136834,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "sk", @@ -127168,13 +136845,14 @@ "docstring": { "type": "array-like of shape (n_components, n_features, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the full Wishart distribution parameters.", - "docstring": "Estimate the full Wishart distribution parameters.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nnk : array-like of shape (n_components,)\n\nxk : array-like of shape (n_components, n_features)\n\nsk : array-like of shape (n_components, n_features, n_features)", + "docstring": "Estimate the full Wishart distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like of shape (n_components, n_features, n_features)\n ", "source_code": "\ndef _estimate_wishart_full(self, nk, xk, sk):\n \"\"\"Estimate the full Wishart distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like of shape (n_components, n_features, n_features)\n \"\"\"\n (_, n_features) = xk.shape\n self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk\n self.covariances_ = np.empty((self.n_components, n_features, n_features))\n for k in range(self.n_components):\n diff = xk[k] - self.mean_prior_\n self.covariances_[k] = self.covariance_prior_ + nk[k] * sk[k] + nk[k] * self.mean_precision_prior_ / self.mean_precision_[k] * np.outer(diff, diff)\n self.covariances_ /= self.degrees_of_freedom_[:, np.newaxis, np.newaxis]" }, { @@ -127192,7 +136870,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nk", @@ -127202,7 +136881,8 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "" - } + }, + "refined_type": {} }, { "name": "xk", @@ -127212,7 +136892,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "sk", @@ -127222,13 +136903,14 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the spherical Wishart distribution parameters.", - "docstring": "Estimate the spherical Wishart distribution parameters.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nnk : array-like of shape (n_components,)\n\nxk : array-like of shape (n_components, n_features)\n\nsk : array-like of shape (n_components,)", + "docstring": "Estimate the spherical Wishart distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like of shape (n_components,)\n ", "source_code": "\ndef _estimate_wishart_spherical(self, nk, xk, sk):\n \"\"\"Estimate the spherical Wishart distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like of shape (n_components,)\n \"\"\"\n (_, n_features) = xk.shape\n self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk\n diff = xk - self.mean_prior_\n self.covariances_ = self.covariance_prior_ + nk * (sk + self.mean_precision_prior_ / self.mean_precision_ * np.mean(np.square(diff), 1))\n self.covariances_ /= self.degrees_of_freedom_" }, { @@ -127246,7 +136928,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nk", @@ -127256,7 +136939,8 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "" - } + }, + "refined_type": {} }, { "name": "xk", @@ -127266,7 +136950,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "sk", @@ -127276,13 +136961,14 @@ "docstring": { "type": "array-like of shape (n_features, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the tied Wishart distribution parameters.", - "docstring": "Estimate the tied Wishart distribution parameters.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nnk : array-like of shape (n_components,)\n\nxk : array-like of shape (n_components, n_features)\n\nsk : array-like of shape (n_features, n_features)", + "docstring": "Estimate the tied Wishart distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like of shape (n_features, n_features)\n ", "source_code": "\ndef _estimate_wishart_tied(self, nk, xk, sk):\n \"\"\"Estimate the tied Wishart distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n xk : array-like of shape (n_components, n_features)\n\n sk : array-like of shape (n_features, n_features)\n \"\"\"\n (_, n_features) = xk.shape\n self.degrees_of_freedom_ = self.degrees_of_freedom_prior_ + nk.sum() / self.n_components\n diff = xk - self.mean_prior_\n self.covariances_ = self.covariance_prior_ + sk * nk.sum() / self.n_components + self.mean_precision_prior_ / self.n_components * np.dot(nk / self.mean_precision_ * diff.T, diff)\n self.covariances_ /= self.degrees_of_freedom_" }, { @@ -127300,13 +136986,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_parameters(self):\n return self.weight_concentration_, self.mean_precision_, self.means_, self.degrees_of_freedom_, self.covariances_, self.precisions_cholesky_" }, { @@ -127324,7 +137011,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -127334,7 +137022,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "resp", @@ -127344,13 +137033,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Initialization of the mixture parameters.", - "docstring": "Initialization of the mixture parameters.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nresp : array-like of shape (n_samples, n_components)", + "docstring": "Initialization of the mixture parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n resp : array-like of shape (n_samples, n_components)\n ", "source_code": "\ndef _initialize(self, X, resp):\n \"\"\"Initialization of the mixture parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n resp : array-like of shape (n_samples, n_components)\n \"\"\"\n (nk, xk, sk) = _estimate_gaussian_parameters(X, resp, self.reg_covar, self.covariance_type)\n self._estimate_weights(nk)\n self._estimate_means(nk, xk)\n self._estimate_precisions(nk, xk, sk)" }, { @@ -127368,7 +137058,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -127378,7 +137069,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "log_resp", @@ -127388,13 +137080,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "Logarithm of the posterior probabilities (or responsibilities) of\nthe point of each sample in X." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "M step.", - "docstring": "M step.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nlog_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.", + "docstring": "M step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n ", "source_code": "\ndef _m_step(self, X, log_resp):\n \"\"\"M step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n \"\"\"\n (n_samples, _) = X.shape\n (nk, xk, sk) = _estimate_gaussian_parameters(X, np.exp(log_resp), self.reg_covar, self.covariance_type)\n self._estimate_weights(nk)\n self._estimate_means(nk, xk)\n self._estimate_precisions(nk, xk, sk)" }, { @@ -127412,7 +137105,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "params", @@ -127422,13 +137116,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _set_parameters(self, params):\n (self.weight_concentration_, self.mean_precision_, self.means_, self.degrees_of_freedom_, self.covariances_, self.precisions_cholesky_) = params\n if self.weight_concentration_prior_type == 'dirichlet_process':\n weight_dirichlet_sum = self.weight_concentration_[0] + self.weight_concentration_[1]\n tmp = self.weight_concentration_[1] / weight_dirichlet_sum\n self.weights_ = self.weight_concentration_[0] / weight_dirichlet_sum * np.hstack((1, np.cumprod(tmp[:-1])))\n self.weights_ /= np.sum(self.weights_)\n else:\n self.weights_ = self.weight_concentration_ / np.sum(self.weight_concentration_)\n if self.covariance_type == 'full':\n self.precisions_ = np.array([np.dot(prec_chol, prec_chol.T) for prec_chol in self.precisions_cholesky_])\n elif self.covariance_type == 'tied':\n self.precisions_ = np.dot(self.precisions_cholesky_, self.precisions_cholesky_.T)\n else:\n self.precisions_ = self.precisions_cholesky_**2" }, { @@ -127446,13 +137141,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The parameters values of the Dirichlet distribution." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the log of the Dirichlet distribution normalization term.", - "docstring": "Compute the log of the Dirichlet distribution normalization term.\n\nParameters\n----------\ndirichlet_concentration : array-like of shape (n_samples,)\n The parameters values of the Dirichlet distribution.\n\nReturns\n-------\nlog_dirichlet_norm : float\n The log normalization of the Dirichlet distribution.", + "docstring": "Compute the log of the Dirichlet distribution normalization term.\n\n Parameters\n ----------\n dirichlet_concentration : array-like of shape (n_samples,)\n The parameters values of the Dirichlet distribution.\n\n Returns\n -------\n log_dirichlet_norm : float\n The log normalization of the Dirichlet distribution.\n ", "source_code": "\ndef _log_dirichlet_norm(dirichlet_concentration):\n \"\"\"Compute the log of the Dirichlet distribution normalization term.\n\n Parameters\n ----------\n dirichlet_concentration : array-like of shape (n_samples,)\n The parameters values of the Dirichlet distribution.\n\n Returns\n -------\n log_dirichlet_norm : float\n The log normalization of the Dirichlet distribution.\n \"\"\"\n return gammaln(np.sum(dirichlet_concentration)) - np.sum(gammaln(dirichlet_concentration))" }, { @@ -127470,7 +137166,8 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "The number of degrees of freedom on the covariance Wishart\ndistributions." - } + }, + "refined_type": {} }, { "name": "log_det_precisions_chol", @@ -127480,7 +137177,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -127490,13 +137188,14 @@ "docstring": { "type": "int", "description": "The number of features." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the log of the Wishart distribution normalization term.", - "docstring": "Compute the log of the Wishart distribution normalization term.\n\nParameters\n----------\ndegrees_of_freedom : array-like of shape (n_components,)\n The number of degrees of freedom on the covariance Wishart\n distributions.\n\nlog_det_precision_chol : array-like of shape (n_components,)\n The determinant of the precision matrix for each component.\n\nn_features : int\n The number of features.\n\nReturn\n------\nlog_wishart_norm : array-like of shape (n_components,)\n The log normalization of the Wishart distribution.", + "docstring": "Compute the log of the Wishart distribution normalization term.\n\n Parameters\n ----------\n degrees_of_freedom : array-like of shape (n_components,)\n The number of degrees of freedom on the covariance Wishart\n distributions.\n\n log_det_precision_chol : array-like of shape (n_components,)\n The determinant of the precision matrix for each component.\n\n n_features : int\n The number of features.\n\n Return\n ------\n log_wishart_norm : array-like of shape (n_components,)\n The log normalization of the Wishart distribution.\n ", "source_code": "\ndef _log_wishart_norm(degrees_of_freedom, log_det_precisions_chol, n_features):\n \"\"\"Compute the log of the Wishart distribution normalization term.\n\n Parameters\n ----------\n degrees_of_freedom : array-like of shape (n_components,)\n The number of degrees of freedom on the covariance Wishart\n distributions.\n\n log_det_precision_chol : array-like of shape (n_components,)\n The determinant of the precision matrix for each component.\n\n n_features : int\n The number of features.\n\n Return\n ------\n log_wishart_norm : array-like of shape (n_components,)\n The log normalization of the Wishart distribution.\n \"\"\"\n return -(degrees_of_freedom * log_det_precisions_chol + degrees_of_freedom * n_features * 0.5 * math.log(2.0) + np.sum(gammaln(0.5 * (degrees_of_freedom - np.arange(n_features)[:, np.newaxis])), 0))" }, { @@ -127514,7 +137213,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -127524,7 +137224,8 @@ "docstring": { "type": "int, default=1", "description": "The number of mixture components." - } + }, + "refined_type": {} }, { "name": "covariance_type", @@ -127533,7 +137234,11 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "{'full', 'tied', 'diag', 'spherical'}, default='full'", - "description": "String describing the type of covariance parameters to use.\nMust be one of:\n\n'full'\n each component has its own general covariance matrix\n'tied'\n all components share the same general covariance matrix\n'diag'\n each component has its own diagonal covariance matrix\n'spherical'\n each component has its own single variance" + "description": "String describing the type of covariance parameters to use.\nMust be one of:\n\n- 'full': each component has its own general covariance matrix.\n- 'tied': all components share the same general covariance matrix.\n- 'diag': each component has its own diagonal covariance matrix.\n- 'spherical': each component has its own single variance." + }, + "refined_type": { + "kind": "EnumType", + "values": ["tied", "full", "diag", "spherical"] } }, { @@ -127544,7 +137249,8 @@ "docstring": { "type": "float, default=1e-3", "description": "The convergence threshold. EM iterations will stop when the\nlower bound average gain is below this threshold." - } + }, + "refined_type": {} }, { "name": "reg_covar", @@ -127554,7 +137260,8 @@ "docstring": { "type": "float, default=1e-6", "description": "Non-negative regularization added to the diagonal of covariance.\nAllows to assure that the covariance matrices are all positive." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -127564,7 +137271,8 @@ "docstring": { "type": "int, default=100", "description": "The number of EM iterations to perform." - } + }, + "refined_type": {} }, { "name": "n_init", @@ -127574,7 +137282,8 @@ "docstring": { "type": "int, default=1", "description": "The number of initializations to perform. The best results are kept." - } + }, + "refined_type": {} }, { "name": "init_params", @@ -127584,6 +137293,10 @@ "docstring": { "type": "{'kmeans', 'random'}, default='kmeans'", "description": "The method used to initialize the weights, the means and the\nprecisions.\nMust be one of::\n\n 'kmeans' : responsibilities are initialized using kmeans.\n 'random' : responsibilities are initialized randomly." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "kmeans"] } }, { @@ -127594,7 +137307,8 @@ "docstring": { "type": "array-like of shape (n_components, ), default=None", "description": "The user-provided initial weights.\nIf it is None, weights are initialized using the `init_params` method." - } + }, + "refined_type": {} }, { "name": "means_init", @@ -127604,7 +137318,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features), default=None", "description": "The user-provided initial means,\nIf it is None, means are initialized using the `init_params` method." - } + }, + "refined_type": {} }, { "name": "precisions_init", @@ -127614,7 +137329,8 @@ "docstring": { "type": "array-like, default=None", "description": "The user-provided initial precisions (inverse of the covariance\nmatrices).\nIf it is None, precisions are initialized using the 'init_params'\nmethod.\nThe shape depends on 'covariance_type'::\n\n (n_components,) if 'spherical',\n (n_features, n_features) if 'tied',\n (n_components, n_features) if 'diag',\n (n_components, n_features, n_features) if 'full'" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -127624,7 +137340,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the random seed given to the method chosen to initialize the\nparameters (see `init_params`).\nIn addition, it controls the generation of random samples from the\nfitted distribution (see the method `sample`).\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -127634,7 +137351,8 @@ "docstring": { "type": "bool, default=False", "description": "If 'warm_start' is True, the solution of the last fitting is used as\ninitialization for the next call of fit(). This can speed up\nconvergence when fit is called several times on similar problems.\nIn that case, 'n_init' is ignored and only a single initialization\noccurs upon the first call.\nSee :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -127644,7 +137362,8 @@ "docstring": { "type": "int, default=0", "description": "Enable verbose output. If 1 then it prints the current\ninitialization and each iteration step. If greater than 1 then\nit prints also the log probability and the time needed\nfor each step." - } + }, + "refined_type": {} }, { "name": "verbose_interval", @@ -127654,13 +137373,14 @@ "docstring": { "type": "int, default=10", "description": "Number of iteration done before the next print." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=1, *, covariance_type='full', tol=0.001, reg_covar=1e-06, max_iter=100, n_init=1, init_params='kmeans', weights_init=None, means_init=None, precisions_init=None, random_state=None, warm_start=False, verbose=0, verbose_interval=10):\n super().__init__(n_components=n_components, tol=tol, reg_covar=reg_covar, max_iter=max_iter, n_init=n_init, init_params=init_params, random_state=random_state, warm_start=warm_start, verbose=verbose, verbose_interval=verbose_interval)\n self.covariance_type = covariance_type\n self.weights_init = weights_init\n self.means_init = means_init\n self.precisions_init = precisions_init" }, { @@ -127678,7 +137398,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -127688,7 +137409,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -127712,7 +137434,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "_", @@ -127722,7 +137445,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "log_prob_norm", @@ -127732,13 +137456,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _compute_lower_bound(self, _, log_prob_norm):\n return log_prob_norm" }, { @@ -127756,7 +137481,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -127766,13 +137492,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _estimate_log_prob(self, X):\n return _estimate_log_gaussian_prob(X, self.means_, self.precisions_cholesky_, self.covariance_type)" }, { @@ -127790,13 +137517,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _estimate_log_weights(self):\n return np.log(self.weights_)" }, { @@ -127814,13 +137542,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_parameters(self):\n return self.weights_, self.means_, self.covariances_, self.precisions_cholesky_" }, { @@ -127838,7 +137567,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -127848,7 +137578,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "resp", @@ -127858,14 +137589,15 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Initialization of the Gaussian mixture parameters.", - "docstring": "Initialization of the Gaussian mixture parameters.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nresp : array-like of shape (n_samples, n_components)", - "source_code": "\ndef _initialize(self, X, resp):\n \"\"\"Initialization of the Gaussian mixture parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n resp : array-like of shape (n_samples, n_components)\n \"\"\"\n (n_samples, _) = X.shape\n (weights, means, covariances) = _estimate_gaussian_parameters(X, resp, self.reg_covar, self.covariance_type)\n weights /= n_samples\n self.weights_ = weights if self.weights_init is None else self.weights_init\n self.means_ = means if self.means_init is None else self.means_init\n if self.precisions_init is None:\n self.covariances_ = covariances\n self.precisions_cholesky_ = _compute_precision_cholesky(covariances, self.covariance_type)\n elif self.covariance_type == 'full':\n self.precisions_cholesky_ = np.array([linalg.cholesky(prec_init, lower=True) for prec_init in self.precisions_init])\n elif self.covariance_type == 'tied':\n self.precisions_cholesky_ = linalg.cholesky(self.precisions_init, lower=True)\n else:\n self.precisions_cholesky_ = self.precisions_init" + "docstring": "Initialization of the Gaussian mixture parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n resp : array-like of shape (n_samples, n_components)\n ", + "source_code": "\ndef _initialize(self, X, resp):\n \"\"\"Initialization of the Gaussian mixture parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n resp : array-like of shape (n_samples, n_components)\n \"\"\"\n (n_samples, _) = X.shape\n (weights, means, covariances) = _estimate_gaussian_parameters(X, resp, self.reg_covar, self.covariance_type)\n weights /= n_samples\n self.weights_ = weights if self.weights_init is None else self.weights_init\n self.means_ = means if self.means_init is None else self.means_init\n if self.precisions_init is None:\n self.covariances_ = covariances\n self.precisions_cholesky_ = _compute_precision_cholesky(covariances, self.covariance_type)\n elif self.covariance_type == 'full':\n self.precisions_cholesky_ = np.array([linalg.cholesky(prec_init, lower=True) for prec_init in self.precisions_init])\n elif self.covariance_type == 'tied':\n self.precisions_cholesky_ = linalg.cholesky(self.precisions_init, lower=True)\n else:\n self.precisions_cholesky_ = np.sqrt(self.precisions_init)" }, { "name": "_m_step", @@ -127882,7 +137614,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -127892,7 +137625,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "log_resp", @@ -127902,13 +137636,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "Logarithm of the posterior probabilities (or responsibilities) of\nthe point of each sample in X." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "M step.", - "docstring": "M step.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nlog_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.", + "docstring": "M step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n ", "source_code": "\ndef _m_step(self, X, log_resp):\n \"\"\"M step.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n log_resp : array-like of shape (n_samples, n_components)\n Logarithm of the posterior probabilities (or responsibilities) of\n the point of each sample in X.\n \"\"\"\n (n_samples, _) = X.shape\n (self.weights_, self.means_, self.covariances_) = _estimate_gaussian_parameters(X, np.exp(log_resp), self.reg_covar, self.covariance_type)\n self.weights_ /= n_samples\n self.precisions_cholesky_ = _compute_precision_cholesky(self.covariances_, self.covariance_type)" }, { @@ -127926,7 +137661,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -127950,7 +137686,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "params", @@ -127960,13 +137697,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _set_parameters(self, params):\n (self.weights_, self.means_, self.covariances_, self.precisions_cholesky_) = params\n (_, n_features) = self.means_.shape\n if self.covariance_type == 'full':\n self.precisions_ = np.empty(self.precisions_cholesky_.shape)\n for (k, prec_chol) in enumerate(self.precisions_cholesky_):\n self.precisions_[k] = np.dot(prec_chol, prec_chol.T)\n elif self.covariance_type == 'tied':\n self.precisions_ = np.dot(self.precisions_cholesky_, self.precisions_cholesky_.T)\n else:\n self.precisions_ = self.precisions_cholesky_**2" }, { @@ -127984,7 +137722,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -127994,14 +137733,15 @@ "docstring": { "type": "array of shape (n_samples, n_dimensions)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Akaike information criterion for the current model on the input X.", - "docstring": "Akaike information criterion for the current model on the input X.\n\nParameters\n----------\nX : array of shape (n_samples, n_dimensions)\n The input samples.\n\nReturns\n-------\naic : float\n The lower the better.", - "source_code": "\ndef aic(self, X):\n \"\"\"Akaike information criterion for the current model on the input X.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_dimensions)\n The input samples.\n\n Returns\n -------\n aic : float\n The lower the better.\n \"\"\"\n return -2 * self.score(X) * X.shape[0] + 2 * self._n_parameters()" + "description": "Akaike information criterion for the current model on the input X.\n\nYou can refer to this :ref:`mathematical section ` for more\ndetails regarding the formulation of the AIC used.", + "docstring": "Akaike information criterion for the current model on the input X.\n\n You can refer to this :ref:`mathematical section ` for more\n details regarding the formulation of the AIC used.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_dimensions)\n The input samples.\n\n Returns\n -------\n aic : float\n The lower the better.\n ", + "source_code": "\ndef aic(self, X):\n \"\"\"Akaike information criterion for the current model on the input X.\n\n You can refer to this :ref:`mathematical section ` for more\n details regarding the formulation of the AIC used.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_dimensions)\n The input samples.\n\n Returns\n -------\n aic : float\n The lower the better.\n \"\"\"\n return -2 * self.score(X) * X.shape[0] + 2 * self._n_parameters()" }, { "name": "bic", @@ -128018,7 +137758,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -128028,14 +137769,15 @@ "docstring": { "type": "array of shape (n_samples, n_dimensions)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Bayesian information criterion for the current model on the input X.", - "docstring": "Bayesian information criterion for the current model on the input X.\n\nParameters\n----------\nX : array of shape (n_samples, n_dimensions)\n The input samples.\n\nReturns\n-------\nbic : float\n The lower the better.", - "source_code": "\ndef bic(self, X):\n \"\"\"Bayesian information criterion for the current model on the input X.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_dimensions)\n The input samples.\n\n Returns\n -------\n bic : float\n The lower the better.\n \"\"\"\n return -2 * self.score(X) * X.shape[0] + self._n_parameters() * np.log(X.shape[0])" + "description": "Bayesian information criterion for the current model on the input X.\n\nYou can refer to this :ref:`mathematical section ` for more\ndetails regarding the formulation of the BIC used.", + "docstring": "Bayesian information criterion for the current model on the input X.\n\n You can refer to this :ref:`mathematical section ` for more\n details regarding the formulation of the BIC used.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_dimensions)\n The input samples.\n\n Returns\n -------\n bic : float\n The lower the better.\n ", + "source_code": "\ndef bic(self, X):\n \"\"\"Bayesian information criterion for the current model on the input X.\n\n You can refer to this :ref:`mathematical section ` for more\n details regarding the formulation of the BIC used.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_dimensions)\n The input samples.\n\n Returns\n -------\n bic : float\n The lower the better.\n \"\"\"\n return -2 * self.score(X) * X.shape[0] + self._n_parameters() * np.log(X.shape[0])" }, { "name": "_check_means", @@ -128052,7 +137794,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "The centers of the current components." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -128062,7 +137805,8 @@ "docstring": { "type": "int", "description": "Number of components." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -128072,13 +137816,14 @@ "docstring": { "type": "int", "description": "Number of features." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Validate the provided 'means'.", - "docstring": "Validate the provided 'means'.\n\nParameters\n----------\nmeans : array-like of shape (n_components, n_features)\n The centers of the current components.\n\nn_components : int\n Number of components.\n\nn_features : int\n Number of features.\n\nReturns\n-------\nmeans : array, (n_components, n_features)", + "docstring": "Validate the provided 'means'.\n\n Parameters\n ----------\n means : array-like of shape (n_components, n_features)\n The centers of the current components.\n\n n_components : int\n Number of components.\n\n n_features : int\n Number of features.\n\n Returns\n -------\n means : array, (n_components, n_features)\n ", "source_code": "\ndef _check_means(means, n_components, n_features):\n \"\"\"Validate the provided 'means'.\n\n Parameters\n ----------\n means : array-like of shape (n_components, n_features)\n The centers of the current components.\n\n n_components : int\n Number of components.\n\n n_features : int\n Number of features.\n\n Returns\n -------\n means : array, (n_components, n_features)\n \"\"\"\n means = check_array(means, dtype=[np.float64, np.float32], ensure_2d=False)\n _check_shape(means, (n_components, n_features), 'means')\n return means" }, { @@ -128096,7 +137841,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "covariance_type", @@ -128106,7 +137852,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -128130,7 +137877,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "covariance_type", @@ -128140,7 +137888,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -128164,7 +137913,8 @@ "docstring": { "type": "array-like", "description": "'full' : shape of (n_components, n_features, n_features)\n'tied' : shape of (n_features, n_features)\n'diag' : shape of (n_components, n_features)\n'spherical' : shape of (n_components,)" - } + }, + "refined_type": {} }, { "name": "covariance_type", @@ -128174,7 +137924,8 @@ "docstring": { "type": "str", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -128184,7 +137935,8 @@ "docstring": { "type": "int", "description": "Number of components." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -128194,13 +137946,14 @@ "docstring": { "type": "int", "description": "Number of features." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Validate user provided precisions.", - "docstring": "Validate user provided precisions.\n\nParameters\n----------\nprecisions : array-like\n 'full' : shape of (n_components, n_features, n_features)\n 'tied' : shape of (n_features, n_features)\n 'diag' : shape of (n_components, n_features)\n 'spherical' : shape of (n_components,)\n\ncovariance_type : str\n\nn_components : int\n Number of components.\n\nn_features : int\n Number of features.\n\nReturns\n-------\nprecisions : array", + "docstring": "Validate user provided precisions.\n\n Parameters\n ----------\n precisions : array-like\n 'full' : shape of (n_components, n_features, n_features)\n 'tied' : shape of (n_features, n_features)\n 'diag' : shape of (n_components, n_features)\n 'spherical' : shape of (n_components,)\n\n covariance_type : str\n\n n_components : int\n Number of components.\n\n n_features : int\n Number of features.\n\n Returns\n -------\n precisions : array\n ", "source_code": "\ndef _check_precisions(precisions, covariance_type, n_components, n_features):\n \"\"\"Validate user provided precisions.\n\n Parameters\n ----------\n precisions : array-like\n 'full' : shape of (n_components, n_features, n_features)\n 'tied' : shape of (n_features, n_features)\n 'diag' : shape of (n_components, n_features)\n 'spherical' : shape of (n_components,)\n\n covariance_type : str\n\n n_components : int\n Number of components.\n\n n_features : int\n Number of features.\n\n Returns\n -------\n precisions : array\n \"\"\"\n precisions = check_array(precisions, dtype=[np.float64, np.float32], ensure_2d=False, allow_nd=covariance_type == 'full')\n precisions_shape = {'full': (n_components, n_features, n_features), 'tied': (n_features, n_features), 'diag': (n_components, n_features), 'spherical': (n_components, )}\n _check_shape(precisions, precisions_shape[covariance_type], '%s precision' % covariance_type)\n _check_precisions = {'full': _check_precisions_full, 'tied': _check_precision_matrix, 'diag': _check_precision_positivity, 'spherical': _check_precision_positivity}\n _check_precisions[covariance_type](precisions, covariance_type)\n return precisions" }, { @@ -128218,7 +137971,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "covariance_type", @@ -128228,7 +137982,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -128252,7 +138007,8 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "The proportions of components of each mixture." - } + }, + "refined_type": {} }, { "name": "n_components", @@ -128262,13 +138018,14 @@ "docstring": { "type": "int", "description": "Number of components." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check the user provided 'weights'.", - "docstring": "Check the user provided 'weights'.\n\nParameters\n----------\nweights : array-like of shape (n_components,)\n The proportions of components of each mixture.\n\nn_components : int\n Number of components.\n\nReturns\n-------\nweights : array, shape (n_components,)", + "docstring": "Check the user provided 'weights'.\n\n Parameters\n ----------\n weights : array-like of shape (n_components,)\n The proportions of components of each mixture.\n\n n_components : int\n Number of components.\n\n Returns\n -------\n weights : array, shape (n_components,)\n ", "source_code": "\ndef _check_weights(weights, n_components):\n \"\"\"Check the user provided 'weights'.\n\n Parameters\n ----------\n weights : array-like of shape (n_components,)\n The proportions of components of each mixture.\n\n n_components : int\n Number of components.\n\n Returns\n -------\n weights : array, shape (n_components,)\n \"\"\"\n weights = check_array(weights, dtype=[np.float64, np.float32], ensure_2d=False)\n _check_shape(weights, (n_components, ), 'weights')\n if any(np.less(weights, 0.0)) or any(np.greater(weights, 1.0)):\n raise ValueError(\"The parameter 'weights' should be in the range [0, 1], but got max value %.5f, min value %.5f\" % (np.min(weights), np.max(weights)))\n if not np.allclose(np.abs(1.0 - np.sum(weights)), 0.0):\n raise ValueError(\"The parameter 'weights' should be normalized, but got sum(weights) = %.5f\" % np.sum(weights))\n return weights" }, { @@ -128286,7 +138043,8 @@ "docstring": { "type": "array-like", "description": "Cholesky decompositions of the matrices.\n'full' : shape of (n_components, n_features, n_features)\n'tied' : shape of (n_features, n_features)\n'diag' : shape of (n_components, n_features)\n'spherical' : shape of (n_components,)" - } + }, + "refined_type": {} }, { "name": "covariance_type", @@ -128296,6 +138054,10 @@ "docstring": { "type": "{'full', 'tied', 'diag', 'spherical'}", "description": "" + }, + "refined_type": { + "kind": "EnumType", + "values": ["tied", "full", "diag", "spherical"] } }, { @@ -128306,13 +138068,14 @@ "docstring": { "type": "int", "description": "Number of features." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the log-det of the cholesky decomposition of matrices.", - "docstring": "Compute the log-det of the cholesky decomposition of matrices.\n\nParameters\n----------\nmatrix_chol : array-like\n Cholesky decompositions of the matrices.\n 'full' : shape of (n_components, n_features, n_features)\n 'tied' : shape of (n_features, n_features)\n 'diag' : shape of (n_components, n_features)\n 'spherical' : shape of (n_components,)\n\ncovariance_type : {'full', 'tied', 'diag', 'spherical'}\n\nn_features : int\n Number of features.\n\nReturns\n-------\nlog_det_precision_chol : array-like of shape (n_components,)\n The determinant of the precision matrix for each component.", + "docstring": "Compute the log-det of the cholesky decomposition of matrices.\n\n Parameters\n ----------\n matrix_chol : array-like\n Cholesky decompositions of the matrices.\n 'full' : shape of (n_components, n_features, n_features)\n 'tied' : shape of (n_features, n_features)\n 'diag' : shape of (n_components, n_features)\n 'spherical' : shape of (n_components,)\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}\n\n n_features : int\n Number of features.\n\n Returns\n -------\n log_det_precision_chol : array-like of shape (n_components,)\n The determinant of the precision matrix for each component.\n ", "source_code": "\ndef _compute_log_det_cholesky(matrix_chol, covariance_type, n_features):\n \"\"\"Compute the log-det of the cholesky decomposition of matrices.\n\n Parameters\n ----------\n matrix_chol : array-like\n Cholesky decompositions of the matrices.\n 'full' : shape of (n_components, n_features, n_features)\n 'tied' : shape of (n_features, n_features)\n 'diag' : shape of (n_components, n_features)\n 'spherical' : shape of (n_components,)\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}\n\n n_features : int\n Number of features.\n\n Returns\n -------\n log_det_precision_chol : array-like of shape (n_components,)\n The determinant of the precision matrix for each component.\n \"\"\"\n if covariance_type == 'full':\n (n_components, _, _) = matrix_chol.shape\n log_det_chol = np.sum(np.log(matrix_chol.reshape(n_components, -1)[:, ::n_features + 1]), 1)\n elif covariance_type == 'tied':\n log_det_chol = np.sum(np.log(np.diag(matrix_chol)))\n elif covariance_type == 'diag':\n log_det_chol = np.sum(np.log(matrix_chol), axis=1)\n else:\n log_det_chol = n_features * np.log(matrix_chol)\n return log_det_chol" }, { @@ -128330,7 +138093,8 @@ "docstring": { "type": "array-like", "description": "The covariance matrix of the current components.\nThe shape depends of the covariance_type." - } + }, + "refined_type": {} }, { "name": "covariance_type", @@ -128340,13 +138104,17 @@ "docstring": { "type": "{'full', 'tied', 'diag', 'spherical'}", "description": "The type of precision matrices." + }, + "refined_type": { + "kind": "EnumType", + "values": ["tied", "full", "diag", "spherical"] } } ], "results": [], "is_public": false, "description": "Compute the Cholesky decomposition of the precisions.", - "docstring": "Compute the Cholesky decomposition of the precisions.\n\nParameters\n----------\ncovariances : array-like\n The covariance matrix of the current components.\n The shape depends of the covariance_type.\n\ncovariance_type : {'full', 'tied', 'diag', 'spherical'}\n The type of precision matrices.\n\nReturns\n-------\nprecisions_cholesky : array-like\n The cholesky decomposition of sample precisions of the current\n components. The shape depends of the covariance_type.", + "docstring": "Compute the Cholesky decomposition of the precisions.\n\n Parameters\n ----------\n covariances : array-like\n The covariance matrix of the current components.\n The shape depends of the covariance_type.\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}\n The type of precision matrices.\n\n Returns\n -------\n precisions_cholesky : array-like\n The cholesky decomposition of sample precisions of the current\n components. The shape depends of the covariance_type.\n ", "source_code": "\ndef _compute_precision_cholesky(covariances, covariance_type):\n \"\"\"Compute the Cholesky decomposition of the precisions.\n\n Parameters\n ----------\n covariances : array-like\n The covariance matrix of the current components.\n The shape depends of the covariance_type.\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}\n The type of precision matrices.\n\n Returns\n -------\n precisions_cholesky : array-like\n The cholesky decomposition of sample precisions of the current\n components. The shape depends of the covariance_type.\n \"\"\"\n estimate_precision_error_message = 'Fitting the mixture model failed because some components have ill-defined empirical covariance (for instance caused by singleton or collapsed samples). Try to decrease the number of components, or increase reg_covar.'\n if covariance_type == 'full':\n (n_components, n_features, _) = covariances.shape\n precisions_chol = np.empty((n_components, n_features, n_features))\n for (k, covariance) in enumerate(covariances):\n try:\n cov_chol = linalg.cholesky(covariance, lower=True)\n except linalg.LinAlgError:\n raise ValueError(estimate_precision_error_message)\n precisions_chol[k] = linalg.solve_triangular(cov_chol, np.eye(n_features), lower=True).T\n elif covariance_type == 'tied':\n (_, n_features) = covariances.shape\n try:\n cov_chol = linalg.cholesky(covariances, lower=True)\n except linalg.LinAlgError:\n raise ValueError(estimate_precision_error_message)\n precisions_chol = linalg.solve_triangular(cov_chol, np.eye(n_features), lower=True).T\n else:\n if np.any(np.less_equal(covariances, 0.0)):\n raise ValueError(estimate_precision_error_message)\n precisions_chol = 1.0 / np.sqrt(covariances)\n return precisions_chol" }, { @@ -128364,7 +138132,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -128374,7 +138143,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "nk", @@ -128384,7 +138154,8 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "" - } + }, + "refined_type": {} }, { "name": "means", @@ -128394,7 +138165,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "reg_covar", @@ -128404,13 +138176,14 @@ "docstring": { "type": "float", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the diagonal covariance vectors.", - "docstring": "Estimate the diagonal covariance vectors.\n\nParameters\n----------\nresponsibilities : array-like of shape (n_samples, n_components)\n\nX : array-like of shape (n_samples, n_features)\n\nnk : array-like of shape (n_components,)\n\nmeans : array-like of shape (n_components, n_features)\n\nreg_covar : float\n\nReturns\n-------\ncovariances : array, shape (n_components, n_features)\n The covariance vector of the current components.", + "docstring": "Estimate the diagonal covariance vectors.\n\n Parameters\n ----------\n responsibilities : array-like of shape (n_samples, n_components)\n\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n means : array-like of shape (n_components, n_features)\n\n reg_covar : float\n\n Returns\n -------\n covariances : array, shape (n_components, n_features)\n The covariance vector of the current components.\n ", "source_code": "\ndef _estimate_gaussian_covariances_diag(resp, X, nk, means, reg_covar):\n \"\"\"Estimate the diagonal covariance vectors.\n\n Parameters\n ----------\n responsibilities : array-like of shape (n_samples, n_components)\n\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n means : array-like of shape (n_components, n_features)\n\n reg_covar : float\n\n Returns\n -------\n covariances : array, shape (n_components, n_features)\n The covariance vector of the current components.\n \"\"\"\n avg_X2 = np.dot(resp.T, X * X) / nk[:, np.newaxis]\n avg_means2 = means**2\n avg_X_means = means * np.dot(resp.T, X) / nk[:, np.newaxis]\n return avg_X2 - 2 * avg_X_means + avg_means2 + reg_covar" }, { @@ -128428,7 +138201,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -128438,7 +138212,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "nk", @@ -128448,7 +138223,8 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "" - } + }, + "refined_type": {} }, { "name": "means", @@ -128458,7 +138234,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "reg_covar", @@ -128468,13 +138245,14 @@ "docstring": { "type": "float", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the full covariance matrices.", - "docstring": "Estimate the full covariance matrices.\n\nParameters\n----------\nresp : array-like of shape (n_samples, n_components)\n\nX : array-like of shape (n_samples, n_features)\n\nnk : array-like of shape (n_components,)\n\nmeans : array-like of shape (n_components, n_features)\n\nreg_covar : float\n\nReturns\n-------\ncovariances : array, shape (n_components, n_features, n_features)\n The covariance matrix of the current components.", + "docstring": "Estimate the full covariance matrices.\n\n Parameters\n ----------\n resp : array-like of shape (n_samples, n_components)\n\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n means : array-like of shape (n_components, n_features)\n\n reg_covar : float\n\n Returns\n -------\n covariances : array, shape (n_components, n_features, n_features)\n The covariance matrix of the current components.\n ", "source_code": "\ndef _estimate_gaussian_covariances_full(resp, X, nk, means, reg_covar):\n \"\"\"Estimate the full covariance matrices.\n\n Parameters\n ----------\n resp : array-like of shape (n_samples, n_components)\n\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n means : array-like of shape (n_components, n_features)\n\n reg_covar : float\n\n Returns\n -------\n covariances : array, shape (n_components, n_features, n_features)\n The covariance matrix of the current components.\n \"\"\"\n (n_components, n_features) = means.shape\n covariances = np.empty((n_components, n_features, n_features))\n for k in range(n_components):\n diff = X - means[k]\n covariances[k] = np.dot(resp[:, k] * diff.T, diff) / nk[k]\n covariances[k].flat[::n_features + 1] += reg_covar\n return covariances" }, { @@ -128492,7 +138270,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -128502,7 +138281,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "nk", @@ -128512,7 +138292,8 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "" - } + }, + "refined_type": {} }, { "name": "means", @@ -128522,7 +138303,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "reg_covar", @@ -128532,13 +138314,14 @@ "docstring": { "type": "float", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the spherical variance values.", - "docstring": "Estimate the spherical variance values.\n\nParameters\n----------\nresponsibilities : array-like of shape (n_samples, n_components)\n\nX : array-like of shape (n_samples, n_features)\n\nnk : array-like of shape (n_components,)\n\nmeans : array-like of shape (n_components, n_features)\n\nreg_covar : float\n\nReturns\n-------\nvariances : array, shape (n_components,)\n The variance values of each components.", + "docstring": "Estimate the spherical variance values.\n\n Parameters\n ----------\n responsibilities : array-like of shape (n_samples, n_components)\n\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n means : array-like of shape (n_components, n_features)\n\n reg_covar : float\n\n Returns\n -------\n variances : array, shape (n_components,)\n The variance values of each components.\n ", "source_code": "\ndef _estimate_gaussian_covariances_spherical(resp, X, nk, means, reg_covar):\n \"\"\"Estimate the spherical variance values.\n\n Parameters\n ----------\n responsibilities : array-like of shape (n_samples, n_components)\n\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n means : array-like of shape (n_components, n_features)\n\n reg_covar : float\n\n Returns\n -------\n variances : array, shape (n_components,)\n The variance values of each components.\n \"\"\"\n return _estimate_gaussian_covariances_diag(resp, X, nk, means, reg_covar).mean(1)" }, { @@ -128556,7 +138339,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -128566,7 +138350,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "nk", @@ -128576,7 +138361,8 @@ "docstring": { "type": "array-like of shape (n_components,)", "description": "" - } + }, + "refined_type": {} }, { "name": "means", @@ -128586,7 +138372,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "reg_covar", @@ -128596,13 +138383,14 @@ "docstring": { "type": "float", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Estimate the tied covariance matrix.", - "docstring": "Estimate the tied covariance matrix.\n\nParameters\n----------\nresp : array-like of shape (n_samples, n_components)\n\nX : array-like of shape (n_samples, n_features)\n\nnk : array-like of shape (n_components,)\n\nmeans : array-like of shape (n_components, n_features)\n\nreg_covar : float\n\nReturns\n-------\ncovariance : array, shape (n_features, n_features)\n The tied covariance matrix of the components.", + "docstring": "Estimate the tied covariance matrix.\n\n Parameters\n ----------\n resp : array-like of shape (n_samples, n_components)\n\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n means : array-like of shape (n_components, n_features)\n\n reg_covar : float\n\n Returns\n -------\n covariance : array, shape (n_features, n_features)\n The tied covariance matrix of the components.\n ", "source_code": "\ndef _estimate_gaussian_covariances_tied(resp, X, nk, means, reg_covar):\n \"\"\"Estimate the tied covariance matrix.\n\n Parameters\n ----------\n resp : array-like of shape (n_samples, n_components)\n\n X : array-like of shape (n_samples, n_features)\n\n nk : array-like of shape (n_components,)\n\n means : array-like of shape (n_components, n_features)\n\n reg_covar : float\n\n Returns\n -------\n covariance : array, shape (n_features, n_features)\n The tied covariance matrix of the components.\n \"\"\"\n avg_X2 = np.dot(X.T, X)\n avg_means2 = np.dot(nk * means.T, means)\n covariance = avg_X2 - avg_means2\n covariance /= nk.sum()\n covariance.flat[::len(covariance) + 1] += reg_covar\n return covariance" }, { @@ -128620,7 +138408,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input data array." - } + }, + "refined_type": {} }, { "name": "resp", @@ -128630,7 +138419,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_components)", "description": "The responsibilities for each data sample in X." - } + }, + "refined_type": {} }, { "name": "reg_covar", @@ -128640,7 +138430,8 @@ "docstring": { "type": "float", "description": "The regularization added to the diagonal of the covariance matrices." - } + }, + "refined_type": {} }, { "name": "covariance_type", @@ -128650,13 +138441,17 @@ "docstring": { "type": "{'full', 'tied', 'diag', 'spherical'}", "description": "The type of precision matrices." + }, + "refined_type": { + "kind": "EnumType", + "values": ["tied", "full", "diag", "spherical"] } } ], "results": [], "is_public": false, "description": "Estimate the Gaussian distribution parameters.", - "docstring": "Estimate the Gaussian distribution parameters.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input data array.\n\nresp : array-like of shape (n_samples, n_components)\n The responsibilities for each data sample in X.\n\nreg_covar : float\n The regularization added to the diagonal of the covariance matrices.\n\ncovariance_type : {'full', 'tied', 'diag', 'spherical'}\n The type of precision matrices.\n\nReturns\n-------\nnk : array-like of shape (n_components,)\n The numbers of data samples in the current components.\n\nmeans : array-like of shape (n_components, n_features)\n The centers of the current components.\n\ncovariances : array-like\n The covariance matrix of the current components.\n The shape depends of the covariance_type.", + "docstring": "Estimate the Gaussian distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data array.\n\n resp : array-like of shape (n_samples, n_components)\n The responsibilities for each data sample in X.\n\n reg_covar : float\n The regularization added to the diagonal of the covariance matrices.\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}\n The type of precision matrices.\n\n Returns\n -------\n nk : array-like of shape (n_components,)\n The numbers of data samples in the current components.\n\n means : array-like of shape (n_components, n_features)\n The centers of the current components.\n\n covariances : array-like\n The covariance matrix of the current components.\n The shape depends of the covariance_type.\n ", "source_code": "\ndef _estimate_gaussian_parameters(X, resp, reg_covar, covariance_type):\n \"\"\"Estimate the Gaussian distribution parameters.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data array.\n\n resp : array-like of shape (n_samples, n_components)\n The responsibilities for each data sample in X.\n\n reg_covar : float\n The regularization added to the diagonal of the covariance matrices.\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}\n The type of precision matrices.\n\n Returns\n -------\n nk : array-like of shape (n_components,)\n The numbers of data samples in the current components.\n\n means : array-like of shape (n_components, n_features)\n The centers of the current components.\n\n covariances : array-like\n The covariance matrix of the current components.\n The shape depends of the covariance_type.\n \"\"\"\n nk = resp.sum(axis=0) + 10 * np.finfo(resp.dtype).eps\n means = np.dot(resp.T, X) / nk[:, np.newaxis]\n covariances = {'full': _estimate_gaussian_covariances_full, 'tied': _estimate_gaussian_covariances_tied, 'diag': _estimate_gaussian_covariances_diag, 'spherical': _estimate_gaussian_covariances_spherical}[covariance_type](resp, X, nk, means, reg_covar)\n return nk, means, covariances" }, { @@ -128674,7 +138469,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "means", @@ -128684,7 +138480,8 @@ "docstring": { "type": "array-like of shape (n_components, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "precisions_chol", @@ -128694,7 +138491,8 @@ "docstring": { "type": "array-like", "description": "Cholesky decompositions of the precision matrices.\n'full' : shape of (n_components, n_features, n_features)\n'tied' : shape of (n_features, n_features)\n'diag' : shape of (n_components, n_features)\n'spherical' : shape of (n_components,)" - } + }, + "refined_type": {} }, { "name": "covariance_type", @@ -128704,13 +138502,17 @@ "docstring": { "type": "{'full', 'tied', 'diag', 'spherical'}", "description": "" + }, + "refined_type": { + "kind": "EnumType", + "values": ["tied", "full", "diag", "spherical"] } } ], "results": [], "is_public": false, "description": "Estimate the log Gaussian probability.", - "docstring": "Estimate the log Gaussian probability.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nmeans : array-like of shape (n_components, n_features)\n\nprecisions_chol : array-like\n Cholesky decompositions of the precision matrices.\n 'full' : shape of (n_components, n_features, n_features)\n 'tied' : shape of (n_features, n_features)\n 'diag' : shape of (n_components, n_features)\n 'spherical' : shape of (n_components,)\n\ncovariance_type : {'full', 'tied', 'diag', 'spherical'}\n\nReturns\n-------\nlog_prob : array, shape (n_samples, n_components)", + "docstring": "Estimate the log Gaussian probability.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n means : array-like of shape (n_components, n_features)\n\n precisions_chol : array-like\n Cholesky decompositions of the precision matrices.\n 'full' : shape of (n_components, n_features, n_features)\n 'tied' : shape of (n_features, n_features)\n 'diag' : shape of (n_components, n_features)\n 'spherical' : shape of (n_components,)\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}\n\n Returns\n -------\n log_prob : array, shape (n_samples, n_components)\n ", "source_code": "\ndef _estimate_log_gaussian_prob(X, means, precisions_chol, covariance_type):\n \"\"\"Estimate the log Gaussian probability.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n means : array-like of shape (n_components, n_features)\n\n precisions_chol : array-like\n Cholesky decompositions of the precision matrices.\n 'full' : shape of (n_components, n_features, n_features)\n 'tied' : shape of (n_features, n_features)\n 'diag' : shape of (n_components, n_features)\n 'spherical' : shape of (n_components,)\n\n covariance_type : {'full', 'tied', 'diag', 'spherical'}\n\n Returns\n -------\n log_prob : array, shape (n_samples, n_components)\n \"\"\"\n (n_samples, n_features) = X.shape\n (n_components, _) = means.shape\n log_det = _compute_log_det_cholesky(precisions_chol, covariance_type, n_features)\n if covariance_type == 'full':\n log_prob = np.empty((n_samples, n_components))\n for (k, (mu, prec_chol)) in enumerate(zip(means, precisions_chol)):\n y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)\n log_prob[:, k] = np.sum(np.square(y), axis=1)\n elif covariance_type == 'tied':\n log_prob = np.empty((n_samples, n_components))\n for (k, mu) in enumerate(means):\n y = np.dot(X, precisions_chol) - np.dot(mu, precisions_chol)\n log_prob[:, k] = np.sum(np.square(y), axis=1)\n elif covariance_type == 'diag':\n precisions = precisions_chol**2\n log_prob = np.sum(means**2 * precisions, 1) - 2.0 * np.dot(X, (means * precisions).T) + np.dot(X**2, precisions.T)\n elif covariance_type == 'spherical':\n precisions = precisions_chol**2\n log_prob = np.sum(means**2, 1) * precisions - 2 * np.dot(X, means.T * precisions) + np.outer(row_norms(X, squared=True), precisions)\n return -0.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det" }, { @@ -128728,7 +138530,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -128738,7 +138541,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scoring", @@ -128748,7 +138552,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -128758,7 +138563,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "refit", @@ -128768,7 +138574,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cv", @@ -128778,7 +138585,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -128788,7 +138596,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "pre_dispatch", @@ -128798,7 +138607,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "error_score", @@ -128808,7 +138618,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "return_train_score", @@ -128818,13 +138629,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, estimator, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=True):\n self.scoring = scoring\n self.estimator = estimator\n self.n_jobs = n_jobs\n self.refit = refit\n self.cv = cv\n self.verbose = verbose\n self.pre_dispatch = pre_dispatch\n self.error_score = error_score\n self.return_train_score = return_train_score" }, { @@ -128842,7 +138654,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scores", @@ -128852,7 +138665,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -128876,13 +138690,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef _estimator_type(self):\n return self.estimator._estimator_type" }, { @@ -128900,7 +138715,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "candidate_params", @@ -128910,7 +138726,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -128920,7 +138737,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "out", @@ -128930,7 +138748,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "more_results", @@ -128940,13 +138759,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _format_results(self, candidate_params, n_splits, out, more_results=None):\n n_candidates = len(candidate_params)\n out = _aggregate_score_dicts(out)\n results = dict(more_results or {})\n for (key, val) in results.items():\n results[key] = np.asarray(val)\n \n def _store(key_name, array, weights=None, splits=False, rank=False):\n \"\"\"A small helper to store the scores/times to the cv_results_\"\"\"\n array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits)\n if splits:\n for split_idx in range(n_splits):\n results['split%d_%s' % (split_idx, key_name)] = array[:, split_idx]\n array_means = np.average(array, axis=1, weights=weights)\n results['mean_%s' % key_name] = array_means\n if key_name.startswith(('train_', 'test_')) and np.any(~np.isfinite(array_means)):\n warnings.warn(f\"One or more of the {key_name.split('_')[0]} scores are non-finite: {array_means}\", category=UserWarning)\n array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis])**2, axis=1, weights=weights))\n results['std_%s' % key_name] = array_stds\n if rank:\n results['rank_%s' % key_name] = np.asarray(rankdata(-array_means, method='min'), dtype=np.int32)\n _store('fit_time', out['fit_time'])\n _store('score_time', out['score_time'])\n param_results = defaultdict(partial(MaskedArray, np.empty(n_candidates), mask=True, dtype=object))\n for (cand_idx, params) in enumerate(candidate_params):\n for (name, value) in params.items():\n param_results['param_%s' % name][cand_idx] = value\n results.update(param_results)\n results['params'] = candidate_params\n test_scores_dict = _normalize_score_results(out['test_scores'])\n if self.return_train_score:\n train_scores_dict = _normalize_score_results(out['train_scores'])\n for scorer_name in test_scores_dict:\n _store('test_%s' % scorer_name, test_scores_dict[scorer_name], splits=True, rank=True, weights=None)\n if self.return_train_score:\n _store('train_%s' % scorer_name, train_scores_dict[scorer_name], splits=True)\n return results" }, { @@ -128964,13 +138784,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'pairwise': _safe_tags(self.estimator, 'pairwise'), '_xfail_checks': {'check_supervised_y_2d': 'DataConversionWarning not caught'}}" }, { @@ -128991,13 +138812,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef _pairwise(self):\n return getattr(self.estimator, '_pairwise', False)" }, { @@ -129015,7 +138837,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "evaluate_candidates", @@ -129025,13 +138848,14 @@ "docstring": { "type": "callable", "description": "This callback accepts:\n - a list of candidates, where each candidate is a dict of\n parameter settings.\n - an optional `cv` parameter which can be used to e.g.\n evaluate candidates on different dataset splits, or\n evaluate candidates on subsampled data (as done in the\n SucessiveHaling estimators). By default, the original `cv`\n parameter is used, and it is available as a private\n `_checked_cv_orig` attribute.\n - an optional `more_results` dict. Each key will be added to\n the `cv_results_` attribute. Values should be lists of\n length `n_candidates`\n\nIt returns a dict of all results so far, formatted like\n``cv_results_``.\n\nImportant note (relevant whether the default cv is used or not):\nin randomized splitters, and unless the random_state parameter of\ncv was set to an int, calling cv.split() multiple times will\nyield different splits. Since cv.split() is called in\nevaluate_candidates, this means that candidates will be evaluated\non different splits each time evaluate_candidates is called. This\nmight be a methodological issue depending on the search strategy\nthat you're implementing. To prevent randomized splitters from\nbeing used, you may use _split._yields_constant_splits()" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Repeatedly calls `evaluate_candidates` to conduct a search.\n\nThis method, implemented in sub-classes, makes it possible to customize the the scheduling of evaluations: GridSearchCV and RandomizedSearchCV schedule evaluations for their whole parameter search space at once but other more sequential approaches are also possible: for instance is possible to iteratively schedule evaluations for new regions of the parameter search space based on previously collected evaluation results. This makes it possible to implement Bayesian optimization or more generally sequential model-based optimization by deriving from the BaseSearchCV abstract base class. For example, Successive Halving is implemented by calling `evaluate_candidates` multiples times (once per iteration of the SH process), each time passing a different set of candidates with `X` and `y` of increasing sizes.", - "docstring": "Repeatedly calls `evaluate_candidates` to conduct a search.\n\nThis method, implemented in sub-classes, makes it possible to\ncustomize the the scheduling of evaluations: GridSearchCV and\nRandomizedSearchCV schedule evaluations for their whole parameter\nsearch space at once but other more sequential approaches are also\npossible: for instance is possible to iteratively schedule evaluations\nfor new regions of the parameter search space based on previously\ncollected evaluation results. This makes it possible to implement\nBayesian optimization or more generally sequential model-based\noptimization by deriving from the BaseSearchCV abstract base class.\nFor example, Successive Halving is implemented by calling\n`evaluate_candidates` multiples times (once per iteration of the SH\nprocess), each time passing a different set of candidates with `X`\nand `y` of increasing sizes.\n\nParameters\n----------\nevaluate_candidates : callable\n This callback accepts:\n - a list of candidates, where each candidate is a dict of\n parameter settings.\n - an optional `cv` parameter which can be used to e.g.\n evaluate candidates on different dataset splits, or\n evaluate candidates on subsampled data (as done in the\n SucessiveHaling estimators). By default, the original `cv`\n parameter is used, and it is available as a private\n `_checked_cv_orig` attribute.\n - an optional `more_results` dict. Each key will be added to\n the `cv_results_` attribute. Values should be lists of\n length `n_candidates`\n\n It returns a dict of all results so far, formatted like\n ``cv_results_``.\n\n Important note (relevant whether the default cv is used or not):\n in randomized splitters, and unless the random_state parameter of\n cv was set to an int, calling cv.split() multiple times will\n yield different splits. Since cv.split() is called in\n evaluate_candidates, this means that candidates will be evaluated\n on different splits each time evaluate_candidates is called. This\n might be a methodological issue depending on the search strategy\n that you're implementing. To prevent randomized splitters from\n being used, you may use _split._yields_constant_splits()\n\nExamples\n--------\n\n::\n\n def _run_search(self, evaluate_candidates):\n 'Try C=0.1 only if C=1 is better than C=10'\n all_results = evaluate_candidates([{'C': 1}, {'C': 10}])\n score = all_results['mean_test_score']\n if score[0] < score[1]:\n evaluate_candidates([{'C': 0.1}])", + "description": "Repeatedly calls `evaluate_candidates` to conduct a search.\n\nThis method, implemented in sub-classes, makes it possible to\ncustomize the the scheduling of evaluations: GridSearchCV and\nRandomizedSearchCV schedule evaluations for their whole parameter\nsearch space at once but other more sequential approaches are also\npossible: for instance is possible to iteratively schedule evaluations\nfor new regions of the parameter search space based on previously\ncollected evaluation results. This makes it possible to implement\nBayesian optimization or more generally sequential model-based\noptimization by deriving from the BaseSearchCV abstract base class.\nFor example, Successive Halving is implemented by calling\n`evaluate_candidates` multiples times (once per iteration of the SH\nprocess), each time passing a different set of candidates with `X`\nand `y` of increasing sizes.", + "docstring": "Repeatedly calls `evaluate_candidates` to conduct a search.\n\n This method, implemented in sub-classes, makes it possible to\n customize the the scheduling of evaluations: GridSearchCV and\n RandomizedSearchCV schedule evaluations for their whole parameter\n search space at once but other more sequential approaches are also\n possible: for instance is possible to iteratively schedule evaluations\n for new regions of the parameter search space based on previously\n collected evaluation results. This makes it possible to implement\n Bayesian optimization or more generally sequential model-based\n optimization by deriving from the BaseSearchCV abstract base class.\n For example, Successive Halving is implemented by calling\n `evaluate_candidates` multiples times (once per iteration of the SH\n process), each time passing a different set of candidates with `X`\n and `y` of increasing sizes.\n\n Parameters\n ----------\n evaluate_candidates : callable\n This callback accepts:\n - a list of candidates, where each candidate is a dict of\n parameter settings.\n - an optional `cv` parameter which can be used to e.g.\n evaluate candidates on different dataset splits, or\n evaluate candidates on subsampled data (as done in the\n SucessiveHaling estimators). By default, the original `cv`\n parameter is used, and it is available as a private\n `_checked_cv_orig` attribute.\n - an optional `more_results` dict. Each key will be added to\n the `cv_results_` attribute. Values should be lists of\n length `n_candidates`\n\n It returns a dict of all results so far, formatted like\n ``cv_results_``.\n\n Important note (relevant whether the default cv is used or not):\n in randomized splitters, and unless the random_state parameter of\n cv was set to an int, calling cv.split() multiple times will\n yield different splits. Since cv.split() is called in\n evaluate_candidates, this means that candidates will be evaluated\n on different splits each time evaluate_candidates is called. This\n might be a methodological issue depending on the search strategy\n that you're implementing. To prevent randomized splitters from\n being used, you may use _split._yields_constant_splits()\n\n Examples\n --------\n\n ::\n\n def _run_search(self, evaluate_candidates):\n 'Try C=0.1 only if C=1 is better than C=10'\n all_results = evaluate_candidates([{'C': 1}, {'C': 10}])\n score = all_results['mean_test_score']\n if score[0] < score[1]:\n evaluate_candidates([{'C': 0.1}])\n ", "source_code": "\ndef _run_search(self, evaluate_candidates):\n \"\"\"Repeatedly calls `evaluate_candidates` to conduct a search.\n\n This method, implemented in sub-classes, makes it possible to\n customize the the scheduling of evaluations: GridSearchCV and\n RandomizedSearchCV schedule evaluations for their whole parameter\n search space at once but other more sequential approaches are also\n possible: for instance is possible to iteratively schedule evaluations\n for new regions of the parameter search space based on previously\n collected evaluation results. This makes it possible to implement\n Bayesian optimization or more generally sequential model-based\n optimization by deriving from the BaseSearchCV abstract base class.\n For example, Successive Halving is implemented by calling\n `evaluate_candidates` multiples times (once per iteration of the SH\n process), each time passing a different set of candidates with `X`\n and `y` of increasing sizes.\n\n Parameters\n ----------\n evaluate_candidates : callable\n This callback accepts:\n - a list of candidates, where each candidate is a dict of\n parameter settings.\n - an optional `cv` parameter which can be used to e.g.\n evaluate candidates on different dataset splits, or\n evaluate candidates on subsampled data (as done in the\n SucessiveHaling estimators). By default, the original `cv`\n parameter is used, and it is available as a private\n `_checked_cv_orig` attribute.\n - an optional `more_results` dict. Each key will be added to\n the `cv_results_` attribute. Values should be lists of\n length `n_candidates`\n\n It returns a dict of all results so far, formatted like\n ``cv_results_``.\n\n Important note (relevant whether the default cv is used or not):\n in randomized splitters, and unless the random_state parameter of\n cv was set to an int, calling cv.split() multiple times will\n yield different splits. Since cv.split() is called in\n evaluate_candidates, this means that candidates will be evaluated\n on different splits each time evaluate_candidates is called. This\n might be a methodological issue depending on the search strategy\n that you're implementing. To prevent randomized splitters from\n being used, you may use _split._yields_constant_splits()\n\n Examples\n --------\n\n ::\n\n def _run_search(self, evaluate_candidates):\n 'Try C=0.1 only if C=1 is better than C=10'\n all_results = evaluate_candidates([{'C': 1}, {'C': 10}])\n score = all_results['mean_test_score']\n if score[0] < score[1]:\n evaluate_candidates([{'C': 0.1}])\n \"\"\"\n raise NotImplementedError('_run_search not implemented.')" }, { @@ -129049,7 +138873,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "refit_metric", @@ -129059,7 +138884,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "results", @@ -129069,7 +138895,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -129093,13 +138920,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Class labels.\n\nOnly available when `refit=True` and the estimator is a classifier.", - "docstring": "Class labels.\n\nOnly available when `refit=True` and the estimator is a classifier.", + "docstring": "Class labels.\n\n Only available when `refit=True` and the estimator is a classifier.\n ", "source_code": "\n@property\ndef classes_(self):\n \"\"\"Class labels.\n\n Only available when `refit=True` and the estimator is a classifier.\n \"\"\"\n _estimator_has('classes_')(self)\n return self.best_estimator_.classes_" }, { @@ -129117,7 +138945,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -129127,13 +138956,14 @@ "docstring": { "type": "indexable, length n_samples", "description": "Must fulfill the input assumptions of the\nunderlying estimator." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Call decision_function on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports ``decision_function``.", - "docstring": "Call decision_function on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports\n``decision_function``.\n\nParameters\n----------\nX : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\nReturns\n-------\ny_score : ndarray of shape (n_samples,) or (n_samples, n_classes) or (n_samples, n_classes * (n_classes-1) / 2)\n Result of the decision function for `X` based on the estimator with\n the best found parameters.", + "description": "Call decision_function on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports\n``decision_function``.", + "docstring": "Call decision_function on the estimator with the best found parameters.\n\n Only available if ``refit=True`` and the underlying estimator supports\n ``decision_function``.\n\n Parameters\n ----------\n X : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\n Returns\n -------\n y_score : ndarray of shape (n_samples,) or (n_samples, n_classes) or (n_samples, n_classes * (n_classes-1) / 2)\n Result of the decision function for `X` based on the estimator with\n the best found parameters.\n ", "source_code": "\n@available_if(_estimator_has('decision_function'))\ndef decision_function(self, X):\n \"\"\"Call decision_function on the estimator with the best found parameters.\n\n Only available if ``refit=True`` and the underlying estimator supports\n ``decision_function``.\n\n Parameters\n ----------\n X : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\n Returns\n -------\n y_score : ndarray of shape (n_samples,) or (n_samples, n_classes) or (n_samples, n_classes * (n_classes-1) / 2)\n Result of the decision function for `X` based on the estimator with\n the best found parameters.\n \"\"\"\n check_is_fitted(self)\n return self.best_estimator_.decision_function(X)" }, { @@ -129151,7 +138981,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -129161,7 +138992,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -129171,7 +139003,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_output) or (n_samples,), default=None", "description": "Target relative to X for classification or regression;\nNone for unsupervised learning." - } + }, + "refined_type": {} }, { "name": "groups", @@ -129181,13 +139014,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set. Only used in conjunction with a \"Group\" :term:`cv`\ninstance (e.g., :class:`~sklearn.model_selection.GroupKFold`)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Run fit with all sets of parameters.", - "docstring": "Run fit with all sets of parameters.\n\nParameters\n----------\n\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples, n_output) or (n_samples,), default=None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\ngroups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n**fit_params : dict of str -> object\n Parameters passed to the ``fit`` method of the estimator.\n\nReturns\n-------\nself : object\n Instance of fitted estimator.", + "docstring": "Run fit with all sets of parameters.\n\n Parameters\n ----------\n\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples, n_output) or (n_samples,), default=None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n **fit_params : dict of str -> object\n Parameters passed to the ``fit`` method of the estimator.\n\n Returns\n -------\n self : object\n Instance of fitted estimator.\n ", "source_code": "\ndef fit(self, X, y=None, *, groups=None, **fit_params):\n \"\"\"Run fit with all sets of parameters.\n\n Parameters\n ----------\n\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples, n_output) or (n_samples,), default=None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n **fit_params : dict of str -> object\n Parameters passed to the ``fit`` method of the estimator.\n\n Returns\n -------\n self : object\n Instance of fitted estimator.\n \"\"\"\n estimator = self.estimator\n refit_metric = 'score'\n if callable(self.scoring):\n scorers = self.scoring\n elif self.scoring is None or isinstance(self.scoring, str):\n scorers = check_scoring(self.estimator, self.scoring)\n else:\n scorers = _check_multimetric_scoring(self.estimator, self.scoring)\n self._check_refit_for_multimetric(scorers)\n refit_metric = self.refit\n (X, y, groups) = indexable(X, y, groups)\n fit_params = _check_fit_params(X, fit_params)\n cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))\n n_splits = cv_orig.get_n_splits(X, y, groups)\n base_estimator = clone(self.estimator)\n parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch)\n fit_and_score_kwargs = dict(scorer=scorers, fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=False, error_score=self.error_score, verbose=self.verbose)\n results = {}\n with parallel:\n all_candidate_params = []\n all_out = []\n all_more_results = defaultdict(list)\n \n def evaluate_candidates(candidate_params, cv=None, more_results=None):\n cv = cv or cv_orig\n candidate_params = list(candidate_params)\n n_candidates = len(candidate_params)\n if self.verbose > 0:\n print('Fitting {0} folds for each of {1} candidates, totalling {2} fits'.format(n_splits, n_candidates, n_candidates * n_splits))\n out = parallel((delayed(_fit_and_score)(clone(base_estimator), X, y, train=train, test=test, parameters=parameters, split_progress=(split_idx, n_splits), candidate_progress=(cand_idx, n_candidates), **fit_and_score_kwargs) for ((cand_idx, parameters), (split_idx, (train, test))) in product(enumerate(candidate_params), enumerate(cv.split(X, y, groups)))))\n if len(out) < 1:\n raise ValueError('No fits were performed. Was the CV iterator empty? Were there no candidates?')\n elif len(out) != n_candidates * n_splits:\n raise ValueError('cv.split and cv.get_n_splits returned inconsistent results. Expected {} splits, got {}'.format(n_splits, len(out) // n_candidates))\n _warn_about_fit_failures(out, self.error_score)\n if callable(self.scoring):\n _insert_error_scores(out, self.error_score)\n all_candidate_params.extend(candidate_params)\n all_out.extend(out)\n if more_results is not None:\n for (key, value) in more_results.items():\n all_more_results[key].extend(value)\n nonlocal results\n results = self._format_results(all_candidate_params, n_splits, all_out, all_more_results)\n return results\n self._run_search(evaluate_candidates)\n first_test_score = all_out[0]['test_scores']\n self.multimetric_ = isinstance(first_test_score, dict)\n if callable(self.scoring) and self.multimetric_:\n self._check_refit_for_multimetric(first_test_score)\n refit_metric = self.refit\n if self.refit or not self.multimetric_:\n self.best_index_ = self._select_best_index(self.refit, refit_metric, results)\n if not callable(self.refit):\n self.best_score_ = results[f'mean_test_{refit_metric}'][self.best_index_]\n self.best_params_ = results['params'][self.best_index_]\n if self.refit:\n self.best_estimator_ = clone(clone(base_estimator).set_params(**self.best_params_))\n refit_start_time = time.time()\n if y is not None:\n self.best_estimator_.fit(X, y, **fit_params)\n else:\n self.best_estimator_.fit(X, **fit_params)\n refit_end_time = time.time()\n self.refit_time_ = refit_end_time - refit_start_time\n if hasattr(self.best_estimator_, 'feature_names_in_'):\n self.feature_names_in_ = self.best_estimator_.feature_names_in_\n self.scorer_ = scorers\n self.cv_results_ = results\n self.n_splits_ = n_splits\n return self" }, { @@ -129205,7 +139039,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Xt", @@ -129215,13 +139050,14 @@ "docstring": { "type": "indexable, length n_samples", "description": "Must fulfill the input assumptions of the\nunderlying estimator." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Call inverse_transform on the estimator with the best found params.\n\nOnly available if the underlying estimator implements ``inverse_transform`` and ``refit=True``.", - "docstring": "Call inverse_transform on the estimator with the best found params.\n\nOnly available if the underlying estimator implements\n``inverse_transform`` and ``refit=True``.\n\nParameters\n----------\nXt : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\nReturns\n-------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Result of the `inverse_transform` function for `Xt` based on the\n estimator with the best found parameters.", + "description": "Call inverse_transform on the estimator with the best found params.\n\nOnly available if the underlying estimator implements\n``inverse_transform`` and ``refit=True``.", + "docstring": "Call inverse_transform on the estimator with the best found params.\n\n Only available if the underlying estimator implements\n ``inverse_transform`` and ``refit=True``.\n\n Parameters\n ----------\n Xt : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\n Returns\n -------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Result of the `inverse_transform` function for `Xt` based on the\n estimator with the best found parameters.\n ", "source_code": "\n@available_if(_estimator_has('inverse_transform'))\ndef inverse_transform(self, Xt):\n \"\"\"Call inverse_transform on the estimator with the best found params.\n\n Only available if the underlying estimator implements\n ``inverse_transform`` and ``refit=True``.\n\n Parameters\n ----------\n Xt : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\n Returns\n -------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Result of the `inverse_transform` function for `Xt` based on the\n estimator with the best found parameters.\n \"\"\"\n check_is_fitted(self)\n return self.best_estimator_.inverse_transform(Xt)" }, { @@ -129239,13 +139075,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Number of features seen during :term:`fit`.\n\nOnly available when `refit=True`.", - "docstring": "Number of features seen during :term:`fit`.\n\nOnly available when `refit=True`.", + "docstring": "Number of features seen during :term:`fit`.\n\n Only available when `refit=True`.\n ", "source_code": "\n@property\ndef n_features_in_(self):\n \"\"\"Number of features seen during :term:`fit`.\n\n Only available when `refit=True`.\n \"\"\"\n try:\n check_is_fitted(self)\n except NotFittedError as nfe:\n raise AttributeError('{} object has no n_features_in_ attribute.'.format(self.__class__.__name__)) from nfe\n return self.best_estimator_.n_features_in_" }, { @@ -129263,7 +139100,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -129273,13 +139111,14 @@ "docstring": { "type": "indexable, length n_samples", "description": "Must fulfill the input assumptions of the\nunderlying estimator." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Call predict on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports ``predict``.", - "docstring": "Call predict on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports\n``predict``.\n\nParameters\n----------\nX : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,)\n The predicted labels or values for `X` based on the estimator with\n the best found parameters.", + "description": "Call predict on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports\n``predict``.", + "docstring": "Call predict on the estimator with the best found parameters.\n\n Only available if ``refit=True`` and the underlying estimator supports\n ``predict``.\n\n Parameters\n ----------\n X : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n The predicted labels or values for `X` based on the estimator with\n the best found parameters.\n ", "source_code": "\n@available_if(_estimator_has('predict'))\ndef predict(self, X):\n \"\"\"Call predict on the estimator with the best found parameters.\n\n Only available if ``refit=True`` and the underlying estimator supports\n ``predict``.\n\n Parameters\n ----------\n X : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n The predicted labels or values for `X` based on the estimator with\n the best found parameters.\n \"\"\"\n check_is_fitted(self)\n return self.best_estimator_.predict(X)" }, { @@ -129297,7 +139136,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -129307,13 +139147,14 @@ "docstring": { "type": "indexable, length n_samples", "description": "Must fulfill the input assumptions of the\nunderlying estimator." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Call predict_log_proba on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports ``predict_log_proba``.", - "docstring": "Call predict_log_proba on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports\n``predict_log_proba``.\n\nParameters\n----------\nX : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Predicted class log-probabilities for `X` based on the estimator\n with the best found parameters. The order of the classes\n corresponds to that in the fitted attribute :term:`classes_`.", + "description": "Call predict_log_proba on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports\n``predict_log_proba``.", + "docstring": "Call predict_log_proba on the estimator with the best found parameters.\n\n Only available if ``refit=True`` and the underlying estimator supports\n ``predict_log_proba``.\n\n Parameters\n ----------\n X : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Predicted class log-probabilities for `X` based on the estimator\n with the best found parameters. The order of the classes\n corresponds to that in the fitted attribute :term:`classes_`.\n ", "source_code": "\n@available_if(_estimator_has('predict_log_proba'))\ndef predict_log_proba(self, X):\n \"\"\"Call predict_log_proba on the estimator with the best found parameters.\n\n Only available if ``refit=True`` and the underlying estimator supports\n ``predict_log_proba``.\n\n Parameters\n ----------\n X : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Predicted class log-probabilities for `X` based on the estimator\n with the best found parameters. The order of the classes\n corresponds to that in the fitted attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n return self.best_estimator_.predict_log_proba(X)" }, { @@ -129331,7 +139172,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -129341,13 +139183,14 @@ "docstring": { "type": "indexable, length n_samples", "description": "Must fulfill the input assumptions of the\nunderlying estimator." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Call predict_proba on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports ``predict_proba``.", - "docstring": "Call predict_proba on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports\n``predict_proba``.\n\nParameters\n----------\nX : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Predicted class probabilities for `X` based on the estimator with\n the best found parameters. The order of the classes corresponds\n to that in the fitted attribute :term:`classes_`.", + "description": "Call predict_proba on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports\n``predict_proba``.", + "docstring": "Call predict_proba on the estimator with the best found parameters.\n\n Only available if ``refit=True`` and the underlying estimator supports\n ``predict_proba``.\n\n Parameters\n ----------\n X : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Predicted class probabilities for `X` based on the estimator with\n the best found parameters. The order of the classes corresponds\n to that in the fitted attribute :term:`classes_`.\n ", "source_code": "\n@available_if(_estimator_has('predict_proba'))\ndef predict_proba(self, X):\n \"\"\"Call predict_proba on the estimator with the best found parameters.\n\n Only available if ``refit=True`` and the underlying estimator supports\n ``predict_proba``.\n\n Parameters\n ----------\n X : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Predicted class probabilities for `X` based on the estimator with\n the best found parameters. The order of the classes corresponds\n to that in the fitted attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n return self.best_estimator_.predict_proba(X)" }, { @@ -129365,7 +139208,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -129375,7 +139219,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -129385,13 +139230,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_output) or (n_samples,), default=None", "description": "Target relative to X for classification or regression;\nNone for unsupervised learning." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return the score on the given data, if the estimator has been refit.\n\nThis uses the score defined by ``scoring`` where provided, and the ``best_estimator_.score`` method otherwise.", - "docstring": "Return the score on the given data, if the estimator has been refit.\n\nThis uses the score defined by ``scoring`` where provided, and the\n``best_estimator_.score`` method otherwise.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples, n_output) or (n_samples,), default=None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\nReturns\n-------\nscore : float\n The score defined by ``scoring`` if provided, and the\n ``best_estimator_.score`` method otherwise.", + "description": "Return the score on the given data, if the estimator has been refit.\n\nThis uses the score defined by ``scoring`` where provided, and the\n``best_estimator_.score`` method otherwise.", + "docstring": "Return the score on the given data, if the estimator has been refit.\n\n This uses the score defined by ``scoring`` where provided, and the\n ``best_estimator_.score`` method otherwise.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples, n_output) or (n_samples,), default=None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n Returns\n -------\n score : float\n The score defined by ``scoring`` if provided, and the\n ``best_estimator_.score`` method otherwise.\n ", "source_code": "\ndef score(self, X, y=None):\n \"\"\"Return the score on the given data, if the estimator has been refit.\n\n This uses the score defined by ``scoring`` where provided, and the\n ``best_estimator_.score`` method otherwise.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples, n_output) or (n_samples,), default=None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n Returns\n -------\n score : float\n The score defined by ``scoring`` if provided, and the\n ``best_estimator_.score`` method otherwise.\n \"\"\"\n _check_refit(self, 'score')\n check_is_fitted(self)\n if self.scorer_ is None:\n raise ValueError(\"No score function explicitly defined, and the estimator doesn't provide one %s\" % self.best_estimator_)\n if isinstance(self.scorer_, dict):\n if self.multimetric_:\n scorer = self.scorer_[self.refit]\n else:\n scorer = self.scorer_\n return scorer(self.best_estimator_, X, y)\n score = self.scorer_(self.best_estimator_, X, y)\n if self.multimetric_:\n score = score[self.refit]\n return score" }, { @@ -129409,7 +139255,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -129419,13 +139266,14 @@ "docstring": { "type": "iterable", "description": "Data to predict on. Must fulfill input requirements\nof the underlying estimator." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Call score_samples on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports ``score_samples``. .. versionadded:: 0.24", - "docstring": "Call score_samples on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports\n``score_samples``.\n\n.. versionadded:: 0.24\n\nParameters\n----------\nX : iterable\n Data to predict on. Must fulfill input requirements\n of the underlying estimator.\n\nReturns\n-------\ny_score : ndarray of shape (n_samples,)\n The ``best_estimator_.score_samples`` method.", + "description": "Call score_samples on the estimator with the best found parameters.\n\nOnly available if ``refit=True`` and the underlying estimator supports\n``score_samples``.\n\n.. versionadded:: 0.24", + "docstring": "Call score_samples on the estimator with the best found parameters.\n\n Only available if ``refit=True`` and the underlying estimator supports\n ``score_samples``.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements\n of the underlying estimator.\n\n Returns\n -------\n y_score : ndarray of shape (n_samples,)\n The ``best_estimator_.score_samples`` method.\n ", "source_code": "\n@available_if(_estimator_has('score_samples'))\ndef score_samples(self, X):\n \"\"\"Call score_samples on the estimator with the best found parameters.\n\n Only available if ``refit=True`` and the underlying estimator supports\n ``score_samples``.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements\n of the underlying estimator.\n\n Returns\n -------\n y_score : ndarray of shape (n_samples,)\n The ``best_estimator_.score_samples`` method.\n \"\"\"\n check_is_fitted(self)\n return self.best_estimator_.score_samples(X)" }, { @@ -129443,7 +139291,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -129453,13 +139302,14 @@ "docstring": { "type": "indexable, length n_samples", "description": "Must fulfill the input assumptions of the\nunderlying estimator." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Call transform on the estimator with the best found parameters.\n\nOnly available if the underlying estimator supports ``transform`` and ``refit=True``.", - "docstring": "Call transform on the estimator with the best found parameters.\n\nOnly available if the underlying estimator supports ``transform`` and\n``refit=True``.\n\nParameters\n----------\nX : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\nReturns\n-------\nXt : {ndarray, sparse matrix} of shape (n_samples, n_features)\n `X` transformed in the new space based on the estimator with\n the best found parameters.", + "description": "Call transform on the estimator with the best found parameters.\n\nOnly available if the underlying estimator supports ``transform`` and\n``refit=True``.", + "docstring": "Call transform on the estimator with the best found parameters.\n\n Only available if the underlying estimator supports ``transform`` and\n ``refit=True``.\n\n Parameters\n ----------\n X : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)\n `X` transformed in the new space based on the estimator with\n the best found parameters.\n ", "source_code": "\n@available_if(_estimator_has('transform'))\ndef transform(self, X):\n \"\"\"Call transform on the estimator with the best found parameters.\n\n Only available if the underlying estimator supports ``transform`` and\n ``refit=True``.\n\n Parameters\n ----------\n X : indexable, length n_samples\n Must fulfill the input assumptions of the\n underlying estimator.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)\n `X` transformed in the new space based on the estimator with\n the best found parameters.\n \"\"\"\n check_is_fitted(self)\n return self.best_estimator_.transform(X)" }, { @@ -129477,7 +139327,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -129487,7 +139338,8 @@ "docstring": { "type": "estimator object", "description": "This is assumed to implement the scikit-learn estimator interface.\nEither estimator needs to provide a ``score`` function,\nor ``scoring`` must be passed." - } + }, + "refined_type": {} }, { "name": "param_grid", @@ -129497,7 +139349,8 @@ "docstring": { "type": "dict or list of dictionaries", "description": "Dictionary with parameters names (`str`) as keys and lists of\nparameter settings to try as values, or a list of such\ndictionaries, in which case the grids spanned by each dictionary\nin the list are explored. This enables searching over any sequence\nof parameter settings." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -129507,7 +139360,8 @@ "docstring": { "type": "str, callable, list, tuple or dict, default=None", "description": "Strategy to evaluate the performance of the cross-validated model on\nthe test set.\n\nIf `scoring` represents a single score, one can use:\n\n- a single string (see :ref:`scoring_parameter`);\n- a callable (see :ref:`scoring`) that returns a single value.\n\nIf `scoring` represents multiple scores, one can use:\n\n- a list or tuple of unique strings;\n- a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n- a dictionary with metric names as keys and callables a values.\n\nSee :ref:`multimetric_grid_search` for an example." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -129517,7 +139371,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\n\n.. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None" - } + }, + "refined_type": {} }, { "name": "refit", @@ -129527,7 +139382,8 @@ "docstring": { "type": "bool, str, or callable, default=True", "description": "Refit an estimator using the best found parameters on the whole\ndataset.\n\nFor multiple metric evaluation, this needs to be a `str` denoting the\nscorer that would be used to find the best parameters for refitting\nthe estimator at the end.\n\nWhere there are considerations other than maximum score in\nchoosing a best estimator, ``refit`` can be set to a function which\nreturns the selected ``best_index_`` given ``cv_results_``. In that\ncase, the ``best_estimator_`` and ``best_params_`` will be set\naccording to the returned ``best_index_`` while the ``best_score_``\nattribute will not be available.\n\nThe refitted estimator is made available at the ``best_estimator_``\nattribute and permits using ``predict`` directly on this\n``GridSearchCV`` instance.\n\nAlso for multiple metric evaluation, the attributes ``best_index_``,\n``best_score_`` and ``best_params_`` will only be available if\n``refit`` is set and all of them will be determined w.r.t this specific\nscorer.\n\nSee ``scoring`` parameter to know more about multiple metric\nevaluation.\n\n.. versionchanged:: 0.20\n Support for callable added." - } + }, + "refined_type": {} }, { "name": "cv", @@ -129537,7 +139393,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross validation,\n- integer, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor integer/None inputs, if the estimator is a classifier and ``y`` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`KFold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -129547,7 +139404,8 @@ "docstring": { "type": "int", "description": "Controls the verbosity: the higher, the more messages.\n\n- >1 : the computation time for each fold and parameter candidate is\n displayed;\n- >2 : the score is also displayed;\n- >3 : the fold and candidate parameter indexes are also displayed\n together with the starting time of the computation." - } + }, + "refined_type": {} }, { "name": "pre_dispatch", @@ -129557,7 +139415,8 @@ "docstring": { "type": "int, or str, default='2*n_jobs'", "description": "Controls the number of jobs that get dispatched during parallel\nexecution. Reducing this number can be useful to avoid an\nexplosion of memory consumption when more jobs get dispatched\nthan CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'" - } + }, + "refined_type": {} }, { "name": "error_score", @@ -129567,7 +139426,8 @@ "docstring": { "type": "'raise' or numeric, default=np.nan", "description": "Value to assign to the score if an error occurs in estimator fitting.\nIf set to 'raise', the error is raised. If a numeric value is given,\nFitFailedWarning is raised. This parameter does not affect the refit\nstep, which will always raise the error." - } + }, + "refined_type": {} }, { "name": "return_train_score", @@ -129577,13 +139437,14 @@ "docstring": { "type": "bool, default=False", "description": "If ``False``, the ``cv_results_`` attribute will not include training\nscores.\nComputing training scores is used to get insights on how different\nparameter settings impact the overfitting/underfitting trade-off.\nHowever computing the scores on the training set can be computationally\nexpensive and is not strictly required to select the parameters that\nyield the best generalization performance.\n\n.. versionadded:: 0.19\n\n.. versionchanged:: 0.21\n Default value was changed from ``True`` to ``False``" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, param_grid, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False):\n super().__init__(estimator=estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score)\n self.param_grid = param_grid\n _check_param_grid(param_grid)" }, { @@ -129601,7 +139462,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "evaluate_candidates", @@ -129611,7 +139473,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -129635,7 +139498,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ind", @@ -129645,13 +139509,14 @@ "docstring": { "type": "int", "description": "The iteration index" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get the parameters that would be ``ind``th in iteration", - "docstring": "Get the parameters that would be ``ind``th in iteration\n\nParameters\n----------\nind : int\n The iteration index\n\nReturns\n-------\nparams : dict of str to any\n Equal to list(self)[ind]", + "docstring": "Get the parameters that would be ``ind``th in iteration\n\n Parameters\n ----------\n ind : int\n The iteration index\n\n Returns\n -------\n params : dict of str to any\n Equal to list(self)[ind]\n ", "source_code": "\ndef __getitem__(self, ind):\n \"\"\"Get the parameters that would be ``ind``th in iteration\n\n Parameters\n ----------\n ind : int\n The iteration index\n\n Returns\n -------\n params : dict of str to any\n Equal to list(self)[ind]\n \"\"\"\n for sub_grid in self.param_grid:\n if not sub_grid:\n if ind == 0:\n return {}\n else:\n ind -= 1\n continue\n (keys, values_lists) = zip(*sorted(sub_grid.items())[::-1])\n sizes = [len(v_list) for v_list in values_lists]\n total = np.product(sizes)\n if ind >= total:\n ind -= total\n else:\n out = {}\n for (key, v_list, n) in zip(keys, values_lists, sizes):\n (ind, offset) = divmod(ind, n)\n out[key] = v_list[offset]\n return out\n raise IndexError('ParameterGrid index out of range')" }, { @@ -129669,7 +139534,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "param_grid", @@ -129679,13 +139545,14 @@ "docstring": { "type": "dict of str to sequence, or sequence of such", "description": "The parameter grid to explore, as a dictionary mapping estimator\nparameters to sequences of allowed values.\n\nAn empty dict signifies default parameters.\n\nA sequence of dicts signifies a sequence of grids to search, and is\nuseful to avoid exploring parameter combinations that make no sense\nor have no effect. See the examples below." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, param_grid):\n if not isinstance(param_grid, (Mapping, Iterable)):\n raise TypeError('Parameter grid is not a dict or a list ({!r})'.format(param_grid))\n if isinstance(param_grid, Mapping):\n param_grid = [param_grid]\n for grid in param_grid:\n if not isinstance(grid, dict):\n raise TypeError('Parameter grid is not a dict ({!r})'.format(grid))\n for key in grid:\n if not isinstance(grid[key], Iterable):\n raise TypeError('Parameter grid value is not iterable (key={!r}, value={!r})'.format(key, grid[key]))\n self.param_grid = param_grid" }, { @@ -129703,13 +139570,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Iterate over the points in the grid.", - "docstring": "Iterate over the points in the grid.\n\nReturns\n-------\nparams : iterator over dict of str to any\n Yields dictionaries mapping each estimator parameter to one of its\n allowed values.", + "docstring": "Iterate over the points in the grid.\n\n Returns\n -------\n params : iterator over dict of str to any\n Yields dictionaries mapping each estimator parameter to one of its\n allowed values.\n ", "source_code": "\ndef __iter__(self):\n \"\"\"Iterate over the points in the grid.\n\n Returns\n -------\n params : iterator over dict of str to any\n Yields dictionaries mapping each estimator parameter to one of its\n allowed values.\n \"\"\"\n for p in self.param_grid:\n items = sorted(p.items())\n if not items:\n yield {}\n else:\n (keys, values) = zip(*items)\n for v in product(*values):\n params = dict(zip(keys, v))\n yield params" }, { @@ -129727,7 +139595,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -129751,7 +139620,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "param_distributions", @@ -129761,7 +139631,8 @@ "docstring": { "type": "dict", "description": "Dictionary with parameters names (`str`) as keys and distributions\nor lists of parameters to try. Distributions must provide a ``rvs``\nmethod for sampling (such as those from scipy.stats.distributions).\nIf a list is given, it is sampled uniformly.\nIf a list of dicts is given, first a dict is sampled uniformly, and\nthen a parameter is sampled using that dict as above." - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -129771,7 +139642,8 @@ "docstring": { "type": "int", "description": "Number of parameter settings that are produced." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -129781,13 +139653,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pseudo random number generator state used for random uniform sampling\nfrom lists of possible values instead of scipy.stats distributions.\nPass an int for reproducible output across multiple\nfunction calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, param_distributions, n_iter, *, random_state=None):\n if not isinstance(param_distributions, (Mapping, Iterable)):\n raise TypeError('Parameter distribution is not a dict or a list ({!r})'.format(param_distributions))\n if isinstance(param_distributions, Mapping):\n param_distributions = [param_distributions]\n for dist in param_distributions:\n if not isinstance(dist, dict):\n raise TypeError('Parameter distribution is not a dict ({!r})'.format(dist))\n for key in dist:\n if not isinstance(dist[key], Iterable) and not hasattr(dist[key], 'rvs'):\n raise TypeError('Parameter value is not iterable or distribution (key={!r}, value={!r})'.format(key, dist[key]))\n self.n_iter = n_iter\n self.random_state = random_state\n self.param_distributions = param_distributions" }, { @@ -129805,13 +139678,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __iter__(self):\n rng = check_random_state(self.random_state)\n if self._is_all_lists():\n param_grid = ParameterGrid(self.param_distributions)\n grid_size = len(param_grid)\n n_iter = self.n_iter\n if grid_size < n_iter:\n warnings.warn('The total space of parameters %d is smaller than n_iter=%d. Running %d iterations. For exhaustive searches, use GridSearchCV.' % (grid_size, self.n_iter, grid_size), UserWarning)\n n_iter = grid_size\n for i in sample_without_replacement(grid_size, n_iter, random_state=rng):\n yield param_grid[i]\n else:\n for _ in range(self.n_iter):\n dist = rng.choice(self.param_distributions)\n items = sorted(dist.items())\n params = dict()\n for (k, v) in items:\n if hasattr(v, 'rvs'):\n params[k] = v.rvs(random_state=rng)\n else:\n params[k] = v[rng.randint(len(v))]\n yield params" }, { @@ -129829,7 +139703,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -129853,13 +139728,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _is_all_lists(self):\n return all((all((not hasattr(v, 'rvs') for v in dist.values())) for dist in self.param_distributions))" }, { @@ -129877,7 +139753,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -129887,7 +139764,8 @@ "docstring": { "type": "estimator object", "description": "A object of that type is instantiated for each grid point.\nThis is assumed to implement the scikit-learn estimator interface.\nEither estimator needs to provide a ``score`` function,\nor ``scoring`` must be passed." - } + }, + "refined_type": {} }, { "name": "param_distributions", @@ -129897,7 +139775,8 @@ "docstring": { "type": "dict or list of dicts", "description": "Dictionary with parameters names (`str`) as keys and distributions\nor lists of parameters to try. Distributions must provide a ``rvs``\nmethod for sampling (such as those from scipy.stats.distributions).\nIf a list is given, it is sampled uniformly.\nIf a list of dicts is given, first a dict is sampled uniformly, and\nthen a parameter is sampled using that dict as above." - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -129907,7 +139786,8 @@ "docstring": { "type": "int, default=10", "description": "Number of parameter settings that are sampled. n_iter trades\noff runtime vs quality of the solution." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -129917,7 +139797,8 @@ "docstring": { "type": "str, callable, list, tuple or dict, default=None", "description": "Strategy to evaluate the performance of the cross-validated model on\nthe test set.\n\nIf `scoring` represents a single score, one can use:\n\n- a single string (see :ref:`scoring_parameter`);\n- a callable (see :ref:`scoring`) that returns a single value.\n\nIf `scoring` represents multiple scores, one can use:\n\n- a list or tuple of unique strings;\n- a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n- a dictionary with metric names as keys and callables a values.\n\nSee :ref:`multimetric_grid_search` for an example.\n\nIf None, the estimator's score method is used." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -129927,7 +139808,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\n\n.. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None" - } + }, + "refined_type": {} }, { "name": "refit", @@ -129937,7 +139819,8 @@ "docstring": { "type": "bool, str, or callable, default=True", "description": "Refit an estimator using the best found parameters on the whole\ndataset.\n\nFor multiple metric evaluation, this needs to be a `str` denoting the\nscorer that would be used to find the best parameters for refitting\nthe estimator at the end.\n\nWhere there are considerations other than maximum score in\nchoosing a best estimator, ``refit`` can be set to a function which\nreturns the selected ``best_index_`` given the ``cv_results``. In that\ncase, the ``best_estimator_`` and ``best_params_`` will be set\naccording to the returned ``best_index_`` while the ``best_score_``\nattribute will not be available.\n\nThe refitted estimator is made available at the ``best_estimator_``\nattribute and permits using ``predict`` directly on this\n``RandomizedSearchCV`` instance.\n\nAlso for multiple metric evaluation, the attributes ``best_index_``,\n``best_score_`` and ``best_params_`` will only be available if\n``refit`` is set and all of them will be determined w.r.t this specific\nscorer.\n\nSee ``scoring`` parameter to know more about multiple metric\nevaluation.\n\n.. versionchanged:: 0.20\n Support for callable added." - } + }, + "refined_type": {} }, { "name": "cv", @@ -129947,7 +139830,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross validation,\n- integer, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor integer/None inputs, if the estimator is a classifier and ``y`` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`KFold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -129957,7 +139841,8 @@ "docstring": { "type": "int", "description": "Controls the verbosity: the higher, the more messages." - } + }, + "refined_type": {} }, { "name": "pre_dispatch", @@ -129967,7 +139852,8 @@ "docstring": { "type": "int, or str, default='2*n_jobs'", "description": "Controls the number of jobs that get dispatched during parallel\nexecution. Reducing this number can be useful to avoid an\nexplosion of memory consumption when more jobs get dispatched\nthan CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -129977,7 +139863,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pseudo random number generator state used for random uniform sampling\nfrom lists of possible values instead of scipy.stats distributions.\nPass an int for reproducible output across multiple\nfunction calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "error_score", @@ -129987,7 +139874,8 @@ "docstring": { "type": "'raise' or numeric, default=np.nan", "description": "Value to assign to the score if an error occurs in estimator fitting.\nIf set to 'raise', the error is raised. If a numeric value is given,\nFitFailedWarning is raised. This parameter does not affect the refit\nstep, which will always raise the error." - } + }, + "refined_type": {} }, { "name": "return_train_score", @@ -129997,13 +139885,14 @@ "docstring": { "type": "bool, default=False", "description": "If ``False``, the ``cv_results_`` attribute will not include training\nscores.\nComputing training scores is used to get insights on how different\nparameter settings impact the overfitting/underfitting trade-off.\nHowever computing the scores on the training set can be computationally\nexpensive and is not strictly required to select the parameters that\nyield the best generalization performance.\n\n.. versionadded:: 0.19\n\n.. versionchanged:: 0.21\n Default value was changed from ``True`` to ``False``" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=False):\n self.param_distributions = param_distributions\n self.n_iter = n_iter\n self.random_state = random_state\n super().__init__(estimator=estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score)" }, { @@ -130021,7 +139910,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "evaluate_candidates", @@ -130031,7 +139921,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -130055,13 +139946,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_param_grid(param_grid):\n if hasattr(param_grid, 'items'):\n param_grid = [param_grid]\n for p in param_grid:\n for (name, v) in p.items():\n if isinstance(v, np.ndarray) and v.ndim > 1:\n raise ValueError('Parameter array should be one-dimensional.')\n if isinstance(v, str) or not isinstance(v, (np.ndarray, Sequence)):\n raise ValueError('Parameter grid for parameter ({0}) needs to be a list or numpy array, but got ({1}). Single values need to be wrapped in a list with one element.'.format(name, type(v)))\n if len(v) == 0:\n raise ValueError('Parameter values for parameter ({0}) need to be a non-empty sequence.'.format(name))" }, { @@ -130079,7 +139971,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "attr", @@ -130089,13 +139982,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_refit(search_cv, attr):\n if not search_cv.refit:\n raise AttributeError(f'This {type(search_cv).__name__} instance was initialized with `refit=False`. {attr} is available only after refitting on the best parameters. You can refit an estimator manually using the `best_params_` attribute')" }, { @@ -130113,13 +140007,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Check if we can delegate a method to the underlying estimator.\n\nCalling a prediction method will only be available if `refit=True`. In such case, we check first the fitted best estimator. If it is not fitted, we check the unfitted estimator. Checking the unfitted estimator allows to use `hasattr` on the `SearchCV` instance even before calling `fit`.", - "docstring": "Check if we can delegate a method to the underlying estimator.\n\nCalling a prediction method will only be available if `refit=True`. In\nsuch case, we check first the fitted best estimator. If it is not\nfitted, we check the unfitted estimator.\n\nChecking the unfitted estimator allows to use `hasattr` on the `SearchCV`\ninstance even before calling `fit`.", + "description": "Check if we can delegate a method to the underlying estimator.\n\nCalling a prediction method will only be available if `refit=True`. In\nsuch case, we check first the fitted best estimator. If it is not\nfitted, we check the unfitted estimator.\n\nChecking the unfitted estimator allows to use `hasattr` on the `SearchCV`\ninstance even before calling `fit`.", + "docstring": "Check if we can delegate a method to the underlying estimator.\n\n Calling a prediction method will only be available if `refit=True`. In\n such case, we check first the fitted best estimator. If it is not\n fitted, we check the unfitted estimator.\n\n Checking the unfitted estimator allows to use `hasattr` on the `SearchCV`\n instance even before calling `fit`.\n ", "source_code": "\ndef _estimator_has(attr):\n \"\"\"Check if we can delegate a method to the underlying estimator.\n\n Calling a prediction method will only be available if `refit=True`. In\n such case, we check first the fitted best estimator. If it is not\n fitted, we check the unfitted estimator.\n\n Checking the unfitted estimator allows to use `hasattr` on the `SearchCV`\n instance even before calling `fit`.\n \"\"\"\n \n def check(self):\n _check_refit(self, attr)\n if hasattr(self, 'best_estimator_'):\n getattr(self.best_estimator_, attr)\n return True\n getattr(self.estimator, attr)\n return True\n return check" }, { @@ -130137,7 +140032,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -130147,7 +140043,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scoring", @@ -130157,7 +140054,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -130167,7 +140065,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "refit", @@ -130177,7 +140076,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cv", @@ -130187,7 +140087,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -130197,7 +140098,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -130207,7 +140109,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "error_score", @@ -130217,7 +140120,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "return_train_score", @@ -130227,7 +140131,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_resources", @@ -130237,7 +140142,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_resources", @@ -130247,7 +140153,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "resource", @@ -130257,7 +140164,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "factor", @@ -130267,7 +140175,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "aggressive_elimination", @@ -130277,13 +140186,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, *, scoring=None, n_jobs=None, refit=True, cv=5, verbose=0, random_state=None, error_score=np.nan, return_train_score=True, max_resources='auto', min_resources='exhaust', resource='n_samples', factor=3, aggressive_elimination=False):\n super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose, error_score=error_score, return_train_score=return_train_score)\n self.random_state = random_state\n self.max_resources = max_resources\n self.resource = resource\n self.factor = factor\n self.min_resources = min_resources\n self.aggressive_elimination = aggressive_elimination" }, { @@ -130301,7 +140211,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -130311,7 +140222,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -130321,7 +140233,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -130331,13 +140244,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_input_parameters(self, X, y, groups):\n if self.scoring is not None and not (isinstance(self.scoring, str) or callable(self.scoring)):\n raise ValueError('scoring parameter must be a string, a callable or None. Multimetric scoring is not supported.')\n if not _yields_constant_splits(self._checked_cv_orig):\n raise ValueError('The cv parameter must yield consistent folds across calls to split(). Set its random_state to an int, or set shuffle=False.')\n if self.resource != 'n_samples' and self.resource not in self.estimator.get_params():\n raise ValueError(f'Cannot use resource={self.resource} which is not supported by estimator {self.estimator.__class__.__name__}')\n if isinstance(self.max_resources, str) and self.max_resources != 'auto':\n raise ValueError(\"max_resources must be either 'auto' or a positive integer\")\n if self.max_resources != 'auto' and (not isinstance(self.max_resources, Integral) or self.max_resources <= 0):\n raise ValueError(\"max_resources must be either 'auto' or a positive integer\")\n if self.min_resources not in ('smallest', 'exhaust') and (not isinstance(self.min_resources, Integral) or self.min_resources <= 0):\n raise ValueError(\"min_resources must be either 'smallest', 'exhaust', or a positive integer no greater than max_resources.\")\n if isinstance(self, HalvingRandomSearchCV):\n if self.min_resources == self.n_candidates == 'exhaust':\n raise ValueError(\"n_candidates and min_resources cannot be both set to 'exhaust'.\")\n if self.n_candidates != 'exhaust' and (not isinstance(self.n_candidates, Integral) or self.n_candidates <= 0):\n raise ValueError(\"n_candidates must be either 'exhaust' or a positive integer\")\n self.min_resources_ = self.min_resources\n if self.min_resources_ in ('smallest', 'exhaust'):\n if self.resource == 'n_samples':\n n_splits = self._checked_cv_orig.get_n_splits(X, y, groups)\n magic_factor = 2\n self.min_resources_ = n_splits * magic_factor\n if is_classifier(self.estimator):\n y = self._validate_data(X='no_validation', y=y)\n check_classification_targets(y)\n n_classes = np.unique(y).shape[0]\n self.min_resources_ *= n_classes\n else:\n self.min_resources_ = 1\n self.max_resources_ = self.max_resources\n if self.max_resources_ == 'auto':\n if not self.resource == 'n_samples':\n raise ValueError(\"max_resources can only be 'auto' if resource='n_samples'\")\n self.max_resources_ = _num_samples(X)\n if self.min_resources_ > self.max_resources_:\n raise ValueError(f'min_resources_={self.min_resources_} is greater than max_resources_={self.max_resources_}.')\n if self.min_resources_ == 0:\n raise ValueError(f'min_resources_={self.min_resources_}: you might have passed an empty dataset X.')\n if not isinstance(self.refit, bool):\n raise ValueError(f'refit is expected to be a boolean. Got {type(self.refit)} instead.')" }, { @@ -130355,13 +140269,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef _generate_candidate_params(self):\n pass" }, { @@ -130379,13 +140294,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n tags = deepcopy(super()._more_tags())\n tags['_xfail_checks'].update({'check_fit2d_1sample': 'Fail during parameter check since min/max resources requires more samples'})\n return tags" }, { @@ -130403,7 +140319,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "evaluate_candidates", @@ -130413,13 +140330,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _run_search(self, evaluate_candidates):\n candidate_params = self._generate_candidate_params()\n if self.resource != 'n_samples' and any((self.resource in candidate for candidate in candidate_params)):\n raise ValueError(f'Cannot use parameter {self.resource} as the resource since it is part of the searched parameters.')\n n_required_iterations = 1 + floor(log(len(candidate_params), self.factor))\n if self.min_resources == 'exhaust':\n last_iteration = n_required_iterations - 1\n self.min_resources_ = max(self.min_resources_, self.max_resources_ // self.factor**last_iteration)\n n_possible_iterations = 1 + floor(log(self.max_resources_ // self.min_resources_, self.factor))\n if self.aggressive_elimination:\n n_iterations = n_required_iterations\n else:\n n_iterations = min(n_possible_iterations, n_required_iterations)\n if self.verbose:\n print(f'n_iterations: {n_iterations}')\n print(f'n_required_iterations: {n_required_iterations}')\n print(f'n_possible_iterations: {n_possible_iterations}')\n print(f'min_resources_: {self.min_resources_}')\n print(f'max_resources_: {self.max_resources_}')\n print(f'aggressive_elimination: {self.aggressive_elimination}')\n print(f'factor: {self.factor}')\n self.n_resources_ = []\n self.n_candidates_ = []\n for itr in range(n_iterations):\n power = itr\n if self.aggressive_elimination:\n power = max(0, itr - n_required_iterations + n_possible_iterations)\n n_resources = int(self.factor**power * self.min_resources_)\n n_resources = min(n_resources, self.max_resources_)\n self.n_resources_.append(n_resources)\n n_candidates = len(candidate_params)\n self.n_candidates_.append(n_candidates)\n if self.verbose:\n print('-' * 10)\n print(f'iter: {itr}')\n print(f'n_candidates: {n_candidates}')\n print(f'n_resources: {n_resources}')\n if self.resource == 'n_samples':\n cv = _SubsampleMetaSplitter(base_cv=self._checked_cv_orig, fraction=n_resources / self._n_samples_orig, subsample_test=True, random_state=self.random_state)\n else:\n candidate_params = [c.copy() for c in candidate_params]\n for candidate in candidate_params:\n candidate[self.resource] = n_resources\n cv = self._checked_cv_orig\n more_results = {'iter': [itr] * n_candidates, 'n_resources': [n_resources] * n_candidates}\n results = evaluate_candidates(candidate_params, cv, more_results=more_results)\n n_candidates_to_keep = ceil(n_candidates / self.factor)\n candidate_params = _top_k(results, n_candidates_to_keep, itr)\n self.n_remaining_candidates_ = len(candidate_params)\n self.n_required_iterations_ = n_required_iterations\n self.n_possible_iterations_ = n_possible_iterations\n self.n_iterations_ = n_iterations" }, { @@ -130437,7 +140355,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "refit_metric", @@ -130447,7 +140366,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "results", @@ -130457,13 +140377,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Custom refit callable to return the index of the best candidate.\n\nWe want the best candidate out of the last iteration. By default BaseSearchCV would return the best candidate out of all iterations. Currently, we only support for a single metric thus `refit` and `refit_metric` are not required.", - "docstring": "Custom refit callable to return the index of the best candidate.\n\nWe want the best candidate out of the last iteration. By default\nBaseSearchCV would return the best candidate out of all iterations.\n\nCurrently, we only support for a single metric thus `refit` and\n`refit_metric` are not required.", + "description": "Custom refit callable to return the index of the best candidate.\n\nWe want the best candidate out of the last iteration. By default\nBaseSearchCV would return the best candidate out of all iterations.\n\nCurrently, we only support for a single metric thus `refit` and\n`refit_metric` are not required.", + "docstring": "Custom refit callable to return the index of the best candidate.\n\n We want the best candidate out of the last iteration. By default\n BaseSearchCV would return the best candidate out of all iterations.\n\n Currently, we only support for a single metric thus `refit` and\n `refit_metric` are not required.\n ", "source_code": "\n@staticmethod\ndef _select_best_index(refit, refit_metric, results):\n \"\"\"Custom refit callable to return the index of the best candidate.\n\n We want the best candidate out of the last iteration. By default\n BaseSearchCV would return the best candidate out of all iterations.\n\n Currently, we only support for a single metric thus `refit` and\n `refit_metric` are not required.\n \"\"\"\n last_iter = np.max(results['iter'])\n last_iter_indices = np.flatnonzero(results['iter'] == last_iter)\n best_idx = np.argmax(results['mean_test_score'][last_iter_indices])\n return last_iter_indices[best_idx]" }, { @@ -130481,7 +140402,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -130491,7 +140413,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -130501,7 +140424,8 @@ "docstring": { "type": "array-like, shape (n_samples,) or (n_samples, n_output), optional", "description": "Target relative to X for classification or regression;\nNone for unsupervised learning." - } + }, + "refined_type": {} }, { "name": "groups", @@ -130511,13 +140435,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set. Only used in conjunction with a \"Group\" :term:`cv`\ninstance (e.g., :class:`~sklearn.model_selection.GroupKFold`)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Run fit with all sets of parameters.", - "docstring": "Run fit with all sets of parameters.\n\nParameters\n----------\n\nX : array-like, shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like, shape (n_samples,) or (n_samples, n_output), optional\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\ngroups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n**fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of the estimator.\n\nReturns\n-------\nself : object\n Instance of fitted estimator.", + "docstring": "Run fit with all sets of parameters.\n\n Parameters\n ----------\n\n X : array-like, shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_output), optional\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of the estimator.\n\n Returns\n -------\n self : object\n Instance of fitted estimator.\n ", "source_code": "\ndef fit(self, X, y=None, groups=None, **fit_params):\n \"\"\"Run fit with all sets of parameters.\n\n Parameters\n ----------\n\n X : array-like, shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like, shape (n_samples,) or (n_samples, n_output), optional\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of the estimator.\n\n Returns\n -------\n self : object\n Instance of fitted estimator.\n \"\"\"\n self._checked_cv_orig = check_cv(self.cv, y, classifier=is_classifier(self.estimator))\n self._check_input_parameters(X=X, y=y, groups=groups)\n self._n_samples_orig = _num_samples(X)\n super().fit(X, y=y, groups=groups, **fit_params)\n self.best_score_ = self.cv_results_['mean_test_score'][self.best_index_]\n return self" }, { @@ -130535,7 +140460,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -130545,7 +140471,8 @@ "docstring": { "type": "estimator object", "description": "This is assumed to implement the scikit-learn estimator interface.\nEither estimator needs to provide a ``score`` function,\nor ``scoring`` must be passed." - } + }, + "refined_type": {} }, { "name": "param_grid", @@ -130555,7 +140482,8 @@ "docstring": { "type": "dict or list of dictionaries", "description": "Dictionary with parameters names (string) as keys and lists of\nparameter settings to try as values, or a list of such\ndictionaries, in which case the grids spanned by each dictionary\nin the list are explored. This enables searching over any sequence\nof parameter settings." - } + }, + "refined_type": {} }, { "name": "factor", @@ -130565,7 +140493,8 @@ "docstring": { "type": "int or float, default=3", "description": "The 'halving' parameter, which determines the proportion of candidates\nthat are selected for each subsequent iteration. For example,\n``factor=3`` means that only one third of the candidates are selected." - } + }, + "refined_type": {} }, { "name": "resource", @@ -130575,7 +140504,8 @@ "docstring": { "type": "``'n_samples'`` or str, default='n_samples'", "description": "Defines the resource that increases with each iteration. By default,\nthe resource is the number of samples. It can also be set to any\nparameter of the base estimator that accepts positive integer\nvalues, e.g. 'n_iterations' or 'n_estimators' for a gradient\nboosting estimator. In this case ``max_resources`` cannot be 'auto'\nand must be set explicitly." - } + }, + "refined_type": {} }, { "name": "max_resources", @@ -130585,7 +140515,8 @@ "docstring": { "type": "int, default='auto'", "description": "The maximum amount of resource that any candidate is allowed to use\nfor a given iteration. By default, this is set to ``n_samples`` when\n``resource='n_samples'`` (default), else an error is raised." - } + }, + "refined_type": {} }, { "name": "min_resources", @@ -130595,6 +140526,10 @@ "docstring": { "type": "{'exhaust', 'smallest'} or int, default='exhaust'", "description": "The minimum amount of resource that any candidate is allowed to use\nfor a given iteration. Equivalently, this defines the amount of\nresources `r0` that are allocated for each candidate at the first\niteration.\n\n- 'smallest' is a heuristic that sets `r0` to a small value:\n\n - ``n_splits * 2`` when ``resource='n_samples'`` for a regression\n problem\n - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a\n classification problem\n - ``1`` when ``resource != 'n_samples'``\n\n- 'exhaust' will set `r0` such that the **last** iteration uses as\n much resources as possible. Namely, the last iteration will use the\n highest value smaller than ``max_resources`` that is a multiple of\n both ``min_resources`` and ``factor``. In general, using 'exhaust'\n leads to a more accurate estimator, but is slightly more time\n consuming.\n\nNote that the amount of resources used at each iteration is always a\nmultiple of ``min_resources``." + }, + "refined_type": { + "kind": "EnumType", + "values": ["exhaust", "smallest"] } }, { @@ -130605,7 +140540,8 @@ "docstring": { "type": "bool, default=False", "description": "This is only relevant in cases where there isn't enough resources to\nreduce the remaining candidates to at most `factor` after the last\niteration. If ``True``, then the search process will 'replay' the\nfirst iteration for as long as needed until the number of candidates\nis small enough. This is ``False`` by default, which means that the\nlast iteration may evaluate more than ``factor`` candidates. See\n:ref:`aggressive_elimination` for more details." - } + }, + "refined_type": {} }, { "name": "cv", @@ -130615,7 +140551,8 @@ "docstring": { "type": "int, cross-validation generator or iterable, default=5", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- integer, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor integer/None inputs, if the estimator is a classifier and ``y`` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`KFold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. note::\n Due to implementation details, the folds produced by `cv` must be\n the same across multiple calls to `cv.split()`. For\n built-in `scikit-learn` iterators, this can be achieved by\n deactivating shuffling (`shuffle=False`), or by setting the\n `cv`'s `random_state` parameter to an integer." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -130625,7 +140562,8 @@ "docstring": { "type": "str, callable, or None, default=None", "description": "A single string (see :ref:`scoring_parameter`) or a callable\n(see :ref:`scoring`) to evaluate the predictions on the test set.\nIf None, the estimator's score method is used." - } + }, + "refined_type": {} }, { "name": "refit", @@ -130635,7 +140573,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, refit an estimator using the best found parameters on the\nwhole dataset.\n\nThe refitted estimator is made available at the ``best_estimator_``\nattribute and permits using ``predict`` directly on this\n``HalvingGridSearchCV`` instance." - } + }, + "refined_type": {} }, { "name": "error_score", @@ -130645,7 +140584,8 @@ "docstring": { "type": "'raise' or numeric", "description": "Value to assign to the score if an error occurs in estimator fitting.\nIf set to 'raise', the error is raised. If a numeric value is given,\nFitFailedWarning is raised. This parameter does not affect the refit\nstep, which will always raise the error. Default is ``np.nan``." - } + }, + "refined_type": {} }, { "name": "return_train_score", @@ -130655,7 +140595,8 @@ "docstring": { "type": "bool, default=False", "description": "If ``False``, the ``cv_results_`` attribute will not include training\nscores.\nComputing training scores is used to get insights on how different\nparameter settings impact the overfitting/underfitting trade-off.\nHowever computing the scores on the training set can be computationally\nexpensive and is not strictly required to select the parameters that\nyield the best generalization performance." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -130665,7 +140606,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pseudo random number generator state used for subsampling the dataset\nwhen `resources != 'n_samples'`. Ignored otherwise.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -130675,7 +140617,8 @@ "docstring": { "type": "int or None, default=None", "description": "Number of jobs to run in parallel.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -130685,13 +140628,14 @@ "docstring": { "type": "int", "description": "Controls the verbosity: the higher, the more messages." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, param_grid, *, factor=3, resource='n_samples', max_resources='auto', min_resources='exhaust', aggressive_elimination=False, cv=5, scoring=None, refit=True, error_score=np.nan, return_train_score=True, random_state=None, n_jobs=None, verbose=0):\n super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, random_state=random_state, error_score=error_score, return_train_score=return_train_score, max_resources=max_resources, resource=resource, factor=factor, min_resources=min_resources, aggressive_elimination=aggressive_elimination)\n self.param_grid = param_grid\n _check_param_grid(self.param_grid)" }, { @@ -130709,13 +140653,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _generate_candidate_params(self):\n return ParameterGrid(self.param_grid)" }, { @@ -130733,7 +140678,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -130743,7 +140689,8 @@ "docstring": { "type": "estimator object", "description": "This is assumed to implement the scikit-learn estimator interface.\nEither estimator needs to provide a ``score`` function,\nor ``scoring`` must be passed." - } + }, + "refined_type": {} }, { "name": "param_distributions", @@ -130753,7 +140700,8 @@ "docstring": { "type": "dict", "description": "Dictionary with parameters names (string) as keys and distributions\nor lists of parameters to try. Distributions must provide a ``rvs``\nmethod for sampling (such as those from scipy.stats.distributions).\nIf a list is given, it is sampled uniformly." - } + }, + "refined_type": {} }, { "name": "n_candidates", @@ -130763,7 +140711,8 @@ "docstring": { "type": "int, default='exhaust'", "description": "The number of candidate parameters to sample, at the first\niteration. Using 'exhaust' will sample enough candidates so that the\nlast iteration uses as many resources as possible, based on\n`min_resources`, `max_resources` and `factor`. In this case,\n`min_resources` cannot be 'exhaust'." - } + }, + "refined_type": {} }, { "name": "factor", @@ -130773,7 +140722,8 @@ "docstring": { "type": "int or float, default=3", "description": "The 'halving' parameter, which determines the proportion of candidates\nthat are selected for each subsequent iteration. For example,\n``factor=3`` means that only one third of the candidates are selected." - } + }, + "refined_type": {} }, { "name": "resource", @@ -130783,7 +140733,8 @@ "docstring": { "type": "``'n_samples'`` or str, default='n_samples'", "description": "Defines the resource that increases with each iteration. By default,\nthe resource is the number of samples. It can also be set to any\nparameter of the base estimator that accepts positive integer\nvalues, e.g. 'n_iterations' or 'n_estimators' for a gradient\nboosting estimator. In this case ``max_resources`` cannot be 'auto'\nand must be set explicitly." - } + }, + "refined_type": {} }, { "name": "max_resources", @@ -130793,7 +140744,8 @@ "docstring": { "type": "int, default='auto'", "description": "The maximum number of resources that any candidate is allowed to use\nfor a given iteration. By default, this is set ``n_samples`` when\n``resource='n_samples'`` (default), else an error is raised." - } + }, + "refined_type": {} }, { "name": "min_resources", @@ -130803,6 +140755,10 @@ "docstring": { "type": "{'exhaust', 'smallest'} or int, default='smallest'", "description": "The minimum amount of resource that any candidate is allowed to use\nfor a given iteration. Equivalently, this defines the amount of\nresources `r0` that are allocated for each candidate at the first\niteration.\n\n- 'smallest' is a heuristic that sets `r0` to a small value:\n\n - ``n_splits * 2`` when ``resource='n_samples'`` for a regression\n problem\n - ``n_classes * n_splits * 2`` when ``resource='n_samples'`` for a\n classification problem\n - ``1`` when ``resource != 'n_samples'``\n\n- 'exhaust' will set `r0` such that the **last** iteration uses as\n much resources as possible. Namely, the last iteration will use the\n highest value smaller than ``max_resources`` that is a multiple of\n both ``min_resources`` and ``factor``. In general, using 'exhaust'\n leads to a more accurate estimator, but is slightly more time\n consuming. 'exhaust' isn't available when `n_candidates='exhaust'`.\n\nNote that the amount of resources used at each iteration is always a\nmultiple of ``min_resources``." + }, + "refined_type": { + "kind": "EnumType", + "values": ["exhaust", "smallest"] } }, { @@ -130813,7 +140769,8 @@ "docstring": { "type": "bool, default=False", "description": "This is only relevant in cases where there isn't enough resources to\nreduce the remaining candidates to at most `factor` after the last\niteration. If ``True``, then the search process will 'replay' the\nfirst iteration for as long as needed until the number of candidates\nis small enough. This is ``False`` by default, which means that the\nlast iteration may evaluate more than ``factor`` candidates. See\n:ref:`aggressive_elimination` for more details." - } + }, + "refined_type": {} }, { "name": "cv", @@ -130823,7 +140780,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=5", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- integer, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor integer/None inputs, if the estimator is a classifier and ``y`` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`KFold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. note::\n Due to implementation details, the folds produced by `cv` must be\n the same across multiple calls to `cv.split()`. For\n built-in `scikit-learn` iterators, this can be achieved by\n deactivating shuffling (`shuffle=False`), or by setting the\n `cv`'s `random_state` parameter to an integer." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -130833,7 +140791,8 @@ "docstring": { "type": "str, callable, or None, default=None", "description": "A single string (see :ref:`scoring_parameter`) or a callable\n(see :ref:`scoring`) to evaluate the predictions on the test set.\nIf None, the estimator's score method is used." - } + }, + "refined_type": {} }, { "name": "refit", @@ -130843,7 +140802,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, refit an estimator using the best found parameters on the\nwhole dataset.\n\nThe refitted estimator is made available at the ``best_estimator_``\nattribute and permits using ``predict`` directly on this\n``HalvingRandomSearchCV`` instance." - } + }, + "refined_type": {} }, { "name": "error_score", @@ -130853,7 +140813,8 @@ "docstring": { "type": "'raise' or numeric", "description": "Value to assign to the score if an error occurs in estimator fitting.\nIf set to 'raise', the error is raised. If a numeric value is given,\nFitFailedWarning is raised. This parameter does not affect the refit\nstep, which will always raise the error. Default is ``np.nan``." - } + }, + "refined_type": {} }, { "name": "return_train_score", @@ -130863,7 +140824,8 @@ "docstring": { "type": "bool, default=False", "description": "If ``False``, the ``cv_results_`` attribute will not include training\nscores.\nComputing training scores is used to get insights on how different\nparameter settings impact the overfitting/underfitting trade-off.\nHowever computing the scores on the training set can be computationally\nexpensive and is not strictly required to select the parameters that\nyield the best generalization performance." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -130873,7 +140835,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Pseudo random number generator state used for subsampling the dataset\nwhen `resources != 'n_samples'`. Also used for random uniform\nsampling from lists of possible values instead of scipy.stats\ndistributions.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -130883,7 +140846,8 @@ "docstring": { "type": "int or None, default=None", "description": "Number of jobs to run in parallel.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -130893,13 +140857,14 @@ "docstring": { "type": "int", "description": "Controls the verbosity: the higher, the more messages." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, param_distributions, *, n_candidates='exhaust', factor=3, resource='n_samples', max_resources='auto', min_resources='smallest', aggressive_elimination=False, cv=5, scoring=None, refit=True, error_score=np.nan, return_train_score=True, random_state=None, n_jobs=None, verbose=0):\n super().__init__(estimator, scoring=scoring, n_jobs=n_jobs, refit=refit, verbose=verbose, cv=cv, random_state=random_state, error_score=error_score, return_train_score=return_train_score, max_resources=max_resources, resource=resource, factor=factor, min_resources=min_resources, aggressive_elimination=aggressive_elimination)\n self.param_distributions = param_distributions\n self.n_candidates = n_candidates" }, { @@ -130917,13 +140882,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _generate_candidate_params(self):\n n_candidates_first_iter = self.n_candidates\n if n_candidates_first_iter == 'exhaust':\n n_candidates_first_iter = self.max_resources_ // self.min_resources_\n return ParameterSampler(self.param_distributions, n_candidates_first_iter, random_state=self.random_state)" }, { @@ -130941,7 +140907,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_cv", @@ -130951,7 +140918,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fraction", @@ -130961,7 +140929,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "subsample_test", @@ -130971,7 +140940,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -130981,13 +140951,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, base_cv, fraction, subsample_test, random_state):\n self.base_cv = base_cv\n self.fraction = fraction\n self.subsample_test = subsample_test\n self.random_state = random_state" }, { @@ -131005,7 +140976,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131015,7 +140987,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -131025,7 +140998,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -131035,13 +141009,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef split(self, X, y, groups=None):\n for (train_idx, test_idx) in self.base_cv.split(X, y, groups):\n train_idx = resample(train_idx, replace=False, random_state=self.random_state, n_samples=int(self.fraction * train_idx.shape[0]))\n if self.subsample_test:\n test_idx = resample(test_idx, replace=False, random_state=self.random_state, n_samples=int(self.fraction * test_idx.shape[0]))\n yield (train_idx, test_idx)" }, { @@ -131059,7 +141034,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "k", @@ -131069,7 +141045,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "itr", @@ -131079,13 +141056,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _top_k(results, k, itr):\n (iteration, mean_test_score, params) = (np.asarray(a) for a in (results['iter'], results['mean_test_score'], results['params']))\n iter_indices = np.flatnonzero(iteration == itr)\n sorted_indices = np.argsort(mean_test_score[iter_indices])\n return np.array(params[iter_indices][sorted_indices[-k:]])" }, { @@ -131103,13 +141081,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return _build_repr(self)" }, { @@ -131127,7 +141106,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131137,7 +141117,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -131147,7 +141128,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -131157,7 +141139,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -131181,7 +141164,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131191,7 +141175,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -131201,7 +141186,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -131211,13 +141197,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Generates boolean masks corresponding to test sets.\n\nBy default, delegates to _iter_test_indices(X, y, groups)", - "docstring": "Generates boolean masks corresponding to test sets.\n\nBy default, delegates to _iter_test_indices(X, y, groups)", + "docstring": "Generates boolean masks corresponding to test sets.\n\n By default, delegates to _iter_test_indices(X, y, groups)\n ", "source_code": "\ndef _iter_test_masks(self, X=None, y=None, groups=None):\n \"\"\"Generates boolean masks corresponding to test sets.\n\n By default, delegates to _iter_test_indices(X, y, groups)\n \"\"\"\n for test_index in self._iter_test_indices(X, y, groups):\n test_mask = np.zeros(_num_samples(X), dtype=bool)\n test_mask[test_index] = True\n yield test_mask" }, { @@ -131235,7 +141222,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131245,7 +141233,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -131255,7 +141244,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -131265,7 +141255,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -131289,7 +141280,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131299,7 +141291,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -131309,7 +141302,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target variable for supervised learning problems." - } + }, + "refined_type": {} }, { "name": "groups", @@ -131319,13 +141313,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate indices to split data into training and test set.", - "docstring": "Generate indices to split data into training and test set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n\ngroups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.", + "docstring": "Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n ", "source_code": "\ndef split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n (X, y, groups) = indexable(X, y, groups)\n indices = np.arange(_num_samples(X))\n for test_index in self._iter_test_masks(X, y, groups):\n train_index = indices[np.logical_not(test_index)]\n test_index = indices[test_index]\n yield (train_index, test_index)" }, { @@ -131343,7 +141338,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -131353,7 +141349,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "test_size", @@ -131363,7 +141360,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "train_size", @@ -131373,7 +141371,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -131383,13 +141382,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None):\n self.n_splits = n_splits\n self.test_size = test_size\n self.train_size = train_size\n self.random_state = random_state\n self._default_test_size = 0.1" }, { @@ -131407,13 +141407,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return _build_repr(self)" }, { @@ -131431,7 +141432,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131441,7 +141443,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -131451,7 +141454,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -131461,7 +141465,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -131485,7 +141490,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131495,7 +141501,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "y", @@ -131505,7 +141512,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "groups", @@ -131515,13 +141523,14 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Returns the number of splitting iterations in the cross-validator", - "docstring": "Returns the number of splitting iterations in the cross-validator\n\nParameters\n----------\nX : object\n Always ignored, exists for compatibility.\n\ny : object\n Always ignored, exists for compatibility.\n\ngroups : object\n Always ignored, exists for compatibility.\n\nReturns\n-------\nn_splits : int\n Returns the number of splitting iterations in the cross-validator.", + "docstring": "Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n ", "source_code": "\ndef get_n_splits(self, X=None, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n return self.n_splits" }, { @@ -131539,7 +141548,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131549,7 +141559,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -131559,7 +141570,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target variable for supervised learning problems." - } + }, + "refined_type": {} }, { "name": "groups", @@ -131569,13 +141581,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate indices to split data into training and test set.", - "docstring": "Generate indices to split data into training and test set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n\ngroups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.\n\nNotes\n-----\nRandomized CV splitters may return different results for each call of\nsplit. You can make the results identical by setting `random_state`\nto an integer.", + "docstring": "Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n ", "source_code": "\ndef split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n \"\"\"\n (X, y, groups) = indexable(X, y, groups)\n for (train, test) in self._iter_indices(X, y, groups):\n yield (train, test)" }, { @@ -131593,7 +141606,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -131603,13 +141617,14 @@ "docstring": { "type": "int, default=5", "description": "Number of folds. Must be at least 2.\n\n.. versionchanged:: 0.22\n ``n_splits`` default value changed from 3 to 5." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_splits=5):\n super().__init__(n_splits, shuffle=False, random_state=None)" }, { @@ -131627,7 +141642,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131637,7 +141653,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -131647,7 +141664,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -131657,13 +141675,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _iter_test_indices(self, X, y, groups):\n if groups is None:\n raise ValueError(\"The 'groups' parameter should not be None.\")\n groups = check_array(groups, ensure_2d=False, dtype=None)\n (unique_groups, groups) = np.unique(groups, return_inverse=True)\n n_groups = len(unique_groups)\n if self.n_splits > n_groups:\n raise ValueError('Cannot have number of splits n_splits=%d greater than the number of groups: %d.' % (self.n_splits, n_groups))\n n_samples_per_group = np.bincount(groups)\n indices = np.argsort(n_samples_per_group)[::-1]\n n_samples_per_group = n_samples_per_group[indices]\n n_samples_per_fold = np.zeros(self.n_splits)\n group_to_fold = np.zeros(len(unique_groups))\n for (group_index, weight) in enumerate(n_samples_per_group):\n lightest_fold = np.argmin(n_samples_per_fold)\n n_samples_per_fold[lightest_fold] += weight\n group_to_fold[indices[group_index]] = lightest_fold\n indices = group_to_fold[groups]\n for f in range(self.n_splits):\n yield np.where(indices == f)[0]" }, { @@ -131681,7 +141700,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131691,7 +141711,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -131701,7 +141722,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The target variable for supervised learning problems." - } + }, + "refined_type": {} }, { "name": "groups", @@ -131711,13 +141733,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate indices to split data into training and test set.", - "docstring": "Generate indices to split data into training and test set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\ngroups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.", + "docstring": "Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n ", "source_code": "\ndef split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n return super().split(X, y, groups)" }, { @@ -131735,7 +141758,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -131745,7 +141769,8 @@ "docstring": { "type": "int, default=5", "description": "Number of re-shuffling & splitting iterations." - } + }, + "refined_type": {} }, { "name": "test_size", @@ -131755,7 +141780,8 @@ "docstring": { "type": "float, int, default=0.2", "description": "If float, should be between 0.0 and 1.0 and represent the proportion\nof groups to include in the test split (rounded up). If int,\nrepresents the absolute number of test groups. If None, the value is\nset to the complement of the train size.\nThe default will change in version 0.21. It will remain 0.2 only\nif ``train_size`` is unspecified, otherwise it will complement\nthe specified ``train_size``." - } + }, + "refined_type": {} }, { "name": "train_size", @@ -131765,7 +141791,8 @@ "docstring": { "type": "float or int, default=None", "description": "If float, should be between 0.0 and 1.0 and represent the\nproportion of the groups to include in the train split. If\nint, represents the absolute number of train groups. If None,\nthe value is automatically set to the complement of the test size." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -131775,13 +141802,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the randomness of the training and testing indices produced.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_splits=5, *, test_size=None, train_size=None, random_state=None):\n super().__init__(n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state)\n self._default_test_size = 0.2" }, { @@ -131799,7 +141827,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131809,7 +141838,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -131819,7 +141849,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -131829,13 +141860,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _iter_indices(self, X, y, groups):\n if groups is None:\n raise ValueError(\"The 'groups' parameter should not be None.\")\n groups = check_array(groups, ensure_2d=False, dtype=None)\n (classes, group_indices) = np.unique(groups, return_inverse=True)\n for (group_train, group_test) in super()._iter_indices(X=classes):\n train = np.flatnonzero(np.in1d(group_indices, group_train))\n test = np.flatnonzero(np.in1d(group_indices, group_test))\n yield (train, test)" }, { @@ -131853,7 +141885,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131863,7 +141896,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -131873,7 +141907,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The target variable for supervised learning problems." - } + }, + "refined_type": {} }, { "name": "groups", @@ -131883,13 +141918,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate indices to split data into training and test set.", - "docstring": "Generate indices to split data into training and test set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\ngroups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.\n\nNotes\n-----\nRandomized CV splitters may return different results for each call of\nsplit. You can make the results identical by setting `random_state`\nto an integer.", + "docstring": "Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n ", "source_code": "\ndef split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n \"\"\"\n return super().split(X, y, groups)" }, { @@ -131907,7 +141943,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -131917,7 +141954,8 @@ "docstring": { "type": "int, default=5", "description": "Number of folds. Must be at least 2.\n\n.. versionchanged:: 0.22\n ``n_splits`` default value changed from 3 to 5." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -131927,7 +141965,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to shuffle the data before splitting into batches.\nNote that the samples within each split will not be shuffled." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -131937,13 +141976,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "When `shuffle` is True, `random_state` affects the ordering of the\nindices, which controls the randomness of each fold. Otherwise, this\nparameter has no effect.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_splits=5, *, shuffle=False, random_state=None):\n super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)" }, { @@ -131961,7 +142001,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -131971,7 +142012,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -131981,7 +142023,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -131991,13 +142034,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _iter_test_indices(self, X, y=None, groups=None):\n n_samples = _num_samples(X)\n indices = np.arange(n_samples)\n if self.shuffle:\n check_random_state(self.random_state).shuffle(indices)\n n_splits = self.n_splits\n fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int)\n fold_sizes[:n_samples % n_splits] += 1\n current = 0\n for fold_size in fold_sizes:\n (start, stop) = (current, current + fold_size)\n yield indices[start:stop]\n current = stop" }, { @@ -132015,7 +142059,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132025,7 +142070,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -132035,7 +142081,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -132045,13 +142092,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _iter_test_masks(self, X, y, groups):\n if groups is None:\n raise ValueError(\"The 'groups' parameter should not be None.\")\n groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)\n unique_groups = np.unique(groups)\n if len(unique_groups) <= 1:\n raise ValueError('The groups parameter contains fewer than 2 unique groups (%s). LeaveOneGroupOut expects at least 2.' % unique_groups)\n for i in unique_groups:\n yield groups == i" }, { @@ -132069,7 +142117,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132079,7 +142128,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "y", @@ -132089,7 +142139,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "groups", @@ -132099,13 +142150,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set. This 'groups' parameter must always be specified to\ncalculate the number of splits, though the other parameters can be\nomitted." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Returns the number of splitting iterations in the cross-validator", - "docstring": "Returns the number of splitting iterations in the cross-validator\n\nParameters\n----------\nX : object\n Always ignored, exists for compatibility.\n\ny : object\n Always ignored, exists for compatibility.\n\ngroups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set. This 'groups' parameter must always be specified to\n calculate the number of splits, though the other parameters can be\n omitted.\n\nReturns\n-------\nn_splits : int\n Returns the number of splitting iterations in the cross-validator.", + "docstring": "Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set. This 'groups' parameter must always be specified to\n calculate the number of splits, though the other parameters can be\n omitted.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n ", "source_code": "\ndef get_n_splits(self, X=None, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set. This 'groups' parameter must always be specified to\n calculate the number of splits, though the other parameters can be\n omitted.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n if groups is None:\n raise ValueError(\"The 'groups' parameter should not be None.\")\n groups = check_array(groups, ensure_2d=False, dtype=None)\n return len(np.unique(groups))" }, { @@ -132123,7 +142175,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132133,7 +142186,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -132143,7 +142197,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The target variable for supervised learning problems." - } + }, + "refined_type": {} }, { "name": "groups", @@ -132153,13 +142208,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate indices to split data into training and test set.", - "docstring": "Generate indices to split data into training and test set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\ngroups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.", + "docstring": "Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n ", "source_code": "\ndef split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n return super().split(X, y, groups)" }, { @@ -132177,7 +142233,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132187,7 +142244,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -132197,7 +142255,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -132207,13 +142266,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _iter_test_indices(self, X, y=None, groups=None):\n n_samples = _num_samples(X)\n if n_samples <= 1:\n raise ValueError('Cannot perform LeaveOneOut with n_samples={}.'.format(n_samples))\n return range(n_samples)" }, { @@ -132231,7 +142291,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132241,7 +142302,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -132251,7 +142313,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "groups", @@ -132261,13 +142324,14 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Returns the number of splitting iterations in the cross-validator", - "docstring": "Returns the number of splitting iterations in the cross-validator\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : object\n Always ignored, exists for compatibility.\n\ngroups : object\n Always ignored, exists for compatibility.\n\nReturns\n-------\nn_splits : int\n Returns the number of splitting iterations in the cross-validator.", + "docstring": "Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n ", "source_code": "\ndef get_n_splits(self, X, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n if X is None:\n raise ValueError(\"The 'X' parameter should not be None.\")\n return _num_samples(X)" }, { @@ -132285,7 +142349,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_groups", @@ -132295,13 +142360,14 @@ "docstring": { "type": "int", "description": "Number of groups (``p``) to leave out in the test split." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_groups):\n self.n_groups = n_groups" }, { @@ -132319,7 +142385,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132329,7 +142396,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -132339,7 +142407,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -132349,13 +142418,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _iter_test_masks(self, X, y, groups):\n if groups is None:\n raise ValueError(\"The 'groups' parameter should not be None.\")\n groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)\n unique_groups = np.unique(groups)\n if self.n_groups >= len(unique_groups):\n raise ValueError('The groups parameter contains fewer than (or equal to) n_groups (%d) numbers of unique groups (%s). LeavePGroupsOut expects that at least n_groups + 1 (%d) unique groups be present' % (self.n_groups, unique_groups, self.n_groups + 1))\n combi = combinations(range(len(unique_groups)), self.n_groups)\n for indices in combi:\n test_index = np.zeros(_num_samples(X), dtype=bool)\n for l in unique_groups[np.array(indices)]:\n test_index[groups == l] = True\n yield test_index" }, { @@ -132373,7 +142443,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132383,7 +142454,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "y", @@ -132393,7 +142465,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "groups", @@ -132403,13 +142476,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set. This 'groups' parameter must always be specified to\ncalculate the number of splits, though the other parameters can be\nomitted." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Returns the number of splitting iterations in the cross-validator", - "docstring": "Returns the number of splitting iterations in the cross-validator\n\nParameters\n----------\nX : object\n Always ignored, exists for compatibility.\n\ny : object\n Always ignored, exists for compatibility.\n\ngroups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set. This 'groups' parameter must always be specified to\n calculate the number of splits, though the other parameters can be\n omitted.\n\nReturns\n-------\nn_splits : int\n Returns the number of splitting iterations in the cross-validator.", + "docstring": "Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set. This 'groups' parameter must always be specified to\n calculate the number of splits, though the other parameters can be\n omitted.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n ", "source_code": "\ndef get_n_splits(self, X=None, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set. This 'groups' parameter must always be specified to\n calculate the number of splits, though the other parameters can be\n omitted.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n if groups is None:\n raise ValueError(\"The 'groups' parameter should not be None.\")\n groups = check_array(groups, ensure_2d=False, dtype=None)\n return int(comb(len(np.unique(groups)), self.n_groups, exact=True))" }, { @@ -132427,7 +142501,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132437,7 +142512,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -132447,7 +142523,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The target variable for supervised learning problems." - } + }, + "refined_type": {} }, { "name": "groups", @@ -132457,13 +142534,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate indices to split data into training and test set.", - "docstring": "Generate indices to split data into training and test set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\ngroups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.", + "docstring": "Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n ", "source_code": "\ndef split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,)\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n return super().split(X, y, groups)" }, { @@ -132481,7 +142559,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "p", @@ -132491,13 +142570,14 @@ "docstring": { "type": "int", "description": "Size of the test sets. Must be strictly less than the number of\nsamples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, p):\n self.p = p" }, { @@ -132515,7 +142595,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132525,7 +142606,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -132535,7 +142617,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -132545,13 +142628,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _iter_test_indices(self, X, y=None, groups=None):\n n_samples = _num_samples(X)\n if n_samples <= self.p:\n raise ValueError('p={} must be strictly less than the number of samples={}'.format(self.p, n_samples))\n for combination in combinations(range(n_samples), self.p):\n yield np.array(combination)" }, { @@ -132569,7 +142653,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132579,7 +142664,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -132589,7 +142675,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "groups", @@ -132599,13 +142686,14 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Returns the number of splitting iterations in the cross-validator", - "docstring": "Returns the number of splitting iterations in the cross-validator\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : object\n Always ignored, exists for compatibility.\n\ngroups : object\n Always ignored, exists for compatibility.", + "docstring": "Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n ", "source_code": "\ndef get_n_splits(self, X, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n \"\"\"\n if X is None:\n raise ValueError(\"The 'X' parameter should not be None.\")\n return int(comb(_num_samples(X), self.p, exact=True))" }, { @@ -132623,7 +142711,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "test_fold", @@ -132633,13 +142722,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The entry ``test_fold[i]`` represents the index of the test set that\nsample ``i`` belongs to. It is possible to exclude sample ``i`` from\nany test set (i.e. include sample ``i`` in every training set) by\nsetting ``test_fold[i]`` equal to -1." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, test_fold):\n self.test_fold = np.array(test_fold, dtype=int)\n self.test_fold = column_or_1d(self.test_fold)\n self.unique_folds = np.unique(self.test_fold)\n self.unique_folds = self.unique_folds[self.unique_folds != -1]" }, { @@ -132657,7 +142747,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -132681,7 +142772,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132691,7 +142783,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "y", @@ -132701,7 +142794,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "groups", @@ -132711,13 +142805,14 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Returns the number of splitting iterations in the cross-validator", - "docstring": "Returns the number of splitting iterations in the cross-validator\n\nParameters\n----------\nX : object\n Always ignored, exists for compatibility.\n\ny : object\n Always ignored, exists for compatibility.\n\ngroups : object\n Always ignored, exists for compatibility.\n\nReturns\n-------\nn_splits : int\n Returns the number of splitting iterations in the cross-validator.", + "docstring": "Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n ", "source_code": "\ndef get_n_splits(self, X=None, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n return len(self.unique_folds)" }, { @@ -132735,7 +142830,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132745,7 +142841,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "y", @@ -132755,7 +142852,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "groups", @@ -132765,13 +142863,14 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate indices to split data into training and test set.", - "docstring": "Generate indices to split data into training and test set.\n\nParameters\n----------\nX : object\n Always ignored, exists for compatibility.\n\ny : object\n Always ignored, exists for compatibility.\n\ngroups : object\n Always ignored, exists for compatibility.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.", + "docstring": "Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n ", "source_code": "\ndef split(self, X=None, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n ind = np.arange(len(self.test_fold))\n for test_index in self._iter_test_masks():\n train_index = ind[np.logical_not(test_index)]\n test_index = ind[test_index]\n yield (train_index, test_index)" }, { @@ -132789,7 +142888,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -132799,7 +142899,8 @@ "docstring": { "type": "int, default=5", "description": "Number of folds. Must be at least 2." - } + }, + "refined_type": {} }, { "name": "n_repeats", @@ -132809,7 +142910,8 @@ "docstring": { "type": "int, default=10", "description": "Number of times cross-validator needs to be repeated." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -132819,13 +142921,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the randomness of each repeated cross-validation instance.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, n_splits=5, n_repeats=10, random_state=None):\n super().__init__(KFold, n_repeats=n_repeats, random_state=random_state, n_splits=n_splits)" }, { @@ -132843,7 +142946,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -132853,7 +142957,8 @@ "docstring": { "type": "int, default=5", "description": "Number of folds. Must be at least 2." - } + }, + "refined_type": {} }, { "name": "n_repeats", @@ -132863,7 +142968,8 @@ "docstring": { "type": "int, default=10", "description": "Number of times cross-validator needs to be repeated." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -132873,13 +142979,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the generation of the random states for each repetition.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, n_splits=5, n_repeats=10, random_state=None):\n super().__init__(StratifiedKFold, n_repeats=n_repeats, random_state=random_state, n_splits=n_splits)" }, { @@ -132897,7 +143004,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -132907,7 +143015,8 @@ "docstring": { "type": "int, default=10", "description": "Number of re-shuffling & splitting iterations." - } + }, + "refined_type": {} }, { "name": "test_size", @@ -132917,7 +143026,8 @@ "docstring": { "type": "float or int, default=None", "description": "If float, should be between 0.0 and 1.0 and represent the proportion\nof the dataset to include in the test split. If int, represents the\nabsolute number of test samples. If None, the value is set to the\ncomplement of the train size. If ``train_size`` is also None, it will\nbe set to 0.1." - } + }, + "refined_type": {} }, { "name": "train_size", @@ -132927,7 +143037,8 @@ "docstring": { "type": "float or int, default=None", "description": "If float, should be between 0.0 and 1.0 and represent the\nproportion of the dataset to include in the train split. If\nint, represents the absolute number of train samples. If None,\nthe value is automatically set to the complement of the test size." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -132937,13 +143048,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the randomness of the training and testing indices produced.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None):\n super().__init__(n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state)\n self._default_test_size = 0.1" }, { @@ -132961,7 +143073,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -132971,7 +143084,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -132981,7 +143095,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -132991,13 +143106,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _iter_indices(self, X, y=None, groups=None):\n n_samples = _num_samples(X)\n (n_train, n_test) = _validate_shuffle_split(n_samples, self.test_size, self.train_size, default_test_size=self._default_test_size)\n rng = check_random_state(self.random_state)\n for i in range(self.n_splits):\n permutation = rng.permutation(n_samples)\n ind_test = permutation[:n_test]\n ind_train = permutation[n_test:n_test + n_train]\n yield (ind_train, ind_test)" }, { @@ -133015,7 +143131,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -133025,7 +143142,8 @@ "docstring": { "type": "int, default=5", "description": "Number of folds. Must be at least 2." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -133035,7 +143153,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to shuffle each class's samples before splitting into batches.\nNote that the samples within each split will not be shuffled.\nThis implementation can only shuffle groups that have approximately the\nsame y distribution, no global shuffle will be performed." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -133045,13 +143164,14 @@ "docstring": { "type": "int or RandomState instance, default=None", "description": "When `shuffle` is True, `random_state` affects the ordering of the\nindices, which controls the randomness of each fold for each class.\nOtherwise, leave `random_state` as `None`.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_splits=5, shuffle=False, random_state=None):\n super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)" }, { @@ -133069,7 +143189,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_counts_per_fold", @@ -133079,7 +143200,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_cnt", @@ -133089,7 +143211,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "group_y_counts", @@ -133099,13 +143222,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _find_best_fold(self, y_counts_per_fold, y_cnt, group_y_counts):\n best_fold = None\n min_eval = np.inf\n min_samples_in_fold = np.inf\n for i in range(self.n_splits):\n y_counts_per_fold[i] += group_y_counts\n std_per_class = np.std(y_counts_per_fold / y_cnt.reshape(1, -1), axis=0)\n y_counts_per_fold[i] -= group_y_counts\n fold_eval = np.mean(std_per_class)\n samples_in_fold = np.sum(y_counts_per_fold[i])\n is_current_fold_better = fold_eval < min_eval or np.isclose(fold_eval, min_eval) and samples_in_fold < min_samples_in_fold\n if is_current_fold_better:\n min_eval = fold_eval\n min_samples_in_fold = samples_in_fold\n best_fold = i\n return best_fold" }, { @@ -133123,7 +143247,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -133133,7 +143258,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -133143,7 +143269,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -133153,13 +143280,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _iter_test_indices(self, X, y, groups):\n rng = check_random_state(self.random_state)\n y = np.asarray(y)\n type_of_target_y = type_of_target(y)\n allowed_target_types = ('binary', 'multiclass')\n if type_of_target_y not in allowed_target_types:\n raise ValueError('Supported target types are: {}. Got {!r} instead.'.format(allowed_target_types, type_of_target_y))\n y = column_or_1d(y)\n (_, y_inv, y_cnt) = np.unique(y, return_inverse=True, return_counts=True)\n if np.all(self.n_splits > y_cnt):\n raise ValueError('n_splits=%d cannot be greater than the number of members in each class.' % self.n_splits)\n n_smallest_class = np.min(y_cnt)\n if self.n_splits > n_smallest_class:\n warnings.warn('The least populated class in y has only %d members, which is less than n_splits=%d.' % (n_smallest_class, self.n_splits), UserWarning)\n n_classes = len(y_cnt)\n (_, groups_inv, groups_cnt) = np.unique(groups, return_inverse=True, return_counts=True)\n y_counts_per_group = np.zeros((len(groups_cnt), n_classes))\n for (class_idx, group_idx) in zip(y_inv, groups_inv):\n y_counts_per_group[group_idx, class_idx] += 1\n y_counts_per_fold = np.zeros((self.n_splits, n_classes))\n groups_per_fold = defaultdict(set)\n if self.shuffle:\n rng.shuffle(y_counts_per_group)\n sorted_groups_idx = np.argsort(-np.std(y_counts_per_group, axis=1), kind='mergesort')\n for group_idx in sorted_groups_idx:\n group_y_counts = y_counts_per_group[group_idx]\n best_fold = self._find_best_fold(y_counts_per_fold=y_counts_per_fold, y_cnt=y_cnt, group_y_counts=group_y_counts)\n y_counts_per_fold[best_fold] += group_y_counts\n groups_per_fold[best_fold].add(group_idx)\n for i in range(self.n_splits):\n test_indices = [idx for (idx, group_idx) in enumerate(groups_inv) if group_idx in groups_per_fold[i]]\n yield test_indices" }, { @@ -133177,7 +143305,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -133187,7 +143316,8 @@ "docstring": { "type": "int, default=5", "description": "Number of folds. Must be at least 2.\n\n.. versionchanged:: 0.22\n ``n_splits`` default value changed from 3 to 5." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -133197,7 +143327,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to shuffle each class's samples before splitting into batches.\nNote that the samples within each split will not be shuffled." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -133207,13 +143338,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "When `shuffle` is True, `random_state` affects the ordering of the\nindices, which controls the randomness of each fold for each class.\nOtherwise, leave `random_state` as `None`.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_splits=5, *, shuffle=False, random_state=None):\n super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)" }, { @@ -133231,7 +143363,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -133241,7 +143374,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -133251,7 +143385,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -133261,13 +143396,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _iter_test_masks(self, X, y=None, groups=None):\n test_folds = self._make_test_folds(X, y)\n for i in range(self.n_splits):\n yield test_folds == i" }, { @@ -133285,7 +143421,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -133295,7 +143432,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -133305,13 +143443,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _make_test_folds(self, X, y=None):\n rng = check_random_state(self.random_state)\n y = np.asarray(y)\n type_of_target_y = type_of_target(y)\n allowed_target_types = ('binary', 'multiclass')\n if type_of_target_y not in allowed_target_types:\n raise ValueError('Supported target types are: {}. Got {!r} instead.'.format(allowed_target_types, type_of_target_y))\n y = column_or_1d(y)\n (_, y_idx, y_inv) = np.unique(y, return_index=True, return_inverse=True)\n (_, class_perm) = np.unique(y_idx, return_inverse=True)\n y_encoded = class_perm[y_inv]\n n_classes = len(y_idx)\n y_counts = np.bincount(y_encoded)\n min_groups = np.min(y_counts)\n if np.all(self.n_splits > y_counts):\n raise ValueError('n_splits=%d cannot be greater than the number of members in each class.' % self.n_splits)\n if self.n_splits > min_groups:\n warnings.warn('The least populated class in y has only %d members, which is less than n_splits=%d.' % (min_groups, self.n_splits), UserWarning)\n y_order = np.sort(y_encoded)\n allocation = np.asarray([np.bincount(y_order[i::self.n_splits], minlength=n_classes) for i in range(self.n_splits)])\n test_folds = np.empty(len(y), dtype='i')\n for k in range(n_classes):\n folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k])\n if self.shuffle:\n rng.shuffle(folds_for_class)\n test_folds[y_encoded == k] = folds_for_class\n return test_folds" }, { @@ -133329,7 +143468,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -133339,7 +143479,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features.\n\nNote that providing ``y`` is sufficient to generate the splits and\nhence ``np.zeros(n_samples)`` may be used as a placeholder for\n``X`` instead of actual training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -133349,7 +143490,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target variable for supervised learning problems.\nStratification is done based on the y labels." - } + }, + "refined_type": {} }, { "name": "groups", @@ -133359,13 +143501,14 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate indices to split data into training and test set.", - "docstring": "Generate indices to split data into training and test set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Note that providing ``y`` is sufficient to generate the splits and\n hence ``np.zeros(n_samples)`` may be used as a placeholder for\n ``X`` instead of actual training data.\n\ny : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n Stratification is done based on the y labels.\n\ngroups : object\n Always ignored, exists for compatibility.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.\n\nNotes\n-----\nRandomized CV splitters may return different results for each call of\nsplit. You can make the results identical by setting `random_state`\nto an integer.", + "docstring": "Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Note that providing ``y`` is sufficient to generate the splits and\n hence ``np.zeros(n_samples)`` may be used as a placeholder for\n ``X`` instead of actual training data.\n\n y : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n Stratification is done based on the y labels.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n ", "source_code": "\ndef split(self, X, y, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Note that providing ``y`` is sufficient to generate the splits and\n hence ``np.zeros(n_samples)`` may be used as a placeholder for\n ``X`` instead of actual training data.\n\n y : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n Stratification is done based on the y labels.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n \"\"\"\n y = check_array(y, ensure_2d=False, dtype=None)\n return super().split(X, y, groups)" }, { @@ -133383,7 +143526,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -133393,7 +143537,8 @@ "docstring": { "type": "int, default=10", "description": "Number of re-shuffling & splitting iterations." - } + }, + "refined_type": {} }, { "name": "test_size", @@ -133403,7 +143548,8 @@ "docstring": { "type": "float or int, default=None", "description": "If float, should be between 0.0 and 1.0 and represent the proportion\nof the dataset to include in the test split. If int, represents the\nabsolute number of test samples. If None, the value is set to the\ncomplement of the train size. If ``train_size`` is also None, it will\nbe set to 0.1." - } + }, + "refined_type": {} }, { "name": "train_size", @@ -133413,7 +143559,8 @@ "docstring": { "type": "float or int, default=None", "description": "If float, should be between 0.0 and 1.0 and represent the\nproportion of the dataset to include in the train split. If\nint, represents the absolute number of train samples. If None,\nthe value is automatically set to the complement of the test size." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -133423,13 +143570,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the randomness of the training and testing indices produced.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None):\n super().__init__(n_splits=n_splits, test_size=test_size, train_size=train_size, random_state=random_state)\n self._default_test_size = 0.1" }, { @@ -133447,7 +143595,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -133457,7 +143606,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -133467,7 +143617,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -133477,13 +143628,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _iter_indices(self, X, y, groups=None):\n n_samples = _num_samples(X)\n y = check_array(y, ensure_2d=False, dtype=None)\n (n_train, n_test) = _validate_shuffle_split(n_samples, self.test_size, self.train_size, default_test_size=self._default_test_size)\n if y.ndim == 2:\n y = np.array([' '.join(row.astype('str')) for row in y])\n (classes, y_indices) = np.unique(y, return_inverse=True)\n n_classes = classes.shape[0]\n class_counts = np.bincount(y_indices)\n if np.min(class_counts) < 2:\n raise ValueError('The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.')\n if n_train < n_classes:\n raise ValueError('The train_size = %d should be greater or equal to the number of classes = %d' % (n_train, n_classes))\n if n_test < n_classes:\n raise ValueError('The test_size = %d should be greater or equal to the number of classes = %d' % (n_test, n_classes))\n class_indices = np.split(np.argsort(y_indices, kind='mergesort'), np.cumsum(class_counts)[:-1])\n rng = check_random_state(self.random_state)\n for _ in range(self.n_splits):\n n_i = _approximate_mode(class_counts, n_train, rng)\n class_counts_remaining = class_counts - n_i\n t_i = _approximate_mode(class_counts_remaining, n_test, rng)\n train = []\n test = []\n for i in range(n_classes):\n permutation = rng.permutation(class_counts[i])\n perm_indices_class_i = class_indices[i].take(permutation, mode='clip')\n train.extend(perm_indices_class_i[:n_i[i]])\n test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])\n train = rng.permutation(train)\n test = rng.permutation(test)\n yield (train, test)" }, { @@ -133501,7 +143653,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -133511,7 +143664,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features.\n\nNote that providing ``y`` is sufficient to generate the splits and\nhence ``np.zeros(n_samples)`` may be used as a placeholder for\n``X`` instead of actual training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -133521,7 +143675,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_labels)", "description": "The target variable for supervised learning problems.\nStratification is done based on the y labels." - } + }, + "refined_type": {} }, { "name": "groups", @@ -133531,13 +143686,14 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate indices to split data into training and test set.", - "docstring": "Generate indices to split data into training and test set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Note that providing ``y`` is sufficient to generate the splits and\n hence ``np.zeros(n_samples)`` may be used as a placeholder for\n ``X`` instead of actual training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_labels)\n The target variable for supervised learning problems.\n Stratification is done based on the y labels.\n\ngroups : object\n Always ignored, exists for compatibility.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.\n\nNotes\n-----\nRandomized CV splitters may return different results for each call of\nsplit. You can make the results identical by setting `random_state`\nto an integer.", + "docstring": "Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Note that providing ``y`` is sufficient to generate the splits and\n hence ``np.zeros(n_samples)`` may be used as a placeholder for\n ``X`` instead of actual training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_labels)\n The target variable for supervised learning problems.\n Stratification is done based on the y labels.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n ", "source_code": "\ndef split(self, X, y, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n Note that providing ``y`` is sufficient to generate the splits and\n hence ``np.zeros(n_samples)`` may be used as a placeholder for\n ``X`` instead of actual training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_labels)\n The target variable for supervised learning problems.\n Stratification is done based on the y labels.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n\n Notes\n -----\n Randomized CV splitters may return different results for each call of\n split. You can make the results identical by setting `random_state`\n to an integer.\n \"\"\"\n y = check_array(y, ensure_2d=False, dtype=None)\n return super().split(X, y, groups)" }, { @@ -133555,7 +143711,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -133565,7 +143722,8 @@ "docstring": { "type": "int, default=5", "description": "Number of splits. Must be at least 2.\n\n.. versionchanged:: 0.22\n ``n_splits`` default value changed from 3 to 5." - } + }, + "refined_type": {} }, { "name": "max_train_size", @@ -133575,7 +143733,8 @@ "docstring": { "type": "int, default=None", "description": "Maximum size for a single training set." - } + }, + "refined_type": {} }, { "name": "test_size", @@ -133585,7 +143744,8 @@ "docstring": { "type": "int, default=None", "description": "Used to limit the size of the test set. Defaults to\n``n_samples // (n_splits + 1)``, which is the maximum allowed value\nwith ``gap=0``.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} }, { "name": "gap", @@ -133595,13 +143755,14 @@ "docstring": { "type": "int, default=0", "description": "Number of samples to exclude from the end of each train set before\nthe test set.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_splits=5, *, max_train_size=None, test_size=None, gap=0):\n super().__init__(n_splits, shuffle=False, random_state=None)\n self.max_train_size = max_train_size\n self.test_size = test_size\n self.gap = gap" }, { @@ -133619,7 +143780,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -133629,7 +143791,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -133639,7 +143802,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "groups", @@ -133649,13 +143813,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate indices to split data into training and test set.", - "docstring": "Generate indices to split data into training and test set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Always ignored, exists for compatibility.\n\ngroups : array-like of shape (n_samples,)\n Always ignored, exists for compatibility.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.", + "docstring": "Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Always ignored, exists for compatibility.\n\n groups : array-like of shape (n_samples,)\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n ", "source_code": "\ndef split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Always ignored, exists for compatibility.\n\n groups : array-like of shape (n_samples,)\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n (X, y, groups) = indexable(X, y, groups)\n n_samples = _num_samples(X)\n n_splits = self.n_splits\n n_folds = n_splits + 1\n gap = self.gap\n test_size = self.test_size if self.test_size is not None else n_samples // n_folds\n if n_folds > n_samples:\n raise ValueError(f'Cannot have number of folds={n_folds} greater than the number of samples={n_samples}.')\n if n_samples - gap - test_size * n_splits <= 0:\n raise ValueError(f'Too many splits={n_splits} for number of samples={n_samples} with test_size={test_size} and gap={gap}.')\n indices = np.arange(n_samples)\n test_starts = range(n_samples - n_splits * test_size, n_samples, test_size)\n for test_start in test_starts:\n train_end = test_start - gap\n if self.max_train_size and self.max_train_size < train_end:\n yield (indices[train_end - self.max_train_size:train_end], indices[test_start:test_start + test_size])\n else:\n yield (indices[:train_end], indices[test_start:test_start + test_size])" }, { @@ -133673,7 +143838,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_splits", @@ -133683,7 +143849,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -133693,7 +143860,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -133703,13 +143871,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, n_splits, *, shuffle, random_state):\n if not isinstance(n_splits, numbers.Integral):\n raise ValueError('The number of folds must be of Integral type. %s of type %s was passed.' % (n_splits, type(n_splits)))\n n_splits = int(n_splits)\n if n_splits <= 1:\n raise ValueError('k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits={0}.'.format(n_splits))\n if not isinstance(shuffle, bool):\n raise TypeError('shuffle must be True or False; got {0}'.format(shuffle))\n if not shuffle and random_state is not None:\n raise ValueError('Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True.')\n self.n_splits = n_splits\n self.shuffle = shuffle\n self.random_state = random_state" }, { @@ -133727,7 +143896,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -133737,7 +143907,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "y", @@ -133747,7 +143918,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "groups", @@ -133757,13 +143929,14 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Returns the number of splitting iterations in the cross-validator", - "docstring": "Returns the number of splitting iterations in the cross-validator\n\nParameters\n----------\nX : object\n Always ignored, exists for compatibility.\n\ny : object\n Always ignored, exists for compatibility.\n\ngroups : object\n Always ignored, exists for compatibility.\n\nReturns\n-------\nn_splits : int\n Returns the number of splitting iterations in the cross-validator.", + "docstring": "Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n ", "source_code": "\ndef get_n_splits(self, X=None, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n return self.n_splits" }, { @@ -133781,7 +143954,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -133791,7 +143965,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -133801,7 +143976,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "The target variable for supervised learning problems." - } + }, + "refined_type": {} }, { "name": "groups", @@ -133811,13 +143987,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Generate indices to split data into training and test set.", - "docstring": "Generate indices to split data into training and test set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\ngroups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.", + "docstring": "Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n ", "source_code": "\ndef split(self, X, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,), default=None\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n (X, y, groups) = indexable(X, y, groups)\n n_samples = _num_samples(X)\n if self.n_splits > n_samples:\n raise ValueError('Cannot have number of splits n_splits={0} greater than the number of samples: n_samples={1}.'.format(self.n_splits, n_samples))\n for (train, test) in super().split(X, y, groups):\n yield (train, test)" }, { @@ -133835,7 +144012,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cv", @@ -133845,13 +144023,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, cv):\n self.cv = list(cv)" }, { @@ -133869,7 +144048,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -133879,7 +144059,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "y", @@ -133889,7 +144070,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "groups", @@ -133899,13 +144081,14 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Returns the number of splitting iterations in the cross-validator", - "docstring": "Returns the number of splitting iterations in the cross-validator\n\nParameters\n----------\nX : object\n Always ignored, exists for compatibility.\n\ny : object\n Always ignored, exists for compatibility.\n\ngroups : object\n Always ignored, exists for compatibility.\n\nReturns\n-------\nn_splits : int\n Returns the number of splitting iterations in the cross-validator.", + "docstring": "Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n ", "source_code": "\ndef get_n_splits(self, X=None, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n return len(self.cv)" }, { @@ -133923,7 +144106,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -133933,7 +144117,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "y", @@ -133943,7 +144128,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} }, { "name": "groups", @@ -133953,13 +144139,14 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Generate indices to split data into training and test set.", - "docstring": "Generate indices to split data into training and test set.\n\nParameters\n----------\nX : object\n Always ignored, exists for compatibility.\n\ny : object\n Always ignored, exists for compatibility.\n\ngroups : object\n Always ignored, exists for compatibility.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.", + "docstring": "Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n ", "source_code": "\ndef split(self, X=None, y=None, groups=None):\n \"\"\"Generate indices to split data into training and test set.\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n\n y : object\n Always ignored, exists for compatibility.\n\n groups : object\n Always ignored, exists for compatibility.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n for (train, test) in self.cv:\n yield (train, test)" }, { @@ -133977,7 +144164,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cv", @@ -133987,7 +144175,8 @@ "docstring": { "type": "callable", "description": "Cross-validator class." - } + }, + "refined_type": {} }, { "name": "n_repeats", @@ -133997,7 +144186,8 @@ "docstring": { "type": "int, default=10", "description": "Number of times cross-validator needs to be repeated." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -134007,13 +144197,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Passes `random_state` to the arbitrary repeating cross validator.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs):\n if not isinstance(n_repeats, numbers.Integral):\n raise ValueError('Number of repetitions must be of Integral type.')\n if n_repeats <= 0:\n raise ValueError('Number of repetitions must be greater than 0.')\n if any((key in cvargs for key in ('random_state', 'shuffle'))):\n raise ValueError('cvargs must not contain random_state or shuffle.')\n self.cv = cv\n self.n_repeats = n_repeats\n self.random_state = random_state\n self.cvargs = cvargs" }, { @@ -134031,13 +144222,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return _build_repr(self)" }, { @@ -134055,7 +144247,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -134065,7 +144258,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility.\n``np.zeros(n_samples)`` may be used as a placeholder." - } + }, + "refined_type": {} }, { "name": "y", @@ -134075,7 +144269,8 @@ "docstring": { "type": "object", "description": "Always ignored, exists for compatibility.\n``np.zeros(n_samples)`` may be used as a placeholder." - } + }, + "refined_type": {} }, { "name": "groups", @@ -134085,13 +144280,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Returns the number of splitting iterations in the cross-validator", - "docstring": "Returns the number of splitting iterations in the cross-validator\n\nParameters\n----------\nX : object\n Always ignored, exists for compatibility.\n ``np.zeros(n_samples)`` may be used as a placeholder.\n\ny : object\n Always ignored, exists for compatibility.\n ``np.zeros(n_samples)`` may be used as a placeholder.\n\ngroups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\nReturns\n-------\nn_splits : int\n Returns the number of splitting iterations in the cross-validator.", + "docstring": "Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n ``np.zeros(n_samples)`` may be used as a placeholder.\n\n y : object\n Always ignored, exists for compatibility.\n ``np.zeros(n_samples)`` may be used as a placeholder.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n ", "source_code": "\ndef get_n_splits(self, X=None, y=None, groups=None):\n \"\"\"Returns the number of splitting iterations in the cross-validator\n\n Parameters\n ----------\n X : object\n Always ignored, exists for compatibility.\n ``np.zeros(n_samples)`` may be used as a placeholder.\n\n y : object\n Always ignored, exists for compatibility.\n ``np.zeros(n_samples)`` may be used as a placeholder.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Returns\n -------\n n_splits : int\n Returns the number of splitting iterations in the cross-validator.\n \"\"\"\n rng = check_random_state(self.random_state)\n cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)\n return cv.get_n_splits(X, y, groups) * self.n_repeats" }, { @@ -134109,7 +144305,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -134119,7 +144316,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -134129,7 +144327,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target variable for supervised learning problems." - } + }, + "refined_type": {} }, { "name": "groups", @@ -134139,13 +144338,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Generates indices to split data into training and test set.", - "docstring": "Generates indices to split data into training and test set.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n\ngroups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\nYields\n------\ntrain : ndarray\n The training set indices for that split.\n\ntest : ndarray\n The testing set indices for that split.", + "docstring": "Generates indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n ", "source_code": "\ndef split(self, X, y=None, groups=None):\n \"\"\"Generates indices to split data into training and test set.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n The target variable for supervised learning problems.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set.\n\n Yields\n ------\n train : ndarray\n The training set indices for that split.\n\n test : ndarray\n The testing set indices for that split.\n \"\"\"\n n_repeats = self.n_repeats\n rng = check_random_state(self.random_state)\n for idx in range(n_repeats):\n cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)\n for (train_index, test_index) in cv.split(X, y, groups):\n yield (train_index, test_index)" }, { @@ -134163,13 +144363,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _build_repr(self):\n cls = self.__class__\n init = getattr(cls.__init__, 'deprecated_original', cls.__init__)\n init_signature = signature(init)\n if init is object.__init__:\n args = []\n else:\n args = sorted([p.name for p in init_signature.parameters.values() if p.name != 'self' and p.kind != p.VAR_KEYWORD])\n class_name = self.__class__.__name__\n params = dict()\n for key in args:\n warnings.simplefilter('always', FutureWarning)\n try:\n with warnings.catch_warnings(record=True) as w:\n value = getattr(self, key, None)\n if value is None and hasattr(self, 'cvargs'):\n value = self.cvargs.get(key, None)\n if len(w) and w[0].category == FutureWarning:\n continue\n finally:\n warnings.filters.pop(0)\n params[key] = value\n return '%s(%s)' % (class_name, _pprint(params, offset=len(class_name)))" }, { @@ -134187,7 +144388,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "test_size", @@ -134197,7 +144399,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "train_size", @@ -134207,7 +144410,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "default_test_size", @@ -134217,13 +144421,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Validation helper to check if the test/test sizes are meaningful wrt to the size of the data (n_samples)", - "docstring": "Validation helper to check if the test/test sizes are meaningful wrt to the\nsize of the data (n_samples)", + "description": "Validation helper to check if the test/test sizes are meaningful wrt to the\nsize of the data (n_samples)", + "docstring": "\n Validation helper to check if the test/test sizes are meaningful wrt to the\n size of the data (n_samples)\n ", "source_code": "\ndef _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=None):\n \"\"\"\n Validation helper to check if the test/test sizes are meaningful wrt to the\n size of the data (n_samples)\n \"\"\"\n if test_size is None and train_size is None:\n test_size = default_test_size\n test_size_type = np.asarray(test_size).dtype.kind\n train_size_type = np.asarray(train_size).dtype.kind\n if test_size_type == 'i' and (test_size >= n_samples or test_size <= 0) or test_size_type == 'f' and (test_size <= 0 or test_size >= 1):\n raise ValueError('test_size={0} should be either positive and smaller than the number of samples {1} or a float in the (0, 1) range'.format(test_size, n_samples))\n if train_size_type == 'i' and (train_size >= n_samples or train_size <= 0) or train_size_type == 'f' and (train_size <= 0 or train_size >= 1):\n raise ValueError('train_size={0} should be either positive and smaller than the number of samples {1} or a float in the (0, 1) range'.format(train_size, n_samples))\n if train_size is not None and train_size_type not in ('i', 'f'):\n raise ValueError('Invalid value for train_size: {}'.format(train_size))\n if test_size is not None and test_size_type not in ('i', 'f'):\n raise ValueError('Invalid value for test_size: {}'.format(test_size))\n if train_size_type == 'f' and test_size_type == 'f' and train_size + test_size > 1:\n raise ValueError('The sum of test_size and train_size = {}, should be in the (0, 1) range. Reduce test_size and/or train_size.'.format(train_size + test_size))\n if test_size_type == 'f':\n n_test = ceil(test_size * n_samples)\n elif test_size_type == 'i':\n n_test = float(test_size)\n if train_size_type == 'f':\n n_train = floor(train_size * n_samples)\n elif train_size_type == 'i':\n n_train = float(train_size)\n if train_size is None:\n n_train = n_samples - n_test\n elif test_size is None:\n n_test = n_samples - n_train\n if n_train + n_test > n_samples:\n raise ValueError('The sum of train_size and test_size = %d, should be smaller than the number of samples %d. Reduce test_size and/or train_size.' % (n_train + n_test, n_samples))\n (n_train, n_test) = (int(n_train), int(n_test))\n if n_train == 0:\n raise ValueError('With n_samples={}, test_size={} and train_size={}, the resulting train set will be empty. Adjust any of the aforementioned parameters.'.format(n_samples, test_size, train_size))\n return n_train, n_test" }, { @@ -134241,13 +144446,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _yields_constant_splits(cv):\n shuffle = getattr(cv, 'shuffle', True)\n random_state = getattr(cv, 'random_state', 0)\n return isinstance(random_state, numbers.Integral) or not shuffle" }, { @@ -134265,7 +144471,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n- None, to use the default 5-fold cross validation,\n- integer, to specify the number of folds.\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor integer/None inputs, if classifier is True and ``y`` is either\nbinary or multiclass, :class:`StratifiedKFold` is used. In all other\ncases, :class:`KFold` is used.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "y", @@ -134275,7 +144482,8 @@ "docstring": { "type": "array-like, default=None", "description": "The target variable for supervised learning problems." - } + }, + "refined_type": {} }, { "name": "classifier", @@ -134285,13 +144493,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether the task is a classification task, in which case\nstratified KFold will be used." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Input checker utility for building a cross-validator", - "docstring": "Input checker utility for building a cross-validator\n\nParameters\n----------\ncv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n - None, to use the default 5-fold cross validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if classifier is True and ``y`` is either\n binary or multiclass, :class:`StratifiedKFold` is used. In all other\n cases, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value changed from 3-fold to 5-fold.\n\ny : array-like, default=None\n The target variable for supervised learning problems.\n\nclassifier : bool, default=False\n Whether the task is a classification task, in which case\n stratified KFold will be used.\n\nReturns\n-------\nchecked_cv : a cross-validator instance.\n The return value is a cross-validator which generates the train/test\n splits via the ``split`` method.", + "docstring": "Input checker utility for building a cross-validator\n\n Parameters\n ----------\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n - None, to use the default 5-fold cross validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if classifier is True and ``y`` is either\n binary or multiclass, :class:`StratifiedKFold` is used. In all other\n cases, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value changed from 3-fold to 5-fold.\n\n y : array-like, default=None\n The target variable for supervised learning problems.\n\n classifier : bool, default=False\n Whether the task is a classification task, in which case\n stratified KFold will be used.\n\n Returns\n -------\n checked_cv : a cross-validator instance.\n The return value is a cross-validator which generates the train/test\n splits via the ``split`` method.\n ", "source_code": "\ndef check_cv(cv=5, y=None, *, classifier=False):\n \"\"\"Input checker utility for building a cross-validator\n\n Parameters\n ----------\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n - None, to use the default 5-fold cross validation,\n - integer, to specify the number of folds.\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For integer/None inputs, if classifier is True and ``y`` is either\n binary or multiclass, :class:`StratifiedKFold` is used. In all other\n cases, :class:`KFold` is used.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value changed from 3-fold to 5-fold.\n\n y : array-like, default=None\n The target variable for supervised learning problems.\n\n classifier : bool, default=False\n Whether the task is a classification task, in which case\n stratified KFold will be used.\n\n Returns\n -------\n checked_cv : a cross-validator instance.\n The return value is a cross-validator which generates the train/test\n splits via the ``split`` method.\n \"\"\"\n cv = 5 if cv is None else cv\n if isinstance(cv, numbers.Integral):\n if classifier and y is not None and type_of_target(y) in ('binary', 'multiclass'):\n return StratifiedKFold(cv)\n else:\n return KFold(cv)\n if not hasattr(cv, 'split') or isinstance(cv, str):\n if not isinstance(cv, Iterable) or isinstance(cv, str):\n raise ValueError('Expected cv as an integer, cross-validation object (from sklearn.model_selection) or an iterable. Got %s.' % cv)\n return _CVIterableWrapper(cv)\n return cv" }, { @@ -134309,7 +144518,8 @@ "docstring": { "type": "float or int, default=None", "description": "If float, should be between 0.0 and 1.0 and represent the proportion\nof the dataset to include in the test split. If int, represents the\nabsolute number of test samples. If None, the value is set to the\ncomplement of the train size. If ``train_size`` is also None, it will\nbe set to 0.25." - } + }, + "refined_type": {} }, { "name": "train_size", @@ -134319,7 +144529,8 @@ "docstring": { "type": "float or int, default=None", "description": "If float, should be between 0.0 and 1.0 and represent the\nproportion of the dataset to include in the train split. If\nint, represents the absolute number of train samples. If None,\nthe value is automatically set to the complement of the test size." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -134329,7 +144540,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the shuffling applied to the data before applying the split.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -134339,7 +144551,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to shuffle the data before splitting. If shuffle=False\nthen stratify must be None." - } + }, + "refined_type": {} }, { "name": "stratify", @@ -134349,14 +144562,15 @@ "docstring": { "type": "array-like, default=None", "description": "If not None, data is split in a stratified fashion, using this as\nthe class labels.\nRead more in the :ref:`User Guide `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Split arrays or matrices into random train and test subsets\n\nQuick utility that wraps input validation and ``next(ShuffleSplit().split(X, y))`` and application to input data into a single call for splitting (and optionally subsampling) data in a oneliner. Read more in the :ref:`User Guide `.", - "docstring": "Split arrays or matrices into random train and test subsets\n\nQuick utility that wraps input validation and\n``next(ShuffleSplit().split(X, y))`` and application to input data\ninto a single call for splitting (and optionally subsampling) data in a\noneliner.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\n*arrays : sequence of indexables with same length / shape[0]\n Allowed inputs are lists, numpy arrays, scipy-sparse\n matrices or pandas dataframes.\n\ntest_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to include in the test split. If int, represents the\n absolute number of test samples. If None, the value is set to the\n complement of the train size. If ``train_size`` is also None, it will\n be set to 0.25.\n\ntrain_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the\n proportion of the dataset to include in the train split. If\n int, represents the absolute number of train samples. If None,\n the value is automatically set to the complement of the test size.\n\nrandom_state : int, RandomState instance or None, default=None\n Controls the shuffling applied to the data before applying the split.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n\nshuffle : bool, default=True\n Whether or not to shuffle the data before splitting. If shuffle=False\n then stratify must be None.\n\nstratify : array-like, default=None\n If not None, data is split in a stratified fashion, using this as\n the class labels.\n Read more in the :ref:`User Guide `.\n\nReturns\n-------\nsplitting : list, length=2 * len(arrays)\n List containing train-test split of inputs.\n\n .. versionadded:: 0.16\n If the input is sparse, the output will be a\n ``scipy.sparse.csr_matrix``. Else, output type is the same as the\n input type.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.model_selection import train_test_split\n>>> X, y = np.arange(10).reshape((5, 2)), range(5)\n>>> X\narray([[0, 1],\n [2, 3],\n [4, 5],\n [6, 7],\n [8, 9]])\n>>> list(y)\n[0, 1, 2, 3, 4]\n\n>>> X_train, X_test, y_train, y_test = train_test_split(\n... X, y, test_size=0.33, random_state=42)\n...\n>>> X_train\narray([[4, 5],\n [0, 1],\n [6, 7]])\n>>> y_train\n[2, 0, 3]\n>>> X_test\narray([[2, 3],\n [8, 9]])\n>>> y_test\n[1, 4]\n\n>>> train_test_split(y, shuffle=False)\n[[0, 1, 2], [3, 4]]", - "source_code": "\ndef train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None):\n \"\"\"Split arrays or matrices into random train and test subsets\n\n Quick utility that wraps input validation and\n ``next(ShuffleSplit().split(X, y))`` and application to input data\n into a single call for splitting (and optionally subsampling) data in a\n oneliner.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n *arrays : sequence of indexables with same length / shape[0]\n Allowed inputs are lists, numpy arrays, scipy-sparse\n matrices or pandas dataframes.\n\n test_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to include in the test split. If int, represents the\n absolute number of test samples. If None, the value is set to the\n complement of the train size. If ``train_size`` is also None, it will\n be set to 0.25.\n\n train_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the\n proportion of the dataset to include in the train split. If\n int, represents the absolute number of train samples. If None,\n the value is automatically set to the complement of the test size.\n\n random_state : int, RandomState instance or None, default=None\n Controls the shuffling applied to the data before applying the split.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n\n shuffle : bool, default=True\n Whether or not to shuffle the data before splitting. If shuffle=False\n then stratify must be None.\n\n stratify : array-like, default=None\n If not None, data is split in a stratified fashion, using this as\n the class labels.\n Read more in the :ref:`User Guide `.\n\n Returns\n -------\n splitting : list, length=2 * len(arrays)\n List containing train-test split of inputs.\n\n .. versionadded:: 0.16\n If the input is sparse, the output will be a\n ``scipy.sparse.csr_matrix``. Else, output type is the same as the\n input type.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = np.arange(10).reshape((5, 2)), range(5)\n >>> X\n array([[0, 1],\n [2, 3],\n [4, 5],\n [6, 7],\n [8, 9]])\n >>> list(y)\n [0, 1, 2, 3, 4]\n\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.33, random_state=42)\n ...\n >>> X_train\n array([[4, 5],\n [0, 1],\n [6, 7]])\n >>> y_train\n [2, 0, 3]\n >>> X_test\n array([[2, 3],\n [8, 9]])\n >>> y_test\n [1, 4]\n\n >>> train_test_split(y, shuffle=False)\n [[0, 1, 2], [3, 4]]\n\n \"\"\"\n n_arrays = len(arrays)\n if n_arrays == 0:\n raise ValueError('At least one array required as input')\n arrays = indexable(*arrays)\n n_samples = _num_samples(arrays[0])\n (n_train, n_test) = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25)\n if shuffle is False:\n if stratify is not None:\n raise ValueError('Stratified train/test split is not implemented for shuffle=False')\n train = np.arange(n_train)\n test = np.arange(n_train, n_train + n_test)\n else:\n if stratify is not None:\n CVClass = StratifiedShuffleSplit\n else:\n CVClass = ShuffleSplit\n cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state)\n (train, test) = next(cv.split(X=arrays[0], y=stratify))\n return list(chain.from_iterable(((_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays)))" + "description": "Split arrays or matrices into random train and test subsets.\n\nQuick utility that wraps input validation and\n``next(ShuffleSplit().split(X, y))`` and application to input data\ninto a single call for splitting (and optionally subsampling) data in a\noneliner.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Split arrays or matrices into random train and test subsets.\n\n Quick utility that wraps input validation and\n ``next(ShuffleSplit().split(X, y))`` and application to input data\n into a single call for splitting (and optionally subsampling) data in a\n oneliner.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n *arrays : sequence of indexables with same length / shape[0]\n Allowed inputs are lists, numpy arrays, scipy-sparse\n matrices or pandas dataframes.\n\n test_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to include in the test split. If int, represents the\n absolute number of test samples. If None, the value is set to the\n complement of the train size. If ``train_size`` is also None, it will\n be set to 0.25.\n\n train_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the\n proportion of the dataset to include in the train split. If\n int, represents the absolute number of train samples. If None,\n the value is automatically set to the complement of the test size.\n\n random_state : int, RandomState instance or None, default=None\n Controls the shuffling applied to the data before applying the split.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n shuffle : bool, default=True\n Whether or not to shuffle the data before splitting. If shuffle=False\n then stratify must be None.\n\n stratify : array-like, default=None\n If not None, data is split in a stratified fashion, using this as\n the class labels.\n Read more in the :ref:`User Guide `.\n\n Returns\n -------\n splitting : list, length=2 * len(arrays)\n List containing train-test split of inputs.\n\n .. versionadded:: 0.16\n If the input is sparse, the output will be a\n ``scipy.sparse.csr_matrix``. Else, output type is the same as the\n input type.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = np.arange(10).reshape((5, 2)), range(5)\n >>> X\n array([[0, 1],\n [2, 3],\n [4, 5],\n [6, 7],\n [8, 9]])\n >>> list(y)\n [0, 1, 2, 3, 4]\n\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.33, random_state=42)\n ...\n >>> X_train\n array([[4, 5],\n [0, 1],\n [6, 7]])\n >>> y_train\n [2, 0, 3]\n >>> X_test\n array([[2, 3],\n [8, 9]])\n >>> y_test\n [1, 4]\n\n >>> train_test_split(y, shuffle=False)\n [[0, 1, 2], [3, 4]]\n ", + "source_code": "\ndef train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None):\n \"\"\"Split arrays or matrices into random train and test subsets.\n\n Quick utility that wraps input validation and\n ``next(ShuffleSplit().split(X, y))`` and application to input data\n into a single call for splitting (and optionally subsampling) data in a\n oneliner.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n *arrays : sequence of indexables with same length / shape[0]\n Allowed inputs are lists, numpy arrays, scipy-sparse\n matrices or pandas dataframes.\n\n test_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the proportion\n of the dataset to include in the test split. If int, represents the\n absolute number of test samples. If None, the value is set to the\n complement of the train size. If ``train_size`` is also None, it will\n be set to 0.25.\n\n train_size : float or int, default=None\n If float, should be between 0.0 and 1.0 and represent the\n proportion of the dataset to include in the train split. If\n int, represents the absolute number of train samples. If None,\n the value is automatically set to the complement of the test size.\n\n random_state : int, RandomState instance or None, default=None\n Controls the shuffling applied to the data before applying the split.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n shuffle : bool, default=True\n Whether or not to shuffle the data before splitting. If shuffle=False\n then stratify must be None.\n\n stratify : array-like, default=None\n If not None, data is split in a stratified fashion, using this as\n the class labels.\n Read more in the :ref:`User Guide `.\n\n Returns\n -------\n splitting : list, length=2 * len(arrays)\n List containing train-test split of inputs.\n\n .. versionadded:: 0.16\n If the input is sparse, the output will be a\n ``scipy.sparse.csr_matrix``. Else, output type is the same as the\n input type.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.model_selection import train_test_split\n >>> X, y = np.arange(10).reshape((5, 2)), range(5)\n >>> X\n array([[0, 1],\n [2, 3],\n [4, 5],\n [6, 7],\n [8, 9]])\n >>> list(y)\n [0, 1, 2, 3, 4]\n\n >>> X_train, X_test, y_train, y_test = train_test_split(\n ... X, y, test_size=0.33, random_state=42)\n ...\n >>> X_train\n array([[4, 5],\n [0, 1],\n [6, 7]])\n >>> y_train\n [2, 0, 3]\n >>> X_test\n array([[2, 3],\n [8, 9]])\n >>> y_test\n [1, 4]\n\n >>> train_test_split(y, shuffle=False)\n [[0, 1, 2], [3, 4]]\n \"\"\"\n n_arrays = len(arrays)\n if n_arrays == 0:\n raise ValueError('At least one array required as input')\n arrays = indexable(*arrays)\n n_samples = _num_samples(arrays[0])\n (n_train, n_test) = _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=0.25)\n if shuffle is False:\n if stratify is not None:\n raise ValueError('Stratified train/test split is not implemented for shuffle=False')\n train = np.arange(n_train)\n test = np.arange(n_train, n_train + n_test)\n else:\n if stratify is not None:\n CVClass = StratifiedShuffleSplit\n else:\n CVClass = ShuffleSplit\n cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state)\n (train, test) = next(cv.split(X=arrays[0], y=stratify))\n return list(chain.from_iterable(((_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays)))" }, { "name": "_aggregate_score_dicts", @@ -134373,13 +144587,14 @@ "docstring": { "type": "list of dict", "description": "List of dicts of the scores for all scorers. This is a flat list,\nassumed originally to be of row major order." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Aggregate the list of dict to dict of np ndarray\n\nThe aggregated output of _aggregate_score_dicts will be a list of dict of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...] Convert it to a dict of array {'prec': np.array([0.1 ...]), ...}", - "docstring": "Aggregate the list of dict to dict of np ndarray\n\nThe aggregated output of _aggregate_score_dicts will be a list of dict\nof form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...]\nConvert it to a dict of array {'prec': np.array([0.1 ...]), ...}\n\nParameters\n----------\n\nscores : list of dict\n List of dicts of the scores for all scorers. This is a flat list,\n assumed originally to be of row major order.\n\nExample\n-------\n\n>>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3},\n... {'a': 10, 'b': 10}] # doctest: +SKIP\n>>> _aggregate_score_dicts(scores) # doctest: +SKIP\n{'a': array([1, 2, 3, 10]),\n 'b': array([10, 2, 3, 10])}", + "description": "Aggregate the list of dict to dict of np ndarray\n\nThe aggregated output of _aggregate_score_dicts will be a list of dict\nof form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...]\nConvert it to a dict of array {'prec': np.array([0.1 ...]), ...}", + "docstring": "Aggregate the list of dict to dict of np ndarray\n\n The aggregated output of _aggregate_score_dicts will be a list of dict\n of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...]\n Convert it to a dict of array {'prec': np.array([0.1 ...]), ...}\n\n Parameters\n ----------\n\n scores : list of dict\n List of dicts of the scores for all scorers. This is a flat list,\n assumed originally to be of row major order.\n\n Example\n -------\n\n >>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3},\n ... {'a': 10, 'b': 10}] # doctest: +SKIP\n >>> _aggregate_score_dicts(scores) # doctest: +SKIP\n {'a': array([1, 2, 3, 10]),\n 'b': array([10, 2, 3, 10])}\n ", "source_code": "\ndef _aggregate_score_dicts(scores):\n \"\"\"Aggregate the list of dict to dict of np ndarray\n\n The aggregated output of _aggregate_score_dicts will be a list of dict\n of form [{'prec': 0.1, 'acc':1.0}, {'prec': 0.1, 'acc':1.0}, ...]\n Convert it to a dict of array {'prec': np.array([0.1 ...]), ...}\n\n Parameters\n ----------\n\n scores : list of dict\n List of dicts of the scores for all scorers. This is a flat list,\n assumed originally to be of row major order.\n\n Example\n -------\n\n >>> scores = [{'a': 1, 'b':10}, {'a': 2, 'b':2}, {'a': 3, 'b':3},\n ... {'a': 10, 'b': 10}] # doctest: +SKIP\n >>> _aggregate_score_dicts(scores) # doctest: +SKIP\n {'a': array([1, 2, 3, 10]),\n 'b': array([10, 2, 3, 10])}\n \"\"\"\n return {key: np.asarray([score[key] for score in scores]) if isinstance(scores[0][key], numbers.Number) else [score[key] for score in scores] for key in scores[0]}" }, { @@ -134397,7 +144612,8 @@ "docstring": { "type": "ndarray", "description": "int array to test" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -134407,13 +144623,14 @@ "docstring": { "type": "int", "description": "number of expected elements" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check whether indices is a reordering of the array np.arange(n_samples)", - "docstring": "Check whether indices is a reordering of the array np.arange(n_samples)\n\nParameters\n----------\nindices : ndarray\n int array to test\nn_samples : int\n number of expected elements\n\nReturns\n-------\nis_partition : bool\n True iff sorted(indices) is np.arange(n)", + "docstring": "Check whether indices is a reordering of the array np.arange(n_samples)\n\n Parameters\n ----------\n indices : ndarray\n int array to test\n n_samples : int\n number of expected elements\n\n Returns\n -------\n is_partition : bool\n True iff sorted(indices) is np.arange(n)\n ", "source_code": "\ndef _check_is_permutation(indices, n_samples):\n \"\"\"Check whether indices is a reordering of the array np.arange(n_samples)\n\n Parameters\n ----------\n indices : ndarray\n int array to test\n n_samples : int\n number of expected elements\n\n Returns\n -------\n is_partition : bool\n True iff sorted(indices) is np.arange(n)\n \"\"\"\n if len(indices) != n_samples:\n return False\n hit = np.zeros(n_samples, dtype=bool)\n hit[indices] = True\n if not np.all(hit):\n return False\n return True" }, { @@ -134431,7 +144648,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "predictions", @@ -134441,7 +144659,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -134451,7 +144670,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "method", @@ -134461,13 +144681,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Ensure that prediction arrays have correct column order\n\nWhen doing cross-validation, if one or more classes are not present in the subset of data used for training, then the output prediction array might not have the same columns as other folds. Use the list of class names (assumed to be ints) to enforce the correct column order. Note that `classes` is the list of classes in this fold (a subset of the classes in the full training set) and `n_classes` is the number of classes in the full training set.", - "docstring": "Ensure that prediction arrays have correct column order\n\nWhen doing cross-validation, if one or more classes are\nnot present in the subset of data used for training,\nthen the output prediction array might not have the same\ncolumns as other folds. Use the list of class names\n(assumed to be ints) to enforce the correct column order.\n\nNote that `classes` is the list of classes in this fold\n(a subset of the classes in the full training set)\nand `n_classes` is the number of classes in the full training set.", + "description": "Ensure that prediction arrays have correct column order\n\nWhen doing cross-validation, if one or more classes are\nnot present in the subset of data used for training,\nthen the output prediction array might not have the same\ncolumns as other folds. Use the list of class names\n(assumed to be ints) to enforce the correct column order.\n\nNote that `classes` is the list of classes in this fold\n(a subset of the classes in the full training set)\nand `n_classes` is the number of classes in the full training set.", + "docstring": "Ensure that prediction arrays have correct column order\n\n When doing cross-validation, if one or more classes are\n not present in the subset of data used for training,\n then the output prediction array might not have the same\n columns as other folds. Use the list of class names\n (assumed to be ints) to enforce the correct column order.\n\n Note that `classes` is the list of classes in this fold\n (a subset of the classes in the full training set)\n and `n_classes` is the number of classes in the full training set.\n ", "source_code": "\ndef _enforce_prediction_order(classes, predictions, n_classes, method):\n \"\"\"Ensure that prediction arrays have correct column order\n\n When doing cross-validation, if one or more classes are\n not present in the subset of data used for training,\n then the output prediction array might not have the same\n columns as other folds. Use the list of class names\n (assumed to be ints) to enforce the correct column order.\n\n Note that `classes` is the list of classes in this fold\n (a subset of the classes in the full training set)\n and `n_classes` is the number of classes in the full training set.\n \"\"\"\n if n_classes != len(classes):\n recommendation = 'To fix this, use a cross-validation technique resulting in properly stratified folds'\n warnings.warn('Number of classes in training fold ({}) does not match total number of classes ({}). Results may not be appropriate for your use case. {}'.format(len(classes), n_classes, recommendation), RuntimeWarning)\n if method == 'decision_function':\n if predictions.ndim == 2 and predictions.shape[1] != len(classes):\n raise ValueError('Output shape {} of {} does not match number of classes ({}) in fold. Irregular decision_function outputs are not currently supported by cross_val_predict'.format(predictions.shape, method, len(classes)))\n if len(classes) <= 2:\n raise ValueError('Only {} class/es in training fold, but {} in overall dataset. This is not supported for decision_function with imbalanced folds. {}'.format(len(classes), n_classes, recommendation))\n float_min = np.finfo(predictions.dtype).min\n default_values = {'decision_function': float_min, 'predict_log_proba': float_min, 'predict_proba': 0}\n predictions_for_all_classes = np.full((_num_samples(predictions), n_classes), default_values[method], dtype=predictions.dtype)\n predictions_for_all_classes[:, classes] = predictions\n predictions = predictions_for_all_classes\n return predictions" }, { @@ -134485,7 +144706,8 @@ "docstring": { "type": "estimator object implementing 'fit' and 'predict'", "description": "The object to use to fit the data." - } + }, + "refined_type": {} }, { "name": "X", @@ -134495,7 +144717,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to fit.\n\n.. versionchanged:: 0.20\n X is only required to be an object with finite length or shape now" - } + }, + "refined_type": {} }, { "name": "y", @@ -134505,7 +144728,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs) or None", "description": "The target variable to try to predict in the case of\nsupervised learning." - } + }, + "refined_type": {} }, { "name": "train", @@ -134515,7 +144739,8 @@ "docstring": { "type": "array-like of shape (n_train_samples,)", "description": "Indices of training samples." - } + }, + "refined_type": {} }, { "name": "test", @@ -134525,7 +144750,8 @@ "docstring": { "type": "array-like of shape (n_test_samples,)", "description": "Indices of test samples." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -134535,7 +144761,8 @@ "docstring": { "type": "int", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "fit_params", @@ -134545,7 +144772,8 @@ "docstring": { "type": "dict or None", "description": "Parameters that will be passed to ``estimator.fit``." - } + }, + "refined_type": {} }, { "name": "method", @@ -134555,13 +144783,14 @@ "docstring": { "type": "str", "description": "Invokes the passed method name of the passed estimator." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit estimator and predict values for a given dataset split.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Fit estimator and predict values for a given dataset split.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nestimator : estimator object implementing 'fit' and 'predict'\n The object to use to fit the data.\n\nX : array-like of shape (n_samples, n_features)\n The data to fit.\n\n .. versionchanged:: 0.20\n X is only required to be an object with finite length or shape now\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs) or None\n The target variable to try to predict in the case of\n supervised learning.\n\ntrain : array-like of shape (n_train_samples,)\n Indices of training samples.\n\ntest : array-like of shape (n_test_samples,)\n Indices of test samples.\n\nverbose : int\n The verbosity level.\n\nfit_params : dict or None\n Parameters that will be passed to ``estimator.fit``.\n\nmethod : str\n Invokes the passed method name of the passed estimator.\n\nReturns\n-------\npredictions : sequence\n Result of calling 'estimator.method'", + "docstring": "Fit estimator and predict values for a given dataset split.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit' and 'predict'\n The object to use to fit the data.\n\n X : array-like of shape (n_samples, n_features)\n The data to fit.\n\n .. versionchanged:: 0.20\n X is only required to be an object with finite length or shape now\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None\n The target variable to try to predict in the case of\n supervised learning.\n\n train : array-like of shape (n_train_samples,)\n Indices of training samples.\n\n test : array-like of shape (n_test_samples,)\n Indices of test samples.\n\n verbose : int\n The verbosity level.\n\n fit_params : dict or None\n Parameters that will be passed to ``estimator.fit``.\n\n method : str\n Invokes the passed method name of the passed estimator.\n\n Returns\n -------\n predictions : sequence\n Result of calling 'estimator.method'\n ", "source_code": "\ndef _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method):\n \"\"\"Fit estimator and predict values for a given dataset split.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit' and 'predict'\n The object to use to fit the data.\n\n X : array-like of shape (n_samples, n_features)\n The data to fit.\n\n .. versionchanged:: 0.20\n X is only required to be an object with finite length or shape now\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None\n The target variable to try to predict in the case of\n supervised learning.\n\n train : array-like of shape (n_train_samples,)\n Indices of training samples.\n\n test : array-like of shape (n_test_samples,)\n Indices of test samples.\n\n verbose : int\n The verbosity level.\n\n fit_params : dict or None\n Parameters that will be passed to ``estimator.fit``.\n\n method : str\n Invokes the passed method name of the passed estimator.\n\n Returns\n -------\n predictions : sequence\n Result of calling 'estimator.method'\n \"\"\"\n fit_params = fit_params if fit_params is not None else {}\n fit_params = _check_fit_params(X, fit_params, train)\n (X_train, y_train) = _safe_split(estimator, X, y, train)\n (X_test, _) = _safe_split(estimator, X, y, test, train)\n if y_train is None:\n estimator.fit(X_train, **fit_params)\n else:\n estimator.fit(X_train, y_train, **fit_params)\n func = getattr(estimator, method)\n predictions = func(X_test)\n encode = method in ['decision_function', 'predict_proba', 'predict_log_proba'] and y is not None\n if encode:\n if isinstance(predictions, list):\n predictions = [_enforce_prediction_order(estimator.classes_[i_label], predictions[i_label], n_classes=len(set(y[:, i_label])), method=method) for i_label in range(len(predictions))]\n else:\n n_classes = len(set(y)) if y.ndim == 1 else y.shape[1]\n predictions = _enforce_prediction_order(estimator.classes_, predictions, n_classes, method)\n return predictions" }, { @@ -134579,7 +144808,8 @@ "docstring": { "type": "estimator object implementing 'fit'", "description": "The object to use to fit the data." - } + }, + "refined_type": {} }, { "name": "X", @@ -134589,7 +144819,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to fit." - } + }, + "refined_type": {} }, { "name": "y", @@ -134599,7 +144830,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs) or None", "description": "The target variable to try to predict in the case of\nsupervised learning." - } + }, + "refined_type": {} }, { "name": "scorer", @@ -134609,7 +144841,8 @@ "docstring": { "type": "A single callable or dict mapping scorer name to the callable", "description": "If it is a single callable, the return value for ``train_scores`` and\n``test_scores`` is a single float.\n\nFor a dict, it should be one mapping the scorer name to the scorer\ncallable object / function.\n\nThe callable object / fn should have signature\n``scorer(estimator, X, y)``." - } + }, + "refined_type": {} }, { "name": "train", @@ -134619,7 +144852,8 @@ "docstring": { "type": "array-like of shape (n_train_samples,)", "description": "Indices of training samples." - } + }, + "refined_type": {} }, { "name": "test", @@ -134629,7 +144863,8 @@ "docstring": { "type": "array-like of shape (n_test_samples,)", "description": "Indices of test samples." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -134639,7 +144874,8 @@ "docstring": { "type": "int", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "parameters", @@ -134649,7 +144885,8 @@ "docstring": { "type": "dict or None", "description": "Parameters to be set on the estimator." - } + }, + "refined_type": {} }, { "name": "fit_params", @@ -134659,7 +144896,8 @@ "docstring": { "type": "dict or None", "description": "Parameters that will be passed to ``estimator.fit``." - } + }, + "refined_type": {} }, { "name": "return_train_score", @@ -134669,7 +144907,8 @@ "docstring": { "type": "bool, default=False", "description": "Compute and return score on training set." - } + }, + "refined_type": {} }, { "name": "return_parameters", @@ -134679,7 +144918,8 @@ "docstring": { "type": "bool, default=False", "description": "Return parameters that has been used for the estimator." - } + }, + "refined_type": {} }, { "name": "return_n_test_samples", @@ -134689,7 +144929,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the ``n_test_samples``." - } + }, + "refined_type": {} }, { "name": "return_times", @@ -134699,7 +144940,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the fit/score times." - } + }, + "refined_type": {} }, { "name": "return_estimator", @@ -134709,7 +144951,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the fitted estimator." - } + }, + "refined_type": {} }, { "name": "split_progress", @@ -134719,6 +144962,10 @@ "docstring": { "type": "{list, tuple} of int, default=None", "description": "A list or tuple of format (, )." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -134729,6 +144976,10 @@ "docstring": { "type": "{list, tuple} of int, default=None", "description": "A list or tuple of format\n(, )." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -134739,13 +144990,14 @@ "docstring": { "type": "'raise' or numeric, default=np.nan", "description": "Value to assign to the score if an error occurs in estimator fitting.\nIf set to 'raise', the error is raised.\nIf a numeric value is given, FitFailedWarning is raised." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit estimator and compute scores for a given dataset split.", - "docstring": "Fit estimator and compute scores for a given dataset split.\n\nParameters\n----------\nestimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\nX : array-like of shape (n_samples, n_features)\n The data to fit.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs) or None\n The target variable to try to predict in the case of\n supervised learning.\n\nscorer : A single callable or dict mapping scorer name to the callable\n If it is a single callable, the return value for ``train_scores`` and\n ``test_scores`` is a single float.\n\n For a dict, it should be one mapping the scorer name to the scorer\n callable object / function.\n\n The callable object / fn should have signature\n ``scorer(estimator, X, y)``.\n\ntrain : array-like of shape (n_train_samples,)\n Indices of training samples.\n\ntest : array-like of shape (n_test_samples,)\n Indices of test samples.\n\nverbose : int\n The verbosity level.\n\nerror_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\nparameters : dict or None\n Parameters to be set on the estimator.\n\nfit_params : dict or None\n Parameters that will be passed to ``estimator.fit``.\n\nreturn_train_score : bool, default=False\n Compute and return score on training set.\n\nreturn_parameters : bool, default=False\n Return parameters that has been used for the estimator.\n\nsplit_progress : {list, tuple} of int, default=None\n A list or tuple of format (, ).\n\ncandidate_progress : {list, tuple} of int, default=None\n A list or tuple of format\n (, ).\n\nreturn_n_test_samples : bool, default=False\n Whether to return the ``n_test_samples``.\n\nreturn_times : bool, default=False\n Whether to return the fit/score times.\n\nreturn_estimator : bool, default=False\n Whether to return the fitted estimator.\n\nReturns\n-------\nresult : dict with the following attributes\n train_scores : dict of scorer name -> float\n Score on training set (for all the scorers),\n returned only if `return_train_score` is `True`.\n test_scores : dict of scorer name -> float\n Score on testing set (for all the scorers).\n n_test_samples : int\n Number of test samples.\n fit_time : float\n Time spent for fitting in seconds.\n score_time : float\n Time spent for scoring in seconds.\n parameters : dict or None\n The parameters that have been evaluated.\n estimator : estimator object\n The fitted estimator.\n fit_error : str or None\n Traceback str if the fit failed, None if the fit succeeded.", + "docstring": "Fit estimator and compute scores for a given dataset split.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\n X : array-like of shape (n_samples, n_features)\n The data to fit.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None\n The target variable to try to predict in the case of\n supervised learning.\n\n scorer : A single callable or dict mapping scorer name to the callable\n If it is a single callable, the return value for ``train_scores`` and\n ``test_scores`` is a single float.\n\n For a dict, it should be one mapping the scorer name to the scorer\n callable object / function.\n\n The callable object / fn should have signature\n ``scorer(estimator, X, y)``.\n\n train : array-like of shape (n_train_samples,)\n Indices of training samples.\n\n test : array-like of shape (n_test_samples,)\n Indices of test samples.\n\n verbose : int\n The verbosity level.\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n parameters : dict or None\n Parameters to be set on the estimator.\n\n fit_params : dict or None\n Parameters that will be passed to ``estimator.fit``.\n\n return_train_score : bool, default=False\n Compute and return score on training set.\n\n return_parameters : bool, default=False\n Return parameters that has been used for the estimator.\n\n split_progress : {list, tuple} of int, default=None\n A list or tuple of format (, ).\n\n candidate_progress : {list, tuple} of int, default=None\n A list or tuple of format\n (, ).\n\n return_n_test_samples : bool, default=False\n Whether to return the ``n_test_samples``.\n\n return_times : bool, default=False\n Whether to return the fit/score times.\n\n return_estimator : bool, default=False\n Whether to return the fitted estimator.\n\n Returns\n -------\n result : dict with the following attributes\n train_scores : dict of scorer name -> float\n Score on training set (for all the scorers),\n returned only if `return_train_score` is `True`.\n test_scores : dict of scorer name -> float\n Score on testing set (for all the scorers).\n n_test_samples : int\n Number of test samples.\n fit_time : float\n Time spent for fitting in seconds.\n score_time : float\n Time spent for scoring in seconds.\n parameters : dict or None\n The parameters that have been evaluated.\n estimator : estimator object\n The fitted estimator.\n fit_error : str or None\n Traceback str if the fit failed, None if the fit succeeded.\n ", "source_code": "\ndef _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, return_estimator=False, split_progress=None, candidate_progress=None, error_score=np.nan):\n \"\"\"Fit estimator and compute scores for a given dataset split.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\n X : array-like of shape (n_samples, n_features)\n The data to fit.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None\n The target variable to try to predict in the case of\n supervised learning.\n\n scorer : A single callable or dict mapping scorer name to the callable\n If it is a single callable, the return value for ``train_scores`` and\n ``test_scores`` is a single float.\n\n For a dict, it should be one mapping the scorer name to the scorer\n callable object / function.\n\n The callable object / fn should have signature\n ``scorer(estimator, X, y)``.\n\n train : array-like of shape (n_train_samples,)\n Indices of training samples.\n\n test : array-like of shape (n_test_samples,)\n Indices of test samples.\n\n verbose : int\n The verbosity level.\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n parameters : dict or None\n Parameters to be set on the estimator.\n\n fit_params : dict or None\n Parameters that will be passed to ``estimator.fit``.\n\n return_train_score : bool, default=False\n Compute and return score on training set.\n\n return_parameters : bool, default=False\n Return parameters that has been used for the estimator.\n\n split_progress : {list, tuple} of int, default=None\n A list or tuple of format (, ).\n\n candidate_progress : {list, tuple} of int, default=None\n A list or tuple of format\n (, ).\n\n return_n_test_samples : bool, default=False\n Whether to return the ``n_test_samples``.\n\n return_times : bool, default=False\n Whether to return the fit/score times.\n\n return_estimator : bool, default=False\n Whether to return the fitted estimator.\n\n Returns\n -------\n result : dict with the following attributes\n train_scores : dict of scorer name -> float\n Score on training set (for all the scorers),\n returned only if `return_train_score` is `True`.\n test_scores : dict of scorer name -> float\n Score on testing set (for all the scorers).\n n_test_samples : int\n Number of test samples.\n fit_time : float\n Time spent for fitting in seconds.\n score_time : float\n Time spent for scoring in seconds.\n parameters : dict or None\n The parameters that have been evaluated.\n estimator : estimator object\n The fitted estimator.\n fit_error : str or None\n Traceback str if the fit failed, None if the fit succeeded.\n \"\"\"\n if not isinstance(error_score, numbers.Number) and error_score != 'raise':\n raise ValueError(\"error_score must be the string 'raise' or a numeric value. (Hint: if using 'raise', please make sure that it has been spelled correctly.)\")\n progress_msg = ''\n if verbose > 2:\n if split_progress is not None:\n progress_msg = f' {split_progress[0] + 1}/{split_progress[1]}'\n if candidate_progress and verbose > 9:\n progress_msg += f'; {candidate_progress[0] + 1}/{candidate_progress[1]}'\n if verbose > 1:\n if parameters is None:\n params_msg = ''\n else:\n sorted_keys = sorted(parameters)\n params_msg = ', '.join((f'{k}={parameters[k]}' for k in sorted_keys))\n if verbose > 9:\n start_msg = f'[CV{progress_msg}] START {params_msg}'\n print(f\"{start_msg}{(80 - len(start_msg)) * '.'}\")\n fit_params = fit_params if fit_params is not None else {}\n fit_params = _check_fit_params(X, fit_params, train)\n if parameters is not None:\n cloned_parameters = {}\n for (k, v) in parameters.items():\n cloned_parameters[k] = clone(v, safe=False)\n estimator = estimator.set_params(**cloned_parameters)\n start_time = time.time()\n (X_train, y_train) = _safe_split(estimator, X, y, train)\n (X_test, y_test) = _safe_split(estimator, X, y, test, train)\n result = {}\n try:\n if y_train is None:\n estimator.fit(X_train, **fit_params)\n else:\n estimator.fit(X_train, y_train, **fit_params)\n except Exception:\n fit_time = time.time() - start_time\n score_time = 0.0\n if error_score == 'raise':\n raise\n elif isinstance(error_score, numbers.Number):\n if isinstance(scorer, dict):\n test_scores = {name: error_score for name in scorer}\n if return_train_score:\n train_scores = test_scores.copy()\n else:\n test_scores = error_score\n if return_train_score:\n train_scores = error_score\n result['fit_error'] = format_exc()\n else:\n result['fit_error'] = None\n fit_time = time.time() - start_time\n test_scores = _score(estimator, X_test, y_test, scorer, error_score)\n score_time = time.time() - start_time - fit_time\n if return_train_score:\n train_scores = _score(estimator, X_train, y_train, scorer, error_score)\n if verbose > 1:\n total_time = score_time + fit_time\n end_msg = f'[CV{progress_msg}] END '\n result_msg = params_msg + (';' if params_msg else '')\n if verbose > 2:\n if isinstance(test_scores, dict):\n for scorer_name in sorted(test_scores):\n result_msg += f' {scorer_name}: ('\n if return_train_score:\n scorer_scores = train_scores[scorer_name]\n result_msg += f'train={scorer_scores:.3f}, '\n result_msg += f'test={test_scores[scorer_name]:.3f})'\n else:\n result_msg += ', score='\n if return_train_score:\n result_msg += f'(train={train_scores:.3f}, test={test_scores:.3f})'\n else:\n result_msg += f'{test_scores:.3f}'\n result_msg += f' total time={logger.short_format_time(total_time)}'\n end_msg += '.' * (80 - len(end_msg) - len(result_msg))\n end_msg += result_msg\n print(end_msg)\n result['test_scores'] = test_scores\n if return_train_score:\n result['train_scores'] = train_scores\n if return_n_test_samples:\n result['n_test_samples'] = _num_samples(X_test)\n if return_times:\n result['fit_time'] = fit_time\n result['score_time'] = score_time\n if return_parameters:\n result['parameters'] = parameters\n if return_estimator:\n result['estimator'] = estimator\n return result" }, { @@ -134763,7 +145015,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -134773,7 +145026,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -134783,7 +145037,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classes", @@ -134793,7 +145048,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "train", @@ -134803,7 +145059,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "test", @@ -134813,7 +145070,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "train_sizes", @@ -134823,7 +145081,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scorer", @@ -134833,7 +145092,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -134843,7 +145103,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "return_times", @@ -134853,7 +145114,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "error_score", @@ -134863,7 +145125,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_params", @@ -134873,7 +145136,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -134897,7 +145161,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "error_score", @@ -134907,13 +145172,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Insert error in `results` by replacing them inplace with `error_score`.\n\nThis only applies to multimetric scores because `_fit_and_score` will handle the single metric case.", - "docstring": "Insert error in `results` by replacing them inplace with `error_score`.\n\nThis only applies to multimetric scores because `_fit_and_score` will\nhandle the single metric case.", + "description": "Insert error in `results` by replacing them inplace with `error_score`.\n\nThis only applies to multimetric scores because `_fit_and_score` will\nhandle the single metric case.", + "docstring": "Insert error in `results` by replacing them inplace with `error_score`.\n\n This only applies to multimetric scores because `_fit_and_score` will\n handle the single metric case.\n ", "source_code": "\ndef _insert_error_scores(results, error_score):\n \"\"\"Insert error in `results` by replacing them inplace with `error_score`.\n\n This only applies to multimetric scores because `_fit_and_score` will\n handle the single metric case.\n \"\"\"\n successful_score = None\n failed_indices = []\n for (i, result) in enumerate(results):\n if result['fit_error'] is not None:\n failed_indices.append(i)\n elif successful_score is None:\n successful_score = result['test_scores']\n if successful_score is None:\n raise NotFittedError('All estimators failed to fit')\n if isinstance(successful_score, dict):\n formatted_error = {name: error_score for name in successful_score}\n for i in failed_indices:\n results[i]['test_scores'] = formatted_error.copy()\n if 'train_scores' in results[i]:\n results[i]['train_scores'] = formatted_error.copy()" }, { @@ -134931,7 +145197,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scaler_score_key", @@ -134941,7 +145208,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -134965,7 +145233,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -134975,7 +145244,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -134985,7 +145255,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -134995,7 +145266,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cv", @@ -135005,7 +145277,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scorer", @@ -135015,7 +145288,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_params", @@ -135025,7 +145299,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -135049,7 +145324,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_test", @@ -135059,7 +145335,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_test", @@ -135069,7 +145346,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "scorer", @@ -135079,7 +145357,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "error_score", @@ -135089,13 +145368,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the score(s) of an estimator on a given test set.\n\nWill return a dict of floats if `scorer` is a dict, otherwise a single float is returned.", - "docstring": "Compute the score(s) of an estimator on a given test set.\n\nWill return a dict of floats if `scorer` is a dict, otherwise a single\nfloat is returned.", + "description": "Compute the score(s) of an estimator on a given test set.\n\nWill return a dict of floats if `scorer` is a dict, otherwise a single\nfloat is returned.", + "docstring": "Compute the score(s) of an estimator on a given test set.\n\n Will return a dict of floats if `scorer` is a dict, otherwise a single\n float is returned.\n ", "source_code": "\ndef _score(estimator, X_test, y_test, scorer, error_score='raise'):\n \"\"\"Compute the score(s) of an estimator on a given test set.\n\n Will return a dict of floats if `scorer` is a dict, otherwise a single\n float is returned.\n \"\"\"\n if isinstance(scorer, dict):\n scorer = _MultimetricScorer(**scorer)\n try:\n if y_test is None:\n scores = scorer(estimator, X_test)\n else:\n scores = scorer(estimator, X_test, y_test)\n except Exception:\n if error_score == 'raise':\n raise\n else:\n if isinstance(scorer, _MultimetricScorer):\n scores = {name: error_score for name in scorer._scorers}\n else:\n scores = error_score\n warnings.warn(f'Scoring failed. The score on this train-test partition for these parameters will be set to {error_score}. Details: \\n{format_exc()}', UserWarning)\n error_msg = 'scoring must return a number, got %s (%s) instead. (scorer=%s)'\n if isinstance(scores, dict):\n for (name, score) in scores.items():\n if hasattr(score, 'item'):\n with suppress(ValueError):\n score = score.item()\n if not isinstance(score, numbers.Number):\n raise ValueError(error_msg % (score, type(score), name))\n scores[name] = score\n else:\n if hasattr(scores, 'item'):\n with suppress(ValueError):\n scores = scores.item()\n if not isinstance(scores, numbers.Number):\n raise ValueError(error_msg % (scores, type(scores), scorer))\n return scores" }, { @@ -135113,7 +145393,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "groups", @@ -135123,7 +145404,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -135133,7 +145415,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -135157,7 +145440,8 @@ "docstring": { "type": "array-like of shape (n_ticks,)", "description": "Numbers of training examples that will be used to generate the\nlearning curve. If the dtype is float, it is regarded as a\nfraction of 'n_max_training_samples', i.e. it has to be within (0, 1]." - } + }, + "refined_type": {} }, { "name": "n_max_training_samples", @@ -135167,13 +145451,14 @@ "docstring": { "type": "int", "description": "Maximum number of training samples (upper bound of 'train_sizes')." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Determine absolute sizes of training subsets and validate 'train_sizes'.\n\nExamples: _translate_train_sizes([0.5, 1.0], 10) -> [5, 10] _translate_train_sizes([5, 10], 10) -> [5, 10]", - "docstring": "Determine absolute sizes of training subsets and validate 'train_sizes'.\n\nExamples:\n _translate_train_sizes([0.5, 1.0], 10) -> [5, 10]\n _translate_train_sizes([5, 10], 10) -> [5, 10]\n\nParameters\n----------\ntrain_sizes : array-like of shape (n_ticks,)\n Numbers of training examples that will be used to generate the\n learning curve. If the dtype is float, it is regarded as a\n fraction of 'n_max_training_samples', i.e. it has to be within (0, 1].\n\nn_max_training_samples : int\n Maximum number of training samples (upper bound of 'train_sizes').\n\nReturns\n-------\ntrain_sizes_abs : array of shape (n_unique_ticks,)\n Numbers of training examples that will be used to generate the\n learning curve. Note that the number of ticks might be less\n than n_ticks because duplicate entries will be removed.", + "description": "Determine absolute sizes of training subsets and validate 'train_sizes'.\n\nExamples:\n _translate_train_sizes([0.5, 1.0], 10) -> [5, 10]\n _translate_train_sizes([5, 10], 10) -> [5, 10]", + "docstring": "Determine absolute sizes of training subsets and validate 'train_sizes'.\n\n Examples:\n _translate_train_sizes([0.5, 1.0], 10) -> [5, 10]\n _translate_train_sizes([5, 10], 10) -> [5, 10]\n\n Parameters\n ----------\n train_sizes : array-like of shape (n_ticks,)\n Numbers of training examples that will be used to generate the\n learning curve. If the dtype is float, it is regarded as a\n fraction of 'n_max_training_samples', i.e. it has to be within (0, 1].\n\n n_max_training_samples : int\n Maximum number of training samples (upper bound of 'train_sizes').\n\n Returns\n -------\n train_sizes_abs : array of shape (n_unique_ticks,)\n Numbers of training examples that will be used to generate the\n learning curve. Note that the number of ticks might be less\n than n_ticks because duplicate entries will be removed.\n ", "source_code": "\ndef _translate_train_sizes(train_sizes, n_max_training_samples):\n \"\"\"Determine absolute sizes of training subsets and validate 'train_sizes'.\n\n Examples:\n _translate_train_sizes([0.5, 1.0], 10) -> [5, 10]\n _translate_train_sizes([5, 10], 10) -> [5, 10]\n\n Parameters\n ----------\n train_sizes : array-like of shape (n_ticks,)\n Numbers of training examples that will be used to generate the\n learning curve. If the dtype is float, it is regarded as a\n fraction of 'n_max_training_samples', i.e. it has to be within (0, 1].\n\n n_max_training_samples : int\n Maximum number of training samples (upper bound of 'train_sizes').\n\n Returns\n -------\n train_sizes_abs : array of shape (n_unique_ticks,)\n Numbers of training examples that will be used to generate the\n learning curve. Note that the number of ticks might be less\n than n_ticks because duplicate entries will be removed.\n \"\"\"\n train_sizes_abs = np.asarray(train_sizes)\n n_ticks = train_sizes_abs.shape[0]\n n_min_required_samples = np.min(train_sizes_abs)\n n_max_required_samples = np.max(train_sizes_abs)\n if np.issubdtype(train_sizes_abs.dtype, np.floating):\n if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0:\n raise ValueError('train_sizes has been interpreted as fractions of the maximum number of training samples and must be within (0, 1], but is within [%f, %f].' % (n_min_required_samples, n_max_required_samples))\n train_sizes_abs = (train_sizes_abs * n_max_training_samples).astype(dtype=int, copy=False)\n train_sizes_abs = np.clip(train_sizes_abs, 1, n_max_training_samples)\n elif n_min_required_samples <= 0 or n_max_required_samples > n_max_training_samples:\n raise ValueError('train_sizes has been interpreted as absolute numbers of training samples and must be within (0, %d], but is within [%d, %d].' % (n_max_training_samples, n_min_required_samples, n_max_required_samples))\n train_sizes_abs = np.unique(train_sizes_abs)\n if n_ticks > train_sizes_abs.shape[0]:\n warnings.warn(\"Removed duplicate entries from 'train_sizes'. Number of ticks will be less than the size of 'train_sizes': %d instead of %d.\" % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning)\n return train_sizes_abs" }, { @@ -135191,7 +145476,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "error_score", @@ -135201,13 +145487,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _warn_about_fit_failures(results, error_score):\n fit_errors = [result['fit_error'] for result in results if result['fit_error'] is not None]\n if fit_errors:\n num_failed_fits = len(fit_errors)\n num_fits = len(results)\n fit_errors_counter = Counter(fit_errors)\n delimiter = '-' * 80 + '\\n'\n fit_errors_summary = '\\n'.join((f'{delimiter}{n} fits failed with the following error:\\n{error}' for (error, n) in fit_errors_counter.items()))\n some_fits_failed_message = f\"\\n{num_failed_fits} fits failed out of a total of {num_fits}.\\nThe score on these train-test partitions for these parameters will be set to {error_score}.\\nIf these failures are not expected, you can try to debug them by setting error_score='raise'.\\n\\nBelow are more details about the failures:\\n{fit_errors_summary}\"\n warnings.warn(some_fits_failed_message, FitFailedWarning)" }, { @@ -135225,7 +145512,8 @@ "docstring": { "type": "estimator object implementing 'fit' and 'predict'", "description": "The object to use to fit the data." - } + }, + "refined_type": {} }, { "name": "X", @@ -135235,7 +145523,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to fit. Can be, for example a list, or an array at least 2d." - } + }, + "refined_type": {} }, { "name": "y", @@ -135245,7 +145534,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs), default=None", "description": "The target variable to try to predict in the case of\nsupervised learning." - } + }, + "refined_type": {} }, { "name": "groups", @@ -135255,7 +145545,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set. Only used in conjunction with a \"Group\" :term:`cv`\ninstance (e.g., :class:`GroupKFold`)." - } + }, + "refined_type": {} }, { "name": "cv", @@ -135264,8 +145555,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "int, cross-validation generator or an iterable, default=None", - "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross validation,\n- int, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor int/None inputs, if the estimator is a classifier and ``y`` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`KFold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross validation,\n- int, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable that generates (train, test) splits as arrays of indices.\n\nFor int/None inputs, if the estimator is a classifier and ``y`` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`KFold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." + }, + "refined_type": {} }, { "name": "n_jobs", @@ -135275,7 +145567,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel. Training the estimator and\npredicting are parallelized over the cross-validation splits.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -135285,7 +145578,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "fit_params", @@ -135295,7 +145589,8 @@ "docstring": { "type": "dict, default=None", "description": "Parameters to pass to the fit method of the estimator." - } + }, + "refined_type": {} }, { "name": "pre_dispatch", @@ -135305,7 +145600,8 @@ "docstring": { "type": "int or str, default='2*n_jobs'", "description": "Controls the number of jobs that get dispatched during parallel\nexecution. Reducing this number can be useful to avoid an\nexplosion of memory consumption when more jobs get dispatched\nthan CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'" - } + }, + "refined_type": {} }, { "name": "method", @@ -135315,14 +145611,23 @@ "docstring": { "type": "{'predict', 'predict_proba', 'predict_log_proba', 'decision_function'}, default='predict'", "description": "The method to be invoked by `estimator`." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "predict_log_proba", + "predict", + "decision_function", + "predict_proba" + ] } } ], "results": [], "is_public": true, - "description": "Generate cross-validated estimates for each input data point\n\nThe data is split according to the cv parameter. Each sample belongs to exactly one test set, and its prediction is computed with an estimator fitted on the corresponding training set. Passing these predictions into an evaluation metric may not be a valid way to measure generalization performance. Results can differ from :func:`cross_validate` and :func:`cross_val_score` unless all tests sets have equal size and the metric decomposes over samples. Read more in the :ref:`User Guide `.", - "docstring": "Generate cross-validated estimates for each input data point\n\nThe data is split according to the cv parameter. Each sample belongs\nto exactly one test set, and its prediction is computed with an\nestimator fitted on the corresponding training set.\n\nPassing these predictions into an evaluation metric may not be a valid\nway to measure generalization performance. Results can differ from\n:func:`cross_validate` and :func:`cross_val_score` unless all tests sets\nhave equal size and the metric decomposes over samples.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nestimator : estimator object implementing 'fit' and 'predict'\n The object to use to fit the data.\n\nX : array-like of shape (n_samples, n_features)\n The data to fit. Can be, for example a list, or an array at least 2d.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n The target variable to try to predict in the case of\n supervised learning.\n\ngroups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\ncv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\nn_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and\n predicting are parallelized over the cross-validation splits.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nverbose : int, default=0\n The verbosity level.\n\nfit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\npre_dispatch : int or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\nmethod : {'predict', 'predict_proba', 'predict_log_proba', 'decision_function'}, default='predict'\n The method to be invoked by `estimator`.\n\nReturns\n-------\npredictions : ndarray\n This is the result of calling `method`. Shape:\n\n - When `method` is 'predict' and in special case where `method` is\n 'decision_function' and the target is binary: (n_samples,)\n - When `method` is one of {'predict_proba', 'predict_log_proba',\n 'decision_function'} (unless special case above):\n (n_samples, n_classes)\n - If `estimator` is :term:`multioutput`, an extra dimension\n 'n_outputs' is added to the end of each shape above.\n\nSee Also\n--------\ncross_val_score : Calculate score for each CV split.\ncross_validate : Calculate one or more scores and timings for each CV\n split.\n\nNotes\n-----\nIn the case that one or more classes are absent in a training portion, a\ndefault score needs to be assigned to all instances for that class if\n``method`` produces columns per class, as in {'decision_function',\n'predict_proba', 'predict_log_proba'}. For ``predict_proba`` this value is\n0. In order to ensure finite output, we approximate negative infinity by\nthe minimum finite float value for the dtype in other cases.\n\nExamples\n--------\n>>> from sklearn import datasets, linear_model\n>>> from sklearn.model_selection import cross_val_predict\n>>> diabetes = datasets.load_diabetes()\n>>> X = diabetes.data[:150]\n>>> y = diabetes.target[:150]\n>>> lasso = linear_model.Lasso()\n>>> y_pred = cross_val_predict(lasso, X, y, cv=3)", - "source_code": "\ndef cross_val_predict(estimator, X, y=None, *, groups=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict'):\n \"\"\"Generate cross-validated estimates for each input data point\n\n The data is split according to the cv parameter. Each sample belongs\n to exactly one test set, and its prediction is computed with an\n estimator fitted on the corresponding training set.\n\n Passing these predictions into an evaluation metric may not be a valid\n way to measure generalization performance. Results can differ from\n :func:`cross_validate` and :func:`cross_val_score` unless all tests sets\n have equal size and the metric decomposes over samples.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit' and 'predict'\n The object to use to fit the data.\n\n X : array-like of shape (n_samples, n_features)\n The data to fit. Can be, for example a list, or an array at least 2d.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n The target variable to try to predict in the case of\n supervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and\n predicting are parallelized over the cross-validation splits.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n The verbosity level.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n pre_dispatch : int or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\n method : {'predict', 'predict_proba', 'predict_log_proba', 'decision_function'}, default='predict'\n The method to be invoked by `estimator`.\n\n Returns\n -------\n predictions : ndarray\n This is the result of calling `method`. Shape:\n\n - When `method` is 'predict' and in special case where `method` is\n 'decision_function' and the target is binary: (n_samples,)\n - When `method` is one of {'predict_proba', 'predict_log_proba',\n 'decision_function'} (unless special case above):\n (n_samples, n_classes)\n - If `estimator` is :term:`multioutput`, an extra dimension\n 'n_outputs' is added to the end of each shape above.\n\n See Also\n --------\n cross_val_score : Calculate score for each CV split.\n cross_validate : Calculate one or more scores and timings for each CV\n split.\n\n Notes\n -----\n In the case that one or more classes are absent in a training portion, a\n default score needs to be assigned to all instances for that class if\n ``method`` produces columns per class, as in {'decision_function',\n 'predict_proba', 'predict_log_proba'}. For ``predict_proba`` this value is\n 0. In order to ensure finite output, we approximate negative infinity by\n the minimum finite float value for the dtype in other cases.\n\n Examples\n --------\n >>> from sklearn import datasets, linear_model\n >>> from sklearn.model_selection import cross_val_predict\n >>> diabetes = datasets.load_diabetes()\n >>> X = diabetes.data[:150]\n >>> y = diabetes.target[:150]\n >>> lasso = linear_model.Lasso()\n >>> y_pred = cross_val_predict(lasso, X, y, cv=3)\n \"\"\"\n (X, y, groups) = indexable(X, y, groups)\n cv = check_cv(cv, y, classifier=is_classifier(estimator))\n splits = list(cv.split(X, y, groups))\n test_indices = np.concatenate([test for (_, test) in splits])\n if not _check_is_permutation(test_indices, _num_samples(X)):\n raise ValueError('cross_val_predict only works for partitions')\n encode = method in ['decision_function', 'predict_proba', 'predict_log_proba'] and y is not None\n if encode:\n y = np.asarray(y)\n if y.ndim == 1:\n le = LabelEncoder()\n y = le.fit_transform(y)\n elif y.ndim == 2:\n y_enc = np.zeros_like(y, dtype=int)\n for i_label in range(y.shape[1]):\n y_enc[:, i_label] = LabelEncoder().fit_transform(y[:, i_label])\n y = y_enc\n parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)\n predictions = parallel((delayed(_fit_and_predict)(clone(estimator), X, y, train, test, verbose, fit_params, method) for (train, test) in splits))\n inv_test_indices = np.empty(len(test_indices), dtype=int)\n inv_test_indices[test_indices] = np.arange(len(test_indices))\n if sp.issparse(predictions[0]):\n predictions = sp.vstack(predictions, format=predictions[0].format)\n elif encode and isinstance(predictions[0], list):\n n_labels = y.shape[1]\n concat_pred = []\n for i_label in range(n_labels):\n label_preds = np.concatenate([p[i_label] for p in predictions])\n concat_pred.append(label_preds)\n predictions = concat_pred\n else:\n predictions = np.concatenate(predictions)\n if isinstance(predictions, list):\n return [p[inv_test_indices] for p in predictions]\n else:\n return predictions[inv_test_indices]" + "description": "Generate cross-validated estimates for each input data point.\n\nThe data is split according to the cv parameter. Each sample belongs\nto exactly one test set, and its prediction is computed with an\nestimator fitted on the corresponding training set.\n\nPassing these predictions into an evaluation metric may not be a valid\nway to measure generalization performance. Results can differ from\n:func:`cross_validate` and :func:`cross_val_score` unless all tests sets\nhave equal size and the metric decomposes over samples.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate cross-validated estimates for each input data point.\n\n The data is split according to the cv parameter. Each sample belongs\n to exactly one test set, and its prediction is computed with an\n estimator fitted on the corresponding training set.\n\n Passing these predictions into an evaluation metric may not be a valid\n way to measure generalization performance. Results can differ from\n :func:`cross_validate` and :func:`cross_val_score` unless all tests sets\n have equal size and the metric decomposes over samples.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit' and 'predict'\n The object to use to fit the data.\n\n X : array-like of shape (n_samples, n_features)\n The data to fit. Can be, for example a list, or an array at least 2d.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n The target variable to try to predict in the case of\n supervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable that generates (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and\n predicting are parallelized over the cross-validation splits.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n The verbosity level.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n pre_dispatch : int or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\n method : {'predict', 'predict_proba', 'predict_log_proba', 'decision_function'}, default='predict'\n The method to be invoked by `estimator`.\n\n Returns\n -------\n predictions : ndarray\n This is the result of calling `method`. Shape:\n\n - When `method` is 'predict' and in special case where `method` is\n 'decision_function' and the target is binary: (n_samples,)\n - When `method` is one of {'predict_proba', 'predict_log_proba',\n 'decision_function'} (unless special case above):\n (n_samples, n_classes)\n - If `estimator` is :term:`multioutput`, an extra dimension\n 'n_outputs' is added to the end of each shape above.\n\n See Also\n --------\n cross_val_score : Calculate score for each CV split.\n cross_validate : Calculate one or more scores and timings for each CV\n split.\n\n Notes\n -----\n In the case that one or more classes are absent in a training portion, a\n default score needs to be assigned to all instances for that class if\n ``method`` produces columns per class, as in {'decision_function',\n 'predict_proba', 'predict_log_proba'}. For ``predict_proba`` this value is\n 0. In order to ensure finite output, we approximate negative infinity by\n the minimum finite float value for the dtype in other cases.\n\n Examples\n --------\n >>> from sklearn import datasets, linear_model\n >>> from sklearn.model_selection import cross_val_predict\n >>> diabetes = datasets.load_diabetes()\n >>> X = diabetes.data[:150]\n >>> y = diabetes.target[:150]\n >>> lasso = linear_model.Lasso()\n >>> y_pred = cross_val_predict(lasso, X, y, cv=3)\n ", + "source_code": "\ndef cross_val_predict(estimator, X, y=None, *, groups=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict'):\n \"\"\"Generate cross-validated estimates for each input data point.\n\n The data is split according to the cv parameter. Each sample belongs\n to exactly one test set, and its prediction is computed with an\n estimator fitted on the corresponding training set.\n\n Passing these predictions into an evaluation metric may not be a valid\n way to measure generalization performance. Results can differ from\n :func:`cross_validate` and :func:`cross_val_score` unless all tests sets\n have equal size and the metric decomposes over samples.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit' and 'predict'\n The object to use to fit the data.\n\n X : array-like of shape (n_samples, n_features)\n The data to fit. Can be, for example a list, or an array at least 2d.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n The target variable to try to predict in the case of\n supervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable that generates (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and\n predicting are parallelized over the cross-validation splits.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n The verbosity level.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n pre_dispatch : int or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\n method : {'predict', 'predict_proba', 'predict_log_proba', 'decision_function'}, default='predict'\n The method to be invoked by `estimator`.\n\n Returns\n -------\n predictions : ndarray\n This is the result of calling `method`. Shape:\n\n - When `method` is 'predict' and in special case where `method` is\n 'decision_function' and the target is binary: (n_samples,)\n - When `method` is one of {'predict_proba', 'predict_log_proba',\n 'decision_function'} (unless special case above):\n (n_samples, n_classes)\n - If `estimator` is :term:`multioutput`, an extra dimension\n 'n_outputs' is added to the end of each shape above.\n\n See Also\n --------\n cross_val_score : Calculate score for each CV split.\n cross_validate : Calculate one or more scores and timings for each CV\n split.\n\n Notes\n -----\n In the case that one or more classes are absent in a training portion, a\n default score needs to be assigned to all instances for that class if\n ``method`` produces columns per class, as in {'decision_function',\n 'predict_proba', 'predict_log_proba'}. For ``predict_proba`` this value is\n 0. In order to ensure finite output, we approximate negative infinity by\n the minimum finite float value for the dtype in other cases.\n\n Examples\n --------\n >>> from sklearn import datasets, linear_model\n >>> from sklearn.model_selection import cross_val_predict\n >>> diabetes = datasets.load_diabetes()\n >>> X = diabetes.data[:150]\n >>> y = diabetes.target[:150]\n >>> lasso = linear_model.Lasso()\n >>> y_pred = cross_val_predict(lasso, X, y, cv=3)\n \"\"\"\n (X, y, groups) = indexable(X, y, groups)\n cv = check_cv(cv, y, classifier=is_classifier(estimator))\n splits = list(cv.split(X, y, groups))\n test_indices = np.concatenate([test for (_, test) in splits])\n if not _check_is_permutation(test_indices, _num_samples(X)):\n raise ValueError('cross_val_predict only works for partitions')\n encode = method in ['decision_function', 'predict_proba', 'predict_log_proba'] and y is not None\n if encode:\n y = np.asarray(y)\n if y.ndim == 1:\n le = LabelEncoder()\n y = le.fit_transform(y)\n elif y.ndim == 2:\n y_enc = np.zeros_like(y, dtype=int)\n for i_label in range(y.shape[1]):\n y_enc[:, i_label] = LabelEncoder().fit_transform(y[:, i_label])\n y = y_enc\n parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)\n predictions = parallel((delayed(_fit_and_predict)(clone(estimator), X, y, train, test, verbose, fit_params, method) for (train, test) in splits))\n inv_test_indices = np.empty(len(test_indices), dtype=int)\n inv_test_indices[test_indices] = np.arange(len(test_indices))\n if sp.issparse(predictions[0]):\n predictions = sp.vstack(predictions, format=predictions[0].format)\n elif encode and isinstance(predictions[0], list):\n n_labels = y.shape[1]\n concat_pred = []\n for i_label in range(n_labels):\n label_preds = np.concatenate([p[i_label] for p in predictions])\n concat_pred.append(label_preds)\n predictions = concat_pred\n else:\n predictions = np.concatenate(predictions)\n if isinstance(predictions, list):\n return [p[inv_test_indices] for p in predictions]\n else:\n return predictions[inv_test_indices]" }, { "name": "cross_val_score", @@ -135339,7 +145644,8 @@ "docstring": { "type": "estimator object implementing 'fit'", "description": "The object to use to fit the data." - } + }, + "refined_type": {} }, { "name": "X", @@ -135349,7 +145655,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to fit. Can be for example a list, or an array." - } + }, + "refined_type": {} }, { "name": "y", @@ -135359,7 +145666,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs), default=None", "description": "The target variable to try to predict in the case of\nsupervised learning." - } + }, + "refined_type": {} }, { "name": "groups", @@ -135369,7 +145677,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set. Only used in conjunction with a \"Group\" :term:`cv`\ninstance (e.g., :class:`GroupKFold`)." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -135379,7 +145688,8 @@ "docstring": { "type": "str or callable, default=None", "description": "A str (see model evaluation documentation) or\na scorer callable object / function with signature\n``scorer(estimator, X, y)`` which should return only\na single value.\n\nSimilar to :func:`cross_validate`\nbut only a single metric is permitted.\n\nIf `None`, the estimator's default scorer (if available) is used." - } + }, + "refined_type": {} }, { "name": "cv", @@ -135388,8 +145698,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "int, cross-validation generator or an iterable, default=None", - "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- `None`, to use the default 5-fold cross validation,\n- int, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor `int`/`None` inputs, if the estimator is a classifier and `y` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`KFold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n `cv` default value if `None` changed from 3-fold to 5-fold." - } + "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- `None`, to use the default 5-fold cross validation,\n- int, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable that generates (train, test) splits as arrays of indices.\n\nFor `int`/`None` inputs, if the estimator is a classifier and `y` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`KFold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n `cv` default value if `None` changed from 3-fold to 5-fold." + }, + "refined_type": {} }, { "name": "n_jobs", @@ -135399,7 +145710,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel. Training the estimator and computing\nthe score are parallelized over the cross-validation splits.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -135409,7 +145721,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "fit_params", @@ -135419,7 +145732,8 @@ "docstring": { "type": "dict, default=None", "description": "Parameters to pass to the fit method of the estimator." - } + }, + "refined_type": {} }, { "name": "pre_dispatch", @@ -135429,7 +145743,8 @@ "docstring": { "type": "int or str, default='2*n_jobs'", "description": "Controls the number of jobs that get dispatched during parallel\nexecution. Reducing this number can be useful to avoid an\nexplosion of memory consumption when more jobs get dispatched\nthan CPUs can process. This parameter can be:\n\n - ``None``, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'" - } + }, + "refined_type": {} }, { "name": "error_score", @@ -135439,14 +145754,15 @@ "docstring": { "type": "'raise' or numeric, default=np.nan", "description": "Value to assign to the score if an error occurs in estimator fitting.\nIf set to 'raise', the error is raised.\nIf a numeric value is given, FitFailedWarning is raised.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Evaluate a score by cross-validation\n\nRead more in the :ref:`User Guide `.", - "docstring": "Evaluate a score by cross-validation\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nestimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\nX : array-like of shape (n_samples, n_features)\n The data to fit. Can be for example a list, or an array.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n The target variable to try to predict in the case of\n supervised learning.\n\ngroups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\nscoring : str or callable, default=None\n A str (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)`` which should return only\n a single value.\n\n Similar to :func:`cross_validate`\n but only a single metric is permitted.\n\n If `None`, the estimator's default scorer (if available) is used.\n\ncv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - `None`, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For `int`/`None` inputs, if the estimator is a classifier and `y` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n `cv` default value if `None` changed from 3-fold to 5-fold.\n\nn_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the cross-validation splits.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nverbose : int, default=0\n The verbosity level.\n\nfit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\npre_dispatch : int or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - ``None``, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\nerror_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\nReturns\n-------\nscores : ndarray of float of shape=(len(list(cv)),)\n Array of scores of the estimator for each run of the cross validation.\n\nExamples\n--------\n>>> from sklearn import datasets, linear_model\n>>> from sklearn.model_selection import cross_val_score\n>>> diabetes = datasets.load_diabetes()\n>>> X = diabetes.data[:150]\n>>> y = diabetes.target[:150]\n>>> lasso = linear_model.Lasso()\n>>> print(cross_val_score(lasso, X, y, cv=3))\n[0.33150734 0.08022311 0.03531764]\n\nSee Also\n---------\ncross_validate : To run cross-validation on multiple metrics and also to\n return train scores, fit times and score times.\n\ncross_val_predict : Get predictions from each split of cross-validation for\n diagnostic purposes.\n\nsklearn.metrics.make_scorer : Make a scorer from a performance metric or\n loss function.", - "source_code": "\ndef cross_val_score(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score=np.nan):\n \"\"\"Evaluate a score by cross-validation\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\n X : array-like of shape (n_samples, n_features)\n The data to fit. Can be for example a list, or an array.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n The target variable to try to predict in the case of\n supervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\n scoring : str or callable, default=None\n A str (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)`` which should return only\n a single value.\n\n Similar to :func:`cross_validate`\n but only a single metric is permitted.\n\n If `None`, the estimator's default scorer (if available) is used.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - `None`, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For `int`/`None` inputs, if the estimator is a classifier and `y` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n `cv` default value if `None` changed from 3-fold to 5-fold.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the cross-validation splits.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n The verbosity level.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n pre_dispatch : int or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - ``None``, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n scores : ndarray of float of shape=(len(list(cv)),)\n Array of scores of the estimator for each run of the cross validation.\n\n Examples\n --------\n >>> from sklearn import datasets, linear_model\n >>> from sklearn.model_selection import cross_val_score\n >>> diabetes = datasets.load_diabetes()\n >>> X = diabetes.data[:150]\n >>> y = diabetes.target[:150]\n >>> lasso = linear_model.Lasso()\n >>> print(cross_val_score(lasso, X, y, cv=3))\n [0.33150734 0.08022311 0.03531764]\n\n See Also\n ---------\n cross_validate : To run cross-validation on multiple metrics and also to\n return train scores, fit times and score times.\n\n cross_val_predict : Get predictions from each split of cross-validation for\n diagnostic purposes.\n\n sklearn.metrics.make_scorer : Make a scorer from a performance metric or\n loss function.\n\n \"\"\"\n scorer = check_scoring(estimator, scoring=scoring)\n cv_results = cross_validate(estimator=estimator, X=X, y=y, groups=groups, scoring={'score': scorer}, cv=cv, n_jobs=n_jobs, verbose=verbose, fit_params=fit_params, pre_dispatch=pre_dispatch, error_score=error_score)\n return cv_results['test_score']" + "description": "Evaluate a score by cross-validation.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Evaluate a score by cross-validation.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\n X : array-like of shape (n_samples, n_features)\n The data to fit. Can be for example a list, or an array.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n The target variable to try to predict in the case of\n supervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\n scoring : str or callable, default=None\n A str (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)`` which should return only\n a single value.\n\n Similar to :func:`cross_validate`\n but only a single metric is permitted.\n\n If `None`, the estimator's default scorer (if available) is used.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - `None`, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable that generates (train, test) splits as arrays of indices.\n\n For `int`/`None` inputs, if the estimator is a classifier and `y` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n `cv` default value if `None` changed from 3-fold to 5-fold.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the cross-validation splits.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n The verbosity level.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n pre_dispatch : int or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - ``None``, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n scores : ndarray of float of shape=(len(list(cv)),)\n Array of scores of the estimator for each run of the cross validation.\n\n Examples\n --------\n >>> from sklearn import datasets, linear_model\n >>> from sklearn.model_selection import cross_val_score\n >>> diabetes = datasets.load_diabetes()\n >>> X = diabetes.data[:150]\n >>> y = diabetes.target[:150]\n >>> lasso = linear_model.Lasso()\n >>> print(cross_val_score(lasso, X, y, cv=3))\n [0.33150734 0.08022311 0.03531764]\n\n See Also\n ---------\n cross_validate : To run cross-validation on multiple metrics and also to\n return train scores, fit times and score times.\n\n cross_val_predict : Get predictions from each split of cross-validation for\n diagnostic purposes.\n\n sklearn.metrics.make_scorer : Make a scorer from a performance metric or\n loss function.\n ", + "source_code": "\ndef cross_val_score(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score=np.nan):\n \"\"\"Evaluate a score by cross-validation.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\n X : array-like of shape (n_samples, n_features)\n The data to fit. Can be for example a list, or an array.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n The target variable to try to predict in the case of\n supervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\n scoring : str or callable, default=None\n A str (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)`` which should return only\n a single value.\n\n Similar to :func:`cross_validate`\n but only a single metric is permitted.\n\n If `None`, the estimator's default scorer (if available) is used.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - `None`, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable that generates (train, test) splits as arrays of indices.\n\n For `int`/`None` inputs, if the estimator is a classifier and `y` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n `cv` default value if `None` changed from 3-fold to 5-fold.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the cross-validation splits.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n The verbosity level.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n pre_dispatch : int or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - ``None``, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n scores : ndarray of float of shape=(len(list(cv)),)\n Array of scores of the estimator for each run of the cross validation.\n\n Examples\n --------\n >>> from sklearn import datasets, linear_model\n >>> from sklearn.model_selection import cross_val_score\n >>> diabetes = datasets.load_diabetes()\n >>> X = diabetes.data[:150]\n >>> y = diabetes.target[:150]\n >>> lasso = linear_model.Lasso()\n >>> print(cross_val_score(lasso, X, y, cv=3))\n [0.33150734 0.08022311 0.03531764]\n\n See Also\n ---------\n cross_validate : To run cross-validation on multiple metrics and also to\n return train scores, fit times and score times.\n\n cross_val_predict : Get predictions from each split of cross-validation for\n diagnostic purposes.\n\n sklearn.metrics.make_scorer : Make a scorer from a performance metric or\n loss function.\n \"\"\"\n scorer = check_scoring(estimator, scoring=scoring)\n cv_results = cross_validate(estimator=estimator, X=X, y=y, groups=groups, scoring={'score': scorer}, cv=cv, n_jobs=n_jobs, verbose=verbose, fit_params=fit_params, pre_dispatch=pre_dispatch, error_score=error_score)\n return cv_results['test_score']" }, { "name": "cross_validate", @@ -135463,7 +145779,8 @@ "docstring": { "type": "estimator object implementing 'fit'", "description": "The object to use to fit the data." - } + }, + "refined_type": {} }, { "name": "X", @@ -135473,7 +145790,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to fit. Can be for example a list, or an array." - } + }, + "refined_type": {} }, { "name": "y", @@ -135483,7 +145801,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs), default=None", "description": "The target variable to try to predict in the case of\nsupervised learning." - } + }, + "refined_type": {} }, { "name": "groups", @@ -135493,7 +145812,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set. Only used in conjunction with a \"Group\" :term:`cv`\ninstance (e.g., :class:`GroupKFold`)." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -135503,7 +145823,8 @@ "docstring": { "type": "str, callable, list, tuple, or dict, default=None", "description": "Strategy to evaluate the performance of the cross-validated model on\nthe test set.\n\nIf `scoring` represents a single score, one can use:\n\n- a single string (see :ref:`scoring_parameter`);\n- a callable (see :ref:`scoring`) that returns a single value.\n\nIf `scoring` represents multiple scores, one can use:\n\n- a list or tuple of unique strings;\n- a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n- a dictionary with metric names as keys and callables a values.\n\nSee :ref:`multimetric_grid_search` for an example." - } + }, + "refined_type": {} }, { "name": "cv", @@ -135513,7 +145834,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross validation,\n- int, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor int/None inputs, if the estimator is a classifier and ``y`` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`.Fold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -135523,7 +145845,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel. Training the estimator and computing\nthe score are parallelized over the cross-validation splits.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -135533,7 +145856,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "fit_params", @@ -135543,7 +145867,8 @@ "docstring": { "type": "dict, default=None", "description": "Parameters to pass to the fit method of the estimator." - } + }, + "refined_type": {} }, { "name": "pre_dispatch", @@ -135553,7 +145878,8 @@ "docstring": { "type": "int or str, default='2*n_jobs'", "description": "Controls the number of jobs that get dispatched during parallel\nexecution. Reducing this number can be useful to avoid an\nexplosion of memory consumption when more jobs get dispatched\nthan CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'" - } + }, + "refined_type": {} }, { "name": "return_train_score", @@ -135563,7 +145889,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to include train scores.\nComputing training scores is used to get insights on how different\nparameter settings impact the overfitting/underfitting trade-off.\nHowever computing the scores on the training set can be computationally\nexpensive and is not strictly required to select the parameters that\nyield the best generalization performance.\n\n.. versionadded:: 0.19\n\n.. versionchanged:: 0.21\n Default value was changed from ``True`` to ``False``" - } + }, + "refined_type": {} }, { "name": "return_estimator", @@ -135573,7 +145900,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the estimators fitted on each split.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "error_score", @@ -135583,13 +145911,14 @@ "docstring": { "type": "'raise' or numeric, default=np.nan", "description": "Value to assign to the score if an error occurs in estimator fitting.\nIf set to 'raise', the error is raised.\nIf a numeric value is given, FitFailedWarning is raised.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Evaluate metric(s) by cross-validation and also record fit/score times.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Evaluate metric(s) by cross-validation and also record fit/score times.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nestimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\nX : array-like of shape (n_samples, n_features)\n The data to fit. Can be for example a list, or an array.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n The target variable to try to predict in the case of\n supervised learning.\n\ngroups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\nscoring : str, callable, list, tuple, or dict, default=None\n Strategy to evaluate the performance of the cross-validated model on\n the test set.\n\n If `scoring` represents a single score, one can use:\n\n - a single string (see :ref:`scoring_parameter`);\n - a callable (see :ref:`scoring`) that returns a single value.\n\n If `scoring` represents multiple scores, one can use:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n See :ref:`multimetric_grid_search` for an example.\n\ncv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`.Fold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\nn_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the cross-validation splits.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nverbose : int, default=0\n The verbosity level.\n\nfit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\npre_dispatch : int or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\nreturn_train_score : bool, default=False\n Whether to include train scores.\n Computing training scores is used to get insights on how different\n parameter settings impact the overfitting/underfitting trade-off.\n However computing the scores on the training set can be computationally\n expensive and is not strictly required to select the parameters that\n yield the best generalization performance.\n\n .. versionadded:: 0.19\n\n .. versionchanged:: 0.21\n Default value was changed from ``True`` to ``False``\n\nreturn_estimator : bool, default=False\n Whether to return the estimators fitted on each split.\n\n .. versionadded:: 0.20\n\nerror_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\nReturns\n-------\nscores : dict of float arrays of shape (n_splits,)\n Array of scores of the estimator for each run of the cross validation.\n\n A dict of arrays containing the score/time arrays for each scorer is\n returned. The possible keys for this ``dict`` are:\n\n ``test_score``\n The score array for test scores on each cv split.\n Suffix ``_score`` in ``test_score`` changes to a specific\n metric like ``test_r2`` or ``test_auc`` if there are\n multiple scoring metrics in the scoring parameter.\n ``train_score``\n The score array for train scores on each cv split.\n Suffix ``_score`` in ``train_score`` changes to a specific\n metric like ``train_r2`` or ``train_auc`` if there are\n multiple scoring metrics in the scoring parameter.\n This is available only if ``return_train_score`` parameter\n is ``True``.\n ``fit_time``\n The time for fitting the estimator on the train\n set for each cv split.\n ``score_time``\n The time for scoring the estimator on the test set for each\n cv split. (Note time for scoring on the train set is not\n included even if ``return_train_score`` is set to ``True``\n ``estimator``\n The estimator objects for each cv split.\n This is available only if ``return_estimator`` parameter\n is set to ``True``.\n\nExamples\n--------\n>>> from sklearn import datasets, linear_model\n>>> from sklearn.model_selection import cross_validate\n>>> from sklearn.metrics import make_scorer\n>>> from sklearn.metrics import confusion_matrix\n>>> from sklearn.svm import LinearSVC\n>>> diabetes = datasets.load_diabetes()\n>>> X = diabetes.data[:150]\n>>> y = diabetes.target[:150]\n>>> lasso = linear_model.Lasso()\n\nSingle metric evaluation using ``cross_validate``\n\n>>> cv_results = cross_validate(lasso, X, y, cv=3)\n>>> sorted(cv_results.keys())\n['fit_time', 'score_time', 'test_score']\n>>> cv_results['test_score']\narray([0.33150734, 0.08022311, 0.03531764])\n\nMultiple metric evaluation using ``cross_validate``\n(please refer the ``scoring`` parameter doc for more information)\n\n>>> scores = cross_validate(lasso, X, y, cv=3,\n... scoring=('r2', 'neg_mean_squared_error'),\n... return_train_score=True)\n>>> print(scores['test_neg_mean_squared_error'])\n[-3635.5... -3573.3... -6114.7...]\n>>> print(scores['train_r2'])\n[0.28010158 0.39088426 0.22784852]\n\nSee Also\n---------\ncross_val_score : Run cross-validation for single metric evaluation.\n\ncross_val_predict : Get predictions from each split of cross-validation for\n diagnostic purposes.\n\nsklearn.metrics.make_scorer : Make a scorer from a performance metric or\n loss function.", + "docstring": "Evaluate metric(s) by cross-validation and also record fit/score times.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\n X : array-like of shape (n_samples, n_features)\n The data to fit. Can be for example a list, or an array.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n The target variable to try to predict in the case of\n supervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\n scoring : str, callable, list, tuple, or dict, default=None\n Strategy to evaluate the performance of the cross-validated model on\n the test set.\n\n If `scoring` represents a single score, one can use:\n\n - a single string (see :ref:`scoring_parameter`);\n - a callable (see :ref:`scoring`) that returns a single value.\n\n If `scoring` represents multiple scores, one can use:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n See :ref:`multimetric_grid_search` for an example.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`.Fold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the cross-validation splits.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n The verbosity level.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n pre_dispatch : int or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\n return_train_score : bool, default=False\n Whether to include train scores.\n Computing training scores is used to get insights on how different\n parameter settings impact the overfitting/underfitting trade-off.\n However computing the scores on the training set can be computationally\n expensive and is not strictly required to select the parameters that\n yield the best generalization performance.\n\n .. versionadded:: 0.19\n\n .. versionchanged:: 0.21\n Default value was changed from ``True`` to ``False``\n\n return_estimator : bool, default=False\n Whether to return the estimators fitted on each split.\n\n .. versionadded:: 0.20\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n scores : dict of float arrays of shape (n_splits,)\n Array of scores of the estimator for each run of the cross validation.\n\n A dict of arrays containing the score/time arrays for each scorer is\n returned. The possible keys for this ``dict`` are:\n\n ``test_score``\n The score array for test scores on each cv split.\n Suffix ``_score`` in ``test_score`` changes to a specific\n metric like ``test_r2`` or ``test_auc`` if there are\n multiple scoring metrics in the scoring parameter.\n ``train_score``\n The score array for train scores on each cv split.\n Suffix ``_score`` in ``train_score`` changes to a specific\n metric like ``train_r2`` or ``train_auc`` if there are\n multiple scoring metrics in the scoring parameter.\n This is available only if ``return_train_score`` parameter\n is ``True``.\n ``fit_time``\n The time for fitting the estimator on the train\n set for each cv split.\n ``score_time``\n The time for scoring the estimator on the test set for each\n cv split. (Note time for scoring on the train set is not\n included even if ``return_train_score`` is set to ``True``\n ``estimator``\n The estimator objects for each cv split.\n This is available only if ``return_estimator`` parameter\n is set to ``True``.\n\n Examples\n --------\n >>> from sklearn import datasets, linear_model\n >>> from sklearn.model_selection import cross_validate\n >>> from sklearn.metrics import make_scorer\n >>> from sklearn.metrics import confusion_matrix\n >>> from sklearn.svm import LinearSVC\n >>> diabetes = datasets.load_diabetes()\n >>> X = diabetes.data[:150]\n >>> y = diabetes.target[:150]\n >>> lasso = linear_model.Lasso()\n\n Single metric evaluation using ``cross_validate``\n\n >>> cv_results = cross_validate(lasso, X, y, cv=3)\n >>> sorted(cv_results.keys())\n ['fit_time', 'score_time', 'test_score']\n >>> cv_results['test_score']\n array([0.33150734, 0.08022311, 0.03531764])\n\n Multiple metric evaluation using ``cross_validate``\n (please refer the ``scoring`` parameter doc for more information)\n\n >>> scores = cross_validate(lasso, X, y, cv=3,\n ... scoring=('r2', 'neg_mean_squared_error'),\n ... return_train_score=True)\n >>> print(scores['test_neg_mean_squared_error'])\n [-3635.5... -3573.3... -6114.7...]\n >>> print(scores['train_r2'])\n [0.28010158 0.39088426 0.22784852]\n\n See Also\n ---------\n cross_val_score : Run cross-validation for single metric evaluation.\n\n cross_val_predict : Get predictions from each split of cross-validation for\n diagnostic purposes.\n\n sklearn.metrics.make_scorer : Make a scorer from a performance metric or\n loss function.\n\n ", "source_code": "\ndef cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', return_train_score=False, return_estimator=False, error_score=np.nan):\n \"\"\"Evaluate metric(s) by cross-validation and also record fit/score times.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\n X : array-like of shape (n_samples, n_features)\n The data to fit. Can be for example a list, or an array.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n The target variable to try to predict in the case of\n supervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\n scoring : str, callable, list, tuple, or dict, default=None\n Strategy to evaluate the performance of the cross-validated model on\n the test set.\n\n If `scoring` represents a single score, one can use:\n\n - a single string (see :ref:`scoring_parameter`);\n - a callable (see :ref:`scoring`) that returns a single value.\n\n If `scoring` represents multiple scores, one can use:\n\n - a list or tuple of unique strings;\n - a callable returning a dictionary where the keys are the metric\n names and the values are the metric scores;\n - a dictionary with metric names as keys and callables a values.\n\n See :ref:`multimetric_grid_search` for an example.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`.Fold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the cross-validation splits.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n verbose : int, default=0\n The verbosity level.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n pre_dispatch : int or str, default='2*n_jobs'\n Controls the number of jobs that get dispatched during parallel\n execution. Reducing this number can be useful to avoid an\n explosion of memory consumption when more jobs get dispatched\n than CPUs can process. This parameter can be:\n\n - None, in which case all the jobs are immediately\n created and spawned. Use this for lightweight and\n fast-running jobs, to avoid delays due to on-demand\n spawning of the jobs\n\n - An int, giving the exact number of total jobs that are\n spawned\n\n - A str, giving an expression as a function of n_jobs,\n as in '2*n_jobs'\n\n return_train_score : bool, default=False\n Whether to include train scores.\n Computing training scores is used to get insights on how different\n parameter settings impact the overfitting/underfitting trade-off.\n However computing the scores on the training set can be computationally\n expensive and is not strictly required to select the parameters that\n yield the best generalization performance.\n\n .. versionadded:: 0.19\n\n .. versionchanged:: 0.21\n Default value was changed from ``True`` to ``False``\n\n return_estimator : bool, default=False\n Whether to return the estimators fitted on each split.\n\n .. versionadded:: 0.20\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n scores : dict of float arrays of shape (n_splits,)\n Array of scores of the estimator for each run of the cross validation.\n\n A dict of arrays containing the score/time arrays for each scorer is\n returned. The possible keys for this ``dict`` are:\n\n ``test_score``\n The score array for test scores on each cv split.\n Suffix ``_score`` in ``test_score`` changes to a specific\n metric like ``test_r2`` or ``test_auc`` if there are\n multiple scoring metrics in the scoring parameter.\n ``train_score``\n The score array for train scores on each cv split.\n Suffix ``_score`` in ``train_score`` changes to a specific\n metric like ``train_r2`` or ``train_auc`` if there are\n multiple scoring metrics in the scoring parameter.\n This is available only if ``return_train_score`` parameter\n is ``True``.\n ``fit_time``\n The time for fitting the estimator on the train\n set for each cv split.\n ``score_time``\n The time for scoring the estimator on the test set for each\n cv split. (Note time for scoring on the train set is not\n included even if ``return_train_score`` is set to ``True``\n ``estimator``\n The estimator objects for each cv split.\n This is available only if ``return_estimator`` parameter\n is set to ``True``.\n\n Examples\n --------\n >>> from sklearn import datasets, linear_model\n >>> from sklearn.model_selection import cross_validate\n >>> from sklearn.metrics import make_scorer\n >>> from sklearn.metrics import confusion_matrix\n >>> from sklearn.svm import LinearSVC\n >>> diabetes = datasets.load_diabetes()\n >>> X = diabetes.data[:150]\n >>> y = diabetes.target[:150]\n >>> lasso = linear_model.Lasso()\n\n Single metric evaluation using ``cross_validate``\n\n >>> cv_results = cross_validate(lasso, X, y, cv=3)\n >>> sorted(cv_results.keys())\n ['fit_time', 'score_time', 'test_score']\n >>> cv_results['test_score']\n array([0.33150734, 0.08022311, 0.03531764])\n\n Multiple metric evaluation using ``cross_validate``\n (please refer the ``scoring`` parameter doc for more information)\n\n >>> scores = cross_validate(lasso, X, y, cv=3,\n ... scoring=('r2', 'neg_mean_squared_error'),\n ... return_train_score=True)\n >>> print(scores['test_neg_mean_squared_error'])\n [-3635.5... -3573.3... -6114.7...]\n >>> print(scores['train_r2'])\n [0.28010158 0.39088426 0.22784852]\n\n See Also\n ---------\n cross_val_score : Run cross-validation for single metric evaluation.\n\n cross_val_predict : Get predictions from each split of cross-validation for\n diagnostic purposes.\n\n sklearn.metrics.make_scorer : Make a scorer from a performance metric or\n loss function.\n\n \"\"\"\n (X, y, groups) = indexable(X, y, groups)\n cv = check_cv(cv, y, classifier=is_classifier(estimator))\n if callable(scoring):\n scorers = scoring\n elif scoring is None or isinstance(scoring, str):\n scorers = check_scoring(estimator, scoring)\n else:\n scorers = _check_multimetric_scoring(estimator, scoring)\n parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)\n results = parallel((delayed(_fit_and_score)(clone(estimator), X, y, scorers, train, test, verbose, None, fit_params, return_train_score=return_train_score, return_times=True, return_estimator=return_estimator, error_score=error_score) for (train, test) in cv.split(X, y, groups)))\n _warn_about_fit_failures(results, error_score)\n if callable(scoring):\n _insert_error_scores(results, error_score)\n results = _aggregate_score_dicts(results)\n ret = {}\n ret['fit_time'] = results['fit_time']\n ret['score_time'] = results['score_time']\n if return_estimator:\n ret['estimator'] = results['estimator']\n test_scores_dict = _normalize_score_results(results['test_scores'])\n if return_train_score:\n train_scores_dict = _normalize_score_results(results['train_scores'])\n for name in test_scores_dict:\n ret['test_%s' % name] = test_scores_dict[name]\n if return_train_score:\n key = 'train_%s' % name\n ret[key] = train_scores_dict[name]\n return ret" }, { @@ -135607,7 +145936,8 @@ "docstring": { "type": "object type that implements the \"fit\" and \"predict\" methods", "description": "An object of that type which is cloned for each validation." - } + }, + "refined_type": {} }, { "name": "X", @@ -135617,7 +145947,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -135627,7 +145958,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Target relative to X for classification or regression;\nNone for unsupervised learning." - } + }, + "refined_type": {} }, { "name": "groups", @@ -135637,7 +145969,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set. Only used in conjunction with a \"Group\" :term:`cv`\ninstance (e.g., :class:`GroupKFold`)." - } + }, + "refined_type": {} }, { "name": "train_sizes", @@ -135647,7 +145980,8 @@ "docstring": { "type": "array-like of shape (n_ticks,), default=np.linspace(0.1, 1.0, 5)", "description": "Relative or absolute numbers of training examples that will be used to\ngenerate the learning curve. If the dtype is float, it is regarded as a\nfraction of the maximum size of the training set (that is determined\nby the selected validation method), i.e. it has to be within (0, 1].\nOtherwise it is interpreted as absolute sizes of the training sets.\nNote that for classification the number of samples usually have to\nbe big enough to contain at least one sample from each class." - } + }, + "refined_type": {} }, { "name": "cv", @@ -135657,7 +145991,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross validation,\n- int, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor int/None inputs, if the estimator is a classifier and ``y`` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`KFold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -135667,7 +146002,8 @@ "docstring": { "type": "str or callable, default=None", "description": "A str (see model evaluation documentation) or\na scorer callable object / function with signature\n``scorer(estimator, X, y)``." - } + }, + "refined_type": {} }, { "name": "exploit_incremental_learning", @@ -135677,7 +146013,8 @@ "docstring": { "type": "bool, default=False", "description": "If the estimator supports incremental learning, this will be\nused to speed up fitting for different training set sizes." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -135687,7 +146024,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel. Training the estimator and computing\nthe score are parallelized over the different training and test sets.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "pre_dispatch", @@ -135697,7 +146035,8 @@ "docstring": { "type": "int or str, default='all'", "description": "Number of predispatched jobs for parallel execution (default is\nall). The option can reduce the allocated memory. The str can\nbe an expression like '2*n_jobs'." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -135707,7 +146046,8 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity: the higher, the more messages." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -135717,7 +146057,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to shuffle training data before taking prefixes of it\nbased on``train_sizes``." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -135727,7 +146068,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used when ``shuffle`` is True. Pass an int for reproducible\noutput across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "error_score", @@ -135737,7 +146079,8 @@ "docstring": { "type": "'raise' or numeric, default=np.nan", "description": "Value to assign to the score if an error occurs in estimator fitting.\nIf set to 'raise', the error is raised.\nIf a numeric value is given, FitFailedWarning is raised.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "return_times", @@ -135747,7 +146090,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to return the fit and score times." - } + }, + "refined_type": {} }, { "name": "fit_params", @@ -135757,13 +146101,14 @@ "docstring": { "type": "dict, default=None", "description": "Parameters to pass to the fit method of the estimator.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Learning curve.\n\nDetermines cross-validated training and test scores for different training set sizes. A cross-validation generator splits the whole dataset k times in training and test data. Subsets of the training set with varying sizes will be used to train the estimator and a score for each training subset size and the test set will be computed. Afterwards, the scores will be averaged over all k runs for each training subset size. Read more in the :ref:`User Guide `.", - "docstring": "Learning curve.\n\nDetermines cross-validated training and test scores for different training\nset sizes.\n\nA cross-validation generator splits the whole dataset k times in training\nand test data. Subsets of the training set with varying sizes will be used\nto train the estimator and a score for each training subset size and the\ntest set will be computed. Afterwards, the scores will be averaged over\nall k runs for each training subset size.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nestimator : object type that implements the \"fit\" and \"predict\" methods\n An object of that type which is cloned for each validation.\n\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\ngroups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\ntrain_sizes : array-like of shape (n_ticks,), default=np.linspace(0.1, 1.0, 5)\n Relative or absolute numbers of training examples that will be used to\n generate the learning curve. If the dtype is float, it is regarded as a\n fraction of the maximum size of the training set (that is determined\n by the selected validation method), i.e. it has to be within (0, 1].\n Otherwise it is interpreted as absolute sizes of the training sets.\n Note that for classification the number of samples usually have to\n be big enough to contain at least one sample from each class.\n\ncv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\nscoring : str or callable, default=None\n A str (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n\nexploit_incremental_learning : bool, default=False\n If the estimator supports incremental learning, this will be\n used to speed up fitting for different training set sizes.\n\nn_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the different training and test sets.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\npre_dispatch : int or str, default='all'\n Number of predispatched jobs for parallel execution (default is\n all). The option can reduce the allocated memory. The str can\n be an expression like '2*n_jobs'.\n\nverbose : int, default=0\n Controls the verbosity: the higher, the more messages.\n\nshuffle : bool, default=False\n Whether to shuffle training data before taking prefixes of it\n based on``train_sizes``.\n\nrandom_state : int, RandomState instance or None, default=None\n Used when ``shuffle`` is True. Pass an int for reproducible\n output across multiple function calls.\n See :term:`Glossary `.\n\nerror_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\nreturn_times : bool, default=False\n Whether to return the fit and score times.\n\nfit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n .. versionadded:: 0.24\n\nReturns\n-------\ntrain_sizes_abs : array of shape (n_unique_ticks,)\n Numbers of training examples that has been used to generate the\n learning curve. Note that the number of ticks might be less\n than n_ticks because duplicate entries will be removed.\n\ntrain_scores : array of shape (n_ticks, n_cv_folds)\n Scores on training sets.\n\ntest_scores : array of shape (n_ticks, n_cv_folds)\n Scores on test set.\n\nfit_times : array of shape (n_ticks, n_cv_folds)\n Times spent for fitting in seconds. Only present if ``return_times``\n is True.\n\nscore_times : array of shape (n_ticks, n_cv_folds)\n Times spent for scoring in seconds. Only present if ``return_times``\n is True.\n\nNotes\n-----\nSee :ref:`examples/model_selection/plot_learning_curve.py\n`", + "description": "Learning curve.\n\nDetermines cross-validated training and test scores for different training\nset sizes.\n\nA cross-validation generator splits the whole dataset k times in training\nand test data. Subsets of the training set with varying sizes will be used\nto train the estimator and a score for each training subset size and the\ntest set will be computed. Afterwards, the scores will be averaged over\nall k runs for each training subset size.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Learning curve.\n\n Determines cross-validated training and test scores for different training\n set sizes.\n\n A cross-validation generator splits the whole dataset k times in training\n and test data. Subsets of the training set with varying sizes will be used\n to train the estimator and a score for each training subset size and the\n test set will be computed. Afterwards, the scores will be averaged over\n all k runs for each training subset size.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : object type that implements the \"fit\" and \"predict\" methods\n An object of that type which is cloned for each validation.\n\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\n train_sizes : array-like of shape (n_ticks,), default=np.linspace(0.1, 1.0, 5)\n Relative or absolute numbers of training examples that will be used to\n generate the learning curve. If the dtype is float, it is regarded as a\n fraction of the maximum size of the training set (that is determined\n by the selected validation method), i.e. it has to be within (0, 1].\n Otherwise it is interpreted as absolute sizes of the training sets.\n Note that for classification the number of samples usually have to\n be big enough to contain at least one sample from each class.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n scoring : str or callable, default=None\n A str (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n\n exploit_incremental_learning : bool, default=False\n If the estimator supports incremental learning, this will be\n used to speed up fitting for different training set sizes.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the different training and test sets.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n pre_dispatch : int or str, default='all'\n Number of predispatched jobs for parallel execution (default is\n all). The option can reduce the allocated memory. The str can\n be an expression like '2*n_jobs'.\n\n verbose : int, default=0\n Controls the verbosity: the higher, the more messages.\n\n shuffle : bool, default=False\n Whether to shuffle training data before taking prefixes of it\n based on``train_sizes``.\n\n random_state : int, RandomState instance or None, default=None\n Used when ``shuffle`` is True. Pass an int for reproducible\n output across multiple function calls.\n See :term:`Glossary `.\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\n return_times : bool, default=False\n Whether to return the fit and score times.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n train_sizes_abs : array of shape (n_unique_ticks,)\n Numbers of training examples that has been used to generate the\n learning curve. Note that the number of ticks might be less\n than n_ticks because duplicate entries will be removed.\n\n train_scores : array of shape (n_ticks, n_cv_folds)\n Scores on training sets.\n\n test_scores : array of shape (n_ticks, n_cv_folds)\n Scores on test set.\n\n fit_times : array of shape (n_ticks, n_cv_folds)\n Times spent for fitting in seconds. Only present if ``return_times``\n is True.\n\n score_times : array of shape (n_ticks, n_cv_folds)\n Times spent for scoring in seconds. Only present if ``return_times``\n is True.\n\n Notes\n -----\n See :ref:`examples/model_selection/plot_learning_curve.py\n `\n ", "source_code": "\ndef learning_curve(estimator, X, y, *, groups=None, train_sizes=np.linspace(0.1, 1.0, 5), cv=None, scoring=None, exploit_incremental_learning=False, n_jobs=None, pre_dispatch='all', verbose=0, shuffle=False, random_state=None, error_score=np.nan, return_times=False, fit_params=None):\n \"\"\"Learning curve.\n\n Determines cross-validated training and test scores for different training\n set sizes.\n\n A cross-validation generator splits the whole dataset k times in training\n and test data. Subsets of the training set with varying sizes will be used\n to train the estimator and a score for each training subset size and the\n test set will be computed. Afterwards, the scores will be averaged over\n all k runs for each training subset size.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : object type that implements the \"fit\" and \"predict\" methods\n An object of that type which is cloned for each validation.\n\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\n train_sizes : array-like of shape (n_ticks,), default=np.linspace(0.1, 1.0, 5)\n Relative or absolute numbers of training examples that will be used to\n generate the learning curve. If the dtype is float, it is regarded as a\n fraction of the maximum size of the training set (that is determined\n by the selected validation method), i.e. it has to be within (0, 1].\n Otherwise it is interpreted as absolute sizes of the training sets.\n Note that for classification the number of samples usually have to\n be big enough to contain at least one sample from each class.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n scoring : str or callable, default=None\n A str (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n\n exploit_incremental_learning : bool, default=False\n If the estimator supports incremental learning, this will be\n used to speed up fitting for different training set sizes.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the different training and test sets.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n pre_dispatch : int or str, default='all'\n Number of predispatched jobs for parallel execution (default is\n all). The option can reduce the allocated memory. The str can\n be an expression like '2*n_jobs'.\n\n verbose : int, default=0\n Controls the verbosity: the higher, the more messages.\n\n shuffle : bool, default=False\n Whether to shuffle training data before taking prefixes of it\n based on``train_sizes``.\n\n random_state : int, RandomState instance or None, default=None\n Used when ``shuffle`` is True. Pass an int for reproducible\n output across multiple function calls.\n See :term:`Glossary `.\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\n return_times : bool, default=False\n Whether to return the fit and score times.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n train_sizes_abs : array of shape (n_unique_ticks,)\n Numbers of training examples that has been used to generate the\n learning curve. Note that the number of ticks might be less\n than n_ticks because duplicate entries will be removed.\n\n train_scores : array of shape (n_ticks, n_cv_folds)\n Scores on training sets.\n\n test_scores : array of shape (n_ticks, n_cv_folds)\n Scores on test set.\n\n fit_times : array of shape (n_ticks, n_cv_folds)\n Times spent for fitting in seconds. Only present if ``return_times``\n is True.\n\n score_times : array of shape (n_ticks, n_cv_folds)\n Times spent for scoring in seconds. Only present if ``return_times``\n is True.\n\n Notes\n -----\n See :ref:`examples/model_selection/plot_learning_curve.py\n `\n \"\"\"\n if exploit_incremental_learning and not hasattr(estimator, 'partial_fit'):\n raise ValueError('An estimator must support the partial_fit interface to exploit incremental learning')\n (X, y, groups) = indexable(X, y, groups)\n cv = check_cv(cv, y, classifier=is_classifier(estimator))\n cv_iter = list(cv.split(X, y, groups))\n scorer = check_scoring(estimator, scoring=scoring)\n n_max_training_samples = len(cv_iter[0][0])\n train_sizes_abs = _translate_train_sizes(train_sizes, n_max_training_samples)\n n_unique_ticks = train_sizes_abs.shape[0]\n if verbose > 0:\n print('[learning_curve] Training set sizes: ' + str(train_sizes_abs))\n parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose)\n if shuffle:\n rng = check_random_state(random_state)\n cv_iter = ((rng.permutation(train), test) for (train, test) in cv_iter)\n if exploit_incremental_learning:\n classes = np.unique(y) if is_classifier(estimator) else None\n out = parallel((delayed(_incremental_fit_estimator)(clone(estimator), X, y, classes, train, test, train_sizes_abs, scorer, verbose, return_times, error_score=error_score, fit_params=fit_params) for (train, test) in cv_iter))\n out = np.asarray(out).transpose((2, 1, 0))\n else:\n train_test_proportions = []\n for (train, test) in cv_iter:\n for n_train_samples in train_sizes_abs:\n train_test_proportions.append((train[:n_train_samples], test))\n results = parallel((delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, parameters=None, fit_params=fit_params, return_train_score=True, error_score=error_score, return_times=return_times) for (train, test) in train_test_proportions))\n results = _aggregate_score_dicts(results)\n train_scores = results['train_scores'].reshape(-1, n_unique_ticks).T\n test_scores = results['test_scores'].reshape(-1, n_unique_ticks).T\n out = [train_scores, test_scores]\n if return_times:\n fit_times = results['fit_time'].reshape(-1, n_unique_ticks).T\n score_times = results['score_time'].reshape(-1, n_unique_ticks).T\n out.extend([fit_times, score_times])\n ret = (train_sizes_abs, out[0], out[1])\n if return_times:\n ret = ret + (out[2], out[3])\n return ret" }, { @@ -135781,7 +146126,8 @@ "docstring": { "type": "estimator object implementing 'fit'", "description": "The object to use to fit the data." - } + }, + "refined_type": {} }, { "name": "X", @@ -135791,7 +146137,8 @@ "docstring": { "type": "array-like of shape at least 2D", "description": "The data to fit." - } + }, + "refined_type": {} }, { "name": "y", @@ -135801,7 +146148,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs) or None", "description": "The target variable to try to predict in the case of\nsupervised learning." - } + }, + "refined_type": {} }, { "name": "groups", @@ -135811,7 +146159,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Labels to constrain permutation within groups, i.e. ``y`` values\nare permuted among samples with the same group identifier.\nWhen not specified, ``y`` values are permuted among all samples.\n\nWhen a grouped cross-validator is used, the group labels are\nalso passed on to the ``split`` method of the cross-validator. The\ncross-validator uses them for grouping the samples while splitting\nthe dataset into train/test set." - } + }, + "refined_type": {} }, { "name": "cv", @@ -135821,7 +146170,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- `None`, to use the default 5-fold cross validation,\n- int, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor `int`/`None` inputs, if the estimator is a classifier and `y` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`KFold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n `cv` default value if `None` changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "n_permutations", @@ -135831,7 +146181,8 @@ "docstring": { "type": "int, default=100", "description": "Number of times to permute ``y``." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -135841,7 +146192,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel. Training the estimator and computing\nthe cross-validated score are parallelized over the permutations.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -135851,7 +146203,8 @@ "docstring": { "type": "int, RandomState instance or None, default=0", "description": "Pass an int for reproducible output for permutation of\n``y`` values among samples. See :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -135861,7 +146214,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -135871,7 +146225,8 @@ "docstring": { "type": "str or callable, default=None", "description": "A single str (see :ref:`scoring_parameter`) or a callable\n(see :ref:`scoring`) to evaluate the predictions on the test set.\n\nIf `None` the estimator's score method is used." - } + }, + "refined_type": {} }, { "name": "fit_params", @@ -135881,13 +146236,14 @@ "docstring": { "type": "dict, default=None", "description": "Parameters to pass to the fit method of the estimator.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Evaluate the significance of a cross-validated score with permutations\n\nPermutes targets to generate 'randomized data' and compute the empirical p-value against the null hypothesis that features and targets are independent. The p-value represents the fraction of randomized data sets where the estimator performed as well or better than in the original data. A small p-value suggests that there is a real dependency between features and targets which has been used by the estimator to give good predictions. A large p-value may be due to lack of real dependency between features and targets or the estimator was not able to use the dependency to give good predictions. Read more in the :ref:`User Guide `.", - "docstring": "Evaluate the significance of a cross-validated score with permutations\n\nPermutes targets to generate 'randomized data' and compute the empirical\np-value against the null hypothesis that features and targets are\nindependent.\n\nThe p-value represents the fraction of randomized data sets where the\nestimator performed as well or better than in the original data. A small\np-value suggests that there is a real dependency between features and\ntargets which has been used by the estimator to give good predictions.\nA large p-value may be due to lack of real dependency between features\nand targets or the estimator was not able to use the dependency to\ngive good predictions.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nestimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\nX : array-like of shape at least 2D\n The data to fit.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs) or None\n The target variable to try to predict in the case of\n supervised learning.\n\ngroups : array-like of shape (n_samples,), default=None\n Labels to constrain permutation within groups, i.e. ``y`` values\n are permuted among samples with the same group identifier.\n When not specified, ``y`` values are permuted among all samples.\n\n When a grouped cross-validator is used, the group labels are\n also passed on to the ``split`` method of the cross-validator. The\n cross-validator uses them for grouping the samples while splitting\n the dataset into train/test set.\n\nscoring : str or callable, default=None\n A single str (see :ref:`scoring_parameter`) or a callable\n (see :ref:`scoring`) to evaluate the predictions on the test set.\n\n If `None` the estimator's score method is used.\n\ncv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - `None`, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For `int`/`None` inputs, if the estimator is a classifier and `y` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n `cv` default value if `None` changed from 3-fold to 5-fold.\n\nn_permutations : int, default=100\n Number of times to permute ``y``.\n\nn_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the cross-validated score are parallelized over the permutations.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nrandom_state : int, RandomState instance or None, default=0\n Pass an int for reproducible output for permutation of\n ``y`` values among samples. See :term:`Glossary `.\n\nverbose : int, default=0\n The verbosity level.\n\nfit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n .. versionadded:: 0.24\n\nReturns\n-------\nscore : float\n The true score without permuting targets.\n\npermutation_scores : array of shape (n_permutations,)\n The scores obtained for each permutations.\n\npvalue : float\n The p-value, which approximates the probability that the score would\n be obtained by chance. This is calculated as:\n\n `(C + 1) / (n_permutations + 1)`\n\n Where C is the number of permutations whose score >= the true score.\n\n The best possible p-value is 1/(n_permutations + 1), the worst is 1.0.\n\nNotes\n-----\nThis function implements Test 1 in:\n\n Ojala and Garriga. `Permutation Tests for Studying Classifier\n Performance\n `_. The\n Journal of Machine Learning Research (2010) vol. 11", + "description": "Evaluate the significance of a cross-validated score with permutations\n\nPermutes targets to generate 'randomized data' and compute the empirical\np-value against the null hypothesis that features and targets are\nindependent.\n\nThe p-value represents the fraction of randomized data sets where the\nestimator performed as well or better than in the original data. A small\np-value suggests that there is a real dependency between features and\ntargets which has been used by the estimator to give good predictions.\nA large p-value may be due to lack of real dependency between features\nand targets or the estimator was not able to use the dependency to\ngive good predictions.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Evaluate the significance of a cross-validated score with permutations\n\n Permutes targets to generate 'randomized data' and compute the empirical\n p-value against the null hypothesis that features and targets are\n independent.\n\n The p-value represents the fraction of randomized data sets where the\n estimator performed as well or better than in the original data. A small\n p-value suggests that there is a real dependency between features and\n targets which has been used by the estimator to give good predictions.\n A large p-value may be due to lack of real dependency between features\n and targets or the estimator was not able to use the dependency to\n give good predictions.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\n X : array-like of shape at least 2D\n The data to fit.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None\n The target variable to try to predict in the case of\n supervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Labels to constrain permutation within groups, i.e. ``y`` values\n are permuted among samples with the same group identifier.\n When not specified, ``y`` values are permuted among all samples.\n\n When a grouped cross-validator is used, the group labels are\n also passed on to the ``split`` method of the cross-validator. The\n cross-validator uses them for grouping the samples while splitting\n the dataset into train/test set.\n\n scoring : str or callable, default=None\n A single str (see :ref:`scoring_parameter`) or a callable\n (see :ref:`scoring`) to evaluate the predictions on the test set.\n\n If `None` the estimator's score method is used.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - `None`, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For `int`/`None` inputs, if the estimator is a classifier and `y` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n `cv` default value if `None` changed from 3-fold to 5-fold.\n\n n_permutations : int, default=100\n Number of times to permute ``y``.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the cross-validated score are parallelized over the permutations.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance or None, default=0\n Pass an int for reproducible output for permutation of\n ``y`` values among samples. See :term:`Glossary `.\n\n verbose : int, default=0\n The verbosity level.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n score : float\n The true score without permuting targets.\n\n permutation_scores : array of shape (n_permutations,)\n The scores obtained for each permutations.\n\n pvalue : float\n The p-value, which approximates the probability that the score would\n be obtained by chance. This is calculated as:\n\n `(C + 1) / (n_permutations + 1)`\n\n Where C is the number of permutations whose score >= the true score.\n\n The best possible p-value is 1/(n_permutations + 1), the worst is 1.0.\n\n Notes\n -----\n This function implements Test 1 in:\n\n Ojala and Garriga. `Permutation Tests for Studying Classifier\n Performance\n `_. The\n Journal of Machine Learning Research (2010) vol. 11\n\n ", "source_code": "\ndef permutation_test_score(estimator, X, y, *, groups=None, cv=None, n_permutations=100, n_jobs=None, random_state=0, verbose=0, scoring=None, fit_params=None):\n \"\"\"Evaluate the significance of a cross-validated score with permutations\n\n Permutes targets to generate 'randomized data' and compute the empirical\n p-value against the null hypothesis that features and targets are\n independent.\n\n The p-value represents the fraction of randomized data sets where the\n estimator performed as well or better than in the original data. A small\n p-value suggests that there is a real dependency between features and\n targets which has been used by the estimator to give good predictions.\n A large p-value may be due to lack of real dependency between features\n and targets or the estimator was not able to use the dependency to\n give good predictions.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object implementing 'fit'\n The object to use to fit the data.\n\n X : array-like of shape at least 2D\n The data to fit.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None\n The target variable to try to predict in the case of\n supervised learning.\n\n groups : array-like of shape (n_samples,), default=None\n Labels to constrain permutation within groups, i.e. ``y`` values\n are permuted among samples with the same group identifier.\n When not specified, ``y`` values are permuted among all samples.\n\n When a grouped cross-validator is used, the group labels are\n also passed on to the ``split`` method of the cross-validator. The\n cross-validator uses them for grouping the samples while splitting\n the dataset into train/test set.\n\n scoring : str or callable, default=None\n A single str (see :ref:`scoring_parameter`) or a callable\n (see :ref:`scoring`) to evaluate the predictions on the test set.\n\n If `None` the estimator's score method is used.\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - `None`, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For `int`/`None` inputs, if the estimator is a classifier and `y` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n `cv` default value if `None` changed from 3-fold to 5-fold.\n\n n_permutations : int, default=100\n Number of times to permute ``y``.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the cross-validated score are parallelized over the permutations.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n random_state : int, RandomState instance or None, default=0\n Pass an int for reproducible output for permutation of\n ``y`` values among samples. See :term:`Glossary `.\n\n verbose : int, default=0\n The verbosity level.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n score : float\n The true score without permuting targets.\n\n permutation_scores : array of shape (n_permutations,)\n The scores obtained for each permutations.\n\n pvalue : float\n The p-value, which approximates the probability that the score would\n be obtained by chance. This is calculated as:\n\n `(C + 1) / (n_permutations + 1)`\n\n Where C is the number of permutations whose score >= the true score.\n\n The best possible p-value is 1/(n_permutations + 1), the worst is 1.0.\n\n Notes\n -----\n This function implements Test 1 in:\n\n Ojala and Garriga. `Permutation Tests for Studying Classifier\n Performance\n `_. The\n Journal of Machine Learning Research (2010) vol. 11\n\n \"\"\"\n (X, y, groups) = indexable(X, y, groups)\n cv = check_cv(cv, y, classifier=is_classifier(estimator))\n scorer = check_scoring(estimator, scoring=scoring)\n random_state = check_random_state(random_state)\n score = _permutation_test_score(clone(estimator), X, y, groups, cv, scorer, fit_params=fit_params)\n permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)((delayed(_permutation_test_score)(clone(estimator), X, _shuffle(y, groups, random_state), groups, cv, scorer, fit_params=fit_params) for _ in range(n_permutations)))\n permutation_scores = np.array(permutation_scores)\n pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)\n return score, permutation_scores, pvalue" }, { @@ -135905,7 +146261,8 @@ "docstring": { "type": "object type that implements the \"fit\" and \"predict\" methods", "description": "An object of that type which is cloned for each validation." - } + }, + "refined_type": {} }, { "name": "X", @@ -135915,7 +146272,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -135925,7 +146283,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs) or None", "description": "Target relative to X for classification or regression;\nNone for unsupervised learning." - } + }, + "refined_type": {} }, { "name": "param_name", @@ -135935,7 +146294,8 @@ "docstring": { "type": "str", "description": "Name of the parameter that will be varied." - } + }, + "refined_type": {} }, { "name": "param_range", @@ -135945,7 +146305,8 @@ "docstring": { "type": "array-like of shape (n_values,)", "description": "The values of the parameter that will be evaluated." - } + }, + "refined_type": {} }, { "name": "groups", @@ -135955,7 +146316,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Group labels for the samples used while splitting the dataset into\ntrain/test set. Only used in conjunction with a \"Group\" :term:`cv`\ninstance (e.g., :class:`GroupKFold`)." - } + }, + "refined_type": {} }, { "name": "cv", @@ -135965,7 +146327,8 @@ "docstring": { "type": "int, cross-validation generator or an iterable, default=None", "description": "Determines the cross-validation splitting strategy.\nPossible inputs for cv are:\n\n- None, to use the default 5-fold cross validation,\n- int, to specify the number of folds in a `(Stratified)KFold`,\n- :term:`CV splitter`,\n- An iterable yielding (train, test) splits as arrays of indices.\n\nFor int/None inputs, if the estimator is a classifier and ``y`` is\neither binary or multiclass, :class:`StratifiedKFold` is used. In all\nother cases, :class:`KFold` is used. These splitters are instantiated\nwith `shuffle=False` so the splits will be the same across calls.\n\nRefer :ref:`User Guide ` for the various\ncross-validation strategies that can be used here.\n\n.. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold." - } + }, + "refined_type": {} }, { "name": "scoring", @@ -135975,7 +146338,8 @@ "docstring": { "type": "str or callable, default=None", "description": "A str (see model evaluation documentation) or\na scorer callable object / function with signature\n``scorer(estimator, X, y)``." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -135985,7 +146349,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel. Training the estimator and computing\nthe score are parallelized over the combinations of each parameter\nvalue and each cross-validation split.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} }, { "name": "pre_dispatch", @@ -135995,7 +146360,8 @@ "docstring": { "type": "int or str, default='all'", "description": "Number of predispatched jobs for parallel execution (default is\nall). The option can reduce the allocated memory. The str can\nbe an expression like '2*n_jobs'." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -136005,7 +146371,8 @@ "docstring": { "type": "int, default=0", "description": "Controls the verbosity: the higher, the more messages." - } + }, + "refined_type": {} }, { "name": "error_score", @@ -136015,7 +146382,8 @@ "docstring": { "type": "'raise' or numeric, default=np.nan", "description": "Value to assign to the score if an error occurs in estimator fitting.\nIf set to 'raise', the error is raised.\nIf a numeric value is given, FitFailedWarning is raised.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "fit_params", @@ -136025,13 +146393,14 @@ "docstring": { "type": "dict, default=None", "description": "Parameters to pass to the fit method of the estimator.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Validation curve.\n\nDetermine training and test scores for varying parameter values. Compute scores for an estimator with different values of a specified parameter. This is similar to grid search with one parameter. However, this will also compute training scores and is merely a utility for plotting the results. Read more in the :ref:`User Guide `.", - "docstring": "Validation curve.\n\nDetermine training and test scores for varying parameter values.\n\nCompute scores for an estimator with different values of a specified\nparameter. This is similar to grid search with one parameter. However, this\nwill also compute training scores and is merely a utility for plotting the\nresults.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nestimator : object type that implements the \"fit\" and \"predict\" methods\n An object of that type which is cloned for each validation.\n\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs) or None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\nparam_name : str\n Name of the parameter that will be varied.\n\nparam_range : array-like of shape (n_values,)\n The values of the parameter that will be evaluated.\n\ngroups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\ncv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\nscoring : str or callable, default=None\n A str (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n\nn_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the combinations of each parameter\n value and each cross-validation split.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\npre_dispatch : int or str, default='all'\n Number of predispatched jobs for parallel execution (default is\n all). The option can reduce the allocated memory. The str can\n be an expression like '2*n_jobs'.\n\nverbose : int, default=0\n Controls the verbosity: the higher, the more messages.\n\nfit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n .. versionadded:: 0.24\n\nerror_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\nReturns\n-------\ntrain_scores : array of shape (n_ticks, n_cv_folds)\n Scores on training sets.\n\ntest_scores : array of shape (n_ticks, n_cv_folds)\n Scores on test set.\n\nNotes\n-----\nSee :ref:`sphx_glr_auto_examples_model_selection_plot_validation_curve.py`", + "description": "Validation curve.\n\nDetermine training and test scores for varying parameter values.\n\nCompute scores for an estimator with different values of a specified\nparameter. This is similar to grid search with one parameter. However, this\nwill also compute training scores and is merely a utility for plotting the\nresults.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Validation curve.\n\n Determine training and test scores for varying parameter values.\n\n Compute scores for an estimator with different values of a specified\n parameter. This is similar to grid search with one parameter. However, this\n will also compute training scores and is merely a utility for plotting the\n results.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : object type that implements the \"fit\" and \"predict\" methods\n An object of that type which is cloned for each validation.\n\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n param_name : str\n Name of the parameter that will be varied.\n\n param_range : array-like of shape (n_values,)\n The values of the parameter that will be evaluated.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n scoring : str or callable, default=None\n A str (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the combinations of each parameter\n value and each cross-validation split.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n pre_dispatch : int or str, default='all'\n Number of predispatched jobs for parallel execution (default is\n all). The option can reduce the allocated memory. The str can\n be an expression like '2*n_jobs'.\n\n verbose : int, default=0\n Controls the verbosity: the higher, the more messages.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n .. versionadded:: 0.24\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n train_scores : array of shape (n_ticks, n_cv_folds)\n Scores on training sets.\n\n test_scores : array of shape (n_ticks, n_cv_folds)\n Scores on test set.\n\n Notes\n -----\n See :ref:`sphx_glr_auto_examples_model_selection_plot_validation_curve.py`\n\n ", "source_code": "\ndef validation_curve(estimator, X, y, *, param_name, param_range, groups=None, cv=None, scoring=None, n_jobs=None, pre_dispatch='all', verbose=0, error_score=np.nan, fit_params=None):\n \"\"\"Validation curve.\n\n Determine training and test scores for varying parameter values.\n\n Compute scores for an estimator with different values of a specified\n parameter. This is similar to grid search with one parameter. However, this\n will also compute training scores and is merely a utility for plotting the\n results.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : object type that implements the \"fit\" and \"predict\" methods\n An object of that type which is cloned for each validation.\n\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n param_name : str\n Name of the parameter that will be varied.\n\n param_range : array-like of shape (n_values,)\n The values of the parameter that will be evaluated.\n\n groups : array-like of shape (n_samples,), default=None\n Group labels for the samples used while splitting the dataset into\n train/test set. Only used in conjunction with a \"Group\" :term:`cv`\n instance (e.g., :class:`GroupKFold`).\n\n cv : int, cross-validation generator or an iterable, default=None\n Determines the cross-validation splitting strategy.\n Possible inputs for cv are:\n\n - None, to use the default 5-fold cross validation,\n - int, to specify the number of folds in a `(Stratified)KFold`,\n - :term:`CV splitter`,\n - An iterable yielding (train, test) splits as arrays of indices.\n\n For int/None inputs, if the estimator is a classifier and ``y`` is\n either binary or multiclass, :class:`StratifiedKFold` is used. In all\n other cases, :class:`KFold` is used. These splitters are instantiated\n with `shuffle=False` so the splits will be the same across calls.\n\n Refer :ref:`User Guide ` for the various\n cross-validation strategies that can be used here.\n\n .. versionchanged:: 0.22\n ``cv`` default value if None changed from 3-fold to 5-fold.\n\n scoring : str or callable, default=None\n A str (see model evaluation documentation) or\n a scorer callable object / function with signature\n ``scorer(estimator, X, y)``.\n\n n_jobs : int, default=None\n Number of jobs to run in parallel. Training the estimator and computing\n the score are parallelized over the combinations of each parameter\n value and each cross-validation split.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n pre_dispatch : int or str, default='all'\n Number of predispatched jobs for parallel execution (default is\n all). The option can reduce the allocated memory. The str can\n be an expression like '2*n_jobs'.\n\n verbose : int, default=0\n Controls the verbosity: the higher, the more messages.\n\n fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n .. versionadded:: 0.24\n\n error_score : 'raise' or numeric, default=np.nan\n Value to assign to the score if an error occurs in estimator fitting.\n If set to 'raise', the error is raised.\n If a numeric value is given, FitFailedWarning is raised.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n train_scores : array of shape (n_ticks, n_cv_folds)\n Scores on training sets.\n\n test_scores : array of shape (n_ticks, n_cv_folds)\n Scores on test set.\n\n Notes\n -----\n See :ref:`sphx_glr_auto_examples_model_selection_plot_validation_curve.py`\n\n \"\"\"\n (X, y, groups) = indexable(X, y, groups)\n cv = check_cv(cv, y, classifier=is_classifier(estimator))\n scorer = check_scoring(estimator, scoring=scoring)\n parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose)\n results = parallel((delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, parameters={param_name: v}, fit_params=fit_params, return_train_score=True, error_score=error_score) for (train, test) in cv.split(X, y, groups) for v in param_range))\n n_params = len(param_range)\n results = _aggregate_score_dicts(results)\n train_scores = results['train_scores'].reshape(-1, n_params).T\n test_scores = results['test_scores'].reshape(-1, n_params).T\n return train_scores, test_scores" }, { @@ -136049,7 +146418,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -136059,7 +146429,8 @@ "docstring": { "type": "estimator object", "description": "An estimator object implementing :term:`fit` and one of\n:term:`decision_function` or :term:`predict_proba`." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -136069,13 +146440,14 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation: the `n_classes * (\nn_classes - 1) / 2` OVO problems are computed in parallel.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, *, n_jobs=None):\n self.estimator = estimator\n self.n_jobs = n_jobs" }, { @@ -136093,7 +146465,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -136120,7 +146493,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -136144,7 +146518,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136154,13 +146529,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Decision function for the OneVsOneClassifier.\n\nThe decision values for the samples are computed by adding the normalized sum of pair-wise classification confidence levels to the votes in order to disambiguate between the decision values when the votes for all the classes are equal leading to a tie.", - "docstring": "Decision function for the OneVsOneClassifier.\n\nThe decision values for the samples are computed by adding the\nnormalized sum of pair-wise classification confidence levels to the\nvotes in order to disambiguate between the decision values when the\nvotes for all the classes are equal leading to a tie.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data.\n\nReturns\n-------\nY : array-like of shape (n_samples, n_classes) or (n_samples,)\n Result of calling `decision_function` on the final estimator.\n\n .. versionchanged:: 0.19\n output shape changed to ``(n_samples,)`` to conform to\n scikit-learn conventions for binary classification.", + "description": "Decision function for the OneVsOneClassifier.\n\nThe decision values for the samples are computed by adding the\nnormalized sum of pair-wise classification confidence levels to the\nvotes in order to disambiguate between the decision values when the\nvotes for all the classes are equal leading to a tie.", + "docstring": "Decision function for the OneVsOneClassifier.\n\n The decision values for the samples are computed by adding the\n normalized sum of pair-wise classification confidence levels to the\n votes in order to disambiguate between the decision values when the\n votes for all the classes are equal leading to a tie.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n Y : array-like of shape (n_samples, n_classes) or (n_samples,)\n Result of calling `decision_function` on the final estimator.\n\n .. versionchanged:: 0.19\n output shape changed to ``(n_samples,)`` to conform to\n scikit-learn conventions for binary classification.\n ", "source_code": "\ndef decision_function(self, X):\n \"\"\"Decision function for the OneVsOneClassifier.\n\n The decision values for the samples are computed by adding the\n normalized sum of pair-wise classification confidence levels to the\n votes in order to disambiguate between the decision values when the\n votes for all the classes are equal leading to a tie.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n Y : array-like of shape (n_samples, n_classes) or (n_samples,)\n Result of calling `decision_function` on the final estimator.\n\n .. versionchanged:: 0.19\n output shape changed to ``(n_samples,)`` to conform to\n scikit-learn conventions for binary classification.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, force_all_finite=False, reset=False)\n indices = self.pairwise_indices_\n if indices is None:\n Xs = [X] * len(self.estimators_)\n else:\n Xs = [X[:, idx] for idx in indices]\n predictions = np.vstack([est.predict(Xi) for (est, Xi) in zip(self.estimators_, Xs)]).T\n confidences = np.vstack([_predict_binary(est, Xi) for (est, Xi) in zip(self.estimators_, Xs)]).T\n Y = _ovr_decision_function(predictions, confidences, len(self.classes_))\n if self.n_classes_ == 2:\n return Y[:, 1]\n return Y" }, { @@ -136178,7 +146554,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136188,7 +146565,8 @@ "docstring": { "type": "(sparse) array-like of shape (n_samples, n_features)", "description": "Data." - } + }, + "refined_type": {} }, { "name": "y", @@ -136198,13 +146576,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Multi-class targets." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit underlying estimators.", - "docstring": "Fit underlying estimators.\n\nParameters\n----------\nX : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\ny : array-like of shape (n_samples,)\n Multi-class targets.\n\nReturns\n-------\nself : object\n The fitted underlying estimator.", + "docstring": "Fit underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : array-like of shape (n_samples,)\n Multi-class targets.\n\n Returns\n -------\n self : object\n The fitted underlying estimator.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : array-like of shape (n_samples,)\n Multi-class targets.\n\n Returns\n -------\n self : object\n The fitted underlying estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], force_all_finite=False)\n check_classification_targets(y)\n self.classes_ = np.unique(y)\n if len(self.classes_) == 1:\n raise ValueError('OneVsOneClassifier can not be fit when only one class is present.')\n n_classes = self.classes_.shape[0]\n estimators_indices = list(zip(*Parallel(n_jobs=self.n_jobs)((delayed(_fit_ovo_binary)(self.estimator, X, y, self.classes_[i], self.classes_[j]) for i in range(n_classes) for j in range(i + 1, n_classes)))))\n self.estimators_ = estimators_indices[0]\n pairwise = _is_pairwise(self)\n self.pairwise_indices_ = estimators_indices[1] if pairwise else None\n return self" }, { @@ -136222,7 +146601,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -136246,7 +146626,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136256,7 +146637,8 @@ "docstring": { "type": "(sparse) array-like of shape (n_samples, n_features)", "description": "Data." - } + }, + "refined_type": {} }, { "name": "y", @@ -136266,7 +146648,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Multi-class targets." - } + }, + "refined_type": {} }, { "name": "classes", @@ -136276,13 +146659,14 @@ "docstring": { "type": "array, shape (n_classes, )", "description": "Classes across all calls to partial_fit.\nCan be obtained via `np.unique(y_all)`, where y_all is the\ntarget vector of the entire dataset.\nThis argument is only required in the first call of partial_fit\nand can be omitted in the subsequent calls." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Partially fit underlying estimators.\n\nShould be used when memory is inefficient to train all data. Chunks of data can be passed in several iteration, where the first call should have an array of all target variables.", - "docstring": "Partially fit underlying estimators.\n\nShould be used when memory is inefficient to train all data. Chunks\nof data can be passed in several iteration, where the first call\nshould have an array of all target variables.\n\nParameters\n----------\nX : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\ny : array-like of shape (n_samples,)\n Multi-class targets.\n\nclasses : array, shape (n_classes, )\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is only required in the first call of partial_fit\n and can be omitted in the subsequent calls.\n\nReturns\n-------\nself : object\n The partially fitted underlying estimator.", + "description": "Partially fit underlying estimators.\n\nShould be used when memory is inefficient to train all data. Chunks\nof data can be passed in several iteration, where the first call\nshould have an array of all target variables.", + "docstring": "Partially fit underlying estimators.\n\n Should be used when memory is inefficient to train all data. Chunks\n of data can be passed in several iteration, where the first call\n should have an array of all target variables.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : array-like of shape (n_samples,)\n Multi-class targets.\n\n classes : array, shape (n_classes, )\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is only required in the first call of partial_fit\n and can be omitted in the subsequent calls.\n\n Returns\n -------\n self : object\n The partially fitted underlying estimator.\n ", "source_code": "\n@available_if(_estimators_has('partial_fit'))\ndef partial_fit(self, X, y, classes=None):\n \"\"\"Partially fit underlying estimators.\n\n Should be used when memory is inefficient to train all data. Chunks\n of data can be passed in several iteration, where the first call\n should have an array of all target variables.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : array-like of shape (n_samples,)\n Multi-class targets.\n\n classes : array, shape (n_classes, )\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is only required in the first call of partial_fit\n and can be omitted in the subsequent calls.\n\n Returns\n -------\n self : object\n The partially fitted underlying estimator.\n \"\"\"\n first_call = _check_partial_fit_first_call(self, classes)\n if first_call:\n self.estimators_ = [clone(self.estimator) for _ in range(self.n_classes_ * (self.n_classes_ - 1) // 2)]\n if len(np.setdiff1d(y, self.classes_)):\n raise ValueError('Mini-batch contains {0} while it must be subset of {1}'.format(np.unique(y), self.classes_))\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], force_all_finite=False, reset=first_call)\n check_classification_targets(y)\n combinations = itertools.combinations(range(self.n_classes_), 2)\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_partial_fit_ovo_binary)(estimator, X, y, self.classes_[i], self.classes_[j]) for (estimator, (i, j)) in zip(self.estimators_, combinations)))\n self.pairwise_indices_ = None\n if hasattr(self.estimators_[0], 'n_features_in_'):\n self.n_features_in_ = self.estimators_[0].n_features_in_\n return self" }, { @@ -136300,7 +146684,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136310,13 +146695,14 @@ "docstring": { "type": "(sparse) array-like of shape (n_samples, n_features)", "description": "Data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Estimate the best class label for each sample in X.\n\nThis is implemented as ``argmax(decision_function(X), axis=1)`` which will return the label of the class with most votes by estimators predicting the outcome of a decision for each possible class pair.", - "docstring": "Estimate the best class label for each sample in X.\n\nThis is implemented as ``argmax(decision_function(X), axis=1)`` which\nwill return the label of the class with most votes by estimators\npredicting the outcome of a decision for each possible class pair.\n\nParameters\n----------\nX : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\nReturns\n-------\ny : numpy array of shape [n_samples]\n Predicted multi-class targets.", + "description": "Estimate the best class label for each sample in X.\n\nThis is implemented as ``argmax(decision_function(X), axis=1)`` which\nwill return the label of the class with most votes by estimators\npredicting the outcome of a decision for each possible class pair.", + "docstring": "Estimate the best class label for each sample in X.\n\n This is implemented as ``argmax(decision_function(X), axis=1)`` which\n will return the label of the class with most votes by estimators\n predicting the outcome of a decision for each possible class pair.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n Returns\n -------\n y : numpy array of shape [n_samples]\n Predicted multi-class targets.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Estimate the best class label for each sample in X.\n\n This is implemented as ``argmax(decision_function(X), axis=1)`` which\n will return the label of the class with most votes by estimators\n predicting the outcome of a decision for each possible class pair.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n Returns\n -------\n y : numpy array of shape [n_samples]\n Predicted multi-class targets.\n \"\"\"\n Y = self.decision_function(X)\n if self.n_classes_ == 2:\n return self.classes_[(Y > 0).astype(int)]\n return self.classes_[Y.argmax(axis=1)]" }, { @@ -136334,7 +146720,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -136344,7 +146731,8 @@ "docstring": { "type": "estimator object", "description": "An estimator object implementing :term:`fit` and one of\n:term:`decision_function` or :term:`predict_proba`." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -136354,13 +146742,14 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation: the `n_classes`\none-vs-rest problems are computed in parallel.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\n\n.. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, *, n_jobs=None):\n self.estimator = estimator\n self.n_jobs = n_jobs" }, { @@ -136378,7 +146767,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -136405,7 +146795,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -136432,13 +146823,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `coef_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26). If you observe this warning while using RFE or SelectFromModel, use the importance_getter parameter instead.')\n@property\ndef coef_(self):\n check_is_fitted(self)\n if not hasattr(self.estimators_[0], 'coef_'):\n raise AttributeError(\"Base estimator doesn't have a coef_ attribute.\")\n coefs = [e.coef_ for e in self.estimators_]\n if sp.issparse(coefs[0]):\n return sp.vstack(coefs)\n return np.vstack(coefs)" }, { @@ -136458,7 +146850,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136468,13 +146861,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Decision function for the OneVsRestClassifier.\n\nReturn the distance of each sample from the decision boundary for each class. This can only be used with estimators which implement the `decision_function` method.", - "docstring": "Decision function for the OneVsRestClassifier.\n\nReturn the distance of each sample from the decision boundary for each\nclass. This can only be used with estimators which implement the\n`decision_function` method.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data.\n\nReturns\n-------\nT : array-like of shape (n_samples, n_classes) or (n_samples,) for binary classification.\n Result of calling `decision_function` on the final estimator.\n\n .. versionchanged:: 0.19\n output shape changed to ``(n_samples,)`` to conform to\n scikit-learn conventions for binary classification.", + "description": "Decision function for the OneVsRestClassifier.\n\nReturn the distance of each sample from the decision boundary for each\nclass. This can only be used with estimators which implement the\n`decision_function` method.", + "docstring": "Decision function for the OneVsRestClassifier.\n\n Return the distance of each sample from the decision boundary for each\n class. This can only be used with estimators which implement the\n `decision_function` method.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n T : array-like of shape (n_samples, n_classes) or (n_samples,) for binary classification.\n Result of calling `decision_function` on the final estimator.\n\n .. versionchanged:: 0.19\n output shape changed to ``(n_samples,)`` to conform to\n scikit-learn conventions for binary classification.\n ", "source_code": "\n@available_if(_estimators_has('decision_function'))\ndef decision_function(self, X):\n \"\"\"Decision function for the OneVsRestClassifier.\n\n Return the distance of each sample from the decision boundary for each\n class. This can only be used with estimators which implement the\n `decision_function` method.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n T : array-like of shape (n_samples, n_classes) or (n_samples,) for binary classification.\n Result of calling `decision_function` on the final estimator.\n\n .. versionchanged:: 0.19\n output shape changed to ``(n_samples,)`` to conform to\n scikit-learn conventions for binary classification.\n \"\"\"\n check_is_fitted(self)\n if len(self.estimators_) == 1:\n return self.estimators_[0].decision_function(X)\n return np.array([est.decision_function(X).ravel() for est in self.estimators_]).T" }, { @@ -136492,7 +146886,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136502,7 +146897,8 @@ "docstring": { "type": "(sparse) array-like of shape (n_samples, n_features)", "description": "Data." - } + }, + "refined_type": {} }, { "name": "y", @@ -136512,13 +146908,14 @@ "docstring": { "type": "(sparse) array-like of shape (n_samples,) or (n_samples, n_classes)", "description": "Multi-class targets. An indicator matrix turns on multilabel\nclassification." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit underlying estimators.", - "docstring": "Fit underlying estimators.\n\nParameters\n----------\nX : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\ny : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\n Multi-class targets. An indicator matrix turns on multilabel\n classification.\n\nReturns\n-------\nself : object\n Instance of fitted estimator.", + "docstring": "Fit underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\n Multi-class targets. An indicator matrix turns on multilabel\n classification.\n\n Returns\n -------\n self : object\n Instance of fitted estimator.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\n Multi-class targets. An indicator matrix turns on multilabel\n classification.\n\n Returns\n -------\n self : object\n Instance of fitted estimator.\n \"\"\"\n self.label_binarizer_ = LabelBinarizer(sparse_output=True)\n Y = self.label_binarizer_.fit_transform(y)\n Y = Y.tocsc()\n self.classes_ = self.label_binarizer_.classes_\n columns = (col.toarray().ravel() for col in Y.T)\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_fit_binary)(self.estimator, X, column, classes=['not %s' % self.label_binarizer_.classes_[i], self.label_binarizer_.classes_[i]]) for (i, column) in enumerate(columns)))\n if hasattr(self.estimators_[0], 'n_features_in_'):\n self.n_features_in_ = self.estimators_[0].n_features_in_\n if hasattr(self.estimators_[0], 'feature_names_in_'):\n self.feature_names_in_ = self.estimators_[0].feature_names_in_\n return self" }, { @@ -136539,13 +146936,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `intercept_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26). If you observe this warning while using RFE or SelectFromModel, use the importance_getter parameter instead.')\n@property\ndef intercept_(self):\n check_is_fitted(self)\n if not hasattr(self.estimators_[0], 'intercept_'):\n raise AttributeError(\"Base estimator doesn't have an intercept_ attribute.\")\n return np.array([e.intercept_.ravel() for e in self.estimators_])" }, { @@ -136563,7 +146961,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -136587,7 +146986,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -136611,7 +147011,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136621,7 +147022,8 @@ "docstring": { "type": "(sparse) array-like of shape (n_samples, n_features)", "description": "Data." - } + }, + "refined_type": {} }, { "name": "y", @@ -136631,7 +147033,8 @@ "docstring": { "type": "(sparse) array-like of shape (n_samples,) or (n_samples, n_classes)", "description": "Multi-class targets. An indicator matrix turns on multilabel\nclassification." - } + }, + "refined_type": {} }, { "name": "classes", @@ -136641,13 +147044,14 @@ "docstring": { "type": "array, shape (n_classes, )", "description": "Classes across all calls to partial_fit.\nCan be obtained via `np.unique(y_all)`, where y_all is the\ntarget vector of the entire dataset.\nThis argument is only required in the first call of partial_fit\nand can be omitted in the subsequent calls." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Partially fit underlying estimators.\n\nShould be used when memory is inefficient to train all data. Chunks of data can be passed in several iteration.", - "docstring": "Partially fit underlying estimators.\n\nShould be used when memory is inefficient to train all data.\nChunks of data can be passed in several iteration.\n\nParameters\n----------\nX : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\ny : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\n Multi-class targets. An indicator matrix turns on multilabel\n classification.\n\nclasses : array, shape (n_classes, )\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is only required in the first call of partial_fit\n and can be omitted in the subsequent calls.\n\nReturns\n-------\nself : object\n Instance of partially fitted estimator.", + "description": "Partially fit underlying estimators.\n\nShould be used when memory is inefficient to train all data.\nChunks of data can be passed in several iteration.", + "docstring": "Partially fit underlying estimators.\n\n Should be used when memory is inefficient to train all data.\n Chunks of data can be passed in several iteration.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\n Multi-class targets. An indicator matrix turns on multilabel\n classification.\n\n classes : array, shape (n_classes, )\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is only required in the first call of partial_fit\n and can be omitted in the subsequent calls.\n\n Returns\n -------\n self : object\n Instance of partially fitted estimator.\n ", "source_code": "\n@available_if(_estimators_has('partial_fit'))\ndef partial_fit(self, X, y, classes=None):\n \"\"\"Partially fit underlying estimators.\n\n Should be used when memory is inefficient to train all data.\n Chunks of data can be passed in several iteration.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\n Multi-class targets. An indicator matrix turns on multilabel\n classification.\n\n classes : array, shape (n_classes, )\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is only required in the first call of partial_fit\n and can be omitted in the subsequent calls.\n\n Returns\n -------\n self : object\n Instance of partially fitted estimator.\n \"\"\"\n if _check_partial_fit_first_call(self, classes):\n if not hasattr(self.estimator, 'partial_fit'):\n raise ValueError(\"Base estimator {0}, doesn't have partial_fit method\".format(self.estimator))\n self.estimators_ = [clone(self.estimator) for _ in range(self.n_classes_)]\n self.label_binarizer_ = LabelBinarizer(sparse_output=True)\n self.label_binarizer_.fit(self.classes_)\n if len(np.setdiff1d(y, self.classes_)):\n raise ValueError(('Mini-batch contains {0} while classes ' + 'must be subset of {1}').format(np.unique(y), self.classes_))\n Y = self.label_binarizer_.transform(y)\n Y = Y.tocsc()\n columns = (col.toarray().ravel() for col in Y.T)\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_partial_fit_binary)(estimator, X, column) for (estimator, column) in zip(self.estimators_, columns)))\n if hasattr(self.estimators_[0], 'n_features_in_'):\n self.n_features_in_ = self.estimators_[0].n_features_in_\n return self" }, { @@ -136665,7 +147069,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136675,13 +147080,14 @@ "docstring": { "type": "(sparse) array-like of shape (n_samples, n_features)", "description": "Data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict multi-class targets using underlying estimators.", - "docstring": "Predict multi-class targets using underlying estimators.\n\nParameters\n----------\nX : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\nReturns\n-------\ny : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\n Predicted multi-class targets.", + "docstring": "Predict multi-class targets using underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n Returns\n -------\n y : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\n Predicted multi-class targets.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict multi-class targets using underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n Returns\n -------\n y : (sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\n Predicted multi-class targets.\n \"\"\"\n check_is_fitted(self)\n n_samples = _num_samples(X)\n if self.label_binarizer_.y_type_ == 'multiclass':\n maxima = np.empty(n_samples, dtype=float)\n maxima.fill(-np.inf)\n argmaxima = np.zeros(n_samples, dtype=int)\n for (i, e) in enumerate(self.estimators_):\n pred = _predict_binary(e, X)\n np.maximum(maxima, pred, out=maxima)\n argmaxima[maxima == pred] = i\n return self.classes_[argmaxima]\n else:\n if hasattr(self.estimators_[0], 'decision_function') and is_classifier(self.estimators_[0]):\n thresh = 0\n else:\n thresh = 0.5\n indices = array.array('i')\n indptr = array.array('i', [0])\n for e in self.estimators_:\n indices.extend(np.where(_predict_binary(e, X) > thresh)[0])\n indptr.append(len(indices))\n data = np.ones(len(indices), dtype=int)\n indicator = sp.csc_matrix((data, indices, indptr), shape=(n_samples, len(self.estimators_)))\n return self.label_binarizer_.inverse_transform(indicator)" }, { @@ -136699,7 +147105,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136709,13 +147116,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Probability estimates.\n\nThe returned estimates for all classes are ordered by label of classes. Note that in the multilabel case, each sample can have any number of labels. This returns the marginal probability that the given sample has the label in question. For example, it is entirely consistent that two labels both have a 90% probability of applying to a given sample. In the single label multiclass case, the rows of the returned matrix sum to 1.", - "docstring": "Probability estimates.\n\nThe returned estimates for all classes are ordered by label of classes.\n\nNote that in the multilabel case, each sample can have any number of\nlabels. This returns the marginal probability that the given sample has\nthe label in question. For example, it is entirely consistent that two\nlabels both have a 90% probability of applying to a given sample.\n\nIn the single label multiclass case, the rows of the returned matrix\nsum to 1.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data.\n\nReturns\n-------\nT : (sparse) array-like of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in `self.classes_`.", + "description": "Probability estimates.\n\nThe returned estimates for all classes are ordered by label of classes.\n\nNote that in the multilabel case, each sample can have any number of\nlabels. This returns the marginal probability that the given sample has\nthe label in question. For example, it is entirely consistent that two\nlabels both have a 90% probability of applying to a given sample.\n\nIn the single label multiclass case, the rows of the returned matrix\nsum to 1.", + "docstring": "Probability estimates.\n\n The returned estimates for all classes are ordered by label of classes.\n\n Note that in the multilabel case, each sample can have any number of\n labels. This returns the marginal probability that the given sample has\n the label in question. For example, it is entirely consistent that two\n labels both have a 90% probability of applying to a given sample.\n\n In the single label multiclass case, the rows of the returned matrix\n sum to 1.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n T : (sparse) array-like of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in `self.classes_`.\n ", "source_code": "\n@available_if(_estimators_has('predict_proba'))\ndef predict_proba(self, X):\n \"\"\"Probability estimates.\n\n The returned estimates for all classes are ordered by label of classes.\n\n Note that in the multilabel case, each sample can have any number of\n labels. This returns the marginal probability that the given sample has\n the label in question. For example, it is entirely consistent that two\n labels both have a 90% probability of applying to a given sample.\n\n In the single label multiclass case, the rows of the returned matrix\n sum to 1.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data.\n\n Returns\n -------\n T : (sparse) array-like of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in the model,\n where classes are ordered as they are in `self.classes_`.\n \"\"\"\n check_is_fitted(self)\n Y = np.array([e.predict_proba(X)[:, 1] for e in self.estimators_]).T\n if len(self.estimators_) == 1:\n Y = np.concatenate((1 - Y, Y), axis=1)\n if not self.multilabel_:\n Y /= np.sum(Y, axis=1)[:, np.newaxis]\n return Y" }, { @@ -136733,7 +147141,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -136743,7 +147152,8 @@ "docstring": { "type": "estimator object", "description": "An estimator object implementing :term:`fit` and one of\n:term:`decision_function` or :term:`predict_proba`." - } + }, + "refined_type": {} }, { "name": "code_size", @@ -136751,9 +147161,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "float", + "type": "float, default=1.5", "description": "Percentage of the number of classes to be used to create the code book.\nA number between 0 and 1 will require fewer classifiers than\none-vs-the-rest. A number greater than 1 will require more classifiers\nthan one-vs-the-rest." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -136763,7 +147174,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "The generator used to initialize the codebook.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -136773,13 +147185,14 @@ "docstring": { "type": "int, default=None", "description": "The number of jobs to use for the computation: the multiclass problems\nare computed in parallel.\n\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, *, code_size=1.5, random_state=None, n_jobs=None):\n self.estimator = estimator\n self.code_size = code_size\n self.random_state = random_state\n self.n_jobs = n_jobs" }, { @@ -136797,7 +147210,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136807,7 +147221,8 @@ "docstring": { "type": "(sparse) array-like of shape (n_samples, n_features)", "description": "Data." - } + }, + "refined_type": {} }, { "name": "y", @@ -136817,13 +147232,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Multi-class targets." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit underlying estimators.", - "docstring": "Fit underlying estimators.\n\nParameters\n----------\nX : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\ny : array-like of shape (n_samples,)\n Multi-class targets.\n\nReturns\n-------\nself : object\n Returns a fitted instance of self.", + "docstring": "Fit underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : array-like of shape (n_samples,)\n Multi-class targets.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n y : array-like of shape (n_samples,)\n Multi-class targets.\n\n Returns\n -------\n self : object\n Returns a fitted instance of self.\n \"\"\"\n y = self._validate_data(X='no_validation', y=y)\n if self.code_size <= 0:\n raise ValueError('code_size should be greater than 0, got {0}'.format(self.code_size))\n _check_estimator(self.estimator)\n random_state = check_random_state(self.random_state)\n check_classification_targets(y)\n self.classes_ = np.unique(y)\n n_classes = self.classes_.shape[0]\n if n_classes == 0:\n raise ValueError('OutputCodeClassifier can not be fit when no class is present.')\n code_size_ = int(n_classes * self.code_size)\n self.code_book_ = random_state.random_sample((n_classes, code_size_))\n self.code_book_[self.code_book_ > 0.5] = 1\n if hasattr(self.estimator, 'decision_function'):\n self.code_book_[self.code_book_ != 1] = -1\n else:\n self.code_book_[self.code_book_ != 1] = 0\n classes_index = {c: i for (i, c) in enumerate(self.classes_)}\n Y = np.array([self.code_book_[classes_index[y[i]]] for i in range(_num_samples(y))], dtype=int)\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_fit_binary)(self.estimator, X, Y[:, i]) for i in range(Y.shape[1])))\n if hasattr(self.estimators_[0], 'n_features_in_'):\n self.n_features_in_ = self.estimators_[0].n_features_in_\n if hasattr(self.estimators_[0], 'feature_names_in_'):\n self.feature_names_in_ = self.estimators_[0].feature_names_in_\n return self" }, { @@ -136841,7 +147257,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136851,13 +147268,14 @@ "docstring": { "type": "(sparse) array-like of shape (n_samples, n_features)", "description": "Data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict multi-class targets using underlying estimators.", - "docstring": "Predict multi-class targets using underlying estimators.\n\nParameters\n----------\nX : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\nReturns\n-------\ny : ndarray of shape (n_samples,)\n Predicted multi-class targets.", + "docstring": "Predict multi-class targets using underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n Predicted multi-class targets.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict multi-class targets using underlying estimators.\n\n Parameters\n ----------\n X : (sparse) array-like of shape (n_samples, n_features)\n Data.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n Predicted multi-class targets.\n \"\"\"\n check_is_fitted(self)\n Y = np.array([_predict_binary(e, X) for e in self.estimators_]).T\n pred = euclidean_distances(Y, self.code_book_).argmin(axis=1)\n return self.classes_[pred]" }, { @@ -136875,7 +147293,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136885,13 +147304,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef decision_function(self, X):\n check_is_fitted(self)\n self._validate_data(X, force_all_finite=False, dtype=None, accept_sparse=True, ensure_2d=False, reset=False)\n return np.repeat(self.y_, _num_samples(X))" }, { @@ -136909,7 +147329,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136919,7 +147340,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -136929,13 +147351,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef fit(self, X, y):\n check_params = dict(force_all_finite=False, dtype=None, ensure_2d=False, accept_sparse=True)\n self._validate_data(X, y, reset=True, validate_separately=(check_params, check_params))\n self.y_ = y\n return self" }, { @@ -136953,7 +147376,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136963,13 +147387,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef predict(self, X):\n check_is_fitted(self)\n self._validate_data(X, force_all_finite=False, dtype=None, accept_sparse=True, ensure_2d=False, reset=False)\n return np.repeat(self.y_, _num_samples(X))" }, { @@ -136987,7 +147412,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -136997,13 +147423,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef predict_proba(self, X):\n check_is_fitted(self)\n self._validate_data(X, force_all_finite=False, dtype=None, accept_sparse=True, ensure_2d=False, reset=False)\n return np.repeat([np.hstack([1 - self.y_, self.y_])], _num_samples(X), axis=0)" }, { @@ -137021,7 +147448,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -137045,13 +147473,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Check if self.estimator or self.estimators_[0] has attr.\n\nIf `self.estimators_[0]` has the attr, then its safe to assume that other values has it too. This function is used together with `avaliable_if`.", - "docstring": "Check if self.estimator or self.estimators_[0] has attr.\n\nIf `self.estimators_[0]` has the attr, then its safe to assume that other\nvalues has it too. This function is used together with `avaliable_if`.", + "description": "Check if self.estimator or self.estimators_[0] has attr.\n\nIf `self.estimators_[0]` has the attr, then its safe to assume that other\nvalues has it too. This function is used together with `avaliable_if`.", + "docstring": "Check if self.estimator or self.estimators_[0] has attr.\n\n If `self.estimators_[0]` has the attr, then its safe to assume that other\n values has it too. This function is used together with `avaliable_if`.\n ", "source_code": "\ndef _estimators_has(attr):\n \"\"\"Check if self.estimator or self.estimators_[0] has attr.\n\n If `self.estimators_[0]` has the attr, then its safe to assume that other\n values has it too. This function is used together with `avaliable_if`.\n \"\"\"\n return lambda self: hasattr(self.estimator, attr) or hasattr(self, 'estimators_') and hasattr(self.estimators_[0], attr)" }, { @@ -137069,7 +147498,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137079,7 +147509,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -137089,7 +147520,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classes", @@ -137099,7 +147531,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -137123,7 +147556,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137133,7 +147567,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -137143,7 +147578,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "i", @@ -137153,7 +147589,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "j", @@ -137163,7 +147600,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -137187,7 +147625,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137197,7 +147636,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -137207,7 +147647,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -137231,7 +147672,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137241,7 +147683,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -137251,7 +147694,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "i", @@ -137261,7 +147705,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "j", @@ -137271,7 +147716,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -137295,7 +147741,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137305,7 +147752,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -137329,13 +147777,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_skip_test': True, 'multioutput_only': True}" }, { @@ -137355,7 +147804,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137365,13 +147815,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Evaluate the decision_function of the models in the chain.", - "docstring": "Evaluate the decision_function of the models in the chain.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\nY_decision : array-like of shape (n_samples, n_classes)\n Returns the decision function of the sample for each model\n in the chain.", + "docstring": "Evaluate the decision_function of the models in the chain.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n Y_decision : array-like of shape (n_samples, n_classes)\n Returns the decision function of the sample for each model\n in the chain.\n ", "source_code": "\n@_available_if_base_estimator_has('decision_function')\ndef decision_function(self, X):\n \"\"\"Evaluate the decision_function of the models in the chain.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n Y_decision : array-like of shape (n_samples, n_classes)\n Returns the decision function of the sample for each model\n in the chain.\n \"\"\"\n X = self._validate_data(X, accept_sparse=True, reset=False)\n Y_decision_chain = np.zeros((X.shape[0], len(self.estimators_)))\n Y_pred_chain = np.zeros((X.shape[0], len(self.estimators_)))\n for (chain_idx, estimator) in enumerate(self.estimators_):\n previous_predictions = Y_pred_chain[:, :chain_idx]\n if sp.issparse(X):\n X_aug = sp.hstack((X, previous_predictions))\n else:\n X_aug = np.hstack((X, previous_predictions))\n Y_decision_chain[:, chain_idx] = estimator.decision_function(X_aug)\n Y_pred_chain[:, chain_idx] = estimator.predict(X_aug)\n inv_order = np.empty_like(self.order_)\n inv_order[self.order_] = np.arange(len(self.order_))\n Y_decision = Y_decision_chain[:, inv_order]\n return Y_decision" }, { @@ -137389,7 +147840,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137399,6 +147851,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -137409,13 +147865,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_classes)", "description": "The target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model to data matrix X and targets Y.", - "docstring": "Fit the model to data matrix X and targets Y.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\nY : array-like of shape (n_samples, n_classes)\n The target values.\n\nReturns\n-------\nself : object\n Class instance.", + "docstring": "Fit the model to data matrix X and targets Y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Y : array-like of shape (n_samples, n_classes)\n The target values.\n\n Returns\n -------\n self : object\n Class instance.\n ", "source_code": "\ndef fit(self, X, Y):\n \"\"\"Fit the model to data matrix X and targets Y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Y : array-like of shape (n_samples, n_classes)\n The target values.\n\n Returns\n -------\n self : object\n Class instance.\n \"\"\"\n super().fit(X, Y)\n self.classes_ = [estimator.classes_ for (chain_idx, estimator) in enumerate(self.estimators_)]\n return self" }, { @@ -137433,7 +147890,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137443,13 +147901,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict probability estimates.", - "docstring": "Predict probability estimates.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\nY_prob : array-like of shape (n_samples, n_classes)\n The predicted probabilities.", + "docstring": "Predict probability estimates.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n Y_prob : array-like of shape (n_samples, n_classes)\n The predicted probabilities.\n ", "source_code": "\n@_available_if_base_estimator_has('predict_proba')\ndef predict_proba(self, X):\n \"\"\"Predict probability estimates.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n Y_prob : array-like of shape (n_samples, n_classes)\n The predicted probabilities.\n \"\"\"\n X = self._validate_data(X, accept_sparse=True, reset=False)\n Y_prob_chain = np.zeros((X.shape[0], len(self.estimators_)))\n Y_pred_chain = np.zeros((X.shape[0], len(self.estimators_)))\n for (chain_idx, estimator) in enumerate(self.estimators_):\n previous_predictions = Y_pred_chain[:, :chain_idx]\n if sp.issparse(X):\n X_aug = sp.hstack((X, previous_predictions))\n else:\n X_aug = np.hstack((X, previous_predictions))\n Y_prob_chain[:, chain_idx] = estimator.predict_proba(X_aug)[:, 1]\n Y_pred_chain[:, chain_idx] = estimator.predict(X_aug)\n inv_order = np.empty_like(self.order_)\n inv_order[self.order_] = np.arange(len(self.order_))\n Y_prob = Y_prob_chain[:, inv_order]\n return Y_prob" }, { @@ -137467,7 +147929,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -137477,7 +147940,8 @@ "docstring": { "type": "estimator object", "description": "An estimator object implementing :term:`fit`, :term:`score` and\n:term:`predict_proba`." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -137487,13 +147951,14 @@ "docstring": { "type": "int or None, optional (default=None)", "description": "The number of jobs to run in parallel.\n:meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported\nby the passed estimator) will be parallelized for each target.\n\nWhen individual estimators are fast to train or predict,\nusing ``n_jobs > 1`` can result in slower performance due\nto the parallelism overhead.\n\n``None`` means `1` unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all available processes / threads.\nSee :term:`Glossary ` for more details.\n\n.. versionchanged:: 0.20\n `n_jobs` default changed from `1` to `None`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, *, n_jobs=None):\n super().__init__(estimator, n_jobs=n_jobs)" }, { @@ -137511,13 +147976,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_predict_proba(self):\n if hasattr(self, 'estimators_'):\n [getattr(est, 'predict_proba') for est in self.estimators_]\n return True\n getattr(self.estimator, 'predict_proba')\n return True" }, { @@ -137535,13 +148001,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_skip_test': True}" }, { @@ -137559,7 +148026,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137569,6 +148037,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -137579,7 +148051,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_classes)", "description": "The target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -137589,13 +148062,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If `None`, then samples are equally weighted.\nOnly supported if the underlying classifier supports sample\nweights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model to data matrix X and targets Y.", - "docstring": "Fit the model to data matrix X and targets Y.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\nY : array-like of shape (n_samples, n_classes)\n The target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying classifier supports sample\n weights.\n\n**fit_params : dict of string -> object\n Parameters passed to the ``estimator.fit`` method of each step.\n\n .. versionadded:: 0.23\n\nReturns\n-------\nself : object\n Returns a fitted instance.", + "docstring": "Fit the model to data matrix X and targets Y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Y : array-like of shape (n_samples, n_classes)\n The target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying classifier supports sample\n weights.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``estimator.fit`` method of each step.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n ", "source_code": "\ndef fit(self, X, Y, sample_weight=None, **fit_params):\n \"\"\"Fit the model to data matrix X and targets Y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Y : array-like of shape (n_samples, n_classes)\n The target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying classifier supports sample\n weights.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``estimator.fit`` method of each step.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n \"\"\"\n super().fit(X, Y, sample_weight, **fit_params)\n self.classes_ = [estimator.classes_ for estimator in self.estimators_]\n return self" }, { @@ -137613,7 +148087,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137623,13 +148098,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return prediction probabilities for each class of each output.\n\nThis method will raise a ``ValueError`` if any of the estimators do not have ``predict_proba``.", - "docstring": "Return prediction probabilities for each class of each output.\n\nThis method will raise a ``ValueError`` if any of the\nestimators do not have ``predict_proba``.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\np : array of shape (n_samples, n_classes), or a list of n_outputs such arrays if n_outputs > 1.\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n\n .. versionchanged:: 0.19\n This function now returns a list of arrays where the length of\n the list is ``n_outputs``, and each array is (``n_samples``,\n ``n_classes``) for that particular output.", + "description": "Return prediction probabilities for each class of each output.\n\nThis method will raise a ``ValueError`` if any of the\nestimators do not have ``predict_proba``.", + "docstring": "Return prediction probabilities for each class of each output.\n\n This method will raise a ``ValueError`` if any of the\n estimators do not have ``predict_proba``.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n p : array of shape (n_samples, n_classes), or a list of n_outputs such arrays if n_outputs > 1.\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n\n .. versionchanged:: 0.19\n This function now returns a list of arrays where the length of\n the list is ``n_outputs``, and each array is (``n_samples``,\n ``n_classes``) for that particular output.\n ", "source_code": "\n@available_if(_check_predict_proba)\ndef predict_proba(self, X):\n \"\"\"Return prediction probabilities for each class of each output.\n\n This method will raise a ``ValueError`` if any of the\n estimators do not have ``predict_proba``.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n p : array of shape (n_samples, n_classes), or a list of n_outputs such arrays if n_outputs > 1.\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n\n .. versionchanged:: 0.19\n This function now returns a list of arrays where the length of\n the list is ``n_outputs``, and each array is (``n_samples``,\n ``n_classes``) for that particular output.\n \"\"\"\n check_is_fitted(self)\n results = [estimator.predict_proba(X) for estimator in self.estimators_]\n return results" }, { @@ -137647,7 +148123,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137657,7 +148134,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Test samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -137667,13 +148145,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_outputs)", "description": "True values for X." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the mean accuracy on the given test data and labels.", - "docstring": "Return the mean accuracy on the given test data and labels.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Test samples.\n\ny : array-like of shape (n_samples, n_outputs)\n True values for X.\n\nReturns\n-------\nscores : float\n Mean accuracy of predicted target versus true target.", + "docstring": "Return the mean accuracy on the given test data and labels.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples, n_outputs)\n True values for X.\n\n Returns\n -------\n scores : float\n Mean accuracy of predicted target versus true target.\n ", "source_code": "\ndef score(self, X, y):\n \"\"\"Return the mean accuracy on the given test data and labels.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Test samples.\n\n y : array-like of shape (n_samples, n_outputs)\n True values for X.\n\n Returns\n -------\n scores : float\n Mean accuracy of predicted target versus true target.\n \"\"\"\n check_is_fitted(self)\n n_outputs_ = len(self.estimators_)\n if y.ndim == 1:\n raise ValueError('y must have at least two dimensions for multi target classification but has only one')\n if y.shape[1] != n_outputs_:\n raise ValueError('The number of outputs of Y for fit {0} and score {1} should be same'.format(n_outputs_, y.shape[1]))\n y_pred = self.predict(X)\n return np.mean(np.all(y == y_pred, axis=1))" }, { @@ -137691,7 +148170,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -137701,7 +148181,8 @@ "docstring": { "type": "estimator object", "description": "An estimator object implementing :term:`fit` and :term:`predict`." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -137711,13 +148192,14 @@ "docstring": { "type": "int or None, optional (default=None)", "description": "The number of jobs to run in parallel.\n:meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported\nby the passed estimator) will be parallelized for each target.\n\nWhen individual estimators are fast to train or predict,\nusing ``n_jobs > 1`` can result in slower performance due\nto the parallelism overhead.\n\n``None`` means `1` unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all available processes / threads.\nSee :term:`Glossary ` for more details.\n\n.. versionchanged:: 0.20\n `n_jobs` default changed from `1` to `None`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, estimator, *, n_jobs=None):\n super().__init__(estimator, n_jobs=n_jobs)" }, { @@ -137735,7 +148217,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137745,6 +148228,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -137755,6 +148242,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_outputs)", "description": "Multi-output targets." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -137765,13 +148256,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If `None`, then samples are equally weighted.\nOnly supported if the underlying regressor supports sample\nweights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Incrementally fit the model to data, for each output variable.", - "docstring": "Incrementally fit the model to data, for each output variable.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\ny : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying regressor supports sample\n weights.\n\nReturns\n-------\nself : object\n Returns a fitted instance.", + "docstring": "Incrementally fit the model to data, for each output variable.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying regressor supports sample\n weights.\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n ", "source_code": "\n@_available_if_estimator_has('partial_fit')\ndef partial_fit(self, X, y, sample_weight=None):\n \"\"\"Incrementally fit the model to data, for each output variable.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying regressor supports sample\n weights.\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n \"\"\"\n super().partial_fit(X, y, sample_weight=sample_weight)" }, { @@ -137789,13 +148281,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multioutput_only': True}" }, { @@ -137813,7 +148306,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137823,6 +148317,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -137833,13 +148331,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_classes)", "description": "The target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model to data matrix X and targets Y.", - "docstring": "Fit the model to data matrix X and targets Y.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\nY : array-like of shape (n_samples, n_classes)\n The target values.\n\n**fit_params : dict of string -> object\n Parameters passed to the `fit` method at each step\n of the regressor chain.\n\n .. versionadded:: 0.23\n\nReturns\n-------\nself : object\n Returns a fitted instance.", + "docstring": "Fit the model to data matrix X and targets Y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Y : array-like of shape (n_samples, n_classes)\n The target values.\n\n **fit_params : dict of string -> object\n Parameters passed to the `fit` method at each step\n of the regressor chain.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n ", "source_code": "\ndef fit(self, X, Y, **fit_params):\n \"\"\"Fit the model to data matrix X and targets Y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Y : array-like of shape (n_samples, n_classes)\n The target values.\n\n **fit_params : dict of string -> object\n Parameters passed to the `fit` method at each step\n of the regressor chain.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n \"\"\"\n super().fit(X, Y, **fit_params)\n return self" }, { @@ -137857,7 +148356,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -137867,7 +148367,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "order", @@ -137877,7 +148378,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cv", @@ -137887,7 +148389,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -137897,13 +148400,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, base_estimator, *, order=None, cv=None, random_state=None):\n self.base_estimator = base_estimator\n self.order = order\n self.cv = cv\n self.random_state = random_state" }, { @@ -137921,7 +148425,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137931,6 +148436,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -137941,13 +148450,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_classes)", "description": "The target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit the model to data matrix X and targets Y.", - "docstring": "Fit the model to data matrix X and targets Y.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\nY : array-like of shape (n_samples, n_classes)\n The target values.\n\n**fit_params : dict of string -> object\n Parameters passed to the `fit` method of each step.\n\n .. versionadded:: 0.23\n\nReturns\n-------\nself : object\n Returns a fitted instance.", + "docstring": "Fit the model to data matrix X and targets Y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Y : array-like of shape (n_samples, n_classes)\n The target values.\n\n **fit_params : dict of string -> object\n Parameters passed to the `fit` method of each step.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n ", "source_code": "\n@abstractmethod\ndef fit(self, X, Y, **fit_params):\n \"\"\"Fit the model to data matrix X and targets Y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Y : array-like of shape (n_samples, n_classes)\n The target values.\n\n **fit_params : dict of string -> object\n Parameters passed to the `fit` method of each step.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n \"\"\"\n (X, Y) = self._validate_data(X, Y, multi_output=True, accept_sparse=True)\n random_state = check_random_state(self.random_state)\n self.order_ = self.order\n if isinstance(self.order_, tuple):\n self.order_ = np.array(self.order_)\n if self.order_ is None:\n self.order_ = np.array(range(Y.shape[1]))\n elif isinstance(self.order_, str):\n if self.order_ == 'random':\n self.order_ = random_state.permutation(Y.shape[1])\n elif sorted(self.order_) != list(range(Y.shape[1])):\n raise ValueError('invalid order')\n self.estimators_ = [clone(self.base_estimator) for _ in range(Y.shape[1])]\n if self.cv is None:\n Y_pred_chain = Y[:, self.order_]\n if sp.issparse(X):\n X_aug = sp.hstack((X, Y_pred_chain), format='lil')\n X_aug = X_aug.tocsr()\n else:\n X_aug = np.hstack((X, Y_pred_chain))\n elif sp.issparse(X):\n Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1]))\n X_aug = sp.hstack((X, Y_pred_chain), format='lil')\n else:\n Y_pred_chain = np.zeros((X.shape[0], Y.shape[1]))\n X_aug = np.hstack((X, Y_pred_chain))\n del Y_pred_chain\n for (chain_idx, estimator) in enumerate(self.estimators_):\n y = Y[:, self.order_[chain_idx]]\n estimator.fit(X_aug[:, :X.shape[1] + chain_idx], y, **fit_params)\n if self.cv is not None and chain_idx < len(self.estimators_) - 1:\n col_idx = X.shape[1] + chain_idx\n cv_result = cross_val_predict(self.base_estimator, X_aug[:, :col_idx], y=y, cv=self.cv)\n if sp.issparse(X_aug):\n X_aug[:, col_idx] = np.expand_dims(cv_result, 1)\n else:\n X_aug[:, col_idx] = cv_result\n return self" }, { @@ -137965,7 +148475,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -137975,13 +148486,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Predict on the data matrix X using the ClassifierChain model.", - "docstring": "Predict on the data matrix X using the ClassifierChain model.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\nY_pred : array-like of shape (n_samples, n_classes)\n The predicted values.", + "docstring": "Predict on the data matrix X using the ClassifierChain model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n Y_pred : array-like of shape (n_samples, n_classes)\n The predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict on the data matrix X using the ClassifierChain model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n Y_pred : array-like of shape (n_samples, n_classes)\n The predicted values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, reset=False)\n Y_pred_chain = np.zeros((X.shape[0], len(self.estimators_)))\n for (chain_idx, estimator) in enumerate(self.estimators_):\n previous_predictions = Y_pred_chain[:, :chain_idx]\n if sp.issparse(X):\n if chain_idx == 0:\n X_aug = X\n else:\n X_aug = sp.hstack((X, previous_predictions))\n else:\n X_aug = np.hstack((X, previous_predictions))\n Y_pred_chain[:, chain_idx] = estimator.predict(X_aug)\n inv_order = np.empty_like(self.order_)\n inv_order[self.order_] = np.arange(len(self.order_))\n Y_pred = Y_pred_chain[:, inv_order]\n return Y_pred" }, { @@ -137999,7 +148514,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -138009,7 +148525,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -138019,13 +148536,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, estimator, *, n_jobs=None):\n self.estimator = estimator\n self.n_jobs = n_jobs" }, { @@ -138043,13 +148561,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multioutput_only': True}" }, { @@ -138067,7 +148586,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138077,6 +148597,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -138087,6 +148611,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_outputs)", "description": "Multi-output targets. An indicator matrix turns on multilabel\nestimation." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -138097,13 +148625,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If `None`, then samples are equally weighted.\nOnly supported if the underlying regressor supports sample\nweights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit the model to data, separately for each output variable.", - "docstring": "Fit the model to data, separately for each output variable.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\ny : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets. An indicator matrix turns on multilabel\n estimation.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying regressor supports sample\n weights.\n\n**fit_params : dict of string -> object\n Parameters passed to the ``estimator.fit`` method of each step.\n\n .. versionadded:: 0.23\n\nReturns\n-------\nself : object\n Returns a fitted instance.", + "docstring": "Fit the model to data, separately for each output variable.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets. An indicator matrix turns on multilabel\n estimation.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying regressor supports sample\n weights.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``estimator.fit`` method of each step.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None, **fit_params):\n \"\"\"Fit the model to data, separately for each output variable.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets. An indicator matrix turns on multilabel\n estimation.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying regressor supports sample\n weights.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``estimator.fit`` method of each step.\n\n .. versionadded:: 0.23\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n \"\"\"\n if not hasattr(self.estimator, 'fit'):\n raise ValueError('The base estimator should implement a fit method')\n y = self._validate_data(X='no_validation', y=y, multi_output=True)\n if is_classifier(self):\n check_classification_targets(y)\n if y.ndim == 1:\n raise ValueError('y must have at least two dimensions for multi-output regression but has only one.')\n if sample_weight is not None and not has_fit_parameter(self.estimator, 'sample_weight'):\n raise ValueError('Underlying estimator does not support sample weights.')\n fit_params_validated = _check_fit_params(X, fit_params)\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_fit_estimator)(self.estimator, X, y[:, i], sample_weight, **fit_params_validated) for i in range(y.shape[1])))\n if hasattr(self.estimators_[0], 'n_features_in_'):\n self.n_features_in_ = self.estimators_[0].n_features_in_\n if hasattr(self.estimators_[0], 'feature_names_in_'):\n self.feature_names_in_ = self.estimators_[0].feature_names_in_\n return self" }, { @@ -138121,7 +148650,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138131,6 +148661,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -138141,6 +148675,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_outputs)", "description": "Multi-output targets." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -138151,7 +148689,8 @@ "docstring": { "type": "list of ndarray of shape (n_outputs,), default=None", "description": "Each array is unique classes for one output in str/int.\nCan be obtained via\n``[np.unique(y[:, i]) for i in range(y.shape[1])]``, where `y`\nis the target matrix of the entire dataset.\nThis argument is required for the first call to partial_fit\nand can be omitted in the subsequent calls.\nNote that `y` doesn't need to contain all labels in `classes`." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -138161,13 +148700,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If `None`, then samples are equally weighted.\nOnly supported if the underlying regressor supports sample\nweights." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Incrementally fit a separate model for each class output.", - "docstring": "Incrementally fit a separate model for each class output.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\ny : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets.\n\nclasses : list of ndarray of shape (n_outputs,), default=None\n Each array is unique classes for one output in str/int.\n Can be obtained via\n ``[np.unique(y[:, i]) for i in range(y.shape[1])]``, where `y`\n is the target matrix of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that `y` doesn't need to contain all labels in `classes`.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying regressor supports sample\n weights.\n\nReturns\n-------\nself : object\n Returns a fitted instance.", + "docstring": "Incrementally fit a separate model for each class output.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets.\n\n classes : list of ndarray of shape (n_outputs,), default=None\n Each array is unique classes for one output in str/int.\n Can be obtained via\n ``[np.unique(y[:, i]) for i in range(y.shape[1])]``, where `y`\n is the target matrix of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that `y` doesn't need to contain all labels in `classes`.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying regressor supports sample\n weights.\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n ", "source_code": "\n@_available_if_estimator_has('partial_fit')\ndef partial_fit(self, X, y, classes=None, sample_weight=None):\n \"\"\"Incrementally fit a separate model for each class output.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets.\n\n classes : list of ndarray of shape (n_outputs,), default=None\n Each array is unique classes for one output in str/int.\n Can be obtained via\n ``[np.unique(y[:, i]) for i in range(y.shape[1])]``, where `y`\n is the target matrix of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that `y` doesn't need to contain all labels in `classes`.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If `None`, then samples are equally weighted.\n Only supported if the underlying regressor supports sample\n weights.\n\n Returns\n -------\n self : object\n Returns a fitted instance.\n \"\"\"\n first_time = not hasattr(self, 'estimators_')\n y = self._validate_data(X='no_validation', y=y, multi_output=True)\n if y.ndim == 1:\n raise ValueError('y must have at least two dimensions for multi-output regression but has only one.')\n if sample_weight is not None and not has_fit_parameter(self.estimator, 'sample_weight'):\n raise ValueError('Underlying estimator does not support sample weights.')\n first_time = not hasattr(self, 'estimators_')\n self.estimators_ = Parallel(n_jobs=self.n_jobs)((delayed(_partial_fit_estimator)(self.estimators_[i] if not first_time else self.estimator, X, y[:, i], classes[i] if classes is not None else None, sample_weight, first_time) for i in range(y.shape[1])))\n if first_time and hasattr(self.estimators_[0], 'n_features_in_'):\n self.n_features_in_ = self.estimators_[0].n_features_in_\n if first_time and hasattr(self.estimators_[0], 'feature_names_in_'):\n self.feature_names_in_ = self.estimators_[0].feature_names_in_\n return self" }, { @@ -138185,7 +148725,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138195,13 +148736,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Predict multi-output variable using model for each target variable.", - "docstring": "Predict multi-output variable using model for each target variable.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\ny : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets predicted across multiple predictors.\n Note: Separate models are generated for each predictor.", + "docstring": "Predict multi-output variable using model for each target variable.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets predicted across multiple predictors.\n Note: Separate models are generated for each predictor.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict multi-output variable using model for each target variable.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y : {array-like, sparse matrix} of shape (n_samples, n_outputs)\n Multi-output targets predicted across multiple predictors.\n Note: Separate models are generated for each predictor.\n \"\"\"\n check_is_fitted(self)\n if not hasattr(self.estimators_[0], 'predict'):\n raise ValueError('The base estimator should implement a predict method')\n y = Parallel(n_jobs=self.n_jobs)((delayed(e.predict)(X) for e in self.estimators_))\n return np.asarray(y).T" }, { @@ -138219,13 +148764,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return a function to check if `base_estimator` or `estimators_` has `attr`.\n\nHelper for Chain implementations.", - "docstring": "Return a function to check if `base_estimator` or `estimators_` has `attr`.\n\nHelper for Chain implementations.", + "docstring": "Return a function to check if `base_estimator` or `estimators_` has `attr`.\n\n Helper for Chain implementations.\n ", "source_code": "\ndef _available_if_base_estimator_has(attr):\n \"\"\"Return a function to check if `base_estimator` or `estimators_` has `attr`.\n\n Helper for Chain implementations.\n \"\"\"\n \n def _check(self):\n return hasattr(self.base_estimator, attr) or all((hasattr(est, attr) for est in self.estimators_))\n return available_if(_check)" }, { @@ -138243,13 +148789,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return a function to check if `estimator` or `estimators_` has `attr`.\n\nHelper for Chain implementations.", - "docstring": "Return a function to check if `estimator` or `estimators_` has `attr`.\n\nHelper for Chain implementations.", + "docstring": "Return a function to check if `estimator` or `estimators_` has `attr`.\n\n Helper for Chain implementations.\n ", "source_code": "\ndef _available_if_estimator_has(attr):\n \"\"\"Return a function to check if `estimator` or `estimators_` has `attr`.\n\n Helper for Chain implementations.\n \"\"\"\n \n def _check(self):\n return hasattr(self.estimator, attr) or all((hasattr(est, attr) for est in self.estimators_))\n return available_if(_check)" }, { @@ -138267,7 +148814,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138277,7 +148825,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -138287,7 +148836,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -138297,13 +148847,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit_estimator(estimator, X, y, sample_weight=None, **fit_params):\n estimator = clone(estimator)\n if sample_weight is not None:\n estimator.fit(X, y, sample_weight=sample_weight, **fit_params)\n else:\n estimator.fit(X, y, **fit_params)\n return estimator" }, { @@ -138321,7 +148872,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138331,7 +148883,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -138341,7 +148894,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classes", @@ -138351,7 +148905,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -138361,7 +148916,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "first_time", @@ -138371,13 +148927,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _partial_fit_estimator(estimator, X, y, classes=None, sample_weight=None, first_time=True):\n if first_time:\n estimator = clone(estimator)\n if sample_weight is not None:\n if classes is not None:\n estimator.partial_fit(X, y, classes=classes, sample_weight=sample_weight)\n else:\n estimator.partial_fit(X, y, sample_weight=sample_weight)\n elif classes is not None:\n estimator.partial_fit(X, y, classes=classes)\n else:\n estimator.partial_fit(X, y)\n return estimator" }, { @@ -138395,7 +148952,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -138405,7 +148963,8 @@ "docstring": { "type": "float, default=1.0", "description": "Additive (Laplace/Lidstone) smoothing parameter\n(0 for no smoothing)." - } + }, + "refined_type": {} }, { "name": "binarize", @@ -138415,7 +148974,8 @@ "docstring": { "type": "float or None, default=0.0", "description": "Threshold for binarizing (mapping to booleans) of sample features.\nIf None, input is presumed to already consist of binary vectors." - } + }, + "refined_type": {} }, { "name": "fit_prior", @@ -138425,7 +148985,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to learn class prior probabilities or not.\nIf false, a uniform prior will be used." - } + }, + "refined_type": {} }, { "name": "class_prior", @@ -138435,13 +148996,14 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "Prior probabilities of the classes. If specified the priors are not\nadjusted according to the data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None):\n self.alpha = alpha\n self.binarize = binarize\n self.fit_prior = fit_prior\n self.class_prior = class_prior" }, { @@ -138459,7 +149021,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138469,7 +149032,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -138493,7 +149057,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138503,7 +149068,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -138513,7 +149079,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "reset", @@ -138523,13 +149090,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_X_y(self, X, y, reset=True):\n (X, y) = super()._check_X_y(X, y, reset=reset)\n if self.binarize is not None:\n X = binarize(X, threshold=self.binarize)\n return X, y" }, { @@ -138547,7 +149115,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138557,7 +149126,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -138567,7 +149137,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -138591,7 +149162,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138601,7 +149173,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -138625,7 +149198,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -138635,7 +149209,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -138659,7 +149234,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -138669,7 +149245,8 @@ "docstring": { "type": "float, default=1.0", "description": "Additive (Laplace/Lidstone) smoothing parameter\n(0 for no smoothing)." - } + }, + "refined_type": {} }, { "name": "fit_prior", @@ -138679,7 +149256,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to learn class prior probabilities or not.\nIf false, a uniform prior will be used." - } + }, + "refined_type": {} }, { "name": "class_prior", @@ -138689,7 +149267,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "Prior probabilities of the classes. If specified the priors are not\nadjusted according to the data." - } + }, + "refined_type": {} }, { "name": "min_categories", @@ -138699,13 +149278,14 @@ "docstring": { "type": "int or array-like of shape (n_features,), default=None", "description": "Minimum number of categories per feature.\n\n- integer: Sets the minimum number of categories per feature to\n `n_categories` for each features.\n- array-like: shape (n_features,) where `n_categories[i]` holds the\n minimum number of categories for the ith column of the input.\n- None (default): Determines the number of categories automatically\n from the training data.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None, min_categories=None):\n self.alpha = alpha\n self.fit_prior = fit_prior\n self.class_prior = class_prior\n self.min_categories = min_categories" }, { @@ -138723,7 +149303,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138733,7 +149314,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -138757,7 +149339,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138767,7 +149350,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -138777,7 +149361,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "reset", @@ -138787,13 +149372,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_X_y(self, X, y, reset=True):\n (X, y) = self._validate_data(X, y, dtype='int', accept_sparse=False, force_all_finite=True, reset=reset)\n check_non_negative(X, 'CategoricalNB (input X)')\n return X, y" }, { @@ -138811,7 +149397,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138821,7 +149408,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -138831,13 +149419,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _count(self, X, Y):\n \n def _update_cat_count_dims(cat_count, highest_feature):\n diff = highest_feature + 1 - cat_count.shape[1]\n if diff > 0:\n return np.pad(cat_count, [(0, 0), (0, diff)], 'constant')\n return cat_count\n \n def _update_cat_count(X_feature, Y, cat_count, n_classes):\n for j in range(n_classes):\n mask = Y[:, j].astype(bool)\n if Y.dtype.type == np.int64:\n weights = None\n else:\n weights = Y[mask, j]\n counts = np.bincount(X_feature[mask], weights=weights)\n indices = np.nonzero(counts)[0]\n cat_count[j, indices] += counts[indices]\n self.class_count_ += Y.sum(axis=0)\n self.n_categories_ = self._validate_n_categories(X, self.min_categories)\n for i in range(self.n_features_in_):\n X_feature = X[:, i]\n self.category_count_[i] = _update_cat_count_dims(self.category_count_[i], self.n_categories_[i] - 1)\n _update_cat_count(X_feature, Y, self.category_count_[i], self.class_count_.shape[0])" }, { @@ -138855,7 +149444,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -138865,7 +149455,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -138875,13 +149466,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _init_counters(self, n_classes, n_features):\n self.class_count_ = np.zeros(n_classes, dtype=np.float64)\n self.category_count_ = [np.zeros((n_classes, 0)) for _ in range(n_features)]" }, { @@ -138899,7 +149491,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -138909,13 +149502,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _joint_log_likelihood(self, X):\n self._check_n_features(X, reset=False)\n jll = np.zeros((X.shape[0], self.class_count_.shape[0]))\n for i in range(self.n_features_in_):\n indices = X[:, i]\n jll += self.feature_log_prob_[i][:, indices].T\n total_ll = jll + self.class_log_prior_\n return total_ll" }, { @@ -138933,13 +149527,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'requires_positive_X': True}" }, { @@ -138957,7 +149552,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -138967,13 +149563,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _update_feature_log_prob(self, alpha):\n feature_log_prob = []\n for i in range(self.n_features_in_):\n smoothed_cat_count = self.category_count_[i] + alpha\n smoothed_class_count = smoothed_cat_count.sum(axis=1)\n feature_log_prob.append(np.log(smoothed_cat_count) - np.log(smoothed_class_count.reshape(-1, 1)))\n self.feature_log_prob_ = feature_log_prob" }, { @@ -138991,7 +149588,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_categories", @@ -139001,13 +149599,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@staticmethod\ndef _validate_n_categories(X, min_categories):\n n_categories_X = X.max(axis=0) + 1\n min_categories_ = np.array(min_categories)\n if min_categories is not None:\n if not np.issubdtype(min_categories_.dtype, np.signedinteger):\n raise ValueError(f\"'min_categories' should have integral type. Got {min_categories_.dtype} instead.\")\n n_categories_ = np.maximum(n_categories_X, min_categories_, dtype=np.int64)\n if n_categories_.shape != n_categories_X.shape:\n raise ValueError(f\"'min_categories' should have shape ({X.shape[1]},) when an array-like is provided. Got {min_categories_.shape} instead.\")\n return n_categories_\n else:\n return n_categories_X" }, { @@ -139025,7 +149624,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139035,6 +149635,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features. Here, each feature of X is\nassumed to be from a different categorical distribution.\nIt is further assumed that all categories of each feature are\nrepresented by the numbers 0, ..., n - 1, where n refers to the\ntotal number of categories for the given feature. This can, for\ninstance, be achieved with the help of OrdinalEncoder." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -139045,7 +149649,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -139055,13 +149660,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weights applied to individual samples (1. for unweighted)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit Naive Bayes classifier according to X, y.", - "docstring": "Fit Naive Bayes classifier according to X, y.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features. Here, each feature of X is\n assumed to be from a different categorical distribution.\n It is further assumed that all categories of each feature are\n represented by the numbers 0, ..., n - 1, where n refers to the\n total number of categories for the given feature. This can, for\n instance, be achieved with the help of OrdinalEncoder.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit Naive Bayes classifier according to X, y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features. Here, each feature of X is\n assumed to be from a different categorical distribution.\n It is further assumed that all categories of each feature are\n represented by the numbers 0, ..., n - 1, where n refers to the\n total number of categories for the given feature. This can, for\n instance, be achieved with the help of OrdinalEncoder.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit Naive Bayes classifier according to X, y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features. Here, each feature of X is\n assumed to be from a different categorical distribution.\n It is further assumed that all categories of each feature are\n represented by the numbers 0, ..., n - 1, where n refers to the\n total number of categories for the given feature. This can, for\n instance, be achieved with the help of OrdinalEncoder.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n return super().fit(X, y, sample_weight=sample_weight)" }, { @@ -139079,7 +149685,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139089,6 +149696,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features. Here, each feature of X is\nassumed to be from a different categorical distribution.\nIt is further assumed that all categories of each feature are\nrepresented by the numbers 0, ..., n - 1, where n refers to the\ntotal number of categories for the given feature. This can, for\ninstance, be achieved with the help of OrdinalEncoder." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -139099,7 +149710,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "classes", @@ -139109,7 +149721,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "List of all the classes that can possibly appear in the y vector.\n\nMust be provided at the first call to partial_fit, can be omitted\nin subsequent calls." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -139119,13 +149732,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weights applied to individual samples (1. for unweighted)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Incremental fit on a batch of samples.\n\nThis method is expected to be called several times consecutively on different chunks of a dataset so as to implement out-of-core or online learning. This is especially useful when the whole dataset is too big to fit in memory at once. This method has some performance overhead hence it is better to call partial_fit on chunks of data that are as large as possible (as long as fitting in the memory budget) to hide the overhead.", - "docstring": "Incremental fit on a batch of samples.\n\nThis method is expected to be called several times consecutively\non different chunks of a dataset so as to implement out-of-core\nor online learning.\n\nThis is especially useful when the whole dataset is too big to fit in\nmemory at once.\n\nThis method has some performance overhead hence it is better to call\npartial_fit on chunks of data that are as large as possible\n(as long as fitting in the memory budget) to hide the overhead.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features. Here, each feature of X is\n assumed to be from a different categorical distribution.\n It is further assumed that all categories of each feature are\n represented by the numbers 0, ..., n - 1, where n refers to the\n total number of categories for the given feature. This can, for\n instance, be achieved with the help of OrdinalEncoder.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nclasses : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "description": "Incremental fit on a batch of samples.\n\nThis method is expected to be called several times consecutively\non different chunks of a dataset so as to implement out-of-core\nor online learning.\n\nThis is especially useful when the whole dataset is too big to fit in\nmemory at once.\n\nThis method has some performance overhead hence it is better to call\npartial_fit on chunks of data that are as large as possible\n(as long as fitting in the memory budget) to hide the overhead.", + "docstring": "Incremental fit on a batch of samples.\n\n This method is expected to be called several times consecutively\n on different chunks of a dataset so as to implement out-of-core\n or online learning.\n\n This is especially useful when the whole dataset is too big to fit in\n memory at once.\n\n This method has some performance overhead hence it is better to call\n partial_fit on chunks of data that are as large as possible\n (as long as fitting in the memory budget) to hide the overhead.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features. Here, each feature of X is\n assumed to be from a different categorical distribution.\n It is further assumed that all categories of each feature are\n represented by the numbers 0, ..., n - 1, where n refers to the\n total number of categories for the given feature. This can, for\n instance, be achieved with the help of OrdinalEncoder.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n classes : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef partial_fit(self, X, y, classes=None, sample_weight=None):\n \"\"\"Incremental fit on a batch of samples.\n\n This method is expected to be called several times consecutively\n on different chunks of a dataset so as to implement out-of-core\n or online learning.\n\n This is especially useful when the whole dataset is too big to fit in\n memory at once.\n\n This method has some performance overhead hence it is better to call\n partial_fit on chunks of data that are as large as possible\n (as long as fitting in the memory budget) to hide the overhead.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features. Here, each feature of X is\n assumed to be from a different categorical distribution.\n It is further assumed that all categories of each feature are\n represented by the numbers 0, ..., n - 1, where n refers to the\n total number of categories for the given feature. This can, for\n instance, be achieved with the help of OrdinalEncoder.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n classes : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n return super().partial_fit(X, y, classes, sample_weight=sample_weight)" }, { @@ -139143,7 +149757,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -139153,7 +149768,8 @@ "docstring": { "type": "float, default=1.0", "description": "Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)." - } + }, + "refined_type": {} }, { "name": "fit_prior", @@ -139163,7 +149779,8 @@ "docstring": { "type": "bool, default=True", "description": "Only used in edge case with a single class in the training set." - } + }, + "refined_type": {} }, { "name": "class_prior", @@ -139173,7 +149790,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "Prior probabilities of the classes. Not used." - } + }, + "refined_type": {} }, { "name": "norm", @@ -139183,13 +149801,14 @@ "docstring": { "type": "bool, default=False", "description": "Whether or not a second normalization of the weights is performed. The\ndefault behavior mirrors the implementations found in Mahout and Weka,\nwhich do not follow the full algorithm described in Table 9 of the\npaper." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None, norm=False):\n self.alpha = alpha\n self.fit_prior = fit_prior\n self.class_prior = class_prior\n self.norm = norm" }, { @@ -139207,7 +149826,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139217,7 +149837,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -139227,7 +149848,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -139251,7 +149873,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139261,7 +149884,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -139285,13 +149909,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'requires_positive_X': True}" }, { @@ -139309,7 +149934,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -139319,7 +149945,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -139343,7 +149970,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "priors", @@ -139353,7 +149981,8 @@ "docstring": { "type": "array-like of shape (n_classes,)", "description": "Prior probabilities of the classes. If specified the priors are not\nadjusted according to the data." - } + }, + "refined_type": {} }, { "name": "var_smoothing", @@ -139363,13 +149992,14 @@ "docstring": { "type": "float, default=1e-9", "description": "Portion of the largest variance of all features that is added to\nvariances for calculation stability.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, priors=None, var_smoothing=1e-09):\n self.priors = priors\n self.var_smoothing = var_smoothing" }, { @@ -139387,7 +150017,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139397,7 +150028,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -139421,7 +150053,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139431,13 +150064,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _joint_log_likelihood(self, X):\n joint_log_likelihood = []\n for i in range(np.size(self.classes_)):\n jointi = np.log(self.class_prior_[i])\n n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))\n n_ij -= 0.5 * np.sum((X - self.theta_[i, :])**2 / self.var_[i, :], 1)\n joint_log_likelihood.append(jointi + n_ij)\n joint_log_likelihood = np.array(joint_log_likelihood).T\n return joint_log_likelihood" }, { @@ -139455,7 +150089,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139465,7 +150100,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -139475,7 +150111,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "classes", @@ -139485,7 +150122,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "List of all the classes that can possibly appear in the y vector.\n\nMust be provided at the first call to partial_fit, can be omitted\nin subsequent calls." - } + }, + "refined_type": {} }, { "name": "_refit", @@ -139495,7 +150133,8 @@ "docstring": { "type": "bool, default=False", "description": "If true, act as though this were the first time we called\n_partial_fit (ie, throw away any past fitting and start over)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -139505,13 +150144,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weights applied to individual samples (1. for unweighted)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Actual implementation of Gaussian NB fitting.", - "docstring": "Actual implementation of Gaussian NB fitting.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nclasses : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n_refit : bool, default=False\n If true, act as though this were the first time we called\n _partial_fit (ie, throw away any past fitting and start over).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\nReturns\n-------\nself : object", + "docstring": "Actual implementation of Gaussian NB fitting.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n classes : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n _refit : bool, default=False\n If true, act as though this were the first time we called\n _partial_fit (ie, throw away any past fitting and start over).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n ", "source_code": "\ndef _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None):\n \"\"\"Actual implementation of Gaussian NB fitting.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n classes : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n _refit : bool, default=False\n If true, act as though this were the first time we called\n _partial_fit (ie, throw away any past fitting and start over).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n \"\"\"\n if _refit:\n self.classes_ = None\n first_call = _check_partial_fit_first_call(self, classes)\n (X, y) = self._validate_data(X, y, reset=first_call)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n self.epsilon_ = self.var_smoothing * np.var(X, axis=0).max()\n if first_call:\n n_features = X.shape[1]\n n_classes = len(self.classes_)\n self.theta_ = np.zeros((n_classes, n_features))\n self.var_ = np.zeros((n_classes, n_features))\n self.class_count_ = np.zeros(n_classes, dtype=np.float64)\n if self.priors is not None:\n priors = np.asarray(self.priors)\n if len(priors) != n_classes:\n raise ValueError('Number of priors must match number of classes.')\n if not np.isclose(priors.sum(), 1.0):\n raise ValueError('The sum of the priors should be 1.')\n if (priors < 0).any():\n raise ValueError('Priors must be non-negative.')\n self.class_prior_ = priors\n else:\n self.class_prior_ = np.zeros(len(self.classes_), dtype=np.float64)\n else:\n if X.shape[1] != self.theta_.shape[1]:\n msg = 'Number of features %d does not match previous data %d.'\n raise ValueError(msg % (X.shape[1], self.theta_.shape[1]))\n self.var_[:, :] -= self.epsilon_\n classes = self.classes_\n unique_y = np.unique(y)\n unique_y_in_classes = np.in1d(unique_y, classes)\n if not np.all(unique_y_in_classes):\n raise ValueError('The target label(s) %s in y do not exist in the initial classes %s' % (unique_y[~unique_y_in_classes], classes))\n for y_i in unique_y:\n i = classes.searchsorted(y_i)\n X_i = X[y == y_i, :]\n if sample_weight is not None:\n sw_i = sample_weight[y == y_i]\n N_i = sw_i.sum()\n else:\n sw_i = None\n N_i = X_i.shape[0]\n (new_theta, new_sigma) = self._update_mean_variance(self.class_count_[i], self.theta_[i, :], self.var_[i, :], X_i, sw_i)\n self.theta_[i, :] = new_theta\n self.var_[i, :] = new_sigma\n self.class_count_[i] += N_i\n self.var_[:, :] += self.epsilon_\n if self.priors is None:\n self.class_prior_ = self.class_count_ / self.class_count_.sum()\n return self" }, { @@ -139529,7 +150169,8 @@ "docstring": { "type": "int", "description": "Number of samples represented in old mean and variance. If sample\nweights were given, this should contain the sum of sample\nweights represented in old mean and variance." - } + }, + "refined_type": {} }, { "name": "mu", @@ -139539,7 +150180,8 @@ "docstring": { "type": "array-like of shape (number of Gaussians,)", "description": "Means for Gaussians in original set." - } + }, + "refined_type": {} }, { "name": "var", @@ -139549,7 +150191,8 @@ "docstring": { "type": "array-like of shape (number of Gaussians,)", "description": "Variances for Gaussians in original set." - } + }, + "refined_type": {} }, { "name": "X", @@ -139559,7 +150202,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -139569,13 +150213,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weights applied to individual samples (1. for unweighted)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute online update of Gaussian mean and variance.\n\nGiven starting sample count, mean, and variance, a new set of points X, and optionally sample weights, return the updated mean and variance. (NB - each dimension (column) in X is treated as independent -- you get variance, not covariance). Can take scalar mean and variance, or vector mean and variance to simultaneously update a number of independent Gaussians. See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque: http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf", - "docstring": "Compute online update of Gaussian mean and variance.\n\nGiven starting sample count, mean, and variance, a new set of\npoints X, and optionally sample weights, return the updated mean and\nvariance. (NB - each dimension (column) in X is treated as independent\n-- you get variance, not covariance).\n\nCan take scalar mean and variance, or vector mean and variance to\nsimultaneously update a number of independent Gaussians.\n\nSee Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:\n\nhttp://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf\n\nParameters\n----------\nn_past : int\n Number of samples represented in old mean and variance. If sample\n weights were given, this should contain the sum of sample\n weights represented in old mean and variance.\n\nmu : array-like of shape (number of Gaussians,)\n Means for Gaussians in original set.\n\nvar : array-like of shape (number of Gaussians,)\n Variances for Gaussians in original set.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\nReturns\n-------\ntotal_mu : array-like of shape (number of Gaussians,)\n Updated mean for each Gaussian over the combined set.\n\ntotal_var : array-like of shape (number of Gaussians,)\n Updated variance for each Gaussian over the combined set.", + "description": "Compute online update of Gaussian mean and variance.\n\nGiven starting sample count, mean, and variance, a new set of\npoints X, and optionally sample weights, return the updated mean and\nvariance. (NB - each dimension (column) in X is treated as independent\n-- you get variance, not covariance).\n\nCan take scalar mean and variance, or vector mean and variance to\nsimultaneously update a number of independent Gaussians.\n\nSee Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:\n\nhttp://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf", + "docstring": "Compute online update of Gaussian mean and variance.\n\n Given starting sample count, mean, and variance, a new set of\n points X, and optionally sample weights, return the updated mean and\n variance. (NB - each dimension (column) in X is treated as independent\n -- you get variance, not covariance).\n\n Can take scalar mean and variance, or vector mean and variance to\n simultaneously update a number of independent Gaussians.\n\n See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:\n\n http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf\n\n Parameters\n ----------\n n_past : int\n Number of samples represented in old mean and variance. If sample\n weights were given, this should contain the sum of sample\n weights represented in old mean and variance.\n\n mu : array-like of shape (number of Gaussians,)\n Means for Gaussians in original set.\n\n var : array-like of shape (number of Gaussians,)\n Variances for Gaussians in original set.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n total_mu : array-like of shape (number of Gaussians,)\n Updated mean for each Gaussian over the combined set.\n\n total_var : array-like of shape (number of Gaussians,)\n Updated variance for each Gaussian over the combined set.\n ", "source_code": "\n@staticmethod\ndef _update_mean_variance(n_past, mu, var, X, sample_weight=None):\n \"\"\"Compute online update of Gaussian mean and variance.\n\n Given starting sample count, mean, and variance, a new set of\n points X, and optionally sample weights, return the updated mean and\n variance. (NB - each dimension (column) in X is treated as independent\n -- you get variance, not covariance).\n\n Can take scalar mean and variance, or vector mean and variance to\n simultaneously update a number of independent Gaussians.\n\n See Stanford CS tech report STAN-CS-79-773 by Chan, Golub, and LeVeque:\n\n http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf\n\n Parameters\n ----------\n n_past : int\n Number of samples represented in old mean and variance. If sample\n weights were given, this should contain the sum of sample\n weights represented in old mean and variance.\n\n mu : array-like of shape (number of Gaussians,)\n Means for Gaussians in original set.\n\n var : array-like of shape (number of Gaussians,)\n Variances for Gaussians in original set.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n total_mu : array-like of shape (number of Gaussians,)\n Updated mean for each Gaussian over the combined set.\n\n total_var : array-like of shape (number of Gaussians,)\n Updated variance for each Gaussian over the combined set.\n \"\"\"\n if X.shape[0] == 0:\n return mu, var\n if sample_weight is not None:\n n_new = float(sample_weight.sum())\n new_mu = np.average(X, axis=0, weights=sample_weight)\n new_var = np.average((X - new_mu)**2, axis=0, weights=sample_weight)\n else:\n n_new = X.shape[0]\n new_var = np.var(X, axis=0)\n new_mu = np.mean(X, axis=0)\n if n_past == 0:\n return new_mu, new_var\n n_total = float(n_past + n_new)\n total_mu = (n_new * new_mu + n_past * mu) / n_total\n old_ssd = n_past * var\n new_ssd = n_new * new_var\n total_ssd = old_ssd + new_ssd + n_new * n_past / n_total * (mu - new_mu)**2\n total_var = total_ssd / n_total\n return total_mu, total_var" }, { @@ -139593,7 +150238,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139603,7 +150249,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -139613,7 +150260,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -139623,13 +150271,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weights applied to individual samples (1. for unweighted).\n\n.. versionadded:: 0.17\n Gaussian Naive Bayes supports fitting with *sample_weight*." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit Gaussian Naive Bayes according to X, y.", - "docstring": "Fit Gaussian Naive Bayes according to X, y.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n .. versionadded:: 0.17\n Gaussian Naive Bayes supports fitting with *sample_weight*.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit Gaussian Naive Bayes according to X, y.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n .. versionadded:: 0.17\n Gaussian Naive Bayes supports fitting with *sample_weight*.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit Gaussian Naive Bayes according to X, y.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n .. versionadded:: 0.17\n Gaussian Naive Bayes supports fitting with *sample_weight*.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n y = self._validate_data(y=y)\n return self._partial_fit(X, y, np.unique(y), _refit=True, sample_weight=sample_weight)" }, { @@ -139647,7 +150296,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139657,7 +150307,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -139667,7 +150318,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "classes", @@ -139677,7 +150329,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "List of all the classes that can possibly appear in the y vector.\n\nMust be provided at the first call to partial_fit, can be omitted\nin subsequent calls." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -139687,13 +150340,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weights applied to individual samples (1. for unweighted).\n\n.. versionadded:: 0.17" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Incremental fit on a batch of samples.\n\nThis method is expected to be called several times consecutively on different chunks of a dataset so as to implement out-of-core or online learning. This is especially useful when the whole dataset is too big to fit in memory at once. This method has some performance and numerical stability overhead, hence it is better to call partial_fit on chunks of data that are as large as possible (as long as fitting in the memory budget) to hide the overhead.", - "docstring": "Incremental fit on a batch of samples.\n\nThis method is expected to be called several times consecutively\non different chunks of a dataset so as to implement out-of-core\nor online learning.\n\nThis is especially useful when the whole dataset is too big to fit in\nmemory at once.\n\nThis method has some performance and numerical stability overhead,\nhence it is better to call partial_fit on chunks of data that are\nas large as possible (as long as fitting in the memory budget) to\nhide the overhead.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nclasses : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n .. versionadded:: 0.17\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "description": "Incremental fit on a batch of samples.\n\nThis method is expected to be called several times consecutively\non different chunks of a dataset so as to implement out-of-core\nor online learning.\n\nThis is especially useful when the whole dataset is too big to fit in\nmemory at once.\n\nThis method has some performance and numerical stability overhead,\nhence it is better to call partial_fit on chunks of data that are\nas large as possible (as long as fitting in the memory budget) to\nhide the overhead.", + "docstring": "Incremental fit on a batch of samples.\n\n This method is expected to be called several times consecutively\n on different chunks of a dataset so as to implement out-of-core\n or online learning.\n\n This is especially useful when the whole dataset is too big to fit in\n memory at once.\n\n This method has some performance and numerical stability overhead,\n hence it is better to call partial_fit on chunks of data that are\n as large as possible (as long as fitting in the memory budget) to\n hide the overhead.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n classes : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n .. versionadded:: 0.17\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef partial_fit(self, X, y, classes=None, sample_weight=None):\n \"\"\"Incremental fit on a batch of samples.\n\n This method is expected to be called several times consecutively\n on different chunks of a dataset so as to implement out-of-core\n or online learning.\n\n This is especially useful when the whole dataset is too big to fit in\n memory at once.\n\n This method has some performance and numerical stability overhead,\n hence it is better to call partial_fit on chunks of data that are\n as large as possible (as long as fitting in the memory budget) to\n hide the overhead.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n classes : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n .. versionadded:: 0.17\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n return self._partial_fit(X, y, classes, _refit=False, sample_weight=sample_weight)" }, { @@ -139714,13 +150368,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `sigma_` was deprecated in 1.0 and will be removed in1.2. Use `var_` instead.')\n@property\ndef sigma_(self):\n return self.var_" }, { @@ -139738,7 +150393,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -139748,7 +150404,8 @@ "docstring": { "type": "float, default=1.0", "description": "Additive (Laplace/Lidstone) smoothing parameter\n(0 for no smoothing)." - } + }, + "refined_type": {} }, { "name": "fit_prior", @@ -139758,7 +150415,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to learn class prior probabilities or not.\nIf false, a uniform prior will be used." - } + }, + "refined_type": {} }, { "name": "class_prior", @@ -139768,13 +150426,14 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "Prior probabilities of the classes. If specified the priors are not\nadjusted according to the data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None):\n self.alpha = alpha\n self.fit_prior = fit_prior\n self.class_prior = class_prior" }, { @@ -139792,7 +150451,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139802,7 +150462,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -139812,7 +150473,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -139836,7 +150498,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139846,7 +150509,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -139870,13 +150534,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'requires_positive_X': True}" }, { @@ -139894,7 +150559,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -139904,7 +150570,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -139928,7 +150595,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139938,7 +150606,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -139962,7 +150631,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -139972,7 +150642,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -139982,7 +150653,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "reset", @@ -139992,7 +150664,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -140016,13 +150689,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_alpha(self):\n if np.min(self.alpha) < 0:\n raise ValueError('Smoothing parameter alpha = %.1e. alpha should be > 0.' % np.min(self.alpha))\n if isinstance(self.alpha, np.ndarray):\n if not self.alpha.shape[0] == self.n_features_in_:\n raise ValueError('alpha should be a scalar or a numpy array with shape [n_features]')\n if np.min(self.alpha) < _ALPHA_MIN:\n warnings.warn('alpha too small will result in numeric errors, setting alpha = %.1e' % _ALPHA_MIN)\n return np.maximum(self.alpha, _ALPHA_MIN)\n return self.alpha" }, { @@ -140040,7 +150714,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -140050,7 +150725,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -140060,13 +150736,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _init_counters(self, n_classes, n_features):\n self.class_count_ = np.zeros(n_classes, dtype=np.float64)\n self.feature_count_ = np.zeros((n_classes, n_features), dtype=np.float64)" }, { @@ -140084,13 +150761,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'poor_score': True}" }, { @@ -140108,7 +150786,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "class_prior", @@ -140118,13 +150797,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _update_class_log_prior(self, class_prior=None):\n n_classes = len(self.classes_)\n if class_prior is not None:\n if len(class_prior) != n_classes:\n raise ValueError('Number of priors must match number of classes.')\n self.class_log_prior_ = np.log(class_prior)\n elif self.fit_prior:\n with warnings.catch_warnings():\n warnings.simplefilter('ignore', RuntimeWarning)\n log_class_count = np.log(self.class_count_)\n self.class_log_prior_ = log_class_count - np.log(self.class_count_.sum())\n else:\n self.class_log_prior_ = np.full(n_classes, -np.log(n_classes))" }, { @@ -140145,13 +150825,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `coef_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef coef_(self):\n return self.feature_log_prob_[1:] if len(self.classes_) == 2 else self.feature_log_prob_" }, { @@ -140169,7 +150850,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -140179,6 +150861,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -140189,7 +150875,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -140199,13 +150886,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weights applied to individual samples (1. for unweighted)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit Naive Bayes classifier according to X, y.", - "docstring": "Fit Naive Bayes classifier according to X, y.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit Naive Bayes classifier according to X, y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit Naive Bayes classifier according to X, y.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n (X, y) = self._check_X_y(X, y)\n (_, n_features) = X.shape\n labelbin = LabelBinarizer()\n Y = labelbin.fit_transform(y)\n self.classes_ = labelbin.classes_\n if Y.shape[1] == 1:\n if len(self.classes_) == 2:\n Y = np.concatenate((1 - Y, Y), axis=1)\n else:\n Y = np.ones_like(Y)\n if sample_weight is not None:\n Y = Y.astype(np.float64, copy=False)\n sample_weight = _check_sample_weight(sample_weight, X)\n sample_weight = np.atleast_2d(sample_weight)\n Y *= sample_weight.T\n class_prior = self.class_prior\n n_classes = Y.shape[1]\n self._init_counters(n_classes, n_features)\n self._count(X, Y)\n alpha = self._check_alpha()\n self._update_feature_log_prob(alpha)\n self._update_class_log_prior(class_prior=class_prior)\n return self" }, { @@ -140226,13 +150914,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `intercept_` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef intercept_(self):\n return self.class_log_prior_[1:] if len(self.classes_) == 2 else self.class_log_prior_" }, { @@ -140253,13 +150942,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `n_features_` was deprecated in version 1.0 and will be removed in 1.2. Use `n_features_in_` instead.')\n@property\ndef n_features_(self):\n return self.n_features_in_" }, { @@ -140277,7 +150967,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -140287,6 +150978,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vectors, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -140297,7 +150992,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} }, { "name": "classes", @@ -140307,7 +151003,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "List of all the classes that can possibly appear in the y vector.\n\nMust be provided at the first call to partial_fit, can be omitted\nin subsequent calls." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -140317,13 +151014,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weights applied to individual samples (1. for unweighted)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Incremental fit on a batch of samples.\n\nThis method is expected to be called several times consecutively on different chunks of a dataset so as to implement out-of-core or online learning. This is especially useful when the whole dataset is too big to fit in memory at once. This method has some performance overhead hence it is better to call partial_fit on chunks of data that are as large as possible (as long as fitting in the memory budget) to hide the overhead.", - "docstring": "Incremental fit on a batch of samples.\n\nThis method is expected to be called several times consecutively\non different chunks of a dataset so as to implement out-of-core\nor online learning.\n\nThis is especially useful when the whole dataset is too big to fit in\nmemory at once.\n\nThis method has some performance overhead hence it is better to call\npartial_fit on chunks of data that are as large as possible\n(as long as fitting in the memory budget) to hide the overhead.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target values.\n\nclasses : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "description": "Incremental fit on a batch of samples.\n\nThis method is expected to be called several times consecutively\non different chunks of a dataset so as to implement out-of-core\nor online learning.\n\nThis is especially useful when the whole dataset is too big to fit in\nmemory at once.\n\nThis method has some performance overhead hence it is better to call\npartial_fit on chunks of data that are as large as possible\n(as long as fitting in the memory budget) to hide the overhead.", + "docstring": "Incremental fit on a batch of samples.\n\n This method is expected to be called several times consecutively\n on different chunks of a dataset so as to implement out-of-core\n or online learning.\n\n This is especially useful when the whole dataset is too big to fit in\n memory at once.\n\n This method has some performance overhead hence it is better to call\n partial_fit on chunks of data that are as large as possible\n (as long as fitting in the memory budget) to hide the overhead.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n classes : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef partial_fit(self, X, y, classes=None, sample_weight=None):\n \"\"\"Incremental fit on a batch of samples.\n\n This method is expected to be called several times consecutively\n on different chunks of a dataset so as to implement out-of-core\n or online learning.\n\n This is especially useful when the whole dataset is too big to fit in\n memory at once.\n\n This method has some performance overhead hence it is better to call\n partial_fit on chunks of data that are as large as possible\n (as long as fitting in the memory budget) to hide the overhead.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vectors, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target values.\n\n classes : array-like of shape (n_classes,), default=None\n List of all the classes that can possibly appear in the y vector.\n\n Must be provided at the first call to partial_fit, can be omitted\n in subsequent calls.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights applied to individual samples (1. for unweighted).\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n first_call = not hasattr(self, 'classes_')\n (X, y) = self._check_X_y(X, y, reset=first_call)\n (_, n_features) = X.shape\n if _check_partial_fit_first_call(self, classes):\n n_classes = len(classes)\n self._init_counters(n_classes, n_features)\n Y = label_binarize(y, classes=self.classes_)\n if Y.shape[1] == 1:\n if len(self.classes_) == 2:\n Y = np.concatenate((1 - Y, Y), axis=1)\n else:\n Y = np.ones_like(Y)\n if X.shape[0] != Y.shape[0]:\n msg = 'X.shape[0]=%d and y.shape[0]=%d are incompatible.'\n raise ValueError(msg % (X.shape[0], y.shape[0]))\n Y = Y.astype(np.float64, copy=False)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X)\n sample_weight = np.atleast_2d(sample_weight)\n Y *= sample_weight.T\n class_prior = self.class_prior\n self._count(X, Y)\n alpha = self._check_alpha()\n self._update_feature_log_prob(alpha)\n self._update_class_log_prior(class_prior=class_prior)\n return self" }, { @@ -140341,7 +151039,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -140351,13 +151050,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "To be overridden in subclasses with the actual checks.\n\nOnly used in predict* methods.", - "docstring": "To be overridden in subclasses with the actual checks.\n\nOnly used in predict* methods.", + "docstring": "To be overridden in subclasses with the actual checks.\n\n Only used in predict* methods.\n ", "source_code": "\n@abstractmethod\ndef _check_X(self, X):\n \"\"\"To be overridden in subclasses with the actual checks.\n\n Only used in predict* methods.\n \"\"\"\n " }, { @@ -140375,7 +151075,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -140385,13 +151086,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the unnormalized posterior log probability of X\n\nI.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of shape (n_classes, n_samples). Input is passed to _joint_log_likelihood as-is by predict, predict_proba and predict_log_proba.", - "docstring": "Compute the unnormalized posterior log probability of X\n\nI.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of\nshape (n_classes, n_samples).\n\nInput is passed to _joint_log_likelihood as-is by predict,\npredict_proba and predict_log_proba.", + "description": "Compute the unnormalized posterior log probability of X\n\nI.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of\nshape (n_classes, n_samples).\n\nInput is passed to _joint_log_likelihood as-is by predict,\npredict_proba and predict_log_proba.", + "docstring": "Compute the unnormalized posterior log probability of X\n\n I.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of\n shape (n_classes, n_samples).\n\n Input is passed to _joint_log_likelihood as-is by predict,\n predict_proba and predict_log_proba.\n ", "source_code": "\n@abstractmethod\ndef _joint_log_likelihood(self, X):\n \"\"\"Compute the unnormalized posterior log probability of X\n\n I.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of\n shape (n_classes, n_samples).\n\n Input is passed to _joint_log_likelihood as-is by predict,\n predict_proba and predict_log_proba.\n \"\"\"\n " }, { @@ -140409,7 +151111,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -140419,13 +151122,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Perform classification on an array of test vectors X.", - "docstring": "Perform classification on an array of test vectors X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\nC : ndarray of shape (n_samples,)\n Predicted target values for X.", + "docstring": "\n Perform classification on an array of test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n Predicted target values for X.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"\n Perform classification on an array of test vectors X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n Predicted target values for X.\n \"\"\"\n check_is_fitted(self)\n X = self._check_X(X)\n jll = self._joint_log_likelihood(X)\n return self.classes_[np.argmax(jll, axis=1)]" }, { @@ -140443,7 +151147,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -140453,13 +151158,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return log-probability estimates for the test vector X.", - "docstring": "Return log-probability estimates for the test vector X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\nC : array-like of shape (n_samples, n_classes)\n Returns the log-probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.", + "docstring": "\n Return log-probability estimates for the test vector X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n C : array-like of shape (n_samples, n_classes)\n Returns the log-probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n ", "source_code": "\ndef predict_log_proba(self, X):\n \"\"\"\n Return log-probability estimates for the test vector X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n C : array-like of shape (n_samples, n_classes)\n Returns the log-probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n X = self._check_X(X)\n jll = self._joint_log_likelihood(X)\n log_prob_x = logsumexp(jll, axis=1)\n return jll - np.atleast_2d(log_prob_x).T" }, { @@ -140477,7 +151183,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -140487,13 +151194,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Return probability estimates for the test vector X.", - "docstring": "Return probability estimates for the test vector X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\nC : array-like of shape (n_samples, n_classes)\n Returns the probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.", + "docstring": "\n Return probability estimates for the test vector X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n C : array-like of shape (n_samples, n_classes)\n Returns the probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"\n Return probability estimates for the test vector X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n C : array-like of shape (n_samples, n_classes)\n Returns the probability of the samples for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n \"\"\"\n return np.exp(self.predict_log_proba(X))" }, { @@ -140511,7 +151219,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dist", @@ -140521,7 +151230,8 @@ "docstring": { "type": "ndarray of shape (n_samples_chunk, n_samples)", "description": "The distance matrix." - } + }, + "refined_type": {} }, { "name": "start", @@ -140531,7 +151241,8 @@ "docstring": { "type": "int", "description": "The index in X which the first row of dist corresponds to." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -140541,7 +151252,8 @@ "docstring": { "type": "int", "description": "Number of neighbors required for each sample." - } + }, + "refined_type": {} }, { "name": "return_distance", @@ -140551,13 +151263,14 @@ "docstring": { "type": "bool", "description": "Whether or not to return the distances." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Reduce a chunk of distances to the nearest neighbors.\n\nCallback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`", - "docstring": "Reduce a chunk of distances to the nearest neighbors.\n\nCallback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`\n\nParameters\n----------\ndist : ndarray of shape (n_samples_chunk, n_samples)\n The distance matrix.\n\nstart : int\n The index in X which the first row of dist corresponds to.\n\nn_neighbors : int\n Number of neighbors required for each sample.\n\nreturn_distance : bool\n Whether or not to return the distances.\n\nReturns\n-------\ndist : array of shape (n_samples_chunk, n_neighbors)\n Returned only if `return_distance=True`.\n\nneigh : array of shape (n_samples_chunk, n_neighbors)\n The neighbors indices.", + "docstring": "Reduce a chunk of distances to the nearest neighbors.\n\n Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`\n\n Parameters\n ----------\n dist : ndarray of shape (n_samples_chunk, n_samples)\n The distance matrix.\n\n start : int\n The index in X which the first row of dist corresponds to.\n\n n_neighbors : int\n Number of neighbors required for each sample.\n\n return_distance : bool\n Whether or not to return the distances.\n\n Returns\n -------\n dist : array of shape (n_samples_chunk, n_neighbors)\n Returned only if `return_distance=True`.\n\n neigh : array of shape (n_samples_chunk, n_neighbors)\n The neighbors indices.\n ", "source_code": "\ndef _kneighbors_reduce_func(self, dist, start, n_neighbors, return_distance):\n \"\"\"Reduce a chunk of distances to the nearest neighbors.\n\n Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`\n\n Parameters\n ----------\n dist : ndarray of shape (n_samples_chunk, n_samples)\n The distance matrix.\n\n start : int\n The index in X which the first row of dist corresponds to.\n\n n_neighbors : int\n Number of neighbors required for each sample.\n\n return_distance : bool\n Whether or not to return the distances.\n\n Returns\n -------\n dist : array of shape (n_samples_chunk, n_neighbors)\n Returned only if `return_distance=True`.\n\n neigh : array of shape (n_samples_chunk, n_neighbors)\n The neighbors indices.\n \"\"\"\n sample_range = np.arange(dist.shape[0])[:, None]\n neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1)\n neigh_ind = neigh_ind[:, :n_neighbors]\n neigh_ind = neigh_ind[sample_range, np.argsort(dist[sample_range, neigh_ind])]\n if return_distance:\n if self.effective_metric_ == 'euclidean':\n result = (np.sqrt(dist[sample_range, neigh_ind]), neigh_ind)\n else:\n result = (dist[sample_range, neigh_ind], neigh_ind)\n else:\n result = neigh_ind\n return result" }, { @@ -140575,7 +151288,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -140585,7 +151299,8 @@ "docstring": { "type": "array-like, shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed', default=None", "description": "The query point or points.\nIf not provided, neighbors of each indexed point are returned.\nIn this case, the query point is not considered its own neighbor." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -140595,7 +151310,8 @@ "docstring": { "type": "int, default=None", "description": "Number of neighbors required for each sample. The default is the\nvalue passed to the constructor." - } + }, + "refined_type": {} }, { "name": "return_distance", @@ -140605,13 +151321,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to return the distances." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Find the K-neighbors of a point.\n\nReturns indices of and distances to the neighbors of each point.", - "docstring": "Find the K-neighbors of a point.\n\nReturns indices of and distances to the neighbors of each point.\n\nParameters\n----------\nX : array-like, shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed', default=None\n The query point or points.\n If not provided, neighbors of each indexed point are returned.\n In this case, the query point is not considered its own neighbor.\n\nn_neighbors : int, default=None\n Number of neighbors required for each sample. The default is the\n value passed to the constructor.\n\nreturn_distance : bool, default=True\n Whether or not to return the distances.\n\nReturns\n-------\nneigh_dist : ndarray of shape (n_queries, n_neighbors)\n Array representing the lengths to points, only present if\n return_distance=True.\n\nneigh_ind : ndarray of shape (n_queries, n_neighbors)\n Indices of the nearest points in the population matrix.\n\nExamples\n--------\nIn the following example, we construct a NearestNeighbors\nclass from an array representing our data set and ask who's\nthe closest point to [1,1,1]\n\n>>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]\n>>> from sklearn.neighbors import NearestNeighbors\n>>> neigh = NearestNeighbors(n_neighbors=1)\n>>> neigh.fit(samples)\nNearestNeighbors(n_neighbors=1)\n>>> print(neigh.kneighbors([[1., 1., 1.]]))\n(array([[0.5]]), array([[2]]))\n\nAs you can see, it returns [[0.5]], and [[2]], which means that the\nelement is at distance 0.5 and is the third element of samples\n(indexes start at 0). You can also query for multiple points:\n\n>>> X = [[0., 1., 0.], [1., 0., 1.]]\n>>> neigh.kneighbors(X, return_distance=False)\narray([[1],\n [2]]...)", + "docstring": "Find the K-neighbors of a point.\n\n Returns indices of and distances to the neighbors of each point.\n\n Parameters\n ----------\n X : array-like, shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed', default=None\n The query point or points.\n If not provided, neighbors of each indexed point are returned.\n In this case, the query point is not considered its own neighbor.\n\n n_neighbors : int, default=None\n Number of neighbors required for each sample. The default is the\n value passed to the constructor.\n\n return_distance : bool, default=True\n Whether or not to return the distances.\n\n Returns\n -------\n neigh_dist : ndarray of shape (n_queries, n_neighbors)\n Array representing the lengths to points, only present if\n return_distance=True.\n\n neigh_ind : ndarray of shape (n_queries, n_neighbors)\n Indices of the nearest points in the population matrix.\n\n Examples\n --------\n In the following example, we construct a NearestNeighbors\n class from an array representing our data set and ask who's\n the closest point to [1,1,1]\n\n >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]\n >>> from sklearn.neighbors import NearestNeighbors\n >>> neigh = NearestNeighbors(n_neighbors=1)\n >>> neigh.fit(samples)\n NearestNeighbors(n_neighbors=1)\n >>> print(neigh.kneighbors([[1., 1., 1.]]))\n (array([[0.5]]), array([[2]]))\n\n As you can see, it returns [[0.5]], and [[2]], which means that the\n element is at distance 0.5 and is the third element of samples\n (indexes start at 0). You can also query for multiple points:\n\n >>> X = [[0., 1., 0.], [1., 0., 1.]]\n >>> neigh.kneighbors(X, return_distance=False)\n array([[1],\n [2]]...)\n ", "source_code": "\ndef kneighbors(self, X=None, n_neighbors=None, return_distance=True):\n \"\"\"Find the K-neighbors of a point.\n\n Returns indices of and distances to the neighbors of each point.\n\n Parameters\n ----------\n X : array-like, shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed', default=None\n The query point or points.\n If not provided, neighbors of each indexed point are returned.\n In this case, the query point is not considered its own neighbor.\n\n n_neighbors : int, default=None\n Number of neighbors required for each sample. The default is the\n value passed to the constructor.\n\n return_distance : bool, default=True\n Whether or not to return the distances.\n\n Returns\n -------\n neigh_dist : ndarray of shape (n_queries, n_neighbors)\n Array representing the lengths to points, only present if\n return_distance=True.\n\n neigh_ind : ndarray of shape (n_queries, n_neighbors)\n Indices of the nearest points in the population matrix.\n\n Examples\n --------\n In the following example, we construct a NearestNeighbors\n class from an array representing our data set and ask who's\n the closest point to [1,1,1]\n\n >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]\n >>> from sklearn.neighbors import NearestNeighbors\n >>> neigh = NearestNeighbors(n_neighbors=1)\n >>> neigh.fit(samples)\n NearestNeighbors(n_neighbors=1)\n >>> print(neigh.kneighbors([[1., 1., 1.]]))\n (array([[0.5]]), array([[2]]))\n\n As you can see, it returns [[0.5]], and [[2]], which means that the\n element is at distance 0.5 and is the third element of samples\n (indexes start at 0). You can also query for multiple points:\n\n >>> X = [[0., 1., 0.], [1., 0., 1.]]\n >>> neigh.kneighbors(X, return_distance=False)\n array([[1],\n [2]]...)\n \"\"\"\n check_is_fitted(self)\n if n_neighbors is None:\n n_neighbors = self.n_neighbors\n elif n_neighbors <= 0:\n raise ValueError('Expected n_neighbors > 0. Got %d' % n_neighbors)\n elif not isinstance(n_neighbors, numbers.Integral):\n raise TypeError('n_neighbors does not take %s value, enter integer value' % type(n_neighbors))\n if X is not None:\n query_is_train = False\n if self.metric == 'precomputed':\n X = _check_precomputed(X)\n else:\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n else:\n query_is_train = True\n X = self._fit_X\n n_neighbors += 1\n n_samples_fit = self.n_samples_fit_\n if n_neighbors > n_samples_fit:\n raise ValueError('Expected n_neighbors <= n_samples, but n_samples = %d, n_neighbors = %d' % (n_samples_fit, n_neighbors))\n n_jobs = effective_n_jobs(self.n_jobs)\n chunked_results = None\n if self._fit_method == 'brute' and self.metric == 'precomputed' and issparse(X):\n results = _kneighbors_from_graph(X, n_neighbors=n_neighbors, return_distance=return_distance)\n elif self._fit_method == 'brute':\n reduce_func = partial(self._kneighbors_reduce_func, n_neighbors=n_neighbors, return_distance=return_distance)\n if self.effective_metric_ == 'euclidean':\n kwds = {'squared': True}\n else:\n kwds = self.effective_metric_params_\n chunked_results = list(pairwise_distances_chunked(X, self._fit_X, reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=n_jobs, **kwds))\n elif self._fit_method in ['ball_tree', 'kd_tree']:\n if issparse(X):\n raise ValueError(\"%s does not work with sparse matrices. Densify the data, or set algorithm='brute'\" % self._fit_method)\n old_joblib = parse_version(joblib.__version__) < parse_version('0.12')\n if old_joblib:\n parallel_kwargs = {'backend': 'threading'}\n else:\n parallel_kwargs = {'prefer': 'threads'}\n chunked_results = Parallel(n_jobs, **parallel_kwargs)((delayed(_tree_query_parallel_helper)(self._tree, X[s], n_neighbors, return_distance) for s in gen_even_slices(X.shape[0], n_jobs)))\n else:\n raise ValueError('internal: _fit_method not recognized')\n if chunked_results is not None:\n if return_distance:\n (neigh_dist, neigh_ind) = zip(*chunked_results)\n results = (np.vstack(neigh_dist), np.vstack(neigh_ind))\n else:\n results = np.vstack(chunked_results)\n if not query_is_train:\n return results\n else:\n if return_distance:\n (neigh_dist, neigh_ind) = results\n else:\n neigh_ind = results\n (n_queries, _) = X.shape\n sample_range = np.arange(n_queries)[:, None]\n sample_mask = neigh_ind != sample_range\n dup_gr_nbrs = np.all(sample_mask, axis=1)\n sample_mask[:, 0][dup_gr_nbrs] = False\n neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1))\n if return_distance:\n neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1))\n return neigh_dist, neigh_ind\n return neigh_ind" }, { @@ -140629,7 +151346,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -140639,7 +151357,8 @@ "docstring": { "type": "array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed', default=None", "description": "The query point or points.\nIf not provided, neighbors of each indexed point are returned.\nIn this case, the query point is not considered its own neighbor.\nFor ``metric='precomputed'`` the shape should be\n(n_queries, n_indexed). Otherwise the shape should be\n(n_queries, n_features)." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -140649,7 +151368,8 @@ "docstring": { "type": "int, default=None", "description": "Number of neighbors for each sample. The default is the value\npassed to the constructor." - } + }, + "refined_type": {} }, { "name": "mode", @@ -140659,13 +151379,17 @@ "docstring": { "type": "{'connectivity', 'distance'}, default='connectivity'", "description": "Type of returned matrix: 'connectivity' will return the\nconnectivity matrix with ones and zeros, in 'distance' the\nedges are distances between points, type of distance\ndepends on the selected metric parameter in\nNearestNeighbors class." + }, + "refined_type": { + "kind": "EnumType", + "values": ["distance", "connectivity"] } } ], "results": [], "is_public": false, "description": "Compute the (weighted) graph of k-Neighbors for points in X.", - "docstring": "Compute the (weighted) graph of k-Neighbors for points in X.\n\nParameters\n----------\nX : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed', default=None\n The query point or points.\n If not provided, neighbors of each indexed point are returned.\n In this case, the query point is not considered its own neighbor.\n For ``metric='precomputed'`` the shape should be\n (n_queries, n_indexed). Otherwise the shape should be\n (n_queries, n_features).\n\nn_neighbors : int, default=None\n Number of neighbors for each sample. The default is the value\n passed to the constructor.\n\nmode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the\n connectivity matrix with ones and zeros, in 'distance' the\n edges are distances between points, type of distance\n depends on the selected metric parameter in\n NearestNeighbors class.\n\nReturns\n-------\nA : sparse-matrix of shape (n_queries, n_samples_fit)\n `n_samples_fit` is the number of samples in the fitted data.\n `A[i, j]` gives the weight of the edge connecting `i` to `j`.\n The matrix is of CSR format.\n\nSee Also\n--------\nNearestNeighbors.radius_neighbors_graph : Compute the (weighted) graph\n of Neighbors for points in X.\n\nExamples\n--------\n>>> X = [[0], [3], [1]]\n>>> from sklearn.neighbors import NearestNeighbors\n>>> neigh = NearestNeighbors(n_neighbors=2)\n>>> neigh.fit(X)\nNearestNeighbors(n_neighbors=2)\n>>> A = neigh.kneighbors_graph(X)\n>>> A.toarray()\narray([[1., 0., 1.],\n [0., 1., 1.],\n [1., 0., 1.]])", + "docstring": "Compute the (weighted) graph of k-Neighbors for points in X.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed', default=None\n The query point or points.\n If not provided, neighbors of each indexed point are returned.\n In this case, the query point is not considered its own neighbor.\n For ``metric='precomputed'`` the shape should be\n (n_queries, n_indexed). Otherwise the shape should be\n (n_queries, n_features).\n\n n_neighbors : int, default=None\n Number of neighbors for each sample. The default is the value\n passed to the constructor.\n\n mode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the\n connectivity matrix with ones and zeros, in 'distance' the\n edges are distances between points, type of distance\n depends on the selected metric parameter in\n NearestNeighbors class.\n\n Returns\n -------\n A : sparse-matrix of shape (n_queries, n_samples_fit)\n `n_samples_fit` is the number of samples in the fitted data.\n `A[i, j]` gives the weight of the edge connecting `i` to `j`.\n The matrix is of CSR format.\n\n See Also\n --------\n NearestNeighbors.radius_neighbors_graph : Compute the (weighted) graph\n of Neighbors for points in X.\n\n Examples\n --------\n >>> X = [[0], [3], [1]]\n >>> from sklearn.neighbors import NearestNeighbors\n >>> neigh = NearestNeighbors(n_neighbors=2)\n >>> neigh.fit(X)\n NearestNeighbors(n_neighbors=2)\n >>> A = neigh.kneighbors_graph(X)\n >>> A.toarray()\n array([[1., 0., 1.],\n [0., 1., 1.],\n [1., 0., 1.]])\n ", "source_code": "\ndef kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity'):\n \"\"\"Compute the (weighted) graph of k-Neighbors for points in X.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed', default=None\n The query point or points.\n If not provided, neighbors of each indexed point are returned.\n In this case, the query point is not considered its own neighbor.\n For ``metric='precomputed'`` the shape should be\n (n_queries, n_indexed). Otherwise the shape should be\n (n_queries, n_features).\n\n n_neighbors : int, default=None\n Number of neighbors for each sample. The default is the value\n passed to the constructor.\n\n mode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the\n connectivity matrix with ones and zeros, in 'distance' the\n edges are distances between points, type of distance\n depends on the selected metric parameter in\n NearestNeighbors class.\n\n Returns\n -------\n A : sparse-matrix of shape (n_queries, n_samples_fit)\n `n_samples_fit` is the number of samples in the fitted data.\n `A[i, j]` gives the weight of the edge connecting `i` to `j`.\n The matrix is of CSR format.\n\n See Also\n --------\n NearestNeighbors.radius_neighbors_graph : Compute the (weighted) graph\n of Neighbors for points in X.\n\n Examples\n --------\n >>> X = [[0], [3], [1]]\n >>> from sklearn.neighbors import NearestNeighbors\n >>> neigh = NearestNeighbors(n_neighbors=2)\n >>> neigh.fit(X)\n NearestNeighbors(n_neighbors=2)\n >>> A = neigh.kneighbors_graph(X)\n >>> A.toarray()\n array([[1., 0., 1.],\n [0., 1., 1.],\n [1., 0., 1.]])\n \"\"\"\n check_is_fitted(self)\n if n_neighbors is None:\n n_neighbors = self.n_neighbors\n if mode == 'connectivity':\n A_ind = self.kneighbors(X, n_neighbors, return_distance=False)\n n_queries = A_ind.shape[0]\n A_data = np.ones(n_queries * n_neighbors)\n elif mode == 'distance':\n (A_data, A_ind) = self.kneighbors(X, n_neighbors, return_distance=True)\n A_data = np.ravel(A_data)\n else:\n raise ValueError('Unsupported mode, must be one of \"connectivity\" or \"distance\" but got \"%s\" instead' % mode)\n n_queries = A_ind.shape[0]\n n_samples_fit = self.n_samples_fit_\n n_nonzero = n_queries * n_neighbors\n A_indptr = np.arange(0, n_nonzero + 1, n_neighbors)\n kneighbors_graph = csr_matrix((A_data, A_ind.ravel(), A_indptr), shape=(n_queries, n_samples_fit))\n return kneighbors_graph" }, { @@ -140683,7 +151407,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -140693,7 +151418,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "radius", @@ -140703,7 +151429,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -140713,7 +151440,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "leaf_size", @@ -140723,7 +151451,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "metric", @@ -140733,7 +151462,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "p", @@ -140743,7 +151473,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -140753,7 +151484,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -140763,13 +151495,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, n_neighbors=None, radius=None, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None):\n self.n_neighbors = n_neighbors\n self.radius = radius\n self.algorithm = algorithm\n self.leaf_size = leaf_size\n self.metric = metric\n self.metric_params = metric_params\n self.p = p\n self.n_jobs = n_jobs" }, { @@ -140787,13 +151520,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_algorithm_metric(self):\n if self.algorithm not in ['auto', 'brute', 'kd_tree', 'ball_tree']:\n raise ValueError(\"unrecognized algorithm: '%s'\" % self.algorithm)\n if self.algorithm == 'auto':\n if self.metric == 'precomputed':\n alg_check = 'brute'\n elif callable(self.metric) or self.metric in VALID_METRICS['ball_tree']:\n alg_check = 'ball_tree'\n else:\n alg_check = 'brute'\n else:\n alg_check = self.algorithm\n if callable(self.metric):\n if self.algorithm == 'kd_tree':\n raise ValueError(\"kd_tree does not support callable metric '%s'Function call overhead will resultin very poor performance.\" % self.metric)\n elif self.metric not in VALID_METRICS[alg_check]:\n raise ValueError(\"Metric '%s' not valid. Use sorted(sklearn.neighbors.VALID_METRICS['%s']) to get valid options. Metric can also be a callable function.\" % (self.metric, alg_check))\n if self.metric_params is not None and 'p' in self.metric_params:\n if self.p is not None:\n warnings.warn('Parameter p is found in metric_params. The corresponding parameter from __init__ is ignored.', SyntaxWarning, stacklevel=3)\n effective_p = self.metric_params['p']\n else:\n effective_p = self.p\n if self.metric in ['wminkowski', 'minkowski'] and effective_p < 1:\n raise ValueError('p must be greater or equal to one for minkowski metric')" }, { @@ -140811,7 +151545,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -140821,7 +151556,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -140831,14 +151567,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", - "source_code": "\ndef _fit(self, X, y=None):\n if self._get_tags()['requires_y']:\n if not isinstance(X, (KDTree, BallTree, NeighborsBase)):\n (X, y) = self._validate_data(X, y, accept_sparse='csr', multi_output=True)\n if is_classifier(self):\n if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:\n if y.ndim != 1:\n warnings.warn('A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().', DataConversionWarning, stacklevel=2)\n self.outputs_2d_ = False\n y = y.reshape((-1, 1))\n else:\n self.outputs_2d_ = True\n check_classification_targets(y)\n self.classes_ = []\n self._y = np.empty(y.shape, dtype=int)\n for k in range(self._y.shape[1]):\n (classes, self._y[:, k]) = np.unique(y[:, k], return_inverse=True)\n self.classes_.append(classes)\n if not self.outputs_2d_:\n self.classes_ = self.classes_[0]\n self._y = self._y.ravel()\n else:\n self._y = y\n elif not isinstance(X, (KDTree, BallTree, NeighborsBase)):\n X = self._validate_data(X, accept_sparse='csr')\n self._check_algorithm_metric()\n if self.metric_params is None:\n self.effective_metric_params_ = {}\n else:\n self.effective_metric_params_ = self.metric_params.copy()\n effective_p = self.effective_metric_params_.get('p', self.p)\n if self.metric in ['wminkowski', 'minkowski']:\n self.effective_metric_params_['p'] = effective_p\n self.effective_metric_ = self.metric\n if self.metric == 'minkowski':\n p = self.effective_metric_params_.pop('p', 2)\n if p < 1:\n raise ValueError('p must be greater or equal to one for minkowski metric')\n elif p == 1:\n self.effective_metric_ = 'manhattan'\n elif p == 2:\n self.effective_metric_ = 'euclidean'\n elif p == np.inf:\n self.effective_metric_ = 'chebyshev'\n else:\n self.effective_metric_params_['p'] = p\n if isinstance(X, NeighborsBase):\n self._fit_X = X._fit_X\n self._tree = X._tree\n self._fit_method = X._fit_method\n self.n_samples_fit_ = X.n_samples_fit_\n return self\n elif isinstance(X, BallTree):\n self._fit_X = X.data\n self._tree = X\n self._fit_method = 'ball_tree'\n self.n_samples_fit_ = X.data.shape[0]\n return self\n elif isinstance(X, KDTree):\n self._fit_X = X.data\n self._tree = X\n self._fit_method = 'kd_tree'\n self.n_samples_fit_ = X.data.shape[0]\n return self\n if self.metric == 'precomputed':\n X = _check_precomputed(X)\n if X.shape[0] != X.shape[1]:\n raise ValueError('Precomputed matrix must be square. Input is a {}x{} matrix.'.format(X.shape[0], X.shape[1]))\n self.n_features_in_ = X.shape[1]\n n_samples = X.shape[0]\n if n_samples == 0:\n raise ValueError('n_samples must be greater than 0')\n if issparse(X):\n if self.algorithm not in ('auto', 'brute'):\n warnings.warn('cannot use tree with sparse input: using brute force')\n if self.effective_metric_ not in VALID_METRICS_SPARSE['brute'] and not callable(self.effective_metric_):\n raise ValueError(\"Metric '%s' not valid for sparse input. Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) to get valid options. Metric can also be a callable function.\" % self.effective_metric_)\n self._fit_X = X.copy()\n self._tree = None\n self._fit_method = 'brute'\n self.n_samples_fit_ = X.shape[0]\n return self\n self._fit_method = self.algorithm\n self._fit_X = X\n self.n_samples_fit_ = X.shape[0]\n if self._fit_method == 'auto':\n if self.metric == 'precomputed' or self._fit_X.shape[1] > 15 or self.n_neighbors is not None and self.n_neighbors >= self._fit_X.shape[0] // 2:\n self._fit_method = 'brute'\n elif self.effective_metric_ in VALID_METRICS['kd_tree']:\n self._fit_method = 'kd_tree'\n elif callable(self.effective_metric_) or self.effective_metric_ in VALID_METRICS['ball_tree']:\n self._fit_method = 'ball_tree'\n else:\n self._fit_method = 'brute'\n if self._fit_method == 'ball_tree':\n self._tree = BallTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_)\n elif self._fit_method == 'kd_tree':\n self._tree = KDTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_)\n elif self._fit_method == 'brute':\n self._tree = None\n else:\n raise ValueError(\"algorithm = '%s' not recognized\" % self.algorithm)\n if self.n_neighbors is not None:\n if self.n_neighbors <= 0:\n raise ValueError('Expected n_neighbors > 0. Got %d' % self.n_neighbors)\n elif not isinstance(self.n_neighbors, numbers.Integral):\n raise TypeError('n_neighbors does not take %s value, enter integer value' % type(self.n_neighbors))\n return self" + "docstring": null, + "source_code": "\ndef _fit(self, X, y=None):\n if self._get_tags()['requires_y']:\n if not isinstance(X, (KDTree, BallTree, NeighborsBase)):\n (X, y) = self._validate_data(X, y, accept_sparse='csr', multi_output=True)\n if is_classifier(self):\n if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:\n if y.ndim != 1:\n warnings.warn('A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().', DataConversionWarning, stacklevel=2)\n self.outputs_2d_ = False\n y = y.reshape((-1, 1))\n else:\n self.outputs_2d_ = True\n check_classification_targets(y)\n self.classes_ = []\n self._y = np.empty(y.shape, dtype=int)\n for k in range(self._y.shape[1]):\n (classes, self._y[:, k]) = np.unique(y[:, k], return_inverse=True)\n self.classes_.append(classes)\n if not self.outputs_2d_:\n self.classes_ = self.classes_[0]\n self._y = self._y.ravel()\n else:\n self._y = y\n elif not isinstance(X, (KDTree, BallTree, NeighborsBase)):\n X = self._validate_data(X, accept_sparse='csr')\n self._check_algorithm_metric()\n if self.metric_params is None:\n self.effective_metric_params_ = {}\n else:\n self.effective_metric_params_ = self.metric_params.copy()\n effective_p = self.effective_metric_params_.get('p', self.p)\n if self.metric in ['wminkowski', 'minkowski']:\n self.effective_metric_params_['p'] = effective_p\n self.effective_metric_ = self.metric\n if self.metric == 'minkowski':\n p = self.effective_metric_params_.pop('p', 2)\n w = self.effective_metric_params_.pop('w', None)\n if p < 1:\n raise ValueError('p must be greater or equal to one for minkowski metric')\n elif p == 1 and w is None:\n self.effective_metric_ = 'manhattan'\n elif p == 2 and w is None:\n self.effective_metric_ = 'euclidean'\n elif p == np.inf and w is None:\n self.effective_metric_ = 'chebyshev'\n else:\n self.effective_metric_params_['p'] = p\n self.effective_metric_params_['w'] = w\n if isinstance(X, NeighborsBase):\n self._fit_X = X._fit_X\n self._tree = X._tree\n self._fit_method = X._fit_method\n self.n_samples_fit_ = X.n_samples_fit_\n return self\n elif isinstance(X, BallTree):\n self._fit_X = X.data\n self._tree = X\n self._fit_method = 'ball_tree'\n self.n_samples_fit_ = X.data.shape[0]\n return self\n elif isinstance(X, KDTree):\n self._fit_X = X.data\n self._tree = X\n self._fit_method = 'kd_tree'\n self.n_samples_fit_ = X.data.shape[0]\n return self\n if self.metric == 'precomputed':\n X = _check_precomputed(X)\n if X.shape[0] != X.shape[1]:\n raise ValueError('Precomputed matrix must be square. Input is a {}x{} matrix.'.format(X.shape[0], X.shape[1]))\n self.n_features_in_ = X.shape[1]\n n_samples = X.shape[0]\n if n_samples == 0:\n raise ValueError('n_samples must be greater than 0')\n if issparse(X):\n if self.algorithm not in ('auto', 'brute'):\n warnings.warn('cannot use tree with sparse input: using brute force')\n if self.effective_metric_ not in VALID_METRICS_SPARSE['brute'] and not callable(self.effective_metric_):\n raise ValueError(\"Metric '%s' not valid for sparse input. Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) to get valid options. Metric can also be a callable function.\" % self.effective_metric_)\n self._fit_X = X.copy()\n self._tree = None\n self._fit_method = 'brute'\n self.n_samples_fit_ = X.shape[0]\n return self\n self._fit_method = self.algorithm\n self._fit_X = X\n self.n_samples_fit_ = X.shape[0]\n if self._fit_method == 'auto':\n if self.metric == 'precomputed' or self._fit_X.shape[1] > 15 or self.n_neighbors is not None and self.n_neighbors >= self._fit_X.shape[0] // 2:\n self._fit_method = 'brute'\n elif self.effective_metric_ in VALID_METRICS['kd_tree']:\n self._fit_method = 'kd_tree'\n elif callable(self.effective_metric_) or self.effective_metric_ in VALID_METRICS['ball_tree']:\n self._fit_method = 'ball_tree'\n else:\n self._fit_method = 'brute'\n if self._fit_method == 'ball_tree':\n self._tree = BallTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_)\n elif self._fit_method == 'kd_tree':\n self._tree = KDTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_)\n elif self._fit_method == 'brute':\n self._tree = None\n else:\n raise ValueError(\"algorithm = '%s' not recognized\" % self.algorithm)\n if self.n_neighbors is not None:\n if self.n_neighbors <= 0:\n raise ValueError('Expected n_neighbors > 0. Got %d' % self.n_neighbors)\n elif not isinstance(self.n_neighbors, numbers.Integral):\n raise TypeError('n_neighbors does not take %s value, enter integer value' % type(self.n_neighbors))\n return self" }, { "name": "_more_tags", @@ -140855,13 +151592,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'pairwise': self.metric == 'precomputed'}" }, { @@ -140882,13 +151620,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef _pairwise(self):\n return self.metric == 'precomputed'" }, { @@ -140906,7 +151645,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dist", @@ -140916,7 +151656,8 @@ "docstring": { "type": "ndarray of shape (n_samples_chunk, n_samples)", "description": "The distance matrix." - } + }, + "refined_type": {} }, { "name": "start", @@ -140926,7 +151667,8 @@ "docstring": { "type": "int", "description": "The index in X which the first row of dist corresponds to." - } + }, + "refined_type": {} }, { "name": "radius", @@ -140936,7 +151678,8 @@ "docstring": { "type": "float", "description": "The radius considered when making the nearest neighbors search." - } + }, + "refined_type": {} }, { "name": "return_distance", @@ -140946,13 +151689,14 @@ "docstring": { "type": "bool", "description": "Whether or not to return the distances." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Reduce a chunk of distances to the nearest neighbors.\n\nCallback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`", - "docstring": "Reduce a chunk of distances to the nearest neighbors.\n\nCallback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`\n\nParameters\n----------\ndist : ndarray of shape (n_samples_chunk, n_samples)\n The distance matrix.\n\nstart : int\n The index in X which the first row of dist corresponds to.\n\nradius : float\n The radius considered when making the nearest neighbors search.\n\nreturn_distance : bool\n Whether or not to return the distances.\n\nReturns\n-------\ndist : list of ndarray of shape (n_samples_chunk,)\n Returned only if `return_distance=True`.\n\nneigh : list of ndarray of shape (n_samples_chunk,)\n The neighbors indices.", + "docstring": "Reduce a chunk of distances to the nearest neighbors.\n\n Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`\n\n Parameters\n ----------\n dist : ndarray of shape (n_samples_chunk, n_samples)\n The distance matrix.\n\n start : int\n The index in X which the first row of dist corresponds to.\n\n radius : float\n The radius considered when making the nearest neighbors search.\n\n return_distance : bool\n Whether or not to return the distances.\n\n Returns\n -------\n dist : list of ndarray of shape (n_samples_chunk,)\n Returned only if `return_distance=True`.\n\n neigh : list of ndarray of shape (n_samples_chunk,)\n The neighbors indices.\n ", "source_code": "\ndef _radius_neighbors_reduce_func(self, dist, start, radius, return_distance):\n \"\"\"Reduce a chunk of distances to the nearest neighbors.\n\n Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked`\n\n Parameters\n ----------\n dist : ndarray of shape (n_samples_chunk, n_samples)\n The distance matrix.\n\n start : int\n The index in X which the first row of dist corresponds to.\n\n radius : float\n The radius considered when making the nearest neighbors search.\n\n return_distance : bool\n Whether or not to return the distances.\n\n Returns\n -------\n dist : list of ndarray of shape (n_samples_chunk,)\n Returned only if `return_distance=True`.\n\n neigh : list of ndarray of shape (n_samples_chunk,)\n The neighbors indices.\n \"\"\"\n neigh_ind = [np.where(d <= radius)[0] for d in dist]\n if return_distance:\n if self.effective_metric_ == 'euclidean':\n dist = [np.sqrt(d[neigh_ind[i]]) for (i, d) in enumerate(dist)]\n else:\n dist = [d[neigh_ind[i]] for (i, d) in enumerate(dist)]\n results = (dist, neigh_ind)\n else:\n results = neigh_ind\n return results" }, { @@ -140970,7 +151714,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -140980,7 +151725,8 @@ "docstring": { "type": "array-like of (n_samples, n_features), default=None", "description": "The query point or points.\nIf not provided, neighbors of each indexed point are returned.\nIn this case, the query point is not considered its own neighbor." - } + }, + "refined_type": {} }, { "name": "radius", @@ -140990,7 +151736,8 @@ "docstring": { "type": "float, default=None", "description": "Limiting distance of neighbors to return. The default is the value\npassed to the constructor." - } + }, + "refined_type": {} }, { "name": "return_distance", @@ -141000,7 +151747,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to return the distances." - } + }, + "refined_type": {} }, { "name": "sort_results", @@ -141010,13 +151758,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the distances and indices will be sorted by increasing\ndistances before being returned. If False, the results may not\nbe sorted. If `return_distance=False`, setting `sort_results=True`\nwill result in an error.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Find the neighbors within a given radius of a point or points.\n\nReturn the indices and distances of each point from the dataset lying in a ball with size ``radius`` around the points of the query array. Points lying on the boundary are included in the results. The result points are *not* necessarily sorted by distance to their query point.", - "docstring": "Find the neighbors within a given radius of a point or points.\n\nReturn the indices and distances of each point from the dataset\nlying in a ball with size ``radius`` around the points of the query\narray. Points lying on the boundary are included in the results.\n\nThe result points are *not* necessarily sorted by distance to their\nquery point.\n\nParameters\n----------\nX : array-like of (n_samples, n_features), default=None\n The query point or points.\n If not provided, neighbors of each indexed point are returned.\n In this case, the query point is not considered its own neighbor.\n\nradius : float, default=None\n Limiting distance of neighbors to return. The default is the value\n passed to the constructor.\n\nreturn_distance : bool, default=True\n Whether or not to return the distances.\n\nsort_results : bool, default=False\n If True, the distances and indices will be sorted by increasing\n distances before being returned. If False, the results may not\n be sorted. If `return_distance=False`, setting `sort_results=True`\n will result in an error.\n\n .. versionadded:: 0.22\n\nReturns\n-------\nneigh_dist : ndarray of shape (n_samples,) of arrays\n Array representing the distances to each point, only present if\n `return_distance=True`. The distance values are computed according\n to the ``metric`` constructor parameter.\n\nneigh_ind : ndarray of shape (n_samples,) of arrays\n An array of arrays of indices of the approximate nearest points\n from the population matrix that lie within a ball of size\n ``radius`` around the query points.\n\nNotes\n-----\nBecause the number of neighbors of each point is not necessarily\nequal, the results for multiple query points cannot be fit in a\nstandard data array.\nFor efficiency, `radius_neighbors` returns arrays of objects, where\neach object is a 1D array of indices or distances.\n\nExamples\n--------\nIn the following example, we construct a NeighborsClassifier\nclass from an array representing our data set and ask who's\nthe closest point to [1, 1, 1]:\n\n>>> import numpy as np\n>>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]\n>>> from sklearn.neighbors import NearestNeighbors\n>>> neigh = NearestNeighbors(radius=1.6)\n>>> neigh.fit(samples)\nNearestNeighbors(radius=1.6)\n>>> rng = neigh.radius_neighbors([[1., 1., 1.]])\n>>> print(np.asarray(rng[0][0]))\n[1.5 0.5]\n>>> print(np.asarray(rng[1][0]))\n[1 2]\n\nThe first array returned contains the distances to all points which\nare closer than 1.6, while the second array returned contains their\nindices. In general, multiple points can be queried at the same time.", + "description": "Find the neighbors within a given radius of a point or points.\n\nReturn the indices and distances of each point from the dataset\nlying in a ball with size ``radius`` around the points of the query\narray. Points lying on the boundary are included in the results.\n\nThe result points are *not* necessarily sorted by distance to their\nquery point.", + "docstring": "Find the neighbors within a given radius of a point or points.\n\n Return the indices and distances of each point from the dataset\n lying in a ball with size ``radius`` around the points of the query\n array. Points lying on the boundary are included in the results.\n\n The result points are *not* necessarily sorted by distance to their\n query point.\n\n Parameters\n ----------\n X : array-like of (n_samples, n_features), default=None\n The query point or points.\n If not provided, neighbors of each indexed point are returned.\n In this case, the query point is not considered its own neighbor.\n\n radius : float, default=None\n Limiting distance of neighbors to return. The default is the value\n passed to the constructor.\n\n return_distance : bool, default=True\n Whether or not to return the distances.\n\n sort_results : bool, default=False\n If True, the distances and indices will be sorted by increasing\n distances before being returned. If False, the results may not\n be sorted. If `return_distance=False`, setting `sort_results=True`\n will result in an error.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n neigh_dist : ndarray of shape (n_samples,) of arrays\n Array representing the distances to each point, only present if\n `return_distance=True`. The distance values are computed according\n to the ``metric`` constructor parameter.\n\n neigh_ind : ndarray of shape (n_samples,) of arrays\n An array of arrays of indices of the approximate nearest points\n from the population matrix that lie within a ball of size\n ``radius`` around the query points.\n\n Notes\n -----\n Because the number of neighbors of each point is not necessarily\n equal, the results for multiple query points cannot be fit in a\n standard data array.\n For efficiency, `radius_neighbors` returns arrays of objects, where\n each object is a 1D array of indices or distances.\n\n Examples\n --------\n In the following example, we construct a NeighborsClassifier\n class from an array representing our data set and ask who's\n the closest point to [1, 1, 1]:\n\n >>> import numpy as np\n >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]\n >>> from sklearn.neighbors import NearestNeighbors\n >>> neigh = NearestNeighbors(radius=1.6)\n >>> neigh.fit(samples)\n NearestNeighbors(radius=1.6)\n >>> rng = neigh.radius_neighbors([[1., 1., 1.]])\n >>> print(np.asarray(rng[0][0]))\n [1.5 0.5]\n >>> print(np.asarray(rng[1][0]))\n [1 2]\n\n The first array returned contains the distances to all points which\n are closer than 1.6, while the second array returned contains their\n indices. In general, multiple points can be queried at the same time.\n ", "source_code": "\ndef radius_neighbors(self, X=None, radius=None, return_distance=True, sort_results=False):\n \"\"\"Find the neighbors within a given radius of a point or points.\n\n Return the indices and distances of each point from the dataset\n lying in a ball with size ``radius`` around the points of the query\n array. Points lying on the boundary are included in the results.\n\n The result points are *not* necessarily sorted by distance to their\n query point.\n\n Parameters\n ----------\n X : array-like of (n_samples, n_features), default=None\n The query point or points.\n If not provided, neighbors of each indexed point are returned.\n In this case, the query point is not considered its own neighbor.\n\n radius : float, default=None\n Limiting distance of neighbors to return. The default is the value\n passed to the constructor.\n\n return_distance : bool, default=True\n Whether or not to return the distances.\n\n sort_results : bool, default=False\n If True, the distances and indices will be sorted by increasing\n distances before being returned. If False, the results may not\n be sorted. If `return_distance=False`, setting `sort_results=True`\n will result in an error.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n neigh_dist : ndarray of shape (n_samples,) of arrays\n Array representing the distances to each point, only present if\n `return_distance=True`. The distance values are computed according\n to the ``metric`` constructor parameter.\n\n neigh_ind : ndarray of shape (n_samples,) of arrays\n An array of arrays of indices of the approximate nearest points\n from the population matrix that lie within a ball of size\n ``radius`` around the query points.\n\n Notes\n -----\n Because the number of neighbors of each point is not necessarily\n equal, the results for multiple query points cannot be fit in a\n standard data array.\n For efficiency, `radius_neighbors` returns arrays of objects, where\n each object is a 1D array of indices or distances.\n\n Examples\n --------\n In the following example, we construct a NeighborsClassifier\n class from an array representing our data set and ask who's\n the closest point to [1, 1, 1]:\n\n >>> import numpy as np\n >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]\n >>> from sklearn.neighbors import NearestNeighbors\n >>> neigh = NearestNeighbors(radius=1.6)\n >>> neigh.fit(samples)\n NearestNeighbors(radius=1.6)\n >>> rng = neigh.radius_neighbors([[1., 1., 1.]])\n >>> print(np.asarray(rng[0][0]))\n [1.5 0.5]\n >>> print(np.asarray(rng[1][0]))\n [1 2]\n\n The first array returned contains the distances to all points which\n are closer than 1.6, while the second array returned contains their\n indices. In general, multiple points can be queried at the same time.\n \"\"\"\n check_is_fitted(self)\n if X is not None:\n query_is_train = False\n if self.metric == 'precomputed':\n X = _check_precomputed(X)\n else:\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n else:\n query_is_train = True\n X = self._fit_X\n if radius is None:\n radius = self.radius\n if self._fit_method == 'brute' and self.metric == 'precomputed' and issparse(X):\n results = _radius_neighbors_from_graph(X, radius=radius, return_distance=return_distance)\n elif self._fit_method == 'brute':\n if self.effective_metric_ == 'euclidean':\n radius *= radius\n kwds = {'squared': True}\n else:\n kwds = self.effective_metric_params_\n reduce_func = partial(self._radius_neighbors_reduce_func, radius=radius, return_distance=return_distance)\n chunked_results = pairwise_distances_chunked(X, self._fit_X, reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=self.n_jobs, **kwds)\n if return_distance:\n (neigh_dist_chunks, neigh_ind_chunks) = zip(*chunked_results)\n neigh_dist_list = sum(neigh_dist_chunks, [])\n neigh_ind_list = sum(neigh_ind_chunks, [])\n neigh_dist = _to_object_array(neigh_dist_list)\n neigh_ind = _to_object_array(neigh_ind_list)\n results = (neigh_dist, neigh_ind)\n else:\n neigh_ind_list = sum(chunked_results, [])\n results = _to_object_array(neigh_ind_list)\n if sort_results:\n if not return_distance:\n raise ValueError('return_distance must be True if sort_results is True.')\n for ii in range(len(neigh_dist)):\n order = np.argsort(neigh_dist[ii], kind='mergesort')\n neigh_ind[ii] = neigh_ind[ii][order]\n neigh_dist[ii] = neigh_dist[ii][order]\n results = (neigh_dist, neigh_ind)\n elif self._fit_method in ['ball_tree', 'kd_tree']:\n if issparse(X):\n raise ValueError(\"%s does not work with sparse matrices. Densify the data, or set algorithm='brute'\" % self._fit_method)\n n_jobs = effective_n_jobs(self.n_jobs)\n delayed_query = delayed(_tree_query_radius_parallel_helper)\n if parse_version(joblib.__version__) < parse_version('0.12'):\n parallel_kwargs = {'backend': 'threading'}\n else:\n parallel_kwargs = {'prefer': 'threads'}\n chunked_results = Parallel(n_jobs, **parallel_kwargs)((delayed_query(self._tree, X[s], radius, return_distance, sort_results=sort_results) for s in gen_even_slices(X.shape[0], n_jobs)))\n if return_distance:\n (neigh_ind, neigh_dist) = tuple(zip(*chunked_results))\n results = (np.hstack(neigh_dist), np.hstack(neigh_ind))\n else:\n results = np.hstack(chunked_results)\n else:\n raise ValueError('internal: _fit_method not recognized')\n if not query_is_train:\n return results\n else:\n if return_distance:\n (neigh_dist, neigh_ind) = results\n else:\n neigh_ind = results\n for (ind, ind_neighbor) in enumerate(neigh_ind):\n mask = ind_neighbor != ind\n neigh_ind[ind] = ind_neighbor[mask]\n if return_distance:\n neigh_dist[ind] = neigh_dist[ind][mask]\n if return_distance:\n return neigh_dist, neigh_ind\n return neigh_ind" }, { @@ -141034,7 +151783,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -141044,7 +151794,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features), default=None", "description": "The query point or points.\nIf not provided, neighbors of each indexed point are returned.\nIn this case, the query point is not considered its own neighbor." - } + }, + "refined_type": {} }, { "name": "radius", @@ -141054,7 +151805,8 @@ "docstring": { "type": "float, default=None", "description": "Radius of neighborhoods. The default is the value passed to the\nconstructor." - } + }, + "refined_type": {} }, { "name": "mode", @@ -141064,6 +151816,10 @@ "docstring": { "type": "{'connectivity', 'distance'}, default='connectivity'", "description": "Type of returned matrix: 'connectivity' will return the\nconnectivity matrix with ones and zeros, in 'distance' the\nedges are distances between points, type of distance\ndepends on the selected metric parameter in\nNearestNeighbors class." + }, + "refined_type": { + "kind": "EnumType", + "values": ["distance", "connectivity"] } }, { @@ -141074,13 +151830,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, in each row of the result, the non-zero entries will be\nsorted by increasing distances. If False, the non-zero entries may\nnot be sorted. Only used with mode='distance'.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the (weighted) graph of Neighbors for points in X.\n\nNeighborhoods are restricted the points at a distance lower than radius.", - "docstring": "Compute the (weighted) graph of Neighbors for points in X.\n\nNeighborhoods are restricted the points at a distance lower than\nradius.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features), default=None\n The query point or points.\n If not provided, neighbors of each indexed point are returned.\n In this case, the query point is not considered its own neighbor.\n\nradius : float, default=None\n Radius of neighborhoods. The default is the value passed to the\n constructor.\n\nmode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the\n connectivity matrix with ones and zeros, in 'distance' the\n edges are distances between points, type of distance\n depends on the selected metric parameter in\n NearestNeighbors class.\n\nsort_results : bool, default=False\n If True, in each row of the result, the non-zero entries will be\n sorted by increasing distances. If False, the non-zero entries may\n not be sorted. Only used with mode='distance'.\n\n .. versionadded:: 0.22\n\nReturns\n-------\nA : sparse-matrix of shape (n_queries, n_samples_fit)\n `n_samples_fit` is the number of samples in the fitted data.\n `A[i, j]` gives the weight of the edge connecting `i` to `j`.\n The matrix is of CSR format.\n\nSee Also\n--------\nkneighbors_graph : Compute the (weighted) graph of k-Neighbors for\n points in X.\n\nExamples\n--------\n>>> X = [[0], [3], [1]]\n>>> from sklearn.neighbors import NearestNeighbors\n>>> neigh = NearestNeighbors(radius=1.5)\n>>> neigh.fit(X)\nNearestNeighbors(radius=1.5)\n>>> A = neigh.radius_neighbors_graph(X)\n>>> A.toarray()\narray([[1., 0., 1.],\n [0., 1., 0.],\n [1., 0., 1.]])", + "description": "Compute the (weighted) graph of Neighbors for points in X.\n\nNeighborhoods are restricted the points at a distance lower than\nradius.", + "docstring": "Compute the (weighted) graph of Neighbors for points in X.\n\n Neighborhoods are restricted the points at a distance lower than\n radius.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features), default=None\n The query point or points.\n If not provided, neighbors of each indexed point are returned.\n In this case, the query point is not considered its own neighbor.\n\n radius : float, default=None\n Radius of neighborhoods. The default is the value passed to the\n constructor.\n\n mode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the\n connectivity matrix with ones and zeros, in 'distance' the\n edges are distances between points, type of distance\n depends on the selected metric parameter in\n NearestNeighbors class.\n\n sort_results : bool, default=False\n If True, in each row of the result, the non-zero entries will be\n sorted by increasing distances. If False, the non-zero entries may\n not be sorted. Only used with mode='distance'.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n A : sparse-matrix of shape (n_queries, n_samples_fit)\n `n_samples_fit` is the number of samples in the fitted data.\n `A[i, j]` gives the weight of the edge connecting `i` to `j`.\n The matrix is of CSR format.\n\n See Also\n --------\n kneighbors_graph : Compute the (weighted) graph of k-Neighbors for\n points in X.\n\n Examples\n --------\n >>> X = [[0], [3], [1]]\n >>> from sklearn.neighbors import NearestNeighbors\n >>> neigh = NearestNeighbors(radius=1.5)\n >>> neigh.fit(X)\n NearestNeighbors(radius=1.5)\n >>> A = neigh.radius_neighbors_graph(X)\n >>> A.toarray()\n array([[1., 0., 1.],\n [0., 1., 0.],\n [1., 0., 1.]])\n ", "source_code": "\ndef radius_neighbors_graph(self, X=None, radius=None, mode='connectivity', sort_results=False):\n \"\"\"Compute the (weighted) graph of Neighbors for points in X.\n\n Neighborhoods are restricted the points at a distance lower than\n radius.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features), default=None\n The query point or points.\n If not provided, neighbors of each indexed point are returned.\n In this case, the query point is not considered its own neighbor.\n\n radius : float, default=None\n Radius of neighborhoods. The default is the value passed to the\n constructor.\n\n mode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the\n connectivity matrix with ones and zeros, in 'distance' the\n edges are distances between points, type of distance\n depends on the selected metric parameter in\n NearestNeighbors class.\n\n sort_results : bool, default=False\n If True, in each row of the result, the non-zero entries will be\n sorted by increasing distances. If False, the non-zero entries may\n not be sorted. Only used with mode='distance'.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n A : sparse-matrix of shape (n_queries, n_samples_fit)\n `n_samples_fit` is the number of samples in the fitted data.\n `A[i, j]` gives the weight of the edge connecting `i` to `j`.\n The matrix is of CSR format.\n\n See Also\n --------\n kneighbors_graph : Compute the (weighted) graph of k-Neighbors for\n points in X.\n\n Examples\n --------\n >>> X = [[0], [3], [1]]\n >>> from sklearn.neighbors import NearestNeighbors\n >>> neigh = NearestNeighbors(radius=1.5)\n >>> neigh.fit(X)\n NearestNeighbors(radius=1.5)\n >>> A = neigh.radius_neighbors_graph(X)\n >>> A.toarray()\n array([[1., 0., 1.],\n [0., 1., 0.],\n [1., 0., 1.]])\n \"\"\"\n check_is_fitted(self)\n if radius is None:\n radius = self.radius\n if mode == 'connectivity':\n A_ind = self.radius_neighbors(X, radius, return_distance=False)\n A_data = None\n elif mode == 'distance':\n (dist, A_ind) = self.radius_neighbors(X, radius, return_distance=True, sort_results=sort_results)\n A_data = np.concatenate(list(dist))\n else:\n raise ValueError('Unsupported mode, must be one of \"connectivity\", or \"distance\" but got %s instead' % mode)\n n_queries = A_ind.shape[0]\n n_samples_fit = self.n_samples_fit_\n n_neighbors = np.array([len(a) for a in A_ind])\n A_ind = np.concatenate(list(A_ind))\n if A_data is None:\n A_data = np.ones(len(A_ind))\n A_indptr = np.concatenate((np.zeros(1, dtype=int), np.cumsum(n_neighbors)))\n return csr_matrix((A_data, A_ind, A_indptr), shape=(n_queries, n_samples_fit))" }, { @@ -141098,13 +151855,17 @@ "docstring": { "type": "{sparse matrix, array-like}, (n_samples, n_samples)", "description": "Distance matrix to other samples. X may be a sparse matrix, in which\ncase only non-zero elements may be considered neighbors." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Check precomputed distance matrix.\n\nIf the precomputed distance matrix is sparse, it checks that the non-zero entries are sorted by distances. If not, the matrix is copied and sorted.", - "docstring": "Check precomputed distance matrix.\n\nIf the precomputed distance matrix is sparse, it checks that the non-zero\nentries are sorted by distances. If not, the matrix is copied and sorted.\n\nParameters\n----------\nX : {sparse matrix, array-like}, (n_samples, n_samples)\n Distance matrix to other samples. X may be a sparse matrix, in which\n case only non-zero elements may be considered neighbors.\n\nReturns\n-------\nX : {sparse matrix, array-like}, (n_samples, n_samples)\n Distance matrix to other samples. X may be a sparse matrix, in which\n case only non-zero elements may be considered neighbors.", + "description": "Check precomputed distance matrix.\n\nIf the precomputed distance matrix is sparse, it checks that the non-zero\nentries are sorted by distances. If not, the matrix is copied and sorted.", + "docstring": "Check precomputed distance matrix.\n\n If the precomputed distance matrix is sparse, it checks that the non-zero\n entries are sorted by distances. If not, the matrix is copied and sorted.\n\n Parameters\n ----------\n X : {sparse matrix, array-like}, (n_samples, n_samples)\n Distance matrix to other samples. X may be a sparse matrix, in which\n case only non-zero elements may be considered neighbors.\n\n Returns\n -------\n X : {sparse matrix, array-like}, (n_samples, n_samples)\n Distance matrix to other samples. X may be a sparse matrix, in which\n case only non-zero elements may be considered neighbors.\n ", "source_code": "\ndef _check_precomputed(X):\n \"\"\"Check precomputed distance matrix.\n\n If the precomputed distance matrix is sparse, it checks that the non-zero\n entries are sorted by distances. If not, the matrix is copied and sorted.\n\n Parameters\n ----------\n X : {sparse matrix, array-like}, (n_samples, n_samples)\n Distance matrix to other samples. X may be a sparse matrix, in which\n case only non-zero elements may be considered neighbors.\n\n Returns\n -------\n X : {sparse matrix, array-like}, (n_samples, n_samples)\n Distance matrix to other samples. X may be a sparse matrix, in which\n case only non-zero elements may be considered neighbors.\n \"\"\"\n if not issparse(X):\n X = check_array(X)\n check_non_negative(X, whom='precomputed distance matrix.')\n return X\n else:\n graph = X\n if graph.format not in ('csr', 'csc', 'coo', 'lil'):\n raise TypeError('Sparse matrix in {!r} format is not supported due to its handling of explicit zeros'.format(graph.format))\n copied = graph.format != 'csr'\n graph = check_array(graph, accept_sparse='csr')\n check_non_negative(graph, whom='precomputed distance matrix.')\n if not _is_sorted_by_data(graph):\n warnings.warn('Precomputed sparse input was not sorted by data.', EfficiencyWarning)\n if not copied:\n graph = graph.copy()\n row_nnz = np.diff(graph.indptr)\n if row_nnz.max() == row_nnz.min():\n n_samples = graph.shape[0]\n distances = graph.data.reshape(n_samples, -1)\n order = np.argsort(distances, kind='mergesort')\n order += np.arange(n_samples)[:, None] * row_nnz[0]\n order = order.ravel()\n graph.data = graph.data[order]\n graph.indices = graph.indices[order]\n else:\n for (start, stop) in zip(graph.indptr, graph.indptr[1:]):\n order = np.argsort(graph.data[start:stop], kind='mergesort')\n graph.data[start:stop] = graph.data[start:stop][order]\n graph.indices[start:stop] = graph.indices[start:stop][order]\n return graph" }, { @@ -141122,7 +151883,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -141146,7 +151908,8 @@ "docstring": { "type": "ndarray", "description": "The input distances." - } + }, + "refined_type": {} }, { "name": "weights", @@ -141156,13 +151919,17 @@ "docstring": { "type": "{'uniform', 'distance' or a callable}", "description": "The kind of weighting used." + }, + "refined_type": { + "kind": "EnumType", + "values": ["uniform", "distance"] } } ], "results": [], "is_public": false, "description": "Get the weights from an array of distances and a parameter ``weights``.", - "docstring": "Get the weights from an array of distances and a parameter ``weights``.\n\nParameters\n----------\ndist : ndarray\n The input distances.\n\nweights : {'uniform', 'distance' or a callable}\n The kind of weighting used.\n\nReturns\n-------\nweights_arr : array of the same shape as ``dist``\n If ``weights == 'uniform'``, then returns None.", + "docstring": "Get the weights from an array of distances and a parameter ``weights``.\n\n Parameters\n ----------\n dist : ndarray\n The input distances.\n\n weights : {'uniform', 'distance' or a callable}\n The kind of weighting used.\n\n Returns\n -------\n weights_arr : array of the same shape as ``dist``\n If ``weights == 'uniform'``, then returns None.\n ", "source_code": "\ndef _get_weights(dist, weights):\n \"\"\"Get the weights from an array of distances and a parameter ``weights``.\n\n Parameters\n ----------\n dist : ndarray\n The input distances.\n\n weights : {'uniform', 'distance' or a callable}\n The kind of weighting used.\n\n Returns\n -------\n weights_arr : array of the same shape as ``dist``\n If ``weights == 'uniform'``, then returns None.\n \"\"\"\n if weights in (None, 'uniform'):\n return None\n elif weights == 'distance':\n if dist.dtype is np.dtype(object):\n for (point_dist_i, point_dist) in enumerate(dist):\n if hasattr(point_dist, '__contains__') and 0.0 in point_dist:\n dist[point_dist_i] = point_dist == 0.0\n else:\n dist[point_dist_i] = 1.0 / point_dist\n else:\n with np.errstate(divide='ignore'):\n dist = 1.0 / dist\n inf_mask = np.isinf(dist)\n inf_row = np.any(inf_mask, axis=1)\n dist[inf_row] = inf_mask[inf_row]\n return dist\n elif callable(weights):\n return weights(dist)\n else:\n raise ValueError(\"weights not recognized: should be 'uniform', 'distance', or a callable function\")" }, { @@ -141180,13 +151947,14 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_samples)", "description": "Neighbors graph as given by `kneighbors_graph` or\n`radius_neighbors_graph`. Matrix should be of format CSR format." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return whether the graph's non-zero entries are sorted by data.\n\nThe non-zero entries are stored in graph.data and graph.indices. For each row (or sample), the non-zero entries can be either: - sorted by indices, as after graph.sort_indices(); - sorted by data, as after _check_precomputed(graph); - not sorted.", - "docstring": "Return whether the graph's non-zero entries are sorted by data.\n\nThe non-zero entries are stored in graph.data and graph.indices.\nFor each row (or sample), the non-zero entries can be either:\n - sorted by indices, as after graph.sort_indices();\n - sorted by data, as after _check_precomputed(graph);\n - not sorted.\n\nParameters\n----------\ngraph : sparse matrix of shape (n_samples, n_samples)\n Neighbors graph as given by `kneighbors_graph` or\n `radius_neighbors_graph`. Matrix should be of format CSR format.\n\nReturns\n-------\nres : bool\n Whether input graph is sorted by data.", + "description": "Return whether the graph's non-zero entries are sorted by data.\n\nThe non-zero entries are stored in graph.data and graph.indices.\nFor each row (or sample), the non-zero entries can be either:\n - sorted by indices, as after graph.sort_indices();\n - sorted by data, as after _check_precomputed(graph);\n - not sorted.", + "docstring": "Return whether the graph's non-zero entries are sorted by data.\n\n The non-zero entries are stored in graph.data and graph.indices.\n For each row (or sample), the non-zero entries can be either:\n - sorted by indices, as after graph.sort_indices();\n - sorted by data, as after _check_precomputed(graph);\n - not sorted.\n\n Parameters\n ----------\n graph : sparse matrix of shape (n_samples, n_samples)\n Neighbors graph as given by `kneighbors_graph` or\n `radius_neighbors_graph`. Matrix should be of format CSR format.\n\n Returns\n -------\n res : bool\n Whether input graph is sorted by data.\n ", "source_code": "\ndef _is_sorted_by_data(graph):\n \"\"\"Return whether the graph's non-zero entries are sorted by data.\n\n The non-zero entries are stored in graph.data and graph.indices.\n For each row (or sample), the non-zero entries can be either:\n - sorted by indices, as after graph.sort_indices();\n - sorted by data, as after _check_precomputed(graph);\n - not sorted.\n\n Parameters\n ----------\n graph : sparse matrix of shape (n_samples, n_samples)\n Neighbors graph as given by `kneighbors_graph` or\n `radius_neighbors_graph`. Matrix should be of format CSR format.\n\n Returns\n -------\n res : bool\n Whether input graph is sorted by data.\n \"\"\"\n assert graph.format == 'csr'\n out_of_order = graph.data[:-1] > graph.data[1:]\n line_change = np.unique(graph.indptr[1:-1] - 1)\n line_change = line_change[line_change < out_of_order.shape[0]]\n return out_of_order.sum() == out_of_order[line_change].sum()" }, { @@ -141204,7 +151972,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_samples)", "description": "Neighbors graph as given by `kneighbors_graph` or\n`radius_neighbors_graph`. Matrix should be of format CSR format." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -141214,7 +151983,8 @@ "docstring": { "type": "int", "description": "Number of neighbors required for each sample." - } + }, + "refined_type": {} }, { "name": "return_distance", @@ -141224,13 +151994,14 @@ "docstring": { "type": "bool", "description": "Whether or not to return the distances." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Decompose a nearest neighbors sparse graph into distances and indices.", - "docstring": "Decompose a nearest neighbors sparse graph into distances and indices.\n\nParameters\n----------\ngraph : sparse matrix of shape (n_samples, n_samples)\n Neighbors graph as given by `kneighbors_graph` or\n `radius_neighbors_graph`. Matrix should be of format CSR format.\n\nn_neighbors : int\n Number of neighbors required for each sample.\n\nreturn_distance : bool\n Whether or not to return the distances.\n\nReturns\n-------\nneigh_dist : ndarray of shape (n_samples, n_neighbors)\n Distances to nearest neighbors. Only present if `return_distance=True`.\n\nneigh_ind : ndarray of shape (n_samples, n_neighbors)\n Indices of nearest neighbors.", + "docstring": "Decompose a nearest neighbors sparse graph into distances and indices.\n\n Parameters\n ----------\n graph : sparse matrix of shape (n_samples, n_samples)\n Neighbors graph as given by `kneighbors_graph` or\n `radius_neighbors_graph`. Matrix should be of format CSR format.\n\n n_neighbors : int\n Number of neighbors required for each sample.\n\n return_distance : bool\n Whether or not to return the distances.\n\n Returns\n -------\n neigh_dist : ndarray of shape (n_samples, n_neighbors)\n Distances to nearest neighbors. Only present if `return_distance=True`.\n\n neigh_ind : ndarray of shape (n_samples, n_neighbors)\n Indices of nearest neighbors.\n ", "source_code": "\ndef _kneighbors_from_graph(graph, n_neighbors, return_distance):\n \"\"\"Decompose a nearest neighbors sparse graph into distances and indices.\n\n Parameters\n ----------\n graph : sparse matrix of shape (n_samples, n_samples)\n Neighbors graph as given by `kneighbors_graph` or\n `radius_neighbors_graph`. Matrix should be of format CSR format.\n\n n_neighbors : int\n Number of neighbors required for each sample.\n\n return_distance : bool\n Whether or not to return the distances.\n\n Returns\n -------\n neigh_dist : ndarray of shape (n_samples, n_neighbors)\n Distances to nearest neighbors. Only present if `return_distance=True`.\n\n neigh_ind : ndarray of shape (n_samples, n_neighbors)\n Indices of nearest neighbors.\n \"\"\"\n n_samples = graph.shape[0]\n assert graph.format == 'csr'\n row_nnz = np.diff(graph.indptr)\n row_nnz_min = row_nnz.min()\n if n_neighbors is not None and row_nnz_min < n_neighbors:\n raise ValueError('%d neighbors per samples are required, but some samples have only %d neighbors in precomputed graph matrix. Decrease number of neighbors used or recompute the graph with more neighbors.' % (n_neighbors, row_nnz_min))\n \n def extract(a):\n if row_nnz.max() == row_nnz_min:\n return a.reshape(n_samples, -1)[:, :n_neighbors]\n else:\n idx = np.tile(np.arange(n_neighbors), (n_samples, 1))\n idx += graph.indptr[:-1, None]\n return a.take(idx, mode='clip').reshape(n_samples, n_neighbors)\n if return_distance:\n return extract(graph.data), extract(graph.indices)\n else:\n return extract(graph.indices)" }, { @@ -141248,7 +152019,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_samples)", "description": "Neighbors graph as given by `kneighbors_graph` or\n`radius_neighbors_graph`. Matrix should be of format CSR format." - } + }, + "refined_type": {} }, { "name": "radius", @@ -141258,7 +152030,8 @@ "docstring": { "type": "float", "description": "Radius of neighborhoods which should be strictly positive." - } + }, + "refined_type": {} }, { "name": "return_distance", @@ -141268,13 +152041,14 @@ "docstring": { "type": "bool", "description": "Whether or not to return the distances." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Decompose a nearest neighbors sparse graph into distances and indices.", - "docstring": "Decompose a nearest neighbors sparse graph into distances and indices.\n\nParameters\n----------\ngraph : sparse matrix of shape (n_samples, n_samples)\n Neighbors graph as given by `kneighbors_graph` or\n `radius_neighbors_graph`. Matrix should be of format CSR format.\n\nradius : float\n Radius of neighborhoods which should be strictly positive.\n\nreturn_distance : bool\n Whether or not to return the distances.\n\nReturns\n-------\nneigh_dist : ndarray of shape (n_samples,) of arrays\n Distances to nearest neighbors. Only present if `return_distance=True`.\n\nneigh_ind : ndarray of shape (n_samples,) of arrays\n Indices of nearest neighbors.", + "docstring": "Decompose a nearest neighbors sparse graph into distances and indices.\n\n Parameters\n ----------\n graph : sparse matrix of shape (n_samples, n_samples)\n Neighbors graph as given by `kneighbors_graph` or\n `radius_neighbors_graph`. Matrix should be of format CSR format.\n\n radius : float\n Radius of neighborhoods which should be strictly positive.\n\n return_distance : bool\n Whether or not to return the distances.\n\n Returns\n -------\n neigh_dist : ndarray of shape (n_samples,) of arrays\n Distances to nearest neighbors. Only present if `return_distance=True`.\n\n neigh_ind : ndarray of shape (n_samples,) of arrays\n Indices of nearest neighbors.\n ", "source_code": "\ndef _radius_neighbors_from_graph(graph, radius, return_distance):\n \"\"\"Decompose a nearest neighbors sparse graph into distances and indices.\n\n Parameters\n ----------\n graph : sparse matrix of shape (n_samples, n_samples)\n Neighbors graph as given by `kneighbors_graph` or\n `radius_neighbors_graph`. Matrix should be of format CSR format.\n\n radius : float\n Radius of neighborhoods which should be strictly positive.\n\n return_distance : bool\n Whether or not to return the distances.\n\n Returns\n -------\n neigh_dist : ndarray of shape (n_samples,) of arrays\n Distances to nearest neighbors. Only present if `return_distance=True`.\n\n neigh_ind : ndarray of shape (n_samples,) of arrays\n Indices of nearest neighbors.\n \"\"\"\n assert graph.format == 'csr'\n no_filter_needed = bool(graph.data.max() <= radius)\n if no_filter_needed:\n (data, indices, indptr) = (graph.data, graph.indices, graph.indptr)\n else:\n mask = graph.data <= radius\n if return_distance:\n data = np.compress(mask, graph.data)\n indices = np.compress(mask, graph.indices)\n indptr = np.concatenate(([0], np.cumsum(mask)))[graph.indptr]\n indices = indices.astype(np.intp, copy=no_filter_needed)\n if return_distance:\n neigh_dist = _to_object_array(np.split(data, indptr[1:-1]))\n neigh_ind = _to_object_array(np.split(indices, indptr[1:-1]))\n if return_distance:\n return neigh_dist, neigh_ind\n else:\n return neigh_ind" }, { @@ -141292,13 +152066,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Helper for the Parallel calls in KNeighborsMixin.kneighbors.\n\nThe Cython method tree.query is not directly picklable by cloudpickle under PyPy.", - "docstring": "Helper for the Parallel calls in KNeighborsMixin.kneighbors.\n\nThe Cython method tree.query is not directly picklable by cloudpickle\nunder PyPy.", + "description": "Helper for the Parallel calls in KNeighborsMixin.kneighbors.\n\nThe Cython method tree.query is not directly picklable by cloudpickle\nunder PyPy.", + "docstring": "Helper for the Parallel calls in KNeighborsMixin.kneighbors.\n\n The Cython method tree.query is not directly picklable by cloudpickle\n under PyPy.\n ", "source_code": "\ndef _tree_query_parallel_helper(tree, *args, **kwargs):\n \"\"\"Helper for the Parallel calls in KNeighborsMixin.kneighbors.\n\n The Cython method tree.query is not directly picklable by cloudpickle\n under PyPy.\n \"\"\"\n return tree.query(*args, **kwargs)" }, { @@ -141316,13 +152091,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Helper for the Parallel calls in RadiusNeighborsMixin.radius_neighbors.\n\nThe Cython method tree.query_radius is not directly picklable by cloudpickle under PyPy.", - "docstring": "Helper for the Parallel calls in RadiusNeighborsMixin.radius_neighbors.\n\nThe Cython method tree.query_radius is not directly picklable by\ncloudpickle under PyPy.", + "description": "Helper for the Parallel calls in RadiusNeighborsMixin.radius_neighbors.\n\nThe Cython method tree.query_radius is not directly picklable by\ncloudpickle under PyPy.", + "docstring": "Helper for the Parallel calls in RadiusNeighborsMixin.radius_neighbors.\n\n The Cython method tree.query_radius is not directly picklable by\n cloudpickle under PyPy.\n ", "source_code": "\ndef _tree_query_radius_parallel_helper(tree, *args, **kwargs):\n \"\"\"Helper for the Parallel calls in RadiusNeighborsMixin.radius_neighbors.\n\n The Cython method tree.query_radius is not directly picklable by\n cloudpickle under PyPy.\n \"\"\"\n return tree.query_radius(*args, **kwargs)" }, { @@ -141340,7 +152116,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -141350,7 +152127,8 @@ "docstring": { "type": "int, default=5", "description": "Number of neighbors to use by default for :meth:`kneighbors` queries." - } + }, + "refined_type": {} }, { "name": "weights", @@ -141360,6 +152138,10 @@ "docstring": { "type": "{'uniform', 'distance'} or callable, default='uniform'", "description": "Weight function used in prediction. Possible values:\n\n- 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n- 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n- [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights." + }, + "refined_type": { + "kind": "EnumType", + "values": ["uniform", "distance"] } }, { @@ -141370,6 +152152,10 @@ "docstring": { "type": "{'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'", "description": "Algorithm used to compute the nearest neighbors:\n\n- 'ball_tree' will use :class:`BallTree`\n- 'kd_tree' will use :class:`KDTree`\n- 'brute' will use a brute-force search.\n- 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\nNote: fitting on sparse input will override the setting of\nthis parameter, using brute force." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -141380,7 +152166,8 @@ "docstring": { "type": "int, default=30", "description": "Leaf size passed to BallTree or KDTree. This can affect the\nspeed of the construction and query, as well as the memory\nrequired to store the tree. The optimal value depends on the\nnature of the problem." - } + }, + "refined_type": {} }, { "name": "p", @@ -141390,7 +152177,8 @@ "docstring": { "type": "int, default=2", "description": "Power parameter for the Minkowski metric. When p = 1, this is\nequivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." - } + }, + "refined_type": {} }, { "name": "metric", @@ -141399,8 +152187,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "str or callable, default='minkowski'", - "description": "The distance metric to use for the tree. The default metric is\nminkowski, and with p=2 is equivalent to the standard Euclidean\nmetric. See the documentation of :class:`DistanceMetric` for a\nlist of available metrics.\nIf metric is \"precomputed\", X is assumed to be a distance matrix and\nmust be square during fit. X may be a :term:`sparse graph`,\nin which case only \"nonzero\" elements may be considered neighbors." - } + "description": "The distance metric to use for the tree. The default metric is\nminkowski, and with p=2 is equivalent to the standard Euclidean\nmetric. For a list of available metrics, see the documentation of\n:class:`~sklearn.metrics.DistanceMetric`.\nIf metric is \"precomputed\", X is assumed to be a distance matrix and\nmust be square during fit. X may be a :term:`sparse graph`,\nin which case only \"nonzero\" elements may be considered neighbors." + }, + "refined_type": {} }, { "name": "metric_params", @@ -141410,7 +152199,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -141420,13 +152210,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\nDoesn't affect :meth:`fit` method." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):\n super().__init__(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.weights = weights" }, { @@ -141444,13 +152235,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multilabel': True}" }, { @@ -141468,7 +152260,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -141478,6 +152271,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -141488,13 +152285,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)", "description": "Target values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Fit the k-nearest neighbors classifier from the training dataset.", - "docstring": "Fit the k-nearest neighbors classifier from the training dataset.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\ny : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\nReturns\n-------\nself : KNeighborsClassifier\n The fitted k-nearest neighbors classifier.", + "docstring": "Fit the k-nearest neighbors classifier from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : KNeighborsClassifier\n The fitted k-nearest neighbors classifier.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit the k-nearest neighbors classifier from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : KNeighborsClassifier\n The fitted k-nearest neighbors classifier.\n \"\"\"\n self.weights = _check_weights(self.weights)\n return self._fit(X, y)" }, { @@ -141512,7 +152313,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -141522,13 +152324,14 @@ "docstring": { "type": "array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'", "description": "Test samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict the class labels for the provided data.", - "docstring": "Predict the class labels for the provided data.\n\nParameters\n----------\nX : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\nReturns\n-------\ny : ndarray of shape (n_queries,) or (n_queries, n_outputs)\n Class labels for each data sample.", + "docstring": "Predict the class labels for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs)\n Class labels for each data sample.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict the class labels for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs)\n Class labels for each data sample.\n \"\"\"\n (neigh_dist, neigh_ind) = self.kneighbors(X)\n classes_ = self.classes_\n _y = self._y\n if not self.outputs_2d_:\n _y = self._y.reshape((-1, 1))\n classes_ = [self.classes_]\n n_outputs = len(classes_)\n n_queries = _num_samples(X)\n weights = _get_weights(neigh_dist, self.weights)\n y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)\n for (k, classes_k) in enumerate(classes_):\n if weights is None:\n (mode, _) = stats.mode(_y[neigh_ind, k], axis=1)\n else:\n (mode, _) = weighted_mode(_y[neigh_ind, k], weights, axis=1)\n mode = np.asarray(mode.ravel(), dtype=np.intp)\n y_pred[:, k] = classes_k.take(mode)\n if not self.outputs_2d_:\n y_pred = y_pred.ravel()\n return y_pred" }, { @@ -141546,7 +152349,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -141556,13 +152360,14 @@ "docstring": { "type": "array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'", "description": "Test samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return probability estimates for the test data X.", - "docstring": "Return probability estimates for the test data X.\n\nParameters\n----------\nX : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\nReturns\n-------\np : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1.\n The class probabilities of the input samples. Classes are ordered\n by lexicographic order.", + "docstring": "Return probability estimates for the test data X.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n p : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1.\n The class probabilities of the input samples. Classes are ordered\n by lexicographic order.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Return probability estimates for the test data X.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n p : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1.\n The class probabilities of the input samples. Classes are ordered\n by lexicographic order.\n \"\"\"\n (neigh_dist, neigh_ind) = self.kneighbors(X)\n classes_ = self.classes_\n _y = self._y\n if not self.outputs_2d_:\n _y = self._y.reshape((-1, 1))\n classes_ = [self.classes_]\n n_queries = _num_samples(X)\n weights = _get_weights(neigh_dist, self.weights)\n if weights is None:\n weights = np.ones_like(neigh_ind)\n all_rows = np.arange(n_queries)\n probabilities = []\n for (k, classes_k) in enumerate(classes_):\n pred_labels = _y[:, k][neigh_ind]\n proba_k = np.zeros((n_queries, classes_k.size))\n for (i, idx) in enumerate(pred_labels.T):\n proba_k[all_rows, idx] += weights[:, i]\n normalizer = proba_k.sum(axis=1)[:, np.newaxis]\n normalizer[normalizer == 0.0] = 1.0\n proba_k /= normalizer\n probabilities.append(proba_k)\n if not self.outputs_2d_:\n probabilities = probabilities[0]\n return probabilities" }, { @@ -141580,7 +152385,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "radius", @@ -141590,7 +152396,8 @@ "docstring": { "type": "float, default=1.0", "description": "Range of parameter space to use by default for :meth:`radius_neighbors`\nqueries." - } + }, + "refined_type": {} }, { "name": "weights", @@ -141600,6 +152407,10 @@ "docstring": { "type": "{'uniform', 'distance'} or callable, default='uniform'", "description": "Weight function used in prediction. Possible values:\n\n- 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n- 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n- [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\nUniform weights are used by default." + }, + "refined_type": { + "kind": "EnumType", + "values": ["uniform", "distance"] } }, { @@ -141610,6 +152421,10 @@ "docstring": { "type": "{'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'", "description": "Algorithm used to compute the nearest neighbors:\n\n- 'ball_tree' will use :class:`BallTree`\n- 'kd_tree' will use :class:`KDTree`\n- 'brute' will use a brute-force search.\n- 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\nNote: fitting on sparse input will override the setting of\nthis parameter, using brute force." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -141620,7 +152435,8 @@ "docstring": { "type": "int, default=30", "description": "Leaf size passed to BallTree or KDTree. This can affect the\nspeed of the construction and query, as well as the memory\nrequired to store the tree. The optimal value depends on the\nnature of the problem." - } + }, + "refined_type": {} }, { "name": "p", @@ -141630,7 +152446,8 @@ "docstring": { "type": "int, default=2", "description": "Power parameter for the Minkowski metric. When p = 1, this is\nequivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." - } + }, + "refined_type": {} }, { "name": "metric", @@ -141639,8 +152456,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "str or callable, default='minkowski'", - "description": "Distance metric to use for the tree. The default metric is\nminkowski, and with p=2 is equivalent to the standard Euclidean\nmetric. See the documentation of :class:`DistanceMetric` for a\nlist of available metrics.\nIf metric is \"precomputed\", X is assumed to be a distance matrix and\nmust be square during fit. X may be a :term:`sparse graph`,\nin which case only \"nonzero\" elements may be considered neighbors." - } + "description": "Distance metric to use for the tree. The default metric is\nminkowski, and with p=2 is equivalent to the standard Euclidean\nmetric. For a list of available metrics, see the documentation of\n:class:`~sklearn.metrics.DistanceMetric`.\nIf metric is \"precomputed\", X is assumed to be a distance matrix and\nmust be square during fit. X may be a :term:`sparse graph`,\nin which case only \"nonzero\" elements may be considered neighbors." + }, + "refined_type": {} }, { "name": "outlier_label", @@ -141650,6 +152468,10 @@ "docstring": { "type": "{manual label, 'most_frequent'}, default=None", "description": "Label for outlier samples (samples with no neighbors in given radius).\n\n- manual label: str or int label (should be the same type as y)\n or list of manual labels if multi-output is used.\n- 'most_frequent' : assign the most frequent label of y to outliers.\n- None : when any outlier is detected, ValueError will be raised." + }, + "refined_type": { + "kind": "EnumType", + "values": ["most_frequent"] } }, { @@ -141660,7 +152482,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -141670,13 +152493,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, radius=1.0, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label=None, metric_params=None, n_jobs=None, **kwargs):\n if len(kwargs) > 0:\n warnings.warn(f'Passing additional keyword parameters has no effect and is deprecated in 1.0. An error will be raised from 1.2 and beyond. The ignored keyword parameter(s) are: {kwargs.keys()}.', FutureWarning)\n super().__init__(radius=radius, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.weights = weights\n self.outlier_label = outlier_label" }, { @@ -141694,13 +152518,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multilabel': True}" }, { @@ -141718,7 +152543,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -141728,6 +152554,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -141738,13 +152568,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)", "description": "Target values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Fit the radius neighbors classifier from the training dataset.", - "docstring": "Fit the radius neighbors classifier from the training dataset.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\ny : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\nReturns\n-------\nself : RadiusNeighborsClassifier\n The fitted radius neighbors classifier.", + "docstring": "Fit the radius neighbors classifier from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : RadiusNeighborsClassifier\n The fitted radius neighbors classifier.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit the radius neighbors classifier from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : RadiusNeighborsClassifier\n The fitted radius neighbors classifier.\n \"\"\"\n self.weights = _check_weights(self.weights)\n self._fit(X, y)\n classes_ = self.classes_\n _y = self._y\n if not self.outputs_2d_:\n _y = self._y.reshape((-1, 1))\n classes_ = [self.classes_]\n if self.outlier_label is None:\n outlier_label_ = None\n elif self.outlier_label == 'most_frequent':\n outlier_label_ = []\n for (k, classes_k) in enumerate(classes_):\n label_count = np.bincount(_y[:, k])\n outlier_label_.append(classes_k[label_count.argmax()])\n else:\n if _is_arraylike(self.outlier_label) and not isinstance(self.outlier_label, str):\n if len(self.outlier_label) != len(classes_):\n raise ValueError('The length of outlier_label: {} is inconsistent with the output length: {}'.format(self.outlier_label, len(classes_)))\n outlier_label_ = self.outlier_label\n else:\n outlier_label_ = [self.outlier_label] * len(classes_)\n for (classes, label) in zip(classes_, outlier_label_):\n if _is_arraylike(label) and not isinstance(label, str):\n raise TypeError('The outlier_label of classes {} is supposed to be a scalar, got {}.'.format(classes, label))\n if np.append(classes, label).dtype != classes.dtype:\n raise TypeError('The dtype of outlier_label {} is inconsistent with classes {} in y.'.format(label, classes))\n self.outlier_label_ = outlier_label_\n return self" }, { @@ -141762,7 +152596,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -141772,13 +152607,14 @@ "docstring": { "type": "array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'", "description": "Test samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict the class labels for the provided data.", - "docstring": "Predict the class labels for the provided data.\n\nParameters\n----------\nX : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\nReturns\n-------\ny : ndarray of shape (n_queries,) or (n_queries, n_outputs)\n Class labels for each data sample.", + "docstring": "Predict the class labels for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs)\n Class labels for each data sample.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict the class labels for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs)\n Class labels for each data sample.\n \"\"\"\n probs = self.predict_proba(X)\n classes_ = self.classes_\n if not self.outputs_2d_:\n probs = [probs]\n classes_ = [self.classes_]\n n_outputs = len(classes_)\n n_queries = probs[0].shape[0]\n y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)\n for (k, prob) in enumerate(probs):\n max_prob_index = prob.argmax(axis=1)\n y_pred[:, k] = classes_[k].take(max_prob_index)\n outlier_zero_probs = (prob == 0).all(axis=1)\n if outlier_zero_probs.any():\n zero_prob_index = np.flatnonzero(outlier_zero_probs)\n y_pred[zero_prob_index, k] = self.outlier_label_[k]\n if not self.outputs_2d_:\n y_pred = y_pred.ravel()\n return y_pred" }, { @@ -141796,7 +152632,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -141806,15 +152643,77 @@ "docstring": { "type": "array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'", "description": "Test samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return probability estimates for the test data X.", - "docstring": "Return probability estimates for the test data X.\n\nParameters\n----------\nX : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\nReturns\n-------\np : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1.\n The class probabilities of the input samples. Classes are ordered\n by lexicographic order.", + "docstring": "Return probability estimates for the test data X.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n p : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1.\n The class probabilities of the input samples. Classes are ordered\n by lexicographic order.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Return probability estimates for the test data X.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n p : ndarray of shape (n_queries, n_classes), or a list of n_outputs of such arrays if n_outputs > 1.\n The class probabilities of the input samples. Classes are ordered\n by lexicographic order.\n \"\"\"\n n_queries = _num_samples(X)\n (neigh_dist, neigh_ind) = self.radius_neighbors(X)\n outlier_mask = np.zeros(n_queries, dtype=bool)\n outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]\n outliers = np.flatnonzero(outlier_mask)\n inliers = np.flatnonzero(~outlier_mask)\n classes_ = self.classes_\n _y = self._y\n if not self.outputs_2d_:\n _y = self._y.reshape((-1, 1))\n classes_ = [self.classes_]\n if self.outlier_label_ is None and outliers.size > 0:\n raise ValueError('No neighbors found for test samples %r, you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.' % outliers)\n weights = _get_weights(neigh_dist, self.weights)\n if weights is not None:\n weights = weights[inliers]\n probabilities = []\n for (k, classes_k) in enumerate(classes_):\n pred_labels = np.zeros(len(neigh_ind), dtype=object)\n pred_labels[:] = [_y[ind, k] for ind in neigh_ind]\n proba_k = np.zeros((n_queries, classes_k.size))\n proba_inl = np.zeros((len(inliers), classes_k.size))\n if weights is None:\n for (i, idx) in enumerate(pred_labels[inliers]):\n proba_inl[i, :] = np.bincount(idx, minlength=classes_k.size)\n else:\n for (i, idx) in enumerate(pred_labels[inliers]):\n proba_inl[i, :] = np.bincount(idx, weights[i], minlength=classes_k.size)\n proba_k[inliers, :] = proba_inl\n if outliers.size > 0:\n _outlier_label = self.outlier_label_[k]\n label_index = np.flatnonzero(classes_k == _outlier_label)\n if label_index.size == 1:\n proba_k[outliers, label_index[0]] = 1.0\n else:\n warnings.warn('Outlier label {} is not in training classes. All class probabilities of outliers will be assigned with 0.'.format(self.outlier_label_[k]))\n normalizer = proba_k.sum(axis=1)[:, np.newaxis]\n normalizer[normalizer == 0.0] = 1.0\n proba_k /= normalizer\n probabilities.append(proba_k)\n if not self.outputs_2d_:\n probabilities = probabilities[0]\n return probabilities" }, + { + "name": "_warn", + "unique_name": "_warn", + "qname": "sklearn.neighbors._distance_metric.DistanceMetric._warn", + "unique_qname": "sklearn.neighbors._distance_metric.DistanceMetric._warn", + "decorators": ["classmethod"], + "parameters": [ + { + "name": "cls", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + } + ], + "results": [], + "is_public": false, + "description": "", + "docstring": null, + "source_code": "\n@classmethod\ndef _warn(cls):\n warnings.warn('sklearn.neighbors.DistanceMetric has been moved to sklearn.metrics.DistanceMetric in 1.0. This import path will be removed in 1.3', category=FutureWarning)" + }, + { + "name": "get_metric", + "unique_name": "get_metric", + "qname": "sklearn.neighbors._distance_metric.DistanceMetric.get_metric", + "unique_qname": "sklearn.neighbors._distance_metric.DistanceMetric.get_metric", + "decorators": ["classmethod"], + "parameters": [ + { + "name": "cls", + "default_value": null, + "is_public": true, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + }, + { + "name": "metric", + "default_value": null, + "is_public": true, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + } + ], + "results": [], + "is_public": true, + "description": "", + "docstring": null, + "source_code": "\n@classmethod\ndef get_metric(cls, metric, **kwargs):\n DistanceMetric._warn()\n return _DistanceMetric.get_metric(metric, **kwargs)" + }, { "name": "__init__", "unique_name": "__init__", @@ -141830,7 +152729,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mode", @@ -141840,6 +152740,10 @@ "docstring": { "type": "{'distance', 'connectivity'}, default='distance'", "description": "Type of returned matrix: 'connectivity' will return the connectivity\nmatrix with ones and zeros, and 'distance' will return the distances\nbetween neighbors according to the given metric." + }, + "refined_type": { + "kind": "EnumType", + "values": ["distance", "connectivity"] } }, { @@ -141850,7 +152754,8 @@ "docstring": { "type": "int, default=5", "description": "Number of neighbors for each sample in the transformed sparse graph.\nFor compatibility reasons, as each sample is considered as its own\nneighbor, one extra neighbor will be computed when mode == 'distance'.\nIn this case, the sparse graph contains (n_neighbors + 1) neighbors." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -141860,6 +152765,10 @@ "docstring": { "type": "{'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'", "description": "Algorithm used to compute the nearest neighbors:\n\n- 'ball_tree' will use :class:`BallTree`\n- 'kd_tree' will use :class:`KDTree`\n- 'brute' will use a brute-force search.\n- 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\nNote: fitting on sparse input will override the setting of\nthis parameter, using brute force." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -141870,7 +152779,8 @@ "docstring": { "type": "int, default=30", "description": "Leaf size passed to BallTree or KDTree. This can affect the\nspeed of the construction and query, as well as the memory\nrequired to store the tree. The optimal value depends on the\nnature of the problem." - } + }, + "refined_type": {} }, { "name": "metric", @@ -141880,7 +152790,8 @@ "docstring": { "type": "str or callable, default='minkowski'", "description": "Metric to use for distance computation. Any metric from scikit-learn\nor scipy.spatial.distance can be used.\n\nIf metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two arrays as input and return one value indicating the\ndistance between them. This works for Scipy's metrics, but is less\nefficient than passing the metric name as a string.\n\nDistance matrices are not supported.\n\nValid values for metric are:\n\n- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\nSee the documentation for scipy.spatial.distance for details on these\nmetrics." - } + }, + "refined_type": {} }, { "name": "p", @@ -141890,7 +152801,8 @@ "docstring": { "type": "int, default=2", "description": "Parameter for the Minkowski metric from\nsklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\nequivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -141900,7 +152812,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -141910,13 +152823,14 @@ "docstring": { "type": "int, default=1", "description": "The number of parallel jobs to run for neighbors search.\nIf ``-1``, then the number of jobs is set to the number of CPU cores." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, mode='distance', n_neighbors=5, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1):\n super(KNeighborsTransformer, self).__init__(n_neighbors=n_neighbors, radius=None, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.mode = mode" }, { @@ -141934,13 +152848,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_methods_sample_order_invariance': 'check is not applicable.'}}" }, { @@ -141958,7 +152873,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -141968,6 +152884,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -141978,13 +152898,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the k-nearest neighbors transformer from the training dataset.", - "docstring": "Fit the k-nearest neighbors transformer from the training dataset.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : KNeighborsTransformer\n The fitted k-nearest neighbors transformer.", + "docstring": "Fit the k-nearest neighbors transformer from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : KNeighborsTransformer\n The fitted k-nearest neighbors transformer.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the k-nearest neighbors transformer from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : KNeighborsTransformer\n The fitted k-nearest neighbors transformer.\n \"\"\"\n return self._fit(X)" }, { @@ -142002,7 +152923,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -142012,7 +152934,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training set." - } + }, + "refined_type": {} }, { "name": "y", @@ -142022,13 +152945,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit to data, then transform it.\n\nFits transformer to X and y with optional parameters fit_params and returns a transformed version of X.", - "docstring": "Fit to data, then transform it.\n\nFits transformer to X and y with optional parameters fit_params\nand returns a transformed version of X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training set.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nXt : sparse matrix of shape (n_samples, n_samples)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.", + "description": "Fit to data, then transform it.\n\nFits transformer to X and y with optional parameters fit_params\nand returns a transformed version of X.", + "docstring": "Fit to data, then transform it.\n\n Fits transformer to X and y with optional parameters fit_params\n and returns a transformed version of X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training set.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n Xt : sparse matrix of shape (n_samples, n_samples)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Fit to data, then transform it.\n\n Fits transformer to X and y with optional parameters fit_params\n and returns a transformed version of X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training set.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n Xt : sparse matrix of shape (n_samples, n_samples)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.\n \"\"\"\n return self.fit(X).transform(X)" }, { @@ -142046,7 +152970,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -142056,13 +152981,14 @@ "docstring": { "type": "array-like of shape (n_samples_transform, n_features)", "description": "Sample data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the (weighted) graph of Neighbors for points in X.", - "docstring": "Compute the (weighted) graph of Neighbors for points in X.\n\nParameters\n----------\nX : array-like of shape (n_samples_transform, n_features)\n Sample data.\n\nReturns\n-------\nXt : sparse matrix of shape (n_samples_transform, n_samples_fit)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.", + "docstring": "Compute the (weighted) graph of Neighbors for points in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_transform, n_features)\n Sample data.\n\n Returns\n -------\n Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Compute the (weighted) graph of Neighbors for points in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_transform, n_features)\n Sample data.\n\n Returns\n -------\n Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.\n \"\"\"\n check_is_fitted(self)\n add_one = self.mode == 'distance'\n return self.kneighbors_graph(X, mode=self.mode, n_neighbors=self.n_neighbors + add_one)" }, { @@ -142080,7 +153006,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mode", @@ -142090,6 +153017,10 @@ "docstring": { "type": "{'distance', 'connectivity'}, default='distance'", "description": "Type of returned matrix: 'connectivity' will return the connectivity\nmatrix with ones and zeros, and 'distance' will return the distances\nbetween neighbors according to the given metric." + }, + "refined_type": { + "kind": "EnumType", + "values": ["distance", "connectivity"] } }, { @@ -142100,7 +153031,8 @@ "docstring": { "type": "float, default=1.0", "description": "Radius of neighborhood in the transformed sparse graph." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -142110,6 +153042,10 @@ "docstring": { "type": "{'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'", "description": "Algorithm used to compute the nearest neighbors:\n\n- 'ball_tree' will use :class:`BallTree`\n- 'kd_tree' will use :class:`KDTree`\n- 'brute' will use a brute-force search.\n- 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\nNote: fitting on sparse input will override the setting of\nthis parameter, using brute force." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -142120,7 +153056,8 @@ "docstring": { "type": "int, default=30", "description": "Leaf size passed to BallTree or KDTree. This can affect the\nspeed of the construction and query, as well as the memory\nrequired to store the tree. The optimal value depends on the\nnature of the problem." - } + }, + "refined_type": {} }, { "name": "metric", @@ -142130,7 +153067,8 @@ "docstring": { "type": "str or callable, default='minkowski'", "description": "Metric to use for distance computation. Any metric from scikit-learn\nor scipy.spatial.distance can be used.\n\nIf metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two arrays as input and return one value indicating the\ndistance between them. This works for Scipy's metrics, but is less\nefficient than passing the metric name as a string.\n\nDistance matrices are not supported.\n\nValid values for metric are:\n\n- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\nSee the documentation for scipy.spatial.distance for details on these\nmetrics." - } + }, + "refined_type": {} }, { "name": "p", @@ -142140,7 +153078,8 @@ "docstring": { "type": "int, default=2", "description": "Parameter for the Minkowski metric from\nsklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\nequivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -142150,7 +153089,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -142160,13 +153100,14 @@ "docstring": { "type": "int, default=1", "description": "The number of parallel jobs to run for neighbors search.\nIf ``-1``, then the number of jobs is set to the number of CPU cores." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, mode='distance', radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1):\n super(RadiusNeighborsTransformer, self).__init__(n_neighbors=None, radius=radius, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.mode = mode" }, { @@ -142184,13 +153125,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_methods_sample_order_invariance': 'check is not applicable.'}}" }, { @@ -142208,7 +153150,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -142218,6 +153161,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -142228,13 +153175,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the radius neighbors transformer from the training dataset.", - "docstring": "Fit the radius neighbors transformer from the training dataset.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : RadiusNeighborsTransformer\n The fitted radius neighbors transformer.", + "docstring": "Fit the radius neighbors transformer from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : RadiusNeighborsTransformer\n The fitted radius neighbors transformer.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the radius neighbors transformer from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : RadiusNeighborsTransformer\n The fitted radius neighbors transformer.\n \"\"\"\n return self._fit(X)" }, { @@ -142252,7 +153200,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -142262,7 +153211,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training set." - } + }, + "refined_type": {} }, { "name": "y", @@ -142272,13 +153222,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit to data, then transform it.\n\nFits transformer to X and y with optional parameters fit_params and returns a transformed version of X.", - "docstring": "Fit to data, then transform it.\n\nFits transformer to X and y with optional parameters fit_params\nand returns a transformed version of X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training set.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nXt : sparse matrix of shape (n_samples, n_samples)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.", + "description": "Fit to data, then transform it.\n\nFits transformer to X and y with optional parameters fit_params\nand returns a transformed version of X.", + "docstring": "Fit to data, then transform it.\n\n Fits transformer to X and y with optional parameters fit_params\n and returns a transformed version of X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training set.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n Xt : sparse matrix of shape (n_samples, n_samples)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Fit to data, then transform it.\n\n Fits transformer to X and y with optional parameters fit_params\n and returns a transformed version of X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training set.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n Xt : sparse matrix of shape (n_samples, n_samples)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.\n \"\"\"\n return self.fit(X).transform(X)" }, { @@ -142296,7 +153247,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -142306,13 +153258,14 @@ "docstring": { "type": "array-like of shape (n_samples_transform, n_features)", "description": "Sample data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the (weighted) graph of Neighbors for points in X.", - "docstring": "Compute the (weighted) graph of Neighbors for points in X.\n\nParameters\n----------\nX : array-like of shape (n_samples_transform, n_features)\n Sample data.\n\nReturns\n-------\nXt : sparse matrix of shape (n_samples_transform, n_samples_fit)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.", + "docstring": "Compute the (weighted) graph of Neighbors for points in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_transform, n_features)\n Sample data.\n\n Returns\n -------\n Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Compute the (weighted) graph of Neighbors for points in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples_transform, n_features)\n Sample data.\n\n Returns\n -------\n Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)\n Xt[i, j] is assigned the weight of edge that connects i to j.\n Only the neighbors have an explicit value.\n The diagonal is always explicit.\n The matrix is of CSR format.\n \"\"\"\n check_is_fitted(self)\n return self.radius_neighbors_graph(X, mode=self.mode, sort_results=True)" }, { @@ -142330,7 +153283,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "metric", @@ -142340,7 +153294,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "p", @@ -142350,7 +153305,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -142360,7 +153316,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -142384,7 +153341,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "include_self", @@ -142394,7 +153352,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mode", @@ -142404,7 +153363,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -142428,7 +153388,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or BallTree", "description": "Sample data, in the form of a numpy array or a precomputed\n:class:`BallTree`." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -142438,7 +153399,8 @@ "docstring": { "type": "int", "description": "Number of neighbors for each sample." - } + }, + "refined_type": {} }, { "name": "mode", @@ -142448,6 +153410,10 @@ "docstring": { "type": "{'connectivity', 'distance'}, default='connectivity'", "description": "Type of returned matrix: 'connectivity' will return the connectivity\nmatrix with ones and zeros, and 'distance' will return the distances\nbetween neighbors according to the given metric." + }, + "refined_type": { + "kind": "EnumType", + "values": ["distance", "connectivity"] } }, { @@ -142457,8 +153423,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "str, default='minkowski'", - "description": "The distance metric used to calculate the k-Neighbors for each sample\npoint. The DistanceMetric class gives a list of available metrics.\nThe default distance is 'euclidean' ('minkowski' metric with the p\nparam equal to 2.)" - } + "description": "The distance metric to use for the tree. The default metric is\nminkowski, and with p=2 is equivalent to the standard Euclidean\nmetric.\nFor a list of available metrics, see the documentation of\n:class:`~sklearn.metrics.DistanceMetric`." + }, + "refined_type": {} }, { "name": "p", @@ -142468,7 +153435,8 @@ "docstring": { "type": "int, default=2", "description": "Power parameter for the Minkowski metric. When p = 1, this is\nequivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -142478,7 +153446,8 @@ "docstring": { "type": "dict, default=None", "description": "additional keyword arguments for the metric function." - } + }, + "refined_type": {} }, { "name": "include_self", @@ -142488,7 +153457,8 @@ "docstring": { "type": "bool or 'auto', default=False", "description": "Whether or not to mark each sample as the first nearest neighbor to\nitself. If 'auto', then True is used for mode='connectivity' and False\nfor mode='distance'." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -142498,14 +153468,15 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Computes the (weighted) graph of k-Neighbors for points in X\n\nRead more in the :ref:`User Guide `.", - "docstring": "Computes the (weighted) graph of k-Neighbors for points in X\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or BallTree\n Sample data, in the form of a numpy array or a precomputed\n :class:`BallTree`.\n\nn_neighbors : int\n Number of neighbors for each sample.\n\nmode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the connectivity\n matrix with ones and zeros, and 'distance' will return the distances\n between neighbors according to the given metric.\n\nmetric : str, default='minkowski'\n The distance metric used to calculate the k-Neighbors for each sample\n point. The DistanceMetric class gives a list of available metrics.\n The default distance is 'euclidean' ('minkowski' metric with the p\n param equal to 2.)\n\np : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\nmetric_params : dict, default=None\n additional keyword arguments for the metric function.\n\ninclude_self : bool or 'auto', default=False\n Whether or not to mark each sample as the first nearest neighbor to\n itself. If 'auto', then True is used for mode='connectivity' and False\n for mode='distance'.\n\nn_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nReturns\n-------\nA : sparse matrix of shape (n_samples, n_samples)\n Graph where A[i, j] is assigned the weight of edge that\n connects i to j. The matrix is of CSR format.\n\nExamples\n--------\n>>> X = [[0], [3], [1]]\n>>> from sklearn.neighbors import kneighbors_graph\n>>> A = kneighbors_graph(X, 2, mode='connectivity', include_self=True)\n>>> A.toarray()\narray([[1., 0., 1.],\n [0., 1., 1.],\n [1., 0., 1.]])\n\nSee Also\n--------\nradius_neighbors_graph", - "source_code": "\ndef kneighbors_graph(X, n_neighbors, *, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=False, n_jobs=None):\n \"\"\"Computes the (weighted) graph of k-Neighbors for points in X\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or BallTree\n Sample data, in the form of a numpy array or a precomputed\n :class:`BallTree`.\n\n n_neighbors : int\n Number of neighbors for each sample.\n\n mode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the connectivity\n matrix with ones and zeros, and 'distance' will return the distances\n between neighbors according to the given metric.\n\n metric : str, default='minkowski'\n The distance metric used to calculate the k-Neighbors for each sample\n point. The DistanceMetric class gives a list of available metrics.\n The default distance is 'euclidean' ('minkowski' metric with the p\n param equal to 2.)\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n additional keyword arguments for the metric function.\n\n include_self : bool or 'auto', default=False\n Whether or not to mark each sample as the first nearest neighbor to\n itself. If 'auto', then True is used for mode='connectivity' and False\n for mode='distance'.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n A : sparse matrix of shape (n_samples, n_samples)\n Graph where A[i, j] is assigned the weight of edge that\n connects i to j. The matrix is of CSR format.\n\n Examples\n --------\n >>> X = [[0], [3], [1]]\n >>> from sklearn.neighbors import kneighbors_graph\n >>> A = kneighbors_graph(X, 2, mode='connectivity', include_self=True)\n >>> A.toarray()\n array([[1., 0., 1.],\n [0., 1., 1.],\n [1., 0., 1.]])\n\n See Also\n --------\n radius_neighbors_graph\n \"\"\"\n if not isinstance(X, KNeighborsMixin):\n X = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs).fit(X)\n else:\n _check_params(X, metric, p, metric_params)\n query = _query_include_self(X._fit_X, include_self, mode)\n return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)" + "docstring": "Computes the (weighted) graph of k-Neighbors for points in X\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or BallTree\n Sample data, in the form of a numpy array or a precomputed\n :class:`BallTree`.\n\n n_neighbors : int\n Number of neighbors for each sample.\n\n mode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the connectivity\n matrix with ones and zeros, and 'distance' will return the distances\n between neighbors according to the given metric.\n\n metric : str, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric.\n For a list of available metrics, see the documentation of\n :class:`~sklearn.metrics.DistanceMetric`.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n additional keyword arguments for the metric function.\n\n include_self : bool or 'auto', default=False\n Whether or not to mark each sample as the first nearest neighbor to\n itself. If 'auto', then True is used for mode='connectivity' and False\n for mode='distance'.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n A : sparse matrix of shape (n_samples, n_samples)\n Graph where A[i, j] is assigned the weight of edge that\n connects i to j. The matrix is of CSR format.\n\n Examples\n --------\n >>> X = [[0], [3], [1]]\n >>> from sklearn.neighbors import kneighbors_graph\n >>> A = kneighbors_graph(X, 2, mode='connectivity', include_self=True)\n >>> A.toarray()\n array([[1., 0., 1.],\n [0., 1., 1.],\n [1., 0., 1.]])\n\n See Also\n --------\n radius_neighbors_graph\n ", + "source_code": "\ndef kneighbors_graph(X, n_neighbors, *, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=False, n_jobs=None):\n \"\"\"Computes the (weighted) graph of k-Neighbors for points in X\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or BallTree\n Sample data, in the form of a numpy array or a precomputed\n :class:`BallTree`.\n\n n_neighbors : int\n Number of neighbors for each sample.\n\n mode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the connectivity\n matrix with ones and zeros, and 'distance' will return the distances\n between neighbors according to the given metric.\n\n metric : str, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric.\n For a list of available metrics, see the documentation of\n :class:`~sklearn.metrics.DistanceMetric`.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n additional keyword arguments for the metric function.\n\n include_self : bool or 'auto', default=False\n Whether or not to mark each sample as the first nearest neighbor to\n itself. If 'auto', then True is used for mode='connectivity' and False\n for mode='distance'.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n A : sparse matrix of shape (n_samples, n_samples)\n Graph where A[i, j] is assigned the weight of edge that\n connects i to j. The matrix is of CSR format.\n\n Examples\n --------\n >>> X = [[0], [3], [1]]\n >>> from sklearn.neighbors import kneighbors_graph\n >>> A = kneighbors_graph(X, 2, mode='connectivity', include_self=True)\n >>> A.toarray()\n array([[1., 0., 1.],\n [0., 1., 1.],\n [1., 0., 1.]])\n\n See Also\n --------\n radius_neighbors_graph\n \"\"\"\n if not isinstance(X, KNeighborsMixin):\n X = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs).fit(X)\n else:\n _check_params(X, metric, p, metric_params)\n query = _query_include_self(X._fit_X, include_self, mode)\n return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)" }, { "name": "radius_neighbors_graph", @@ -142522,7 +153493,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or BallTree", "description": "Sample data, in the form of a numpy array or a precomputed\n:class:`BallTree`." - } + }, + "refined_type": {} }, { "name": "radius", @@ -142532,7 +153504,8 @@ "docstring": { "type": "float", "description": "Radius of neighborhoods." - } + }, + "refined_type": {} }, { "name": "mode", @@ -142542,6 +153515,10 @@ "docstring": { "type": "{'connectivity', 'distance'}, default='connectivity'", "description": "Type of returned matrix: 'connectivity' will return the connectivity\nmatrix with ones and zeros, and 'distance' will return the distances\nbetween neighbors according to the given metric." + }, + "refined_type": { + "kind": "EnumType", + "values": ["distance", "connectivity"] } }, { @@ -142551,8 +153528,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "str, default='minkowski'", - "description": "The distance metric used to calculate the neighbors within a\ngiven radius for each sample point. The DistanceMetric class\ngives a list of available metrics. The default distance is\n'euclidean' ('minkowski' metric with the param equal to 2.)" - } + "description": "The distance metric to use for the tree. The default metric is\nminkowski, and with p=2 is equivalent to the standard Euclidean\nmetric.\nFor a list of available metrics, see the documentation of\n:class:`~sklearn.metrics.DistanceMetric`." + }, + "refined_type": {} }, { "name": "p", @@ -142562,7 +153540,8 @@ "docstring": { "type": "int, default=2", "description": "Power parameter for the Minkowski metric. When p = 1, this is\nequivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -142572,7 +153551,8 @@ "docstring": { "type": "dict, default=None", "description": "additional keyword arguments for the metric function." - } + }, + "refined_type": {} }, { "name": "include_self", @@ -142582,7 +153562,8 @@ "docstring": { "type": "bool or 'auto', default=False", "description": "Whether or not to mark each sample as the first nearest neighbor to\nitself. If 'auto', then True is used for mode='connectivity' and False\nfor mode='distance'." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -142592,14 +153573,15 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Computes the (weighted) graph of Neighbors for points in X\n\nNeighborhoods are restricted the points at a distance lower than radius. Read more in the :ref:`User Guide `.", - "docstring": "Computes the (weighted) graph of Neighbors for points in X\n\nNeighborhoods are restricted the points at a distance lower than\nradius.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or BallTree\n Sample data, in the form of a numpy array or a precomputed\n :class:`BallTree`.\n\nradius : float\n Radius of neighborhoods.\n\nmode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the connectivity\n matrix with ones and zeros, and 'distance' will return the distances\n between neighbors according to the given metric.\n\nmetric : str, default='minkowski'\n The distance metric used to calculate the neighbors within a\n given radius for each sample point. The DistanceMetric class\n gives a list of available metrics. The default distance is\n 'euclidean' ('minkowski' metric with the param equal to 2.)\n\np : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\nmetric_params : dict, default=None\n additional keyword arguments for the metric function.\n\ninclude_self : bool or 'auto', default=False\n Whether or not to mark each sample as the first nearest neighbor to\n itself. If 'auto', then True is used for mode='connectivity' and False\n for mode='distance'.\n\nn_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\nReturns\n-------\nA : sparse matrix of shape (n_samples, n_samples)\n Graph where A[i, j] is assigned the weight of edge that connects\n i to j. The matrix is of CSR format.\n\nExamples\n--------\n>>> X = [[0], [3], [1]]\n>>> from sklearn.neighbors import radius_neighbors_graph\n>>> A = radius_neighbors_graph(X, 1.5, mode='connectivity',\n... include_self=True)\n>>> A.toarray()\narray([[1., 0., 1.],\n [0., 1., 0.],\n [1., 0., 1.]])\n\nSee Also\n--------\nkneighbors_graph", - "source_code": "\ndef radius_neighbors_graph(X, radius, *, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=False, n_jobs=None):\n \"\"\"Computes the (weighted) graph of Neighbors for points in X\n\n Neighborhoods are restricted the points at a distance lower than\n radius.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or BallTree\n Sample data, in the form of a numpy array or a precomputed\n :class:`BallTree`.\n\n radius : float\n Radius of neighborhoods.\n\n mode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the connectivity\n matrix with ones and zeros, and 'distance' will return the distances\n between neighbors according to the given metric.\n\n metric : str, default='minkowski'\n The distance metric used to calculate the neighbors within a\n given radius for each sample point. The DistanceMetric class\n gives a list of available metrics. The default distance is\n 'euclidean' ('minkowski' metric with the param equal to 2.)\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n additional keyword arguments for the metric function.\n\n include_self : bool or 'auto', default=False\n Whether or not to mark each sample as the first nearest neighbor to\n itself. If 'auto', then True is used for mode='connectivity' and False\n for mode='distance'.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n A : sparse matrix of shape (n_samples, n_samples)\n Graph where A[i, j] is assigned the weight of edge that connects\n i to j. The matrix is of CSR format.\n\n Examples\n --------\n >>> X = [[0], [3], [1]]\n >>> from sklearn.neighbors import radius_neighbors_graph\n >>> A = radius_neighbors_graph(X, 1.5, mode='connectivity',\n ... include_self=True)\n >>> A.toarray()\n array([[1., 0., 1.],\n [0., 1., 0.],\n [1., 0., 1.]])\n\n See Also\n --------\n kneighbors_graph\n \"\"\"\n if not isinstance(X, RadiusNeighborsMixin):\n X = NearestNeighbors(radius=radius, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs).fit(X)\n else:\n _check_params(X, metric, p, metric_params)\n query = _query_include_self(X._fit_X, include_self, mode)\n return X.radius_neighbors_graph(query, radius, mode)" + "description": "Computes the (weighted) graph of Neighbors for points in X\n\nNeighborhoods are restricted the points at a distance lower than\nradius.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Computes the (weighted) graph of Neighbors for points in X\n\n Neighborhoods are restricted the points at a distance lower than\n radius.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or BallTree\n Sample data, in the form of a numpy array or a precomputed\n :class:`BallTree`.\n\n radius : float\n Radius of neighborhoods.\n\n mode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the connectivity\n matrix with ones and zeros, and 'distance' will return the distances\n between neighbors according to the given metric.\n\n metric : str, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric.\n For a list of available metrics, see the documentation of\n :class:`~sklearn.metrics.DistanceMetric`.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n additional keyword arguments for the metric function.\n\n include_self : bool or 'auto', default=False\n Whether or not to mark each sample as the first nearest neighbor to\n itself. If 'auto', then True is used for mode='connectivity' and False\n for mode='distance'.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n A : sparse matrix of shape (n_samples, n_samples)\n Graph where A[i, j] is assigned the weight of edge that connects\n i to j. The matrix is of CSR format.\n\n Examples\n --------\n >>> X = [[0], [3], [1]]\n >>> from sklearn.neighbors import radius_neighbors_graph\n >>> A = radius_neighbors_graph(X, 1.5, mode='connectivity',\n ... include_self=True)\n >>> A.toarray()\n array([[1., 0., 1.],\n [0., 1., 0.],\n [1., 0., 1.]])\n\n See Also\n --------\n kneighbors_graph\n ", + "source_code": "\ndef radius_neighbors_graph(X, radius, *, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=False, n_jobs=None):\n \"\"\"Computes the (weighted) graph of Neighbors for points in X\n\n Neighborhoods are restricted the points at a distance lower than\n radius.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or BallTree\n Sample data, in the form of a numpy array or a precomputed\n :class:`BallTree`.\n\n radius : float\n Radius of neighborhoods.\n\n mode : {'connectivity', 'distance'}, default='connectivity'\n Type of returned matrix: 'connectivity' will return the connectivity\n matrix with ones and zeros, and 'distance' will return the distances\n between neighbors according to the given metric.\n\n metric : str, default='minkowski'\n The distance metric to use for the tree. The default metric is\n minkowski, and with p=2 is equivalent to the standard Euclidean\n metric.\n For a list of available metrics, see the documentation of\n :class:`~sklearn.metrics.DistanceMetric`.\n\n p : int, default=2\n Power parameter for the Minkowski metric. When p = 1, this is\n equivalent to using manhattan_distance (l1), and euclidean_distance\n (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.\n\n metric_params : dict, default=None\n additional keyword arguments for the metric function.\n\n include_self : bool or 'auto', default=False\n Whether or not to mark each sample as the first nearest neighbor to\n itself. If 'auto', then True is used for mode='connectivity' and False\n for mode='distance'.\n\n n_jobs : int, default=None\n The number of parallel jobs to run for neighbors search.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n Returns\n -------\n A : sparse matrix of shape (n_samples, n_samples)\n Graph where A[i, j] is assigned the weight of edge that connects\n i to j. The matrix is of CSR format.\n\n Examples\n --------\n >>> X = [[0], [3], [1]]\n >>> from sklearn.neighbors import radius_neighbors_graph\n >>> A = radius_neighbors_graph(X, 1.5, mode='connectivity',\n ... include_self=True)\n >>> A.toarray()\n array([[1., 0., 1.],\n [0., 1., 0.],\n [1., 0., 1.]])\n\n See Also\n --------\n kneighbors_graph\n \"\"\"\n if not isinstance(X, RadiusNeighborsMixin):\n X = NearestNeighbors(radius=radius, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs).fit(X)\n else:\n _check_params(X, metric, p, metric_params)\n query = _query_include_self(X._fit_X, include_self, mode)\n return X.radius_neighbors_graph(query, radius, mode)" }, { "name": "__init__", @@ -142616,7 +153598,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "bandwidth", @@ -142626,7 +153609,8 @@ "docstring": { "type": "float, default=1.0", "description": "The bandwidth of the kernel." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -142636,6 +153620,10 @@ "docstring": { "type": "{'kd_tree', 'ball_tree', 'auto'}, default='auto'", "description": "The tree algorithm to use." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "ball_tree"] } }, { @@ -142646,6 +153634,17 @@ "docstring": { "type": "{'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine'}, default='gaussian'", "description": "The kernel to use." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "tophat", + "cosine", + "exponential", + "epanechnikov", + "gaussian", + "linear" + ] } }, { @@ -142656,7 +153655,8 @@ "docstring": { "type": "str, default='euclidean'", "description": "The distance metric to use. Note that not all metrics are\nvalid with all algorithms. Refer to the documentation of\n:class:`BallTree` and :class:`KDTree` for a description of\navailable algorithms. Note that the normalization of the density\noutput is correct only for the Euclidean distance metric. Default\nis 'euclidean'." - } + }, + "refined_type": {} }, { "name": "atol", @@ -142666,7 +153666,8 @@ "docstring": { "type": "float, default=0", "description": "The desired absolute tolerance of the result. A larger tolerance will\ngenerally lead to faster execution." - } + }, + "refined_type": {} }, { "name": "rtol", @@ -142676,7 +153677,8 @@ "docstring": { "type": "float, default=0", "description": "The desired relative tolerance of the result. A larger tolerance will\ngenerally lead to faster execution." - } + }, + "refined_type": {} }, { "name": "breadth_first", @@ -142686,7 +153688,8 @@ "docstring": { "type": "bool, default=True", "description": "If true (default), use a breadth-first approach to the problem.\nOtherwise use a depth-first approach." - } + }, + "refined_type": {} }, { "name": "leaf_size", @@ -142696,7 +153699,8 @@ "docstring": { "type": "int, default=40", "description": "Specify the leaf size of the underlying tree. See :class:`BallTree`\nor :class:`KDTree` for details." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -142706,13 +153710,14 @@ "docstring": { "type": "dict, default=None", "description": "Additional parameters to be passed to the tree for use with the\nmetric. For more information, see the documentation of\n:class:`BallTree` or :class:`KDTree`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, bandwidth=1.0, algorithm='auto', kernel='gaussian', metric='euclidean', atol=0, rtol=0, breadth_first=True, leaf_size=40, metric_params=None):\n self.algorithm = algorithm\n self.bandwidth = bandwidth\n self.kernel = kernel\n self.metric = metric\n self.atol = atol\n self.rtol = rtol\n self.breadth_first = breadth_first\n self.leaf_size = leaf_size\n self.metric_params = metric_params\n self._choose_algorithm(self.algorithm, self.metric)\n if bandwidth <= 0:\n raise ValueError('bandwidth must be positive')\n if kernel not in VALID_KERNELS:\n raise ValueError(\"invalid kernel: '{0}'\".format(kernel))" }, { @@ -142730,7 +153735,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -142740,7 +153746,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "metric", @@ -142750,13 +153757,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _choose_algorithm(self, algorithm, metric):\n if algorithm == 'auto':\n if metric in KDTree.valid_metrics:\n return 'kd_tree'\n elif metric in BallTree.valid_metrics:\n return 'ball_tree'\n else:\n raise ValueError(\"invalid metric: '{0}'\".format(metric))\n elif algorithm in TREE_DICT:\n if metric not in TREE_DICT[algorithm].valid_metrics:\n raise ValueError(\"invalid metric for {0}: '{1}'\".format(TREE_DICT[algorithm], metric))\n return algorithm\n else:\n raise ValueError(\"invalid algorithm: '{0}'\".format(algorithm))" }, { @@ -142774,13 +153782,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'sample_weight must have positive values'}}" }, { @@ -142798,7 +153807,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -142808,7 +153818,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "List of n_features-dimensional data points. Each row\ncorresponds to a single data point." - } + }, + "refined_type": {} }, { "name": "y", @@ -142818,7 +153829,8 @@ "docstring": { "type": "None", "description": "Ignored. This parameter exists only for compatibility with\n:class:`~sklearn.pipeline.Pipeline`." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -142828,13 +153840,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "List of sample weights attached to the data X.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the Kernel Density model on the data.", - "docstring": "Fit the Kernel Density model on the data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\ny : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\nsample_weight : array-like of shape (n_samples,), default=None\n List of sample weights attached to the data X.\n\n .. versionadded:: 0.20\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit the Kernel Density model on the data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n sample_weight : array-like of shape (n_samples,), default=None\n List of sample weights attached to the data X.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None, sample_weight=None):\n \"\"\"Fit the Kernel Density model on the data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n sample_weight : array-like of shape (n_samples,), default=None\n List of sample weights attached to the data X.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n algorithm = self._choose_algorithm(self.algorithm, self.metric)\n X = self._validate_data(X, order='C', dtype=DTYPE)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, DTYPE)\n if sample_weight.min() <= 0:\n raise ValueError('sample_weight must have positive values')\n kwargs = self.metric_params\n if kwargs is None:\n kwargs = {}\n self.tree_ = TREE_DICT[algorithm](X, metric=self.metric, leaf_size=self.leaf_size, sample_weight=sample_weight, **kwargs)\n return self" }, { @@ -142852,7 +153865,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -142862,7 +153876,8 @@ "docstring": { "type": "int, default=1", "description": "Number of samples to generate." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -142872,13 +153887,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation used to generate\nrandom samples. Pass an int for reproducible results\nacross multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate random samples from the model.\n\nCurrently, this is implemented only for gaussian and tophat kernels.", - "docstring": "Generate random samples from the model.\n\nCurrently, this is implemented only for gaussian and tophat kernels.\n\nParameters\n----------\nn_samples : int, default=1\n Number of samples to generate.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation used to generate\n random samples. Pass an int for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nX : array-like of shape (n_samples, n_features)\n List of samples.", + "docstring": "Generate random samples from the model.\n\n Currently, this is implemented only for gaussian and tophat kernels.\n\n Parameters\n ----------\n n_samples : int, default=1\n Number of samples to generate.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation used to generate\n random samples. Pass an int for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : array-like of shape (n_samples, n_features)\n List of samples.\n ", "source_code": "\ndef sample(self, n_samples=1, random_state=None):\n \"\"\"Generate random samples from the model.\n\n Currently, this is implemented only for gaussian and tophat kernels.\n\n Parameters\n ----------\n n_samples : int, default=1\n Number of samples to generate.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation used to generate\n random samples. Pass an int for reproducible results\n across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n X : array-like of shape (n_samples, n_features)\n List of samples.\n \"\"\"\n check_is_fitted(self)\n if self.kernel not in ['gaussian', 'tophat']:\n raise NotImplementedError()\n data = np.asarray(self.tree_.data)\n rng = check_random_state(random_state)\n u = rng.uniform(0, 1, size=n_samples)\n if self.tree_.sample_weight is None:\n i = (u * data.shape[0]).astype(np.int64)\n else:\n cumsum_weight = np.cumsum(np.asarray(self.tree_.sample_weight))\n sum_weight = cumsum_weight[-1]\n i = np.searchsorted(cumsum_weight, u * sum_weight)\n if self.kernel == 'gaussian':\n return np.atleast_2d(rng.normal(data[i], self.bandwidth))\n elif self.kernel == 'tophat':\n dim = data.shape[1]\n X = rng.normal(size=(n_samples, dim))\n s_sq = row_norms(X, squared=True)\n correction = gammainc(0.5 * dim, 0.5 * s_sq)**(1.0 / dim) * self.bandwidth / np.sqrt(s_sq)\n return data[i] + X * correction[:, np.newaxis]" }, { @@ -142896,7 +153912,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -142906,7 +153923,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "List of n_features-dimensional data points. Each row\ncorresponds to a single data point." - } + }, + "refined_type": {} }, { "name": "y", @@ -142916,13 +153934,14 @@ "docstring": { "type": "None", "description": "Ignored. This parameter exists only for compatibility with\n:class:`~sklearn.pipeline.Pipeline`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the total log-likelihood under the model.", - "docstring": "Compute the total log-likelihood under the model.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\ny : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\nReturns\n-------\nlogprob : float\n Total log-likelihood of the data in X. This is normalized to be a\n probability density, so the value will be low for high-dimensional\n data.", + "docstring": "Compute the total log-likelihood under the model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n logprob : float\n Total log-likelihood of the data in X. This is normalized to be a\n probability density, so the value will be low for high-dimensional\n data.\n ", "source_code": "\ndef score(self, X, y=None):\n \"\"\"Compute the total log-likelihood under the model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n List of n_features-dimensional data points. Each row\n corresponds to a single data point.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n logprob : float\n Total log-likelihood of the data in X. This is normalized to be a\n probability density, so the value will be low for high-dimensional\n data.\n \"\"\"\n return np.sum(self.score_samples(X))" }, { @@ -142940,7 +153959,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -142950,13 +153970,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "An array of points to query. Last dimension should match dimension\nof training data (n_features)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the log-likelihood of each sample under the model.", - "docstring": "Compute the log-likelihood of each sample under the model.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n An array of points to query. Last dimension should match dimension\n of training data (n_features).\n\nReturns\n-------\ndensity : ndarray of shape (n_samples,)\n Log-likelihood of each sample in `X`. These are normalized to be\n probability densities, so values will be low for high-dimensional\n data.", + "docstring": "Compute the log-likelihood of each sample under the model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n An array of points to query. Last dimension should match dimension\n of training data (n_features).\n\n Returns\n -------\n density : ndarray of shape (n_samples,)\n Log-likelihood of each sample in `X`. These are normalized to be\n probability densities, so values will be low for high-dimensional\n data.\n ", "source_code": "\ndef score_samples(self, X):\n \"\"\"Compute the log-likelihood of each sample under the model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n An array of points to query. Last dimension should match dimension\n of training data (n_features).\n\n Returns\n -------\n density : ndarray of shape (n_samples,)\n Log-likelihood of each sample in `X`. These are normalized to be\n probability densities, so values will be low for high-dimensional\n data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, order='C', dtype=DTYPE, reset=False)\n if self.tree_.sample_weight is None:\n N = self.tree_.data.shape[0]\n else:\n N = self.tree_.sum_weight\n atol_N = self.atol * N\n log_density = self.tree_.kernel_density(X, h=self.bandwidth, kernel=self.kernel, atol=atol_N, rtol=self.rtol, breadth_first=self.breadth_first, return_log=True)\n log_density -= np.log(N)\n return log_density" }, { @@ -142974,7 +153995,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -142984,7 +154006,8 @@ "docstring": { "type": "int, default=20", "description": "Number of neighbors to use by default for :meth:`kneighbors` queries.\nIf n_neighbors is larger than the number of samples provided,\nall samples will be used." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -142994,6 +154017,10 @@ "docstring": { "type": "{'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'", "description": "Algorithm used to compute the nearest neighbors:\n\n- 'ball_tree' will use :class:`BallTree`\n- 'kd_tree' will use :class:`KDTree`\n- 'brute' will use a brute-force search.\n- 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\nNote: fitting on sparse input will override the setting of\nthis parameter, using brute force." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -143004,7 +154031,8 @@ "docstring": { "type": "int, default=30", "description": "Leaf is size passed to :class:`BallTree` or :class:`KDTree`. This can\naffect the speed of the construction and query, as well as the memory\nrequired to store the tree. The optimal value depends on the\nnature of the problem." - } + }, + "refined_type": {} }, { "name": "metric", @@ -143014,7 +154042,8 @@ "docstring": { "type": "str or callable, default='minkowski'", "description": "The metric is used for distance computation. Any metric from scikit-learn\nor scipy.spatial.distance can be used.\n\nIf metric is \"precomputed\", X is assumed to be a distance matrix and\nmust be square. X may be a sparse matrix, in which case only \"nonzero\"\nelements may be considered neighbors.\n\nIf metric is a callable function, it is called on each\npair of instances (rows) and the resulting value recorded. The callable\nshould take two arrays as input and return one value indicating the\ndistance between them. This works for Scipy's metrics, but is less\nefficient than passing the metric name as a string.\n\nValid values for metric are:\n\n- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',\n 'manhattan']\n\n- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',\n 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',\n 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',\n 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',\n 'yule']\n\nSee the documentation for scipy.spatial.distance for details on these\nmetrics:\nhttps://docs.scipy.org/doc/scipy/reference/spatial.distance.html." - } + }, + "refined_type": {} }, { "name": "p", @@ -143024,7 +154053,8 @@ "docstring": { "type": "int, default=2", "description": "Parameter for the Minkowski metric from\n:func:`sklearn.metrics.pairwise.pairwise_distances`. When p = 1, this\nis equivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -143034,7 +154064,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function." - } + }, + "refined_type": {} }, { "name": "contamination", @@ -143044,6 +154075,14 @@ "docstring": { "type": "'auto' or float, default='auto'", "description": "The amount of contamination of the data set, i.e. the proportion\nof outliers in the data set. When fitting this is used to define the\nthreshold on the scores of the samples.\n\n- if 'auto', the threshold is determined as in the\n original paper,\n- if a float, the contamination should be in the range (0, 0.5].\n\n.. versionchanged:: 0.22\n The default value of ``contamination`` changed from 0.1\n to ``'auto'``." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 0.5, + "min_inclusive": false, + "max_inclusive": true } }, { @@ -143054,7 +154093,8 @@ "docstring": { "type": "bool, default=False", "description": "By default, LocalOutlierFactor is only meant to be used for outlier\ndetection (novelty=False). Set novelty to True if you want to use\nLocalOutlierFactor for novelty detection. In this case be aware that\nyou should only use predict, decision_function and score_samples\non new unseen data and not on the training set.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -143064,13 +154104,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_neighbors=20, *, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination='auto', novelty=False, n_jobs=None):\n super().__init__(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.contamination = contamination\n self.novelty = novelty" }, { @@ -143088,13 +154129,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_novelty_decision_function(self):\n if not self.novelty:\n msg = 'decision_function is not available when novelty=False. Use novelty=True if you want to use LOF for novelty detection and compute decision_function for new unseen data. Note that the opposite LOF of the training samples is always available by considering the negative_outlier_factor_ attribute.'\n raise AttributeError(msg)\n return True" }, { @@ -143112,13 +154154,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_novelty_fit_predict(self):\n if self.novelty:\n msg = 'fit_predict is not available when novelty=True. Use novelty=False if you want to predict on the training set.'\n raise AttributeError(msg)\n return True" }, { @@ -143136,13 +154179,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_novelty_predict(self):\n if not self.novelty:\n msg = 'predict is not available when novelty=False, use fit_predict if you want to predict on training data. Use novelty=True if you want to use LOF for novelty detection and predict on new unseen data.'\n raise AttributeError(msg)\n return True" }, { @@ -143160,13 +154204,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_novelty_score_samples(self):\n if not self.novelty:\n msg = 'score_samples is not available when novelty=False. The scores of the training samples are always available through the negative_outlier_factor_ attribute. Use novelty=True if you want to use LOF for novelty detection and compute score_samples for new unseen data.'\n raise AttributeError(msg)\n return True" }, { @@ -143184,7 +154229,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "distances_X", @@ -143194,7 +154240,8 @@ "docstring": { "type": "ndarray of shape (n_queries, self.n_neighbors)", "description": "Distances to the neighbors (in the training samples `self._fit_X`)\nof each query point to compute the LRD." - } + }, + "refined_type": {} }, { "name": "neighbors_indices", @@ -143204,13 +154251,14 @@ "docstring": { "type": "ndarray of shape (n_queries, self.n_neighbors)", "description": "Neighbors indices (of each query point) among training samples\nself._fit_X." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "The local reachability density (LRD)\n\nThe LRD of a sample is the inverse of the average reachability distance of its k-nearest neighbors.", - "docstring": "The local reachability density (LRD)\n\nThe LRD of a sample is the inverse of the average reachability\ndistance of its k-nearest neighbors.\n\nParameters\n----------\ndistances_X : ndarray of shape (n_queries, self.n_neighbors)\n Distances to the neighbors (in the training samples `self._fit_X`)\n of each query point to compute the LRD.\n\nneighbors_indices : ndarray of shape (n_queries, self.n_neighbors)\n Neighbors indices (of each query point) among training samples\n self._fit_X.\n\nReturns\n-------\nlocal_reachability_density : ndarray of shape (n_queries,)\n The local reachability density of each sample.", + "description": "The local reachability density (LRD)\n\nThe LRD of a sample is the inverse of the average reachability\ndistance of its k-nearest neighbors.", + "docstring": "The local reachability density (LRD)\n\n The LRD of a sample is the inverse of the average reachability\n distance of its k-nearest neighbors.\n\n Parameters\n ----------\n distances_X : ndarray of shape (n_queries, self.n_neighbors)\n Distances to the neighbors (in the training samples `self._fit_X`)\n of each query point to compute the LRD.\n\n neighbors_indices : ndarray of shape (n_queries, self.n_neighbors)\n Neighbors indices (of each query point) among training samples\n self._fit_X.\n\n Returns\n -------\n local_reachability_density : ndarray of shape (n_queries,)\n The local reachability density of each sample.\n ", "source_code": "\ndef _local_reachability_density(self, distances_X, neighbors_indices):\n \"\"\"The local reachability density (LRD)\n\n The LRD of a sample is the inverse of the average reachability\n distance of its k-nearest neighbors.\n\n Parameters\n ----------\n distances_X : ndarray of shape (n_queries, self.n_neighbors)\n Distances to the neighbors (in the training samples `self._fit_X`)\n of each query point to compute the LRD.\n\n neighbors_indices : ndarray of shape (n_queries, self.n_neighbors)\n Neighbors indices (of each query point) among training samples\n self._fit_X.\n\n Returns\n -------\n local_reachability_density : ndarray of shape (n_queries,)\n The local reachability density of each sample.\n \"\"\"\n dist_k = self._distances_fit_X_[neighbors_indices, self.n_neighbors_ - 1]\n reach_dist_array = np.maximum(distances_X, dist_k)\n return 1.0 / (np.mean(reach_dist_array, axis=1) + 1e-10)" }, { @@ -143228,7 +154276,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -143238,13 +154287,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features), default=None", "description": "The query sample or samples to compute the Local Outlier Factor\nw.r.t. to the training samples. If None, makes prediction on the\ntraining data without considering them as their own neighbors." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Predict the labels (1 inlier, -1 outlier) of X according to LOF.\n\nIf X is None, returns the same as fit_predict(X_train).", - "docstring": "Predict the labels (1 inlier, -1 outlier) of X according to LOF.\n\nIf X is None, returns the same as fit_predict(X_train).\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features), default=None\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. to the training samples. If None, makes prediction on the\n training data without considering them as their own neighbors.\n\nReturns\n-------\nis_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and +1 for inliers.", + "docstring": "Predict the labels (1 inlier, -1 outlier) of X according to LOF.\n\n If X is None, returns the same as fit_predict(X_train).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features), default=None\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. to the training samples. If None, makes prediction on the\n training data without considering them as their own neighbors.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and +1 for inliers.\n ", "source_code": "\ndef _predict(self, X=None):\n \"\"\"Predict the labels (1 inlier, -1 outlier) of X according to LOF.\n\n If X is None, returns the same as fit_predict(X_train).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features), default=None\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. to the training samples. If None, makes prediction on the\n training data without considering them as their own neighbors.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and +1 for inliers.\n \"\"\"\n check_is_fitted(self)\n if X is not None:\n X = check_array(X, accept_sparse='csr')\n is_inlier = np.ones(X.shape[0], dtype=int)\n is_inlier[self.decision_function(X) < 0] = -1\n else:\n is_inlier = np.ones(self.n_samples_fit_, dtype=int)\n is_inlier[self.negative_outlier_factor_ < self.offset_] = -1\n return is_inlier" }, { @@ -143262,7 +154312,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -143272,13 +154323,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The query sample or samples to compute the Local Outlier Factor\nw.r.t. the training samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Shifted opposite of the Local Outlier Factor of X.\n\nBigger is better, i.e. large values correspond to inliers. **Only available for novelty detection (when novelty is set to True).** The shift offset allows a zero threshold for being an outlier. The argument X is supposed to contain *new data*: if X contains a point from training, it considers the later in its own neighborhood. Also, the samples in X are not considered in the neighborhood of any point.", - "docstring": "Shifted opposite of the Local Outlier Factor of X.\n\nBigger is better, i.e. large values correspond to inliers.\n\n**Only available for novelty detection (when novelty is set to True).**\nThe shift offset allows a zero threshold for being an outlier.\nThe argument X is supposed to contain *new data*: if X contains a\npoint from training, it considers the later in its own neighborhood.\nAlso, the samples in X are not considered in the neighborhood of any\npoint.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. the training samples.\n\nReturns\n-------\nshifted_opposite_lof_scores : ndarray of shape (n_samples,)\n The shifted opposite of the Local Outlier Factor of each input\n samples. The lower, the more abnormal. Negative scores represent\n outliers, positive scores represent inliers.", + "description": "Shifted opposite of the Local Outlier Factor of X.\n\nBigger is better, i.e. large values correspond to inliers.\n\n**Only available for novelty detection (when novelty is set to True).**\nThe shift offset allows a zero threshold for being an outlier.\nThe argument X is supposed to contain *new data*: if X contains a\npoint from training, it considers the later in its own neighborhood.\nAlso, the samples in X are not considered in the neighborhood of any\npoint.", + "docstring": "Shifted opposite of the Local Outlier Factor of X.\n\n Bigger is better, i.e. large values correspond to inliers.\n\n **Only available for novelty detection (when novelty is set to True).**\n The shift offset allows a zero threshold for being an outlier.\n The argument X is supposed to contain *new data*: if X contains a\n point from training, it considers the later in its own neighborhood.\n Also, the samples in X are not considered in the neighborhood of any\n point.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. the training samples.\n\n Returns\n -------\n shifted_opposite_lof_scores : ndarray of shape (n_samples,)\n The shifted opposite of the Local Outlier Factor of each input\n samples. The lower, the more abnormal. Negative scores represent\n outliers, positive scores represent inliers.\n ", "source_code": "\n@available_if(_check_novelty_decision_function)\ndef decision_function(self, X):\n \"\"\"Shifted opposite of the Local Outlier Factor of X.\n\n Bigger is better, i.e. large values correspond to inliers.\n\n **Only available for novelty detection (when novelty is set to True).**\n The shift offset allows a zero threshold for being an outlier.\n The argument X is supposed to contain *new data*: if X contains a\n point from training, it considers the later in its own neighborhood.\n Also, the samples in X are not considered in the neighborhood of any\n point.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. the training samples.\n\n Returns\n -------\n shifted_opposite_lof_scores : ndarray of shape (n_samples,)\n The shifted opposite of the Local Outlier Factor of each input\n samples. The lower, the more abnormal. Negative scores represent\n outliers, positive scores represent inliers.\n \"\"\"\n return self.score_samples(X) - self.offset_" }, { @@ -143296,7 +154348,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -143306,6 +154359,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -143316,13 +154373,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the local outlier factor detector from the training dataset.", - "docstring": "Fit the local outlier factor detector from the training dataset.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : LocalOutlierFactor\n The fitted local outlier factor detector.", + "docstring": "Fit the local outlier factor detector from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : LocalOutlierFactor\n The fitted local outlier factor detector.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the local outlier factor detector from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : LocalOutlierFactor\n The fitted local outlier factor detector.\n \"\"\"\n self._fit(X)\n if self.contamination != 'auto':\n if not 0.0 < self.contamination <= 0.5:\n raise ValueError('contamination must be in (0, 0.5], got: %f' % self.contamination)\n n_samples = self.n_samples_fit_\n if self.n_neighbors > n_samples:\n warnings.warn('n_neighbors (%s) is greater than the total number of samples (%s). n_neighbors will be set to (n_samples - 1) for estimation.' % (self.n_neighbors, n_samples))\n self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1))\n (self._distances_fit_X_, _neighbors_indices_fit_X_) = self.kneighbors(n_neighbors=self.n_neighbors_)\n self._lrd = self._local_reachability_density(self._distances_fit_X_, _neighbors_indices_fit_X_)\n lrd_ratios_array = self._lrd[_neighbors_indices_fit_X_] / self._lrd[:, np.newaxis]\n self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1)\n if self.contamination == 'auto':\n self.offset_ = -1.5\n else:\n self.offset_ = np.percentile(self.negative_outlier_factor_, 100.0 * self.contamination)\n return self" }, { @@ -143340,7 +154398,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -143350,7 +154409,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features), default=None", "description": "The query sample or samples to compute the Local Outlier Factor\nw.r.t. to the training samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -143360,13 +154420,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit the model to the training set X and return the labels.\n\n**Not available for novelty detection (when novelty is set to True).** Label is 1 for an inlier and -1 for an outlier according to the LOF score and the contamination parameter.", - "docstring": "Fit the model to the training set X and return the labels.\n\n**Not available for novelty detection (when novelty is set to True).**\nLabel is 1 for an inlier and -1 for an outlier according to the LOF\nscore and the contamination parameter.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features), default=None\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. to the training samples.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nis_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and 1 for inliers.", + "description": "Fit the model to the training set X and return the labels.\n\n**Not available for novelty detection (when novelty is set to True).**\nLabel is 1 for an inlier and -1 for an outlier according to the LOF\nscore and the contamination parameter.", + "docstring": "Fit the model to the training set X and return the labels.\n\n **Not available for novelty detection (when novelty is set to True).**\n Label is 1 for an inlier and -1 for an outlier according to the LOF\n score and the contamination parameter.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features), default=None\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. to the training samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and 1 for inliers.\n ", "source_code": "\n@available_if(_check_novelty_fit_predict)\ndef fit_predict(self, X, y=None):\n \"\"\"Fit the model to the training set X and return the labels.\n\n **Not available for novelty detection (when novelty is set to True).**\n Label is 1 for an inlier and -1 for an outlier according to the LOF\n score and the contamination parameter.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features), default=None\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. to the training samples.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and 1 for inliers.\n \"\"\"\n return self.fit(X)._predict()" }, { @@ -143384,7 +154445,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -143394,13 +154456,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The query sample or samples to compute the Local Outlier Factor\nw.r.t. to the training samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict the labels (1 inlier, -1 outlier) of X according to LOF.\n\n**Only available for novelty detection (when novelty is set to True).** This method allows to generalize prediction to *new observations* (not in the training set).", - "docstring": "Predict the labels (1 inlier, -1 outlier) of X according to LOF.\n\n**Only available for novelty detection (when novelty is set to True).**\nThis method allows to generalize prediction to *new observations* (not\nin the training set).\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. to the training samples.\n\nReturns\n-------\nis_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and +1 for inliers.", + "description": "Predict the labels (1 inlier, -1 outlier) of X according to LOF.\n\n**Only available for novelty detection (when novelty is set to True).**\nThis method allows to generalize prediction to *new observations* (not\nin the training set).", + "docstring": "Predict the labels (1 inlier, -1 outlier) of X according to LOF.\n\n **Only available for novelty detection (when novelty is set to True).**\n This method allows to generalize prediction to *new observations* (not\n in the training set).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. to the training samples.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and +1 for inliers.\n ", "source_code": "\n@available_if(_check_novelty_predict)\ndef predict(self, X=None):\n \"\"\"Predict the labels (1 inlier, -1 outlier) of X according to LOF.\n\n **Only available for novelty detection (when novelty is set to True).**\n This method allows to generalize prediction to *new observations* (not\n in the training set).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. to the training samples.\n\n Returns\n -------\n is_inlier : ndarray of shape (n_samples,)\n Returns -1 for anomalies/outliers and +1 for inliers.\n \"\"\"\n return self._predict(X)" }, { @@ -143418,7 +154481,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -143428,13 +154492,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The query sample or samples to compute the Local Outlier Factor\nw.r.t. the training samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Opposite of the Local Outlier Factor of X.\n\nIt is the opposite as bigger is better, i.e. large values correspond to inliers. **Only available for novelty detection (when novelty is set to True).** The argument X is supposed to contain *new data*: if X contains a point from training, it considers the later in its own neighborhood. Also, the samples in X are not considered in the neighborhood of any point. The score_samples on training data is available by considering the the ``negative_outlier_factor_`` attribute.", - "docstring": "Opposite of the Local Outlier Factor of X.\n\nIt is the opposite as bigger is better, i.e. large values correspond\nto inliers.\n\n**Only available for novelty detection (when novelty is set to True).**\nThe argument X is supposed to contain *new data*: if X contains a\npoint from training, it considers the later in its own neighborhood.\nAlso, the samples in X are not considered in the neighborhood of any\npoint.\nThe score_samples on training data is available by considering the\nthe ``negative_outlier_factor_`` attribute.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. the training samples.\n\nReturns\n-------\nopposite_lof_scores : ndarray of shape (n_samples,)\n The opposite of the Local Outlier Factor of each input samples.\n The lower, the more abnormal.", + "description": "Opposite of the Local Outlier Factor of X.\n\nIt is the opposite as bigger is better, i.e. large values correspond\nto inliers.\n\n**Only available for novelty detection (when novelty is set to True).**\nThe argument X is supposed to contain *new data*: if X contains a\npoint from training, it considers the later in its own neighborhood.\nAlso, the samples in X are not considered in the neighborhood of any\npoint.\nThe score_samples on training data is available by considering the\nthe ``negative_outlier_factor_`` attribute.", + "docstring": "Opposite of the Local Outlier Factor of X.\n\n It is the opposite as bigger is better, i.e. large values correspond\n to inliers.\n\n **Only available for novelty detection (when novelty is set to True).**\n The argument X is supposed to contain *new data*: if X contains a\n point from training, it considers the later in its own neighborhood.\n Also, the samples in X are not considered in the neighborhood of any\n point.\n The score_samples on training data is available by considering the\n the ``negative_outlier_factor_`` attribute.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. the training samples.\n\n Returns\n -------\n opposite_lof_scores : ndarray of shape (n_samples,)\n The opposite of the Local Outlier Factor of each input samples.\n The lower, the more abnormal.\n ", "source_code": "\n@available_if(_check_novelty_score_samples)\ndef score_samples(self, X):\n \"\"\"Opposite of the Local Outlier Factor of X.\n\n It is the opposite as bigger is better, i.e. large values correspond\n to inliers.\n\n **Only available for novelty detection (when novelty is set to True).**\n The argument X is supposed to contain *new data*: if X contains a\n point from training, it considers the later in its own neighborhood.\n Also, the samples in X are not considered in the neighborhood of any\n point.\n The score_samples on training data is available by considering the\n the ``negative_outlier_factor_`` attribute.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The query sample or samples to compute the Local Outlier Factor\n w.r.t. the training samples.\n\n Returns\n -------\n opposite_lof_scores : ndarray of shape (n_samples,)\n The opposite of the Local Outlier Factor of each input samples.\n The lower, the more abnormal.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, accept_sparse='csr')\n (distances_X, neighbors_indices_X) = self.kneighbors(X, n_neighbors=self.n_neighbors_)\n X_lrd = self._local_reachability_density(distances_X, neighbors_indices_X)\n lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis]\n return -np.mean(lrd_ratios_array, axis=1)" }, { @@ -143452,7 +154517,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -143462,7 +154528,8 @@ "docstring": { "type": "int, default=None", "description": "Preferred dimensionality of the projected space.\nIf None it will be set to `n_features`." - } + }, + "refined_type": {} }, { "name": "init", @@ -143472,6 +154539,10 @@ "docstring": { "type": "{'auto', 'pca', 'lda', 'identity', 'random'} or ndarray of shape (n_features_a, n_features_b), default='auto'", "description": "Initialization of the linear transformation. Possible options are\n`'auto'`, `'pca'`, `'lda'`, `'identity'`, `'random'`, and a numpy\narray of shape `(n_features_a, n_features_b)`.\n\n- `'auto'`\n Depending on `n_components`, the most reasonable initialization\n will be chosen. If `n_components <= n_classes` we use `'lda'`, as\n it uses labels information. If not, but\n `n_components < min(n_features, n_samples)`, we use `'pca'`, as\n it projects data in meaningful directions (those of higher\n variance). Otherwise, we just use `'identity'`.\n\n- `'pca'`\n `n_components` principal components of the inputs passed\n to :meth:`fit` will be used to initialize the transformation.\n (See :class:`~sklearn.decomposition.PCA`)\n\n- `'lda'`\n `min(n_components, n_classes)` most discriminative\n components of the inputs passed to :meth:`fit` will be used to\n initialize the transformation. (If `n_components > n_classes`,\n the rest of the components will be zero.) (See\n :class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)\n\n- `'identity'`\n If `n_components` is strictly smaller than the\n dimensionality of the inputs passed to :meth:`fit`, the identity\n matrix will be truncated to the first `n_components` rows.\n\n- `'random'`\n The initial transformation will be a random array of shape\n `(n_components, n_features)`. Each value is sampled from the\n standard normal distribution.\n\n- numpy array\n `n_features_b` must match the dimensionality of the inputs passed\n to :meth:`fit` and n_features_a must be less than or equal to that.\n If `n_components` is not `None`, `n_features_a` must match it." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "lda", "pca", "auto", "identity"] } }, { @@ -143482,7 +154553,8 @@ "docstring": { "type": "bool, default=False", "description": "If `True` and :meth:`fit` has been called before, the solution of the\nprevious call to :meth:`fit` is used as the initial linear\ntransformation (`n_components` and `init` will be ignored)." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -143492,7 +154564,8 @@ "docstring": { "type": "int, default=50", "description": "Maximum number of iterations in the optimization." - } + }, + "refined_type": {} }, { "name": "tol", @@ -143502,7 +154575,8 @@ "docstring": { "type": "float, default=1e-5", "description": "Convergence tolerance for the optimization." - } + }, + "refined_type": {} }, { "name": "callback", @@ -143512,7 +154586,8 @@ "docstring": { "type": "callable, default=None", "description": "If not `None`, this function is called after every iteration of the\noptimizer, taking as arguments the current solution (flattened\ntransformation matrix) and the number of iterations. This might be\nuseful in case one wants to examine or store the transformation\nfound after each iteration." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -143522,7 +154597,8 @@ "docstring": { "type": "int, default=0", "description": "If 0, no progress messages will be printed.\nIf 1, progress messages will be printed to stdout.\nIf > 1, progress messages will be printed and the `disp`\nparameter of :func:`scipy.optimize.minimize` will be set to\n`verbose - 2`." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -143532,13 +154608,14 @@ "docstring": { "type": "int or numpy.RandomState, default=None", "description": "A pseudo random number generator object or a seed for it if int. If\n`init='random'`, `random_state` is used to initialize the random\ntransformation. If `init='pca'`, `random_state` is passed as an\nargument to PCA when initializing the transformation. Pass an int\nfor reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=None, *, init='auto', warm_start=False, max_iter=50, tol=1e-05, callback=None, verbose=0, random_state=None):\n self.n_components = n_components\n self.init = init\n self.warm_start = warm_start\n self.max_iter = max_iter\n self.tol = tol\n self.callback = callback\n self.verbose = verbose\n self.random_state = random_state" }, { @@ -143556,7 +154633,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformation", @@ -143566,13 +154644,14 @@ "docstring": { "type": "ndarray of shape (n_components * n_features,)", "description": "The solution computed by the optimizer in this iteration." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Called after each iteration of the optimizer.", - "docstring": "Called after each iteration of the optimizer.\n\nParameters\n----------\ntransformation : ndarray of shape (n_components * n_features,)\n The solution computed by the optimizer in this iteration.", + "docstring": "Called after each iteration of the optimizer.\n\n Parameters\n ----------\n transformation : ndarray of shape (n_components * n_features,)\n The solution computed by the optimizer in this iteration.\n ", "source_code": "\ndef _callback(self, transformation):\n \"\"\"Called after each iteration of the optimizer.\n\n Parameters\n ----------\n transformation : ndarray of shape (n_components * n_features,)\n The solution computed by the optimizer in this iteration.\n \"\"\"\n if self.callback is not None:\n self.callback(transformation, self.n_iter_)\n self.n_iter_ += 1" }, { @@ -143590,7 +154669,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -143600,7 +154680,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The training samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -143610,7 +154691,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The training labels." - } + }, + "refined_type": {} }, { "name": "init", @@ -143620,13 +154702,14 @@ "docstring": { "type": "str or ndarray of shape (n_features_a, n_features_b)", "description": "The validated initialization of the linear transformation." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Initialize the transformation.", - "docstring": "Initialize the transformation.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The training samples.\n\ny : array-like of shape (n_samples,)\n The training labels.\n\ninit : str or ndarray of shape (n_features_a, n_features_b)\n The validated initialization of the linear transformation.\n\nReturns\n-------\ntransformation : ndarray of shape (n_components, n_features)\n The initialized linear transformation.", + "docstring": "Initialize the transformation.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training samples.\n\n y : array-like of shape (n_samples,)\n The training labels.\n\n init : str or ndarray of shape (n_features_a, n_features_b)\n The validated initialization of the linear transformation.\n\n Returns\n -------\n transformation : ndarray of shape (n_components, n_features)\n The initialized linear transformation.\n\n ", "source_code": "\ndef _initialize(self, X, y, init):\n \"\"\"Initialize the transformation.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training samples.\n\n y : array-like of shape (n_samples,)\n The training labels.\n\n init : str or ndarray of shape (n_features_a, n_features_b)\n The validated initialization of the linear transformation.\n\n Returns\n -------\n transformation : ndarray of shape (n_components, n_features)\n The initialized linear transformation.\n\n \"\"\"\n transformation = init\n if self.warm_start and hasattr(self, 'components_'):\n transformation = self.components_\n elif isinstance(init, np.ndarray):\n pass\n else:\n (n_samples, n_features) = X.shape\n n_components = self.n_components or n_features\n if init == 'auto':\n n_classes = len(np.unique(y))\n if n_components <= min(n_features, n_classes - 1):\n init = 'lda'\n elif n_components < min(n_features, n_samples):\n init = 'pca'\n else:\n init = 'identity'\n if init == 'identity':\n transformation = np.eye(n_components, X.shape[1])\n elif init == 'random':\n transformation = self.random_state_.randn(n_components, X.shape[1])\n elif init in {'pca', 'lda'}:\n init_time = time.time()\n if init == 'pca':\n pca = PCA(n_components=n_components, random_state=self.random_state_)\n if self.verbose:\n print('Finding principal components... ', end='')\n sys.stdout.flush()\n pca.fit(X)\n transformation = pca.components_\n elif init == 'lda':\n from ..discriminant_analysis import LinearDiscriminantAnalysis\n lda = LinearDiscriminantAnalysis(n_components=n_components)\n if self.verbose:\n print('Finding most discriminative components... ', end='')\n sys.stdout.flush()\n lda.fit(X, y)\n transformation = lda.scalings_.T[:n_components]\n if self.verbose:\n print('done in {:5.2f}s'.format(time.time() - init_time))\n return transformation" }, { @@ -143644,7 +154727,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformation", @@ -143654,7 +154738,8 @@ "docstring": { "type": "ndarray of shape (n_components * n_features,)", "description": "The raveled linear transformation on which to compute loss and\nevaluate gradient." - } + }, + "refined_type": {} }, { "name": "X", @@ -143664,7 +154749,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The training samples." - } + }, + "refined_type": {} }, { "name": "same_class_mask", @@ -143674,7 +154760,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_samples)", "description": "A mask where `mask[i, j] == 1` if `X[i]` and `X[j]` belong\nto the same class, and `0` otherwise." - } + }, + "refined_type": {} }, { "name": "sign", @@ -143684,13 +154771,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the loss and the loss gradient w.r.t. `transformation`.", - "docstring": "Compute the loss and the loss gradient w.r.t. `transformation`.\n\nParameters\n----------\ntransformation : ndarray of shape (n_components * n_features,)\n The raveled linear transformation on which to compute loss and\n evaluate gradient.\n\nX : ndarray of shape (n_samples, n_features)\n The training samples.\n\nsame_class_mask : ndarray of shape (n_samples, n_samples)\n A mask where `mask[i, j] == 1` if `X[i]` and `X[j]` belong\n to the same class, and `0` otherwise.\n\nReturns\n-------\nloss : float\n The loss computed for the given transformation.\n\ngradient : ndarray of shape (n_components * n_features,)\n The new (flattened) gradient of the loss.", + "docstring": "Compute the loss and the loss gradient w.r.t. `transformation`.\n\n Parameters\n ----------\n transformation : ndarray of shape (n_components * n_features,)\n The raveled linear transformation on which to compute loss and\n evaluate gradient.\n\n X : ndarray of shape (n_samples, n_features)\n The training samples.\n\n same_class_mask : ndarray of shape (n_samples, n_samples)\n A mask where `mask[i, j] == 1` if `X[i]` and `X[j]` belong\n to the same class, and `0` otherwise.\n\n Returns\n -------\n loss : float\n The loss computed for the given transformation.\n\n gradient : ndarray of shape (n_components * n_features,)\n The new (flattened) gradient of the loss.\n ", "source_code": "\ndef _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):\n \"\"\"Compute the loss and the loss gradient w.r.t. `transformation`.\n\n Parameters\n ----------\n transformation : ndarray of shape (n_components * n_features,)\n The raveled linear transformation on which to compute loss and\n evaluate gradient.\n\n X : ndarray of shape (n_samples, n_features)\n The training samples.\n\n same_class_mask : ndarray of shape (n_samples, n_samples)\n A mask where `mask[i, j] == 1` if `X[i]` and `X[j]` belong\n to the same class, and `0` otherwise.\n\n Returns\n -------\n loss : float\n The loss computed for the given transformation.\n\n gradient : ndarray of shape (n_components * n_features,)\n The new (flattened) gradient of the loss.\n \"\"\"\n if self.n_iter_ == 0:\n self.n_iter_ += 1\n if self.verbose:\n header_fields = ['Iteration', 'Objective Value', 'Time(s)']\n header_fmt = '{:>10} {:>20} {:>10}'\n header = header_fmt.format(*header_fields)\n cls_name = self.__class__.__name__\n print('[{}]'.format(cls_name))\n print('[{}] {}\\n[{}] {}'.format(cls_name, header, cls_name, '-' * len(header)))\n t_funcall = time.time()\n transformation = transformation.reshape(-1, X.shape[1])\n X_embedded = np.dot(X, transformation.T)\n p_ij = pairwise_distances(X_embedded, squared=True)\n np.fill_diagonal(p_ij, np.inf)\n p_ij = softmax(-p_ij)\n masked_p_ij = p_ij * same_class_mask\n p = np.sum(masked_p_ij, axis=1, keepdims=True)\n loss = np.sum(p)\n weighted_p_ij = masked_p_ij - p_ij * p\n weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T\n np.fill_diagonal(weighted_p_ij_sym, -weighted_p_ij.sum(axis=0))\n gradient = 2 * X_embedded.T.dot(weighted_p_ij_sym).dot(X)\n if self.verbose:\n t_funcall = time.time() - t_funcall\n values_fmt = '[{}] {:>10} {:>20.6e} {:>10.2f}'\n print(values_fmt.format(self.__class__.__name__, self.n_iter_, loss, t_funcall))\n sys.stdout.flush()\n return sign * loss, sign * gradient.ravel()" }, { @@ -143708,13 +154796,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'requires_y': True}" }, { @@ -143732,7 +154821,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -143742,7 +154832,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The training samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -143752,13 +154843,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The corresponding training labels." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Validate parameters as soon as :meth:`fit` is called.", - "docstring": "Validate parameters as soon as :meth:`fit` is called.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The training samples.\n\ny : array-like of shape (n_samples,)\n The corresponding training labels.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_features)\n The validated training samples.\n\ny : ndarray of shape (n_samples,)\n The validated training labels, encoded to be integers in\n the `range(0, n_classes)`.\n\ninit : str or ndarray of shape (n_features_a, n_features_b)\n The validated initialization of the linear transformation.\n\nRaises\n-------\nTypeError\n If a parameter is not an instance of the desired type.\n\nValueError\n If a parameter's value violates its legal value range or if the\n combination of two or more given parameters is incompatible.", + "docstring": "Validate parameters as soon as :meth:`fit` is called.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training samples.\n\n y : array-like of shape (n_samples,)\n The corresponding training labels.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The validated training samples.\n\n y : ndarray of shape (n_samples,)\n The validated training labels, encoded to be integers in\n the `range(0, n_classes)`.\n\n init : str or ndarray of shape (n_features_a, n_features_b)\n The validated initialization of the linear transformation.\n\n Raises\n -------\n TypeError\n If a parameter is not an instance of the desired type.\n\n ValueError\n If a parameter's value violates its legal value range or if the\n combination of two or more given parameters is incompatible.\n ", "source_code": "\ndef _validate_params(self, X, y):\n \"\"\"Validate parameters as soon as :meth:`fit` is called.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training samples.\n\n y : array-like of shape (n_samples,)\n The corresponding training labels.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The validated training samples.\n\n y : ndarray of shape (n_samples,)\n The validated training labels, encoded to be integers in\n the `range(0, n_classes)`.\n\n init : str or ndarray of shape (n_features_a, n_features_b)\n The validated initialization of the linear transformation.\n\n Raises\n -------\n TypeError\n If a parameter is not an instance of the desired type.\n\n ValueError\n If a parameter's value violates its legal value range or if the\n combination of two or more given parameters is incompatible.\n \"\"\"\n (X, y) = self._validate_data(X, y, ensure_min_samples=2)\n check_classification_targets(y)\n y = LabelEncoder().fit_transform(y)\n if self.n_components is not None:\n check_scalar(self.n_components, 'n_components', numbers.Integral, min_val=1)\n if self.n_components > X.shape[1]:\n raise ValueError('The preferred dimensionality of the projected space `n_components` ({}) cannot be greater than the given data dimensionality ({})!'.format(self.n_components, X.shape[1]))\n check_scalar(self.warm_start, 'warm_start', bool)\n if self.warm_start and hasattr(self, 'components_'):\n if self.components_.shape[1] != X.shape[1]:\n raise ValueError('The new inputs dimensionality ({}) does not match the input dimensionality of the previously learned transformation ({}).'.format(X.shape[1], self.components_.shape[1]))\n check_scalar(self.max_iter, 'max_iter', numbers.Integral, min_val=1)\n check_scalar(self.tol, 'tol', numbers.Real, min_val=0.0)\n check_scalar(self.verbose, 'verbose', numbers.Integral, min_val=0)\n if self.callback is not None:\n if not callable(self.callback):\n raise ValueError('`callback` is not callable.')\n init = self.init\n if isinstance(init, np.ndarray):\n init = check_array(init)\n if init.shape[1] != X.shape[1]:\n raise ValueError('The input dimensionality ({}) of the given linear transformation `init` must match the dimensionality of the given inputs `X` ({}).'.format(init.shape[1], X.shape[1]))\n if init.shape[0] > init.shape[1]:\n raise ValueError('The output dimensionality ({}) of the given linear transformation `init` cannot be greater than its input dimensionality ({}).'.format(init.shape[0], init.shape[1]))\n if self.n_components is not None:\n if self.n_components != init.shape[0]:\n raise ValueError('The preferred dimensionality of the projected space `n_components` ({}) does not match the output dimensionality of the given linear transformation `init` ({})!'.format(self.n_components, init.shape[0]))\n elif init in ['auto', 'pca', 'lda', 'identity', 'random']:\n pass\n else:\n raise ValueError(\"`init` must be 'auto', 'pca', 'lda', 'identity', 'random' or a numpy array of shape (n_components, n_features).\")\n return X, y, init" }, { @@ -143776,7 +154868,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -143786,7 +154879,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The training samples." - } + }, + "refined_type": {} }, { "name": "y", @@ -143796,13 +154890,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The corresponding training labels." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model according to the given training data.", - "docstring": "Fit the model according to the given training data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The training samples.\n\ny : array-like of shape (n_samples,)\n The corresponding training labels.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the model according to the given training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training samples.\n\n y : array-like of shape (n_samples,)\n The corresponding training labels.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The training samples.\n\n y : array-like of shape (n_samples,)\n The corresponding training labels.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y, init) = self._validate_params(X, y)\n self.random_state_ = check_random_state(self.random_state)\n t_train = time.time()\n same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]\n transformation = self._initialize(X, y, init)\n disp = self.verbose - 2 if self.verbose > 1 else -1\n optimizer_params = {'method': 'L-BFGS-B', 'fun': self._loss_grad_lbfgs, 'args': (X, same_class_mask, -1.0), 'jac': True, 'x0': transformation, 'tol': self.tol, 'options': dict(maxiter=self.max_iter, disp=disp), 'callback': self._callback}\n self.n_iter_ = 0\n opt_result = minimize(**optimizer_params)\n self.components_ = opt_result.x.reshape(-1, X.shape[1])\n t_train = time.time() - t_train\n if self.verbose:\n cls_name = self.__class__.__name__\n if not opt_result.success:\n warn('[{}] NCA did not converge: {}'.format(cls_name, opt_result.message), ConvergenceWarning)\n print('[{}] Training took {:8.2f}s.'.format(cls_name, t_train))\n return self" }, { @@ -143820,7 +154915,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -143830,13 +154926,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Apply the learned transformation to the given data.", - "docstring": "Apply the learned transformation to the given data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data samples.\n\nReturns\n-------\nX_embedded: ndarray of shape (n_samples, n_components)\n The data samples transformed.\n\nRaises\n------\nNotFittedError\n If :meth:`fit` has not been called before.", + "docstring": "Apply the learned transformation to the given data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data samples.\n\n Returns\n -------\n X_embedded: ndarray of shape (n_samples, n_components)\n The data samples transformed.\n\n Raises\n ------\n NotFittedError\n If :meth:`fit` has not been called before.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Apply the learned transformation to the given data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data samples.\n\n Returns\n -------\n X_embedded: ndarray of shape (n_samples, n_components)\n The data samples transformed.\n\n Raises\n ------\n NotFittedError\n If :meth:`fit` has not been called before.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False)\n return np.dot(X, self.components_.T)" }, { @@ -143854,7 +154951,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "metric", @@ -143862,9 +154960,10 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "str or callable", + "type": "str or callable, default=\"euclidian\"", "description": "The metric to use when calculating distance between instances in a\nfeature array. If metric is a string or callable, it must be one of\nthe options allowed by\n:func:`~sklearn.metrics.pairwise_distances` for its metric\nparameter. The centroids for the samples corresponding to each class is\nthe point from which the sum of the distances (according to the metric)\nof all samples that belong to that particular class are minimized.\nIf the `\"manhattan\"` metric is provided, this centroid is the median\nand for all other metrics, the centroid is now set to be the mean.\n\n.. versionchanged:: 0.19\n `metric='precomputed'` was deprecated and now raises an error" - } + }, + "refined_type": {} }, { "name": "shrink_threshold", @@ -143874,13 +154973,14 @@ "docstring": { "type": "float, default=None", "description": "Threshold for shrinking centroids to remove features." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, metric='euclidean', *, shrink_threshold=None):\n self.metric = metric\n self.shrink_threshold = shrink_threshold" }, { @@ -143898,7 +154998,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -143908,6 +155009,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features.\nNote that centroid shrinking cannot be used with sparse matrices." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -143918,13 +155023,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the NearestCentroid model according to the given training data.", - "docstring": "Fit the NearestCentroid model according to the given training data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n Note that centroid shrinking cannot be used with sparse matrices.\ny : array-like of shape (n_samples,)\n Target values.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "\n Fit the NearestCentroid model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n Note that centroid shrinking cannot be used with sparse matrices.\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"\n Fit the NearestCentroid model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n Note that centroid shrinking cannot be used with sparse matrices.\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n if self.metric == 'precomputed':\n raise ValueError('Precomputed is not supported.')\n if self.metric == 'manhattan':\n (X, y) = self._validate_data(X, y, accept_sparse=['csc'])\n else:\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'])\n is_X_sparse = sp.issparse(X)\n if is_X_sparse and self.shrink_threshold:\n raise ValueError('threshold shrinking not supported for sparse input')\n check_classification_targets(y)\n (n_samples, n_features) = X.shape\n le = LabelEncoder()\n y_ind = le.fit_transform(y)\n self.classes_ = classes = le.classes_\n n_classes = classes.size\n if n_classes < 2:\n raise ValueError('The number of classes has to be greater than one; got %d class' % n_classes)\n self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)\n nk = np.zeros(n_classes)\n for cur_class in range(n_classes):\n center_mask = y_ind == cur_class\n nk[cur_class] = np.sum(center_mask)\n if is_X_sparse:\n center_mask = np.where(center_mask)[0]\n if self.metric == 'manhattan':\n if not is_X_sparse:\n self.centroids_[cur_class] = np.median(X[center_mask], axis=0)\n else:\n self.centroids_[cur_class] = csc_median_axis_0(X[center_mask])\n else:\n if self.metric != 'euclidean':\n warnings.warn('Averaging for metrics other than euclidean and manhattan not supported. The average is set to be the mean.')\n self.centroids_[cur_class] = X[center_mask].mean(axis=0)\n if self.shrink_threshold:\n if np.all(np.ptp(X, axis=0) == 0):\n raise ValueError('All features have zero variance. Division by zero.')\n dataset_centroid_ = np.mean(X, axis=0)\n m = np.sqrt(1.0 / nk - 1.0 / n_samples)\n variance = (X - self.centroids_[y_ind])**2\n variance = variance.sum(axis=0)\n s = np.sqrt(variance / (n_samples - n_classes))\n s += np.median(s)\n mm = m.reshape(len(m), 1)\n ms = mm * s\n deviation = (self.centroids_ - dataset_centroid_) / ms\n signs = np.sign(deviation)\n deviation = np.abs(deviation) - self.shrink_threshold\n np.clip(deviation, 0, None, out=deviation)\n deviation *= signs\n msd = ms * deviation\n self.centroids_ = dataset_centroid_[np.newaxis, :] + msd\n return self" }, { @@ -143942,7 +155048,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -143952,13 +155059,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Test samples." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Perform classification on an array of test vectors `X`.\n\nThe predicted class `C` for each sample in `X` is returned.", - "docstring": "Perform classification on an array of test vectors `X`.\n\nThe predicted class `C` for each sample in `X` is returned.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Test samples.\n\nReturns\n-------\nC : ndarray of shape (n_samples,)\n The predicted classes.\n\nNotes\n-----\nIf the metric constructor parameter is `\"precomputed\"`, `X` is assumed\nto be the distance matrix between the data to be predicted and\n`self.centroids_`.", + "docstring": "Perform classification on an array of test vectors `X`.\n\n The predicted class `C` for each sample in `X` is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Test samples.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n The predicted classes.\n\n Notes\n -----\n If the metric constructor parameter is `\"precomputed\"`, `X` is assumed\n to be the distance matrix between the data to be predicted and\n `self.centroids_`.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Perform classification on an array of test vectors `X`.\n\n The predicted class `C` for each sample in `X` is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Test samples.\n\n Returns\n -------\n C : ndarray of shape (n_samples,)\n The predicted classes.\n\n Notes\n -----\n If the metric constructor parameter is `\"precomputed\"`, `X` is assumed\n to be the distance matrix between the data to be predicted and\n `self.centroids_`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n return self.classes_[pairwise_distances(X, self.centroids_, metric=self.metric).argmin(axis=1)]" }, { @@ -143976,7 +155087,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -143986,7 +155098,8 @@ "docstring": { "type": "int, default=5", "description": "Number of neighbors to use by default for :meth:`kneighbors` queries." - } + }, + "refined_type": {} }, { "name": "weights", @@ -143996,6 +155109,10 @@ "docstring": { "type": "{'uniform', 'distance'} or callable, default='uniform'", "description": "Weight function used in prediction. Possible values:\n\n- 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n- 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n- [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\nUniform weights are used by default." + }, + "refined_type": { + "kind": "EnumType", + "values": ["uniform", "distance"] } }, { @@ -144006,6 +155123,10 @@ "docstring": { "type": "{'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'", "description": "Algorithm used to compute the nearest neighbors:\n\n- 'ball_tree' will use :class:`BallTree`\n- 'kd_tree' will use :class:`KDTree`\n- 'brute' will use a brute-force search.\n- 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\nNote: fitting on sparse input will override the setting of\nthis parameter, using brute force." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -144016,7 +155137,8 @@ "docstring": { "type": "int, default=30", "description": "Leaf size passed to BallTree or KDTree. This can affect the\nspeed of the construction and query, as well as the memory\nrequired to store the tree. The optimal value depends on the\nnature of the problem." - } + }, + "refined_type": {} }, { "name": "p", @@ -144026,7 +155148,8 @@ "docstring": { "type": "int, default=2", "description": "Power parameter for the Minkowski metric. When p = 1, this is\nequivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." - } + }, + "refined_type": {} }, { "name": "metric", @@ -144036,7 +155159,8 @@ "docstring": { "type": "str or callable, default='minkowski'", "description": "The distance metric to use for the tree. The default metric is\nminkowski, and with p=2 is equivalent to the standard Euclidean\nmetric. See the documentation of :class:`DistanceMetric` for a\nlist of available metrics.\nIf metric is \"precomputed\", X is assumed to be a distance matrix and\nmust be square during fit. X may be a :term:`sparse graph`,\nin which case only \"nonzero\" elements may be considered neighbors." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -144046,7 +155170,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -144056,13 +155181,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\nDoesn't affect :meth:`fit` method." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):\n super().__init__(n_neighbors=n_neighbors, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)\n self.weights = weights" }, { @@ -144080,13 +155206,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'pairwise': self.metric == 'precomputed'}" }, { @@ -144107,13 +155234,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef _pairwise(self):\n return self.metric == 'precomputed'" }, { @@ -144131,7 +155259,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -144141,6 +155270,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -144151,13 +155284,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)", "description": "Target values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Fit the k-nearest neighbors regressor from the training dataset.", - "docstring": "Fit the k-nearest neighbors regressor from the training dataset.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\ny : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\nReturns\n-------\nself : KNeighborsRegressor\n The fitted k-nearest neighbors regressor.", + "docstring": "Fit the k-nearest neighbors regressor from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : KNeighborsRegressor\n The fitted k-nearest neighbors regressor.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit the k-nearest neighbors regressor from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : KNeighborsRegressor\n The fitted k-nearest neighbors regressor.\n \"\"\"\n self.weights = _check_weights(self.weights)\n return self._fit(X, y)" }, { @@ -144175,7 +155312,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -144185,13 +155323,14 @@ "docstring": { "type": "array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'", "description": "Test samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict the target for the provided data.", - "docstring": "Predict the target for the provided data.\n\nParameters\n----------\nX : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\nReturns\n-------\ny : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int\n Target values.", + "docstring": "Predict the target for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int\n Target values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict the target for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int\n Target values.\n \"\"\"\n (neigh_dist, neigh_ind) = self.kneighbors(X)\n weights = _get_weights(neigh_dist, self.weights)\n _y = self._y\n if _y.ndim == 1:\n _y = _y.reshape((-1, 1))\n if weights is None:\n y_pred = np.mean(_y[neigh_ind], axis=1)\n else:\n y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64)\n denom = np.sum(weights, axis=1)\n for j in range(_y.shape[1]):\n num = np.sum(_y[neigh_ind, j] * weights, axis=1)\n y_pred[:, j] = num / denom\n if self._y.ndim == 1:\n y_pred = y_pred.ravel()\n return y_pred" }, { @@ -144209,7 +155348,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "radius", @@ -144219,7 +155359,8 @@ "docstring": { "type": "float, default=1.0", "description": "Range of parameter space to use by default for :meth:`radius_neighbors`\nqueries." - } + }, + "refined_type": {} }, { "name": "weights", @@ -144229,6 +155370,10 @@ "docstring": { "type": "{'uniform', 'distance'} or callable, default='uniform'", "description": "Weight function used in prediction. Possible values:\n\n- 'uniform' : uniform weights. All points in each neighborhood\n are weighted equally.\n- 'distance' : weight points by the inverse of their distance.\n in this case, closer neighbors of a query point will have a\n greater influence than neighbors which are further away.\n- [callable] : a user-defined function which accepts an\n array of distances, and returns an array of the same shape\n containing the weights.\n\nUniform weights are used by default." + }, + "refined_type": { + "kind": "EnumType", + "values": ["uniform", "distance"] } }, { @@ -144239,6 +155384,10 @@ "docstring": { "type": "{'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'", "description": "Algorithm used to compute the nearest neighbors:\n\n- 'ball_tree' will use :class:`BallTree`\n- 'kd_tree' will use :class:`KDTree`\n- 'brute' will use a brute-force search.\n- 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\nNote: fitting on sparse input will override the setting of\nthis parameter, using brute force." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -144249,7 +155398,8 @@ "docstring": { "type": "int, default=30", "description": "Leaf size passed to BallTree or KDTree. This can affect the\nspeed of the construction and query, as well as the memory\nrequired to store the tree. The optimal value depends on the\nnature of the problem." - } + }, + "refined_type": {} }, { "name": "p", @@ -144259,7 +155409,8 @@ "docstring": { "type": "int, default=2", "description": "Power parameter for the Minkowski metric. When p = 1, this is\nequivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." - } + }, + "refined_type": {} }, { "name": "metric", @@ -144269,7 +155420,8 @@ "docstring": { "type": "str or callable, default='minkowski'", "description": "The distance metric to use for the tree. The default metric is\nminkowski, and with p=2 is equivalent to the standard Euclidean\nmetric. See the documentation of :class:`DistanceMetric` for a\nlist of available metrics.\nIf metric is \"precomputed\", X is assumed to be a distance matrix and\nmust be square during fit. X may be a :term:`sparse graph`,\nin which case only \"nonzero\" elements may be considered neighbors." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -144279,7 +155431,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -144289,13 +155442,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, radius=1.0, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None):\n super().__init__(radius=radius, algorithm=algorithm, leaf_size=leaf_size, p=p, metric=metric, metric_params=metric_params, n_jobs=n_jobs)\n self.weights = weights" }, { @@ -144313,7 +155467,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -144323,6 +155478,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -144333,13 +155492,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)", "description": "Target values." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Fit the radius neighbors regressor from the training dataset.", - "docstring": "Fit the radius neighbors regressor from the training dataset.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\ny : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\nReturns\n-------\nself : RadiusNeighborsRegressor\n The fitted radius neighbors regressor.", + "docstring": "Fit the radius neighbors regressor from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : RadiusNeighborsRegressor\n The fitted radius neighbors regressor.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit the radius neighbors regressor from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)\n Target values.\n\n Returns\n -------\n self : RadiusNeighborsRegressor\n The fitted radius neighbors regressor.\n \"\"\"\n self.weights = _check_weights(self.weights)\n return self._fit(X, y)" }, { @@ -144357,7 +155520,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -144367,13 +155531,14 @@ "docstring": { "type": "array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'", "description": "Test samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Predict the target for the provided data.", - "docstring": "Predict the target for the provided data.\n\nParameters\n----------\nX : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\nReturns\n-------\ny : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=double\n Target values.", + "docstring": "Predict the target for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=double\n Target values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict the target for the provided data.\n\n Parameters\n ----------\n X : array-like of shape (n_queries, n_features), or (n_queries, n_indexed) if metric == 'precomputed'\n Test samples.\n\n Returns\n -------\n y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=double\n Target values.\n \"\"\"\n (neigh_dist, neigh_ind) = self.radius_neighbors(X)\n weights = _get_weights(neigh_dist, self.weights)\n _y = self._y\n if _y.ndim == 1:\n _y = _y.reshape((-1, 1))\n empty_obs = np.full_like(_y[0], np.nan)\n if weights is None:\n y_pred = np.array([np.mean(_y[ind, :], axis=0) if len(ind) else empty_obs for (i, ind) in enumerate(neigh_ind)])\n else:\n y_pred = np.array([np.average(_y[ind, :], axis=0, weights=weights[i]) if len(ind) else empty_obs for (i, ind) in enumerate(neigh_ind)])\n if np.any(np.isnan(y_pred)):\n empty_warning_msg = 'One or more samples have no neighbors within specified radius; predicting NaN.'\n warnings.warn(empty_warning_msg)\n if self._y.ndim == 1:\n y_pred = y_pred.ravel()\n return y_pred" }, { @@ -144391,7 +155556,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -144401,7 +155567,8 @@ "docstring": { "type": "int, default=5", "description": "Number of neighbors to use by default for :meth:`kneighbors` queries." - } + }, + "refined_type": {} }, { "name": "radius", @@ -144411,7 +155578,8 @@ "docstring": { "type": "float, default=1.0", "description": "Range of parameter space to use by default for :meth:`radius_neighbors`\nqueries." - } + }, + "refined_type": {} }, { "name": "algorithm", @@ -144421,6 +155589,10 @@ "docstring": { "type": "{'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'", "description": "Algorithm used to compute the nearest neighbors:\n\n- 'ball_tree' will use :class:`BallTree`\n- 'kd_tree' will use :class:`KDTree`\n- 'brute' will use a brute-force search.\n- 'auto' will attempt to decide the most appropriate algorithm\n based on the values passed to :meth:`fit` method.\n\nNote: fitting on sparse input will override the setting of\nthis parameter, using brute force." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "kd_tree", "brute", "ball_tree"] } }, { @@ -144431,7 +155603,8 @@ "docstring": { "type": "int, default=30", "description": "Leaf size passed to BallTree or KDTree. This can affect the\nspeed of the construction and query, as well as the memory\nrequired to store the tree. The optimal value depends on the\nnature of the problem." - } + }, + "refined_type": {} }, { "name": "metric", @@ -144440,8 +155613,9 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "str or callable, default='minkowski'", - "description": "The distance metric to use for the tree. The default metric is\nminkowski, and with p=2 is equivalent to the standard Euclidean\nmetric. See the documentation of :class:`DistanceMetric` for a\nlist of available metrics.\nIf metric is \"precomputed\", X is assumed to be a distance matrix and\nmust be square during fit. X may be a :term:`sparse graph`,\nin which case only \"nonzero\" elements may be considered neighbors." - } + "description": "The distance metric to use for the tree. The default metric is\nminkowski, and with p=2 is equivalent to the standard Euclidean\nmetric. For a list of available metrics, see the documentation of\n:class:`~sklearn.metrics.DistanceMetric`.\nIf metric is \"precomputed\", X is assumed to be a distance matrix and\nmust be square during fit. X may be a :term:`sparse graph`,\nin which case only \"nonzero\" elements may be considered neighbors." + }, + "refined_type": {} }, { "name": "p", @@ -144451,7 +155625,8 @@ "docstring": { "type": "int, default=2", "description": "Parameter for the Minkowski metric from\nsklearn.metrics.pairwise.pairwise_distances. When p = 1, this is\nequivalent to using manhattan_distance (l1), and euclidean_distance\n(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used." - } + }, + "refined_type": {} }, { "name": "metric_params", @@ -144461,7 +155636,8 @@ "docstring": { "type": "dict, default=None", "description": "Additional keyword arguments for the metric function." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -144471,13 +155647,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run for neighbors search.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None):\n super().__init__(n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs)" }, { @@ -144495,7 +155672,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -144505,6 +155683,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -144515,13 +155697,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the nearest neighbors estimator from the training dataset.", - "docstring": "Fit the nearest neighbors estimator from the training dataset.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nself : NearestNeighbors\n The fitted nearest neighbors estimator.", + "docstring": "Fit the nearest neighbors estimator from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : NearestNeighbors\n The fitted nearest neighbors estimator.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the nearest neighbors estimator from the training dataset.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples) if metric='precomputed'\n Training data.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n self : NearestNeighbors\n The fitted nearest neighbors estimator.\n \"\"\"\n return self._fit(X)" }, { @@ -144539,7 +155722,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -144549,14 +155733,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", - "source_code": "\ndef configuration(parent_package='', top_path=None):\n import numpy\n from numpy.distutils.misc_util import Configuration\n config = Configuration('neighbors', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_extension('_ball_tree', sources=['_ball_tree.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_kd_tree', sources=['_kd_tree.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_partition_nodes', sources=['_partition_nodes.pyx'], include_dirs=[numpy.get_include()], language='c++', libraries=libraries)\n config.add_extension('_dist_metrics', sources=['_dist_metrics.pyx'], include_dirs=[numpy.get_include(), os.path.join(numpy.get_include(), 'numpy')], libraries=libraries)\n config.add_extension('_typedefs', sources=['_typedefs.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_quad_tree', sources=['_quad_tree.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_subpackage('tests')\n return config" + "docstring": null, + "source_code": "\ndef configuration(parent_package='', top_path=None):\n import numpy\n from numpy.distutils.misc_util import Configuration\n config = Configuration('neighbors', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_extension('_ball_tree', sources=['_ball_tree.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_kd_tree', sources=['_kd_tree.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_partition_nodes', sources=['_partition_nodes.pyx'], include_dirs=[numpy.get_include()], language='c++', libraries=libraries)\n config.add_extension('_quad_tree', sources=['_quad_tree.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_subpackage('tests')\n return config" }, { "name": "binary_log_loss", @@ -144573,7 +155758,8 @@ "docstring": { "type": "array-like or label indicator matrix", "description": "Ground truth (correct) labels." - } + }, + "refined_type": {} }, { "name": "y_prob", @@ -144583,13 +155769,14 @@ "docstring": { "type": "array-like of float, shape = (n_samples, 1)", "description": "Predicted probabilities, as returned by a classifier's\npredict_proba method." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute binary logistic loss for classification.\n\nThis is identical to log_loss in binary classification case, but is kept for its use in multilabel case.", - "docstring": "Compute binary logistic loss for classification.\n\nThis is identical to log_loss in binary classification case,\nbut is kept for its use in multilabel case.\n\nParameters\n----------\ny_true : array-like or label indicator matrix\n Ground truth (correct) labels.\n\ny_prob : array-like of float, shape = (n_samples, 1)\n Predicted probabilities, as returned by a classifier's\n predict_proba method.\n\nReturns\n-------\nloss : float\n The degree to which the samples are correctly predicted.", + "description": "Compute binary logistic loss for classification.\n\nThis is identical to log_loss in binary classification case,\nbut is kept for its use in multilabel case.", + "docstring": "Compute binary logistic loss for classification.\n\n This is identical to log_loss in binary classification case,\n but is kept for its use in multilabel case.\n\n Parameters\n ----------\n y_true : array-like or label indicator matrix\n Ground truth (correct) labels.\n\n y_prob : array-like of float, shape = (n_samples, 1)\n Predicted probabilities, as returned by a classifier's\n predict_proba method.\n\n Returns\n -------\n loss : float\n The degree to which the samples are correctly predicted.\n ", "source_code": "\ndef binary_log_loss(y_true, y_prob):\n \"\"\"Compute binary logistic loss for classification.\n\n This is identical to log_loss in binary classification case,\n but is kept for its use in multilabel case.\n\n Parameters\n ----------\n y_true : array-like or label indicator matrix\n Ground truth (correct) labels.\n\n y_prob : array-like of float, shape = (n_samples, 1)\n Predicted probabilities, as returned by a classifier's\n predict_proba method.\n\n Returns\n -------\n loss : float\n The degree to which the samples are correctly predicted.\n \"\"\"\n eps = np.finfo(y_prob.dtype).eps\n y_prob = np.clip(y_prob, eps, 1 - eps)\n return -(xlogy(y_true, y_prob).sum() + xlogy(1 - y_true, 1 - y_prob).sum()) / y_prob.shape[0]" }, { @@ -144607,13 +155794,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "Data, where `n_samples` is the number of samples\nand `n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Simply leave the input array unchanged.", - "docstring": "Simply leave the input array unchanged.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n Data, where `n_samples` is the number of samples\n and `n_features` is the number of features.", + "docstring": "Simply leave the input array unchanged.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n ", "source_code": "\ndef inplace_identity(X):\n \"\"\"Simply leave the input array unchanged.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n Data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n \"\"\"\n " }, { @@ -144631,6 +155822,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "The data which was output from the identity activation function during\nthe forward pass." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -144641,13 +155836,17 @@ "docstring": { "type": "{array-like}, shape (n_samples, n_features)", "description": "The backpropagated error signal to be modified inplace." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Apply the derivative of the identity function: do nothing.", - "docstring": "Apply the derivative of the identity function: do nothing.\n\nParameters\n----------\nZ : {array-like, sparse matrix}, shape (n_samples, n_features)\n The data which was output from the identity activation function during\n the forward pass.\n\ndelta : {array-like}, shape (n_samples, n_features)\n The backpropagated error signal to be modified inplace.", + "docstring": "Apply the derivative of the identity function: do nothing.\n\n Parameters\n ----------\n Z : {array-like, sparse matrix}, shape (n_samples, n_features)\n The data which was output from the identity activation function during\n the forward pass.\n\n delta : {array-like}, shape (n_samples, n_features)\n The backpropagated error signal to be modified inplace.\n ", "source_code": "\ndef inplace_identity_derivative(Z, delta):\n \"\"\"Apply the derivative of the identity function: do nothing.\n\n Parameters\n ----------\n Z : {array-like, sparse matrix}, shape (n_samples, n_features)\n The data which was output from the identity activation function during\n the forward pass.\n\n delta : {array-like}, shape (n_samples, n_features)\n The backpropagated error signal to be modified inplace.\n \"\"\"\n " }, { @@ -144665,13 +155864,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Compute the logistic function inplace.", - "docstring": "Compute the logistic function inplace.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data.", + "docstring": "Compute the logistic function inplace.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data.\n ", "source_code": "\ndef inplace_logistic(X):\n \"\"\"Compute the logistic function inplace.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data.\n \"\"\"\n logistic_sigmoid(X, out=X)" }, { @@ -144689,6 +155892,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "The data which was output from the logistic activation function during\nthe forward pass." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -144699,13 +155906,17 @@ "docstring": { "type": "{array-like}, shape (n_samples, n_features)", "description": "The backpropagated error signal to be modified inplace." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Apply the derivative of the logistic sigmoid function.\n\nIt exploits the fact that the derivative is a simple function of the output value from logistic function.", - "docstring": "Apply the derivative of the logistic sigmoid function.\n\nIt exploits the fact that the derivative is a simple function of the output\nvalue from logistic function.\n\nParameters\n----------\nZ : {array-like, sparse matrix}, shape (n_samples, n_features)\n The data which was output from the logistic activation function during\n the forward pass.\n\ndelta : {array-like}, shape (n_samples, n_features)\n The backpropagated error signal to be modified inplace.", + "description": "Apply the derivative of the logistic sigmoid function.\n\nIt exploits the fact that the derivative is a simple function of the output\nvalue from logistic function.", + "docstring": "Apply the derivative of the logistic sigmoid function.\n\n It exploits the fact that the derivative is a simple function of the output\n value from logistic function.\n\n Parameters\n ----------\n Z : {array-like, sparse matrix}, shape (n_samples, n_features)\n The data which was output from the logistic activation function during\n the forward pass.\n\n delta : {array-like}, shape (n_samples, n_features)\n The backpropagated error signal to be modified inplace.\n ", "source_code": "\ndef inplace_logistic_derivative(Z, delta):\n \"\"\"Apply the derivative of the logistic sigmoid function.\n\n It exploits the fact that the derivative is a simple function of the output\n value from logistic function.\n\n Parameters\n ----------\n Z : {array-like, sparse matrix}, shape (n_samples, n_features)\n The data which was output from the logistic activation function during\n the forward pass.\n\n delta : {array-like}, shape (n_samples, n_features)\n The backpropagated error signal to be modified inplace.\n \"\"\"\n delta *= Z\n delta *= 1 - Z" }, { @@ -144723,13 +155934,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Compute the rectified linear unit function inplace.", - "docstring": "Compute the rectified linear unit function inplace.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data.", + "docstring": "Compute the rectified linear unit function inplace.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data.\n ", "source_code": "\ndef inplace_relu(X):\n \"\"\"Compute the rectified linear unit function inplace.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data.\n \"\"\"\n np.maximum(X, 0, out=X)" }, { @@ -144747,6 +155962,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "The data which was output from the rectified linear units activation\nfunction during the forward pass." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -144757,13 +155976,17 @@ "docstring": { "type": "{array-like}, shape (n_samples, n_features)", "description": "The backpropagated error signal to be modified inplace." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Apply the derivative of the relu function.\n\nIt exploits the fact that the derivative is a simple function of the output value from rectified linear units activation function.", - "docstring": "Apply the derivative of the relu function.\n\nIt exploits the fact that the derivative is a simple function of the output\nvalue from rectified linear units activation function.\n\nParameters\n----------\nZ : {array-like, sparse matrix}, shape (n_samples, n_features)\n The data which was output from the rectified linear units activation\n function during the forward pass.\n\ndelta : {array-like}, shape (n_samples, n_features)\n The backpropagated error signal to be modified inplace.", + "description": "Apply the derivative of the relu function.\n\nIt exploits the fact that the derivative is a simple function of the output\nvalue from rectified linear units activation function.", + "docstring": "Apply the derivative of the relu function.\n\n It exploits the fact that the derivative is a simple function of the output\n value from rectified linear units activation function.\n\n Parameters\n ----------\n Z : {array-like, sparse matrix}, shape (n_samples, n_features)\n The data which was output from the rectified linear units activation\n function during the forward pass.\n\n delta : {array-like}, shape (n_samples, n_features)\n The backpropagated error signal to be modified inplace.\n ", "source_code": "\ndef inplace_relu_derivative(Z, delta):\n \"\"\"Apply the derivative of the relu function.\n\n It exploits the fact that the derivative is a simple function of the output\n value from rectified linear units activation function.\n\n Parameters\n ----------\n Z : {array-like, sparse matrix}, shape (n_samples, n_features)\n The data which was output from the rectified linear units activation\n function during the forward pass.\n\n delta : {array-like}, shape (n_samples, n_features)\n The backpropagated error signal to be modified inplace.\n \"\"\"\n delta[Z == 0] = 0" }, { @@ -144781,13 +156004,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Compute the K-way softmax function inplace.", - "docstring": "Compute the K-way softmax function inplace.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data.", + "docstring": "Compute the K-way softmax function inplace.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data.\n ", "source_code": "\ndef inplace_softmax(X):\n \"\"\"Compute the K-way softmax function inplace.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data.\n \"\"\"\n tmp = X - X.max(axis=1)[:, np.newaxis]\n np.exp(tmp, out=X)\n X /= X.sum(axis=1)[:, np.newaxis]" }, { @@ -144805,13 +156032,17 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Compute the hyperbolic tan function inplace.", - "docstring": "Compute the hyperbolic tan function inplace.\n\nParameters\n----------\nX : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data.", + "docstring": "Compute the hyperbolic tan function inplace.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data.\n ", "source_code": "\ndef inplace_tanh(X):\n \"\"\"Compute the hyperbolic tan function inplace.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}, shape (n_samples, n_features)\n The input data.\n \"\"\"\n np.tanh(X, out=X)" }, { @@ -144829,6 +156060,10 @@ "docstring": { "type": "{array-like, sparse matrix}, shape (n_samples, n_features)", "description": "The data which was output from the hyperbolic tangent activation\nfunction during the forward pass." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -144839,13 +156074,17 @@ "docstring": { "type": "{array-like}, shape (n_samples, n_features)", "description": "The backpropagated error signal to be modified inplace." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Apply the derivative of the hyperbolic tanh function.\n\nIt exploits the fact that the derivative is a simple function of the output value from hyperbolic tangent.", - "docstring": "Apply the derivative of the hyperbolic tanh function.\n\nIt exploits the fact that the derivative is a simple function of the output\nvalue from hyperbolic tangent.\n\nParameters\n----------\nZ : {array-like, sparse matrix}, shape (n_samples, n_features)\n The data which was output from the hyperbolic tangent activation\n function during the forward pass.\n\ndelta : {array-like}, shape (n_samples, n_features)\n The backpropagated error signal to be modified inplace.", + "description": "Apply the derivative of the hyperbolic tanh function.\n\nIt exploits the fact that the derivative is a simple function of the output\nvalue from hyperbolic tangent.", + "docstring": "Apply the derivative of the hyperbolic tanh function.\n\n It exploits the fact that the derivative is a simple function of the output\n value from hyperbolic tangent.\n\n Parameters\n ----------\n Z : {array-like, sparse matrix}, shape (n_samples, n_features)\n The data which was output from the hyperbolic tangent activation\n function during the forward pass.\n\n delta : {array-like}, shape (n_samples, n_features)\n The backpropagated error signal to be modified inplace.\n ", "source_code": "\ndef inplace_tanh_derivative(Z, delta):\n \"\"\"Apply the derivative of the hyperbolic tanh function.\n\n It exploits the fact that the derivative is a simple function of the output\n value from hyperbolic tangent.\n\n Parameters\n ----------\n Z : {array-like, sparse matrix}, shape (n_samples, n_features)\n The data which was output from the hyperbolic tangent activation\n function during the forward pass.\n\n delta : {array-like}, shape (n_samples, n_features)\n The backpropagated error signal to be modified inplace.\n \"\"\"\n delta *= 1 - Z**2" }, { @@ -144863,7 +156102,8 @@ "docstring": { "type": "array-like or label indicator matrix", "description": "Ground truth (correct) labels." - } + }, + "refined_type": {} }, { "name": "y_prob", @@ -144873,13 +156113,14 @@ "docstring": { "type": "array-like of float, shape = (n_samples, n_classes)", "description": "Predicted probabilities, as returned by a classifier's\npredict_proba method." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute Logistic loss for classification.", - "docstring": "Compute Logistic loss for classification.\n\nParameters\n----------\ny_true : array-like or label indicator matrix\n Ground truth (correct) labels.\n\ny_prob : array-like of float, shape = (n_samples, n_classes)\n Predicted probabilities, as returned by a classifier's\n predict_proba method.\n\nReturns\n-------\nloss : float\n The degree to which the samples are correctly predicted.", + "docstring": "Compute Logistic loss for classification.\n\n Parameters\n ----------\n y_true : array-like or label indicator matrix\n Ground truth (correct) labels.\n\n y_prob : array-like of float, shape = (n_samples, n_classes)\n Predicted probabilities, as returned by a classifier's\n predict_proba method.\n\n Returns\n -------\n loss : float\n The degree to which the samples are correctly predicted.\n ", "source_code": "\ndef log_loss(y_true, y_prob):\n \"\"\"Compute Logistic loss for classification.\n\n Parameters\n ----------\n y_true : array-like or label indicator matrix\n Ground truth (correct) labels.\n\n y_prob : array-like of float, shape = (n_samples, n_classes)\n Predicted probabilities, as returned by a classifier's\n predict_proba method.\n\n Returns\n -------\n loss : float\n The degree to which the samples are correctly predicted.\n \"\"\"\n eps = np.finfo(y_prob.dtype).eps\n y_prob = np.clip(y_prob, eps, 1 - eps)\n if y_prob.shape[1] == 1:\n y_prob = np.append(1 - y_prob, y_prob, axis=1)\n if y_true.shape[1] == 1:\n y_true = np.append(1 - y_true, y_true, axis=1)\n return -xlogy(y_true, y_prob).sum() / y_prob.shape[0]" }, { @@ -144897,7 +156138,8 @@ "docstring": { "type": "array-like or label indicator matrix", "description": "Ground truth (correct) values." - } + }, + "refined_type": {} }, { "name": "y_pred", @@ -144907,13 +156149,14 @@ "docstring": { "type": "array-like or label indicator matrix", "description": "Predicted values, as returned by a regression estimator." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute the squared loss for regression.", - "docstring": "Compute the squared loss for regression.\n\nParameters\n----------\ny_true : array-like or label indicator matrix\n Ground truth (correct) values.\n\ny_pred : array-like or label indicator matrix\n Predicted values, as returned by a regression estimator.\n\nReturns\n-------\nloss : float\n The degree to which the samples are correctly predicted.", + "docstring": "Compute the squared loss for regression.\n\n Parameters\n ----------\n y_true : array-like or label indicator matrix\n Ground truth (correct) values.\n\n y_pred : array-like or label indicator matrix\n Predicted values, as returned by a regression estimator.\n\n Returns\n -------\n loss : float\n The degree to which the samples are correctly predicted.\n ", "source_code": "\ndef squared_loss(y_true, y_pred):\n \"\"\"Compute the squared loss for regression.\n\n Parameters\n ----------\n y_true : array-like or label indicator matrix\n Ground truth (correct) values.\n\n y_pred : array-like or label indicator matrix\n Predicted values, as returned by a regression estimator.\n\n Returns\n -------\n loss : float\n The degree to which the samples are correctly predicted.\n \"\"\"\n return ((y_true - y_pred)**2).mean() / 2" }, { @@ -144931,7 +156174,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "hidden_layer_sizes", @@ -144941,7 +156185,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "activation", @@ -144951,7 +156196,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "solver", @@ -144961,7 +156207,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -144971,7 +156218,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "batch_size", @@ -144981,7 +156229,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -144991,7 +156240,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate_init", @@ -145001,7 +156251,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "power_t", @@ -145011,7 +156262,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -145021,7 +156273,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -145031,7 +156284,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -145041,7 +156295,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -145051,7 +156306,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -145061,7 +156317,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -145071,7 +156328,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -145081,7 +156339,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "momentum", @@ -145091,7 +156350,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nesterovs_momentum", @@ -145101,7 +156361,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -145111,7 +156372,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -145121,7 +156383,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "beta_1", @@ -145131,7 +156394,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "beta_2", @@ -145141,7 +156405,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -145151,7 +156416,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -145161,7 +156427,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_fun", @@ -145171,13 +156438,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, hidden_layer_sizes, activation, solver, alpha, batch_size, learning_rate, learning_rate_init, power_t, max_iter, loss, shuffle, random_state, tol, verbose, warm_start, momentum, nesterovs_momentum, early_stopping, validation_fraction, beta_1, beta_2, epsilon, n_iter_no_change, max_fun):\n self.activation = activation\n self.solver = solver\n self.alpha = alpha\n self.batch_size = batch_size\n self.learning_rate = learning_rate\n self.learning_rate_init = learning_rate_init\n self.power_t = power_t\n self.max_iter = max_iter\n self.loss = loss\n self.hidden_layer_sizes = hidden_layer_sizes\n self.shuffle = shuffle\n self.random_state = random_state\n self.tol = tol\n self.verbose = verbose\n self.warm_start = warm_start\n self.momentum = momentum\n self.nesterovs_momentum = nesterovs_momentum\n self.early_stopping = early_stopping\n self.validation_fraction = validation_fraction\n self.beta_1 = beta_1\n self.beta_2 = beta_2\n self.epsilon = epsilon\n self.n_iter_no_change = n_iter_no_change\n self.max_fun = max_fun" }, { @@ -145195,7 +156463,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -145205,6 +156474,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -145215,7 +156488,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target values." - } + }, + "refined_type": {} }, { "name": "activations", @@ -145225,7 +156499,8 @@ "docstring": { "type": "list, length = n_layers - 1", "description": "The ith element of the list holds the values of the ith layer." - } + }, + "refined_type": {} }, { "name": "deltas", @@ -145235,7 +156510,8 @@ "docstring": { "type": "list, length = n_layers - 1", "description": "The ith element of the list holds the difference between the\nactivations of the i + 1 layer and the backpropagated error.\nMore specifically, deltas are gradients of loss with respect to z\nin each layer, where z = wx + b is the value of a particular layer\nbefore passing through the activation function" - } + }, + "refined_type": {} }, { "name": "coef_grads", @@ -145245,7 +156521,8 @@ "docstring": { "type": "list, length = n_layers - 1", "description": "The ith element contains the amount of change used to update the\ncoefficient parameters of the ith layer in an iteration." - } + }, + "refined_type": {} }, { "name": "intercept_grads", @@ -145255,13 +156532,14 @@ "docstring": { "type": "list, length = n_layers - 1", "description": "The ith element contains the amount of change used to update the\nintercept parameters of the ith layer in an iteration." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the MLP loss function and its corresponding derivatives with respect to each parameter: weights and bias vectors.", - "docstring": "Compute the MLP loss function and its corresponding derivatives\nwith respect to each parameter: weights and bias vectors.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\ny : ndarray of shape (n_samples,)\n The target values.\n\nactivations : list, length = n_layers - 1\n The ith element of the list holds the values of the ith layer.\n\ndeltas : list, length = n_layers - 1\n The ith element of the list holds the difference between the\n activations of the i + 1 layer and the backpropagated error.\n More specifically, deltas are gradients of loss with respect to z\n in each layer, where z = wx + b is the value of a particular layer\n before passing through the activation function\n\ncoef_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n coefficient parameters of the ith layer in an iteration.\n\nintercept_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n intercept parameters of the ith layer in an iteration.\n\nReturns\n-------\nloss : float\ncoef_grads : list, length = n_layers - 1\nintercept_grads : list, length = n_layers - 1", + "description": "Compute the MLP loss function and its corresponding derivatives\nwith respect to each parameter: weights and bias vectors.", + "docstring": "Compute the MLP loss function and its corresponding derivatives\n with respect to each parameter: weights and bias vectors.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : ndarray of shape (n_samples,)\n The target values.\n\n activations : list, length = n_layers - 1\n The ith element of the list holds the values of the ith layer.\n\n deltas : list, length = n_layers - 1\n The ith element of the list holds the difference between the\n activations of the i + 1 layer and the backpropagated error.\n More specifically, deltas are gradients of loss with respect to z\n in each layer, where z = wx + b is the value of a particular layer\n before passing through the activation function\n\n coef_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n coefficient parameters of the ith layer in an iteration.\n\n intercept_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n intercept parameters of the ith layer in an iteration.\n\n Returns\n -------\n loss : float\n coef_grads : list, length = n_layers - 1\n intercept_grads : list, length = n_layers - 1\n ", "source_code": "\ndef _backprop(self, X, y, activations, deltas, coef_grads, intercept_grads):\n \"\"\"Compute the MLP loss function and its corresponding derivatives\n with respect to each parameter: weights and bias vectors.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : ndarray of shape (n_samples,)\n The target values.\n\n activations : list, length = n_layers - 1\n The ith element of the list holds the values of the ith layer.\n\n deltas : list, length = n_layers - 1\n The ith element of the list holds the difference between the\n activations of the i + 1 layer and the backpropagated error.\n More specifically, deltas are gradients of loss with respect to z\n in each layer, where z = wx + b is the value of a particular layer\n before passing through the activation function\n\n coef_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n coefficient parameters of the ith layer in an iteration.\n\n intercept_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n intercept parameters of the ith layer in an iteration.\n\n Returns\n -------\n loss : float\n coef_grads : list, length = n_layers - 1\n intercept_grads : list, length = n_layers - 1\n \"\"\"\n n_samples = X.shape[0]\n activations = self._forward_pass(activations)\n loss_func_name = self.loss\n if loss_func_name == 'log_loss' and self.out_activation_ == 'logistic':\n loss_func_name = 'binary_log_loss'\n loss = LOSS_FUNCTIONS[loss_func_name](y, activations[-1])\n values = 0\n for s in self.coefs_:\n s = s.ravel()\n values += np.dot(s, s)\n loss += 0.5 * self.alpha * values / n_samples\n last = self.n_layers_ - 2\n deltas[last] = activations[-1] - y\n self._compute_loss_grad(last, n_samples, activations, deltas, coef_grads, intercept_grads)\n inplace_derivative = DERIVATIVES[self.activation]\n for i in range(self.n_layers_ - 2, 0, -1):\n deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)\n inplace_derivative(activations[i], deltas[i - 1])\n self._compute_loss_grad(i - 1, n_samples, activations, deltas, coef_grads, intercept_grads)\n return loss, coef_grads, intercept_grads" }, { @@ -145279,13 +156557,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_solver(self):\n if self.solver not in _STOCHASTIC_SOLVERS:\n raise AttributeError('partial_fit is only available for stochastic optimizers. %s is not stochastic.' % self.solver)\n return True" }, { @@ -145303,7 +156582,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "layer", @@ -145313,7 +156593,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -145323,7 +156604,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "activations", @@ -145333,7 +156615,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deltas", @@ -145343,7 +156626,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef_grads", @@ -145353,7 +156637,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "intercept_grads", @@ -145363,13 +156648,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the gradient of loss with respect to coefs and intercept for specified layer.\n\nThis function does backpropagation for the specified one layer.", - "docstring": "Compute the gradient of loss with respect to coefs and intercept for\nspecified layer.\n\nThis function does backpropagation for the specified one layer.", + "description": "Compute the gradient of loss with respect to coefs and intercept for\nspecified layer.\n\nThis function does backpropagation for the specified one layer.", + "docstring": "Compute the gradient of loss with respect to coefs and intercept for\n specified layer.\n\n This function does backpropagation for the specified one layer.\n ", "source_code": "\ndef _compute_loss_grad(self, layer, n_samples, activations, deltas, coef_grads, intercept_grads):\n \"\"\"Compute the gradient of loss with respect to coefs and intercept for\n specified layer.\n\n This function does backpropagation for the specified one layer.\n \"\"\"\n coef_grads[layer] = safe_sparse_dot(activations[layer].T, deltas[layer])\n coef_grads[layer] += self.alpha * self.coefs_[layer]\n coef_grads[layer] /= n_samples\n intercept_grads[layer] = np.mean(deltas[layer], 0)" }, { @@ -145387,7 +156673,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -145397,7 +156684,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -145407,7 +156695,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "incremental", @@ -145417,13 +156706,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit(self, X, y, incremental=False):\n hidden_layer_sizes = self.hidden_layer_sizes\n if not hasattr(hidden_layer_sizes, '__iter__'):\n hidden_layer_sizes = [hidden_layer_sizes]\n hidden_layer_sizes = list(hidden_layer_sizes)\n self._validate_hyperparameters()\n if np.any(np.array(hidden_layer_sizes) <= 0):\n raise ValueError('hidden_layer_sizes must be > 0, got %s.' % hidden_layer_sizes)\n first_pass = not hasattr(self, 'coefs_') or not self.warm_start and not incremental\n (X, y) = self._validate_input(X, y, incremental, reset=first_pass)\n (n_samples, n_features) = X.shape\n if y.ndim == 1:\n y = y.reshape((-1, 1))\n self.n_outputs_ = y.shape[1]\n layer_units = [n_features] + hidden_layer_sizes + [self.n_outputs_]\n self._random_state = check_random_state(self.random_state)\n if first_pass:\n self._initialize(y, layer_units, X.dtype)\n activations = [X] + [None] * (len(layer_units) - 1)\n deltas = [None] * (len(activations) - 1)\n coef_grads = [np.empty((n_fan_in_, n_fan_out_), dtype=X.dtype) for (n_fan_in_, n_fan_out_) in zip(layer_units[:-1], layer_units[1:])]\n intercept_grads = [np.empty(n_fan_out_, dtype=X.dtype) for n_fan_out_ in layer_units[1:]]\n if self.solver in _STOCHASTIC_SOLVERS:\n self._fit_stochastic(X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental)\n elif self.solver == 'lbfgs':\n self._fit_lbfgs(X, y, activations, deltas, coef_grads, intercept_grads, layer_units)\n return self" }, { @@ -145441,7 +156731,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -145451,7 +156742,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -145461,7 +156753,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "activations", @@ -145471,7 +156764,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deltas", @@ -145481,7 +156775,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef_grads", @@ -145491,7 +156786,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "intercept_grads", @@ -145501,7 +156797,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "layer_units", @@ -145511,13 +156808,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit_lbfgs(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units):\n self._coef_indptr = []\n self._intercept_indptr = []\n start = 0\n for i in range(self.n_layers_ - 1):\n (n_fan_in, n_fan_out) = (layer_units[i], layer_units[i + 1])\n end = start + n_fan_in * n_fan_out\n self._coef_indptr.append((start, end, (n_fan_in, n_fan_out)))\n start = end\n for i in range(self.n_layers_ - 1):\n end = start + layer_units[i + 1]\n self._intercept_indptr.append((start, end))\n start = end\n packed_coef_inter = _pack(self.coefs_, self.intercepts_)\n if self.verbose is True or self.verbose >= 1:\n iprint = 1\n else:\n iprint = -1\n opt_res = scipy.optimize.minimize(self._loss_grad_lbfgs, packed_coef_inter, method='L-BFGS-B', jac=True, options={'maxfun': self.max_fun, 'maxiter': self.max_iter, 'iprint': iprint, 'gtol': self.tol}, args=(X, y, activations, deltas, coef_grads, intercept_grads))\n self.n_iter_ = _check_optimize_result('lbfgs', opt_res, self.max_iter)\n self.loss_ = opt_res.fun\n self._unpack(opt_res.x)" }, { @@ -145535,7 +156833,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -145545,7 +156844,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -145555,7 +156855,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "activations", @@ -145565,7 +156866,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deltas", @@ -145575,7 +156877,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef_grads", @@ -145585,7 +156888,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "intercept_grads", @@ -145595,7 +156899,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "layer_units", @@ -145605,7 +156910,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "incremental", @@ -145615,13 +156921,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit_stochastic(self, X, y, activations, deltas, coef_grads, intercept_grads, layer_units, incremental):\n params = self.coefs_ + self.intercepts_\n if not incremental or not hasattr(self, '_optimizer'):\n if self.solver == 'sgd':\n self._optimizer = SGDOptimizer(params, self.learning_rate_init, self.learning_rate, self.momentum, self.nesterovs_momentum, self.power_t)\n elif self.solver == 'adam':\n self._optimizer = AdamOptimizer(params, self.learning_rate_init, self.beta_1, self.beta_2, self.epsilon)\n early_stopping = self.early_stopping and not incremental\n if early_stopping:\n should_stratify = is_classifier(self) and self.n_outputs_ == 1\n stratify = y if should_stratify else None\n (X, X_val, y, y_val) = train_test_split(X, y, random_state=self._random_state, test_size=self.validation_fraction, stratify=stratify)\n if is_classifier(self):\n y_val = self._label_binarizer.inverse_transform(y_val)\n else:\n X_val = None\n y_val = None\n n_samples = X.shape[0]\n sample_idx = np.arange(n_samples, dtype=int)\n if self.batch_size == 'auto':\n batch_size = min(200, n_samples)\n else:\n if self.batch_size < 1 or self.batch_size > n_samples:\n warnings.warn('Got `batch_size` less than 1 or larger than sample size. It is going to be clipped')\n batch_size = np.clip(self.batch_size, 1, n_samples)\n try:\n for it in range(self.max_iter):\n if self.shuffle:\n sample_idx = shuffle(sample_idx, random_state=self._random_state)\n accumulated_loss = 0.0\n for batch_slice in gen_batches(n_samples, batch_size):\n if self.shuffle:\n X_batch = _safe_indexing(X, sample_idx[batch_slice])\n y_batch = y[sample_idx[batch_slice]]\n else:\n X_batch = X[batch_slice]\n y_batch = y[batch_slice]\n activations[0] = X_batch\n (batch_loss, coef_grads, intercept_grads) = self._backprop(X_batch, y_batch, activations, deltas, coef_grads, intercept_grads)\n accumulated_loss += batch_loss * (batch_slice.stop - batch_slice.start)\n grads = coef_grads + intercept_grads\n self._optimizer.update_params(params, grads)\n self.n_iter_ += 1\n self.loss_ = accumulated_loss / X.shape[0]\n self.t_ += n_samples\n self.loss_curve_.append(self.loss_)\n if self.verbose:\n print('Iteration %d, loss = %.8f' % (self.n_iter_, self.loss_))\n self._update_no_improvement_count(early_stopping, X_val, y_val)\n self._optimizer.iteration_ends(self.t_)\n if self._no_improvement_count > self.n_iter_no_change:\n if early_stopping:\n msg = 'Validation score did not improve more than tol=%f for %d consecutive epochs.' % (self.tol, self.n_iter_no_change)\n else:\n msg = 'Training loss did not improve more than tol=%f for %d consecutive epochs.' % (self.tol, self.n_iter_no_change)\n is_stopping = self._optimizer.trigger_stopping(msg, self.verbose)\n if is_stopping:\n break\n else:\n self._no_improvement_count = 0\n if incremental:\n break\n if self.n_iter_ == self.max_iter:\n warnings.warn(\"Stochastic Optimizer: Maximum iterations (%d) reached and the optimization hasn't converged yet.\" % self.max_iter, ConvergenceWarning)\n except KeyboardInterrupt:\n warnings.warn('Training interrupted by user.')\n if early_stopping:\n self.coefs_ = self._best_coefs\n self.intercepts_ = self._best_intercepts" }, { @@ -145639,7 +156946,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "activations", @@ -145649,13 +156957,14 @@ "docstring": { "type": "list, length = n_layers - 1", "description": "The ith element of the list holds the values of the ith layer." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Perform a forward pass on the network by computing the values of the neurons in the hidden layers and the output layer.", - "docstring": "Perform a forward pass on the network by computing the values\nof the neurons in the hidden layers and the output layer.\n\nParameters\n----------\nactivations : list, length = n_layers - 1\n The ith element of the list holds the values of the ith layer.", + "description": "Perform a forward pass on the network by computing the values\nof the neurons in the hidden layers and the output layer.", + "docstring": "Perform a forward pass on the network by computing the values\n of the neurons in the hidden layers and the output layer.\n\n Parameters\n ----------\n activations : list, length = n_layers - 1\n The ith element of the list holds the values of the ith layer.\n ", "source_code": "\ndef _forward_pass(self, activations):\n \"\"\"Perform a forward pass on the network by computing the values\n of the neurons in the hidden layers and the output layer.\n\n Parameters\n ----------\n activations : list, length = n_layers - 1\n The ith element of the list holds the values of the ith layer.\n \"\"\"\n hidden_activation = ACTIVATIONS[self.activation]\n for i in range(self.n_layers_ - 1):\n activations[i + 1] = safe_sparse_dot(activations[i], self.coefs_[i])\n activations[i + 1] += self.intercepts_[i]\n if i + 1 != self.n_layers_ - 1:\n hidden_activation(activations[i + 1])\n output_activation = ACTIVATIONS[self.out_activation_]\n output_activation(activations[i + 1])\n return activations" }, { @@ -145673,7 +156982,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -145683,13 +156993,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Predict using the trained model\n\nThis is the same as _forward_pass but does not record the activations of all layers and only returns the last layer's activation.", - "docstring": "Predict using the trained model\n\nThis is the same as _forward_pass but does not record the activations\nof all layers and only returns the last layer's activation.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The decision function of the samples for each class in the model.", + "description": "Predict using the trained model\n\nThis is the same as _forward_pass but does not record the activations\nof all layers and only returns the last layer's activation.", + "docstring": "Predict using the trained model\n\n This is the same as _forward_pass but does not record the activations\n of all layers and only returns the last layer's activation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The decision function of the samples for each class in the model.\n ", "source_code": "\ndef _forward_pass_fast(self, X):\n \"\"\"Predict using the trained model\n\n This is the same as _forward_pass but does not record the activations\n of all layers and only returns the last layer's activation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The decision function of the samples for each class in the model.\n \"\"\"\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], reset=False)\n activation = X\n hidden_activation = ACTIVATIONS[self.activation]\n for i in range(self.n_layers_ - 1):\n activation = safe_sparse_dot(activation, self.coefs_[i])\n activation += self.intercepts_[i]\n if i != self.n_layers_ - 2:\n hidden_activation(activation)\n output_activation = ACTIVATIONS[self.out_activation_]\n output_activation(activation)\n return activation" }, { @@ -145707,7 +157021,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fan_in", @@ -145717,7 +157032,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fan_out", @@ -145727,7 +157043,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -145737,13 +157054,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _init_coef(self, fan_in, fan_out, dtype):\n factor = 6.0\n if self.activation == 'logistic':\n factor = 2.0\n init_bound = np.sqrt(factor / (fan_in + fan_out))\n coef_init = self._random_state.uniform(-init_bound, init_bound, (fan_in, fan_out))\n intercept_init = self._random_state.uniform(-init_bound, init_bound, fan_out)\n coef_init = coef_init.astype(dtype, copy=False)\n intercept_init = intercept_init.astype(dtype, copy=False)\n return coef_init, intercept_init" }, { @@ -145761,7 +157079,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -145771,7 +157090,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "layer_units", @@ -145781,7 +157101,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -145791,13 +157112,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _initialize(self, y, layer_units, dtype):\n self.n_iter_ = 0\n self.t_ = 0\n self.n_outputs_ = y.shape[1]\n self.n_layers_ = len(layer_units)\n if not is_classifier(self):\n self.out_activation_ = 'identity'\n elif self._label_binarizer.y_type_ == 'multiclass':\n self.out_activation_ = 'softmax'\n else:\n self.out_activation_ = 'logistic'\n self.coefs_ = []\n self.intercepts_ = []\n for i in range(self.n_layers_ - 1):\n (coef_init, intercept_init) = self._init_coef(layer_units[i], layer_units[i + 1], dtype)\n self.coefs_.append(coef_init)\n self.intercepts_.append(intercept_init)\n if self.solver in _STOCHASTIC_SOLVERS:\n self.loss_curve_ = []\n self._no_improvement_count = 0\n if self.early_stopping:\n self.validation_scores_ = []\n self.best_validation_score_ = -np.inf\n else:\n self.best_loss_ = np.inf" }, { @@ -145815,7 +157137,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "packed_coef_inter", @@ -145825,7 +157148,8 @@ "docstring": { "type": "ndarray", "description": "A vector comprising the flattened coefficients and intercepts." - } + }, + "refined_type": {} }, { "name": "X", @@ -145835,6 +157159,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -145845,7 +157173,8 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target values." - } + }, + "refined_type": {} }, { "name": "activations", @@ -145855,7 +157184,8 @@ "docstring": { "type": "list, length = n_layers - 1", "description": "The ith element of the list holds the values of the ith layer." - } + }, + "refined_type": {} }, { "name": "deltas", @@ -145865,7 +157195,8 @@ "docstring": { "type": "list, length = n_layers - 1", "description": "The ith element of the list holds the difference between the\nactivations of the i + 1 layer and the backpropagated error.\nMore specifically, deltas are gradients of loss with respect to z\nin each layer, where z = wx + b is the value of a particular layer\nbefore passing through the activation function" - } + }, + "refined_type": {} }, { "name": "coef_grads", @@ -145875,7 +157206,8 @@ "docstring": { "type": "list, length = n_layers - 1", "description": "The ith element contains the amount of change used to update the\ncoefficient parameters of the ith layer in an iteration." - } + }, + "refined_type": {} }, { "name": "intercept_grads", @@ -145885,13 +157217,14 @@ "docstring": { "type": "list, length = n_layers - 1", "description": "The ith element contains the amount of change used to update the\nintercept parameters of the ith layer in an iteration." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the MLP loss function and its corresponding derivatives with respect to the different parameters given in the initialization.\n\nReturned gradients are packed in a single vector so it can be used in lbfgs", - "docstring": "Compute the MLP loss function and its corresponding derivatives\nwith respect to the different parameters given in the initialization.\n\nReturned gradients are packed in a single vector so it can be used\nin lbfgs\n\nParameters\n----------\npacked_coef_inter : ndarray\n A vector comprising the flattened coefficients and intercepts.\n\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\ny : ndarray of shape (n_samples,)\n The target values.\n\nactivations : list, length = n_layers - 1\n The ith element of the list holds the values of the ith layer.\n\ndeltas : list, length = n_layers - 1\n The ith element of the list holds the difference between the\n activations of the i + 1 layer and the backpropagated error.\n More specifically, deltas are gradients of loss with respect to z\n in each layer, where z = wx + b is the value of a particular layer\n before passing through the activation function\n\ncoef_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n coefficient parameters of the ith layer in an iteration.\n\nintercept_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n intercept parameters of the ith layer in an iteration.\n\nReturns\n-------\nloss : float\ngrad : array-like, shape (number of nodes of all layers,)", + "description": "Compute the MLP loss function and its corresponding derivatives\nwith respect to the different parameters given in the initialization.\n\nReturned gradients are packed in a single vector so it can be used\nin lbfgs", + "docstring": "Compute the MLP loss function and its corresponding derivatives\n with respect to the different parameters given in the initialization.\n\n Returned gradients are packed in a single vector so it can be used\n in lbfgs\n\n Parameters\n ----------\n packed_coef_inter : ndarray\n A vector comprising the flattened coefficients and intercepts.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : ndarray of shape (n_samples,)\n The target values.\n\n activations : list, length = n_layers - 1\n The ith element of the list holds the values of the ith layer.\n\n deltas : list, length = n_layers - 1\n The ith element of the list holds the difference between the\n activations of the i + 1 layer and the backpropagated error.\n More specifically, deltas are gradients of loss with respect to z\n in each layer, where z = wx + b is the value of a particular layer\n before passing through the activation function\n\n coef_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n coefficient parameters of the ith layer in an iteration.\n\n intercept_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n intercept parameters of the ith layer in an iteration.\n\n Returns\n -------\n loss : float\n grad : array-like, shape (number of nodes of all layers,)\n ", "source_code": "\ndef _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas, coef_grads, intercept_grads):\n \"\"\"Compute the MLP loss function and its corresponding derivatives\n with respect to the different parameters given in the initialization.\n\n Returned gradients are packed in a single vector so it can be used\n in lbfgs\n\n Parameters\n ----------\n packed_coef_inter : ndarray\n A vector comprising the flattened coefficients and intercepts.\n\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : ndarray of shape (n_samples,)\n The target values.\n\n activations : list, length = n_layers - 1\n The ith element of the list holds the values of the ith layer.\n\n deltas : list, length = n_layers - 1\n The ith element of the list holds the difference between the\n activations of the i + 1 layer and the backpropagated error.\n More specifically, deltas are gradients of loss with respect to z\n in each layer, where z = wx + b is the value of a particular layer\n before passing through the activation function\n\n coef_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n coefficient parameters of the ith layer in an iteration.\n\n intercept_grads : list, length = n_layers - 1\n The ith element contains the amount of change used to update the\n intercept parameters of the ith layer in an iteration.\n\n Returns\n -------\n loss : float\n grad : array-like, shape (number of nodes of all layers,)\n \"\"\"\n self._unpack(packed_coef_inter)\n (loss, coef_grads, intercept_grads) = self._backprop(X, y, activations, deltas, coef_grads, intercept_grads)\n grad = _pack(coef_grads, intercept_grads)\n return loss, grad" }, { @@ -145909,7 +157242,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "packed_parameters", @@ -145919,7 +157253,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -145943,7 +157278,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -145953,7 +157289,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_val", @@ -145963,7 +157300,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_val", @@ -145973,13 +157311,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _update_no_improvement_count(self, early_stopping, X_val, y_val):\n if early_stopping:\n self.validation_scores_.append(self.score(X_val, y_val))\n if self.verbose:\n print('Validation score: %f' % self.validation_scores_[-1])\n last_valid_score = self.validation_scores_[-1]\n if last_valid_score < self.best_validation_score_ + self.tol:\n self._no_improvement_count += 1\n else:\n self._no_improvement_count = 0\n if last_valid_score > self.best_validation_score_:\n self.best_validation_score_ = last_valid_score\n self._best_coefs = [c.copy() for c in self.coefs_]\n self._best_intercepts = [i.copy() for i in self.intercepts_]\n else:\n if self.loss_curve_[-1] > self.best_loss_ - self.tol:\n self._no_improvement_count += 1\n else:\n self._no_improvement_count = 0\n if self.loss_curve_[-1] < self.best_loss_:\n self.best_loss_ = self.loss_curve_[-1]" }, { @@ -145997,13 +157336,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_hyperparameters(self):\n if not isinstance(self.shuffle, bool):\n raise ValueError('shuffle must be either True or False, got %s.' % self.shuffle)\n if self.max_iter <= 0:\n raise ValueError('max_iter must be > 0, got %s.' % self.max_iter)\n if self.max_fun <= 0:\n raise ValueError('max_fun must be > 0, got %s.' % self.max_fun)\n if self.alpha < 0.0:\n raise ValueError('alpha must be >= 0, got %s.' % self.alpha)\n if self.learning_rate in ['constant', 'invscaling', 'adaptive'] and self.learning_rate_init <= 0.0:\n raise ValueError('learning_rate_init must be > 0, got %s.' % self.learning_rate)\n if self.momentum > 1 or self.momentum < 0:\n raise ValueError('momentum must be >= 0 and <= 1, got %s' % self.momentum)\n if not isinstance(self.nesterovs_momentum, bool):\n raise ValueError('nesterovs_momentum must be either True or False, got %s.' % self.nesterovs_momentum)\n if not isinstance(self.early_stopping, bool):\n raise ValueError('early_stopping must be either True or False, got %s.' % self.early_stopping)\n if self.validation_fraction < 0 or self.validation_fraction >= 1:\n raise ValueError('validation_fraction must be >= 0 and < 1, got %s' % self.validation_fraction)\n if self.beta_1 < 0 or self.beta_1 >= 1:\n raise ValueError('beta_1 must be >= 0 and < 1, got %s' % self.beta_1)\n if self.beta_2 < 0 or self.beta_2 >= 1:\n raise ValueError('beta_2 must be >= 0 and < 1, got %s' % self.beta_2)\n if self.epsilon <= 0.0:\n raise ValueError('epsilon must be > 0, got %s.' % self.epsilon)\n if self.n_iter_no_change <= 0:\n raise ValueError('n_iter_no_change must be > 0, got %s.' % self.n_iter_no_change)\n if self.activation not in ACTIVATIONS:\n raise ValueError(\"The activation '%s' is not supported. Supported activations are %s.\" % (self.activation, list(sorted(ACTIVATIONS))))\n if self.learning_rate not in ['constant', 'invscaling', 'adaptive']:\n raise ValueError('learning rate %s is not supported. ' % self.learning_rate)\n supported_solvers = _STOCHASTIC_SOLVERS + ['lbfgs']\n if self.solver not in supported_solvers:\n raise ValueError('The solver %s is not supported. Expected one of: %s' % (self.solver, ', '.join(supported_solvers)))" }, { @@ -146021,7 +157361,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -146031,7 +157372,8 @@ "docstring": { "type": "ndarray or sparse matrix of shape (n_samples, n_features)", "description": "The input data." - } + }, + "refined_type": {} }, { "name": "y", @@ -146041,13 +157383,14 @@ "docstring": { "type": "ndarray of shape (n_samples,) or (n_samples, n_outputs)", "description": "The target values (class labels in classification, real numbers in\nregression)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit the model to data matrix X and target(s) y.", - "docstring": "Fit the model to data matrix X and target(s) y.\n\nParameters\n----------\nX : ndarray or sparse matrix of shape (n_samples, n_features)\n The input data.\n\ny : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels in classification, real numbers in\n regression).\n\nReturns\n-------\nself : object\n Returns a trained MLP model.", + "docstring": "Fit the model to data matrix X and target(s) y.\n\n Parameters\n ----------\n X : ndarray or sparse matrix of shape (n_samples, n_features)\n The input data.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels in classification, real numbers in\n regression).\n\n Returns\n -------\n self : object\n Returns a trained MLP model.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit the model to data matrix X and target(s) y.\n\n Parameters\n ----------\n X : ndarray or sparse matrix of shape (n_samples, n_features)\n The input data.\n\n y : ndarray of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels in classification, real numbers in\n regression).\n\n Returns\n -------\n self : object\n Returns a trained MLP model.\n \"\"\"\n return self._fit(X, y, incremental=False)" }, { @@ -146065,7 +157408,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -146075,6 +157419,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -146085,13 +157433,14 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "The target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Update the model with a single iteration over the given data.", - "docstring": "Update the model with a single iteration over the given data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\ny : ndarray of shape (n_samples,)\n The target values.\n\nReturns\n-------\nself : object\n Trained MLP model.", + "docstring": "Update the model with a single iteration over the given data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : ndarray of shape (n_samples,)\n The target values.\n\n Returns\n -------\n self : object\n Trained MLP model.\n ", "source_code": "\n@available_if(_check_solver)\ndef partial_fit(self, X, y):\n \"\"\"Update the model with a single iteration over the given data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : ndarray of shape (n_samples,)\n The target values.\n\n Returns\n -------\n self : object\n Trained MLP model.\n \"\"\"\n return self._fit(X, y, incremental=True)" }, { @@ -146109,7 +157458,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "hidden_layer_sizes", @@ -146119,7 +157469,8 @@ "docstring": { "type": "tuple, length = n_layers - 2, default=(100,)", "description": "The ith element represents the number of neurons in the ith\nhidden layer." - } + }, + "refined_type": {} }, { "name": "activation", @@ -146129,6 +157480,10 @@ "docstring": { "type": "{'identity', 'logistic', 'tanh', 'relu'}, default='relu'", "description": "Activation function for the hidden layer.\n\n- 'identity', no-op activation, useful to implement linear bottleneck,\n returns f(x) = x\n\n- 'logistic', the logistic sigmoid function,\n returns f(x) = 1 / (1 + exp(-x)).\n\n- 'tanh', the hyperbolic tan function,\n returns f(x) = tanh(x).\n\n- 'relu', the rectified linear unit function,\n returns f(x) = max(0, x)" + }, + "refined_type": { + "kind": "EnumType", + "values": ["logistic", "identity", "tanh", "relu"] } }, { @@ -146139,6 +157494,10 @@ "docstring": { "type": "{'lbfgs', 'sgd', 'adam'}, default='adam'", "description": "The solver for weight optimization.\n\n- 'lbfgs' is an optimizer in the family of quasi-Newton methods.\n\n- 'sgd' refers to stochastic gradient descent.\n\n- 'adam' refers to a stochastic gradient-based optimizer proposed\n by Kingma, Diederik, and Jimmy Ba\n\nNote: The default solver 'adam' works pretty well on relatively\nlarge datasets (with thousands of training samples or more) in terms of\nboth training time and validation score.\nFor small datasets, however, 'lbfgs' can converge faster and perform\nbetter." + }, + "refined_type": { + "kind": "EnumType", + "values": ["lbfgs", "sgd", "adam"] } }, { @@ -146149,7 +157508,8 @@ "docstring": { "type": "float, default=0.0001", "description": "L2 penalty (regularization term) parameter." - } + }, + "refined_type": {} }, { "name": "batch_size", @@ -146159,7 +157519,8 @@ "docstring": { "type": "int, default='auto'", "description": "Size of minibatches for stochastic optimizers.\nIf the solver is 'lbfgs', the classifier will not use minibatch.\nWhen set to \"auto\", `batch_size=min(200, n_samples)`." - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -146169,6 +157530,10 @@ "docstring": { "type": "{'constant', 'invscaling', 'adaptive'}, default='constant'", "description": "Learning rate schedule for weight updates.\n\n- 'constant' is a constant learning rate given by\n 'learning_rate_init'.\n\n- 'invscaling' gradually decreases the learning rate at each\n time step 't' using an inverse scaling exponent of 'power_t'.\n effective_learning_rate = learning_rate_init / pow(t, power_t)\n\n- 'adaptive' keeps the learning rate constant to\n 'learning_rate_init' as long as training loss keeps decreasing.\n Each time two consecutive epochs fail to decrease training loss by at\n least tol, or fail to increase validation score by at least tol if\n 'early_stopping' is on, the current learning rate is divided by 5.\n\nOnly used when ``solver='sgd'``." + }, + "refined_type": { + "kind": "EnumType", + "values": ["adaptive", "constant", "invscaling"] } }, { @@ -146177,9 +157542,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "double, default=0.001", + "type": "float, default=0.001", "description": "The initial learning rate used. It controls the step-size\nin updating the weights. Only used when solver='sgd' or 'adam'." - } + }, + "refined_type": {} }, { "name": "power_t", @@ -146187,9 +157553,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "double, default=0.5", + "type": "float, default=0.5", "description": "The exponent for inverse scaling learning rate.\nIt is used in updating effective learning rate when the learning_rate\nis set to 'invscaling'. Only used when solver='sgd'." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -146199,7 +157566,8 @@ "docstring": { "type": "int, default=200", "description": "Maximum number of iterations. The solver iterates until convergence\n(determined by 'tol') or this number of iterations. For stochastic\nsolvers ('sgd', 'adam'), note that this determines the number of epochs\n(how many times each data point will be used), not the number of\ngradient steps." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -146209,7 +157577,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to shuffle samples in each iteration. Only used when\nsolver='sgd' or 'adam'." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -146219,7 +157588,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Determines random number generation for weights and bias\ninitialization, train-test split if early stopping is used, and batch\nsampling when solver='sgd' or 'adam'.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "tol", @@ -146229,7 +157599,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance for the optimization. When the loss or score is not improving\nby at least ``tol`` for ``n_iter_no_change`` consecutive iterations,\nunless ``learning_rate`` is set to 'adaptive', convergence is\nconsidered to be reached and training stops." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -146239,7 +157610,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to print progress messages to stdout." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -146249,7 +157621,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to True, reuse the solution of the previous\ncall to fit as initialization, otherwise, just erase the\nprevious solution. See :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "momentum", @@ -146259,7 +157632,8 @@ "docstring": { "type": "float, default=0.9", "description": "Momentum for gradient descent update. Should be between 0 and 1. Only\nused when solver='sgd'." - } + }, + "refined_type": {} }, { "name": "nesterovs_momentum", @@ -146269,7 +157643,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to use Nesterov's momentum. Only used when solver='sgd' and\nmomentum > 0." - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -146279,7 +157654,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use early stopping to terminate training when validation\nscore is not improving. If set to true, it will automatically set\naside 10% of training data as validation and terminate training when\nvalidation score is not improving by at least tol for\n``n_iter_no_change`` consecutive epochs. The split is stratified,\nexcept in a multilabel setting.\nIf early stopping is False, then the training stops when the training\nloss does not improve by more than tol for n_iter_no_change consecutive\npasses over the training set.\nOnly effective when solver='sgd' or 'adam'." - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -146289,7 +157665,8 @@ "docstring": { "type": "float, default=0.1", "description": "The proportion of training data to set aside as validation set for\nearly stopping. Must be between 0 and 1.\nOnly used if early_stopping is True." - } + }, + "refined_type": {} }, { "name": "beta_1", @@ -146299,7 +157676,8 @@ "docstring": { "type": "float, default=0.9", "description": "Exponential decay rate for estimates of first moment vector in adam,\nshould be in [0, 1). Only used when solver='adam'." - } + }, + "refined_type": {} }, { "name": "beta_2", @@ -146309,7 +157687,8 @@ "docstring": { "type": "float, default=0.999", "description": "Exponential decay rate for estimates of second moment vector in adam,\nshould be in [0, 1). Only used when solver='adam'." - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -146319,7 +157698,8 @@ "docstring": { "type": "float, default=1e-8", "description": "Value for numerical stability in adam. Only used when solver='adam'." - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -146329,7 +157709,8 @@ "docstring": { "type": "int, default=10", "description": "Maximum number of epochs to not meet ``tol`` improvement.\nOnly effective when solver='sgd' or 'adam'.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "max_fun", @@ -146339,13 +157720,14 @@ "docstring": { "type": "int, default=15000", "description": "Only used when solver='lbfgs'. Maximum number of loss function calls.\nThe solver iterates until convergence (determined by 'tol'), number\nof iterations reaches max_iter, or this number of loss function calls.\nNote that number of loss function calls will be greater than or equal\nto the number of iterations for the `MLPClassifier`.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, hidden_layer_sizes=(100, ), activation='relu', *, solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000):\n super().__init__(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, loss='log_loss', shuffle=shuffle, random_state=random_state, tol=tol, verbose=verbose, warm_start=warm_start, momentum=momentum, nesterovs_momentum=nesterovs_momentum, early_stopping=early_stopping, validation_fraction=validation_fraction, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, n_iter_no_change=n_iter_no_change, max_fun=max_fun)" }, { @@ -146363,13 +157745,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multilabel': True}" }, { @@ -146387,7 +157770,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -146397,7 +157781,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -146407,7 +157792,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "incremental", @@ -146417,7 +157803,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "reset", @@ -146427,13 +157814,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_input(self, X, y, incremental, reset):\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], multi_output=True, dtype=(np.float64, np.float32), reset=reset)\n if y.ndim == 2 and y.shape[1] == 1:\n y = column_or_1d(y, warn=True)\n if not hasattr(self, 'classes_') or not self.warm_start and not incremental:\n self._label_binarizer = LabelBinarizer()\n self._label_binarizer.fit(y)\n self.classes_ = self._label_binarizer.classes_\n else:\n classes = unique_labels(y)\n if self.warm_start:\n if set(classes) != set(self.classes_):\n raise ValueError(f'warm_start can only be used where `y` has the same classes as in the previous call to fit. Previously got {self.classes_}, `y` has {classes}')\n elif len(np.setdiff1d(classes, self.classes_, assume_unique=True)):\n raise ValueError(f\"`y` has classes not in `self.classes_`. `self.classes_` has {self.classes_}. 'y' has {classes}.\")\n y = self._label_binarizer.transform(y).astype(bool)\n return X, y" }, { @@ -146451,7 +157839,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -146461,6 +157850,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -146471,7 +157864,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "The target values." - } + }, + "refined_type": {} }, { "name": "classes", @@ -146481,13 +157875,14 @@ "docstring": { "type": "array of shape (n_classes,), default=None", "description": "Classes across all calls to partial_fit.\nCan be obtained via `np.unique(y_all)`, where y_all is the\ntarget vector of the entire dataset.\nThis argument is required for the first call to partial_fit\nand can be omitted in the subsequent calls.\nNote that y doesn't need to contain all labels in `classes`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Update the model with a single iteration over the given data.", - "docstring": "Update the model with a single iteration over the given data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\ny : array-like of shape (n_samples,)\n The target values.\n\nclasses : array of shape (n_classes,), default=None\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that y doesn't need to contain all labels in `classes`.\n\nReturns\n-------\nself : object\n Trained MLP model.", + "docstring": "Update the model with a single iteration over the given data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : array-like of shape (n_samples,)\n The target values.\n\n classes : array of shape (n_classes,), default=None\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that y doesn't need to contain all labels in `classes`.\n\n Returns\n -------\n self : object\n Trained MLP model.\n ", "source_code": "\n@available_if(lambda est: est._check_solver())\ndef partial_fit(self, X, y, classes=None):\n \"\"\"Update the model with a single iteration over the given data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n y : array-like of shape (n_samples,)\n The target values.\n\n classes : array of shape (n_classes,), default=None\n Classes across all calls to partial_fit.\n Can be obtained via `np.unique(y_all)`, where y_all is the\n target vector of the entire dataset.\n This argument is required for the first call to partial_fit\n and can be omitted in the subsequent calls.\n Note that y doesn't need to contain all labels in `classes`.\n\n Returns\n -------\n self : object\n Trained MLP model.\n \"\"\"\n if _check_partial_fit_first_call(self, classes):\n self._label_binarizer = LabelBinarizer()\n if type_of_target(y).startswith('multilabel'):\n self._label_binarizer.fit(y)\n else:\n self._label_binarizer.fit(classes)\n super().partial_fit(X, y)\n return self" }, { @@ -146505,7 +157900,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -146515,13 +157911,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict using the multi-layer perceptron classifier.", - "docstring": "Predict using the multi-layer perceptron classifier.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\ny : ndarray, shape (n_samples,) or (n_samples, n_classes)\n The predicted classes.", + "docstring": "Predict using the multi-layer perceptron classifier.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y : ndarray, shape (n_samples,) or (n_samples, n_classes)\n The predicted classes.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict using the multi-layer perceptron classifier.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y : ndarray, shape (n_samples,) or (n_samples, n_classes)\n The predicted classes.\n \"\"\"\n check_is_fitted(self)\n y_pred = self._forward_pass_fast(X)\n if self.n_outputs_ == 1:\n y_pred = y_pred.ravel()\n return self._label_binarizer.inverse_transform(y_pred)" }, { @@ -146539,7 +157939,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -146549,13 +157950,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The input data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the log of probability estimates.", - "docstring": "Return the log of probability estimates.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\nlog_y_prob : ndarray of shape (n_samples, n_classes)\n The predicted log-probability of the sample for each class\n in the model, where classes are ordered as they are in\n `self.classes_`. Equivalent to `log(predict_proba(X))`.", + "docstring": "Return the log of probability estimates.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n log_y_prob : ndarray of shape (n_samples, n_classes)\n The predicted log-probability of the sample for each class\n in the model, where classes are ordered as they are in\n `self.classes_`. Equivalent to `log(predict_proba(X))`.\n ", "source_code": "\ndef predict_log_proba(self, X):\n \"\"\"Return the log of probability estimates.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n log_y_prob : ndarray of shape (n_samples, n_classes)\n The predicted log-probability of the sample for each class\n in the model, where classes are ordered as they are in\n `self.classes_`. Equivalent to `log(predict_proba(X))`.\n \"\"\"\n y_prob = self.predict_proba(X)\n return np.log(y_prob, out=y_prob)" }, { @@ -146573,7 +157975,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -146583,13 +157986,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Probability estimates.", - "docstring": "Probability estimates.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\ny_prob : ndarray of shape (n_samples, n_classes)\n The predicted probability of the sample for each class in the\n model, where classes are ordered as they are in `self.classes_`.", + "docstring": "Probability estimates.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y_prob : ndarray of shape (n_samples, n_classes)\n The predicted probability of the sample for each class in the\n model, where classes are ordered as they are in `self.classes_`.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Probability estimates.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y_prob : ndarray of shape (n_samples, n_classes)\n The predicted probability of the sample for each class in the\n model, where classes are ordered as they are in `self.classes_`.\n \"\"\"\n check_is_fitted(self)\n y_pred = self._forward_pass_fast(X)\n if self.n_outputs_ == 1:\n y_pred = y_pred.ravel()\n if y_pred.ndim == 1:\n return np.vstack([1 - y_pred, y_pred]).T\n else:\n return y_pred" }, { @@ -146607,7 +158014,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "hidden_layer_sizes", @@ -146617,7 +158025,8 @@ "docstring": { "type": "tuple, length = n_layers - 2, default=(100,)", "description": "The ith element represents the number of neurons in the ith\nhidden layer." - } + }, + "refined_type": {} }, { "name": "activation", @@ -146627,6 +158036,10 @@ "docstring": { "type": "{'identity', 'logistic', 'tanh', 'relu'}, default='relu'", "description": "Activation function for the hidden layer.\n\n- 'identity', no-op activation, useful to implement linear bottleneck,\n returns f(x) = x\n\n- 'logistic', the logistic sigmoid function,\n returns f(x) = 1 / (1 + exp(-x)).\n\n- 'tanh', the hyperbolic tan function,\n returns f(x) = tanh(x).\n\n- 'relu', the rectified linear unit function,\n returns f(x) = max(0, x)" + }, + "refined_type": { + "kind": "EnumType", + "values": ["logistic", "identity", "tanh", "relu"] } }, { @@ -146637,6 +158050,10 @@ "docstring": { "type": "{'lbfgs', 'sgd', 'adam'}, default='adam'", "description": "The solver for weight optimization.\n\n- 'lbfgs' is an optimizer in the family of quasi-Newton methods.\n\n- 'sgd' refers to stochastic gradient descent.\n\n- 'adam' refers to a stochastic gradient-based optimizer proposed by\n Kingma, Diederik, and Jimmy Ba\n\nNote: The default solver 'adam' works pretty well on relatively\nlarge datasets (with thousands of training samples or more) in terms of\nboth training time and validation score.\nFor small datasets, however, 'lbfgs' can converge faster and perform\nbetter." + }, + "refined_type": { + "kind": "EnumType", + "values": ["lbfgs", "sgd", "adam"] } }, { @@ -146647,7 +158064,8 @@ "docstring": { "type": "float, default=0.0001", "description": "L2 penalty (regularization term) parameter." - } + }, + "refined_type": {} }, { "name": "batch_size", @@ -146657,7 +158075,8 @@ "docstring": { "type": "int, default='auto'", "description": "Size of minibatches for stochastic optimizers.\nIf the solver is 'lbfgs', the classifier will not use minibatch.\nWhen set to \"auto\", `batch_size=min(200, n_samples)`." - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -146667,6 +158086,10 @@ "docstring": { "type": "{'constant', 'invscaling', 'adaptive'}, default='constant'", "description": "Learning rate schedule for weight updates.\n\n- 'constant' is a constant learning rate given by\n 'learning_rate_init'.\n\n- 'invscaling' gradually decreases the learning rate ``learning_rate_``\n at each time step 't' using an inverse scaling exponent of 'power_t'.\n effective_learning_rate = learning_rate_init / pow(t, power_t)\n\n- 'adaptive' keeps the learning rate constant to\n 'learning_rate_init' as long as training loss keeps decreasing.\n Each time two consecutive epochs fail to decrease training loss by at\n least tol, or fail to increase validation score by at least tol if\n 'early_stopping' is on, the current learning rate is divided by 5.\n\nOnly used when solver='sgd'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["adaptive", "constant", "invscaling"] } }, { @@ -146675,9 +158098,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "double, default=0.001", + "type": "float, default=0.001", "description": "The initial learning rate used. It controls the step-size\nin updating the weights. Only used when solver='sgd' or 'adam'." - } + }, + "refined_type": {} }, { "name": "power_t", @@ -146685,9 +158109,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "double, default=0.5", + "type": "float, default=0.5", "description": "The exponent for inverse scaling learning rate.\nIt is used in updating effective learning rate when the learning_rate\nis set to 'invscaling'. Only used when solver='sgd'." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -146697,7 +158122,8 @@ "docstring": { "type": "int, default=200", "description": "Maximum number of iterations. The solver iterates until convergence\n(determined by 'tol') or this number of iterations. For stochastic\nsolvers ('sgd', 'adam'), note that this determines the number of epochs\n(how many times each data point will be used), not the number of\ngradient steps." - } + }, + "refined_type": {} }, { "name": "shuffle", @@ -146707,7 +158133,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to shuffle samples in each iteration. Only used when\nsolver='sgd' or 'adam'." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -146717,7 +158144,8 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "Determines random number generation for weights and bias\ninitialization, train-test split if early stopping is used, and batch\nsampling when solver='sgd' or 'adam'.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "tol", @@ -146727,7 +158155,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance for the optimization. When the loss or score is not improving\nby at least ``tol`` for ``n_iter_no_change`` consecutive iterations,\nunless ``learning_rate`` is set to 'adaptive', convergence is\nconsidered to be reached and training stops." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -146737,7 +158166,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to print progress messages to stdout." - } + }, + "refined_type": {} }, { "name": "warm_start", @@ -146747,7 +158177,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to True, reuse the solution of the previous\ncall to fit as initialization, otherwise, just erase the\nprevious solution. See :term:`the Glossary `." - } + }, + "refined_type": {} }, { "name": "momentum", @@ -146757,7 +158188,8 @@ "docstring": { "type": "float, default=0.9", "description": "Momentum for gradient descent update. Should be between 0 and 1. Only\nused when solver='sgd'." - } + }, + "refined_type": {} }, { "name": "nesterovs_momentum", @@ -146767,7 +158199,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to use Nesterov's momentum. Only used when solver='sgd' and\nmomentum > 0." - } + }, + "refined_type": {} }, { "name": "early_stopping", @@ -146777,7 +158210,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to use early stopping to terminate training when validation\nscore is not improving. If set to true, it will automatically set\naside 10% of training data as validation and terminate training when\nvalidation score is not improving by at least ``tol`` for\n``n_iter_no_change`` consecutive epochs.\nOnly effective when solver='sgd' or 'adam'." - } + }, + "refined_type": {} }, { "name": "validation_fraction", @@ -146787,7 +158221,8 @@ "docstring": { "type": "float, default=0.1", "description": "The proportion of training data to set aside as validation set for\nearly stopping. Must be between 0 and 1.\nOnly used if early_stopping is True." - } + }, + "refined_type": {} }, { "name": "beta_1", @@ -146797,7 +158232,8 @@ "docstring": { "type": "float, default=0.9", "description": "Exponential decay rate for estimates of first moment vector in adam,\nshould be in [0, 1). Only used when solver='adam'." - } + }, + "refined_type": {} }, { "name": "beta_2", @@ -146807,7 +158243,8 @@ "docstring": { "type": "float, default=0.999", "description": "Exponential decay rate for estimates of second moment vector in adam,\nshould be in [0, 1). Only used when solver='adam'." - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -146817,7 +158254,8 @@ "docstring": { "type": "float, default=1e-8", "description": "Value for numerical stability in adam. Only used when solver='adam'." - } + }, + "refined_type": {} }, { "name": "n_iter_no_change", @@ -146827,7 +158265,8 @@ "docstring": { "type": "int, default=10", "description": "Maximum number of epochs to not meet ``tol`` improvement.\nOnly effective when solver='sgd' or 'adam'.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "max_fun", @@ -146837,13 +158276,14 @@ "docstring": { "type": "int, default=15000", "description": "Only used when solver='lbfgs'. Maximum number of function calls.\nThe solver iterates until convergence (determined by 'tol'), number\nof iterations reaches max_iter, or this number of function calls.\nNote that number of function calls will be greater than or equal to\nthe number of iterations for the MLPRegressor.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, hidden_layer_sizes=(100, ), activation='relu', *, solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000):\n super().__init__(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, batch_size=batch_size, learning_rate=learning_rate, learning_rate_init=learning_rate_init, power_t=power_t, max_iter=max_iter, loss='squared_error', shuffle=shuffle, random_state=random_state, tol=tol, verbose=verbose, warm_start=warm_start, momentum=momentum, nesterovs_momentum=nesterovs_momentum, early_stopping=early_stopping, validation_fraction=validation_fraction, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, n_iter_no_change=n_iter_no_change, max_fun=max_fun)" }, { @@ -146861,7 +158301,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -146871,7 +158312,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -146881,7 +158323,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "incremental", @@ -146891,7 +158334,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "reset", @@ -146901,13 +158345,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_input(self, X, y, incremental, reset):\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc'], multi_output=True, y_numeric=True, dtype=(np.float64, np.float32), reset=reset)\n if y.ndim == 2 and y.shape[1] == 1:\n y = column_or_1d(y, warn=True)\n return X, y" }, { @@ -146925,7 +158370,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -146935,13 +158381,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict using the multi-layer perceptron model.", - "docstring": "Predict using the multi-layer perceptron model.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\ny : ndarray of shape (n_samples, n_outputs)\n The predicted values.", + "docstring": "Predict using the multi-layer perceptron model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_outputs)\n The predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict using the multi-layer perceptron model.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_outputs)\n The predicted values.\n \"\"\"\n check_is_fitted(self)\n y_pred = self._forward_pass_fast(X)\n if y_pred.shape[1] == 1:\n return y_pred.ravel()\n return y_pred" }, { @@ -146959,7 +158409,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "intercepts_", @@ -146969,7 +158420,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -146993,7 +158445,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -147003,7 +158456,8 @@ "docstring": { "type": "int, default=256", "description": "Number of binary hidden units." - } + }, + "refined_type": {} }, { "name": "learning_rate", @@ -147013,7 +158467,8 @@ "docstring": { "type": "float, default=0.1", "description": "The learning rate for weight updates. It is *highly* recommended\nto tune this hyper-parameter. Reasonable values are in the\n10**[0., -3.] range." - } + }, + "refined_type": {} }, { "name": "batch_size", @@ -147023,7 +158478,8 @@ "docstring": { "type": "int, default=10", "description": "Number of examples per minibatch." - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -147033,7 +158489,8 @@ "docstring": { "type": "int, default=10", "description": "Number of iterations/sweeps over the training dataset to perform\nduring training." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -147043,7 +158500,8 @@ "docstring": { "type": "int, default=0", "description": "The verbosity level. The default, zero, means silent mode. Range\nof values is [0, inf]." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -147053,13 +158511,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for:\n\n- Gibbs sampling from visible and hidden layers.\n\n- Initializing components, sampling from layers during fit.\n\n- Corrupting the data when scoring samples.\n\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components=256, *, learning_rate=0.1, batch_size=10, n_iter=10, verbose=0, random_state=None):\n self.n_components = n_components\n self.learning_rate = learning_rate\n self.batch_size = batch_size\n self.n_iter = n_iter\n self.verbose = verbose\n self.random_state = random_state" }, { @@ -147077,7 +158536,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "v_pos", @@ -147087,7 +158547,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The data to use for training." - } + }, + "refined_type": {} }, { "name": "rng", @@ -147097,13 +158558,14 @@ "docstring": { "type": "RandomState instance", "description": "Random number generator to use for sampling." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Inner fit for one mini-batch.\n\nAdjust the parameters to maximize the likelihood of v using Stochastic Maximum Likelihood (SML).", - "docstring": "Inner fit for one mini-batch.\n\nAdjust the parameters to maximize the likelihood of v using\nStochastic Maximum Likelihood (SML).\n\nParameters\n----------\nv_pos : ndarray of shape (n_samples, n_features)\n The data to use for training.\n\nrng : RandomState instance\n Random number generator to use for sampling.", + "description": "Inner fit for one mini-batch.\n\nAdjust the parameters to maximize the likelihood of v using\nStochastic Maximum Likelihood (SML).", + "docstring": "Inner fit for one mini-batch.\n\n Adjust the parameters to maximize the likelihood of v using\n Stochastic Maximum Likelihood (SML).\n\n Parameters\n ----------\n v_pos : ndarray of shape (n_samples, n_features)\n The data to use for training.\n\n rng : RandomState instance\n Random number generator to use for sampling.\n ", "source_code": "\ndef _fit(self, v_pos, rng):\n \"\"\"Inner fit for one mini-batch.\n\n Adjust the parameters to maximize the likelihood of v using\n Stochastic Maximum Likelihood (SML).\n\n Parameters\n ----------\n v_pos : ndarray of shape (n_samples, n_features)\n The data to use for training.\n\n rng : RandomState instance\n Random number generator to use for sampling.\n \"\"\"\n h_pos = self._mean_hiddens(v_pos)\n v_neg = self._sample_visibles(self.h_samples_, rng)\n h_neg = self._mean_hiddens(v_neg)\n lr = float(self.learning_rate) / v_pos.shape[0]\n update = safe_sparse_dot(v_pos.T, h_pos, dense_output=True).T\n update -= np.dot(h_neg.T, v_neg)\n self.components_ += lr * update\n self.intercept_hidden_ += lr * (h_pos.sum(axis=0) - h_neg.sum(axis=0))\n self.intercept_visible_ += lr * (np.asarray(v_pos.sum(axis=0)).squeeze() - v_neg.sum(axis=0))\n h_neg[rng.uniform(size=h_neg.shape) < h_neg] = 1.0\n self.h_samples_ = np.floor(h_neg, h_neg)" }, { @@ -147121,7 +158583,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "v", @@ -147131,13 +158594,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Values of the visible layer." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes the free energy F(v) = - log sum_h exp(-E(v,h)).", - "docstring": "Computes the free energy F(v) = - log sum_h exp(-E(v,h)).\n\nParameters\n----------\nv : ndarray of shape (n_samples, n_features)\n Values of the visible layer.\n\nReturns\n-------\nfree_energy : ndarray of shape (n_samples,)\n The value of the free energy.", + "docstring": "Computes the free energy F(v) = - log sum_h exp(-E(v,h)).\n\n Parameters\n ----------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer.\n\n Returns\n -------\n free_energy : ndarray of shape (n_samples,)\n The value of the free energy.\n ", "source_code": "\ndef _free_energy(self, v):\n \"\"\"Computes the free energy F(v) = - log sum_h exp(-E(v,h)).\n\n Parameters\n ----------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer.\n\n Returns\n -------\n free_energy : ndarray of shape (n_samples,)\n The value of the free energy.\n \"\"\"\n return -safe_sparse_dot(v, self.intercept_visible_) - np.logaddexp(0, safe_sparse_dot(v, self.components_.T) + self.intercept_hidden_).sum(axis=1)" }, { @@ -147155,7 +158619,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "v", @@ -147165,13 +158630,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Values of the visible layer." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Computes the probabilities P(h=1|v).", - "docstring": "Computes the probabilities P(h=1|v).\n\nParameters\n----------\nv : ndarray of shape (n_samples, n_features)\n Values of the visible layer.\n\nReturns\n-------\nh : ndarray of shape (n_samples, n_components)\n Corresponding mean field values for the hidden layer.", + "docstring": "Computes the probabilities P(h=1|v).\n\n Parameters\n ----------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer.\n\n Returns\n -------\n h : ndarray of shape (n_samples, n_components)\n Corresponding mean field values for the hidden layer.\n ", "source_code": "\ndef _mean_hiddens(self, v):\n \"\"\"Computes the probabilities P(h=1|v).\n\n Parameters\n ----------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer.\n\n Returns\n -------\n h : ndarray of shape (n_samples, n_components)\n Corresponding mean field values for the hidden layer.\n \"\"\"\n p = safe_sparse_dot(v, self.components_.T)\n p += self.intercept_hidden_\n return expit(p, out=p)" }, { @@ -147189,13 +158655,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_methods_subset_invariance': 'fails for the decision_function method', 'check_methods_sample_order_invariance': 'fails for the score_samples method'}}" }, { @@ -147213,7 +158680,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "v", @@ -147223,7 +158691,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Values of the visible layer to sample from." - } + }, + "refined_type": {} }, { "name": "rng", @@ -147233,13 +158702,14 @@ "docstring": { "type": "RandomState instance", "description": "Random number generator to use." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Sample from the distribution P(h|v).", - "docstring": "Sample from the distribution P(h|v).\n\nParameters\n----------\nv : ndarray of shape (n_samples, n_features)\n Values of the visible layer to sample from.\n\nrng : RandomState instance\n Random number generator to use.\n\nReturns\n-------\nh : ndarray of shape (n_samples, n_components)\n Values of the hidden layer.", + "docstring": "Sample from the distribution P(h|v).\n\n Parameters\n ----------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer to sample from.\n\n rng : RandomState instance\n Random number generator to use.\n\n Returns\n -------\n h : ndarray of shape (n_samples, n_components)\n Values of the hidden layer.\n ", "source_code": "\ndef _sample_hiddens(self, v, rng):\n \"\"\"Sample from the distribution P(h|v).\n\n Parameters\n ----------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer to sample from.\n\n rng : RandomState instance\n Random number generator to use.\n\n Returns\n -------\n h : ndarray of shape (n_samples, n_components)\n Values of the hidden layer.\n \"\"\"\n p = self._mean_hiddens(v)\n return rng.random_sample(size=p.shape) < p" }, { @@ -147257,7 +158727,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "h", @@ -147267,7 +158738,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_components)", "description": "Values of the hidden layer to sample from." - } + }, + "refined_type": {} }, { "name": "rng", @@ -147277,13 +158749,14 @@ "docstring": { "type": "RandomState instance", "description": "Random number generator to use." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Sample from the distribution P(v|h).", - "docstring": "Sample from the distribution P(v|h).\n\nParameters\n----------\nh : ndarray of shape (n_samples, n_components)\n Values of the hidden layer to sample from.\n\nrng : RandomState instance\n Random number generator to use.\n\nReturns\n-------\nv : ndarray of shape (n_samples, n_features)\n Values of the visible layer.", + "docstring": "Sample from the distribution P(v|h).\n\n Parameters\n ----------\n h : ndarray of shape (n_samples, n_components)\n Values of the hidden layer to sample from.\n\n rng : RandomState instance\n Random number generator to use.\n\n Returns\n -------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer.\n ", "source_code": "\ndef _sample_visibles(self, h, rng):\n \"\"\"Sample from the distribution P(v|h).\n\n Parameters\n ----------\n h : ndarray of shape (n_samples, n_components)\n Values of the hidden layer to sample from.\n\n rng : RandomState instance\n Random number generator to use.\n\n Returns\n -------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer.\n \"\"\"\n p = np.dot(h, self.components_)\n p += self.intercept_visible_\n expit(p, out=p)\n return rng.random_sample(size=p.shape) < p" }, { @@ -147301,7 +158774,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -147311,6 +158785,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -147321,13 +158799,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs), default=None", "description": "Target values (None for unsupervised transformations)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model to the data X.", - "docstring": "Fit the model to the data X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\nReturns\n-------\nself : BernoulliRBM\n The fitted model.", + "docstring": "Fit the model to the data X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : BernoulliRBM\n The fitted model.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit the model to the data X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : BernoulliRBM\n The fitted model.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csr', dtype=(np.float64, np.float32))\n n_samples = X.shape[0]\n rng = check_random_state(self.random_state)\n self.components_ = np.asarray(rng.normal(0, 0.01, (self.n_components, X.shape[1])), order='F', dtype=X.dtype)\n self.intercept_hidden_ = np.zeros(self.n_components, dtype=X.dtype)\n self.intercept_visible_ = np.zeros(X.shape[1], dtype=X.dtype)\n self.h_samples_ = np.zeros((self.batch_size, self.n_components), dtype=X.dtype)\n n_batches = int(np.ceil(float(n_samples) / self.batch_size))\n batch_slices = list(gen_even_slices(n_batches * self.batch_size, n_batches, n_samples=n_samples))\n verbose = self.verbose\n begin = time.time()\n for iteration in range(1, self.n_iter + 1):\n for batch_slice in batch_slices:\n self._fit(X[batch_slice], rng)\n if verbose:\n end = time.time()\n print('[%s] Iteration %d, pseudo-likelihood = %.2f, time = %.2fs' % (type(self).__name__, iteration, self.score_samples(X).mean(), end - begin))\n begin = end\n return self" }, { @@ -147345,7 +158824,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "v", @@ -147355,13 +158835,14 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Values of the visible layer to start from." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform one Gibbs sampling step.", - "docstring": "Perform one Gibbs sampling step.\n\nParameters\n----------\nv : ndarray of shape (n_samples, n_features)\n Values of the visible layer to start from.\n\nReturns\n-------\nv_new : ndarray of shape (n_samples, n_features)\n Values of the visible layer after one Gibbs step.", + "docstring": "Perform one Gibbs sampling step.\n\n Parameters\n ----------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer to start from.\n\n Returns\n -------\n v_new : ndarray of shape (n_samples, n_features)\n Values of the visible layer after one Gibbs step.\n ", "source_code": "\ndef gibbs(self, v):\n \"\"\"Perform one Gibbs sampling step.\n\n Parameters\n ----------\n v : ndarray of shape (n_samples, n_features)\n Values of the visible layer to start from.\n\n Returns\n -------\n v_new : ndarray of shape (n_samples, n_features)\n Values of the visible layer after one Gibbs step.\n \"\"\"\n check_is_fitted(self)\n if not hasattr(self, 'random_state_'):\n self.random_state_ = check_random_state(self.random_state)\n h_ = self._sample_hiddens(v, self.random_state_)\n v_ = self._sample_visibles(h_, self.random_state_)\n return v_" }, { @@ -147379,7 +158860,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -147389,7 +158871,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "Training data." - } + }, + "refined_type": {} }, { "name": "y", @@ -147399,13 +158882,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs), default=None", "description": "Target values (None for unsupervised transformations)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model to the partial segment of the data X.", - "docstring": "Fit the model to the partial segment of the data X.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n Training data.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\nReturns\n-------\nself : BernoulliRBM\n The fitted model.", + "docstring": "Fit the model to the partial segment of the data X.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : BernoulliRBM\n The fitted model.\n ", "source_code": "\ndef partial_fit(self, X, y=None):\n \"\"\"Fit the model to the partial segment of the data X.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n Training data.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n Target values (None for unsupervised transformations).\n\n Returns\n -------\n self : BernoulliRBM\n The fitted model.\n \"\"\"\n first_pass = not hasattr(self, 'components_')\n X = self._validate_data(X, accept_sparse='csr', dtype=np.float64, reset=first_pass)\n if not hasattr(self, 'random_state_'):\n self.random_state_ = check_random_state(self.random_state)\n if not hasattr(self, 'components_'):\n self.components_ = np.asarray(self.random_state_.normal(0, 0.01, (self.n_components, X.shape[1])), order='F')\n if not hasattr(self, 'intercept_hidden_'):\n self.intercept_hidden_ = np.zeros(self.n_components)\n if not hasattr(self, 'intercept_visible_'):\n self.intercept_visible_ = np.zeros(X.shape[1])\n if not hasattr(self, 'h_samples_'):\n self.h_samples_ = np.zeros((self.batch_size, self.n_components))\n self._fit(X, self.random_state_)" }, { @@ -147423,7 +158907,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -147433,13 +158918,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Values of the visible layer. Must be all-boolean (not checked)." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Compute the pseudo-likelihood of X.", - "docstring": "Compute the pseudo-likelihood of X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Values of the visible layer. Must be all-boolean (not checked).\n\nReturns\n-------\npseudo_likelihood : ndarray of shape (n_samples,)\n Value of the pseudo-likelihood (proxy for likelihood).\n\nNotes\n-----\nThis method is not deterministic: it computes a quantity called the\nfree energy on X, then on a randomly corrupted version of X, and\nreturns the log of the logistic function of the difference.", + "docstring": "Compute the pseudo-likelihood of X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Values of the visible layer. Must be all-boolean (not checked).\n\n Returns\n -------\n pseudo_likelihood : ndarray of shape (n_samples,)\n Value of the pseudo-likelihood (proxy for likelihood).\n\n Notes\n -----\n This method is not deterministic: it computes a quantity called the\n free energy on X, then on a randomly corrupted version of X, and\n returns the log of the logistic function of the difference.\n ", "source_code": "\ndef score_samples(self, X):\n \"\"\"Compute the pseudo-likelihood of X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Values of the visible layer. Must be all-boolean (not checked).\n\n Returns\n -------\n pseudo_likelihood : ndarray of shape (n_samples,)\n Value of the pseudo-likelihood (proxy for likelihood).\n\n Notes\n -----\n This method is not deterministic: it computes a quantity called the\n free energy on X, then on a randomly corrupted version of X, and\n returns the log of the logistic function of the difference.\n \"\"\"\n check_is_fitted(self)\n v = self._validate_data(X, accept_sparse='csr', reset=False)\n rng = check_random_state(self.random_state)\n ind = (np.arange(v.shape[0]), rng.randint(0, v.shape[1], v.shape[0]))\n if sp.issparse(v):\n data = -2 * v[ind] + 1\n v_ = v + sp.csr_matrix((data.A.ravel(), ind), shape=v.shape)\n else:\n v_ = v.copy()\n v_[ind] = 1 - v_[ind]\n fe = self._free_energy(v)\n fe_ = self._free_energy(v_)\n return v.shape[1] * log_logistic(fe_ - fe)" }, { @@ -147457,7 +158946,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -147467,13 +158957,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data to be transformed." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Compute the hidden layer activation probabilities, P(h=1|v=X).", - "docstring": "Compute the hidden layer activation probabilities, P(h=1|v=X).\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to be transformed.\n\nReturns\n-------\nh : ndarray of shape (n_samples, n_components)\n Latent representations of the data.", + "docstring": "Compute the hidden layer activation probabilities, P(h=1|v=X).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to be transformed.\n\n Returns\n -------\n h : ndarray of shape (n_samples, n_components)\n Latent representations of the data.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Compute the hidden layer activation probabilities, P(h=1|v=X).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to be transformed.\n\n Returns\n -------\n h : ndarray of shape (n_samples, n_components)\n Latent representations of the data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse='csr', reset=False, dtype=(np.float64, np.float32))\n return self._mean_hiddens(X)" }, { @@ -147491,7 +158985,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "params", @@ -147501,7 +158996,8 @@ "docstring": { "type": "list, length = len(coefs_) + len(intercepts_)", "description": "The concatenated list containing coefs_ and intercepts_ in MLP model.\nUsed for initializing velocities and updating params" - } + }, + "refined_type": {} }, { "name": "learning_rate_init", @@ -147511,7 +159007,8 @@ "docstring": { "type": "float, default=0.001", "description": "The initial learning rate used. It controls the step-size in updating\nthe weights" - } + }, + "refined_type": {} }, { "name": "beta_1", @@ -147521,7 +159018,8 @@ "docstring": { "type": "float, default=0.9", "description": "Exponential decay rate for estimates of first moment vector, should be\nin [0, 1)" - } + }, + "refined_type": {} }, { "name": "beta_2", @@ -147531,7 +159029,8 @@ "docstring": { "type": "float, default=0.999", "description": "Exponential decay rate for estimates of second moment vector, should be\nin [0, 1)" - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -147541,13 +159040,14 @@ "docstring": { "type": "float, default=1e-8", "description": "Value for numerical stability" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, params, learning_rate_init=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08):\n super().__init__(learning_rate_init)\n self.beta_1 = beta_1\n self.beta_2 = beta_2\n self.epsilon = epsilon\n self.t = 0\n self.ms = [np.zeros_like(param) for param in params]\n self.vs = [np.zeros_like(param) for param in params]" }, { @@ -147565,7 +159065,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "grads", @@ -147575,13 +159076,14 @@ "docstring": { "type": "list, length = len(coefs_) + len(intercepts_)", "description": "Containing gradients with respect to coefs_ and intercepts_ in MLP\nmodel. So length should be aligned with params" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Get the values used to update params with given gradients", - "docstring": "Get the values used to update params with given gradients\n\nParameters\n----------\ngrads : list, length = len(coefs_) + len(intercepts_)\n Containing gradients with respect to coefs_ and intercepts_ in MLP\n model. So length should be aligned with params\n\nReturns\n-------\nupdates : list, length = len(grads)\n The values to add to params", + "docstring": "Get the values used to update params with given gradients\n\n Parameters\n ----------\n grads : list, length = len(coefs_) + len(intercepts_)\n Containing gradients with respect to coefs_ and intercepts_ in MLP\n model. So length should be aligned with params\n\n Returns\n -------\n updates : list, length = len(grads)\n The values to add to params\n ", "source_code": "\ndef _get_updates(self, grads):\n \"\"\"Get the values used to update params with given gradients\n\n Parameters\n ----------\n grads : list, length = len(coefs_) + len(intercepts_)\n Containing gradients with respect to coefs_ and intercepts_ in MLP\n model. So length should be aligned with params\n\n Returns\n -------\n updates : list, length = len(grads)\n The values to add to params\n \"\"\"\n self.t += 1\n self.ms = [self.beta_1 * m + (1 - self.beta_1) * grad for (m, grad) in zip(self.ms, grads)]\n self.vs = [self.beta_2 * v + (1 - self.beta_2) * grad**2 for (v, grad) in zip(self.vs, grads)]\n self.learning_rate = self.learning_rate_init * np.sqrt(1 - self.beta_2**self.t) / (1 - self.beta_1**self.t)\n updates = [-self.learning_rate * m / (np.sqrt(v) + self.epsilon) for (m, v) in zip(self.ms, self.vs)]\n return updates" }, { @@ -147599,7 +159101,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "learning_rate_init", @@ -147609,13 +159112,14 @@ "docstring": { "type": "float, default=0.1", "description": "The initial learning rate used. It controls the step-size in updating\nthe weights" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, learning_rate_init=0.1):\n self.learning_rate_init = learning_rate_init\n self.learning_rate = float(learning_rate_init)" }, { @@ -147633,7 +159137,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "time_step", @@ -147643,13 +159148,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Perform update to learning rate and potentially other states at the end of an iteration", - "docstring": "Perform update to learning rate and potentially other states at the\nend of an iteration", + "description": "Perform update to learning rate and potentially other states at the\nend of an iteration", + "docstring": "Perform update to learning rate and potentially other states at the\n end of an iteration\n ", "source_code": "\ndef iteration_ends(self, time_step):\n \"\"\"Perform update to learning rate and potentially other states at the\n end of an iteration\n \"\"\"\n pass" }, { @@ -147667,7 +159173,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "msg", @@ -147677,7 +159184,8 @@ "docstring": { "type": "str", "description": "Message passed in for verbose output" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -147687,13 +159195,14 @@ "docstring": { "type": "bool", "description": "Print message to stdin if True" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Decides whether it is time to stop training", - "docstring": "Decides whether it is time to stop training\n\nParameters\n----------\nmsg : str\n Message passed in for verbose output\n\nverbose : bool\n Print message to stdin if True\n\nReturns\n-------\nis_stopping : bool\n True if training needs to stop", + "docstring": "Decides whether it is time to stop training\n\n Parameters\n ----------\n msg : str\n Message passed in for verbose output\n\n verbose : bool\n Print message to stdin if True\n\n Returns\n -------\n is_stopping : bool\n True if training needs to stop\n ", "source_code": "\ndef trigger_stopping(self, msg, verbose):\n \"\"\"Decides whether it is time to stop training\n\n Parameters\n ----------\n msg : str\n Message passed in for verbose output\n\n verbose : bool\n Print message to stdin if True\n\n Returns\n -------\n is_stopping : bool\n True if training needs to stop\n \"\"\"\n if verbose:\n print(msg + ' Stopping.')\n return True" }, { @@ -147711,7 +159220,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "params", @@ -147721,7 +159231,8 @@ "docstring": { "type": "list of length = len(coefs_) + len(intercepts_)", "description": "The concatenated list containing coefs_ and intercepts_ in MLP\nmodel. Used for initializing velocities and updating params" - } + }, + "refined_type": {} }, { "name": "grads", @@ -147731,13 +159242,14 @@ "docstring": { "type": "list of length = len(params)", "description": "Containing gradients with respect to coefs_ and intercepts_ in MLP\nmodel. So length should be aligned with params" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Update parameters with given gradients", - "docstring": "Update parameters with given gradients\n\nParameters\n----------\nparams : list of length = len(coefs_) + len(intercepts_)\n The concatenated list containing coefs_ and intercepts_ in MLP\n model. Used for initializing velocities and updating params\n\ngrads : list of length = len(params)\n Containing gradients with respect to coefs_ and intercepts_ in MLP\n model. So length should be aligned with params", + "docstring": "Update parameters with given gradients\n\n Parameters\n ----------\n params : list of length = len(coefs_) + len(intercepts_)\n The concatenated list containing coefs_ and intercepts_ in MLP\n model. Used for initializing velocities and updating params\n\n grads : list of length = len(params)\n Containing gradients with respect to coefs_ and intercepts_ in MLP\n model. So length should be aligned with params\n ", "source_code": "\ndef update_params(self, params, grads):\n \"\"\"Update parameters with given gradients\n\n Parameters\n ----------\n params : list of length = len(coefs_) + len(intercepts_)\n The concatenated list containing coefs_ and intercepts_ in MLP\n model. Used for initializing velocities and updating params\n\n grads : list of length = len(params)\n Containing gradients with respect to coefs_ and intercepts_ in MLP\n model. So length should be aligned with params\n \"\"\"\n updates = self._get_updates(grads)\n for (param, update) in zip((p for p in params), updates):\n param += update" }, { @@ -147755,7 +159267,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "params", @@ -147765,7 +159278,8 @@ "docstring": { "type": "list, length = len(coefs_) + len(intercepts_)", "description": "The concatenated list containing coefs_ and intercepts_ in MLP model.\nUsed for initializing velocities and updating params" - } + }, + "refined_type": {} }, { "name": "learning_rate_init", @@ -147775,7 +159289,8 @@ "docstring": { "type": "float, default=0.1", "description": "The initial learning rate used. It controls the step-size in updating\nthe weights" - } + }, + "refined_type": {} }, { "name": "lr_schedule", @@ -147785,6 +159300,10 @@ "docstring": { "type": "{'constant', 'adaptive', 'invscaling'}, default='constant'", "description": "Learning rate schedule for weight updates.\n\n-'constant', is a constant learning rate given by\n 'learning_rate_init'.\n\n-'invscaling' gradually decreases the learning rate 'learning_rate_' at\n each time step 't' using an inverse scaling exponent of 'power_t'.\n learning_rate_ = learning_rate_init / pow(t, power_t)\n\n-'adaptive', keeps the learning rate constant to\n 'learning_rate_init' as long as the training keeps decreasing.\n Each time 2 consecutive epochs fail to decrease the training loss by\n tol, or fail to increase validation score by tol if 'early_stopping'\n is on, the current learning rate is divided by 5." + }, + "refined_type": { + "kind": "EnumType", + "values": ["adaptive", "constant", "invscaling"] } }, { @@ -147795,7 +159314,8 @@ "docstring": { "type": "float, default=0.9", "description": "Value of momentum used, must be larger than or equal to 0" - } + }, + "refined_type": {} }, { "name": "nesterov", @@ -147805,7 +159325,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to use nesterov's momentum or not. Use nesterov's if True" - } + }, + "refined_type": {} }, { "name": "power_t", @@ -147815,13 +159336,14 @@ "docstring": { "type": "float, default=0.5", "description": "Power of time step 't' in inverse scaling. See `lr_schedule` for\nmore details." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, params, learning_rate_init=0.1, lr_schedule='constant', momentum=0.9, nesterov=True, power_t=0.5):\n super().__init__(learning_rate_init)\n self.lr_schedule = lr_schedule\n self.momentum = momentum\n self.nesterov = nesterov\n self.power_t = power_t\n self.velocities = [np.zeros_like(param) for param in params]" }, { @@ -147839,7 +159361,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "grads", @@ -147849,13 +159372,14 @@ "docstring": { "type": "list, length = len(coefs_) + len(intercepts_)", "description": "Containing gradients with respect to coefs_ and intercepts_ in MLP\nmodel. So length should be aligned with params" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Get the values used to update params with given gradients", - "docstring": "Get the values used to update params with given gradients\n\nParameters\n----------\ngrads : list, length = len(coefs_) + len(intercepts_)\n Containing gradients with respect to coefs_ and intercepts_ in MLP\n model. So length should be aligned with params\n\nReturns\n-------\nupdates : list, length = len(grads)\n The values to add to params", + "docstring": "Get the values used to update params with given gradients\n\n Parameters\n ----------\n grads : list, length = len(coefs_) + len(intercepts_)\n Containing gradients with respect to coefs_ and intercepts_ in MLP\n model. So length should be aligned with params\n\n Returns\n -------\n updates : list, length = len(grads)\n The values to add to params\n ", "source_code": "\ndef _get_updates(self, grads):\n \"\"\"Get the values used to update params with given gradients\n\n Parameters\n ----------\n grads : list, length = len(coefs_) + len(intercepts_)\n Containing gradients with respect to coefs_ and intercepts_ in MLP\n model. So length should be aligned with params\n\n Returns\n -------\n updates : list, length = len(grads)\n The values to add to params\n \"\"\"\n updates = [self.momentum * velocity - self.learning_rate * grad for (velocity, grad) in zip(self.velocities, grads)]\n self.velocities = updates\n if self.nesterov:\n updates = [self.momentum * velocity - self.learning_rate * grad for (velocity, grad) in zip(self.velocities, grads)]\n return updates" }, { @@ -147873,7 +159397,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "time_step", @@ -147883,13 +159408,14 @@ "docstring": { "type": "int", "description": "number of training samples trained on so far, used to update\nlearning rate for 'invscaling'" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Perform updates to learning rate and potential other states at the end of an iteration", - "docstring": "Perform updates to learning rate and potential other states at the\nend of an iteration\n\nParameters\n----------\ntime_step : int\n number of training samples trained on so far, used to update\n learning rate for 'invscaling'", + "description": "Perform updates to learning rate and potential other states at the\nend of an iteration", + "docstring": "Perform updates to learning rate and potential other states at the\n end of an iteration\n\n Parameters\n ----------\n time_step : int\n number of training samples trained on so far, used to update\n learning rate for 'invscaling'\n ", "source_code": "\ndef iteration_ends(self, time_step):\n \"\"\"Perform updates to learning rate and potential other states at the\n end of an iteration\n\n Parameters\n ----------\n time_step : int\n number of training samples trained on so far, used to update\n learning rate for 'invscaling'\n \"\"\"\n if self.lr_schedule == 'invscaling':\n self.learning_rate = float(self.learning_rate_init) / (time_step + 1)**self.power_t" }, { @@ -147907,7 +159433,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "msg", @@ -147917,7 +159444,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -147927,13 +159455,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef trigger_stopping(self, msg, verbose):\n if self.lr_schedule != 'adaptive':\n if verbose:\n print(msg + ' Stopping.')\n return True\n if self.learning_rate <= 1e-06:\n if verbose:\n print(msg + ' Learning rate too small. Stopping.')\n return True\n self.learning_rate /= 5.0\n if verbose:\n print(msg + ' Setting learning rate to %f' % self.learning_rate)\n return False" }, { @@ -147951,7 +159480,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformer_list", @@ -147961,7 +159491,8 @@ "docstring": { "type": "list of tuple", "description": "List of tuple containing `(str, transformer)`. The first element\nof the tuple is name affected to the transformer while the\nsecond element is a scikit-learn transformer instance.\nThe transformer instance can also be `\"drop\"` for it to be\nignored.\n\n.. versionchanged:: 0.22\n Deprecated `None` as a transformer in favor of 'drop'." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -147971,7 +159502,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\n\n.. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None" - } + }, + "refined_type": {} }, { "name": "transformer_weights", @@ -147981,7 +159513,8 @@ "docstring": { "type": "dict, default=None", "description": "Multiplicative weights for features per transformer.\nKeys are transformer names, values the weights.\nRaises ValueError if key not present in ``transformer_list``." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -147991,13 +159524,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the time elapsed while fitting each transformer will be\nprinted as it is completed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, transformer_list, *, n_jobs=None, transformer_weights=None, verbose=False):\n self.transformer_list = transformer_list\n self.n_jobs = n_jobs\n self.transformer_weights = transformer_weights\n self.verbose = verbose\n self._validate_transformers()" }, { @@ -148015,7 +159549,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Xs", @@ -148025,13 +159560,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _hstack(self, Xs):\n if any((sparse.issparse(f) for f in Xs)):\n Xs = sparse.hstack(Xs).tocsr()\n else:\n Xs = np.hstack(Xs)\n return Xs" }, { @@ -148049,13 +159585,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Generate (name, trans, weight) tuples excluding None and 'drop' transformers.", - "docstring": "Generate (name, trans, weight) tuples excluding None and\n'drop' transformers.", + "description": "Generate (name, trans, weight) tuples excluding None and\n'drop' transformers.", + "docstring": "\n Generate (name, trans, weight) tuples excluding None and\n 'drop' transformers.\n ", "source_code": "\ndef _iter(self):\n \"\"\"\n Generate (name, trans, weight) tuples excluding None and\n 'drop' transformers.\n \"\"\"\n get_weight = (self.transformer_weights or {}).get\n return ((name, trans, get_weight(name)) for (name, trans) in self.transformer_list if trans != 'drop')" }, { @@ -148073,7 +159610,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "name", @@ -148083,7 +159621,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "idx", @@ -148093,7 +159632,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "total", @@ -148103,13 +159643,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _log_message(self, name, idx, total):\n if not self.verbose:\n return None\n return '(step %d of %d) Processing %s' % (idx, total, name)" }, { @@ -148127,7 +159668,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -148137,7 +159679,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -148147,7 +159690,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fit_params", @@ -148157,7 +159701,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "func", @@ -148167,7 +159712,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -148191,13 +159737,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sk_visual_block_(self):\n (names, transformers) = zip(*self.transformer_list)\n return _VisualBlock('parallel', transformers, names=names)" }, { @@ -148215,7 +159762,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformers", @@ -148225,13 +159773,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _update_transformer_list(self, transformers):\n transformers = iter(transformers)\n self.transformer_list[:] = [(name, old if old == 'drop' else next(transformers)) for (name, old) in self.transformer_list]" }, { @@ -148249,13 +159798,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_transformer_weights(self):\n if not self.transformer_weights:\n return\n transformer_names = set((name for (name, _) in self.transformer_list))\n for name in self.transformer_weights:\n if name not in transformer_names:\n raise ValueError(f'Attempting to weight transformer \"{name}\", but it is not present in transformer_list.')" }, { @@ -148273,13 +159823,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_transformers(self):\n (names, transformers) = zip(*self.transformer_list)\n self._validate_names(names)\n for t in transformers:\n if t == 'drop':\n continue\n if not (hasattr(t, 'fit') or hasattr(t, 'fit_transform')) or not hasattr(t, 'transform'):\n raise TypeError(\"All estimators should implement fit and transform. '%s' (type %s) doesn't\" % (t, type(t)))" }, { @@ -148297,7 +159848,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -148307,7 +159859,8 @@ "docstring": { "type": "iterable or array-like, depending on transformers", "description": "Input data, used to fit transformers." - } + }, + "refined_type": {} }, { "name": "y", @@ -148317,13 +159870,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_outputs), default=None", "description": "Targets for supervised learning." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit all transformers using X.", - "docstring": "Fit all transformers using X.\n\nParameters\n----------\nX : iterable or array-like, depending on transformers\n Input data, used to fit transformers.\n\ny : array-like of shape (n_samples, n_outputs), default=None\n Targets for supervised learning.\n\n**fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\nReturns\n-------\nself : object\n FeatureUnion class instance.", + "docstring": "Fit all transformers using X.\n\n Parameters\n ----------\n X : iterable or array-like, depending on transformers\n Input data, used to fit transformers.\n\n y : array-like of shape (n_samples, n_outputs), default=None\n Targets for supervised learning.\n\n **fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n Returns\n -------\n self : object\n FeatureUnion class instance.\n ", "source_code": "\ndef fit(self, X, y=None, **fit_params):\n \"\"\"Fit all transformers using X.\n\n Parameters\n ----------\n X : iterable or array-like, depending on transformers\n Input data, used to fit transformers.\n\n y : array-like of shape (n_samples, n_outputs), default=None\n Targets for supervised learning.\n\n **fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n Returns\n -------\n self : object\n FeatureUnion class instance.\n \"\"\"\n transformers = self._parallel_func(X, y, fit_params, _fit_one)\n if not transformers:\n return self\n self._update_transformer_list(transformers)\n return self" }, { @@ -148341,7 +159895,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -148351,7 +159906,8 @@ "docstring": { "type": "iterable or array-like, depending on transformers", "description": "Input data to be transformed." - } + }, + "refined_type": {} }, { "name": "y", @@ -148361,13 +159917,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_outputs), default=None", "description": "Targets for supervised learning." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit all transformers, transform the data and concatenate results.", - "docstring": "Fit all transformers, transform the data and concatenate results.\n\nParameters\n----------\nX : iterable or array-like, depending on transformers\n Input data to be transformed.\n\ny : array-like of shape (n_samples, n_outputs), default=None\n Targets for supervised learning.\n\n**fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\nReturns\n-------\nX_t : array-like or sparse matrix of shape (n_samples, sum_n_components)\n The `hstack` of results of transformers. `sum_n_components` is the\n sum of `n_components` (output dimension) over transformers.", + "docstring": "Fit all transformers, transform the data and concatenate results.\n\n Parameters\n ----------\n X : iterable or array-like, depending on transformers\n Input data to be transformed.\n\n y : array-like of shape (n_samples, n_outputs), default=None\n Targets for supervised learning.\n\n **fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n Returns\n -------\n X_t : array-like or sparse matrix of shape (n_samples, sum_n_components)\n The `hstack` of results of transformers. `sum_n_components` is the\n sum of `n_components` (output dimension) over transformers.\n ", "source_code": "\ndef fit_transform(self, X, y=None, **fit_params):\n \"\"\"Fit all transformers, transform the data and concatenate results.\n\n Parameters\n ----------\n X : iterable or array-like, depending on transformers\n Input data to be transformed.\n\n y : array-like of shape (n_samples, n_outputs), default=None\n Targets for supervised learning.\n\n **fit_params : dict, default=None\n Parameters to pass to the fit method of the estimator.\n\n Returns\n -------\n X_t : array-like or sparse matrix of shape (n_samples, sum_n_components)\n The `hstack` of results of transformers. `sum_n_components` is the\n sum of `n_components` (output dimension) over transformers.\n \"\"\"\n results = self._parallel_func(X, y, fit_params, _fit_transform_one)\n if not results:\n return np.zeros((X.shape[0], 0))\n (Xs, transformers) = zip(*results)\n self._update_transformer_list(transformers)\n return self._hstack(Xs)" }, { @@ -148387,13 +159944,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get feature names from all transformers.", - "docstring": "Get feature names from all transformers.\n\nReturns\n-------\nfeature_names : list of strings\n Names of the features produced by transform.", + "docstring": "Get feature names from all transformers.\n\n Returns\n -------\n feature_names : list of strings\n Names of the features produced by transform.\n ", "source_code": "\n@deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\ndef get_feature_names(self):\n \"\"\"Get feature names from all transformers.\n\n Returns\n -------\n feature_names : list of strings\n Names of the features produced by transform.\n \"\"\"\n feature_names = []\n for (name, trans, weight) in self._iter():\n if not hasattr(trans, 'get_feature_names'):\n raise AttributeError('Transformer %s (type %s) does not provide get_feature_names.' % (str(name), type(trans).__name__))\n feature_names.extend([name + '__' + f for f in trans.get_feature_names()])\n return feature_names" }, { @@ -148411,7 +159969,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -148421,13 +159980,14 @@ "docstring": { "type": "array-like of str or None, default=None", "description": "Input features." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get output feature names for transformation.", - "docstring": "Get output feature names for transformation.\n\nParameters\n----------\ninput_features : array-like of str or None, default=None\n Input features.\n\nReturns\n-------\nfeature_names_out : ndarray of str objects\n Transformed feature names.", + "docstring": "Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n ", "source_code": "\ndef get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n feature_names = []\n for (name, trans, _) in self._iter():\n if not hasattr(trans, 'get_feature_names_out'):\n raise AttributeError('Transformer %s (type %s) does not provide get_feature_names_out.' % (str(name), type(trans).__name__))\n feature_names.extend([f'{name}__{f}' for f in trans.get_feature_names_out(input_features)])\n return np.asarray(feature_names, dtype=object)" }, { @@ -148445,7 +160005,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -148455,13 +160016,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, will return the parameters for this estimator and\ncontained subobjects that are estimators." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Get parameters for this estimator.\n\nReturns the parameters given in the constructor as well as the estimators contained within the `transformer_list` of the `FeatureUnion`.", - "docstring": "Get parameters for this estimator.\n\nReturns the parameters given in the constructor as well as the\nestimators contained within the `transformer_list` of the\n`FeatureUnion`.\n\nParameters\n----------\ndeep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\nReturns\n-------\nparams : mapping of string to any\n Parameter names mapped to their values.", + "description": "Get parameters for this estimator.\n\nReturns the parameters given in the constructor as well as the\nestimators contained within the `transformer_list` of the\n`FeatureUnion`.", + "docstring": "Get parameters for this estimator.\n\n Returns the parameters given in the constructor as well as the\n estimators contained within the `transformer_list` of the\n `FeatureUnion`.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : mapping of string to any\n Parameter names mapped to their values.\n ", "source_code": "\ndef get_params(self, deep=True):\n \"\"\"Get parameters for this estimator.\n\n Returns the parameters given in the constructor as well as the\n estimators contained within the `transformer_list` of the\n `FeatureUnion`.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : mapping of string to any\n Parameter names mapped to their values.\n \"\"\"\n return self._get_params('transformer_list', deep=deep)" }, { @@ -148479,7 +160041,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -148503,13 +160066,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Set the parameters of this estimator.\n\nValid parameter keys can be listed with ``get_params()``. Note that you can directly set the parameters of the estimators contained in `tranformer_list`.", - "docstring": "Set the parameters of this estimator.\n\nValid parameter keys can be listed with ``get_params()``. Note that\nyou can directly set the parameters of the estimators contained in\n`tranformer_list`.\n\nParameters\n----------\n**kwargs : dict\n Parameters of this estimator or parameters of estimators contained\n in `transform_list`. Parameters of the transformers may be set\n using its name and the parameter name separated by a '__'.\n\nReturns\n-------\nself : object\n FeatureUnion class instance.", + "description": "Set the parameters of this estimator.\n\nValid parameter keys can be listed with ``get_params()``. Note that\nyou can directly set the parameters of the estimators contained in\n`tranformer_list`.", + "docstring": "Set the parameters of this estimator.\n\n Valid parameter keys can be listed with ``get_params()``. Note that\n you can directly set the parameters of the estimators contained in\n `tranformer_list`.\n\n Parameters\n ----------\n **kwargs : dict\n Parameters of this estimator or parameters of estimators contained\n in `transform_list`. Parameters of the transformers may be set\n using its name and the parameter name separated by a '__'.\n\n Returns\n -------\n self : object\n FeatureUnion class instance.\n ", "source_code": "\ndef set_params(self, **kwargs):\n \"\"\"Set the parameters of this estimator.\n\n Valid parameter keys can be listed with ``get_params()``. Note that\n you can directly set the parameters of the estimators contained in\n `tranformer_list`.\n\n Parameters\n ----------\n **kwargs : dict\n Parameters of this estimator or parameters of estimators contained\n in `transform_list`. Parameters of the transformers may be set\n using its name and the parameter name separated by a '__'.\n\n Returns\n -------\n self : object\n FeatureUnion class instance.\n \"\"\"\n self._set_params('transformer_list', **kwargs)\n return self" }, { @@ -148527,7 +160091,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -148537,13 +160102,14 @@ "docstring": { "type": "iterable or array-like, depending on transformers", "description": "Input data to be transformed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform X separately by each transformer, concatenate results.", - "docstring": "Transform X separately by each transformer, concatenate results.\n\nParameters\n----------\nX : iterable or array-like, depending on transformers\n Input data to be transformed.\n\nReturns\n-------\nX_t : array-like or sparse matrix of shape (n_samples, sum_n_components)\n The `hstack` of results of transformers. `sum_n_components` is the\n sum of `n_components` (output dimension) over transformers.", + "docstring": "Transform X separately by each transformer, concatenate results.\n\n Parameters\n ----------\n X : iterable or array-like, depending on transformers\n Input data to be transformed.\n\n Returns\n -------\n X_t : array-like or sparse matrix of shape (n_samples, sum_n_components)\n The `hstack` of results of transformers. `sum_n_components` is the\n sum of `n_components` (output dimension) over transformers.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform X separately by each transformer, concatenate results.\n\n Parameters\n ----------\n X : iterable or array-like, depending on transformers\n Input data to be transformed.\n\n Returns\n -------\n X_t : array-like or sparse matrix of shape (n_samples, sum_n_components)\n The `hstack` of results of transformers. `sum_n_components` is the\n sum of `n_components` (output dimension) over transformers.\n \"\"\"\n Xs = Parallel(n_jobs=self.n_jobs)((delayed(_transform_one)(trans, X, None, weight) for (name, trans, weight) in self._iter()))\n if not Xs:\n return np.zeros((X.shape[0], 0))\n return self._hstack(Xs)" }, { @@ -148561,7 +160127,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ind", @@ -148571,13 +160138,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns a sub-pipeline or a single estimator in the pipeline\n\nIndexing with an integer will return an estimator; using a slice returns another Pipeline instance which copies a slice of this Pipeline. This copy is shallow: modifying (or fitting) estimators in the sub-pipeline will affect the larger pipeline and vice-versa. However, replacing a value in `step` will not affect a copy.", - "docstring": "Returns a sub-pipeline or a single estimator in the pipeline\n\nIndexing with an integer will return an estimator; using a slice\nreturns another Pipeline instance which copies a slice of this\nPipeline. This copy is shallow: modifying (or fitting) estimators in\nthe sub-pipeline will affect the larger pipeline and vice-versa.\nHowever, replacing a value in `step` will not affect a copy.", + "description": "Returns a sub-pipeline or a single estimator in the pipeline\n\nIndexing with an integer will return an estimator; using a slice\nreturns another Pipeline instance which copies a slice of this\nPipeline. This copy is shallow: modifying (or fitting) estimators in\nthe sub-pipeline will affect the larger pipeline and vice-versa.\nHowever, replacing a value in `step` will not affect a copy.", + "docstring": "Returns a sub-pipeline or a single estimator in the pipeline\n\n Indexing with an integer will return an estimator; using a slice\n returns another Pipeline instance which copies a slice of this\n Pipeline. This copy is shallow: modifying (or fitting) estimators in\n the sub-pipeline will affect the larger pipeline and vice-versa.\n However, replacing a value in `step` will not affect a copy.\n ", "source_code": "\ndef __getitem__(self, ind):\n \"\"\"Returns a sub-pipeline or a single estimator in the pipeline\n\n Indexing with an integer will return an estimator; using a slice\n returns another Pipeline instance which copies a slice of this\n Pipeline. This copy is shallow: modifying (or fitting) estimators in\n the sub-pipeline will affect the larger pipeline and vice-versa.\n However, replacing a value in `step` will not affect a copy.\n \"\"\"\n if isinstance(ind, slice):\n if ind.step not in (1, None):\n raise ValueError('Pipeline slicing only supports a step of 1')\n return self.__class__(self.steps[ind], memory=self.memory, verbose=self.verbose)\n try:\n (name, est) = self.steps[ind]\n except TypeError:\n return self.named_steps[ind]\n return est" }, { @@ -148595,7 +160163,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "steps", @@ -148605,7 +160174,8 @@ "docstring": { "type": "list of tuple", "description": "List of (name, transform) tuples (implementing `fit`/`transform`) that\nare chained, in the order in which they are chained, with the last\nobject an estimator." - } + }, + "refined_type": {} }, { "name": "memory", @@ -148615,7 +160185,8 @@ "docstring": { "type": "str or object with the joblib.Memory interface, default=None", "description": "Used to cache the fitted transformers of the pipeline. By default,\nno caching is performed. If a string is given, it is the path to\nthe caching directory. Enabling caching triggers a clone of\nthe transformers before fitting. Therefore, the transformer\ninstance given to the pipeline cannot be inspected\ndirectly. Use the attribute ``named_steps`` or ``steps`` to\ninspect estimators within the pipeline. Caching the\ntransformers is advantageous when fitting is time consuming." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -148625,13 +160196,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the time elapsed while fitting each step will be printed as it\nis completed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, steps, *, memory=None, verbose=False):\n self.steps = steps\n self.memory = memory\n self.verbose = verbose\n self._validate_steps()" }, { @@ -148649,13 +160221,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Returns the length of the Pipeline", - "docstring": "Returns the length of the Pipeline", + "docstring": "\n Returns the length of the Pipeline\n ", "source_code": "\ndef __len__(self):\n \"\"\"\n Returns the length of the Pipeline\n \"\"\"\n return len(self.steps)" }, { @@ -148673,7 +160246,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -148697,13 +160271,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _can_inverse_transform(self):\n return all((hasattr(t, 'inverse_transform') for (_, _, t) in self._iter()))" }, { @@ -148721,13 +160296,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _can_transform(self):\n return self._final_estimator == 'passthrough' or hasattr(self._final_estimator, 'transform')" }, { @@ -148745,13 +160321,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_fit_params(self, **fit_params):\n fit_params_steps = {name: {} for (name, step) in self.steps if step is not None}\n for (pname, pval) in fit_params.items():\n if '__' not in pname:\n raise ValueError('Pipeline.fit does not accept the {} parameter. You can pass parameters to specific steps of your pipeline using the stepname__parameter format, e.g. `Pipeline.fit(X, y, logisticregression__sample_weight=sample_weight)`.'.format(pname))\n (step, param) = pname.split('__', 1)\n fit_params_steps[step][param] = pval\n return fit_params_steps" }, { @@ -148769,13 +160346,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef _estimator_type(self):\n return self.steps[-1][1]._estimator_type" }, { @@ -148793,13 +160371,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@property\ndef _final_estimator(self):\n estimator = self.steps[-1][1]\n return 'passthrough' if estimator is None else estimator" }, { @@ -148817,7 +160396,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -148827,7 +160407,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -148837,13 +160418,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit(self, X, y=None, **fit_params_steps):\n self.steps = list(self.steps)\n self._validate_steps()\n memory = check_memory(self.memory)\n fit_transform_one_cached = memory.cache(_fit_transform_one)\n for (step_idx, name, transformer) in self._iter(with_final=False, filter_passthrough=False):\n if transformer is None or transformer == 'passthrough':\n with _print_elapsed_time('Pipeline', self._log_message(step_idx)):\n continue\n if hasattr(memory, 'location'):\n if memory.location is None:\n cloned_transformer = transformer\n else:\n cloned_transformer = clone(transformer)\n elif hasattr(memory, 'cachedir'):\n if memory.cachedir is None:\n cloned_transformer = transformer\n else:\n cloned_transformer = clone(transformer)\n else:\n cloned_transformer = clone(transformer)\n (X, fitted_transformer) = fit_transform_one_cached(cloned_transformer, X, y, None, message_clsname='Pipeline', message=self._log_message(step_idx), **fit_params_steps[name])\n self.steps[step_idx] = (name, fitted_transformer)\n return X" }, { @@ -148861,7 +160443,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "with_final", @@ -148871,7 +160454,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "filter_passthrough", @@ -148881,13 +160465,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Generate (idx, (name, trans)) tuples from self.steps\n\nWhen filter_passthrough is True, 'passthrough' and None transformers are filtered out.", - "docstring": "Generate (idx, (name, trans)) tuples from self.steps\n\nWhen filter_passthrough is True, 'passthrough' and None transformers\nare filtered out.", + "description": "Generate (idx, (name, trans)) tuples from self.steps\n\nWhen filter_passthrough is True, 'passthrough' and None transformers\nare filtered out.", + "docstring": "\n Generate (idx, (name, trans)) tuples from self.steps\n\n When filter_passthrough is True, 'passthrough' and None transformers\n are filtered out.\n ", "source_code": "\ndef _iter(self, with_final=True, filter_passthrough=True):\n \"\"\"\n Generate (idx, (name, trans)) tuples from self.steps\n\n When filter_passthrough is True, 'passthrough' and None transformers\n are filtered out.\n \"\"\"\n stop = len(self.steps)\n if not with_final:\n stop -= 1\n for (idx, (name, trans)) in enumerate(islice(self.steps, 0, stop)):\n if not filter_passthrough:\n yield (idx, name, trans)\n elif trans is not None and trans != 'passthrough':\n yield (idx, name, trans)" }, { @@ -148905,7 +160490,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "step_idx", @@ -148915,13 +160501,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _log_message(self, step_idx):\n if not self.verbose:\n return None\n (name, _) = self.steps[step_idx]\n return '(step %d of %d) Processing %s' % (step_idx + 1, len(self.steps), name)" }, { @@ -148939,13 +160526,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'pairwise': _safe_tags(self.steps[0][1], 'pairwise')}" }, { @@ -148966,13 +160554,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef _pairwise(self):\n return getattr(self.steps[0][1], '_pairwise', False)" }, { @@ -148990,13 +160579,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sk_visual_block_(self):\n (_, estimators) = zip(*self.steps)\n \n def _get_name(name, est):\n if est is None or est == 'passthrough':\n return f'{name}: passthrough'\n return f'{name}: {est.__class__.__name__}'\n names = [_get_name(name, est) for (name, est) in self.steps]\n name_details = [str(est) for est in estimators]\n return _VisualBlock('serial', estimators, names=names, name_details=name_details, dash_wrapped=False)" }, { @@ -149014,13 +160604,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_steps(self):\n (names, estimators) = zip(*self.steps)\n self._validate_names(names)\n transformers = estimators[:-1]\n estimator = estimators[-1]\n for t in transformers:\n if t is None or t == 'passthrough':\n continue\n if not (hasattr(t, 'fit') or hasattr(t, 'fit_transform')) or not hasattr(t, 'transform'):\n raise TypeError(\"All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' '%s' (type %s) doesn't\" % (t, type(t)))\n if estimator is not None and estimator != 'passthrough' and not hasattr(estimator, 'fit'):\n raise TypeError(\"Last step of Pipeline should implement fit or be the string 'passthrough'. '%s' (type %s) doesn't\" % (estimator, type(estimator)))" }, { @@ -149038,7 +160629,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -149064,7 +160656,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149074,13 +160667,14 @@ "docstring": { "type": "iterable", "description": "Data to predict on. Must fulfill input requirements of first step\nof the pipeline." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform the data, and apply `decision_function` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `decision_function` method. Only valid if the final estimator implements `decision_function`.", - "docstring": "Transform the data, and apply `decision_function` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls\n`decision_function` method. Only valid if the final estimator\nimplements `decision_function`.\n\nParameters\n----------\nX : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\nReturns\n-------\ny_score : ndarray of shape (n_samples, n_classes)\n Result of calling `decision_function` on the final estimator.", + "description": "Transform the data, and apply `decision_function` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls\n`decision_function` method. Only valid if the final estimator\nimplements `decision_function`.", + "docstring": "Transform the data, and apply `decision_function` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `decision_function` method. Only valid if the final estimator\n implements `decision_function`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n Returns\n -------\n y_score : ndarray of shape (n_samples, n_classes)\n Result of calling `decision_function` on the final estimator.\n ", "source_code": "\n@available_if(_final_estimator_has('decision_function'))\ndef decision_function(self, X):\n \"\"\"Transform the data, and apply `decision_function` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `decision_function` method. Only valid if the final estimator\n implements `decision_function`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n Returns\n -------\n y_score : ndarray of shape (n_samples, n_classes)\n Result of calling `decision_function` on the final estimator.\n \"\"\"\n Xt = X\n for (_, name, transform) in self._iter(with_final=False):\n Xt = transform.transform(Xt)\n return self.steps[-1][1].decision_function(Xt)" }, { @@ -149098,7 +160692,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -149122,7 +160717,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149132,7 +160728,8 @@ "docstring": { "type": "iterable", "description": "Training data. Must fulfill input requirements of first step of the\npipeline." - } + }, + "refined_type": {} }, { "name": "y", @@ -149142,13 +160739,14 @@ "docstring": { "type": "iterable, default=None", "description": "Training targets. Must fulfill label requirements for all steps of\nthe pipeline." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit the model.\n\nFit all the transformers one after the other and transform the data. Finally, fit the transformed data using the final estimator.", - "docstring": "Fit the model.\n\nFit all the transformers one after the other and transform the\ndata. Finally, fit the transformed data using the final estimator.\n\nParameters\n----------\nX : iterable\n Training data. Must fulfill input requirements of first step of the\n pipeline.\n\ny : iterable, default=None\n Training targets. Must fulfill label requirements for all steps of\n the pipeline.\n\n**fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of each step, where\n each parameter name is prefixed such that parameter ``p`` for step\n ``s`` has key ``s__p``.\n\nReturns\n-------\nself : object\n Pipeline with fitted steps.", + "description": "Fit the model.\n\nFit all the transformers one after the other and transform the\ndata. Finally, fit the transformed data using the final estimator.", + "docstring": "Fit the model.\n\n Fit all the transformers one after the other and transform the\n data. Finally, fit the transformed data using the final estimator.\n\n Parameters\n ----------\n X : iterable\n Training data. Must fulfill input requirements of first step of the\n pipeline.\n\n y : iterable, default=None\n Training targets. Must fulfill label requirements for all steps of\n the pipeline.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of each step, where\n each parameter name is prefixed such that parameter ``p`` for step\n ``s`` has key ``s__p``.\n\n Returns\n -------\n self : object\n Pipeline with fitted steps.\n ", "source_code": "\ndef fit(self, X, y=None, **fit_params):\n \"\"\"Fit the model.\n\n Fit all the transformers one after the other and transform the\n data. Finally, fit the transformed data using the final estimator.\n\n Parameters\n ----------\n X : iterable\n Training data. Must fulfill input requirements of first step of the\n pipeline.\n\n y : iterable, default=None\n Training targets. Must fulfill label requirements for all steps of\n the pipeline.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of each step, where\n each parameter name is prefixed such that parameter ``p`` for step\n ``s`` has key ``s__p``.\n\n Returns\n -------\n self : object\n Pipeline with fitted steps.\n \"\"\"\n fit_params_steps = self._check_fit_params(**fit_params)\n Xt = self._fit(X, y, **fit_params_steps)\n with _print_elapsed_time('Pipeline', self._log_message(len(self.steps) - 1)):\n if self._final_estimator != 'passthrough':\n fit_params_last_step = fit_params_steps[self.steps[-1][0]]\n self._final_estimator.fit(Xt, y, **fit_params_last_step)\n return self" }, { @@ -149166,7 +160764,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149176,7 +160775,8 @@ "docstring": { "type": "iterable", "description": "Training data. Must fulfill input requirements of first step of\nthe pipeline." - } + }, + "refined_type": {} }, { "name": "y", @@ -149186,13 +160786,14 @@ "docstring": { "type": "iterable, default=None", "description": "Training targets. Must fulfill label requirements for all steps\nof the pipeline." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform the data, and apply `fit_predict` with the final estimator.\n\nCall `fit_transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `fit_predict` method. Only valid if the final estimator implements `fit_predict`.", - "docstring": "Transform the data, and apply `fit_predict` with the final estimator.\n\nCall `fit_transform` of each transformer in the pipeline. The\ntransformed data are finally passed to the final estimator that calls\n`fit_predict` method. Only valid if the final estimator implements\n`fit_predict`.\n\nParameters\n----------\nX : iterable\n Training data. Must fulfill input requirements of first step of\n the pipeline.\n\ny : iterable, default=None\n Training targets. Must fulfill label requirements for all steps\n of the pipeline.\n\n**fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of each step, where\n each parameter name is prefixed such that parameter ``p`` for step\n ``s`` has key ``s__p``.\n\nReturns\n-------\ny_pred : ndarray\n Result of calling `fit_predict` on the final estimator.", + "description": "Transform the data, and apply `fit_predict` with the final estimator.\n\nCall `fit_transform` of each transformer in the pipeline. The\ntransformed data are finally passed to the final estimator that calls\n`fit_predict` method. Only valid if the final estimator implements\n`fit_predict`.", + "docstring": "Transform the data, and apply `fit_predict` with the final estimator.\n\n Call `fit_transform` of each transformer in the pipeline. The\n transformed data are finally passed to the final estimator that calls\n `fit_predict` method. Only valid if the final estimator implements\n `fit_predict`.\n\n Parameters\n ----------\n X : iterable\n Training data. Must fulfill input requirements of first step of\n the pipeline.\n\n y : iterable, default=None\n Training targets. Must fulfill label requirements for all steps\n of the pipeline.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of each step, where\n each parameter name is prefixed such that parameter ``p`` for step\n ``s`` has key ``s__p``.\n\n Returns\n -------\n y_pred : ndarray\n Result of calling `fit_predict` on the final estimator.\n ", "source_code": "\n@available_if(_final_estimator_has('fit_predict'))\ndef fit_predict(self, X, y=None, **fit_params):\n \"\"\"Transform the data, and apply `fit_predict` with the final estimator.\n\n Call `fit_transform` of each transformer in the pipeline. The\n transformed data are finally passed to the final estimator that calls\n `fit_predict` method. Only valid if the final estimator implements\n `fit_predict`.\n\n Parameters\n ----------\n X : iterable\n Training data. Must fulfill input requirements of first step of\n the pipeline.\n\n y : iterable, default=None\n Training targets. Must fulfill label requirements for all steps\n of the pipeline.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of each step, where\n each parameter name is prefixed such that parameter ``p`` for step\n ``s`` has key ``s__p``.\n\n Returns\n -------\n y_pred : ndarray\n Result of calling `fit_predict` on the final estimator.\n \"\"\"\n fit_params_steps = self._check_fit_params(**fit_params)\n Xt = self._fit(X, y, **fit_params_steps)\n fit_params_last_step = fit_params_steps[self.steps[-1][0]]\n with _print_elapsed_time('Pipeline', self._log_message(len(self.steps) - 1)):\n y_pred = self.steps[-1][1].fit_predict(Xt, y, **fit_params_last_step)\n return y_pred" }, { @@ -149210,7 +160811,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149220,7 +160822,8 @@ "docstring": { "type": "iterable", "description": "Training data. Must fulfill input requirements of first step of the\npipeline." - } + }, + "refined_type": {} }, { "name": "y", @@ -149230,13 +160833,14 @@ "docstring": { "type": "iterable, default=None", "description": "Training targets. Must fulfill label requirements for all steps of\nthe pipeline." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Fit the model and transform with the final estimator.\n\nFits all the transformers one after the other and transform the data. Then uses `fit_transform` on transformed data with the final estimator.", - "docstring": "Fit the model and transform with the final estimator.\n\nFits all the transformers one after the other and transform the\ndata. Then uses `fit_transform` on transformed data with the final\nestimator.\n\nParameters\n----------\nX : iterable\n Training data. Must fulfill input requirements of first step of the\n pipeline.\n\ny : iterable, default=None\n Training targets. Must fulfill label requirements for all steps of\n the pipeline.\n\n**fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of each step, where\n each parameter name is prefixed such that parameter ``p`` for step\n ``s`` has key ``s__p``.\n\nReturns\n-------\nXt : ndarray of shape (n_samples, n_transformed_features)\n Transformed samples.", + "description": "Fit the model and transform with the final estimator.\n\nFits all the transformers one after the other and transform the\ndata. Then uses `fit_transform` on transformed data with the final\nestimator.", + "docstring": "Fit the model and transform with the final estimator.\n\n Fits all the transformers one after the other and transform the\n data. Then uses `fit_transform` on transformed data with the final\n estimator.\n\n Parameters\n ----------\n X : iterable\n Training data. Must fulfill input requirements of first step of the\n pipeline.\n\n y : iterable, default=None\n Training targets. Must fulfill label requirements for all steps of\n the pipeline.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of each step, where\n each parameter name is prefixed such that parameter ``p`` for step\n ``s`` has key ``s__p``.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_transformed_features)\n Transformed samples.\n ", "source_code": "\ndef fit_transform(self, X, y=None, **fit_params):\n \"\"\"Fit the model and transform with the final estimator.\n\n Fits all the transformers one after the other and transform the\n data. Then uses `fit_transform` on transformed data with the final\n estimator.\n\n Parameters\n ----------\n X : iterable\n Training data. Must fulfill input requirements of first step of the\n pipeline.\n\n y : iterable, default=None\n Training targets. Must fulfill label requirements for all steps of\n the pipeline.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of each step, where\n each parameter name is prefixed such that parameter ``p`` for step\n ``s`` has key ``s__p``.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_transformed_features)\n Transformed samples.\n \"\"\"\n fit_params_steps = self._check_fit_params(**fit_params)\n Xt = self._fit(X, y, **fit_params_steps)\n last_step = self._final_estimator\n with _print_elapsed_time('Pipeline', self._log_message(len(self.steps) - 1)):\n if last_step == 'passthrough':\n return Xt\n fit_params_last_step = fit_params_steps[self.steps[-1][0]]\n if hasattr(last_step, 'fit_transform'):\n return last_step.fit_transform(Xt, y, **fit_params_last_step)\n else:\n return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)" }, { @@ -149254,7 +160858,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -149264,13 +160869,14 @@ "docstring": { "type": "array-like of str or None, default=None", "description": "Input features." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get output feature names for transformation.\n\nTransform input features using the pipeline.", - "docstring": "Get output feature names for transformation.\n\nTransform input features using the pipeline.\n\nParameters\n----------\ninput_features : array-like of str or None, default=None\n Input features.\n\nReturns\n-------\nfeature_names_out : ndarray of str objects\n Transformed feature names.", + "docstring": "Get output feature names for transformation.\n\n Transform input features using the pipeline.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n ", "source_code": "\ndef get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Transform input features using the pipeline.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n feature_names_out = input_features\n for (_, name, transform) in self._iter():\n if not hasattr(transform, 'get_feature_names_out'):\n raise AttributeError('Estimator {} does not provide get_feature_names_out. Did you mean to call pipeline[:-1].get_feature_names_out()?'.format(name))\n feature_names_out = transform.get_feature_names_out(feature_names_out)\n return feature_names_out" }, { @@ -149288,7 +160894,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -149298,13 +160905,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, will return the parameters for this estimator and\ncontained subobjects that are estimators." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Get parameters for this estimator.\n\nReturns the parameters given in the constructor as well as the estimators contained within the `steps` of the `Pipeline`.", - "docstring": "Get parameters for this estimator.\n\nReturns the parameters given in the constructor as well as the\nestimators contained within the `steps` of the `Pipeline`.\n\nParameters\n----------\ndeep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\nReturns\n-------\nparams : mapping of string to any\n Parameter names mapped to their values.", + "description": "Get parameters for this estimator.\n\nReturns the parameters given in the constructor as well as the\nestimators contained within the `steps` of the `Pipeline`.", + "docstring": "Get parameters for this estimator.\n\n Returns the parameters given in the constructor as well as the\n estimators contained within the `steps` of the `Pipeline`.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : mapping of string to any\n Parameter names mapped to their values.\n ", "source_code": "\ndef get_params(self, deep=True):\n \"\"\"Get parameters for this estimator.\n\n Returns the parameters given in the constructor as well as the\n estimators contained within the `steps` of the `Pipeline`.\n\n Parameters\n ----------\n deep : bool, default=True\n If True, will return the parameters for this estimator and\n contained subobjects that are estimators.\n\n Returns\n -------\n params : mapping of string to any\n Parameter names mapped to their values.\n \"\"\"\n return self._get_params('steps', deep=deep)" }, { @@ -149322,7 +160930,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Xt", @@ -149332,13 +160941,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_transformed_features)", "description": "Data samples, where ``n_samples`` is the number of samples and\n``n_features`` is the number of features. Must fulfill\ninput requirements of last step of pipeline's\n``inverse_transform`` method." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Apply `inverse_transform` for each step in a reverse order.\n\nAll estimators in the pipeline must support `inverse_transform`.", - "docstring": "Apply `inverse_transform` for each step in a reverse order.\n\nAll estimators in the pipeline must support `inverse_transform`.\n\nParameters\n----------\nXt : array-like of shape (n_samples, n_transformed_features)\n Data samples, where ``n_samples`` is the number of samples and\n ``n_features`` is the number of features. Must fulfill\n input requirements of last step of pipeline's\n ``inverse_transform`` method.\n\nReturns\n-------\nXt : ndarray of shape (n_samples, n_features)\n Inverse transformed data, that is, data in the original feature\n space.", + "docstring": "Apply `inverse_transform` for each step in a reverse order.\n\n All estimators in the pipeline must support `inverse_transform`.\n\n Parameters\n ----------\n Xt : array-like of shape (n_samples, n_transformed_features)\n Data samples, where ``n_samples`` is the number of samples and\n ``n_features`` is the number of features. Must fulfill\n input requirements of last step of pipeline's\n ``inverse_transform`` method.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_features)\n Inverse transformed data, that is, data in the original feature\n space.\n ", "source_code": "\n@available_if(_can_inverse_transform)\ndef inverse_transform(self, Xt):\n \"\"\"Apply `inverse_transform` for each step in a reverse order.\n\n All estimators in the pipeline must support `inverse_transform`.\n\n Parameters\n ----------\n Xt : array-like of shape (n_samples, n_transformed_features)\n Data samples, where ``n_samples`` is the number of samples and\n ``n_features`` is the number of features. Must fulfill\n input requirements of last step of pipeline's\n ``inverse_transform`` method.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_features)\n Inverse transformed data, that is, data in the original feature\n space.\n \"\"\"\n reverse_iter = reversed(list(self._iter()))\n for (_, _, transform) in reverse_iter:\n Xt = transform.inverse_transform(Xt)\n return Xt" }, { @@ -149356,7 +160966,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -149380,13 +160991,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Access the steps by name.\n\nRead-only attribute to access any step by given name. Keys are steps names and values are the steps objects.", - "docstring": "Access the steps by name.\n\nRead-only attribute to access any step by given name.\nKeys are steps names and values are the steps objects.", + "description": "Access the steps by name.\n\nRead-only attribute to access any step by given name.\nKeys are steps names and values are the steps objects.", + "docstring": "Access the steps by name.\n\n Read-only attribute to access any step by given name.\n Keys are steps names and values are the steps objects.", "source_code": "\n@property\ndef named_steps(self):\n \"\"\"Access the steps by name.\n\n Read-only attribute to access any step by given name.\n Keys are steps names and values are the steps objects.\"\"\"\n return Bunch(**dict(self.steps))" }, { @@ -149404,7 +161016,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149414,13 +161027,14 @@ "docstring": { "type": "iterable", "description": "Data to predict on. Must fulfill input requirements of first step\nof the pipeline." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform the data, and apply `predict` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `predict` method. Only valid if the final estimator implements `predict`.", - "docstring": "Transform the data, and apply `predict` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls `predict`\nmethod. Only valid if the final estimator implements `predict`.\n\nParameters\n----------\nX : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n**predict_params : dict of string -> object\n Parameters to the ``predict`` called at the end of all\n transformations in the pipeline. Note that while this may be\n used to return uncertainties from some models with return_std\n or return_cov, uncertainties that are generated by the\n transformations in the pipeline are not propagated to the\n final estimator.\n\n .. versionadded:: 0.20\n\nReturns\n-------\ny_pred : ndarray\n Result of calling `predict` on the final estimator.", + "description": "Transform the data, and apply `predict` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls `predict`\nmethod. Only valid if the final estimator implements `predict`.", + "docstring": "Transform the data, and apply `predict` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls `predict`\n method. Only valid if the final estimator implements `predict`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n **predict_params : dict of string -> object\n Parameters to the ``predict`` called at the end of all\n transformations in the pipeline. Note that while this may be\n used to return uncertainties from some models with return_std\n or return_cov, uncertainties that are generated by the\n transformations in the pipeline are not propagated to the\n final estimator.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n y_pred : ndarray\n Result of calling `predict` on the final estimator.\n ", "source_code": "\n@available_if(_final_estimator_has('predict'))\ndef predict(self, X, **predict_params):\n \"\"\"Transform the data, and apply `predict` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls `predict`\n method. Only valid if the final estimator implements `predict`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n **predict_params : dict of string -> object\n Parameters to the ``predict`` called at the end of all\n transformations in the pipeline. Note that while this may be\n used to return uncertainties from some models with return_std\n or return_cov, uncertainties that are generated by the\n transformations in the pipeline are not propagated to the\n final estimator.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n y_pred : ndarray\n Result of calling `predict` on the final estimator.\n \"\"\"\n Xt = X\n for (_, name, transform) in self._iter(with_final=False):\n Xt = transform.transform(Xt)\n return self.steps[-1][1].predict(Xt, **predict_params)" }, { @@ -149440,7 +161054,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149450,13 +161065,14 @@ "docstring": { "type": "iterable", "description": "Data to predict on. Must fulfill input requirements of first step\nof the pipeline." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform the data, and apply `predict_log_proba` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `predict_log_proba` method. Only valid if the final estimator implements `predict_log_proba`.", - "docstring": "Transform the data, and apply `predict_log_proba` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls\n`predict_log_proba` method. Only valid if the final estimator\nimplements `predict_log_proba`.\n\nParameters\n----------\nX : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n**predict_log_proba_params : dict of string -> object\n Parameters to the ``predict_log_proba`` called at the end of all\n transformations in the pipeline.\n\nReturns\n-------\ny_log_proba : ndarray of shape (n_samples, n_classes)\n Result of calling `predict_log_proba` on the final estimator.", + "description": "Transform the data, and apply `predict_log_proba` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls\n`predict_log_proba` method. Only valid if the final estimator\nimplements `predict_log_proba`.", + "docstring": "Transform the data, and apply `predict_log_proba` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `predict_log_proba` method. Only valid if the final estimator\n implements `predict_log_proba`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n **predict_log_proba_params : dict of string -> object\n Parameters to the ``predict_log_proba`` called at the end of all\n transformations in the pipeline.\n\n Returns\n -------\n y_log_proba : ndarray of shape (n_samples, n_classes)\n Result of calling `predict_log_proba` on the final estimator.\n ", "source_code": "\n@available_if(_final_estimator_has('predict_log_proba'))\ndef predict_log_proba(self, X, **predict_log_proba_params):\n \"\"\"Transform the data, and apply `predict_log_proba` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `predict_log_proba` method. Only valid if the final estimator\n implements `predict_log_proba`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n **predict_log_proba_params : dict of string -> object\n Parameters to the ``predict_log_proba`` called at the end of all\n transformations in the pipeline.\n\n Returns\n -------\n y_log_proba : ndarray of shape (n_samples, n_classes)\n Result of calling `predict_log_proba` on the final estimator.\n \"\"\"\n Xt = X\n for (_, name, transform) in self._iter(with_final=False):\n Xt = transform.transform(Xt)\n return self.steps[-1][1].predict_log_proba(Xt, **predict_log_proba_params)" }, { @@ -149476,7 +161092,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149486,13 +161103,14 @@ "docstring": { "type": "iterable", "description": "Data to predict on. Must fulfill input requirements of first step\nof the pipeline." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform the data, and apply `predict_proba` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `predict_proba` method. Only valid if the final estimator implements `predict_proba`.", - "docstring": "Transform the data, and apply `predict_proba` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls\n`predict_proba` method. Only valid if the final estimator implements\n`predict_proba`.\n\nParameters\n----------\nX : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n**predict_proba_params : dict of string -> object\n Parameters to the `predict_proba` called at the end of all\n transformations in the pipeline.\n\nReturns\n-------\ny_proba : ndarray of shape (n_samples, n_classes)\n Result of calling `predict_proba` on the final estimator.", + "description": "Transform the data, and apply `predict_proba` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls\n`predict_proba` method. Only valid if the final estimator implements\n`predict_proba`.", + "docstring": "Transform the data, and apply `predict_proba` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `predict_proba` method. Only valid if the final estimator implements\n `predict_proba`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n **predict_proba_params : dict of string -> object\n Parameters to the `predict_proba` called at the end of all\n transformations in the pipeline.\n\n Returns\n -------\n y_proba : ndarray of shape (n_samples, n_classes)\n Result of calling `predict_proba` on the final estimator.\n ", "source_code": "\n@available_if(_final_estimator_has('predict_proba'))\ndef predict_proba(self, X, **predict_proba_params):\n \"\"\"Transform the data, and apply `predict_proba` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `predict_proba` method. Only valid if the final estimator implements\n `predict_proba`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n **predict_proba_params : dict of string -> object\n Parameters to the `predict_proba` called at the end of all\n transformations in the pipeline.\n\n Returns\n -------\n y_proba : ndarray of shape (n_samples, n_classes)\n Result of calling `predict_proba` on the final estimator.\n \"\"\"\n Xt = X\n for (_, name, transform) in self._iter(with_final=False):\n Xt = transform.transform(Xt)\n return self.steps[-1][1].predict_proba(Xt, **predict_proba_params)" }, { @@ -149510,7 +161128,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149520,7 +161139,8 @@ "docstring": { "type": "iterable", "description": "Data to predict on. Must fulfill input requirements of first step\nof the pipeline." - } + }, + "refined_type": {} }, { "name": "y", @@ -149530,7 +161150,8 @@ "docstring": { "type": "iterable, default=None", "description": "Targets used for scoring. Must fulfill label requirements for all\nsteps of the pipeline." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -149540,13 +161161,14 @@ "docstring": { "type": "array-like, default=None", "description": "If not None, this argument is passed as ``sample_weight`` keyword\nargument to the ``score`` method of the final estimator." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform the data, and apply `score` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `score` method. Only valid if the final estimator implements `score`.", - "docstring": "Transform the data, and apply `score` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls\n`score` method. Only valid if the final estimator implements `score`.\n\nParameters\n----------\nX : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\ny : iterable, default=None\n Targets used for scoring. Must fulfill label requirements for all\n steps of the pipeline.\n\nsample_weight : array-like, default=None\n If not None, this argument is passed as ``sample_weight`` keyword\n argument to the ``score`` method of the final estimator.\n\nReturns\n-------\nscore : float\n Result of calling `score` on the final estimator.", + "description": "Transform the data, and apply `score` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls\n`score` method. Only valid if the final estimator implements `score`.", + "docstring": "Transform the data, and apply `score` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `score` method. Only valid if the final estimator implements `score`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n y : iterable, default=None\n Targets used for scoring. Must fulfill label requirements for all\n steps of the pipeline.\n\n sample_weight : array-like, default=None\n If not None, this argument is passed as ``sample_weight`` keyword\n argument to the ``score`` method of the final estimator.\n\n Returns\n -------\n score : float\n Result of calling `score` on the final estimator.\n ", "source_code": "\n@available_if(_final_estimator_has('score'))\ndef score(self, X, y=None, sample_weight=None):\n \"\"\"Transform the data, and apply `score` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `score` method. Only valid if the final estimator implements `score`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n y : iterable, default=None\n Targets used for scoring. Must fulfill label requirements for all\n steps of the pipeline.\n\n sample_weight : array-like, default=None\n If not None, this argument is passed as ``sample_weight`` keyword\n argument to the ``score`` method of the final estimator.\n\n Returns\n -------\n score : float\n Result of calling `score` on the final estimator.\n \"\"\"\n Xt = X\n for (_, name, transform) in self._iter(with_final=False):\n Xt = transform.transform(Xt)\n score_params = {}\n if sample_weight is not None:\n score_params['sample_weight'] = sample_weight\n return self.steps[-1][1].score(Xt, y, **score_params)" }, { @@ -149566,7 +161188,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149576,13 +161199,14 @@ "docstring": { "type": "iterable", "description": "Data to predict on. Must fulfill input requirements of first step\nof the pipeline." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform the data, and apply `score_samples` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `score_samples` method. Only valid if the final estimator implements `score_samples`.", - "docstring": "Transform the data, and apply `score_samples` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls\n`score_samples` method. Only valid if the final estimator implements\n`score_samples`.\n\nParameters\n----------\nX : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\nReturns\n-------\ny_score : ndarray of shape (n_samples,)\n Result of calling `score_samples` on the final estimator.", + "description": "Transform the data, and apply `score_samples` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls\n`score_samples` method. Only valid if the final estimator implements\n`score_samples`.", + "docstring": "Transform the data, and apply `score_samples` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `score_samples` method. Only valid if the final estimator implements\n `score_samples`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n Returns\n -------\n y_score : ndarray of shape (n_samples,)\n Result of calling `score_samples` on the final estimator.\n ", "source_code": "\n@available_if(_final_estimator_has('score_samples'))\ndef score_samples(self, X):\n \"\"\"Transform the data, and apply `score_samples` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `score_samples` method. Only valid if the final estimator implements\n `score_samples`.\n\n Parameters\n ----------\n X : iterable\n Data to predict on. Must fulfill input requirements of first step\n of the pipeline.\n\n Returns\n -------\n y_score : ndarray of shape (n_samples,)\n Result of calling `score_samples` on the final estimator.\n \"\"\"\n Xt = X\n for (_, _, transformer) in self._iter(with_final=False):\n Xt = transformer.transform(Xt)\n return self.steps[-1][1].score_samples(Xt)" }, { @@ -149600,13 +161224,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Set the parameters of this estimator.\n\nValid parameter keys can be listed with ``get_params()``. Note that you can directly set the parameters of the estimators contained in `steps`.", - "docstring": "Set the parameters of this estimator.\n\nValid parameter keys can be listed with ``get_params()``. Note that\nyou can directly set the parameters of the estimators contained in\n`steps`.\n\nParameters\n----------\n**kwargs : dict\n Parameters of this estimator or parameters of estimators contained\n in `steps`. Parameters of the steps may be set using its name and\n the parameter name separated by a '__'.\n\nReturns\n-------\nself : object\n Pipeline class instance.", + "description": "Set the parameters of this estimator.\n\nValid parameter keys can be listed with ``get_params()``. Note that\nyou can directly set the parameters of the estimators contained in\n`steps`.", + "docstring": "Set the parameters of this estimator.\n\n Valid parameter keys can be listed with ``get_params()``. Note that\n you can directly set the parameters of the estimators contained in\n `steps`.\n\n Parameters\n ----------\n **kwargs : dict\n Parameters of this estimator or parameters of estimators contained\n in `steps`. Parameters of the steps may be set using its name and\n the parameter name separated by a '__'.\n\n Returns\n -------\n self : object\n Pipeline class instance.\n ", "source_code": "\ndef set_params(self, **kwargs):\n \"\"\"Set the parameters of this estimator.\n\n Valid parameter keys can be listed with ``get_params()``. Note that\n you can directly set the parameters of the estimators contained in\n `steps`.\n\n Parameters\n ----------\n **kwargs : dict\n Parameters of this estimator or parameters of estimators contained\n in `steps`. Parameters of the steps may be set using its name and\n the parameter name separated by a '__'.\n\n Returns\n -------\n self : object\n Pipeline class instance.\n \"\"\"\n self._set_params('steps', **kwargs)\n return self" }, { @@ -149624,7 +161249,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149634,13 +161260,14 @@ "docstring": { "type": "iterable", "description": "Data to transform. Must fulfill input requirements of first step\nof the pipeline." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform the data, and apply `transform` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed data are finally passed to the final estimator that calls `transform` method. Only valid if the final estimator implements `transform`. This also works where final estimator is `None` in which case all prior transformations are applied.", - "docstring": "Transform the data, and apply `transform` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls\n`transform` method. Only valid if the final estimator\nimplements `transform`.\n\nThis also works where final estimator is `None` in which case all prior\ntransformations are applied.\n\nParameters\n----------\nX : iterable\n Data to transform. Must fulfill input requirements of first step\n of the pipeline.\n\nReturns\n-------\nXt : ndarray of shape (n_samples, n_transformed_features)\n Transformed data.", + "description": "Transform the data, and apply `transform` with the final estimator.\n\nCall `transform` of each transformer in the pipeline. The transformed\ndata are finally passed to the final estimator that calls\n`transform` method. Only valid if the final estimator\nimplements `transform`.\n\nThis also works where final estimator is `None` in which case all prior\ntransformations are applied.", + "docstring": "Transform the data, and apply `transform` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `transform` method. Only valid if the final estimator\n implements `transform`.\n\n This also works where final estimator is `None` in which case all prior\n transformations are applied.\n\n Parameters\n ----------\n X : iterable\n Data to transform. Must fulfill input requirements of first step\n of the pipeline.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_transformed_features)\n Transformed data.\n ", "source_code": "\n@available_if(_can_transform)\ndef transform(self, X):\n \"\"\"Transform the data, and apply `transform` with the final estimator.\n\n Call `transform` of each transformer in the pipeline. The transformed\n data are finally passed to the final estimator that calls\n `transform` method. Only valid if the final estimator\n implements `transform`.\n\n This also works where final estimator is `None` in which case all prior\n transformations are applied.\n\n Parameters\n ----------\n X : iterable\n Data to transform. Must fulfill input requirements of first step\n of the pipeline.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_transformed_features)\n Transformed data.\n \"\"\"\n Xt = X\n for (_, _, transform) in self._iter():\n Xt = transform.transform(Xt)\n return Xt" }, { @@ -149658,13 +161285,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check that final_estimator has `attr`.\n\nUsed together with `avaliable_if` in `Pipeline`.", - "docstring": "Check that final_estimator has `attr`.\n\nUsed together with `avaliable_if` in `Pipeline`.", + "docstring": "Check that final_estimator has `attr`.\n\n Used together with `avaliable_if` in `Pipeline`.", "source_code": "\ndef _final_estimator_has(attr):\n \"\"\"Check that final_estimator has `attr`.\n\n Used together with `avaliable_if` in `Pipeline`.\"\"\"\n \n def check(self):\n getattr(self._final_estimator, attr)\n return True\n return check" }, { @@ -149682,7 +161310,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149692,7 +161321,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -149702,7 +161332,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "weight", @@ -149712,7 +161343,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "message_clsname", @@ -149722,7 +161354,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "message", @@ -149732,13 +161365,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fits ``transformer`` to ``X`` and ``y``.", - "docstring": "Fits ``transformer`` to ``X`` and ``y``.", + "docstring": "\n Fits ``transformer`` to ``X`` and ``y``.\n ", "source_code": "\ndef _fit_one(transformer, X, y, weight, message_clsname='', message=None, **fit_params):\n \"\"\"\n Fits ``transformer`` to ``X`` and ``y``.\n \"\"\"\n with _print_elapsed_time(message_clsname, message):\n return transformer.fit(X, y, **fit_params)" }, { @@ -149756,7 +161390,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149766,7 +161401,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -149776,7 +161412,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "weight", @@ -149786,7 +161423,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "message_clsname", @@ -149796,7 +161434,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "message", @@ -149806,13 +161445,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned with the fitted transformer. If ``weight`` is not ``None``, the result will be multiplied by ``weight``.", - "docstring": "Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned\nwith the fitted transformer. If ``weight`` is not ``None``, the result will\nbe multiplied by ``weight``.", + "description": "Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned\nwith the fitted transformer. If ``weight`` is not ``None``, the result will\nbe multiplied by ``weight``.", + "docstring": "\n Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned\n with the fitted transformer. If ``weight`` is not ``None``, the result will\n be multiplied by ``weight``.\n ", "source_code": "\ndef _fit_transform_one(transformer, X, y, weight, message_clsname='', message=None, **fit_params):\n \"\"\"\n Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned\n with the fitted transformer. If ``weight`` is not ``None``, the result will\n be multiplied by ``weight``.\n \"\"\"\n with _print_elapsed_time(message_clsname, message):\n if hasattr(transformer, 'fit_transform'):\n res = transformer.fit_transform(X, y, **fit_params)\n else:\n res = transformer.fit(X, y, **fit_params).transform(X)\n if weight is None:\n return res, transformer\n return res * weight, transformer" }, { @@ -149830,7 +161470,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -149854,7 +161495,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -149864,7 +161506,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -149874,7 +161517,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "weight", @@ -149884,13 +161528,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _transform_one(transformer, X, y, weight, **fit_params):\n res = transformer.transform(X)\n if weight is None:\n return res\n return res * weight" }, { @@ -149908,7 +161553,8 @@ "docstring": { "type": "str or object with the joblib.Memory interface, default=None", "description": "Used to cache the fitted transformers of the pipeline. By default,\nno caching is performed. If a string is given, it is the path to\nthe caching directory. Enabling caching triggers a clone of\nthe transformers before fitting. Therefore, the transformer\ninstance given to the pipeline cannot be inspected\ndirectly. Use the attribute ``named_steps`` or ``steps`` to\ninspect estimators within the pipeline. Caching the\ntransformers is advantageous when fitting is time consuming." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -149918,13 +161564,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the time elapsed while fitting each step will be printed as it\nis completed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Construct a :class:`Pipeline` from the given estimators.\n\nThis is a shorthand for the :class:`Pipeline` constructor; it does not require, and does not permit, naming the estimators. Instead, their names will be set to the lowercase of their types automatically.", - "docstring": "Construct a :class:`Pipeline` from the given estimators.\n\nThis is a shorthand for the :class:`Pipeline` constructor; it does not\nrequire, and does not permit, naming the estimators. Instead, their names\nwill be set to the lowercase of their types automatically.\n\nParameters\n----------\n*steps : list of Estimator objects\n List of the scikit-learn estimators that are chained together.\n\nmemory : str or object with the joblib.Memory interface, default=None\n Used to cache the fitted transformers of the pipeline. By default,\n no caching is performed. If a string is given, it is the path to\n the caching directory. Enabling caching triggers a clone of\n the transformers before fitting. Therefore, the transformer\n instance given to the pipeline cannot be inspected\n directly. Use the attribute ``named_steps`` or ``steps`` to\n inspect estimators within the pipeline. Caching the\n transformers is advantageous when fitting is time consuming.\n\nverbose : bool, default=False\n If True, the time elapsed while fitting each step will be printed as it\n is completed.\n\nReturns\n-------\np : Pipeline\n Returns a scikit-learn :class:`Pipeline` object.\n\nSee Also\n--------\nPipeline : Class for creating a pipeline of transforms with a final\n estimator.\n\nExamples\n--------\n>>> from sklearn.naive_bayes import GaussianNB\n>>> from sklearn.preprocessing import StandardScaler\n>>> from sklearn.pipeline import make_pipeline\n>>> make_pipeline(StandardScaler(), GaussianNB(priors=None))\nPipeline(steps=[('standardscaler', StandardScaler()),\n ('gaussiannb', GaussianNB())])", + "description": "Construct a :class:`Pipeline` from the given estimators.\n\nThis is a shorthand for the :class:`Pipeline` constructor; it does not\nrequire, and does not permit, naming the estimators. Instead, their names\nwill be set to the lowercase of their types automatically.", + "docstring": "Construct a :class:`Pipeline` from the given estimators.\n\n This is a shorthand for the :class:`Pipeline` constructor; it does not\n require, and does not permit, naming the estimators. Instead, their names\n will be set to the lowercase of their types automatically.\n\n Parameters\n ----------\n *steps : list of Estimator objects\n List of the scikit-learn estimators that are chained together.\n\n memory : str or object with the joblib.Memory interface, default=None\n Used to cache the fitted transformers of the pipeline. By default,\n no caching is performed. If a string is given, it is the path to\n the caching directory. Enabling caching triggers a clone of\n the transformers before fitting. Therefore, the transformer\n instance given to the pipeline cannot be inspected\n directly. Use the attribute ``named_steps`` or ``steps`` to\n inspect estimators within the pipeline. Caching the\n transformers is advantageous when fitting is time consuming.\n\n verbose : bool, default=False\n If True, the time elapsed while fitting each step will be printed as it\n is completed.\n\n Returns\n -------\n p : Pipeline\n Returns a scikit-learn :class:`Pipeline` object.\n\n See Also\n --------\n Pipeline : Class for creating a pipeline of transforms with a final\n estimator.\n\n Examples\n --------\n >>> from sklearn.naive_bayes import GaussianNB\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.pipeline import make_pipeline\n >>> make_pipeline(StandardScaler(), GaussianNB(priors=None))\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('gaussiannb', GaussianNB())])\n ", "source_code": "\ndef make_pipeline(*steps, memory=None, verbose=False):\n \"\"\"Construct a :class:`Pipeline` from the given estimators.\n\n This is a shorthand for the :class:`Pipeline` constructor; it does not\n require, and does not permit, naming the estimators. Instead, their names\n will be set to the lowercase of their types automatically.\n\n Parameters\n ----------\n *steps : list of Estimator objects\n List of the scikit-learn estimators that are chained together.\n\n memory : str or object with the joblib.Memory interface, default=None\n Used to cache the fitted transformers of the pipeline. By default,\n no caching is performed. If a string is given, it is the path to\n the caching directory. Enabling caching triggers a clone of\n the transformers before fitting. Therefore, the transformer\n instance given to the pipeline cannot be inspected\n directly. Use the attribute ``named_steps`` or ``steps`` to\n inspect estimators within the pipeline. Caching the\n transformers is advantageous when fitting is time consuming.\n\n verbose : bool, default=False\n If True, the time elapsed while fitting each step will be printed as it\n is completed.\n\n Returns\n -------\n p : Pipeline\n Returns a scikit-learn :class:`Pipeline` object.\n\n See Also\n --------\n Pipeline : Class for creating a pipeline of transforms with a final\n estimator.\n\n Examples\n --------\n >>> from sklearn.naive_bayes import GaussianNB\n >>> from sklearn.preprocessing import StandardScaler\n >>> from sklearn.pipeline import make_pipeline\n >>> make_pipeline(StandardScaler(), GaussianNB(priors=None))\n Pipeline(steps=[('standardscaler', StandardScaler()),\n ('gaussiannb', GaussianNB())])\n \"\"\"\n return Pipeline(_name_estimators(steps), memory=memory, verbose=verbose)" }, { @@ -149942,7 +161589,8 @@ "docstring": { "type": "int, default=None", "description": "Number of jobs to run in parallel.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details.\n\n.. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -149952,13 +161600,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, the time elapsed while fitting each transformer will be\nprinted as it is completed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Construct a FeatureUnion from the given transformers.\n\nThis is a shorthand for the FeatureUnion constructor; it does not require, and does not permit, naming the transformers. Instead, they will be given names automatically based on their types. It also does not allow weighting.", - "docstring": "Construct a FeatureUnion from the given transformers.\n\nThis is a shorthand for the FeatureUnion constructor; it does not require,\nand does not permit, naming the transformers. Instead, they will be given\nnames automatically based on their types. It also does not allow weighting.\n\nParameters\n----------\n*transformers : list of estimators\n\nn_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\nverbose : bool, default=False\n If True, the time elapsed while fitting each transformer will be\n printed as it is completed.\n\nReturns\n-------\nf : FeatureUnion\n\nSee Also\n--------\nFeatureUnion : Class for concatenating the results of multiple transformer\n objects.\n\nExamples\n--------\n>>> from sklearn.decomposition import PCA, TruncatedSVD\n>>> from sklearn.pipeline import make_union\n>>> make_union(PCA(), TruncatedSVD())\n FeatureUnion(transformer_list=[('pca', PCA()),\n ('truncatedsvd', TruncatedSVD())])", + "description": "Construct a FeatureUnion from the given transformers.\n\nThis is a shorthand for the FeatureUnion constructor; it does not require,\nand does not permit, naming the transformers. Instead, they will be given\nnames automatically based on their types. It also does not allow weighting.", + "docstring": "\n Construct a FeatureUnion from the given transformers.\n\n This is a shorthand for the FeatureUnion constructor; it does not require,\n and does not permit, naming the transformers. Instead, they will be given\n names automatically based on their types. It also does not allow weighting.\n\n Parameters\n ----------\n *transformers : list of estimators\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\n verbose : bool, default=False\n If True, the time elapsed while fitting each transformer will be\n printed as it is completed.\n\n Returns\n -------\n f : FeatureUnion\n\n See Also\n --------\n FeatureUnion : Class for concatenating the results of multiple transformer\n objects.\n\n Examples\n --------\n >>> from sklearn.decomposition import PCA, TruncatedSVD\n >>> from sklearn.pipeline import make_union\n >>> make_union(PCA(), TruncatedSVD())\n FeatureUnion(transformer_list=[('pca', PCA()),\n ('truncatedsvd', TruncatedSVD())])\n ", "source_code": "\ndef make_union(*transformers, n_jobs=None, verbose=False):\n \"\"\"\n Construct a FeatureUnion from the given transformers.\n\n This is a shorthand for the FeatureUnion constructor; it does not require,\n and does not permit, naming the transformers. Instead, they will be given\n names automatically based on their types. It also does not allow weighting.\n\n Parameters\n ----------\n *transformers : list of estimators\n\n n_jobs : int, default=None\n Number of jobs to run in parallel.\n ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n ``-1`` means using all processors. See :term:`Glossary `\n for more details.\n\n .. versionchanged:: v0.20\n `n_jobs` default changed from 1 to None\n\n verbose : bool, default=False\n If True, the time elapsed while fitting each transformer will be\n printed as it is completed.\n\n Returns\n -------\n f : FeatureUnion\n\n See Also\n --------\n FeatureUnion : Class for concatenating the results of multiple transformer\n objects.\n\n Examples\n --------\n >>> from sklearn.decomposition import PCA, TruncatedSVD\n >>> from sklearn.pipeline import make_union\n >>> make_union(PCA(), TruncatedSVD())\n FeatureUnion(transformer_list=[('pca', PCA()),\n ('truncatedsvd', TruncatedSVD())])\n \"\"\"\n return FeatureUnion(_name_estimators(transformers), n_jobs=n_jobs, verbose=verbose)" }, { @@ -149976,7 +161625,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "threshold", @@ -149986,7 +161636,8 @@ "docstring": { "type": "float, default=0.0", "description": "Feature values below or equal to this are replaced by 0, above it by 1.\nThreshold may not be less than 0 for operations on sparse matrices." - } + }, + "refined_type": {} }, { "name": "copy", @@ -149996,13 +161647,14 @@ "docstring": { "type": "bool, default=True", "description": "Set to False to perform inplace binarization and avoid a copy (if\nthe input is already a numpy array or a scipy.sparse CSR matrix)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, threshold=0.0, copy=True):\n self.threshold = threshold\n self.copy = copy" }, { @@ -150020,13 +161672,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'stateless': True}" }, { @@ -150044,7 +161697,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -150054,6 +161708,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -150064,13 +161722,14 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Do nothing and return the estimator unchanged.\n\nThis method is just there to implement the usual API and hence work in pipelines.", - "docstring": "Do nothing and return the estimator unchanged.\n\nThis method is just there to implement the usual API and hence\nwork in pipelines.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data.\n\ny : None\n Ignored.\n\nReturns\n-------\nself : object\n Fitted transformer.", + "description": "Do nothing and return the estimator unchanged.\n\nThis method is just there to implement the usual API and hence\nwork in pipelines.", + "docstring": "Do nothing and return the estimator unchanged.\n\n This method is just there to implement the usual API and hence\n work in pipelines.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted transformer.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Do nothing and return the estimator unchanged.\n\n This method is just there to implement the usual API and hence\n work in pipelines.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n self._validate_data(X, accept_sparse='csr')\n return self" }, { @@ -150088,7 +161747,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -150098,6 +161758,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data to binarize, element by element.\nscipy.sparse matrices should be in CSR format to avoid an\nun-necessary copy." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -150108,13 +161772,14 @@ "docstring": { "type": "bool", "description": "Copy the input X or not." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Binarize each element of X.", - "docstring": "Binarize each element of X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to binarize, element by element.\n scipy.sparse matrices should be in CSR format to avoid an\n un-necessary copy.\n\ncopy : bool\n Copy the input X or not.\n\nReturns\n-------\nX_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.", + "docstring": "Binarize each element of X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to binarize, element by element.\n scipy.sparse matrices should be in CSR format to avoid an\n un-necessary copy.\n\n copy : bool\n Copy the input X or not.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n ", "source_code": "\ndef transform(self, X, copy=None):\n \"\"\"Binarize each element of X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to binarize, element by element.\n scipy.sparse matrices should be in CSR format to avoid an\n un-necessary copy.\n\n copy : bool\n Copy the input X or not.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n copy = copy if copy is not None else self.copy\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], copy=copy, reset=False)\n return binarize(X, threshold=self.threshold, copy=False)" }, { @@ -150132,13 +161797,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self):\n pass" }, { @@ -150156,13 +161822,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'pairwise': True}" }, { @@ -150183,13 +161850,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1.')\n@property\ndef _pairwise(self):\n return True" }, { @@ -150207,7 +161875,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "K", @@ -150217,7 +161886,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_samples)", "description": "Kernel matrix." - } + }, + "refined_type": {} }, { "name": "y", @@ -150227,13 +161897,14 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit KernelCenterer.", - "docstring": "Fit KernelCenterer.\n\nParameters\n----------\nK : ndarray of shape (n_samples, n_samples)\n Kernel matrix.\n\ny : None\n Ignored.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit KernelCenterer.\n\n Parameters\n ----------\n K : ndarray of shape (n_samples, n_samples)\n Kernel matrix.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, K, y=None):\n \"\"\"Fit KernelCenterer.\n\n Parameters\n ----------\n K : ndarray of shape (n_samples, n_samples)\n Kernel matrix.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n K = self._validate_data(K, dtype=FLOAT_DTYPES)\n if K.shape[0] != K.shape[1]:\n raise ValueError('Kernel matrix must be a square matrix. Input is a {}x{} matrix.'.format(K.shape[0], K.shape[1]))\n n_samples = K.shape[0]\n self.K_fit_rows_ = np.sum(K, axis=0) / n_samples\n self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples\n return self" }, { @@ -150251,7 +161922,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "K", @@ -150261,7 +161933,8 @@ "docstring": { "type": "ndarray of shape (n_samples1, n_samples2)", "description": "Kernel matrix." - } + }, + "refined_type": {} }, { "name": "copy", @@ -150271,13 +161944,14 @@ "docstring": { "type": "bool, default=True", "description": "Set to False to perform inplace computation." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Center kernel matrix.", - "docstring": "Center kernel matrix.\n\nParameters\n----------\nK : ndarray of shape (n_samples1, n_samples2)\n Kernel matrix.\n\ncopy : bool, default=True\n Set to False to perform inplace computation.\n\nReturns\n-------\nK_new : ndarray of shape (n_samples1, n_samples2)\n Returns the instance itself.", + "docstring": "Center kernel matrix.\n\n Parameters\n ----------\n K : ndarray of shape (n_samples1, n_samples2)\n Kernel matrix.\n\n copy : bool, default=True\n Set to False to perform inplace computation.\n\n Returns\n -------\n K_new : ndarray of shape (n_samples1, n_samples2)\n Returns the instance itself.\n ", "source_code": "\ndef transform(self, K, copy=True):\n \"\"\"Center kernel matrix.\n\n Parameters\n ----------\n K : ndarray of shape (n_samples1, n_samples2)\n Kernel matrix.\n\n copy : bool, default=True\n Set to False to perform inplace computation.\n\n Returns\n -------\n K_new : ndarray of shape (n_samples1, n_samples2)\n Returns the instance itself.\n \"\"\"\n check_is_fitted(self)\n K = self._validate_data(K, copy=copy, dtype=FLOAT_DTYPES, reset=False)\n K_pred_cols = (np.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, np.newaxis]\n K -= self.K_fit_rows_\n K -= K_pred_cols\n K += self.K_fit_all_\n return K" }, { @@ -150295,7 +161969,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy", @@ -150305,13 +161980,14 @@ "docstring": { "type": "bool, default=True", "description": "Set to False to perform inplace scaling and avoid a copy (if the input\nis already a numpy array)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, copy=True):\n self.copy = copy" }, { @@ -150329,13 +162005,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'allow_nan': True}" }, { @@ -150353,13 +162030,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Reset internal data-dependent state of the scaler, if necessary.\n\n__init__ parameters are not touched.", - "docstring": "Reset internal data-dependent state of the scaler, if necessary.\n\n__init__ parameters are not touched.", + "docstring": "Reset internal data-dependent state of the scaler, if necessary.\n\n __init__ parameters are not touched.\n ", "source_code": "\ndef _reset(self):\n \"\"\"Reset internal data-dependent state of the scaler, if necessary.\n\n __init__ parameters are not touched.\n \"\"\"\n if hasattr(self, 'scale_'):\n del self.scale_\n del self.n_samples_seen_\n del self.max_abs_" }, { @@ -150377,7 +162055,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -150387,6 +162066,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data used to compute the per-feature minimum and maximum\nused for later scaling along the features axis." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -150397,13 +162080,14 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the maximum absolute value to be used for later scaling.", - "docstring": "Compute the maximum absolute value to be used for later scaling.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the per-feature minimum and maximum\n used for later scaling along the features axis.\n\ny : None\n Ignored.\n\nReturns\n-------\nself : object\n Fitted scaler.", + "docstring": "Compute the maximum absolute value to be used for later scaling.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the per-feature minimum and maximum\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted scaler.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Compute the maximum absolute value to be used for later scaling.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the per-feature minimum and maximum\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n self._reset()\n return self.partial_fit(X, y)" }, { @@ -150421,7 +162105,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -150431,13 +162116,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data that should be transformed back." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Scale back the data to the original representation.", - "docstring": "Scale back the data to the original representation.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data that should be transformed back.\n\nReturns\n-------\nX_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.", + "docstring": "Scale back the data to the original representation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data that should be transformed back.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Scale back the data to the original representation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data that should be transformed back.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n inplace_column_scale(X, self.scale_)\n else:\n X *= self.scale_\n return X" }, { @@ -150455,7 +162144,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -150465,6 +162155,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data used to compute the mean and standard deviation\nused for later scaling along the features axis." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -150475,13 +162169,14 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Online computation of max absolute value of X for later scaling.\n\nAll of X is processed as a single batch. This is intended for cases when :meth:`fit` is not feasible due to very large number of `n_samples` or because X is read from a continuous stream.", - "docstring": "Online computation of max absolute value of X for later scaling.\n\nAll of X is processed as a single batch. This is intended for cases\nwhen :meth:`fit` is not feasible due to very large number of\n`n_samples` or because X is read from a continuous stream.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\ny : None\n Ignored.\n\nReturns\n-------\nself : object\n Fitted scaler.", + "description": "Online computation of max absolute value of X for later scaling.\n\nAll of X is processed as a single batch. This is intended for cases\nwhen :meth:`fit` is not feasible due to very large number of\n`n_samples` or because X is read from a continuous stream.", + "docstring": "Online computation of max absolute value of X for later scaling.\n\n All of X is processed as a single batch. This is intended for cases\n when :meth:`fit` is not feasible due to very large number of\n `n_samples` or because X is read from a continuous stream.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted scaler.\n ", "source_code": "\ndef partial_fit(self, X, y=None):\n \"\"\"Online computation of max absolute value of X for later scaling.\n\n All of X is processed as a single batch. This is intended for cases\n when :meth:`fit` is not feasible due to very large number of\n `n_samples` or because X is read from a continuous stream.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n first_pass = not hasattr(self, 'n_samples_seen_')\n X = self._validate_data(X, reset=first_pass, accept_sparse=('csr', 'csc'), estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n (mins, maxs) = min_max_axis(X, axis=0, ignore_nan=True)\n max_abs = np.maximum(np.abs(mins), np.abs(maxs))\n else:\n max_abs = np.nanmax(np.abs(X), axis=0)\n if first_pass:\n self.n_samples_seen_ = X.shape[0]\n else:\n max_abs = np.maximum(self.max_abs_, max_abs)\n self.n_samples_seen_ += X.shape[0]\n self.max_abs_ = max_abs\n self.scale_ = _handle_zeros_in_scale(max_abs, copy=True)\n return self" }, { @@ -150499,7 +162194,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -150509,13 +162205,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data that should be scaled." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Scale the data.", - "docstring": "Scale the data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data that should be scaled.\n\nReturns\n-------\nX_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.", + "docstring": "Scale the data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data that should be scaled.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Scale the data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data that should be scaled.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), copy=self.copy, reset=False, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n inplace_column_scale(X, 1.0 / self.scale_)\n else:\n X /= self.scale_\n return X" }, { @@ -150533,7 +162233,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "feature_range", @@ -150543,7 +162244,8 @@ "docstring": { "type": "tuple (min, max), default=(0, 1)", "description": "Desired range of transformed data." - } + }, + "refined_type": {} }, { "name": "copy", @@ -150553,7 +162255,8 @@ "docstring": { "type": "bool, default=True", "description": "Set to False to perform inplace row normalization and avoid a\ncopy (if the input is already a numpy array)." - } + }, + "refined_type": {} }, { "name": "clip", @@ -150563,13 +162266,14 @@ "docstring": { "type": "bool, default=False", "description": "Set to True to clip transformed values of held-out data to\nprovided `feature range`.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, feature_range=(0, 1), *, copy=True, clip=False):\n self.feature_range = feature_range\n self.copy = copy\n self.clip = clip" }, { @@ -150587,13 +162291,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'allow_nan': True}" }, { @@ -150611,13 +162316,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Reset internal data-dependent state of the scaler, if necessary.\n\n__init__ parameters are not touched.", - "docstring": "Reset internal data-dependent state of the scaler, if necessary.\n\n__init__ parameters are not touched.", + "docstring": "Reset internal data-dependent state of the scaler, if necessary.\n\n __init__ parameters are not touched.\n ", "source_code": "\ndef _reset(self):\n \"\"\"Reset internal data-dependent state of the scaler, if necessary.\n\n __init__ parameters are not touched.\n \"\"\"\n if hasattr(self, 'scale_'):\n del self.scale_\n del self.min_\n del self.n_samples_seen_\n del self.data_min_\n del self.data_max_\n del self.data_range_" }, { @@ -150635,7 +162341,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -150645,7 +162352,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data used to compute the per-feature minimum and maximum\nused for later scaling along the features axis." - } + }, + "refined_type": {} }, { "name": "y", @@ -150655,13 +162363,14 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the minimum and maximum to be used for later scaling.", - "docstring": "Compute the minimum and maximum to be used for later scaling.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data used to compute the per-feature minimum and maximum\n used for later scaling along the features axis.\n\ny : None\n Ignored.\n\nReturns\n-------\nself : object\n Fitted scaler.", + "docstring": "Compute the minimum and maximum to be used for later scaling.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data used to compute the per-feature minimum and maximum\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted scaler.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Compute the minimum and maximum to be used for later scaling.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data used to compute the per-feature minimum and maximum\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n self._reset()\n return self.partial_fit(X, y)" }, { @@ -150679,7 +162388,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -150689,13 +162399,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data that will be transformed. It cannot be sparse." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Undo the scaling of X according to feature_range.", - "docstring": "Undo the scaling of X according to feature_range.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data that will be transformed. It cannot be sparse.\n\nReturns\n-------\nXt : ndarray of shape (n_samples, n_features)\n Transformed data.", + "docstring": "Undo the scaling of X according to feature_range.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data that will be transformed. It cannot be sparse.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_features)\n Transformed data.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Undo the scaling of X according to feature_range.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data that will be transformed. It cannot be sparse.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_features)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n X -= self.min_\n X /= self.scale_\n return X" }, { @@ -150713,7 +162424,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -150723,7 +162435,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data used to compute the mean and standard deviation\nused for later scaling along the features axis." - } + }, + "refined_type": {} }, { "name": "y", @@ -150733,13 +162446,14 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Online computation of min and max on X for later scaling.\n\nAll of X is processed as a single batch. This is intended for cases when :meth:`fit` is not feasible due to very large number of `n_samples` or because X is read from a continuous stream.", - "docstring": "Online computation of min and max on X for later scaling.\n\nAll of X is processed as a single batch. This is intended for cases\nwhen :meth:`fit` is not feasible due to very large number of\n`n_samples` or because X is read from a continuous stream.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\ny : None\n Ignored.\n\nReturns\n-------\nself : object\n Fitted scaler.", + "description": "Online computation of min and max on X for later scaling.\n\nAll of X is processed as a single batch. This is intended for cases\nwhen :meth:`fit` is not feasible due to very large number of\n`n_samples` or because X is read from a continuous stream.", + "docstring": "Online computation of min and max on X for later scaling.\n\n All of X is processed as a single batch. This is intended for cases\n when :meth:`fit` is not feasible due to very large number of\n `n_samples` or because X is read from a continuous stream.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted scaler.\n ", "source_code": "\ndef partial_fit(self, X, y=None):\n \"\"\"Online computation of min and max on X for later scaling.\n\n All of X is processed as a single batch. This is intended for cases\n when :meth:`fit` is not feasible due to very large number of\n `n_samples` or because X is read from a continuous stream.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n feature_range = self.feature_range\n if feature_range[0] >= feature_range[1]:\n raise ValueError('Minimum of desired feature range must be smaller than maximum. Got %s.' % str(feature_range))\n if sparse.issparse(X):\n raise TypeError('MinMaxScaler does not support sparse input. Consider using MaxAbsScaler instead.')\n first_pass = not hasattr(self, 'n_samples_seen_')\n X = self._validate_data(X, reset=first_pass, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n data_min = np.nanmin(X, axis=0)\n data_max = np.nanmax(X, axis=0)\n if first_pass:\n self.n_samples_seen_ = X.shape[0]\n else:\n data_min = np.minimum(self.data_min_, data_min)\n data_max = np.maximum(self.data_max_, data_max)\n self.n_samples_seen_ += X.shape[0]\n data_range = data_max - data_min\n self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(data_range, copy=True)\n self.min_ = feature_range[0] - data_min * self.scale_\n self.data_min_ = data_min\n self.data_max_ = data_max\n self.data_range_ = data_range\n return self" }, { @@ -150757,7 +162471,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -150767,13 +162482,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data that will be transformed." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Scale features of X according to feature_range.", - "docstring": "Scale features of X according to feature_range.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data that will be transformed.\n\nReturns\n-------\nXt : ndarray of shape (n_samples, n_features)\n Transformed data.", + "docstring": "Scale features of X according to feature_range.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data that will be transformed.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_features)\n Transformed data.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Scale features of X according to feature_range.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data that will be transformed.\n\n Returns\n -------\n Xt : ndarray of shape (n_samples, n_features)\n Transformed data.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite='allow-nan', reset=False)\n X *= self.scale_\n X += self.min_\n if self.clip:\n np.clip(X, self.feature_range[0], self.feature_range[1], out=X)\n return X" }, { @@ -150791,7 +162507,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "norm", @@ -150801,6 +162518,10 @@ "docstring": { "type": "{'l1', 'l2', 'max'}, default='l2'", "description": "The norm to use to normalize each non zero sample. If norm='max'\nis used, values will be rescaled by the maximum of the absolute\nvalues." + }, + "refined_type": { + "kind": "EnumType", + "values": ["max", "l2", "l1"] } }, { @@ -150811,13 +162532,14 @@ "docstring": { "type": "bool, default=True", "description": "Set to False to perform inplace row normalization and avoid a\ncopy (if the input is already a numpy array or a scipy.sparse\nCSR matrix)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, norm='l2', *, copy=True):\n self.norm = norm\n self.copy = copy" }, { @@ -150835,13 +162557,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'stateless': True}" }, { @@ -150859,7 +162582,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -150869,6 +162593,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data to estimate the normalization parameters." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -150879,13 +162607,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Do nothing and return the estimator unchanged.\n\nThis method is just there to implement the usual API and hence work in pipelines.", - "docstring": "Do nothing and return the estimator unchanged.\n\nThis method is just there to implement the usual API and hence\nwork in pipelines.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to estimate the normalization parameters.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n Fitted transformer.", + "description": "Do nothing and return the estimator unchanged.\n\nThis method is just there to implement the usual API and hence\nwork in pipelines.", + "docstring": "Do nothing and return the estimator unchanged.\n\n This method is just there to implement the usual API and hence\n work in pipelines.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to estimate the normalization parameters.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted transformer.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Do nothing and return the estimator unchanged.\n\n This method is just there to implement the usual API and hence\n work in pipelines.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to estimate the normalization parameters.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n self._validate_data(X, accept_sparse='csr')\n return self" }, { @@ -150903,7 +162632,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -150913,6 +162643,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data to normalize, row by row. scipy.sparse matrices should be\nin CSR format to avoid an un-necessary copy." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -150923,13 +162657,14 @@ "docstring": { "type": "bool, default=None", "description": "Copy the input X or not." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Scale each non zero row of X to unit norm.", - "docstring": "Scale each non zero row of X to unit norm.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to normalize, row by row. scipy.sparse matrices should be\n in CSR format to avoid an un-necessary copy.\n\ncopy : bool, default=None\n Copy the input X or not.\n\nReturns\n-------\nX_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.", + "docstring": "Scale each non zero row of X to unit norm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to normalize, row by row. scipy.sparse matrices should be\n in CSR format to avoid an un-necessary copy.\n\n copy : bool, default=None\n Copy the input X or not.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n ", "source_code": "\ndef transform(self, X, copy=None):\n \"\"\"Scale each non zero row of X to unit norm.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to normalize, row by row. scipy.sparse matrices should be\n in CSR format to avoid an un-necessary copy.\n\n copy : bool, default=None\n Copy the input X or not.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n copy = copy if copy is not None else self.copy\n X = self._validate_data(X, accept_sparse='csr', reset=False)\n return normalize(X, norm=self.norm, axis=1, copy=copy)" }, { @@ -150947,7 +162682,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "method", @@ -150957,6 +162693,10 @@ "docstring": { "type": "{'yeo-johnson', 'box-cox'}, default='yeo-johnson'", "description": "The power transform method. Available methods are:\n\n- 'yeo-johnson' [1]_, works with positive and negative values\n- 'box-cox' [2]_, only works with strictly positive values" + }, + "refined_type": { + "kind": "EnumType", + "values": ["box-cox", "yeo-johnson"] } }, { @@ -150967,7 +162707,8 @@ "docstring": { "type": "bool, default=True", "description": "Set to True to apply zero-mean, unit-variance normalization to the\ntransformed output." - } + }, + "refined_type": {} }, { "name": "copy", @@ -150977,13 +162718,14 @@ "docstring": { "type": "bool, default=True", "description": "Set to False to perform inplace computation during transformation." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, method='yeo-johnson', *, standardize=True, copy=True):\n self.method = method\n self.standardize = standardize\n self.copy = copy" }, { @@ -151001,7 +162743,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "x", @@ -151011,7 +162754,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lmbda", @@ -151021,13 +162765,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return inverse-transformed input x following Box-Cox inverse transform with parameter lambda.", - "docstring": "Return inverse-transformed input x following Box-Cox inverse\ntransform with parameter lambda.", + "description": "Return inverse-transformed input x following Box-Cox inverse\ntransform with parameter lambda.", + "docstring": "Return inverse-transformed input x following Box-Cox inverse\n transform with parameter lambda.\n ", "source_code": "\ndef _box_cox_inverse_tranform(self, x, lmbda):\n \"\"\"Return inverse-transformed input x following Box-Cox inverse\n transform with parameter lambda.\n \"\"\"\n if lmbda == 0:\n x_inv = np.exp(x)\n else:\n x_inv = (x * lmbda + 1)**(1 / lmbda)\n return x_inv" }, { @@ -151045,7 +162790,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "x", @@ -151055,13 +162801,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Find and return optimal lambda parameter of the Box-Cox transform by MLE, for observed data x.\n\nWe here use scipy builtins which uses the brent optimizer.", - "docstring": "Find and return optimal lambda parameter of the Box-Cox transform by\nMLE, for observed data x.\n\nWe here use scipy builtins which uses the brent optimizer.", + "description": "Find and return optimal lambda parameter of the Box-Cox transform by\nMLE, for observed data x.\n\nWe here use scipy builtins which uses the brent optimizer.", + "docstring": "Find and return optimal lambda parameter of the Box-Cox transform by\n MLE, for observed data x.\n\n We here use scipy builtins which uses the brent optimizer.\n ", "source_code": "\ndef _box_cox_optimize(self, x):\n \"\"\"Find and return optimal lambda parameter of the Box-Cox transform by\n MLE, for observed data x.\n\n We here use scipy builtins which uses the brent optimizer.\n \"\"\"\n (_, lmbda) = stats.boxcox(x[~np.isnan(x)], lmbda=None)\n return lmbda" }, { @@ -151079,7 +162826,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151089,7 +162837,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} }, { "name": "in_fit", @@ -151099,7 +162848,8 @@ "docstring": { "type": "bool", "description": "Whether or not `_check_input` is called from `fit` or other\nmethods, e.g. `predict`, `transform`, etc." - } + }, + "refined_type": {} }, { "name": "check_positive", @@ -151109,7 +162859,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, check that all data is positive and non-zero (only if\n``self.method=='box-cox'``)." - } + }, + "refined_type": {} }, { "name": "check_shape", @@ -151119,7 +162870,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, check that n_features matches the length of self.lambdas_" - } + }, + "refined_type": {} }, { "name": "check_method", @@ -151129,13 +162881,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, check that the transformation method is valid." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Validate the input before fit and transform.", - "docstring": "Validate the input before fit and transform.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nin_fit : bool\n Whether or not `_check_input` is called from `fit` or other\n methods, e.g. `predict`, `transform`, etc.\n\ncheck_positive : bool, default=False\n If True, check that all data is positive and non-zero (only if\n ``self.method=='box-cox'``).\n\ncheck_shape : bool, default=False\n If True, check that n_features matches the length of self.lambdas_\n\ncheck_method : bool, default=False\n If True, check that the transformation method is valid.", + "docstring": "Validate the input before fit and transform.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n in_fit : bool\n Whether or not `_check_input` is called from `fit` or other\n methods, e.g. `predict`, `transform`, etc.\n\n check_positive : bool, default=False\n If True, check that all data is positive and non-zero (only if\n ``self.method=='box-cox'``).\n\n check_shape : bool, default=False\n If True, check that n_features matches the length of self.lambdas_\n\n check_method : bool, default=False\n If True, check that the transformation method is valid.\n ", "source_code": "\ndef _check_input(self, X, in_fit, check_positive=False, check_shape=False, check_method=False):\n \"\"\"Validate the input before fit and transform.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n in_fit : bool\n Whether or not `_check_input` is called from `fit` or other\n methods, e.g. `predict`, `transform`, etc.\n\n check_positive : bool, default=False\n If True, check that all data is positive and non-zero (only if\n ``self.method=='box-cox'``).\n\n check_shape : bool, default=False\n If True, check that n_features matches the length of self.lambdas_\n\n check_method : bool, default=False\n If True, check that the transformation method is valid.\n \"\"\"\n X = self._validate_data(X, ensure_2d=True, dtype=FLOAT_DTYPES, copy=self.copy, force_all_finite='allow-nan', reset=in_fit)\n with np.warnings.catch_warnings():\n np.warnings.filterwarnings('ignore', 'All-NaN (slice|axis) encountered')\n if check_positive and self.method == 'box-cox' and np.nanmin(X) <= 0:\n raise ValueError('The Box-Cox transformation can only be applied to strictly positive data')\n if check_shape and not X.shape[1] == len(self.lambdas_):\n raise ValueError('Input data has a different number of features than fitting data. Should have {n}, data has {m}'.format(n=len(self.lambdas_), m=X.shape[1]))\n valid_methods = ('box-cox', 'yeo-johnson')\n if check_method and self.method not in valid_methods:\n raise ValueError(\"'method' must be one of {}, got {} instead.\".format(valid_methods, self.method))\n return X" }, { @@ -151153,7 +162906,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151163,7 +162917,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -151173,7 +162928,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "force_transform", @@ -151183,13 +162939,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit(self, X, y=None, force_transform=False):\n X = self._check_input(X, in_fit=True, check_positive=True, check_method=True)\n if not self.copy and not force_transform:\n X = X.copy()\n optim_function = {'box-cox': self._box_cox_optimize, 'yeo-johnson': self._yeo_johnson_optimize}[self.method]\n with np.errstate(invalid='ignore'):\n self.lambdas_ = np.array([optim_function(col) for col in X.T])\n if self.standardize or force_transform:\n transform_function = {'box-cox': boxcox, 'yeo-johnson': self._yeo_johnson_transform}[self.method]\n for (i, lmbda) in enumerate(self.lambdas_):\n with np.errstate(invalid='ignore'):\n X[:, i] = transform_function(X[:, i], lmbda)\n if self.standardize:\n self._scaler = StandardScaler(copy=False)\n if force_transform:\n X = self._scaler.fit_transform(X)\n else:\n self._scaler.fit(X)\n return X" }, { @@ -151207,13 +162964,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'allow_nan': True}" }, { @@ -151231,7 +162989,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "x", @@ -151241,7 +163000,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lmbda", @@ -151251,13 +163011,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return inverse-transformed input x following Yeo-Johnson inverse transform with parameter lambda.", - "docstring": "Return inverse-transformed input x following Yeo-Johnson inverse\ntransform with parameter lambda.", + "description": "Return inverse-transformed input x following Yeo-Johnson inverse\ntransform with parameter lambda.", + "docstring": "Return inverse-transformed input x following Yeo-Johnson inverse\n transform with parameter lambda.\n ", "source_code": "\ndef _yeo_johnson_inverse_transform(self, x, lmbda):\n \"\"\"Return inverse-transformed input x following Yeo-Johnson inverse\n transform with parameter lambda.\n \"\"\"\n x_inv = np.zeros_like(x)\n pos = x >= 0\n if abs(lmbda) < np.spacing(1.0):\n x_inv[pos] = np.exp(x[pos]) - 1\n else:\n x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1\n if abs(lmbda - 2) > np.spacing(1.0):\n x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda))\n else:\n x_inv[~pos] = 1 - np.exp(-x[~pos])\n return x_inv" }, { @@ -151275,7 +163036,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "x", @@ -151285,13 +163047,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Find and return optimal lambda parameter of the Yeo-Johnson transform by MLE, for observed data x.\n\nLike for Box-Cox, MLE is done via the brent optimizer.", - "docstring": "Find and return optimal lambda parameter of the Yeo-Johnson\ntransform by MLE, for observed data x.\n\nLike for Box-Cox, MLE is done via the brent optimizer.", + "description": "Find and return optimal lambda parameter of the Yeo-Johnson\ntransform by MLE, for observed data x.\n\nLike for Box-Cox, MLE is done via the brent optimizer.", + "docstring": "Find and return optimal lambda parameter of the Yeo-Johnson\n transform by MLE, for observed data x.\n\n Like for Box-Cox, MLE is done via the brent optimizer.\n ", "source_code": "\ndef _yeo_johnson_optimize(self, x):\n \"\"\"Find and return optimal lambda parameter of the Yeo-Johnson\n transform by MLE, for observed data x.\n\n Like for Box-Cox, MLE is done via the brent optimizer.\n \"\"\"\n \n def _neg_log_likelihood(lmbda):\n \"\"\"Return the negative log likelihood of the observed data x as a\n function of lambda.\"\"\"\n x_trans = self._yeo_johnson_transform(x, lmbda)\n n_samples = x.shape[0]\n loglike = -n_samples / 2 * np.log(x_trans.var())\n loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum()\n return -loglike\n x = x[~np.isnan(x)]\n return optimize.brent(_neg_log_likelihood, brack=(-2, 2))" }, { @@ -151309,7 +163072,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "x", @@ -151319,7 +163083,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "lmbda", @@ -151329,13 +163094,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return transformed input x following Yeo-Johnson transform with parameter lambda.", - "docstring": "Return transformed input x following Yeo-Johnson transform with\nparameter lambda.", + "description": "Return transformed input x following Yeo-Johnson transform with\nparameter lambda.", + "docstring": "Return transformed input x following Yeo-Johnson transform with\n parameter lambda.\n ", "source_code": "\ndef _yeo_johnson_transform(self, x, lmbda):\n \"\"\"Return transformed input x following Yeo-Johnson transform with\n parameter lambda.\n \"\"\"\n out = np.zeros_like(x)\n pos = x >= 0\n if abs(lmbda) < np.spacing(1.0):\n out[pos] = np.log1p(x[pos])\n else:\n out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda\n if abs(lmbda - 2) > np.spacing(1.0):\n out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)\n else:\n out[~pos] = -np.log1p(-x[~pos])\n return out" }, { @@ -151353,7 +163119,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151363,7 +163130,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data used to estimate the optimal transformation parameters." - } + }, + "refined_type": {} }, { "name": "y", @@ -151373,13 +163141,14 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Estimate the optimal parameter lambda for each feature.\n\nThe optimal lambda parameter for minimizing skewness is estimated on each feature independently using maximum likelihood.", - "docstring": "Estimate the optimal parameter lambda for each feature.\n\nThe optimal lambda parameter for minimizing skewness is estimated on\neach feature independently using maximum likelihood.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data used to estimate the optimal transformation parameters.\n\ny : None\n Ignored.\n\nReturns\n-------\nself : object\n Fitted transformer.", + "description": "Estimate the optimal parameter lambda for each feature.\n\nThe optimal lambda parameter for minimizing skewness is estimated on\neach feature independently using maximum likelihood.", + "docstring": "Estimate the optimal parameter lambda for each feature.\n\n The optimal lambda parameter for minimizing skewness is estimated on\n each feature independently using maximum likelihood.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data used to estimate the optimal transformation parameters.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted transformer.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Estimate the optimal parameter lambda for each feature.\n\n The optimal lambda parameter for minimizing skewness is estimated on\n each feature independently using maximum likelihood.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data used to estimate the optimal transformation parameters.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n self._fit(X, y=y, force_transform=False)\n return self" }, { @@ -151397,7 +163166,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151407,7 +163177,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data used to estimate the optimal transformation parameters\nand to be transformed using a power transformation." - } + }, + "refined_type": {} }, { "name": "y", @@ -151417,13 +163188,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit `PowerTransformer` to `X`, then transform `X`.", - "docstring": "Fit `PowerTransformer` to `X`, then transform `X`.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data used to estimate the optimal transformation parameters\n and to be transformed using a power transformation.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nReturns\n-------\nX_new : ndarray of shape (n_samples, n_features)\n Transformed data.", + "docstring": "Fit `PowerTransformer` to `X`, then transform `X`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data used to estimate the optimal transformation parameters\n and to be transformed using a power transformation.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_features)\n Transformed data.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"Fit `PowerTransformer` to `X`, then transform `X`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data used to estimate the optimal transformation parameters\n and to be transformed using a power transformation.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n Returns\n -------\n X_new : ndarray of shape (n_samples, n_features)\n Transformed data.\n \"\"\"\n return self._fit(X, y, force_transform=True)" }, { @@ -151441,7 +163213,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151451,13 +163224,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The transformed data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Apply the inverse power transformation using the fitted lambdas.\n\nThe inverse of the Box-Cox transformation is given by:: if lambda_ == 0: X = exp(X_trans) else: X = (X_trans * lambda_ + 1) ** (1 / lambda_) The inverse of the Yeo-Johnson transformation is given by:: if X >= 0 and lambda_ == 0: X = exp(X_trans) - 1 elif X >= 0 and lambda_ != 0: X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1 elif X < 0 and lambda_ != 2: X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_)) elif X < 0 and lambda_ == 2: X = 1 - exp(-X_trans)", - "docstring": "Apply the inverse power transformation using the fitted lambdas.\n\nThe inverse of the Box-Cox transformation is given by::\n\n if lambda_ == 0:\n X = exp(X_trans)\n else:\n X = (X_trans * lambda_ + 1) ** (1 / lambda_)\n\nThe inverse of the Yeo-Johnson transformation is given by::\n\n if X >= 0 and lambda_ == 0:\n X = exp(X_trans) - 1\n elif X >= 0 and lambda_ != 0:\n X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1\n elif X < 0 and lambda_ != 2:\n X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_))\n elif X < 0 and lambda_ == 2:\n X = 1 - exp(-X_trans)\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The transformed data.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_features)\n The original data.", + "description": "Apply the inverse power transformation using the fitted lambdas.\n\nThe inverse of the Box-Cox transformation is given by::\n\n if lambda_ == 0:\n X = exp(X_trans)\n else:\n X = (X_trans * lambda_ + 1) ** (1 / lambda_)\n\nThe inverse of the Yeo-Johnson transformation is given by::\n\n if X >= 0 and lambda_ == 0:\n X = exp(X_trans) - 1\n elif X >= 0 and lambda_ != 0:\n X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1\n elif X < 0 and lambda_ != 2:\n X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_))\n elif X < 0 and lambda_ == 2:\n X = 1 - exp(-X_trans)", + "docstring": "Apply the inverse power transformation using the fitted lambdas.\n\n The inverse of the Box-Cox transformation is given by::\n\n if lambda_ == 0:\n X = exp(X_trans)\n else:\n X = (X_trans * lambda_ + 1) ** (1 / lambda_)\n\n The inverse of the Yeo-Johnson transformation is given by::\n\n if X >= 0 and lambda_ == 0:\n X = exp(X_trans) - 1\n elif X >= 0 and lambda_ != 0:\n X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1\n elif X < 0 and lambda_ != 2:\n X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_))\n elif X < 0 and lambda_ == 2:\n X = 1 - exp(-X_trans)\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The transformed data.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The original data.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Apply the inverse power transformation using the fitted lambdas.\n\n The inverse of the Box-Cox transformation is given by::\n\n if lambda_ == 0:\n X = exp(X_trans)\n else:\n X = (X_trans * lambda_ + 1) ** (1 / lambda_)\n\n The inverse of the Yeo-Johnson transformation is given by::\n\n if X >= 0 and lambda_ == 0:\n X = exp(X_trans) - 1\n elif X >= 0 and lambda_ != 0:\n X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1\n elif X < 0 and lambda_ != 2:\n X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_))\n elif X < 0 and lambda_ == 2:\n X = 1 - exp(-X_trans)\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The transformed data.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n The original data.\n \"\"\"\n check_is_fitted(self)\n X = self._check_input(X, in_fit=False, check_shape=True)\n if self.standardize:\n X = self._scaler.inverse_transform(X)\n inv_fun = {'box-cox': self._box_cox_inverse_tranform, 'yeo-johnson': self._yeo_johnson_inverse_transform}[self.method]\n for (i, lmbda) in enumerate(self.lambdas_):\n with np.errstate(invalid='ignore'):\n X[:, i] = inv_fun(X[:, i], lmbda)\n return X" }, { @@ -151475,7 +163249,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151485,13 +163260,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to be transformed using a power transformation." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Apply the power transform to each feature using the fitted lambdas.", - "docstring": "Apply the power transform to each feature using the fitted lambdas.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data to be transformed using a power transformation.\n\nReturns\n-------\nX_trans : ndarray of shape (n_samples, n_features)\n The transformed data.", + "docstring": "Apply the power transform to each feature using the fitted lambdas.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to be transformed using a power transformation.\n\n Returns\n -------\n X_trans : ndarray of shape (n_samples, n_features)\n The transformed data.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Apply the power transform to each feature using the fitted lambdas.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to be transformed using a power transformation.\n\n Returns\n -------\n X_trans : ndarray of shape (n_samples, n_features)\n The transformed data.\n \"\"\"\n check_is_fitted(self)\n X = self._check_input(X, in_fit=False, check_positive=True, check_shape=True)\n transform_function = {'box-cox': boxcox, 'yeo-johnson': self._yeo_johnson_transform}[self.method]\n for (i, lmbda) in enumerate(self.lambdas_):\n with np.errstate(invalid='ignore'):\n X[:, i] = transform_function(X[:, i], lmbda)\n if self.standardize:\n X = self._scaler.transform(X)\n return X" }, { @@ -151509,7 +163285,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_quantiles", @@ -151519,7 +163296,8 @@ "docstring": { "type": "int, default=1000 or n_samples", "description": "Number of quantiles to be computed. It corresponds to the number\nof landmarks used to discretize the cumulative distribution function.\nIf n_quantiles is larger than the number of samples, n_quantiles is set\nto the number of samples as a larger number of quantiles does not give\na better approximation of the cumulative distribution function\nestimator." - } + }, + "refined_type": {} }, { "name": "output_distribution", @@ -151529,6 +163307,10 @@ "docstring": { "type": "{'uniform', 'normal'}, default='uniform'", "description": "Marginal distribution for the transformed data. The choices are\n'uniform' (default) or 'normal'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["uniform", "normal"] } }, { @@ -151539,7 +163321,8 @@ "docstring": { "type": "bool, default=False", "description": "Only applies to sparse matrices. If True, the sparse entries of the\nmatrix are discarded to compute the quantile statistics. If False,\nthese entries are treated as zeros." - } + }, + "refined_type": {} }, { "name": "subsample", @@ -151549,7 +163332,8 @@ "docstring": { "type": "int, default=1e5", "description": "Maximum number of samples used to estimate the quantiles for\ncomputational efficiency. Note that the subsampling procedure may\ndiffer for value-identical sparse and dense matrices." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -151559,7 +163343,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for subsampling and smoothing\nnoise.\nPlease see ``subsample`` for more details.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "copy", @@ -151569,13 +163354,14 @@ "docstring": { "type": "bool, default=True", "description": "Set to False to perform inplace transformation and avoid a copy (if the\ninput is already a numpy array)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(100000.0), random_state=None, copy=True):\n self.n_quantiles = n_quantiles\n self.output_distribution = output_distribution\n self.ignore_implicit_zeros = ignore_implicit_zeros\n self.subsample = subsample\n self.random_state = random_state\n self.copy = copy" }, { @@ -151593,7 +163379,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151603,7 +163390,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "in_fit", @@ -151613,7 +163401,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "accept_sparse_negative", @@ -151623,7 +163412,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy", @@ -151633,7 +163423,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -151657,7 +163448,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151667,7 +163459,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The data used to scale along the features axis." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -151677,13 +163470,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute percentiles for dense matrices.", - "docstring": "Compute percentiles for dense matrices.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n The data used to scale along the features axis.", + "docstring": "Compute percentiles for dense matrices.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data used to scale along the features axis.\n ", "source_code": "\ndef _dense_fit(self, X, random_state):\n \"\"\"Compute percentiles for dense matrices.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data used to scale along the features axis.\n \"\"\"\n if self.ignore_implicit_zeros:\n warnings.warn(\"'ignore_implicit_zeros' takes effect only with sparse matrix. This parameter has no effect.\")\n (n_samples, n_features) = X.shape\n references = self.references_ * 100\n self.quantiles_ = []\n for col in X.T:\n if self.subsample < n_samples:\n subsample_idx = random_state.choice(n_samples, size=self.subsample, replace=False)\n col = col.take(subsample_idx, mode='clip')\n self.quantiles_.append(np.nanpercentile(col, references))\n self.quantiles_ = np.transpose(self.quantiles_)\n self.quantiles_ = np.maximum.accumulate(self.quantiles_)" }, { @@ -151701,13 +163495,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'allow_nan': True}" }, { @@ -151725,7 +163520,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151735,7 +163531,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "The data used to scale along the features axis. The sparse matrix\nneeds to be nonnegative. If a sparse matrix is provided,\nit will be converted into a sparse ``csc_matrix``." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -151745,13 +163542,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Compute percentiles for sparse matrices.", - "docstring": "Compute percentiles for sparse matrices.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n The data used to scale along the features axis. The sparse matrix\n needs to be nonnegative. If a sparse matrix is provided,\n it will be converted into a sparse ``csc_matrix``.", + "docstring": "Compute percentiles for sparse matrices.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n The data used to scale along the features axis. The sparse matrix\n needs to be nonnegative. If a sparse matrix is provided,\n it will be converted into a sparse ``csc_matrix``.\n ", "source_code": "\ndef _sparse_fit(self, X, random_state):\n \"\"\"Compute percentiles for sparse matrices.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n The data used to scale along the features axis. The sparse matrix\n needs to be nonnegative. If a sparse matrix is provided,\n it will be converted into a sparse ``csc_matrix``.\n \"\"\"\n (n_samples, n_features) = X.shape\n references = self.references_ * 100\n self.quantiles_ = []\n for feature_idx in range(n_features):\n column_nnz_data = X.data[X.indptr[feature_idx]:X.indptr[feature_idx + 1]]\n if len(column_nnz_data) > self.subsample:\n column_subsample = self.subsample * len(column_nnz_data) // n_samples\n if self.ignore_implicit_zeros:\n column_data = np.zeros(shape=column_subsample, dtype=X.dtype)\n else:\n column_data = np.zeros(shape=self.subsample, dtype=X.dtype)\n column_data[:column_subsample] = random_state.choice(column_nnz_data, size=column_subsample, replace=False)\n else:\n if self.ignore_implicit_zeros:\n column_data = np.zeros(shape=len(column_nnz_data), dtype=X.dtype)\n else:\n column_data = np.zeros(shape=n_samples, dtype=X.dtype)\n column_data[:len(column_nnz_data)] = column_nnz_data\n if not column_data.size:\n self.quantiles_.append([0] * len(references))\n else:\n self.quantiles_.append(np.nanpercentile(column_data, references))\n self.quantiles_ = np.transpose(self.quantiles_)\n self.quantiles_ = np.maximum.accumulate(self.quantiles_)" }, { @@ -151769,7 +163567,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151779,7 +163578,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_features)", "description": "The data used to scale along the features axis." - } + }, + "refined_type": {} }, { "name": "inverse", @@ -151789,13 +163589,14 @@ "docstring": { "type": "bool, default=False", "description": "If False, apply forward transform. If True, apply\ninverse transform." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Forward and inverse transform.", - "docstring": "Forward and inverse transform.\n\nParameters\n----------\nX : ndarray of shape (n_samples, n_features)\n The data used to scale along the features axis.\n\ninverse : bool, default=False\n If False, apply forward transform. If True, apply\n inverse transform.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_features)\n Projected data.", + "docstring": "Forward and inverse transform.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data used to scale along the features axis.\n\n inverse : bool, default=False\n If False, apply forward transform. If True, apply\n inverse transform.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n Projected data.\n ", "source_code": "\ndef _transform(self, X, inverse=False):\n \"\"\"Forward and inverse transform.\n\n Parameters\n ----------\n X : ndarray of shape (n_samples, n_features)\n The data used to scale along the features axis.\n\n inverse : bool, default=False\n If False, apply forward transform. If True, apply\n inverse transform.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_features)\n Projected data.\n \"\"\"\n if sparse.issparse(X):\n for feature_idx in range(X.shape[1]):\n column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1])\n X.data[column_slice] = self._transform_col(X.data[column_slice], self.quantiles_[:, feature_idx], inverse)\n else:\n for feature_idx in range(X.shape[1]):\n X[:, feature_idx] = self._transform_col(X[:, feature_idx], self.quantiles_[:, feature_idx], inverse)\n return X" }, { @@ -151813,7 +163614,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_col", @@ -151823,7 +163625,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "quantiles", @@ -151833,7 +163636,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "inverse", @@ -151843,7 +163647,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -151867,7 +163672,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151877,6 +163683,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data used to scale along the features axis. If a sparse\nmatrix is provided, it will be converted into a sparse\n``csc_matrix``. Additionally, the sparse matrix needs to be\nnonnegative if `ignore_implicit_zeros` is False." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -151887,13 +163697,14 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the quantiles used for transforming.", - "docstring": "Compute the quantiles used for transforming.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis. If a sparse\n matrix is provided, it will be converted into a sparse\n ``csc_matrix``. Additionally, the sparse matrix needs to be\n nonnegative if `ignore_implicit_zeros` is False.\n\ny : None\n Ignored.\n\nReturns\n-------\nself : object\n Fitted transformer.", + "docstring": "Compute the quantiles used for transforming.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis. If a sparse\n matrix is provided, it will be converted into a sparse\n ``csc_matrix``. Additionally, the sparse matrix needs to be\n nonnegative if `ignore_implicit_zeros` is False.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted transformer.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Compute the quantiles used for transforming.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis. If a sparse\n matrix is provided, it will be converted into a sparse\n ``csc_matrix``. Additionally, the sparse matrix needs to be\n nonnegative if `ignore_implicit_zeros` is False.\n\n y : None\n Ignored.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n if self.n_quantiles <= 0:\n raise ValueError(\"Invalid value for 'n_quantiles': %d. The number of quantiles must be at least one.\" % self.n_quantiles)\n if self.subsample <= 0:\n raise ValueError(\"Invalid value for 'subsample': %d. The number of subsamples must be at least one.\" % self.subsample)\n if self.n_quantiles > self.subsample:\n raise ValueError('The number of quantiles cannot be greater than the number of samples used. Got {} quantiles and {} samples.'.format(self.n_quantiles, self.subsample))\n X = self._check_inputs(X, in_fit=True, copy=False)\n n_samples = X.shape[0]\n if self.n_quantiles > n_samples:\n warnings.warn('n_quantiles (%s) is greater than the total number of samples (%s). n_quantiles is set to n_samples.' % (self.n_quantiles, n_samples))\n self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples))\n rng = check_random_state(self.random_state)\n self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True)\n if sparse.issparse(X):\n self._sparse_fit(X, rng)\n else:\n self._dense_fit(X, rng)\n return self" }, { @@ -151911,7 +163722,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151921,13 +163733,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data used to scale along the features axis. If a sparse\nmatrix is provided, it will be converted into a sparse\n``csc_matrix``. Additionally, the sparse matrix needs to be\nnonnegative if `ignore_implicit_zeros` is False." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Back-projection to the original space.", - "docstring": "Back-projection to the original space.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis. If a sparse\n matrix is provided, it will be converted into a sparse\n ``csc_matrix``. Additionally, the sparse matrix needs to be\n nonnegative if `ignore_implicit_zeros` is False.\n\nReturns\n-------\nXt : {ndarray, sparse matrix} of (n_samples, n_features)\n The projected data.", + "docstring": "Back-projection to the original space.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis. If a sparse\n matrix is provided, it will be converted into a sparse\n ``csc_matrix``. Additionally, the sparse matrix needs to be\n nonnegative if `ignore_implicit_zeros` is False.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of (n_samples, n_features)\n The projected data.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Back-projection to the original space.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis. If a sparse\n matrix is provided, it will be converted into a sparse\n ``csc_matrix``. Additionally, the sparse matrix needs to be\n nonnegative if `ignore_implicit_zeros` is False.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of (n_samples, n_features)\n The projected data.\n \"\"\"\n check_is_fitted(self)\n X = self._check_inputs(X, in_fit=False, accept_sparse_negative=True, copy=self.copy)\n return self._transform(X, inverse=True)" }, { @@ -151945,7 +163761,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -151955,13 +163772,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data used to scale along the features axis. If a sparse\nmatrix is provided, it will be converted into a sparse\n``csc_matrix``. Additionally, the sparse matrix needs to be\nnonnegative if `ignore_implicit_zeros` is False." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Feature-wise transformation of the data.", - "docstring": "Feature-wise transformation of the data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis. If a sparse\n matrix is provided, it will be converted into a sparse\n ``csc_matrix``. Additionally, the sparse matrix needs to be\n nonnegative if `ignore_implicit_zeros` is False.\n\nReturns\n-------\nXt : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The projected data.", + "docstring": "Feature-wise transformation of the data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis. If a sparse\n matrix is provided, it will be converted into a sparse\n ``csc_matrix``. Additionally, the sparse matrix needs to be\n nonnegative if `ignore_implicit_zeros` is False.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The projected data.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Feature-wise transformation of the data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis. If a sparse\n matrix is provided, it will be converted into a sparse\n ``csc_matrix``. Additionally, the sparse matrix needs to be\n nonnegative if `ignore_implicit_zeros` is False.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The projected data.\n \"\"\"\n check_is_fitted(self)\n X = self._check_inputs(X, in_fit=False, copy=self.copy)\n return self._transform(X, inverse=False)" }, { @@ -151979,7 +163800,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "with_centering", @@ -151989,7 +163811,8 @@ "docstring": { "type": "bool, default=True", "description": "If `True`, center the data before scaling.\nThis will cause :meth:`transform` to raise an exception when attempted\non sparse matrices, because centering them entails building a dense\nmatrix which in common use cases is likely to be too large to fit in\nmemory." - } + }, + "refined_type": {} }, { "name": "with_scaling", @@ -151999,7 +163822,8 @@ "docstring": { "type": "bool, default=True", "description": "If `True`, scale the data to interquartile range." - } + }, + "refined_type": {} }, { "name": "quantile_range", @@ -152009,7 +163833,8 @@ "docstring": { "type": "tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, default=(25.0, 75.0)", "description": "Quantile range used to calculate `scale_`. By default this is equal to\nthe IQR, i.e., `q_min` is the first quantile and `q_max` is the third\nquantile.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "copy", @@ -152019,7 +163844,8 @@ "docstring": { "type": "bool, default=True", "description": "If `False`, try to avoid a copy and do inplace scaling instead.\nThis is not guaranteed to always work inplace; e.g. if the data is\nnot a NumPy array or scipy.sparse CSR matrix, a copy may still be\nreturned." - } + }, + "refined_type": {} }, { "name": "unit_variance", @@ -152029,13 +163855,14 @@ "docstring": { "type": "bool, default=False", "description": "If `True`, scale data so that normally distributed features have a\nvariance of 1. In general, if the difference between the x-values of\n`q_max` and `q_min` for a standard normal distribution is greater\nthan 1, the dataset will be scaled down. If less than 1, the dataset\nwill be scaled up.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True, unit_variance=False):\n self.with_centering = with_centering\n self.with_scaling = with_scaling\n self.quantile_range = quantile_range\n self.unit_variance = unit_variance\n self.copy = copy" }, { @@ -152053,13 +163880,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'allow_nan': True}" }, { @@ -152077,7 +163905,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -152087,6 +163916,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data used to compute the median and quantiles\nused for later scaling along the features axis." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -152097,13 +163930,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the median and quantiles to be used for scaling.", - "docstring": "Compute the median and quantiles to be used for scaling.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the median and quantiles\n used for later scaling along the features axis.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n Fitted scaler.", + "docstring": "Compute the median and quantiles to be used for scaling.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the median and quantiles\n used for later scaling along the features axis.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted scaler.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Compute the median and quantiles to be used for scaling.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the median and quantiles\n used for later scaling along the features axis.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n X = self._validate_data(X, accept_sparse='csc', estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n (q_min, q_max) = self.quantile_range\n if not 0 <= q_min <= q_max <= 100:\n raise ValueError('Invalid quantile range: %s' % str(self.quantile_range))\n if self.with_centering:\n if sparse.issparse(X):\n raise ValueError('Cannot center sparse matrices: use `with_centering=False` instead. See docstring for motivation and alternatives.')\n self.center_ = np.nanmedian(X, axis=0)\n else:\n self.center_ = None\n if self.with_scaling:\n quantiles = []\n for feature_idx in range(X.shape[1]):\n if sparse.issparse(X):\n column_nnz_data = X.data[X.indptr[feature_idx]:X.indptr[feature_idx + 1]]\n column_data = np.zeros(shape=X.shape[0], dtype=X.dtype)\n column_data[:len(column_nnz_data)] = column_nnz_data\n else:\n column_data = X[:, feature_idx]\n quantiles.append(np.nanpercentile(column_data, self.quantile_range))\n quantiles = np.transpose(quantiles)\n self.scale_ = quantiles[1] - quantiles[0]\n self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)\n if self.unit_variance:\n adjust = stats.norm.ppf(q_max / 100.0) - stats.norm.ppf(q_min / 100.0)\n self.scale_ = self.scale_ / adjust\n else:\n self.scale_ = None\n return self" }, { @@ -152121,7 +163955,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -152131,13 +163966,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The rescaled data to be transformed back." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Scale back the data to the original representation.", - "docstring": "Scale back the data to the original representation.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The rescaled data to be transformed back.\n\nReturns\n-------\nX_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.", + "docstring": "Scale back the data to the original representation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The rescaled data to be transformed back.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Scale back the data to the original representation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The rescaled data to be transformed back.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n if self.with_scaling:\n inplace_column_scale(X, self.scale_)\n else:\n if self.with_scaling:\n X *= self.scale_\n if self.with_centering:\n X += self.center_\n return X" }, { @@ -152155,7 +163994,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -152165,13 +164005,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data used to scale along the specified axis." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Center and scale the data.", - "docstring": "Center and scale the data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the specified axis.\n\nReturns\n-------\nX_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.", + "docstring": "Center and scale the data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the specified axis.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Center and scale the data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the specified axis.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, reset=False, force_all_finite='allow-nan')\n if sparse.issparse(X):\n if self.with_scaling:\n inplace_column_scale(X, 1.0 / self.scale_)\n else:\n if self.with_centering:\n X -= self.center_\n if self.with_scaling:\n X /= self.scale_\n return X" }, { @@ -152189,7 +164033,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy", @@ -152199,7 +164044,8 @@ "docstring": { "type": "bool, default=True", "description": "If False, try to avoid a copy and do inplace scaling instead.\nThis is not guaranteed to always work inplace; e.g. if the data is\nnot a NumPy array or scipy.sparse CSR matrix, a copy may still be\nreturned." - } + }, + "refined_type": {} }, { "name": "with_mean", @@ -152209,7 +164055,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, center the data before scaling.\nThis does not work (and will raise an exception) when attempted on\nsparse matrices, because centering them entails building a dense\nmatrix which in common use cases is likely to be too large to fit in\nmemory." - } + }, + "refined_type": {} }, { "name": "with_std", @@ -152219,13 +164066,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, scale the data to unit variance (or equivalently,\nunit standard deviation)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, copy=True, with_mean=True, with_std=True):\n self.with_mean = with_mean\n self.with_std = with_std\n self.copy = copy" }, { @@ -152243,13 +164091,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'allow_nan': True, 'preserves_dtype': [np.float64, np.float32]}" }, { @@ -152267,13 +164116,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Reset internal data-dependent state of the scaler, if necessary.\n\n__init__ parameters are not touched.", - "docstring": "Reset internal data-dependent state of the scaler, if necessary.\n\n__init__ parameters are not touched.", + "docstring": "Reset internal data-dependent state of the scaler, if necessary.\n\n __init__ parameters are not touched.\n ", "source_code": "\ndef _reset(self):\n \"\"\"Reset internal data-dependent state of the scaler, if necessary.\n\n __init__ parameters are not touched.\n \"\"\"\n if hasattr(self, 'scale_'):\n del self.scale_\n del self.n_samples_seen_\n del self.mean_\n del self.var_" }, { @@ -152291,7 +164141,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -152301,6 +164152,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data used to compute the mean and standard deviation\nused for later scaling along the features axis." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -152311,7 +164166,8 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -152321,13 +164177,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Individual weights for each sample.\n\n.. versionadded:: 0.24\n parameter *sample_weight* support to StandardScaler." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute the mean and std to be used for later scaling.", - "docstring": "Compute the mean and std to be used for later scaling.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\ny : None\n Ignored.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.24\n parameter *sample_weight* support to StandardScaler.\n\nReturns\n-------\nself : object\n Fitted scaler.", + "docstring": "Compute the mean and std to be used for later scaling.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.24\n parameter *sample_weight* support to StandardScaler.\n\n Returns\n -------\n self : object\n Fitted scaler.\n ", "source_code": "\ndef fit(self, X, y=None, sample_weight=None):\n \"\"\"Compute the mean and std to be used for later scaling.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.24\n parameter *sample_weight* support to StandardScaler.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n self._reset()\n return self.partial_fit(X, y, sample_weight)" }, { @@ -152345,7 +164202,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -152355,6 +164213,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data used to scale along the features axis." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -152365,13 +164227,14 @@ "docstring": { "type": "bool, default=None", "description": "Copy the input X or not." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Scale back the data to the original representation.", - "docstring": "Scale back the data to the original representation.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis.\ncopy : bool, default=None\n Copy the input X or not.\n\nReturns\n-------\nX_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.", + "docstring": "Scale back the data to the original representation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis.\n copy : bool, default=None\n Copy the input X or not.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n ", "source_code": "\ndef inverse_transform(self, X, copy=None):\n \"\"\"Scale back the data to the original representation.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to scale along the features axis.\n copy : bool, default=None\n Copy the input X or not.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n check_is_fitted(self)\n copy = copy if copy is not None else self.copy\n X = check_array(X, accept_sparse='csr', copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n if self.with_mean:\n raise ValueError('Cannot uncenter sparse matrices: pass `with_mean=False` instead See docstring for motivation and alternatives.')\n if self.scale_ is not None:\n inplace_column_scale(X, self.scale_)\n else:\n if self.with_std:\n X *= self.scale_\n if self.with_mean:\n X += self.mean_\n return X" }, { @@ -152389,7 +164252,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -152399,6 +164263,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data used to compute the mean and standard deviation\nused for later scaling along the features axis." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -152409,7 +164277,8 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -152419,13 +164288,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Individual weights for each sample.\n\n.. versionadded:: 0.24\n parameter *sample_weight* support to StandardScaler." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Online computation of mean and std on X for later scaling.\n\nAll of X is processed as a single batch. This is intended for cases when :meth:`fit` is not feasible due to very large number of `n_samples` or because X is read from a continuous stream. The algorithm for incremental mean and std is given in Equation 1.5a,b in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. \"Algorithms for computing the sample variance: Analysis and recommendations.\" The American Statistician 37.3 (1983): 242-247:", - "docstring": "Online computation of mean and std on X for later scaling.\n\nAll of X is processed as a single batch. This is intended for cases\nwhen :meth:`fit` is not feasible due to very large number of\n`n_samples` or because X is read from a continuous stream.\n\nThe algorithm for incremental mean and std is given in Equation 1.5a,b\nin Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. \"Algorithms\nfor computing the sample variance: Analysis and recommendations.\"\nThe American Statistician 37.3 (1983): 242-247:\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\ny : None\n Ignored.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.24\n parameter *sample_weight* support to StandardScaler.\n\nReturns\n-------\nself : object\n Fitted scaler.", + "description": "Online computation of mean and std on X for later scaling.\n\nAll of X is processed as a single batch. This is intended for cases\nwhen :meth:`fit` is not feasible due to very large number of\n`n_samples` or because X is read from a continuous stream.\n\nThe algorithm for incremental mean and std is given in Equation 1.5a,b\nin Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. \"Algorithms\nfor computing the sample variance: Analysis and recommendations.\"\nThe American Statistician 37.3 (1983): 242-247:", + "docstring": "Online computation of mean and std on X for later scaling.\n\n All of X is processed as a single batch. This is intended for cases\n when :meth:`fit` is not feasible due to very large number of\n `n_samples` or because X is read from a continuous stream.\n\n The algorithm for incremental mean and std is given in Equation 1.5a,b\n in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. \"Algorithms\n for computing the sample variance: Analysis and recommendations.\"\n The American Statistician 37.3 (1983): 242-247:\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.24\n parameter *sample_weight* support to StandardScaler.\n\n Returns\n -------\n self : object\n Fitted scaler.\n ", "source_code": "\ndef partial_fit(self, X, y=None, sample_weight=None):\n \"\"\"Online computation of mean and std on X for later scaling.\n\n All of X is processed as a single batch. This is intended for cases\n when :meth:`fit` is not feasible due to very large number of\n `n_samples` or because X is read from a continuous stream.\n\n The algorithm for incremental mean and std is given in Equation 1.5a,b\n in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. \"Algorithms\n for computing the sample variance: Analysis and recommendations.\"\n The American Statistician 37.3 (1983): 242-247:\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data used to compute the mean and standard deviation\n used for later scaling along the features axis.\n\n y : None\n Ignored.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Individual weights for each sample.\n\n .. versionadded:: 0.24\n parameter *sample_weight* support to StandardScaler.\n\n Returns\n -------\n self : object\n Fitted scaler.\n \"\"\"\n first_call = not hasattr(self, 'n_samples_seen_')\n X = self._validate_data(X, accept_sparse=('csr', 'csc'), estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan', reset=first_call)\n n_features = X.shape[1]\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n dtype = np.int64 if sample_weight is None else X.dtype\n if not hasattr(self, 'n_samples_seen_'):\n self.n_samples_seen_ = np.zeros(n_features, dtype=dtype)\n elif np.size(self.n_samples_seen_) == 1:\n self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1])\n self.n_samples_seen_ = self.n_samples_seen_.astype(dtype, copy=False)\n if sparse.issparse(X):\n if self.with_mean:\n raise ValueError('Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.')\n sparse_constructor = sparse.csr_matrix if X.format == 'csr' else sparse.csc_matrix\n if self.with_std:\n if not hasattr(self, 'scale_'):\n (self.mean_, self.var_, self.n_samples_seen_) = mean_variance_axis(X, axis=0, weights=sample_weight, return_sum_weights=True)\n else:\n (self.mean_, self.var_, self.n_samples_seen_) = incr_mean_variance_axis(X, axis=0, last_mean=self.mean_, last_var=self.var_, last_n=self.n_samples_seen_, weights=sample_weight)\n self.mean_ = self.mean_.astype(np.float64, copy=False)\n self.var_ = self.var_.astype(np.float64, copy=False)\n else:\n self.mean_ = None\n self.var_ = None\n weights = _check_sample_weight(sample_weight, X)\n sum_weights_nan = weights @ sparse_constructor((np.isnan(X.data), X.indices, X.indptr), shape=X.shape)\n self.n_samples_seen_ += (np.sum(weights) - sum_weights_nan).astype(dtype)\n else:\n if not hasattr(self, 'scale_'):\n self.mean_ = 0.0\n if self.with_std:\n self.var_ = 0.0\n else:\n self.var_ = None\n if not self.with_mean and not self.with_std:\n self.mean_ = None\n self.var_ = None\n self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)\n else:\n (self.mean_, self.var_, self.n_samples_seen_) = _incremental_mean_and_var(X, self.mean_, self.var_, self.n_samples_seen_, sample_weight=sample_weight)\n if np.ptp(self.n_samples_seen_) == 0:\n self.n_samples_seen_ = self.n_samples_seen_[0]\n if self.with_std:\n constant_mask = _is_constant_feature(self.var_, self.mean_, self.n_samples_seen_)\n self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_), copy=False, constant_mask=constant_mask)\n else:\n self.scale_ = None\n return self" }, { @@ -152443,7 +164313,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -152453,7 +164324,8 @@ "docstring": { "type": "{array-like, sparse matrix of shape (n_samples, n_features)", "description": "The data used to scale along the features axis." - } + }, + "refined_type": {} }, { "name": "copy", @@ -152463,13 +164335,14 @@ "docstring": { "type": "bool, default=None", "description": "Copy the input X or not." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Perform standardization by centering and scaling.", - "docstring": "Perform standardization by centering and scaling.\n\nParameters\n----------\nX : {array-like, sparse matrix of shape (n_samples, n_features)\n The data used to scale along the features axis.\ncopy : bool, default=None\n Copy the input X or not.\n\nReturns\n-------\nX_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.", + "docstring": "Perform standardization by centering and scaling.\n\n Parameters\n ----------\n X : {array-like, sparse matrix of shape (n_samples, n_features)\n The data used to scale along the features axis.\n copy : bool, default=None\n Copy the input X or not.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n ", "source_code": "\ndef transform(self, X, copy=None):\n \"\"\"Perform standardization by centering and scaling.\n\n Parameters\n ----------\n X : {array-like, sparse matrix of shape (n_samples, n_features)\n The data used to scale along the features axis.\n copy : bool, default=None\n Copy the input X or not.\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Transformed array.\n \"\"\"\n check_is_fitted(self)\n copy = copy if copy is not None else self.copy\n X = self._validate_data(X, reset=False, accept_sparse='csr', copy=copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n if self.with_mean:\n raise ValueError('Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.')\n if self.scale_ is not None:\n inplace_column_scale(X, 1 / self.scale_)\n else:\n if self.with_mean:\n X -= self.mean_\n if self.with_std:\n X /= self.scale_\n return X" }, { @@ -152487,7 +164360,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "copy", @@ -152497,7 +164371,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "constant_mask", @@ -152507,13 +164382,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Set scales of near constant features to 1.\n\nThe goal is to avoid division by very small or zero values. Near constant features are detected automatically by identifying scales close to machine precision unless they are precomputed by the caller and passed with the `constant_mask` kwarg. Typically for standard scaling, the scales are the standard deviation while near constant features are better detected on the computed variances which are closer to machine precision by construction.", - "docstring": "Set scales of near constant features to 1.\n\nThe goal is to avoid division by very small or zero values.\n\nNear constant features are detected automatically by identifying\nscales close to machine precision unless they are precomputed by\nthe caller and passed with the `constant_mask` kwarg.\n\nTypically for standard scaling, the scales are the standard\ndeviation while near constant features are better detected on the\ncomputed variances which are closer to machine precision by\nconstruction.", + "description": "Set scales of near constant features to 1.\n\nThe goal is to avoid division by very small or zero values.\n\nNear constant features are detected automatically by identifying\nscales close to machine precision unless they are precomputed by\nthe caller and passed with the `constant_mask` kwarg.\n\nTypically for standard scaling, the scales are the standard\ndeviation while near constant features are better detected on the\ncomputed variances which are closer to machine precision by\nconstruction.", + "docstring": "Set scales of near constant features to 1.\n\n The goal is to avoid division by very small or zero values.\n\n Near constant features are detected automatically by identifying\n scales close to machine precision unless they are precomputed by\n the caller and passed with the `constant_mask` kwarg.\n\n Typically for standard scaling, the scales are the standard\n deviation while near constant features are better detected on the\n computed variances which are closer to machine precision by\n construction.\n ", "source_code": "\ndef _handle_zeros_in_scale(scale, copy=True, constant_mask=None):\n \"\"\"Set scales of near constant features to 1.\n\n The goal is to avoid division by very small or zero values.\n\n Near constant features are detected automatically by identifying\n scales close to machine precision unless they are precomputed by\n the caller and passed with the `constant_mask` kwarg.\n\n Typically for standard scaling, the scales are the standard\n deviation while near constant features are better detected on the\n computed variances which are closer to machine precision by\n construction.\n \"\"\"\n if np.isscalar(scale):\n if scale == 0.0:\n scale = 1.0\n return scale\n elif isinstance(scale, np.ndarray):\n if constant_mask is None:\n constant_mask = scale < 10 * np.finfo(scale.dtype).eps\n if copy:\n scale = scale.copy()\n scale[constant_mask] = 1.0\n return scale" }, { @@ -152531,7 +164407,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mean", @@ -152541,7 +164418,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -152551,13 +164429,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Detect if a feature is indistinguishable from a constant feature.\n\nThe detection is based on its computed variance and on the theoretical error bounds of the '2 pass algorithm' for variance computation. See \"Algorithms for computing the sample variance: analysis and recommendations\", by Chan, Golub, and LeVeque.", - "docstring": "Detect if a feature is indistinguishable from a constant feature.\n\nThe detection is based on its computed variance and on the theoretical\nerror bounds of the '2 pass algorithm' for variance computation.\n\nSee \"Algorithms for computing the sample variance: analysis and\nrecommendations\", by Chan, Golub, and LeVeque.", + "description": "Detect if a feature is indistinguishable from a constant feature.\n\nThe detection is based on its computed variance and on the theoretical\nerror bounds of the '2 pass algorithm' for variance computation.\n\nSee \"Algorithms for computing the sample variance: analysis and\nrecommendations\", by Chan, Golub, and LeVeque.", + "docstring": "Detect if a feature is indistinguishable from a constant feature.\n\n The detection is based on its computed variance and on the theoretical\n error bounds of the '2 pass algorithm' for variance computation.\n\n See \"Algorithms for computing the sample variance: analysis and\n recommendations\", by Chan, Golub, and LeVeque.\n ", "source_code": "\ndef _is_constant_feature(var, mean, n_samples):\n \"\"\"Detect if a feature is indistinguishable from a constant feature.\n\n The detection is based on its computed variance and on the theoretical\n error bounds of the '2 pass algorithm' for variance computation.\n\n See \"Algorithms for computing the sample variance: analysis and\n recommendations\", by Chan, Golub, and LeVeque.\n \"\"\"\n eps = np.finfo(np.float64).eps\n upper_bound = n_samples * eps * var + (n_samples * mean * eps)**2\n return var <= upper_bound" }, { @@ -152575,6 +164454,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -152585,13 +164468,14 @@ "docstring": { "type": "float", "description": "Value to use for the dummy feature." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Augment dataset with an additional dummy feature.\n\nThis is useful for fitting an intercept term with implementations which cannot otherwise fit it directly.", - "docstring": "Augment dataset with an additional dummy feature.\n\nThis is useful for fitting an intercept term with implementations which\ncannot otherwise fit it directly.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Data.\n\nvalue : float\n Value to use for the dummy feature.\n\nReturns\n-------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features + 1)\n Same data with dummy feature added as first column.\n\nExamples\n--------\n>>> from sklearn.preprocessing import add_dummy_feature\n>>> add_dummy_feature([[0, 1], [1, 0]])\narray([[1., 0., 1.],\n [1., 1., 0.]])", + "description": "Augment dataset with an additional dummy feature.\n\nThis is useful for fitting an intercept term with implementations which\ncannot otherwise fit it directly.", + "docstring": "Augment dataset with an additional dummy feature.\n\n This is useful for fitting an intercept term with implementations which\n cannot otherwise fit it directly.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Data.\n\n value : float\n Value to use for the dummy feature.\n\n Returns\n -------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features + 1)\n Same data with dummy feature added as first column.\n\n Examples\n --------\n >>> from sklearn.preprocessing import add_dummy_feature\n >>> add_dummy_feature([[0, 1], [1, 0]])\n array([[1., 0., 1.],\n [1., 1., 0.]])\n ", "source_code": "\ndef add_dummy_feature(X, value=1.0):\n \"\"\"Augment dataset with an additional dummy feature.\n\n This is useful for fitting an intercept term with implementations which\n cannot otherwise fit it directly.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Data.\n\n value : float\n Value to use for the dummy feature.\n\n Returns\n -------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features + 1)\n Same data with dummy feature added as first column.\n\n Examples\n --------\n >>> from sklearn.preprocessing import add_dummy_feature\n >>> add_dummy_feature([[0, 1], [1, 0]])\n array([[1., 0., 1.],\n [1., 1., 0.]])\n \"\"\"\n X = check_array(X, accept_sparse=['csc', 'csr', 'coo'], dtype=FLOAT_DTYPES)\n (n_samples, n_features) = X.shape\n shape = (n_samples, n_features + 1)\n if sparse.issparse(X):\n if sparse.isspmatrix_coo(X):\n col = X.col + 1\n col = np.concatenate((np.zeros(n_samples), col))\n row = np.concatenate((np.arange(n_samples), X.row))\n data = np.concatenate((np.full(n_samples, value), X.data))\n return sparse.coo_matrix((data, (row, col)), shape)\n elif sparse.isspmatrix_csc(X):\n indptr = X.indptr + n_samples\n indptr = np.concatenate((np.array([0]), indptr))\n indices = np.concatenate((np.arange(n_samples), X.indices))\n data = np.concatenate((np.full(n_samples, value), X.data))\n return sparse.csc_matrix((data, indices, indptr), shape)\n else:\n klass = X.__class__\n return klass(add_dummy_feature(X.tocoo(), value))\n else:\n return np.hstack((np.full((n_samples, 1), value), X))" }, { @@ -152609,6 +164493,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data to binarize, element by element.\nscipy.sparse matrices should be in CSR or CSC format to avoid an\nun-necessary copy." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -152619,7 +164507,8 @@ "docstring": { "type": "float, default=0.0", "description": "Feature values below or equal to this are replaced by 0, above it by 1.\nThreshold may not be less than 0 for operations on sparse matrices." - } + }, + "refined_type": {} }, { "name": "copy", @@ -152629,13 +164518,14 @@ "docstring": { "type": "bool, default=True", "description": "set to False to perform inplace binarization and avoid a copy\n(if the input is already a numpy array or a scipy.sparse CSR / CSC\nmatrix and if axis is 1)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Boolean thresholding of array-like or scipy.sparse matrix.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Boolean thresholding of array-like or scipy.sparse matrix.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to binarize, element by element.\n scipy.sparse matrices should be in CSR or CSC format to avoid an\n un-necessary copy.\n\nthreshold : float, default=0.0\n Feature values below or equal to this are replaced by 0, above it by 1.\n Threshold may not be less than 0 for operations on sparse matrices.\n\ncopy : bool, default=True\n set to False to perform inplace binarization and avoid a copy\n (if the input is already a numpy array or a scipy.sparse CSR / CSC\n matrix and if axis is 1).\n\nReturns\n-------\nX_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\nSee Also\n--------\nBinarizer : Performs binarization using the Transformer API\n (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).", + "docstring": "Boolean thresholding of array-like or scipy.sparse matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to binarize, element by element.\n scipy.sparse matrices should be in CSR or CSC format to avoid an\n un-necessary copy.\n\n threshold : float, default=0.0\n Feature values below or equal to this are replaced by 0, above it by 1.\n Threshold may not be less than 0 for operations on sparse matrices.\n\n copy : bool, default=True\n set to False to perform inplace binarization and avoid a copy\n (if the input is already a numpy array or a scipy.sparse CSR / CSC\n matrix and if axis is 1).\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\n See Also\n --------\n Binarizer : Performs binarization using the Transformer API\n (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).\n ", "source_code": "\ndef binarize(X, *, threshold=0.0, copy=True):\n \"\"\"Boolean thresholding of array-like or scipy.sparse matrix.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to binarize, element by element.\n scipy.sparse matrices should be in CSR or CSC format to avoid an\n un-necessary copy.\n\n threshold : float, default=0.0\n Feature values below or equal to this are replaced by 0, above it by 1.\n Threshold may not be less than 0 for operations on sparse matrices.\n\n copy : bool, default=True\n set to False to perform inplace binarization and avoid a copy\n (if the input is already a numpy array or a scipy.sparse CSR / CSC\n matrix and if axis is 1).\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\n See Also\n --------\n Binarizer : Performs binarization using the Transformer API\n (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).\n \"\"\"\n X = check_array(X, accept_sparse=['csr', 'csc'], copy=copy)\n if sparse.issparse(X):\n if threshold < 0:\n raise ValueError('Cannot binarize a sparse matrix with threshold < 0')\n cond = X.data > threshold\n not_cond = np.logical_not(cond)\n X.data[cond] = 1\n X.data[not_cond] = 0\n X.eliminate_zeros()\n else:\n cond = X > threshold\n not_cond = np.logical_not(cond)\n X[cond] = 1\n X[not_cond] = 0\n return X" }, { @@ -152653,6 +164543,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -152663,7 +164557,8 @@ "docstring": { "type": "int, default=0", "description": "axis used to scale along. If 0, independently scale each feature,\notherwise (if 1) scale each sample." - } + }, + "refined_type": {} }, { "name": "copy", @@ -152673,13 +164568,14 @@ "docstring": { "type": "bool, default=True", "description": "Set to False to perform inplace scaling and avoid a copy (if the input\nis already a numpy array)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Scale each feature to the [-1, 1] range without breaking the sparsity.\n\nThis estimator scales each feature individually such that the maximal absolute value of each feature in the training set will be 1.0. This scaler can also be applied to sparse CSR or CSC matrices.", - "docstring": "Scale each feature to the [-1, 1] range without breaking the sparsity.\n\nThis estimator scales each feature individually such\nthat the maximal absolute value of each feature in the\ntraining set will be 1.0.\n\nThis scaler can also be applied to sparse CSR or CSC matrices.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data.\n\naxis : int, default=0\n axis used to scale along. If 0, independently scale each feature,\n otherwise (if 1) scale each sample.\n\ncopy : bool, default=True\n Set to False to perform inplace scaling and avoid a copy (if the input\n is already a numpy array).\n\nReturns\n-------\nX_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\n.. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.maxabs_scale` unless you know\n what you are doing. A common mistake is to apply it to the entire data\n *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.MaxAbsScaler` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking: `pipe = make_pipeline(MaxAbsScaler(), LogisticRegression())`.\n\nSee Also\n--------\nMaxAbsScaler : Performs scaling to the [-1, 1] range using\n the Transformer API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n\nNotes\n-----\nNaNs are treated as missing values: disregarded to compute the statistics,\nand maintained during the data transformation.\n\nFor a comparison of the different scalers, transformers, and normalizers,\nsee :ref:`examples/preprocessing/plot_all_scaling.py\n`.", + "description": "Scale each feature to the [-1, 1] range without breaking the sparsity.\n\nThis estimator scales each feature individually such\nthat the maximal absolute value of each feature in the\ntraining set will be 1.0.\n\nThis scaler can also be applied to sparse CSR or CSC matrices.", + "docstring": "Scale each feature to the [-1, 1] range without breaking the sparsity.\n\n This estimator scales each feature individually such\n that the maximal absolute value of each feature in the\n training set will be 1.0.\n\n This scaler can also be applied to sparse CSR or CSC matrices.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data.\n\n axis : int, default=0\n axis used to scale along. If 0, independently scale each feature,\n otherwise (if 1) scale each sample.\n\n copy : bool, default=True\n Set to False to perform inplace scaling and avoid a copy (if the input\n is already a numpy array).\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\n .. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.maxabs_scale` unless you know\n what you are doing. A common mistake is to apply it to the entire data\n *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.MaxAbsScaler` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking: `pipe = make_pipeline(MaxAbsScaler(), LogisticRegression())`.\n\n See Also\n --------\n MaxAbsScaler : Performs scaling to the [-1, 1] range using\n the Transformer API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n\n Notes\n -----\n NaNs are treated as missing values: disregarded to compute the statistics,\n and maintained during the data transformation.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n ", "source_code": "\ndef maxabs_scale(X, *, axis=0, copy=True):\n \"\"\"Scale each feature to the [-1, 1] range without breaking the sparsity.\n\n This estimator scales each feature individually such\n that the maximal absolute value of each feature in the\n training set will be 1.0.\n\n This scaler can also be applied to sparse CSR or CSC matrices.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data.\n\n axis : int, default=0\n axis used to scale along. If 0, independently scale each feature,\n otherwise (if 1) scale each sample.\n\n copy : bool, default=True\n Set to False to perform inplace scaling and avoid a copy (if the input\n is already a numpy array).\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\n .. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.maxabs_scale` unless you know\n what you are doing. A common mistake is to apply it to the entire data\n *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.MaxAbsScaler` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking: `pipe = make_pipeline(MaxAbsScaler(), LogisticRegression())`.\n\n See Also\n --------\n MaxAbsScaler : Performs scaling to the [-1, 1] range using\n the Transformer API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n\n Notes\n -----\n NaNs are treated as missing values: disregarded to compute the statistics,\n and maintained during the data transformation.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n \"\"\"\n X = check_array(X, accept_sparse=('csr', 'csc'), copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n original_ndim = X.ndim\n if original_ndim == 1:\n X = X.reshape(X.shape[0], 1)\n s = MaxAbsScaler(copy=copy)\n if axis == 0:\n X = s.fit_transform(X)\n else:\n X = s.fit_transform(X.T).T\n if original_ndim == 1:\n X = X.ravel()\n return X" }, { @@ -152697,7 +164593,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data." - } + }, + "refined_type": {} }, { "name": "feature_range", @@ -152707,7 +164604,8 @@ "docstring": { "type": "tuple (min, max), default=(0, 1)", "description": "Desired range of transformed data." - } + }, + "refined_type": {} }, { "name": "axis", @@ -152717,7 +164615,8 @@ "docstring": { "type": "int, default=0", "description": "Axis used to scale along. If 0, independently scale each feature,\notherwise (if 1) scale each sample." - } + }, + "refined_type": {} }, { "name": "copy", @@ -152727,13 +164626,14 @@ "docstring": { "type": "bool, default=True", "description": "Set to False to perform inplace scaling and avoid a copy (if the input\nis already a numpy array)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform features by scaling each feature to a given range.\n\nThis estimator scales and translates each feature individually such that it is in the given range on the training set, i.e. between zero and one. The transformation is given by (when ``axis=0``):: X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) X_scaled = X_std * (max - min) + min where min, max = feature_range. The transformation is calculated as (when ``axis=0``):: X_scaled = scale * X + min - X.min(axis=0) * scale where scale = (max - min) / (X.max(axis=0) - X.min(axis=0)) This transformation is often used as an alternative to zero mean, unit variance scaling. Read more in the :ref:`User Guide `. .. versionadded:: 0.17 *minmax_scale* function interface to :class:`~sklearn.preprocessing.MinMaxScaler`.", - "docstring": "Transform features by scaling each feature to a given range.\n\nThis estimator scales and translates each feature individually such\nthat it is in the given range on the training set, i.e. between\nzero and one.\n\nThe transformation is given by (when ``axis=0``)::\n\n X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))\n X_scaled = X_std * (max - min) + min\n\nwhere min, max = feature_range.\n\nThe transformation is calculated as (when ``axis=0``)::\n\n X_scaled = scale * X + min - X.min(axis=0) * scale\n where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))\n\nThis transformation is often used as an alternative to zero mean,\nunit variance scaling.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.17\n *minmax_scale* function interface\n to :class:`~sklearn.preprocessing.MinMaxScaler`.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data.\n\nfeature_range : tuple (min, max), default=(0, 1)\n Desired range of transformed data.\n\naxis : int, default=0\n Axis used to scale along. If 0, independently scale each feature,\n otherwise (if 1) scale each sample.\n\ncopy : bool, default=True\n Set to False to perform inplace scaling and avoid a copy (if the input\n is already a numpy array).\n\nReturns\n-------\nX_tr : ndarray of shape (n_samples, n_features)\n The transformed data.\n\n.. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know\n what you are doing. A common mistake is to apply it to the entire data\n *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.MinMaxScaler` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`.\n\nSee Also\n--------\nMinMaxScaler : Performs scaling to a given range using the Transformer\n API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n\nNotes\n-----\nFor a comparison of the different scalers, transformers, and normalizers,\nsee :ref:`examples/preprocessing/plot_all_scaling.py\n`.", + "description": "Transform features by scaling each feature to a given range.\n\nThis estimator scales and translates each feature individually such\nthat it is in the given range on the training set, i.e. between\nzero and one.\n\nThe transformation is given by (when ``axis=0``)::\n\n X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))\n X_scaled = X_std * (max - min) + min\n\nwhere min, max = feature_range.\n\nThe transformation is calculated as (when ``axis=0``)::\n\n X_scaled = scale * X + min - X.min(axis=0) * scale\n where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))\n\nThis transformation is often used as an alternative to zero mean,\nunit variance scaling.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.17\n *minmax_scale* function interface\n to :class:`~sklearn.preprocessing.MinMaxScaler`.", + "docstring": "Transform features by scaling each feature to a given range.\n\n This estimator scales and translates each feature individually such\n that it is in the given range on the training set, i.e. between\n zero and one.\n\n The transformation is given by (when ``axis=0``)::\n\n X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))\n X_scaled = X_std * (max - min) + min\n\n where min, max = feature_range.\n\n The transformation is calculated as (when ``axis=0``)::\n\n X_scaled = scale * X + min - X.min(axis=0) * scale\n where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))\n\n This transformation is often used as an alternative to zero mean,\n unit variance scaling.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.17\n *minmax_scale* function interface\n to :class:`~sklearn.preprocessing.MinMaxScaler`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n feature_range : tuple (min, max), default=(0, 1)\n Desired range of transformed data.\n\n axis : int, default=0\n Axis used to scale along. If 0, independently scale each feature,\n otherwise (if 1) scale each sample.\n\n copy : bool, default=True\n Set to False to perform inplace scaling and avoid a copy (if the input\n is already a numpy array).\n\n Returns\n -------\n X_tr : ndarray of shape (n_samples, n_features)\n The transformed data.\n\n .. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know\n what you are doing. A common mistake is to apply it to the entire data\n *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.MinMaxScaler` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`.\n\n See Also\n --------\n MinMaxScaler : Performs scaling to a given range using the Transformer\n API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n\n Notes\n -----\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n ", "source_code": "\ndef minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):\n \"\"\"Transform features by scaling each feature to a given range.\n\n This estimator scales and translates each feature individually such\n that it is in the given range on the training set, i.e. between\n zero and one.\n\n The transformation is given by (when ``axis=0``)::\n\n X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))\n X_scaled = X_std * (max - min) + min\n\n where min, max = feature_range.\n\n The transformation is calculated as (when ``axis=0``)::\n\n X_scaled = scale * X + min - X.min(axis=0) * scale\n where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))\n\n This transformation is often used as an alternative to zero mean,\n unit variance scaling.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.17\n *minmax_scale* function interface\n to :class:`~sklearn.preprocessing.MinMaxScaler`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n feature_range : tuple (min, max), default=(0, 1)\n Desired range of transformed data.\n\n axis : int, default=0\n Axis used to scale along. If 0, independently scale each feature,\n otherwise (if 1) scale each sample.\n\n copy : bool, default=True\n Set to False to perform inplace scaling and avoid a copy (if the input\n is already a numpy array).\n\n Returns\n -------\n X_tr : ndarray of shape (n_samples, n_features)\n The transformed data.\n\n .. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know\n what you are doing. A common mistake is to apply it to the entire data\n *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.MinMaxScaler` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`.\n\n See Also\n --------\n MinMaxScaler : Performs scaling to a given range using the Transformer\n API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n\n Notes\n -----\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n \"\"\"\n X = check_array(X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n original_ndim = X.ndim\n if original_ndim == 1:\n X = X.reshape(X.shape[0], 1)\n s = MinMaxScaler(feature_range=feature_range, copy=copy)\n if axis == 0:\n X = s.fit_transform(X)\n else:\n X = s.fit_transform(X.T).T\n if original_ndim == 1:\n X = X.ravel()\n return X" }, { @@ -152751,6 +164651,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data to normalize, element by element.\nscipy.sparse matrices should be in CSR format to avoid an\nun-necessary copy." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -152761,6 +164665,10 @@ "docstring": { "type": "{'l1', 'l2', 'max'}, default='l2'", "description": "The norm to use to normalize each non zero sample (or each non-zero\nfeature if axis is 0)." + }, + "refined_type": { + "kind": "EnumType", + "values": ["max", "l2", "l1"] } }, { @@ -152771,6 +164679,10 @@ "docstring": { "type": "{0, 1}, default=1", "description": "axis used to normalize the data along. If 1, independently normalize\neach sample, otherwise (if 0) normalize each feature." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -152781,7 +164693,8 @@ "docstring": { "type": "bool, default=True", "description": "set to False to perform inplace row normalization and avoid a\ncopy (if the input is already a numpy array or a scipy.sparse\nCSR matrix and if axis is 1)." - } + }, + "refined_type": {} }, { "name": "return_norm", @@ -152791,13 +164704,14 @@ "docstring": { "type": "bool, default=False", "description": "whether to return the computed norms" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Scale input vectors individually to unit norm (vector length).\n\nRead more in the :ref:`User Guide `.", - "docstring": "Scale input vectors individually to unit norm (vector length).\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to normalize, element by element.\n scipy.sparse matrices should be in CSR format to avoid an\n un-necessary copy.\n\nnorm : {'l1', 'l2', 'max'}, default='l2'\n The norm to use to normalize each non zero sample (or each non-zero\n feature if axis is 0).\n\naxis : {0, 1}, default=1\n axis used to normalize the data along. If 1, independently normalize\n each sample, otherwise (if 0) normalize each feature.\n\ncopy : bool, default=True\n set to False to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array or a scipy.sparse\n CSR matrix and if axis is 1).\n\nreturn_norm : bool, default=False\n whether to return the computed norms\n\nReturns\n-------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Normalized input X.\n\nnorms : ndarray of shape (n_samples, ) if axis=1 else (n_features, )\n An array of norms along given axis for X.\n When X is sparse, a NotImplementedError will be raised\n for norm 'l1' or 'l2'.\n\nSee Also\n--------\nNormalizer : Performs normalization using the Transformer API\n (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).\n\nNotes\n-----\nFor a comparison of the different scalers, transformers, and normalizers,\nsee :ref:`examples/preprocessing/plot_all_scaling.py\n`.", + "docstring": "Scale input vectors individually to unit norm (vector length).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to normalize, element by element.\n scipy.sparse matrices should be in CSR format to avoid an\n un-necessary copy.\n\n norm : {'l1', 'l2', 'max'}, default='l2'\n The norm to use to normalize each non zero sample (or each non-zero\n feature if axis is 0).\n\n axis : {0, 1}, default=1\n axis used to normalize the data along. If 1, independently normalize\n each sample, otherwise (if 0) normalize each feature.\n\n copy : bool, default=True\n set to False to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array or a scipy.sparse\n CSR matrix and if axis is 1).\n\n return_norm : bool, default=False\n whether to return the computed norms\n\n Returns\n -------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Normalized input X.\n\n norms : ndarray of shape (n_samples, ) if axis=1 else (n_features, )\n An array of norms along given axis for X.\n When X is sparse, a NotImplementedError will be raised\n for norm 'l1' or 'l2'.\n\n See Also\n --------\n Normalizer : Performs normalization using the Transformer API\n (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).\n\n Notes\n -----\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n ", "source_code": "\ndef normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False):\n \"\"\"Scale input vectors individually to unit norm (vector length).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to normalize, element by element.\n scipy.sparse matrices should be in CSR format to avoid an\n un-necessary copy.\n\n norm : {'l1', 'l2', 'max'}, default='l2'\n The norm to use to normalize each non zero sample (or each non-zero\n feature if axis is 0).\n\n axis : {0, 1}, default=1\n axis used to normalize the data along. If 1, independently normalize\n each sample, otherwise (if 0) normalize each feature.\n\n copy : bool, default=True\n set to False to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array or a scipy.sparse\n CSR matrix and if axis is 1).\n\n return_norm : bool, default=False\n whether to return the computed norms\n\n Returns\n -------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Normalized input X.\n\n norms : ndarray of shape (n_samples, ) if axis=1 else (n_features, )\n An array of norms along given axis for X.\n When X is sparse, a NotImplementedError will be raised\n for norm 'l1' or 'l2'.\n\n See Also\n --------\n Normalizer : Performs normalization using the Transformer API\n (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).\n\n Notes\n -----\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n \"\"\"\n if norm not in ('l1', 'l2', 'max'):\n raise ValueError(\"'%s' is not a supported norm\" % norm)\n if axis == 0:\n sparse_format = 'csc'\n elif axis == 1:\n sparse_format = 'csr'\n else:\n raise ValueError(\"'%d' is not a supported axis\" % axis)\n X = check_array(X, accept_sparse=sparse_format, copy=copy, estimator='the normalize function', dtype=FLOAT_DTYPES)\n if axis == 0:\n X = X.T\n if sparse.issparse(X):\n if return_norm and norm in ('l1', 'l2'):\n raise NotImplementedError(\"return_norm=True is not implemented for sparse matrices with norm 'l1' or norm 'l2'\")\n if norm == 'l1':\n inplace_csr_row_normalize_l1(X)\n elif norm == 'l2':\n inplace_csr_row_normalize_l2(X)\n elif norm == 'max':\n (mins, maxes) = min_max_axis(X, 1)\n norms = np.maximum(abs(mins), maxes)\n norms_elementwise = norms.repeat(np.diff(X.indptr))\n mask = norms_elementwise != 0\n X.data[mask] /= norms_elementwise[mask]\n else:\n if norm == 'l1':\n norms = np.abs(X).sum(axis=1)\n elif norm == 'l2':\n norms = row_norms(X)\n elif norm == 'max':\n norms = np.max(abs(X), axis=1)\n norms = _handle_zeros_in_scale(norms, copy=False)\n X /= norms[:, np.newaxis]\n if axis == 0:\n X = X.T\n if return_norm:\n return X, norms\n else:\n return X" }, { @@ -152815,7 +164729,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to be transformed using a power transformation." - } + }, + "refined_type": {} }, { "name": "method", @@ -152825,6 +164740,10 @@ "docstring": { "type": "{'yeo-johnson', 'box-cox'}, default='yeo-johnson'", "description": "The power transform method. Available methods are:\n\n- 'yeo-johnson' [1]_, works with positive and negative values\n- 'box-cox' [2]_, only works with strictly positive values\n\n.. versionchanged:: 0.23\n The default value of the `method` parameter changed from\n 'box-cox' to 'yeo-johnson' in 0.23." + }, + "refined_type": { + "kind": "EnumType", + "values": ["box-cox", "yeo-johnson"] } }, { @@ -152835,7 +164754,8 @@ "docstring": { "type": "bool, default=True", "description": "Set to True to apply zero-mean, unit-variance normalization to the\ntransformed output." - } + }, + "refined_type": {} }, { "name": "copy", @@ -152845,13 +164765,14 @@ "docstring": { "type": "bool, default=True", "description": "Set to False to perform inplace computation during transformation." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Power transforms are a family of parametric, monotonic transformations that are applied to make data more Gaussian-like. This is useful for modeling issues related to heteroscedasticity (non-constant variance), or other situations where normality is desired.\n\nCurrently, power_transform supports the Box-Cox transform and the Yeo-Johnson transform. The optimal parameter for stabilizing variance and minimizing skewness is estimated through maximum likelihood. Box-Cox requires input data to be strictly positive, while Yeo-Johnson supports both positive or negative data. By default, zero-mean, unit-variance normalization is applied to the transformed data. Read more in the :ref:`User Guide `.", - "docstring": "Power transforms are a family of parametric, monotonic transformations\nthat are applied to make data more Gaussian-like. This is useful for\nmodeling issues related to heteroscedasticity (non-constant variance),\nor other situations where normality is desired.\n\nCurrently, power_transform supports the Box-Cox transform and the\nYeo-Johnson transform. The optimal parameter for stabilizing variance and\nminimizing skewness is estimated through maximum likelihood.\n\nBox-Cox requires input data to be strictly positive, while Yeo-Johnson\nsupports both positive or negative data.\n\nBy default, zero-mean, unit-variance normalization is applied to the\ntransformed data.\n\nRead more in the :ref:`User Guide `.\n\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data to be transformed using a power transformation.\n\nmethod : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'\n The power transform method. Available methods are:\n\n - 'yeo-johnson' [1]_, works with positive and negative values\n - 'box-cox' [2]_, only works with strictly positive values\n\n .. versionchanged:: 0.23\n The default value of the `method` parameter changed from\n 'box-cox' to 'yeo-johnson' in 0.23.\n\nstandardize : bool, default=True\n Set to True to apply zero-mean, unit-variance normalization to the\n transformed output.\n\ncopy : bool, default=True\n Set to False to perform inplace computation during transformation.\n\nReturns\n-------\nX_trans : ndarray of shape (n_samples, n_features)\n The transformed data.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.preprocessing import power_transform\n>>> data = [[1, 2], [3, 2], [4, 5]]\n>>> print(power_transform(data, method='box-cox'))\n[[-1.332... -0.707...]\n [ 0.256... -0.707...]\n [ 1.076... 1.414...]]\n\n.. warning:: Risk of data leak.\n Do not use :func:`~sklearn.preprocessing.power_transform` unless you\n know what you are doing. A common mistake is to apply it to the entire\n data *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.PowerTransformer` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking, e.g.: `pipe = make_pipeline(PowerTransformer(),\n LogisticRegression())`.\n\nSee Also\n--------\nPowerTransformer : Equivalent transformation with the\n Transformer API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n\nquantile_transform : Maps data to a standard normal distribution with\n the parameter `output_distribution='normal'`.\n\nNotes\n-----\nNaNs are treated as missing values: disregarded in ``fit``, and maintained\nin ``transform``.\n\nFor a comparison of the different scalers, transformers, and normalizers,\nsee :ref:`examples/preprocessing/plot_all_scaling.py\n`.\n\nReferences\n----------\n\n.. [1] I.K. Yeo and R.A. Johnson, \"A new family of power transformations to\n improve normality or symmetry.\" Biometrika, 87(4), pp.954-959,\n (2000).\n\n.. [2] G.E.P. Box and D.R. Cox, \"An Analysis of Transformations\", Journal\n of the Royal Statistical Society B, 26, 211-252 (1964).", + "description": "Power transforms are a family of parametric, monotonic transformations\nthat are applied to make data more Gaussian-like. This is useful for\nmodeling issues related to heteroscedasticity (non-constant variance),\nor other situations where normality is desired.\n\nCurrently, power_transform supports the Box-Cox transform and the\nYeo-Johnson transform. The optimal parameter for stabilizing variance and\nminimizing skewness is estimated through maximum likelihood.\n\nBox-Cox requires input data to be strictly positive, while Yeo-Johnson\nsupports both positive or negative data.\n\nBy default, zero-mean, unit-variance normalization is applied to the\ntransformed data.\n\nRead more in the :ref:`User Guide `.", + "docstring": "\n Power transforms are a family of parametric, monotonic transformations\n that are applied to make data more Gaussian-like. This is useful for\n modeling issues related to heteroscedasticity (non-constant variance),\n or other situations where normality is desired.\n\n Currently, power_transform supports the Box-Cox transform and the\n Yeo-Johnson transform. The optimal parameter for stabilizing variance and\n minimizing skewness is estimated through maximum likelihood.\n\n Box-Cox requires input data to be strictly positive, while Yeo-Johnson\n supports both positive or negative data.\n\n By default, zero-mean, unit-variance normalization is applied to the\n transformed data.\n\n Read more in the :ref:`User Guide `.\n\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to be transformed using a power transformation.\n\n method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'\n The power transform method. Available methods are:\n\n - 'yeo-johnson' [1]_, works with positive and negative values\n - 'box-cox' [2]_, only works with strictly positive values\n\n .. versionchanged:: 0.23\n The default value of the `method` parameter changed from\n 'box-cox' to 'yeo-johnson' in 0.23.\n\n standardize : bool, default=True\n Set to True to apply zero-mean, unit-variance normalization to the\n transformed output.\n\n copy : bool, default=True\n Set to False to perform inplace computation during transformation.\n\n Returns\n -------\n X_trans : ndarray of shape (n_samples, n_features)\n The transformed data.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import power_transform\n >>> data = [[1, 2], [3, 2], [4, 5]]\n >>> print(power_transform(data, method='box-cox'))\n [[-1.332... -0.707...]\n [ 0.256... -0.707...]\n [ 1.076... 1.414...]]\n\n .. warning:: Risk of data leak.\n Do not use :func:`~sklearn.preprocessing.power_transform` unless you\n know what you are doing. A common mistake is to apply it to the entire\n data *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.PowerTransformer` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking, e.g.: `pipe = make_pipeline(PowerTransformer(),\n LogisticRegression())`.\n\n See Also\n --------\n PowerTransformer : Equivalent transformation with the\n Transformer API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n\n quantile_transform : Maps data to a standard normal distribution with\n the parameter `output_distribution='normal'`.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in ``fit``, and maintained\n in ``transform``.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n References\n ----------\n\n .. [1] I.K. Yeo and R.A. Johnson, \"A new family of power transformations to\n improve normality or symmetry.\" Biometrika, 87(4), pp.954-959,\n (2000).\n\n .. [2] G.E.P. Box and D.R. Cox, \"An Analysis of Transformations\", Journal\n of the Royal Statistical Society B, 26, 211-252 (1964).\n ", "source_code": "\ndef power_transform(X, method='yeo-johnson', *, standardize=True, copy=True):\n \"\"\"\n Power transforms are a family of parametric, monotonic transformations\n that are applied to make data more Gaussian-like. This is useful for\n modeling issues related to heteroscedasticity (non-constant variance),\n or other situations where normality is desired.\n\n Currently, power_transform supports the Box-Cox transform and the\n Yeo-Johnson transform. The optimal parameter for stabilizing variance and\n minimizing skewness is estimated through maximum likelihood.\n\n Box-Cox requires input data to be strictly positive, while Yeo-Johnson\n supports both positive or negative data.\n\n By default, zero-mean, unit-variance normalization is applied to the\n transformed data.\n\n Read more in the :ref:`User Guide `.\n\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to be transformed using a power transformation.\n\n method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'\n The power transform method. Available methods are:\n\n - 'yeo-johnson' [1]_, works with positive and negative values\n - 'box-cox' [2]_, only works with strictly positive values\n\n .. versionchanged:: 0.23\n The default value of the `method` parameter changed from\n 'box-cox' to 'yeo-johnson' in 0.23.\n\n standardize : bool, default=True\n Set to True to apply zero-mean, unit-variance normalization to the\n transformed output.\n\n copy : bool, default=True\n Set to False to perform inplace computation during transformation.\n\n Returns\n -------\n X_trans : ndarray of shape (n_samples, n_features)\n The transformed data.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import power_transform\n >>> data = [[1, 2], [3, 2], [4, 5]]\n >>> print(power_transform(data, method='box-cox'))\n [[-1.332... -0.707...]\n [ 0.256... -0.707...]\n [ 1.076... 1.414...]]\n\n .. warning:: Risk of data leak.\n Do not use :func:`~sklearn.preprocessing.power_transform` unless you\n know what you are doing. A common mistake is to apply it to the entire\n data *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.PowerTransformer` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking, e.g.: `pipe = make_pipeline(PowerTransformer(),\n LogisticRegression())`.\n\n See Also\n --------\n PowerTransformer : Equivalent transformation with the\n Transformer API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n\n quantile_transform : Maps data to a standard normal distribution with\n the parameter `output_distribution='normal'`.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in ``fit``, and maintained\n in ``transform``.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n References\n ----------\n\n .. [1] I.K. Yeo and R.A. Johnson, \"A new family of power transformations to\n improve normality or symmetry.\" Biometrika, 87(4), pp.954-959,\n (2000).\n\n .. [2] G.E.P. Box and D.R. Cox, \"An Analysis of Transformations\", Journal\n of the Royal Statistical Society B, 26, 211-252 (1964).\n \"\"\"\n pt = PowerTransformer(method=method, standardize=standardize, copy=copy)\n return pt.fit_transform(X)" }, { @@ -152869,6 +164790,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data to transform." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -152879,7 +164804,8 @@ "docstring": { "type": "int, default=0", "description": "Axis used to compute the means and standard deviations along. If 0,\ntransform each feature, otherwise (if 1) transform each sample." - } + }, + "refined_type": {} }, { "name": "n_quantiles", @@ -152889,7 +164815,8 @@ "docstring": { "type": "int, default=1000 or n_samples", "description": "Number of quantiles to be computed. It corresponds to the number\nof landmarks used to discretize the cumulative distribution function.\nIf n_quantiles is larger than the number of samples, n_quantiles is set\nto the number of samples as a larger number of quantiles does not give\na better approximation of the cumulative distribution function\nestimator." - } + }, + "refined_type": {} }, { "name": "output_distribution", @@ -152899,6 +164826,10 @@ "docstring": { "type": "{'uniform', 'normal'}, default='uniform'", "description": "Marginal distribution for the transformed data. The choices are\n'uniform' (default) or 'normal'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["uniform", "normal"] } }, { @@ -152909,7 +164840,8 @@ "docstring": { "type": "bool, default=False", "description": "Only applies to sparse matrices. If True, the sparse entries of the\nmatrix are discarded to compute the quantile statistics. If False,\nthese entries are treated as zeros." - } + }, + "refined_type": {} }, { "name": "subsample", @@ -152919,7 +164851,8 @@ "docstring": { "type": "int, default=1e5", "description": "Maximum number of samples used to estimate the quantiles for\ncomputational efficiency. Note that the subsampling procedure may\ndiffer for value-identical sparse and dense matrices." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -152929,7 +164862,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for subsampling and smoothing\nnoise.\nPlease see ``subsample`` for more details.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `" - } + }, + "refined_type": {} }, { "name": "copy", @@ -152939,13 +164873,14 @@ "docstring": { "type": "bool, default=True", "description": "Set to False to perform inplace transformation and avoid a copy (if the\ninput is already a numpy array). If True, a copy of `X` is transformed,\nleaving the original `X` unchanged\n\n..versionchanged:: 0.23\n The default value of `copy` changed from False to True in 0.23." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform features using quantiles information.\n\nThis method transforms the features to follow a uniform or a normal distribution. Therefore, for a given feature, this transformation tends to spread out the most frequent values. It also reduces the impact of (marginal) outliers: this is therefore a robust preprocessing scheme. The transformation is applied on each feature independently. First an estimate of the cumulative distribution function of a feature is used to map the original values to a uniform distribution. The obtained values are then mapped to the desired output distribution using the associated quantile function. Features values of new/unseen data that fall below or above the fitted range will be mapped to the bounds of the output distribution. Note that this transform is non-linear. It may distort linear correlations between variables measured at the same scale but renders variables measured at different scales more directly comparable. Read more in the :ref:`User Guide `.", - "docstring": "Transform features using quantiles information.\n\nThis method transforms the features to follow a uniform or a normal\ndistribution. Therefore, for a given feature, this transformation tends\nto spread out the most frequent values. It also reduces the impact of\n(marginal) outliers: this is therefore a robust preprocessing scheme.\n\nThe transformation is applied on each feature independently. First an\nestimate of the cumulative distribution function of a feature is\nused to map the original values to a uniform distribution. The obtained\nvalues are then mapped to the desired output distribution using the\nassociated quantile function. Features values of new/unseen data that fall\nbelow or above the fitted range will be mapped to the bounds of the output\ndistribution. Note that this transform is non-linear. It may distort linear\ncorrelations between variables measured at the same scale but renders\nvariables measured at different scales more directly comparable.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to transform.\n\naxis : int, default=0\n Axis used to compute the means and standard deviations along. If 0,\n transform each feature, otherwise (if 1) transform each sample.\n\nn_quantiles : int, default=1000 or n_samples\n Number of quantiles to be computed. It corresponds to the number\n of landmarks used to discretize the cumulative distribution function.\n If n_quantiles is larger than the number of samples, n_quantiles is set\n to the number of samples as a larger number of quantiles does not give\n a better approximation of the cumulative distribution function\n estimator.\n\noutput_distribution : {'uniform', 'normal'}, default='uniform'\n Marginal distribution for the transformed data. The choices are\n 'uniform' (default) or 'normal'.\n\nignore_implicit_zeros : bool, default=False\n Only applies to sparse matrices. If True, the sparse entries of the\n matrix are discarded to compute the quantile statistics. If False,\n these entries are treated as zeros.\n\nsubsample : int, default=1e5\n Maximum number of samples used to estimate the quantiles for\n computational efficiency. Note that the subsampling procedure may\n differ for value-identical sparse and dense matrices.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for subsampling and smoothing\n noise.\n Please see ``subsample`` for more details.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `\n\ncopy : bool, default=True\n Set to False to perform inplace transformation and avoid a copy (if the\n input is already a numpy array). If True, a copy of `X` is transformed,\n leaving the original `X` unchanged\n\n ..versionchanged:: 0.23\n The default value of `copy` changed from False to True in 0.23.\n\nReturns\n-------\nXt : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.preprocessing import quantile_transform\n>>> rng = np.random.RandomState(0)\n>>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)\n>>> quantile_transform(X, n_quantiles=10, random_state=0, copy=True)\narray([...])\n\nSee Also\n--------\nQuantileTransformer : Performs quantile-based scaling using the\n Transformer API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\npower_transform : Maps data to a normal distribution using a\n power transformation.\nscale : Performs standardization that is faster, but less robust\n to outliers.\nrobust_scale : Performs robust standardization that removes the influence\n of outliers but does not put outliers and inliers on the same scale.\n\nNotes\n-----\nNaNs are treated as missing values: disregarded in fit, and maintained in\ntransform.\n\n.. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.quantile_transform` unless\n you know what you are doing. A common mistake is to apply it\n to the entire data *before* splitting into training and\n test sets. This will bias the model evaluation because\n information would have leaked from the test set to the\n training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.QuantileTransformer` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking:`pipe = make_pipeline(QuantileTransformer(),\n LogisticRegression())`.\n\nFor a comparison of the different scalers, transformers, and normalizers,\nsee :ref:`examples/preprocessing/plot_all_scaling.py\n`.", + "description": "Transform features using quantiles information.\n\nThis method transforms the features to follow a uniform or a normal\ndistribution. Therefore, for a given feature, this transformation tends\nto spread out the most frequent values. It also reduces the impact of\n(marginal) outliers: this is therefore a robust preprocessing scheme.\n\nThe transformation is applied on each feature independently. First an\nestimate of the cumulative distribution function of a feature is\nused to map the original values to a uniform distribution. The obtained\nvalues are then mapped to the desired output distribution using the\nassociated quantile function. Features values of new/unseen data that fall\nbelow or above the fitted range will be mapped to the bounds of the output\ndistribution. Note that this transform is non-linear. It may distort linear\ncorrelations between variables measured at the same scale but renders\nvariables measured at different scales more directly comparable.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Transform features using quantiles information.\n\n This method transforms the features to follow a uniform or a normal\n distribution. Therefore, for a given feature, this transformation tends\n to spread out the most frequent values. It also reduces the impact of\n (marginal) outliers: this is therefore a robust preprocessing scheme.\n\n The transformation is applied on each feature independently. First an\n estimate of the cumulative distribution function of a feature is\n used to map the original values to a uniform distribution. The obtained\n values are then mapped to the desired output distribution using the\n associated quantile function. Features values of new/unseen data that fall\n below or above the fitted range will be mapped to the bounds of the output\n distribution. Note that this transform is non-linear. It may distort linear\n correlations between variables measured at the same scale but renders\n variables measured at different scales more directly comparable.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to transform.\n\n axis : int, default=0\n Axis used to compute the means and standard deviations along. If 0,\n transform each feature, otherwise (if 1) transform each sample.\n\n n_quantiles : int, default=1000 or n_samples\n Number of quantiles to be computed. It corresponds to the number\n of landmarks used to discretize the cumulative distribution function.\n If n_quantiles is larger than the number of samples, n_quantiles is set\n to the number of samples as a larger number of quantiles does not give\n a better approximation of the cumulative distribution function\n estimator.\n\n output_distribution : {'uniform', 'normal'}, default='uniform'\n Marginal distribution for the transformed data. The choices are\n 'uniform' (default) or 'normal'.\n\n ignore_implicit_zeros : bool, default=False\n Only applies to sparse matrices. If True, the sparse entries of the\n matrix are discarded to compute the quantile statistics. If False,\n these entries are treated as zeros.\n\n subsample : int, default=1e5\n Maximum number of samples used to estimate the quantiles for\n computational efficiency. Note that the subsampling procedure may\n differ for value-identical sparse and dense matrices.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for subsampling and smoothing\n noise.\n Please see ``subsample`` for more details.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `\n\n copy : bool, default=True\n Set to False to perform inplace transformation and avoid a copy (if the\n input is already a numpy array). If True, a copy of `X` is transformed,\n leaving the original `X` unchanged\n\n ..versionchanged:: 0.23\n The default value of `copy` changed from False to True in 0.23.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import quantile_transform\n >>> rng = np.random.RandomState(0)\n >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)\n >>> quantile_transform(X, n_quantiles=10, random_state=0, copy=True)\n array([...])\n\n See Also\n --------\n QuantileTransformer : Performs quantile-based scaling using the\n Transformer API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n power_transform : Maps data to a normal distribution using a\n power transformation.\n scale : Performs standardization that is faster, but less robust\n to outliers.\n robust_scale : Performs robust standardization that removes the influence\n of outliers but does not put outliers and inliers on the same scale.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in fit, and maintained in\n transform.\n\n .. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.quantile_transform` unless\n you know what you are doing. A common mistake is to apply it\n to the entire data *before* splitting into training and\n test sets. This will bias the model evaluation because\n information would have leaked from the test set to the\n training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.QuantileTransformer` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking:`pipe = make_pipeline(QuantileTransformer(),\n LogisticRegression())`.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n ", "source_code": "\ndef quantile_transform(X, *, axis=0, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(100000.0), random_state=None, copy=True):\n \"\"\"Transform features using quantiles information.\n\n This method transforms the features to follow a uniform or a normal\n distribution. Therefore, for a given feature, this transformation tends\n to spread out the most frequent values. It also reduces the impact of\n (marginal) outliers: this is therefore a robust preprocessing scheme.\n\n The transformation is applied on each feature independently. First an\n estimate of the cumulative distribution function of a feature is\n used to map the original values to a uniform distribution. The obtained\n values are then mapped to the desired output distribution using the\n associated quantile function. Features values of new/unseen data that fall\n below or above the fitted range will be mapped to the bounds of the output\n distribution. Note that this transform is non-linear. It may distort linear\n correlations between variables measured at the same scale but renders\n variables measured at different scales more directly comparable.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to transform.\n\n axis : int, default=0\n Axis used to compute the means and standard deviations along. If 0,\n transform each feature, otherwise (if 1) transform each sample.\n\n n_quantiles : int, default=1000 or n_samples\n Number of quantiles to be computed. It corresponds to the number\n of landmarks used to discretize the cumulative distribution function.\n If n_quantiles is larger than the number of samples, n_quantiles is set\n to the number of samples as a larger number of quantiles does not give\n a better approximation of the cumulative distribution function\n estimator.\n\n output_distribution : {'uniform', 'normal'}, default='uniform'\n Marginal distribution for the transformed data. The choices are\n 'uniform' (default) or 'normal'.\n\n ignore_implicit_zeros : bool, default=False\n Only applies to sparse matrices. If True, the sparse entries of the\n matrix are discarded to compute the quantile statistics. If False,\n these entries are treated as zeros.\n\n subsample : int, default=1e5\n Maximum number of samples used to estimate the quantiles for\n computational efficiency. Note that the subsampling procedure may\n differ for value-identical sparse and dense matrices.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for subsampling and smoothing\n noise.\n Please see ``subsample`` for more details.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `\n\n copy : bool, default=True\n Set to False to perform inplace transformation and avoid a copy (if the\n input is already a numpy array). If True, a copy of `X` is transformed,\n leaving the original `X` unchanged\n\n ..versionchanged:: 0.23\n The default value of `copy` changed from False to True in 0.23.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.preprocessing import quantile_transform\n >>> rng = np.random.RandomState(0)\n >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)\n >>> quantile_transform(X, n_quantiles=10, random_state=0, copy=True)\n array([...])\n\n See Also\n --------\n QuantileTransformer : Performs quantile-based scaling using the\n Transformer API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n power_transform : Maps data to a normal distribution using a\n power transformation.\n scale : Performs standardization that is faster, but less robust\n to outliers.\n robust_scale : Performs robust standardization that removes the influence\n of outliers but does not put outliers and inliers on the same scale.\n\n Notes\n -----\n NaNs are treated as missing values: disregarded in fit, and maintained in\n transform.\n\n .. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.quantile_transform` unless\n you know what you are doing. A common mistake is to apply it\n to the entire data *before* splitting into training and\n test sets. This will bias the model evaluation because\n information would have leaked from the test set to the\n training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.QuantileTransformer` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking:`pipe = make_pipeline(QuantileTransformer(),\n LogisticRegression())`.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n \"\"\"\n n = QuantileTransformer(n_quantiles=n_quantiles, output_distribution=output_distribution, subsample=subsample, ignore_implicit_zeros=ignore_implicit_zeros, random_state=random_state, copy=copy)\n if axis == 0:\n return n.fit_transform(X)\n elif axis == 1:\n return n.fit_transform(X.T).T\n else:\n raise ValueError('axis should be either equal to 0 or 1. Got axis={}'.format(axis))" }, { @@ -152963,6 +164898,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_sample, n_features)", "description": "The data to center and scale." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -152973,7 +164912,8 @@ "docstring": { "type": "int, default=0", "description": "Axis used to compute the medians and IQR along. If 0,\nindependently scale each feature, otherwise (if 1) scale\neach sample." - } + }, + "refined_type": {} }, { "name": "with_centering", @@ -152983,7 +164923,8 @@ "docstring": { "type": "bool, default=True", "description": "If `True`, center the data before scaling." - } + }, + "refined_type": {} }, { "name": "with_scaling", @@ -152993,7 +164934,8 @@ "docstring": { "type": "bool, default=True", "description": "If `True`, scale the data to unit variance (or equivalently,\nunit standard deviation)." - } + }, + "refined_type": {} }, { "name": "quantile_range", @@ -153003,7 +164945,8 @@ "docstring": { "type": "tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, default=(25.0, 75.0)", "description": "Quantile range used to calculate `scale_`. By default this is equal to\nthe IQR, i.e., `q_min` is the first quantile and `q_max` is the third\nquantile.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "copy", @@ -153013,7 +164956,8 @@ "docstring": { "type": "bool, default=True", "description": "Set to `False` to perform inplace row normalization and avoid a\ncopy (if the input is already a numpy array or a scipy.sparse\nCSR matrix and if axis is 1)." - } + }, + "refined_type": {} }, { "name": "unit_variance", @@ -153023,13 +164967,14 @@ "docstring": { "type": "bool, default=False", "description": "If `True`, scale data so that normally distributed features have a\nvariance of 1. In general, if the difference between the x-values of\n`q_max` and `q_min` for a standard normal distribution is greater\nthan 1, the dataset will be scaled down. If less than 1, the dataset\nwill be scaled up.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Standardize a dataset along any axis.\n\nCenter to the median and component wise scale according to the interquartile range. Read more in the :ref:`User Guide `.", - "docstring": "Standardize a dataset along any axis.\n\nCenter to the median and component wise scale\naccording to the interquartile range.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_sample, n_features)\n The data to center and scale.\n\naxis : int, default=0\n Axis used to compute the medians and IQR along. If 0,\n independently scale each feature, otherwise (if 1) scale\n each sample.\n\nwith_centering : bool, default=True\n If `True`, center the data before scaling.\n\nwith_scaling : bool, default=True\n If `True`, scale the data to unit variance (or equivalently,\n unit standard deviation).\n\nquantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, default=(25.0, 75.0)\n Quantile range used to calculate `scale_`. By default this is equal to\n the IQR, i.e., `q_min` is the first quantile and `q_max` is the third\n quantile.\n\n .. versionadded:: 0.18\n\ncopy : bool, default=True\n Set to `False` to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array or a scipy.sparse\n CSR matrix and if axis is 1).\n\nunit_variance : bool, default=False\n If `True`, scale data so that normally distributed features have a\n variance of 1. In general, if the difference between the x-values of\n `q_max` and `q_min` for a standard normal distribution is greater\n than 1, the dataset will be scaled down. If less than 1, the dataset\n will be scaled up.\n\n .. versionadded:: 0.24\n\nReturns\n-------\nX_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\nNotes\n-----\nThis implementation will refuse to center scipy.sparse matrices\nsince it would make them non-sparse and would potentially crash the\nprogram with memory exhaustion problems.\n\nInstead the caller is expected to either set explicitly\n`with_centering=False` (in that case, only variance scaling will be\nperformed on the features of the CSR matrix) or to call `X.toarray()`\nif he/she expects the materialized dense array to fit in memory.\n\nTo avoid memory copy the caller should pass a CSR matrix.\n\nFor a comparison of the different scalers, transformers, and normalizers,\nsee :ref:`examples/preprocessing/plot_all_scaling.py\n`.\n\n.. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.robust_scale` unless you know\n what you are doing. A common mistake is to apply it to the entire data\n *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.RobustScaler` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking: `pipe = make_pipeline(RobustScaler(), LogisticRegression())`.\n\nSee Also\n--------\nRobustScaler : Performs centering and scaling using the Transformer API\n (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).", + "description": "Standardize a dataset along any axis.\n\nCenter to the median and component wise scale\naccording to the interquartile range.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Standardize a dataset along any axis.\n\n Center to the median and component wise scale\n according to the interquartile range.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_sample, n_features)\n The data to center and scale.\n\n axis : int, default=0\n Axis used to compute the medians and IQR along. If 0,\n independently scale each feature, otherwise (if 1) scale\n each sample.\n\n with_centering : bool, default=True\n If `True`, center the data before scaling.\n\n with_scaling : bool, default=True\n If `True`, scale the data to unit variance (or equivalently,\n unit standard deviation).\n\n quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, default=(25.0, 75.0)\n Quantile range used to calculate `scale_`. By default this is equal to\n the IQR, i.e., `q_min` is the first quantile and `q_max` is the third\n quantile.\n\n .. versionadded:: 0.18\n\n copy : bool, default=True\n Set to `False` to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array or a scipy.sparse\n CSR matrix and if axis is 1).\n\n unit_variance : bool, default=False\n If `True`, scale data so that normally distributed features have a\n variance of 1. In general, if the difference between the x-values of\n `q_max` and `q_min` for a standard normal distribution is greater\n than 1, the dataset will be scaled down. If less than 1, the dataset\n will be scaled up.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\n Notes\n -----\n This implementation will refuse to center scipy.sparse matrices\n since it would make them non-sparse and would potentially crash the\n program with memory exhaustion problems.\n\n Instead the caller is expected to either set explicitly\n `with_centering=False` (in that case, only variance scaling will be\n performed on the features of the CSR matrix) or to call `X.toarray()`\n if he/she expects the materialized dense array to fit in memory.\n\n To avoid memory copy the caller should pass a CSR matrix.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n .. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.robust_scale` unless you know\n what you are doing. A common mistake is to apply it to the entire data\n *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.RobustScaler` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking: `pipe = make_pipeline(RobustScaler(), LogisticRegression())`.\n\n See Also\n --------\n RobustScaler : Performs centering and scaling using the Transformer API\n (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).\n ", "source_code": "\ndef robust_scale(X, *, axis=0, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True, unit_variance=False):\n \"\"\"Standardize a dataset along any axis.\n\n Center to the median and component wise scale\n according to the interquartile range.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_sample, n_features)\n The data to center and scale.\n\n axis : int, default=0\n Axis used to compute the medians and IQR along. If 0,\n independently scale each feature, otherwise (if 1) scale\n each sample.\n\n with_centering : bool, default=True\n If `True`, center the data before scaling.\n\n with_scaling : bool, default=True\n If `True`, scale the data to unit variance (or equivalently,\n unit standard deviation).\n\n quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, default=(25.0, 75.0)\n Quantile range used to calculate `scale_`. By default this is equal to\n the IQR, i.e., `q_min` is the first quantile and `q_max` is the third\n quantile.\n\n .. versionadded:: 0.18\n\n copy : bool, default=True\n Set to `False` to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array or a scipy.sparse\n CSR matrix and if axis is 1).\n\n unit_variance : bool, default=False\n If `True`, scale data so that normally distributed features have a\n variance of 1. In general, if the difference between the x-values of\n `q_max` and `q_min` for a standard normal distribution is greater\n than 1, the dataset will be scaled down. If less than 1, the dataset\n will be scaled up.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\n Notes\n -----\n This implementation will refuse to center scipy.sparse matrices\n since it would make them non-sparse and would potentially crash the\n program with memory exhaustion problems.\n\n Instead the caller is expected to either set explicitly\n `with_centering=False` (in that case, only variance scaling will be\n performed on the features of the CSR matrix) or to call `X.toarray()`\n if he/she expects the materialized dense array to fit in memory.\n\n To avoid memory copy the caller should pass a CSR matrix.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n .. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.robust_scale` unless you know\n what you are doing. A common mistake is to apply it to the entire data\n *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.RobustScaler` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking: `pipe = make_pipeline(RobustScaler(), LogisticRegression())`.\n\n See Also\n --------\n RobustScaler : Performs centering and scaling using the Transformer API\n (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).\n \"\"\"\n X = check_array(X, accept_sparse=('csr', 'csc'), copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n original_ndim = X.ndim\n if original_ndim == 1:\n X = X.reshape(X.shape[0], 1)\n s = RobustScaler(with_centering=with_centering, with_scaling=with_scaling, quantile_range=quantile_range, unit_variance=unit_variance, copy=copy)\n if axis == 0:\n X = s.fit_transform(X)\n else:\n X = s.fit_transform(X.T).T\n if original_ndim == 1:\n X = X.ravel()\n return X" }, { @@ -153047,6 +164992,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data to center and scale." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -153057,7 +165006,8 @@ "docstring": { "type": "int, default=0", "description": "axis used to compute the means and standard deviations along. If 0,\nindependently standardize each feature, otherwise (if 1) standardize\neach sample." - } + }, + "refined_type": {} }, { "name": "with_mean", @@ -153067,7 +165017,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, center the data before scaling." - } + }, + "refined_type": {} }, { "name": "with_std", @@ -153077,7 +165028,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, scale the data to unit variance (or equivalently,\nunit standard deviation)." - } + }, + "refined_type": {} }, { "name": "copy", @@ -153087,13 +165039,14 @@ "docstring": { "type": "bool, default=True", "description": "set to False to perform inplace row normalization and avoid a\ncopy (if the input is already a numpy array or a scipy.sparse\nCSC matrix and if axis is 1)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Standardize a dataset along any axis.\n\nCenter to the mean and component wise scale to unit variance. Read more in the :ref:`User Guide `.", - "docstring": "Standardize a dataset along any axis.\n\nCenter to the mean and component wise scale to unit variance.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to center and scale.\n\naxis : int, default=0\n axis used to compute the means and standard deviations along. If 0,\n independently standardize each feature, otherwise (if 1) standardize\n each sample.\n\nwith_mean : bool, default=True\n If True, center the data before scaling.\n\nwith_std : bool, default=True\n If True, scale the data to unit variance (or equivalently,\n unit standard deviation).\n\ncopy : bool, default=True\n set to False to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array or a scipy.sparse\n CSC matrix and if axis is 1).\n\nReturns\n-------\nX_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\nNotes\n-----\nThis implementation will refuse to center scipy.sparse matrices\nsince it would make them non-sparse and would potentially crash the\nprogram with memory exhaustion problems.\n\nInstead the caller is expected to either set explicitly\n`with_mean=False` (in that case, only variance scaling will be\nperformed on the features of the CSC matrix) or to call `X.toarray()`\nif he/she expects the materialized dense array to fit in memory.\n\nTo avoid memory copy the caller should pass a CSC matrix.\n\nNaNs are treated as missing values: disregarded to compute the statistics,\nand maintained during the data transformation.\n\nWe use a biased estimator for the standard deviation, equivalent to\n`numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to\naffect model performance.\n\nFor a comparison of the different scalers, transformers, and normalizers,\nsee :ref:`examples/preprocessing/plot_all_scaling.py\n`.\n\n.. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.scale` unless you know\n what you are doing. A common mistake is to apply it to the entire data\n *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.StandardScaler` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking: `pipe = make_pipeline(StandardScaler(), LogisticRegression())`.\n\nSee Also\n--------\nStandardScaler : Performs scaling to unit variance using the Transformer\n API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).", + "description": "Standardize a dataset along any axis.\n\nCenter to the mean and component wise scale to unit variance.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Standardize a dataset along any axis.\n\n Center to the mean and component wise scale to unit variance.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to center and scale.\n\n axis : int, default=0\n axis used to compute the means and standard deviations along. If 0,\n independently standardize each feature, otherwise (if 1) standardize\n each sample.\n\n with_mean : bool, default=True\n If True, center the data before scaling.\n\n with_std : bool, default=True\n If True, scale the data to unit variance (or equivalently,\n unit standard deviation).\n\n copy : bool, default=True\n set to False to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array or a scipy.sparse\n CSC matrix and if axis is 1).\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\n Notes\n -----\n This implementation will refuse to center scipy.sparse matrices\n since it would make them non-sparse and would potentially crash the\n program with memory exhaustion problems.\n\n Instead the caller is expected to either set explicitly\n `with_mean=False` (in that case, only variance scaling will be\n performed on the features of the CSC matrix) or to call `X.toarray()`\n if he/she expects the materialized dense array to fit in memory.\n\n To avoid memory copy the caller should pass a CSC matrix.\n\n NaNs are treated as missing values: disregarded to compute the statistics,\n and maintained during the data transformation.\n\n We use a biased estimator for the standard deviation, equivalent to\n `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to\n affect model performance.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n .. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.scale` unless you know\n what you are doing. A common mistake is to apply it to the entire data\n *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.StandardScaler` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking: `pipe = make_pipeline(StandardScaler(), LogisticRegression())`.\n\n See Also\n --------\n StandardScaler : Performs scaling to unit variance using the Transformer\n API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n\n ", "source_code": "\ndef scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):\n \"\"\"Standardize a dataset along any axis.\n\n Center to the mean and component wise scale to unit variance.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to center and scale.\n\n axis : int, default=0\n axis used to compute the means and standard deviations along. If 0,\n independently standardize each feature, otherwise (if 1) standardize\n each sample.\n\n with_mean : bool, default=True\n If True, center the data before scaling.\n\n with_std : bool, default=True\n If True, scale the data to unit variance (or equivalently,\n unit standard deviation).\n\n copy : bool, default=True\n set to False to perform inplace row normalization and avoid a\n copy (if the input is already a numpy array or a scipy.sparse\n CSC matrix and if axis is 1).\n\n Returns\n -------\n X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The transformed data.\n\n Notes\n -----\n This implementation will refuse to center scipy.sparse matrices\n since it would make them non-sparse and would potentially crash the\n program with memory exhaustion problems.\n\n Instead the caller is expected to either set explicitly\n `with_mean=False` (in that case, only variance scaling will be\n performed on the features of the CSC matrix) or to call `X.toarray()`\n if he/she expects the materialized dense array to fit in memory.\n\n To avoid memory copy the caller should pass a CSC matrix.\n\n NaNs are treated as missing values: disregarded to compute the statistics,\n and maintained during the data transformation.\n\n We use a biased estimator for the standard deviation, equivalent to\n `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to\n affect model performance.\n\n For a comparison of the different scalers, transformers, and normalizers,\n see :ref:`examples/preprocessing/plot_all_scaling.py\n `.\n\n .. warning:: Risk of data leak\n\n Do not use :func:`~sklearn.preprocessing.scale` unless you know\n what you are doing. A common mistake is to apply it to the entire data\n *before* splitting into training and test sets. This will bias the\n model evaluation because information would have leaked from the test\n set to the training set.\n In general, we recommend using\n :class:`~sklearn.preprocessing.StandardScaler` within a\n :ref:`Pipeline ` in order to prevent most risks of data\n leaking: `pipe = make_pipeline(StandardScaler(), LogisticRegression())`.\n\n See Also\n --------\n StandardScaler : Performs scaling to unit variance using the Transformer\n API (e.g. as part of a preprocessing\n :class:`~sklearn.pipeline.Pipeline`).\n\n \"\"\"\n X = check_array(X, accept_sparse='csc', copy=copy, ensure_2d=False, estimator='the scale function', dtype=FLOAT_DTYPES, force_all_finite='allow-nan')\n if sparse.issparse(X):\n if with_mean:\n raise ValueError('Cannot center sparse matrices: pass `with_mean=False` instead See docstring for motivation and alternatives.')\n if axis != 0:\n raise ValueError('Can only scale sparse matrix on axis=0, got axis=%d' % axis)\n if with_std:\n (_, var) = mean_variance_axis(X, axis=0)\n var = _handle_zeros_in_scale(var, copy=False)\n inplace_column_scale(X, 1 / np.sqrt(var))\n else:\n X = np.asarray(X)\n if with_mean:\n mean_ = np.nanmean(X, axis)\n if with_std:\n scale_ = np.nanstd(X, axis)\n Xr = np.rollaxis(X, axis)\n if with_mean:\n Xr -= mean_\n mean_1 = np.nanmean(Xr, axis=0)\n if not np.allclose(mean_1, 0):\n warnings.warn('Numerical issues were encountered when centering the data and might not be solved. Dataset may contain too large values. You may need to prescale your features.')\n Xr -= mean_1\n if with_std:\n scale_ = _handle_zeros_in_scale(scale_, copy=False)\n Xr /= scale_\n if with_mean:\n mean_2 = np.nanmean(Xr, axis=0)\n if not np.allclose(mean_2, 0):\n warnings.warn('Numerical issues were encountered when scaling the data and might not be solved. The standard deviation of the data is probably very close to 0. ')\n Xr -= mean_2\n return X" }, { @@ -153111,7 +165064,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_bins", @@ -153121,7 +165075,8 @@ "docstring": { "type": "int or array-like of shape (n_features,), default=5", "description": "The number of bins to produce. Raises ValueError if ``n_bins < 2``." - } + }, + "refined_type": {} }, { "name": "encode", @@ -153130,7 +165085,11 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "{'onehot', 'onehot-dense', 'ordinal'}, default='onehot'", - "description": "Method used to encode the transformed result.\n\nonehot\n Encode the transformed result with one-hot encoding\n and return a sparse matrix. Ignored features are always\n stacked to the right.\nonehot-dense\n Encode the transformed result with one-hot encoding\n and return a dense array. Ignored features are always\n stacked to the right.\nordinal\n Return the bin identifier encoded as an integer value." + "description": "Method used to encode the transformed result.\n\n- 'onehot': Encode the transformed result with one-hot encoding\n and return a sparse matrix. Ignored features are always\n stacked to the right.\n- 'onehot-dense': Encode the transformed result with one-hot encoding\n and return a dense array. Ignored features are always\n stacked to the right.\n- 'ordinal': Return the bin identifier encoded as an integer value." + }, + "refined_type": { + "kind": "EnumType", + "values": ["onehot-dense", "onehot", "ordinal"] } }, { @@ -153140,7 +165099,11 @@ "assigned_by": "NAME_ONLY", "docstring": { "type": "{'uniform', 'quantile', 'kmeans'}, default='quantile'", - "description": "Strategy used to define the widths of the bins.\n\nuniform\n All bins in each feature have identical widths.\nquantile\n All bins in each feature have the same number of points.\nkmeans\n Values in each bin have the same nearest center of a 1D k-means\n cluster." + "description": "Strategy used to define the widths of the bins.\n\n- 'uniform': All bins in each feature have identical widths.\n- 'quantile': All bins in each feature have the same number of points.\n- 'kmeans': Values in each bin have the same nearest center of a 1D\n k-means cluster." + }, + "refined_type": { + "kind": "EnumType", + "values": ["kmeans", "quantile", "uniform"] } }, { @@ -153151,13 +165114,17 @@ "docstring": { "type": "{np.float32, np.float64}, default=None", "description": "The desired data-type for the output. If None, output dtype is\nconsistent with input dtype. Only np.float32 and np.float64 are\nsupported.\n\n.. versionadded:: 0.24" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_bins=5, *, encode='onehot', strategy='quantile', dtype=None):\n self.n_bins = n_bins\n self.encode = encode\n self.strategy = strategy\n self.dtype = dtype" }, { @@ -153175,7 +165142,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -153185,7 +165153,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -153209,7 +165178,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -153219,7 +165189,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data to be discretized." - } + }, + "refined_type": {} }, { "name": "y", @@ -153229,13 +165200,14 @@ "docstring": { "type": "None", "description": "Ignored. This parameter exists only for compatibility with\n:class:`~sklearn.pipeline.Pipeline`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the estimator.", - "docstring": "Fit the estimator.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data to be discretized.\n\ny : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "\n Fit the estimator.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to be discretized.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"\n Fit the estimator.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to be discretized.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n X = self._validate_data(X, dtype='numeric')\n supported_dtype = (np.float64, np.float32)\n if self.dtype in supported_dtype:\n output_dtype = self.dtype\n elif self.dtype is None:\n output_dtype = X.dtype\n else:\n raise ValueError(f\"Valid options for 'dtype' are {supported_dtype + (None, )}. Got dtype={self.dtype} instead.\")\n valid_encode = ('onehot', 'onehot-dense', 'ordinal')\n if self.encode not in valid_encode:\n raise ValueError(\"Valid options for 'encode' are {}. Got encode={!r} instead.\".format(valid_encode, self.encode))\n valid_strategy = ('uniform', 'quantile', 'kmeans')\n if self.strategy not in valid_strategy:\n raise ValueError(\"Valid options for 'strategy' are {}. Got strategy={!r} instead.\".format(valid_strategy, self.strategy))\n n_features = X.shape[1]\n n_bins = self._validate_n_bins(n_features)\n bin_edges = np.zeros(n_features, dtype=object)\n for jj in range(n_features):\n column = X[:, jj]\n (col_min, col_max) = (column.min(), column.max())\n if col_min == col_max:\n warnings.warn('Feature %d is constant and will be replaced with 0.' % jj)\n n_bins[jj] = 1\n bin_edges[jj] = np.array([-np.inf, np.inf])\n continue\n if self.strategy == 'uniform':\n bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)\n elif self.strategy == 'quantile':\n quantiles = np.linspace(0, 100, n_bins[jj] + 1)\n bin_edges[jj] = np.asarray(np.percentile(column, quantiles))\n elif self.strategy == 'kmeans':\n from ..cluster import KMeans\n uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)\n init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5\n km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1, algorithm='full')\n centers = km.fit(column[:, None]).cluster_centers_[:, 0]\n centers.sort()\n bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5\n bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]\n if self.strategy in ('quantile', 'kmeans'):\n mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-08\n bin_edges[jj] = bin_edges[jj][mask]\n if len(bin_edges[jj]) - 1 != n_bins[jj]:\n warnings.warn('Bins whose width are too small (i.e., <= 1e-8) in feature %d are removed. Consider decreasing the number of bins.' % jj)\n n_bins[jj] = len(bin_edges[jj]) - 1\n self.bin_edges_ = bin_edges\n self.n_bins_ = n_bins\n if 'onehot' in self.encode:\n self._encoder = OneHotEncoder(categories=[np.arange(i) for i in self.n_bins_], sparse=self.encode == 'onehot', dtype=output_dtype)\n self._encoder.fit(np.zeros((1, len(self.n_bins_))))\n return self" }, { @@ -153253,7 +165225,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -153263,13 +165236,14 @@ "docstring": { "type": "array-like of str or None, default=None", "description": "Input features.\n\n- If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n- If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get output feature names.", - "docstring": "Get output feature names.\n\nParameters\n----------\ninput_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\nReturns\n-------\nfeature_names_out : ndarray of str objects\n Transformed feature names.", + "docstring": "Get output feature names.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n ", "source_code": "\ndef get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n input_features = _check_feature_names_in(self, input_features)\n return self._encoder.get_feature_names_out(input_features)" }, { @@ -153287,7 +165261,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Xt", @@ -153297,13 +165272,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Transformed data in the binned space." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Transform discretized data back to original feature space.\n\nNote that this function does not regenerate the original data due to discretization rounding.", - "docstring": "Transform discretized data back to original feature space.\n\nNote that this function does not regenerate the original data\ndue to discretization rounding.\n\nParameters\n----------\nXt : array-like of shape (n_samples, n_features)\n Transformed data in the binned space.\n\nReturns\n-------\nXinv : ndarray, dtype={np.float32, np.float64}\n Data in the original feature space.", + "description": "Transform discretized data back to original feature space.\n\nNote that this function does not regenerate the original data\ndue to discretization rounding.", + "docstring": "\n Transform discretized data back to original feature space.\n\n Note that this function does not regenerate the original data\n due to discretization rounding.\n\n Parameters\n ----------\n Xt : array-like of shape (n_samples, n_features)\n Transformed data in the binned space.\n\n Returns\n -------\n Xinv : ndarray, dtype={np.float32, np.float64}\n Data in the original feature space.\n ", "source_code": "\ndef inverse_transform(self, Xt):\n \"\"\"\n Transform discretized data back to original feature space.\n\n Note that this function does not regenerate the original data\n due to discretization rounding.\n\n Parameters\n ----------\n Xt : array-like of shape (n_samples, n_features)\n Transformed data in the binned space.\n\n Returns\n -------\n Xinv : ndarray, dtype={np.float32, np.float64}\n Data in the original feature space.\n \"\"\"\n check_is_fitted(self)\n if 'onehot' in self.encode:\n Xt = self._encoder.inverse_transform(Xt)\n Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32))\n n_features = self.n_bins_.shape[0]\n if Xinv.shape[1] != n_features:\n raise ValueError('Incorrect number of features. Expecting {}, received {}.'.format(n_features, Xinv.shape[1]))\n for jj in range(n_features):\n bin_edges = self.bin_edges_[jj]\n bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5\n Xinv[:, jj] = bin_centers[np.int_(Xinv[:, jj])]\n return Xinv" }, { @@ -153321,7 +165297,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -153331,13 +165308,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data to be discretized." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Discretize the data.", - "docstring": "Discretize the data.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data to be discretized.\n\nReturns\n-------\nXt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}\n Data in the binned space. Will be a sparse matrix if\n `self.encode='onehot'` and ndarray otherwise.", + "docstring": "\n Discretize the data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to be discretized.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}\n Data in the binned space. Will be a sparse matrix if\n `self.encode='onehot'` and ndarray otherwise.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"\n Discretize the data.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to be discretized.\n\n Returns\n -------\n Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}\n Data in the binned space. Will be a sparse matrix if\n `self.encode='onehot'` and ndarray otherwise.\n \"\"\"\n check_is_fitted(self)\n dtype = (np.float64, np.float32) if self.dtype is None else self.dtype\n Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)\n bin_edges = self.bin_edges_\n for jj in range(Xt.shape[1]):\n rtol = 1e-05\n atol = 1e-08\n eps = atol + rtol * np.abs(Xt[:, jj])\n Xt[:, jj] = np.digitize(Xt[:, jj] + eps, bin_edges[jj][1:])\n np.clip(Xt, 0, self.n_bins_ - 1, out=Xt)\n if self.encode == 'ordinal':\n return Xt\n dtype_init = None\n if 'onehot' in self.encode:\n dtype_init = self._encoder.dtype\n self._encoder.dtype = Xt.dtype\n try:\n Xt_enc = self._encoder.transform(Xt)\n finally:\n self._encoder.dtype = dtype_init\n return Xt_enc" }, { @@ -153355,7 +165333,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "categories", @@ -153365,7 +165344,8 @@ "docstring": { "type": "'auto' or a list of array-like, default='auto'", "description": "Categories (unique values) per feature:\n\n- 'auto' : Determine categories automatically from the training data.\n- list : ``categories[i]`` holds the categories expected in the ith\n column. The passed categories should not mix strings and numeric\n values within a single feature, and should be sorted in case of\n numeric values.\n\nThe used categories can be found in the ``categories_`` attribute.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "drop", @@ -153373,8 +165353,12 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "{'first', 'if_binary'} or a array-like of shape (n_features,), default=None", + "type": "{'first', 'if_binary'} or an array-like of shape (n_features,), default=None", "description": "Specifies a methodology to use to drop one of the categories per\nfeature. This is useful in situations where perfectly collinear\nfeatures cause problems, such as when feeding the resulting data\ninto a neural network or an unregularized regression.\n\nHowever, dropping one category breaks the symmetry of the original\nrepresentation and can therefore induce a bias in downstream models,\nfor instance for penalized linear classification or regression models.\n\n- None : retain all features (the default).\n- 'first' : drop the first category in each feature. If only one\n category is present, the feature will be dropped entirely.\n- 'if_binary' : drop the first category in each feature with two\n categories. Features with 1 or more than 2 categories are\n left intact.\n- array : ``drop[i]`` is the category in feature ``X[:, i]`` that\n should be dropped.\n\n.. versionadded:: 0.21\n The parameter `drop` was added in 0.21.\n\n.. versionchanged:: 0.23\n The option `drop='if_binary'` was added in 0.23." + }, + "refined_type": { + "kind": "EnumType", + "values": ["first", "if_binary"] } }, { @@ -153385,7 +165369,8 @@ "docstring": { "type": "bool, default=True", "description": "Will return sparse matrix if set True else will return an array." - } + }, + "refined_type": {} }, { "name": "dtype", @@ -153395,7 +165380,8 @@ "docstring": { "type": "number type, default=float", "description": "Desired dtype of output." - } + }, + "refined_type": {} }, { "name": "handle_unknown", @@ -153405,13 +165391,17 @@ "docstring": { "type": "{'error', 'ignore'}, default='error'", "description": "Whether to raise an error or ignore if an unknown categorical feature\nis present during transform (default is to raise). When this parameter\nis set to 'ignore' and an unknown category is encountered during\ntransform, the resulting one-hot encoded columns for this feature\nwill be all zeros. In the inverse transform, an unknown category\nwill be denoted as None." + }, + "refined_type": { + "kind": "EnumType", + "values": ["error", "ignore"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, categories='auto', drop=None, sparse=True, dtype=np.float64, handle_unknown='error'):\n self.categories = categories\n self.sparse = sparse\n self.dtype = dtype\n self.handle_unknown = handle_unknown\n self.drop = drop" }, { @@ -153429,13 +165419,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _compute_drop_idx(self):\n if self.drop is None:\n return None\n elif isinstance(self.drop, str):\n if self.drop == 'first':\n return np.zeros(len(self.categories_), dtype=object)\n elif self.drop == 'if_binary':\n return np.array([0 if len(cats) == 2 else None for cats in self.categories_], dtype=object)\n else:\n msg = \"Wrong input for parameter `drop`. Expected 'first', 'if_binary', None or array of objects, got {}\"\n raise ValueError(msg.format(type(self.drop)))\n else:\n try:\n drop_array = np.asarray(self.drop, dtype=object)\n droplen = len(drop_array)\n except (ValueError, TypeError):\n msg = \"Wrong input for parameter `drop`. Expected 'first', 'if_binary', None or array of objects, got {}\"\n raise ValueError(msg.format(type(drop_array)))\n if droplen != len(self.categories_):\n msg = '`drop` should have length equal to the number of features ({}), got {}'\n raise ValueError(msg.format(len(self.categories_), droplen))\n missing_drops = []\n drop_indices = []\n for (col_idx, (val, cat_list)) in enumerate(zip(drop_array, self.categories_)):\n if not is_scalar_nan(val):\n drop_idx = np.where(cat_list == val)[0]\n if drop_idx.size:\n drop_indices.append(drop_idx[0])\n else:\n missing_drops.append((col_idx, val))\n continue\n for (cat_idx, cat) in enumerate(cat_list):\n if is_scalar_nan(cat):\n drop_indices.append(cat_idx)\n break\n else:\n missing_drops.append((col_idx, val))\n if any(missing_drops):\n msg = 'The following categories were supposed to be dropped, but were not found in the training data.\\n{}'.format('\\n'.join(['Category: {}, Feature: {}'.format(c, v) for (c, v) in missing_drops]))\n raise ValueError(msg)\n return np.array(drop_indices, dtype=object)" }, { @@ -153453,13 +165444,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_keywords(self):\n if self.handle_unknown not in ('error', 'ignore'):\n msg = \"handle_unknown should be either 'error' or 'ignore', got {0}.\".format(self.handle_unknown)\n raise ValueError(msg)" }, { @@ -153477,7 +165469,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -153487,7 +165480,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to determine the categories of each feature." - } + }, + "refined_type": {} }, { "name": "y", @@ -153497,13 +165491,14 @@ "docstring": { "type": "None", "description": "Ignored. This parameter exists only for compatibility with\n:class:`~sklearn.pipeline.Pipeline`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit OneHotEncoder to X.", - "docstring": "Fit OneHotEncoder to X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data to determine the categories of each feature.\n\ny : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\nReturns\n-------\nself\n Fitted encoder.", + "docstring": "\n Fit OneHotEncoder to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to determine the categories of each feature.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n self\n Fitted encoder.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"\n Fit OneHotEncoder to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to determine the categories of each feature.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n self\n Fitted encoder.\n \"\"\"\n self._validate_keywords()\n self._fit(X, handle_unknown=self.handle_unknown, force_all_finite='allow-nan')\n self.drop_idx_ = self._compute_drop_idx()\n return self" }, { @@ -153521,7 +165516,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -153531,7 +165527,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to encode." - } + }, + "refined_type": {} }, { "name": "y", @@ -153541,13 +165538,14 @@ "docstring": { "type": "None", "description": "Ignored. This parameter exists only for compatibility with\n:class:`~sklearn.pipeline.Pipeline`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit OneHotEncoder to X, then transform X.\n\nEquivalent to fit(X).transform(X) but more convenient.", - "docstring": "Fit OneHotEncoder to X, then transform X.\n\nEquivalent to fit(X).transform(X) but more convenient.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data to encode.\n\ny : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\nReturns\n-------\nX_out : {ndarray, sparse matrix} of shape (n_samples, n_encoded_features)\n Transformed input. If `sparse=True`, a sparse matrix will be\n returned.", + "docstring": "\n Fit OneHotEncoder to X, then transform X.\n\n Equivalent to fit(X).transform(X) but more convenient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to encode.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n X_out : {ndarray, sparse matrix} of shape (n_samples, n_encoded_features)\n Transformed input. If `sparse=True`, a sparse matrix will be\n returned.\n ", "source_code": "\ndef fit_transform(self, X, y=None):\n \"\"\"\n Fit OneHotEncoder to X, then transform X.\n\n Equivalent to fit(X).transform(X) but more convenient.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to encode.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n X_out : {ndarray, sparse matrix} of shape (n_samples, n_encoded_features)\n Transformed input. If `sparse=True`, a sparse matrix will be\n returned.\n \"\"\"\n self._validate_keywords()\n return super().fit_transform(X, y)" }, { @@ -153567,7 +165565,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -153577,13 +165576,14 @@ "docstring": { "type": "list of str of shape (n_features,)", "description": "String names for input features if available. By default,\n\"x0\", \"x1\", ... \"xn_features\" is used." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return feature names for output features.", - "docstring": "Return feature names for output features.\n\nParameters\n----------\ninput_features : list of str of shape (n_features,)\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\nReturns\n-------\noutput_feature_names : ndarray of shape (n_output_features,)\n Array of feature names.", + "docstring": "Return feature names for output features.\n\n Parameters\n ----------\n input_features : list of str of shape (n_features,)\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\n Returns\n -------\n output_feature_names : ndarray of shape (n_output_features,)\n Array of feature names.\n ", "source_code": "\n@deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\ndef get_feature_names(self, input_features=None):\n \"\"\"Return feature names for output features.\n\n Parameters\n ----------\n input_features : list of str of shape (n_features,)\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\n Returns\n -------\n output_feature_names : ndarray of shape (n_output_features,)\n Array of feature names.\n \"\"\"\n check_is_fitted(self)\n cats = self.categories_\n if input_features is None:\n input_features = ['x%d' % i for i in range(len(cats))]\n elif len(input_features) != len(self.categories_):\n raise ValueError('input_features should have length equal to number of features ({}), got {}'.format(len(self.categories_), len(input_features)))\n feature_names = []\n for i in range(len(cats)):\n names = [input_features[i] + '_' + str(t) for t in cats[i]]\n if self.drop_idx_ is not None and self.drop_idx_[i] is not None:\n names.pop(self.drop_idx_[i])\n feature_names.extend(names)\n return np.array(feature_names, dtype=object)" }, { @@ -153601,7 +165601,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -153611,13 +165612,14 @@ "docstring": { "type": "array-like of str or None, default=None", "description": "Input features.\n\n- If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n- If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get output feature names for transformation.", - "docstring": "Get output feature names for transformation.\n\nParameters\n----------\ninput_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\nReturns\n-------\nfeature_names_out : ndarray of str objects\n Transformed feature names.", + "docstring": "Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n ", "source_code": "\ndef get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n check_is_fitted(self)\n cats = self.categories_\n input_features = _check_feature_names_in(self, input_features)\n feature_names = []\n for i in range(len(cats)):\n names = [input_features[i] + '_' + str(t) for t in cats[i]]\n if self.drop_idx_ is not None and self.drop_idx_[i] is not None:\n names.pop(self.drop_idx_[i])\n feature_names.extend(names)\n return np.asarray(feature_names, dtype=object)" }, { @@ -153635,7 +165637,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -153645,14 +165648,18 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_encoded_features)", "description": "The transformed data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Convert the data back to the original representation.\n\nWhen unknown categories are encountered (all zeros in the one-hot encoding), ``None`` is used to represent this category. If the feature with the unknown category has a dropped caregory, the dropped category will be its inverse.", - "docstring": "Convert the data back to the original representation.\n\nWhen unknown categories are encountered (all zeros in the\none-hot encoding), ``None`` is used to represent this category. If the\nfeature with the unknown category has a dropped caregory, the dropped\ncategory will be its inverse.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_encoded_features)\n The transformed data.\n\nReturns\n-------\nX_tr : ndarray of shape (n_samples, n_features)\n Inverse transformed array.", - "source_code": "\ndef inverse_transform(self, X):\n \"\"\"\n Convert the data back to the original representation.\n\n When unknown categories are encountered (all zeros in the\n one-hot encoding), ``None`` is used to represent this category. If the\n feature with the unknown category has a dropped caregory, the dropped\n category will be its inverse.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_encoded_features)\n The transformed data.\n\n Returns\n -------\n X_tr : ndarray of shape (n_samples, n_features)\n Inverse transformed array.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, accept_sparse='csr')\n (n_samples, _) = X.shape\n n_features = len(self.categories_)\n if self.drop_idx_ is None:\n n_transformed_features = sum((len(cats) for cats in self.categories_))\n else:\n n_transformed_features = sum((len(cats) - 1 if to_drop is not None else len(cats) for (cats, to_drop) in zip(self.categories_, self.drop_idx_)))\n msg = 'Shape of the passed X data is not correct. Expected {0} columns, got {1}.'\n if X.shape[1] != n_transformed_features:\n raise ValueError(msg.format(n_transformed_features, X.shape[1]))\n dt = np.find_common_type([cat.dtype for cat in self.categories_], [])\n X_tr = np.empty((n_samples, n_features), dtype=dt)\n j = 0\n found_unknown = {}\n for i in range(n_features):\n if self.drop_idx_ is None or self.drop_idx_[i] is None:\n cats = self.categories_[i]\n else:\n cats = np.delete(self.categories_[i], self.drop_idx_[i])\n n_categories = len(cats)\n if n_categories == 0:\n X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]\n j += n_categories\n continue\n sub = X[:, j:j + n_categories]\n labels = np.asarray(sub.argmax(axis=1)).flatten()\n X_tr[:, i] = cats[labels]\n if self.handle_unknown == 'ignore':\n unknown = np.asarray(sub.sum(axis=1) == 0).flatten()\n if unknown.any():\n if self.drop_idx_ is None or self.drop_idx_[i] is None:\n found_unknown[i] = unknown\n else:\n X_tr[unknown, i] = self.categories_[i][self.drop_idx_[i]]\n else:\n dropped = np.asarray(sub.sum(axis=1) == 0).flatten()\n if dropped.any():\n if self.drop_idx_ is None:\n all_zero_samples = np.flatnonzero(dropped)\n raise ValueError(f\"Samples {all_zero_samples} can not be inverted when drop=None and handle_unknown='error' because they contain all zeros\")\n X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]]\n j += n_categories\n if found_unknown:\n if X_tr.dtype != object:\n X_tr = X_tr.astype(object)\n for (idx, mask) in found_unknown.items():\n X_tr[mask, idx] = None\n return X_tr" + "description": "Convert the data back to the original representation.\n\nWhen unknown categories are encountered (all zeros in the\none-hot encoding), ``None`` is used to represent this category. If the\nfeature with the unknown category has a dropped category, the dropped\ncategory will be its inverse.", + "docstring": "\n Convert the data back to the original representation.\n\n When unknown categories are encountered (all zeros in the\n one-hot encoding), ``None`` is used to represent this category. If the\n feature with the unknown category has a dropped category, the dropped\n category will be its inverse.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_encoded_features)\n The transformed data.\n\n Returns\n -------\n X_tr : ndarray of shape (n_samples, n_features)\n Inverse transformed array.\n ", + "source_code": "\ndef inverse_transform(self, X):\n \"\"\"\n Convert the data back to the original representation.\n\n When unknown categories are encountered (all zeros in the\n one-hot encoding), ``None`` is used to represent this category. If the\n feature with the unknown category has a dropped category, the dropped\n category will be its inverse.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_encoded_features)\n The transformed data.\n\n Returns\n -------\n X_tr : ndarray of shape (n_samples, n_features)\n Inverse transformed array.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, accept_sparse='csr')\n (n_samples, _) = X.shape\n n_features = len(self.categories_)\n if self.drop_idx_ is None:\n n_transformed_features = sum((len(cats) for cats in self.categories_))\n else:\n n_transformed_features = sum((len(cats) - 1 if to_drop is not None else len(cats) for (cats, to_drop) in zip(self.categories_, self.drop_idx_)))\n msg = 'Shape of the passed X data is not correct. Expected {0} columns, got {1}.'\n if X.shape[1] != n_transformed_features:\n raise ValueError(msg.format(n_transformed_features, X.shape[1]))\n dt = np.find_common_type([cat.dtype for cat in self.categories_], [])\n X_tr = np.empty((n_samples, n_features), dtype=dt)\n j = 0\n found_unknown = {}\n for i in range(n_features):\n if self.drop_idx_ is None or self.drop_idx_[i] is None:\n cats = self.categories_[i]\n else:\n cats = np.delete(self.categories_[i], self.drop_idx_[i])\n n_categories = len(cats)\n if n_categories == 0:\n X_tr[:, i] = self.categories_[i][self.drop_idx_[i]]\n j += n_categories\n continue\n sub = X[:, j:j + n_categories]\n labels = np.asarray(sub.argmax(axis=1)).flatten()\n X_tr[:, i] = cats[labels]\n if self.handle_unknown == 'ignore':\n unknown = np.asarray(sub.sum(axis=1) == 0).flatten()\n if unknown.any():\n if self.drop_idx_ is None or self.drop_idx_[i] is None:\n found_unknown[i] = unknown\n else:\n X_tr[unknown, i] = self.categories_[i][self.drop_idx_[i]]\n else:\n dropped = np.asarray(sub.sum(axis=1) == 0).flatten()\n if dropped.any():\n if self.drop_idx_ is None:\n all_zero_samples = np.flatnonzero(dropped)\n raise ValueError(f\"Samples {all_zero_samples} can not be inverted when drop=None and handle_unknown='error' because they contain all zeros\")\n X_tr[dropped, i] = self.categories_[i][self.drop_idx_[i]]\n j += n_categories\n if found_unknown:\n if X_tr.dtype != object:\n X_tr = X_tr.astype(object)\n for (idx, mask) in found_unknown.items():\n X_tr[mask, idx] = None\n return X_tr" }, { "name": "transform", @@ -153669,7 +165676,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -153679,14 +165687,15 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to encode." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform X using one-hot encoding.", - "docstring": "Transform X using one-hot encoding.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data to encode.\n\nReturns\n-------\nX_out : {ndarray, sparse matrix} of shape (n_samples, n_encoded_features)\n Transformed input. If `sparse=True`, a sparse matrix will be\n returned.", - "source_code": "\ndef transform(self, X):\n \"\"\"\n Transform X using one-hot encoding.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to encode.\n\n Returns\n -------\n X_out : {ndarray, sparse matrix} of shape (n_samples, n_encoded_features)\n Transformed input. If `sparse=True`, a sparse matrix will be\n returned.\n \"\"\"\n check_is_fitted(self)\n warn_on_unknown = self.handle_unknown == 'ignore' and self.drop is not None\n (X_int, X_mask) = self._transform(X, handle_unknown=self.handle_unknown, force_all_finite='allow-nan', warn_on_unknown=warn_on_unknown)\n (n_samples, n_features) = X_int.shape\n if self.drop_idx_ is not None:\n to_drop = self.drop_idx_.copy()\n keep_cells = X_int != to_drop\n n_values = []\n for (i, cats) in enumerate(self.categories_):\n n_cats = len(cats)\n if to_drop[i] is None:\n to_drop[i] = n_cats\n n_values.append(n_cats)\n else:\n n_values.append(n_cats - 1)\n to_drop = to_drop.reshape(1, -1)\n X_int[X_int > to_drop] -= 1\n X_mask &= keep_cells\n else:\n n_values = [len(cats) for cats in self.categories_]\n mask = X_mask.ravel()\n feature_indices = np.cumsum([0] + n_values)\n indices = (X_int + feature_indices[:-1]).ravel()[mask]\n indptr = np.empty(n_samples + 1, dtype=int)\n indptr[0] = 0\n np.sum(X_mask, axis=1, out=indptr[1:])\n np.cumsum(indptr[1:], out=indptr[1:])\n data = np.ones(indptr[-1])\n out = sparse.csr_matrix((data, indices, indptr), shape=(n_samples, feature_indices[-1]), dtype=self.dtype)\n if not self.sparse:\n return out.toarray()\n else:\n return out" + "docstring": "\n Transform X using one-hot encoding.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to encode.\n\n Returns\n -------\n X_out : {ndarray, sparse matrix} of shape (n_samples, n_encoded_features)\n Transformed input. If `sparse=True`, a sparse matrix will be\n returned.\n ", + "source_code": "\ndef transform(self, X):\n \"\"\"\n Transform X using one-hot encoding.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to encode.\n\n Returns\n -------\n X_out : {ndarray, sparse matrix} of shape (n_samples, n_encoded_features)\n Transformed input. If `sparse=True`, a sparse matrix will be\n returned.\n \"\"\"\n check_is_fitted(self)\n warn_on_unknown = self.handle_unknown == 'ignore' and self.drop is not None\n (X_int, X_mask) = self._transform(X, handle_unknown=self.handle_unknown, force_all_finite='allow-nan', warn_on_unknown=warn_on_unknown)\n (n_samples, n_features) = X_int.shape\n if self.drop_idx_ is not None:\n to_drop = self.drop_idx_.copy()\n keep_cells = X_int != to_drop\n n_values = []\n for (i, cats) in enumerate(self.categories_):\n n_cats = len(cats)\n if to_drop[i] is None:\n to_drop[i] = n_cats\n n_values.append(n_cats)\n else:\n n_values.append(n_cats - 1)\n to_drop = to_drop.reshape(1, -1)\n X_int[X_int > to_drop] -= 1\n X_mask &= keep_cells\n else:\n n_values = [len(cats) for cats in self.categories_]\n mask = X_mask.ravel()\n feature_indices = np.cumsum([0] + n_values)\n indices = (X_int + feature_indices[:-1]).ravel()[mask]\n indptr = np.empty(n_samples + 1, dtype=int)\n indptr[0] = 0\n np.sum(X_mask, axis=1, out=indptr[1:], dtype=indptr.dtype)\n np.cumsum(indptr[1:], out=indptr[1:])\n data = np.ones(indptr[-1])\n out = sparse.csr_matrix((data, indices, indptr), shape=(n_samples, feature_indices[-1]), dtype=self.dtype)\n if not self.sparse:\n return out.toarray()\n else:\n return out" }, { "name": "__init__", @@ -153703,7 +165712,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "categories", @@ -153713,7 +165723,8 @@ "docstring": { "type": "'auto' or a list of array-like, default='auto'", "description": "Categories (unique values) per feature:\n\n- 'auto' : Determine categories automatically from the training data.\n- list : ``categories[i]`` holds the categories expected in the ith\n column. The passed categories should not mix strings and numeric\n values, and should be sorted in case of numeric values.\n\nThe used categories can be found in the ``categories_`` attribute." - } + }, + "refined_type": {} }, { "name": "dtype", @@ -153723,7 +165734,8 @@ "docstring": { "type": "number type, default np.float64", "description": "Desired dtype of output." - } + }, + "refined_type": {} }, { "name": "handle_unknown", @@ -153733,6 +165745,10 @@ "docstring": { "type": "{'error', 'use_encoded_value'}, default='error'", "description": "When set to 'error' an error will be raised in case an unknown\ncategorical feature is present during transform. When set to\n'use_encoded_value', the encoded value of unknown categories will be\nset to the value given for the parameter `unknown_value`. In\n:meth:`inverse_transform`, an unknown category will be denoted as None.\n\n.. versionadded:: 0.24" + }, + "refined_type": { + "kind": "EnumType", + "values": ["error", "use_encoded_value"] } }, { @@ -153743,13 +165759,14 @@ "docstring": { "type": "int or np.nan, default=None", "description": "When the parameter handle_unknown is set to 'use_encoded_value', this\nparameter is required and will set the encoded value of unknown\ncategories. It has to be distinct from the values used to encode any of\nthe categories in `fit`. If set to np.nan, the `dtype` parameter must\nbe a float dtype.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, categories='auto', dtype=np.float64, handle_unknown='error', unknown_value=None):\n self.categories = categories\n self.dtype = dtype\n self.handle_unknown = handle_unknown\n self.unknown_value = unknown_value" }, { @@ -153767,7 +165784,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -153777,7 +165795,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to determine the categories of each feature." - } + }, + "refined_type": {} }, { "name": "y", @@ -153787,13 +165806,14 @@ "docstring": { "type": "None", "description": "Ignored. This parameter exists only for compatibility with\n:class:`~sklearn.pipeline.Pipeline`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the OrdinalEncoder to X.", - "docstring": "Fit the OrdinalEncoder to X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data to determine the categories of each feature.\n\ny : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\nReturns\n-------\nself : object\n Fitted encoder.", + "docstring": "\n Fit the OrdinalEncoder to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to determine the categories of each feature.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n self : object\n Fitted encoder.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"\n Fit the OrdinalEncoder to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to determine the categories of each feature.\n\n y : None\n Ignored. This parameter exists only for compatibility with\n :class:`~sklearn.pipeline.Pipeline`.\n\n Returns\n -------\n self : object\n Fitted encoder.\n \"\"\"\n handle_unknown_strategies = ('error', 'use_encoded_value')\n if self.handle_unknown not in handle_unknown_strategies:\n raise ValueError(f\"handle_unknown should be either 'error' or 'use_encoded_value', got {self.handle_unknown}.\")\n if self.handle_unknown == 'use_encoded_value':\n if is_scalar_nan(self.unknown_value):\n if np.dtype(self.dtype).kind != 'f':\n raise ValueError(f'When unknown_value is np.nan, the dtype parameter should be a float dtype. Got {self.dtype}.')\n elif not isinstance(self.unknown_value, numbers.Integral):\n raise TypeError(f\"unknown_value should be an integer or np.nan when handle_unknown is 'use_encoded_value', got {self.unknown_value}.\")\n elif self.unknown_value is not None:\n raise TypeError(f\"unknown_value should only be set when handle_unknown is 'use_encoded_value', got {self.unknown_value}.\")\n self._fit(X, handle_unknown=self.handle_unknown, force_all_finite='allow-nan')\n if self.handle_unknown == 'use_encoded_value':\n for feature_cats in self.categories_:\n if 0 <= self.unknown_value < len(feature_cats):\n raise ValueError(f'The used value for unknown_value {self.unknown_value} is one of the values already used for encoding the seen categories.')\n self._missing_indices = {}\n for (cat_idx, categories_for_idx) in enumerate(self.categories_):\n for (i, cat) in enumerate(categories_for_idx):\n if is_scalar_nan(cat):\n self._missing_indices[cat_idx] = i\n continue\n if np.dtype(self.dtype).kind != 'f' and self._missing_indices:\n raise ValueError(f'There are missing values in features {list(self._missing_indices)}. For OrdinalEncoder to passthrough missing values, the dtype parameter must be a float')\n return self" }, { @@ -153811,7 +165831,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -153821,13 +165842,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_encoded_features)", "description": "The transformed data." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Convert the data back to the original representation.", - "docstring": "Convert the data back to the original representation.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_encoded_features)\n The transformed data.\n\nReturns\n-------\nX_tr : ndarray of shape (n_samples, n_features)\n Inverse transformed array.", + "docstring": "\n Convert the data back to the original representation.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_encoded_features)\n The transformed data.\n\n Returns\n -------\n X_tr : ndarray of shape (n_samples, n_features)\n Inverse transformed array.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"\n Convert the data back to the original representation.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_encoded_features)\n The transformed data.\n\n Returns\n -------\n X_tr : ndarray of shape (n_samples, n_features)\n Inverse transformed array.\n \"\"\"\n check_is_fitted(self)\n X = check_array(X, force_all_finite='allow-nan')\n (n_samples, _) = X.shape\n n_features = len(self.categories_)\n msg = 'Shape of the passed X data is not correct. Expected {0} columns, got {1}.'\n if X.shape[1] != n_features:\n raise ValueError(msg.format(n_features, X.shape[1]))\n dt = np.find_common_type([cat.dtype for cat in self.categories_], [])\n X_tr = np.empty((n_samples, n_features), dtype=dt)\n found_unknown = {}\n for i in range(n_features):\n labels = X[:, i].astype('int64', copy=False)\n if i in self._missing_indices:\n X_i_mask = _get_mask(X[:, i], np.nan)\n labels[X_i_mask] = self._missing_indices[i]\n if self.handle_unknown == 'use_encoded_value':\n unknown_labels = labels == self.unknown_value\n X_tr[:, i] = self.categories_[i][np.where(unknown_labels, 0, labels)]\n found_unknown[i] = unknown_labels\n else:\n X_tr[:, i] = self.categories_[i][labels]\n if found_unknown:\n X_tr = X_tr.astype(object, copy=False)\n for (idx, mask) in found_unknown.items():\n X_tr[mask, idx] = None\n return X_tr" }, { @@ -153845,7 +165867,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -153855,13 +165878,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to encode." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform X to ordinal codes.", - "docstring": "Transform X to ordinal codes.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data to encode.\n\nReturns\n-------\nX_out : ndarray of shape (n_samples, n_features)\n Transformed input.", + "docstring": "\n Transform X to ordinal codes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to encode.\n\n Returns\n -------\n X_out : ndarray of shape (n_samples, n_features)\n Transformed input.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"\n Transform X to ordinal codes.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to encode.\n\n Returns\n -------\n X_out : ndarray of shape (n_samples, n_features)\n Transformed input.\n \"\"\"\n (X_int, X_mask) = self._transform(X, handle_unknown=self.handle_unknown, force_all_finite='allow-nan')\n X_trans = X_int.astype(self.dtype, copy=False)\n for (cat_idx, missing_idx) in self._missing_indices.items():\n X_missing_mask = X_int[:, cat_idx] == missing_idx\n X_trans[X_missing_mask, cat_idx] = np.nan\n if self.handle_unknown == 'use_encoded_value':\n X_trans[~X_mask] = self.unknown_value\n return X_trans" }, { @@ -153879,7 +165903,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -153889,7 +165914,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "force_all_finite", @@ -153899,14 +165925,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Perform custom check_array: - convert list of strings to object dtype - check for missing values for object dtype data (check_array does not do that) - return list of features (arrays): this list of features is constructed feature by feature to preserve the data types of pandas DataFrame columns, as otherwise information is lost and cannot be used, eg for the `categories_` attribute.", - "docstring": "Perform custom check_array:\n- convert list of strings to object dtype\n- check for missing values for object dtype data (check_array does\n not do that)\n- return list of features (arrays): this list of features is\n constructed feature by feature to preserve the data types\n of pandas DataFrame columns, as otherwise information is lost\n and cannot be used, eg for the `categories_` attribute.", - "source_code": "\ndef _check_X(self, X, force_all_finite=True):\n \"\"\"\n Perform custom check_array:\n - convert list of strings to object dtype\n - check for missing values for object dtype data (check_array does\n not do that)\n - return list of features (arrays): this list of features is\n constructed feature by feature to preserve the data types\n of pandas DataFrame columns, as otherwise information is lost\n and cannot be used, eg for the `categories_` attribute.\n\n \"\"\"\n if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):\n X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite)\n if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):\n X = check_array(X, dtype=object, force_all_finite=force_all_finite)\n else:\n X = X_temp\n needs_validation = False\n else:\n needs_validation = force_all_finite\n (n_samples, n_features) = X.shape\n X_columns = []\n for i in range(n_features):\n Xi = self._get_feature(X, feature_idx=i)\n Xi = check_array(Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation)\n X_columns.append(Xi)\n return X_columns, n_samples, n_features" + "description": "Perform custom check_array:\n- convert list of strings to object dtype\n- check for missing values for object dtype data (check_array does\n not do that)\n- return list of features (arrays): this list of features is\n constructed feature by feature to preserve the data types\n of pandas DataFrame columns, as otherwise information is lost\n and cannot be used, e.g. for the `categories_` attribute.", + "docstring": "\n Perform custom check_array:\n - convert list of strings to object dtype\n - check for missing values for object dtype data (check_array does\n not do that)\n - return list of features (arrays): this list of features is\n constructed feature by feature to preserve the data types\n of pandas DataFrame columns, as otherwise information is lost\n and cannot be used, e.g. for the `categories_` attribute.\n\n ", + "source_code": "\ndef _check_X(self, X, force_all_finite=True):\n \"\"\"\n Perform custom check_array:\n - convert list of strings to object dtype\n - check for missing values for object dtype data (check_array does\n not do that)\n - return list of features (arrays): this list of features is\n constructed feature by feature to preserve the data types\n of pandas DataFrame columns, as otherwise information is lost\n and cannot be used, e.g. for the `categories_` attribute.\n\n \"\"\"\n if not (hasattr(X, 'iloc') and getattr(X, 'ndim', 0) == 2):\n X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite)\n if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):\n X = check_array(X, dtype=object, force_all_finite=force_all_finite)\n else:\n X = X_temp\n needs_validation = False\n else:\n needs_validation = force_all_finite\n (n_samples, n_features) = X.shape\n X_columns = []\n for i in range(n_features):\n Xi = self._get_feature(X, feature_idx=i)\n Xi = check_array(Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation)\n X_columns.append(Xi)\n return X_columns, n_samples, n_features" }, { "name": "_fit", @@ -153923,7 +165950,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -153933,7 +165961,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "handle_unknown", @@ -153943,7 +165972,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "force_all_finite", @@ -153953,13 +165983,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _fit(self, X, handle_unknown='error', force_all_finite=True):\n self._check_n_features(X, reset=True)\n self._check_feature_names(X, reset=True)\n (X_list, n_samples, n_features) = self._check_X(X, force_all_finite=force_all_finite)\n self.n_features_in_ = n_features\n if self.categories != 'auto':\n if len(self.categories) != n_features:\n raise ValueError('Shape mismatch: if categories is an array, it has to be of shape (n_features,).')\n self.categories_ = []\n for i in range(n_features):\n Xi = X_list[i]\n if self.categories == 'auto':\n cats = _unique(Xi)\n else:\n cats = np.array(self.categories[i], dtype=Xi.dtype)\n if Xi.dtype.kind not in 'OUS':\n sorted_cats = np.sort(cats)\n error_msg = 'Unsorted categories are not supported for numerical categories'\n stop_idx = -1 if np.isnan(sorted_cats[-1]) else None\n if np.any(sorted_cats[:stop_idx] != cats[:stop_idx]) or np.isnan(sorted_cats[-1]) and not np.isnan(sorted_cats[-1]):\n raise ValueError(error_msg)\n if handle_unknown == 'error':\n diff = _check_unknown(Xi, cats)\n if diff:\n msg = 'Found unknown categories {0} in column {1} during fit'.format(diff, i)\n raise ValueError(msg)\n self.categories_.append(cats)" }, { @@ -153977,7 +166008,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -153987,7 +166019,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "feature_idx", @@ -153997,13 +166030,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_feature(self, X, feature_idx):\n if hasattr(X, 'iloc'):\n return X.iloc[:, feature_idx]\n return X[:, feature_idx]" }, { @@ -154021,13 +166055,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'X_types': ['categorical']}" }, { @@ -154045,7 +166080,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -154055,7 +166091,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "handle_unknown", @@ -154065,7 +166102,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "force_all_finite", @@ -154075,7 +166113,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warn_on_unknown", @@ -154085,13 +166124,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _transform(self, X, handle_unknown='error', force_all_finite=True, warn_on_unknown=False):\n self._check_feature_names(X, reset=False)\n self._check_n_features(X, reset=False)\n (X_list, n_samples, n_features) = self._check_X(X, force_all_finite=force_all_finite)\n X_int = np.zeros((n_samples, n_features), dtype=int)\n X_mask = np.ones((n_samples, n_features), dtype=bool)\n columns_with_unknown = []\n for i in range(n_features):\n Xi = X_list[i]\n (diff, valid_mask) = _check_unknown(Xi, self.categories_[i], return_mask=True)\n if not np.all(valid_mask):\n if handle_unknown == 'error':\n msg = 'Found unknown categories {0} in column {1} during transform'.format(diff, i)\n raise ValueError(msg)\n else:\n if warn_on_unknown:\n columns_with_unknown.append(i)\n X_mask[:, i] = valid_mask\n if self.categories_[i].dtype.kind in ('U', 'S') and self.categories_[i].itemsize > Xi.itemsize:\n Xi = Xi.astype(self.categories_[i].dtype)\n elif self.categories_[i].dtype.kind == 'O' and Xi.dtype.kind == 'U':\n Xi = Xi.astype('O')\n else:\n Xi = Xi.copy()\n Xi[~valid_mask] = self.categories_[i][0]\n X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False)\n if columns_with_unknown:\n warnings.warn(f'Found unknown categories in columns {columns_with_unknown} during transform. These unknown categories will be encoded as all zeros', UserWarning)\n return X_int, X_mask" }, { @@ -154109,7 +166149,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "func", @@ -154119,7 +166160,8 @@ "docstring": { "type": "callable, default=None", "description": "The callable to use for the transformation. This will be passed\nthe same arguments as transform, with args and kwargs forwarded.\nIf func is None, then func will be the identity function." - } + }, + "refined_type": {} }, { "name": "inverse_func", @@ -154129,7 +166171,8 @@ "docstring": { "type": "callable, default=None", "description": "The callable to use for the inverse transformation. This will be\npassed the same arguments as inverse transform, with args and\nkwargs forwarded. If inverse_func is None, then inverse_func\nwill be the identity function." - } + }, + "refined_type": {} }, { "name": "validate", @@ -154139,7 +166182,8 @@ "docstring": { "type": "bool, default=False", "description": "Indicate that the input X array should be checked before calling\n``func``. The possibilities are:\n\n- If False, there is no input validation.\n- If True, then X will be converted to a 2-dimensional NumPy array or\n sparse matrix. If the conversion is not possible an exception is\n raised.\n\n.. versionchanged:: 0.22\n The default of ``validate`` changed from True to False." - } + }, + "refined_type": {} }, { "name": "accept_sparse", @@ -154149,7 +166193,8 @@ "docstring": { "type": "bool, default=False", "description": "Indicate that func accepts a sparse matrix as input. If validate is\nFalse, this has no effect. Otherwise, if accept_sparse is false,\nsparse matrix inputs will cause an exception to be raised." - } + }, + "refined_type": {} }, { "name": "check_inverse", @@ -154159,7 +166204,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to check that or ``func`` followed by ``inverse_func`` leads to\nthe original inputs. It can be used for a sanity check, raising a\nwarning when the condition is not fulfilled.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "kw_args", @@ -154169,7 +166215,8 @@ "docstring": { "type": "dict, default=None", "description": "Dictionary of additional keyword arguments to pass to func.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} }, { "name": "inv_kw_args", @@ -154179,13 +166226,14 @@ "docstring": { "type": "dict, default=None", "description": "Dictionary of additional keyword arguments to pass to inverse_func.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, func=None, inverse_func=None, *, validate=False, accept_sparse=False, check_inverse=True, kw_args=None, inv_kw_args=None):\n self.func = func\n self.inverse_func = inverse_func\n self.validate = validate\n self.accept_sparse = accept_sparse\n self.check_inverse = check_inverse\n self.kw_args = kw_args\n self.inv_kw_args = inv_kw_args" }, { @@ -154203,7 +166251,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -154227,7 +166276,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -154237,7 +166287,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "reset", @@ -154247,13 +166298,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_input(self, X, *, reset):\n if self.validate:\n return self._validate_data(X, accept_sparse=self.accept_sparse, reset=reset)\n return X" }, { @@ -154271,7 +166323,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -154281,7 +166334,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -154305,13 +166359,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'no_validation': not self.validate, 'stateless': True}" }, { @@ -154329,7 +166384,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -154339,7 +166395,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "func", @@ -154349,7 +166406,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kw_args", @@ -154359,13 +166417,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _transform(self, X, func=None, kw_args=None):\n if func is None:\n func = _identity\n return func(X, **kw_args if kw_args else {})" }, { @@ -154383,7 +166442,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -154393,7 +166453,8 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Input array." - } + }, + "refined_type": {} }, { "name": "y", @@ -154403,13 +166464,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit transformer by checking X.\n\nIf ``validate`` is ``True``, ``X`` will be checked.", - "docstring": "Fit transformer by checking X.\n\nIf ``validate`` is ``True``, ``X`` will be checked.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n Input array.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n FunctionTransformer class instance.", + "docstring": "Fit transformer by checking X.\n\n If ``validate`` is ``True``, ``X`` will be checked.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input array.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n FunctionTransformer class instance.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Fit transformer by checking X.\n\n If ``validate`` is ``True``, ``X`` will be checked.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input array.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n FunctionTransformer class instance.\n \"\"\"\n X = self._check_input(X, reset=True)\n if self.check_inverse and not (self.func is None or self.inverse_func is None):\n self._check_inverse_transform(X)\n return self" }, { @@ -154427,7 +166489,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -154437,13 +166500,14 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Input array." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform X using the inverse function.", - "docstring": "Transform X using the inverse function.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n Input array.\n\nReturns\n-------\nX_out : array-like, shape (n_samples, n_features)\n Transformed input.", + "docstring": "Transform X using the inverse function.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input array.\n\n Returns\n -------\n X_out : array-like, shape (n_samples, n_features)\n Transformed input.\n ", "source_code": "\ndef inverse_transform(self, X):\n \"\"\"Transform X using the inverse function.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input array.\n\n Returns\n -------\n X_out : array-like, shape (n_samples, n_features)\n Transformed input.\n \"\"\"\n if self.validate:\n X = check_array(X, accept_sparse=self.accept_sparse)\n return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args)" }, { @@ -154461,7 +166525,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -154471,13 +166536,14 @@ "docstring": { "type": "array-like, shape (n_samples, n_features)", "description": "Input array." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform X using the forward function.", - "docstring": "Transform X using the forward function.\n\nParameters\n----------\nX : array-like, shape (n_samples, n_features)\n Input array.\n\nReturns\n-------\nX_out : array-like, shape (n_samples, n_features)\n Transformed input.", + "docstring": "Transform X using the forward function.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input array.\n\n Returns\n -------\n X_out : array-like, shape (n_samples, n_features)\n Transformed input.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform X using the forward function.\n\n Parameters\n ----------\n X : array-like, shape (n_samples, n_features)\n Input array.\n\n Returns\n -------\n X_out : array-like, shape (n_samples, n_features)\n Transformed input.\n \"\"\"\n X = self._check_input(X, reset=False)\n return self._transform(X, func=self.func, kw_args=self.kw_args)" }, { @@ -154495,7 +166561,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -154519,7 +166586,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "neg_label", @@ -154529,7 +166597,8 @@ "docstring": { "type": "int, default=0", "description": "Value with which negative labels must be encoded." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -154539,7 +166608,8 @@ "docstring": { "type": "int, default=1", "description": "Value with which positive labels must be encoded." - } + }, + "refined_type": {} }, { "name": "sparse_output", @@ -154549,13 +166619,14 @@ "docstring": { "type": "bool, default=False", "description": "True if the returned array from transform is desired to be in sparse\nCSR format." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):\n if neg_label >= pos_label:\n raise ValueError('neg_label={0} must be strictly less than pos_label={1}.'.format(neg_label, pos_label))\n if sparse_output and (pos_label == 0 or neg_label != 0):\n raise ValueError('Sparse binarization is only supported with non zero pos_label and zero neg_label, got pos_label={0} and neg_label={1}'.format(pos_label, neg_label))\n self.neg_label = neg_label\n self.pos_label = pos_label\n self.sparse_output = sparse_output" }, { @@ -154573,13 +166644,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'X_types': ['1dlabels']}" }, { @@ -154597,7 +166669,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -154607,13 +166680,14 @@ "docstring": { "type": "ndarray of shape (n_samples,) or (n_samples, n_classes)", "description": "Target values. The 2-d matrix should only contain 0 and 1,\nrepresents multilabel classification." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit label binarizer.", - "docstring": "Fit label binarizer.\n\nParameters\n----------\ny : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Target values. The 2-d matrix should only contain 0 and 1,\n represents multilabel classification.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit label binarizer.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Target values. The 2-d matrix should only contain 0 and 1,\n represents multilabel classification.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, y):\n \"\"\"Fit label binarizer.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,) or (n_samples, n_classes)\n Target values. The 2-d matrix should only contain 0 and 1,\n represents multilabel classification.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n self.y_type_ = type_of_target(y)\n if 'multioutput' in self.y_type_:\n raise ValueError('Multioutput target data is not supported with label binarization')\n if _num_samples(y) == 0:\n raise ValueError('y has 0 samples: %r' % y)\n self.sparse_input_ = sp.issparse(y)\n self.classes_ = unique_labels(y)\n return self" }, { @@ -154631,7 +166705,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -154641,13 +166716,17 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_classes)", "description": "Target values. The 2-d matrix should only contain 0 and 1,\nrepresents multilabel classification. Sparse matrix can be\nCSR, CSC, COO, DOK, or LIL." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Fit label binarizer/transform multi-class labels to binary labels.\n\nThe output of transform is sometimes referred to as the 1-of-K coding scheme.", - "docstring": "Fit label binarizer/transform multi-class labels to binary labels.\n\nThe output of transform is sometimes referred to as\nthe 1-of-K coding scheme.\n\nParameters\n----------\ny : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_classes)\n Target values. The 2-d matrix should only contain 0 and 1,\n represents multilabel classification. Sparse matrix can be\n CSR, CSC, COO, DOK, or LIL.\n\nReturns\n-------\nY : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Shape will be (n_samples, 1) for binary problems. Sparse matrix\n will be of CSR format.", + "description": "Fit label binarizer/transform multi-class labels to binary labels.\n\nThe output of transform is sometimes referred to as\nthe 1-of-K coding scheme.", + "docstring": "Fit label binarizer/transform multi-class labels to binary labels.\n\n The output of transform is sometimes referred to as\n the 1-of-K coding scheme.\n\n Parameters\n ----------\n y : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_classes)\n Target values. The 2-d matrix should only contain 0 and 1,\n represents multilabel classification. Sparse matrix can be\n CSR, CSC, COO, DOK, or LIL.\n\n Returns\n -------\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Shape will be (n_samples, 1) for binary problems. Sparse matrix\n will be of CSR format.\n ", "source_code": "\ndef fit_transform(self, y):\n \"\"\"Fit label binarizer/transform multi-class labels to binary labels.\n\n The output of transform is sometimes referred to as\n the 1-of-K coding scheme.\n\n Parameters\n ----------\n y : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_classes)\n Target values. The 2-d matrix should only contain 0 and 1,\n represents multilabel classification. Sparse matrix can be\n CSR, CSC, COO, DOK, or LIL.\n\n Returns\n -------\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Shape will be (n_samples, 1) for binary problems. Sparse matrix\n will be of CSR format.\n \"\"\"\n return self.fit(y).transform(y)" }, { @@ -154665,7 +166744,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Y", @@ -154675,6 +166755,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_classes)", "description": "Target values. All sparse matrices are converted to CSR before\ninverse transformation." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -154685,13 +166769,14 @@ "docstring": { "type": "float, default=None", "description": "Threshold used in the binary and multi-label cases.\n\nUse 0 when ``Y`` contains the output of decision_function\n(classifier).\nUse 0.5 when ``Y`` contains the output of predict_proba.\n\nIf None, the threshold is assumed to be half way between\nneg_label and pos_label." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform binary labels back to multi-class labels.", - "docstring": "Transform binary labels back to multi-class labels.\n\nParameters\n----------\nY : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Target values. All sparse matrices are converted to CSR before\n inverse transformation.\n\nthreshold : float, default=None\n Threshold used in the binary and multi-label cases.\n\n Use 0 when ``Y`` contains the output of decision_function\n (classifier).\n Use 0.5 when ``Y`` contains the output of predict_proba.\n\n If None, the threshold is assumed to be half way between\n neg_label and pos_label.\n\nReturns\n-------\ny : {ndarray, sparse matrix} of shape (n_samples,)\n Target values. Sparse matrix will be of CSR format.\n\nNotes\n-----\nIn the case when the binary labels are fractional\n(probabilistic), inverse_transform chooses the class with the\ngreatest value. Typically, this allows to use the output of a\nlinear model's decision_function method directly as the input\nof inverse_transform.", + "docstring": "Transform binary labels back to multi-class labels.\n\n Parameters\n ----------\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Target values. All sparse matrices are converted to CSR before\n inverse transformation.\n\n threshold : float, default=None\n Threshold used in the binary and multi-label cases.\n\n Use 0 when ``Y`` contains the output of decision_function\n (classifier).\n Use 0.5 when ``Y`` contains the output of predict_proba.\n\n If None, the threshold is assumed to be half way between\n neg_label and pos_label.\n\n Returns\n -------\n y : {ndarray, sparse matrix} of shape (n_samples,)\n Target values. Sparse matrix will be of CSR format.\n\n Notes\n -----\n In the case when the binary labels are fractional\n (probabilistic), inverse_transform chooses the class with the\n greatest value. Typically, this allows to use the output of a\n linear model's decision_function method directly as the input\n of inverse_transform.\n ", "source_code": "\ndef inverse_transform(self, Y, threshold=None):\n \"\"\"Transform binary labels back to multi-class labels.\n\n Parameters\n ----------\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Target values. All sparse matrices are converted to CSR before\n inverse transformation.\n\n threshold : float, default=None\n Threshold used in the binary and multi-label cases.\n\n Use 0 when ``Y`` contains the output of decision_function\n (classifier).\n Use 0.5 when ``Y`` contains the output of predict_proba.\n\n If None, the threshold is assumed to be half way between\n neg_label and pos_label.\n\n Returns\n -------\n y : {ndarray, sparse matrix} of shape (n_samples,)\n Target values. Sparse matrix will be of CSR format.\n\n Notes\n -----\n In the case when the binary labels are fractional\n (probabilistic), inverse_transform chooses the class with the\n greatest value. Typically, this allows to use the output of a\n linear model's decision_function method directly as the input\n of inverse_transform.\n \"\"\"\n check_is_fitted(self)\n if threshold is None:\n threshold = (self.pos_label + self.neg_label) / 2.0\n if self.y_type_ == 'multiclass':\n y_inv = _inverse_binarize_multiclass(Y, self.classes_)\n else:\n y_inv = _inverse_binarize_thresholding(Y, self.y_type_, self.classes_, threshold)\n if self.sparse_input_:\n y_inv = sp.csr_matrix(y_inv)\n elif sp.issparse(y_inv):\n y_inv = y_inv.toarray()\n return y_inv" }, { @@ -154709,7 +166794,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -154719,13 +166805,17 @@ "docstring": { "type": "{array, sparse matrix} of shape (n_samples,) or (n_samples, n_classes)", "description": "Target values. The 2-d matrix should only contain 0 and 1,\nrepresents multilabel classification. Sparse matrix can be\nCSR, CSC, COO, DOK, or LIL." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Transform multi-class labels to binary labels.\n\nThe output of transform is sometimes referred to by some authors as the 1-of-K coding scheme.", - "docstring": "Transform multi-class labels to binary labels.\n\nThe output of transform is sometimes referred to by some authors as\nthe 1-of-K coding scheme.\n\nParameters\n----------\ny : {array, sparse matrix} of shape (n_samples,) or (n_samples, n_classes)\n Target values. The 2-d matrix should only contain 0 and 1,\n represents multilabel classification. Sparse matrix can be\n CSR, CSC, COO, DOK, or LIL.\n\nReturns\n-------\nY : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Shape will be (n_samples, 1) for binary problems. Sparse matrix\n will be of CSR format.", + "description": "Transform multi-class labels to binary labels.\n\nThe output of transform is sometimes referred to by some authors as\nthe 1-of-K coding scheme.", + "docstring": "Transform multi-class labels to binary labels.\n\n The output of transform is sometimes referred to by some authors as\n the 1-of-K coding scheme.\n\n Parameters\n ----------\n y : {array, sparse matrix} of shape (n_samples,) or (n_samples, n_classes)\n Target values. The 2-d matrix should only contain 0 and 1,\n represents multilabel classification. Sparse matrix can be\n CSR, CSC, COO, DOK, or LIL.\n\n Returns\n -------\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Shape will be (n_samples, 1) for binary problems. Sparse matrix\n will be of CSR format.\n ", "source_code": "\ndef transform(self, y):\n \"\"\"Transform multi-class labels to binary labels.\n\n The output of transform is sometimes referred to by some authors as\n the 1-of-K coding scheme.\n\n Parameters\n ----------\n y : {array, sparse matrix} of shape (n_samples,) or (n_samples, n_classes)\n Target values. The 2-d matrix should only contain 0 and 1,\n represents multilabel classification. Sparse matrix can be\n CSR, CSC, COO, DOK, or LIL.\n\n Returns\n -------\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Shape will be (n_samples, 1) for binary problems. Sparse matrix\n will be of CSR format.\n \"\"\"\n check_is_fitted(self)\n y_is_multilabel = type_of_target(y).startswith('multilabel')\n if y_is_multilabel and not self.y_type_.startswith('multilabel'):\n raise ValueError('The object was not fitted with multilabel input.')\n return label_binarize(y, classes=self.classes_, pos_label=self.pos_label, neg_label=self.neg_label, sparse_output=self.sparse_output)" }, { @@ -154743,13 +166833,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'X_types': ['1dlabels']}" }, { @@ -154767,7 +166858,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -154777,13 +166869,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit label encoder.", - "docstring": "Fit label encoder.\n\nParameters\n----------\ny : array-like of shape (n_samples,)\n Target values.\n\nReturns\n-------\nself : returns an instance of self.\n Fitted label encoder.", + "docstring": "Fit label encoder.\n\n Parameters\n ----------\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : returns an instance of self.\n Fitted label encoder.\n ", "source_code": "\ndef fit(self, y):\n \"\"\"Fit label encoder.\n\n Parameters\n ----------\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n self : returns an instance of self.\n Fitted label encoder.\n \"\"\"\n y = column_or_1d(y, warn=True)\n self.classes_ = _unique(y)\n return self" }, { @@ -154801,7 +166894,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -154811,13 +166905,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit label encoder and return encoded labels.", - "docstring": "Fit label encoder and return encoded labels.\n\nParameters\n----------\ny : array-like of shape (n_samples,)\n Target values.\n\nReturns\n-------\ny : array-like of shape (n_samples,)\n Encoded labels.", + "docstring": "Fit label encoder and return encoded labels.\n\n Parameters\n ----------\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n y : array-like of shape (n_samples,)\n Encoded labels.\n ", "source_code": "\ndef fit_transform(self, y):\n \"\"\"Fit label encoder and return encoded labels.\n\n Parameters\n ----------\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n y : array-like of shape (n_samples,)\n Encoded labels.\n \"\"\"\n y = column_or_1d(y, warn=True)\n (self.classes_, y) = _unique(y, return_inverse=True)\n return y" }, { @@ -154835,7 +166930,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -154845,13 +166941,14 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform labels back to original encoding.", - "docstring": "Transform labels back to original encoding.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n Target values.\n\nReturns\n-------\ny : ndarray of shape (n_samples,)\n Original encoding.", + "docstring": "Transform labels back to original encoding.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n Target values.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n Original encoding.\n ", "source_code": "\ndef inverse_transform(self, y):\n \"\"\"Transform labels back to original encoding.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n Target values.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n Original encoding.\n \"\"\"\n check_is_fitted(self)\n y = column_or_1d(y, warn=True)\n if _num_samples(y) == 0:\n return np.array([])\n diff = np.setdiff1d(y, np.arange(len(self.classes_)))\n if len(diff):\n raise ValueError('y contains previously unseen labels: %s' % str(diff))\n y = np.asarray(y)\n return self.classes_[y]" }, { @@ -154869,7 +166966,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -154879,13 +166977,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform labels to normalized encoding.", - "docstring": "Transform labels to normalized encoding.\n\nParameters\n----------\ny : array-like of shape (n_samples,)\n Target values.\n\nReturns\n-------\ny : array-like of shape (n_samples,)\n Labels as normalized encodings.", + "docstring": "Transform labels to normalized encoding.\n\n Parameters\n ----------\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n y : array-like of shape (n_samples,)\n Labels as normalized encodings.\n ", "source_code": "\ndef transform(self, y):\n \"\"\"Transform labels to normalized encoding.\n\n Parameters\n ----------\n y : array-like of shape (n_samples,)\n Target values.\n\n Returns\n -------\n y : array-like of shape (n_samples,)\n Labels as normalized encodings.\n \"\"\"\n check_is_fitted(self)\n y = column_or_1d(y, warn=True)\n if _num_samples(y) == 0:\n return np.array([])\n return _encode(y, uniques=self.classes_)" }, { @@ -154903,7 +167002,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classes", @@ -154913,7 +167013,8 @@ "docstring": { "type": "array-like of shape (n_classes,), default=None", "description": "Indicates an ordering for the class labels.\nAll entries should be unique (cannot contain duplicate classes)." - } + }, + "refined_type": {} }, { "name": "sparse_output", @@ -154923,13 +167024,14 @@ "docstring": { "type": "bool, default=False", "description": "Set to True if output binary array is desired in CSR sparse format." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, classes=None, sparse_output=False):\n self.classes = classes\n self.sparse_output = sparse_output" }, { @@ -154947,13 +167049,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _build_cache(self):\n if self._cached_dict is None:\n self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))\n return self._cached_dict" }, { @@ -154971,13 +167074,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'X_types': ['2dlabels']}" }, { @@ -154995,7 +167099,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -155005,7 +167110,8 @@ "docstring": { "type": "iterable of iterables", "description": "A set of labels (any orderable and hashable object) for each\nsample. If the `classes` parameter is set, `y` will not be\niterated." - } + }, + "refined_type": {} }, { "name": "class_mapping", @@ -155015,13 +167121,14 @@ "docstring": { "type": "Mapping", "description": "Maps from label to column index in label indicator matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Transforms the label sets with a given mapping.", - "docstring": "Transforms the label sets with a given mapping.\n\nParameters\n----------\ny : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\nclass_mapping : Mapping\n Maps from label to column index in label indicator matrix.\n\nReturns\n-------\ny_indicator : sparse matrix of shape (n_samples, n_classes)\n Label indicator matrix. Will be of CSR format.", + "docstring": "Transforms the label sets with a given mapping.\n\n Parameters\n ----------\n y : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\n class_mapping : Mapping\n Maps from label to column index in label indicator matrix.\n\n Returns\n -------\n y_indicator : sparse matrix of shape (n_samples, n_classes)\n Label indicator matrix. Will be of CSR format.\n ", "source_code": "\ndef _transform(self, y, class_mapping):\n \"\"\"Transforms the label sets with a given mapping.\n\n Parameters\n ----------\n y : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\n class_mapping : Mapping\n Maps from label to column index in label indicator matrix.\n\n Returns\n -------\n y_indicator : sparse matrix of shape (n_samples, n_classes)\n Label indicator matrix. Will be of CSR format.\n \"\"\"\n indices = array.array('i')\n indptr = array.array('i', [0])\n unknown = set()\n for labels in y:\n index = set()\n for label in labels:\n try:\n index.add(class_mapping[label])\n except KeyError:\n unknown.add(label)\n indices.extend(index)\n indptr.append(len(indices))\n if unknown:\n warnings.warn('unknown class(es) {0} will be ignored'.format(sorted(unknown, key=str)))\n data = np.ones(len(indices), dtype=int)\n return sp.csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping)))" }, { @@ -155039,7 +167146,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -155049,13 +167157,14 @@ "docstring": { "type": "iterable of iterables", "description": "A set of labels (any orderable and hashable object) for each\nsample. If the `classes` parameter is set, `y` will not be\niterated." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the label sets binarizer, storing :term:`classes_`.", - "docstring": "Fit the label sets binarizer, storing :term:`classes_`.\n\nParameters\n----------\ny : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "Fit the label sets binarizer, storing :term:`classes_`.\n\n Parameters\n ----------\n y : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, y):\n \"\"\"Fit the label sets binarizer, storing :term:`classes_`.\n\n Parameters\n ----------\n y : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n self._cached_dict = None\n if self.classes is None:\n classes = sorted(set(itertools.chain.from_iterable(y)))\n elif len(set(self.classes)) < len(self.classes):\n raise ValueError('The classes argument contains duplicate classes. Remove these duplicates before passing them to MultiLabelBinarizer.')\n else:\n classes = self.classes\n dtype = int if all((isinstance(c, int) for c in classes)) else object\n self.classes_ = np.empty(len(classes), dtype=dtype)\n self.classes_[:] = classes\n return self" }, { @@ -155073,7 +167182,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -155083,13 +167193,14 @@ "docstring": { "type": "iterable of iterables", "description": "A set of labels (any orderable and hashable object) for each\nsample. If the `classes` parameter is set, `y` will not be\niterated." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the label sets binarizer and transform the given label sets.", - "docstring": "Fit the label sets binarizer and transform the given label sets.\n\nParameters\n----------\ny : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\nReturns\n-------\ny_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`\n is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR\n format.", + "docstring": "Fit the label sets binarizer and transform the given label sets.\n\n Parameters\n ----------\n y : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\n Returns\n -------\n y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`\n is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR\n format.\n ", "source_code": "\ndef fit_transform(self, y):\n \"\"\"Fit the label sets binarizer and transform the given label sets.\n\n Parameters\n ----------\n y : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\n Returns\n -------\n y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`\n is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR\n format.\n \"\"\"\n self._cached_dict = None\n if self.classes is not None:\n return self.fit(y).transform(y)\n class_mapping = defaultdict(int)\n class_mapping.default_factory = class_mapping.__len__\n yt = self._transform(y, class_mapping)\n tmp = sorted(class_mapping, key=class_mapping.get)\n dtype = int if all((isinstance(c, int) for c in tmp)) else object\n class_mapping = np.empty(len(tmp), dtype=dtype)\n class_mapping[:] = tmp\n (self.classes_, inverse) = np.unique(class_mapping, return_inverse=True)\n yt.indices = np.array(inverse[yt.indices], dtype=yt.indices.dtype, copy=False)\n if not self.sparse_output:\n yt = yt.toarray()\n return yt" }, { @@ -155107,7 +167218,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "yt", @@ -155117,13 +167229,17 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_classes)", "description": "A matrix containing only 1s ands 0s." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Transform the given indicator matrix into label sets.", - "docstring": "Transform the given indicator matrix into label sets.\n\nParameters\n----------\nyt : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n A matrix containing only 1s ands 0s.\n\nReturns\n-------\ny : list of tuples\n The set of labels for each sample such that `y[i]` consists of\n `classes_[j]` for each `yt[i, j] == 1`.", + "docstring": "Transform the given indicator matrix into label sets.\n\n Parameters\n ----------\n yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n A matrix containing only 1s ands 0s.\n\n Returns\n -------\n y : list of tuples\n The set of labels for each sample such that `y[i]` consists of\n `classes_[j]` for each `yt[i, j] == 1`.\n ", "source_code": "\ndef inverse_transform(self, yt):\n \"\"\"Transform the given indicator matrix into label sets.\n\n Parameters\n ----------\n yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n A matrix containing only 1s ands 0s.\n\n Returns\n -------\n y : list of tuples\n The set of labels for each sample such that `y[i]` consists of\n `classes_[j]` for each `yt[i, j] == 1`.\n \"\"\"\n check_is_fitted(self)\n if yt.shape[1] != len(self.classes_):\n raise ValueError('Expected indicator for {0} classes, but got {1}'.format(len(self.classes_), yt.shape[1]))\n if sp.issparse(yt):\n yt = yt.tocsr()\n if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:\n raise ValueError('Expected only 0s and 1s in label indicator.')\n return [tuple(self.classes_.take(yt.indices[start:end])) for (start, end) in zip(yt.indptr[:-1], yt.indptr[1:])]\n else:\n unexpected = np.setdiff1d(yt, [0, 1])\n if len(unexpected) > 0:\n raise ValueError('Expected only 0s and 1s in label indicator. Also got {0}'.format(unexpected))\n return [tuple(self.classes_.compress(indicators)) for indicators in yt]" }, { @@ -155141,7 +167257,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -155151,13 +167268,14 @@ "docstring": { "type": "iterable of iterables", "description": "A set of labels (any orderable and hashable object) for each\nsample. If the `classes` parameter is set, `y` will not be\niterated." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform the given label sets.", - "docstring": "Transform the given label sets.\n\nParameters\n----------\ny : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\nReturns\n-------\ny_indicator : array or CSR matrix, shape (n_samples, n_classes)\n A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in\n `y[i]`, and 0 otherwise.", + "docstring": "Transform the given label sets.\n\n Parameters\n ----------\n y : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\n Returns\n -------\n y_indicator : array or CSR matrix, shape (n_samples, n_classes)\n A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in\n `y[i]`, and 0 otherwise.\n ", "source_code": "\ndef transform(self, y):\n \"\"\"Transform the given label sets.\n\n Parameters\n ----------\n y : iterable of iterables\n A set of labels (any orderable and hashable object) for each\n sample. If the `classes` parameter is set, `y` will not be\n iterated.\n\n Returns\n -------\n y_indicator : array or CSR matrix, shape (n_samples, n_classes)\n A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in\n `y[i]`, and 0 otherwise.\n \"\"\"\n check_is_fitted(self)\n class_to_index = self._build_cache()\n yt = self._transform(y, class_to_index)\n if not self.sparse_output:\n yt = yt.toarray()\n return yt" }, { @@ -155175,7 +167293,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classes", @@ -155185,13 +167304,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Inverse label binarization transformation for multiclass.\n\nMulticlass uses the maximal score instead of a threshold.", - "docstring": "Inverse label binarization transformation for multiclass.\n\nMulticlass uses the maximal score instead of a threshold.", + "docstring": "Inverse label binarization transformation for multiclass.\n\n Multiclass uses the maximal score instead of a threshold.\n ", "source_code": "\ndef _inverse_binarize_multiclass(y, classes):\n \"\"\"Inverse label binarization transformation for multiclass.\n\n Multiclass uses the maximal score instead of a threshold.\n \"\"\"\n classes = np.asarray(classes)\n if sp.issparse(y):\n y = y.tocsr()\n (n_samples, n_outputs) = y.shape\n outputs = np.arange(n_outputs)\n row_max = min_max_axis(y, 1)[1]\n row_nnz = np.diff(y.indptr)\n y_data_repeated_max = np.repeat(row_max, row_nnz)\n y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)\n if row_max[-1] == 0:\n y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])\n index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])\n y_ind_ext = np.append(y.indices, [0])\n y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]\n y_i_argmax[np.where(row_nnz == 0)[0]] = 0\n samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]\n for i in samples:\n ind = y.indices[y.indptr[i]:y.indptr[i + 1]]\n y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]\n return classes[y_i_argmax]\n else:\n return classes.take(y.argmax(axis=1), mode='clip')" }, { @@ -155209,7 +167329,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "output_type", @@ -155219,7 +167340,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classes", @@ -155229,7 +167351,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "threshold", @@ -155239,7 +167362,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -155263,7 +167387,8 @@ "docstring": { "type": "array-like", "description": "Sequence of integer labels or multilabel data to encode." - } + }, + "refined_type": {} }, { "name": "classes", @@ -155273,7 +167398,8 @@ "docstring": { "type": "array-like of shape (n_classes,)", "description": "Uniquely holds the label for each class." - } + }, + "refined_type": {} }, { "name": "neg_label", @@ -155283,7 +167409,8 @@ "docstring": { "type": "int, default=0", "description": "Value with which negative labels must be encoded." - } + }, + "refined_type": {} }, { "name": "pos_label", @@ -155293,7 +167420,8 @@ "docstring": { "type": "int, default=1", "description": "Value with which positive labels must be encoded." - } + }, + "refined_type": {} }, { "name": "sparse_output", @@ -155303,13 +167431,14 @@ "docstring": { "type": "bool, default=False,", "description": "Set to true if output binary array is desired in CSR sparse format." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Binarize labels in a one-vs-all fashion.\n\nSeveral regression and binary classification algorithms are available in scikit-learn. A simple way to extend these algorithms to the multi-class classification case is to use the so-called one-vs-all scheme. This function makes it possible to compute this transformation for a fixed set of class labels known ahead of time.", - "docstring": "Binarize labels in a one-vs-all fashion.\n\nSeveral regression and binary classification algorithms are\navailable in scikit-learn. A simple way to extend these algorithms\nto the multi-class classification case is to use the so-called\none-vs-all scheme.\n\nThis function makes it possible to compute this transformation for a\nfixed set of class labels known ahead of time.\n\nParameters\n----------\ny : array-like\n Sequence of integer labels or multilabel data to encode.\n\nclasses : array-like of shape (n_classes,)\n Uniquely holds the label for each class.\n\nneg_label : int, default=0\n Value with which negative labels must be encoded.\n\npos_label : int, default=1\n Value with which positive labels must be encoded.\n\nsparse_output : bool, default=False,\n Set to true if output binary array is desired in CSR sparse format.\n\nReturns\n-------\nY : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Shape will be (n_samples, 1) for binary problems. Sparse matrix will\n be of CSR format.\n\nExamples\n--------\n>>> from sklearn.preprocessing import label_binarize\n>>> label_binarize([1, 6], classes=[1, 2, 4, 6])\narray([[1, 0, 0, 0],\n [0, 0, 0, 1]])\n\nThe class ordering is preserved:\n\n>>> label_binarize([1, 6], classes=[1, 6, 4, 2])\narray([[1, 0, 0, 0],\n [0, 1, 0, 0]])\n\nBinary targets transform to a column vector\n\n>>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])\narray([[1],\n [0],\n [0],\n [1]])\n\nSee Also\n--------\nLabelBinarizer : Class used to wrap the functionality of label_binarize and\n allow for fitting to classes independently of the transform operation.", + "description": "Binarize labels in a one-vs-all fashion.\n\nSeveral regression and binary classification algorithms are\navailable in scikit-learn. A simple way to extend these algorithms\nto the multi-class classification case is to use the so-called\none-vs-all scheme.\n\nThis function makes it possible to compute this transformation for a\nfixed set of class labels known ahead of time.", + "docstring": "Binarize labels in a one-vs-all fashion.\n\n Several regression and binary classification algorithms are\n available in scikit-learn. A simple way to extend these algorithms\n to the multi-class classification case is to use the so-called\n one-vs-all scheme.\n\n This function makes it possible to compute this transformation for a\n fixed set of class labels known ahead of time.\n\n Parameters\n ----------\n y : array-like\n Sequence of integer labels or multilabel data to encode.\n\n classes : array-like of shape (n_classes,)\n Uniquely holds the label for each class.\n\n neg_label : int, default=0\n Value with which negative labels must be encoded.\n\n pos_label : int, default=1\n Value with which positive labels must be encoded.\n\n sparse_output : bool, default=False,\n Set to true if output binary array is desired in CSR sparse format.\n\n Returns\n -------\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Shape will be (n_samples, 1) for binary problems. Sparse matrix will\n be of CSR format.\n\n Examples\n --------\n >>> from sklearn.preprocessing import label_binarize\n >>> label_binarize([1, 6], classes=[1, 2, 4, 6])\n array([[1, 0, 0, 0],\n [0, 0, 0, 1]])\n\n The class ordering is preserved:\n\n >>> label_binarize([1, 6], classes=[1, 6, 4, 2])\n array([[1, 0, 0, 0],\n [0, 1, 0, 0]])\n\n Binary targets transform to a column vector\n\n >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])\n array([[1],\n [0],\n [0],\n [1]])\n\n See Also\n --------\n LabelBinarizer : Class used to wrap the functionality of label_binarize and\n allow for fitting to classes independently of the transform operation.\n ", "source_code": "\ndef label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False):\n \"\"\"Binarize labels in a one-vs-all fashion.\n\n Several regression and binary classification algorithms are\n available in scikit-learn. A simple way to extend these algorithms\n to the multi-class classification case is to use the so-called\n one-vs-all scheme.\n\n This function makes it possible to compute this transformation for a\n fixed set of class labels known ahead of time.\n\n Parameters\n ----------\n y : array-like\n Sequence of integer labels or multilabel data to encode.\n\n classes : array-like of shape (n_classes,)\n Uniquely holds the label for each class.\n\n neg_label : int, default=0\n Value with which negative labels must be encoded.\n\n pos_label : int, default=1\n Value with which positive labels must be encoded.\n\n sparse_output : bool, default=False,\n Set to true if output binary array is desired in CSR sparse format.\n\n Returns\n -------\n Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)\n Shape will be (n_samples, 1) for binary problems. Sparse matrix will\n be of CSR format.\n\n Examples\n --------\n >>> from sklearn.preprocessing import label_binarize\n >>> label_binarize([1, 6], classes=[1, 2, 4, 6])\n array([[1, 0, 0, 0],\n [0, 0, 0, 1]])\n\n The class ordering is preserved:\n\n >>> label_binarize([1, 6], classes=[1, 6, 4, 2])\n array([[1, 0, 0, 0],\n [0, 1, 0, 0]])\n\n Binary targets transform to a column vector\n\n >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])\n array([[1],\n [0],\n [0],\n [1]])\n\n See Also\n --------\n LabelBinarizer : Class used to wrap the functionality of label_binarize and\n allow for fitting to classes independently of the transform operation.\n \"\"\"\n if not isinstance(y, list):\n y = check_array(y, accept_sparse='csr', ensure_2d=False, dtype=None)\n elif _num_samples(y) == 0:\n raise ValueError('y has 0 samples: %r' % y)\n if neg_label >= pos_label:\n raise ValueError('neg_label={0} must be strictly less than pos_label={1}.'.format(neg_label, pos_label))\n if sparse_output and (pos_label == 0 or neg_label != 0):\n raise ValueError('Sparse binarization is only supported with non zero pos_label and zero neg_label, got pos_label={0} and neg_label={1}'.format(pos_label, neg_label))\n pos_switch = pos_label == 0\n if pos_switch:\n pos_label = -neg_label\n y_type = type_of_target(y)\n if 'multioutput' in y_type:\n raise ValueError('Multioutput target data is not supported with label binarization')\n if y_type == 'unknown':\n raise ValueError('The type of target data is not known')\n n_samples = y.shape[0] if sp.issparse(y) else len(y)\n n_classes = len(classes)\n classes = np.asarray(classes)\n if y_type == 'binary':\n if n_classes == 1:\n if sparse_output:\n return sp.csr_matrix((n_samples, 1), dtype=int)\n else:\n Y = np.zeros((len(y), 1), dtype=int)\n Y += neg_label\n return Y\n elif len(classes) >= 3:\n y_type = 'multiclass'\n sorted_class = np.sort(classes)\n if y_type == 'multilabel-indicator':\n y_n_classes = y.shape[1] if hasattr(y, 'shape') else len(y[0])\n if classes.size != y_n_classes:\n raise ValueError('classes {0} mismatch with the labels {1} found in the data'.format(classes, unique_labels(y)))\n if y_type in ('binary', 'multiclass'):\n y = column_or_1d(y)\n y_in_classes = np.in1d(y, classes)\n y_seen = y[y_in_classes]\n indices = np.searchsorted(sorted_class, y_seen)\n indptr = np.hstack((0, np.cumsum(y_in_classes)))\n data = np.empty_like(indices)\n data.fill(pos_label)\n Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))\n elif y_type == 'multilabel-indicator':\n Y = sp.csr_matrix(y)\n if pos_label != 1:\n data = np.empty_like(Y.data)\n data.fill(pos_label)\n Y.data = data\n else:\n raise ValueError('%s target data is not supported with label binarization' % y_type)\n if not sparse_output:\n Y = Y.toarray()\n Y = Y.astype(int, copy=False)\n if neg_label != 0:\n Y[Y == 0] = neg_label\n if pos_switch:\n Y[Y == pos_label] = 0\n else:\n Y.data = Y.data.astype(int, copy=False)\n if np.any(classes != sorted_class):\n indices = np.searchsorted(sorted_class, classes)\n Y = Y[:, indices]\n if y_type == 'binary':\n if sparse_output:\n Y = Y.getcol(-1)\n else:\n Y = Y[:, -1].reshape((-1, 1))\n return Y" }, { @@ -155327,7 +167456,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "degree", @@ -155337,7 +167467,8 @@ "docstring": { "type": "int or tuple (min_degree, max_degree), default=2", "description": "If a single int is given, it specifies the maximal degree of the\npolynomial features. If a tuple `(min_degree, max_degree)` is passed,\nthen `min_degree` is the minimum and `max_degree` is the maximum\npolynomial degree of the generated features. Note that `min_degree=0`\nand `min_degree=1` are equivalent as outputting the degree zero term is\ndetermined by `include_bias`." - } + }, + "refined_type": {} }, { "name": "interaction_only", @@ -155347,7 +167478,8 @@ "docstring": { "type": "bool, default=False", "description": "If `True`, only interaction features are produced: features that are\nproducts of at most `degree` *distinct* input features, i.e. terms with\npower of 2 or higher of the same input feature are excluded:\n\n - included: `x[0]`, `x[1]`, `x[0] * x[1]`, etc.\n - excluded: `x[0] ** 2`, `x[0] ** 2 * x[1]`, etc." - } + }, + "refined_type": {} }, { "name": "include_bias", @@ -155357,7 +167489,8 @@ "docstring": { "type": "bool, default=True", "description": "If `True` (default), then include a bias column, the feature in which\nall polynomial powers are zero (i.e. a column of ones - acts as an\nintercept term in a linear model)." - } + }, + "refined_type": {} }, { "name": "order", @@ -155367,13 +167500,17 @@ "docstring": { "type": "{'C', 'F'}, default='C'", "description": "Order of output array in the dense case. `'F'` order is faster to\ncompute, but may slow down subsequent estimators.\n\n.. versionadded:: 0.21" + }, + "refined_type": { + "kind": "EnumType", + "values": ["F", "C"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, degree=2, *, interaction_only=False, include_bias=True, order='C'):\n self.degree = degree\n self.interaction_only = interaction_only\n self.include_bias = include_bias\n self.order = order" }, { @@ -155391,7 +167528,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_degree", @@ -155401,7 +167539,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_degree", @@ -155411,7 +167550,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "interaction_only", @@ -155421,7 +167561,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "include_bias", @@ -155431,13 +167572,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@staticmethod\ndef _combinations(n_features, min_degree, max_degree, interaction_only, include_bias):\n comb = combinations if interaction_only else combinations_w_r\n start = max(1, min_degree)\n iter = chain.from_iterable((comb(range(n_features), i) for i in range(start, max_degree + 1)))\n if include_bias:\n iter = chain(comb(range(n_features), 0), iter)\n return iter" }, { @@ -155455,7 +167597,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_degree", @@ -155465,7 +167608,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_degree", @@ -155475,7 +167619,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "interaction_only", @@ -155485,7 +167630,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "include_bias", @@ -155495,13 +167641,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Calculate number of terms in polynomial expansion\n\nThis should be equivalent to counting the number of terms returned by _combinations(...) but much faster.", - "docstring": "Calculate number of terms in polynomial expansion\n\nThis should be equivalent to counting the number of terms returned by\n_combinations(...) but much faster.", + "description": "Calculate number of terms in polynomial expansion\n\nThis should be equivalent to counting the number of terms returned by\n_combinations(...) but much faster.", + "docstring": "Calculate number of terms in polynomial expansion\n\n This should be equivalent to counting the number of terms returned by\n _combinations(...) but much faster.\n ", "source_code": "\n@staticmethod\ndef _num_combinations(n_features, min_degree, max_degree, interaction_only, include_bias):\n \"\"\"Calculate number of terms in polynomial expansion\n\n This should be equivalent to counting the number of terms returned by\n _combinations(...) but much faster.\n \"\"\"\n if interaction_only:\n combinations = sum([comb(n_features, i, exact=True) for i in range(max(1, min_degree), min(max_degree, n_features) + 1)])\n else:\n combinations = comb(n_features + max_degree, max_degree, exact=True) - 1\n if min_degree > 0:\n d = min_degree - 1\n combinations -= comb(n_features + d, d, exact=True) - 1\n if include_bias:\n combinations += 1\n return combinations" }, { @@ -155519,7 +167666,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -155529,6 +167677,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -155539,13 +167691,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute number of output features.", - "docstring": "Compute number of output features.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n Fitted transformer.", + "docstring": "\n Compute number of output features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted transformer.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"\n Compute number of output features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n (_, n_features) = self._validate_data(X, accept_sparse=True).shape\n if isinstance(self.degree, numbers.Integral):\n if self.degree < 0:\n raise ValueError(f'degree must be a non-negative integer, got {self.degree}.')\n self._min_degree = 0\n self._max_degree = self.degree\n elif isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2:\n (self._min_degree, self._max_degree) = self.degree\n if not (isinstance(self._min_degree, numbers.Integral) and isinstance(self._max_degree, numbers.Integral) and self._min_degree >= 0 and self._min_degree <= self._max_degree):\n raise ValueError(f'degree=(min_degree, max_degree) must be non-negative integers that fulfil min_degree <= max_degree, got {self.degree}.')\n else:\n raise ValueError(f'degree must be a non-negative int or tuple (min_degree, max_degree), got {self.degree}.')\n self.n_output_features_ = self._num_combinations(n_features=n_features, min_degree=self._min_degree, max_degree=self._max_degree, interaction_only=self.interaction_only, include_bias=self.include_bias)\n self._n_out_full = self._num_combinations(n_features=n_features, min_degree=0, max_degree=self._max_degree, interaction_only=self.interaction_only, include_bias=self.include_bias)\n return self" }, { @@ -155565,7 +167718,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -155575,13 +167729,14 @@ "docstring": { "type": "list of str of shape (n_features,), default=None", "description": "String names for input features if available. By default,\n\"x0\", \"x1\", ... \"xn_features\" is used." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return feature names for output features.", - "docstring": "Return feature names for output features.\n\nParameters\n----------\ninput_features : list of str of shape (n_features,), default=None\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\nReturns\n-------\noutput_feature_names : list of str of shape (n_output_features,)\n Transformed feature names.", + "docstring": "Return feature names for output features.\n\n Parameters\n ----------\n input_features : list of str of shape (n_features,), default=None\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\n Returns\n -------\n output_feature_names : list of str of shape (n_output_features,)\n Transformed feature names.\n ", "source_code": "\n@deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\ndef get_feature_names(self, input_features=None):\n \"\"\"Return feature names for output features.\n\n Parameters\n ----------\n input_features : list of str of shape (n_features,), default=None\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\n Returns\n -------\n output_feature_names : list of str of shape (n_output_features,)\n Transformed feature names.\n \"\"\"\n powers = self.powers_\n if input_features is None:\n input_features = ['x%d' % i for i in range(powers.shape[1])]\n feature_names = []\n for row in powers:\n inds = np.where(row)[0]\n if len(inds):\n name = ' '.join(('%s^%d' % (input_features[ind], exp) if exp != 1 else input_features[ind] for (ind, exp) in zip(inds, row[inds])))\n else:\n name = '1'\n feature_names.append(name)\n return feature_names" }, { @@ -155599,7 +167754,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -155609,13 +167765,14 @@ "docstring": { "type": "array-like of str or None, default=None", "description": "Input features.\n\n- If `input_features is None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n- If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get output feature names for transformation.", - "docstring": "Get output feature names for transformation.\n\nParameters\n----------\ninput_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features is None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\nReturns\n-------\nfeature_names_out : ndarray of str objects\n Transformed feature names.", + "docstring": "Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features is None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n ", "source_code": "\ndef get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features is None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n powers = self.powers_\n input_features = _check_feature_names_in(self, input_features)\n feature_names = []\n for row in powers:\n inds = np.where(row)[0]\n if len(inds):\n name = ' '.join(('%s^%d' % (input_features[ind], exp) if exp != 1 else input_features[ind] for (ind, exp) in zip(inds, row[inds])))\n else:\n name = '1'\n feature_names.append(name)\n return np.asarray(feature_names, dtype=object)" }, { @@ -155636,13 +167793,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('The attribute `n_input_features_` was deprecated in version 1.0 and will be removed in 1.2.')\n@property\ndef n_input_features_(self):\n return self.n_features_in_" }, { @@ -155660,7 +167818,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -155684,7 +167843,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -155694,13 +167854,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The data to transform, row by row.\n\nPrefer CSR over CSC for sparse input (for speed), but CSC is\nrequired if the degree is 4 or higher. If the degree is less than\n4 and the input format is CSC, it will be converted to CSR, have\nits polynomial features generated, then converted back to CSC.\n\nIf the degree is 2 or 3, the method described in \"Leveraging\nSparsity to Speed Up Polynomial Feature Expansions of CSR Matrices\nUsing K-Simplex Numbers\" by Andrew Nystrom and John Hughes is\nused, which is much faster than the method used on CSC input. For\nthis reason, a CSC input will be converted to CSR, and the output\nwill be converted back to CSC prior to being returned, hence the\npreference of CSR." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Transform data to polynomial features.", - "docstring": "Transform data to polynomial features.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to transform, row by row.\n\n Prefer CSR over CSC for sparse input (for speed), but CSC is\n required if the degree is 4 or higher. If the degree is less than\n 4 and the input format is CSC, it will be converted to CSR, have\n its polynomial features generated, then converted back to CSC.\n\n If the degree is 2 or 3, the method described in \"Leveraging\n Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices\n Using K-Simplex Numbers\" by Andrew Nystrom and John Hughes is\n used, which is much faster than the method used on CSC input. For\n this reason, a CSC input will be converted to CSR, and the output\n will be converted back to CSC prior to being returned, hence the\n preference of CSR.\n\nReturns\n-------\nXP : {ndarray, sparse matrix} of shape (n_samples, NP)\n The matrix of features, where `NP` is the number of polynomial\n features generated from the combination of inputs. If a sparse\n matrix is provided, it will be converted into a sparse\n `csr_matrix`.", + "docstring": "Transform data to polynomial features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to transform, row by row.\n\n Prefer CSR over CSC for sparse input (for speed), but CSC is\n required if the degree is 4 or higher. If the degree is less than\n 4 and the input format is CSC, it will be converted to CSR, have\n its polynomial features generated, then converted back to CSC.\n\n If the degree is 2 or 3, the method described in \"Leveraging\n Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices\n Using K-Simplex Numbers\" by Andrew Nystrom and John Hughes is\n used, which is much faster than the method used on CSC input. For\n this reason, a CSC input will be converted to CSR, and the output\n will be converted back to CSC prior to being returned, hence the\n preference of CSR.\n\n Returns\n -------\n XP : {ndarray, sparse matrix} of shape (n_samples, NP)\n The matrix of features, where `NP` is the number of polynomial\n features generated from the combination of inputs. If a sparse\n matrix is provided, it will be converted into a sparse\n `csr_matrix`.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform data to polynomial features.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The data to transform, row by row.\n\n Prefer CSR over CSC for sparse input (for speed), but CSC is\n required if the degree is 4 or higher. If the degree is less than\n 4 and the input format is CSC, it will be converted to CSR, have\n its polynomial features generated, then converted back to CSC.\n\n If the degree is 2 or 3, the method described in \"Leveraging\n Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices\n Using K-Simplex Numbers\" by Andrew Nystrom and John Hughes is\n used, which is much faster than the method used on CSC input. For\n this reason, a CSC input will be converted to CSR, and the output\n will be converted back to CSC prior to being returned, hence the\n preference of CSR.\n\n Returns\n -------\n XP : {ndarray, sparse matrix} of shape (n_samples, NP)\n The matrix of features, where `NP` is the number of polynomial\n features generated from the combination of inputs. If a sparse\n matrix is provided, it will be converted into a sparse\n `csr_matrix`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False, accept_sparse=('csr', 'csc'))\n (n_samples, n_features) = X.shape\n if sparse.isspmatrix_csr(X):\n if self._max_degree > 3:\n return self.transform(X.tocsc()).tocsr()\n to_stack = []\n if self.include_bias:\n to_stack.append(sparse.csc_matrix(np.ones(shape=(n_samples, 1), dtype=X.dtype)))\n if self._min_degree <= 1:\n to_stack.append(X)\n for deg in range(max(2, self._min_degree), self._max_degree + 1):\n Xp_next = _csr_polynomial_expansion(X.data, X.indices, X.indptr, X.shape[1], self.interaction_only, deg)\n if Xp_next is None:\n break\n to_stack.append(Xp_next)\n if len(to_stack) == 0:\n XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype)\n else:\n XP = sparse.hstack(to_stack, format='csr')\n elif sparse.isspmatrix_csc(X) and self._max_degree < 4:\n return self.transform(X.tocsr()).tocsc()\n elif sparse.isspmatrix(X):\n combinations = self._combinations(n_features=n_features, min_degree=self._min_degree, max_degree=self._max_degree, interaction_only=self.interaction_only, include_bias=self.include_bias)\n columns = []\n for combi in combinations:\n if combi:\n out_col = 1\n for col_idx in combi:\n out_col = X[:, col_idx].multiply(out_col)\n columns.append(out_col)\n else:\n bias = sparse.csc_matrix(np.ones((X.shape[0], 1)))\n columns.append(bias)\n XP = sparse.hstack(columns, dtype=X.dtype).tocsc()\n else:\n XP = np.empty(shape=(n_samples, self._n_out_full), dtype=X.dtype, order=self.order)\n if self.include_bias:\n XP[:, 0] = 1\n current_col = 1\n else:\n current_col = 0\n XP[:, current_col:current_col + n_features] = X\n index = list(range(current_col, current_col + n_features))\n current_col += n_features\n index.append(current_col)\n for _ in range(2, self._max_degree + 1):\n new_index = []\n end = index[-1]\n for feature_idx in range(n_features):\n start = index[feature_idx]\n new_index.append(current_col)\n if self.interaction_only:\n start += index[feature_idx + 1] - index[feature_idx]\n next_col = current_col + end - start\n if next_col <= current_col:\n break\n np.multiply(XP[:, start:end], X[:, feature_idx:feature_idx + 1], out=XP[:, current_col:next_col], casting='no')\n current_col = next_col\n new_index.append(current_col)\n index = new_index\n if self._min_degree > 1:\n (n_XP, n_Xout) = (self._n_out_full, self.n_output_features_)\n if self.include_bias:\n Xout = np.empty(shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order)\n Xout[:, 0] = 1\n Xout[:, 1:] = XP[:, n_XP - n_Xout + 1:]\n else:\n Xout = XP[:, n_XP - n_Xout:].copy()\n XP = Xout\n return XP" }, { @@ -155718,7 +167882,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_knots", @@ -155728,6 +167893,10 @@ "docstring": { "type": "int, default=5", "description": "Number of knots of the splines if `knots` equals one of\n{'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots`\nis array-like." + }, + "refined_type": { + "kind": "EnumType", + "values": ["quantile", "uniform"] } }, { @@ -155738,7 +167907,8 @@ "docstring": { "type": "int, default=3", "description": "The polynomial degree of the spline basis. Must be a non-negative\ninteger." - } + }, + "refined_type": {} }, { "name": "knots", @@ -155748,6 +167918,10 @@ "docstring": { "type": "{'uniform', 'quantile'} or array-like of shape (n_knots, n_features), default='uniform'", "description": "Set knot positions such that first knot <= features <= last knot.\n\n- If 'uniform', `n_knots` number of knots are distributed uniformly\n from min to max values of the features.\n- If 'quantile', they are distributed uniformly along the quantiles of\n the features.\n- If an array-like is given, it directly specifies the sorted knot\n positions including the boundary knots. Note that, internally,\n `degree` number of knots are added before the first knot, the same\n after the last knot." + }, + "refined_type": { + "kind": "EnumType", + "values": ["quantile", "uniform"] } }, { @@ -155758,6 +167932,16 @@ "docstring": { "type": "{'error', 'constant', 'linear', 'continue', 'periodic'}, default='constant'", "description": "If 'error', values outside the min and max values of the training\nfeatures raises a `ValueError`. If 'constant', the value of the\nsplines at minimum and maximum value of the features is used as\nconstant extrapolation. If 'linear', a linear extrapolation is used.\nIf 'continue', the splines are extrapolated as is, i.e. option\n`extrapolate=True` in :class:`scipy.interpolate.BSpline`. If\n'periodic', periodic splines with a periodicity equal to the distance\nbetween the first and last knot are used. Periodic splines enforce\nequal function values and derivatives at the first and last knot.\nFor example, this makes it possible to avoid introducing an arbitrary\njump between Dec 31st and Jan 1st in spline features derived from a\nnaturally periodic \"day-of-year\" input feature. In this case it is\nrecommended to manually set the knot values to control the period." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "error", + "periodic", + "constant", + "continue", + "linear" + ] } }, { @@ -155768,7 +167952,8 @@ "docstring": { "type": "bool, default=True", "description": "If True (default), then the last spline element inside the data range\nof a feature is dropped. As B-splines sum to one over the spline basis\nfunctions for each data point, they implicitly include a bias term,\ni.e. a column of ones. It acts as an intercept term in a linear models." - } + }, + "refined_type": {} }, { "name": "order", @@ -155778,13 +167963,17 @@ "docstring": { "type": "{'C', 'F'}, default='C'", "description": "Order of output array. 'F' order is faster to compute, but may slow\ndown subsequent estimators." + }, + "refined_type": { + "kind": "EnumType", + "values": ["F", "C"] } } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_knots=5, degree=3, *, knots='uniform', extrapolation='constant', include_bias=True, order='C'):\n self.n_knots = n_knots\n self.degree = degree\n self.knots = knots\n self.extrapolation = extrapolation\n self.include_bias = include_bias\n self.order = order" }, { @@ -155802,7 +167991,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_knots", @@ -155812,7 +168002,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "knots", @@ -155822,7 +168013,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -155832,13 +168024,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Calculate base knot positions.\n\nBase knots such that first knot <= feature <= last knot. For the B-spline construction with scipy.interpolate.BSpline, 2*degree knots beyond the base interval are added.", - "docstring": "Calculate base knot positions.\n\nBase knots such that first knot <= feature <= last knot. For the\nB-spline construction with scipy.interpolate.BSpline, 2*degree knots\nbeyond the base interval are added.\n\nReturns\n-------\nknots : ndarray of shape (n_knots, n_features), dtype=np.float64\n Knot positions (points) of base interval.", + "description": "Calculate base knot positions.\n\nBase knots such that first knot <= feature <= last knot. For the\nB-spline construction with scipy.interpolate.BSpline, 2*degree knots\nbeyond the base interval are added.", + "docstring": "Calculate base knot positions.\n\n Base knots such that first knot <= feature <= last knot. For the\n B-spline construction with scipy.interpolate.BSpline, 2*degree knots\n beyond the base interval are added.\n\n Returns\n -------\n knots : ndarray of shape (n_knots, n_features), dtype=np.float64\n Knot positions (points) of base interval.\n ", "source_code": "\n@staticmethod\ndef _get_base_knot_positions(X, n_knots=10, knots='uniform', sample_weight=None):\n \"\"\"Calculate base knot positions.\n\n Base knots such that first knot <= feature <= last knot. For the\n B-spline construction with scipy.interpolate.BSpline, 2*degree knots\n beyond the base interval are added.\n\n Returns\n -------\n knots : ndarray of shape (n_knots, n_features), dtype=np.float64\n Knot positions (points) of base interval.\n \"\"\"\n if knots == 'quantile':\n percentiles = 100 * np.linspace(start=0, stop=1, num=n_knots, dtype=np.float64)\n if sample_weight is None:\n knots = np.percentile(X, percentiles, axis=0)\n else:\n knots = np.array([_weighted_percentile(X, sample_weight, percentile) for percentile in percentiles])\n else:\n mask = slice(None, None, 1) if sample_weight is None else sample_weight > 0\n x_min = np.amin(X[mask], axis=0)\n x_max = np.amax(X[mask], axis=0)\n knots = linspace(start=x_min, stop=x_max, num=n_knots, endpoint=True, dtype=np.float64)\n return knots" }, { @@ -155856,7 +168049,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -155866,7 +168060,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data." - } + }, + "refined_type": {} }, { "name": "y", @@ -155876,7 +168071,8 @@ "docstring": { "type": "None", "description": "Ignored." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -155886,13 +168082,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default = None", "description": "Individual weights for each sample. Used to calculate quantiles if\n`knots=\"quantile\"`. For `knots=\"uniform\"`, zero weighted\nobservations are ignored for finding the min and max of `X`." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute knot positions of splines.", - "docstring": "Compute knot positions of splines.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data.\n\ny : None\n Ignored.\n\nsample_weight : array-like of shape (n_samples,), default = None\n Individual weights for each sample. Used to calculate quantiles if\n `knots=\"quantile\"`. For `knots=\"uniform\"`, zero weighted\n observations are ignored for finding the min and max of `X`.\n\nReturns\n-------\nself : object\n Fitted transformer.", + "docstring": "Compute knot positions of splines.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n y : None\n Ignored.\n\n sample_weight : array-like of shape (n_samples,), default = None\n Individual weights for each sample. Used to calculate quantiles if\n `knots=\"quantile\"`. For `knots=\"uniform\"`, zero weighted\n observations are ignored for finding the min and max of `X`.\n\n Returns\n -------\n self : object\n Fitted transformer.\n ", "source_code": "\ndef fit(self, X, y=None, sample_weight=None):\n \"\"\"Compute knot positions of splines.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data.\n\n y : None\n Ignored.\n\n sample_weight : array-like of shape (n_samples,), default = None\n Individual weights for each sample. Used to calculate quantiles if\n `knots=\"quantile\"`. For `knots=\"uniform\"`, zero weighted\n observations are ignored for finding the min and max of `X`.\n\n Returns\n -------\n self : object\n Fitted transformer.\n \"\"\"\n X = self._validate_data(X, reset=True, accept_sparse=False, ensure_min_samples=2, ensure_2d=True)\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)\n (_, n_features) = X.shape\n if not (isinstance(self.degree, numbers.Integral) and self.degree >= 0):\n raise ValueError(f'degree must be a non-negative integer, got {self.degree}.')\n if isinstance(self.knots, str) and self.knots in ['uniform', 'quantile']:\n if not (isinstance(self.n_knots, numbers.Integral) and self.n_knots >= 2):\n raise ValueError(f'n_knots must be a positive integer >= 2, got: {self.n_knots}')\n base_knots = self._get_base_knot_positions(X, n_knots=self.n_knots, knots=self.knots, sample_weight=sample_weight)\n else:\n base_knots = check_array(self.knots, dtype=np.float64)\n if base_knots.shape[0] < 2:\n raise ValueError('Number of knots, knots.shape[0], must be >= 2.')\n elif base_knots.shape[1] != n_features:\n raise ValueError('knots.shape[1] == n_features is violated.')\n elif not np.all(np.diff(base_knots, axis=0) > 0):\n raise ValueError('knots must be sorted without duplicates.')\n if self.extrapolation not in ('error', 'constant', 'linear', 'continue', 'periodic'):\n raise ValueError(\"extrapolation must be one of 'error', 'constant', 'linear', 'continue' or 'periodic'.\")\n if not isinstance(self.include_bias, (bool, np.bool_)):\n raise ValueError('include_bias must be bool.')\n n_knots = base_knots.shape[0]\n if self.extrapolation == 'periodic' and n_knots <= self.degree:\n raise ValueError(f'Periodic splines require degree < n_knots. Got n_knots={n_knots} and degree={self.degree}.')\n if self.extrapolation != 'periodic':\n n_splines = n_knots + self.degree - 1\n else:\n n_splines = n_knots - 1\n degree = self.degree\n n_out = n_features * n_splines\n if self.extrapolation == 'periodic':\n period = base_knots[-1] - base_knots[0]\n knots = np.r_[base_knots[-(degree + 1):-1] - period, base_knots, base_knots[1:degree + 1] + period]\n else:\n dist_min = base_knots[1] - base_knots[0]\n dist_max = base_knots[-1] - base_knots[-2]\n knots = np.r_[linspace(base_knots[0] - degree * dist_min, base_knots[0] - dist_min, num=degree), base_knots, linspace(base_knots[-1] + dist_max, base_knots[-1] + degree * dist_max, num=degree)]\n coef = np.eye(n_splines, dtype=np.float64)\n if self.extrapolation == 'periodic':\n coef = np.concatenate((coef, coef[:degree, :]))\n extrapolate = self.extrapolation in ['periodic', 'continue']\n bsplines = [BSpline.construct_fast(knots[:, i], coef, self.degree, extrapolate=extrapolate) for i in range(n_features)]\n self.bsplines_ = bsplines\n self.n_features_out_ = n_out - n_features * (1 - self.include_bias)\n return self" }, { @@ -155912,7 +168109,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -155922,13 +168120,14 @@ "docstring": { "type": "list of str of shape (n_features,), default=None", "description": "String names for input features if available. By default,\n\"x0\", \"x1\", ... \"xn_features\" is used." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return feature names for output features.", - "docstring": "Return feature names for output features.\n\nParameters\n----------\ninput_features : list of str of shape (n_features,), default=None\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\nReturns\n-------\noutput_feature_names : list of str of shape (n_output_features,)\n Transformed feature names.", + "docstring": "Return feature names for output features.\n\n Parameters\n ----------\n input_features : list of str of shape (n_features,), default=None\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\n Returns\n -------\n output_feature_names : list of str of shape (n_output_features,)\n Transformed feature names.\n ", "source_code": "\n@deprecated('get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.')\ndef get_feature_names(self, input_features=None):\n \"\"\"Return feature names for output features.\n\n Parameters\n ----------\n input_features : list of str of shape (n_features,), default=None\n String names for input features if available. By default,\n \"x0\", \"x1\", ... \"xn_features\" is used.\n\n Returns\n -------\n output_feature_names : list of str of shape (n_output_features,)\n Transformed feature names.\n \"\"\"\n n_splines = self.bsplines_[0].c.shape[0]\n if input_features is None:\n input_features = ['x%d' % i for i in range(self.n_features_in_)]\n feature_names = []\n for i in range(self.n_features_in_):\n for j in range(n_splines - 1 + self.include_bias):\n feature_names.append(f'{input_features[i]}_sp_{j}')\n return feature_names" }, { @@ -155946,7 +168145,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -155956,13 +168156,14 @@ "docstring": { "type": "array-like of str or None, default=None", "description": "Input features.\n\n- If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n- If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Get output feature names for transformation.", - "docstring": "Get output feature names for transformation.\n\nParameters\n----------\ninput_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\nReturns\n-------\nfeature_names_out : ndarray of str objects\n Transformed feature names.", + "docstring": "Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n ", "source_code": "\ndef get_feature_names_out(self, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_out : ndarray of str objects\n Transformed feature names.\n \"\"\"\n n_splines = self.bsplines_[0].c.shape[0]\n input_features = _check_feature_names_in(self, input_features)\n feature_names = []\n for i in range(self.n_features_in_):\n for j in range(n_splines - 1 + self.include_bias):\n feature_names.append(f'{input_features[i]}_sp_{j}')\n return np.asarray(feature_names, dtype=object)" }, { @@ -155980,7 +168181,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -155990,13 +168192,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data to transform." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Transform each feature data to B-splines.", - "docstring": "Transform each feature data to B-splines.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data to transform.\n\nReturns\n-------\nXBS : ndarray of shape (n_samples, n_features * n_splines)\n The matrix of features, where n_splines is the number of bases\n elements of the B-splines, n_knots + degree - 1.", + "docstring": "Transform each feature data to B-splines.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to transform.\n\n Returns\n -------\n XBS : ndarray of shape (n_samples, n_features * n_splines)\n The matrix of features, where n_splines is the number of bases\n elements of the B-splines, n_knots + degree - 1.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Transform each feature data to B-splines.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data to transform.\n\n Returns\n -------\n XBS : ndarray of shape (n_samples, n_features * n_splines)\n The matrix of features, where n_splines is the number of bases\n elements of the B-splines, n_knots + degree - 1.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, reset=False, accept_sparse=False, ensure_2d=True)\n (n_samples, n_features) = X.shape\n n_splines = self.bsplines_[0].c.shape[1]\n degree = self.degree\n n_out = self.n_features_out_ + n_features * (1 - self.include_bias)\n if X.dtype in FLOAT_DTYPES:\n dtype = X.dtype\n else:\n dtype = np.float64\n XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order)\n for i in range(n_features):\n spl = self.bsplines_[i]\n if self.extrapolation in ('continue', 'error', 'periodic'):\n if self.extrapolation == 'periodic':\n n = spl.t.size - spl.k - 1\n x = spl.t[spl.k] + (X[:, i] - spl.t[spl.k]) % (spl.t[n] - spl.t[spl.k])\n else:\n x = X[:, i]\n XBS[:, i * n_splines:(i + 1) * n_splines] = spl(x)\n else:\n xmin = spl.t[degree]\n xmax = spl.t[-degree - 1]\n mask = (xmin <= X[:, i]) & (X[:, i] <= xmax)\n XBS[mask, i * n_splines:(i + 1) * n_splines] = spl(X[mask, i])\n if self.extrapolation == 'error':\n if np.any(np.isnan(XBS[:, i * n_splines:(i + 1) * n_splines])):\n raise ValueError('X contains values beyond the limits of the knots.')\n elif self.extrapolation == 'constant':\n f_min = spl(xmin)\n f_max = spl(xmax)\n mask = X[:, i] < xmin\n if np.any(mask):\n XBS[mask, i * n_splines:i * n_splines + degree] = f_min[:degree]\n mask = X[:, i] > xmax\n if np.any(mask):\n XBS[mask, (i + 1) * n_splines - degree:(i + 1) * n_splines] = f_max[-degree:]\n elif self.extrapolation == 'linear':\n (f_min, f_max) = (spl(xmin), spl(xmax))\n (fp_min, fp_max) = (spl(xmin, nu=1), spl(xmax, nu=1))\n if degree <= 1:\n degree += 1\n for j in range(degree):\n mask = X[:, i] < xmin\n if np.any(mask):\n XBS[mask, i * n_splines + j] = f_min[j] + (X[mask, i] - xmin) * fp_min[j]\n mask = X[:, i] > xmax\n if np.any(mask):\n k = n_splines - 1 - j\n XBS[mask, i * n_splines + k] = f_max[k] + (X[mask, i] - xmax) * fp_max[k]\n if self.include_bias:\n return XBS\n else:\n indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0]\n return XBS[:, indices]" }, { @@ -156014,7 +168217,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -156024,13 +168228,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n import numpy\n from numpy.distutils.misc_util import Configuration\n config = Configuration('preprocessing', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_extension('_csr_polynomial_expansion', sources=['_csr_polynomial_expansion.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_subpackage('tests')\n return config" }, { @@ -156048,7 +168253,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -156058,7 +168264,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "eps", @@ -156068,7 +168275,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dense_output", @@ -156078,7 +168286,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -156088,13 +168297,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, n_components='auto', *, eps=0.1, dense_output=False, random_state=None):\n self.n_components = n_components\n self.eps = eps\n self.dense_output = dense_output\n self.random_state = random_state" }, { @@ -156112,7 +168322,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -156122,7 +168333,8 @@ "docstring": { "type": "int,", "description": "Dimensionality of the target projection space." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -156132,13 +168344,14 @@ "docstring": { "type": "int,", "description": "Dimensionality of the original source space." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Generate the random projection matrix.", - "docstring": "Generate the random projection matrix.\n\nParameters\n----------\nn_components : int,\n Dimensionality of the target projection space.\n\nn_features : int,\n Dimensionality of the original source space.\n\nReturns\n-------\ncomponents : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.", + "docstring": "Generate the random projection matrix.\n\n Parameters\n ----------\n n_components : int,\n Dimensionality of the target projection space.\n\n n_features : int,\n Dimensionality of the original source space.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.\n\n ", "source_code": "\n@abstractmethod\ndef _make_random_matrix(self, n_components, n_features):\n \"\"\"Generate the random projection matrix.\n\n Parameters\n ----------\n n_components : int,\n Dimensionality of the target projection space.\n\n n_features : int,\n Dimensionality of the original source space.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.\n\n \"\"\"\n " }, { @@ -156156,7 +168369,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -156166,6 +168380,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "Training set: only the shape is used to find optimal random\nmatrix dimensions based on the theory referenced in the\nafore mentioned papers." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -156176,13 +168394,14 @@ "docstring": { "type": "Ignored", "description": "Not used, present here for API consistency by convention." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate a sparse random projection matrix.", - "docstring": "Generate a sparse random projection matrix.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training set: only the shape is used to find optimal random\n matrix dimensions based on the theory referenced in the\n afore mentioned papers.\n\ny : Ignored\n Not used, present here for API consistency by convention.\n\nReturns\n-------\nself : object\n BaseRandomProjection class instance.", + "docstring": "Generate a sparse random projection matrix.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training set: only the shape is used to find optimal random\n matrix dimensions based on the theory referenced in the\n afore mentioned papers.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n BaseRandomProjection class instance.\n ", "source_code": "\ndef fit(self, X, y=None):\n \"\"\"Generate a sparse random projection matrix.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Training set: only the shape is used to find optimal random\n matrix dimensions based on the theory referenced in the\n afore mentioned papers.\n\n y : Ignored\n Not used, present here for API consistency by convention.\n\n Returns\n -------\n self : object\n BaseRandomProjection class instance.\n \"\"\"\n X = self._validate_data(X, accept_sparse=['csr', 'csc'])\n (n_samples, n_features) = X.shape\n if self.n_components == 'auto':\n self.n_components_ = johnson_lindenstrauss_min_dim(n_samples=n_samples, eps=self.eps)\n if self.n_components_ <= 0:\n raise ValueError('eps=%f and n_samples=%d lead to a target dimension of %d which is invalid' % (self.eps, n_samples, self.n_components_))\n elif self.n_components_ > n_features:\n raise ValueError('eps=%f and n_samples=%d lead to a target dimension of %d which is larger than the original space with n_features=%d' % (self.eps, n_samples, self.n_components_, n_features))\n else:\n if self.n_components <= 0:\n raise ValueError('n_components must be greater than 0, got %s' % self.n_components)\n elif self.n_components > n_features:\n warnings.warn('The number of components is higher than the number of features: n_features < n_components (%s < %s).The dimensionality of the problem will not be reduced.' % (n_features, self.n_components), DataDimensionalityWarning)\n self.n_components_ = self.n_components\n self.components_ = self._make_random_matrix(self.n_components_, n_features)\n assert self.components_.shape == (self.n_components_, n_features), 'An error has occurred the self.components_ matrix has not the proper shape.'\n return self" }, { @@ -156200,7 +168419,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -156210,13 +168430,17 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "The input data to project into a smaller dimensional space." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Project the data by using matrix product with the random matrix.", - "docstring": "Project the data by using matrix product with the random matrix.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input data to project into a smaller dimensional space.\n\nReturns\n-------\nX_new : {ndarray, sparse matrix} of shape (n_samples, n_components)\n Projected array.", + "docstring": "Project the data by using matrix product with the random matrix.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input data to project into a smaller dimensional space.\n\n Returns\n -------\n X_new : {ndarray, sparse matrix} of shape (n_samples, n_components)\n Projected array.\n ", "source_code": "\ndef transform(self, X):\n \"\"\"Project the data by using matrix product with the random matrix.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n The input data to project into a smaller dimensional space.\n\n Returns\n -------\n X_new : {ndarray, sparse matrix} of shape (n_samples, n_components)\n Projected array.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=['csr', 'csc'], reset=False)\n if X.shape[1] != self.components_.shape[1]:\n raise ValueError('Impossible to perform projection:X at fit stage had a different number of features. (%s != %s)' % (X.shape[1], self.components_.shape[1]))\n X_new = safe_sparse_dot(X, self.components_.T, dense_output=self.dense_output)\n return X_new" }, { @@ -156234,7 +168458,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -156244,7 +168469,8 @@ "docstring": { "type": "int or 'auto', default='auto'", "description": "Dimensionality of the target projection space.\n\nn_components can be automatically adjusted according to the\nnumber of samples in the dataset and the bound given by the\nJohnson-Lindenstrauss lemma. In that case the quality of the\nembedding is controlled by the ``eps`` parameter.\n\nIt should be noted that Johnson-Lindenstrauss lemma can yield\nvery conservative estimated of the required number of components\nas it makes no assumption on the structure of the dataset." - } + }, + "refined_type": {} }, { "name": "eps", @@ -156254,7 +168480,8 @@ "docstring": { "type": "float, default=0.1", "description": "Parameter to control the quality of the embedding according to\nthe Johnson-Lindenstrauss lemma when `n_components` is set to\n'auto'. The value should be strictly positive.\n\nSmaller values lead to better embedding and higher number of\ndimensions (n_components) in the target projection space." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -156264,13 +168491,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the pseudo random number generator used to generate the\nprojection matrix at fit time.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components='auto', *, eps=0.1, random_state=None):\n super().__init__(n_components=n_components, eps=eps, dense_output=True, random_state=random_state)" }, { @@ -156288,7 +168516,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -156298,7 +168527,8 @@ "docstring": { "type": "int,", "description": "Dimensionality of the target projection space." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -156308,13 +168538,14 @@ "docstring": { "type": "int,", "description": "Dimensionality of the original source space." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Generate the random projection matrix.", - "docstring": "Generate the random projection matrix.\n\nParameters\n----------\nn_components : int,\n Dimensionality of the target projection space.\n\nn_features : int,\n Dimensionality of the original source space.\n\nReturns\n-------\ncomponents : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.", + "docstring": " Generate the random projection matrix.\n\n Parameters\n ----------\n n_components : int,\n Dimensionality of the target projection space.\n\n n_features : int,\n Dimensionality of the original source space.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.\n\n ", "source_code": "\ndef _make_random_matrix(self, n_components, n_features):\n \"\"\" Generate the random projection matrix.\n\n Parameters\n ----------\n n_components : int,\n Dimensionality of the target projection space.\n\n n_features : int,\n Dimensionality of the original source space.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.\n\n \"\"\"\n random_state = check_random_state(self.random_state)\n return _gaussian_random_matrix(n_components, n_features, random_state=random_state)" }, { @@ -156332,7 +168563,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -156342,7 +168574,8 @@ "docstring": { "type": "int or 'auto', default='auto'", "description": "Dimensionality of the target projection space.\n\nn_components can be automatically adjusted according to the\nnumber of samples in the dataset and the bound given by the\nJohnson-Lindenstrauss lemma. In that case the quality of the\nembedding is controlled by the ``eps`` parameter.\n\nIt should be noted that Johnson-Lindenstrauss lemma can yield\nvery conservative estimated of the required number of components\nas it makes no assumption on the structure of the dataset." - } + }, + "refined_type": {} }, { "name": "density", @@ -156352,6 +168585,14 @@ "docstring": { "type": "float or 'auto', default='auto'", "description": "Ratio in the range (0, 1] of non-zero component in the random\nprojection matrix.\n\nIf density = 'auto', the value is set to the minimum density\nas recommended by Ping Li et al.: 1 / sqrt(n_features).\n\nUse density = 1 / 3.0 if you want to reproduce the results from\nAchlioptas, 2001." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": false, + "max_inclusive": true } }, { @@ -156362,7 +168603,8 @@ "docstring": { "type": "float, default=0.1", "description": "Parameter to control the quality of the embedding according to\nthe Johnson-Lindenstrauss lemma when n_components is set to\n'auto'. This value should be strictly positive.\n\nSmaller values lead to better embedding and higher number of\ndimensions (n_components) in the target projection space." - } + }, + "refined_type": {} }, { "name": "dense_output", @@ -156372,7 +168614,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, ensure that the output of the random projection is a\ndense numpy array even if the input and random projection matrix\nare both sparse. In practice, if the number of components is\nsmall the number of zero components in the projected data will\nbe very small and it will be more CPU and memory efficient to\nuse a dense representation.\n\nIf False, the projected data uses a sparse representation if\nthe input is sparse." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -156382,13 +168625,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the pseudo random number generator used to generate the\nprojection matrix at fit time.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, n_components='auto', *, density='auto', eps=0.1, dense_output=False, random_state=None):\n super().__init__(n_components=n_components, eps=eps, dense_output=dense_output, random_state=random_state)\n self.density = density" }, { @@ -156406,7 +168650,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -156416,7 +168661,8 @@ "docstring": { "type": "int", "description": "Dimensionality of the target projection space." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -156426,13 +168672,14 @@ "docstring": { "type": "int", "description": "Dimensionality of the original source space." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Generate the random projection matrix", - "docstring": "Generate the random projection matrix\n\nParameters\n----------\nn_components : int\n Dimensionality of the target projection space.\n\nn_features : int\n Dimensionality of the original source space.\n\nReturns\n-------\ncomponents : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.", + "docstring": " Generate the random projection matrix\n\n Parameters\n ----------\n n_components : int\n Dimensionality of the target projection space.\n\n n_features : int\n Dimensionality of the original source space.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.\n\n ", "source_code": "\ndef _make_random_matrix(self, n_components, n_features):\n \"\"\" Generate the random projection matrix\n\n Parameters\n ----------\n n_components : int\n Dimensionality of the target projection space.\n\n n_features : int\n Dimensionality of the original source space.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated random matrix. Sparse matrix will be of CSR format.\n\n \"\"\"\n random_state = check_random_state(self.random_state)\n self.density_ = _check_density(self.density, n_features)\n return _sparse_random_matrix(n_components, n_features, density=self.density_, random_state=random_state)" }, { @@ -156450,7 +168697,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -156460,7 +168708,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -156484,7 +168733,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_features", @@ -156494,7 +168744,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -156518,7 +168769,8 @@ "docstring": { "type": "int,", "description": "Dimensionality of the target projection space." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -156528,7 +168780,8 @@ "docstring": { "type": "int,", "description": "Dimensionality of the original source space." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -156538,13 +168791,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the pseudo random number generator used to generate the matrix\nat fit time.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Generate a dense Gaussian random matrix.\n\nThe components of the random matrix are drawn from N(0, 1.0 / n_components). Read more in the :ref:`User Guide `.", - "docstring": "Generate a dense Gaussian random matrix.\n\nThe components of the random matrix are drawn from\n\n N(0, 1.0 / n_components).\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_components : int,\n Dimensionality of the target projection space.\n\nn_features : int,\n Dimensionality of the original source space.\n\nrandom_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the matrix\n at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\ncomponents : ndarray of shape (n_components, n_features)\n The generated Gaussian random matrix.\n\nSee Also\n--------\nGaussianRandomProjection", + "description": "Generate a dense Gaussian random matrix.\n\nThe components of the random matrix are drawn from\n\n N(0, 1.0 / n_components).\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generate a dense Gaussian random matrix.\n\n The components of the random matrix are drawn from\n\n N(0, 1.0 / n_components).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int,\n Dimensionality of the target projection space.\n\n n_features : int,\n Dimensionality of the original source space.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the matrix\n at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n components : ndarray of shape (n_components, n_features)\n The generated Gaussian random matrix.\n\n See Also\n --------\n GaussianRandomProjection\n ", "source_code": "\ndef _gaussian_random_matrix(n_components, n_features, random_state=None):\n \"\"\"Generate a dense Gaussian random matrix.\n\n The components of the random matrix are drawn from\n\n N(0, 1.0 / n_components).\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int,\n Dimensionality of the target projection space.\n\n n_features : int,\n Dimensionality of the original source space.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the matrix\n at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n components : ndarray of shape (n_components, n_features)\n The generated Gaussian random matrix.\n\n See Also\n --------\n GaussianRandomProjection\n \"\"\"\n _check_input_size(n_components, n_features)\n rng = check_random_state(random_state)\n components = rng.normal(loc=0.0, scale=1.0 / np.sqrt(n_components), size=(n_components, n_features))\n return components" }, { @@ -156562,7 +168816,8 @@ "docstring": { "type": "int,", "description": "Dimensionality of the target projection space." - } + }, + "refined_type": {} }, { "name": "n_features", @@ -156572,7 +168827,8 @@ "docstring": { "type": "int,", "description": "Dimensionality of the original source space." - } + }, + "refined_type": {} }, { "name": "density", @@ -156582,7 +168838,8 @@ "docstring": { "type": "float or 'auto', default='auto'", "description": "Ratio of non-zero component in the random projection matrix in the\nrange `(0, 1]`\n\nIf density = 'auto', the value is set to the minimum density\nas recommended by Ping Li et al.: 1 / sqrt(n_features).\n\nUse density = 1 / 3.0 if you want to reproduce the results from\nAchlioptas, 2001." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -156592,13 +168849,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the pseudo random number generator used to generate the matrix\nat fit time.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Generalized Achlioptas random sparse matrix for random projection.\n\nSetting density to 1 / 3 will yield the original matrix by Dimitris Achlioptas while setting a lower value will yield the generalization by Ping Li et al. If we note :math:`s = 1 / density`, the components of the random matrix are drawn from: - -sqrt(s) / sqrt(n_components) with probability 1 / 2s - 0 with probability 1 - 1 / s - +sqrt(s) / sqrt(n_components) with probability 1 / 2s Read more in the :ref:`User Guide `.", - "docstring": "Generalized Achlioptas random sparse matrix for random projection.\n\nSetting density to 1 / 3 will yield the original matrix by Dimitris\nAchlioptas while setting a lower value will yield the generalization\nby Ping Li et al.\n\nIf we note :math:`s = 1 / density`, the components of the random matrix are\ndrawn from:\n\n - -sqrt(s) / sqrt(n_components) with probability 1 / 2s\n - 0 with probability 1 - 1 / s\n - +sqrt(s) / sqrt(n_components) with probability 1 / 2s\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_components : int,\n Dimensionality of the target projection space.\n\nn_features : int,\n Dimensionality of the original source space.\n\ndensity : float or 'auto', default='auto'\n Ratio of non-zero component in the random projection matrix in the\n range `(0, 1]`\n\n If density = 'auto', the value is set to the minimum density\n as recommended by Ping Li et al.: 1 / sqrt(n_features).\n\n Use density = 1 / 3.0 if you want to reproduce the results from\n Achlioptas, 2001.\n\nrandom_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the matrix\n at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\ncomponents : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated Gaussian random matrix. Sparse matrix will be of CSR\n format.\n\nSee Also\n--------\nSparseRandomProjection\n\nReferences\n----------\n\n.. [1] Ping Li, T. Hastie and K. W. Church, 2006,\n \"Very Sparse Random Projections\".\n https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf\n\n.. [2] D. Achlioptas, 2001, \"Database-friendly random projections\",\n http://www.cs.ucsc.edu/~optas/papers/jl.pdf", + "description": "Generalized Achlioptas random sparse matrix for random projection.\n\nSetting density to 1 / 3 will yield the original matrix by Dimitris\nAchlioptas while setting a lower value will yield the generalization\nby Ping Li et al.\n\nIf we note :math:`s = 1 / density`, the components of the random matrix are\ndrawn from:\n\n - -sqrt(s) / sqrt(n_components) with probability 1 / 2s\n - 0 with probability 1 - 1 / s\n - +sqrt(s) / sqrt(n_components) with probability 1 / 2s\n\nRead more in the :ref:`User Guide `.", + "docstring": "Generalized Achlioptas random sparse matrix for random projection.\n\n Setting density to 1 / 3 will yield the original matrix by Dimitris\n Achlioptas while setting a lower value will yield the generalization\n by Ping Li et al.\n\n If we note :math:`s = 1 / density`, the components of the random matrix are\n drawn from:\n\n - -sqrt(s) / sqrt(n_components) with probability 1 / 2s\n - 0 with probability 1 - 1 / s\n - +sqrt(s) / sqrt(n_components) with probability 1 / 2s\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int,\n Dimensionality of the target projection space.\n\n n_features : int,\n Dimensionality of the original source space.\n\n density : float or 'auto', default='auto'\n Ratio of non-zero component in the random projection matrix in the\n range `(0, 1]`\n\n If density = 'auto', the value is set to the minimum density\n as recommended by Ping Li et al.: 1 / sqrt(n_features).\n\n Use density = 1 / 3.0 if you want to reproduce the results from\n Achlioptas, 2001.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the matrix\n at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated Gaussian random matrix. Sparse matrix will be of CSR\n format.\n\n See Also\n --------\n SparseRandomProjection\n\n References\n ----------\n\n .. [1] Ping Li, T. Hastie and K. W. Church, 2006,\n \"Very Sparse Random Projections\".\n https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf\n\n .. [2] D. Achlioptas, 2001, \"Database-friendly random projections\",\n http://www.cs.ucsc.edu/~optas/papers/jl.pdf\n\n ", "source_code": "\ndef _sparse_random_matrix(n_components, n_features, density='auto', random_state=None):\n \"\"\"Generalized Achlioptas random sparse matrix for random projection.\n\n Setting density to 1 / 3 will yield the original matrix by Dimitris\n Achlioptas while setting a lower value will yield the generalization\n by Ping Li et al.\n\n If we note :math:`s = 1 / density`, the components of the random matrix are\n drawn from:\n\n - -sqrt(s) / sqrt(n_components) with probability 1 / 2s\n - 0 with probability 1 - 1 / s\n - +sqrt(s) / sqrt(n_components) with probability 1 / 2s\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_components : int,\n Dimensionality of the target projection space.\n\n n_features : int,\n Dimensionality of the original source space.\n\n density : float or 'auto', default='auto'\n Ratio of non-zero component in the random projection matrix in the\n range `(0, 1]`\n\n If density = 'auto', the value is set to the minimum density\n as recommended by Ping Li et al.: 1 / sqrt(n_features).\n\n Use density = 1 / 3.0 if you want to reproduce the results from\n Achlioptas, 2001.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generator used to generate the matrix\n at fit time.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n components : {ndarray, sparse matrix} of shape (n_components, n_features)\n The generated Gaussian random matrix. Sparse matrix will be of CSR\n format.\n\n See Also\n --------\n SparseRandomProjection\n\n References\n ----------\n\n .. [1] Ping Li, T. Hastie and K. W. Church, 2006,\n \"Very Sparse Random Projections\".\n https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf\n\n .. [2] D. Achlioptas, 2001, \"Database-friendly random projections\",\n http://www.cs.ucsc.edu/~optas/papers/jl.pdf\n\n \"\"\"\n _check_input_size(n_components, n_features)\n density = _check_density(density, n_features)\n rng = check_random_state(random_state)\n if density == 1:\n components = rng.binomial(1, 0.5, (n_components, n_features)) * 2 - 1\n return 1 / np.sqrt(n_components) * components\n else:\n indices = []\n offset = 0\n indptr = [offset]\n for _ in range(n_components):\n n_nonzero_i = rng.binomial(n_features, density)\n indices_i = sample_without_replacement(n_features, n_nonzero_i, random_state=rng)\n indices.append(indices_i)\n offset += n_nonzero_i\n indptr.append(offset)\n indices = np.concatenate(indices)\n data = rng.binomial(1, 0.5, size=np.size(indices)) * 2 - 1\n components = sp.csr_matrix((data, indices, indptr), shape=(n_components, n_features))\n return np.sqrt(1 / density) / np.sqrt(n_components) * components" }, { @@ -156616,7 +168874,8 @@ "docstring": { "type": "int or array-like of int", "description": "Number of samples that should be a integer greater than 0. If an array\nis given, it will compute a safe number of components array-wise." - } + }, + "refined_type": {} }, { "name": "eps", @@ -156626,13 +168885,14 @@ "docstring": { "type": "float or ndarray of shape (n_components,), dtype=float, default=0.1", "description": "Maximum distortion rate in the range (0,1 ) as defined by the\nJohnson-Lindenstrauss lemma. If an array is given, it will compute a\nsafe number of components array-wise." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Find a 'safe' number of components to randomly project to.\n\nThe distortion introduced by a random projection `p` only changes the distance between two points by a factor (1 +- eps) in an euclidean space with good probability. The projection `p` is an eps-embedding as defined by: (1 - eps) ||u - v||^2 < ||p(u) - p(v)||^2 < (1 + eps) ||u - v||^2 Where u and v are any rows taken from a dataset of shape (n_samples, n_features), eps is in ]0, 1[ and p is a projection by a random Gaussian N(0, 1) matrix of shape (n_components, n_features) (or a sparse Achlioptas matrix). The minimum number of components to guarantee the eps-embedding is given by: n_components >= 4 log(n_samples) / (eps^2 / 2 - eps^3 / 3) Note that the number of dimensions is independent of the original number of features but instead depends on the size of the dataset: the larger the dataset, the higher is the minimal dimensionality of an eps-embedding. Read more in the :ref:`User Guide `.", - "docstring": "Find a 'safe' number of components to randomly project to.\n\nThe distortion introduced by a random projection `p` only changes the\ndistance between two points by a factor (1 +- eps) in an euclidean space\nwith good probability. The projection `p` is an eps-embedding as defined\nby:\n\n (1 - eps) ||u - v||^2 < ||p(u) - p(v)||^2 < (1 + eps) ||u - v||^2\n\nWhere u and v are any rows taken from a dataset of shape (n_samples,\nn_features), eps is in ]0, 1[ and p is a projection by a random Gaussian\nN(0, 1) matrix of shape (n_components, n_features) (or a sparse\nAchlioptas matrix).\n\nThe minimum number of components to guarantee the eps-embedding is\ngiven by:\n\n n_components >= 4 log(n_samples) / (eps^2 / 2 - eps^3 / 3)\n\nNote that the number of dimensions is independent of the original\nnumber of features but instead depends on the size of the dataset:\nthe larger the dataset, the higher is the minimal dimensionality of\nan eps-embedding.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nn_samples : int or array-like of int\n Number of samples that should be a integer greater than 0. If an array\n is given, it will compute a safe number of components array-wise.\n\neps : float or ndarray of shape (n_components,), dtype=float, default=0.1\n Maximum distortion rate in the range (0,1 ) as defined by the\n Johnson-Lindenstrauss lemma. If an array is given, it will compute a\n safe number of components array-wise.\n\nReturns\n-------\nn_components : int or ndarray of int\n The minimal number of components to guarantee with good probability\n an eps-embedding with n_samples.\n\nExamples\n--------\n>>> from sklearn.random_projection import johnson_lindenstrauss_min_dim\n>>> johnson_lindenstrauss_min_dim(1e6, eps=0.5)\n663\n\n>>> johnson_lindenstrauss_min_dim(1e6, eps=[0.5, 0.1, 0.01])\narray([ 663, 11841, 1112658])\n\n>>> johnson_lindenstrauss_min_dim([1e4, 1e5, 1e6], eps=0.1)\narray([ 7894, 9868, 11841])\n\nReferences\n----------\n\n.. [1] https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma\n\n.. [2] Sanjoy Dasgupta and Anupam Gupta, 1999,\n \"An elementary proof of the Johnson-Lindenstrauss Lemma.\"\n http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.45.3654", + "description": "Find a 'safe' number of components to randomly project to.\n\nThe distortion introduced by a random projection `p` only changes the\ndistance between two points by a factor (1 +- eps) in an euclidean space\nwith good probability. The projection `p` is an eps-embedding as defined\nby:\n\n (1 - eps) ||u - v||^2 < ||p(u) - p(v)||^2 < (1 + eps) ||u - v||^2\n\nWhere u and v are any rows taken from a dataset of shape (n_samples,\nn_features), eps is in ]0, 1[ and p is a projection by a random Gaussian\nN(0, 1) matrix of shape (n_components, n_features) (or a sparse\nAchlioptas matrix).\n\nThe minimum number of components to guarantee the eps-embedding is\ngiven by:\n\n n_components >= 4 log(n_samples) / (eps^2 / 2 - eps^3 / 3)\n\nNote that the number of dimensions is independent of the original\nnumber of features but instead depends on the size of the dataset:\nthe larger the dataset, the higher is the minimal dimensionality of\nan eps-embedding.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Find a 'safe' number of components to randomly project to.\n\n The distortion introduced by a random projection `p` only changes the\n distance between two points by a factor (1 +- eps) in an euclidean space\n with good probability. The projection `p` is an eps-embedding as defined\n by:\n\n (1 - eps) ||u - v||^2 < ||p(u) - p(v)||^2 < (1 + eps) ||u - v||^2\n\n Where u and v are any rows taken from a dataset of shape (n_samples,\n n_features), eps is in ]0, 1[ and p is a projection by a random Gaussian\n N(0, 1) matrix of shape (n_components, n_features) (or a sparse\n Achlioptas matrix).\n\n The minimum number of components to guarantee the eps-embedding is\n given by:\n\n n_components >= 4 log(n_samples) / (eps^2 / 2 - eps^3 / 3)\n\n Note that the number of dimensions is independent of the original\n number of features but instead depends on the size of the dataset:\n the larger the dataset, the higher is the minimal dimensionality of\n an eps-embedding.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int or array-like of int\n Number of samples that should be a integer greater than 0. If an array\n is given, it will compute a safe number of components array-wise.\n\n eps : float or ndarray of shape (n_components,), dtype=float, default=0.1\n Maximum distortion rate in the range (0,1 ) as defined by the\n Johnson-Lindenstrauss lemma. If an array is given, it will compute a\n safe number of components array-wise.\n\n Returns\n -------\n n_components : int or ndarray of int\n The minimal number of components to guarantee with good probability\n an eps-embedding with n_samples.\n\n Examples\n --------\n >>> from sklearn.random_projection import johnson_lindenstrauss_min_dim\n >>> johnson_lindenstrauss_min_dim(1e6, eps=0.5)\n 663\n\n >>> johnson_lindenstrauss_min_dim(1e6, eps=[0.5, 0.1, 0.01])\n array([ 663, 11841, 1112658])\n\n >>> johnson_lindenstrauss_min_dim([1e4, 1e5, 1e6], eps=0.1)\n array([ 7894, 9868, 11841])\n\n References\n ----------\n\n .. [1] https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma\n\n .. [2] Sanjoy Dasgupta and Anupam Gupta, 1999,\n \"An elementary proof of the Johnson-Lindenstrauss Lemma.\"\n http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.45.3654\n\n ", "source_code": "\ndef johnson_lindenstrauss_min_dim(n_samples, *, eps=0.1):\n \"\"\"Find a 'safe' number of components to randomly project to.\n\n The distortion introduced by a random projection `p` only changes the\n distance between two points by a factor (1 +- eps) in an euclidean space\n with good probability. The projection `p` is an eps-embedding as defined\n by:\n\n (1 - eps) ||u - v||^2 < ||p(u) - p(v)||^2 < (1 + eps) ||u - v||^2\n\n Where u and v are any rows taken from a dataset of shape (n_samples,\n n_features), eps is in ]0, 1[ and p is a projection by a random Gaussian\n N(0, 1) matrix of shape (n_components, n_features) (or a sparse\n Achlioptas matrix).\n\n The minimum number of components to guarantee the eps-embedding is\n given by:\n\n n_components >= 4 log(n_samples) / (eps^2 / 2 - eps^3 / 3)\n\n Note that the number of dimensions is independent of the original\n number of features but instead depends on the size of the dataset:\n the larger the dataset, the higher is the minimal dimensionality of\n an eps-embedding.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n n_samples : int or array-like of int\n Number of samples that should be a integer greater than 0. If an array\n is given, it will compute a safe number of components array-wise.\n\n eps : float or ndarray of shape (n_components,), dtype=float, default=0.1\n Maximum distortion rate in the range (0,1 ) as defined by the\n Johnson-Lindenstrauss lemma. If an array is given, it will compute a\n safe number of components array-wise.\n\n Returns\n -------\n n_components : int or ndarray of int\n The minimal number of components to guarantee with good probability\n an eps-embedding with n_samples.\n\n Examples\n --------\n >>> from sklearn.random_projection import johnson_lindenstrauss_min_dim\n >>> johnson_lindenstrauss_min_dim(1e6, eps=0.5)\n 663\n\n >>> johnson_lindenstrauss_min_dim(1e6, eps=[0.5, 0.1, 0.01])\n array([ 663, 11841, 1112658])\n\n >>> johnson_lindenstrauss_min_dim([1e4, 1e5, 1e6], eps=0.1)\n array([ 7894, 9868, 11841])\n\n References\n ----------\n\n .. [1] https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma\n\n .. [2] Sanjoy Dasgupta and Anupam Gupta, 1999,\n \"An elementary proof of the Johnson-Lindenstrauss Lemma.\"\n http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.45.3654\n\n \"\"\"\n eps = np.asarray(eps)\n n_samples = np.asarray(n_samples)\n if np.any(eps <= 0.0) or np.any(eps >= 1):\n raise ValueError('The JL bound is defined for eps in ]0, 1[, got %r' % eps)\n if np.any(n_samples) <= 0:\n raise ValueError('The JL bound is defined for n_samples greater than zero, got %r' % n_samples)\n denominator = eps**2 / 2 - eps**3 / 3\n return (4 * np.log(n_samples) / denominator).astype(np.int64)" }, { @@ -156650,7 +168910,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -156660,6 +168921,10 @@ "docstring": { "type": "{'knn', 'rbf'} or callable, default='rbf'", "description": " String identifier for kernel function to use or the kernel function\n itself. Only 'rbf' and 'knn' strings are valid inputs. The function\n passed should take two inputs, each of shape (n_samples, n_features),\n and return a (n_samples, n_samples) shaped weight matrix.\n\ngamma : float, default=20\n Parameter for rbf kernel.\n\nn_neighbors : int, default=7\n Parameter for knn kernel. Need to be strictly positive.\n\nalpha : float, default=1.0\n Clamping factor.\n\nmax_iter : int, default=30\n Change maximum number of iterations allowed.\n\ntol : float, default=1e-3\n Convergence tolerance: threshold to consider the system at steady\n state." + }, + "refined_type": { + "kind": "EnumType", + "values": ["knn", "rbf"] } }, { @@ -156670,7 +168935,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -156680,7 +168946,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "alpha", @@ -156690,7 +168957,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -156700,7 +168968,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -156710,7 +168979,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -156720,13 +168990,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, alpha=1, max_iter=30, tol=0.001, n_jobs=None):\n self.max_iter = max_iter\n self.tol = tol\n self.kernel = kernel\n self.gamma = gamma\n self.n_neighbors = n_neighbors\n self.alpha = alpha\n self.n_jobs = n_jobs" }, { @@ -156744,13 +169015,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef _build_graph(self):\n raise NotImplementedError('Graph construction must be implemented to fit a label propagation model.')" }, { @@ -156768,7 +169040,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -156778,7 +169051,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -156788,13 +169062,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_kernel(self, X, y=None):\n if self.kernel == 'rbf':\n if y is None:\n return rbf_kernel(X, X, gamma=self.gamma)\n else:\n return rbf_kernel(X, y, gamma=self.gamma)\n elif self.kernel == 'knn':\n if self.nn_fit is None:\n self.nn_fit = NearestNeighbors(n_neighbors=self.n_neighbors, n_jobs=self.n_jobs).fit(X)\n if y is None:\n return self.nn_fit.kneighbors_graph(self.nn_fit._fit_X, self.n_neighbors, mode='connectivity')\n else:\n return self.nn_fit.kneighbors(y, return_distance=False)\n elif callable(self.kernel):\n if y is None:\n return self.kernel(X, X)\n else:\n return self.kernel(X, y)\n else:\n raise ValueError('%s is not a valid kernel. Only rbf and knn or an explicit function are supported at this time.' % self.kernel)" }, { @@ -156812,7 +169087,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -156822,7 +169098,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -156832,13 +169109,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target class values with unlabeled points marked as -1.\nAll unlabeled samples will be transductively assigned labels\ninternally." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Fit a semi-supervised label propagation model to X.\n\nThe input samples (labeled and unlabeled) are provided by matrix X, and target labels are provided by matrix y. We conventionally apply the label -1 to unlabeled samples in matrix y in a semi-supervised classification.", - "docstring": "Fit a semi-supervised label propagation model to X.\n\nThe input samples (labeled and unlabeled) are provided by matrix X,\nand target labels are provided by matrix y. We conventionally apply the\nlabel -1 to unlabeled samples in matrix y in a semi-supervised\nclassification.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target class values with unlabeled points marked as -1.\n All unlabeled samples will be transductively assigned labels\n internally.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "description": "Fit a semi-supervised label propagation model to X.\n\nThe input samples (labeled and unlabeled) are provided by matrix X,\nand target labels are provided by matrix y. We conventionally apply the\nlabel -1 to unlabeled samples in matrix y in a semi-supervised\nclassification.", + "docstring": "Fit a semi-supervised label propagation model to X.\n\n The input samples (labeled and unlabeled) are provided by matrix X,\n and target labels are provided by matrix y. We conventionally apply the\n label -1 to unlabeled samples in matrix y in a semi-supervised\n classification.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target class values with unlabeled points marked as -1.\n All unlabeled samples will be transductively assigned labels\n internally.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit a semi-supervised label propagation model to X.\n\n The input samples (labeled and unlabeled) are provided by matrix X,\n and target labels are provided by matrix y. We conventionally apply the\n label -1 to unlabeled samples in matrix y in a semi-supervised\n classification.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target class values with unlabeled points marked as -1.\n All unlabeled samples will be transductively assigned labels\n internally.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n (X, y) = self._validate_data(X, y)\n self.X_ = X\n check_classification_targets(y)\n graph_matrix = self._build_graph()\n classes = np.unique(y)\n classes = classes[classes != -1]\n self.classes_ = classes\n (n_samples, n_classes) = (len(y), len(classes))\n alpha = self.alpha\n if self._variant == 'spreading' and (alpha is None or alpha <= 0.0 or alpha >= 1.0):\n raise ValueError('alpha=%s is invalid: it must be inside the open interval (0, 1)' % alpha)\n y = np.asarray(y)\n unlabeled = y == -1\n self.label_distributions_ = np.zeros((n_samples, n_classes))\n for label in classes:\n self.label_distributions_[y == label, classes == label] = 1\n y_static = np.copy(self.label_distributions_)\n if self._variant == 'propagation':\n y_static[unlabeled] = 0\n else:\n y_static *= 1 - alpha\n l_previous = np.zeros((self.X_.shape[0], n_classes))\n unlabeled = unlabeled[:, np.newaxis]\n if sparse.isspmatrix(graph_matrix):\n graph_matrix = graph_matrix.tocsr()\n for self.n_iter_ in range(self.max_iter):\n if np.abs(self.label_distributions_ - l_previous).sum() < self.tol:\n break\n l_previous = self.label_distributions_\n self.label_distributions_ = safe_sparse_dot(graph_matrix, self.label_distributions_)\n if self._variant == 'propagation':\n normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]\n normalizer[normalizer == 0] = 1\n self.label_distributions_ /= normalizer\n self.label_distributions_ = np.where(unlabeled, self.label_distributions_, y_static)\n else:\n self.label_distributions_ = np.multiply(alpha, self.label_distributions_) + y_static\n else:\n warnings.warn('max_iter=%d was reached without convergence.' % self.max_iter, category=ConvergenceWarning)\n self.n_iter_ += 1\n normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]\n normalizer[normalizer == 0] = 1\n self.label_distributions_ /= normalizer\n transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)]\n self.transduction_ = transduction.ravel()\n return self" }, { @@ -156856,7 +169134,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -156866,13 +169145,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Perform inductive inference across the model.", - "docstring": "Perform inductive inference across the model.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix.\n\nReturns\n-------\ny : ndarray of shape (n_samples,)\n Predictions for input data.", + "docstring": "Perform inductive inference across the model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n Predictions for input data.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Perform inductive inference across the model.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n Predictions for input data.\n \"\"\"\n probas = self.predict_proba(X)\n return self.classes_[np.argmax(probas, axis=1)].ravel()" }, { @@ -156890,7 +169170,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -156900,13 +169181,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Predict probability for each possible outcome.\n\nCompute the probability estimates for each single sample in X and each possible outcome seen during training (categorical distribution).", - "docstring": "Predict probability for each possible outcome.\n\nCompute the probability estimates for each single sample in X\nand each possible outcome seen during training (categorical\ndistribution).\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix.\n\nReturns\n-------\nprobabilities : ndarray of shape (n_samples, n_classes)\n Normalized probability distributions across\n class labels.", + "description": "Predict probability for each possible outcome.\n\nCompute the probability estimates for each single sample in X\nand each possible outcome seen during training (categorical\ndistribution).", + "docstring": "Predict probability for each possible outcome.\n\n Compute the probability estimates for each single sample in X\n and each possible outcome seen during training (categorical\n distribution).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n probabilities : ndarray of shape (n_samples, n_classes)\n Normalized probability distributions across\n class labels.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Predict probability for each possible outcome.\n\n Compute the probability estimates for each single sample in X\n and each possible outcome seen during training (categorical\n distribution).\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n probabilities : ndarray of shape (n_samples, n_classes)\n Normalized probability distributions across\n class labels.\n \"\"\"\n check_is_fitted(self)\n X_2d = self._validate_data(X, accept_sparse=['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'], reset=False)\n weight_matrices = self._get_kernel(self.X_, X_2d)\n if self.kernel == 'knn':\n probabilities = np.array([np.sum(self.label_distributions_[weight_matrix], axis=0) for weight_matrix in weight_matrices])\n else:\n weight_matrices = weight_matrices.T\n probabilities = safe_sparse_dot(weight_matrices, self.label_distributions_)\n normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T\n probabilities /= normalizer\n return probabilities" }, { @@ -156924,7 +169206,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -156934,6 +169217,10 @@ "docstring": { "type": "{'knn', 'rbf'} or callable, default='rbf'", "description": "String identifier for kernel function to use or the kernel function\nitself. Only 'rbf' and 'knn' strings are valid inputs. The function\npassed should take two inputs, each of shape (n_samples, n_features),\nand return a (n_samples, n_samples) shaped weight matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": ["knn", "rbf"] } }, { @@ -156944,7 +169231,8 @@ "docstring": { "type": "float, default=20", "description": "Parameter for rbf kernel." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -156954,7 +169242,8 @@ "docstring": { "type": "int, default=7", "description": "Parameter for knn kernel which need to be strictly positive." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -156964,7 +169253,8 @@ "docstring": { "type": "int, default=1000", "description": "Change maximum number of iterations allowed." - } + }, + "refined_type": {} }, { "name": "tol", @@ -156974,7 +169264,8 @@ "docstring": { "type": "float, 1e-3", "description": "Convergence tolerance: threshold to consider the system at steady\nstate." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -156984,13 +169275,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, max_iter=1000, tol=0.001, n_jobs=None):\n super().__init__(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, max_iter=max_iter, tol=tol, n_jobs=n_jobs, alpha=None)" }, { @@ -157008,13 +169300,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Matrix representing a fully connected graph between each sample\n\nThis basic implementation creates a non-stochastic affinity matrix, so class distributions will exceed 1 (normalization may be desired).", - "docstring": "Matrix representing a fully connected graph between each sample\n\nThis basic implementation creates a non-stochastic affinity matrix, so\nclass distributions will exceed 1 (normalization may be desired).", + "description": "Matrix representing a fully connected graph between each sample\n\nThis basic implementation creates a non-stochastic affinity matrix, so\nclass distributions will exceed 1 (normalization may be desired).", + "docstring": "Matrix representing a fully connected graph between each sample\n\n This basic implementation creates a non-stochastic affinity matrix, so\n class distributions will exceed 1 (normalization may be desired).\n ", "source_code": "\ndef _build_graph(self):\n \"\"\"Matrix representing a fully connected graph between each sample\n\n This basic implementation creates a non-stochastic affinity matrix, so\n class distributions will exceed 1 (normalization may be desired).\n \"\"\"\n if self.kernel == 'knn':\n self.nn_fit = None\n affinity_matrix = self._get_kernel(self.X_)\n normalizer = affinity_matrix.sum(axis=0)\n if sparse.isspmatrix(affinity_matrix):\n affinity_matrix.data /= np.diag(np.array(normalizer))\n else:\n affinity_matrix /= normalizer[:, np.newaxis]\n return affinity_matrix" }, { @@ -157032,7 +169325,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -157042,7 +169336,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training data, where `n_samples` is the number of samples\nand `n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -157052,13 +169347,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target class values with unlabeled points marked as -1.\nAll unlabeled samples will be transductively assigned labels\ninternally." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit a semi-supervised label propagation model to X.", - "docstring": "Fit a semi-supervised label propagation model to X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target class values with unlabeled points marked as -1.\n All unlabeled samples will be transductively assigned labels\n internally.\n\nReturns\n-------\nself : object\n Returns the instance itself.", + "docstring": "Fit a semi-supervised label propagation model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target class values with unlabeled points marked as -1.\n All unlabeled samples will be transductively assigned labels\n internally.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"Fit a semi-supervised label propagation model to X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training data, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target class values with unlabeled points marked as -1.\n All unlabeled samples will be transductively assigned labels\n internally.\n\n Returns\n -------\n self : object\n Returns the instance itself.\n \"\"\"\n return super().fit(X, y)" }, { @@ -157076,7 +169372,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -157086,6 +169383,10 @@ "docstring": { "type": "{'knn', 'rbf'} or callable, default='rbf'", "description": "String identifier for kernel function to use or the kernel function\nitself. Only 'rbf' and 'knn' strings are valid inputs. The function\npassed should take two inputs, each of shape (n_samples, n_features),\nand return a (n_samples, n_samples) shaped weight matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": ["knn", "rbf"] } }, { @@ -157096,7 +169397,8 @@ "docstring": { "type": "float, default=20", "description": "Parameter for rbf kernel." - } + }, + "refined_type": {} }, { "name": "n_neighbors", @@ -157106,7 +169408,8 @@ "docstring": { "type": "int, default=7", "description": "Parameter for knn kernel which is a strictly positive integer." - } + }, + "refined_type": {} }, { "name": "alpha", @@ -157116,7 +169419,8 @@ "docstring": { "type": "float, default=0.2", "description": "Clamping factor. A value in (0, 1) that specifies the relative amount\nthat an instance should adopt the information from its neighbors as\nopposed to its initial label.\nalpha=0 means keeping the initial label information; alpha=1 means\nreplacing all initial information." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -157126,7 +169430,8 @@ "docstring": { "type": "int, default=30", "description": "Maximum number of iterations allowed." - } + }, + "refined_type": {} }, { "name": "tol", @@ -157136,7 +169441,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Convergence tolerance: threshold to consider the system at steady\nstate." - } + }, + "refined_type": {} }, { "name": "n_jobs", @@ -157146,13 +169452,14 @@ "docstring": { "type": "int, default=None", "description": "The number of parallel jobs to run.\n``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.\n``-1`` means using all processors. See :term:`Glossary `\nfor more details." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=0.001, n_jobs=None):\n super().__init__(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol, n_jobs=n_jobs)" }, { @@ -157170,7 +169477,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -157194,7 +169502,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "base_estimator", @@ -157204,7 +169513,8 @@ "docstring": { "type": "estimator object", "description": "An estimator object implementing `fit` and `predict_proba`.\nInvoking the `fit` method will fit a clone of the passed estimator,\nwhich will be stored in the `base_estimator_` attribute." - } + }, + "refined_type": {} }, { "name": "threshold", @@ -157214,7 +169524,8 @@ "docstring": { "type": "float, default=0.75", "description": "The decision threshold for use with `criterion='threshold'`.\nShould be in [0, 1). When using the `'threshold'` criterion, a\n:ref:`well calibrated classifier ` should be used." - } + }, + "refined_type": {} }, { "name": "criterion", @@ -157224,6 +169535,10 @@ "docstring": { "type": "{'threshold', 'k_best'}, default='threshold'", "description": "The selection criterion used to select which labels to add to the\ntraining set. If `'threshold'`, pseudo-labels with prediction\nprobabilities above `threshold` are added to the dataset. If `'k_best'`,\nthe `k_best` pseudo-labels with highest prediction probabilities are\nadded to the dataset. When using the 'threshold' criterion, a\n:ref:`well calibrated classifier ` should be used." + }, + "refined_type": { + "kind": "EnumType", + "values": ["threshold", "k_best"] } }, { @@ -157234,7 +169549,8 @@ "docstring": { "type": "int, default=10", "description": "The amount of samples to add in each iteration. Only used when\n`criterion='k_best'`." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -157244,7 +169560,8 @@ "docstring": { "type": "int or None, default=10", "description": "Maximum number of iterations allowed. Should be greater than or equal\nto 0. If it is `None`, the classifier will continue to predict labels\nuntil no new pseudo-labels are added, or all unlabeled samples have\nbeen labeled." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -157254,13 +169571,14 @@ "docstring": { "type": "bool, default=False", "description": "Enable verbose output." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, base_estimator, threshold=0.75, criterion='threshold', k_best=10, max_iter=10, verbose=False):\n self.base_estimator = base_estimator\n self.threshold = threshold\n self.criterion = criterion\n self.k_best = k_best\n self.max_iter = max_iter\n self.verbose = verbose" }, { @@ -157278,7 +169596,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -157288,13 +169607,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Array representing the data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Call decision function of the `base_estimator`.", - "docstring": "Call decision function of the `base_estimator`.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\nReturns\n-------\ny : ndarray of shape (n_samples, n_features)\n Result of the decision function of the `base_estimator`.", + "docstring": "Call decision function of the `base_estimator`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_features)\n Result of the decision function of the `base_estimator`.\n ", "source_code": "\n@if_delegate_has_method(delegate='base_estimator')\ndef decision_function(self, X):\n \"\"\"Call decision function of the `base_estimator`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_features)\n Result of the decision function of the `base_estimator`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, force_all_finite=False, reset=False)\n return self.base_estimator_.decision_function(X)" }, { @@ -157312,7 +169635,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -157322,6 +169646,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Array representing the data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -157332,13 +169660,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples,)", "description": "Array representing the labels. Unlabeled samples should have the\nlabel -1." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Fit self-training classifier using `X`, `y` as training data.", - "docstring": "Fit self-training classifier using `X`, `y` as training data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\ny : {array-like, sparse matrix} of shape (n_samples,)\n Array representing the labels. Unlabeled samples should have the\n label -1.\n\nReturns\n-------\nself : object\n Fitted estimator.", + "docstring": "\n Fit self-training classifier using `X`, `y` as training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n y : {array-like, sparse matrix} of shape (n_samples,)\n Array representing the labels. Unlabeled samples should have the\n label -1.\n\n Returns\n -------\n self : object\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y):\n \"\"\"\n Fit self-training classifier using `X`, `y` as training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n y : {array-like, sparse matrix} of shape (n_samples,)\n Array representing the labels. Unlabeled samples should have the\n label -1.\n\n Returns\n -------\n self : object\n Fitted estimator.\n \"\"\"\n (X, y) = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'lil', 'dok'], force_all_finite=False)\n if self.base_estimator is None:\n raise ValueError('base_estimator cannot be None!')\n self.base_estimator_ = clone(self.base_estimator)\n if self.max_iter is not None and self.max_iter < 0:\n raise ValueError(f'max_iter must be >= 0 or None, got {self.max_iter}')\n if not 0 <= self.threshold < 1:\n raise ValueError(f'threshold must be in [0,1), got {self.threshold}')\n if self.criterion not in ['threshold', 'k_best']:\n raise ValueError(f\"criterion must be either 'threshold' or 'k_best', got {self.criterion}.\")\n if y.dtype.kind in ['U', 'S']:\n raise ValueError('y has dtype string. If you wish to predict on string targets, use dtype object, and use -1 as the label for unlabeled samples.')\n has_label = y != -1\n if np.all(has_label):\n warnings.warn('y contains no unlabeled samples', UserWarning)\n if self.criterion == 'k_best' and self.k_best > X.shape[0] - np.sum(has_label):\n warnings.warn('k_best is larger than the amount of unlabeled samples. All unlabeled samples will be labeled in the first iteration', UserWarning)\n self.transduction_ = np.copy(y)\n self.labeled_iter_ = np.full_like(y, -1)\n self.labeled_iter_[has_label] = 0\n self.n_iter_ = 0\n while not np.all(has_label) and (self.max_iter is None or self.n_iter_ < self.max_iter):\n self.n_iter_ += 1\n self.base_estimator_.fit(X[safe_mask(X, has_label)], self.transduction_[has_label])\n _validate_estimator(self.base_estimator_)\n prob = self.base_estimator_.predict_proba(X[safe_mask(X, ~has_label)])\n pred = self.base_estimator_.classes_[np.argmax(prob, axis=1)]\n max_proba = np.max(prob, axis=1)\n if self.criterion == 'threshold':\n selected = max_proba > self.threshold\n else:\n n_to_select = min(self.k_best, max_proba.shape[0])\n if n_to_select == max_proba.shape[0]:\n selected = np.ones_like(max_proba, dtype=bool)\n else:\n selected = np.argpartition(-max_proba, n_to_select)[:n_to_select]\n selected_full = np.nonzero(~has_label)[0][selected]\n self.transduction_[selected_full] = pred[selected]\n has_label[selected_full] = True\n self.labeled_iter_[selected_full] = self.n_iter_\n if selected_full.shape[0] == 0:\n self.termination_condition_ = 'no_change'\n break\n if self.verbose:\n print(f'End of iteration {self.n_iter_}, added {selected_full.shape[0]} new labels.')\n if self.n_iter_ == self.max_iter:\n self.termination_condition_ = 'max_iter'\n if np.all(has_label):\n self.termination_condition_ = 'all_labeled'\n self.base_estimator_.fit(X[safe_mask(X, has_label)], self.transduction_[has_label])\n self.classes_ = self.base_estimator_.classes_\n return self" }, { @@ -157356,7 +169688,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -157366,13 +169699,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Array representing the data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict the classes of `X`.", - "docstring": "Predict the classes of `X`.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\nReturns\n-------\ny : ndarray of shape (n_samples,)\n Array with predicted labels.", + "docstring": "Predict the classes of `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n Array with predicted labels.\n ", "source_code": "\n@if_delegate_has_method(delegate='base_estimator')\ndef predict(self, X):\n \"\"\"Predict the classes of `X`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n Returns\n -------\n y : ndarray of shape (n_samples,)\n Array with predicted labels.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, force_all_finite=False, reset=False)\n return self.base_estimator_.predict(X)" }, { @@ -157390,7 +169727,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -157400,13 +169738,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Array representing the data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict log probability for each possible outcome.", - "docstring": "Predict log probability for each possible outcome.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\nReturns\n-------\ny : ndarray of shape (n_samples, n_features)\n Array with log prediction probabilities.", + "docstring": "Predict log probability for each possible outcome.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_features)\n Array with log prediction probabilities.\n ", "source_code": "\n@if_delegate_has_method(delegate='base_estimator')\ndef predict_log_proba(self, X):\n \"\"\"Predict log probability for each possible outcome.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_features)\n Array with log prediction probabilities.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, force_all_finite=False, reset=False)\n return self.base_estimator_.predict_log_proba(X)" }, { @@ -157424,7 +169766,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -157434,13 +169777,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Array representing the data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict probability for each possible outcome.", - "docstring": "Predict probability for each possible outcome.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\nReturns\n-------\ny : ndarray of shape (n_samples, n_features)\n Array with prediction probabilities.", + "docstring": "Predict probability for each possible outcome.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_features)\n Array with prediction probabilities.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Predict probability for each possible outcome.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n Returns\n -------\n y : ndarray of shape (n_samples, n_features)\n Array with prediction probabilities.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, force_all_finite=False, reset=False)\n return self.base_estimator_.predict_proba(X)" }, { @@ -157458,7 +169805,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -157468,6 +169816,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Array representing the data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -157478,13 +169830,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Array representing the labels." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Call score on the `base_estimator`.", - "docstring": "Call score on the `base_estimator`.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\ny : array-like of shape (n_samples,)\n Array representing the labels.\n\nReturns\n-------\nscore : float\n Result of calling score on the `base_estimator`.", + "docstring": "Call score on the `base_estimator`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n y : array-like of shape (n_samples,)\n Array representing the labels.\n\n Returns\n -------\n score : float\n Result of calling score on the `base_estimator`.\n ", "source_code": "\n@if_delegate_has_method(delegate='base_estimator')\ndef score(self, X, y):\n \"\"\"Call score on the `base_estimator`.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Array representing the data.\n\n y : array-like of shape (n_samples,)\n Array representing the labels.\n\n Returns\n -------\n score : float\n Result of calling score on the `base_estimator`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_data(X, accept_sparse=True, force_all_finite=False, reset=False)\n return self.base_estimator_.score(X, y)" }, { @@ -157502,7 +169855,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -157526,7 +169880,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -157536,13 +169891,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n from numpy.distutils.misc_util import Configuration\n import numpy\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config = Configuration('sklearn', parent_package, top_path)\n config.add_subpackage('__check_build')\n config.add_subpackage('_build_utils')\n config.add_subpackage('compose')\n config.add_subpackage('compose/tests')\n config.add_subpackage('covariance')\n config.add_subpackage('covariance/tests')\n config.add_subpackage('cross_decomposition')\n config.add_subpackage('cross_decomposition/tests')\n config.add_subpackage('feature_selection')\n config.add_subpackage('feature_selection/tests')\n config.add_subpackage('gaussian_process')\n config.add_subpackage('gaussian_process/tests')\n config.add_subpackage('impute')\n config.add_subpackage('impute/tests')\n config.add_subpackage('inspection')\n config.add_subpackage('inspection/tests')\n config.add_subpackage('mixture')\n config.add_subpackage('mixture/tests')\n config.add_subpackage('model_selection')\n config.add_subpackage('model_selection/tests')\n config.add_subpackage('neural_network')\n config.add_subpackage('neural_network/tests')\n config.add_subpackage('preprocessing')\n config.add_subpackage('preprocessing/tests')\n config.add_subpackage('semi_supervised')\n config.add_subpackage('semi_supervised/tests')\n config.add_subpackage('experimental')\n config.add_subpackage('experimental/tests')\n config.add_subpackage('ensemble/_hist_gradient_boosting')\n config.add_subpackage('ensemble/_hist_gradient_boosting/tests')\n config.add_subpackage('_loss/')\n config.add_subpackage('_loss/tests')\n config.add_subpackage('externals')\n config.add_subpackage('externals/_packaging')\n config.add_subpackage('cluster')\n config.add_subpackage('datasets')\n config.add_subpackage('decomposition')\n config.add_subpackage('ensemble')\n config.add_subpackage('feature_extraction')\n config.add_subpackage('manifold')\n config.add_subpackage('metrics')\n config.add_subpackage('neighbors')\n config.add_subpackage('tree')\n config.add_subpackage('utils')\n config.add_subpackage('svm')\n config.add_subpackage('linear_model')\n config.add_extension('_isotonic', sources=['_isotonic.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_subpackage('tests')\n if 'sdist' not in sys.argv:\n cythonize_extensions(top_path, config)\n return config" }, { @@ -157560,7 +169916,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -157584,7 +169941,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -157594,7 +169952,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "degree", @@ -157604,7 +169963,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gamma", @@ -157614,7 +169974,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef0", @@ -157624,7 +169985,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -157634,7 +169996,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -157644,7 +170007,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nu", @@ -157654,7 +170018,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -157664,7 +170029,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "shrinking", @@ -157674,7 +170040,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "probability", @@ -157684,7 +170051,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cache_size", @@ -157694,7 +170062,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -157704,7 +170073,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -157714,7 +170084,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -157724,7 +170095,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -157734,13 +170106,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, kernel, degree, gamma, coef0, tol, C, nu, epsilon, shrinking, probability, cache_size, class_weight, verbose, max_iter, random_state):\n if self._impl not in LIBSVM_IMPL:\n raise ValueError('impl should be one of %s, %s was given' % (LIBSVM_IMPL, self._impl))\n if gamma == 0:\n msg = \"The gamma value of 0.0 is invalid. Use 'auto' to set gamma to a value of 1 / n_features.\"\n raise ValueError(msg)\n self.kernel = kernel\n self.degree = degree\n self.gamma = gamma\n self.coef0 = coef0\n self.tol = tol\n self.C = C\n self.nu = nu\n self.epsilon = epsilon\n self.shrinking = shrinking\n self.probability = probability\n self.cache_size = cache_size\n self.class_weight = class_weight\n self.verbose = verbose\n self.max_iter = max_iter\n self.random_state = random_state" }, { @@ -157758,7 +170131,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -157768,7 +170142,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -157792,7 +170167,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -157802,13 +170178,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Evaluates the decision function for the samples in X.", - "docstring": "Evaluates the decision function for the samples in X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n\nReturns\n-------\nX : array-like of shape (n_samples, n_class * (n_class-1) / 2)\n Returns the decision function of the sample for each class\n in the model.", + "docstring": "Evaluates the decision function for the samples in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n X : array-like of shape (n_samples, n_class * (n_class-1) / 2)\n Returns the decision function of the sample for each class\n in the model.\n ", "source_code": "\ndef _decision_function(self, X):\n \"\"\"Evaluates the decision function for the samples in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n\n Returns\n -------\n X : array-like of shape (n_samples, n_class * (n_class-1) / 2)\n Returns the decision function of the sample for each class\n in the model.\n \"\"\"\n X = self._validate_for_predict(X)\n X = self._compute_kernel(X)\n if self._sparse:\n dec_func = self._sparse_decision_function(X)\n else:\n dec_func = self._dense_decision_function(X)\n if self._impl in ['c_svc', 'nu_svc'] and len(self.classes_) == 2:\n return -dec_func.ravel()\n return dec_func" }, { @@ -157826,7 +170203,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -157836,13 +170214,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _dense_decision_function(self, X):\n X = check_array(X, dtype=np.float64, order='C', accept_large_sparse=False)\n kernel = self.kernel\n if callable(kernel):\n kernel = 'precomputed'\n return libsvm.decision_function(X, self.support_, self.support_vectors_, self._n_support, self._dual_coef_, self._intercept_, self._probA, self._probB, svm_type=LIBSVM_IMPL.index(self._impl), kernel=kernel, degree=self.degree, cache_size=self.cache_size, coef0=self.coef0, gamma=self._gamma)" }, { @@ -157860,7 +170239,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -157870,7 +170250,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -157880,7 +170261,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -157890,7 +170272,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "solver_type", @@ -157900,7 +170283,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -157910,7 +170294,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_seed", @@ -157920,13 +170305,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _dense_fit(self, X, y, sample_weight, solver_type, kernel, random_seed):\n if callable(self.kernel):\n self.__Xfit = X\n X = self._compute_kernel(X)\n if X.shape[0] != X.shape[1]:\n raise ValueError('X.shape[0] should be equal to X.shape[1]')\n libsvm.set_verbosity_wrap(self.verbose)\n (self.support_, self.support_vectors_, self._n_support, self.dual_coef_, self.intercept_, self._probA, self._probB, self.fit_status_) = libsvm.fit(X, y, svm_type=solver_type, sample_weight=sample_weight, class_weight=self.class_weight_, kernel=kernel, C=self.C, nu=self.nu, probability=self.probability, degree=self.degree, shrinking=self.shrinking, tol=self.tol, cache_size=self.cache_size, coef0=self.coef0, gamma=self._gamma, epsilon=self.epsilon, max_iter=self.max_iter, random_seed=random_seed)\n self._warn_from_fit_status()" }, { @@ -157944,7 +170330,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -157954,13 +170341,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _dense_predict(self, X):\n X = self._compute_kernel(X)\n if X.ndim == 1:\n X = check_array(X, order='C', accept_large_sparse=False)\n kernel = self.kernel\n if callable(self.kernel):\n kernel = 'precomputed'\n if X.shape[1] != self.shape_fit_[0]:\n raise ValueError('X.shape[1] = %d should be equal to %d, the number of samples at training time' % (X.shape[1], self.shape_fit_[0]))\n svm_type = LIBSVM_IMPL.index(self._impl)\n return libsvm.predict(X, self.support_, self.support_vectors_, self._n_support, self._dual_coef_, self._intercept_, self._probA, self._probB, svm_type=svm_type, kernel=kernel, degree=self.degree, coef0=self.coef0, gamma=self._gamma, cache_size=self.cache_size)" }, { @@ -157978,13 +170366,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_coef(self):\n return safe_sparse_dot(self._dual_coef_, self.support_vectors_)" }, { @@ -158002,13 +170391,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'pairwise': self.kernel == 'precomputed'}" }, { @@ -158029,13 +170419,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('Attribute `_pairwise` was deprecated in version 0.24 and will be removed in 1.1 (renaming of 0.26).')\n@property\ndef _pairwise(self):\n return self.kernel == 'precomputed'" }, { @@ -158053,7 +170444,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -158063,13 +170455,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sparse_decision_function(self, X):\n X.data = np.asarray(X.data, dtype=np.float64, order='C')\n kernel = self.kernel\n if hasattr(kernel, '__call__'):\n kernel = 'precomputed'\n kernel_type = self._sparse_kernels.index(kernel)\n return libsvm_sparse.libsvm_sparse_decision_function(X.data, X.indices, X.indptr, self.support_vectors_.data, self.support_vectors_.indices, self.support_vectors_.indptr, self._dual_coef_.data, self._intercept_, LIBSVM_IMPL.index(self._impl), kernel_type, self.degree, self._gamma, self.coef0, self.tol, self.C, self.class_weight_, self.nu, self.epsilon, self.shrinking, self.probability, self._n_support, self._probA, self._probB)" }, { @@ -158087,7 +170480,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -158097,7 +170491,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -158107,7 +170502,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -158117,7 +170513,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "solver_type", @@ -158127,7 +170524,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -158137,7 +170535,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_seed", @@ -158147,13 +170546,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sparse_fit(self, X, y, sample_weight, solver_type, kernel, random_seed):\n X.data = np.asarray(X.data, dtype=np.float64, order='C')\n X.sort_indices()\n kernel_type = self._sparse_kernels.index(kernel)\n libsvm_sparse.set_verbosity_wrap(self.verbose)\n (self.support_, self.support_vectors_, dual_coef_data, self.intercept_, self._n_support, self._probA, self._probB, self.fit_status_) = libsvm_sparse.libsvm_sparse_train(X.shape[1], X.data, X.indices, X.indptr, y, solver_type, kernel_type, self.degree, self._gamma, self.coef0, self.tol, self.C, self.class_weight_, sample_weight, self.nu, self.cache_size, self.epsilon, int(self.shrinking), int(self.probability), self.max_iter, random_seed)\n self._warn_from_fit_status()\n if hasattr(self, 'classes_'):\n n_class = len(self.classes_) - 1\n else:\n n_class = 1\n n_SV = self.support_vectors_.shape[0]\n dual_coef_indices = np.tile(np.arange(n_SV), n_class)\n if not n_SV:\n self.dual_coef_ = sp.csr_matrix([])\n else:\n dual_coef_indptr = np.arange(0, dual_coef_indices.size + 1, dual_coef_indices.size / n_class)\n self.dual_coef_ = sp.csr_matrix((dual_coef_data, dual_coef_indices, dual_coef_indptr), (n_class, n_SV))" }, { @@ -158171,7 +170571,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -158181,13 +170582,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sparse_predict(self, X):\n kernel = self.kernel\n if callable(kernel):\n kernel = 'precomputed'\n kernel_type = self._sparse_kernels.index(kernel)\n C = 0.0\n return libsvm_sparse.libsvm_sparse_predict(X.data, X.indices, X.indptr, self.support_vectors_.data, self.support_vectors_.indices, self.support_vectors_.indptr, self._dual_coef_.data, self._intercept_, LIBSVM_IMPL.index(self._impl), kernel_type, self.degree, self._gamma, self.coef0, self.tol, C, self.class_weight_, self.nu, self.epsilon, self.shrinking, self.probability, self._n_support, self._probA, self._probB)" }, { @@ -158205,7 +170607,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -158215,13 +170618,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_for_predict(self, X):\n check_is_fitted(self)\n if not callable(self.kernel):\n X = self._validate_data(X, accept_sparse='csr', dtype=np.float64, order='C', accept_large_sparse=False, reset=False)\n if self._sparse and not sp.isspmatrix(X):\n X = sp.csr_matrix(X)\n if self._sparse:\n X.sort_indices()\n if sp.issparse(X) and not self._sparse and not callable(self.kernel):\n raise ValueError('cannot use sparse input in %r trained on dense data' % type(self).__name__)\n if self.kernel == 'precomputed':\n if X.shape[1] != self.shape_fit_[0]:\n raise ValueError('X.shape[1] = %d should be equal to %d, the number of samples at training time' % (X.shape[1], self.shape_fit_[0]))\n sv = self.support_vectors_\n if not self._sparse and sv.size > 0 and self.n_support_.sum() != sv.shape[0]:\n raise ValueError(f'The internal representation of {self.__class__.__name__} was altered')\n return X" }, { @@ -158239,7 +170643,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -158249,13 +170654,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Validation of y and class_weight.\n\nDefault implementation for SVR and one-class; overridden in BaseSVC.", - "docstring": "Validation of y and class_weight.\n\nDefault implementation for SVR and one-class; overridden in BaseSVC.", + "docstring": "Validation of y and class_weight.\n\n Default implementation for SVR and one-class; overridden in BaseSVC.\n ", "source_code": "\ndef _validate_targets(self, y):\n \"\"\"Validation of y and class_weight.\n\n Default implementation for SVR and one-class; overridden in BaseSVC.\n \"\"\"\n self.class_weight_ = np.empty(0)\n return column_or_1d(y, warn=True).astype(np.float64, copy=False)" }, { @@ -158273,13 +170679,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _warn_from_fit_status(self):\n assert self.fit_status_ in (0, 1)\n if self.fit_status_ == 1:\n warnings.warn('Solver terminated early (max_iter=%i). Consider pre-processing your data with StandardScaler or MinMaxScaler.' % self.max_iter, ConvergenceWarning)" }, { @@ -158297,13 +170704,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Weights assigned to the features when `kernel=\"linear\"`.", - "docstring": "Weights assigned to the features when `kernel=\"linear\"`.\n\nReturns\n-------\nndarray of shape (n_features, n_classes)", + "docstring": "Weights assigned to the features when `kernel=\"linear\"`.\n\n Returns\n -------\n ndarray of shape (n_features, n_classes)\n ", "source_code": "\n@property\ndef coef_(self):\n \"\"\"Weights assigned to the features when `kernel=\"linear\"`.\n\n Returns\n -------\n ndarray of shape (n_features, n_classes)\n \"\"\"\n if self.kernel != 'linear':\n raise AttributeError('coef_ is only available when using a linear kernel')\n coef = self._get_coef()\n if sp.issparse(coef):\n coef.data.flags.writeable = False\n else:\n coef.flags.writeable = False\n return coef" }, { @@ -158321,7 +170729,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -158331,6 +170740,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "Training vectors, where `n_samples` is the number of samples\nand `n_features` is the number of features.\nFor kernel=\"precomputed\", the expected shape of X is\n(n_samples, n_samples)." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -158341,7 +170754,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target values (class labels in classification, real numbers in\nregression)." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -158351,13 +170765,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Per-sample weights. Rescale C per sample. Higher weights\nforce the classifier to put more emphasis on these points." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit the SVM model according to the given training data.", - "docstring": "Fit the SVM model according to the given training data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples, n_samples).\n\ny : array-like of shape (n_samples,)\n Target values (class labels in classification, real numbers in\n regression).\n\nsample_weight : array-like of shape (n_samples,), default=None\n Per-sample weights. Rescale C per sample. Higher weights\n force the classifier to put more emphasis on these points.\n\nReturns\n-------\nself : object\n Fitted estimator.\n\nNotes\n-----\nIf X and y are not C-ordered and contiguous arrays of np.float64 and\nX is not a scipy.sparse.csr_matrix, X and/or y may be copied.\n\nIf X is a dense array, then the other methods will not support sparse\nmatrices as input.", + "docstring": "Fit the SVM model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples, n_samples).\n\n y : array-like of shape (n_samples,)\n Target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Per-sample weights. Rescale C per sample. Higher weights\n force the classifier to put more emphasis on these points.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n If X and y are not C-ordered and contiguous arrays of np.float64 and\n X is not a scipy.sparse.csr_matrix, X and/or y may be copied.\n\n If X is a dense array, then the other methods will not support sparse\n matrices as input.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the SVM model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples, n_samples)\n Training vectors, where `n_samples` is the number of samples\n and `n_features` is the number of features.\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples, n_samples).\n\n y : array-like of shape (n_samples,)\n Target values (class labels in classification, real numbers in\n regression).\n\n sample_weight : array-like of shape (n_samples,), default=None\n Per-sample weights. Rescale C per sample. Higher weights\n force the classifier to put more emphasis on these points.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n If X and y are not C-ordered and contiguous arrays of np.float64 and\n X is not a scipy.sparse.csr_matrix, X and/or y may be copied.\n\n If X is a dense array, then the other methods will not support sparse\n matrices as input.\n \"\"\"\n rnd = check_random_state(self.random_state)\n sparse = sp.isspmatrix(X)\n if sparse and self.kernel == 'precomputed':\n raise TypeError('Sparse precomputed kernels are not supported.')\n self._sparse = sparse and not callable(self.kernel)\n if hasattr(self, 'decision_function_shape'):\n if self.decision_function_shape not in ('ovr', 'ovo'):\n raise ValueError(f\"decision_function_shape must be either 'ovr' or 'ovo', got {self.decision_function_shape}.\")\n if callable(self.kernel):\n check_consistent_length(X, y)\n else:\n (X, y) = self._validate_data(X, y, dtype=np.float64, order='C', accept_sparse='csr', accept_large_sparse=False)\n y = self._validate_targets(y)\n sample_weight = np.asarray([] if sample_weight is None else sample_weight, dtype=np.float64)\n solver_type = LIBSVM_IMPL.index(self._impl)\n n_samples = _num_samples(X)\n if solver_type != 2 and n_samples != y.shape[0]:\n raise ValueError('X and y have incompatible shapes.\\n' + 'X has %s samples, but y has %s.' % (n_samples, y.shape[0]))\n if self.kernel == 'precomputed' and n_samples != X.shape[1]:\n raise ValueError('Precomputed matrix must be a square matrix. Input is a {}x{} matrix.'.format(X.shape[0], X.shape[1]))\n if sample_weight.shape[0] > 0 and sample_weight.shape[0] != n_samples:\n raise ValueError('sample_weight and X have incompatible shapes: %r vs %r\\nNote: Sparse matrices cannot be indexed w/boolean masks (use `indices=True` in CV).' % (sample_weight.shape, X.shape))\n kernel = 'precomputed' if callable(self.kernel) else self.kernel\n if kernel == 'precomputed':\n self._gamma = 0.0\n elif isinstance(self.gamma, str):\n if self.gamma == 'scale':\n X_var = X.multiply(X).mean() - X.mean()**2 if sparse else X.var()\n self._gamma = 1.0 / (X.shape[1] * X_var) if X_var != 0 else 1.0\n elif self.gamma == 'auto':\n self._gamma = 1.0 / X.shape[1]\n else:\n raise ValueError(\"When 'gamma' is a string, it should be either 'scale' or 'auto'. Got '{}' instead.\".format(self.gamma))\n else:\n self._gamma = self.gamma\n fit = self._sparse_fit if self._sparse else self._dense_fit\n if self.verbose:\n print('[LibSVM]', end='')\n seed = rnd.randint(np.iinfo('i').max)\n fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)\n self.shape_fit_ = X.shape if hasattr(X, 'shape') else (n_samples, )\n self._intercept_ = self.intercept_.copy()\n self._dual_coef_ = self.dual_coef_\n if self._impl in ['c_svc', 'nu_svc'] and len(self.classes_) == 2:\n self.intercept_ *= -1\n self.dual_coef_ = -self.dual_coef_\n return self" }, { @@ -158375,7 +170790,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -158399,7 +170815,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -158409,13 +170826,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "For kernel=\"precomputed\", the expected shape of X is\n(n_samples_test, n_samples_train)." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Perform regression on samples in X.\n\nFor an one-class model, +1 (inlier) or -1 (outlier) is returned.", - "docstring": "Perform regression on samples in X.\n\nFor an one-class model, +1 (inlier) or -1 (outlier) is returned.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,)\n The predicted values.", + "docstring": "Perform regression on samples in X.\n\n For an one-class model, +1 (inlier) or -1 (outlier) is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n The predicted values.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Perform regression on samples in X.\n\n For an one-class model, +1 (inlier) or -1 (outlier) is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n The predicted values.\n \"\"\"\n X = self._validate_for_predict(X)\n predict = self._sparse_predict if self._sparse else self._dense_predict\n return predict(X)" }, { @@ -158433,7 +170854,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -158443,7 +170865,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "degree", @@ -158453,7 +170876,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gamma", @@ -158463,7 +170887,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "coef0", @@ -158473,7 +170898,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tol", @@ -158483,7 +170909,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -158493,7 +170920,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nu", @@ -158503,7 +170931,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "shrinking", @@ -158513,7 +170942,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "probability", @@ -158523,7 +170953,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cache_size", @@ -158533,7 +170964,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -158543,7 +170975,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "verbose", @@ -158553,7 +170986,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -158563,7 +170997,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "decision_function_shape", @@ -158573,7 +171008,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -158583,7 +171019,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "break_ties", @@ -158593,13 +171030,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, kernel, degree, gamma, coef0, tol, C, nu, shrinking, probability, cache_size, class_weight, verbose, max_iter, decision_function_shape, random_state, break_ties):\n self.decision_function_shape = decision_function_shape\n self.break_ties = break_ties\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, nu=nu, epsilon=0.0, shrinking=shrinking, probability=probability, cache_size=cache_size, class_weight=class_weight, verbose=verbose, max_iter=max_iter, random_state=random_state)" }, { @@ -158617,13 +171055,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_proba(self):\n if not self.probability:\n raise AttributeError('predict_proba is not available when probability=False')\n if self._impl not in ('c_svc', 'nu_svc'):\n raise AttributeError('predict_proba only implemented for SVC and NuSVC')\n return True" }, { @@ -158641,7 +171080,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -158651,13 +171091,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _dense_predict_proba(self, X):\n X = self._compute_kernel(X)\n kernel = self.kernel\n if callable(kernel):\n kernel = 'precomputed'\n svm_type = LIBSVM_IMPL.index(self._impl)\n pprob = libsvm.predict_proba(X, self.support_, self.support_vectors_, self._n_support, self._dual_coef_, self._intercept_, self._probA, self._probB, svm_type=svm_type, kernel=kernel, degree=self.degree, cache_size=self.cache_size, coef0=self.coef0, gamma=self._gamma)\n return pprob" }, { @@ -158675,13 +171116,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_coef(self):\n if self.dual_coef_.shape[0] == 1:\n coef = safe_sparse_dot(self.dual_coef_, self.support_vectors_)\n else:\n coef = _one_vs_one_coef(self.dual_coef_, self._n_support, self.support_vectors_)\n if sp.issparse(coef[0]):\n coef = sp.vstack(coef).tocsr()\n else:\n coef = np.vstack(coef)\n return coef" }, { @@ -158699,7 +171141,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -158709,13 +171152,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sparse_predict_proba(self, X):\n X.data = np.asarray(X.data, dtype=np.float64, order='C')\n kernel = self.kernel\n if callable(kernel):\n kernel = 'precomputed'\n kernel_type = self._sparse_kernels.index(kernel)\n return libsvm_sparse.libsvm_sparse_predict_proba(X.data, X.indices, X.indptr, self.support_vectors_.data, self.support_vectors_.indices, self.support_vectors_.indptr, self._dual_coef_.data, self._intercept_, LIBSVM_IMPL.index(self._impl), kernel_type, self.degree, self._gamma, self.coef0, self.tol, self.C, self.class_weight_, self.nu, self.epsilon, self.shrinking, self.probability, self._n_support, self._probA, self._probB)" }, { @@ -158733,7 +171177,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -158743,13 +171188,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_targets(self, y):\n y_ = column_or_1d(y, warn=True)\n check_classification_targets(y)\n (cls, y) = np.unique(y_, return_inverse=True)\n self.class_weight_ = compute_class_weight(self.class_weight, classes=cls, y=y_)\n if len(cls) < 2:\n raise ValueError('The number of classes has to be greater than one; got %d class' % len(cls))\n self.classes_ = cls\n return np.asarray(y, dtype=np.float64, order='C')" }, { @@ -158767,7 +171213,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -158777,13 +171224,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input samples." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Evaluate the decision function for the samples in X.", - "docstring": "Evaluate the decision function for the samples in X.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input samples.\n\nReturns\n-------\nX : ndarray of shape (n_samples, n_classes * (n_classes-1) / 2)\n Returns the decision function of the sample for each class\n in the model.\n If decision_function_shape='ovr', the shape is (n_samples,\n n_classes).\n\nNotes\n-----\nIf decision_function_shape='ovo', the function values are proportional\nto the distance of the samples X to the separating hyperplane. If the\nexact distances are required, divide the function values by the norm of\nthe weight vector (``coef_``). See also `this question\n`_ for further details.\nIf decision_function_shape='ovr', the decision function is a monotonic\ntransformation of ovo decision function.", + "docstring": "Evaluate the decision function for the samples in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_classes * (n_classes-1) / 2)\n Returns the decision function of the sample for each class\n in the model.\n If decision_function_shape='ovr', the shape is (n_samples,\n n_classes).\n\n Notes\n -----\n If decision_function_shape='ovo', the function values are proportional\n to the distance of the samples X to the separating hyperplane. If the\n exact distances are required, divide the function values by the norm of\n the weight vector (``coef_``). See also `this question\n `_ for further details.\n If decision_function_shape='ovr', the decision function is a monotonic\n transformation of ovo decision function.\n ", "source_code": "\ndef decision_function(self, X):\n \"\"\"Evaluate the decision function for the samples in X.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input samples.\n\n Returns\n -------\n X : ndarray of shape (n_samples, n_classes * (n_classes-1) / 2)\n Returns the decision function of the sample for each class\n in the model.\n If decision_function_shape='ovr', the shape is (n_samples,\n n_classes).\n\n Notes\n -----\n If decision_function_shape='ovo', the function values are proportional\n to the distance of the samples X to the separating hyperplane. If the\n exact distances are required, divide the function values by the norm of\n the weight vector (``coef_``). See also `this question\n `_ for further details.\n If decision_function_shape='ovr', the decision function is a monotonic\n transformation of ovo decision function.\n \"\"\"\n dec = self._decision_function(X)\n if self.decision_function_shape == 'ovr' and len(self.classes_) > 2:\n return _ovr_decision_function(dec < 0, -dec, len(self.classes_))\n return dec" }, { @@ -158801,7 +171249,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -158811,13 +171260,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples_test, n_samples_train)", "description": "For kernel=\"precomputed\", the expected shape of X is\n(n_samples_test, n_samples_train)." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Perform classification on samples in X.\n\nFor an one-class model, +1 or -1 is returned.", - "docstring": "Perform classification on samples in X.\n\nFor an one-class model, +1 or -1 is returned.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples_test, n_samples_train)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,)\n Class labels for samples in X.", + "docstring": "Perform classification on samples in X.\n\n For an one-class model, +1 or -1 is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples_test, n_samples_train)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Class labels for samples in X.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Perform classification on samples in X.\n\n For an one-class model, +1 or -1 is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples_test, n_samples_train)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Class labels for samples in X.\n \"\"\"\n check_is_fitted(self)\n if self.break_ties and self.decision_function_shape == 'ovo':\n raise ValueError(\"break_ties must be False when decision_function_shape is 'ovo'\")\n if self.break_ties and self.decision_function_shape == 'ovr' and len(self.classes_) > 2:\n y = np.argmax(self.decision_function(X), axis=1)\n else:\n y = super().predict(X)\n return self.classes_.take(np.asarray(y, dtype=np.intp))" }, { @@ -158835,7 +171288,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -158845,13 +171299,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features) or (n_samples_test, n_samples_train)", "description": "For kernel=\"precomputed\", the expected shape of X is\n(n_samples_test, n_samples_train)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute log probabilities of possible outcomes for samples in X.\n\nThe model need to have probability information computed at training time: fit with attribute `probability` set to True.", - "docstring": "Compute log probabilities of possible outcomes for samples in X.\n\nThe model need to have probability information computed at training\ntime: fit with attribute `probability` set to True.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features) or (n_samples_test, n_samples_train)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\nReturns\n-------\nT : ndarray of shape (n_samples, n_classes)\n Returns the log-probabilities of the sample for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n\nNotes\n-----\nThe probability model is created using cross validation, so\nthe results can be slightly different than those obtained by\npredict. Also, it will produce meaningless results on very small\ndatasets.", + "description": "Compute log probabilities of possible outcomes for samples in X.\n\nThe model need to have probability information computed at training\ntime: fit with attribute `probability` set to True.", + "docstring": "Compute log probabilities of possible outcomes for samples in X.\n\n The model need to have probability information computed at training\n time: fit with attribute `probability` set to True.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or (n_samples_test, n_samples_train)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n T : ndarray of shape (n_samples, n_classes)\n Returns the log-probabilities of the sample for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n\n Notes\n -----\n The probability model is created using cross validation, so\n the results can be slightly different than those obtained by\n predict. Also, it will produce meaningless results on very small\n datasets.\n ", "source_code": "\n@available_if(_check_proba)\ndef predict_log_proba(self, X):\n \"\"\"Compute log probabilities of possible outcomes for samples in X.\n\n The model need to have probability information computed at training\n time: fit with attribute `probability` set to True.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features) or (n_samples_test, n_samples_train)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n T : ndarray of shape (n_samples, n_classes)\n Returns the log-probabilities of the sample for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n\n Notes\n -----\n The probability model is created using cross validation, so\n the results can be slightly different than those obtained by\n predict. Also, it will produce meaningless results on very small\n datasets.\n \"\"\"\n return np.log(self.predict_proba(X))" }, { @@ -158869,7 +171324,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -158879,13 +171335,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "For kernel=\"precomputed\", the expected shape of X is\n(n_samples_test, n_samples_train)." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute probabilities of possible outcomes for samples in X.\n\nThe model need to have probability information computed at training time: fit with attribute `probability` set to True.", - "docstring": "Compute probabilities of possible outcomes for samples in X.\n\nThe model need to have probability information computed at training\ntime: fit with attribute `probability` set to True.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\nReturns\n-------\nT : ndarray of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n\nNotes\n-----\nThe probability model is created using cross validation, so\nthe results can be slightly different than those obtained by\npredict. Also, it will produce meaningless results on very small\ndatasets.", + "description": "Compute probabilities of possible outcomes for samples in X.\n\nThe model need to have probability information computed at training\ntime: fit with attribute `probability` set to True.", + "docstring": "Compute probabilities of possible outcomes for samples in X.\n\n The model need to have probability information computed at training\n time: fit with attribute `probability` set to True.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n T : ndarray of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n\n Notes\n -----\n The probability model is created using cross validation, so\n the results can be slightly different than those obtained by\n predict. Also, it will produce meaningless results on very small\n datasets.\n ", "source_code": "\n@available_if(_check_proba)\ndef predict_proba(self, X):\n \"\"\"Compute probabilities of possible outcomes for samples in X.\n\n The model need to have probability information computed at training\n time: fit with attribute `probability` set to True.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n T : ndarray of shape (n_samples, n_classes)\n Returns the probability of the sample for each class in\n the model. The columns correspond to the classes in sorted\n order, as they appear in the attribute :term:`classes_`.\n\n Notes\n -----\n The probability model is created using cross validation, so\n the results can be slightly different than those obtained by\n predict. Also, it will produce meaningless results on very small\n datasets.\n \"\"\"\n X = self._validate_for_predict(X)\n if self.probA_.size == 0 or self.probB_.size == 0:\n raise NotFittedError('predict_proba is not available when fitted with probability=False')\n pred_proba = self._sparse_predict_proba if self._sparse else self._dense_predict_proba\n return pred_proba(X)" }, { @@ -158903,13 +171360,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Parameter learned in Platt scaling when `probability=True`.", - "docstring": "Parameter learned in Platt scaling when `probability=True`.\n\nReturns\n-------\nndarray of shape (n_classes * (n_classes - 1) / 2)", + "docstring": "Parameter learned in Platt scaling when `probability=True`.\n\n Returns\n -------\n ndarray of shape (n_classes * (n_classes - 1) / 2)\n ", "source_code": "\n@property\ndef probA_(self):\n \"\"\"Parameter learned in Platt scaling when `probability=True`.\n\n Returns\n -------\n ndarray of shape (n_classes * (n_classes - 1) / 2)\n \"\"\"\n return self._probA" }, { @@ -158927,13 +171385,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Parameter learned in Platt scaling when `probability=True`.", - "docstring": "Parameter learned in Platt scaling when `probability=True`.\n\nReturns\n-------\nndarray of shape (n_classes * (n_classes - 1) / 2)", + "docstring": "Parameter learned in Platt scaling when `probability=True`.\n\n Returns\n -------\n ndarray of shape (n_classes * (n_classes - 1) / 2)\n ", "source_code": "\n@property\ndef probB_(self):\n \"\"\"Parameter learned in Platt scaling when `probability=True`.\n\n Returns\n -------\n ndarray of shape (n_classes * (n_classes - 1) / 2)\n \"\"\"\n return self._probB" }, { @@ -158951,6 +171410,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -158961,7 +171424,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target vector relative to X" - } + }, + "refined_type": {} }, { "name": "C", @@ -158971,7 +171435,8 @@ "docstring": { "type": "float", "description": "Inverse of cross-validation parameter. Lower the C, the more\nthe penalization." - } + }, + "refined_type": {} }, { "name": "fit_intercept", @@ -158981,7 +171446,8 @@ "docstring": { "type": "bool", "description": "Whether or not to fit the intercept, that is to add a intercept\nterm to the decision function." - } + }, + "refined_type": {} }, { "name": "intercept_scaling", @@ -158991,7 +171457,8 @@ "docstring": { "type": "float", "description": "LibLinear internally penalizes the intercept and this term is subject\nto regularization just like the other terms of the feature vector.\nIn order to avoid this, one should increase the intercept_scaling.\nsuch that the feature vector becomes [x, intercept_scaling]." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -159001,6 +171468,10 @@ "docstring": { "type": "dict or 'balanced', default=None", "description": "Weights associated with classes in the form ``{class_label: weight}``.\nIf not given, all classes are supposed to have weight one. For\nmulti-output problems, a list of dicts can be provided in the same\norder as the columns of y.\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -159011,6 +171482,10 @@ "docstring": { "type": "{'l1', 'l2'}", "description": "The norm of the penalty used in regularization." + }, + "refined_type": { + "kind": "EnumType", + "values": ["l2", "l1"] } }, { @@ -159021,7 +171496,8 @@ "docstring": { "type": "bool", "description": "Dual or primal formulation," - } + }, + "refined_type": {} }, { "name": "verbose", @@ -159031,7 +171507,8 @@ "docstring": { "type": "int", "description": "Set verbose to any positive number for verbosity." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -159041,7 +171518,8 @@ "docstring": { "type": "int", "description": "Number of iterations." - } + }, + "refined_type": {} }, { "name": "tol", @@ -159051,7 +171529,8 @@ "docstring": { "type": "float", "description": "Stopping condition." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -159061,7 +171540,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the pseudo random number generation for shuffling the data.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "multi_class", @@ -159071,6 +171551,10 @@ "docstring": { "type": "{'ovr', 'crammer_singer'}, default='ovr'", "description": "`ovr` trains n_classes one-vs-rest classifiers, while `crammer_singer`\noptimizes a joint objective over all classes.\nWhile `crammer_singer` is interesting from an theoretical perspective\nas it is consistent it is seldom used in practice and rarely leads to\nbetter accuracy and is more expensive to compute.\nIf `crammer_singer` is chosen, the options loss, penalty and dual will\nbe ignored." + }, + "refined_type": { + "kind": "EnumType", + "values": ["crammer_singer", "ovr"] } }, { @@ -159081,6 +171565,15 @@ "docstring": { "type": "{'logistic_regression', 'hinge', 'squared_hinge', 'epsilon_insensitive', 'squared_epsilon_insensitive}, default='logistic_regression'", "description": "The loss function used to fit the model." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "logistic_regression", + "squared_hinge", + "hinge", + "epsilon_insensitive" + ] } }, { @@ -159091,7 +171584,8 @@ "docstring": { "type": "float, default=0.1", "description": "Epsilon parameter in the epsilon-insensitive loss function. Note\nthat the value of this parameter depends on the scale of the target\nvariable y. If unsure, set epsilon=0." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -159101,13 +171595,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weights assigned to each sample." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Used by Logistic Regression (and CV) and LinearSVC/LinearSVR.\n\nPreprocessing is done in this function before supplying it to liblinear.", - "docstring": "Used by Logistic Regression (and CV) and LinearSVC/LinearSVR.\n\nPreprocessing is done in this function before supplying it to liblinear.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target vector relative to X\n\nC : float\n Inverse of cross-validation parameter. Lower the C, the more\n the penalization.\n\nfit_intercept : bool\n Whether or not to fit the intercept, that is to add a intercept\n term to the decision function.\n\nintercept_scaling : float\n LibLinear internally penalizes the intercept and this term is subject\n to regularization just like the other terms of the feature vector.\n In order to avoid this, one should increase the intercept_scaling.\n such that the feature vector becomes [x, intercept_scaling].\n\nclass_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\npenalty : {'l1', 'l2'}\n The norm of the penalty used in regularization.\n\ndual : bool\n Dual or primal formulation,\n\nverbose : int\n Set verbose to any positive number for verbosity.\n\nmax_iter : int\n Number of iterations.\n\ntol : float\n Stopping condition.\n\nrandom_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\nmulti_class : {'ovr', 'crammer_singer'}, default='ovr'\n `ovr` trains n_classes one-vs-rest classifiers, while `crammer_singer`\n optimizes a joint objective over all classes.\n While `crammer_singer` is interesting from an theoretical perspective\n as it is consistent it is seldom used in practice and rarely leads to\n better accuracy and is more expensive to compute.\n If `crammer_singer` is chosen, the options loss, penalty and dual will\n be ignored.\n\nloss : {'logistic_regression', 'hinge', 'squared_hinge', 'epsilon_insensitive', 'squared_epsilon_insensitive}, default='logistic_regression'\n The loss function used to fit the model.\n\nepsilon : float, default=0.1\n Epsilon parameter in the epsilon-insensitive loss function. Note\n that the value of this parameter depends on the scale of the target\n variable y. If unsure, set epsilon=0.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weights assigned to each sample.\n\nReturns\n-------\ncoef_ : ndarray of shape (n_features, n_features + 1)\n The coefficient vector got by minimizing the objective function.\n\nintercept_ : float\n The intercept term added to the vector.\n\nn_iter_ : int\n Maximum number of iterations run across all classes.", + "docstring": "Used by Logistic Regression (and CV) and LinearSVC/LinearSVR.\n\n Preprocessing is done in this function before supplying it to liblinear.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X\n\n C : float\n Inverse of cross-validation parameter. Lower the C, the more\n the penalization.\n\n fit_intercept : bool\n Whether or not to fit the intercept, that is to add a intercept\n term to the decision function.\n\n intercept_scaling : float\n LibLinear internally penalizes the intercept and this term is subject\n to regularization just like the other terms of the feature vector.\n In order to avoid this, one should increase the intercept_scaling.\n such that the feature vector becomes [x, intercept_scaling].\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n penalty : {'l1', 'l2'}\n The norm of the penalty used in regularization.\n\n dual : bool\n Dual or primal formulation,\n\n verbose : int\n Set verbose to any positive number for verbosity.\n\n max_iter : int\n Number of iterations.\n\n tol : float\n Stopping condition.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n multi_class : {'ovr', 'crammer_singer'}, default='ovr'\n `ovr` trains n_classes one-vs-rest classifiers, while `crammer_singer`\n optimizes a joint objective over all classes.\n While `crammer_singer` is interesting from an theoretical perspective\n as it is consistent it is seldom used in practice and rarely leads to\n better accuracy and is more expensive to compute.\n If `crammer_singer` is chosen, the options loss, penalty and dual will\n be ignored.\n\n loss : {'logistic_regression', 'hinge', 'squared_hinge', 'epsilon_insensitive', 'squared_epsilon_insensitive}, default='logistic_regression'\n The loss function used to fit the model.\n\n epsilon : float, default=0.1\n Epsilon parameter in the epsilon-insensitive loss function. Note\n that the value of this parameter depends on the scale of the target\n variable y. If unsure, set epsilon=0.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights assigned to each sample.\n\n Returns\n -------\n coef_ : ndarray of shape (n_features, n_features + 1)\n The coefficient vector got by minimizing the objective function.\n\n intercept_ : float\n The intercept term added to the vector.\n\n n_iter_ : int\n Maximum number of iterations run across all classes.\n ", "source_code": "\ndef _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state=None, multi_class='ovr', loss='logistic_regression', epsilon=0.1, sample_weight=None):\n \"\"\"Used by Logistic Regression (and CV) and LinearSVC/LinearSVR.\n\n Preprocessing is done in this function before supplying it to liblinear.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X\n\n C : float\n Inverse of cross-validation parameter. Lower the C, the more\n the penalization.\n\n fit_intercept : bool\n Whether or not to fit the intercept, that is to add a intercept\n term to the decision function.\n\n intercept_scaling : float\n LibLinear internally penalizes the intercept and this term is subject\n to regularization just like the other terms of the feature vector.\n In order to avoid this, one should increase the intercept_scaling.\n such that the feature vector becomes [x, intercept_scaling].\n\n class_weight : dict or 'balanced', default=None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data\n as ``n_samples / (n_classes * np.bincount(y))``\n\n penalty : {'l1', 'l2'}\n The norm of the penalty used in regularization.\n\n dual : bool\n Dual or primal formulation,\n\n verbose : int\n Set verbose to any positive number for verbosity.\n\n max_iter : int\n Number of iterations.\n\n tol : float\n Stopping condition.\n\n random_state : int, RandomState instance or None, default=None\n Controls the pseudo random number generation for shuffling the data.\n Pass an int for reproducible output across multiple function calls.\n See :term:`Glossary `.\n\n multi_class : {'ovr', 'crammer_singer'}, default='ovr'\n `ovr` trains n_classes one-vs-rest classifiers, while `crammer_singer`\n optimizes a joint objective over all classes.\n While `crammer_singer` is interesting from an theoretical perspective\n as it is consistent it is seldom used in practice and rarely leads to\n better accuracy and is more expensive to compute.\n If `crammer_singer` is chosen, the options loss, penalty and dual will\n be ignored.\n\n loss : {'logistic_regression', 'hinge', 'squared_hinge', 'epsilon_insensitive', 'squared_epsilon_insensitive}, default='logistic_regression'\n The loss function used to fit the model.\n\n epsilon : float, default=0.1\n Epsilon parameter in the epsilon-insensitive loss function. Note\n that the value of this parameter depends on the scale of the target\n variable y. If unsure, set epsilon=0.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weights assigned to each sample.\n\n Returns\n -------\n coef_ : ndarray of shape (n_features, n_features + 1)\n The coefficient vector got by minimizing the objective function.\n\n intercept_ : float\n The intercept term added to the vector.\n\n n_iter_ : int\n Maximum number of iterations run across all classes.\n \"\"\"\n if loss not in ['epsilon_insensitive', 'squared_epsilon_insensitive']:\n enc = LabelEncoder()\n y_ind = enc.fit_transform(y)\n classes_ = enc.classes_\n if len(classes_) < 2:\n raise ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: %r' % classes_[0])\n class_weight_ = compute_class_weight(class_weight, classes=classes_, y=y)\n else:\n class_weight_ = np.empty(0, dtype=np.float64)\n y_ind = y\n liblinear.set_verbosity_wrap(verbose)\n rnd = check_random_state(random_state)\n if verbose:\n print('[LibLinear]', end='')\n bias = -1.0\n if fit_intercept:\n if intercept_scaling <= 0:\n raise ValueError('Intercept scaling is %r but needs to be greater than 0. To disable fitting an intercept, set fit_intercept=False.' % intercept_scaling)\n else:\n bias = intercept_scaling\n libsvm.set_verbosity_wrap(verbose)\n libsvm_sparse.set_verbosity_wrap(verbose)\n liblinear.set_verbosity_wrap(verbose)\n if sp.issparse(X):\n _check_large_sparse(X)\n y_ind = np.asarray(y_ind, dtype=np.float64).ravel()\n y_ind = np.require(y_ind, requirements='W')\n sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float64)\n solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)\n (raw_coef_, n_iter_) = liblinear.train_wrap(X, y_ind, sp.isspmatrix(X), solver_type, tol, bias, C, class_weight_, max_iter, rnd.randint(np.iinfo('i').max), epsilon, sample_weight)\n n_iter_ = max(n_iter_)\n if n_iter_ >= max_iter:\n warnings.warn('Liblinear failed to converge, increase the number of iterations.', ConvergenceWarning)\n if fit_intercept:\n coef_ = raw_coef_[:, :-1]\n intercept_ = intercept_scaling * raw_coef_[:, -1]\n else:\n coef_ = raw_coef_\n intercept_ = 0.0\n return coef_, intercept_, n_iter_" }, { @@ -159125,7 +171620,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "penalty", @@ -159135,7 +171631,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "loss", @@ -159145,7 +171642,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dual", @@ -159155,13 +171653,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Find the liblinear magic number for the solver.\n\nThis number depends on the values of the following attributes: - multi_class - penalty - loss - dual The same number is also internally used by LibLinear to determine which solver to use.", - "docstring": "Find the liblinear magic number for the solver.\n\nThis number depends on the values of the following attributes:\n - multi_class\n - penalty\n - loss\n - dual\n\nThe same number is also internally used by LibLinear to determine\nwhich solver to use.", + "description": "Find the liblinear magic number for the solver.\n\nThis number depends on the values of the following attributes:\n - multi_class\n - penalty\n - loss\n - dual\n\nThe same number is also internally used by LibLinear to determine\nwhich solver to use.", + "docstring": "Find the liblinear magic number for the solver.\n\n This number depends on the values of the following attributes:\n - multi_class\n - penalty\n - loss\n - dual\n\n The same number is also internally used by LibLinear to determine\n which solver to use.\n ", "source_code": "\ndef _get_liblinear_solver_type(multi_class, penalty, loss, dual):\n \"\"\"Find the liblinear magic number for the solver.\n\n This number depends on the values of the following attributes:\n - multi_class\n - penalty\n - loss\n - dual\n\n The same number is also internally used by LibLinear to determine\n which solver to use.\n \"\"\"\n _solver_type_dict = {'logistic_regression': {'l1': {False: 6}, 'l2': {False: 0, True: 7}}, 'hinge': {'l2': {True: 3}}, 'squared_hinge': {'l1': {False: 5}, 'l2': {False: 2, True: 1}}, 'epsilon_insensitive': {'l2': {True: 13}}, 'squared_epsilon_insensitive': {'l2': {False: 11, True: 12}}, 'crammer_singer': 4}\n if multi_class == 'crammer_singer':\n return _solver_type_dict[multi_class]\n elif multi_class != 'ovr':\n raise ValueError('`multi_class` must be one of `ovr`, `crammer_singer`, got %r' % multi_class)\n _solver_pen = _solver_type_dict.get(loss, None)\n if _solver_pen is None:\n error_string = \"loss='%s' is not supported\" % loss\n else:\n _solver_dual = _solver_pen.get(penalty, None)\n if _solver_dual is None:\n error_string = \"The combination of penalty='%s' and loss='%s' is not supported\" % (penalty, loss)\n else:\n solver_num = _solver_dual.get(dual, None)\n if solver_num is None:\n error_string = \"The combination of penalty='%s' and loss='%s' are not supported when dual=%s\" % (penalty, loss, dual)\n else:\n return solver_num\n raise ValueError('Unsupported set of arguments: %s, Parameters: penalty=%r, loss=%r, dual=%r' % (error_string, penalty, loss, dual))" }, { @@ -159179,7 +171678,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_support", @@ -159189,7 +171689,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "support_vectors", @@ -159199,13 +171700,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Generate primal coefficients from dual coefficients for the one-vs-one multi class LibSVM in the case of a linear kernel.", - "docstring": "Generate primal coefficients from dual coefficients\nfor the one-vs-one multi class LibSVM in the case\nof a linear kernel.", + "description": "Generate primal coefficients from dual coefficients\nfor the one-vs-one multi class LibSVM in the case\nof a linear kernel.", + "docstring": "Generate primal coefficients from dual coefficients\n for the one-vs-one multi class LibSVM in the case\n of a linear kernel.", "source_code": "\ndef _one_vs_one_coef(dual_coef, n_support, support_vectors):\n \"\"\"Generate primal coefficients from dual coefficients\n for the one-vs-one multi class LibSVM in the case\n of a linear kernel.\"\"\"\n n_class = dual_coef.shape[0] + 1\n coef = []\n sv_locs = np.cumsum(np.hstack([[0], n_support]))\n for class1 in range(n_class):\n sv1 = support_vectors[sv_locs[class1]:sv_locs[class1 + 1], :]\n for class2 in range(class1 + 1, n_class):\n sv2 = support_vectors[sv_locs[class2]:sv_locs[class2 + 1], :]\n alpha1 = dual_coef[class2 - 1, sv_locs[class1]:sv_locs[class1 + 1]]\n alpha2 = dual_coef[class1, sv_locs[class2]:sv_locs[class2 + 1]]\n coef.append(safe_sparse_dot(alpha1, sv1) + safe_sparse_dot(alpha2, sv2))\n return coef" }, { @@ -159223,6 +171725,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -159233,7 +171739,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target vector relative to X." - } + }, + "refined_type": {} }, { "name": "loss", @@ -159243,6 +171750,10 @@ "docstring": { "type": "{'squared_hinge', 'log'}, default='squared_hinge'", "description": "Specifies the loss function.\nWith 'squared_hinge' it is the squared hinge loss (a.k.a. L2 loss).\nWith 'log' it is the loss of logistic regression models." + }, + "refined_type": { + "kind": "EnumType", + "values": ["log", "squared_hinge"] } }, { @@ -159253,7 +171764,8 @@ "docstring": { "type": "bool, default=True", "description": "Specifies if the intercept should be fitted by the model.\nIt must match the fit() method parameter." - } + }, + "refined_type": {} }, { "name": "intercept_scaling", @@ -159263,13 +171775,14 @@ "docstring": { "type": "float, default=1.0", "description": "when fit_intercept is True, instance vector x becomes\n[x, intercept_scaling],\ni.e. a \"synthetic\" feature with constant value equals to\nintercept_scaling is appended to the instance vector.\nIt must match the fit() method parameter." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return the lowest bound for C such that for C in (l1_min_C, infinity) the model is guaranteed not to be empty. This applies to l1 penalized classifiers, such as LinearSVC with penalty='l1' and linear_model.LogisticRegression with penalty='l1'.\n\nThis value is valid if class_weight parameter in fit() is not set.", - "docstring": "Return the lowest bound for C such that for C in (l1_min_C, infinity)\nthe model is guaranteed not to be empty. This applies to l1 penalized\nclassifiers, such as LinearSVC with penalty='l1' and\nlinear_model.LogisticRegression with penalty='l1'.\n\nThis value is valid if class_weight parameter in fit() is not set.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target vector relative to X.\n\nloss : {'squared_hinge', 'log'}, default='squared_hinge'\n Specifies the loss function.\n With 'squared_hinge' it is the squared hinge loss (a.k.a. L2 loss).\n With 'log' it is the loss of logistic regression models.\n\nfit_intercept : bool, default=True\n Specifies if the intercept should be fitted by the model.\n It must match the fit() method parameter.\n\nintercept_scaling : float, default=1.0\n when fit_intercept is True, instance vector x becomes\n [x, intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equals to\n intercept_scaling is appended to the instance vector.\n It must match the fit() method parameter.\n\nReturns\n-------\nl1_min_c : float\n minimum value for C", + "description": "Return the lowest bound for C such that for C in (l1_min_C, infinity)\nthe model is guaranteed not to be empty. This applies to l1 penalized\nclassifiers, such as LinearSVC with penalty='l1' and\nlinear_model.LogisticRegression with penalty='l1'.\n\nThis value is valid if class_weight parameter in fit() is not set.", + "docstring": "\n Return the lowest bound for C such that for C in (l1_min_C, infinity)\n the model is guaranteed not to be empty. This applies to l1 penalized\n classifiers, such as LinearSVC with penalty='l1' and\n linear_model.LogisticRegression with penalty='l1'.\n\n This value is valid if class_weight parameter in fit() is not set.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n loss : {'squared_hinge', 'log'}, default='squared_hinge'\n Specifies the loss function.\n With 'squared_hinge' it is the squared hinge loss (a.k.a. L2 loss).\n With 'log' it is the loss of logistic regression models.\n\n fit_intercept : bool, default=True\n Specifies if the intercept should be fitted by the model.\n It must match the fit() method parameter.\n\n intercept_scaling : float, default=1.0\n when fit_intercept is True, instance vector x becomes\n [x, intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equals to\n intercept_scaling is appended to the instance vector.\n It must match the fit() method parameter.\n\n Returns\n -------\n l1_min_c : float\n minimum value for C\n ", "source_code": "\ndef l1_min_c(X, y, *, loss='squared_hinge', fit_intercept=True, intercept_scaling=1.0):\n \"\"\"\n Return the lowest bound for C such that for C in (l1_min_C, infinity)\n the model is guaranteed not to be empty. This applies to l1 penalized\n classifiers, such as LinearSVC with penalty='l1' and\n linear_model.LogisticRegression with penalty='l1'.\n\n This value is valid if class_weight parameter in fit() is not set.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n loss : {'squared_hinge', 'log'}, default='squared_hinge'\n Specifies the loss function.\n With 'squared_hinge' it is the squared hinge loss (a.k.a. L2 loss).\n With 'log' it is the loss of logistic regression models.\n\n fit_intercept : bool, default=True\n Specifies if the intercept should be fitted by the model.\n It must match the fit() method parameter.\n\n intercept_scaling : float, default=1.0\n when fit_intercept is True, instance vector x becomes\n [x, intercept_scaling],\n i.e. a \"synthetic\" feature with constant value equals to\n intercept_scaling is appended to the instance vector.\n It must match the fit() method parameter.\n\n Returns\n -------\n l1_min_c : float\n minimum value for C\n \"\"\"\n if loss not in ('squared_hinge', 'log'):\n raise ValueError('loss type not in (\"squared_hinge\", \"log\")')\n X = check_array(X, accept_sparse='csc')\n check_consistent_length(X, y)\n Y = LabelBinarizer(neg_label=-1).fit_transform(y).T\n den = np.max(np.abs(safe_sparse_dot(Y, X)))\n if fit_intercept:\n bias = np.full((np.size(y), 1), intercept_scaling, dtype=np.array(intercept_scaling).dtype)\n den = max(den, abs(np.dot(Y, bias)).max())\n if den == 0.0:\n raise ValueError('Ill-posed l1_min_c calculation: l1 will always select zero coefficients for this data')\n if loss == 'squared_hinge':\n return 0.5 / den\n else:\n return 2.0 / den" }, { @@ -159287,7 +171800,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "penalty", @@ -159297,6 +171811,10 @@ "docstring": { "type": "{'l1', 'l2'}, default='l2'", "description": "Specifies the norm used in the penalization. The 'l2'\npenalty is the standard used in SVC. The 'l1' leads to ``coef_``\nvectors that are sparse." + }, + "refined_type": { + "kind": "EnumType", + "values": ["l2", "l1"] } }, { @@ -159307,6 +171825,10 @@ "docstring": { "type": "{'hinge', 'squared_hinge'}, default='squared_hinge'", "description": "Specifies the loss function. 'hinge' is the standard SVM loss\n(used e.g. by the SVC class) while 'squared_hinge' is the\nsquare of the hinge loss. The combination of ``penalty='l1'``\nand ``loss='hinge'`` is not supported." + }, + "refined_type": { + "kind": "EnumType", + "values": ["hinge", "squared_hinge"] } }, { @@ -159317,7 +171839,8 @@ "docstring": { "type": "bool, default=True", "description": "Select the algorithm to either solve the dual or primal\noptimization problem. Prefer dual=False when n_samples > n_features." - } + }, + "refined_type": {} }, { "name": "tol", @@ -159327,7 +171850,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance for stopping criteria." - } + }, + "refined_type": {} }, { "name": "C", @@ -159337,7 +171861,8 @@ "docstring": { "type": "float, default=1.0", "description": "Regularization parameter. The strength of the regularization is\ninversely proportional to C. Must be strictly positive." - } + }, + "refined_type": {} }, { "name": "multi_class", @@ -159347,6 +171872,10 @@ "docstring": { "type": "{'ovr', 'crammer_singer'}, default='ovr'", "description": "Determines the multi-class strategy if `y` contains more than\ntwo classes.\n``\"ovr\"`` trains n_classes one-vs-rest classifiers, while\n``\"crammer_singer\"`` optimizes a joint objective over all classes.\nWhile `crammer_singer` is interesting from a theoretical perspective\nas it is consistent, it is seldom used in practice as it rarely leads\nto better accuracy and is more expensive to compute.\nIf ``\"crammer_singer\"`` is chosen, the options loss, penalty and dual\nwill be ignored." + }, + "refined_type": { + "kind": "EnumType", + "values": ["crammer_singer", "ovr"] } }, { @@ -159357,7 +171886,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be already centered)." - } + }, + "refined_type": {} }, { "name": "intercept_scaling", @@ -159367,7 +171897,8 @@ "docstring": { "type": "float, default=1", "description": "When self.fit_intercept is True, instance vector x becomes\n``[x, self.intercept_scaling]``,\ni.e. a \"synthetic\" feature with constant value equals to\nintercept_scaling is appended to the instance vector.\nThe intercept becomes intercept_scaling * synthetic feature weight\nNote! the synthetic feature weight is subject to l1/l2 regularization\nas all other features.\nTo lessen the effect of regularization on synthetic feature weight\n(and therefore on the intercept) intercept_scaling has to be increased." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -159377,7 +171908,8 @@ "docstring": { "type": "dict or 'balanced', default=None", "description": "Set the parameter C of class i to ``class_weight[i]*C`` for\nSVC. If not given, all classes are supposed to have\nweight one.\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -159387,7 +171919,8 @@ "docstring": { "type": "int, default=0", "description": "Enable verbose output. Note that this setting takes advantage of a\nper-process runtime setting in liblinear that, if enabled, may not work\nproperly in a multithreaded context." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -159397,7 +171930,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the pseudo random number generation for shuffling the data for\nthe dual coordinate descent (if ``dual=True``). When ``dual=False`` the\nunderlying implementation of :class:`LinearSVC` is not random and\n``random_state`` has no effect on the results.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -159407,13 +171941,14 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of iterations to be run." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, penalty='l2', loss='squared_hinge', *, dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000):\n self.dual = dual\n self.tol = tol\n self.C = C\n self.multi_class = multi_class\n self.fit_intercept = fit_intercept\n self.intercept_scaling = intercept_scaling\n self.class_weight = class_weight\n self.verbose = verbose\n self.random_state = random_state\n self.max_iter = max_iter\n self.penalty = penalty\n self.loss = loss" }, { @@ -159431,13 +171966,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -159455,7 +171991,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -159465,6 +172002,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -159475,7 +172016,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target vector relative to X." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -159485,13 +172027,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Array of weights that are assigned to individual\nsamples. If not provided,\nthen each sample is given unit weight.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model according to the given training data.", - "docstring": "Fit the model according to the given training data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target vector relative to X.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Array of weights that are assigned to individual\n samples. If not provided,\n then each sample is given unit weight.\n\n .. versionadded:: 0.18\n\nReturns\n-------\nself : object\n An instance of the estimator.", + "docstring": "Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Array of weights that are assigned to individual\n samples. If not provided,\n then each sample is given unit weight.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n self : object\n An instance of the estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Array of weights that are assigned to individual\n samples. If not provided,\n then each sample is given unit weight.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n self : object\n An instance of the estimator.\n \"\"\"\n if self.C < 0:\n raise ValueError('Penalty term must be positive; got (C=%r)' % self.C)\n (X, y) = self._validate_data(X, y, accept_sparse='csr', dtype=np.float64, order='C', accept_large_sparse=False)\n check_classification_targets(y)\n self.classes_ = np.unique(y)\n (self.coef_, self.intercept_, self.n_iter_) = _fit_liblinear(X, y, self.C, self.fit_intercept, self.intercept_scaling, self.class_weight, self.penalty, self.dual, self.verbose, self.max_iter, self.tol, self.random_state, self.multi_class, self.loss, sample_weight=sample_weight)\n if self.multi_class == 'crammer_singer' and len(self.classes_) == 2:\n self.coef_ = (self.coef_[1] - self.coef_[0]).reshape(1, -1)\n if self.fit_intercept:\n intercept = self.intercept_[1] - self.intercept_[0]\n self.intercept_ = np.array([intercept])\n return self" }, { @@ -159509,7 +172052,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -159519,7 +172063,8 @@ "docstring": { "type": "float, default=0.0", "description": "Epsilon parameter in the epsilon-insensitive loss function. Note\nthat the value of this parameter depends on the scale of the target\nvariable y. If unsure, set ``epsilon=0``." - } + }, + "refined_type": {} }, { "name": "tol", @@ -159529,7 +172074,8 @@ "docstring": { "type": "float, default=1e-4", "description": "Tolerance for stopping criteria." - } + }, + "refined_type": {} }, { "name": "C", @@ -159539,7 +172085,8 @@ "docstring": { "type": "float, default=1.0", "description": "Regularization parameter. The strength of the regularization is\ninversely proportional to C. Must be strictly positive." - } + }, + "refined_type": {} }, { "name": "loss", @@ -159549,6 +172096,13 @@ "docstring": { "type": "{'epsilon_insensitive', 'squared_epsilon_insensitive'}, default='epsilon_insensitive'", "description": "Specifies the loss function. The epsilon-insensitive loss\n(standard SVR) is the L1 loss, while the squared epsilon-insensitive\nloss ('squared_epsilon_insensitive') is the L2 loss." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "squared_epsilon_insensitive", + "epsilon_insensitive" + ] } }, { @@ -159559,7 +172113,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to calculate the intercept for this model. If set\nto false, no intercept will be used in calculations\n(i.e. data is expected to be already centered)." - } + }, + "refined_type": {} }, { "name": "intercept_scaling", @@ -159569,7 +172124,8 @@ "docstring": { "type": "float, default=1.0", "description": "When self.fit_intercept is True, instance vector x becomes\n[x, self.intercept_scaling],\ni.e. a \"synthetic\" feature with constant value equals to\nintercept_scaling is appended to the instance vector.\nThe intercept becomes intercept_scaling * synthetic feature weight\nNote! the synthetic feature weight is subject to l1/l2 regularization\nas all other features.\nTo lessen the effect of regularization on synthetic feature weight\n(and therefore on the intercept) intercept_scaling has to be increased." - } + }, + "refined_type": {} }, { "name": "dual", @@ -159579,7 +172135,8 @@ "docstring": { "type": "bool, default=True", "description": "Select the algorithm to either solve the dual or primal\noptimization problem. Prefer dual=False when n_samples > n_features." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -159589,7 +172146,8 @@ "docstring": { "type": "int, default=0", "description": "Enable verbose output. Note that this setting takes advantage of a\nper-process runtime setting in liblinear that, if enabled, may not work\nproperly in a multithreaded context." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -159599,7 +172157,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the pseudo random number generation for shuffling the data.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -159609,13 +172168,14 @@ "docstring": { "type": "int, default=1000", "description": "The maximum number of iterations to be run." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, epsilon=0.0, tol=0.0001, C=1.0, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=1000):\n self.tol = tol\n self.C = C\n self.epsilon = epsilon\n self.fit_intercept = fit_intercept\n self.intercept_scaling = intercept_scaling\n self.verbose = verbose\n self.random_state = random_state\n self.max_iter = max_iter\n self.dual = dual\n self.loss = loss" }, { @@ -159633,13 +172193,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -159657,7 +172218,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -159667,6 +172229,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -159677,7 +172243,8 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Target vector relative to X." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -159687,13 +172254,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Array of weights that are assigned to individual\nsamples. If not provided,\nthen each sample is given unit weight.\n\n.. versionadded:: 0.18" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Fit the model according to the given training data.", - "docstring": "Fit the model according to the given training data.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples,)\n Target vector relative to X.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Array of weights that are assigned to individual\n samples. If not provided,\n then each sample is given unit weight.\n\n .. versionadded:: 0.18\n\nReturns\n-------\nself : object\n An instance of the estimator.", + "docstring": "Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Array of weights that are assigned to individual\n samples. If not provided,\n then each sample is given unit weight.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n self : object\n An instance of the estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None):\n \"\"\"Fit the model according to the given training data.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples,)\n Target vector relative to X.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Array of weights that are assigned to individual\n samples. If not provided,\n then each sample is given unit weight.\n\n .. versionadded:: 0.18\n\n Returns\n -------\n self : object\n An instance of the estimator.\n \"\"\"\n if self.C < 0:\n raise ValueError('Penalty term must be positive; got (C=%r)' % self.C)\n (X, y) = self._validate_data(X, y, accept_sparse='csr', dtype=np.float64, order='C', accept_large_sparse=False)\n penalty = 'l2'\n (self.coef_, self.intercept_, self.n_iter_) = _fit_liblinear(X, y, self.C, self.fit_intercept, self.intercept_scaling, None, penalty, self.dual, self.verbose, self.max_iter, self.tol, self.random_state, loss=self.loss, epsilon=self.epsilon, sample_weight=sample_weight)\n self.coef_ = self.coef_.ravel()\n return self" }, { @@ -159711,7 +172279,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nu", @@ -159721,6 +172290,14 @@ "docstring": { "type": "float, default=0.5", "description": "An upper bound on the fraction of margin errors (see :ref:`User Guide\n`) and a lower bound of the fraction of support vectors.\nShould be in the interval (0, 1]." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": false, + "max_inclusive": true } }, { @@ -159729,8 +172306,18 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'", - "description": "Specifies the kernel type to be used in the algorithm.\nIt must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\na callable.\nIf none is given, 'rbf' will be used. If a callable is given it is\nused to precompute the kernel matrix." + "type": "{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'", + "description": "Specifies the kernel type to be used in the algorithm.\nIf none is given, 'rbf' will be used. If a callable is given it is\nused to precompute the kernel matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "poly", + "rbf", + "sigmoid", + "linear", + "precomputed" + ] } }, { @@ -159741,7 +172328,8 @@ "docstring": { "type": "int, default=3", "description": "Degree of the polynomial kernel function ('poly').\nIgnored by all other kernels." - } + }, + "refined_type": {} }, { "name": "gamma", @@ -159751,6 +172339,10 @@ "docstring": { "type": "{'scale', 'auto'} or float, default='scale'", "description": "Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n- if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n- if 'auto', uses 1 / n_features.\n\n.. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "scale"] } }, { @@ -159761,7 +172353,8 @@ "docstring": { "type": "float, default=0.0", "description": "Independent term in kernel function.\nIt is only significant in 'poly' and 'sigmoid'." - } + }, + "refined_type": {} }, { "name": "shrinking", @@ -159771,7 +172364,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to use the shrinking heuristic.\nSee the :ref:`User Guide `." - } + }, + "refined_type": {} }, { "name": "probability", @@ -159781,7 +172375,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enable probability estimates. This must be enabled prior\nto calling `fit`, will slow down that method as it internally uses\n5-fold cross-validation, and `predict_proba` may be inconsistent with\n`predict`. Read more in the :ref:`User Guide `." - } + }, + "refined_type": {} }, { "name": "tol", @@ -159791,7 +172386,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Tolerance for stopping criterion." - } + }, + "refined_type": {} }, { "name": "cache_size", @@ -159801,7 +172397,8 @@ "docstring": { "type": "float, default=200", "description": "Specify the size of the kernel cache (in MB)." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -159811,6 +172408,10 @@ "docstring": { "type": "{dict, 'balanced'}, default=None", "description": "Set the parameter C of class i to class_weight[i]*C for\nSVC. If not given, all classes are supposed to have\nweight one. The \"balanced\" mode uses the values of y to automatically\nadjust weights inversely proportional to class frequencies as\n``n_samples / (n_classes * np.bincount(y))``." + }, + "refined_type": { + "kind": "EnumType", + "values": ["balanced"] } }, { @@ -159821,7 +172422,8 @@ "docstring": { "type": "bool, default=False", "description": "Enable verbose output. Note that this setting takes advantage of a\nper-process runtime setting in libsvm that, if enabled, may not work\nproperly in a multithreaded context." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -159831,7 +172433,8 @@ "docstring": { "type": "int, default=-1", "description": "Hard limit on iterations within solver, or -1 for no limit." - } + }, + "refined_type": {} }, { "name": "decision_function_shape", @@ -159841,6 +172444,10 @@ "docstring": { "type": "{'ovo', 'ovr'}, default='ovr'", "description": "Whether to return a one-vs-rest ('ovr') decision function of shape\n(n_samples, n_classes) as all other classifiers, or the original\none-vs-one ('ovo') decision function of libsvm which has shape\n(n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one\n('ovo') is always used as multi-class strategy. The parameter is\nignored for binary classification.\n\n.. versionchanged:: 0.19\n decision_function_shape is 'ovr' by default.\n\n.. versionadded:: 0.17\n *decision_function_shape='ovr'* is recommended.\n\n.. versionchanged:: 0.17\n Deprecated *decision_function_shape='ovo' and None*." + }, + "refined_type": { + "kind": "EnumType", + "values": ["ovo", "ovr"] } }, { @@ -159851,7 +172458,8 @@ "docstring": { "type": "bool, default=False", "description": "If true, ``decision_function_shape='ovr'``, and number of classes > 2,\n:term:`predict` will break ties according to the confidence values of\n:term:`decision_function`; otherwise the first class among the tied\nclasses is returned. Please note that breaking ties comes at a\nrelatively high computational cost compared to a simple predict.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -159861,13 +172469,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the pseudo random number generation for shuffling the data for\nprobability estimates. Ignored when `probability` is False.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, nu=0.5, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None):\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=0.0, nu=nu, shrinking=shrinking, probability=probability, cache_size=cache_size, class_weight=class_weight, verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties, random_state=random_state)" }, { @@ -159885,13 +172494,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_methods_subset_invariance': 'fails for the decision_function method', 'check_class_weight_classifiers': 'class_weight is ignored.', 'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -159909,7 +172519,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "nu", @@ -159919,6 +172530,14 @@ "docstring": { "type": "float, default=0.5", "description": "An upper bound on the fraction of training errors and a lower bound of\nthe fraction of support vectors. Should be in the interval (0, 1]. By\ndefault 0.5 will be taken." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": false, + "max_inclusive": true } }, { @@ -159929,7 +172548,8 @@ "docstring": { "type": "float, default=1.0", "description": "Penalty parameter C of the error term." - } + }, + "refined_type": {} }, { "name": "kernel", @@ -159937,8 +172557,18 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'", - "description": "Specifies the kernel type to be used in the algorithm.\nIt must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\na callable.\nIf none is given, 'rbf' will be used. If a callable is given it is\nused to precompute the kernel matrix." + "type": "{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'", + "description": "Specifies the kernel type to be used in the algorithm.\nIf none is given, 'rbf' will be used. If a callable is given it is\nused to precompute the kernel matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "poly", + "rbf", + "sigmoid", + "linear", + "precomputed" + ] } }, { @@ -159949,7 +172579,8 @@ "docstring": { "type": "int, default=3", "description": "Degree of the polynomial kernel function ('poly').\nIgnored by all other kernels." - } + }, + "refined_type": {} }, { "name": "gamma", @@ -159959,6 +172590,10 @@ "docstring": { "type": "{'scale', 'auto'} or float, default='scale'", "description": "Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n- if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n- if 'auto', uses 1 / n_features.\n\n.. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "scale"] } }, { @@ -159969,7 +172604,8 @@ "docstring": { "type": "float, default=0.0", "description": "Independent term in kernel function.\nIt is only significant in 'poly' and 'sigmoid'." - } + }, + "refined_type": {} }, { "name": "shrinking", @@ -159979,7 +172615,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to use the shrinking heuristic.\nSee the :ref:`User Guide `." - } + }, + "refined_type": {} }, { "name": "tol", @@ -159989,7 +172626,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Tolerance for stopping criterion." - } + }, + "refined_type": {} }, { "name": "cache_size", @@ -159999,7 +172637,8 @@ "docstring": { "type": "float, default=200", "description": "Specify the size of the kernel cache (in MB)." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -160009,7 +172648,8 @@ "docstring": { "type": "bool, default=False", "description": "Enable verbose output. Note that this setting takes advantage of a\nper-process runtime setting in libsvm that, if enabled, may not work\nproperly in a multithreaded context." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -160019,13 +172659,14 @@ "docstring": { "type": "int, default=-1", "description": "Hard limit on iterations within solver, or -1 for no limit." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, nu=0.5, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, tol=0.001, cache_size=200, verbose=False, max_iter=-1):\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, nu=nu, epsilon=0.0, shrinking=shrinking, probability=False, cache_size=cache_size, class_weight=None, verbose=verbose, max_iter=max_iter, random_state=None)" }, { @@ -160043,13 +172684,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -160067,7 +172709,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -160075,8 +172718,18 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'", - "description": "Specifies the kernel type to be used in the algorithm.\nIt must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\na callable.\nIf none is given, 'rbf' will be used. If a callable is given it is\nused to precompute the kernel matrix." + "type": "{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'", + "description": "Specifies the kernel type to be used in the algorithm.\nIf none is given, 'rbf' will be used. If a callable is given it is\nused to precompute the kernel matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "poly", + "rbf", + "sigmoid", + "linear", + "precomputed" + ] } }, { @@ -160087,7 +172740,8 @@ "docstring": { "type": "int, default=3", "description": "Degree of the polynomial kernel function ('poly').\nIgnored by all other kernels." - } + }, + "refined_type": {} }, { "name": "gamma", @@ -160097,6 +172751,10 @@ "docstring": { "type": "{'scale', 'auto'} or float, default='scale'", "description": "Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n- if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n- if 'auto', uses 1 / n_features.\n\n.. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "scale"] } }, { @@ -160107,7 +172765,8 @@ "docstring": { "type": "float, default=0.0", "description": "Independent term in kernel function.\nIt is only significant in 'poly' and 'sigmoid'." - } + }, + "refined_type": {} }, { "name": "tol", @@ -160117,7 +172776,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Tolerance for stopping criterion." - } + }, + "refined_type": {} }, { "name": "nu", @@ -160127,6 +172787,14 @@ "docstring": { "type": "float, default=0.5", "description": "An upper bound on the fraction of training\nerrors and a lower bound of the fraction of support\nvectors. Should be in the interval (0, 1]. By default 0.5\nwill be taken." + }, + "refined_type": { + "kind": "BoundaryType", + "base_type": "float", + "min": 0.0, + "max": 1.0, + "min_inclusive": false, + "max_inclusive": true } }, { @@ -160137,7 +172805,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to use the shrinking heuristic.\nSee the :ref:`User Guide `." - } + }, + "refined_type": {} }, { "name": "cache_size", @@ -160147,7 +172816,8 @@ "docstring": { "type": "float, default=200", "description": "Specify the size of the kernel cache (in MB)." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -160157,7 +172827,8 @@ "docstring": { "type": "bool, default=False", "description": "Enable verbose output. Note that this setting takes advantage of a\nper-process runtime setting in libsvm that, if enabled, may not work\nproperly in a multithreaded context." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -160167,13 +172838,14 @@ "docstring": { "type": "int, default=-1", "description": "Hard limit on iterations within solver, or -1 for no limit." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1):\n super().__init__(kernel, degree, gamma, coef0, tol, 0.0, nu, 0.0, shrinking, False, cache_size, None, verbose, max_iter, random_state=None)" }, { @@ -160191,13 +172863,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -160215,7 +172888,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -160225,13 +172899,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Signed distance to the separating hyperplane.\n\nSigned distance is positive for an inlier and negative for an outlier.", - "docstring": "Signed distance to the separating hyperplane.\n\nSigned distance is positive for an inlier and negative for an outlier.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix.\n\nReturns\n-------\ndec : ndarray of shape (n_samples,)\n Returns the decision function of the samples.", + "docstring": "Signed distance to the separating hyperplane.\n\n Signed distance is positive for an inlier and negative for an outlier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n dec : ndarray of shape (n_samples,)\n Returns the decision function of the samples.\n ", "source_code": "\ndef decision_function(self, X):\n \"\"\"Signed distance to the separating hyperplane.\n\n Signed distance is positive for an inlier and negative for an outlier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n dec : ndarray of shape (n_samples,)\n Returns the decision function of the samples.\n \"\"\"\n dec = self._decision_function(X).ravel()\n return dec" }, { @@ -160249,7 +172924,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -160259,6 +172935,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "Set of samples, where `n_samples` is the number of samples and\n`n_features` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -160269,7 +172949,8 @@ "docstring": { "type": "Ignored", "description": "Not used, present for API consistency by convention." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -160279,13 +172960,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Per-sample weights. Rescale C per sample. Higher weights\nforce the classifier to put more emphasis on these points." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Detect the soft boundary of the set of samples X.", - "docstring": "Detect the soft boundary of the set of samples X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n Set of samples, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : Ignored\n Not used, present for API consistency by convention.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Per-sample weights. Rescale C per sample. Higher weights\n force the classifier to put more emphasis on these points.\n\n**params : dict\n Additional fit parameters.\n\n .. deprecated:: 1.0\n The `fit` method will not longer accept extra keyword\n parameters in 1.2. These keyword parameters were\n already discarded.\n\nReturns\n-------\nself : object\n Fitted estimator.\n\nNotes\n-----\nIf X is not a C-ordered contiguous array it is copied.", + "docstring": "Detect the soft boundary of the set of samples X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Set of samples, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Per-sample weights. Rescale C per sample. Higher weights\n force the classifier to put more emphasis on these points.\n\n **params : dict\n Additional fit parameters.\n\n .. deprecated:: 1.0\n The `fit` method will not longer accept extra keyword\n parameters in 1.2. These keyword parameters were\n already discarded.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n If X is not a C-ordered contiguous array it is copied.\n ", "source_code": "\ndef fit(self, X, y=None, sample_weight=None, **params):\n \"\"\"Detect the soft boundary of the set of samples X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n Set of samples, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : Ignored\n Not used, present for API consistency by convention.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Per-sample weights. Rescale C per sample. Higher weights\n force the classifier to put more emphasis on these points.\n\n **params : dict\n Additional fit parameters.\n\n .. deprecated:: 1.0\n The `fit` method will not longer accept extra keyword\n parameters in 1.2. These keyword parameters were\n already discarded.\n\n Returns\n -------\n self : object\n Fitted estimator.\n\n Notes\n -----\n If X is not a C-ordered contiguous array it is copied.\n \"\"\"\n if len(params) > 0:\n warnings.warn(f'Passing additional keyword parameters has no effect and is deprecated in 1.0. An error will be raised from 1.2 and beyond. The ignored keyword parameter(s) are: {params.keys()}.', FutureWarning)\n super().fit(X, np.ones(_num_samples(X)), sample_weight=sample_weight)\n self.offset_ = -self._intercept_\n return self" }, { @@ -160303,7 +172985,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -160313,13 +172996,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples_test, n_samples_train)", "description": "For kernel=\"precomputed\", the expected shape of X is\n(n_samples_test, n_samples_train)." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Perform classification on samples in X.\n\nFor a one-class model, +1 or -1 is returned.", - "docstring": "Perform classification on samples in X.\n\nFor a one-class model, +1 or -1 is returned.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples_test, n_samples_train)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\nReturns\n-------\ny_pred : ndarray of shape (n_samples,)\n Class labels for samples in X.", + "docstring": "Perform classification on samples in X.\n\n For a one-class model, +1 or -1 is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples_test, n_samples_train)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Class labels for samples in X.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Perform classification on samples in X.\n\n For a one-class model, +1 or -1 is returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features) or (n_samples_test, n_samples_train)\n For kernel=\"precomputed\", the expected shape of X is\n (n_samples_test, n_samples_train).\n\n Returns\n -------\n y_pred : ndarray of shape (n_samples,)\n Class labels for samples in X.\n \"\"\"\n y = super().predict(X)\n return np.asarray(y, dtype=np.intp)" }, { @@ -160337,7 +173024,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -160347,13 +173035,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Raw scoring function of the samples.", - "docstring": "Raw scoring function of the samples.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data matrix.\n\nReturns\n-------\nscore_samples : ndarray of shape (n_samples,)\n Returns the (unshifted) scoring function of the samples.", + "docstring": "Raw scoring function of the samples.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n score_samples : ndarray of shape (n_samples,)\n Returns the (unshifted) scoring function of the samples.\n ", "source_code": "\ndef score_samples(self, X):\n \"\"\"Raw scoring function of the samples.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data matrix.\n\n Returns\n -------\n score_samples : ndarray of shape (n_samples,)\n Returns the (unshifted) scoring function of the samples.\n \"\"\"\n return self.decision_function(X) + self.offset_" }, { @@ -160371,7 +173060,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "C", @@ -160381,7 +173071,8 @@ "docstring": { "type": "float, default=1.0", "description": "Regularization parameter. The strength of the regularization is\ninversely proportional to C. Must be strictly positive. The penalty\nis a squared l2 penalty." - } + }, + "refined_type": {} }, { "name": "kernel", @@ -160389,8 +173080,18 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'", - "description": "Specifies the kernel type to be used in the algorithm.\nIt must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\na callable.\nIf none is given, 'rbf' will be used. If a callable is given it is\nused to pre-compute the kernel matrix from data matrices; that matrix\nshould be an array of shape ``(n_samples, n_samples)``." + "type": "{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'", + "description": "Specifies the kernel type to be used in the algorithm.\nIf none is given, 'rbf' will be used. If a callable is given it is\nused to pre-compute the kernel matrix from data matrices; that matrix\nshould be an array of shape ``(n_samples, n_samples)``." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "poly", + "rbf", + "sigmoid", + "linear", + "precomputed" + ] } }, { @@ -160401,7 +173102,8 @@ "docstring": { "type": "int, default=3", "description": "Degree of the polynomial kernel function ('poly').\nIgnored by all other kernels." - } + }, + "refined_type": {} }, { "name": "gamma", @@ -160411,6 +173113,10 @@ "docstring": { "type": "{'scale', 'auto'} or float, default='scale'", "description": "Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n- if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n- if 'auto', uses 1 / n_features.\n\n.. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "scale"] } }, { @@ -160421,7 +173127,8 @@ "docstring": { "type": "float, default=0.0", "description": "Independent term in kernel function.\nIt is only significant in 'poly' and 'sigmoid'." - } + }, + "refined_type": {} }, { "name": "shrinking", @@ -160431,7 +173138,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to use the shrinking heuristic.\nSee the :ref:`User Guide `." - } + }, + "refined_type": {} }, { "name": "probability", @@ -160441,7 +173149,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to enable probability estimates. This must be enabled prior\nto calling `fit`, will slow down that method as it internally uses\n5-fold cross-validation, and `predict_proba` may be inconsistent with\n`predict`. Read more in the :ref:`User Guide `." - } + }, + "refined_type": {} }, { "name": "tol", @@ -160451,7 +173160,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Tolerance for stopping criterion." - } + }, + "refined_type": {} }, { "name": "cache_size", @@ -160461,7 +173171,8 @@ "docstring": { "type": "float, default=200", "description": "Specify the size of the kernel cache (in MB)." - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -160471,7 +173182,8 @@ "docstring": { "type": "dict or 'balanced', default=None", "description": "Set the parameter C of class i to class_weight[i]*C for\nSVC. If not given, all classes are supposed to have\nweight one.\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -160481,7 +173193,8 @@ "docstring": { "type": "bool, default=False", "description": "Enable verbose output. Note that this setting takes advantage of a\nper-process runtime setting in libsvm that, if enabled, may not work\nproperly in a multithreaded context." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -160491,7 +173204,8 @@ "docstring": { "type": "int, default=-1", "description": "Hard limit on iterations within solver, or -1 for no limit." - } + }, + "refined_type": {} }, { "name": "decision_function_shape", @@ -160501,6 +173215,10 @@ "docstring": { "type": "{'ovo', 'ovr'}, default='ovr'", "description": "Whether to return a one-vs-rest ('ovr') decision function of shape\n(n_samples, n_classes) as all other classifiers, or the original\none-vs-one ('ovo') decision function of libsvm which has shape\n(n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one\n('ovo') is always used as multi-class strategy. The parameter is\nignored for binary classification.\n\n.. versionchanged:: 0.19\n decision_function_shape is 'ovr' by default.\n\n.. versionadded:: 0.17\n *decision_function_shape='ovr'* is recommended.\n\n.. versionchanged:: 0.17\n Deprecated *decision_function_shape='ovo' and None*." + }, + "refined_type": { + "kind": "EnumType", + "values": ["ovo", "ovr"] } }, { @@ -160511,7 +173229,8 @@ "docstring": { "type": "bool, default=False", "description": "If true, ``decision_function_shape='ovr'``, and number of classes > 2,\n:term:`predict` will break ties according to the confidence values of\n:term:`decision_function`; otherwise the first class among the tied\nclasses is returned. Please note that breaking ties comes at a\nrelatively high computational cost compared to a simple predict.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -160521,13 +173240,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the pseudo random number generation for shuffling the data for\nprobability estimates. Ignored when `probability` is False.\nPass an int for reproducible output across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None):\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, nu=0.0, shrinking=shrinking, probability=probability, cache_size=cache_size, class_weight=class_weight, verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties, random_state=random_state)" }, { @@ -160545,13 +173265,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -160569,7 +173290,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -160577,8 +173299,18 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'", - "description": "Specifies the kernel type to be used in the algorithm.\nIt must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or\na callable.\nIf none is given, 'rbf' will be used. If a callable is given it is\nused to precompute the kernel matrix." + "type": "{'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf'", + "description": "Specifies the kernel type to be used in the algorithm.\nIf none is given, 'rbf' will be used. If a callable is given it is\nused to precompute the kernel matrix." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "poly", + "rbf", + "sigmoid", + "linear", + "precomputed" + ] } }, { @@ -160589,7 +173321,8 @@ "docstring": { "type": "int, default=3", "description": "Degree of the polynomial kernel function ('poly').\nIgnored by all other kernels." - } + }, + "refined_type": {} }, { "name": "gamma", @@ -160599,6 +173332,10 @@ "docstring": { "type": "{'scale', 'auto'} or float, default='scale'", "description": "Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.\n\n- if ``gamma='scale'`` (default) is passed then it uses\n 1 / (n_features * X.var()) as value of gamma,\n- if 'auto', uses 1 / n_features.\n\n.. versionchanged:: 0.22\n The default value of ``gamma`` changed from 'auto' to 'scale'." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "scale"] } }, { @@ -160609,7 +173346,8 @@ "docstring": { "type": "float, default=0.0", "description": "Independent term in kernel function.\nIt is only significant in 'poly' and 'sigmoid'." - } + }, + "refined_type": {} }, { "name": "tol", @@ -160619,7 +173357,8 @@ "docstring": { "type": "float, default=1e-3", "description": "Tolerance for stopping criterion." - } + }, + "refined_type": {} }, { "name": "C", @@ -160629,7 +173368,8 @@ "docstring": { "type": "float, default=1.0", "description": "Regularization parameter. The strength of the regularization is\ninversely proportional to C. Must be strictly positive.\nThe penalty is a squared l2 penalty." - } + }, + "refined_type": {} }, { "name": "epsilon", @@ -160639,7 +173379,8 @@ "docstring": { "type": "float, default=0.1", "description": "Epsilon in the epsilon-SVR model. It specifies the epsilon-tube\nwithin which no penalty is associated in the training loss function\nwith points predicted within a distance epsilon from the actual\nvalue." - } + }, + "refined_type": {} }, { "name": "shrinking", @@ -160649,7 +173390,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to use the shrinking heuristic.\nSee the :ref:`User Guide `." - } + }, + "refined_type": {} }, { "name": "cache_size", @@ -160659,7 +173401,8 @@ "docstring": { "type": "float, default=200", "description": "Specify the size of the kernel cache (in MB)." - } + }, + "refined_type": {} }, { "name": "verbose", @@ -160669,7 +173412,8 @@ "docstring": { "type": "bool, default=False", "description": "Enable verbose output. Note that this setting takes advantage of a\nper-process runtime setting in libsvm that, if enabled, may not work\nproperly in a multithreaded context." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -160679,13 +173423,14 @@ "docstring": { "type": "int, default=-1", "description": "Hard limit on iterations within solver, or -1 for no limit." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1):\n super().__init__(kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, nu=0.0, epsilon=epsilon, verbose=verbose, shrinking=shrinking, probability=False, cache_size=cache_size, class_weight=None, max_iter=max_iter, random_state=None)" }, { @@ -160703,13 +173448,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_xfail_checks': {'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples'}}" }, { @@ -160727,7 +173473,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -160737,13 +173484,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n from numpy.distutils.misc_util import Configuration\n config = Configuration('svm', parent_package, top_path)\n config.add_subpackage('tests')\n config.add_extension('_newrand', sources=['_newrand.pyx'], include_dirs=[numpy.get_include(), join('src', 'newrand')], depends=[join('src', 'newrand', 'newrand.h')], language='c++', extra_compile_args=['-std=c++11'])\n config.add_library('libsvm-skl', sources=[join('src', 'libsvm', 'libsvm_template.cpp')], depends=[join('src', 'libsvm', 'svm.cpp'), join('src', 'libsvm', 'svm.h'), join('src', 'newrand', 'newrand.h')], extra_link_args=['-lstdc++'], extra_compiler_args=['-std=c++11'])\n libsvm_sources = ['_libsvm.pyx']\n libsvm_depends = [join('src', 'libsvm', 'libsvm_helper.c'), join('src', 'libsvm', 'libsvm_template.cpp'), join('src', 'libsvm', 'svm.cpp'), join('src', 'libsvm', 'svm.h'), join('src', 'newrand', 'newrand.h')]\n config.add_extension('_libsvm', sources=libsvm_sources, include_dirs=[numpy.get_include(), join('src', 'libsvm'), join('src', 'newrand')], libraries=['libsvm-skl'], depends=libsvm_depends)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_library('liblinear-skl', sources=[join('src', 'liblinear', 'linear.cpp'), join('src', 'liblinear', 'tron.cpp')], depends=[join('src', 'liblinear', 'linear.h'), join('src', 'liblinear', 'tron.h'), join('src', 'newrand', 'newrand.h')], extra_link_args=['-lstdc++'], extra_compiler_args=['-std=c++11'])\n liblinear_sources = ['_liblinear.pyx']\n liblinear_depends = [join('src', 'liblinear', '*.h'), join('src', 'newrand', 'newrand.h'), join('src', 'liblinear', 'liblinear_helper.c')]\n config.add_extension('_liblinear', sources=liblinear_sources, libraries=['liblinear-skl'] + libraries, include_dirs=[join('.', 'src', 'liblinear'), join('.', 'src', 'newrand'), join('..', 'utils'), numpy.get_include()], depends=liblinear_depends)\n libsvm_sparse_sources = ['_libsvm_sparse.pyx']\n config.add_extension('_libsvm_sparse', libraries=['libsvm-skl'], sources=libsvm_sparse_sources, include_dirs=[numpy.get_include(), join('src', 'libsvm'), join('src', 'newrand')], depends=[join('src', 'libsvm', 'svm.h'), join('src', 'newrand', 'newrand.h'), join('src', 'libsvm', 'libsvm_sparse_helper.c')])\n return config" }, { @@ -160761,7 +173509,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "criterion", @@ -160771,7 +173520,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "splitter", @@ -160781,7 +173531,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -160791,7 +173542,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_samples_split", @@ -160801,7 +173553,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -160811,7 +173564,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -160821,7 +173575,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_features", @@ -160831,7 +173586,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -160841,7 +173597,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "random_state", @@ -160851,7 +173608,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -160861,7 +173619,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -160871,7 +173630,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ccp_alpha", @@ -160881,13 +173641,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self, *, criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, max_leaf_nodes, random_state, min_impurity_decrease, class_weight=None, ccp_alpha=0.0):\n self.criterion = criterion\n self.splitter = splitter\n self.max_depth = max_depth\n self.min_samples_split = min_samples_split\n self.min_samples_leaf = min_samples_leaf\n self.min_weight_fraction_leaf = min_weight_fraction_leaf\n self.max_features = max_features\n self.max_leaf_nodes = max_leaf_nodes\n self.random_state = random_state\n self.min_impurity_decrease = min_impurity_decrease\n self.class_weight = class_weight\n self.ccp_alpha = ccp_alpha" }, { @@ -160905,7 +173666,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -160929,7 +173691,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -160939,7 +173702,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check_input", @@ -160949,7 +173713,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -160973,7 +173738,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -160983,6 +173749,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -160993,13 +173763,14 @@ "docstring": { "type": "bool, default=True", "description": "Allow to bypass several input checking.\nDon't use this parameter unless you know what you do." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the index of the leaf that each sample is predicted as.\n\n.. versionadded:: 0.17", - "docstring": "Return the index of the leaf that each sample is predicted as.\n\n.. versionadded:: 0.17\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\ncheck_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\nReturns\n-------\nX_leaves : array-like of shape (n_samples,)\n For each datapoint x in X, return the index of the leaf x\n ends up in. Leaves are numbered within\n ``[0; self.tree_.node_count)``, possibly with gaps in the\n numbering.", + "docstring": "Return the index of the leaf that each sample is predicted as.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n X_leaves : array-like of shape (n_samples,)\n For each datapoint x in X, return the index of the leaf x\n ends up in. Leaves are numbered within\n ``[0; self.tree_.node_count)``, possibly with gaps in the\n numbering.\n ", "source_code": "\ndef apply(self, X, check_input=True):\n \"\"\"Return the index of the leaf that each sample is predicted as.\n\n .. versionadded:: 0.17\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n X_leaves : array-like of shape (n_samples,)\n For each datapoint x in X, return the index of the leaf x\n ends up in. Leaves are numbered within\n ``[0; self.tree_.node_count)``, possibly with gaps in the\n numbering.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_X_predict(X, check_input)\n return self.tree_.apply(X)" }, { @@ -161017,7 +173788,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -161027,6 +173799,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csc_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -161037,7 +173813,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "The target values (class labels) as integers or strings." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -161047,13 +173824,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted. Splits\nthat would create child nodes with net zero or negative weight are\nignored while searching for a split in each node. Splits are also\nignored if they would result in any single class carrying a\nnegative weight in either child node." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the pruning path during Minimal Cost-Complexity Pruning.\n\nSee :ref:`minimal_cost_complexity_pruning` for details on the pruning process.", - "docstring": "Compute the pruning path during Minimal Cost-Complexity Pruning.\n\nSee :ref:`minimal_cost_complexity_pruning` for details on the pruning\nprocess.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csc_matrix``.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels) as integers or strings.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. Splits are also\n ignored if they would result in any single class carrying a\n negative weight in either child node.\n\nReturns\n-------\nccp_path : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n ccp_alphas : ndarray\n Effective alphas of subtree during pruning.\n\n impurities : ndarray\n Sum of the impurities of the subtree leaves for the\n corresponding alpha value in ``ccp_alphas``.", + "description": "Compute the pruning path during Minimal Cost-Complexity Pruning.\n\nSee :ref:`minimal_cost_complexity_pruning` for details on the pruning\nprocess.", + "docstring": "Compute the pruning path during Minimal Cost-Complexity Pruning.\n\n See :ref:`minimal_cost_complexity_pruning` for details on the pruning\n process.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csc_matrix``.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels) as integers or strings.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. Splits are also\n ignored if they would result in any single class carrying a\n negative weight in either child node.\n\n Returns\n -------\n ccp_path : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n ccp_alphas : ndarray\n Effective alphas of subtree during pruning.\n\n impurities : ndarray\n Sum of the impurities of the subtree leaves for the\n corresponding alpha value in ``ccp_alphas``.\n ", "source_code": "\ndef cost_complexity_pruning_path(self, X, y, sample_weight=None):\n \"\"\"Compute the pruning path during Minimal Cost-Complexity Pruning.\n\n See :ref:`minimal_cost_complexity_pruning` for details on the pruning\n process.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csc_matrix``.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels) as integers or strings.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. Splits are also\n ignored if they would result in any single class carrying a\n negative weight in either child node.\n\n Returns\n -------\n ccp_path : :class:`~sklearn.utils.Bunch`\n Dictionary-like object, with the following attributes.\n\n ccp_alphas : ndarray\n Effective alphas of subtree during pruning.\n\n impurities : ndarray\n Sum of the impurities of the subtree leaves for the\n corresponding alpha value in ``ccp_alphas``.\n \"\"\"\n est = clone(self).set_params(ccp_alpha=0.0)\n est.fit(X, y, sample_weight=sample_weight)\n return Bunch(**ccp_pruning_path(est.tree_))" }, { @@ -161071,7 +173849,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -161081,6 +173860,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -161091,13 +173874,14 @@ "docstring": { "type": "bool, default=True", "description": "Allow to bypass several input checking.\nDon't use this parameter unless you know what you do." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the decision path in the tree.\n\n.. versionadded:: 0.18", - "docstring": "Return the decision path in the tree.\n\n.. versionadded:: 0.18\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\ncheck_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\nReturns\n-------\nindicator : sparse matrix of shape (n_samples, n_nodes)\n Return a node indicator CSR matrix where non zero elements\n indicates that the samples goes through the nodes.", + "docstring": "Return the decision path in the tree.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n indicator : sparse matrix of shape (n_samples, n_nodes)\n Return a node indicator CSR matrix where non zero elements\n indicates that the samples goes through the nodes.\n ", "source_code": "\ndef decision_path(self, X, check_input=True):\n \"\"\"Return the decision path in the tree.\n\n .. versionadded:: 0.18\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n indicator : sparse matrix of shape (n_samples, n_nodes)\n Return a node indicator CSR matrix where non zero elements\n indicates that the samples goes through the nodes.\n \"\"\"\n X = self._validate_X_predict(X, check_input)\n return self.tree_.decision_path(X)" }, { @@ -161115,13 +173899,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return the feature importances.\n\nThe importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values). See :func:`sklearn.inspection.permutation_importance` as an alternative.", - "docstring": "Return the feature importances.\n\nThe importance of a feature is computed as the (normalized) total\nreduction of the criterion brought by that feature.\nIt is also known as the Gini importance.\n\nWarning: impurity-based feature importances can be misleading for\nhigh cardinality features (many unique values). See\n:func:`sklearn.inspection.permutation_importance` as an alternative.\n\nReturns\n-------\nfeature_importances_ : ndarray of shape (n_features,)\n Normalized total reduction of criteria by feature\n (Gini importance).", + "description": "Return the feature importances.\n\nThe importance of a feature is computed as the (normalized) total\nreduction of the criterion brought by that feature.\nIt is also known as the Gini importance.\n\nWarning: impurity-based feature importances can be misleading for\nhigh cardinality features (many unique values). See\n:func:`sklearn.inspection.permutation_importance` as an alternative.", + "docstring": "Return the feature importances.\n\n The importance of a feature is computed as the (normalized) total\n reduction of the criterion brought by that feature.\n It is also known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n Normalized total reduction of criteria by feature\n (Gini importance).\n ", "source_code": "\n@property\ndef feature_importances_(self):\n \"\"\"Return the feature importances.\n\n The importance of a feature is computed as the (normalized) total\n reduction of the criterion brought by that feature.\n It is also known as the Gini importance.\n\n Warning: impurity-based feature importances can be misleading for\n high cardinality features (many unique values). See\n :func:`sklearn.inspection.permutation_importance` as an alternative.\n\n Returns\n -------\n feature_importances_ : ndarray of shape (n_features,)\n Normalized total reduction of criteria by feature\n (Gini importance).\n \"\"\"\n check_is_fitted(self)\n return self.tree_.compute_feature_importances()" }, { @@ -161139,7 +173924,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -161149,7 +173935,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -161159,7 +173946,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -161169,7 +173957,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check_input", @@ -161179,7 +173968,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_idx_sorted", @@ -161189,13 +173979,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted='deprecated'):\n random_state = check_random_state(self.random_state)\n if self.ccp_alpha < 0.0:\n raise ValueError('ccp_alpha must be greater than or equal to 0')\n if check_input:\n check_X_params = dict(dtype=DTYPE, accept_sparse='csc')\n check_y_params = dict(ensure_2d=False, dtype=None)\n (X, y) = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params))\n if issparse(X):\n X.sort_indices()\n if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:\n raise ValueError('No support for np.int64 index based sparse matrices')\n if self.criterion == 'poisson':\n if np.any(y < 0):\n raise ValueError('Some value(s) of y are negative which is not allowed for Poisson regression.')\n if np.sum(y) <= 0:\n raise ValueError('Sum of y is not positive which is necessary for Poisson regression.')\n (n_samples, self.n_features_in_) = X.shape\n is_classification = is_classifier(self)\n y = np.atleast_1d(y)\n expanded_class_weight = None\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n self.n_outputs_ = y.shape[1]\n if is_classification:\n check_classification_targets(y)\n y = np.copy(y)\n self.classes_ = []\n self.n_classes_ = []\n if self.class_weight is not None:\n y_original = np.copy(y)\n y_encoded = np.zeros(y.shape, dtype=int)\n for k in range(self.n_outputs_):\n (classes_k, y_encoded[:, k]) = np.unique(y[:, k], return_inverse=True)\n self.classes_.append(classes_k)\n self.n_classes_.append(classes_k.shape[0])\n y = y_encoded\n if self.class_weight is not None:\n expanded_class_weight = compute_sample_weight(self.class_weight, y_original)\n self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)\n if getattr(y, 'dtype', None) != DOUBLE or not y.flags.contiguous:\n y = np.ascontiguousarray(y, dtype=DOUBLE)\n max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth\n max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes\n if isinstance(self.min_samples_leaf, numbers.Integral):\n if not 1 <= self.min_samples_leaf:\n raise ValueError('min_samples_leaf must be at least 1 or in (0, 0.5], got %s' % self.min_samples_leaf)\n min_samples_leaf = self.min_samples_leaf\n else:\n if not 0.0 < self.min_samples_leaf <= 0.5:\n raise ValueError('min_samples_leaf must be at least 1 or in (0, 0.5], got %s' % self.min_samples_leaf)\n min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))\n if isinstance(self.min_samples_split, numbers.Integral):\n if not 2 <= self.min_samples_split:\n raise ValueError('min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer %s' % self.min_samples_split)\n min_samples_split = self.min_samples_split\n else:\n if not 0.0 < self.min_samples_split <= 1.0:\n raise ValueError('min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the float %s' % self.min_samples_split)\n min_samples_split = int(ceil(self.min_samples_split * n_samples))\n min_samples_split = max(2, min_samples_split)\n min_samples_split = max(min_samples_split, 2 * min_samples_leaf)\n if isinstance(self.max_features, str):\n if self.max_features == 'auto':\n if is_classification:\n max_features = max(1, int(np.sqrt(self.n_features_in_)))\n else:\n max_features = self.n_features_in_\n elif self.max_features == 'sqrt':\n max_features = max(1, int(np.sqrt(self.n_features_in_)))\n elif self.max_features == 'log2':\n max_features = max(1, int(np.log2(self.n_features_in_)))\n else:\n raise ValueError(\"Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.\")\n elif self.max_features is None:\n max_features = self.n_features_in_\n elif isinstance(self.max_features, numbers.Integral):\n max_features = self.max_features\n elif self.max_features > 0.0:\n max_features = max(1, int(self.max_features * self.n_features_in_))\n else:\n max_features = 0\n self.max_features_ = max_features\n if len(y) != n_samples:\n raise ValueError('Number of labels=%d does not match number of samples=%d' % (len(y), n_samples))\n if not 0 <= self.min_weight_fraction_leaf <= 0.5:\n raise ValueError('min_weight_fraction_leaf must in [0, 0.5]')\n if max_depth <= 0:\n raise ValueError('max_depth must be greater than zero. ')\n if not 0 < max_features <= self.n_features_in_:\n raise ValueError('max_features must be in (0, n_features]')\n if not isinstance(max_leaf_nodes, numbers.Integral):\n raise ValueError('max_leaf_nodes must be integral number but was %r' % max_leaf_nodes)\n if -1 < max_leaf_nodes < 2:\n raise ValueError('max_leaf_nodes {0} must be either None or larger than 1'.format(max_leaf_nodes))\n if sample_weight is not None:\n sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)\n if expanded_class_weight is not None:\n if sample_weight is not None:\n sample_weight = sample_weight * expanded_class_weight\n else:\n sample_weight = expanded_class_weight\n if sample_weight is None:\n min_weight_leaf = self.min_weight_fraction_leaf * n_samples\n else:\n min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)\n if self.min_impurity_decrease < 0.0:\n raise ValueError('min_impurity_decrease must be greater than or equal to 0')\n if X_idx_sorted != 'deprecated':\n warnings.warn(\"The parameter 'X_idx_sorted' is deprecated and has no effect. It will be removed in 1.1 (renaming of 0.26). You can suppress this warning by not passing any value to the 'X_idx_sorted' parameter.\", FutureWarning)\n criterion = self.criterion\n if not isinstance(criterion, Criterion):\n if is_classification:\n criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_)\n else:\n criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples)\n if self.criterion == 'mse':\n warnings.warn(\"Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.\", FutureWarning)\n elif self.criterion == 'mae':\n warnings.warn(\"Criterion 'mae' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='absolute_error'` which is equivalent.\", FutureWarning)\n else:\n criterion = copy.deepcopy(criterion)\n SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS\n splitter = self.splitter\n if not isinstance(self.splitter, Splitter):\n splitter = SPLITTERS[self.splitter](criterion, self.max_features_, min_samples_leaf, min_weight_leaf, random_state)\n if is_classifier(self):\n self.tree_ = Tree(self.n_features_in_, self.n_classes_, self.n_outputs_)\n else:\n self.tree_ = Tree(self.n_features_in_, np.array([1] * self.n_outputs_, dtype=np.intp), self.n_outputs_)\n if max_leaf_nodes < 0:\n builder = DepthFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, self.min_impurity_decrease)\n else:\n builder = BestFirstTreeBuilder(splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, max_leaf_nodes, self.min_impurity_decrease)\n builder.build(self.tree_, X, y, sample_weight)\n if self.n_outputs_ == 1 and is_classifier(self):\n self.n_classes_ = self.n_classes_[0]\n self.classes_ = self.classes_[0]\n self._prune_tree()\n return self" }, { @@ -161213,13 +174004,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Return the depth of the decision tree.\n\nThe depth of a tree is the maximum distance between the root and any leaf.", - "docstring": "Return the depth of the decision tree.\n\nThe depth of a tree is the maximum distance between the root\nand any leaf.\n\nReturns\n-------\nself.tree_.max_depth : int\n The maximum depth of the tree.", + "description": "Return the depth of the decision tree.\n\nThe depth of a tree is the maximum distance between the root\nand any leaf.", + "docstring": "Return the depth of the decision tree.\n\n The depth of a tree is the maximum distance between the root\n and any leaf.\n\n Returns\n -------\n self.tree_.max_depth : int\n The maximum depth of the tree.\n ", "source_code": "\ndef get_depth(self):\n \"\"\"Return the depth of the decision tree.\n\n The depth of a tree is the maximum distance between the root\n and any leaf.\n\n Returns\n -------\n self.tree_.max_depth : int\n The maximum depth of the tree.\n \"\"\"\n check_is_fitted(self)\n return self.tree_.max_depth" }, { @@ -161237,13 +174029,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the number of leaves of the decision tree.", - "docstring": "Return the number of leaves of the decision tree.\n\nReturns\n-------\nself.tree_.n_leaves : int\n Number of leaves.", + "docstring": "Return the number of leaves of the decision tree.\n\n Returns\n -------\n self.tree_.n_leaves : int\n Number of leaves.\n ", "source_code": "\ndef get_n_leaves(self):\n \"\"\"Return the number of leaves of the decision tree.\n\n Returns\n -------\n self.tree_.n_leaves : int\n Number of leaves.\n \"\"\"\n check_is_fitted(self)\n return self.tree_.n_leaves" }, { @@ -161261,7 +174054,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -161271,6 +174065,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -161281,13 +174079,14 @@ "docstring": { "type": "bool, default=True", "description": "Allow to bypass several input checking.\nDon't use this parameter unless you know what you do." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict class or regression value for X.\n\nFor a classification model, the predicted class for each sample in X is returned. For a regression model, the predicted value based on X is returned.", - "docstring": "Predict class or regression value for X.\n\nFor a classification model, the predicted class for each sample in X is\nreturned. For a regression model, the predicted value based on X is\nreturned.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\ncheck_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\nReturns\n-------\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The predicted classes, or the predict values.", + "description": "Predict class or regression value for X.\n\nFor a classification model, the predicted class for each sample in X is\nreturned. For a regression model, the predicted value based on X is\nreturned.", + "docstring": "Predict class or regression value for X.\n\n For a classification model, the predicted class for each sample in X is\n returned. For a regression model, the predicted value based on X is\n returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The predicted classes, or the predict values.\n ", "source_code": "\ndef predict(self, X, check_input=True):\n \"\"\"Predict class or regression value for X.\n\n For a classification model, the predicted class for each sample in X is\n returned. For a regression model, the predicted value based on X is\n returned.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The predicted classes, or the predict values.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_X_predict(X, check_input)\n proba = self.tree_.predict(X)\n n_samples = X.shape[0]\n if is_classifier(self):\n if self.n_outputs_ == 1:\n return self.classes_.take(np.argmax(proba, axis=1), axis=0)\n else:\n class_type = self.classes_[0].dtype\n predictions = np.zeros((n_samples, self.n_outputs_), dtype=class_type)\n for k in range(self.n_outputs_):\n predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0)\n return predictions\n elif self.n_outputs_ == 1:\n return proba[:, 0]\n else:\n return proba[:, :, 0]" }, { @@ -161305,7 +174104,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "criterion", @@ -161315,6 +174115,10 @@ "docstring": { "type": "{\"gini\", \"entropy\"}, default=\"gini\"", "description": "The function to measure the quality of a split. Supported criteria are\n\"gini\" for the Gini impurity and \"entropy\" for the information gain." + }, + "refined_type": { + "kind": "EnumType", + "values": ["gini", "entropy"] } }, { @@ -161325,6 +174129,10 @@ "docstring": { "type": "{\"best\", \"random\"}, default=\"best\"", "description": "The strategy used to choose the split at each node. Supported\nstrategies are \"best\" to choose the best split and \"random\" to choose\nthe best random split." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "best"] } }, { @@ -161335,7 +174143,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum depth of the tree. If None, then nodes are expanded until\nall leaves are pure or until all leaves contain less than\nmin_samples_split samples." - } + }, + "refined_type": {} }, { "name": "min_samples_split", @@ -161345,7 +174154,8 @@ "docstring": { "type": "int or float, default=2", "description": "The minimum number of samples required to split an internal node:\n\n- If int, then consider `min_samples_split` as the minimum number.\n- If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -161355,7 +174165,8 @@ "docstring": { "type": "int or float, default=1", "description": "The minimum number of samples required to be at a leaf node.\nA split point at any depth will only be considered if it leaves at\nleast ``min_samples_leaf`` training samples in each of the left and\nright branches. This may have the effect of smoothing the model,\nespecially in regression.\n\n- If int, then consider `min_samples_leaf` as the minimum number.\n- If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -161365,7 +174176,8 @@ "docstring": { "type": "float, default=0.0", "description": "The minimum weighted fraction of the sum total of weights (of all\nthe input samples) required to be at a leaf node. Samples have\nequal weight when sample_weight is not provided." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -161375,6 +174187,10 @@ "docstring": { "type": "int, float or {\"auto\", \"sqrt\", \"log2\"}, default=None", "description": "The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=sqrt(n_features)`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\nNote: the search for a split does not stop until at least one\nvalid partition of the node samples is found, even if it requires to\neffectively inspect more than ``max_features`` features." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "sqrt", "log2"] } }, { @@ -161385,7 +174201,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the randomness of the estimator. The features are always\nrandomly permuted at each split, even if ``splitter`` is set to\n``\"best\"``. When ``max_features < n_features``, the algorithm will\nselect ``max_features`` at random at each split before finding the best\nsplit among them. But the best found split may vary across different\nruns, even if ``max_features=n_features``. That is the case, if the\nimprovement of the criterion is identical for several splits and one\nsplit has to be selected at random. To obtain a deterministic behaviour\nduring fitting, ``random_state`` has to be fixed to an integer.\nSee :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -161395,7 +174212,8 @@ "docstring": { "type": "int, default=None", "description": "Grow a tree with ``max_leaf_nodes`` in best-first fashion.\nBest nodes are defined as relative reduction in impurity.\nIf None then unlimited number of leaf nodes." - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -161405,7 +174223,8 @@ "docstring": { "type": "float, default=0.0", "description": "A node will be split if this split induces a decrease of the impurity\ngreater than or equal to this value.\n\nThe weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\nwhere ``N`` is the total number of samples, ``N_t`` is the number of\nsamples at the current node, ``N_t_L`` is the number of samples in the\nleft child, and ``N_t_R`` is the number of samples in the right child.\n\n``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\nif ``sample_weight`` is passed.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -161415,6 +174234,10 @@ "docstring": { "type": "dict, list of dict or \"balanced\", default=None", "description": "Weights associated with classes in the form ``{class_label: weight}``.\nIf None, all classes are supposed to have weight one. For\nmulti-output problems, a list of dicts can be provided in the same\norder as the columns of y.\n\nNote that for multioutput (including multilabel) weights should be\ndefined for each class of every column in its own dict. For example,\nfor four-class multilabel classification weights should be\n[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n[{1:1}, {2:5}, {3:1}, {4:1}].\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``\n\nFor multi-output, the weights of each column of y will be multiplied.\n\nNote that these weights will be multiplied with sample_weight (passed\nthrough the fit method) if sample_weight is specified." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -161425,13 +174248,14 @@ "docstring": { "type": "non-negative float, default=0.0", "description": "Complexity parameter used for Minimal Cost-Complexity Pruning. The\nsubtree with the largest cost complexity that is smaller than\n``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n:ref:`minimal_cost_complexity_pruning` for details.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0):\n super().__init__(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha)" }, { @@ -161449,13 +174273,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'multilabel': True}" }, { @@ -161473,7 +174298,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -161483,6 +174309,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csc_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -161493,7 +174323,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "The target values (class labels) as integers or strings." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -161503,7 +174334,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted. Splits\nthat would create child nodes with net zero or negative weight are\nignored while searching for a split in each node. Splits are also\nignored if they would result in any single class carrying a\nnegative weight in either child node." - } + }, + "refined_type": {} }, { "name": "check_input", @@ -161513,7 +174345,8 @@ "docstring": { "type": "bool, default=True", "description": "Allow to bypass several input checking.\nDon't use this parameter unless you know what you do." - } + }, + "refined_type": {} }, { "name": "X_idx_sorted", @@ -161523,13 +174356,14 @@ "docstring": { "type": "deprecated, default=\"deprecated\"", "description": "This parameter is deprecated and has no effect.\nIt will be removed in 1.1 (renaming of 0.26).\n\n.. deprecated:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Build a decision tree classifier from the training set (X, y).", - "docstring": "Build a decision tree classifier from the training set (X, y).\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csc_matrix``.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels) as integers or strings.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. Splits are also\n ignored if they would result in any single class carrying a\n negative weight in either child node.\n\ncheck_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\nX_idx_sorted : deprecated, default=\"deprecated\"\n This parameter is deprecated and has no effect.\n It will be removed in 1.1 (renaming of 0.26).\n\n .. deprecated:: 0.24\n\nReturns\n-------\nself : DecisionTreeClassifier\n Fitted estimator.", + "docstring": "Build a decision tree classifier from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csc_matrix``.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels) as integers or strings.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. Splits are also\n ignored if they would result in any single class carrying a\n negative weight in either child node.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n X_idx_sorted : deprecated, default=\"deprecated\"\n This parameter is deprecated and has no effect.\n It will be removed in 1.1 (renaming of 0.26).\n\n .. deprecated:: 0.24\n\n Returns\n -------\n self : DecisionTreeClassifier\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted='deprecated'):\n \"\"\"Build a decision tree classifier from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csc_matrix``.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (class labels) as integers or strings.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node. Splits are also\n ignored if they would result in any single class carrying a\n negative weight in either child node.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n X_idx_sorted : deprecated, default=\"deprecated\"\n This parameter is deprecated and has no effect.\n It will be removed in 1.1 (renaming of 0.26).\n\n .. deprecated:: 0.24\n\n Returns\n -------\n self : DecisionTreeClassifier\n Fitted estimator.\n \"\"\"\n super().fit(X, y, sample_weight=sample_weight, check_input=check_input, X_idx_sorted=X_idx_sorted)\n return self" }, { @@ -161550,13 +174384,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.')\n@property\ndef n_features_(self):\n return self.n_features_in_" }, { @@ -161574,7 +174409,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -161584,13 +174420,17 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, "description": "Predict class log-probabilities of the input samples X.", - "docstring": "Predict class log-probabilities of the input samples X.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\nReturns\n-------\nproba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.", + "docstring": "Predict class log-probabilities of the input samples X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n proba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n ", "source_code": "\ndef predict_log_proba(self, X):\n \"\"\"Predict class log-probabilities of the input samples X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n Returns\n -------\n proba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1\n The class log-probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n proba = self.predict_proba(X)\n if self.n_outputs_ == 1:\n return np.log(proba)\n else:\n for k in range(self.n_outputs_):\n proba[k] = np.log(proba[k])\n return proba" }, { @@ -161608,7 +174448,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -161618,6 +174459,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csr_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -161628,13 +174473,14 @@ "docstring": { "type": "bool, default=True", "description": "Allow to bypass several input checking.\nDon't use this parameter unless you know what you do." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Predict class probabilities of the input samples X.\n\nThe predicted class probability is the fraction of samples of the same class in a leaf.", - "docstring": "Predict class probabilities of the input samples X.\n\nThe predicted class probability is the fraction of samples of the same\nclass in a leaf.\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\ncheck_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\nReturns\n-------\nproba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.", + "description": "Predict class probabilities of the input samples X.\n\nThe predicted class probability is the fraction of samples of the same\nclass in a leaf.", + "docstring": "Predict class probabilities of the input samples X.\n\n The predicted class probability is the fraction of samples of the same\n class in a leaf.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n proba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n ", "source_code": "\ndef predict_proba(self, X, check_input=True):\n \"\"\"Predict class probabilities of the input samples X.\n\n The predicted class probability is the fraction of samples of the same\n class in a leaf.\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csr_matrix``.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n Returns\n -------\n proba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1\n The class probabilities of the input samples. The order of the\n classes corresponds to that in the attribute :term:`classes_`.\n \"\"\"\n check_is_fitted(self)\n X = self._validate_X_predict(X, check_input)\n proba = self.tree_.predict(X)\n if self.n_outputs_ == 1:\n proba = proba[:, :self.n_classes_]\n normalizer = proba.sum(axis=1)[:, np.newaxis]\n normalizer[normalizer == 0.0] = 1.0\n proba /= normalizer\n return proba\n else:\n all_proba = []\n for k in range(self.n_outputs_):\n proba_k = proba[:, k, :self.n_classes_[k]]\n normalizer = proba_k.sum(axis=1)[:, np.newaxis]\n normalizer[normalizer == 0.0] = 1.0\n proba_k /= normalizer\n all_proba.append(proba_k)\n return all_proba" }, { @@ -161652,7 +174498,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "criterion", @@ -161662,6 +174509,15 @@ "docstring": { "type": "{\"squared_error\", \"friedman_mse\", \"absolute_error\", \"poisson\"}, default=\"squared_error\"", "description": "The function to measure the quality of a split. Supported criteria\nare \"squared_error\" for the mean squared error, which is equal to\nvariance reduction as feature selection criterion and minimizes the L2\nloss using the mean of each terminal node, \"friedman_mse\", which uses\nmean squared error with Friedman's improvement score for potential\nsplits, \"absolute_error\" for the mean absolute error, which minimizes\nthe L1 loss using the median of each terminal node, and \"poisson\" which\nuses reduction in Poisson deviance to find splits.\n\n.. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n.. versionadded:: 0.24\n Poisson deviance criterion.\n\n.. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n.. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "friedman_mse", + "squared_error", + "poisson", + "absolute_error" + ] } }, { @@ -161672,6 +174528,10 @@ "docstring": { "type": "{\"best\", \"random\"}, default=\"best\"", "description": "The strategy used to choose the split at each node. Supported\nstrategies are \"best\" to choose the best split and \"random\" to choose\nthe best random split." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "best"] } }, { @@ -161682,7 +174542,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum depth of the tree. If None, then nodes are expanded until\nall leaves are pure or until all leaves contain less than\nmin_samples_split samples." - } + }, + "refined_type": {} }, { "name": "min_samples_split", @@ -161692,7 +174553,8 @@ "docstring": { "type": "int or float, default=2", "description": "The minimum number of samples required to split an internal node:\n\n- If int, then consider `min_samples_split` as the minimum number.\n- If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -161702,7 +174564,8 @@ "docstring": { "type": "int or float, default=1", "description": "The minimum number of samples required to be at a leaf node.\nA split point at any depth will only be considered if it leaves at\nleast ``min_samples_leaf`` training samples in each of the left and\nright branches. This may have the effect of smoothing the model,\nespecially in regression.\n\n- If int, then consider `min_samples_leaf` as the minimum number.\n- If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -161712,7 +174575,8 @@ "docstring": { "type": "float, default=0.0", "description": "The minimum weighted fraction of the sum total of weights (of all\nthe input samples) required to be at a leaf node. Samples have\nequal weight when sample_weight is not provided." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -161722,6 +174586,10 @@ "docstring": { "type": "int, float or {\"auto\", \"sqrt\", \"log2\"}, default=None", "description": "The number of features to consider when looking for the best split:\n\n- If int, then consider `max_features` features at each split.\n- If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n- If \"auto\", then `max_features=n_features`.\n- If \"sqrt\", then `max_features=sqrt(n_features)`.\n- If \"log2\", then `max_features=log2(n_features)`.\n- If None, then `max_features=n_features`.\n\nNote: the search for a split does not stop until at least one\nvalid partition of the node samples is found, even if it requires to\neffectively inspect more than ``max_features`` features." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "sqrt", "log2"] } }, { @@ -161732,7 +174600,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the randomness of the estimator. The features are always\nrandomly permuted at each split, even if ``splitter`` is set to\n``\"best\"``. When ``max_features < n_features``, the algorithm will\nselect ``max_features`` at random at each split before finding the best\nsplit among them. But the best found split may vary across different\nruns, even if ``max_features=n_features``. That is the case, if the\nimprovement of the criterion is identical for several splits and one\nsplit has to be selected at random. To obtain a deterministic behaviour\nduring fitting, ``random_state`` has to be fixed to an integer.\nSee :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -161742,7 +174611,8 @@ "docstring": { "type": "int, default=None", "description": "Grow a tree with ``max_leaf_nodes`` in best-first fashion.\nBest nodes are defined as relative reduction in impurity.\nIf None then unlimited number of leaf nodes." - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -161752,7 +174622,8 @@ "docstring": { "type": "float, default=0.0", "description": "A node will be split if this split induces a decrease of the impurity\ngreater than or equal to this value.\n\nThe weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\nwhere ``N`` is the total number of samples, ``N_t`` is the number of\nsamples at the current node, ``N_t_L`` is the number of samples in the\nleft child, and ``N_t_R`` is the number of samples in the right child.\n\n``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\nif ``sample_weight`` is passed.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "ccp_alpha", @@ -161762,13 +174633,14 @@ "docstring": { "type": "non-negative float, default=0.0", "description": "Complexity parameter used for Minimal Cost-Complexity Pruning. The\nsubtree with the largest cost complexity that is smaller than\n``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n:ref:`minimal_cost_complexity_pruning` for details.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, criterion='squared_error', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0):\n super().__init__(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha)" }, { @@ -161786,7 +174658,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "grid", @@ -161796,7 +174669,8 @@ "docstring": { "type": "ndarray of shape (n_samples, n_target_features)", "description": "The grid points on which the partial dependence should be\nevaluated." - } + }, + "refined_type": {} }, { "name": "target_features", @@ -161806,13 +174680,14 @@ "docstring": { "type": "ndarray of shape (n_target_features)", "description": "The set of target features for which the partial dependence\nshould be evaluated." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fast partial dependence computation.", - "docstring": "Fast partial dependence computation.\n\nParameters\n----------\ngrid : ndarray of shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\ntarget_features : ndarray of shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\n\nReturns\n-------\naveraged_predictions : ndarray of shape (n_samples,)\n The value of the partial dependence function on each grid point.", + "docstring": "Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray of shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray of shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\n\n Returns\n -------\n averaged_predictions : ndarray of shape (n_samples,)\n The value of the partial dependence function on each grid point.\n ", "source_code": "\ndef _compute_partial_dependence_recursion(self, grid, target_features):\n \"\"\"Fast partial dependence computation.\n\n Parameters\n ----------\n grid : ndarray of shape (n_samples, n_target_features)\n The grid points on which the partial dependence should be\n evaluated.\n target_features : ndarray of shape (n_target_features)\n The set of target features for which the partial dependence\n should be evaluated.\n\n Returns\n -------\n averaged_predictions : ndarray of shape (n_samples,)\n The value of the partial dependence function on each grid point.\n \"\"\"\n grid = np.asarray(grid, dtype=DTYPE, order='C')\n averaged_predictions = np.zeros(shape=grid.shape[0], dtype=np.float64, order='C')\n self.tree_.compute_partial_dependence(grid, target_features, averaged_predictions)\n return averaged_predictions" }, { @@ -161830,7 +174705,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -161840,6 +174716,10 @@ "docstring": { "type": "{array-like, sparse matrix} of shape (n_samples, n_features)", "description": "The training input samples. Internally, it will be converted to\n``dtype=np.float32`` and if a sparse matrix is provided\nto a sparse ``csc_matrix``." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -161850,7 +174730,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "The target values (real numbers). Use ``dtype=np.float64`` and\n``order='C'`` for maximum efficiency." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -161860,7 +174741,8 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights. If None, then samples are equally weighted. Splits\nthat would create child nodes with net zero or negative weight are\nignored while searching for a split in each node." - } + }, + "refined_type": {} }, { "name": "check_input", @@ -161870,7 +174752,8 @@ "docstring": { "type": "bool, default=True", "description": "Allow to bypass several input checking.\nDon't use this parameter unless you know what you do." - } + }, + "refined_type": {} }, { "name": "X_idx_sorted", @@ -161880,13 +174763,14 @@ "docstring": { "type": "deprecated, default=\"deprecated\"", "description": "This parameter is deprecated and has no effect.\nIt will be removed in 1.1 (renaming of 0.26).\n\n.. deprecated:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Build a decision tree regressor from the training set (X, y).", - "docstring": "Build a decision tree regressor from the training set (X, y).\n\nParameters\n----------\nX : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csc_matrix``.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (real numbers). Use ``dtype=np.float64`` and\n ``order='C'`` for maximum efficiency.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node.\n\ncheck_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\nX_idx_sorted : deprecated, default=\"deprecated\"\n This parameter is deprecated and has no effect.\n It will be removed in 1.1 (renaming of 0.26).\n\n .. deprecated:: 0.24\n\nReturns\n-------\nself : DecisionTreeRegressor\n Fitted estimator.", + "docstring": "Build a decision tree regressor from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csc_matrix``.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (real numbers). Use ``dtype=np.float64`` and\n ``order='C'`` for maximum efficiency.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n X_idx_sorted : deprecated, default=\"deprecated\"\n This parameter is deprecated and has no effect.\n It will be removed in 1.1 (renaming of 0.26).\n\n .. deprecated:: 0.24\n\n Returns\n -------\n self : DecisionTreeRegressor\n Fitted estimator.\n ", "source_code": "\ndef fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted='deprecated'):\n \"\"\"Build a decision tree regressor from the training set (X, y).\n\n Parameters\n ----------\n X : {array-like, sparse matrix} of shape (n_samples, n_features)\n The training input samples. Internally, it will be converted to\n ``dtype=np.float32`` and if a sparse matrix is provided\n to a sparse ``csc_matrix``.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n The target values (real numbers). Use ``dtype=np.float64`` and\n ``order='C'`` for maximum efficiency.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights. If None, then samples are equally weighted. Splits\n that would create child nodes with net zero or negative weight are\n ignored while searching for a split in each node.\n\n check_input : bool, default=True\n Allow to bypass several input checking.\n Don't use this parameter unless you know what you do.\n\n X_idx_sorted : deprecated, default=\"deprecated\"\n This parameter is deprecated and has no effect.\n It will be removed in 1.1 (renaming of 0.26).\n\n .. deprecated:: 0.24\n\n Returns\n -------\n self : DecisionTreeRegressor\n Fitted estimator.\n \"\"\"\n super().fit(X, y, sample_weight=sample_weight, check_input=check_input, X_idx_sorted=X_idx_sorted)\n return self" }, { @@ -161907,13 +174791,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@deprecated('The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.')\n@property\ndef n_features_(self):\n return self.n_features_in_" }, { @@ -161931,7 +174816,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "criterion", @@ -161941,6 +174827,10 @@ "docstring": { "type": "{\"gini\", \"entropy\"}, default=\"gini\"", "description": "The function to measure the quality of a split. Supported criteria are\n\"gini\" for the Gini impurity and \"entropy\" for the information gain." + }, + "refined_type": { + "kind": "EnumType", + "values": ["gini", "entropy"] } }, { @@ -161951,6 +174841,10 @@ "docstring": { "type": "{\"random\", \"best\"}, default=\"random\"", "description": "The strategy used to choose the split at each node. Supported\nstrategies are \"best\" to choose the best split and \"random\" to choose\nthe best random split." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "best"] } }, { @@ -161961,7 +174855,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum depth of the tree. If None, then nodes are expanded until\nall leaves are pure or until all leaves contain less than\nmin_samples_split samples." - } + }, + "refined_type": {} }, { "name": "min_samples_split", @@ -161971,7 +174866,8 @@ "docstring": { "type": "int or float, default=2", "description": "The minimum number of samples required to split an internal node:\n\n- If int, then consider `min_samples_split` as the minimum number.\n- If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -161981,7 +174877,8 @@ "docstring": { "type": "int or float, default=1", "description": "The minimum number of samples required to be at a leaf node.\nA split point at any depth will only be considered if it leaves at\nleast ``min_samples_leaf`` training samples in each of the left and\nright branches. This may have the effect of smoothing the model,\nespecially in regression.\n\n- If int, then consider `min_samples_leaf` as the minimum number.\n- If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -161991,7 +174888,8 @@ "docstring": { "type": "float, default=0.0", "description": "The minimum weighted fraction of the sum total of weights (of all\nthe input samples) required to be at a leaf node. Samples have\nequal weight when sample_weight is not provided." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -162001,6 +174899,10 @@ "docstring": { "type": "int, float, {\"auto\", \"sqrt\", \"log2\"} or None, default=\"auto\"", "description": "The number of features to consider when looking for the best split:\n\n - If int, then consider `max_features` features at each split.\n - If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n - If \"auto\", then `max_features=sqrt(n_features)`.\n - If \"sqrt\", then `max_features=sqrt(n_features)`.\n - If \"log2\", then `max_features=log2(n_features)`.\n - If None, then `max_features=n_features`.\n\nNote: the search for a split does not stop until at least one\nvalid partition of the node samples is found, even if it requires to\neffectively inspect more than ``max_features`` features." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "sqrt", "log2"] } }, { @@ -162011,7 +174913,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used to pick randomly the `max_features` used at each split.\nSee :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -162021,7 +174924,8 @@ "docstring": { "type": "int, default=None", "description": "Grow a tree with ``max_leaf_nodes`` in best-first fashion.\nBest nodes are defined as relative reduction in impurity.\nIf None then unlimited number of leaf nodes." - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -162031,7 +174935,8 @@ "docstring": { "type": "float, default=0.0", "description": "A node will be split if this split induces a decrease of the impurity\ngreater than or equal to this value.\n\nThe weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\nwhere ``N`` is the total number of samples, ``N_t`` is the number of\nsamples at the current node, ``N_t_L`` is the number of samples in the\nleft child, and ``N_t_R`` is the number of samples in the right child.\n\n``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\nif ``sample_weight`` is passed.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "class_weight", @@ -162041,6 +174946,10 @@ "docstring": { "type": "dict, list of dict or \"balanced\", default=None", "description": "Weights associated with classes in the form ``{class_label: weight}``.\nIf None, all classes are supposed to have weight one. For\nmulti-output problems, a list of dicts can be provided in the same\norder as the columns of y.\n\nNote that for multioutput (including multilabel) weights should be\ndefined for each class of every column in its own dict. For example,\nfor four-class multilabel classification weights should be\n[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n[{1:1}, {2:5}, {3:1}, {4:1}].\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data\nas ``n_samples / (n_classes * np.bincount(y))``\n\nFor multi-output, the weights of each column of y will be multiplied.\n\nNote that these weights will be multiplied with sample_weight (passed\nthrough the fit method) if sample_weight is specified." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -162051,13 +174960,14 @@ "docstring": { "type": "non-negative float, default=0.0", "description": "Complexity parameter used for Minimal Cost-Complexity Pruning. The\nsubtree with the largest cost complexity that is smaller than\n``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n:ref:`minimal_cost_complexity_pruning` for details.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, criterion='gini', splitter='random', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0):\n super().__init__(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, class_weight=class_weight, min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha)" }, { @@ -162075,7 +174985,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "criterion", @@ -162085,6 +174996,10 @@ "docstring": { "type": "{\"squared_error\", \"friedman_mse\"}, default=\"squared_error\"", "description": "The function to measure the quality of a split. Supported criteria\nare \"squared_error\" for the mean squared error, which is equal to\nvariance reduction as feature selection criterion and \"mae\" for the\nmean absolute error.\n\n.. versionadded:: 0.18\n Mean Absolute Error (MAE) criterion.\n\n.. versionadded:: 0.24\n Poisson deviance criterion.\n\n.. deprecated:: 1.0\n Criterion \"mse\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"squared_error\"` which is equivalent.\n\n.. deprecated:: 1.0\n Criterion \"mae\" was deprecated in v1.0 and will be removed in\n version 1.2. Use `criterion=\"absolute_error\"` which is equivalent." + }, + "refined_type": { + "kind": "EnumType", + "values": ["friedman_mse", "squared_error"] } }, { @@ -162095,6 +175010,10 @@ "docstring": { "type": "{\"random\", \"best\"}, default=\"random\"", "description": "The strategy used to choose the split at each node. Supported\nstrategies are \"best\" to choose the best split and \"random\" to choose\nthe best random split." + }, + "refined_type": { + "kind": "EnumType", + "values": ["random", "best"] } }, { @@ -162105,7 +175024,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum depth of the tree. If None, then nodes are expanded until\nall leaves are pure or until all leaves contain less than\nmin_samples_split samples." - } + }, + "refined_type": {} }, { "name": "min_samples_split", @@ -162115,7 +175035,8 @@ "docstring": { "type": "int or float, default=2", "description": "The minimum number of samples required to split an internal node:\n\n- If int, then consider `min_samples_split` as the minimum number.\n- If float, then `min_samples_split` is a fraction and\n `ceil(min_samples_split * n_samples)` are the minimum\n number of samples for each split.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_samples_leaf", @@ -162125,7 +175046,8 @@ "docstring": { "type": "int or float, default=1", "description": "The minimum number of samples required to be at a leaf node.\nA split point at any depth will only be considered if it leaves at\nleast ``min_samples_leaf`` training samples in each of the left and\nright branches. This may have the effect of smoothing the model,\nespecially in regression.\n\n- If int, then consider `min_samples_leaf` as the minimum number.\n- If float, then `min_samples_leaf` is a fraction and\n `ceil(min_samples_leaf * n_samples)` are the minimum\n number of samples for each node.\n\n.. versionchanged:: 0.18\n Added float values for fractions." - } + }, + "refined_type": {} }, { "name": "min_weight_fraction_leaf", @@ -162135,7 +175057,8 @@ "docstring": { "type": "float, default=0.0", "description": "The minimum weighted fraction of the sum total of weights (of all\nthe input samples) required to be at a leaf node. Samples have\nequal weight when sample_weight is not provided." - } + }, + "refined_type": {} }, { "name": "max_features", @@ -162145,6 +175068,10 @@ "docstring": { "type": "int, float, {\"auto\", \"sqrt\", \"log2\"} or None, default=\"auto\"", "description": "The number of features to consider when looking for the best split:\n\n- If int, then consider `max_features` features at each split.\n- If float, then `max_features` is a fraction and\n `int(max_features * n_features)` features are considered at each\n split.\n- If \"auto\", then `max_features=n_features`.\n- If \"sqrt\", then `max_features=sqrt(n_features)`.\n- If \"log2\", then `max_features=log2(n_features)`.\n- If None, then `max_features=n_features`.\n\nNote: the search for a split does not stop until at least one\nvalid partition of the node samples is found, even if it requires to\neffectively inspect more than ``max_features`` features." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "sqrt", "log2"] } }, { @@ -162155,7 +175082,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Used to pick randomly the `max_features` used at each split.\nSee :term:`Glossary ` for details." - } + }, + "refined_type": {} }, { "name": "min_impurity_decrease", @@ -162165,7 +175093,8 @@ "docstring": { "type": "float, default=0.0", "description": "A node will be split if this split induces a decrease of the impurity\ngreater than or equal to this value.\n\nThe weighted impurity decrease equation is the following::\n\n N_t / N * (impurity - N_t_R / N_t * right_impurity\n - N_t_L / N_t * left_impurity)\n\nwhere ``N`` is the total number of samples, ``N_t`` is the number of\nsamples at the current node, ``N_t_L`` is the number of samples in the\nleft child, and ``N_t_R`` is the number of samples in the right child.\n\n``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,\nif ``sample_weight`` is passed.\n\n.. versionadded:: 0.19" - } + }, + "refined_type": {} }, { "name": "max_leaf_nodes", @@ -162175,7 +175104,8 @@ "docstring": { "type": "int, default=None", "description": "Grow a tree with ``max_leaf_nodes`` in best-first fashion.\nBest nodes are defined as relative reduction in impurity.\nIf None then unlimited number of leaf nodes." - } + }, + "refined_type": {} }, { "name": "ccp_alpha", @@ -162185,13 +175115,14 @@ "docstring": { "type": "non-negative float, default=0.0", "description": "Complexity parameter used for Minimal Cost-Complexity Pruning. The\nsubtree with the largest cost complexity that is smaller than\n``ccp_alpha`` will be chosen. By default, no pruning is performed. See\n:ref:`minimal_cost_complexity_pruning` for details.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, criterion='squared_error', splitter='random', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', random_state=None, min_impurity_decrease=0.0, max_leaf_nodes=None, ccp_alpha=0.0):\n super().__init__(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha)" }, { @@ -162209,13 +175140,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return '\"tree.dot\"'" }, { @@ -162233,7 +175165,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -162243,7 +175176,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "feature_names", @@ -162253,7 +175187,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "class_names", @@ -162263,7 +175198,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "label", @@ -162273,7 +175209,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "filled", @@ -162283,7 +175220,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "impurity", @@ -162293,7 +175231,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "node_ids", @@ -162303,7 +175242,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "proportion", @@ -162313,7 +175253,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "rounded", @@ -162323,7 +175264,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "precision", @@ -162333,7 +175275,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fontsize", @@ -162343,13 +175286,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, impurity=True, node_ids=False, proportion=False, rounded=False, precision=3, fontsize=None):\n self.max_depth = max_depth\n self.feature_names = feature_names\n self.class_names = class_names\n self.label = label\n self.filled = filled\n self.impurity = impurity\n self.node_ids = node_ids\n self.proportion = proportion\n self.rounded = rounded\n self.precision = precision\n self.fontsize = fontsize" }, { @@ -162367,7 +175311,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -162377,13 +175322,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_color(self, value):\n if self.colors['bounds'] is None:\n color = list(self.colors['rgb'][np.argmax(value)])\n sorted_values = sorted(value, reverse=True)\n if len(sorted_values) == 1:\n alpha = 0\n else:\n alpha = (sorted_values[0] - sorted_values[1]) / (1 - sorted_values[1])\n else:\n color = list(self.colors['rgb'][0])\n alpha = (value - self.colors['bounds'][0]) / (self.colors['bounds'][1] - self.colors['bounds'][0])\n alpha = float(alpha)\n color = [int(round(alpha * c + (1 - alpha) * 255, 0)) for c in color]\n return '#%2x%2x%2x' % tuple(color)" }, { @@ -162401,7 +175347,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -162411,7 +175358,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "node_id", @@ -162421,13 +175369,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_fill_color(self, tree, node_id):\n if 'rgb' not in self.colors:\n self.colors['rgb'] = _color_brew(tree.n_classes[0])\n if tree.n_outputs != 1:\n self.colors['bounds'] = (np.min(-tree.impurity), np.max(-tree.impurity))\n elif tree.n_classes[0] == 1 and len(np.unique(tree.value)) != 1:\n self.colors['bounds'] = (np.min(tree.value), np.max(tree.value))\n if tree.n_outputs == 1:\n node_val = tree.value[node_id][0, :] / tree.weighted_n_node_samples[node_id]\n if tree.n_classes[0] == 1:\n node_val = tree.value[node_id][0, :]\n else:\n node_val = -tree.impurity[node_id]\n return self.get_color(node_val)" }, { @@ -162445,7 +175394,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -162455,7 +175405,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "node_id", @@ -162465,7 +175416,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "criterion", @@ -162475,13 +175427,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef node_to_str(self, tree, node_id, criterion):\n if tree.n_outputs == 1:\n value = tree.value[node_id][0, :]\n else:\n value = tree.value[node_id]\n labels = self.label == 'root' and node_id == 0 or self.label == 'all'\n characters = self.characters\n node_string = characters[-1]\n if self.node_ids:\n if labels:\n node_string += 'node '\n node_string += characters[0] + str(node_id) + characters[4]\n if tree.children_left[node_id] != _tree.TREE_LEAF:\n if self.feature_names is not None:\n feature = self.feature_names[tree.feature[node_id]]\n else:\n feature = 'X%s%s%s' % (characters[1], tree.feature[node_id], characters[2])\n node_string += '%s %s %s%s' % (feature, characters[3], round(tree.threshold[node_id], self.precision), characters[4])\n if self.impurity:\n if isinstance(criterion, _criterion.FriedmanMSE):\n criterion = 'friedman_mse'\n elif isinstance(criterion, _criterion.MSE) or criterion == 'squared_error':\n criterion = 'squared_error'\n elif not isinstance(criterion, str):\n criterion = 'impurity'\n if labels:\n node_string += '%s = ' % criterion\n node_string += str(round(tree.impurity[node_id], self.precision)) + characters[4]\n if labels:\n node_string += 'samples = '\n if self.proportion:\n percent = 100.0 * tree.n_node_samples[node_id] / float(tree.n_node_samples[0])\n node_string += str(round(percent, 1)) + '%' + characters[4]\n else:\n node_string += str(tree.n_node_samples[node_id]) + characters[4]\n if self.proportion and tree.n_classes[0] != 1:\n value = value / tree.weighted_n_node_samples[node_id]\n if labels:\n node_string += 'value = '\n if tree.n_classes[0] == 1:\n value_text = np.around(value, self.precision)\n elif self.proportion:\n value_text = np.around(value, self.precision)\n elif np.all(np.equal(np.mod(value, 1), 0)):\n value_text = value.astype(int)\n else:\n value_text = np.around(value, self.precision)\n value_text = str(value_text.astype('S32')).replace(\"b'\", \"'\")\n value_text = value_text.replace(\"' '\", ', ').replace(\"'\", '')\n if tree.n_classes[0] == 1 and tree.n_outputs == 1:\n value_text = value_text.replace('[', '').replace(']', '')\n value_text = value_text.replace('\\n ', characters[4])\n node_string += value_text + characters[4]\n if self.class_names is not None and tree.n_classes[0] != 1 and tree.n_outputs == 1:\n if labels:\n node_string += 'class = '\n if self.class_names is not True:\n class_name = self.class_names[np.argmax(value)]\n else:\n class_name = 'y%s%s%s' % (characters[1], np.argmax(value), characters[2])\n node_string += class_name\n if node_string.endswith(characters[4]):\n node_string = node_string[:-len(characters[4])]\n return node_string + characters[5]" }, { @@ -162499,7 +175452,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "out_file", @@ -162509,7 +175463,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -162519,7 +175474,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "feature_names", @@ -162529,7 +175485,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "class_names", @@ -162539,7 +175496,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "label", @@ -162549,7 +175507,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "filled", @@ -162559,7 +175518,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "leaves_parallel", @@ -162569,7 +175529,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "impurity", @@ -162579,7 +175540,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "node_ids", @@ -162589,7 +175551,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "proportion", @@ -162599,7 +175562,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "rotate", @@ -162609,7 +175573,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "rounded", @@ -162619,7 +175584,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "special_characters", @@ -162629,7 +175595,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "precision", @@ -162639,7 +175606,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fontname", @@ -162649,13 +175617,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, out_file=SENTINEL, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, leaves_parallel=False, impurity=True, node_ids=False, proportion=False, rotate=False, rounded=False, special_characters=False, precision=3, fontname='helvetica'):\n super().__init__(max_depth=max_depth, feature_names=feature_names, class_names=class_names, label=label, filled=filled, impurity=impurity, node_ids=node_ids, proportion=proportion, rounded=rounded, precision=precision)\n self.leaves_parallel = leaves_parallel\n self.out_file = out_file\n self.special_characters = special_characters\n self.fontname = fontname\n self.rotate = rotate\n if special_characters:\n self.characters = ['#', '', '', '≤', '
', '>', '<']\n else:\n self.characters = ['#', '[', ']', '<=', '\\\\n', '\"', '\"']\n if isinstance(precision, Integral):\n if precision < 0:\n raise ValueError(\"'precision' should be greater or equal to 0. Got {} instead.\".format(precision))\n else:\n raise ValueError(\"'precision' should be an integer. Got {} instead.\".format(type(precision)))\n self.ranks = {'leaves': []}\n self.colors = {'bounds': None}" }, { @@ -162673,7 +175642,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "decision_tree", @@ -162683,13 +175653,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef export(self, decision_tree):\n if self.feature_names is not None:\n if len(self.feature_names) != decision_tree.n_features_in_:\n raise ValueError('Length of feature_names, %d does not match number of features, %d' % (len(self.feature_names), decision_tree.n_features_in_))\n self.head()\n if isinstance(decision_tree, _tree.Tree):\n self.recurse(decision_tree, 0, criterion='impurity')\n else:\n self.recurse(decision_tree.tree_, 0, criterion=decision_tree.criterion)\n self.tail()" }, { @@ -162707,13 +175678,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef head(self):\n self.out_file.write('digraph Tree {\\n')\n self.out_file.write('node [shape=box')\n rounded_filled = []\n if self.filled:\n rounded_filled.append('filled')\n if self.rounded:\n rounded_filled.append('rounded')\n if len(rounded_filled) > 0:\n self.out_file.write(', style=\"%s\", color=\"black\"' % ', '.join(rounded_filled))\n self.out_file.write(', fontname=\"%s\"' % self.fontname)\n self.out_file.write('] ;\\n')\n if self.leaves_parallel:\n self.out_file.write('graph [ranksep=equally, splines=polyline] ;\\n')\n self.out_file.write('edge [fontname=\"%s\"] ;\\n' % self.fontname)\n if self.rotate:\n self.out_file.write('rankdir=LR ;\\n')" }, { @@ -162731,7 +175703,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -162741,7 +175714,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "node_id", @@ -162751,7 +175725,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "criterion", @@ -162761,7 +175736,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "parent", @@ -162771,7 +175747,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "depth", @@ -162781,13 +175758,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef recurse(self, tree, node_id, criterion, parent=None, depth=0):\n if node_id == _tree.TREE_LEAF:\n raise ValueError('Invalid node_id %s' % _tree.TREE_LEAF)\n left_child = tree.children_left[node_id]\n right_child = tree.children_right[node_id]\n if self.max_depth is None or depth <= self.max_depth:\n if left_child == _tree.TREE_LEAF:\n self.ranks['leaves'].append(str(node_id))\n elif str(depth) not in self.ranks:\n self.ranks[str(depth)] = [str(node_id)]\n else:\n self.ranks[str(depth)].append(str(node_id))\n self.out_file.write('%d [label=%s' % (node_id, self.node_to_str(tree, node_id, criterion)))\n if self.filled:\n self.out_file.write(', fillcolor=\"%s\"' % self.get_fill_color(tree, node_id))\n self.out_file.write('] ;\\n')\n if parent is not None:\n self.out_file.write('%d -> %d' % (parent, node_id))\n if parent == 0:\n angles = np.array([45, -45]) * ((self.rotate - 0.5) * -2)\n self.out_file.write(' [labeldistance=2.5, labelangle=')\n if node_id == 1:\n self.out_file.write('%d, headlabel=\"True\"]' % angles[0])\n else:\n self.out_file.write('%d, headlabel=\"False\"]' % angles[1])\n self.out_file.write(' ;\\n')\n if left_child != _tree.TREE_LEAF:\n self.recurse(tree, left_child, criterion=criterion, parent=node_id, depth=depth + 1)\n self.recurse(tree, right_child, criterion=criterion, parent=node_id, depth=depth + 1)\n else:\n self.ranks['leaves'].append(str(node_id))\n self.out_file.write('%d [label=\"(...)\"' % node_id)\n if self.filled:\n self.out_file.write(', fillcolor=\"#C0C0C0\"')\n self.out_file.write('] ;\\n' % node_id)\n if parent is not None:\n self.out_file.write('%d -> %d ;\\n' % (parent, node_id))" }, { @@ -162805,13 +175783,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef tail(self):\n if self.leaves_parallel:\n for rank in sorted(self.ranks):\n self.out_file.write('{rank=same ; ' + '; '.join((r for r in self.ranks[rank])) + '} ;\\n')\n self.out_file.write('}')" }, { @@ -162829,7 +175808,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -162839,7 +175819,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "feature_names", @@ -162849,7 +175830,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "class_names", @@ -162859,7 +175841,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "label", @@ -162869,7 +175852,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "filled", @@ -162879,7 +175863,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "impurity", @@ -162889,7 +175874,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "node_ids", @@ -162899,7 +175885,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "proportion", @@ -162909,7 +175896,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "rounded", @@ -162919,7 +175907,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "precision", @@ -162929,7 +175918,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fontsize", @@ -162939,13 +175929,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, impurity=True, node_ids=False, proportion=False, rounded=False, precision=3, fontsize=None):\n super().__init__(max_depth=max_depth, feature_names=feature_names, class_names=class_names, label=label, filled=filled, impurity=impurity, node_ids=node_ids, proportion=proportion, rounded=rounded, precision=precision)\n self.fontsize = fontsize\n if isinstance(precision, Integral):\n if precision < 0:\n raise ValueError(\"'precision' should be greater or equal to 0. Got {} instead.\".format(precision))\n else:\n raise ValueError(\"'precision' should be an integer. Got {} instead.\".format(type(precision)))\n self.ranks = {'leaves': []}\n self.colors = {'bounds': None}\n self.characters = ['#', '[', ']', '<=', '\\n', '', '']\n self.bbox_args = dict()\n if self.rounded:\n self.bbox_args['boxstyle'] = 'round'\n self.arrow_args = dict(arrowstyle='<-')" }, { @@ -162963,7 +175954,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "node_id", @@ -162973,7 +175965,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "et", @@ -162983,7 +175976,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "criterion", @@ -162993,7 +175987,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "depth", @@ -163003,13 +175998,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _make_tree(self, node_id, et, criterion, depth=0):\n name = self.node_to_str(et, node_id, criterion=criterion)\n if et.children_left[node_id] != _tree.TREE_LEAF and (self.max_depth is None or depth <= self.max_depth):\n children = [self._make_tree(et.children_left[node_id], et, criterion, depth=depth + 1), self._make_tree(et.children_right[node_id], et, criterion, depth=depth + 1)]\n else:\n return Tree(name, node_id)\n return Tree(name, node_id, *children)" }, { @@ -163027,7 +176023,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "decision_tree", @@ -163037,7 +176034,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ax", @@ -163047,14 +176045,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", - "source_code": "\ndef export(self, decision_tree, ax=None):\n import matplotlib.pyplot as plt\n from matplotlib.text import Annotation\n if ax is None:\n ax = plt.gca()\n ax.clear()\n ax.set_axis_off()\n my_tree = self._make_tree(0, decision_tree.tree_, decision_tree.criterion)\n draw_tree = buchheim(my_tree)\n (max_x, max_y) = draw_tree.max_extents() + 1\n ax_width = ax.get_window_extent().width\n ax_height = ax.get_window_extent().height\n scale_x = ax_width / max_x\n scale_y = ax_height / max_y\n self.recurse(draw_tree, decision_tree.tree_, ax, scale_x, scale_y, ax_height)\n anns = [ann for ann in ax.get_children() if isinstance(ann, Annotation)]\n renderer = ax.figure.canvas.get_renderer()\n for ann in anns:\n ann.update_bbox_position_size(renderer)\n if self.fontsize is None:\n extents = [ann.get_bbox_patch().get_window_extent() for ann in anns]\n max_width = max([extent.width for extent in extents])\n max_height = max([extent.height for extent in extents])\n size = anns[0].get_fontsize() * min(scale_x / max_width, scale_y / max_height)\n for ann in anns:\n ann.set_fontsize(size)\n return anns" + "docstring": null, + "source_code": "\ndef export(self, decision_tree, ax=None):\n import matplotlib.pyplot as plt\n from matplotlib.text import Annotation\n if ax is None:\n ax = plt.gca()\n ax.clear()\n ax.set_axis_off()\n my_tree = self._make_tree(0, decision_tree.tree_, decision_tree.criterion)\n draw_tree = buchheim(my_tree)\n (max_x, max_y) = draw_tree.max_extents() + 1\n ax_width = ax.get_window_extent().width\n ax_height = ax.get_window_extent().height\n scale_x = ax_width / max_x\n scale_y = ax_height / max_y\n self.recurse(draw_tree, decision_tree.tree_, ax, max_x, max_y)\n anns = [ann for ann in ax.get_children() if isinstance(ann, Annotation)]\n renderer = ax.figure.canvas.get_renderer()\n for ann in anns:\n ann.update_bbox_position_size(renderer)\n if self.fontsize is None:\n extents = [ann.get_bbox_patch().get_window_extent() for ann in anns]\n max_width = max([extent.width for extent in extents])\n max_height = max([extent.height for extent in extents])\n size = anns[0].get_fontsize() * min(scale_x / max_width, scale_y / max_height)\n for ann in anns:\n ann.set_fontsize(size)\n return anns" }, { "name": "recurse", @@ -163071,7 +176070,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "node", @@ -163081,7 +176081,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -163091,7 +176092,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ax", @@ -163101,37 +176103,30 @@ "docstring": { "type": "", "description": "" - } - }, - { - "name": "scale_x", - "default_value": null, - "is_public": false, - "assigned_by": "POSITION_OR_NAME", - "docstring": { - "type": "", - "description": "" - } + }, + "refined_type": {} }, { - "name": "scale_y", + "name": "max_x", "default_value": null, "is_public": false, "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { - "name": "height", + "name": "max_y", "default_value": null, "is_public": false, "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "depth", @@ -163141,14 +176136,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", - "source_code": "\ndef recurse(self, node, tree, ax, scale_x, scale_y, height, depth=0):\n import matplotlib.pyplot as plt\n kwargs = dict(bbox=self.bbox_args.copy(), ha='center', va='center', zorder=100 - 10 * depth, xycoords='axes points', arrowprops=self.arrow_args.copy())\n kwargs['arrowprops']['edgecolor'] = plt.rcParams['text.color']\n if self.fontsize is not None:\n kwargs['fontsize'] = self.fontsize\n xy = ((node.x + 0.5) * scale_x, height - (node.y + 0.5) * scale_y)\n if self.max_depth is None or depth <= self.max_depth:\n if self.filled:\n kwargs['bbox']['fc'] = self.get_fill_color(tree, node.tree.node_id)\n else:\n kwargs['bbox']['fc'] = ax.get_facecolor()\n if node.parent is None:\n ax.annotate(node.tree.label, xy, **kwargs)\n else:\n xy_parent = ((node.parent.x + 0.5) * scale_x, height - (node.parent.y + 0.5) * scale_y)\n ax.annotate(node.tree.label, xy_parent, xy, **kwargs)\n for child in node.children:\n self.recurse(child, tree, ax, scale_x, scale_y, height, depth=depth + 1)\n else:\n xy_parent = ((node.parent.x + 0.5) * scale_x, height - (node.parent.y + 0.5) * scale_y)\n kwargs['bbox']['fc'] = 'grey'\n ax.annotate('\\n (...) \\n', xy_parent, xy, **kwargs)" + "docstring": null, + "source_code": "\ndef recurse(self, node, tree, ax, max_x, max_y, depth=0):\n import matplotlib.pyplot as plt\n kwargs = dict(bbox=self.bbox_args.copy(), ha='center', va='center', zorder=100 - 10 * depth, xycoords='axes fraction', arrowprops=self.arrow_args.copy())\n kwargs['arrowprops']['edgecolor'] = plt.rcParams['text.color']\n if self.fontsize is not None:\n kwargs['fontsize'] = self.fontsize\n xy = ((node.x + 0.5) / max_x, (max_y - node.y - 0.5) / max_y)\n if self.max_depth is None or depth <= self.max_depth:\n if self.filled:\n kwargs['bbox']['fc'] = self.get_fill_color(tree, node.tree.node_id)\n else:\n kwargs['bbox']['fc'] = ax.get_facecolor()\n if node.parent is None:\n ax.annotate(node.tree.label, xy, **kwargs)\n else:\n xy_parent = ((node.parent.x + 0.5) / max_x, (max_y - node.parent.y - 0.5) / max_y)\n ax.annotate(node.tree.label, xy_parent, xy, **kwargs)\n for child in node.children:\n self.recurse(child, tree, ax, max_x, max_y, depth=depth + 1)\n else:\n xy_parent = ((node.parent.x + 0.5) / max_x, (max_y - node.parent.y - 0.5) / max_y)\n kwargs['bbox']['fc'] = 'grey'\n ax.annotate('\\n (...) \\n', xy_parent, xy, **kwargs)" }, { "name": "_color_brew", @@ -163165,13 +176161,14 @@ "docstring": { "type": "int", "description": "The number of colors required." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Generate n colors with equally spaced hues.", - "docstring": "Generate n colors with equally spaced hues.\n\nParameters\n----------\nn : int\n The number of colors required.\n\nReturns\n-------\ncolor_list : list, length n\n List of n tuples of form (R, G, B) being the components of each color.", + "docstring": "Generate n colors with equally spaced hues.\n\n Parameters\n ----------\n n : int\n The number of colors required.\n\n Returns\n -------\n color_list : list, length n\n List of n tuples of form (R, G, B) being the components of each color.\n ", "source_code": "\ndef _color_brew(n):\n \"\"\"Generate n colors with equally spaced hues.\n\n Parameters\n ----------\n n : int\n The number of colors required.\n\n Returns\n -------\n color_list : list, length n\n List of n tuples of form (R, G, B) being the components of each color.\n \"\"\"\n color_list = []\n (s, v) = (0.75, 0.9)\n c = s * v\n m = v - c\n for h in np.arange(25, 385, 360.0 / n).astype(int):\n h_bar = h / 60.0\n x = c * (1 - abs(h_bar % 2 - 1))\n rgb = [(c, x, 0), (x, c, 0), (0, c, x), (0, x, c), (x, 0, c), (c, 0, x), (c, x, 0)]\n (r, g, b) = rgb[int(h_bar)]\n rgb = [int(255 * (r + m)), int(255 * (g + m)), int(255 * (b + m))]\n color_list.append(rgb)\n return color_list" }, { @@ -163189,7 +176186,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "node", @@ -163199,13 +176197,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Returns the depth of the subtree rooted in node.", - "docstring": "Returns the depth of the subtree rooted in node.", + "docstring": "\n Returns the depth of the subtree rooted in node.\n ", "source_code": "\ndef _compute_depth(tree, node):\n \"\"\"\n Returns the depth of the subtree rooted in node.\n \"\"\"\n \n def compute_depth_(current_node, current_depth, children_left, children_right, depths):\n depths += [current_depth]\n left = children_left[current_node]\n right = children_right[current_node]\n if left != -1 and right != -1:\n compute_depth_(left, current_depth + 1, children_left, children_right, depths)\n compute_depth_(right, current_depth + 1, children_left, children_right, depths)\n depths = []\n compute_depth_(node, 1, tree.children_left, tree.children_right, depths)\n return max(depths)" }, { @@ -163223,7 +176222,8 @@ "docstring": { "type": "decision tree classifier", "description": "The decision tree to be exported to GraphViz." - } + }, + "refined_type": {} }, { "name": "out_file", @@ -163233,7 +176233,8 @@ "docstring": { "type": "object or str, default=None", "description": "Handle or name of the output file. If ``None``, the result is\nreturned as a string.\n\n.. versionchanged:: 0.20\n Default of out_file changed from \"tree.dot\" to None." - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -163243,7 +176244,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum depth of the representation. If None, the tree is fully\ngenerated." - } + }, + "refined_type": {} }, { "name": "feature_names", @@ -163253,7 +176255,8 @@ "docstring": { "type": "list of str, default=None", "description": "Names of each of the features.\nIf None generic names will be used (\"feature_0\", \"feature_1\", ...)." - } + }, + "refined_type": {} }, { "name": "class_names", @@ -163263,7 +176266,8 @@ "docstring": { "type": "list of str or bool, default=None", "description": "Names of each of the target classes in ascending numerical order.\nOnly relevant for classification and not supported for multi-output.\nIf ``True``, shows a symbolic representation of the class name." - } + }, + "refined_type": {} }, { "name": "label", @@ -163273,6 +176277,10 @@ "docstring": { "type": "{'all', 'root', 'none'}, default='all'", "description": "Whether to show informative labels for impurity, etc.\nOptions include 'all' to show at every node, 'root' to show only at\nthe top root node, or 'none' to not show at any node." + }, + "refined_type": { + "kind": "EnumType", + "values": ["none", "root", "all"] } }, { @@ -163283,7 +176291,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, paint nodes to indicate majority class for\nclassification, extremity of values for regression, or purity of node\nfor multi-output." - } + }, + "refined_type": {} }, { "name": "leaves_parallel", @@ -163293,7 +176302,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, draw all leaf nodes at the bottom of the tree." - } + }, + "refined_type": {} }, { "name": "impurity", @@ -163303,7 +176313,8 @@ "docstring": { "type": "bool, default=True", "description": "When set to ``True``, show the impurity at each node." - } + }, + "refined_type": {} }, { "name": "node_ids", @@ -163313,7 +176324,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, show the ID number on each node." - } + }, + "refined_type": {} }, { "name": "proportion", @@ -163323,7 +176335,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, change the display of 'values' and/or 'samples'\nto be proportions and percentages respectively." - } + }, + "refined_type": {} }, { "name": "rotate", @@ -163333,7 +176346,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, orient tree left to right rather than top-down." - } + }, + "refined_type": {} }, { "name": "rounded", @@ -163343,7 +176357,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, draw node boxes with rounded corners." - } + }, + "refined_type": {} }, { "name": "special_characters", @@ -163353,7 +176368,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``False``, ignore special characters for PostScript\ncompatibility." - } + }, + "refined_type": {} }, { "name": "precision", @@ -163363,7 +176379,8 @@ "docstring": { "type": "int, default=3", "description": "Number of digits of precision for floating point in the values of\nimpurity, threshold and value attributes of each node." - } + }, + "refined_type": {} }, { "name": "fontname", @@ -163373,13 +176390,14 @@ "docstring": { "type": "str, default='helvetica'", "description": "Name of font used to render text." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Export a decision tree in DOT format.\n\nThis function generates a GraphViz representation of the decision tree, which is then written into `out_file`. Once exported, graphical renderings can be generated using, for example:: $ dot -Tps tree.dot -o tree.ps (PostScript format) $ dot -Tpng tree.dot -o tree.png (PNG format) The sample counts that are shown are weighted with any sample_weights that might be present. Read more in the :ref:`User Guide `.", - "docstring": "Export a decision tree in DOT format.\n\nThis function generates a GraphViz representation of the decision tree,\nwhich is then written into `out_file`. Once exported, graphical renderings\ncan be generated using, for example::\n\n $ dot -Tps tree.dot -o tree.ps (PostScript format)\n $ dot -Tpng tree.dot -o tree.png (PNG format)\n\nThe sample counts that are shown are weighted with any sample_weights that\nmight be present.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\ndecision_tree : decision tree classifier\n The decision tree to be exported to GraphViz.\n\nout_file : object or str, default=None\n Handle or name of the output file. If ``None``, the result is\n returned as a string.\n\n .. versionchanged:: 0.20\n Default of out_file changed from \"tree.dot\" to None.\n\nmax_depth : int, default=None\n The maximum depth of the representation. If None, the tree is fully\n generated.\n\nfeature_names : list of str, default=None\n Names of each of the features.\n If None generic names will be used (\"feature_0\", \"feature_1\", ...).\n\nclass_names : list of str or bool, default=None\n Names of each of the target classes in ascending numerical order.\n Only relevant for classification and not supported for multi-output.\n If ``True``, shows a symbolic representation of the class name.\n\nlabel : {'all', 'root', 'none'}, default='all'\n Whether to show informative labels for impurity, etc.\n Options include 'all' to show at every node, 'root' to show only at\n the top root node, or 'none' to not show at any node.\n\nfilled : bool, default=False\n When set to ``True``, paint nodes to indicate majority class for\n classification, extremity of values for regression, or purity of node\n for multi-output.\n\nleaves_parallel : bool, default=False\n When set to ``True``, draw all leaf nodes at the bottom of the tree.\n\nimpurity : bool, default=True\n When set to ``True``, show the impurity at each node.\n\nnode_ids : bool, default=False\n When set to ``True``, show the ID number on each node.\n\nproportion : bool, default=False\n When set to ``True``, change the display of 'values' and/or 'samples'\n to be proportions and percentages respectively.\n\nrotate : bool, default=False\n When set to ``True``, orient tree left to right rather than top-down.\n\nrounded : bool, default=False\n When set to ``True``, draw node boxes with rounded corners.\n\nspecial_characters : bool, default=False\n When set to ``False``, ignore special characters for PostScript\n compatibility.\n\nprecision : int, default=3\n Number of digits of precision for floating point in the values of\n impurity, threshold and value attributes of each node.\n\nfontname : str, default='helvetica'\n Name of font used to render text.\n\nReturns\n-------\ndot_data : str\n String representation of the input tree in GraphViz dot format.\n Only returned if ``out_file`` is None.\n\n .. versionadded:: 0.18\n\nExamples\n--------\n>>> from sklearn.datasets import load_iris\n>>> from sklearn import tree\n\n>>> clf = tree.DecisionTreeClassifier()\n>>> iris = load_iris()\n\n>>> clf = clf.fit(iris.data, iris.target)\n>>> tree.export_graphviz(clf)\n'digraph Tree {...", + "description": "Export a decision tree in DOT format.\n\nThis function generates a GraphViz representation of the decision tree,\nwhich is then written into `out_file`. Once exported, graphical renderings\ncan be generated using, for example::\n\n $ dot -Tps tree.dot -o tree.ps (PostScript format)\n $ dot -Tpng tree.dot -o tree.png (PNG format)\n\nThe sample counts that are shown are weighted with any sample_weights that\nmight be present.\n\nRead more in the :ref:`User Guide `.", + "docstring": "Export a decision tree in DOT format.\n\n This function generates a GraphViz representation of the decision tree,\n which is then written into `out_file`. Once exported, graphical renderings\n can be generated using, for example::\n\n $ dot -Tps tree.dot -o tree.ps (PostScript format)\n $ dot -Tpng tree.dot -o tree.png (PNG format)\n\n The sample counts that are shown are weighted with any sample_weights that\n might be present.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n decision_tree : decision tree classifier\n The decision tree to be exported to GraphViz.\n\n out_file : object or str, default=None\n Handle or name of the output file. If ``None``, the result is\n returned as a string.\n\n .. versionchanged:: 0.20\n Default of out_file changed from \"tree.dot\" to None.\n\n max_depth : int, default=None\n The maximum depth of the representation. If None, the tree is fully\n generated.\n\n feature_names : list of str, default=None\n Names of each of the features.\n If None generic names will be used (\"feature_0\", \"feature_1\", ...).\n\n class_names : list of str or bool, default=None\n Names of each of the target classes in ascending numerical order.\n Only relevant for classification and not supported for multi-output.\n If ``True``, shows a symbolic representation of the class name.\n\n label : {'all', 'root', 'none'}, default='all'\n Whether to show informative labels for impurity, etc.\n Options include 'all' to show at every node, 'root' to show only at\n the top root node, or 'none' to not show at any node.\n\n filled : bool, default=False\n When set to ``True``, paint nodes to indicate majority class for\n classification, extremity of values for regression, or purity of node\n for multi-output.\n\n leaves_parallel : bool, default=False\n When set to ``True``, draw all leaf nodes at the bottom of the tree.\n\n impurity : bool, default=True\n When set to ``True``, show the impurity at each node.\n\n node_ids : bool, default=False\n When set to ``True``, show the ID number on each node.\n\n proportion : bool, default=False\n When set to ``True``, change the display of 'values' and/or 'samples'\n to be proportions and percentages respectively.\n\n rotate : bool, default=False\n When set to ``True``, orient tree left to right rather than top-down.\n\n rounded : bool, default=False\n When set to ``True``, draw node boxes with rounded corners.\n\n special_characters : bool, default=False\n When set to ``False``, ignore special characters for PostScript\n compatibility.\n\n precision : int, default=3\n Number of digits of precision for floating point in the values of\n impurity, threshold and value attributes of each node.\n\n fontname : str, default='helvetica'\n Name of font used to render text.\n\n Returns\n -------\n dot_data : str\n String representation of the input tree in GraphViz dot format.\n Only returned if ``out_file`` is None.\n\n .. versionadded:: 0.18\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn import tree\n\n >>> clf = tree.DecisionTreeClassifier()\n >>> iris = load_iris()\n\n >>> clf = clf.fit(iris.data, iris.target)\n >>> tree.export_graphviz(clf)\n 'digraph Tree {...\n ", "source_code": "\ndef export_graphviz(decision_tree, out_file=None, *, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, leaves_parallel=False, impurity=True, node_ids=False, proportion=False, rotate=False, rounded=False, special_characters=False, precision=3, fontname='helvetica'):\n \"\"\"Export a decision tree in DOT format.\n\n This function generates a GraphViz representation of the decision tree,\n which is then written into `out_file`. Once exported, graphical renderings\n can be generated using, for example::\n\n $ dot -Tps tree.dot -o tree.ps (PostScript format)\n $ dot -Tpng tree.dot -o tree.png (PNG format)\n\n The sample counts that are shown are weighted with any sample_weights that\n might be present.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n decision_tree : decision tree classifier\n The decision tree to be exported to GraphViz.\n\n out_file : object or str, default=None\n Handle or name of the output file. If ``None``, the result is\n returned as a string.\n\n .. versionchanged:: 0.20\n Default of out_file changed from \"tree.dot\" to None.\n\n max_depth : int, default=None\n The maximum depth of the representation. If None, the tree is fully\n generated.\n\n feature_names : list of str, default=None\n Names of each of the features.\n If None generic names will be used (\"feature_0\", \"feature_1\", ...).\n\n class_names : list of str or bool, default=None\n Names of each of the target classes in ascending numerical order.\n Only relevant for classification and not supported for multi-output.\n If ``True``, shows a symbolic representation of the class name.\n\n label : {'all', 'root', 'none'}, default='all'\n Whether to show informative labels for impurity, etc.\n Options include 'all' to show at every node, 'root' to show only at\n the top root node, or 'none' to not show at any node.\n\n filled : bool, default=False\n When set to ``True``, paint nodes to indicate majority class for\n classification, extremity of values for regression, or purity of node\n for multi-output.\n\n leaves_parallel : bool, default=False\n When set to ``True``, draw all leaf nodes at the bottom of the tree.\n\n impurity : bool, default=True\n When set to ``True``, show the impurity at each node.\n\n node_ids : bool, default=False\n When set to ``True``, show the ID number on each node.\n\n proportion : bool, default=False\n When set to ``True``, change the display of 'values' and/or 'samples'\n to be proportions and percentages respectively.\n\n rotate : bool, default=False\n When set to ``True``, orient tree left to right rather than top-down.\n\n rounded : bool, default=False\n When set to ``True``, draw node boxes with rounded corners.\n\n special_characters : bool, default=False\n When set to ``False``, ignore special characters for PostScript\n compatibility.\n\n precision : int, default=3\n Number of digits of precision for floating point in the values of\n impurity, threshold and value attributes of each node.\n\n fontname : str, default='helvetica'\n Name of font used to render text.\n\n Returns\n -------\n dot_data : str\n String representation of the input tree in GraphViz dot format.\n Only returned if ``out_file`` is None.\n\n .. versionadded:: 0.18\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn import tree\n\n >>> clf = tree.DecisionTreeClassifier()\n >>> iris = load_iris()\n\n >>> clf = clf.fit(iris.data, iris.target)\n >>> tree.export_graphviz(clf)\n 'digraph Tree {...\n \"\"\"\n check_is_fitted(decision_tree)\n own_file = False\n return_string = False\n try:\n if isinstance(out_file, str):\n out_file = open(out_file, 'w', encoding='utf-8')\n own_file = True\n if out_file is None:\n return_string = True\n out_file = StringIO()\n exporter = _DOTTreeExporter(out_file=out_file, max_depth=max_depth, feature_names=feature_names, class_names=class_names, label=label, filled=filled, leaves_parallel=leaves_parallel, impurity=impurity, node_ids=node_ids, proportion=proportion, rotate=rotate, rounded=rounded, special_characters=special_characters, precision=precision, fontname=fontname)\n exporter.export(decision_tree)\n if return_string:\n return exporter.out_file.getvalue()\n finally:\n if own_file:\n out_file.close()" }, { @@ -163397,7 +176415,8 @@ "docstring": { "type": "object", "description": "The decision tree estimator to be exported.\nIt can be an instance of\nDecisionTreeClassifier or DecisionTreeRegressor." - } + }, + "refined_type": {} }, { "name": "feature_names", @@ -163407,7 +176426,8 @@ "docstring": { "type": "list of str, default=None", "description": "A list of length n_features containing the feature names.\nIf None generic names will be used (\"feature_0\", \"feature_1\", ...)." - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -163417,7 +176437,8 @@ "docstring": { "type": "int, default=10", "description": "Only the first max_depth levels of the tree are exported.\nTruncated branches will be marked with \"...\"." - } + }, + "refined_type": {} }, { "name": "spacing", @@ -163427,7 +176448,8 @@ "docstring": { "type": "int, default=3", "description": "Number of spaces between edges. The higher it is, the wider the result." - } + }, + "refined_type": {} }, { "name": "decimals", @@ -163437,7 +176459,8 @@ "docstring": { "type": "int, default=2", "description": "Number of decimal digits to display." - } + }, + "refined_type": {} }, { "name": "show_weights", @@ -163447,13 +176470,14 @@ "docstring": { "type": "bool, default=False", "description": "If true the classification weights will be exported on each leaf.\nThe classification weights are the number of samples each class." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Build a text report showing the rules of a decision tree.\n\nNote that backwards compatibility may not be supported.", - "docstring": "Build a text report showing the rules of a decision tree.\n\nNote that backwards compatibility may not be supported.\n\nParameters\n----------\ndecision_tree : object\n The decision tree estimator to be exported.\n It can be an instance of\n DecisionTreeClassifier or DecisionTreeRegressor.\n\nfeature_names : list of str, default=None\n A list of length n_features containing the feature names.\n If None generic names will be used (\"feature_0\", \"feature_1\", ...).\n\nmax_depth : int, default=10\n Only the first max_depth levels of the tree are exported.\n Truncated branches will be marked with \"...\".\n\nspacing : int, default=3\n Number of spaces between edges. The higher it is, the wider the result.\n\ndecimals : int, default=2\n Number of decimal digits to display.\n\nshow_weights : bool, default=False\n If true the classification weights will be exported on each leaf.\n The classification weights are the number of samples each class.\n\nReturns\n-------\nreport : str\n Text summary of all the rules in the decision tree.\n\nExamples\n--------\n\n>>> from sklearn.datasets import load_iris\n>>> from sklearn.tree import DecisionTreeClassifier\n>>> from sklearn.tree import export_text\n>>> iris = load_iris()\n>>> X = iris['data']\n>>> y = iris['target']\n>>> decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)\n>>> decision_tree = decision_tree.fit(X, y)\n>>> r = export_text(decision_tree, feature_names=iris['feature_names'])\n>>> print(r)\n|--- petal width (cm) <= 0.80\n| |--- class: 0\n|--- petal width (cm) > 0.80\n| |--- petal width (cm) <= 1.75\n| | |--- class: 1\n| |--- petal width (cm) > 1.75\n| | |--- class: 2", + "docstring": "Build a text report showing the rules of a decision tree.\n\n Note that backwards compatibility may not be supported.\n\n Parameters\n ----------\n decision_tree : object\n The decision tree estimator to be exported.\n It can be an instance of\n DecisionTreeClassifier or DecisionTreeRegressor.\n\n feature_names : list of str, default=None\n A list of length n_features containing the feature names.\n If None generic names will be used (\"feature_0\", \"feature_1\", ...).\n\n max_depth : int, default=10\n Only the first max_depth levels of the tree are exported.\n Truncated branches will be marked with \"...\".\n\n spacing : int, default=3\n Number of spaces between edges. The higher it is, the wider the result.\n\n decimals : int, default=2\n Number of decimal digits to display.\n\n show_weights : bool, default=False\n If true the classification weights will be exported on each leaf.\n The classification weights are the number of samples each class.\n\n Returns\n -------\n report : str\n Text summary of all the rules in the decision tree.\n\n Examples\n --------\n\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.tree import DecisionTreeClassifier\n >>> from sklearn.tree import export_text\n >>> iris = load_iris()\n >>> X = iris['data']\n >>> y = iris['target']\n >>> decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)\n >>> decision_tree = decision_tree.fit(X, y)\n >>> r = export_text(decision_tree, feature_names=iris['feature_names'])\n >>> print(r)\n |--- petal width (cm) <= 0.80\n | |--- class: 0\n |--- petal width (cm) > 0.80\n | |--- petal width (cm) <= 1.75\n | | |--- class: 1\n | |--- petal width (cm) > 1.75\n | | |--- class: 2\n ", "source_code": "\ndef export_text(decision_tree, *, feature_names=None, max_depth=10, spacing=3, decimals=2, show_weights=False):\n \"\"\"Build a text report showing the rules of a decision tree.\n\n Note that backwards compatibility may not be supported.\n\n Parameters\n ----------\n decision_tree : object\n The decision tree estimator to be exported.\n It can be an instance of\n DecisionTreeClassifier or DecisionTreeRegressor.\n\n feature_names : list of str, default=None\n A list of length n_features containing the feature names.\n If None generic names will be used (\"feature_0\", \"feature_1\", ...).\n\n max_depth : int, default=10\n Only the first max_depth levels of the tree are exported.\n Truncated branches will be marked with \"...\".\n\n spacing : int, default=3\n Number of spaces between edges. The higher it is, the wider the result.\n\n decimals : int, default=2\n Number of decimal digits to display.\n\n show_weights : bool, default=False\n If true the classification weights will be exported on each leaf.\n The classification weights are the number of samples each class.\n\n Returns\n -------\n report : str\n Text summary of all the rules in the decision tree.\n\n Examples\n --------\n\n >>> from sklearn.datasets import load_iris\n >>> from sklearn.tree import DecisionTreeClassifier\n >>> from sklearn.tree import export_text\n >>> iris = load_iris()\n >>> X = iris['data']\n >>> y = iris['target']\n >>> decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)\n >>> decision_tree = decision_tree.fit(X, y)\n >>> r = export_text(decision_tree, feature_names=iris['feature_names'])\n >>> print(r)\n |--- petal width (cm) <= 0.80\n | |--- class: 0\n |--- petal width (cm) > 0.80\n | |--- petal width (cm) <= 1.75\n | | |--- class: 1\n | |--- petal width (cm) > 1.75\n | | |--- class: 2\n \"\"\"\n check_is_fitted(decision_tree)\n tree_ = decision_tree.tree_\n if is_classifier(decision_tree):\n class_names = decision_tree.classes_\n right_child_fmt = '{} {} <= {}\\n'\n left_child_fmt = '{} {} > {}\\n'\n truncation_fmt = '{} {}\\n'\n if max_depth < 0:\n raise ValueError('max_depth bust be >= 0, given %d' % max_depth)\n if feature_names is not None and len(feature_names) != tree_.n_features:\n raise ValueError('feature_names must contain %d elements, got %d' % (tree_.n_features, len(feature_names)))\n if spacing <= 0:\n raise ValueError('spacing must be > 0, given %d' % spacing)\n if decimals < 0:\n raise ValueError('decimals must be >= 0, given %d' % decimals)\n if isinstance(decision_tree, DecisionTreeClassifier):\n value_fmt = '{}{} weights: {}\\n'\n if not show_weights:\n value_fmt = '{}{}{}\\n'\n else:\n value_fmt = '{}{} value: {}\\n'\n if feature_names:\n feature_names_ = [feature_names[i] if i != _tree.TREE_UNDEFINED else None for i in tree_.feature]\n else:\n feature_names_ = ['feature_{}'.format(i) for i in tree_.feature]\n export_text.report = ''\n \n def _add_leaf(value, class_name, indent):\n val = ''\n is_classification = isinstance(decision_tree, DecisionTreeClassifier)\n if show_weights or not is_classification:\n val = ['{1:.{0}f}, '.format(decimals, v) for v in value]\n val = '[' + ''.join(val)[:-2] + ']'\n if is_classification:\n val += ' class: ' + str(class_name)\n export_text.report += value_fmt.format(indent, '', val)\n \n def print_tree_recurse(node, depth):\n indent = ('|' + ' ' * spacing) * depth\n indent = indent[:-spacing] + '-' * spacing\n value = None\n if tree_.n_outputs == 1:\n value = tree_.value[node][0]\n else:\n value = tree_.value[node].T[0]\n class_name = np.argmax(value)\n if tree_.n_classes[0] != 1 and tree_.n_outputs == 1:\n class_name = class_names[class_name]\n if depth <= max_depth + 1:\n info_fmt = ''\n info_fmt_left = info_fmt\n info_fmt_right = info_fmt\n if tree_.feature[node] != _tree.TREE_UNDEFINED:\n name = feature_names_[node]\n threshold = tree_.threshold[node]\n threshold = '{1:.{0}f}'.format(decimals, threshold)\n export_text.report += right_child_fmt.format(indent, name, threshold)\n export_text.report += info_fmt_left\n print_tree_recurse(tree_.children_left[node], depth + 1)\n export_text.report += left_child_fmt.format(indent, name, threshold)\n export_text.report += info_fmt_right\n print_tree_recurse(tree_.children_right[node], depth + 1)\n else:\n _add_leaf(value, class_name, indent)\n else:\n subtree_depth = _compute_depth(tree_, node)\n if subtree_depth == 1:\n _add_leaf(value, class_name, indent)\n else:\n trunc_report = 'truncated branch of depth %d' % subtree_depth\n export_text.report += truncation_fmt.format(indent, trunc_report)\n print_tree_recurse(0, 1)\n return export_text.report" }, { @@ -163471,7 +176495,8 @@ "docstring": { "type": "decision tree regressor or classifier", "description": "The decision tree to be plotted." - } + }, + "refined_type": {} }, { "name": "max_depth", @@ -163481,7 +176506,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum depth of the representation. If None, the tree is fully\ngenerated." - } + }, + "refined_type": {} }, { "name": "feature_names", @@ -163491,7 +176517,8 @@ "docstring": { "type": "list of strings, default=None", "description": "Names of each of the features.\nIf None, generic names will be used (\"X[0]\", \"X[1]\", ...)." - } + }, + "refined_type": {} }, { "name": "class_names", @@ -163501,7 +176528,8 @@ "docstring": { "type": "list of str or bool, default=None", "description": "Names of each of the target classes in ascending numerical order.\nOnly relevant for classification and not supported for multi-output.\nIf ``True``, shows a symbolic representation of the class name." - } + }, + "refined_type": {} }, { "name": "label", @@ -163511,6 +176539,10 @@ "docstring": { "type": "{'all', 'root', 'none'}, default='all'", "description": "Whether to show informative labels for impurity, etc.\nOptions include 'all' to show at every node, 'root' to show only at\nthe top root node, or 'none' to not show at any node." + }, + "refined_type": { + "kind": "EnumType", + "values": ["none", "root", "all"] } }, { @@ -163521,7 +176553,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, paint nodes to indicate majority class for\nclassification, extremity of values for regression, or purity of node\nfor multi-output." - } + }, + "refined_type": {} }, { "name": "impurity", @@ -163531,7 +176564,8 @@ "docstring": { "type": "bool, default=True", "description": "When set to ``True``, show the impurity at each node." - } + }, + "refined_type": {} }, { "name": "node_ids", @@ -163541,7 +176575,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, show the ID number on each node." - } + }, + "refined_type": {} }, { "name": "proportion", @@ -163551,7 +176586,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, change the display of 'values' and/or 'samples'\nto be proportions and percentages respectively." - } + }, + "refined_type": {} }, { "name": "rounded", @@ -163561,7 +176597,8 @@ "docstring": { "type": "bool, default=False", "description": "When set to ``True``, draw node boxes with rounded corners and use\nHelvetica fonts instead of Times-Roman." - } + }, + "refined_type": {} }, { "name": "precision", @@ -163571,7 +176608,8 @@ "docstring": { "type": "int, default=3", "description": "Number of digits of precision for floating point in the values of\nimpurity, threshold and value attributes of each node." - } + }, + "refined_type": {} }, { "name": "ax", @@ -163581,7 +176619,8 @@ "docstring": { "type": "matplotlib axis, default=None", "description": "Axes to plot to. If None, use current axis. Any previous content\nis cleared." - } + }, + "refined_type": {} }, { "name": "fontsize", @@ -163591,13 +176630,14 @@ "docstring": { "type": "int, default=None", "description": "Size of text font. If None, determined automatically to fit figure." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Plot a decision tree.\n\nThe sample counts that are shown are weighted with any sample_weights that might be present. The visualization is fit automatically to the size of the axis. Use the ``figsize`` or ``dpi`` arguments of ``plt.figure`` to control the size of the rendering. Read more in the :ref:`User Guide `. .. versionadded:: 0.21", - "docstring": "Plot a decision tree.\n\nThe sample counts that are shown are weighted with any sample_weights that\nmight be present.\n\nThe visualization is fit automatically to the size of the axis.\nUse the ``figsize`` or ``dpi`` arguments of ``plt.figure`` to control\nthe size of the rendering.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.21\n\nParameters\n----------\ndecision_tree : decision tree regressor or classifier\n The decision tree to be plotted.\n\nmax_depth : int, default=None\n The maximum depth of the representation. If None, the tree is fully\n generated.\n\nfeature_names : list of strings, default=None\n Names of each of the features.\n If None, generic names will be used (\"X[0]\", \"X[1]\", ...).\n\nclass_names : list of str or bool, default=None\n Names of each of the target classes in ascending numerical order.\n Only relevant for classification and not supported for multi-output.\n If ``True``, shows a symbolic representation of the class name.\n\nlabel : {'all', 'root', 'none'}, default='all'\n Whether to show informative labels for impurity, etc.\n Options include 'all' to show at every node, 'root' to show only at\n the top root node, or 'none' to not show at any node.\n\nfilled : bool, default=False\n When set to ``True``, paint nodes to indicate majority class for\n classification, extremity of values for regression, or purity of node\n for multi-output.\n\nimpurity : bool, default=True\n When set to ``True``, show the impurity at each node.\n\nnode_ids : bool, default=False\n When set to ``True``, show the ID number on each node.\n\nproportion : bool, default=False\n When set to ``True``, change the display of 'values' and/or 'samples'\n to be proportions and percentages respectively.\n\nrounded : bool, default=False\n When set to ``True``, draw node boxes with rounded corners and use\n Helvetica fonts instead of Times-Roman.\n\nprecision : int, default=3\n Number of digits of precision for floating point in the values of\n impurity, threshold and value attributes of each node.\n\nax : matplotlib axis, default=None\n Axes to plot to. If None, use current axis. Any previous content\n is cleared.\n\nfontsize : int, default=None\n Size of text font. If None, determined automatically to fit figure.\n\nReturns\n-------\nannotations : list of artists\n List containing the artists for the annotation boxes making up the\n tree.\n\nExamples\n--------\n>>> from sklearn.datasets import load_iris\n>>> from sklearn import tree\n\n>>> clf = tree.DecisionTreeClassifier(random_state=0)\n>>> iris = load_iris()\n\n>>> clf = clf.fit(iris.data, iris.target)\n>>> tree.plot_tree(clf)\n[...]", + "description": "Plot a decision tree.\n\nThe sample counts that are shown are weighted with any sample_weights that\nmight be present.\n\nThe visualization is fit automatically to the size of the axis.\nUse the ``figsize`` or ``dpi`` arguments of ``plt.figure`` to control\nthe size of the rendering.\n\nRead more in the :ref:`User Guide `.\n\n.. versionadded:: 0.21", + "docstring": "Plot a decision tree.\n\n The sample counts that are shown are weighted with any sample_weights that\n might be present.\n\n The visualization is fit automatically to the size of the axis.\n Use the ``figsize`` or ``dpi`` arguments of ``plt.figure`` to control\n the size of the rendering.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.21\n\n Parameters\n ----------\n decision_tree : decision tree regressor or classifier\n The decision tree to be plotted.\n\n max_depth : int, default=None\n The maximum depth of the representation. If None, the tree is fully\n generated.\n\n feature_names : list of strings, default=None\n Names of each of the features.\n If None, generic names will be used (\"X[0]\", \"X[1]\", ...).\n\n class_names : list of str or bool, default=None\n Names of each of the target classes in ascending numerical order.\n Only relevant for classification and not supported for multi-output.\n If ``True``, shows a symbolic representation of the class name.\n\n label : {'all', 'root', 'none'}, default='all'\n Whether to show informative labels for impurity, etc.\n Options include 'all' to show at every node, 'root' to show only at\n the top root node, or 'none' to not show at any node.\n\n filled : bool, default=False\n When set to ``True``, paint nodes to indicate majority class for\n classification, extremity of values for regression, or purity of node\n for multi-output.\n\n impurity : bool, default=True\n When set to ``True``, show the impurity at each node.\n\n node_ids : bool, default=False\n When set to ``True``, show the ID number on each node.\n\n proportion : bool, default=False\n When set to ``True``, change the display of 'values' and/or 'samples'\n to be proportions and percentages respectively.\n\n rounded : bool, default=False\n When set to ``True``, draw node boxes with rounded corners and use\n Helvetica fonts instead of Times-Roman.\n\n precision : int, default=3\n Number of digits of precision for floating point in the values of\n impurity, threshold and value attributes of each node.\n\n ax : matplotlib axis, default=None\n Axes to plot to. If None, use current axis. Any previous content\n is cleared.\n\n fontsize : int, default=None\n Size of text font. If None, determined automatically to fit figure.\n\n Returns\n -------\n annotations : list of artists\n List containing the artists for the annotation boxes making up the\n tree.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn import tree\n\n >>> clf = tree.DecisionTreeClassifier(random_state=0)\n >>> iris = load_iris()\n\n >>> clf = clf.fit(iris.data, iris.target)\n >>> tree.plot_tree(clf)\n [...]\n\n ", "source_code": "\ndef plot_tree(decision_tree, *, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, impurity=True, node_ids=False, proportion=False, rounded=False, precision=3, ax=None, fontsize=None):\n \"\"\"Plot a decision tree.\n\n The sample counts that are shown are weighted with any sample_weights that\n might be present.\n\n The visualization is fit automatically to the size of the axis.\n Use the ``figsize`` or ``dpi`` arguments of ``plt.figure`` to control\n the size of the rendering.\n\n Read more in the :ref:`User Guide `.\n\n .. versionadded:: 0.21\n\n Parameters\n ----------\n decision_tree : decision tree regressor or classifier\n The decision tree to be plotted.\n\n max_depth : int, default=None\n The maximum depth of the representation. If None, the tree is fully\n generated.\n\n feature_names : list of strings, default=None\n Names of each of the features.\n If None, generic names will be used (\"X[0]\", \"X[1]\", ...).\n\n class_names : list of str or bool, default=None\n Names of each of the target classes in ascending numerical order.\n Only relevant for classification and not supported for multi-output.\n If ``True``, shows a symbolic representation of the class name.\n\n label : {'all', 'root', 'none'}, default='all'\n Whether to show informative labels for impurity, etc.\n Options include 'all' to show at every node, 'root' to show only at\n the top root node, or 'none' to not show at any node.\n\n filled : bool, default=False\n When set to ``True``, paint nodes to indicate majority class for\n classification, extremity of values for regression, or purity of node\n for multi-output.\n\n impurity : bool, default=True\n When set to ``True``, show the impurity at each node.\n\n node_ids : bool, default=False\n When set to ``True``, show the ID number on each node.\n\n proportion : bool, default=False\n When set to ``True``, change the display of 'values' and/or 'samples'\n to be proportions and percentages respectively.\n\n rounded : bool, default=False\n When set to ``True``, draw node boxes with rounded corners and use\n Helvetica fonts instead of Times-Roman.\n\n precision : int, default=3\n Number of digits of precision for floating point in the values of\n impurity, threshold and value attributes of each node.\n\n ax : matplotlib axis, default=None\n Axes to plot to. If None, use current axis. Any previous content\n is cleared.\n\n fontsize : int, default=None\n Size of text font. If None, determined automatically to fit figure.\n\n Returns\n -------\n annotations : list of artists\n List containing the artists for the annotation boxes making up the\n tree.\n\n Examples\n --------\n >>> from sklearn.datasets import load_iris\n >>> from sklearn import tree\n\n >>> clf = tree.DecisionTreeClassifier(random_state=0)\n >>> iris = load_iris()\n\n >>> clf = clf.fit(iris.data, iris.target)\n >>> tree.plot_tree(clf)\n [...]\n\n \"\"\"\n check_is_fitted(decision_tree)\n exporter = _MPLTreeExporter(max_depth=max_depth, feature_names=feature_names, class_names=class_names, label=label, filled=filled, impurity=impurity, node_ids=node_ids, proportion=proportion, rounded=rounded, precision=precision, fontsize=fontsize)\n return exporter.export(decision_tree, ax=ax)" }, { @@ -163615,7 +176655,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "tree", @@ -163625,7 +176666,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "parent", @@ -163635,7 +176677,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "depth", @@ -163645,7 +176688,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "number", @@ -163655,13 +176699,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, tree, parent=None, depth=0, number=1):\n self.x = -1.0\n self.y = depth\n self.tree = tree\n self.children = [DrawTree(c, self, depth + 1, i + 1) for (i, c) in enumerate(tree.children)]\n self.parent = parent\n self.thread = None\n self.mod = 0\n self.ancestor = self\n self.change = self.shift = 0\n self._lmost_sibling = None\n self.number = number" }, { @@ -163679,13 +176724,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return self.__str__()" }, { @@ -163703,13 +176749,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __str__(self):\n return '%s: x=%s mod=%s' % (self.tree, self.x, self.mod)" }, { @@ -163727,13 +176774,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_lmost_sibling(self):\n if not self._lmost_sibling and self.parent and self != self.parent.children[0]:\n self._lmost_sibling = self.parent.children[0]\n return self._lmost_sibling" }, { @@ -163751,13 +176799,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef lbrother(self):\n n = None\n if self.parent:\n for node in self.parent.children:\n if node == self:\n return n\n else:\n n = node\n return n" }, { @@ -163775,13 +176824,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef left(self):\n return self.thread or len(self.children) and self.children[0]" }, { @@ -163799,13 +176849,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef max_extents(self):\n extents = [c.max_extents() for c in self.children]\n extents.append((self.x, self.y))\n return np.max(extents, axis=0)" }, { @@ -163823,13 +176874,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef right(self):\n return self.thread or len(self.children) and self.children[-1]" }, { @@ -163847,7 +176899,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "label", @@ -163857,7 +176910,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "node_id", @@ -163867,13 +176921,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, label='', node_id=-1, *children):\n self.label = label\n self.node_id = node_id\n if children:\n self.children = children\n else:\n self.children = []" }, { @@ -163891,7 +176946,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "v", @@ -163901,7 +176957,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "default_ancestor", @@ -163911,13 +176968,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef ancestor(vil, v, default_ancestor):\n if vil.ancestor in v.parent.children:\n return vil.ancestor\n else:\n return default_ancestor" }, { @@ -163935,7 +176993,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "default_ancestor", @@ -163945,7 +177004,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "distance", @@ -163955,13 +177015,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef apportion(v, default_ancestor, distance):\n w = v.lbrother()\n if w is not None:\n vir = vor = v\n vil = w\n vol = v.lmost_sibling\n sir = sor = v.mod\n sil = vil.mod\n sol = vol.mod\n while vil.right() and vir.left():\n vil = vil.right()\n vir = vir.left()\n vol = vol.left()\n vor = vor.right()\n vor.ancestor = v\n shift = vil.x + sil - (vir.x + sir) + distance\n if shift > 0:\n move_subtree(ancestor(vil, v, default_ancestor), v, shift)\n sir = sir + shift\n sor = sor + shift\n sil += vil.mod\n sir += vir.mod\n sol += vol.mod\n sor += vor.mod\n if vil.right() and not vor.right():\n vor.thread = vil.right()\n vor.mod += sil - sor\n else:\n if vir.left() and not vol.left():\n vol.thread = vir.left()\n vol.mod += sir - sol\n default_ancestor = v\n return default_ancestor" }, { @@ -163979,13 +177040,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef buchheim(tree):\n dt = first_walk(DrawTree(tree))\n min = second_walk(dt)\n if min < 0:\n third_walk(dt, -min)\n return dt" }, { @@ -164003,13 +177065,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef execute_shifts(v):\n shift = change = 0\n for w in v.children[::-1]:\n w.x += shift\n w.mod += shift\n change += w.change\n shift += w.shift + change" }, { @@ -164027,7 +177090,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "distance", @@ -164037,13 +177101,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef first_walk(v, distance=1.0):\n if len(v.children) == 0:\n if v.lmost_sibling:\n v.x = v.lbrother().x + distance\n else:\n v.x = 0.0\n else:\n default_ancestor = v.children[0]\n for w in v.children:\n first_walk(w)\n default_ancestor = apportion(w, default_ancestor, distance)\n execute_shifts(v)\n midpoint = (v.children[0].x + v.children[-1].x) / 2\n w = v.lbrother()\n if w:\n v.x = w.x + distance\n v.mod = v.x - midpoint\n else:\n v.x = midpoint\n return v" }, { @@ -164061,7 +177126,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "wr", @@ -164071,7 +177137,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "shift", @@ -164081,13 +177148,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef move_subtree(wl, wr, shift):\n subtrees = wr.number - wl.number\n wr.change -= shift / subtrees\n wr.shift += shift\n wl.change += shift / subtrees\n wr.x += shift\n wr.mod += shift" }, { @@ -164105,7 +177173,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "m", @@ -164115,7 +177184,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "depth", @@ -164125,7 +177195,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min", @@ -164135,13 +177206,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef second_walk(v, m=0, depth=0, min=None):\n v.x += m\n v.y = depth\n if min is None or v.x < min:\n min = v.x\n for w in v.children:\n min = second_walk(w, m + v.mod, depth + 1, min)\n return min" }, { @@ -164159,7 +177231,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n", @@ -164169,13 +177242,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef third_walk(tree, n):\n tree.x += n\n for c in tree.children:\n third_walk(c, n)" }, { @@ -164193,7 +177267,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -164203,13 +177278,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef configuration(parent_package='', top_path=None):\n config = Configuration('tree', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_extension('_tree', sources=['_tree.pyx'], include_dirs=[numpy.get_include()], libraries=libraries, extra_compile_args=['-O3'])\n config.add_extension('_splitter', sources=['_splitter.pyx'], include_dirs=[numpy.get_include()], libraries=libraries, extra_compile_args=['-O3'])\n config.add_extension('_criterion', sources=['_criterion.pyx'], include_dirs=[numpy.get_include()], libraries=libraries, extra_compile_args=['-O3'])\n config.add_extension('_utils', sources=['_utils.pyx'], include_dirs=[numpy.get_include()], libraries=libraries, extra_compile_args=['-O3'])\n config.add_subpackage('tests')\n return config" }, { @@ -164227,13 +177303,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __dir__(self):\n return self.keys()" }, { @@ -164251,7 +177328,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "key", @@ -164261,13 +177339,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __getattr__(self, key):\n try:\n return self[key]\n except KeyError:\n raise AttributeError(key)" }, { @@ -164285,13 +177364,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, **kwargs):\n super().__init__(kwargs)" }, { @@ -164309,7 +177389,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "key", @@ -164319,7 +177400,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value", @@ -164329,13 +177411,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __setattr__(self, key, value):\n self[key] = value" }, { @@ -164353,7 +177436,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "state", @@ -164363,13 +177447,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __setstate__(self, state):\n pass" }, { @@ -164387,7 +177472,8 @@ "docstring": { "type": "ndarray of int", "description": "Population per class." - } + }, + "refined_type": {} }, { "name": "n_draws", @@ -164397,7 +177483,8 @@ "docstring": { "type": "int", "description": "Number of draws (samples to draw) from the overall population." - } + }, + "refined_type": {} }, { "name": "rng", @@ -164407,13 +177494,14 @@ "docstring": { "type": "random state", "description": "Used to break ties." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Computes approximate mode of multivariate hypergeometric.\n\nThis is an approximation to the mode of the multivariate hypergeometric given by class_counts and n_draws. It shouldn't be off by more than one. It is the mostly likely outcome of drawing n_draws many samples from the population given by class_counts.", - "docstring": "Computes approximate mode of multivariate hypergeometric.\n\nThis is an approximation to the mode of the multivariate\nhypergeometric given by class_counts and n_draws.\nIt shouldn't be off by more than one.\n\nIt is the mostly likely outcome of drawing n_draws many\nsamples from the population given by class_counts.\n\nParameters\n----------\nclass_counts : ndarray of int\n Population per class.\nn_draws : int\n Number of draws (samples to draw) from the overall population.\nrng : random state\n Used to break ties.\n\nReturns\n-------\nsampled_classes : ndarray of int\n Number of samples drawn from each class.\n np.sum(sampled_classes) == n_draws\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.utils import _approximate_mode\n>>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)\narray([2, 1])\n>>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)\narray([3, 1])\n>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),\n... n_draws=2, rng=0)\narray([0, 1, 1, 0])\n>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),\n... n_draws=2, rng=42)\narray([1, 1, 0, 0])", + "description": "Computes approximate mode of multivariate hypergeometric.\n\nThis is an approximation to the mode of the multivariate\nhypergeometric given by class_counts and n_draws.\nIt shouldn't be off by more than one.\n\nIt is the mostly likely outcome of drawing n_draws many\nsamples from the population given by class_counts.", + "docstring": "Computes approximate mode of multivariate hypergeometric.\n\n This is an approximation to the mode of the multivariate\n hypergeometric given by class_counts and n_draws.\n It shouldn't be off by more than one.\n\n It is the mostly likely outcome of drawing n_draws many\n samples from the population given by class_counts.\n\n Parameters\n ----------\n class_counts : ndarray of int\n Population per class.\n n_draws : int\n Number of draws (samples to draw) from the overall population.\n rng : random state\n Used to break ties.\n\n Returns\n -------\n sampled_classes : ndarray of int\n Number of samples drawn from each class.\n np.sum(sampled_classes) == n_draws\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.utils import _approximate_mode\n >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)\n array([2, 1])\n >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)\n array([3, 1])\n >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),\n ... n_draws=2, rng=0)\n array([0, 1, 1, 0])\n >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),\n ... n_draws=2, rng=42)\n array([1, 1, 0, 0])\n ", "source_code": "\ndef _approximate_mode(class_counts, n_draws, rng):\n \"\"\"Computes approximate mode of multivariate hypergeometric.\n\n This is an approximation to the mode of the multivariate\n hypergeometric given by class_counts and n_draws.\n It shouldn't be off by more than one.\n\n It is the mostly likely outcome of drawing n_draws many\n samples from the population given by class_counts.\n\n Parameters\n ----------\n class_counts : ndarray of int\n Population per class.\n n_draws : int\n Number of draws (samples to draw) from the overall population.\n rng : random state\n Used to break ties.\n\n Returns\n -------\n sampled_classes : ndarray of int\n Number of samples drawn from each class.\n np.sum(sampled_classes) == n_draws\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.utils import _approximate_mode\n >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)\n array([2, 1])\n >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)\n array([3, 1])\n >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),\n ... n_draws=2, rng=0)\n array([0, 1, 1, 0])\n >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),\n ... n_draws=2, rng=42)\n array([1, 1, 0, 0])\n \"\"\"\n rng = check_random_state(rng)\n continuous = class_counts / class_counts.sum() * n_draws\n floored = np.floor(continuous)\n need_to_add = int(n_draws - floored.sum())\n if need_to_add > 0:\n remainder = continuous - floored\n values = np.sort(np.unique(remainder))[::-1]\n for value in values:\n (inds, ) = np.where(remainder == value)\n add_now = min(len(inds), need_to_add)\n inds = rng.choice(inds, size=add_now, replace=False)\n floored[inds] += 1\n need_to_add -= add_now\n if need_to_add == 0:\n break\n return floored.astype(int)" }, { @@ -164431,7 +177519,8 @@ "docstring": { "type": "int", "description": "The size of the eigenvalue vector to be initialized." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -164441,13 +177530,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "The seed of the pseudo random number generator used to generate a\nuniform distribution. If int, random_state is the seed used by the\nrandom number generator; If RandomState instance, random_state is the\nrandom number generator; If None, the random number generator is the\nRandomState instance used by `np.random`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Initialize the starting vector for iteration in ARPACK functions.\n\nInitialize a ndarray with values sampled from the uniform distribution on [-1, 1]. This initialization model has been chosen to be consistent with the ARPACK one as another initialization can lead to convergence issues.", - "docstring": "Initialize the starting vector for iteration in ARPACK functions.\n\nInitialize a ndarray with values sampled from the uniform distribution on\n[-1, 1]. This initialization model has been chosen to be consistent with\nthe ARPACK one as another initialization can lead to convergence issues.\n\nParameters\n----------\nsize : int\n The size of the eigenvalue vector to be initialized.\n\nrandom_state : int, RandomState instance or None, default=None\n The seed of the pseudo random number generator used to generate a\n uniform distribution. If int, random_state is the seed used by the\n random number generator; If RandomState instance, random_state is the\n random number generator; If None, the random number generator is the\n RandomState instance used by `np.random`.\n\nReturns\n-------\nv0 : ndarray of shape (size,)\n The initialized vector.", + "description": "Initialize the starting vector for iteration in ARPACK functions.\n\nInitialize a ndarray with values sampled from the uniform distribution on\n[-1, 1]. This initialization model has been chosen to be consistent with\nthe ARPACK one as another initialization can lead to convergence issues.", + "docstring": "Initialize the starting vector for iteration in ARPACK functions.\n\n Initialize a ndarray with values sampled from the uniform distribution on\n [-1, 1]. This initialization model has been chosen to be consistent with\n the ARPACK one as another initialization can lead to convergence issues.\n\n Parameters\n ----------\n size : int\n The size of the eigenvalue vector to be initialized.\n\n random_state : int, RandomState instance or None, default=None\n The seed of the pseudo random number generator used to generate a\n uniform distribution. If int, random_state is the seed used by the\n random number generator; If RandomState instance, random_state is the\n random number generator; If None, the random number generator is the\n RandomState instance used by `np.random`.\n\n Returns\n -------\n v0 : ndarray of shape (size,)\n The initialized vector.\n ", "source_code": "\ndef _init_arpack_v0(size, random_state):\n \"\"\"Initialize the starting vector for iteration in ARPACK functions.\n\n Initialize a ndarray with values sampled from the uniform distribution on\n [-1, 1]. This initialization model has been chosen to be consistent with\n the ARPACK one as another initialization can lead to convergence issues.\n\n Parameters\n ----------\n size : int\n The size of the eigenvalue vector to be initialized.\n\n random_state : int, RandomState instance or None, default=None\n The seed of the pseudo random number generator used to generate a\n uniform distribution. If int, random_state is the seed used by the\n random number generator; If RandomState instance, random_state is the\n random number generator; If None, the random number generator is the\n RandomState instance used by `np.random`.\n\n Returns\n -------\n v0 : ndarray of shape (size,)\n The initialized vector.\n \"\"\"\n random_state = check_random_state(random_state)\n v0 = random_state.uniform(-1, 1, size)\n return v0" }, { @@ -164465,7 +177555,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "key", @@ -164475,7 +177566,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "key_dtype", @@ -164485,7 +177577,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "axis", @@ -164495,7 +177588,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -164519,7 +177613,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "chunksize", @@ -164529,13 +177624,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Chunk generator, ``gen`` into lists of length ``chunksize``. The last chunk may have a length less than ``chunksize``.", - "docstring": "Chunk generator, ``gen`` into lists of length ``chunksize``. The last\nchunk may have a length less than ``chunksize``.", + "description": "Chunk generator, ``gen`` into lists of length ``chunksize``. The last\nchunk may have a length less than ``chunksize``.", + "docstring": "Chunk generator, ``gen`` into lists of length ``chunksize``. The last\n chunk may have a length less than ``chunksize``.", "source_code": "\ndef _chunk_generator(gen, chunksize):\n \"\"\"Chunk generator, ``gen`` into lists of length ``chunksize``. The last\n chunk may have a length less than ``chunksize``.\"\"\"\n while True:\n chunk = list(islice(gen, chunksize))\n if chunk:\n yield chunk\n else:\n return" }, { @@ -164553,7 +177649,8 @@ "docstring": { "type": "scalar, slice or array-like", "description": "The key from which we want to infer the data type." - } + }, + "refined_type": {} }, { "name": "accept_slice", @@ -164563,13 +177660,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not to raise an error if the key is a slice." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Determine the data type of key.", - "docstring": "Determine the data type of key.\n\nParameters\n----------\nkey : scalar, slice or array-like\n The key from which we want to infer the data type.\n\naccept_slice : bool, default=True\n Whether or not to raise an error if the key is a slice.\n\nReturns\n-------\ndtype : {'int', 'str', 'bool', None}\n Returns the data type of key.", + "docstring": "Determine the data type of key.\n\n Parameters\n ----------\n key : scalar, slice or array-like\n The key from which we want to infer the data type.\n\n accept_slice : bool, default=True\n Whether or not to raise an error if the key is a slice.\n\n Returns\n -------\n dtype : {'int', 'str', 'bool', None}\n Returns the data type of key.\n ", "source_code": "\ndef _determine_key_type(key, accept_slice=True):\n \"\"\"Determine the data type of key.\n\n Parameters\n ----------\n key : scalar, slice or array-like\n The key from which we want to infer the data type.\n\n accept_slice : bool, default=True\n Whether or not to raise an error if the key is a slice.\n\n Returns\n -------\n dtype : {'int', 'str', 'bool', None}\n Returns the data type of key.\n \"\"\"\n err_msg = 'No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed'\n dtype_to_str = {int: 'int', str: 'str', bool: 'bool', np.bool_: 'bool'}\n array_dtype_to_str = {'i': 'int', 'u': 'int', 'b': 'bool', 'O': 'str', 'U': 'str', 'S': 'str'}\n if key is None:\n return None\n if isinstance(key, tuple(dtype_to_str.keys())):\n try:\n return dtype_to_str[type(key)]\n except KeyError:\n raise ValueError(err_msg)\n if isinstance(key, slice):\n if not accept_slice:\n raise TypeError('Only array-like or scalar are supported. A Python slice was given.')\n if key.start is None and key.stop is None:\n return None\n key_start_type = _determine_key_type(key.start)\n key_stop_type = _determine_key_type(key.stop)\n if key_start_type is not None and key_stop_type is not None:\n if key_start_type != key_stop_type:\n raise ValueError(err_msg)\n if key_start_type is not None:\n return key_start_type\n return key_stop_type\n if isinstance(key, (list, tuple)):\n unique_key = set(key)\n key_type = {_determine_key_type(elt) for elt in unique_key}\n if not key_type:\n return None\n if len(key_type) != 1:\n raise ValueError(err_msg)\n return key_type.pop()\n if hasattr(key, 'dtype'):\n try:\n return array_dtype_to_str[key.dtype.kind]\n except KeyError:\n raise ValueError(err_msg)\n raise ValueError(err_msg)" }, { @@ -164587,7 +177685,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -164611,7 +177710,8 @@ "docstring": { "type": "array", "description": "Values to check for unknowns." - } + }, + "refined_type": {} }, { "name": "known_values", @@ -164621,7 +177721,8 @@ "docstring": { "type": "array", "description": "Known values. Must be unique." - } + }, + "refined_type": {} }, { "name": "return_mask", @@ -164631,13 +177732,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, return a mask of the same shape as `values` indicating\nthe valid values." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Helper function to check for unknowns in values to be encoded.\n\nUses pure python method for object dtype, and numpy method for all other dtypes.", - "docstring": "Helper function to check for unknowns in values to be encoded.\n\nUses pure python method for object dtype, and numpy method for\nall other dtypes.\n\nParameters\n----------\nvalues : array\n Values to check for unknowns.\nknown_values : array\n Known values. Must be unique.\nreturn_mask : bool, default=False\n If True, return a mask of the same shape as `values` indicating\n the valid values.\n\nReturns\n-------\ndiff : list\n The unique values present in `values` and not in `know_values`.\nvalid_mask : boolean array\n Additionally returned if ``return_mask=True``.", + "description": "Helper function to check for unknowns in values to be encoded.\n\nUses pure python method for object dtype, and numpy method for\nall other dtypes.", + "docstring": "\n Helper function to check for unknowns in values to be encoded.\n\n Uses pure python method for object dtype, and numpy method for\n all other dtypes.\n\n Parameters\n ----------\n values : array\n Values to check for unknowns.\n known_values : array\n Known values. Must be unique.\n return_mask : bool, default=False\n If True, return a mask of the same shape as `values` indicating\n the valid values.\n\n Returns\n -------\n diff : list\n The unique values present in `values` and not in `know_values`.\n valid_mask : boolean array\n Additionally returned if ``return_mask=True``.\n\n ", "source_code": "\ndef _check_unknown(values, known_values, return_mask=False):\n \"\"\"\n Helper function to check for unknowns in values to be encoded.\n\n Uses pure python method for object dtype, and numpy method for\n all other dtypes.\n\n Parameters\n ----------\n values : array\n Values to check for unknowns.\n known_values : array\n Known values. Must be unique.\n return_mask : bool, default=False\n If True, return a mask of the same shape as `values` indicating\n the valid values.\n\n Returns\n -------\n diff : list\n The unique values present in `values` and not in `know_values`.\n valid_mask : boolean array\n Additionally returned if ``return_mask=True``.\n\n \"\"\"\n valid_mask = None\n if values.dtype.kind in 'OUS':\n values_set = set(values)\n (values_set, missing_in_values) = _extract_missing(values_set)\n uniques_set = set(known_values)\n (uniques_set, missing_in_uniques) = _extract_missing(uniques_set)\n diff = values_set - uniques_set\n nan_in_diff = missing_in_values.nan and not missing_in_uniques.nan\n none_in_diff = missing_in_values.none and not missing_in_uniques.none\n \n def is_valid(value):\n return value in uniques_set or missing_in_uniques.none and value is None or missing_in_uniques.nan and is_scalar_nan(value)\n if return_mask:\n if diff or nan_in_diff or none_in_diff:\n valid_mask = np.array([is_valid(value) for value in values])\n else:\n valid_mask = np.ones(len(values), dtype=bool)\n diff = list(diff)\n if none_in_diff:\n diff.append(None)\n if nan_in_diff:\n diff.append(np.nan)\n else:\n unique_values = np.unique(values)\n diff = np.setdiff1d(unique_values, known_values, assume_unique=True)\n if return_mask:\n if diff.size:\n valid_mask = np.in1d(values, known_values)\n else:\n valid_mask = np.ones(len(values), dtype=bool)\n if np.isnan(known_values).any():\n diff_is_nan = np.isnan(diff)\n if diff_is_nan.any():\n if diff.size and return_mask:\n is_nan = np.isnan(values)\n valid_mask[is_nan] = 1\n diff = diff[~diff_is_nan]\n diff = list(diff)\n if return_mask:\n return diff, valid_mask\n return diff" }, { @@ -164655,7 +177757,8 @@ "docstring": { "type": "ndarray", "description": "Values to encode." - } + }, + "refined_type": {} }, { "name": "uniques", @@ -164665,7 +177768,8 @@ "docstring": { "type": "ndarray", "description": "The unique values in `values`. If the dtype is not object, then\n`uniques` needs to be sorted." - } + }, + "refined_type": {} }, { "name": "check_unknown", @@ -164675,13 +177779,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, check for values in `values` that are not in `unique`\nand raise an error. This is ignored for object dtype, and treated as\nTrue in this case. This parameter is useful for\n_BaseEncoder._transform() to avoid calling _check_unknown()\ntwice." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Helper function to encode values into [0, n_uniques - 1].\n\nUses pure python method for object dtype, and numpy method for all other dtypes. The numpy method has the limitation that the `uniques` need to be sorted. Importantly, this is not checked but assumed to already be the case. The calling method needs to ensure this for all non-object values.", - "docstring": "Helper function to encode values into [0, n_uniques - 1].\n\nUses pure python method for object dtype, and numpy method for\nall other dtypes.\nThe numpy method has the limitation that the `uniques` need to\nbe sorted. Importantly, this is not checked but assumed to already be\nthe case. The calling method needs to ensure this for all non-object\nvalues.\n\nParameters\n----------\nvalues : ndarray\n Values to encode.\nuniques : ndarray\n The unique values in `values`. If the dtype is not object, then\n `uniques` needs to be sorted.\ncheck_unknown : bool, default=True\n If True, check for values in `values` that are not in `unique`\n and raise an error. This is ignored for object dtype, and treated as\n True in this case. This parameter is useful for\n _BaseEncoder._transform() to avoid calling _check_unknown()\n twice.\n\nReturns\n-------\nencoded : ndarray\n Encoded values", + "description": "Helper function to encode values into [0, n_uniques - 1].\n\nUses pure python method for object dtype, and numpy method for\nall other dtypes.\nThe numpy method has the limitation that the `uniques` need to\nbe sorted. Importantly, this is not checked but assumed to already be\nthe case. The calling method needs to ensure this for all non-object\nvalues.", + "docstring": "Helper function to encode values into [0, n_uniques - 1].\n\n Uses pure python method for object dtype, and numpy method for\n all other dtypes.\n The numpy method has the limitation that the `uniques` need to\n be sorted. Importantly, this is not checked but assumed to already be\n the case. The calling method needs to ensure this for all non-object\n values.\n\n Parameters\n ----------\n values : ndarray\n Values to encode.\n uniques : ndarray\n The unique values in `values`. If the dtype is not object, then\n `uniques` needs to be sorted.\n check_unknown : bool, default=True\n If True, check for values in `values` that are not in `unique`\n and raise an error. This is ignored for object dtype, and treated as\n True in this case. This parameter is useful for\n _BaseEncoder._transform() to avoid calling _check_unknown()\n twice.\n\n Returns\n -------\n encoded : ndarray\n Encoded values\n ", "source_code": "\ndef _encode(values, *, uniques, check_unknown=True):\n \"\"\"Helper function to encode values into [0, n_uniques - 1].\n\n Uses pure python method for object dtype, and numpy method for\n all other dtypes.\n The numpy method has the limitation that the `uniques` need to\n be sorted. Importantly, this is not checked but assumed to already be\n the case. The calling method needs to ensure this for all non-object\n values.\n\n Parameters\n ----------\n values : ndarray\n Values to encode.\n uniques : ndarray\n The unique values in `values`. If the dtype is not object, then\n `uniques` needs to be sorted.\n check_unknown : bool, default=True\n If True, check for values in `values` that are not in `unique`\n and raise an error. This is ignored for object dtype, and treated as\n True in this case. This parameter is useful for\n _BaseEncoder._transform() to avoid calling _check_unknown()\n twice.\n\n Returns\n -------\n encoded : ndarray\n Encoded values\n \"\"\"\n if values.dtype.kind in 'OUS':\n try:\n return _map_to_integer(values, uniques)\n except KeyError as e:\n raise ValueError(f'y contains previously unseen labels: {str(e)}')\n else:\n if check_unknown:\n diff = _check_unknown(values, uniques)\n if diff:\n raise ValueError(f'y contains previously unseen labels: {str(diff)}')\n return np.searchsorted(uniques, values)" }, { @@ -164699,13 +177804,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Extract missing values from `values`.", - "docstring": "Extract missing values from `values`.\n\nParameters\n----------\nvalues: set\n Set of values to extract missing from.\n\nReturns\n-------\noutput: set\n Set with missing values extracted.\n\nmissing_values: MissingValues\n Object with missing value information.", + "docstring": "Extract missing values from `values`.\n\n Parameters\n ----------\n values: set\n Set of values to extract missing from.\n\n Returns\n -------\n output: set\n Set with missing values extracted.\n\n missing_values: MissingValues\n Object with missing value information.\n ", "source_code": "\ndef _extract_missing(values):\n \"\"\"Extract missing values from `values`.\n\n Parameters\n ----------\n values: set\n Set of values to extract missing from.\n\n Returns\n -------\n output: set\n Set with missing values extracted.\n\n missing_values: MissingValues\n Object with missing value information.\n \"\"\"\n missing_values_set = {value for value in values if value is None or is_scalar_nan(value)}\n if not missing_values_set:\n return values, MissingValues(nan=False, none=False)\n if None in missing_values_set:\n if len(missing_values_set) == 1:\n output_missing_values = MissingValues(nan=False, none=True)\n else:\n output_missing_values = MissingValues(nan=True, none=True)\n else:\n output_missing_values = MissingValues(nan=True, none=False)\n output = values - missing_values_set\n return output, output_missing_values" }, { @@ -164723,7 +177829,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "uniques", @@ -164733,7 +177840,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -164757,7 +177865,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mapping", @@ -164767,13 +177876,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, mapping):\n super().__init__(mapping)\n for (key, value) in mapping.items():\n if is_scalar_nan(key):\n self.nan_value = value\n break" }, { @@ -164791,7 +177901,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "key", @@ -164801,13 +177912,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __missing__(self, key):\n if hasattr(self, 'nan_value') and is_scalar_nan(key):\n return self.nan_value\n raise KeyError(key)" }, { @@ -164825,7 +177937,8 @@ "docstring": { "type": "ndarray", "description": "Values to check for unknowns." - } + }, + "refined_type": {} }, { "name": "return_inverse", @@ -164835,13 +177948,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, also return the indices of the unique values." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Helper function to find unique values with support for python objects.\n\nUses pure python method for object dtype, and numpy method for all other dtypes.", - "docstring": "Helper function to find unique values with support for python objects.\n\nUses pure python method for object dtype, and numpy method for\nall other dtypes.\n\nParameters\n----------\nvalues : ndarray\n Values to check for unknowns.\n\nreturn_inverse : bool, default=False\n If True, also return the indices of the unique values.\n\nReturns\n-------\nunique : ndarray\n The sorted unique values.\n\nunique_inverse : ndarray\n The indices to reconstruct the original array from the unique array.\n Only provided if `return_inverse` is True.", + "description": "Helper function to find unique values with support for python objects.\n\nUses pure python method for object dtype, and numpy method for\nall other dtypes.", + "docstring": "Helper function to find unique values with support for python objects.\n\n Uses pure python method for object dtype, and numpy method for\n all other dtypes.\n\n Parameters\n ----------\n values : ndarray\n Values to check for unknowns.\n\n return_inverse : bool, default=False\n If True, also return the indices of the unique values.\n\n Returns\n -------\n unique : ndarray\n The sorted unique values.\n\n unique_inverse : ndarray\n The indices to reconstruct the original array from the unique array.\n Only provided if `return_inverse` is True.\n ", "source_code": "\ndef _unique(values, *, return_inverse=False):\n \"\"\"Helper function to find unique values with support for python objects.\n\n Uses pure python method for object dtype, and numpy method for\n all other dtypes.\n\n Parameters\n ----------\n values : ndarray\n Values to check for unknowns.\n\n return_inverse : bool, default=False\n If True, also return the indices of the unique values.\n\n Returns\n -------\n unique : ndarray\n The sorted unique values.\n\n unique_inverse : ndarray\n The indices to reconstruct the original array from the unique array.\n Only provided if `return_inverse` is True.\n \"\"\"\n if values.dtype == object:\n return _unique_python(values, return_inverse=return_inverse)\n out = np.unique(values, return_inverse=return_inverse)\n if return_inverse:\n (uniques, inverse) = out\n else:\n uniques = out\n if uniques.size and is_scalar_nan(uniques[-1]):\n nan_idx = np.searchsorted(uniques, np.nan)\n uniques = uniques[:nan_idx + 1]\n if return_inverse:\n inverse[inverse > nan_idx] = nan_idx\n if return_inverse:\n return uniques, inverse\n return uniques" }, { @@ -164859,7 +177973,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "return_inverse", @@ -164869,13 +177984,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _unique_python(values, *, return_inverse):\n try:\n uniques_set = set(values)\n (uniques_set, missing_values) = _extract_missing(uniques_set)\n uniques = sorted(uniques_set)\n uniques.extend(missing_values.to_list())\n uniques = np.array(uniques, dtype=values.dtype)\n except TypeError:\n types = sorted((t.__qualname__ for t in set((type(v) for v in values))))\n raise TypeError(f'Encoders require their input to be uniformly strings or numbers. Got {types}')\n if return_inverse:\n return uniques, _map_to_integer(values, uniques)\n return uniques" }, { @@ -164893,7 +178009,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kind", @@ -164903,6 +178020,10 @@ "docstring": { "type": "{'serial', 'parallel', 'single'}", "description": "kind of HTML block" + }, + "refined_type": { + "kind": "EnumType", + "values": ["serial", "parallel", "single"] } }, { @@ -164913,7 +178034,8 @@ "docstring": { "type": "list of estimators or `_VisualBlock`s or a single estimator", "description": "If kind != 'single', then `estimators` is a list of\nestimators.\nIf kind == 'single', then `estimators` is a single estimator." - } + }, + "refined_type": {} }, { "name": "names", @@ -164923,7 +178045,8 @@ "docstring": { "type": "list of str, default=None", "description": "If kind != 'single', then `names` corresponds to estimators.\nIf kind == 'single', then `names` is a single string corresponding to\nthe single estimator." - } + }, + "refined_type": {} }, { "name": "name_details", @@ -164933,7 +178056,8 @@ "docstring": { "type": "list of str, str, or None, default=None", "description": "If kind != 'single', then `name_details` corresponds to `names`.\nIf kind == 'single', then `name_details` is a single string\ncorresponding to the single estimator." - } + }, + "refined_type": {} }, { "name": "dash_wrapped", @@ -164943,13 +178067,14 @@ "docstring": { "type": "bool, default=True", "description": "If true, wrapped HTML element will be wrapped with a dashed border.\nOnly active when kind != 'single'." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, kind, estimators, *, names=None, name_details=None, dash_wrapped=True):\n self.kind = kind\n self.estimators = estimators\n self.dash_wrapped = dash_wrapped\n if self.kind in ('parallel', 'serial'):\n if names is None:\n names = (None, ) * len(estimators)\n if name_details is None:\n name_details = (None, ) * len(estimators)\n self.names = names\n self.name_details = name_details" }, { @@ -164967,13 +178092,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sk_visual_block_(self):\n return self" }, { @@ -164991,7 +178117,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -165015,7 +178142,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -165025,7 +178153,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_label", @@ -165035,7 +178164,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_label_details", @@ -165045,7 +178175,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "first_call", @@ -165055,7 +178186,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -165079,7 +178211,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "name", @@ -165089,7 +178222,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "name_details", @@ -165099,7 +178233,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "outer_class", @@ -165109,7 +178244,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "inner_class", @@ -165119,7 +178255,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "checked", @@ -165129,14 +178266,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Write labeled html with or without a dropdown with named details", "docstring": "Write labeled html with or without a dropdown with named details", - "source_code": "\ndef _write_label_html(out, name, name_details, outer_class='sk-label-container', inner_class='sk-label', checked=False):\n \"\"\"Write labeled html with or without a dropdown with named details\"\"\"\n out.write(f'
')\n name = html.escape(name)\n if name_details is not None:\n checked_str = 'checked' if checked else ''\n est_id = uuid.uuid4()\n out.write(f'
{name_details}
')\n else:\n out.write(f'')\n out.write('
')" + "source_code": "\ndef _write_label_html(out, name, name_details, outer_class='sk-label-container', inner_class='sk-label', checked=False):\n \"\"\"Write labeled html with or without a dropdown with named details\"\"\"\n out.write(f'
')\n name = html.escape(name)\n if name_details is not None:\n name_details = html.escape(str(name_details))\n label_class = 'sk-toggleable__label sk-toggleable__label-arrow'\n checked_str = 'checked' if checked else ''\n est_id = uuid.uuid4()\n out.write(f'
{name_details}
')\n else:\n out.write(f'')\n out.write('
')" }, { "name": "estimator_html_repr", @@ -165153,14 +178291,15 @@ "docstring": { "type": "estimator object", "description": "The estimator to visualize." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Build a HTML representation of an estimator.\n\nRead more in the :ref:`User Guide `.", - "docstring": "Build a HTML representation of an estimator.\n\nRead more in the :ref:`User Guide `.\n\nParameters\n----------\nestimator : estimator object\n The estimator to visualize.\n\nReturns\n-------\nhtml: str\n HTML representation of estimator.", - "source_code": "\ndef estimator_html_repr(estimator):\n \"\"\"Build a HTML representation of an estimator.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n The estimator to visualize.\n\n Returns\n -------\n html: str\n HTML representation of estimator.\n \"\"\"\n with closing(StringIO()) as out:\n container_id = 'sk-' + str(uuid.uuid4())\n style_template = Template(_STYLE)\n style_with_id = style_template.substitute(id=container_id)\n out.write(f'
')\n _write_estimator_html(out, estimator, estimator.__class__.__name__, str(estimator), first_call=True)\n out.write('
')\n html_output = out.getvalue()\n return html_output" + "docstring": "Build a HTML representation of an estimator.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n The estimator to visualize.\n\n Returns\n -------\n html: str\n HTML representation of estimator.\n ", + "source_code": "\ndef estimator_html_repr(estimator):\n \"\"\"Build a HTML representation of an estimator.\n\n Read more in the :ref:`User Guide `.\n\n Parameters\n ----------\n estimator : estimator object\n The estimator to visualize.\n\n Returns\n -------\n html: str\n HTML representation of estimator.\n \"\"\"\n with closing(StringIO()) as out:\n container_id = 'sk-' + str(uuid.uuid4())\n style_template = Template(_STYLE)\n style_with_id = style_template.substitute(id=container_id)\n estimator_str = str(estimator)\n fallback_msg = 'Please rerun this cell to show the HTML repr or trust the notebook.'\n out.write(f'
{html.escape(estimator_str)}
{fallback_msg}
')\n html_output = out.getvalue()\n return html_output" }, { "name": "_get_column_indices", @@ -165177,7 +178316,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "key", @@ -165187,13 +178327,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Get feature column indices for input data X and key.\n\nFor accepted values of `key`, see the docstring of :func:`_safe_indexing_column`.", - "docstring": "Get feature column indices for input data X and key.\n\nFor accepted values of `key`, see the docstring of\n:func:`_safe_indexing_column`.", + "description": "Get feature column indices for input data X and key.\n\nFor accepted values of `key`, see the docstring of\n:func:`_safe_indexing_column`.", + "docstring": "Get feature column indices for input data X and key.\n\n For accepted values of `key`, see the docstring of\n :func:`_safe_indexing_column`.\n ", "source_code": "\ndef _get_column_indices(X, key):\n \"\"\"Get feature column indices for input data X and key.\n\n For accepted values of `key`, see the docstring of\n :func:`_safe_indexing_column`.\n \"\"\"\n n_columns = X.shape[1]\n key_dtype = _determine_key_type(key)\n if isinstance(key, (list, tuple)) and not key:\n return []\n elif key_dtype in ('bool', 'int'):\n try:\n idx = _safe_indexing(np.arange(n_columns), key)\n except IndexError as e:\n raise ValueError('all features must be in [0, {}] or [-{}, 0]'.format(n_columns - 1, n_columns)) from e\n return np.atleast_1d(idx).tolist()\n elif key_dtype == 'str':\n try:\n all_columns = X.columns\n except AttributeError:\n raise ValueError('Specifying the columns using strings is only supported for pandas DataFrames')\n if isinstance(key, str):\n columns = [key]\n elif isinstance(key, slice):\n (start, stop) = (key.start, key.stop)\n if start is not None:\n start = all_columns.get_loc(start)\n if stop is not None:\n stop = all_columns.get_loc(stop) + 1\n else:\n stop = n_columns + 1\n return list(range(n_columns)[slice(start, stop)])\n else:\n columns = list(key)\n try:\n column_indices = []\n for col in columns:\n col_idx = all_columns.get_loc(col)\n if not isinstance(col_idx, numbers.Integral):\n raise ValueError(f'Selected columns, {columns}, are not unique in dataframe')\n column_indices.append(col_idx)\n except KeyError as e:\n raise ValueError('A given column is not a column of the dataframe') from e\n return column_indices\n else:\n raise ValueError('No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed')" }, { @@ -165211,7 +178352,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "key", @@ -165221,7 +178363,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "key_dtype", @@ -165231,7 +178374,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -165255,7 +178399,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "value_to_mask", @@ -165265,13 +178410,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_dense_mask(X, value_to_mask):\n if is_scalar_nan(value_to_mask):\n if X.dtype.kind == 'f':\n Xt = np.isnan(X)\n elif X.dtype.kind in ('i', 'u'):\n Xt = np.zeros(X.shape, dtype=bool)\n else:\n Xt = _object_dtype_isnan(X)\n else:\n Xt = X == value_to_mask\n return Xt" }, { @@ -165289,6 +178435,10 @@ "docstring": { "type": "{ndarray, sparse matrix} of shape (n_samples, n_features)", "description": "Input data, where ``n_samples`` is the number of samples and\n``n_features`` is the number of features." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -165299,13 +178449,17 @@ "docstring": { "type": "{int, float}", "description": "The value which is to be masked in X." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Compute the boolean mask X == value_to_mask.", - "docstring": "Compute the boolean mask X == value_to_mask.\n\nParameters\n----------\nX : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Input data, where ``n_samples`` is the number of samples and\n ``n_features`` is the number of features.\n\nvalue_to_mask : {int, float}\n The value which is to be masked in X.\n\nReturns\n-------\nX_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Missing mask.", + "docstring": "Compute the boolean mask X == value_to_mask.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Input data, where ``n_samples`` is the number of samples and\n ``n_features`` is the number of features.\n\n value_to_mask : {int, float}\n The value which is to be masked in X.\n\n Returns\n -------\n X_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Missing mask.\n ", "source_code": "\ndef _get_mask(X, value_to_mask):\n \"\"\"Compute the boolean mask X == value_to_mask.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Input data, where ``n_samples`` is the number of samples and\n ``n_features`` is the number of features.\n\n value_to_mask : {int, float}\n The value which is to be masked in X.\n\n Returns\n -------\n X_mask : {ndarray, sparse matrix} of shape (n_samples, n_features)\n Missing mask.\n \"\"\"\n if not sp.issparse(X):\n return _get_dense_mask(X, value_to_mask)\n Xt = _get_dense_mask(X.data, value_to_mask)\n sparse_constructor = sp.csr_matrix if X.format == 'csr' else sp.csc_matrix\n Xt_sparse = sparse_constructor((Xt, X.indices.copy(), X.indptr.copy()), shape=X.shape, dtype=bool)\n return Xt_sparse" }, { @@ -165323,7 +178477,8 @@ "docstring": { "type": "str", "description": "String indicating the source or the reference of the message." - } + }, + "refined_type": {} }, { "name": "message", @@ -165333,7 +178488,8 @@ "docstring": { "type": "str", "description": "Short message." - } + }, + "refined_type": {} }, { "name": "time", @@ -165343,13 +178499,14 @@ "docstring": { "type": "int", "description": "Time in seconds." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Create one line message for logging purposes.", - "docstring": "Create one line message for logging purposes.\n\nParameters\n----------\nsource : str\n String indicating the source or the reference of the message.\n\nmessage : str\n Short message.\n\ntime : int\n Time in seconds.", + "docstring": "Create one line message for logging purposes.\n\n Parameters\n ----------\n source : str\n String indicating the source or the reference of the message.\n\n message : str\n Short message.\n\n time : int\n Time in seconds.\n ", "source_code": "\ndef _message_with_time(source, message, time):\n \"\"\"Create one line message for logging purposes.\n\n Parameters\n ----------\n source : str\n String indicating the source or the reference of the message.\n\n message : str\n Short message.\n\n time : int\n Time in seconds.\n \"\"\"\n start_message = '[%s] ' % source\n if time > 60:\n time_str = '%4.1fmin' % (time / 60)\n else:\n time_str = ' %5.1fs' % time\n end_message = ' %s, total=%s' % (message, time_str)\n dots_len = 70 - len(start_message) - len(end_message)\n return '%s%s%s' % (start_message, dots_len * '.', end_message)" }, { @@ -165367,7 +178524,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "aslice", @@ -165377,13 +178535,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __getitem__(self, aslice):\n return MockDataFrame(self.array[aslice])" }, { @@ -165401,7 +178560,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "array", @@ -165411,13 +178571,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, array):\n self.array = array" }, { @@ -165435,7 +178596,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check_y", @@ -165445,7 +178607,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check_y_params", @@ -165455,7 +178618,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check_X", @@ -165465,7 +178629,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check_X_params", @@ -165475,7 +178640,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "methods_to_check", @@ -165485,7 +178651,8 @@ "docstring": { "type": "\"all\" or list of str, default=\"all\"", "description": "The methods in which the checks should be applied. By default,\nall checks will be done on all methods (`fit`, `predict`,\n`predict_proba`, `decision_function` and `score`)." - } + }, + "refined_type": {} }, { "name": "foo_param", @@ -165495,7 +178662,8 @@ "docstring": { "type": "int, default=0", "description": "A `foo` param. When `foo > 1`, the output of :meth:`score` will be 1\notherwise it is 0." - } + }, + "refined_type": {} }, { "name": "expected_fit_params", @@ -165505,13 +178673,14 @@ "docstring": { "type": "list of str, default=None", "description": "A list of the expected parameters given when calling `fit`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, *, check_y=None, check_y_params=None, check_X=None, check_X_params=None, methods_to_check='all', foo_param=0, expected_fit_params=None):\n self.check_y = check_y\n self.check_y_params = check_y_params\n self.check_X = check_X\n self.check_X_params = check_X_params\n self.methods_to_check = methods_to_check\n self.foo_param = foo_param\n self.expected_fit_params = expected_fit_params" }, { @@ -165529,7 +178698,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -165539,7 +178709,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The data set." - } + }, + "refined_type": {} }, { "name": "y", @@ -165549,7 +178720,8 @@ "docstring": { "type": "array-like of shape (n_samples), default=None", "description": "The corresponding target, by default None." - } + }, + "refined_type": {} }, { "name": "should_be_fitted", @@ -165559,13 +178731,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether or not the classifier should be already fitted.\nBy default True." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Validate X and y and make extra check.", - "docstring": "Validate X and y and make extra check.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The data set.\ny : array-like of shape (n_samples), default=None\n The corresponding target, by default None.\nshould_be_fitted : bool, default=True\n Whether or not the classifier should be already fitted.\n By default True.\n\nReturns\n-------\nX, y", + "docstring": "Validate X and y and make extra check.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data set.\n y : array-like of shape (n_samples), default=None\n The corresponding target, by default None.\n should_be_fitted : bool, default=True\n Whether or not the classifier should be already fitted.\n By default True.\n\n Returns\n -------\n X, y\n ", "source_code": "\ndef _check_X_y(self, X, y=None, should_be_fitted=True):\n \"\"\"Validate X and y and make extra check.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The data set.\n y : array-like of shape (n_samples), default=None\n The corresponding target, by default None.\n should_be_fitted : bool, default=True\n Whether or not the classifier should be already fitted.\n By default True.\n\n Returns\n -------\n X, y\n \"\"\"\n if should_be_fitted:\n check_is_fitted(self)\n if self.check_X is not None:\n params = {} if self.check_X_params is None else self.check_X_params\n checked_X = self.check_X(X, **params)\n if isinstance(checked_X, (bool, np.bool_)):\n assert checked_X\n else:\n X = checked_X\n if y is not None and self.check_y is not None:\n params = {} if self.check_y_params is None else self.check_y_params\n checked_y = self.check_y(y, **params)\n if isinstance(checked_y, (bool, np.bool_)):\n assert checked_y\n else:\n y = checked_y\n return X, y" }, { @@ -165583,13 +178756,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_skip_test': True, 'X_types': ['1dlabel']}" }, { @@ -165607,7 +178781,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -165617,13 +178792,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input data." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Confidence score.", - "docstring": "Confidence score.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\ndecision : ndarray of shape (n_samples,) if n_classes == 2 else (n_samples, n_classes)\n Confidence score.", + "docstring": "Confidence score.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n decision : ndarray of shape (n_samples,) if n_classes == 2 else (n_samples, n_classes)\n Confidence score.\n ", "source_code": "\ndef decision_function(self, X):\n \"\"\"Confidence score.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n decision : ndarray of shape (n_samples,) if n_classes == 2 else (n_samples, n_classes)\n Confidence score.\n \"\"\"\n if self.methods_to_check == 'all' or 'decision_function' in self.methods_to_check:\n (X, y) = self._check_X_y(X)\n if len(self.classes_) == 2:\n return np.zeros(_num_samples(X))\n else:\n decision = np.zeros((_num_samples(X), len(self.classes_)))\n decision[:, 0] = 1\n return decision" }, { @@ -165641,7 +178817,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -165651,7 +178828,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Training vector, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "y", @@ -165661,13 +178839,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_outputs) or (n_samples,), default=None", "description": "Target relative to X for classification or regression;\nNone for unsupervised learning." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fit classifier.", - "docstring": "Fit classifier.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\ny : array-like of shape (n_samples, n_outputs) or (n_samples,), default=None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n**fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of the estimator\n\nReturns\n-------\nself", + "docstring": "Fit classifier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples, n_outputs) or (n_samples,), default=None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of the estimator\n\n Returns\n -------\n self\n ", "source_code": "\ndef fit(self, X, y, **fit_params):\n \"\"\"Fit classifier.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Training vector, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n y : array-like of shape (n_samples, n_outputs) or (n_samples,), default=None\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n **fit_params : dict of string -> object\n Parameters passed to the ``fit`` method of the estimator\n\n Returns\n -------\n self\n \"\"\"\n assert _num_samples(X) == _num_samples(y)\n if self.methods_to_check == 'all' or 'fit' in self.methods_to_check:\n (X, y) = self._check_X_y(X, y, should_be_fitted=False)\n self.n_features_in_ = np.shape(X)[1]\n self.classes_ = np.unique(check_array(y, ensure_2d=False, allow_nd=True))\n if self.expected_fit_params:\n missing = set(self.expected_fit_params) - set(fit_params)\n if missing:\n raise AssertionError(f'Expected fit parameter(s) {list(missing)} not seen.')\n for (key, value) in fit_params.items():\n if _num_samples(value) != _num_samples(X):\n raise AssertionError(f'Fit parameter {key} has length {_num_samples(value)}; expected {_num_samples(X)}.')\n return self" }, { @@ -165685,7 +178864,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -165695,13 +178875,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input data." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Predict the first class seen in `classes_`.", - "docstring": "Predict the first class seen in `classes_`.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\npreds : ndarray of shape (n_samples,)\n Predictions of the first class seens in `classes_`.", + "docstring": "Predict the first class seen in `classes_`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n preds : ndarray of shape (n_samples,)\n Predictions of the first class seens in `classes_`.\n ", "source_code": "\ndef predict(self, X):\n \"\"\"Predict the first class seen in `classes_`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n preds : ndarray of shape (n_samples,)\n Predictions of the first class seens in `classes_`.\n \"\"\"\n if self.methods_to_check == 'all' or 'predict' in self.methods_to_check:\n (X, y) = self._check_X_y(X)\n return self.classes_[np.zeros(_num_samples(X), dtype=int)]" }, { @@ -165719,7 +178900,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -165729,13 +178911,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "The input data." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Predict probabilities for each class.\n\nHere, the dummy classifier will provide a probability of 1 for the first class of `classes_` and 0 otherwise.", - "docstring": "Predict probabilities for each class.\n\nHere, the dummy classifier will provide a probability of 1 for the\nfirst class of `classes_` and 0 otherwise.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n The input data.\n\nReturns\n-------\nproba : ndarray of shape (n_samples, n_classes)\n The probabilities for each sample and class.", + "description": "Predict probabilities for each class.\n\nHere, the dummy classifier will provide a probability of 1 for the\nfirst class of `classes_` and 0 otherwise.", + "docstring": "Predict probabilities for each class.\n\n Here, the dummy classifier will provide a probability of 1 for the\n first class of `classes_` and 0 otherwise.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n proba : ndarray of shape (n_samples, n_classes)\n The probabilities for each sample and class.\n ", "source_code": "\ndef predict_proba(self, X):\n \"\"\"Predict probabilities for each class.\n\n Here, the dummy classifier will provide a probability of 1 for the\n first class of `classes_` and 0 otherwise.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n The input data.\n\n Returns\n -------\n proba : ndarray of shape (n_samples, n_classes)\n The probabilities for each sample and class.\n \"\"\"\n if self.methods_to_check == 'all' or 'predict_proba' in self.methods_to_check:\n (X, y) = self._check_X_y(X)\n proba = np.zeros((_num_samples(X), len(self.classes_)))\n proba[:, 0] = 1\n return proba" }, { @@ -165753,7 +178936,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -165763,7 +178947,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Input data, where `n_samples` is the number of samples and\n`n_features` is the number of features." - } + }, + "refined_type": {} }, { "name": "Y", @@ -165773,13 +178958,14 @@ "docstring": { "type": "array-like of shape (n_samples, n_output) or (n_samples,)", "description": "Target relative to X for classification or regression;\nNone for unsupervised learning." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Fake score.", - "docstring": "Fake score.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\nY : array-like of shape (n_samples, n_output) or (n_samples,)\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\nReturns\n-------\nscore : float\n Either 0 or 1 depending of `foo_param` (i.e. `foo_param > 1 =>\n score=1` otherwise `score=0`).", + "docstring": "Fake score.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Y : array-like of shape (n_samples, n_output) or (n_samples,)\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n Returns\n -------\n score : float\n Either 0 or 1 depending of `foo_param` (i.e. `foo_param > 1 =>\n score=1` otherwise `score=0`).\n ", "source_code": "\ndef score(self, X=None, Y=None):\n \"\"\"Fake score.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Input data, where `n_samples` is the number of samples and\n `n_features` is the number of features.\n\n Y : array-like of shape (n_samples, n_output) or (n_samples,)\n Target relative to X for classification or regression;\n None for unsupervised learning.\n\n Returns\n -------\n score : float\n Either 0 or 1 depending of `foo_param` (i.e. `foo_param > 1 =>\n score=1` otherwise `score=0`).\n \"\"\"\n if self.methods_to_check == 'all' or 'score' in self.methods_to_check:\n self._check_X_y(X, Y)\n if self.foo_param > 1:\n score = 1.0\n else:\n score = 0.0\n return score" }, { @@ -165797,7 +178983,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -165807,13 +178994,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __array__(self, dtype=None):\n return self.array" }, { @@ -165831,7 +179019,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -165841,13 +179030,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __eq__(self, other):\n return MockDataFrame(self.array == other.array)" }, { @@ -165865,7 +179055,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "array", @@ -165875,13 +179066,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, array):\n self.array = array\n self.values = array\n self.shape = array.shape\n self.ndim = array.ndim\n self.iloc = ArraySlicingWrapper(array)" }, { @@ -165899,13 +179091,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __len__(self):\n return len(self.array)" }, { @@ -165923,7 +179116,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "other", @@ -165933,13 +179127,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __ne__(self, other):\n return not self == other" }, { @@ -165957,7 +179152,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "indices", @@ -165967,7 +179163,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "axis", @@ -165977,13 +179174,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef take(self, indices, axis=0):\n return MockDataFrame(self.array.take(indices, axis=axis))" }, { @@ -166001,7 +179199,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "est", @@ -166011,13 +179210,14 @@ "docstring": { "type": "estimator, default=None", "description": "The estimator to wrap." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, est=None):\n self.est = est" }, { @@ -166035,13 +179235,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _more_tags(self):\n return {'_skip_test': True}" }, { @@ -166059,7 +179260,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -166069,7 +179271,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -166079,13 +179282,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef fit(self, X, y):\n return self.est.fit(X, y)" }, { @@ -166103,7 +179307,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -166113,13 +179318,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef predict(self, X):\n return self.est.predict(X)" }, { @@ -166137,7 +179343,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -166147,13 +179354,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef predict_proba(self, X):\n return self.est.predict_proba(X)" }, { @@ -166171,7 +179379,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "key", @@ -166181,7 +179390,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "key_dtype", @@ -166191,7 +179401,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "axis", @@ -166201,7 +179412,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -166225,13 +179437,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n return super().__repr__()" }, { @@ -166249,7 +179462,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "indent", @@ -166259,7 +179473,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "width", @@ -166269,7 +179484,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "depth", @@ -166279,7 +179495,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stream", @@ -166289,7 +179506,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "compact", @@ -166299,7 +179517,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "indent_at_name", @@ -166309,7 +179528,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_max_elements_to_show", @@ -166319,13 +179539,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, indent=1, width=80, depth=None, stream=None, *, compact=False, indent_at_name=True, n_max_elements_to_show=None):\n super().__init__(indent, width, depth, stream, compact=compact)\n self._indent_at_name = indent_at_name\n if self._indent_at_name:\n self._indent_per_level = 1\n self._changed_only = get_config()['print_changed_only']\n self.n_max_elements_to_show = n_max_elements_to_show" }, { @@ -166343,7 +179564,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "items", @@ -166353,7 +179575,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stream", @@ -166363,7 +179586,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "indent", @@ -166373,7 +179597,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "allowance", @@ -166383,7 +179608,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "context", @@ -166393,7 +179619,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "level", @@ -166403,13 +179630,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _format_dict_items(self, items, stream, indent, allowance, context, level):\n return self._format_params_or_dict_items(items, stream, indent, allowance, context, level, is_dict=True)" }, { @@ -166427,7 +179655,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "items", @@ -166437,7 +179666,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stream", @@ -166447,7 +179677,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "indent", @@ -166457,7 +179688,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "allowance", @@ -166467,7 +179699,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "context", @@ -166477,7 +179710,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "level", @@ -166487,13 +179721,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Format the items of an iterable (list, tuple...). Same as the built-in _format_items, with support for ellipsis if the number of elements is greater than self.n_max_elements_to_show.", - "docstring": "Format the items of an iterable (list, tuple...). Same as the\nbuilt-in _format_items, with support for ellipsis if the number of\nelements is greater than self.n_max_elements_to_show.", + "description": "Format the items of an iterable (list, tuple...). Same as the\nbuilt-in _format_items, with support for ellipsis if the number of\nelements is greater than self.n_max_elements_to_show.", + "docstring": "Format the items of an iterable (list, tuple...). Same as the\n built-in _format_items, with support for ellipsis if the number of\n elements is greater than self.n_max_elements_to_show.\n ", "source_code": "\ndef _format_items(self, items, stream, indent, allowance, context, level):\n \"\"\"Format the items of an iterable (list, tuple...). Same as the\n built-in _format_items, with support for ellipsis if the number of\n elements is greater than self.n_max_elements_to_show.\n \"\"\"\n write = stream.write\n indent += self._indent_per_level\n if self._indent_per_level > 1:\n write((self._indent_per_level - 1) * ' ')\n delimnl = ',\\n' + ' ' * indent\n delim = ''\n width = max_width = self._width - indent + 1\n it = iter(items)\n try:\n next_ent = next(it)\n except StopIteration:\n return\n last = False\n n_items = 0\n while not last:\n if n_items == self.n_max_elements_to_show:\n write(', ...')\n break\n n_items += 1\n ent = next_ent\n try:\n next_ent = next(it)\n except StopIteration:\n last = True\n max_width -= allowance\n width -= allowance\n if self._compact:\n rep = self._repr(ent, context, level)\n w = len(rep) + 2\n if width < w:\n width = max_width\n if delim:\n delim = delimnl\n if width >= w:\n width -= w\n write(delim)\n delim = ', '\n write(rep)\n continue\n write(delim)\n delim = delimnl\n self._format(ent, stream, indent, allowance if last else 1, context, level)" }, { @@ -166511,7 +179746,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "items", @@ -166521,7 +179757,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stream", @@ -166531,7 +179768,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "indent", @@ -166541,7 +179779,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "allowance", @@ -166551,7 +179790,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "context", @@ -166561,7 +179801,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "level", @@ -166571,13 +179812,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _format_params(self, items, stream, indent, allowance, context, level):\n return self._format_params_or_dict_items(items, stream, indent, allowance, context, level, is_dict=False)" }, { @@ -166595,7 +179837,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "object", @@ -166605,7 +179848,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stream", @@ -166615,7 +179859,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "indent", @@ -166625,7 +179870,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "allowance", @@ -166635,7 +179881,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "context", @@ -166645,7 +179892,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "level", @@ -166655,7 +179903,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "is_dict", @@ -166665,13 +179914,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Format dict items or parameters respecting the compact=True parameter. For some reason, the builtin rendering of dict items doesn't respect compact=True and will use one line per key-value if all cannot fit in a single line. Dict items will be rendered as <'key': value> while params will be rendered as . The implementation is mostly copy/pasting from the builtin _format_items(). This also adds ellipsis if the number of items is greater than self.n_max_elements_to_show.", - "docstring": "Format dict items or parameters respecting the compact=True\nparameter. For some reason, the builtin rendering of dict items doesn't\nrespect compact=True and will use one line per key-value if all cannot\nfit in a single line.\nDict items will be rendered as <'key': value> while params will be\nrendered as . The implementation is mostly copy/pasting from\nthe builtin _format_items().\nThis also adds ellipsis if the number of items is greater than\nself.n_max_elements_to_show.", + "description": "Format dict items or parameters respecting the compact=True\nparameter. For some reason, the builtin rendering of dict items doesn't\nrespect compact=True and will use one line per key-value if all cannot\nfit in a single line.\nDict items will be rendered as <'key': value> while params will be\nrendered as . The implementation is mostly copy/pasting from\nthe builtin _format_items().\nThis also adds ellipsis if the number of items is greater than\nself.n_max_elements_to_show.", + "docstring": "Format dict items or parameters respecting the compact=True\n parameter. For some reason, the builtin rendering of dict items doesn't\n respect compact=True and will use one line per key-value if all cannot\n fit in a single line.\n Dict items will be rendered as <'key': value> while params will be\n rendered as . The implementation is mostly copy/pasting from\n the builtin _format_items().\n This also adds ellipsis if the number of items is greater than\n self.n_max_elements_to_show.\n ", "source_code": "\ndef _format_params_or_dict_items(self, object, stream, indent, allowance, context, level, is_dict):\n \"\"\"Format dict items or parameters respecting the compact=True\n parameter. For some reason, the builtin rendering of dict items doesn't\n respect compact=True and will use one line per key-value if all cannot\n fit in a single line.\n Dict items will be rendered as <'key': value> while params will be\n rendered as . The implementation is mostly copy/pasting from\n the builtin _format_items().\n This also adds ellipsis if the number of items is greater than\n self.n_max_elements_to_show.\n \"\"\"\n write = stream.write\n indent += self._indent_per_level\n delimnl = ',\\n' + ' ' * indent\n delim = ''\n width = max_width = self._width - indent + 1\n it = iter(object)\n try:\n next_ent = next(it)\n except StopIteration:\n return\n last = False\n n_items = 0\n while not last:\n if n_items == self.n_max_elements_to_show:\n write(', ...')\n break\n n_items += 1\n ent = next_ent\n try:\n next_ent = next(it)\n except StopIteration:\n last = True\n max_width -= allowance\n width -= allowance\n if self._compact:\n (k, v) = ent\n krepr = self._repr(k, context, level)\n vrepr = self._repr(v, context, level)\n if not is_dict:\n krepr = krepr.strip(\"'\")\n middle = ': ' if is_dict else '='\n rep = krepr + middle + vrepr\n w = len(rep) + 2\n if width < w:\n width = max_width\n if delim:\n delim = delimnl\n if width >= w:\n width -= w\n write(delim)\n delim = ', '\n write(rep)\n continue\n write(delim)\n delim = delimnl\n class_ = KeyValTuple if is_dict else KeyValTupleParam\n self._format(class_(ent), stream, indent, allowance if last else 1, context, level)" }, { @@ -166689,7 +179939,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "object", @@ -166699,7 +179950,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stream", @@ -166709,7 +179961,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "indent", @@ -166719,7 +179972,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "allowance", @@ -166729,7 +179983,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "context", @@ -166739,7 +179994,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "level", @@ -166749,13 +180005,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _pprint_estimator(self, object, stream, indent, allowance, context, level):\n stream.write(object.__class__.__name__ + '(')\n if self._indent_at_name:\n indent += len(object.__class__.__name__)\n if self._changed_only:\n params = _changed_params(object)\n else:\n params = object.get_params(deep=False)\n params = OrderedDict(((name, val) for (name, val) in sorted(params.items())))\n self._format_params(params.items(), stream, indent, allowance + 1, context, level)\n stream.write(')')" }, { @@ -166773,7 +180030,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "object", @@ -166783,7 +180041,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stream", @@ -166793,7 +180052,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "indent", @@ -166803,7 +180063,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "allowance", @@ -166813,7 +180074,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "context", @@ -166823,7 +180085,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "level", @@ -166833,7 +180096,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -166857,7 +180121,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "object", @@ -166867,7 +180132,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "context", @@ -166877,7 +180143,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "maxlevels", @@ -166887,7 +180154,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "level", @@ -166897,13 +180165,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef format(self, object, context, maxlevels, level):\n return _safe_repr(object, context, maxlevels, level, changed_only=self._changed_only)" }, { @@ -166921,13 +180190,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return dict (param_name: value) of parameters that were given to estimator with non-default values.", - "docstring": "Return dict (param_name: value) of parameters that were given to\nestimator with non-default values.", + "description": "Return dict (param_name: value) of parameters that were given to\nestimator with non-default values.", + "docstring": "Return dict (param_name: value) of parameters that were given to\n estimator with non-default values.", "source_code": "\ndef _changed_params(estimator):\n \"\"\"Return dict (param_name: value) of parameters that were given to\n estimator with non-default values.\"\"\"\n params = estimator.get_params(deep=False)\n init_func = getattr(estimator.__init__, 'deprecated_original', estimator.__init__)\n init_params = inspect.signature(init_func).parameters\n init_params = {name: param.default for (name, param) in init_params.items()}\n \n def has_changed(k, v):\n if k not in init_params:\n return True\n if init_params[k] == inspect._empty:\n return True\n if isinstance(v, BaseEstimator) and v.__class__ != init_params[k].__class__:\n return True\n if repr(v) != repr(init_params[k]) and not (is_scalar_nan(init_params[k]) and is_scalar_nan(v)):\n return True\n return False\n return {k: v for (k, v) in params.items() if has_changed(k, v)}" }, { @@ -166945,7 +180215,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "context", @@ -166955,7 +180226,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "maxlevels", @@ -166965,7 +180237,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "level", @@ -166975,7 +180248,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "changed_only", @@ -166985,13 +180259,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Same as the builtin _safe_repr, with added support for Estimator objects.", - "docstring": "Same as the builtin _safe_repr, with added support for Estimator\nobjects.", + "description": "Same as the builtin _safe_repr, with added support for Estimator\nobjects.", + "docstring": "Same as the builtin _safe_repr, with added support for Estimator\n objects.", "source_code": "\ndef _safe_repr(object, context, maxlevels, level, changed_only=False):\n \"\"\"Same as the builtin _safe_repr, with added support for Estimator\n objects.\"\"\"\n typ = type(object)\n if typ in pprint._builtin_scalars:\n return repr(object), True, False\n r = getattr(typ, '__repr__', None)\n if issubclass(typ, dict) and r is dict.__repr__:\n if not object:\n return '{}', True, False\n objid = id(object)\n if maxlevels and level >= maxlevels:\n return '{...}', False, objid in context\n if objid in context:\n return pprint._recursion(object), False, True\n context[objid] = 1\n readable = True\n recursive = False\n components = []\n append = components.append\n level += 1\n saferepr = _safe_repr\n items = sorted(object.items(), key=pprint._safe_tuple)\n for (k, v) in items:\n (krepr, kreadable, krecur) = saferepr(k, context, maxlevels, level, changed_only=changed_only)\n (vrepr, vreadable, vrecur) = saferepr(v, context, maxlevels, level, changed_only=changed_only)\n append('%s: %s' % (krepr, vrepr))\n readable = readable and kreadable and vreadable\n if krecur or vrecur:\n recursive = True\n del context[objid]\n return '{%s}' % ', '.join(components), readable, recursive\n if issubclass(typ, list) and r is list.__repr__ or issubclass(typ, tuple) and r is tuple.__repr__:\n if issubclass(typ, list):\n if not object:\n return '[]', True, False\n format = '[%s]'\n elif len(object) == 1:\n format = '(%s,)'\n else:\n if not object:\n return '()', True, False\n format = '(%s)'\n objid = id(object)\n if maxlevels and level >= maxlevels:\n return format % '...', False, objid in context\n if objid in context:\n return pprint._recursion(object), False, True\n context[objid] = 1\n readable = True\n recursive = False\n components = []\n append = components.append\n level += 1\n for o in object:\n (orepr, oreadable, orecur) = _safe_repr(o, context, maxlevels, level, changed_only=changed_only)\n append(orepr)\n if not oreadable:\n readable = False\n if orecur:\n recursive = True\n del context[objid]\n return format % ', '.join(components), readable, recursive\n if issubclass(typ, BaseEstimator):\n objid = id(object)\n if maxlevels and level >= maxlevels:\n return '{...}', False, objid in context\n if objid in context:\n return pprint._recursion(object), False, True\n context[objid] = 1\n readable = True\n recursive = False\n if changed_only:\n params = _changed_params(object)\n else:\n params = object.get_params(deep=False)\n components = []\n append = components.append\n level += 1\n saferepr = _safe_repr\n items = sorted(params.items(), key=pprint._safe_tuple)\n for (k, v) in items:\n (krepr, kreadable, krecur) = saferepr(k, context, maxlevels, level, changed_only=changed_only)\n (vrepr, vreadable, vrecur) = saferepr(v, context, maxlevels, level, changed_only=changed_only)\n append('%s=%s' % (krepr.strip(\"'\"), vrepr))\n readable = readable and kreadable and vreadable\n if krecur or vrecur:\n recursive = True\n del context[objid]\n return '%s(%s)' % (typ.__name__, ', '.join(components)), readable, recursive\n rep = repr(object)\n return rep, rep and not rep.startswith('<'), False" }, { @@ -167009,7 +180284,8 @@ "docstring": { "type": "str", "description": "String indicating the source or the reference of the message." - } + }, + "refined_type": {} }, { "name": "message", @@ -167019,13 +180295,14 @@ "docstring": { "type": "str, default=None", "description": "Short message. If None, nothing will be printed." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Log elapsed time to stdout when the context is exited.", - "docstring": "Log elapsed time to stdout when the context is exited.\n\nParameters\n----------\nsource : str\n String indicating the source or the reference of the message.\n\nmessage : str, default=None\n Short message. If None, nothing will be printed.\n\nReturns\n-------\ncontext_manager\n Prints elapsed time upon exit if verbose.", + "docstring": "Log elapsed time to stdout when the context is exited.\n\n Parameters\n ----------\n source : str\n String indicating the source or the reference of the message.\n\n message : str, default=None\n Short message. If None, nothing will be printed.\n\n Returns\n -------\n context_manager\n Prints elapsed time upon exit if verbose.\n ", "source_code": "\n@contextmanager\ndef _print_elapsed_time(source, message=None):\n \"\"\"Log elapsed time to stdout when the context is exited.\n\n Parameters\n ----------\n source : str\n String indicating the source or the reference of the message.\n\n message : str, default=None\n Short message. If None, nothing will be printed.\n\n Returns\n -------\n context_manager\n Prints elapsed time upon exit if verbose.\n \"\"\"\n if message is None:\n yield\n else:\n start = timeit.default_timer()\n yield\n print(_message_with_time(source, message, timeit.default_timer() - start))" }, { @@ -167043,7 +180320,8 @@ "docstring": { "type": "array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series", "description": "Data from which to sample rows, items or columns. `list` are only\nsupported when `axis=0`." - } + }, + "refined_type": {} }, { "name": "indices", @@ -167053,7 +180331,8 @@ "docstring": { "type": "bool, int, str, slice, array-like", "description": "- If `axis=0`, boolean and integer array-like, integer slice,\n and scalar integer are supported.\n- If `axis=1`:\n - to select a single column, `indices` can be of `int` type for\n all `X` types and `str` only for dataframe. The selected subset\n will be 1D, unless `X` is a sparse matrix in which case it will\n be 2D.\n - to select multiples columns, `indices` can be one of the\n following: `list`, `array`, `slice`. The type used in\n these containers can be one of the following: `int`, 'bool' and\n `str`. However, `str` is only supported when `X` is a dataframe.\n The selected subset will be 2D." - } + }, + "refined_type": {} }, { "name": "axis", @@ -167063,13 +180342,14 @@ "docstring": { "type": "int, default=0", "description": "The axis along which `X` will be subsampled. `axis=0` will select\nrows while `axis=1` will select columns." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return rows, items or columns of X using indices.\n\n.. warning:: This utility is documented, but **private**. This means that backward compatibility might be broken without any deprecation cycle.", - "docstring": "Return rows, items or columns of X using indices.\n\n.. warning::\n\n This utility is documented, but **private**. This means that\n backward compatibility might be broken without any deprecation\n cycle.\n\nParameters\n----------\nX : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series\n Data from which to sample rows, items or columns. `list` are only\n supported when `axis=0`.\nindices : bool, int, str, slice, array-like\n - If `axis=0`, boolean and integer array-like, integer slice,\n and scalar integer are supported.\n - If `axis=1`:\n - to select a single column, `indices` can be of `int` type for\n all `X` types and `str` only for dataframe. The selected subset\n will be 1D, unless `X` is a sparse matrix in which case it will\n be 2D.\n - to select multiples columns, `indices` can be one of the\n following: `list`, `array`, `slice`. The type used in\n these containers can be one of the following: `int`, 'bool' and\n `str`. However, `str` is only supported when `X` is a dataframe.\n The selected subset will be 2D.\naxis : int, default=0\n The axis along which `X` will be subsampled. `axis=0` will select\n rows while `axis=1` will select columns.\n\nReturns\n-------\nsubset\n Subset of X on axis 0 or 1.\n\nNotes\n-----\nCSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are\nnot supported.", + "description": "Return rows, items or columns of X using indices.\n\n.. warning::\n\n This utility is documented, but **private**. This means that\n backward compatibility might be broken without any deprecation\n cycle.", + "docstring": "Return rows, items or columns of X using indices.\n\n .. warning::\n\n This utility is documented, but **private**. This means that\n backward compatibility might be broken without any deprecation\n cycle.\n\n Parameters\n ----------\n X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series\n Data from which to sample rows, items or columns. `list` are only\n supported when `axis=0`.\n indices : bool, int, str, slice, array-like\n - If `axis=0`, boolean and integer array-like, integer slice,\n and scalar integer are supported.\n - If `axis=1`:\n - to select a single column, `indices` can be of `int` type for\n all `X` types and `str` only for dataframe. The selected subset\n will be 1D, unless `X` is a sparse matrix in which case it will\n be 2D.\n - to select multiples columns, `indices` can be one of the\n following: `list`, `array`, `slice`. The type used in\n these containers can be one of the following: `int`, 'bool' and\n `str`. However, `str` is only supported when `X` is a dataframe.\n The selected subset will be 2D.\n axis : int, default=0\n The axis along which `X` will be subsampled. `axis=0` will select\n rows while `axis=1` will select columns.\n\n Returns\n -------\n subset\n Subset of X on axis 0 or 1.\n\n Notes\n -----\n CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are\n not supported.\n ", "source_code": "\ndef _safe_indexing(X, indices, *, axis=0):\n \"\"\"Return rows, items or columns of X using indices.\n\n .. warning::\n\n This utility is documented, but **private**. This means that\n backward compatibility might be broken without any deprecation\n cycle.\n\n Parameters\n ----------\n X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series\n Data from which to sample rows, items or columns. `list` are only\n supported when `axis=0`.\n indices : bool, int, str, slice, array-like\n - If `axis=0`, boolean and integer array-like, integer slice,\n and scalar integer are supported.\n - If `axis=1`:\n - to select a single column, `indices` can be of `int` type for\n all `X` types and `str` only for dataframe. The selected subset\n will be 1D, unless `X` is a sparse matrix in which case it will\n be 2D.\n - to select multiples columns, `indices` can be one of the\n following: `list`, `array`, `slice`. The type used in\n these containers can be one of the following: `int`, 'bool' and\n `str`. However, `str` is only supported when `X` is a dataframe.\n The selected subset will be 2D.\n axis : int, default=0\n The axis along which `X` will be subsampled. `axis=0` will select\n rows while `axis=1` will select columns.\n\n Returns\n -------\n subset\n Subset of X on axis 0 or 1.\n\n Notes\n -----\n CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are\n not supported.\n \"\"\"\n if indices is None:\n return X\n if axis not in (0, 1):\n raise ValueError(\"'axis' should be either 0 (to index rows) or 1 (to index column). Got {} instead.\".format(axis))\n indices_dtype = _determine_key_type(indices)\n if axis == 0 and indices_dtype == 'str':\n raise ValueError(\"String indexing is not supported with 'axis=0'\")\n if axis == 1 and X.ndim != 2:\n raise ValueError(\"'X' should be a 2D NumPy array, 2D sparse matrix or pandas dataframe when indexing the columns (i.e. 'axis=1'). Got {} instead with {} dimension(s).\".format(type(X), X.ndim))\n if axis == 1 and indices_dtype == 'str' and not hasattr(X, 'loc'):\n raise ValueError('Specifying the columns using strings is only supported for pandas DataFrames')\n if hasattr(X, 'iloc'):\n return _pandas_indexing(X, indices, indices_dtype, axis=axis)\n elif hasattr(X, 'shape'):\n return _array_indexing(X, indices, indices_dtype, axis=axis)\n else:\n return _list_indexing(X, indices, indices_dtype)" }, { @@ -167082,7 +180362,7 @@ "results": [], "is_public": false, "description": "Overview of the installed version of main dependencies", - "docstring": "Overview of the installed version of main dependencies\n\nReturns\n-------\ndeps_info: dict\n version information on relevant Python libraries", + "docstring": "Overview of the installed version of main dependencies\n\n Returns\n -------\n deps_info: dict\n version information on relevant Python libraries\n\n ", "source_code": "\ndef _get_deps_info():\n \"\"\"Overview of the installed version of main dependencies\n\n Returns\n -------\n deps_info: dict\n version information on relevant Python libraries\n\n \"\"\"\n deps = ['pip', 'setuptools', 'sklearn', 'numpy', 'scipy', 'Cython', 'pandas', 'matplotlib', 'joblib', 'threadpoolctl']\n \n def get_version(module):\n return module.__version__\n deps_info = {}\n for modname in deps:\n try:\n if modname in sys.modules:\n mod = sys.modules[modname]\n else:\n mod = importlib.import_module(modname)\n ver = get_version(mod)\n deps_info[modname] = ver\n except ImportError:\n deps_info[modname] = None\n return deps_info" }, { @@ -167095,7 +180375,7 @@ "results": [], "is_public": false, "description": "System information", - "docstring": "System information\n\nReturns\n-------\nsys_info : dict\n system and Python version information", + "docstring": "System information\n\n Returns\n -------\n sys_info : dict\n system and Python version information\n\n ", "source_code": "\ndef _get_sys_info():\n \"\"\"System information\n\n Returns\n -------\n sys_info : dict\n system and Python version information\n\n \"\"\"\n python = sys.version.replace('\\n', ' ')\n blob = [('python', python), ('executable', sys.executable), ('machine', platform.platform())]\n return dict(blob)" }, { @@ -167108,7 +180388,7 @@ "results": [], "is_public": true, "description": "Print useful debugging information\"\n\n.. versionadded:: 0.20", - "docstring": "Print useful debugging information\"\n\n.. versionadded:: 0.20", + "docstring": "Print useful debugging information\"\n\n .. versionadded:: 0.20\n ", "source_code": "\ndef show_versions():\n \"\"\"Print useful debugging information\"\n\n .. versionadded:: 0.20\n \"\"\"\n sys_info = _get_sys_info()\n deps_info = _get_deps_info()\n print('\\nSystem:')\n for (k, stat) in sys_info.items():\n print('{k:>10}: {stat}'.format(k=k, stat=stat))\n print('\\nPython dependencies:')\n for (k, stat) in deps_info.items():\n print('{k:>13}: {stat}'.format(k=k, stat=stat))\n print('\\n{k}: {stat}'.format(k='Built with OpenMP', stat=_openmp_parallelism_enabled()))" }, { @@ -167126,7 +180406,8 @@ "docstring": { "type": "estimator object", "description": "The estimator from which to get the tag." - } + }, + "refined_type": {} }, { "name": "key", @@ -167136,13 +180417,14 @@ "docstring": { "type": "str, default=None", "description": "Tag name to get. By default (`None`), all tags are returned." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Safely get estimator tags.\n\n:class:`~sklearn.BaseEstimator` provides the estimator tags machinery. However, if an estimator does not inherit from this base class, we should fall-back to the default tags. For scikit-learn built-in estimators, we should still rely on `self._get_tags()`. `_safe_tags(est)` should be used when we are not sure where `est` comes from: typically `_safe_tags(self.base_estimator)` where `self` is a meta-estimator, or in the common checks.", - "docstring": "Safely get estimator tags.\n\n:class:`~sklearn.BaseEstimator` provides the estimator tags machinery.\nHowever, if an estimator does not inherit from this base class, we should\nfall-back to the default tags.\n\nFor scikit-learn built-in estimators, we should still rely on\n`self._get_tags()`. `_safe_tags(est)` should be used when we are not sure\nwhere `est` comes from: typically `_safe_tags(self.base_estimator)` where\n`self` is a meta-estimator, or in the common checks.\n\nParameters\n----------\nestimator : estimator object\n The estimator from which to get the tag.\n\nkey : str, default=None\n Tag name to get. By default (`None`), all tags are returned.\n\nReturns\n-------\ntags : dict or tag value\n The estimator tags. A single value is returned if `key` is not None.", + "description": "Safely get estimator tags.\n\n:class:`~sklearn.BaseEstimator` provides the estimator tags machinery.\nHowever, if an estimator does not inherit from this base class, we should\nfall-back to the default tags.\n\nFor scikit-learn built-in estimators, we should still rely on\n`self._get_tags()`. `_safe_tags(est)` should be used when we are not sure\nwhere `est` comes from: typically `_safe_tags(self.base_estimator)` where\n`self` is a meta-estimator, or in the common checks.", + "docstring": "Safely get estimator tags.\n\n :class:`~sklearn.BaseEstimator` provides the estimator tags machinery.\n However, if an estimator does not inherit from this base class, we should\n fall-back to the default tags.\n\n For scikit-learn built-in estimators, we should still rely on\n `self._get_tags()`. `_safe_tags(est)` should be used when we are not sure\n where `est` comes from: typically `_safe_tags(self.base_estimator)` where\n `self` is a meta-estimator, or in the common checks.\n\n Parameters\n ----------\n estimator : estimator object\n The estimator from which to get the tag.\n\n key : str, default=None\n Tag name to get. By default (`None`), all tags are returned.\n\n Returns\n -------\n tags : dict or tag value\n The estimator tags. A single value is returned if `key` is not None.\n ", "source_code": "\ndef _safe_tags(estimator, key=None):\n \"\"\"Safely get estimator tags.\n\n :class:`~sklearn.BaseEstimator` provides the estimator tags machinery.\n However, if an estimator does not inherit from this base class, we should\n fall-back to the default tags.\n\n For scikit-learn built-in estimators, we should still rely on\n `self._get_tags()`. `_safe_tags(est)` should be used when we are not sure\n where `est` comes from: typically `_safe_tags(self.base_estimator)` where\n `self` is a meta-estimator, or in the common checks.\n\n Parameters\n ----------\n estimator : estimator object\n The estimator from which to get the tag.\n\n key : str, default=None\n Tag name to get. By default (`None`), all tags are returned.\n\n Returns\n -------\n tags : dict or tag value\n The estimator tags. A single value is returned if `key` is not None.\n \"\"\"\n if hasattr(estimator, '_get_tags'):\n tags_provider = '_get_tags()'\n tags = estimator._get_tags()\n elif hasattr(estimator, '_more_tags'):\n tags_provider = '_more_tags()'\n tags = {**_DEFAULT_TAGS, **estimator._more_tags()}\n else:\n tags_provider = '_DEFAULT_TAGS'\n tags = _DEFAULT_TAGS\n if key is not None:\n if key not in tags:\n raise ValueError(f'The key {key} is not defined in {tags_provider} for the class {estimator.__class__.__name__}.')\n return tags[key]\n return tags" }, { @@ -167160,7 +180442,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "param", @@ -167170,13 +180453,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, param=None):\n self.param = param" }, { @@ -167194,7 +180478,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -167204,7 +180489,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -167214,13 +180500,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef fit(self, X, y):\n (X, y) = check_X_y(X, y)\n check_classification_targets(y)\n (self.classes_, counts) = np.unique(y, return_counts=True)\n self._most_frequent_class_idx = counts.argmax()\n return self" }, { @@ -167238,7 +180525,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -167248,13 +180536,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_params(self, deep=True):\n return {'param': self.param}" }, { @@ -167272,7 +180561,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -167282,13 +180572,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef predict(self, X):\n y_proba = self.predict_proba(X)\n y_pred = y_proba.argmax(axis=1)\n return self.classes_[y_pred]" }, { @@ -167306,7 +180597,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -167316,13 +180608,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef predict_proba(self, X):\n check_is_fitted(self)\n X = check_array(X)\n proba_shape = (X.shape[0], self.classes_.size)\n y_proba = np.zeros(shape=proba_shape, dtype=np.float64)\n y_proba[:, self._most_frequent_class_idx] = 1.0\n return y_proba" }, { @@ -167340,7 +180633,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -167350,7 +180644,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -167360,13 +180655,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef score(self, X, y):\n from sklearn.metrics import accuracy_score\n return accuracy_score(y, self.predict(X))" }, { @@ -167384,13 +180680,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef set_params(self, **params):\n for (key, value) in params.items():\n setattr(self, key, value)\n return self" }, { @@ -167408,7 +180705,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "param", @@ -167418,13 +180716,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, param=None):\n self.param = param" }, { @@ -167442,7 +180741,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -167452,7 +180752,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -167462,13 +180763,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef fit(self, X, y):\n (X, y) = check_X_y(X, y)\n self.is_fitted_ = True\n self._mean = np.mean(y)\n return self" }, { @@ -167486,7 +180788,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -167496,13 +180799,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_params(self, deep=True):\n return {'param': self.param}" }, { @@ -167520,7 +180824,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -167530,13 +180835,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef predict(self, X):\n check_is_fitted(self)\n X = check_array(X)\n return np.ones(shape=(X.shape[0], )) * self._mean" }, { @@ -167554,7 +180860,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -167564,7 +180871,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -167574,13 +180882,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef score(self, X, y):\n from sklearn.metrics import r2_score\n return r2_score(y, self.predict(X))" }, { @@ -167598,13 +180907,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef set_params(self, **params):\n for (key, value) in params.items():\n setattr(self, key, value)\n return self" }, { @@ -167622,7 +180932,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "param", @@ -167632,13 +180943,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, param=None):\n self.param = param" }, { @@ -167656,7 +180968,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -167666,7 +180979,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -167676,13 +180990,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef fit(self, X, y=None):\n check_array(X)\n self.is_fitted_ = True\n return self" }, { @@ -167700,7 +181015,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -167710,7 +181026,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -167720,13 +181037,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef fit_transform(self, X, y=None):\n return self.fit(X, y).transform(X, y)" }, { @@ -167744,7 +181062,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -167754,13 +181073,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef get_params(self, deep=True):\n return {'param': self.param}" }, { @@ -167778,13 +181098,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef set_params(self, **params):\n for (key, value) in params.items():\n setattr(self, key, value)\n return self" }, { @@ -167802,7 +181123,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -167812,7 +181134,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -167822,13 +181145,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef transform(self, X, y=None):\n check_is_fitted(self)\n X = check_array(X)\n return X" }, { @@ -167846,13 +181170,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __enter__(self):\n (data_read_only, self.temp_folder) = create_memmap_backed_data(self.data, mmap_mode=self.mmap_mode, return_folder=True)\n return data_read_only" }, { @@ -167870,7 +181195,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "exc_type", @@ -167880,7 +181206,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "exc_val", @@ -167890,7 +181217,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "exc_tb", @@ -167900,13 +181228,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __exit__(self, exc_type, exc_val, exc_tb):\n _delete_folder(self.temp_folder)" }, { @@ -167924,7 +181253,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data", @@ -167934,7 +181264,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mmap_mode", @@ -167944,13 +181275,14 @@ "docstring": { "type": "str, default='r'", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, data, mmap_mode='r'):\n self.mmap_mode = mmap_mode\n self.data = data" }, { @@ -167968,7 +181300,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fn", @@ -167978,7 +181311,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -168002,13 +181336,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __enter__(self):\n if self._entered:\n raise RuntimeError('Cannot enter %r twice' % self)\n self._entered = True\n self._filters = self._module.filters\n self._module.filters = self._filters[:]\n self._showwarning = self._module.showwarning\n warnings.simplefilter('ignore', self.category)" }, { @@ -168026,13 +181361,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __exit__(self, *exc_info):\n if not self._entered:\n raise RuntimeError('Cannot exit %r without entering first' % self)\n self._module.filters = self._filters\n self._module.showwarning = self._showwarning\n self.log[:] = []" }, { @@ -168050,7 +181386,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "category", @@ -168060,13 +181397,14 @@ "docstring": { "type": "tuple of warning class, default=Warning", "description": "The category to filter. By default, all the categories will be muted." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, category):\n self._record = True\n self._module = sys.modules['warnings']\n self._entered = False\n self.log = []\n self.category = category" }, { @@ -168084,13 +181422,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __repr__(self):\n args = []\n if self._record:\n args.append('record=True')\n if self._module is not sys.modules['warnings']:\n args.append('module=%r' % self._module)\n name = type(self).__name__\n return '%s(%s)' % (name, ', '.join(args))" }, { @@ -168108,7 +181447,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "exc_type", @@ -168118,7 +181458,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "exc_value", @@ -168128,7 +181469,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "_", @@ -168138,13 +181480,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __exit__(self, exc_type, exc_value, _):\n if exc_type is None:\n if self.may_pass:\n return True\n else:\n err_msg = self.err_msg or f'Did not raise: {self.expected_exc_types}'\n raise AssertionError(err_msg)\n if not any((issubclass(exc_type, expected_type) for expected_type in self.expected_exc_types)):\n if self.err_msg is not None:\n raise AssertionError(self.err_msg) from exc_value\n else:\n return False\n if self.matches is not None:\n err_msg = self.err_msg or 'The error message should contain one of the following patterns:\\n{}\\nGot {}'.format('\\n'.join(self.matches), str(exc_value))\n if not any((re.search(match, str(exc_value)) for match in self.matches)):\n raise AssertionError(err_msg) from exc_value\n self.raised_and_matched = True\n return True" }, { @@ -168162,7 +181505,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "expected_exc_type", @@ -168172,7 +181516,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "match", @@ -168182,7 +181527,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "may_pass", @@ -168192,7 +181538,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "err_msg", @@ -168202,13 +181549,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, expected_exc_type, match, may_pass, err_msg):\n self.expected_exc_types = expected_exc_type if isinstance(expected_exc_type, Iterable) else [expected_exc_type]\n self.matches = [match] if isinstance(match, str) else match\n self.may_pass = may_pass\n self.err_msg = err_msg\n self.raised_and_matched = False" }, { @@ -168226,7 +181574,8 @@ "docstring": { "type": "array-like", "description": "The container to convert." - } + }, + "refined_type": {} }, { "name": "constructor_name", @@ -168236,6 +181585,21 @@ "docstring": { "type": "{\"list\", \"tuple\", \"array\", \"sparse\", \"dataframe\", \"series\", \"index\", \"slice\", \"sparse_csr\", \"sparse_csc\"}", "description": "The type of the returned container." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "sparse", + "tuple", + "array", + "series", + "index", + "dataframe", + "list", + "sparse_csr", + "sparse_csc", + "slice" + ] } }, { @@ -168246,7 +181610,8 @@ "docstring": { "type": "index or array-like, default=None", "description": "For pandas container supporting `columns_names`, it will affect\nspecific names." - } + }, + "refined_type": {} }, { "name": "dtype", @@ -168256,13 +181621,14 @@ "docstring": { "type": "dtype, default=None", "description": "Force the dtype of the container. Does not apply to `\"slice\"`\ncontainer." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Convert a given container to a specific array-like with a dtype.", - "docstring": "Convert a given container to a specific array-like with a dtype.\n\nParameters\n----------\ncontainer : array-like\n The container to convert.\nconstructor_name : {\"list\", \"tuple\", \"array\", \"sparse\", \"dataframe\", \"series\", \"index\", \"slice\", \"sparse_csr\", \"sparse_csc\"}\n The type of the returned container.\ncolumns_name : index or array-like, default=None\n For pandas container supporting `columns_names`, it will affect\n specific names.\ndtype : dtype, default=None\n Force the dtype of the container. Does not apply to `\"slice\"`\n container.\n\nReturns\n-------\nconverted_container", + "docstring": "Convert a given container to a specific array-like with a dtype.\n\n Parameters\n ----------\n container : array-like\n The container to convert.\n constructor_name : {\"list\", \"tuple\", \"array\", \"sparse\", \"dataframe\", \"series\", \"index\", \"slice\", \"sparse_csr\", \"sparse_csc\"}\n The type of the returned container.\n columns_name : index or array-like, default=None\n For pandas container supporting `columns_names`, it will affect\n specific names.\n dtype : dtype, default=None\n Force the dtype of the container. Does not apply to `\"slice\"`\n container.\n\n Returns\n -------\n converted_container\n ", "source_code": "\ndef _convert_container(container, constructor_name, columns_name=None, dtype=None):\n \"\"\"Convert a given container to a specific array-like with a dtype.\n\n Parameters\n ----------\n container : array-like\n The container to convert.\n constructor_name : {\"list\", \"tuple\", \"array\", \"sparse\", \"dataframe\", \"series\", \"index\", \"slice\", \"sparse_csr\", \"sparse_csc\"}\n The type of the returned container.\n columns_name : index or array-like, default=None\n For pandas container supporting `columns_names`, it will affect\n specific names.\n dtype : dtype, default=None\n Force the dtype of the container. Does not apply to `\"slice\"`\n container.\n\n Returns\n -------\n converted_container\n \"\"\"\n if constructor_name == 'list':\n if dtype is None:\n return list(container)\n else:\n return np.asarray(container, dtype=dtype).tolist()\n elif constructor_name == 'tuple':\n if dtype is None:\n return tuple(container)\n else:\n return tuple(np.asarray(container, dtype=dtype).tolist())\n elif constructor_name == 'array':\n return np.asarray(container, dtype=dtype)\n elif constructor_name == 'sparse':\n return sp.sparse.csr_matrix(container, dtype=dtype)\n elif constructor_name == 'dataframe':\n pd = pytest.importorskip('pandas')\n return pd.DataFrame(container, columns=columns_name, dtype=dtype)\n elif constructor_name == 'series':\n pd = pytest.importorskip('pandas')\n return pd.Series(container, dtype=dtype)\n elif constructor_name == 'index':\n pd = pytest.importorskip('pandas')\n return pd.Index(container, dtype=dtype)\n elif constructor_name == 'slice':\n return slice(container[0], container[1])\n elif constructor_name == 'sparse_csr':\n return sp.sparse.csr_matrix(container, dtype=dtype)\n elif constructor_name == 'sparse_csc':\n return sp.sparse.csc_matrix(container, dtype=dtype)" }, { @@ -168280,7 +181646,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "warn", @@ -168290,13 +181657,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Utility function to cleanup a temporary folder if still existing.\n\nCopy from joblib.pool (for independence).", - "docstring": "Utility function to cleanup a temporary folder if still existing.\n\nCopy from joblib.pool (for independence).", + "docstring": "Utility function to cleanup a temporary folder if still existing.\n\n Copy from joblib.pool (for independence).\n ", "source_code": "\ndef _delete_folder(folder_path, warn=False):\n \"\"\"Utility function to cleanup a temporary folder if still existing.\n\n Copy from joblib.pool (for independence).\n \"\"\"\n try:\n if os.path.exists(folder_path):\n shutil.rmtree(folder_path)\n except WindowsError:\n if warn:\n warnings.warn('Could not delete temporary folder %s' % folder_path)" }, { @@ -168314,7 +181682,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "varargs", @@ -168324,7 +181693,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -168348,13 +181718,14 @@ "docstring": { "type": "callable", "description": "The function object." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Get function full name.", - "docstring": "Get function full name.\n\nParameters\n----------\nfunc : callable\n The function object.\n\nReturns\n-------\nname : str\n The function name.", + "docstring": "Get function full name.\n\n Parameters\n ----------\n func : callable\n The function object.\n\n Returns\n -------\n name : str\n The function name.\n ", "source_code": "\ndef _get_func_name(func):\n \"\"\"Get function full name.\n\n Parameters\n ----------\n func : callable\n The function object.\n\n Returns\n -------\n name : str\n The function name.\n \"\"\"\n parts = []\n module = inspect.getmodule(func)\n if module:\n parts.append(module.__name__)\n qualname = func.__qualname__\n if qualname != func.__name__:\n parts.append(qualname[:qualname.find('.')])\n parts.append(func.__name__)\n return '.'.join(parts)" }, { @@ -168372,6 +181743,10 @@ "docstring": { "type": "{array-like, sparse matrix}", "description": "First array to compare." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -168382,6 +181757,10 @@ "docstring": { "type": "{array-like, sparse matrix}", "description": "Second array to compare." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -168392,7 +181771,8 @@ "docstring": { "type": "float, default=1e-07", "description": "relative tolerance; see numpy.allclose." - } + }, + "refined_type": {} }, { "name": "atol", @@ -168402,7 +181782,8 @@ "docstring": { "type": "float, default=1e-9", "description": "absolute tolerance; see numpy.allclose. Note that the default here is\nmore tolerant than the default for numpy.testing.assert_allclose, where\natol=0." - } + }, + "refined_type": {} }, { "name": "err_msg", @@ -168412,13 +181793,14 @@ "docstring": { "type": "str, default=''", "description": "Error message to raise." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Assert allclose for sparse and dense data.\n\nBoth x and y need to be either sparse or dense, they can't be mixed.", - "docstring": "Assert allclose for sparse and dense data.\n\nBoth x and y need to be either sparse or dense, they\ncan't be mixed.\n\nParameters\n----------\nx : {array-like, sparse matrix}\n First array to compare.\n\ny : {array-like, sparse matrix}\n Second array to compare.\n\nrtol : float, default=1e-07\n relative tolerance; see numpy.allclose.\n\natol : float, default=1e-9\n absolute tolerance; see numpy.allclose. Note that the default here is\n more tolerant than the default for numpy.testing.assert_allclose, where\n atol=0.\n\nerr_msg : str, default=''\n Error message to raise.", + "description": "Assert allclose for sparse and dense data.\n\nBoth x and y need to be either sparse or dense, they\ncan't be mixed.", + "docstring": "Assert allclose for sparse and dense data.\n\n Both x and y need to be either sparse or dense, they\n can't be mixed.\n\n Parameters\n ----------\n x : {array-like, sparse matrix}\n First array to compare.\n\n y : {array-like, sparse matrix}\n Second array to compare.\n\n rtol : float, default=1e-07\n relative tolerance; see numpy.allclose.\n\n atol : float, default=1e-9\n absolute tolerance; see numpy.allclose. Note that the default here is\n more tolerant than the default for numpy.testing.assert_allclose, where\n atol=0.\n\n err_msg : str, default=''\n Error message to raise.\n ", "source_code": "\ndef assert_allclose_dense_sparse(x, y, rtol=1e-07, atol=1e-09, err_msg=''):\n \"\"\"Assert allclose for sparse and dense data.\n\n Both x and y need to be either sparse or dense, they\n can't be mixed.\n\n Parameters\n ----------\n x : {array-like, sparse matrix}\n First array to compare.\n\n y : {array-like, sparse matrix}\n Second array to compare.\n\n rtol : float, default=1e-07\n relative tolerance; see numpy.allclose.\n\n atol : float, default=1e-9\n absolute tolerance; see numpy.allclose. Note that the default here is\n more tolerant than the default for numpy.testing.assert_allclose, where\n atol=0.\n\n err_msg : str, default=''\n Error message to raise.\n \"\"\"\n if sp.sparse.issparse(x) and sp.sparse.issparse(y):\n x = x.tocsr()\n y = y.tocsr()\n x.sum_duplicates()\n y.sum_duplicates()\n assert_array_equal(x.indices, y.indices, err_msg=err_msg)\n assert_array_equal(x.indptr, y.indptr, err_msg=err_msg)\n assert_allclose(x.data, y.data, rtol=rtol, atol=atol, err_msg=err_msg)\n elif not sp.sparse.issparse(x) and not sp.sparse.issparse(y):\n assert_allclose(x, y, rtol=rtol, atol=atol, err_msg=err_msg)\n else:\n raise ValueError('Can only compare two sparse matrices, not a sparse matrix and an array.')" }, { @@ -168436,13 +181818,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "Parameters\n----------\nfunc\n*args\n**kw", + "docstring": "\n Parameters\n ----------\n func\n *args\n **kw\n ", "source_code": "\ndef assert_no_warnings(func, *args, **kw):\n \"\"\"\n Parameters\n ----------\n func\n *args\n **kw\n \"\"\"\n with warnings.catch_warnings(record=True) as w:\n warnings.simplefilter('always')\n result = func(*args, **kw)\n if hasattr(np, 'FutureWarning'):\n w = [e for e in w if e.category is not np.VisibleDeprecationWarning]\n if len(w) > 0:\n raise AssertionError('Got warnings when calling %s: [%s]' % (func.__name__, ', '.join((str(warning) for warning in w))))\n return result" }, { @@ -168460,7 +181843,8 @@ "docstring": { "type": "exception or tuple of exception", "description": "An Exception object." - } + }, + "refined_type": {} }, { "name": "message", @@ -168470,7 +181854,8 @@ "docstring": { "type": "str", "description": "The error message or a substring of the error message." - } + }, + "refined_type": {} }, { "name": "function", @@ -168480,13 +181865,14 @@ "docstring": { "type": "callable", "description": "Callable object to raise error." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Helper function to test the message raised in an exception.\n\nGiven an exception, a callable to raise the exception, and a message string, tests that the correct exception is raised and that the message is a substring of the error thrown. Used to test that the specific message thrown during an exception is correct.", - "docstring": "Helper function to test the message raised in an exception.\n\nGiven an exception, a callable to raise the exception, and\na message string, tests that the correct exception is raised and\nthat the message is a substring of the error thrown. Used to test\nthat the specific message thrown during an exception is correct.\n\nParameters\n----------\nexceptions : exception or tuple of exception\n An Exception object.\n\nmessage : str\n The error message or a substring of the error message.\n\nfunction : callable\n Callable object to raise error.\n\n*args : the positional arguments to `function`.\n\n**kwargs : the keyword arguments to `function`.", + "description": "Helper function to test the message raised in an exception.\n\nGiven an exception, a callable to raise the exception, and\na message string, tests that the correct exception is raised and\nthat the message is a substring of the error thrown. Used to test\nthat the specific message thrown during an exception is correct.", + "docstring": "Helper function to test the message raised in an exception.\n\n Given an exception, a callable to raise the exception, and\n a message string, tests that the correct exception is raised and\n that the message is a substring of the error thrown. Used to test\n that the specific message thrown during an exception is correct.\n\n Parameters\n ----------\n exceptions : exception or tuple of exception\n An Exception object.\n\n message : str\n The error message or a substring of the error message.\n\n function : callable\n Callable object to raise error.\n\n *args : the positional arguments to `function`.\n\n **kwargs : the keyword arguments to `function`.\n ", "source_code": "\ndef assert_raise_message(exceptions, message, function, *args, **kwargs):\n \"\"\"Helper function to test the message raised in an exception.\n\n Given an exception, a callable to raise the exception, and\n a message string, tests that the correct exception is raised and\n that the message is a substring of the error thrown. Used to test\n that the specific message thrown during an exception is correct.\n\n Parameters\n ----------\n exceptions : exception or tuple of exception\n An Exception object.\n\n message : str\n The error message or a substring of the error message.\n\n function : callable\n Callable object to raise error.\n\n *args : the positional arguments to `function`.\n\n **kwargs : the keyword arguments to `function`.\n \"\"\"\n try:\n function(*args, **kwargs)\n except exceptions as e:\n error_message = str(e)\n if message not in error_message:\n raise AssertionError('Error message does not include the expected string: %r. Observed error message: %r' % (message, error_message))\n else:\n if isinstance(exceptions, tuple):\n names = ' or '.join((e.__name__ for e in exceptions))\n else:\n names = exceptions.__name__\n raise AssertionError('%s not raised by %s' % (names, function.__name__))" }, { @@ -168504,7 +181890,8 @@ "docstring": { "type": "str", "description": "The Python source code to execute." - } + }, + "refined_type": {} }, { "name": "timeout", @@ -168514,13 +181901,14 @@ "docstring": { "type": "int, default=60", "description": "Time in seconds before timeout." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Utility to check assertions in an independent Python subprocess.\n\nThe script provided in the source code should return 0 and not print anything on stderr or stdout. This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle", - "docstring": "Utility to check assertions in an independent Python subprocess.\n\nThe script provided in the source code should return 0 and not print\nanything on stderr or stdout.\n\nThis is a port from cloudpickle https://github.com/cloudpipe/cloudpickle\n\nParameters\n----------\nsource_code : str\n The Python source code to execute.\ntimeout : int, default=60\n Time in seconds before timeout.", + "description": "Utility to check assertions in an independent Python subprocess.\n\nThe script provided in the source code should return 0 and not print\nanything on stderr or stdout.\n\nThis is a port from cloudpickle https://github.com/cloudpipe/cloudpickle", + "docstring": "Utility to check assertions in an independent Python subprocess.\n\n The script provided in the source code should return 0 and not print\n anything on stderr or stdout.\n\n This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle\n\n Parameters\n ----------\n source_code : str\n The Python source code to execute.\n timeout : int, default=60\n Time in seconds before timeout.\n ", "source_code": "\ndef assert_run_python_script(source_code, timeout=60):\n \"\"\"Utility to check assertions in an independent Python subprocess.\n\n The script provided in the source code should return 0 and not print\n anything on stderr or stdout.\n\n This is a port from cloudpickle https://github.com/cloudpipe/cloudpickle\n\n Parameters\n ----------\n source_code : str\n The Python source code to execute.\n timeout : int, default=60\n Time in seconds before timeout.\n \"\"\"\n (fd, source_file) = tempfile.mkstemp(suffix='_src_test_sklearn.py')\n os.close(fd)\n try:\n with open(source_file, 'wb') as f:\n f.write(source_code.encode('utf-8'))\n cmd = [sys.executable, source_file]\n cwd = op.normpath(op.join(op.dirname(sklearn.__file__), '..'))\n env = os.environ.copy()\n try:\n env['PYTHONPATH'] = os.pathsep.join([cwd, env['PYTHONPATH']])\n except KeyError:\n env['PYTHONPATH'] = cwd\n kwargs = {'cwd': cwd, 'stderr': STDOUT, 'env': env}\n coverage_rc = os.environ.get('COVERAGE_PROCESS_START')\n if coverage_rc:\n kwargs['env']['COVERAGE_PROCESS_START'] = coverage_rc\n kwargs['timeout'] = timeout\n try:\n try:\n out = check_output(cmd, **kwargs)\n except CalledProcessError as e:\n raise RuntimeError('script errored with output:\\n%s' % e.output.decode('utf-8'))\n if out != b'':\n raise AssertionError(out.decode('utf-8'))\n except TimeoutExpired as e:\n raise RuntimeError('script timeout, output so far:\\n%s' % e.output.decode('utf-8'))\n finally:\n os.unlink(source_file)" }, { @@ -168540,7 +181928,8 @@ "docstring": { "type": "the warning class", "description": "The class to test for, e.g. UserWarning." - } + }, + "refined_type": {} }, { "name": "func", @@ -168550,13 +181939,14 @@ "docstring": { "type": "callable", "description": "Callable object to trigger warnings." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Test that a certain warning occurs.\n\n.. deprecated:: 1.0 `assert_warns` is deprecated in 1.0 and will be removed in 1.2. Use `pytest.warns` instead.", - "docstring": "Test that a certain warning occurs.\n\n.. deprecated:: 1.0\n `assert_warns` is deprecated in 1.0 and will be removed in 1.2.\n Use `pytest.warns` instead.\n\nParameters\n----------\nwarning_class : the warning class\n The class to test for, e.g. UserWarning.\n\nfunc : callable\n Callable object to trigger warnings.\n\n*args : the positional arguments to `func`.\n\n**kw : the keyword arguments to `func`\n\nReturns\n-------\nresult : the return value of `func`", + "description": "Test that a certain warning occurs.\n\n.. deprecated:: 1.0\n `assert_warns` is deprecated in 1.0 and will be removed in 1.2.\n Use `pytest.warns` instead.", + "docstring": "Test that a certain warning occurs.\n\n .. deprecated:: 1.0\n `assert_warns` is deprecated in 1.0 and will be removed in 1.2.\n Use `pytest.warns` instead.\n\n Parameters\n ----------\n warning_class : the warning class\n The class to test for, e.g. UserWarning.\n\n func : callable\n Callable object to trigger warnings.\n\n *args : the positional arguments to `func`.\n\n **kw : the keyword arguments to `func`\n\n Returns\n -------\n result : the return value of `func`\n\n ", "source_code": "\n@deprecated('`assert_warns` is deprecated in 1.0 and will be removed in 1.2.Use `pytest.warns` instead.')\ndef assert_warns(warning_class, func, *args, **kw):\n \"\"\"Test that a certain warning occurs.\n\n .. deprecated:: 1.0\n `assert_warns` is deprecated in 1.0 and will be removed in 1.2.\n Use `pytest.warns` instead.\n\n Parameters\n ----------\n warning_class : the warning class\n The class to test for, e.g. UserWarning.\n\n func : callable\n Callable object to trigger warnings.\n\n *args : the positional arguments to `func`.\n\n **kw : the keyword arguments to `func`\n\n Returns\n -------\n result : the return value of `func`\n\n \"\"\"\n with warnings.catch_warnings(record=True) as w:\n warnings.simplefilter('always')\n result = func(*args, **kw)\n if hasattr(np, 'FutureWarning'):\n w = [e for e in w if e.category is not np.VisibleDeprecationWarning]\n if not len(w) > 0:\n raise AssertionError('No warning raised when calling %s' % func.__name__)\n found = any((warning.category is warning_class for warning in w))\n if not found:\n raise AssertionError('%s did not give warning: %s( is %s)' % (func.__name__, warning_class, w))\n return result" }, { @@ -168576,7 +181966,8 @@ "docstring": { "type": "the warning class", "description": "The class to test for, e.g. UserWarning." - } + }, + "refined_type": {} }, { "name": "message", @@ -168586,7 +181977,8 @@ "docstring": { "type": "str or callable", "description": "The message or a substring of the message to test for. If callable,\nit takes a string as the argument and will trigger an AssertionError\nif the callable returns `False`." - } + }, + "refined_type": {} }, { "name": "func", @@ -168596,13 +181988,14 @@ "docstring": { "type": "callable", "description": "Callable object to trigger warnings." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Test that a certain warning occurs and with a certain message.\n\n.. deprecated:: 1.0 `assert_warns_message` is deprecated in 1.0 and will be removed in 1.2. Use `pytest.warns` instead.", - "docstring": "Test that a certain warning occurs and with a certain message.\n\n.. deprecated:: 1.0\n `assert_warns_message` is deprecated in 1.0 and will be removed in 1.2.\n Use `pytest.warns` instead.\n\nParameters\n----------\nwarning_class : the warning class\n The class to test for, e.g. UserWarning.\n\nmessage : str or callable\n The message or a substring of the message to test for. If callable,\n it takes a string as the argument and will trigger an AssertionError\n if the callable returns `False`.\n\nfunc : callable\n Callable object to trigger warnings.\n\n*args : the positional arguments to `func`.\n\n**kw : the keyword arguments to `func`.\n\nReturns\n-------\nresult : the return value of `func`", + "description": "Test that a certain warning occurs and with a certain message.\n\n.. deprecated:: 1.0\n `assert_warns_message` is deprecated in 1.0 and will be removed in 1.2.\n Use `pytest.warns` instead.", + "docstring": "Test that a certain warning occurs and with a certain message.\n\n .. deprecated:: 1.0\n `assert_warns_message` is deprecated in 1.0 and will be removed in 1.2.\n Use `pytest.warns` instead.\n\n Parameters\n ----------\n warning_class : the warning class\n The class to test for, e.g. UserWarning.\n\n message : str or callable\n The message or a substring of the message to test for. If callable,\n it takes a string as the argument and will trigger an AssertionError\n if the callable returns `False`.\n\n func : callable\n Callable object to trigger warnings.\n\n *args : the positional arguments to `func`.\n\n **kw : the keyword arguments to `func`.\n\n Returns\n -------\n result : the return value of `func`\n\n ", "source_code": "\n@deprecated('`assert_warns_message` is deprecated in 1.0 and will be removed in 1.2.Use `pytest.warns` instead.')\ndef assert_warns_message(warning_class, message, func, *args, **kw):\n \"\"\"Test that a certain warning occurs and with a certain message.\n\n .. deprecated:: 1.0\n `assert_warns_message` is deprecated in 1.0 and will be removed in 1.2.\n Use `pytest.warns` instead.\n\n Parameters\n ----------\n warning_class : the warning class\n The class to test for, e.g. UserWarning.\n\n message : str or callable\n The message or a substring of the message to test for. If callable,\n it takes a string as the argument and will trigger an AssertionError\n if the callable returns `False`.\n\n func : callable\n Callable object to trigger warnings.\n\n *args : the positional arguments to `func`.\n\n **kw : the keyword arguments to `func`.\n\n Returns\n -------\n result : the return value of `func`\n\n \"\"\"\n with warnings.catch_warnings(record=True) as w:\n warnings.simplefilter('always')\n if hasattr(np, 'FutureWarning'):\n warnings.simplefilter('ignore', np.VisibleDeprecationWarning)\n result = func(*args, **kw)\n if not len(w) > 0:\n raise AssertionError('No warning raised when calling %s' % func.__name__)\n found = [issubclass(warning.category, warning_class) for warning in w]\n if not any(found):\n raise AssertionError('No warning raised for %s with class %s' % (func.__name__, warning_class))\n message_found = False\n for index in [i for (i, x) in enumerate(found) if x]:\n msg = w[index].message\n msg = str(msg.args[0] if hasattr(msg, 'args') else msg)\n if callable(message):\n check_in_message = message\n else:\n \n def check_in_message(msg):\n return message in msg\n if check_in_message(msg):\n message_found = True\n break\n if not message_found:\n raise AssertionError(\"Did not receive the message you expected ('%s') for <%s>, got: '%s'\" % (message, func.__name__, msg))\n return result" }, { @@ -168620,7 +182013,8 @@ "docstring": { "type": "callable", "description": "The function object to test." - } + }, + "refined_type": {} }, { "name": "doc", @@ -168630,7 +182024,8 @@ "docstring": { "type": "str, default=None", "description": "Docstring if it is passed manually to the test." - } + }, + "refined_type": {} }, { "name": "ignore", @@ -168640,13 +182035,14 @@ "docstring": { "type": "list, default=None", "description": "Parameters to ignore." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Helper to check docstring.", - "docstring": "Helper to check docstring.\n\nParameters\n----------\nfunc : callable\n The function object to test.\ndoc : str, default=None\n Docstring if it is passed manually to the test.\nignore : list, default=None\n Parameters to ignore.\n\nReturns\n-------\nincorrect : list\n A list of string describing the incorrect results.", + "docstring": "Helper to check docstring.\n\n Parameters\n ----------\n func : callable\n The function object to test.\n doc : str, default=None\n Docstring if it is passed manually to the test.\n ignore : list, default=None\n Parameters to ignore.\n\n Returns\n -------\n incorrect : list\n A list of string describing the incorrect results.\n ", "source_code": "\ndef check_docstring_parameters(func, doc=None, ignore=None):\n \"\"\"Helper to check docstring.\n\n Parameters\n ----------\n func : callable\n The function object to test.\n doc : str, default=None\n Docstring if it is passed manually to the test.\n ignore : list, default=None\n Parameters to ignore.\n\n Returns\n -------\n incorrect : list\n A list of string describing the incorrect results.\n \"\"\"\n from numpydoc import docscrape\n incorrect = []\n ignore = [] if ignore is None else ignore\n func_name = _get_func_name(func)\n if not func_name.startswith('sklearn.') or func_name.startswith('sklearn.externals'):\n return incorrect\n if inspect.isdatadescriptor(func):\n return incorrect\n if func_name.split('.')[-1] in ('setup_module', 'teardown_module'):\n return incorrect\n if func_name.split('.')[2] == 'estimator_checks':\n return incorrect\n param_signature = list(filter(lambda x: x not in ignore, _get_args(func)))\n if len(param_signature) > 0 and param_signature[0] == 'self':\n param_signature.remove('self')\n if doc is None:\n with warnings.catch_warnings(record=True) as w:\n try:\n doc = docscrape.FunctionDoc(func)\n except Exception as exp:\n incorrect += [func_name + ' parsing error: ' + str(exp)]\n return incorrect\n if len(w):\n raise RuntimeError('Error for %s:\\n%s' % (func_name, w[0]))\n param_docs = []\n for (name, type_definition, param_doc) in doc['Parameters']:\n if not type_definition.strip():\n if ':' in name and name[:name.index(':')][-1:].strip():\n incorrect += [func_name + ' There was no space between the param name and colon (%r)' % name]\n elif name.rstrip().endswith(':'):\n incorrect += [func_name + ' Parameter %r has an empty type spec. Remove the colon' % name.lstrip()]\n if '*' not in name:\n param_docs.append(name.split(':')[0].strip('` '))\n if len(incorrect) > 0:\n return incorrect\n param_docs = list(filter(lambda x: x not in ignore, param_docs))\n message = []\n for i in range(min(len(param_docs), len(param_signature))):\n if param_signature[i] != param_docs[i]:\n message += [\"There's a parameter name mismatch in function docstring w.r.t. function signature, at index %s diff: %r != %r\" % (i, param_signature[i], param_docs[i])]\n break\n if len(param_signature) > len(param_docs):\n message += ['Parameters in function docstring have less items w.r.t. function signature, first missing item: %s' % param_signature[len(param_docs)]]\n elif len(param_signature) < len(param_docs):\n message += ['Parameters in function docstring have more items w.r.t. function signature, first extra item: %s' % param_docs[len(param_signature)]]\n if len(message) == 0:\n return []\n import difflib\n import pprint\n param_docs_formatted = pprint.pformat(param_docs).splitlines()\n param_signature_formatted = pprint.pformat(param_signature).splitlines()\n message += ['Full diff:']\n message.extend((line.strip() for line in difflib.ndiff(param_signature_formatted, param_docs_formatted)))\n incorrect.extend(message)\n incorrect = ['In function: ' + func_name] + incorrect\n return incorrect" }, { @@ -168659,7 +182055,7 @@ "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_skip_network():\n if int(os.environ.get('SKLEARN_SKIP_NETWORK_TESTS', 0)):\n raise SkipTest('Text tutorial requires large dataset download')" }, { @@ -168677,7 +182073,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "mmap_mode", @@ -168687,7 +182084,8 @@ "docstring": { "type": "str, default='r'", "description": "" - } + }, + "refined_type": {} }, { "name": "return_folder", @@ -168695,16 +182093,28 @@ "is_public": false, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": " bool, default=False", + "type": "bool, default=False", "description": "" - } + }, + "refined_type": {} + }, + { + "name": "aligned", + "default_value": "False", + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "bool, default=False", + "description": "If True, if input is a single numpy array and if the input array is aligned,\nthe memory mapped array will also be aligned. This is a workaround for\nhttps://github.com/joblib/joblib/issues/563." + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "Parameters\n----------\ndata\nmmap_mode : str, default='r'\nreturn_folder : bool, default=False", - "source_code": "\ndef create_memmap_backed_data(data, mmap_mode='r', return_folder=False):\n \"\"\"\n Parameters\n ----------\n data\n mmap_mode : str, default='r'\n return_folder : bool, default=False\n \"\"\"\n temp_folder = tempfile.mkdtemp(prefix='sklearn_testing_')\n atexit.register(functools.partial(_delete_folder, temp_folder, warn=True))\n filename = op.join(temp_folder, 'data.pkl')\n joblib.dump(data, filename)\n memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)\n result = memmap_backed_data if not return_folder else (memmap_backed_data, temp_folder)\n return result" + "docstring": "\n Parameters\n ----------\n data\n mmap_mode : str, default='r'\n return_folder : bool, default=False\n aligned : bool, default=False\n If True, if input is a single numpy array and if the input array is aligned,\n the memory mapped array will also be aligned. This is a workaround for\n https://github.com/joblib/joblib/issues/563.\n ", + "source_code": "\ndef create_memmap_backed_data(data, mmap_mode='r', return_folder=False, aligned=False):\n \"\"\"\n Parameters\n ----------\n data\n mmap_mode : str, default='r'\n return_folder : bool, default=False\n aligned : bool, default=False\n If True, if input is a single numpy array and if the input array is aligned,\n the memory mapped array will also be aligned. This is a workaround for\n https://github.com/joblib/joblib/issues/563.\n \"\"\"\n temp_folder = tempfile.mkdtemp(prefix='sklearn_testing_')\n atexit.register(functools.partial(_delete_folder, temp_folder, warn=True))\n if aligned:\n if isinstance(data, np.ndarray) and data.flags.aligned:\n filename = op.join(temp_folder, 'data.dat')\n fp = np.memmap(filename, dtype=data.dtype, mode='w+', shape=data.shape)\n fp[:] = data[:]\n fp.flush()\n memmap_backed_data = np.memmap(filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape)\n else:\n raise ValueError('If aligned=True, input must be a single numpy array.')\n else:\n filename = op.join(temp_folder, 'data.pkl')\n joblib.dump(data, filename)\n memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)\n result = memmap_backed_data if not return_folder else (memmap_backed_data, temp_folder)\n return result" }, { "name": "ignore_warnings", @@ -168721,7 +182131,8 @@ "docstring": { "type": "callable, default=None", "description": "callable where you want to ignore the warnings." - } + }, + "refined_type": {} }, { "name": "category", @@ -168731,13 +182142,14 @@ "docstring": { "type": "warning class, default=Warning", "description": "The category to filter. If Warning, all categories will be muted." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Context manager and decorator to ignore warnings.\n\nNote: Using this (in both variants) will clear all warnings from all python modules loaded. In case you need to test cross-module-warning-logging, this is not your tool of choice.", - "docstring": "Context manager and decorator to ignore warnings.\n\nNote: Using this (in both variants) will clear all warnings\nfrom all python modules loaded. In case you need to test\ncross-module-warning-logging, this is not your tool of choice.\n\nParameters\n----------\nobj : callable, default=None\n callable where you want to ignore the warnings.\ncategory : warning class, default=Warning\n The category to filter. If Warning, all categories will be muted.\n\nExamples\n--------\n>>> import warnings\n>>> from sklearn.utils._testing import ignore_warnings\n>>> with ignore_warnings():\n... warnings.warn('buhuhuhu')\n\n>>> def nasty_warn():\n... warnings.warn('buhuhuhu')\n... print(42)\n\n>>> ignore_warnings(nasty_warn)()\n42", + "description": "Context manager and decorator to ignore warnings.\n\nNote: Using this (in both variants) will clear all warnings\nfrom all python modules loaded. In case you need to test\ncross-module-warning-logging, this is not your tool of choice.", + "docstring": "Context manager and decorator to ignore warnings.\n\n Note: Using this (in both variants) will clear all warnings\n from all python modules loaded. In case you need to test\n cross-module-warning-logging, this is not your tool of choice.\n\n Parameters\n ----------\n obj : callable, default=None\n callable where you want to ignore the warnings.\n category : warning class, default=Warning\n The category to filter. If Warning, all categories will be muted.\n\n Examples\n --------\n >>> import warnings\n >>> from sklearn.utils._testing import ignore_warnings\n >>> with ignore_warnings():\n ... warnings.warn('buhuhuhu')\n\n >>> def nasty_warn():\n ... warnings.warn('buhuhuhu')\n ... print(42)\n\n >>> ignore_warnings(nasty_warn)()\n 42\n ", "source_code": "\ndef ignore_warnings(obj=None, category=Warning):\n \"\"\"Context manager and decorator to ignore warnings.\n\n Note: Using this (in both variants) will clear all warnings\n from all python modules loaded. In case you need to test\n cross-module-warning-logging, this is not your tool of choice.\n\n Parameters\n ----------\n obj : callable, default=None\n callable where you want to ignore the warnings.\n category : warning class, default=Warning\n The category to filter. If Warning, all categories will be muted.\n\n Examples\n --------\n >>> import warnings\n >>> from sklearn.utils._testing import ignore_warnings\n >>> with ignore_warnings():\n ... warnings.warn('buhuhuhu')\n\n >>> def nasty_warn():\n ... warnings.warn('buhuhuhu')\n ... print(42)\n\n >>> ignore_warnings(nasty_warn)()\n 42\n \"\"\"\n if isinstance(obj, type) and issubclass(obj, Warning):\n warning_name = obj.__name__\n raise ValueError(\"'obj' should be a callable where you want to ignore warnings. You passed a warning class instead: 'obj={warning_name}'. If you want to pass a warning class to ignore_warnings, you should use 'category={warning_name}'\".format(warning_name=warning_name))\n elif callable(obj):\n return _IgnoreWarnings(category=category)(obj)\n else:\n return _IgnoreWarnings(category=category)" }, { @@ -168755,7 +182167,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "match", @@ -168765,7 +182178,8 @@ "docstring": { "type": "str or list of str, default=None", "description": "A regex that the exception message should match. If a list, one of\nthe entries must match. If None, match isn't enforced." - } + }, + "refined_type": {} }, { "name": "may_pass", @@ -168775,7 +182189,8 @@ "docstring": { "type": "bool, default=False", "description": "If True, the block is allowed to not raise an exception. Useful in\ncases where some estimators may support a feature but others must\nfail with an appropriate error message. By default, the context\nmanager will raise an exception if the block does not raise an\nexception." - } + }, + "refined_type": {} }, { "name": "err_msg", @@ -168785,13 +182200,14 @@ "docstring": { "type": "str, default=None", "description": "If the context manager fails (e.g. the block fails to raise the\nproper exception, or fails to match), then an AssertionError is\nraised with this message. By default, an AssertionError is raised\nwith a default error message (depends on the kind of failure). Use\nthis to indicate how users should fix their estimators to pass the\nchecks." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Context manager to ensure exceptions are raised within a code block.\n\nThis is similar to and inspired from pytest.raises, but supports a few other cases. This is only intended to be used in estimator_checks.py where we don't want to use pytest. In the rest of the code base, just use pytest.raises instead.", - "docstring": "Context manager to ensure exceptions are raised within a code block.\n\nThis is similar to and inspired from pytest.raises, but supports a few\nother cases.\n\nThis is only intended to be used in estimator_checks.py where we don't\nwant to use pytest. In the rest of the code base, just use pytest.raises\ninstead.\n\nParameters\n----------\nexcepted_exc_type : Exception or list of Exception\n The exception that should be raised by the block. If a list, the block\n should raise one of the exceptions.\nmatch : str or list of str, default=None\n A regex that the exception message should match. If a list, one of\n the entries must match. If None, match isn't enforced.\nmay_pass : bool, default=False\n If True, the block is allowed to not raise an exception. Useful in\n cases where some estimators may support a feature but others must\n fail with an appropriate error message. By default, the context\n manager will raise an exception if the block does not raise an\n exception.\nerr_msg : str, default=None\n If the context manager fails (e.g. the block fails to raise the\n proper exception, or fails to match), then an AssertionError is\n raised with this message. By default, an AssertionError is raised\n with a default error message (depends on the kind of failure). Use\n this to indicate how users should fix their estimators to pass the\n checks.\n\nAttributes\n----------\nraised_and_matched : bool\n True if an exception was raised and a match was found, False otherwise.", + "description": "Context manager to ensure exceptions are raised within a code block.\n\nThis is similar to and inspired from pytest.raises, but supports a few\nother cases.\n\nThis is only intended to be used in estimator_checks.py where we don't\nwant to use pytest. In the rest of the code base, just use pytest.raises\ninstead.", + "docstring": "Context manager to ensure exceptions are raised within a code block.\n\n This is similar to and inspired from pytest.raises, but supports a few\n other cases.\n\n This is only intended to be used in estimator_checks.py where we don't\n want to use pytest. In the rest of the code base, just use pytest.raises\n instead.\n\n Parameters\n ----------\n excepted_exc_type : Exception or list of Exception\n The exception that should be raised by the block. If a list, the block\n should raise one of the exceptions.\n match : str or list of str, default=None\n A regex that the exception message should match. If a list, one of\n the entries must match. If None, match isn't enforced.\n may_pass : bool, default=False\n If True, the block is allowed to not raise an exception. Useful in\n cases where some estimators may support a feature but others must\n fail with an appropriate error message. By default, the context\n manager will raise an exception if the block does not raise an\n exception.\n err_msg : str, default=None\n If the context manager fails (e.g. the block fails to raise the\n proper exception, or fails to match), then an AssertionError is\n raised with this message. By default, an AssertionError is raised\n with a default error message (depends on the kind of failure). Use\n this to indicate how users should fix their estimators to pass the\n checks.\n\n Attributes\n ----------\n raised_and_matched : bool\n True if an exception was raised and a match was found, False otherwise.\n ", "source_code": "\ndef raises(expected_exc_type, match=None, may_pass=False, err_msg=None):\n \"\"\"Context manager to ensure exceptions are raised within a code block.\n\n This is similar to and inspired from pytest.raises, but supports a few\n other cases.\n\n This is only intended to be used in estimator_checks.py where we don't\n want to use pytest. In the rest of the code base, just use pytest.raises\n instead.\n\n Parameters\n ----------\n excepted_exc_type : Exception or list of Exception\n The exception that should be raised by the block. If a list, the block\n should raise one of the exceptions.\n match : str or list of str, default=None\n A regex that the exception message should match. If a list, one of\n the entries must match. If None, match isn't enforced.\n may_pass : bool, default=False\n If True, the block is allowed to not raise an exception. Useful in\n cases where some estimators may support a feature but others must\n fail with an appropriate error message. By default, the context\n manager will raise an exception if the block does not raise an\n exception.\n err_msg : str, default=None\n If the context manager fails (e.g. the block fails to raise the\n proper exception, or fails to match), then an AssertionError is\n raised with this message. By default, an AssertionError is raised\n with a default error message (depends on the kind of failure). Use\n this to indicate how users should fix their estimators to pass the\n checks.\n\n Attributes\n ----------\n raised_and_matched : bool\n True if an exception was raised and a match was found, False otherwise.\n \"\"\"\n return _Raises(expected_exc_type, match, may_pass, err_msg)" }, { @@ -168809,7 +182225,8 @@ "docstring": { "type": "object", "description": "The estimator." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -168819,13 +182236,14 @@ "docstring": { "type": "int, RandomState instance or None, default=0", "description": "Pseudo random number generator state.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Set random state of an estimator if it has the `random_state` param.", - "docstring": "Set random state of an estimator if it has the `random_state` param.\n\nParameters\n----------\nestimator : object\n The estimator.\nrandom_state : int, RandomState instance or None, default=0\n Pseudo random number generator state.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.", + "docstring": "Set random state of an estimator if it has the `random_state` param.\n\n Parameters\n ----------\n estimator : object\n The estimator.\n random_state : int, RandomState instance or None, default=0\n Pseudo random number generator state.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n ", "source_code": "\ndef set_random_state(estimator, random_state=0):\n \"\"\"Set random state of an estimator if it has the `random_state` param.\n\n Parameters\n ----------\n estimator : object\n The estimator.\n random_state : int, RandomState instance or None, default=0\n Pseudo random number generator state.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n \"\"\"\n if 'random_state' in estimator.get_params():\n estimator.set_params(random_state=random_state)" }, { @@ -168843,13 +182261,14 @@ "docstring": { "type": "array-like of shape (n_elements,)", "description": "The sequence to be converted." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Convert sequence to a 1-D NumPy array of object dtype.\n\nnumpy.array constructor has a similar use but it's output is ambiguous. It can be 1-D NumPy array of object dtype if the input is a ragged array, but if the input is a list of equal length arrays, then the output is a 2D numpy.array. _to_object_array solves this ambiguity by guarantying that the output is a 1-D NumPy array of objects for any input.", - "docstring": "Convert sequence to a 1-D NumPy array of object dtype.\n\nnumpy.array constructor has a similar use but it's output\nis ambiguous. It can be 1-D NumPy array of object dtype if\nthe input is a ragged array, but if the input is a list of\nequal length arrays, then the output is a 2D numpy.array.\n_to_object_array solves this ambiguity by guarantying that\nthe output is a 1-D NumPy array of objects for any input.\n\nParameters\n----------\nsequence : array-like of shape (n_elements,)\n The sequence to be converted.\n\nReturns\n-------\nout : ndarray of shape (n_elements,), dtype=object\n The converted sequence into a 1-D NumPy array of object dtype.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.utils import _to_object_array\n>>> _to_object_array([np.array([0]), np.array([1])])\narray([array([0]), array([1])], dtype=object)\n>>> _to_object_array([np.array([0]), np.array([1, 2])])\narray([array([0]), array([1, 2])], dtype=object)\n>>> _to_object_array([np.array([0]), np.array([1, 2])])\narray([array([0]), array([1, 2])], dtype=object)", + "description": "Convert sequence to a 1-D NumPy array of object dtype.\n\nnumpy.array constructor has a similar use but it's output\nis ambiguous. It can be 1-D NumPy array of object dtype if\nthe input is a ragged array, but if the input is a list of\nequal length arrays, then the output is a 2D numpy.array.\n_to_object_array solves this ambiguity by guarantying that\nthe output is a 1-D NumPy array of objects for any input.", + "docstring": "Convert sequence to a 1-D NumPy array of object dtype.\n\n numpy.array constructor has a similar use but it's output\n is ambiguous. It can be 1-D NumPy array of object dtype if\n the input is a ragged array, but if the input is a list of\n equal length arrays, then the output is a 2D numpy.array.\n _to_object_array solves this ambiguity by guarantying that\n the output is a 1-D NumPy array of objects for any input.\n\n Parameters\n ----------\n sequence : array-like of shape (n_elements,)\n The sequence to be converted.\n\n Returns\n -------\n out : ndarray of shape (n_elements,), dtype=object\n The converted sequence into a 1-D NumPy array of object dtype.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.utils import _to_object_array\n >>> _to_object_array([np.array([0]), np.array([1])])\n array([array([0]), array([1])], dtype=object)\n >>> _to_object_array([np.array([0]), np.array([1, 2])])\n array([array([0]), array([1, 2])], dtype=object)\n >>> _to_object_array([np.array([0]), np.array([1, 2])])\n array([array([0]), array([1, 2])], dtype=object)\n ", "source_code": "\ndef _to_object_array(sequence):\n \"\"\"Convert sequence to a 1-D NumPy array of object dtype.\n\n numpy.array constructor has a similar use but it's output\n is ambiguous. It can be 1-D NumPy array of object dtype if\n the input is a ragged array, but if the input is a list of\n equal length arrays, then the output is a 2D numpy.array.\n _to_object_array solves this ambiguity by guarantying that\n the output is a 1-D NumPy array of objects for any input.\n\n Parameters\n ----------\n sequence : array-like of shape (n_elements,)\n The sequence to be converted.\n\n Returns\n -------\n out : ndarray of shape (n_elements,), dtype=object\n The converted sequence into a 1-D NumPy array of object dtype.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.utils import _to_object_array\n >>> _to_object_array([np.array([0]), np.array([1])])\n array([array([0]), array([1])], dtype=object)\n >>> _to_object_array([np.array([0]), np.array([1, 2])])\n array([array([0]), array([1, 2])], dtype=object)\n >>> _to_object_array([np.array([0]), np.array([1, 2])])\n array([array([0]), array([1, 2])], dtype=object)\n \"\"\"\n out = np.empty(len(sequence), dtype=object)\n out[:] = sequence\n return out" }, { @@ -168867,13 +182286,22 @@ "docstring": { "type": "{\"classifier\", \"regressor\", \"cluster\", \"transformer\"} or list of such str, default=None", "description": "Which kind of estimators should be returned. If None, no filter is\napplied and all estimators are returned. Possible values are\n'classifier', 'regressor', 'cluster' and 'transformer' to get\nestimators only of these specific types, or a list of these to\nget the estimators that fit at least one of the types." + }, + "refined_type": { + "kind": "EnumType", + "values": [ + "cluster", + "regressor", + "classifier", + "transformer" + ] } } ], "results": [], "is_public": true, - "description": "Get a list of all estimators from sklearn.\n\nThis function crawls the module and gets all classes that inherit from BaseEstimator. Classes that are defined in test-modules are not included.", - "docstring": "Get a list of all estimators from sklearn.\n\nThis function crawls the module and gets all classes that inherit\nfrom BaseEstimator. Classes that are defined in test-modules are not\nincluded.\n\nParameters\n----------\ntype_filter : {\"classifier\", \"regressor\", \"cluster\", \"transformer\"} or list of such str, default=None\n Which kind of estimators should be returned. If None, no filter is\n applied and all estimators are returned. Possible values are\n 'classifier', 'regressor', 'cluster' and 'transformer' to get\n estimators only of these specific types, or a list of these to\n get the estimators that fit at least one of the types.\n\nReturns\n-------\nestimators : list of tuples\n List of (name, class), where ``name`` is the class name as string\n and ``class`` is the actual type of the class.", + "description": "Get a list of all estimators from sklearn.\n\nThis function crawls the module and gets all classes that inherit\nfrom BaseEstimator. Classes that are defined in test-modules are not\nincluded.", + "docstring": "Get a list of all estimators from sklearn.\n\n This function crawls the module and gets all classes that inherit\n from BaseEstimator. Classes that are defined in test-modules are not\n included.\n\n Parameters\n ----------\n type_filter : {\"classifier\", \"regressor\", \"cluster\", \"transformer\"} or list of such str, default=None\n Which kind of estimators should be returned. If None, no filter is\n applied and all estimators are returned. Possible values are\n 'classifier', 'regressor', 'cluster' and 'transformer' to get\n estimators only of these specific types, or a list of these to\n get the estimators that fit at least one of the types.\n\n Returns\n -------\n estimators : list of tuples\n List of (name, class), where ``name`` is the class name as string\n and ``class`` is the actual type of the class.\n ", "source_code": "\ndef all_estimators(type_filter=None):\n \"\"\"Get a list of all estimators from sklearn.\n\n This function crawls the module and gets all classes that inherit\n from BaseEstimator. Classes that are defined in test-modules are not\n included.\n\n Parameters\n ----------\n type_filter : {\"classifier\", \"regressor\", \"cluster\", \"transformer\"} or list of such str, default=None\n Which kind of estimators should be returned. If None, no filter is\n applied and all estimators are returned. Possible values are\n 'classifier', 'regressor', 'cluster' and 'transformer' to get\n estimators only of these specific types, or a list of these to\n get the estimators that fit at least one of the types.\n\n Returns\n -------\n estimators : list of tuples\n List of (name, class), where ``name`` is the class name as string\n and ``class`` is the actual type of the class.\n \"\"\"\n from ._testing import ignore_warnings\n from ..base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin, ClusterMixin\n \n def is_abstract(c):\n if not hasattr(c, '__abstractmethods__'):\n return False\n if not len(c.__abstractmethods__):\n return False\n return True\n all_classes = []\n modules_to_ignore = {'tests', 'externals', 'setup', 'conftest', 'enable_hist_gradient_boosting'}\n root = str(Path(__file__).parent.parent)\n with ignore_warnings(category=FutureWarning):\n for (importer, modname, ispkg) in pkgutil.walk_packages(path=[root], prefix='sklearn.'):\n mod_parts = modname.split('.')\n if any((part in modules_to_ignore for part in mod_parts)) or '._' in modname:\n continue\n module = import_module(modname)\n classes = inspect.getmembers(module, inspect.isclass)\n classes = [(name, est_cls) for (name, est_cls) in classes if not name.startswith('_')]\n if IS_PYPY and 'feature_extraction' in modname:\n classes = [(name, est_cls) for (name, est_cls) in classes if name == 'FeatureHasher']\n all_classes.extend(classes)\n all_classes = set(all_classes)\n estimators = [c for c in all_classes if issubclass(c[1], BaseEstimator) and c[0] != 'BaseEstimator']\n estimators = [c for c in estimators if not is_abstract(c[1])]\n if type_filter is not None:\n if not isinstance(type_filter, list):\n type_filter = [type_filter]\n else:\n type_filter = list(type_filter)\n filtered_estimators = []\n filters = {'classifier': ClassifierMixin, 'regressor': RegressorMixin, 'transformer': TransformerMixin, 'cluster': ClusterMixin}\n for (name, mixin) in filters.items():\n if name in type_filter:\n type_filter.remove(name)\n filtered_estimators.extend([est for est in estimators if issubclass(est[1], mixin)])\n estimators = filtered_estimators\n if type_filter:\n raise ValueError(\"Parameter type_filter must be 'classifier', 'regressor', 'transformer', 'cluster' or None, got %s.\" % repr(type_filter))\n return sorted(set(estimators), key=itemgetter(0))" }, { @@ -168891,6 +182319,10 @@ "docstring": { "type": "{array-like, sparse matrix}", "description": "Data on which to apply mask." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -168901,7 +182333,8 @@ "docstring": { "type": "ndarray", "description": "Mask to be used on X." - } + }, + "refined_type": {} }, { "name": "len_mask", @@ -168911,13 +182344,14 @@ "docstring": { "type": "int", "description": "The length of the mask." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "This mask is safer than safe_mask since it returns an empty array, when a sparse matrix is sliced with a boolean mask with all False, instead of raising an unhelpful error in older versions of SciPy.\n\nSee: https://github.com/scipy/scipy/issues/5361 Also note that we can avoid doing the dot product by checking if the len_mask is not zero in _huber_loss_and_gradient but this is not going to be the bottleneck, since the number of outliers and non_outliers are typically non-zero and it makes the code tougher to follow.", - "docstring": "This mask is safer than safe_mask since it returns an\nempty array, when a sparse matrix is sliced with a boolean mask\nwith all False, instead of raising an unhelpful error in older\nversions of SciPy.\n\nSee: https://github.com/scipy/scipy/issues/5361\n\nAlso note that we can avoid doing the dot product by checking if\nthe len_mask is not zero in _huber_loss_and_gradient but this\nis not going to be the bottleneck, since the number of outliers\nand non_outliers are typically non-zero and it makes the code\ntougher to follow.\n\nParameters\n----------\nX : {array-like, sparse matrix}\n Data on which to apply mask.\n\nmask : ndarray\n Mask to be used on X.\n\nlen_mask : int\n The length of the mask.\n\nReturns\n-------\n mask", + "description": "This mask is safer than safe_mask since it returns an\nempty array, when a sparse matrix is sliced with a boolean mask\nwith all False, instead of raising an unhelpful error in older\nversions of SciPy.\n\nSee: https://github.com/scipy/scipy/issues/5361\n\nAlso note that we can avoid doing the dot product by checking if\nthe len_mask is not zero in _huber_loss_and_gradient but this\nis not going to be the bottleneck, since the number of outliers\nand non_outliers are typically non-zero and it makes the code\ntougher to follow.", + "docstring": "\n This mask is safer than safe_mask since it returns an\n empty array, when a sparse matrix is sliced with a boolean mask\n with all False, instead of raising an unhelpful error in older\n versions of SciPy.\n\n See: https://github.com/scipy/scipy/issues/5361\n\n Also note that we can avoid doing the dot product by checking if\n the len_mask is not zero in _huber_loss_and_gradient but this\n is not going to be the bottleneck, since the number of outliers\n and non_outliers are typically non-zero and it makes the code\n tougher to follow.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}\n Data on which to apply mask.\n\n mask : ndarray\n Mask to be used on X.\n\n len_mask : int\n The length of the mask.\n\n Returns\n -------\n mask\n ", "source_code": "\ndef axis0_safe_slice(X, mask, len_mask):\n \"\"\"\n This mask is safer than safe_mask since it returns an\n empty array, when a sparse matrix is sliced with a boolean mask\n with all False, instead of raising an unhelpful error in older\n versions of SciPy.\n\n See: https://github.com/scipy/scipy/issues/5361\n\n Also note that we can avoid doing the dot product by checking if\n the len_mask is not zero in _huber_loss_and_gradient but this\n is not going to be the bottleneck, since the number of outliers\n and non_outliers are typically non-zero and it makes the code\n tougher to follow.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}\n Data on which to apply mask.\n\n mask : ndarray\n Mask to be used on X.\n\n len_mask : int\n The length of the mask.\n\n Returns\n -------\n mask\n \"\"\"\n if len_mask != 0:\n return X[safe_mask(X, mask), :]\n return np.zeros(shape=(0, X.shape[1]))" }, { @@ -168935,13 +182369,14 @@ "docstring": { "type": "str", "description": "The name of the caller that requires matplotlib." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Raise ImportError with detailed error message if mpl is not installed.\n\nPlot utilities like any of the Display's plotting functions should lazily import matplotlib and call this helper before any computation.", - "docstring": "Raise ImportError with detailed error message if mpl is not installed.\n\nPlot utilities like any of the Display's plotting functions should lazily import\nmatplotlib and call this helper before any computation.\n\nParameters\n----------\ncaller_name : str\n The name of the caller that requires matplotlib.", + "description": "Raise ImportError with detailed error message if mpl is not installed.\n\nPlot utilities like any of the Display's plotting functions should lazily import\nmatplotlib and call this helper before any computation.", + "docstring": "Raise ImportError with detailed error message if mpl is not installed.\n\n Plot utilities like any of the Display's plotting functions should lazily import\n matplotlib and call this helper before any computation.\n\n Parameters\n ----------\n caller_name : str\n The name of the caller that requires matplotlib.\n ", "source_code": "\ndef check_matplotlib_support(caller_name):\n \"\"\"Raise ImportError with detailed error message if mpl is not installed.\n\n Plot utilities like any of the Display's plotting functions should lazily import\n matplotlib and call this helper before any computation.\n\n Parameters\n ----------\n caller_name : str\n The name of the caller that requires matplotlib.\n \"\"\"\n try:\n import matplotlib\n except ImportError as e:\n raise ImportError('{} requires matplotlib. You can install matplotlib with `pip install matplotlib`'.format(caller_name)) from e" }, { @@ -168959,14 +182394,15 @@ "docstring": { "type": "str", "description": "The name of the caller that requires pandas." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Raise ImportError with detailed error message if pandas is not installed.\n\nPlot utilities like :func:`fetch_openml` should lazily import pandas and call this helper before any computation.", - "docstring": "Raise ImportError with detailed error message if pandas is not\ninstalled.\n\nPlot utilities like :func:`fetch_openml` should lazily import\npandas and call this helper before any computation.\n\nParameters\n----------\ncaller_name : str\n The name of the caller that requires pandas.", - "source_code": "\ndef check_pandas_support(caller_name):\n \"\"\"Raise ImportError with detailed error message if pandas is not\n installed.\n\n Plot utilities like :func:`fetch_openml` should lazily import\n pandas and call this helper before any computation.\n\n Parameters\n ----------\n caller_name : str\n The name of the caller that requires pandas.\n \"\"\"\n try:\n import pandas\n return pandas\n except ImportError as e:\n raise ImportError('{} requires pandas.'.format(caller_name)) from e" + "description": "Raise ImportError with detailed error message if pandas is not installed.\n\nPlot utilities like :func:`fetch_openml` should lazily import\npandas and call this helper before any computation.", + "docstring": "Raise ImportError with detailed error message if pandas is not installed.\n\n Plot utilities like :func:`fetch_openml` should lazily import\n pandas and call this helper before any computation.\n\n Parameters\n ----------\n caller_name : str\n The name of the caller that requires pandas.\n\n Returns\n -------\n pandas\n The pandas package.\n ", + "source_code": "\ndef check_pandas_support(caller_name):\n \"\"\"Raise ImportError with detailed error message if pandas is not installed.\n\n Plot utilities like :func:`fetch_openml` should lazily import\n pandas and call this helper before any computation.\n\n Parameters\n ----------\n caller_name : str\n The name of the caller that requires pandas.\n\n Returns\n -------\n pandas\n The pandas package.\n \"\"\"\n try:\n import pandas\n return pandas\n except ImportError as e:\n raise ImportError('{} requires pandas.'.format(caller_name)) from e" }, { "name": "compute_class_weight", @@ -168983,7 +182419,8 @@ "docstring": { "type": "dict, 'balanced' or None", "description": "If 'balanced', class weights will be given by\n``n_samples / (n_classes * np.bincount(y))``.\nIf a dictionary is given, keys are classes and values\nare corresponding class weights.\nIf None is given, the class weights will be uniform." - } + }, + "refined_type": {} }, { "name": "classes", @@ -168993,7 +182430,8 @@ "docstring": { "type": "ndarray", "description": "Array of the classes occurring in the data, as given by\n``np.unique(y_org)`` with ``y_org`` the original class labels." - } + }, + "refined_type": {} }, { "name": "y", @@ -169003,13 +182441,14 @@ "docstring": { "type": "array-like of shape (n_samples,)", "description": "Array of original class labels per sample." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Estimate class weights for unbalanced datasets.", - "docstring": "Estimate class weights for unbalanced datasets.\n\nParameters\n----------\nclass_weight : dict, 'balanced' or None\n If 'balanced', class weights will be given by\n ``n_samples / (n_classes * np.bincount(y))``.\n If a dictionary is given, keys are classes and values\n are corresponding class weights.\n If None is given, the class weights will be uniform.\n\nclasses : ndarray\n Array of the classes occurring in the data, as given by\n ``np.unique(y_org)`` with ``y_org`` the original class labels.\n\ny : array-like of shape (n_samples,)\n Array of original class labels per sample.\n\nReturns\n-------\nclass_weight_vect : ndarray of shape (n_classes,)\n Array with class_weight_vect[i] the weight for i-th class.\n\nReferences\n----------\nThe \"balanced\" heuristic is inspired by\nLogistic Regression in Rare Events Data, King, Zen, 2001.", + "docstring": "Estimate class weights for unbalanced datasets.\n\n Parameters\n ----------\n class_weight : dict, 'balanced' or None\n If 'balanced', class weights will be given by\n ``n_samples / (n_classes * np.bincount(y))``.\n If a dictionary is given, keys are classes and values\n are corresponding class weights.\n If None is given, the class weights will be uniform.\n\n classes : ndarray\n Array of the classes occurring in the data, as given by\n ``np.unique(y_org)`` with ``y_org`` the original class labels.\n\n y : array-like of shape (n_samples,)\n Array of original class labels per sample.\n\n Returns\n -------\n class_weight_vect : ndarray of shape (n_classes,)\n Array with class_weight_vect[i] the weight for i-th class.\n\n References\n ----------\n The \"balanced\" heuristic is inspired by\n Logistic Regression in Rare Events Data, King, Zen, 2001.\n ", "source_code": "\ndef compute_class_weight(class_weight, *, classes, y):\n \"\"\"Estimate class weights for unbalanced datasets.\n\n Parameters\n ----------\n class_weight : dict, 'balanced' or None\n If 'balanced', class weights will be given by\n ``n_samples / (n_classes * np.bincount(y))``.\n If a dictionary is given, keys are classes and values\n are corresponding class weights.\n If None is given, the class weights will be uniform.\n\n classes : ndarray\n Array of the classes occurring in the data, as given by\n ``np.unique(y_org)`` with ``y_org`` the original class labels.\n\n y : array-like of shape (n_samples,)\n Array of original class labels per sample.\n\n Returns\n -------\n class_weight_vect : ndarray of shape (n_classes,)\n Array with class_weight_vect[i] the weight for i-th class.\n\n References\n ----------\n The \"balanced\" heuristic is inspired by\n Logistic Regression in Rare Events Data, King, Zen, 2001.\n \"\"\"\n from ..preprocessing import LabelEncoder\n if set(y) - set(classes):\n raise ValueError('classes should include all valid labels that can be in y')\n if class_weight is None or len(class_weight) == 0:\n weight = np.ones(classes.shape[0], dtype=np.float64, order='C')\n elif class_weight == 'balanced':\n le = LabelEncoder()\n y_ind = le.fit_transform(y)\n if not all(np.in1d(classes, le.classes_)):\n raise ValueError('classes should have valid labels that are in y')\n recip_freq = len(y) / (len(le.classes_) * np.bincount(y_ind).astype(np.float64))\n weight = recip_freq[le.transform(classes)]\n else:\n weight = np.ones(classes.shape[0], dtype=np.float64, order='C')\n if not isinstance(class_weight, dict):\n raise ValueError(\"class_weight must be dict, 'balanced', or None, got: %r\" % class_weight)\n for c in class_weight:\n i = np.searchsorted(classes, c)\n if i >= len(classes) or classes[i] != c:\n raise ValueError('Class label {} not present.'.format(c))\n else:\n weight[i] = class_weight[c]\n return weight" }, { @@ -169027,6 +182466,10 @@ "docstring": { "type": "dict, list of dicts, \"balanced\", or None", "description": "Weights associated with classes in the form ``{class_label: weight}``.\nIf not given, all classes are supposed to have weight one. For\nmulti-output problems, a list of dicts can be provided in the same\norder as the columns of y.\n\nNote that for multioutput (including multilabel) weights should be\ndefined for each class of every column in its own dict. For example,\nfor four-class multilabel classification weights should be\n[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n[{1:1}, {2:5}, {3:1}, {4:1}].\n\nThe \"balanced\" mode uses the values of y to automatically adjust\nweights inversely proportional to class frequencies in the input data:\n``n_samples / (n_classes * np.bincount(y))``.\n\nFor multi-output, the weights of each column of y will be multiplied." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -169037,7 +182480,8 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs)", "description": "Array of original class labels per sample." - } + }, + "refined_type": {} }, { "name": "indices", @@ -169047,13 +182491,14 @@ "docstring": { "type": "array-like of shape (n_subsample,), default=None", "description": "Array of indices to be used in a subsample. Can be of length less than\nn_samples in the case of a subsample, or equal to n_samples in the\ncase of a bootstrap subsample with repeated indices. If None, the\nsample weight will be calculated over the full sample. Only \"balanced\"\nis supported for class_weight if this is provided." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Estimate sample weights by class for unbalanced datasets.", - "docstring": "Estimate sample weights by class for unbalanced datasets.\n\nParameters\n----------\nclass_weight : dict, list of dicts, \"balanced\", or None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data:\n ``n_samples / (n_classes * np.bincount(y))``.\n\n For multi-output, the weights of each column of y will be multiplied.\n\ny : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Array of original class labels per sample.\n\nindices : array-like of shape (n_subsample,), default=None\n Array of indices to be used in a subsample. Can be of length less than\n n_samples in the case of a subsample, or equal to n_samples in the\n case of a bootstrap subsample with repeated indices. If None, the\n sample weight will be calculated over the full sample. Only \"balanced\"\n is supported for class_weight if this is provided.\n\nReturns\n-------\nsample_weight_vect : ndarray of shape (n_samples,)\n Array with sample weights as applied to the original y.", + "docstring": "Estimate sample weights by class for unbalanced datasets.\n\n Parameters\n ----------\n class_weight : dict, list of dicts, \"balanced\", or None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data:\n ``n_samples / (n_classes * np.bincount(y))``.\n\n For multi-output, the weights of each column of y will be multiplied.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Array of original class labels per sample.\n\n indices : array-like of shape (n_subsample,), default=None\n Array of indices to be used in a subsample. Can be of length less than\n n_samples in the case of a subsample, or equal to n_samples in the\n case of a bootstrap subsample with repeated indices. If None, the\n sample weight will be calculated over the full sample. Only \"balanced\"\n is supported for class_weight if this is provided.\n\n Returns\n -------\n sample_weight_vect : ndarray of shape (n_samples,)\n Array with sample weights as applied to the original y.\n ", "source_code": "\ndef compute_sample_weight(class_weight, y, *, indices=None):\n \"\"\"Estimate sample weights by class for unbalanced datasets.\n\n Parameters\n ----------\n class_weight : dict, list of dicts, \"balanced\", or None\n Weights associated with classes in the form ``{class_label: weight}``.\n If not given, all classes are supposed to have weight one. For\n multi-output problems, a list of dicts can be provided in the same\n order as the columns of y.\n\n Note that for multioutput (including multilabel) weights should be\n defined for each class of every column in its own dict. For example,\n for four-class multilabel classification weights should be\n [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of\n [{1:1}, {2:5}, {3:1}, {4:1}].\n\n The \"balanced\" mode uses the values of y to automatically adjust\n weights inversely proportional to class frequencies in the input data:\n ``n_samples / (n_classes * np.bincount(y))``.\n\n For multi-output, the weights of each column of y will be multiplied.\n\n y : array-like of shape (n_samples,) or (n_samples, n_outputs)\n Array of original class labels per sample.\n\n indices : array-like of shape (n_subsample,), default=None\n Array of indices to be used in a subsample. Can be of length less than\n n_samples in the case of a subsample, or equal to n_samples in the\n case of a bootstrap subsample with repeated indices. If None, the\n sample weight will be calculated over the full sample. Only \"balanced\"\n is supported for class_weight if this is provided.\n\n Returns\n -------\n sample_weight_vect : ndarray of shape (n_samples,)\n Array with sample weights as applied to the original y.\n \"\"\"\n y = np.atleast_1d(y)\n if y.ndim == 1:\n y = np.reshape(y, (-1, 1))\n n_outputs = y.shape[1]\n if isinstance(class_weight, str):\n if class_weight not in ['balanced']:\n raise ValueError('The only valid preset for class_weight is \"balanced\". Given \"%s\".' % class_weight)\n elif indices is not None and not isinstance(class_weight, str):\n raise ValueError('The only valid class_weight for subsampling is \"balanced\". Given \"%s\".' % class_weight)\n elif n_outputs > 1:\n if not hasattr(class_weight, '__iter__') or isinstance(class_weight, dict):\n raise ValueError('For multi-output, class_weight should be a list of dicts, or a valid string.')\n if len(class_weight) != n_outputs:\n raise ValueError('For multi-output, number of elements in class_weight should match number of outputs.')\n expanded_class_weight = []\n for k in range(n_outputs):\n y_full = y[:, k]\n classes_full = np.unique(y_full)\n classes_missing = None\n if class_weight == 'balanced' or n_outputs == 1:\n class_weight_k = class_weight\n else:\n class_weight_k = class_weight[k]\n if indices is not None:\n y_subsample = y[indices, k]\n classes_subsample = np.unique(y_subsample)\n weight_k = np.take(compute_class_weight(class_weight_k, classes=classes_subsample, y=y_subsample), np.searchsorted(classes_subsample, classes_full), mode='clip')\n classes_missing = set(classes_full) - set(classes_subsample)\n else:\n weight_k = compute_class_weight(class_weight_k, classes=classes_full, y=y_full)\n weight_k = weight_k[np.searchsorted(classes_full, y_full)]\n if classes_missing:\n weight_k[np.in1d(y_full, list(classes_missing))] = 0.0\n expanded_class_weight.append(weight_k)\n expanded_class_weight = np.prod(expanded_class_weight, axis=0, dtype=np.float64)\n return expanded_class_weight" }, { @@ -169071,7 +182516,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -169095,7 +182541,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "obj", @@ -169105,13 +182552,14 @@ "docstring": { "type": "object", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Call method", - "docstring": "Call method\n\nParameters\n----------\nobj : object", + "docstring": "Call method\n\n Parameters\n ----------\n obj : object\n ", "source_code": "\ndef __call__(self, obj):\n \"\"\"Call method\n\n Parameters\n ----------\n obj : object\n \"\"\"\n if isinstance(obj, type):\n return self._decorate_class(obj)\n elif isinstance(obj, property):\n return self._decorate_property(obj)\n else:\n return self._decorate_fun(obj)" }, { @@ -169129,7 +182577,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "extra", @@ -169139,13 +182588,14 @@ "docstring": { "type": "str, default=''", "description": "To be added to the deprecation messages." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, extra=''):\n self.extra = extra" }, { @@ -169163,7 +182613,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "cls", @@ -169173,13 +182624,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _decorate_class(self, cls):\n msg = 'Class %s is deprecated' % cls.__name__\n if self.extra:\n msg += '; %s' % self.extra\n init = cls.__init__\n \n def wrapped(*args, **kwargs):\n warnings.warn(msg, category=FutureWarning)\n return init(*args, **kwargs)\n cls.__init__ = wrapped\n wrapped.__name__ = '__init__'\n wrapped.__doc__ = self._update_doc(init.__doc__)\n wrapped.deprecated_original = init\n return cls" }, { @@ -169197,7 +182649,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fun", @@ -169207,7 +182660,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -169231,7 +182685,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "prop", @@ -169241,13 +182696,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _decorate_property(self, prop):\n msg = self.extra\n \n @property\n @functools.wraps(prop)\n def wrapped(*args, **kwargs):\n warnings.warn(msg, category=FutureWarning)\n return prop.fget(*args, **kwargs)\n wrapped.__doc__ = self._update_doc(wrapped.__doc__)\n return wrapped" }, { @@ -169265,7 +182721,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "olddoc", @@ -169275,13 +182732,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _update_doc(self, olddoc):\n newdoc = 'DEPRECATED'\n if self.extra:\n newdoc = '%s: %s' % (newdoc, self.extra)\n if olddoc:\n newdoc = '%s\\n\\n %s' % (newdoc, olddoc)\n return newdoc" }, { @@ -169299,7 +182757,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -169309,13 +182768,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __array__(self, dtype=None):\n return self.data" }, { @@ -169333,7 +182793,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "func", @@ -169343,7 +182804,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "types", @@ -169353,7 +182815,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "args", @@ -169363,7 +182826,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kwargs", @@ -169373,13 +182837,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __array_function__(self, func, types, args, kwargs):\n if func.__name__ == 'may_share_memory':\n return True\n raise TypeError(\"Don't want to call array_function {}!\".format(func.__name__))" }, { @@ -169397,7 +182862,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data", @@ -169407,13 +182873,14 @@ "docstring": { "type": "array-like", "description": "The data." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, data):\n self.data = np.asarray(data)" }, { @@ -169431,7 +182898,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -169441,13 +182909,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _apply_on_subsets(func, X):\n result_full = func(X)\n n_features = X.shape[1]\n result_by_batch = [func(batch.reshape(1, n_features)) for batch in X]\n if type(result_full) == tuple:\n result_full = result_full[0]\n result_by_batch = list(map(lambda x: x[0], result_by_batch))\n if sparse.issparse(result_full):\n result_full = result_full.A\n result_by_batch = [x.A for x in result_by_batch]\n return np.ravel(result_full), np.ravel(result_by_batch)" }, { @@ -169465,7 +182934,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformer_orig", @@ -169475,7 +182945,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -169485,7 +182956,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -169495,13 +182967,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check_transformer(name, transformer_orig, X, y):\n (n_samples, n_features) = np.asarray(X).shape\n transformer = clone(transformer_orig)\n set_random_state(transformer)\n if name in CROSS_DECOMPOSITION:\n y_ = np.c_[np.asarray(y), np.asarray(y)]\n y_[::2, 1] *= 2\n if isinstance(X, _NotAnArray):\n y_ = _NotAnArray(y_)\n else:\n y_ = y\n transformer.fit(X, y_)\n transformer_clone = clone(transformer)\n X_pred = transformer_clone.fit_transform(X, y=y_)\n if isinstance(X_pred, tuple):\n for x_pred in X_pred:\n assert x_pred.shape[0] == n_samples\n else:\n assert X_pred.shape[0] == n_samples\n if hasattr(transformer, 'transform'):\n if name in CROSS_DECOMPOSITION:\n X_pred2 = transformer.transform(X, y_)\n X_pred3 = transformer.fit_transform(X, y=y_)\n else:\n X_pred2 = transformer.transform(X)\n X_pred3 = transformer.fit_transform(X, y=y_)\n if _safe_tags(transformer_orig, key='non_deterministic'):\n msg = name + ' is non deterministic'\n raise SkipTest(msg)\n if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):\n for (x_pred, x_pred2, x_pred3) in zip(X_pred, X_pred2, X_pred3):\n assert_allclose_dense_sparse(x_pred, x_pred2, atol=0.01, err_msg='fit_transform and transform outcomes not consistent in %s' % transformer)\n assert_allclose_dense_sparse(x_pred, x_pred3, atol=0.01, err_msg='consecutive fit_transform outcomes not consistent in %s' % transformer)\n else:\n assert_allclose_dense_sparse(X_pred, X_pred2, err_msg='fit_transform and transform outcomes not consistent in %s' % transformer, atol=0.01)\n assert_allclose_dense_sparse(X_pred, X_pred3, atol=0.01, err_msg='consecutive fit_transform outcomes not consistent in %s' % transformer)\n assert _num_samples(X_pred2) == n_samples\n assert _num_samples(X_pred3) == n_samples\n if hasattr(X, 'shape') and not _safe_tags(transformer, key='stateless') and X.ndim == 2 and X.shape[1] > 1:\n with raises(ValueError, err_msg=f'The transformer {name} does not raise an error when the number of features in transform is different from the number of features in fit.'):\n transformer.transform(X[:, :-1])" }, { @@ -169519,7 +182992,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -169529,7 +183003,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_names", @@ -169539,13 +183014,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _choose_check_classifiers_labels(name, y, y_names):\n return y if name in ['LabelPropagation', 'LabelSpreading', 'SelfTrainingClassifier'] else y_names" }, { @@ -169563,7 +183039,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -169572,6 +183049,31 @@ "docstring": "Construct Estimator instance if possible.", "source_code": "\ndef _construct_instance(Estimator):\n \"\"\"Construct Estimator instance if possible.\"\"\"\n required_parameters = getattr(Estimator, '_required_parameters', [])\n if len(required_parameters):\n if required_parameters in (['estimator'], ['base_estimator']):\n if issubclass(Estimator, RANSACRegressor):\n estimator = Estimator(LinearRegression())\n elif issubclass(Estimator, RegressorMixin):\n estimator = Estimator(Ridge())\n else:\n estimator = Estimator(LogisticRegression(C=1))\n elif required_parameters in (['estimators'], ):\n if issubclass(Estimator, RegressorMixin):\n estimator = Estimator(estimators=[('est1', Ridge(alpha=0.1)), ('est2', Ridge(alpha=1))])\n else:\n estimator = Estimator(estimators=[('est1', LogisticRegression(C=0.1)), ('est2', LogisticRegression(C=1))])\n else:\n msg = f\"Can't instantiate estimator {Estimator.__name__} parameters {required_parameters}\"\n warnings.warn(msg, SkipTestWarning)\n raise SkipTest(msg)\n else:\n estimator = Estimator()\n return estimator" }, + { + "name": "_create_memmap_backed_data", + "unique_name": "_create_memmap_backed_data", + "qname": "sklearn.utils.estimator_checks._create_memmap_backed_data", + "unique_qname": "sklearn.utils.estimator_checks._create_memmap_backed_data", + "decorators": [], + "parameters": [ + { + "name": "numpy_arrays", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + } + ], + "results": [], + "is_public": false, + "description": "", + "docstring": null, + "source_code": "\ndef _create_memmap_backed_data(numpy_arrays):\n has_prescott_openblas = any((True for info in threadpool_info() if info['internal_api'] == 'openblas' and info.get('architecture', 'prescott').lower() == 'prescott'))\n return [create_memmap_backed_data(array, aligned=has_prescott_openblas) for array in numpy_arrays]" + }, { "name": "_enforce_estimator_tags_x", "unique_name": "_enforce_estimator_tags_x", @@ -169587,7 +183089,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -169597,13 +183100,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _enforce_estimator_tags_x(estimator, X):\n if _is_pairwise(estimator):\n X = X.dot(X.T)\n if '1darray' in _safe_tags(estimator, key='X_types'):\n X = X[:, 0]\n if _safe_tags(estimator, key='requires_positive_X'):\n X -= X.min()\n if 'categorical' in _safe_tags(estimator, key='X_types'):\n X = (X - X.min()).astype(np.int32)\n return X" }, { @@ -169621,7 +183125,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -169631,13 +183136,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _enforce_estimator_tags_y(estimator, y):\n if _safe_tags(estimator, key='requires_positive_y'):\n y += 1 + abs(y.min())\n if _safe_tags(estimator, key='binary_only') and y.size > 0:\n y = np.where(y == y.flat[0], y, y.flat[0] + 1)\n if _safe_tags(estimator, key='multioutput_only'):\n return np.reshape(y, (-1, 1))\n return y" }, { @@ -169655,13 +183161,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Generate sparse matrices with {32,64}bit indices of diverse format.", - "docstring": "Generate sparse matrices with {32,64}bit indices of diverse format.\n\nParameters\n----------\nX_csr: CSR Matrix\n Input matrix in CSR format.\n\nReturns\n-------\nout: iter(Matrices)\n In format['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo',\n 'coo_64', 'csc_64', 'csr_64']", + "docstring": "Generate sparse matrices with {32,64}bit indices of diverse format.\n\n Parameters\n ----------\n X_csr: CSR Matrix\n Input matrix in CSR format.\n\n Returns\n -------\n out: iter(Matrices)\n In format['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo',\n 'coo_64', 'csc_64', 'csr_64']\n ", "source_code": "\ndef _generate_sparse_matrix(X_csr):\n \"\"\"Generate sparse matrices with {32,64}bit indices of diverse format.\n\n Parameters\n ----------\n X_csr: CSR Matrix\n Input matrix in CSR format.\n\n Returns\n -------\n out: iter(Matrices)\n In format['dok', 'lil', 'dia', 'bsr', 'csr', 'csc', 'coo',\n 'coo_64', 'csc_64', 'csr_64']\n \"\"\"\n assert X_csr.format == 'csr'\n yield ('csr', X_csr.copy())\n for sparse_format in ['dok', 'lil', 'dia', 'bsr', 'csc', 'coo']:\n yield (sparse_format, X_csr.asformat(sparse_format))\n X_coo = X_csr.asformat('coo')\n X_coo.row = X_coo.row.astype('int64')\n X_coo.col = X_coo.col.astype('int64')\n yield ('coo_64', X_coo)\n for sparse_format in ['csc', 'csr']:\n X = X_csr.asformat(sparse_format)\n X.indices = X.indices.astype('int64')\n X.indptr = X.indptr.astype('int64')\n yield (sparse_format + '_64', X)" }, { @@ -169679,13 +183186,14 @@ "docstring": { "type": "estimator or function", "description": "Items generated by `check_estimator`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Create pytest ids for checks.\n\nWhen `obj` is an estimator, this returns the pprint version of the estimator (with `print_changed_only=True`). When `obj` is a function, the name of the function is returned with its keyword arguments. `_get_check_estimator_ids` is designed to be used as the `id` in `pytest.mark.parametrize` where `check_estimator(..., generate_only=True)` is yielding estimators and checks.", - "docstring": "Create pytest ids for checks.\n\nWhen `obj` is an estimator, this returns the pprint version of the\nestimator (with `print_changed_only=True`). When `obj` is a function, the\nname of the function is returned with its keyword arguments.\n\n`_get_check_estimator_ids` is designed to be used as the `id` in\n`pytest.mark.parametrize` where `check_estimator(..., generate_only=True)`\nis yielding estimators and checks.\n\nParameters\n----------\nobj : estimator or function\n Items generated by `check_estimator`.\n\nReturns\n-------\nid : str or None\n\nSee Also\n--------\ncheck_estimator", + "description": "Create pytest ids for checks.\n\nWhen `obj` is an estimator, this returns the pprint version of the\nestimator (with `print_changed_only=True`). When `obj` is a function, the\nname of the function is returned with its keyword arguments.\n\n`_get_check_estimator_ids` is designed to be used as the `id` in\n`pytest.mark.parametrize` where `check_estimator(..., generate_only=True)`\nis yielding estimators and checks.", + "docstring": "Create pytest ids for checks.\n\n When `obj` is an estimator, this returns the pprint version of the\n estimator (with `print_changed_only=True`). When `obj` is a function, the\n name of the function is returned with its keyword arguments.\n\n `_get_check_estimator_ids` is designed to be used as the `id` in\n `pytest.mark.parametrize` where `check_estimator(..., generate_only=True)`\n is yielding estimators and checks.\n\n Parameters\n ----------\n obj : estimator or function\n Items generated by `check_estimator`.\n\n Returns\n -------\n id : str or None\n\n See Also\n --------\n check_estimator\n ", "source_code": "\ndef _get_check_estimator_ids(obj):\n \"\"\"Create pytest ids for checks.\n\n When `obj` is an estimator, this returns the pprint version of the\n estimator (with `print_changed_only=True`). When `obj` is a function, the\n name of the function is returned with its keyword arguments.\n\n `_get_check_estimator_ids` is designed to be used as the `id` in\n `pytest.mark.parametrize` where `check_estimator(..., generate_only=True)`\n is yielding estimators and checks.\n\n Parameters\n ----------\n obj : estimator or function\n Items generated by `check_estimator`.\n\n Returns\n -------\n id : str or None\n\n See Also\n --------\n check_estimator\n \"\"\"\n if callable(obj):\n if not isinstance(obj, partial):\n return obj.__name__\n if not obj.keywords:\n return obj.func.__name__\n kwstring = ','.join(['{}={}'.format(k, v) for (k, v) in obj.keywords.items()])\n return '{}({})'.format(obj.func.__name__, kwstring)\n if hasattr(obj, 'get_params'):\n with config_context(print_changed_only=True):\n return re.sub('\\\\s', '', str(obj))" }, { @@ -169703,13 +183211,14 @@ "docstring": { "type": "object", "description": "Estimator object to test." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Returns True if estimator accepts pairwise metric.", - "docstring": "Returns True if estimator accepts pairwise metric.\n\nParameters\n----------\nestimator : object\n Estimator object to test.\n\nReturns\n-------\nout : bool\n True if _pairwise is set to True and False otherwise.", + "docstring": "Returns True if estimator accepts pairwise metric.\n\n Parameters\n ----------\n estimator : object\n Estimator object to test.\n\n Returns\n -------\n out : bool\n True if _pairwise is set to True and False otherwise.\n ", "source_code": "\ndef _is_pairwise_metric(estimator):\n \"\"\"Returns True if estimator accepts pairwise metric.\n\n Parameters\n ----------\n estimator : object\n Estimator object to test.\n\n Returns\n -------\n out : bool\n True if _pairwise is set to True and False otherwise.\n \"\"\"\n metric = getattr(estimator, 'metric', None)\n return bool(metric == 'precomputed')" }, { @@ -169727,13 +183236,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _is_public_parameter(attr):\n return not (attr.startswith('_') or attr.endswith('_'))" }, { @@ -169751,7 +183261,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check", @@ -169761,7 +183272,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "pytest", @@ -169771,13 +183283,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _maybe_mark_xfail(estimator, check, pytest):\n (should_be_marked, reason) = _should_be_skipped_or_marked(estimator, check)\n if not should_be_marked:\n return estimator, check\n else:\n return pytest.param(estimator, check, marks=pytest.mark.xfail(reason=reason))" }, { @@ -169795,7 +183308,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check", @@ -169805,13 +183319,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _maybe_skip(estimator, check):\n (should_be_skipped, reason) = _should_be_skipped_or_marked(estimator, check)\n if not should_be_skipped:\n return check\n check_name = check.func.__name__ if isinstance(check, partial) else check.__name__\n \n @wraps(check)\n def wrapped(*args, **kwargs):\n raise SkipTest(f'Skipping {check_name} for {estimator.__class__.__name__}: {reason}')\n return wrapped" }, { @@ -169829,7 +183344,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -169839,7 +183355,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kernel", @@ -169849,13 +183366,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _pairwise_estimator_convert_X(X, estimator, kernel=linear_kernel):\n if _is_pairwise_metric(estimator):\n return pairwise_distances(X, metric='euclidean')\n if _is_pairwise(estimator):\n return kernel(X, X)\n return X" }, { @@ -169868,7 +183386,7 @@ "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _regression_dataset():\n global REGRESSION_DATASET\n if REGRESSION_DATASET is None:\n (X, y) = make_regression(n_samples=200, n_features=10, n_informative=1, bias=5.0, noise=20, random_state=42)\n X = StandardScaler().fit_transform(X)\n REGRESSION_DATASET = (X, y)\n return REGRESSION_DATASET" }, { @@ -169886,14 +183404,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", - "source_code": "\ndef _set_checking_parameters(estimator):\n params = estimator.get_params()\n name = estimator.__class__.__name__\n if 'n_iter' in params and name != 'TSNE':\n estimator.set_params(n_iter=5)\n if 'max_iter' in params:\n if estimator.max_iter is not None:\n estimator.set_params(max_iter=min(5, estimator.max_iter))\n if estimator.__class__.__name__ in ['LinearSVR', 'LinearSVC']:\n estimator.set_params(max_iter=20)\n if estimator.__class__.__name__ == 'NMF':\n estimator.set_params(max_iter=500, init='nndsvda')\n if estimator.__class__.__name__ in ['MLPClassifier', 'MLPRegressor']:\n estimator.set_params(max_iter=100)\n if 'n_resampling' in params:\n estimator.set_params(n_resampling=5)\n if 'n_estimators' in params:\n estimator.set_params(n_estimators=min(5, estimator.n_estimators))\n if 'max_trials' in params:\n estimator.set_params(max_trials=10)\n if 'n_init' in params:\n estimator.set_params(n_init=2)\n if name == 'TruncatedSVD':\n estimator.n_components = 1\n if hasattr(estimator, 'n_clusters'):\n estimator.n_clusters = min(estimator.n_clusters, 2)\n if hasattr(estimator, 'n_best'):\n estimator.n_best = 1\n if name == 'SelectFdr':\n estimator.set_params(alpha=0.5)\n if name == 'TheilSenRegressor':\n estimator.max_subpopulation = 100\n if isinstance(estimator, BaseRandomProjection):\n estimator.set_params(n_components=2)\n if isinstance(estimator, SelectKBest):\n estimator.set_params(k=1)\n if name in ('HistGradientBoostingClassifier', 'HistGradientBoostingRegressor'):\n estimator.set_params(min_samples_leaf=5)\n if name == 'DummyClassifier':\n estimator.set_params(strategy='stratified')\n loo_cv = ['RidgeCV', 'RidgeClassifierCV']\n if name not in loo_cv and hasattr(estimator, 'cv'):\n estimator.set_params(cv=3)\n if hasattr(estimator, 'n_splits'):\n estimator.set_params(n_splits=3)\n if name == 'OneHotEncoder':\n estimator.set_params(handle_unknown='ignore')\n if name in CROSS_DECOMPOSITION:\n estimator.set_params(n_components=1)" + "docstring": null, + "source_code": "\ndef _set_checking_parameters(estimator):\n params = estimator.get_params()\n name = estimator.__class__.__name__\n if 'n_iter' in params and name != 'TSNE':\n estimator.set_params(n_iter=5)\n if 'max_iter' in params:\n if estimator.max_iter is not None:\n estimator.set_params(max_iter=min(5, estimator.max_iter))\n if estimator.__class__.__name__ in ['LinearSVR', 'LinearSVC']:\n estimator.set_params(max_iter=20)\n if estimator.__class__.__name__ == 'NMF':\n estimator.set_params(max_iter=500, init='nndsvda')\n if estimator.__class__.__name__ in ['MLPClassifier', 'MLPRegressor']:\n estimator.set_params(max_iter=100)\n if 'n_resampling' in params:\n estimator.set_params(n_resampling=5)\n if 'n_estimators' in params:\n estimator.set_params(n_estimators=min(5, estimator.n_estimators))\n if 'max_trials' in params:\n estimator.set_params(max_trials=10)\n if 'n_init' in params:\n estimator.set_params(n_init=2)\n if name == 'MeanShift':\n estimator.set_params(bandwidth=1.0)\n if name == 'TruncatedSVD':\n estimator.n_components = 1\n if name == 'LassoLarsIC':\n estimator.set_params(noise_variance=1.0)\n if hasattr(estimator, 'n_clusters'):\n estimator.n_clusters = min(estimator.n_clusters, 2)\n if hasattr(estimator, 'n_best'):\n estimator.n_best = 1\n if name == 'SelectFdr':\n estimator.set_params(alpha=0.5)\n if name == 'TheilSenRegressor':\n estimator.max_subpopulation = 100\n if isinstance(estimator, BaseRandomProjection):\n estimator.set_params(n_components=2)\n if isinstance(estimator, SelectKBest):\n estimator.set_params(k=1)\n if name in ('HistGradientBoostingClassifier', 'HistGradientBoostingRegressor'):\n estimator.set_params(min_samples_leaf=5)\n if name == 'DummyClassifier':\n estimator.set_params(strategy='stratified')\n loo_cv = ['RidgeCV', 'RidgeClassifierCV']\n if name not in loo_cv and hasattr(estimator, 'cv'):\n estimator.set_params(cv=3)\n if hasattr(estimator, 'n_splits'):\n estimator.set_params(n_splits=3)\n if name == 'OneHotEncoder':\n estimator.set_params(handle_unknown='ignore')\n if name in CROSS_DECOMPOSITION:\n estimator.set_params(n_components=1)" }, { "name": "_should_be_skipped_or_marked", @@ -169910,7 +183429,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check", @@ -169920,13 +183440,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _should_be_skipped_or_marked(estimator, check):\n check_name = check.func.__name__ if isinstance(check, partial) else check.__name__\n xfail_checks = _safe_tags(estimator, key='_xfail_checks') or {}\n if check_name in xfail_checks:\n return True, xfail_checks[check_name]\n return False, 'placeholder reason that will never be used'" }, { @@ -169944,13 +183465,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _yield_all_checks(estimator):\n name = estimator.__class__.__name__\n tags = _safe_tags(estimator)\n if '2darray' not in tags['X_types']:\n warnings.warn(\"Can't test estimator {} which requires input of type {}\".format(name, tags['X_types']), SkipTestWarning)\n return\n if tags['_skip_test']:\n warnings.warn('Explicit SKIP via _skip_test tag for estimator {}.'.format(name), SkipTestWarning)\n return\n for check in _yield_checks(estimator):\n yield check\n if is_classifier(estimator):\n for check in _yield_classifier_checks(estimator):\n yield check\n if is_regressor(estimator):\n for check in _yield_regressor_checks(estimator):\n yield check\n if hasattr(estimator, 'transform'):\n for check in _yield_transformer_checks(estimator):\n yield check\n if isinstance(estimator, ClusterMixin):\n for check in _yield_clustering_checks(estimator):\n yield check\n if is_outlier_detector(estimator):\n for check in _yield_outliers_checks(estimator):\n yield check\n yield check_parameters_default_constructible\n yield check_methods_sample_order_invariance\n yield check_methods_subset_invariance\n yield check_fit2d_1sample\n yield check_fit2d_1feature\n yield check_get_params_invariance\n yield check_set_params\n yield check_dict_unchanged\n yield check_dont_overwrite_parameters\n yield check_fit_idempotent\n yield check_fit_check_is_fitted\n if not tags['no_validation']:\n yield check_n_features_in\n yield check_fit1d\n yield check_fit2d_predict1d\n if tags['requires_y']:\n yield check_requires_y_none\n if tags['requires_positive_X']:\n yield check_fit_non_negative" }, { @@ -169968,13 +183490,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _yield_checks(estimator):\n name = estimator.__class__.__name__\n tags = _safe_tags(estimator)\n pairwise = _is_pairwise(estimator)\n yield check_no_attributes_set_in_init\n yield check_estimators_dtypes\n yield check_fit_score_takes_y\n if has_fit_parameter(estimator, 'sample_weight'):\n yield check_sample_weights_pandas_series\n yield check_sample_weights_not_an_array\n yield check_sample_weights_list\n if not pairwise:\n yield check_sample_weights_shape\n yield check_sample_weights_not_overwritten\n yield partial(check_sample_weights_invariance, kind='ones')\n yield partial(check_sample_weights_invariance, kind='zeros')\n yield check_estimators_fit_returns_self\n yield partial(check_estimators_fit_returns_self, readonly_memmap=True)\n if not tags['no_validation']:\n yield check_complex_data\n yield check_dtype_object\n yield check_estimators_empty_data_messages\n if name not in CROSS_DECOMPOSITION:\n yield check_pipeline_consistency\n if not tags['allow_nan'] and not tags['no_validation']:\n yield check_estimators_nan_inf\n if pairwise:\n yield check_nonsquare_error\n yield check_estimators_overwrite_params\n if hasattr(estimator, 'sparsify'):\n yield check_sparsify_coefficients\n yield check_estimator_sparse_data\n yield check_estimators_pickle\n yield check_estimator_get_tags_default_keys" }, { @@ -169992,13 +183515,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _yield_classifier_checks(classifier):\n tags = _safe_tags(classifier)\n yield check_classifier_data_not_an_array\n yield check_classifiers_one_label\n yield check_classifiers_classes\n yield check_estimators_partial_fit_n_features\n if tags['multioutput']:\n yield check_classifier_multioutput\n yield check_classifiers_train\n yield partial(check_classifiers_train, readonly_memmap=True)\n yield partial(check_classifiers_train, readonly_memmap=True, X_dtype='float32')\n yield check_classifiers_regression_target\n if tags['multilabel']:\n yield check_classifiers_multilabel_representation_invariance\n yield check_classifiers_multilabel_output_format_predict\n yield check_classifiers_multilabel_output_format_predict_proba\n yield check_classifiers_multilabel_output_format_decision_function\n if not tags['no_validation']:\n yield check_supervised_y_no_nan\n if not tags['multioutput_only']:\n yield check_supervised_y_2d\n if tags['requires_fit']:\n yield check_estimators_unfitted\n if 'class_weight' in classifier.get_params().keys():\n yield check_class_weight_classifiers\n yield check_non_transformer_estimators_n_iter\n yield check_decision_proba_consistency" }, { @@ -170016,13 +183540,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _yield_clustering_checks(clusterer):\n yield check_clusterer_compute_labels_predict\n name = clusterer.__class__.__name__\n if name not in ('WardAgglomeration', 'FeatureAgglomeration'):\n yield check_clustering\n yield partial(check_clustering, readonly_memmap=True)\n yield check_estimators_partial_fit_n_features\n yield check_non_transformer_estimators_n_iter" }, { @@ -170040,13 +183565,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _yield_outliers_checks(estimator):\n if hasattr(estimator, 'fit_predict'):\n yield check_outliers_fit_predict\n if hasattr(estimator, 'predict'):\n yield check_outliers_train\n yield partial(check_outliers_train, readonly_memmap=True)\n yield check_classifier_data_not_an_array\n if _safe_tags(estimator, key='requires_fit'):\n yield check_estimators_unfitted" }, { @@ -170064,13 +183590,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _yield_regressor_checks(regressor):\n tags = _safe_tags(regressor)\n yield check_regressors_train\n yield partial(check_regressors_train, readonly_memmap=True)\n yield partial(check_regressors_train, readonly_memmap=True, X_dtype='float32')\n yield check_regressor_data_not_an_array\n yield check_estimators_partial_fit_n_features\n if tags['multioutput']:\n yield check_regressor_multioutput\n yield check_regressors_no_decision_function\n if not tags['no_validation'] and not tags['multioutput_only']:\n yield check_supervised_y_2d\n yield check_supervised_y_no_nan\n name = regressor.__class__.__name__\n if name != 'CCA':\n yield check_regressors_int\n if tags['requires_fit']:\n yield check_estimators_unfitted\n yield check_non_transformer_estimators_n_iter" }, { @@ -170088,13 +183615,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _yield_transformer_checks(transformer):\n tags = _safe_tags(transformer)\n if not tags['no_validation']:\n yield check_transformer_data_not_an_array\n yield check_transformer_general\n if tags['preserves_dtype']:\n yield check_transformer_preserve_dtypes\n yield partial(check_transformer_general, readonly_memmap=True)\n if not _safe_tags(transformer, key='stateless'):\n yield check_transformers_unfitted\n external_solver = ['Isomap', 'KernelPCA', 'LocallyLinearEmbedding', 'RandomizedLasso', 'LogisticRegressionCV']\n name = transformer.__class__.__name__\n if name not in external_solver:\n yield check_transformer_n_iter" }, { @@ -170112,7 +183640,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classifier_orig", @@ -170122,7 +183651,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_train", @@ -170132,7 +183662,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_train", @@ -170142,7 +183673,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_test", @@ -170152,7 +183684,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_test", @@ -170162,7 +183695,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "weights", @@ -170172,13 +183706,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_class_weight_balanced_classifiers(name, classifier_orig, X_train, y_train, X_test, y_test, weights):\n classifier = clone(classifier_orig)\n if hasattr(classifier, 'n_iter'):\n classifier.set_params(n_iter=100)\n if hasattr(classifier, 'max_iter'):\n classifier.set_params(max_iter=1000)\n set_random_state(classifier)\n classifier.fit(X_train, y_train)\n y_pred = classifier.predict(X_test)\n classifier.set_params(class_weight='balanced')\n classifier.fit(X_train, y_train)\n y_pred_balanced = classifier.predict(X_test)\n assert f1_score(y_test, y_pred_balanced, average='weighted') > f1_score(y_test, y_pred, average='weighted')" }, { @@ -170196,7 +183731,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Classifier", @@ -170206,7 +183742,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -170230,7 +183767,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classifier_orig", @@ -170240,13 +183778,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_class_weight_classifiers(name, classifier_orig):\n if _safe_tags(classifier_orig, key='binary_only'):\n problems = [2]\n else:\n problems = [2, 3]\n for n_centers in problems:\n (X, y) = make_blobs(centers=n_centers, random_state=0, cluster_std=20)\n (X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.5, random_state=0)\n if _is_pairwise(classifier_orig):\n X_test = rbf_kernel(X_test, X_train)\n X_train = rbf_kernel(X_train, X_train)\n n_centers = len(np.unique(y_train))\n if n_centers == 2:\n class_weight = {0: 1000, 1: 0.0001}\n else:\n class_weight = {0: 1000, 1: 0.0001, 2: 0.0001}\n classifier = clone(classifier_orig).set_params(class_weight=class_weight)\n if hasattr(classifier, 'n_iter'):\n classifier.set_params(n_iter=100)\n if hasattr(classifier, 'max_iter'):\n classifier.set_params(max_iter=1000)\n if hasattr(classifier, 'min_weight_fraction_leaf'):\n classifier.set_params(min_weight_fraction_leaf=0.01)\n if hasattr(classifier, 'n_iter_no_change'):\n classifier.set_params(n_iter_no_change=20)\n set_random_state(classifier)\n classifier.fit(X_train, y_train)\n y_pred = classifier.predict(X_test)\n if not _safe_tags(classifier_orig, key='poor_score'):\n assert np.mean(y_pred == 0) > 0.87" }, { @@ -170264,7 +183803,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -170274,13 +183814,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_classifier_data_not_an_array(name, estimator_orig):\n X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1], [0, 3], [1, 0], [2, 0], [4, 4], [2, 3], [3, 2]])\n X = _pairwise_estimator_convert_X(X, estimator_orig)\n y = np.array([1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2])\n y = _enforce_estimator_tags_y(estimator_orig, y)\n for obj_type in ['NotAnArray', 'PandasDataframe']:\n check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type)" }, { @@ -170298,7 +183839,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -170308,13 +183850,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_classifier_multioutput(name, estimator):\n (n_samples, n_labels, n_classes) = (42, 5, 3)\n tags = _safe_tags(estimator)\n estimator = clone(estimator)\n (X, y) = make_multilabel_classification(random_state=42, n_samples=n_samples, n_labels=n_labels, n_classes=n_classes)\n estimator.fit(X, y)\n y_pred = estimator.predict(X)\n assert y_pred.shape == (n_samples, n_classes), 'The shape of the prediction for multioutput data is incorrect. Expected {}, got {}.'.format((n_samples, n_labels), y_pred.shape)\n assert y_pred.dtype.kind == 'i'\n if hasattr(estimator, 'decision_function'):\n decision = estimator.decision_function(X)\n assert isinstance(decision, np.ndarray)\n assert decision.shape == (n_samples, n_classes), 'The shape of the decision function output for multioutput data is incorrect. Expected {}, got {}.'.format((n_samples, n_classes), decision.shape)\n dec_pred = (decision > 0).astype(int)\n dec_exp = estimator.classes_[dec_pred]\n assert_array_equal(dec_exp, y_pred)\n if hasattr(estimator, 'predict_proba'):\n y_prob = estimator.predict_proba(X)\n if isinstance(y_prob, list) and not tags['poor_score']:\n for i in range(n_classes):\n assert y_prob[i].shape == (n_samples, 2), 'The shape of the probability for multioutput data is incorrect. Expected {}, got {}.'.format((n_samples, 2), y_prob[i].shape)\n assert_array_equal(np.argmax(y_prob[i], axis=1).astype(int), y_pred[:, i])\n elif not tags['poor_score']:\n assert y_prob.shape == (n_samples, n_classes), 'The shape of the probability for multioutput data is incorrect. Expected {}, got {}.'.format((n_samples, n_classes), y_prob.shape)\n assert_array_equal(y_prob.round().astype(int), y_pred)\n if hasattr(estimator, 'decision_function') and hasattr(estimator, 'predict_proba'):\n for i in range(n_classes):\n y_proba = estimator.predict_proba(X)[:, i]\n y_decision = estimator.decision_function(X)\n assert_array_equal(rankdata(y_proba), rankdata(y_decision[:, i]))" }, { @@ -170332,7 +183875,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classifier_orig", @@ -170342,13 +183886,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_classifiers_classes(name, classifier_orig):\n (X_multiclass, y_multiclass) = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)\n (X_multiclass, y_multiclass) = shuffle(X_multiclass, y_multiclass, random_state=7)\n X_multiclass = StandardScaler().fit_transform(X_multiclass)\n X_multiclass -= X_multiclass.min() - 0.1\n X_binary = X_multiclass[y_multiclass != 2]\n y_binary = y_multiclass[y_multiclass != 2]\n X_multiclass = _pairwise_estimator_convert_X(X_multiclass, classifier_orig)\n X_binary = _pairwise_estimator_convert_X(X_binary, classifier_orig)\n labels_multiclass = ['one', 'two', 'three']\n labels_binary = ['one', 'two']\n y_names_multiclass = np.take(labels_multiclass, y_multiclass)\n y_names_binary = np.take(labels_binary, y_binary)\n problems = [(X_binary, y_binary, y_names_binary)]\n if not _safe_tags(classifier_orig, key='binary_only'):\n problems.append((X_multiclass, y_multiclass, y_names_multiclass))\n for (X, y, y_names) in problems:\n for y_names_i in [y_names, y_names.astype('O')]:\n y_ = _choose_check_classifiers_labels(name, y, y_names_i)\n check_classifiers_predictions(X, y_, name, classifier_orig)\n labels_binary = [-1, 1]\n y_names_binary = np.take(labels_binary, y_binary)\n y_binary = _choose_check_classifiers_labels(name, y_binary, y_names_binary)\n check_classifiers_predictions(X_binary, y_binary, name, classifier_orig)" }, { @@ -170366,7 +183911,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classifier_orig", @@ -170376,13 +183922,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Check the output of the `decision_function` method for classifiers supporting multilabel-indicator targets.", - "docstring": "Check the output of the `decision_function` method for classifiers supporting\nmultilabel-indicator targets.", + "description": "Check the output of the `decision_function` method for classifiers supporting\nmultilabel-indicator targets.", + "docstring": "Check the output of the `decision_function` method for classifiers supporting\n multilabel-indicator targets.", "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_classifiers_multilabel_output_format_decision_function(name, classifier_orig):\n \"\"\"Check the output of the `decision_function` method for classifiers supporting\n multilabel-indicator targets.\"\"\"\n classifier = clone(classifier_orig)\n set_random_state(classifier)\n (n_samples, test_size, n_outputs) = (100, 25, 5)\n (X, y) = make_multilabel_classification(n_samples=n_samples, n_features=2, n_classes=n_outputs, n_labels=3, length=50, allow_unlabeled=True, random_state=0)\n X = scale(X)\n (X_train, X_test) = (X[:-test_size], X[-test_size:])\n y_train = y[:-test_size]\n classifier.fit(X_train, y_train)\n response_method_name = 'decision_function'\n decision_function_method = getattr(classifier, response_method_name, None)\n if decision_function_method is None:\n raise SkipTest(f'{name} does not have a {response_method_name} method.')\n y_pred = decision_function_method(X_test)\n assert isinstance(y_pred, np.ndarray), f'{name}.decision_function is expected to output a NumPy array. Got {type(y_pred)} instead.'\n assert y_pred.shape == (test_size, n_outputs), f'{name}.decision_function is expected to provide a NumPy array of shape (n_samples, n_outputs). Got {y_pred.shape} instead of {(test_size, n_outputs)}.'\n assert y_pred.dtype.kind == 'f', f'{name}.decision_function is expected to output a floating dtype. Got {y_pred.dtype} instead.'" }, { @@ -170400,7 +183947,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classifier_orig", @@ -170410,13 +183958,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Check the output of the `predict` method for classifiers supporting multilabel-indicator targets.", - "docstring": "Check the output of the `predict` method for classifiers supporting\nmultilabel-indicator targets.", + "description": "Check the output of the `predict` method for classifiers supporting\nmultilabel-indicator targets.", + "docstring": "Check the output of the `predict` method for classifiers supporting\n multilabel-indicator targets.", "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_classifiers_multilabel_output_format_predict(name, classifier_orig):\n \"\"\"Check the output of the `predict` method for classifiers supporting\n multilabel-indicator targets.\"\"\"\n classifier = clone(classifier_orig)\n set_random_state(classifier)\n (n_samples, test_size, n_outputs) = (100, 25, 5)\n (X, y) = make_multilabel_classification(n_samples=n_samples, n_features=2, n_classes=n_outputs, n_labels=3, length=50, allow_unlabeled=True, random_state=0)\n X = scale(X)\n (X_train, X_test) = (X[:-test_size], X[-test_size:])\n (y_train, y_test) = (y[:-test_size], y[-test_size:])\n classifier.fit(X_train, y_train)\n response_method_name = 'predict'\n predict_method = getattr(classifier, response_method_name, None)\n if predict_method is None:\n raise SkipTest(f'{name} does not have a {response_method_name} method.')\n y_pred = predict_method(X_test)\n assert isinstance(y_pred, np.ndarray), f'{name}.predict is expected to output a NumPy array. Got {type(y_pred)} instead.'\n assert y_pred.shape == y_test.shape, f'{name}.predict outputs a NumPy array of shape {y_pred.shape} instead of {y_test.shape}.'\n assert y_pred.dtype == y_test.dtype, f'{name}.predict does not output the same dtype than the targets. Got {y_pred.dtype} instead of {y_test.dtype}.'" }, { @@ -170434,7 +183983,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classifier_orig", @@ -170444,13 +183994,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Check the output of the `predict_proba` method for classifiers supporting multilabel-indicator targets.", - "docstring": "Check the output of the `predict_proba` method for classifiers supporting\nmultilabel-indicator targets.", + "description": "Check the output of the `predict_proba` method for classifiers supporting\nmultilabel-indicator targets.", + "docstring": "Check the output of the `predict_proba` method for classifiers supporting\n multilabel-indicator targets.", "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_classifiers_multilabel_output_format_predict_proba(name, classifier_orig):\n \"\"\"Check the output of the `predict_proba` method for classifiers supporting\n multilabel-indicator targets.\"\"\"\n classifier = clone(classifier_orig)\n set_random_state(classifier)\n (n_samples, test_size, n_outputs) = (100, 25, 5)\n (X, y) = make_multilabel_classification(n_samples=n_samples, n_features=2, n_classes=n_outputs, n_labels=3, length=50, allow_unlabeled=True, random_state=0)\n X = scale(X)\n (X_train, X_test) = (X[:-test_size], X[-test_size:])\n y_train = y[:-test_size]\n classifier.fit(X_train, y_train)\n response_method_name = 'predict_proba'\n predict_proba_method = getattr(classifier, response_method_name, None)\n if predict_proba_method is None:\n raise SkipTest(f'{name} does not have a {response_method_name} method.')\n y_pred = predict_proba_method(X_test)\n if isinstance(y_pred, list):\n assert len(y_pred) == n_outputs, f'When {name}.predict_proba returns a list, the list should be of length n_outputs and contain NumPy arrays. Got length of {len(y_pred)} instead of {n_outputs}.'\n for pred in y_pred:\n assert pred.shape == (test_size, 2), f'When {name}.predict_proba returns a list, this list should contain NumPy arrays of shape (n_samples, 2). Got NumPy arrays of shape {pred.shape} instead of {(test_size, 2)}.'\n assert pred.dtype.kind == 'f', f'When {name}.predict_proba returns a list, it should contain NumPy arrays with floating dtype. Got {pred.dtype} instead.'\n err_msg = f'When {name}.predict_proba returns a list, each NumPy array should contain probabilities for each class and thus each row should sum to 1 (or close to 1 due to numerical errors).'\n assert_allclose(pred.sum(axis=1), 1, err_msg=err_msg)\n elif isinstance(y_pred, np.ndarray):\n assert y_pred.shape == (test_size, n_outputs), f'When {name}.predict_proba returns a NumPy array, the expected shape is (n_samples, n_outputs). Got {y_pred.shape} instead of {(test_size, n_outputs)}.'\n assert y_pred.dtype.kind == 'f', f'When {name}.predict_proba returns a NumPy array, the expected data type is floating. Got {y_pred.dtype} instead.'\n err_msg = f'When {name}.predict_proba returns a NumPy array, this array is expected to provide probabilities of the positive class and should therefore contain values between 0 and 1.'\n assert_array_less(0, y_pred, err_msg=err_msg)\n assert_array_less(y_pred, 1, err_msg=err_msg)\n else:\n raise ValueError(f'Unknown returned type {type(y_pred)} by {name}.predict_proba. A list or a Numpy array is expected.')" }, { @@ -170468,7 +184019,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classifier_orig", @@ -170478,13 +184030,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_classifiers_multilabel_representation_invariance(name, classifier_orig):\n (X, y) = make_multilabel_classification(n_samples=100, n_features=2, n_classes=5, n_labels=3, length=50, allow_unlabeled=True, random_state=0)\n X = scale(X)\n (X_train, y_train) = (X[:80], y[:80])\n X_test = X[80:]\n y_train_list_of_lists = y_train.tolist()\n y_train_list_of_arrays = list(y_train)\n classifier = clone(classifier_orig)\n set_random_state(classifier)\n y_pred = classifier.fit(X_train, y_train).predict(X_test)\n y_pred_list_of_lists = classifier.fit(X_train, y_train_list_of_lists).predict(X_test)\n y_pred_list_of_arrays = classifier.fit(X_train, y_train_list_of_arrays).predict(X_test)\n assert_array_equal(y_pred, y_pred_list_of_arrays)\n assert_array_equal(y_pred, y_pred_list_of_lists)\n assert y_pred.dtype == y_pred_list_of_arrays.dtype\n assert y_pred.dtype == y_pred_list_of_lists.dtype\n assert type(y_pred) == type(y_pred_list_of_arrays)\n assert type(y_pred) == type(y_pred_list_of_lists)" }, { @@ -170502,7 +184055,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classifier_orig", @@ -170512,13 +184066,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_classifiers_one_label(name, classifier_orig):\n error_string_fit = \"Classifier can't train when only one class is present.\"\n error_string_predict = \"Classifier can't predict when only one class is present.\"\n rnd = np.random.RandomState(0)\n X_train = rnd.uniform(size=(10, 3))\n X_test = rnd.uniform(size=(10, 3))\n y = np.ones(10)\n with ignore_warnings(category=FutureWarning):\n classifier = clone(classifier_orig)\n with raises(ValueError, match='class', may_pass=True, err_msg=error_string_fit) as cm:\n classifier.fit(X_train, y)\n if cm.raised_and_matched:\n return\n assert_array_equal(classifier.predict(X_test), y, err_msg=error_string_predict)" }, { @@ -170536,7 +184091,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -170546,7 +184102,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "name", @@ -170556,7 +184113,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classifier_orig", @@ -170566,13 +184124,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings\ndef check_classifiers_predictions(X, y, name, classifier_orig):\n classes = np.unique(y)\n classifier = clone(classifier_orig)\n if name == 'BernoulliNB':\n X = X > X.mean()\n set_random_state(classifier)\n classifier.fit(X, y)\n y_pred = classifier.predict(X)\n if hasattr(classifier, 'decision_function'):\n decision = classifier.decision_function(X)\n assert isinstance(decision, np.ndarray)\n if len(classes) == 2:\n dec_pred = (decision.ravel() > 0).astype(int)\n dec_exp = classifier.classes_[dec_pred]\n assert_array_equal(dec_exp, y_pred, err_msg=\"decision_function does not match classifier for %r: expected '%s', got '%s'\" % (classifier, ', '.join(map(str, dec_exp)), ', '.join(map(str, y_pred))))\n elif getattr(classifier, 'decision_function_shape', 'ovr') == 'ovr':\n decision_y = np.argmax(decision, axis=1).astype(int)\n y_exp = classifier.classes_[decision_y]\n assert_array_equal(y_exp, y_pred, err_msg=\"decision_function does not match classifier for %r: expected '%s', got '%s'\" % (classifier, ', '.join(map(str, y_exp)), ', '.join(map(str, y_pred))))\n if name != 'ComplementNB':\n assert_array_equal(np.unique(y), np.unique(y_pred))\n assert_array_equal(classes, classifier.classes_, err_msg=\"Unexpected classes_ attribute for %r: expected '%s', got '%s'\" % (classifier, ', '.join(map(str, classes)), ', '.join(map(str, classifier.classes_))))" }, { @@ -170590,7 +184149,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -170600,13 +184160,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_classifiers_regression_target(name, estimator_orig):\n (X, y) = _regression_dataset()\n X = X + 1 + abs(X.min(axis=0))\n e = clone(estimator_orig)\n msg = 'Unknown label type: '\n if not _safe_tags(e, key='no_validation'):\n with raises(ValueError, match=msg):\n e.fit(X, y)" }, { @@ -170624,7 +184185,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classifier_orig", @@ -170634,7 +184196,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "readonly_memmap", @@ -170644,7 +184207,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_dtype", @@ -170654,14 +184218,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", - "source_code": "\n@ignore_warnings\ndef check_classifiers_train(name, classifier_orig, readonly_memmap=False, X_dtype='float64'):\n (X_m, y_m) = make_blobs(n_samples=300, random_state=0)\n X_m = X_m.astype(X_dtype)\n (X_m, y_m) = shuffle(X_m, y_m, random_state=7)\n X_m = StandardScaler().fit_transform(X_m)\n y_b = y_m[y_m != 2]\n X_b = X_m[y_m != 2]\n if name in ['BernoulliNB', 'MultinomialNB', 'ComplementNB', 'CategoricalNB']:\n X_m -= X_m.min()\n X_b -= X_b.min()\n if readonly_memmap:\n (X_m, y_m, X_b, y_b) = create_memmap_backed_data([X_m, y_m, X_b, y_b])\n problems = [(X_b, y_b)]\n tags = _safe_tags(classifier_orig)\n if not tags['binary_only']:\n problems.append((X_m, y_m))\n for (X, y) in problems:\n classes = np.unique(y)\n n_classes = len(classes)\n (n_samples, n_features) = X.shape\n classifier = clone(classifier_orig)\n X = _pairwise_estimator_convert_X(X, classifier)\n y = _enforce_estimator_tags_y(classifier, y)\n set_random_state(classifier)\n if not tags['no_validation']:\n with raises(ValueError, err_msg=f'The classifier {name} does not raise an error when incorrect/malformed input data for fit is passed. The number of training examples is not the same as the number of labels. Perhaps use check_X_y in fit.'):\n classifier.fit(X, y[:-1])\n classifier.fit(X, y)\n classifier.fit(X.tolist(), y.tolist())\n assert hasattr(classifier, 'classes_')\n y_pred = classifier.predict(X)\n assert y_pred.shape == (n_samples, )\n if not tags['poor_score']:\n assert accuracy_score(y, y_pred) > 0.83\n msg_pairwise = 'The classifier {} does not raise an error when shape of X in {} is not equal to (n_test_samples, n_training_samples)'\n msg = 'The classifier {} does not raise an error when the number of features in {} is different from the number of features in fit.'\n if not tags['no_validation']:\n if _is_pairwise(classifier):\n with raises(ValueError, err_msg=msg_pairwise.format(name, 'predict')):\n classifier.predict(X.reshape(-1, 1))\n else:\n with raises(ValueError, err_msg=msg.format(name, 'predict')):\n classifier.predict(X.T)\n if hasattr(classifier, 'decision_function'):\n try:\n decision = classifier.decision_function(X)\n if n_classes == 2:\n if not tags['multioutput_only']:\n assert decision.shape == (n_samples, )\n else:\n assert decision.shape == (n_samples, 1)\n dec_pred = (decision.ravel() > 0).astype(int)\n assert_array_equal(dec_pred, y_pred)\n else:\n assert decision.shape == (n_samples, n_classes)\n assert_array_equal(np.argmax(decision, axis=1), y_pred)\n if not tags['no_validation']:\n if _is_pairwise(classifier):\n with raises(ValueError, err_msg=msg_pairwise.format(name, 'decision_function')):\n classifier.decision_function(X.reshape(-1, 1))\n else:\n with raises(ValueError, err_msg=msg.format(name, 'decision_function')):\n classifier.decision_function(X.T)\n except NotImplementedError:\n pass\n if hasattr(classifier, 'predict_proba'):\n y_prob = classifier.predict_proba(X)\n assert y_prob.shape == (n_samples, n_classes)\n assert_array_equal(np.argmax(y_prob, axis=1), y_pred)\n assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples))\n if not tags['no_validation']:\n if _is_pairwise(classifier_orig):\n with raises(ValueError, err_msg=msg_pairwise.format(name, 'predict_proba')):\n classifier.predict_proba(X.reshape(-1, 1))\n else:\n with raises(ValueError, err_msg=msg.format(name, 'predict_proba')):\n classifier.predict_proba(X.T)\n if hasattr(classifier, 'predict_log_proba'):\n y_log_prob = classifier.predict_log_proba(X)\n assert_allclose(y_log_prob, np.log(y_prob), 8, atol=1e-09)\n assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))" + "docstring": null, + "source_code": "\n@ignore_warnings\ndef check_classifiers_train(name, classifier_orig, readonly_memmap=False, X_dtype='float64'):\n (X_m, y_m) = make_blobs(n_samples=300, random_state=0)\n X_m = X_m.astype(X_dtype)\n (X_m, y_m) = shuffle(X_m, y_m, random_state=7)\n X_m = StandardScaler().fit_transform(X_m)\n y_b = y_m[y_m != 2]\n X_b = X_m[y_m != 2]\n if name in ['BernoulliNB', 'MultinomialNB', 'ComplementNB', 'CategoricalNB']:\n X_m -= X_m.min()\n X_b -= X_b.min()\n if readonly_memmap:\n (X_m, y_m, X_b, y_b) = _create_memmap_backed_data([X_m, y_m, X_b, y_b])\n problems = [(X_b, y_b)]\n tags = _safe_tags(classifier_orig)\n if not tags['binary_only']:\n problems.append((X_m, y_m))\n for (X, y) in problems:\n classes = np.unique(y)\n n_classes = len(classes)\n (n_samples, n_features) = X.shape\n classifier = clone(classifier_orig)\n X = _pairwise_estimator_convert_X(X, classifier)\n y = _enforce_estimator_tags_y(classifier, y)\n set_random_state(classifier)\n if not tags['no_validation']:\n with raises(ValueError, err_msg=f'The classifier {name} does not raise an error when incorrect/malformed input data for fit is passed. The number of training examples is not the same as the number of labels. Perhaps use check_X_y in fit.'):\n classifier.fit(X, y[:-1])\n classifier.fit(X, y)\n classifier.fit(X.tolist(), y.tolist())\n assert hasattr(classifier, 'classes_')\n y_pred = classifier.predict(X)\n assert y_pred.shape == (n_samples, )\n if not tags['poor_score']:\n assert accuracy_score(y, y_pred) > 0.83\n msg_pairwise = 'The classifier {} does not raise an error when shape of X in {} is not equal to (n_test_samples, n_training_samples)'\n msg = 'The classifier {} does not raise an error when the number of features in {} is different from the number of features in fit.'\n if not tags['no_validation']:\n if _is_pairwise(classifier):\n with raises(ValueError, err_msg=msg_pairwise.format(name, 'predict')):\n classifier.predict(X.reshape(-1, 1))\n else:\n with raises(ValueError, err_msg=msg.format(name, 'predict')):\n classifier.predict(X.T)\n if hasattr(classifier, 'decision_function'):\n try:\n decision = classifier.decision_function(X)\n if n_classes == 2:\n if not tags['multioutput_only']:\n assert decision.shape == (n_samples, )\n else:\n assert decision.shape == (n_samples, 1)\n dec_pred = (decision.ravel() > 0).astype(int)\n assert_array_equal(dec_pred, y_pred)\n else:\n assert decision.shape == (n_samples, n_classes)\n assert_array_equal(np.argmax(decision, axis=1), y_pred)\n if not tags['no_validation']:\n if _is_pairwise(classifier):\n with raises(ValueError, err_msg=msg_pairwise.format(name, 'decision_function')):\n classifier.decision_function(X.reshape(-1, 1))\n else:\n with raises(ValueError, err_msg=msg.format(name, 'decision_function')):\n classifier.decision_function(X.T)\n except NotImplementedError:\n pass\n if hasattr(classifier, 'predict_proba'):\n y_prob = classifier.predict_proba(X)\n assert y_prob.shape == (n_samples, n_classes)\n assert_array_equal(np.argmax(y_prob, axis=1), y_pred)\n assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples))\n if not tags['no_validation']:\n if _is_pairwise(classifier_orig):\n with raises(ValueError, err_msg=msg_pairwise.format(name, 'predict_proba')):\n classifier.predict_proba(X.reshape(-1, 1))\n else:\n with raises(ValueError, err_msg=msg.format(name, 'predict_proba')):\n classifier.predict_proba(X.T)\n if hasattr(classifier, 'predict_log_proba'):\n y_log_prob = classifier.predict_log_proba(X)\n assert_allclose(y_log_prob, np.log(y_prob), 8, atol=1e-09)\n assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))" }, { "name": "check_clusterer_compute_labels_predict", @@ -170678,7 +184243,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "clusterer_orig", @@ -170688,7 +184254,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -170712,7 +184279,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "clusterer_orig", @@ -170722,7 +184290,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "readonly_memmap", @@ -170732,13 +184301,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_clustering(name, clusterer_orig, readonly_memmap=False):\n clusterer = clone(clusterer_orig)\n (X, y) = make_blobs(n_samples=50, random_state=1)\n (X, y) = shuffle(X, y, random_state=7)\n X = StandardScaler().fit_transform(X)\n rng = np.random.RandomState(7)\n X_noise = np.concatenate([X, rng.uniform(low=-3, high=3, size=(5, 2))])\n if readonly_memmap:\n (X, y, X_noise) = create_memmap_backed_data([X, y, X_noise])\n (n_samples, n_features) = X.shape\n if hasattr(clusterer, 'n_clusters'):\n clusterer.set_params(n_clusters=3)\n set_random_state(clusterer)\n if name == 'AffinityPropagation':\n clusterer.set_params(preference=-100)\n clusterer.set_params(max_iter=100)\n clusterer.fit(X)\n clusterer.fit(X.tolist())\n pred = clusterer.labels_\n assert pred.shape == (n_samples, )\n assert adjusted_rand_score(pred, y) > 0.4\n if _safe_tags(clusterer, key='non_deterministic'):\n return\n set_random_state(clusterer)\n with warnings.catch_warnings(record=True):\n pred2 = clusterer.fit_predict(X)\n assert_array_equal(pred, pred2)\n assert pred.dtype in [np.dtype('int32'), np.dtype('int64')]\n assert pred2.dtype in [np.dtype('int32'), np.dtype('int64')]\n labels = clusterer.fit_predict(X_noise)\n labels_sorted = np.unique(labels)\n assert_array_equal(labels_sorted, np.arange(labels_sorted[0], labels_sorted[-1] + 1))\n assert labels_sorted[0] in [0, -1]\n if hasattr(clusterer, 'n_clusters'):\n n_clusters = getattr(clusterer, 'n_clusters')\n assert n_clusters - 1 >= labels_sorted[-1]" }, { @@ -170756,7 +184326,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -170766,13 +184337,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_complex_data(name, estimator_orig):\n rng = np.random.RandomState(42)\n X = rng.uniform(size=10) + 1j * rng.uniform(size=10)\n X = X.reshape(-1, 1)\n y = rng.randint(low=0, high=2, size=10) + 1j\n estimator = clone(estimator_orig)\n set_random_state(estimator, random_state=0)\n with raises(ValueError, match='Complex data not supported'):\n estimator.fit(X, y)" }, { @@ -170790,7 +184362,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -170800,14 +184373,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", - "source_code": "\ndef check_dataframe_column_names_consistency(name, estimator_orig):\n try:\n import pandas as pd\n except ImportError:\n raise SkipTest('pandas is not installed: not checking column name consistency for pandas')\n tags = _safe_tags(estimator_orig)\n is_supported_X_types = '2darray' in tags['X_types'] or 'categorical' in tags['X_types']\n if not is_supported_X_types or tags['no_validation']:\n return\n rng = np.random.RandomState(0)\n estimator = clone(estimator_orig)\n set_random_state(estimator)\n X_orig = rng.normal(size=(150, 8))\n X_orig -= X_orig.min() + 0.5\n X_orig = _enforce_estimator_tags_x(estimator, X_orig)\n X_orig = _pairwise_estimator_convert_X(X_orig, estimator)\n (n_samples, n_features) = X_orig.shape\n names = np.array([f'col_{i}' for i in range(n_features)])\n X = pd.DataFrame(X_orig, columns=names)\n if is_regressor(estimator):\n y = rng.normal(size=n_samples)\n else:\n y = rng.randint(low=0, high=2, size=n_samples)\n y = _enforce_estimator_tags_y(estimator, y)\n estimator.fit(X, y)\n if not hasattr(estimator, 'feature_names_in_'):\n raise ValueError('Estimator does not have a feature_names_in_ attribute after fitting with a dataframe')\n assert isinstance(estimator.feature_names_in_, np.ndarray)\n assert estimator.feature_names_in_.dtype == object\n assert_array_equal(estimator.feature_names_in_, names)\n module_name = estimator_orig.__module__\n if module_name.startswith('sklearn.') and not ('test_' in module_name or module_name.endswith('_testing')) and 'feature_names_in_' not in estimator_orig.__doc__:\n raise ValueError(f'Estimator {name} does not document its feature_names_in_ attribute')\n check_methods = []\n for method in ('predict', 'transform', 'decision_function', 'predict_proba', 'score', 'score_samples', 'predict_log_proba'):\n if not hasattr(estimator, method):\n continue\n callable_method = getattr(estimator, method)\n if method == 'score':\n callable_method = partial(callable_method, y=y)\n check_methods.append((method, callable_method))\n for (_, method) in check_methods:\n with warnings.catch_warnings():\n warnings.filterwarnings('error', message='X does not have valid feature names', category=UserWarning, module='sklearn')\n method(X)\n invalid_names = [(names[::-1], 'Feature names must be in the same order as they were in fit.'), ([f'another_prefix_{i}' for i in range(n_features)], 'Feature names unseen at fit time:\\n- another_prefix_0\\n- another_prefix_1\\n'), (names[:3], f'Feature names seen at fit time, yet now missing:\\n- {min(names[3:])}\\n')]\n for (invalid_name, additional_message) in invalid_names:\n X_bad = pd.DataFrame(X, columns=invalid_name)\n expected_msg = re.escape(f'The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.\\n{additional_message}')\n for (name, method) in check_methods:\n with warnings.catch_warnings():\n warnings.filterwarnings('error', category=FutureWarning, module='sklearn')\n with raises(FutureWarning, match=expected_msg, err_msg=f'{name} did not raise'):\n method(X_bad)\n if not hasattr(estimator, 'partial_fit'):\n continue\n estimator = clone(estimator_orig)\n if is_classifier(estimator):\n classes = np.unique(y)\n estimator.partial_fit(X, y, classes=classes)\n else:\n estimator.partial_fit(X, y)\n with warnings.catch_warnings():\n warnings.filterwarnings('error', category=FutureWarning, module='sklearn')\n with raises(FutureWarning, match=expected_msg):\n estimator.partial_fit(X_bad, y)" + "docstring": null, + "source_code": "\ndef check_dataframe_column_names_consistency(name, estimator_orig):\n try:\n import pandas as pd\n except ImportError:\n raise SkipTest('pandas is not installed: not checking column name consistency for pandas')\n tags = _safe_tags(estimator_orig)\n is_supported_X_types = '2darray' in tags['X_types'] or 'categorical' in tags['X_types']\n if not is_supported_X_types or tags['no_validation']:\n return\n rng = np.random.RandomState(0)\n estimator = clone(estimator_orig)\n set_random_state(estimator)\n X_orig = rng.normal(size=(150, 8))\n X_orig -= X_orig.min() + 0.5\n X_orig = _enforce_estimator_tags_x(estimator, X_orig)\n X_orig = _pairwise_estimator_convert_X(X_orig, estimator)\n (n_samples, n_features) = X_orig.shape\n names = np.array([f'col_{i}' for i in range(n_features)])\n X = pd.DataFrame(X_orig, columns=names)\n if is_regressor(estimator):\n y = rng.normal(size=n_samples)\n else:\n y = rng.randint(low=0, high=2, size=n_samples)\n y = _enforce_estimator_tags_y(estimator, y)\n with warnings.catch_warnings():\n warnings.filterwarnings('error', message='X does not have valid feature names', category=UserWarning, module='sklearn')\n estimator.fit(X, y)\n if not hasattr(estimator, 'feature_names_in_'):\n raise ValueError('Estimator does not have a feature_names_in_ attribute after fitting with a dataframe')\n assert isinstance(estimator.feature_names_in_, np.ndarray)\n assert estimator.feature_names_in_.dtype == object\n assert_array_equal(estimator.feature_names_in_, names)\n module_name = estimator_orig.__module__\n if module_name.startswith('sklearn.') and not ('test_' in module_name or module_name.endswith('_testing')) and 'feature_names_in_' not in estimator_orig.__doc__:\n raise ValueError(f'Estimator {name} does not document its feature_names_in_ attribute')\n check_methods = []\n for method in ('predict', 'transform', 'decision_function', 'predict_proba', 'score', 'score_samples', 'predict_log_proba'):\n if not hasattr(estimator, method):\n continue\n callable_method = getattr(estimator, method)\n if method == 'score':\n callable_method = partial(callable_method, y=y)\n check_methods.append((method, callable_method))\n for (_, method) in check_methods:\n with warnings.catch_warnings():\n warnings.filterwarnings('error', message='X does not have valid feature names', category=UserWarning, module='sklearn')\n method(X)\n invalid_names = [(names[::-1], 'Feature names must be in the same order as they were in fit.'), ([f'another_prefix_{i}' for i in range(n_features)], 'Feature names unseen at fit time:\\n- another_prefix_0\\n- another_prefix_1\\n'), (names[:3], f'Feature names seen at fit time, yet now missing:\\n- {min(names[3:])}\\n')]\n params = {key: value for (key, value) in estimator.get_params().items() if 'early_stopping' in key}\n early_stopping_enabled = any((value is True for value in params.values()))\n for (invalid_name, additional_message) in invalid_names:\n X_bad = pd.DataFrame(X, columns=invalid_name)\n expected_msg = re.escape(f'The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.\\n{additional_message}')\n for (name, method) in check_methods:\n with warnings.catch_warnings():\n warnings.filterwarnings('error', category=FutureWarning, module='sklearn')\n with raises(FutureWarning, match=expected_msg, err_msg=f'{name} did not raise'):\n method(X_bad)\n if not hasattr(estimator, 'partial_fit') or early_stopping_enabled:\n continue\n estimator = clone(estimator_orig)\n if is_classifier(estimator):\n classes = np.unique(y)\n estimator.partial_fit(X, y, classes=classes)\n else:\n estimator.partial_fit(X, y)\n with warnings.catch_warnings():\n warnings.filterwarnings('error', category=FutureWarning, module='sklearn')\n with raises(FutureWarning, match=expected_msg):\n estimator.partial_fit(X_bad, y)" }, { "name": "check_decision_proba_consistency", @@ -170824,7 +184398,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -170834,13 +184409,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_decision_proba_consistency(name, estimator_orig):\n centers = [(2, 2), (4, 4)]\n (X, y) = make_blobs(n_samples=100, random_state=0, n_features=4, centers=centers, cluster_std=1.0, shuffle=True)\n (X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2, random_state=0)\n estimator = clone(estimator_orig)\n if hasattr(estimator, 'decision_function') and hasattr(estimator, 'predict_proba'):\n estimator.fit(X_train, y_train)\n a = estimator.predict_proba(X_test)[:, 1].round(decimals=10)\n b = estimator.decision_function(X_test).round(decimals=10)\n assert_array_equal(rankdata(a), rankdata(b))" }, { @@ -170858,7 +184434,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -170868,13 +184445,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings\ndef check_dict_unchanged(name, estimator_orig):\n if name in ['SpectralCoclustering']:\n return\n rnd = np.random.RandomState(0)\n if name in ['RANSACRegressor']:\n X = 3 * rnd.uniform(size=(20, 3))\n else:\n X = 2 * rnd.uniform(size=(20, 3))\n X = _pairwise_estimator_convert_X(X, estimator_orig)\n y = X[:, 0].astype(int)\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n if hasattr(estimator, 'n_components'):\n estimator.n_components = 1\n if hasattr(estimator, 'n_clusters'):\n estimator.n_clusters = 1\n if hasattr(estimator, 'n_best'):\n estimator.n_best = 1\n set_random_state(estimator, 1)\n estimator.fit(X, y)\n for method in ['predict', 'transform', 'decision_function', 'predict_proba']:\n if hasattr(estimator, method):\n dict_before = estimator.__dict__.copy()\n getattr(estimator, method)(X)\n assert estimator.__dict__ == dict_before, 'Estimator changes __dict__ during %s' % method" }, { @@ -170892,7 +184470,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -170902,13 +184481,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_dont_overwrite_parameters(name, estimator_orig):\n if hasattr(estimator_orig.__init__, 'deprecated_original'):\n return\n estimator = clone(estimator_orig)\n rnd = np.random.RandomState(0)\n X = 3 * rnd.uniform(size=(20, 3))\n X = _pairwise_estimator_convert_X(X, estimator_orig)\n y = X[:, 0].astype(int)\n y = _enforce_estimator_tags_y(estimator, y)\n if hasattr(estimator, 'n_components'):\n estimator.n_components = 1\n if hasattr(estimator, 'n_clusters'):\n estimator.n_clusters = 1\n set_random_state(estimator, 1)\n dict_before_fit = estimator.__dict__.copy()\n estimator.fit(X, y)\n dict_after_fit = estimator.__dict__\n public_keys_after_fit = [key for key in dict_after_fit.keys() if _is_public_parameter(key)]\n attrs_added_by_fit = [key for key in public_keys_after_fit if key not in dict_before_fit.keys()]\n assert not attrs_added_by_fit, 'Estimator adds public attribute(s) during the fit method. Estimators are only allowed to add private attributes either started with _ or ended with _ but %s added' % ', '.join(attrs_added_by_fit)\n attrs_changed_by_fit = [key for key in public_keys_after_fit if dict_before_fit[key] is not dict_after_fit[key]]\n assert not attrs_changed_by_fit, 'Estimator changes public attribute(s) during the fit method. Estimators are only allowed to change attributes started or ended with _, but %s changed' % ', '.join(attrs_changed_by_fit)" }, { @@ -170928,7 +184508,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -170938,13 +184519,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=(FutureWarning, UserWarning))\ndef check_dtype_object(name, estimator_orig):\n rng = np.random.RandomState(0)\n X = _pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig)\n X = X.astype(object)\n tags = _safe_tags(estimator_orig)\n y = (X[:, 0] * 4).astype(int)\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n estimator.fit(X, y)\n if hasattr(estimator, 'predict'):\n estimator.predict(X)\n if hasattr(estimator, 'transform'):\n estimator.transform(X)\n with raises(Exception, match='Unknown label type', may_pass=True):\n estimator.fit(X, y.astype(object))\n if 'string' not in tags['X_types']:\n X[0, 0] = {'foo': 'bar'}\n msg = 'argument must be a string.* number'\n with raises(TypeError, match=msg):\n estimator.fit(X, y)\n else:\n estimator.fit(X, y)" }, { @@ -170962,7 +184544,8 @@ "docstring": { "type": "estimator object", "description": "Estimator instance to check.\n\n.. versionchanged:: 0.24\n Passing a class was deprecated in version 0.23, and support for\n classes was removed in 0.24." - } + }, + "refined_type": {} }, { "name": "generate_only", @@ -170972,13 +184555,14 @@ "docstring": { "type": "bool, default=False", "description": "When `False`, checks are evaluated when `check_estimator` is called.\nWhen `True`, `check_estimator` returns a generator that yields\n(estimator, check) tuples. The check is run by calling\n`check(estimator)`.\n\n.. versionadded:: 0.22" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Check if estimator adheres to scikit-learn conventions.\n\nThis estimator will run an extensive test-suite for input validation, shapes, etc, making sure that the estimator complies with `scikit-learn` conventions as detailed in :ref:`rolling_your_own_estimator`. Additional tests for classifiers, regressors, clustering or transformers will be run if the Estimator class inherits from the corresponding mixin from sklearn.base. Setting `generate_only=True` returns a generator that yields (estimator, check) tuples where the check can be called independently from each other, i.e. `check(estimator)`. This allows all checks to be run independently and report the checks that are failing. scikit-learn provides a pytest specific decorator, :func:`~sklearn.utils.parametrize_with_checks`, making it easier to test multiple estimators.", - "docstring": "Check if estimator adheres to scikit-learn conventions.\n\nThis estimator will run an extensive test-suite for input validation,\nshapes, etc, making sure that the estimator complies with `scikit-learn`\nconventions as detailed in :ref:`rolling_your_own_estimator`.\nAdditional tests for classifiers, regressors, clustering or transformers\nwill be run if the Estimator class inherits from the corresponding mixin\nfrom sklearn.base.\n\nSetting `generate_only=True` returns a generator that yields (estimator,\ncheck) tuples where the check can be called independently from each\nother, i.e. `check(estimator)`. This allows all checks to be run\nindependently and report the checks that are failing.\n\nscikit-learn provides a pytest specific decorator,\n:func:`~sklearn.utils.parametrize_with_checks`, making it easier to test\nmultiple estimators.\n\nParameters\n----------\nEstimator : estimator object\n Estimator instance to check.\n\n .. versionchanged:: 0.24\n Passing a class was deprecated in version 0.23, and support for\n classes was removed in 0.24.\n\ngenerate_only : bool, default=False\n When `False`, checks are evaluated when `check_estimator` is called.\n When `True`, `check_estimator` returns a generator that yields\n (estimator, check) tuples. The check is run by calling\n `check(estimator)`.\n\n .. versionadded:: 0.22\n\nReturns\n-------\nchecks_generator : generator\n Generator that yields (estimator, check) tuples. Returned when\n `generate_only=True`.\n\nSee Also\n--------\nparametrize_with_checks : Pytest specific decorator for parametrizing estimator\n checks.", + "description": "Check if estimator adheres to scikit-learn conventions.\n\nThis estimator will run an extensive test-suite for input validation,\nshapes, etc, making sure that the estimator complies with `scikit-learn`\nconventions as detailed in :ref:`rolling_your_own_estimator`.\nAdditional tests for classifiers, regressors, clustering or transformers\nwill be run if the Estimator class inherits from the corresponding mixin\nfrom sklearn.base.\n\nSetting `generate_only=True` returns a generator that yields (estimator,\ncheck) tuples where the check can be called independently from each\nother, i.e. `check(estimator)`. This allows all checks to be run\nindependently and report the checks that are failing.\n\nscikit-learn provides a pytest specific decorator,\n:func:`~sklearn.utils.parametrize_with_checks`, making it easier to test\nmultiple estimators.", + "docstring": "Check if estimator adheres to scikit-learn conventions.\n\n This estimator will run an extensive test-suite for input validation,\n shapes, etc, making sure that the estimator complies with `scikit-learn`\n conventions as detailed in :ref:`rolling_your_own_estimator`.\n Additional tests for classifiers, regressors, clustering or transformers\n will be run if the Estimator class inherits from the corresponding mixin\n from sklearn.base.\n\n Setting `generate_only=True` returns a generator that yields (estimator,\n check) tuples where the check can be called independently from each\n other, i.e. `check(estimator)`. This allows all checks to be run\n independently and report the checks that are failing.\n\n scikit-learn provides a pytest specific decorator,\n :func:`~sklearn.utils.parametrize_with_checks`, making it easier to test\n multiple estimators.\n\n Parameters\n ----------\n Estimator : estimator object\n Estimator instance to check.\n\n .. versionchanged:: 0.24\n Passing a class was deprecated in version 0.23, and support for\n classes was removed in 0.24.\n\n generate_only : bool, default=False\n When `False`, checks are evaluated when `check_estimator` is called.\n When `True`, `check_estimator` returns a generator that yields\n (estimator, check) tuples. The check is run by calling\n `check(estimator)`.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n checks_generator : generator\n Generator that yields (estimator, check) tuples. Returned when\n `generate_only=True`.\n\n See Also\n --------\n parametrize_with_checks : Pytest specific decorator for parametrizing estimator\n checks.\n ", "source_code": "\ndef check_estimator(Estimator, generate_only=False):\n \"\"\"Check if estimator adheres to scikit-learn conventions.\n\n This estimator will run an extensive test-suite for input validation,\n shapes, etc, making sure that the estimator complies with `scikit-learn`\n conventions as detailed in :ref:`rolling_your_own_estimator`.\n Additional tests for classifiers, regressors, clustering or transformers\n will be run if the Estimator class inherits from the corresponding mixin\n from sklearn.base.\n\n Setting `generate_only=True` returns a generator that yields (estimator,\n check) tuples where the check can be called independently from each\n other, i.e. `check(estimator)`. This allows all checks to be run\n independently and report the checks that are failing.\n\n scikit-learn provides a pytest specific decorator,\n :func:`~sklearn.utils.parametrize_with_checks`, making it easier to test\n multiple estimators.\n\n Parameters\n ----------\n Estimator : estimator object\n Estimator instance to check.\n\n .. versionchanged:: 0.24\n Passing a class was deprecated in version 0.23, and support for\n classes was removed in 0.24.\n\n generate_only : bool, default=False\n When `False`, checks are evaluated when `check_estimator` is called.\n When `True`, `check_estimator` returns a generator that yields\n (estimator, check) tuples. The check is run by calling\n `check(estimator)`.\n\n .. versionadded:: 0.22\n\n Returns\n -------\n checks_generator : generator\n Generator that yields (estimator, check) tuples. Returned when\n `generate_only=True`.\n\n See Also\n --------\n parametrize_with_checks : Pytest specific decorator for parametrizing estimator\n checks.\n \"\"\"\n if isinstance(Estimator, type):\n msg = \"Passing a class was deprecated in version 0.23 and isn't supported anymore from 0.24.Please pass an instance instead.\"\n raise TypeError(msg)\n estimator = Estimator\n name = type(estimator).__name__\n \n def checks_generator():\n for check in _yield_all_checks(estimator):\n check = _maybe_skip(estimator, check)\n yield (estimator, partial(check, name))\n if generate_only:\n return checks_generator()\n for (estimator, check) in checks_generator():\n try:\n check(estimator)\n except SkipTest as exception:\n warnings.warn(str(exception), SkipTestWarning)" }, { @@ -170996,7 +184580,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171006,13 +184591,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_estimator_get_tags_default_keys(name, estimator_orig):\n estimator = clone(estimator_orig)\n if not hasattr(estimator, '_get_tags'):\n return\n tags_keys = set(estimator._get_tags().keys())\n default_tags_keys = set(_DEFAULT_TAGS.keys())\n assert tags_keys.intersection(default_tags_keys) == default_tags_keys, f'{name}._get_tags() is missing entries for the following default tags: {default_tags_keys - tags_keys.intersection(default_tags_keys)}'" }, { @@ -171030,7 +184616,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171040,14 +184627,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", - "source_code": "\ndef check_estimator_sparse_data(name, estimator_orig):\n rng = np.random.RandomState(0)\n X = rng.rand(40, 10)\n X[X < 0.8] = 0\n X = _pairwise_estimator_convert_X(X, estimator_orig)\n X_csr = sparse.csr_matrix(X)\n y = (4 * rng.rand(40)).astype(int)\n with ignore_warnings(category=FutureWarning):\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n tags = _safe_tags(estimator_orig)\n for (matrix_format, X) in _generate_sparse_matrix(X_csr):\n with ignore_warnings(category=FutureWarning):\n estimator = clone(estimator_orig)\n if name in ['Scaler', 'StandardScaler']:\n estimator.set_params(with_mean=False)\n if '64' in matrix_format:\n err_msg = f\"Estimator {name} doesn't seem to support {matrix_format} matrix, and is not failing gracefully, e.g. by using check_array(X, accept_large_sparse=False)\"\n else:\n err_msg = f\"Estimator {name} doesn't seem to fail gracefully on sparse data: error message should state explicitly that sparse input is not supported if this is not the case.\"\n with raises((TypeError, ValueError), match=['sparse', 'Sparse'], may_pass=True, err_msg=err_msg):\n with ignore_warnings(category=FutureWarning):\n estimator.fit(X, y)\n if hasattr(estimator, 'predict'):\n pred = estimator.predict(X)\n if tags['multioutput_only']:\n assert pred.shape == (X.shape[0], 1)\n else:\n assert pred.shape == (X.shape[0], )\n if hasattr(estimator, 'predict_proba'):\n probs = estimator.predict_proba(X)\n if tags['binary_only']:\n expected_probs_shape = (X.shape[0], 2)\n else:\n expected_probs_shape = (X.shape[0], 4)\n assert probs.shape == expected_probs_shape" + "docstring": null, + "source_code": "\ndef check_estimator_sparse_data(name, estimator_orig):\n rng = np.random.RandomState(0)\n X = rng.rand(40, 3)\n X[X < 0.8] = 0\n X = _pairwise_estimator_convert_X(X, estimator_orig)\n X_csr = sparse.csr_matrix(X)\n y = (4 * rng.rand(40)).astype(int)\n with ignore_warnings(category=FutureWarning):\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n tags = _safe_tags(estimator_orig)\n for (matrix_format, X) in _generate_sparse_matrix(X_csr):\n with ignore_warnings(category=FutureWarning):\n estimator = clone(estimator_orig)\n if name in ['Scaler', 'StandardScaler']:\n estimator.set_params(with_mean=False)\n if '64' in matrix_format:\n err_msg = f\"Estimator {name} doesn't seem to support {matrix_format} matrix, and is not failing gracefully, e.g. by using check_array(X, accept_large_sparse=False)\"\n else:\n err_msg = f\"Estimator {name} doesn't seem to fail gracefully on sparse data: error message should state explicitly that sparse input is not supported if this is not the case.\"\n with raises((TypeError, ValueError), match=['sparse', 'Sparse'], may_pass=True, err_msg=err_msg):\n with ignore_warnings(category=FutureWarning):\n estimator.fit(X, y)\n if hasattr(estimator, 'predict'):\n pred = estimator.predict(X)\n if tags['multioutput_only']:\n assert pred.shape == (X.shape[0], 1)\n else:\n assert pred.shape == (X.shape[0], )\n if hasattr(estimator, 'predict_proba'):\n probs = estimator.predict_proba(X)\n if tags['binary_only']:\n expected_probs_shape = (X.shape[0], 2)\n else:\n expected_probs_shape = (X.shape[0], 4)\n assert probs.shape == expected_probs_shape" }, { "name": "check_estimators_data_not_an_array", @@ -171064,7 +184652,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171074,7 +184663,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X", @@ -171084,7 +184674,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y", @@ -171094,7 +184685,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "obj_type", @@ -171104,13 +184696,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type):\n if name in CROSS_DECOMPOSITION:\n raise SkipTest('Skipping check_estimators_data_not_an_array for cross decomposition module as estimators are not deterministic.')\n estimator_1 = clone(estimator_orig)\n estimator_2 = clone(estimator_orig)\n set_random_state(estimator_1)\n set_random_state(estimator_2)\n if obj_type not in ['NotAnArray', 'PandasDataframe']:\n raise ValueError('Data type {0} not supported'.format(obj_type))\n if obj_type == 'NotAnArray':\n y_ = _NotAnArray(np.asarray(y))\n X_ = _NotAnArray(np.asarray(X))\n else:\n try:\n import pandas as pd\n y_ = np.asarray(y)\n if y_.ndim == 1:\n y_ = pd.Series(y_)\n else:\n y_ = pd.DataFrame(y_)\n X_ = pd.DataFrame(np.asarray(X))\n except ImportError:\n raise SkipTest('pandas is not installed: not checking estimators for pandas objects.')\n estimator_1.fit(X_, y_)\n pred1 = estimator_1.predict(X_)\n estimator_2.fit(X, y)\n pred2 = estimator_2.predict(X)\n assert_allclose(pred1, pred2, atol=0.01, err_msg=name)" }, { @@ -171128,7 +184721,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171138,13 +184732,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings\ndef check_estimators_dtypes(name, estimator_orig):\n rnd = np.random.RandomState(0)\n X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32)\n X_train_32 = _pairwise_estimator_convert_X(X_train_32, estimator_orig)\n X_train_64 = X_train_32.astype(np.float64)\n X_train_int_64 = X_train_32.astype(np.int64)\n X_train_int_32 = X_train_32.astype(np.int32)\n y = X_train_int_64[:, 0]\n y = _enforce_estimator_tags_y(estimator_orig, y)\n methods = ['predict', 'transform', 'decision_function', 'predict_proba']\n for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]:\n estimator = clone(estimator_orig)\n set_random_state(estimator, 1)\n estimator.fit(X_train, y)\n for method in methods:\n if hasattr(estimator, method):\n getattr(estimator, method)(X_train)" }, { @@ -171162,7 +184757,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171172,13 +184768,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_estimators_empty_data_messages(name, estimator_orig):\n e = clone(estimator_orig)\n set_random_state(e, 1)\n X_zero_samples = np.empty(0).reshape(0, 3)\n err_msg = f'The estimator {name} does not raise a ValueError when an empty data is used to train. Perhaps use check_array in train.'\n with raises(ValueError, err_msg=err_msg):\n e.fit(X_zero_samples, [])\n X_zero_features = np.empty(0).reshape(12, 0)\n y = _enforce_estimator_tags_y(e, np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]))\n msg = '0 feature\\\\(s\\\\) \\\\(shape=\\\\(\\\\d*, 0\\\\)\\\\) while a minimum of \\\\d* is required.'\n with raises(ValueError, match=msg):\n e.fit(X_zero_features, y)" }, { @@ -171196,7 +184793,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171206,7 +184804,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "readonly_memmap", @@ -171216,7 +184815,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -171240,7 +184840,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171250,13 +184851,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_estimators_nan_inf(name, estimator_orig):\n rnd = np.random.RandomState(0)\n X_train_finite = _pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), estimator_orig)\n X_train_nan = rnd.uniform(size=(10, 3))\n X_train_nan[0, 0] = np.nan\n X_train_inf = rnd.uniform(size=(10, 3))\n X_train_inf[0, 0] = np.inf\n y = np.ones(10)\n y[:5] = 0\n y = _enforce_estimator_tags_y(estimator_orig, y)\n error_string_fit = \"Estimator doesn't check for NaN and inf in fit.\"\n error_string_predict = \"Estimator doesn't check for NaN and inf in predict.\"\n error_string_transform = \"Estimator doesn't check for NaN and inf in transform.\"\n for X_train in [X_train_nan, X_train_inf]:\n with ignore_warnings(category=FutureWarning):\n estimator = clone(estimator_orig)\n set_random_state(estimator, 1)\n with raises(ValueError, match=['inf', 'NaN'], err_msg=error_string_fit):\n estimator.fit(X_train, y)\n estimator.fit(X_train_finite, y)\n if hasattr(estimator, 'predict'):\n with raises(ValueError, match=['inf', 'NaN'], err_msg=error_string_predict):\n estimator.predict(X_train)\n if hasattr(estimator, 'transform'):\n with raises(ValueError, match=['inf', 'NaN'], err_msg=error_string_transform):\n estimator.transform(X_train)" }, { @@ -171274,7 +184876,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171284,13 +184887,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_estimators_overwrite_params(name, estimator_orig):\n (X, y) = make_blobs(random_state=0, n_samples=21)\n X -= X.min()\n X = _pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n set_random_state(estimator)\n params = estimator.get_params()\n original_params = deepcopy(params)\n estimator.fit(X, y)\n new_params = estimator.get_params()\n for (param_name, original_value) in original_params.items():\n new_value = new_params[param_name]\n assert joblib.hash(new_value) == joblib.hash(original_value), 'Estimator %s should not change or mutate the parameter %s from %s to %s during fit.' % (name, param_name, original_value, new_value)" }, { @@ -171308,7 +184912,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171318,13 +184923,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_estimators_partial_fit_n_features(name, estimator_orig):\n if not hasattr(estimator_orig, 'partial_fit'):\n return\n estimator = clone(estimator_orig)\n (X, y) = make_blobs(n_samples=50, random_state=1)\n X -= X.min()\n y = _enforce_estimator_tags_y(estimator_orig, y)\n try:\n if is_classifier(estimator):\n classes = np.unique(y)\n estimator.partial_fit(X, y, classes=classes)\n else:\n estimator.partial_fit(X, y)\n except NotImplementedError:\n return\n with raises(ValueError, err_msg=f'The estimator {name} does not raise an error when the number of features changes between calls to partial_fit.'):\n estimator.partial_fit(X[:, :-1], y)" }, { @@ -171342,7 +184948,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171352,7 +184959,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -171376,7 +184984,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171386,13 +184995,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Check that predict raises an exception in an unfitted estimator.\n\nUnfitted estimators should raise a NotFittedError.", - "docstring": "Check that predict raises an exception in an unfitted estimator.\n\nUnfitted estimators should raise a NotFittedError.", + "docstring": "Check that predict raises an exception in an unfitted estimator.\n\n Unfitted estimators should raise a NotFittedError.\n ", "source_code": "\n@ignore_warnings\ndef check_estimators_unfitted(name, estimator_orig):\n \"\"\"Check that predict raises an exception in an unfitted estimator.\n\n Unfitted estimators should raise a NotFittedError.\n \"\"\"\n (X, y) = _regression_dataset()\n estimator = clone(estimator_orig)\n for method in ('decision_function', 'predict', 'predict_proba', 'predict_log_proba'):\n if hasattr(estimator, method):\n with raises(NotFittedError):\n getattr(estimator, method)(X)" }, { @@ -171410,7 +185020,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171420,13 +185031,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings\ndef check_fit1d(name, estimator_orig):\n rnd = np.random.RandomState(0)\n X = 3 * rnd.uniform(size=20)\n y = X.astype(int)\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n if hasattr(estimator, 'n_components'):\n estimator.n_components = 1\n if hasattr(estimator, 'n_clusters'):\n estimator.n_clusters = 1\n set_random_state(estimator, 1)\n with raises(ValueError):\n estimator.fit(X, y)" }, { @@ -171444,7 +185056,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171454,13 +185067,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings\ndef check_fit2d_1feature(name, estimator_orig):\n rnd = np.random.RandomState(0)\n X = 3 * rnd.uniform(size=(10, 1))\n X = _pairwise_estimator_convert_X(X, estimator_orig)\n y = X[:, 0].astype(int)\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n if hasattr(estimator, 'n_components'):\n estimator.n_components = 1\n if hasattr(estimator, 'n_clusters'):\n estimator.n_clusters = 1\n if name == 'RandomizedLogisticRegression':\n estimator.sample_fraction = 1\n if name == 'RANSACRegressor':\n estimator.residual_threshold = 0.5\n y = _enforce_estimator_tags_y(estimator, y)\n set_random_state(estimator, 1)\n msgs = ['1 feature\\\\(s\\\\)', 'n_features = 1', 'n_features=1']\n with raises(ValueError, match=msgs, may_pass=True):\n estimator.fit(X, y)" }, { @@ -171478,7 +185092,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171488,13 +185103,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings\ndef check_fit2d_1sample(name, estimator_orig):\n rnd = np.random.RandomState(0)\n X = 3 * rnd.uniform(size=(1, 10))\n X = _pairwise_estimator_convert_X(X, estimator_orig)\n y = X[:, 0].astype(int)\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n if hasattr(estimator, 'n_components'):\n estimator.n_components = 1\n if hasattr(estimator, 'n_clusters'):\n estimator.n_clusters = 1\n set_random_state(estimator, 1)\n if name == 'OPTICS':\n estimator.set_params(min_samples=1)\n msgs = ['1 sample', 'n_samples = 1', 'n_samples=1', 'one sample', '1 class', 'one class']\n with raises(ValueError, match=msgs, may_pass=True):\n estimator.fit(X, y)" }, { @@ -171512,7 +185128,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171522,13 +185139,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_fit2d_predict1d(name, estimator_orig):\n rnd = np.random.RandomState(0)\n X = 3 * rnd.uniform(size=(20, 3))\n X = _pairwise_estimator_convert_X(X, estimator_orig)\n y = X[:, 0].astype(int)\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n if hasattr(estimator, 'n_components'):\n estimator.n_components = 1\n if hasattr(estimator, 'n_clusters'):\n estimator.n_clusters = 1\n set_random_state(estimator, 1)\n estimator.fit(X, y)\n for method in ['predict', 'transform', 'decision_function', 'predict_proba']:\n if hasattr(estimator, method):\n assert_raise_message(ValueError, 'Reshape your data', getattr(estimator, method), X[0])" }, { @@ -171546,7 +185164,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171556,13 +185175,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_fit_check_is_fitted(name, estimator_orig):\n rng = np.random.RandomState(42)\n estimator = clone(estimator_orig)\n set_random_state(estimator)\n if 'warm_start' in estimator.get_params():\n estimator.set_params(warm_start=False)\n n_samples = 100\n X = rng.normal(loc=100, size=(n_samples, 2))\n X = _pairwise_estimator_convert_X(X, estimator)\n if is_regressor(estimator_orig):\n y = rng.normal(size=n_samples)\n else:\n y = rng.randint(low=0, high=2, size=n_samples)\n y = _enforce_estimator_tags_y(estimator, y)\n if not _safe_tags(estimator).get('stateless', False):\n try:\n check_is_fitted(estimator)\n raise AssertionError(f'{estimator.__class__.__name__} passes check_is_fitted before being fit!')\n except NotFittedError:\n pass\n estimator.fit(X, y)\n try:\n check_is_fitted(estimator)\n except NotFittedError as e:\n raise NotFittedError('Estimator fails to pass `check_is_fitted` even though it has been fit.') from e" }, { @@ -171580,7 +185200,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171590,13 +185211,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_fit_idempotent(name, estimator_orig):\n check_methods = ['predict', 'transform', 'decision_function', 'predict_proba']\n rng = np.random.RandomState(0)\n estimator = clone(estimator_orig)\n set_random_state(estimator)\n if 'warm_start' in estimator.get_params().keys():\n estimator.set_params(warm_start=False)\n n_samples = 100\n X = rng.normal(loc=100, size=(n_samples, 2))\n X = _pairwise_estimator_convert_X(X, estimator)\n if is_regressor(estimator_orig):\n y = rng.normal(size=n_samples)\n else:\n y = rng.randint(low=0, high=2, size=n_samples)\n y = _enforce_estimator_tags_y(estimator, y)\n (train, test) = next(ShuffleSplit(test_size=0.2, random_state=rng).split(X))\n (X_train, y_train) = _safe_split(estimator, X, y, train)\n (X_test, y_test) = _safe_split(estimator, X, y, test, train)\n estimator.fit(X_train, y_train)\n result = {method: getattr(estimator, method)(X_test) for method in check_methods if hasattr(estimator, method)}\n set_random_state(estimator)\n estimator.fit(X_train, y_train)\n for method in check_methods:\n if hasattr(estimator, method):\n new_result = getattr(estimator, method)(X_test)\n if np.issubdtype(new_result.dtype, np.floating):\n tol = 2 * np.finfo(new_result.dtype).eps\n else:\n tol = 2 * np.finfo(np.float64).eps\n assert_allclose_dense_sparse(result[method], new_result, atol=max(tol, 1e-09), rtol=max(tol, 1e-07), err_msg='Idempotency check failed for method {}'.format(method))" }, { @@ -171614,7 +185236,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171624,13 +185247,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_fit_non_negative(name, estimator_orig):\n X = np.array([[-1.0, 1], [-1.0, 1]])\n y = np.array([1, 2])\n estimator = clone(estimator_orig)\n with raises(ValueError):\n estimator.fit(X, y)" }, { @@ -171648,7 +185272,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171658,13 +185283,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings\ndef check_fit_score_takes_y(name, estimator_orig):\n rnd = np.random.RandomState(0)\n n_samples = 30\n X = rnd.uniform(size=(n_samples, 3))\n X = _pairwise_estimator_convert_X(X, estimator_orig)\n y = np.arange(n_samples) % 3\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n set_random_state(estimator)\n funcs = ['fit', 'score', 'partial_fit', 'fit_predict', 'fit_transform']\n for func_name in funcs:\n func = getattr(estimator, func_name, None)\n if func is not None:\n func(X, y)\n args = [p.name for p in signature(func).parameters.values()]\n if args[0] == 'self':\n args = args[1:]\n assert args[1] in ['y', 'Y'], 'Expected y or Y as second argument for method %s of %s. Got arguments: %r.' % (func_name, type(estimator).__name__, args)" }, { @@ -171682,7 +185308,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171692,13 +185319,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_get_params_invariance(name, estimator_orig):\n e = clone(estimator_orig)\n shallow_params = e.get_params(deep=False)\n deep_params = e.get_params(deep=True)\n assert all((item in deep_params.items() for item in shallow_params.items()))" }, { @@ -171716,7 +185344,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171726,13 +185355,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_methods_sample_order_invariance(name, estimator_orig):\n rnd = np.random.RandomState(0)\n X = 3 * rnd.uniform(size=(20, 3))\n X = _pairwise_estimator_convert_X(X, estimator_orig)\n y = X[:, 0].astype(np.int64)\n if _safe_tags(estimator_orig, key='binary_only'):\n y[y == 2] = 1\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n if hasattr(estimator, 'n_components'):\n estimator.n_components = 1\n if hasattr(estimator, 'n_clusters'):\n estimator.n_clusters = 2\n set_random_state(estimator, 1)\n estimator.fit(X, y)\n idx = np.random.permutation(X.shape[0])\n for method in ['predict', 'transform', 'decision_function', 'score_samples', 'predict_proba']:\n msg = '{method} of {name} is not invariant when applied to a datasetwith different sample order.'.format(method=method, name=name)\n if hasattr(estimator, method):\n assert_allclose_dense_sparse(getattr(estimator, method)(X)[idx], getattr(estimator, method)(X[idx]), atol=1e-09, err_msg=msg)" }, { @@ -171750,7 +185380,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171760,13 +185391,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_methods_subset_invariance(name, estimator_orig):\n rnd = np.random.RandomState(0)\n X = 3 * rnd.uniform(size=(20, 3))\n X = _pairwise_estimator_convert_X(X, estimator_orig)\n y = X[:, 0].astype(int)\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n if hasattr(estimator, 'n_components'):\n estimator.n_components = 1\n if hasattr(estimator, 'n_clusters'):\n estimator.n_clusters = 1\n set_random_state(estimator, 1)\n estimator.fit(X, y)\n for method in ['predict', 'transform', 'decision_function', 'score_samples', 'predict_proba']:\n msg = '{method} of {name} is not invariant when applied to a subset.'.format(method=method, name=name)\n if hasattr(estimator, method):\n (result_full, result_by_batch) = _apply_on_subsets(getattr(estimator, method), X)\n assert_allclose(result_full, result_by_batch, atol=1e-07, err_msg=msg)" }, { @@ -171784,7 +185416,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171794,13 +185427,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_n_features_in(name, estimator_orig):\n rng = np.random.RandomState(0)\n estimator = clone(estimator_orig)\n set_random_state(estimator)\n if 'warm_start' in estimator.get_params():\n estimator.set_params(warm_start=False)\n n_samples = 100\n X = rng.normal(loc=100, size=(n_samples, 2))\n X = _pairwise_estimator_convert_X(X, estimator)\n if is_regressor(estimator_orig):\n y = rng.normal(size=n_samples)\n else:\n y = rng.randint(low=0, high=2, size=n_samples)\n y = _enforce_estimator_tags_y(estimator, y)\n assert not hasattr(estimator, 'n_features_in_')\n estimator.fit(X, y)\n if hasattr(estimator, 'n_features_in_'):\n assert estimator.n_features_in_ == X.shape[1]\n else:\n warnings.warn(\"As of scikit-learn 0.23, estimators should expose a n_features_in_ attribute, unless the 'no_validation' tag is True. This attribute should be equal to the number of features passed to the fit method. An error will be raised from version 1.0 (renaming of 0.25) when calling check_estimator(). See SLEP010: https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html\", FutureWarning)" }, { @@ -171818,7 +185452,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171828,13 +185463,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_n_features_in_after_fitting(name, estimator_orig):\n tags = _safe_tags(estimator_orig)\n is_supported_X_types = '2darray' in tags['X_types'] or 'categorical' in tags['X_types']\n if not is_supported_X_types or tags['no_validation']:\n return\n rng = np.random.RandomState(0)\n estimator = clone(estimator_orig)\n set_random_state(estimator)\n if 'warm_start' in estimator.get_params():\n estimator.set_params(warm_start=False)\n n_samples = 150\n X = rng.normal(size=(n_samples, 8))\n X = _enforce_estimator_tags_x(estimator, X)\n X = _pairwise_estimator_convert_X(X, estimator)\n if is_regressor(estimator):\n y = rng.normal(size=n_samples)\n else:\n y = rng.randint(low=0, high=2, size=n_samples)\n y = _enforce_estimator_tags_y(estimator, y)\n estimator.fit(X, y)\n assert estimator.n_features_in_ == X.shape[1]\n check_methods = ['predict', 'transform', 'decision_function', 'predict_proba', 'score']\n X_bad = X[:, [1]]\n msg = f'X has 1 features, but \\\\w+ is expecting {X.shape[1]} features as input'\n for method in check_methods:\n if not hasattr(estimator, method):\n continue\n callable_method = getattr(estimator, method)\n if method == 'score':\n callable_method = partial(callable_method, y=y)\n with raises(ValueError, match=msg):\n callable_method(X_bad)\n if not hasattr(estimator, 'partial_fit'):\n return\n estimator = clone(estimator_orig)\n if is_classifier(estimator):\n estimator.partial_fit(X, y, classes=np.unique(y))\n else:\n estimator.partial_fit(X, y)\n assert estimator.n_features_in_ == X.shape[1]\n with raises(ValueError, match=msg):\n estimator.partial_fit(X_bad, y)" }, { @@ -171852,7 +185488,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171862,7 +185499,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -171886,7 +185524,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171896,13 +185535,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_non_transformer_estimators_n_iter(name, estimator_orig):\n not_run_check_n_iter = ['Ridge', 'SVR', 'NuSVR', 'NuSVC', 'RidgeClassifier', 'SVC', 'RandomizedLasso', 'LogisticRegressionCV', 'LinearSVC', 'LogisticRegression', 'SelfTrainingClassifier']\n not_run_check_n_iter += CROSS_DECOMPOSITION\n if name in not_run_check_n_iter:\n return\n if name == 'LassoLars':\n estimator = clone(estimator_orig).set_params(alpha=0.0)\n else:\n estimator = clone(estimator_orig)\n if hasattr(estimator, 'max_iter'):\n iris = load_iris()\n (X, y_) = (iris.data, iris.target)\n y_ = _enforce_estimator_tags_y(estimator, y_)\n set_random_state(estimator, 0)\n estimator.fit(X, y_)\n assert estimator.n_iter_ >= 1" }, { @@ -171920,7 +185560,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -171930,7 +185571,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -171954,7 +185596,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "expected_outliers", @@ -171964,7 +185607,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "decision", @@ -171974,13 +185618,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_outlier_corruption(num_outliers, expected_outliers, decision):\n if num_outliers < expected_outliers:\n start = num_outliers\n end = expected_outliers + 1\n else:\n start = expected_outliers\n end = num_outliers + 1\n sorted_decision = np.sort(decision)\n msg = 'The number of predicted outliers is not equal to the expected number of outliers and this difference is not explained by the number of ties in the decision_function values'\n assert len(np.unique(sorted_decision[start:end])) == 1, msg" }, { @@ -171998,7 +185643,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172008,13 +185654,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_outliers_fit_predict(name, estimator_orig):\n n_samples = 300\n (X, _) = make_blobs(n_samples=n_samples, random_state=0)\n X = shuffle(X, random_state=7)\n (n_samples, n_features) = X.shape\n estimator = clone(estimator_orig)\n set_random_state(estimator)\n y_pred = estimator.fit_predict(X)\n assert y_pred.shape == (n_samples, )\n assert y_pred.dtype.kind == 'i'\n assert_array_equal(np.unique(y_pred), np.array([-1, 1]))\n if hasattr(estimator, 'predict'):\n y_pred_2 = estimator.fit(X).predict(X)\n assert_array_equal(y_pred, y_pred_2)\n if hasattr(estimator, 'contamination'):\n expected_outliers = 30\n contamination = float(expected_outliers) / n_samples\n estimator.set_params(contamination=contamination)\n y_pred = estimator.fit_predict(X)\n num_outliers = np.sum(y_pred != 1)\n if num_outliers != expected_outliers and hasattr(estimator, 'decision_function'):\n decision = estimator.decision_function(X)\n check_outlier_corruption(num_outliers, expected_outliers, decision)\n msg = 'contamination must be in \\\\(0, 0.5]'\n for contamination in [-0.5, -0.001, 0.5001, 2.3]:\n estimator.set_params(contamination=contamination)\n with raises(ValueError, match=msg):\n estimator.fit_predict(X)" }, { @@ -172032,7 +185679,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172042,7 +185690,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "readonly_memmap", @@ -172052,13 +185701,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_outliers_train(name, estimator_orig, readonly_memmap=True):\n n_samples = 300\n (X, _) = make_blobs(n_samples=n_samples, random_state=0)\n X = shuffle(X, random_state=7)\n if readonly_memmap:\n X = create_memmap_backed_data(X)\n (n_samples, n_features) = X.shape\n estimator = clone(estimator_orig)\n set_random_state(estimator)\n estimator.fit(X)\n estimator.fit(X.tolist())\n y_pred = estimator.predict(X)\n assert y_pred.shape == (n_samples, )\n assert y_pred.dtype.kind == 'i'\n assert_array_equal(np.unique(y_pred), np.array([-1, 1]))\n decision = estimator.decision_function(X)\n scores = estimator.score_samples(X)\n for output in [decision, scores]:\n assert output.dtype == np.dtype('float')\n assert output.shape == (n_samples, )\n with raises(ValueError):\n estimator.predict(X.T)\n dec_pred = (decision >= 0).astype(int)\n dec_pred[dec_pred == 0] = -1\n assert_array_equal(dec_pred, y_pred)\n with raises(ValueError):\n estimator.decision_function(X.T)\n y_dec = scores - estimator.offset_\n assert_allclose(y_dec, decision)\n with raises(ValueError):\n estimator.score_samples(X.T)\n if hasattr(estimator, 'contamination') and not hasattr(estimator, 'novelty'):\n expected_outliers = 30\n contamination = expected_outliers / n_samples\n estimator.set_params(contamination=contamination)\n estimator.fit(X)\n y_pred = estimator.predict(X)\n num_outliers = np.sum(y_pred != 1)\n if num_outliers != expected_outliers:\n decision = estimator.decision_function(X)\n check_outlier_corruption(num_outliers, expected_outliers, decision)\n msg = 'contamination must be in \\\\(0, 0.5]'\n for contamination in [-0.5, 2.3]:\n estimator.set_params(contamination=contamination)\n with raises(ValueError, match=msg):\n estimator.fit(X)" }, { @@ -172076,7 +185726,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "Estimator", @@ -172086,13 +185737,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_parameters_default_constructible(name, Estimator):\n Estimator = Estimator.__class__\n with ignore_warnings(category=FutureWarning):\n estimator = _construct_instance(Estimator)\n clone(estimator)\n repr(estimator)\n assert estimator.set_params() is estimator\n init = getattr(estimator.__init__, 'deprecated_original', estimator.__init__)\n try:\n \n def param_filter(p):\n \"\"\"Identify hyper parameters of an estimator.\"\"\"\n return p.name != 'self' and p.kind != p.VAR_KEYWORD and p.kind != p.VAR_POSITIONAL\n init_params = [p for p in signature(init).parameters.values() if param_filter(p)]\n except (TypeError, ValueError):\n return\n params = estimator.get_params()\n init_params = init_params[len(getattr(estimator, '_required_parameters', [])):]\n for init_param in init_params:\n assert init_param.default != init_param.empty, 'parameter %s for %s has no default value' % (init_param.name, type(estimator).__name__)\n allowed_types = {str, int, float, bool, tuple, type(None), type, types.FunctionType, joblib.Memory}\n allowed_types.update(np.core.numerictypes.allTypes.values())\n assert type(init_param.default) in allowed_types, f\"Parameter '{init_param.name}' of estimator '{Estimator.__name__}' is of type {type(init_param.default).__name__} which is not allowed. All init parameters have to be immutable to make cloning possible. Therefore we restrict the set of legal types to {set((type.__name__ for type in allowed_types))}.\"\n if init_param.name not in params.keys():\n assert init_param.default is None, f\"Estimator parameter '{init_param.name}' of estimator '{Estimator.__name__}' is not returned by get_params. If it is deprecated, set its default value to None.\"\n continue\n param_value = params[init_param.name]\n if isinstance(param_value, np.ndarray):\n assert_array_equal(param_value, init_param.default)\n else:\n failure_text = f'Parameter {init_param.name} was mutated on init. All parameters must be stored unchanged.'\n if is_scalar_nan(param_value):\n assert param_value is init_param.default, failure_text\n else:\n assert param_value == init_param.default, failure_text" }, { @@ -172110,7 +185762,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172120,13 +185773,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings\ndef check_pipeline_consistency(name, estimator_orig):\n if _safe_tags(estimator_orig, key='non_deterministic'):\n msg = name + ' is non deterministic'\n raise SkipTest(msg)\n (X, y) = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1)\n X -= X.min()\n X = _pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)\n estimator = clone(estimator_orig)\n y = _enforce_estimator_tags_y(estimator, y)\n set_random_state(estimator)\n pipeline = make_pipeline(estimator)\n estimator.fit(X, y)\n pipeline.fit(X, y)\n funcs = ['score', 'fit_transform']\n for func_name in funcs:\n func = getattr(estimator, func_name, None)\n if func is not None:\n func_pipeline = getattr(pipeline, func_name)\n result = func(X, y)\n result_pipe = func_pipeline(X, y)\n assert_allclose_dense_sparse(result, result_pipe)" }, { @@ -172144,7 +185798,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172154,13 +185809,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_regressor_data_not_an_array(name, estimator_orig):\n (X, y) = _regression_dataset()\n X = _pairwise_estimator_convert_X(X, estimator_orig)\n y = _enforce_estimator_tags_y(estimator_orig, y)\n for obj_type in ['NotAnArray', 'PandasDataframe']:\n check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type)" }, { @@ -172178,7 +185834,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator", @@ -172188,13 +185845,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_regressor_multioutput(name, estimator):\n estimator = clone(estimator)\n n_samples = n_features = 10\n if not _is_pairwise_metric(estimator):\n n_samples = n_samples + 1\n (X, y) = make_regression(random_state=42, n_targets=5, n_samples=n_samples, n_features=n_features)\n X = _pairwise_estimator_convert_X(X, estimator)\n estimator.fit(X, y)\n y_pred = estimator.predict(X)\n assert y_pred.dtype == np.dtype('float64'), 'Multioutput predictions by a regressor are expected to be floating-point precision. Got {} instead'.format(y_pred.dtype)\n assert y_pred.shape == y.shape, 'The shape of the prediction for multioutput data is incorrect. Expected {}, got {}.'" }, { @@ -172212,7 +185870,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "regressor_orig", @@ -172222,13 +185881,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_regressors_int(name, regressor_orig):\n (X, _) = _regression_dataset()\n X = _pairwise_estimator_convert_X(X[:50], regressor_orig)\n rnd = np.random.RandomState(0)\n y = rnd.randint(3, size=X.shape[0])\n y = _enforce_estimator_tags_y(regressor_orig, y)\n rnd = np.random.RandomState(0)\n regressor_1 = clone(regressor_orig)\n regressor_2 = clone(regressor_orig)\n set_random_state(regressor_1)\n set_random_state(regressor_2)\n if name in CROSS_DECOMPOSITION:\n y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])\n y_ = y_.T\n else:\n y_ = y\n regressor_1.fit(X, y_)\n pred1 = regressor_1.predict(X)\n regressor_2.fit(X, y_.astype(float))\n pred2 = regressor_2.predict(X)\n assert_allclose(pred1, pred2, atol=0.01, err_msg=name)" }, { @@ -172246,7 +185906,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "regressor_orig", @@ -172256,13 +185917,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings\ndef check_regressors_no_decision_function(name, regressor_orig):\n rng = np.random.RandomState(0)\n regressor = clone(regressor_orig)\n X = rng.normal(size=(10, 4))\n X = _pairwise_estimator_convert_X(X, regressor_orig)\n y = _enforce_estimator_tags_y(regressor, X[:, 0])\n regressor.fit(X, y)\n funcs = ['decision_function', 'predict_proba', 'predict_log_proba']\n for func_name in funcs:\n assert not hasattr(regressor, func_name)" }, { @@ -172280,7 +185942,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "regressor_orig", @@ -172290,7 +185953,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "readonly_memmap", @@ -172300,7 +185964,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "X_dtype", @@ -172310,14 +185975,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", - "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_regressors_train(name, regressor_orig, readonly_memmap=False, X_dtype=np.float64):\n (X, y) = _regression_dataset()\n X = X.astype(X_dtype)\n X = _pairwise_estimator_convert_X(X, regressor_orig)\n y = scale(y)\n regressor = clone(regressor_orig)\n y = _enforce_estimator_tags_y(regressor, y)\n if name in CROSS_DECOMPOSITION:\n rnd = np.random.RandomState(0)\n y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])\n y_ = y_.T\n else:\n y_ = y\n if readonly_memmap:\n (X, y, y_) = create_memmap_backed_data([X, y, y_])\n if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'):\n regressor.alpha = 0.01\n if name == 'PassiveAggressiveRegressor':\n regressor.C = 0.01\n with raises(ValueError, err_msg=f'The classifier {name} does not raise an error when incorrect/malformed input data for fit is passed. The number of training examples is not the same as the number of labels. Perhaps use check_X_y in fit.'):\n regressor.fit(X, y[:-1])\n set_random_state(regressor)\n regressor.fit(X, y_)\n regressor.fit(X.tolist(), y_.tolist())\n y_pred = regressor.predict(X)\n assert y_pred.shape == y_.shape\n if not _safe_tags(regressor, key='poor_score'):\n assert regressor.score(X, y_) > 0.5" + "docstring": null, + "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_regressors_train(name, regressor_orig, readonly_memmap=False, X_dtype=np.float64):\n (X, y) = _regression_dataset()\n X = X.astype(X_dtype)\n X = _pairwise_estimator_convert_X(X, regressor_orig)\n y = scale(y)\n regressor = clone(regressor_orig)\n y = _enforce_estimator_tags_y(regressor, y)\n if name in CROSS_DECOMPOSITION:\n rnd = np.random.RandomState(0)\n y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])\n y_ = y_.T\n else:\n y_ = y\n if readonly_memmap:\n (X, y, y_) = _create_memmap_backed_data([X, y, y_])\n if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'):\n regressor.alpha = 0.01\n if name == 'PassiveAggressiveRegressor':\n regressor.C = 0.01\n with raises(ValueError, err_msg=f'The classifier {name} does not raise an error when incorrect/malformed input data for fit is passed. The number of training examples is not the same as the number of labels. Perhaps use check_X_y in fit.'):\n regressor.fit(X, y[:-1])\n set_random_state(regressor)\n regressor.fit(X, y_)\n regressor.fit(X.tolist(), y_.tolist())\n y_pred = regressor.predict(X)\n assert y_pred.shape == y_.shape\n if not _safe_tags(regressor, key='poor_score'):\n assert regressor.score(X, y_) > 0.5" }, { "name": "check_requires_y_none", @@ -172334,7 +186000,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172344,13 +186011,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_requires_y_none(name, estimator_orig):\n rng = np.random.RandomState(0)\n estimator = clone(estimator_orig)\n set_random_state(estimator)\n n_samples = 100\n X = rng.normal(loc=100, size=(n_samples, 2))\n X = _pairwise_estimator_convert_X(X, estimator)\n warning_msg = \"As of scikit-learn 0.23, estimators should have a 'requires_y' tag set to the appropriate value. The default value of the tag is False. An error will be raised from version 1.0 when calling check_estimator() if the tag isn't properly set.\"\n expected_err_msgs = ('requires y to be passed, but the target y is None', 'Expected array-like (array or non-string sequence), got None', 'y should be a 1d array')\n try:\n estimator.fit(X, None)\n except ValueError as ve:\n if not any((msg in str(ve) for msg in expected_err_msgs)):\n warnings.warn(warning_msg, FutureWarning)" }, { @@ -172368,7 +186036,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172378,7 +186047,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "kind", @@ -172388,13 +186058,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_sample_weights_invariance(name, estimator_orig, kind='ones'):\n estimator1 = clone(estimator_orig)\n estimator2 = clone(estimator_orig)\n set_random_state(estimator1, random_state=0)\n set_random_state(estimator2, random_state=0)\n X1 = np.array([[1, 3], [1, 3], [1, 3], [1, 3], [2, 1], [2, 1], [2, 1], [2, 1], [3, 3], [3, 3], [3, 3], [3, 3], [4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.float64)\n y1 = np.array([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int)\n if kind == 'ones':\n X2 = X1\n y2 = y1\n sw2 = np.ones(shape=len(y1))\n err_msg = f'For {name} sample_weight=None is not equivalent to sample_weight=ones'\n elif kind == 'zeros':\n X2 = np.vstack([X1, X1 + 1])\n y2 = np.hstack([y1, 3 - y1])\n sw2 = np.ones(shape=len(y1) * 2)\n sw2[len(y1):] = 0\n (X2, y2, sw2) = shuffle(X2, y2, sw2, random_state=0)\n err_msg = f'For {name}, a zero sample_weight is not equivalent to removing the sample'\n else:\n raise ValueError\n y1 = _enforce_estimator_tags_y(estimator1, y1)\n y2 = _enforce_estimator_tags_y(estimator2, y2)\n estimator1.fit(X1, y=y1, sample_weight=None)\n estimator2.fit(X2, y=y2, sample_weight=sw2)\n for method in ['predict', 'predict_proba', 'decision_function', 'transform']:\n if hasattr(estimator_orig, method):\n X_pred1 = getattr(estimator1, method)(X1)\n X_pred2 = getattr(estimator2, method)(X1)\n assert_allclose_dense_sparse(X_pred1, X_pred2, err_msg=err_msg)" }, { @@ -172412,7 +186083,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172422,13 +186094,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_sample_weights_list(name, estimator_orig):\n estimator = clone(estimator_orig)\n rnd = np.random.RandomState(0)\n n_samples = 30\n X = _pairwise_estimator_convert_X(rnd.uniform(size=(n_samples, 3)), estimator_orig)\n y = np.arange(n_samples) % 3\n y = _enforce_estimator_tags_y(estimator, y)\n sample_weight = [3] * n_samples\n estimator.fit(X, y, sample_weight=sample_weight)" }, { @@ -172446,7 +186119,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172456,13 +186130,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_sample_weights_not_an_array(name, estimator_orig):\n estimator = clone(estimator_orig)\n X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], [2, 1], [2, 2], [2, 3], [2, 4], [3, 1], [3, 2], [3, 3], [3, 4]])\n X = _NotAnArray(_pairwise_estimator_convert_X(X, estimator_orig))\n y = _NotAnArray([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])\n weights = _NotAnArray([1] * 12)\n if _safe_tags(estimator, key='multioutput_only'):\n y = _NotAnArray(y.data.reshape(-1, 1))\n estimator.fit(X, y, sample_weight=weights)" }, { @@ -172480,7 +186155,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172490,13 +186166,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_sample_weights_not_overwritten(name, estimator_orig):\n estimator = clone(estimator_orig)\n set_random_state(estimator, random_state=0)\n X = np.array([[1, 3], [1, 3], [1, 3], [1, 3], [2, 1], [2, 1], [2, 1], [2, 1], [3, 3], [3, 3], [3, 3], [3, 3], [4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.float64)\n y = np.array([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int)\n y = _enforce_estimator_tags_y(estimator, y)\n sample_weight_original = np.ones(y.shape[0])\n sample_weight_original[0] = 10.0\n sample_weight_fit = sample_weight_original.copy()\n estimator.fit(X, y, sample_weight=sample_weight_fit)\n err_msg = '{name} overwrote the original `sample_weight` given during fit'\n assert_allclose(sample_weight_fit, sample_weight_original, err_msg=err_msg)" }, { @@ -172514,7 +186191,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172524,13 +186202,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_sample_weights_pandas_series(name, estimator_orig):\n estimator = clone(estimator_orig)\n try:\n import pandas as pd\n X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], [2, 1], [2, 2], [2, 3], [2, 4], [3, 1], [3, 2], [3, 3], [3, 4]])\n X = pd.DataFrame(_pairwise_estimator_convert_X(X, estimator_orig))\n y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])\n weights = pd.Series([1] * 12)\n if _safe_tags(estimator, key='multioutput_only'):\n y = pd.DataFrame(y)\n try:\n estimator.fit(X, y, sample_weight=weights)\n except ValueError:\n raise ValueError(\"Estimator {0} raises error if 'sample_weight' parameter is of type pandas.Series\".format(name))\n except ImportError:\n raise SkipTest('pandas is not installed: not testing for input of type pandas.Series to class weight.')" }, { @@ -172548,7 +186227,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172558,13 +186238,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_sample_weights_shape(name, estimator_orig):\n estimator = clone(estimator_orig)\n X = np.array([[1, 3], [1, 3], [1, 3], [1, 3], [2, 1], [2, 1], [2, 1], [2, 1], [3, 3], [3, 3], [3, 3], [3, 3], [4, 1], [4, 1], [4, 1], [4, 1]])\n y = np.array([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2])\n y = _enforce_estimator_tags_y(estimator, y)\n estimator.fit(X, y, sample_weight=np.ones(len(y)))\n with raises(ValueError):\n estimator.fit(X, y, sample_weight=np.ones(2 * len(y)))\n with raises(ValueError):\n estimator.fit(X, y, sample_weight=np.ones((len(y), 2)))" }, { @@ -172582,7 +186263,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172592,13 +186274,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_set_params(name, estimator_orig):\n estimator = clone(estimator_orig)\n orig_params = estimator.get_params(deep=False)\n msg = 'get_params result does not match what was passed to set_params'\n estimator.set_params(**orig_params)\n curr_params = estimator.get_params(deep=False)\n assert set(orig_params.keys()) == set(curr_params.keys()), msg\n for (k, v) in curr_params.items():\n assert orig_params[k] is v, msg\n test_values = [-np.inf, np.inf, None]\n test_params = deepcopy(orig_params)\n for param_name in orig_params.keys():\n default_value = orig_params[param_name]\n for value in test_values:\n test_params[param_name] = value\n try:\n estimator.set_params(**test_params)\n except (TypeError, ValueError) as e:\n e_type = e.__class__.__name__\n warnings.warn('{0} occurred during set_params of param {1} on {2}. It is recommended to delay parameter validation until fit.'.format(e_type, param_name, name))\n change_warning_msg = \"Estimator's parameters changed after set_params raised {}\".format(e_type)\n params_before_exception = curr_params\n curr_params = estimator.get_params(deep=False)\n try:\n assert set(params_before_exception.keys()) == set(curr_params.keys())\n for (k, v) in curr_params.items():\n assert params_before_exception[k] is v\n except AssertionError:\n warnings.warn(change_warning_msg)\n else:\n curr_params = estimator.get_params(deep=False)\n assert set(test_params.keys()) == set(curr_params.keys()), msg\n for (k, v) in curr_params.items():\n assert test_params[k] is v, msg\n test_params[param_name] = default_value" }, { @@ -172616,7 +186299,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172626,13 +186310,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_sparsify_coefficients(name, estimator_orig):\n X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-1, -2], [2, 2], [-2, -2]])\n y = np.array([1, 1, 1, 2, 2, 2, 3, 3, 3])\n y = _enforce_estimator_tags_y(estimator_orig, y)\n est = clone(estimator_orig)\n est.fit(X, y)\n pred_orig = est.predict(X)\n est.sparsify()\n assert sparse.issparse(est.coef_)\n pred = est.predict(X)\n assert_array_equal(pred, pred_orig)\n est = pickle.loads(pickle.dumps(est))\n assert sparse.issparse(est.coef_)\n pred = est.predict(X)\n assert_array_equal(pred, pred_orig)" }, { @@ -172650,7 +186335,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172660,13 +186346,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_supervised_y_2d(name, estimator_orig):\n tags = _safe_tags(estimator_orig)\n rnd = np.random.RandomState(0)\n n_samples = 30\n X = _pairwise_estimator_convert_X(rnd.uniform(size=(n_samples, 3)), estimator_orig)\n y = np.arange(n_samples) % 3\n y = _enforce_estimator_tags_y(estimator_orig, y)\n estimator = clone(estimator_orig)\n set_random_state(estimator)\n estimator.fit(X, y)\n y_pred = estimator.predict(X)\n set_random_state(estimator)\n with warnings.catch_warnings(record=True) as w:\n warnings.simplefilter('always', DataConversionWarning)\n warnings.simplefilter('ignore', RuntimeWarning)\n estimator.fit(X, y[:, np.newaxis])\n y_pred_2d = estimator.predict(X)\n msg = 'expected 1 DataConversionWarning, got: %s' % ', '.join([str(w_x) for w_x in w])\n if not tags['multioutput']:\n assert len(w) > 0, msg\n assert \"DataConversionWarning('A column-vector y was passed when a 1d array was expected\" in msg\n assert_allclose(y_pred.ravel(), y_pred_2d.ravel())" }, { @@ -172684,7 +186371,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172694,13 +186382,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_supervised_y_no_nan(name, estimator_orig):\n estimator = clone(estimator_orig)\n rng = np.random.RandomState(888)\n X = rng.randn(10, 5)\n y = np.full(10, np.inf)\n y = _enforce_estimator_tags_y(estimator, y)\n match = \"Input contains NaN, infinity or a value too large for dtype\\\\('float64'\\\\).\"\n err_msg = f'Estimator {name} should have raised error on fitting array y with NaN value.'\n with raises(ValueError, match=match, err_msg=err_msg):\n estimator.fit(X, y)" }, { @@ -172718,7 +186407,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformer", @@ -172728,13 +186418,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_transformer_data_not_an_array(name, transformer):\n (X, y) = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1)\n X = StandardScaler().fit_transform(X)\n X -= X.min() - 0.1\n X = _pairwise_estimator_convert_X(X, transformer)\n this_X = _NotAnArray(X)\n this_y = _NotAnArray(np.asarray(y))\n _check_transformer(name, transformer, this_X, this_y)\n _check_transformer(name, transformer, X.tolist(), y.tolist())" }, { @@ -172752,7 +186443,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformer", @@ -172762,7 +186454,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "readonly_memmap", @@ -172772,13 +186465,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_transformer_general(name, transformer, readonly_memmap=False):\n (X, y) = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1)\n X = StandardScaler().fit_transform(X)\n X -= X.min()\n X = _pairwise_estimator_convert_X(X, transformer)\n if readonly_memmap:\n (X, y) = create_memmap_backed_data([X, y])\n _check_transformer(name, transformer, X, y)" }, { @@ -172796,7 +186490,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformer_orig", @@ -172806,13 +186501,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_transformer_get_feature_names_out(name, transformer_orig):\n tags = transformer_orig._get_tags()\n if '2darray' not in tags['X_types'] or tags['no_validation']:\n return\n (X, y) = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1)\n X = StandardScaler().fit_transform(X)\n X -= X.min()\n transformer = clone(transformer_orig)\n X = _enforce_estimator_tags_x(transformer, X)\n X = _pairwise_estimator_convert_X(X, transformer)\n n_features = X.shape[1]\n set_random_state(transformer)\n y_ = y\n if name in CROSS_DECOMPOSITION:\n y_ = np.c_[np.asarray(y), np.asarray(y)]\n y_[::2, 1] *= 2\n X_transform = transformer.fit_transform(X, y=y_)\n input_features = [f'feature{i}' for i in range(n_features)]\n with raises(ValueError, match='input_features should have length equal'):\n transformer.get_feature_names_out(input_features[::2])\n feature_names_out = transformer.get_feature_names_out(input_features)\n assert feature_names_out is not None\n assert isinstance(feature_names_out, np.ndarray)\n assert all((isinstance(name, str) for name in feature_names_out))\n if isinstance(X_transform, tuple):\n n_features_out = X_transform[0].shape[1]\n else:\n n_features_out = X_transform.shape[1]\n assert len(feature_names_out) == n_features_out, f'Expected {n_features_out} feature names, got {len(feature_names_out)}'" }, { @@ -172830,7 +186526,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformer_orig", @@ -172840,13 +186537,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_transformer_get_feature_names_out_pandas(name, transformer_orig):\n try:\n import pandas as pd\n except ImportError:\n raise SkipTest('pandas is not installed: not checking column name consistency for pandas')\n tags = transformer_orig._get_tags()\n if '2darray' not in tags['X_types'] or tags['no_validation']:\n return\n (X, y) = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1)\n X = StandardScaler().fit_transform(X)\n X -= X.min()\n transformer = clone(transformer_orig)\n X = _enforce_estimator_tags_x(transformer, X)\n X = _pairwise_estimator_convert_X(X, transformer)\n n_features = X.shape[1]\n set_random_state(transformer)\n y_ = y\n if name in CROSS_DECOMPOSITION:\n y_ = np.c_[np.asarray(y), np.asarray(y)]\n y_[::2, 1] *= 2\n feature_names_in = [f'col{i}' for i in range(n_features)]\n df = pd.DataFrame(X, columns=feature_names_in)\n X_transform = transformer.fit_transform(df, y=y_)\n invalid_feature_names = [f'bad{i}' for i in range(n_features)]\n with raises(ValueError, match='input_features is not equal to feature_names_in_'):\n transformer.get_feature_names_out(invalid_feature_names)\n feature_names_out_default = transformer.get_feature_names_out()\n feature_names_in_explicit_names = transformer.get_feature_names_out(feature_names_in)\n assert_array_equal(feature_names_out_default, feature_names_in_explicit_names)\n if isinstance(X_transform, tuple):\n n_features_out = X_transform[0].shape[1]\n else:\n n_features_out = X_transform.shape[1]\n assert len(feature_names_out_default) == n_features_out, f'Expected {n_features_out} feature names, got {len(feature_names_out_default)}'" }, { @@ -172864,7 +186562,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "estimator_orig", @@ -172874,13 +186573,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_transformer_n_iter(name, estimator_orig):\n estimator = clone(estimator_orig)\n if hasattr(estimator, 'max_iter'):\n if name in CROSS_DECOMPOSITION:\n X = [[0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [2.0, 2.0, 2.0], [2.0, 5.0, 4.0]]\n y_ = [[0.1, -0.2], [0.9, 1.1], [0.1, -0.5], [0.3, -0.2]]\n else:\n (X, y_) = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1)\n X -= X.min() - 0.1\n set_random_state(estimator, 0)\n estimator.fit(X, y_)\n if name in CROSS_DECOMPOSITION:\n for iter_ in estimator.n_iter_:\n assert iter_ >= 1\n else:\n assert estimator.n_iter_ >= 1" }, { @@ -172898,7 +186598,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformer_orig", @@ -172908,13 +186609,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef check_transformer_preserve_dtypes(name, transformer_orig):\n (X, y) = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, cluster_std=0.1)\n X = StandardScaler().fit_transform(X)\n X -= X.min()\n X = _pairwise_estimator_convert_X(X, transformer_orig)\n for dtype in _safe_tags(transformer_orig, key='preserves_dtype'):\n X_cast = X.astype(dtype)\n transformer = clone(transformer_orig)\n set_random_state(transformer)\n X_trans = transformer.fit_transform(X_cast, y)\n if isinstance(X_trans, tuple):\n X_trans = X_trans[0]\n assert X_trans.dtype == dtype, f'Estimator transform dtype: {X_trans.dtype} - original/expected dtype: {dtype.__name__}'" }, { @@ -172932,7 +186634,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "transformer", @@ -172942,13 +186645,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@ignore_warnings(category=FutureWarning)\ndef check_transformers_unfitted(name, transformer):\n (X, y) = _regression_dataset()\n transformer = clone(transformer)\n with raises((AttributeError, ValueError), err_msg=f'The unfitted transformer {name} does not raise an error when transform is called. Perhaps use check_is_fitted in transform.'):\n transformer.transform(X)" }, { @@ -172966,13 +186670,14 @@ "docstring": { "type": "list of estimators instances", "description": "Estimators to generated checks for.\n\n.. versionchanged:: 0.24\n Passing a class was deprecated in version 0.23, and support for\n classes was removed in 0.24. Pass an instance instead.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Pytest specific decorator for parametrizing estimator checks.\n\nThe `id` of each check is set to be a pprint version of the estimator and the name of the check with its keyword arguments. This allows to use `pytest -k` to specify which tests to run:: pytest test_check_estimators.py -k check_estimators_fit_returns_self", - "docstring": "Pytest specific decorator for parametrizing estimator checks.\n\nThe `id` of each check is set to be a pprint version of the estimator\nand the name of the check with its keyword arguments.\nThis allows to use `pytest -k` to specify which tests to run::\n\n pytest test_check_estimators.py -k check_estimators_fit_returns_self\n\nParameters\n----------\nestimators : list of estimators instances\n Estimators to generated checks for.\n\n .. versionchanged:: 0.24\n Passing a class was deprecated in version 0.23, and support for\n classes was removed in 0.24. Pass an instance instead.\n\n .. versionadded:: 0.24\n\nReturns\n-------\ndecorator : `pytest.mark.parametrize`\n\nSee Also\n--------\ncheck_estimator : Check if estimator adheres to scikit-learn conventions.\n\nExamples\n--------\n>>> from sklearn.utils.estimator_checks import parametrize_with_checks\n>>> from sklearn.linear_model import LogisticRegression\n>>> from sklearn.tree import DecisionTreeRegressor\n\n>>> @parametrize_with_checks([LogisticRegression(),\n... DecisionTreeRegressor()])\n... def test_sklearn_compatible_estimator(estimator, check):\n... check(estimator)", + "description": "Pytest specific decorator for parametrizing estimator checks.\n\nThe `id` of each check is set to be a pprint version of the estimator\nand the name of the check with its keyword arguments.\nThis allows to use `pytest -k` to specify which tests to run::\n\n pytest test_check_estimators.py -k check_estimators_fit_returns_self", + "docstring": "Pytest specific decorator for parametrizing estimator checks.\n\n The `id` of each check is set to be a pprint version of the estimator\n and the name of the check with its keyword arguments.\n This allows to use `pytest -k` to specify which tests to run::\n\n pytest test_check_estimators.py -k check_estimators_fit_returns_self\n\n Parameters\n ----------\n estimators : list of estimators instances\n Estimators to generated checks for.\n\n .. versionchanged:: 0.24\n Passing a class was deprecated in version 0.23, and support for\n classes was removed in 0.24. Pass an instance instead.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n decorator : `pytest.mark.parametrize`\n\n See Also\n --------\n check_estimator : Check if estimator adheres to scikit-learn conventions.\n\n Examples\n --------\n >>> from sklearn.utils.estimator_checks import parametrize_with_checks\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.tree import DecisionTreeRegressor\n\n >>> @parametrize_with_checks([LogisticRegression(),\n ... DecisionTreeRegressor()])\n ... def test_sklearn_compatible_estimator(estimator, check):\n ... check(estimator)\n\n ", "source_code": "\ndef parametrize_with_checks(estimators):\n \"\"\"Pytest specific decorator for parametrizing estimator checks.\n\n The `id` of each check is set to be a pprint version of the estimator\n and the name of the check with its keyword arguments.\n This allows to use `pytest -k` to specify which tests to run::\n\n pytest test_check_estimators.py -k check_estimators_fit_returns_self\n\n Parameters\n ----------\n estimators : list of estimators instances\n Estimators to generated checks for.\n\n .. versionchanged:: 0.24\n Passing a class was deprecated in version 0.23, and support for\n classes was removed in 0.24. Pass an instance instead.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n decorator : `pytest.mark.parametrize`\n\n See Also\n --------\n check_estimator : Check if estimator adheres to scikit-learn conventions.\n\n Examples\n --------\n >>> from sklearn.utils.estimator_checks import parametrize_with_checks\n >>> from sklearn.linear_model import LogisticRegression\n >>> from sklearn.tree import DecisionTreeRegressor\n\n >>> @parametrize_with_checks([LogisticRegression(),\n ... DecisionTreeRegressor()])\n ... def test_sklearn_compatible_estimator(estimator, check):\n ... check(estimator)\n\n \"\"\"\n import pytest\n if any((isinstance(est, type) for est in estimators)):\n msg = \"Passing a class was deprecated in version 0.23 and isn't supported anymore from 0.24.Please pass an instance instead.\"\n raise TypeError(msg)\n \n def checks_generator():\n for estimator in estimators:\n name = type(estimator).__name__\n for check in _yield_all_checks(estimator):\n check = partial(check, name)\n yield _maybe_mark_xfail(estimator, check, pytest)\n return pytest.mark.parametrize('estimator, check', checks_generator(), ids=_get_check_estimator_ids)" }, { @@ -172990,13 +186695,14 @@ "docstring": { "type": "ndarray", "description": "Array with vectors as its rows." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Modify the sign of vectors for reproducibility.\n\nFlips the sign of elements of all the vectors (rows of u) such that the absolute maximum element of each vector is positive.", - "docstring": "Modify the sign of vectors for reproducibility.\n\nFlips the sign of elements of all the vectors (rows of u) such that\nthe absolute maximum element of each vector is positive.\n\nParameters\n----------\nu : ndarray\n Array with vectors as its rows.\n\nReturns\n-------\nu_flipped : ndarray with same shape as u\n Array with the sign flipped vectors as its rows.", + "description": "Modify the sign of vectors for reproducibility.\n\nFlips the sign of elements of all the vectors (rows of u) such that\nthe absolute maximum element of each vector is positive.", + "docstring": "Modify the sign of vectors for reproducibility.\n\n Flips the sign of elements of all the vectors (rows of u) such that\n the absolute maximum element of each vector is positive.\n\n Parameters\n ----------\n u : ndarray\n Array with vectors as its rows.\n\n Returns\n -------\n u_flipped : ndarray with same shape as u\n Array with the sign flipped vectors as its rows.\n ", "source_code": "\ndef _deterministic_vector_sign_flip(u):\n \"\"\"Modify the sign of vectors for reproducibility.\n\n Flips the sign of elements of all the vectors (rows of u) such that\n the absolute maximum element of each vector is positive.\n\n Parameters\n ----------\n u : ndarray\n Array with vectors as its rows.\n\n Returns\n -------\n u_flipped : ndarray with same shape as u\n Array with the sign flipped vectors as its rows.\n \"\"\"\n max_abs_rows = np.argmax(np.abs(u), axis=1)\n signs = np.sign(u[range(u.shape[0]), max_abs_rows])\n u *= signs[:, np.newaxis]\n return u" }, { @@ -173014,7 +186720,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data to use for variance update." - } + }, + "refined_type": {} }, { "name": "last_mean", @@ -173024,7 +186731,8 @@ "docstring": { "type": "array-like of shape (n_features,)", "description": "" - } + }, + "refined_type": {} }, { "name": "last_variance", @@ -173034,7 +186742,8 @@ "docstring": { "type": "array-like of shape (n_features,)", "description": "" - } + }, + "refined_type": {} }, { "name": "last_sample_count", @@ -173044,7 +186753,8 @@ "docstring": { "type": "array-like of shape (n_features,)", "description": "The number of samples encountered until now if sample_weight is None.\nIf sample_weight is not None, this is the sum of sample_weight\nencountered." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -173054,13 +186764,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or None", "description": "Sample weights. If None, compute the unweighted mean/variance." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Calculate mean update and a Youngs and Cramer variance update.\n\nIf sample_weight is given, the weighted mean and variance is computed. Update a given mean and (possibly) variance according to new data given in X. last_mean is always required to compute the new mean. If last_variance is None, no variance is computed and None return for updated_variance. From the paper \"Algorithms for computing the sample variance: analysis and recommendations\", by Chan, Golub, and LeVeque.", - "docstring": "Calculate mean update and a Youngs and Cramer variance update.\n\nIf sample_weight is given, the weighted mean and variance is computed.\n\nUpdate a given mean and (possibly) variance according to new data given\nin X. last_mean is always required to compute the new mean.\nIf last_variance is None, no variance is computed and None return for\nupdated_variance.\n\nFrom the paper \"Algorithms for computing the sample variance: analysis and\nrecommendations\", by Chan, Golub, and LeVeque.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data to use for variance update.\n\nlast_mean : array-like of shape (n_features,)\n\nlast_variance : array-like of shape (n_features,)\n\nlast_sample_count : array-like of shape (n_features,)\n The number of samples encountered until now if sample_weight is None.\n If sample_weight is not None, this is the sum of sample_weight\n encountered.\n\nsample_weight : array-like of shape (n_samples,) or None\n Sample weights. If None, compute the unweighted mean/variance.\n\nReturns\n-------\nupdated_mean : ndarray of shape (n_features,)\n\nupdated_variance : ndarray of shape (n_features,)\n None if last_variance was None.\n\nupdated_sample_count : ndarray of shape (n_features,)\n\nNotes\n-----\nNaNs are ignored during the algorithm.\n\nReferences\n----------\nT. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample\n variance: recommendations, The American Statistician, Vol. 37, No. 3,\n pp. 242-247\n\nAlso, see the sparse implementation of this in\n`utils.sparsefuncs.incr_mean_variance_axis` and\n`utils.sparsefuncs_fast.incr_mean_variance_axis0`", + "description": "Calculate mean update and a Youngs and Cramer variance update.\n\nIf sample_weight is given, the weighted mean and variance is computed.\n\nUpdate a given mean and (possibly) variance according to new data given\nin X. last_mean is always required to compute the new mean.\nIf last_variance is None, no variance is computed and None return for\nupdated_variance.\n\nFrom the paper \"Algorithms for computing the sample variance: analysis and\nrecommendations\", by Chan, Golub, and LeVeque.", + "docstring": "Calculate mean update and a Youngs and Cramer variance update.\n\n If sample_weight is given, the weighted mean and variance is computed.\n\n Update a given mean and (possibly) variance according to new data given\n in X. last_mean is always required to compute the new mean.\n If last_variance is None, no variance is computed and None return for\n updated_variance.\n\n From the paper \"Algorithms for computing the sample variance: analysis and\n recommendations\", by Chan, Golub, and LeVeque.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to use for variance update.\n\n last_mean : array-like of shape (n_features,)\n\n last_variance : array-like of shape (n_features,)\n\n last_sample_count : array-like of shape (n_features,)\n The number of samples encountered until now if sample_weight is None.\n If sample_weight is not None, this is the sum of sample_weight\n encountered.\n\n sample_weight : array-like of shape (n_samples,) or None\n Sample weights. If None, compute the unweighted mean/variance.\n\n Returns\n -------\n updated_mean : ndarray of shape (n_features,)\n\n updated_variance : ndarray of shape (n_features,)\n None if last_variance was None.\n\n updated_sample_count : ndarray of shape (n_features,)\n\n Notes\n -----\n NaNs are ignored during the algorithm.\n\n References\n ----------\n T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample\n variance: recommendations, The American Statistician, Vol. 37, No. 3,\n pp. 242-247\n\n Also, see the sparse implementation of this in\n `utils.sparsefuncs.incr_mean_variance_axis` and\n `utils.sparsefuncs_fast.incr_mean_variance_axis0`\n ", "source_code": "\ndef _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count, sample_weight=None):\n \"\"\"Calculate mean update and a Youngs and Cramer variance update.\n\n If sample_weight is given, the weighted mean and variance is computed.\n\n Update a given mean and (possibly) variance according to new data given\n in X. last_mean is always required to compute the new mean.\n If last_variance is None, no variance is computed and None return for\n updated_variance.\n\n From the paper \"Algorithms for computing the sample variance: analysis and\n recommendations\", by Chan, Golub, and LeVeque.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data to use for variance update.\n\n last_mean : array-like of shape (n_features,)\n\n last_variance : array-like of shape (n_features,)\n\n last_sample_count : array-like of shape (n_features,)\n The number of samples encountered until now if sample_weight is None.\n If sample_weight is not None, this is the sum of sample_weight\n encountered.\n\n sample_weight : array-like of shape (n_samples,) or None\n Sample weights. If None, compute the unweighted mean/variance.\n\n Returns\n -------\n updated_mean : ndarray of shape (n_features,)\n\n updated_variance : ndarray of shape (n_features,)\n None if last_variance was None.\n\n updated_sample_count : ndarray of shape (n_features,)\n\n Notes\n -----\n NaNs are ignored during the algorithm.\n\n References\n ----------\n T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample\n variance: recommendations, The American Statistician, Vol. 37, No. 3,\n pp. 242-247\n\n Also, see the sparse implementation of this in\n `utils.sparsefuncs.incr_mean_variance_axis` and\n `utils.sparsefuncs_fast.incr_mean_variance_axis0`\n \"\"\"\n last_sum = last_mean * last_sample_count\n X_nan_mask = np.isnan(X)\n if np.any(X_nan_mask):\n sum_op = np.nansum\n else:\n sum_op = np.sum\n if sample_weight is not None:\n if np_version >= parse_version('1.16.6'):\n new_sum = _safe_accumulator_op(np.matmul, sample_weight, np.where(X_nan_mask, 0, X))\n else:\n new_sum = _safe_accumulator_op(np.nansum, X * sample_weight[:, None], axis=0)\n new_sample_count = _safe_accumulator_op(np.sum, sample_weight[:, None] * ~X_nan_mask, axis=0)\n else:\n new_sum = _safe_accumulator_op(sum_op, X, axis=0)\n n_samples = X.shape[0]\n new_sample_count = n_samples - np.sum(X_nan_mask, axis=0)\n updated_sample_count = last_sample_count + new_sample_count\n updated_mean = (last_sum + new_sum) / updated_sample_count\n if last_variance is None:\n updated_variance = None\n else:\n T = new_sum / new_sample_count\n temp = X - T\n if sample_weight is not None:\n if np_version >= parse_version('1.16.6'):\n correction = _safe_accumulator_op(np.matmul, sample_weight, np.where(X_nan_mask, 0, temp))\n temp **= 2\n new_unnormalized_variance = _safe_accumulator_op(np.matmul, sample_weight, np.where(X_nan_mask, 0, temp))\n else:\n correction = _safe_accumulator_op(sum_op, temp * sample_weight[:, None], axis=0)\n temp *= temp\n new_unnormalized_variance = _safe_accumulator_op(sum_op, temp * sample_weight[:, None], axis=0)\n else:\n correction = _safe_accumulator_op(sum_op, temp, axis=0)\n temp **= 2\n new_unnormalized_variance = _safe_accumulator_op(sum_op, temp, axis=0)\n new_unnormalized_variance -= correction**2 / new_sample_count\n last_unnormalized_variance = last_variance * last_sample_count\n with np.errstate(divide='ignore', invalid='ignore'):\n last_over_new_count = last_sample_count / new_sample_count\n updated_unnormalized_variance = last_unnormalized_variance + new_unnormalized_variance + last_over_new_count / updated_sample_count * (last_sum / last_over_new_count - new_sum)**2\n zeros = last_sample_count == 0\n updated_unnormalized_variance[zeros] = new_unnormalized_variance[zeros]\n updated_variance = updated_unnormalized_variance / updated_sample_count\n return updated_mean, updated_variance, updated_sample_count" }, { @@ -173078,7 +186789,8 @@ "docstring": { "type": "ndarray or sparse matrix", "description": "Matrix to decompose, it should be real symmetric square or complex\nhermitian" - } + }, + "refined_type": {} }, { "name": "n_components", @@ -173088,7 +186800,8 @@ "docstring": { "type": "int", "description": "Number of eigenvalues and vectors to extract." - } + }, + "refined_type": {} }, { "name": "n_oversamples", @@ -173098,7 +186811,8 @@ "docstring": { "type": "int, default=10", "description": "Additional number of random vectors to sample the range of M so as\nto ensure proper conditioning. The total number of random vectors\nused to find the range of M is n_components + n_oversamples. Smaller\nnumber can improve speed but can negatively impact the quality of\napproximation of eigenvectors and eigenvalues. Users might wish\nto increase this parameter up to `2*k - n_components` where k is the\neffective rank, for large matrices, noisy problems, matrices with\nslowly decaying spectrums, or to increase precision accuracy. See Halko\net al (pages 5, 23 and 26)." - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -173108,7 +186822,8 @@ "docstring": { "type": "int or 'auto', default='auto'", "description": "Number of power iterations. It can be used to deal with very noisy\nproblems. When 'auto', it is set to 4, unless `n_components` is small\n(< .1 * min(X.shape)) in which case `n_iter` is set to 7.\nThis improves precision with few components. Note that in general\nusers should rather increase `n_oversamples` before increasing `n_iter`\nas the principle of the randomized method is to avoid usage of these\nmore costly power iterations steps. When `n_components` is equal\nor greater to the effective matrix rank and the spectrum does not\npresent a slow decay, `n_iter=0` or `1` should even work fine in theory\n(see Halko et al paper, page 9)." - } + }, + "refined_type": {} }, { "name": "power_iteration_normalizer", @@ -173118,6 +186833,10 @@ "docstring": { "type": "{'auto', 'QR', 'LU', 'none'}, default='auto'", "description": "Whether the power iterations are normalized with step-by-step\nQR factorization (the slowest but most accurate), 'none'\n(the fastest but numerically unstable when `n_iter` is large, e.g.\ntypically 5 or larger), or 'LU' factorization (numerically stable\nbut can lose slightly in accuracy). The 'auto' mode applies no\nnormalization if `n_iter` <= 2 and switches to LU otherwise." + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "none", "QR", "LU"] } }, { @@ -173128,6 +186847,10 @@ "docstring": { "type": "{'value', 'module'}, default='module'", "description": "Strategy used to select the n components. When `selection` is `'value'`\n(not yet implemented, will become the default when implemented), the\ncomponents corresponding to the n largest eigenvalues are returned.\nWhen `selection` is `'module'`, the components corresponding to the n\neigenvalues with largest modules are returned." + }, + "refined_type": { + "kind": "EnumType", + "values": ["module", "value"] } }, { @@ -173138,13 +186861,14 @@ "docstring": { "type": "int, RandomState instance, default=None", "description": "The seed of the pseudo random number generator to use when shuffling\nthe data, i.e. getting the random vectors to initialize the algorithm.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Computes a truncated eigendecomposition using randomized methods\n\nThis method solves the fixed-rank approximation problem described in the Halko et al paper. The choice of which components to select can be tuned with the `selection` parameter. .. versionadded:: 0.24", - "docstring": "Computes a truncated eigendecomposition using randomized methods\n\nThis method solves the fixed-rank approximation problem described in the\nHalko et al paper.\n\nThe choice of which components to select can be tuned with the `selection`\nparameter.\n\n.. versionadded:: 0.24\n\nParameters\n----------\nM : ndarray or sparse matrix\n Matrix to decompose, it should be real symmetric square or complex\n hermitian\n\nn_components : int\n Number of eigenvalues and vectors to extract.\n\nn_oversamples : int, default=10\n Additional number of random vectors to sample the range of M so as\n to ensure proper conditioning. The total number of random vectors\n used to find the range of M is n_components + n_oversamples. Smaller\n number can improve speed but can negatively impact the quality of\n approximation of eigenvectors and eigenvalues. Users might wish\n to increase this parameter up to `2*k - n_components` where k is the\n effective rank, for large matrices, noisy problems, matrices with\n slowly decaying spectrums, or to increase precision accuracy. See Halko\n et al (pages 5, 23 and 26).\n\nn_iter : int or 'auto', default='auto'\n Number of power iterations. It can be used to deal with very noisy\n problems. When 'auto', it is set to 4, unless `n_components` is small\n (< .1 * min(X.shape)) in which case `n_iter` is set to 7.\n This improves precision with few components. Note that in general\n users should rather increase `n_oversamples` before increasing `n_iter`\n as the principle of the randomized method is to avoid usage of these\n more costly power iterations steps. When `n_components` is equal\n or greater to the effective matrix rank and the spectrum does not\n present a slow decay, `n_iter=0` or `1` should even work fine in theory\n (see Halko et al paper, page 9).\n\npower_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'\n Whether the power iterations are normalized with step-by-step\n QR factorization (the slowest but most accurate), 'none'\n (the fastest but numerically unstable when `n_iter` is large, e.g.\n typically 5 or larger), or 'LU' factorization (numerically stable\n but can lose slightly in accuracy). The 'auto' mode applies no\n normalization if `n_iter` <= 2 and switches to LU otherwise.\n\nselection : {'value', 'module'}, default='module'\n Strategy used to select the n components. When `selection` is `'value'`\n (not yet implemented, will become the default when implemented), the\n components corresponding to the n largest eigenvalues are returned.\n When `selection` is `'module'`, the components corresponding to the n\n eigenvalues with largest modules are returned.\n\nrandom_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator to use when shuffling\n the data, i.e. getting the random vectors to initialize the algorithm.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nNotes\n-----\nThis algorithm finds a (usually very good) approximate truncated\neigendecomposition using randomized methods to speed up the computations.\n\nThis method is particularly fast on large matrices on which\nyou wish to extract only a small number of components. In order to\nobtain further speed up, `n_iter` can be set <=2 (at the cost of\nloss of precision). To increase the precision it is recommended to\nincrease `n_oversamples`, up to `2*k-n_components` where k is the\neffective rank. Usually, `n_components` is chosen to be greater than k\nso increasing `n_oversamples` up to `n_components` should be enough.\n\nStrategy 'value': not implemented yet.\nAlgorithms 5.3, 5.4 and 5.5 in the Halko et al paper should provide good\ncondidates for a future implementation.\n\nStrategy 'module':\nThe principle is that for diagonalizable matrices, the singular values and\neigenvalues are related: if t is an eigenvalue of A, then :math:`|t|` is a\nsingular value of A. This method relies on a randomized SVD to find the n\nsingular components corresponding to the n singular values with largest\nmodules, and then uses the signs of the singular vectors to find the true\nsign of t: if the sign of left and right singular vectors are different\nthen the corresponding eigenvalue is negative.\n\nReturns\n-------\neigvals : 1D array of shape (n_components,) containing the `n_components`\n eigenvalues selected (see ``selection`` parameter).\neigvecs : 2D array of shape (M.shape[0], n_components) containing the\n `n_components` eigenvectors corresponding to the `eigvals`, in the\n corresponding order. Note that this follows the `scipy.linalg.eigh`\n convention.\n\nSee Also\n--------\n:func:`randomized_svd`\n\nReferences\n----------\n* Finding structure with randomness: Stochastic algorithms for constructing\n approximate matrix decompositions (Algorithm 4.3 for strategy 'module')\n Halko, et al., 2009 https://arxiv.org/abs/0909.4061", + "description": "Computes a truncated eigendecomposition using randomized methods\n\nThis method solves the fixed-rank approximation problem described in the\nHalko et al paper.\n\nThe choice of which components to select can be tuned with the `selection`\nparameter.\n\n.. versionadded:: 0.24", + "docstring": "Computes a truncated eigendecomposition using randomized methods\n\n This method solves the fixed-rank approximation problem described in the\n Halko et al paper.\n\n The choice of which components to select can be tuned with the `selection`\n parameter.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n M : ndarray or sparse matrix\n Matrix to decompose, it should be real symmetric square or complex\n hermitian\n\n n_components : int\n Number of eigenvalues and vectors to extract.\n\n n_oversamples : int, default=10\n Additional number of random vectors to sample the range of M so as\n to ensure proper conditioning. The total number of random vectors\n used to find the range of M is n_components + n_oversamples. Smaller\n number can improve speed but can negatively impact the quality of\n approximation of eigenvectors and eigenvalues. Users might wish\n to increase this parameter up to `2*k - n_components` where k is the\n effective rank, for large matrices, noisy problems, matrices with\n slowly decaying spectrums, or to increase precision accuracy. See Halko\n et al (pages 5, 23 and 26).\n\n n_iter : int or 'auto', default='auto'\n Number of power iterations. It can be used to deal with very noisy\n problems. When 'auto', it is set to 4, unless `n_components` is small\n (< .1 * min(X.shape)) in which case `n_iter` is set to 7.\n This improves precision with few components. Note that in general\n users should rather increase `n_oversamples` before increasing `n_iter`\n as the principle of the randomized method is to avoid usage of these\n more costly power iterations steps. When `n_components` is equal\n or greater to the effective matrix rank and the spectrum does not\n present a slow decay, `n_iter=0` or `1` should even work fine in theory\n (see Halko et al paper, page 9).\n\n power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'\n Whether the power iterations are normalized with step-by-step\n QR factorization (the slowest but most accurate), 'none'\n (the fastest but numerically unstable when `n_iter` is large, e.g.\n typically 5 or larger), or 'LU' factorization (numerically stable\n but can lose slightly in accuracy). The 'auto' mode applies no\n normalization if `n_iter` <= 2 and switches to LU otherwise.\n\n selection : {'value', 'module'}, default='module'\n Strategy used to select the n components. When `selection` is `'value'`\n (not yet implemented, will become the default when implemented), the\n components corresponding to the n largest eigenvalues are returned.\n When `selection` is `'module'`, the components corresponding to the n\n eigenvalues with largest modules are returned.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator to use when shuffling\n the data, i.e. getting the random vectors to initialize the algorithm.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Notes\n -----\n This algorithm finds a (usually very good) approximate truncated\n eigendecomposition using randomized methods to speed up the computations.\n\n This method is particularly fast on large matrices on which\n you wish to extract only a small number of components. In order to\n obtain further speed up, `n_iter` can be set <=2 (at the cost of\n loss of precision). To increase the precision it is recommended to\n increase `n_oversamples`, up to `2*k-n_components` where k is the\n effective rank. Usually, `n_components` is chosen to be greater than k\n so increasing `n_oversamples` up to `n_components` should be enough.\n\n Strategy 'value': not implemented yet.\n Algorithms 5.3, 5.4 and 5.5 in the Halko et al paper should provide good\n condidates for a future implementation.\n\n Strategy 'module':\n The principle is that for diagonalizable matrices, the singular values and\n eigenvalues are related: if t is an eigenvalue of A, then :math:`|t|` is a\n singular value of A. This method relies on a randomized SVD to find the n\n singular components corresponding to the n singular values with largest\n modules, and then uses the signs of the singular vectors to find the true\n sign of t: if the sign of left and right singular vectors are different\n then the corresponding eigenvalue is negative.\n\n Returns\n -------\n eigvals : 1D array of shape (n_components,) containing the `n_components`\n eigenvalues selected (see ``selection`` parameter).\n eigvecs : 2D array of shape (M.shape[0], n_components) containing the\n `n_components` eigenvectors corresponding to the `eigvals`, in the\n corresponding order. Note that this follows the `scipy.linalg.eigh`\n convention.\n\n See Also\n --------\n :func:`randomized_svd`\n\n References\n ----------\n * Finding structure with randomness: Stochastic algorithms for constructing\n approximate matrix decompositions (Algorithm 4.3 for strategy 'module')\n Halko, et al., 2009 https://arxiv.org/abs/0909.4061\n\n ", "source_code": "\ndef _randomized_eigsh(M, n_components, *, n_oversamples=10, n_iter='auto', power_iteration_normalizer='auto', selection='module', random_state=None):\n \"\"\"Computes a truncated eigendecomposition using randomized methods\n\n This method solves the fixed-rank approximation problem described in the\n Halko et al paper.\n\n The choice of which components to select can be tuned with the `selection`\n parameter.\n\n .. versionadded:: 0.24\n\n Parameters\n ----------\n M : ndarray or sparse matrix\n Matrix to decompose, it should be real symmetric square or complex\n hermitian\n\n n_components : int\n Number of eigenvalues and vectors to extract.\n\n n_oversamples : int, default=10\n Additional number of random vectors to sample the range of M so as\n to ensure proper conditioning. The total number of random vectors\n used to find the range of M is n_components + n_oversamples. Smaller\n number can improve speed but can negatively impact the quality of\n approximation of eigenvectors and eigenvalues. Users might wish\n to increase this parameter up to `2*k - n_components` where k is the\n effective rank, for large matrices, noisy problems, matrices with\n slowly decaying spectrums, or to increase precision accuracy. See Halko\n et al (pages 5, 23 and 26).\n\n n_iter : int or 'auto', default='auto'\n Number of power iterations. It can be used to deal with very noisy\n problems. When 'auto', it is set to 4, unless `n_components` is small\n (< .1 * min(X.shape)) in which case `n_iter` is set to 7.\n This improves precision with few components. Note that in general\n users should rather increase `n_oversamples` before increasing `n_iter`\n as the principle of the randomized method is to avoid usage of these\n more costly power iterations steps. When `n_components` is equal\n or greater to the effective matrix rank and the spectrum does not\n present a slow decay, `n_iter=0` or `1` should even work fine in theory\n (see Halko et al paper, page 9).\n\n power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'\n Whether the power iterations are normalized with step-by-step\n QR factorization (the slowest but most accurate), 'none'\n (the fastest but numerically unstable when `n_iter` is large, e.g.\n typically 5 or larger), or 'LU' factorization (numerically stable\n but can lose slightly in accuracy). The 'auto' mode applies no\n normalization if `n_iter` <= 2 and switches to LU otherwise.\n\n selection : {'value', 'module'}, default='module'\n Strategy used to select the n components. When `selection` is `'value'`\n (not yet implemented, will become the default when implemented), the\n components corresponding to the n largest eigenvalues are returned.\n When `selection` is `'module'`, the components corresponding to the n\n eigenvalues with largest modules are returned.\n\n random_state : int, RandomState instance, default=None\n The seed of the pseudo random number generator to use when shuffling\n the data, i.e. getting the random vectors to initialize the algorithm.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Notes\n -----\n This algorithm finds a (usually very good) approximate truncated\n eigendecomposition using randomized methods to speed up the computations.\n\n This method is particularly fast on large matrices on which\n you wish to extract only a small number of components. In order to\n obtain further speed up, `n_iter` can be set <=2 (at the cost of\n loss of precision). To increase the precision it is recommended to\n increase `n_oversamples`, up to `2*k-n_components` where k is the\n effective rank. Usually, `n_components` is chosen to be greater than k\n so increasing `n_oversamples` up to `n_components` should be enough.\n\n Strategy 'value': not implemented yet.\n Algorithms 5.3, 5.4 and 5.5 in the Halko et al paper should provide good\n condidates for a future implementation.\n\n Strategy 'module':\n The principle is that for diagonalizable matrices, the singular values and\n eigenvalues are related: if t is an eigenvalue of A, then :math:`|t|` is a\n singular value of A. This method relies on a randomized SVD to find the n\n singular components corresponding to the n singular values with largest\n modules, and then uses the signs of the singular vectors to find the true\n sign of t: if the sign of left and right singular vectors are different\n then the corresponding eigenvalue is negative.\n\n Returns\n -------\n eigvals : 1D array of shape (n_components,) containing the `n_components`\n eigenvalues selected (see ``selection`` parameter).\n eigvecs : 2D array of shape (M.shape[0], n_components) containing the\n `n_components` eigenvectors corresponding to the `eigvals`, in the\n corresponding order. Note that this follows the `scipy.linalg.eigh`\n convention.\n\n See Also\n --------\n :func:`randomized_svd`\n\n References\n ----------\n * Finding structure with randomness: Stochastic algorithms for constructing\n approximate matrix decompositions (Algorithm 4.3 for strategy 'module')\n Halko, et al., 2009 https://arxiv.org/abs/0909.4061\n\n \"\"\"\n if selection == 'value':\n raise NotImplementedError()\n elif selection == 'module':\n (U, S, Vt) = randomized_svd(M, n_components=n_components, n_oversamples=n_oversamples, n_iter=n_iter, power_iteration_normalizer=power_iteration_normalizer, flip_sign=False, random_state=random_state)\n eigvecs = U[:, :n_components]\n eigvals = S[:n_components]\n diag_VtU = np.einsum('ji,ij->j', Vt[:n_components, :], U[:, :n_components])\n signs = np.sign(diag_VtU)\n eigvals = eigvals * signs\n else:\n raise ValueError('Invalid `selection`: %r' % selection)\n return eigvals, eigvecs" }, { @@ -173162,7 +186886,8 @@ "docstring": { "type": "function", "description": "A numpy accumulator function such as np.mean or np.sum." - } + }, + "refined_type": {} }, { "name": "x", @@ -173172,13 +186897,14 @@ "docstring": { "type": "ndarray", "description": "A numpy array to apply the accumulator function." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "This function provides numpy accumulator functions with a float64 dtype when used on a floating point input. This prevents accumulator overflow on smaller floating point dtypes.", - "docstring": "This function provides numpy accumulator functions with a float64 dtype\nwhen used on a floating point input. This prevents accumulator overflow on\nsmaller floating point dtypes.\n\nParameters\n----------\nop : function\n A numpy accumulator function such as np.mean or np.sum.\nx : ndarray\n A numpy array to apply the accumulator function.\n*args : positional arguments\n Positional arguments passed to the accumulator function after the\n input x.\n**kwargs : keyword arguments\n Keyword arguments passed to the accumulator function.\n\nReturns\n-------\nresult\n The output of the accumulator function passed to this function.", + "description": "This function provides numpy accumulator functions with a float64 dtype\nwhen used on a floating point input. This prevents accumulator overflow on\nsmaller floating point dtypes.", + "docstring": "\n This function provides numpy accumulator functions with a float64 dtype\n when used on a floating point input. This prevents accumulator overflow on\n smaller floating point dtypes.\n\n Parameters\n ----------\n op : function\n A numpy accumulator function such as np.mean or np.sum.\n x : ndarray\n A numpy array to apply the accumulator function.\n *args : positional arguments\n Positional arguments passed to the accumulator function after the\n input x.\n **kwargs : keyword arguments\n Keyword arguments passed to the accumulator function.\n\n Returns\n -------\n result\n The output of the accumulator function passed to this function.\n ", "source_code": "\ndef _safe_accumulator_op(op, x, *args, **kwargs):\n \"\"\"\n This function provides numpy accumulator functions with a float64 dtype\n when used on a floating point input. This prevents accumulator overflow on\n smaller floating point dtypes.\n\n Parameters\n ----------\n op : function\n A numpy accumulator function such as np.mean or np.sum.\n x : ndarray\n A numpy array to apply the accumulator function.\n *args : positional arguments\n Positional arguments passed to the accumulator function after the\n input x.\n **kwargs : keyword arguments\n Keyword arguments passed to the accumulator function.\n\n Returns\n -------\n result\n The output of the accumulator function passed to this function.\n \"\"\"\n if np.issubdtype(x.dtype, np.floating) and x.dtype.itemsize < 8:\n result = op(x, *args, **kwargs, dtype=np.float64)\n else:\n result = op(x, *args, **kwargs)\n return result" }, { @@ -173196,7 +186922,8 @@ "docstring": { "type": "list of array-like", "description": "1-D arrays to form the cartesian product of." - } + }, + "refined_type": {} }, { "name": "out", @@ -173204,16 +186931,17 @@ "is_public": true, "assigned_by": "POSITION_OR_NAME", "docstring": { - "type": "ndarray, default=None", + "type": "ndarray of shape (M, len(arrays)), default=None", "description": "Array to place the cartesian product in." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generate a cartesian product of input arrays.", - "docstring": "Generate a cartesian product of input arrays.\n\nParameters\n----------\narrays : list of array-like\n 1-D arrays to form the cartesian product of.\nout : ndarray, default=None\n Array to place the cartesian product in.\n\nReturns\n-------\nout : ndarray\n 2-D array of shape (M, len(arrays)) containing cartesian products\n formed of input arrays.\n\nExamples\n--------\n>>> from sklearn.utils.extmath import cartesian\n>>> cartesian(([1, 2, 3], [4, 5], [6, 7]))\narray([[1, 4, 6],\n [1, 4, 7],\n [1, 5, 6],\n [1, 5, 7],\n [2, 4, 6],\n [2, 4, 7],\n [2, 5, 6],\n [2, 5, 7],\n [3, 4, 6],\n [3, 4, 7],\n [3, 5, 6],\n [3, 5, 7]])\n\nNotes\n-----\nThis function may not be used on more than 32 arrays\nbecause the underlying numpy functions do not support it.", - "source_code": "\ndef cartesian(arrays, out=None):\n \"\"\"Generate a cartesian product of input arrays.\n\n Parameters\n ----------\n arrays : list of array-like\n 1-D arrays to form the cartesian product of.\n out : ndarray, default=None\n Array to place the cartesian product in.\n\n Returns\n -------\n out : ndarray\n 2-D array of shape (M, len(arrays)) containing cartesian products\n formed of input arrays.\n\n Examples\n --------\n >>> from sklearn.utils.extmath import cartesian\n >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))\n array([[1, 4, 6],\n [1, 4, 7],\n [1, 5, 6],\n [1, 5, 7],\n [2, 4, 6],\n [2, 4, 7],\n [2, 5, 6],\n [2, 5, 7],\n [3, 4, 6],\n [3, 4, 7],\n [3, 5, 6],\n [3, 5, 7]])\n\n Notes\n -----\n This function may not be used on more than 32 arrays\n because the underlying numpy functions do not support it.\n \"\"\"\n arrays = [np.asarray(x) for x in arrays]\n shape = (len(x) for x in arrays)\n dtype = arrays[0].dtype\n ix = np.indices(shape)\n ix = ix.reshape(len(arrays), -1).T\n if out is None:\n out = np.empty_like(ix, dtype=dtype)\n for (n, arr) in enumerate(arrays):\n out[:, n] = arrays[n][ix[:, n]]\n return out" + "docstring": "Generate a cartesian product of input arrays.\n\n Parameters\n ----------\n arrays : list of array-like\n 1-D arrays to form the cartesian product of.\n out : ndarray of shape (M, len(arrays)), default=None\n Array to place the cartesian product in.\n\n Returns\n -------\n out : ndarray of shape (M, len(arrays))\n Array containing the cartesian products formed of input arrays.\n\n Notes\n -----\n This function may not be used on more than 32 arrays\n because the underlying numpy functions do not support it.\n\n Examples\n --------\n >>> from sklearn.utils.extmath import cartesian\n >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))\n array([[1, 4, 6],\n [1, 4, 7],\n [1, 5, 6],\n [1, 5, 7],\n [2, 4, 6],\n [2, 4, 7],\n [2, 5, 6],\n [2, 5, 7],\n [3, 4, 6],\n [3, 4, 7],\n [3, 5, 6],\n [3, 5, 7]])\n ", + "source_code": "\ndef cartesian(arrays, out=None):\n \"\"\"Generate a cartesian product of input arrays.\n\n Parameters\n ----------\n arrays : list of array-like\n 1-D arrays to form the cartesian product of.\n out : ndarray of shape (M, len(arrays)), default=None\n Array to place the cartesian product in.\n\n Returns\n -------\n out : ndarray of shape (M, len(arrays))\n Array containing the cartesian products formed of input arrays.\n\n Notes\n -----\n This function may not be used on more than 32 arrays\n because the underlying numpy functions do not support it.\n\n Examples\n --------\n >>> from sklearn.utils.extmath import cartesian\n >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))\n array([[1, 4, 6],\n [1, 4, 7],\n [1, 5, 6],\n [1, 5, 7],\n [2, 4, 6],\n [2, 4, 7],\n [2, 5, 6],\n [2, 5, 7],\n [3, 4, 6],\n [3, 4, 7],\n [3, 5, 6],\n [3, 5, 7]])\n \"\"\"\n arrays = [np.asarray(x) for x in arrays]\n shape = (len(x) for x in arrays)\n dtype = arrays[0].dtype\n ix = np.indices(shape)\n ix = ix.reshape(len(arrays), -1).T\n if out is None:\n out = np.empty_like(ix, dtype=dtype)\n for (n, arr) in enumerate(arrays):\n out[:, n] = arrays[n][ix[:, n]]\n return out" }, { "name": "density", @@ -173230,13 +186958,14 @@ "docstring": { "type": "array-like", "description": "The sparse vector." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute density of a sparse vector.", - "docstring": "Compute density of a sparse vector.\n\nParameters\n----------\nw : array-like\n The sparse vector.\n\nReturns\n-------\nfloat\n The density of w, between 0 and 1.", + "docstring": "Compute density of a sparse vector.\n\n Parameters\n ----------\n w : array-like\n The sparse vector.\n\n Returns\n -------\n float\n The density of w, between 0 and 1.\n ", "source_code": "\ndef density(w, **kwargs):\n \"\"\"Compute density of a sparse vector.\n\n Parameters\n ----------\n w : array-like\n The sparse vector.\n\n Returns\n -------\n float\n The density of w, between 0 and 1.\n \"\"\"\n if hasattr(w, 'toarray'):\n d = float(w.nnz) / (w.shape[0] * w.shape[1])\n else:\n d = 0 if w is None else float((w != 0).sum()) / w.size\n return d" }, { @@ -173254,13 +186983,14 @@ "docstring": { "type": "array-like", "description": "The matrix." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute log(det(A)) for A symmetric.\n\nEquivalent to : np.log(nl.det(A)) but more robust. It returns -Inf if det(A) is non positive or is not defined.", - "docstring": "Compute log(det(A)) for A symmetric.\n\nEquivalent to : np.log(nl.det(A)) but more robust.\nIt returns -Inf if det(A) is non positive or is not defined.\n\nParameters\n----------\nA : array-like\n The matrix.", + "description": "Compute log(det(A)) for A symmetric.\n\nEquivalent to : np.log(nl.det(A)) but more robust.\nIt returns -Inf if det(A) is non positive or is not defined.", + "docstring": "Compute log(det(A)) for A symmetric.\n\n Equivalent to : np.log(nl.det(A)) but more robust.\n It returns -Inf if det(A) is non positive or is not defined.\n\n Parameters\n ----------\n A : array-like\n The matrix.\n ", "source_code": "\ndef fast_logdet(A):\n \"\"\"Compute log(det(A)) for A symmetric.\n\n Equivalent to : np.log(nl.det(A)) but more robust.\n It returns -Inf if det(A) is non positive or is not defined.\n\n Parameters\n ----------\n A : array-like\n The matrix.\n \"\"\"\n (sign, ld) = np.linalg.slogdet(A)\n if not sign > 0:\n return -np.inf\n return ld" }, { @@ -173278,7 +187008,8 @@ "docstring": { "type": "array-like of shape (M, N) or (M,)", "description": "Argument to the logistic function." - } + }, + "refined_type": {} }, { "name": "out", @@ -173288,13 +187019,14 @@ "docstring": { "type": "array-like of shape (M, N) or (M,), default=None", "description": "Preallocated output array." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute the log of the logistic function, ``log(1 / (1 + e ** -x))``.\n\nThis implementation is numerically stable because it splits positive and negative values:: -log(1 + exp(-x_i)) if x_i > 0 x_i - log(1 + exp(x_i)) if x_i <= 0 For the ordinary logistic function, use ``scipy.special.expit``.", - "docstring": "Compute the log of the logistic function, ``log(1 / (1 + e ** -x))``.\n\nThis implementation is numerically stable because it splits positive and\nnegative values::\n\n -log(1 + exp(-x_i)) if x_i > 0\n x_i - log(1 + exp(x_i)) if x_i <= 0\n\nFor the ordinary logistic function, use ``scipy.special.expit``.\n\nParameters\n----------\nX : array-like of shape (M, N) or (M,)\n Argument to the logistic function.\n\nout : array-like of shape (M, N) or (M,), default=None\n Preallocated output array.\n\nReturns\n-------\nout : ndarray of shape (M, N) or (M,)\n Log of the logistic function evaluated at every point in x.\n\nNotes\n-----\nSee the blog post describing this implementation:\nhttp://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression/", + "description": "Compute the log of the logistic function, ``log(1 / (1 + e ** -x))``.\n\nThis implementation is numerically stable because it splits positive and\nnegative values::\n\n -log(1 + exp(-x_i)) if x_i > 0\n x_i - log(1 + exp(x_i)) if x_i <= 0\n\nFor the ordinary logistic function, use ``scipy.special.expit``.", + "docstring": "Compute the log of the logistic function, ``log(1 / (1 + e ** -x))``.\n\n This implementation is numerically stable because it splits positive and\n negative values::\n\n -log(1 + exp(-x_i)) if x_i > 0\n x_i - log(1 + exp(x_i)) if x_i <= 0\n\n For the ordinary logistic function, use ``scipy.special.expit``.\n\n Parameters\n ----------\n X : array-like of shape (M, N) or (M,)\n Argument to the logistic function.\n\n out : array-like of shape (M, N) or (M,), default=None\n Preallocated output array.\n\n Returns\n -------\n out : ndarray of shape (M, N) or (M,)\n Log of the logistic function evaluated at every point in x.\n\n Notes\n -----\n See the blog post describing this implementation:\n http://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression/\n ", "source_code": "\ndef log_logistic(X, out=None):\n \"\"\"Compute the log of the logistic function, ``log(1 / (1 + e ** -x))``.\n\n This implementation is numerically stable because it splits positive and\n negative values::\n\n -log(1 + exp(-x_i)) if x_i > 0\n x_i - log(1 + exp(x_i)) if x_i <= 0\n\n For the ordinary logistic function, use ``scipy.special.expit``.\n\n Parameters\n ----------\n X : array-like of shape (M, N) or (M,)\n Argument to the logistic function.\n\n out : array-like of shape (M, N) or (M,), default=None\n Preallocated output array.\n\n Returns\n -------\n out : ndarray of shape (M, N) or (M,)\n Log of the logistic function evaluated at every point in x.\n\n Notes\n -----\n See the blog post describing this implementation:\n http://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression/\n \"\"\"\n is_1d = X.ndim == 1\n X = np.atleast_2d(X)\n X = check_array(X, dtype=np.float64)\n (n_samples, n_features) = X.shape\n if out is None:\n out = np.empty_like(X)\n _log_logistic_sigmoid(n_samples, n_features, X, out)\n if is_1d:\n return np.squeeze(out)\n return out" }, { @@ -173312,7 +187044,8 @@ "docstring": { "type": "array-like", "description": "The matrix to make non-negative." - } + }, + "refined_type": {} }, { "name": "min_value", @@ -173322,13 +187055,14 @@ "docstring": { "type": "float, default=0", "description": "The threshold value." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Ensure `X.min()` >= `min_value`.", - "docstring": "Ensure `X.min()` >= `min_value`.\n\nParameters\n----------\nX : array-like\n The matrix to make non-negative.\nmin_value : float, default=0\n The threshold value.\n\nReturns\n-------\narray-like\n The thresholded array.\n\nRaises\n------\nValueError\n When X is sparse.", + "docstring": "Ensure `X.min()` >= `min_value`.\n\n Parameters\n ----------\n X : array-like\n The matrix to make non-negative.\n min_value : float, default=0\n The threshold value.\n\n Returns\n -------\n array-like\n The thresholded array.\n\n Raises\n ------\n ValueError\n When X is sparse.\n ", "source_code": "\ndef make_nonnegative(X, min_value=0):\n \"\"\"Ensure `X.min()` >= `min_value`.\n\n Parameters\n ----------\n X : array-like\n The matrix to make non-negative.\n min_value : float, default=0\n The threshold value.\n\n Returns\n -------\n array-like\n The thresholded array.\n\n Raises\n ------\n ValueError\n When X is sparse.\n \"\"\"\n min_ = X.min()\n if min_ < min_value:\n if sparse.issparse(X):\n raise ValueError('Cannot make the data matrix nonnegative because it is sparse. Adding a value to every entry would make it no longer sparse.')\n X = X + (min_value - min_)\n return X" }, { @@ -173346,7 +187080,8 @@ "docstring": { "type": "2D array", "description": "The input data matrix." - } + }, + "refined_type": {} }, { "name": "size", @@ -173356,7 +187091,8 @@ "docstring": { "type": "int", "description": "Size of the return array." - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -173366,7 +187102,8 @@ "docstring": { "type": "int", "description": "Number of power iterations used to stabilize the result." - } + }, + "refined_type": {} }, { "name": "power_iteration_normalizer", @@ -173376,6 +187113,10 @@ "docstring": { "type": "{'auto', 'QR', 'LU', 'none'}, default='auto'", "description": "Whether the power iterations are normalized with step-by-step\nQR factorization (the slowest but most accurate), 'none'\n(the fastest but numerically unstable when `n_iter` is large, e.g.\ntypically 5 or larger), or 'LU' factorization (numerically stable\nbut can lose slightly in accuracy). The 'auto' mode applies no\nnormalization if `n_iter` <= 2 and switches to LU otherwise.\n\n.. versionadded:: 0.18" + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "none", "QR", "LU"] } }, { @@ -173386,14 +187127,15 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "The seed of the pseudo random number generator to use when shuffling\nthe data, i.e. getting the random vectors to initialize the algorithm.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Computes an orthonormal matrix whose range approximates the range of A.", - "docstring": "Computes an orthonormal matrix whose range approximates the range of A.\n\nParameters\n----------\nA : 2D array\n The input data matrix.\n\nsize : int\n Size of the return array.\n\nn_iter : int\n Number of power iterations used to stabilize the result.\n\npower_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'\n Whether the power iterations are normalized with step-by-step\n QR factorization (the slowest but most accurate), 'none'\n (the fastest but numerically unstable when `n_iter` is large, e.g.\n typically 5 or larger), or 'LU' factorization (numerically stable\n but can lose slightly in accuracy). The 'auto' mode applies no\n normalization if `n_iter` <= 2 and switches to LU otherwise.\n\n .. versionadded:: 0.18\n\nrandom_state : int, RandomState instance or None, default=None\n The seed of the pseudo random number generator to use when shuffling\n the data, i.e. getting the random vectors to initialize the algorithm.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nReturns\n-------\nQ : ndarray\n A (size x size) projection matrix, the range of which\n approximates well the range of the input matrix A.\n\nNotes\n-----\n\nFollows Algorithm 4.3 of\nFinding structure with randomness: Stochastic algorithms for constructing\napproximate matrix decompositions\nHalko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf\n\nAn implementation of a randomized algorithm for principal component\nanalysis\nA. Szlam et al. 2014", - "source_code": "\ndef randomized_range_finder(A, *, size, n_iter, power_iteration_normalizer='auto', random_state=None):\n \"\"\"Computes an orthonormal matrix whose range approximates the range of A.\n\n Parameters\n ----------\n A : 2D array\n The input data matrix.\n\n size : int\n Size of the return array.\n\n n_iter : int\n Number of power iterations used to stabilize the result.\n\n power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'\n Whether the power iterations are normalized with step-by-step\n QR factorization (the slowest but most accurate), 'none'\n (the fastest but numerically unstable when `n_iter` is large, e.g.\n typically 5 or larger), or 'LU' factorization (numerically stable\n but can lose slightly in accuracy). The 'auto' mode applies no\n normalization if `n_iter` <= 2 and switches to LU otherwise.\n\n .. versionadded:: 0.18\n\n random_state : int, RandomState instance or None, default=None\n The seed of the pseudo random number generator to use when shuffling\n the data, i.e. getting the random vectors to initialize the algorithm.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n Q : ndarray\n A (size x size) projection matrix, the range of which\n approximates well the range of the input matrix A.\n\n Notes\n -----\n\n Follows Algorithm 4.3 of\n Finding structure with randomness: Stochastic algorithms for constructing\n approximate matrix decompositions\n Halko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf\n\n An implementation of a randomized algorithm for principal component\n analysis\n A. Szlam et al. 2014\n \"\"\"\n random_state = check_random_state(random_state)\n Q = random_state.normal(size=(A.shape[1], size))\n if A.dtype.kind == 'f':\n Q = Q.astype(A.dtype, copy=False)\n if power_iteration_normalizer == 'auto':\n if n_iter <= 2:\n power_iteration_normalizer = 'none'\n else:\n power_iteration_normalizer = 'LU'\n for i in range(n_iter):\n if power_iteration_normalizer == 'none':\n Q = safe_sparse_dot(A, Q)\n Q = safe_sparse_dot(A.T, Q)\n elif power_iteration_normalizer == 'LU':\n (Q, _) = linalg.lu(safe_sparse_dot(A, Q), permute_l=True)\n (Q, _) = linalg.lu(safe_sparse_dot(A.T, Q), permute_l=True)\n elif power_iteration_normalizer == 'QR':\n (Q, _) = linalg.qr(safe_sparse_dot(A, Q), mode='economic')\n (Q, _) = linalg.qr(safe_sparse_dot(A.T, Q), mode='economic')\n (Q, _) = linalg.qr(safe_sparse_dot(A, Q), mode='economic')\n return Q" + "description": "Compute an orthonormal matrix whose range approximates the range of A.", + "docstring": "Compute an orthonormal matrix whose range approximates the range of A.\n\n Parameters\n ----------\n A : 2D array\n The input data matrix.\n\n size : int\n Size of the return array.\n\n n_iter : int\n Number of power iterations used to stabilize the result.\n\n power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'\n Whether the power iterations are normalized with step-by-step\n QR factorization (the slowest but most accurate), 'none'\n (the fastest but numerically unstable when `n_iter` is large, e.g.\n typically 5 or larger), or 'LU' factorization (numerically stable\n but can lose slightly in accuracy). The 'auto' mode applies no\n normalization if `n_iter` <= 2 and switches to LU otherwise.\n\n .. versionadded:: 0.18\n\n random_state : int, RandomState instance or None, default=None\n The seed of the pseudo random number generator to use when shuffling\n the data, i.e. getting the random vectors to initialize the algorithm.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n Q : ndarray\n A (size x size) projection matrix, the range of which\n approximates well the range of the input matrix A.\n\n Notes\n -----\n\n Follows Algorithm 4.3 of\n Finding structure with randomness: Stochastic algorithms for constructing\n approximate matrix decompositions\n Halko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf\n\n An implementation of a randomized algorithm for principal component\n analysis\n A. Szlam et al. 2014\n ", + "source_code": "\ndef randomized_range_finder(A, *, size, n_iter, power_iteration_normalizer='auto', random_state=None):\n \"\"\"Compute an orthonormal matrix whose range approximates the range of A.\n\n Parameters\n ----------\n A : 2D array\n The input data matrix.\n\n size : int\n Size of the return array.\n\n n_iter : int\n Number of power iterations used to stabilize the result.\n\n power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'\n Whether the power iterations are normalized with step-by-step\n QR factorization (the slowest but most accurate), 'none'\n (the fastest but numerically unstable when `n_iter` is large, e.g.\n typically 5 or larger), or 'LU' factorization (numerically stable\n but can lose slightly in accuracy). The 'auto' mode applies no\n normalization if `n_iter` <= 2 and switches to LU otherwise.\n\n .. versionadded:: 0.18\n\n random_state : int, RandomState instance or None, default=None\n The seed of the pseudo random number generator to use when shuffling\n the data, i.e. getting the random vectors to initialize the algorithm.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n Returns\n -------\n Q : ndarray\n A (size x size) projection matrix, the range of which\n approximates well the range of the input matrix A.\n\n Notes\n -----\n\n Follows Algorithm 4.3 of\n Finding structure with randomness: Stochastic algorithms for constructing\n approximate matrix decompositions\n Halko, et al., 2009 (arXiv:909) https://arxiv.org/pdf/0909.4061.pdf\n\n An implementation of a randomized algorithm for principal component\n analysis\n A. Szlam et al. 2014\n \"\"\"\n random_state = check_random_state(random_state)\n Q = random_state.normal(size=(A.shape[1], size))\n if A.dtype.kind == 'f':\n Q = Q.astype(A.dtype, copy=False)\n if power_iteration_normalizer == 'auto':\n if n_iter <= 2:\n power_iteration_normalizer = 'none'\n else:\n power_iteration_normalizer = 'LU'\n for i in range(n_iter):\n if power_iteration_normalizer == 'none':\n Q = safe_sparse_dot(A, Q)\n Q = safe_sparse_dot(A.T, Q)\n elif power_iteration_normalizer == 'LU':\n (Q, _) = linalg.lu(safe_sparse_dot(A, Q), permute_l=True)\n (Q, _) = linalg.lu(safe_sparse_dot(A.T, Q), permute_l=True)\n elif power_iteration_normalizer == 'QR':\n (Q, _) = linalg.qr(safe_sparse_dot(A, Q), mode='economic')\n (Q, _) = linalg.qr(safe_sparse_dot(A.T, Q), mode='economic')\n (Q, _) = linalg.qr(safe_sparse_dot(A, Q), mode='economic')\n return Q" }, { "name": "randomized_svd", @@ -173410,6 +187152,10 @@ "docstring": { "type": "{ndarray, sparse matrix}", "description": "Matrix to decompose." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -173420,7 +187166,8 @@ "docstring": { "type": "int", "description": "Number of singular values and vectors to extract." - } + }, + "refined_type": {} }, { "name": "n_oversamples", @@ -173430,7 +187177,8 @@ "docstring": { "type": "int, default=10", "description": "Additional number of random vectors to sample the range of M so as\nto ensure proper conditioning. The total number of random vectors\nused to find the range of M is n_components + n_oversamples. Smaller\nnumber can improve speed but can negatively impact the quality of\napproximation of singular vectors and singular values. Users might wish\nto increase this parameter up to `2*k - n_components` where k is the\neffective rank, for large matrices, noisy problems, matrices with\nslowly decaying spectrums, or to increase precision accuracy. See Halko\net al (pages 5, 23 and 26)." - } + }, + "refined_type": {} }, { "name": "n_iter", @@ -173440,7 +187188,8 @@ "docstring": { "type": "int or 'auto', default='auto'", "description": "Number of power iterations. It can be used to deal with very noisy\nproblems. When 'auto', it is set to 4, unless `n_components` is small\n(< .1 * min(X.shape)) in which case `n_iter` is set to 7.\nThis improves precision with few components. Note that in general\nusers should rather increase `n_oversamples` before increasing `n_iter`\nas the principle of the randomized method is to avoid usage of these\nmore costly power iterations steps. When `n_components` is equal\nor greater to the effective matrix rank and the spectrum does not\npresent a slow decay, `n_iter=0` or `1` should even work fine in theory\n(see Halko et al paper, page 9).\n\n.. versionchanged:: 0.18" - } + }, + "refined_type": {} }, { "name": "power_iteration_normalizer", @@ -173450,6 +187199,10 @@ "docstring": { "type": "{'auto', 'QR', 'LU', 'none'}, default='auto'", "description": "Whether the power iterations are normalized with step-by-step\nQR factorization (the slowest but most accurate), 'none'\n(the fastest but numerically unstable when `n_iter` is large, e.g.\ntypically 5 or larger), or 'LU' factorization (numerically stable\nbut can lose slightly in accuracy). The 'auto' mode applies no\nnormalization if `n_iter` <= 2 and switches to LU otherwise.\n\n.. versionadded:: 0.18" + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "none", "QR", "LU"] } }, { @@ -173460,7 +187213,8 @@ "docstring": { "type": "bool or 'auto', default='auto'", "description": "Whether the algorithm should be applied to M.T instead of M. The\nresult should approximately be the same. The 'auto' mode will\ntrigger the transposition if M.shape[1] > M.shape[0] since this\nimplementation of randomized SVD tend to be a little faster in that\ncase.\n\n.. versionchanged:: 0.18" - } + }, + "refined_type": {} }, { "name": "flip_sign", @@ -173470,7 +187224,8 @@ "docstring": { "type": "bool, default=True", "description": "The output of a singular value decomposition is only unique up to a\npermutation of the signs of the singular vectors. If `flip_sign` is\nset to `True`, the sign ambiguity is resolved by making the largest\nloadings for each component in the left singular vectors positive." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -173480,13 +187235,14 @@ "docstring": { "type": "int, RandomState instance or None, default='warn'", "description": "The seed of the pseudo random number generator to use when\nshuffling the data, i.e. getting the random vectors to initialize\nthe algorithm. Pass an int for reproducible results across multiple\nfunction calls. See :term:`Glossary `.\n\n.. versionchanged:: 1.2\n The previous behavior (`random_state=0`) is deprecated, and\n from v1.2 the default value will be `random_state=None`. Set\n the value of `random_state` explicitly to suppress the deprecation\n warning." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Computes a truncated randomized SVD.\n\nThis method solves the fixed-rank approximation problem described in the Halko et al paper (problem (1.5), p5).", - "docstring": "Computes a truncated randomized SVD.\n\nThis method solves the fixed-rank approximation problem described in the\nHalko et al paper (problem (1.5), p5).\n\nParameters\n----------\nM : {ndarray, sparse matrix}\n Matrix to decompose.\n\nn_components : int\n Number of singular values and vectors to extract.\n\nn_oversamples : int, default=10\n Additional number of random vectors to sample the range of M so as\n to ensure proper conditioning. The total number of random vectors\n used to find the range of M is n_components + n_oversamples. Smaller\n number can improve speed but can negatively impact the quality of\n approximation of singular vectors and singular values. Users might wish\n to increase this parameter up to `2*k - n_components` where k is the\n effective rank, for large matrices, noisy problems, matrices with\n slowly decaying spectrums, or to increase precision accuracy. See Halko\n et al (pages 5, 23 and 26).\n\nn_iter : int or 'auto', default='auto'\n Number of power iterations. It can be used to deal with very noisy\n problems. When 'auto', it is set to 4, unless `n_components` is small\n (< .1 * min(X.shape)) in which case `n_iter` is set to 7.\n This improves precision with few components. Note that in general\n users should rather increase `n_oversamples` before increasing `n_iter`\n as the principle of the randomized method is to avoid usage of these\n more costly power iterations steps. When `n_components` is equal\n or greater to the effective matrix rank and the spectrum does not\n present a slow decay, `n_iter=0` or `1` should even work fine in theory\n (see Halko et al paper, page 9).\n\n .. versionchanged:: 0.18\n\npower_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'\n Whether the power iterations are normalized with step-by-step\n QR factorization (the slowest but most accurate), 'none'\n (the fastest but numerically unstable when `n_iter` is large, e.g.\n typically 5 or larger), or 'LU' factorization (numerically stable\n but can lose slightly in accuracy). The 'auto' mode applies no\n normalization if `n_iter` <= 2 and switches to LU otherwise.\n\n .. versionadded:: 0.18\n\ntranspose : bool or 'auto', default='auto'\n Whether the algorithm should be applied to M.T instead of M. The\n result should approximately be the same. The 'auto' mode will\n trigger the transposition if M.shape[1] > M.shape[0] since this\n implementation of randomized SVD tend to be a little faster in that\n case.\n\n .. versionchanged:: 0.18\n\nflip_sign : bool, default=True\n The output of a singular value decomposition is only unique up to a\n permutation of the signs of the singular vectors. If `flip_sign` is\n set to `True`, the sign ambiguity is resolved by making the largest\n loadings for each component in the left singular vectors positive.\n\nrandom_state : int, RandomState instance or None, default='warn'\n The seed of the pseudo random number generator to use when\n shuffling the data, i.e. getting the random vectors to initialize\n the algorithm. Pass an int for reproducible results across multiple\n function calls. See :term:`Glossary `.\n\n .. versionchanged:: 1.2\n The previous behavior (`random_state=0`) is deprecated, and\n from v1.2 the default value will be `random_state=None`. Set\n the value of `random_state` explicitly to suppress the deprecation\n warning.\n\nNotes\n-----\nThis algorithm finds a (usually very good) approximate truncated\nsingular value decomposition using randomization to speed up the\ncomputations. It is particularly fast on large matrices on which\nyou wish to extract only a small number of components. In order to\nobtain further speed up, `n_iter` can be set <=2 (at the cost of\nloss of precision). To increase the precision it is recommended to\nincrease `n_oversamples`, up to `2*k-n_components` where k is the\neffective rank. Usually, `n_components` is chosen to be greater than k\nso increasing `n_oversamples` up to `n_components` should be enough.\n\nReferences\n----------\n* Finding structure with randomness: Stochastic algorithms for constructing\n approximate matrix decompositions (Algorithm 4.3)\n Halko, et al., 2009 https://arxiv.org/abs/0909.4061\n\n* A randomized algorithm for the decomposition of matrices\n Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert\n\n* An implementation of a randomized algorithm for principal component\n analysis\n A. Szlam et al. 2014", + "description": "Computes a truncated randomized SVD.\n\nThis method solves the fixed-rank approximation problem described in the\nHalko et al paper (problem (1.5), p5).", + "docstring": "Computes a truncated randomized SVD.\n\n This method solves the fixed-rank approximation problem described in the\n Halko et al paper (problem (1.5), p5).\n\n Parameters\n ----------\n M : {ndarray, sparse matrix}\n Matrix to decompose.\n\n n_components : int\n Number of singular values and vectors to extract.\n\n n_oversamples : int, default=10\n Additional number of random vectors to sample the range of M so as\n to ensure proper conditioning. The total number of random vectors\n used to find the range of M is n_components + n_oversamples. Smaller\n number can improve speed but can negatively impact the quality of\n approximation of singular vectors and singular values. Users might wish\n to increase this parameter up to `2*k - n_components` where k is the\n effective rank, for large matrices, noisy problems, matrices with\n slowly decaying spectrums, or to increase precision accuracy. See Halko\n et al (pages 5, 23 and 26).\n\n n_iter : int or 'auto', default='auto'\n Number of power iterations. It can be used to deal with very noisy\n problems. When 'auto', it is set to 4, unless `n_components` is small\n (< .1 * min(X.shape)) in which case `n_iter` is set to 7.\n This improves precision with few components. Note that in general\n users should rather increase `n_oversamples` before increasing `n_iter`\n as the principle of the randomized method is to avoid usage of these\n more costly power iterations steps. When `n_components` is equal\n or greater to the effective matrix rank and the spectrum does not\n present a slow decay, `n_iter=0` or `1` should even work fine in theory\n (see Halko et al paper, page 9).\n\n .. versionchanged:: 0.18\n\n power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'\n Whether the power iterations are normalized with step-by-step\n QR factorization (the slowest but most accurate), 'none'\n (the fastest but numerically unstable when `n_iter` is large, e.g.\n typically 5 or larger), or 'LU' factorization (numerically stable\n but can lose slightly in accuracy). The 'auto' mode applies no\n normalization if `n_iter` <= 2 and switches to LU otherwise.\n\n .. versionadded:: 0.18\n\n transpose : bool or 'auto', default='auto'\n Whether the algorithm should be applied to M.T instead of M. The\n result should approximately be the same. The 'auto' mode will\n trigger the transposition if M.shape[1] > M.shape[0] since this\n implementation of randomized SVD tend to be a little faster in that\n case.\n\n .. versionchanged:: 0.18\n\n flip_sign : bool, default=True\n The output of a singular value decomposition is only unique up to a\n permutation of the signs of the singular vectors. If `flip_sign` is\n set to `True`, the sign ambiguity is resolved by making the largest\n loadings for each component in the left singular vectors positive.\n\n random_state : int, RandomState instance or None, default='warn'\n The seed of the pseudo random number generator to use when\n shuffling the data, i.e. getting the random vectors to initialize\n the algorithm. Pass an int for reproducible results across multiple\n function calls. See :term:`Glossary `.\n\n .. versionchanged:: 1.2\n The previous behavior (`random_state=0`) is deprecated, and\n from v1.2 the default value will be `random_state=None`. Set\n the value of `random_state` explicitly to suppress the deprecation\n warning.\n\n Notes\n -----\n This algorithm finds a (usually very good) approximate truncated\n singular value decomposition using randomization to speed up the\n computations. It is particularly fast on large matrices on which\n you wish to extract only a small number of components. In order to\n obtain further speed up, `n_iter` can be set <=2 (at the cost of\n loss of precision). To increase the precision it is recommended to\n increase `n_oversamples`, up to `2*k-n_components` where k is the\n effective rank. Usually, `n_components` is chosen to be greater than k\n so increasing `n_oversamples` up to `n_components` should be enough.\n\n References\n ----------\n * Finding structure with randomness: Stochastic algorithms for constructing\n approximate matrix decompositions (Algorithm 4.3)\n Halko, et al., 2009 https://arxiv.org/abs/0909.4061\n\n * A randomized algorithm for the decomposition of matrices\n Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert\n\n * An implementation of a randomized algorithm for principal component\n analysis\n A. Szlam et al. 2014\n ", "source_code": "\ndef randomized_svd(M, n_components, *, n_oversamples=10, n_iter='auto', power_iteration_normalizer='auto', transpose='auto', flip_sign=True, random_state='warn'):\n \"\"\"Computes a truncated randomized SVD.\n\n This method solves the fixed-rank approximation problem described in the\n Halko et al paper (problem (1.5), p5).\n\n Parameters\n ----------\n M : {ndarray, sparse matrix}\n Matrix to decompose.\n\n n_components : int\n Number of singular values and vectors to extract.\n\n n_oversamples : int, default=10\n Additional number of random vectors to sample the range of M so as\n to ensure proper conditioning. The total number of random vectors\n used to find the range of M is n_components + n_oversamples. Smaller\n number can improve speed but can negatively impact the quality of\n approximation of singular vectors and singular values. Users might wish\n to increase this parameter up to `2*k - n_components` where k is the\n effective rank, for large matrices, noisy problems, matrices with\n slowly decaying spectrums, or to increase precision accuracy. See Halko\n et al (pages 5, 23 and 26).\n\n n_iter : int or 'auto', default='auto'\n Number of power iterations. It can be used to deal with very noisy\n problems. When 'auto', it is set to 4, unless `n_components` is small\n (< .1 * min(X.shape)) in which case `n_iter` is set to 7.\n This improves precision with few components. Note that in general\n users should rather increase `n_oversamples` before increasing `n_iter`\n as the principle of the randomized method is to avoid usage of these\n more costly power iterations steps. When `n_components` is equal\n or greater to the effective matrix rank and the spectrum does not\n present a slow decay, `n_iter=0` or `1` should even work fine in theory\n (see Halko et al paper, page 9).\n\n .. versionchanged:: 0.18\n\n power_iteration_normalizer : {'auto', 'QR', 'LU', 'none'}, default='auto'\n Whether the power iterations are normalized with step-by-step\n QR factorization (the slowest but most accurate), 'none'\n (the fastest but numerically unstable when `n_iter` is large, e.g.\n typically 5 or larger), or 'LU' factorization (numerically stable\n but can lose slightly in accuracy). The 'auto' mode applies no\n normalization if `n_iter` <= 2 and switches to LU otherwise.\n\n .. versionadded:: 0.18\n\n transpose : bool or 'auto', default='auto'\n Whether the algorithm should be applied to M.T instead of M. The\n result should approximately be the same. The 'auto' mode will\n trigger the transposition if M.shape[1] > M.shape[0] since this\n implementation of randomized SVD tend to be a little faster in that\n case.\n\n .. versionchanged:: 0.18\n\n flip_sign : bool, default=True\n The output of a singular value decomposition is only unique up to a\n permutation of the signs of the singular vectors. If `flip_sign` is\n set to `True`, the sign ambiguity is resolved by making the largest\n loadings for each component in the left singular vectors positive.\n\n random_state : int, RandomState instance or None, default='warn'\n The seed of the pseudo random number generator to use when\n shuffling the data, i.e. getting the random vectors to initialize\n the algorithm. Pass an int for reproducible results across multiple\n function calls. See :term:`Glossary `.\n\n .. versionchanged:: 1.2\n The previous behavior (`random_state=0`) is deprecated, and\n from v1.2 the default value will be `random_state=None`. Set\n the value of `random_state` explicitly to suppress the deprecation\n warning.\n\n Notes\n -----\n This algorithm finds a (usually very good) approximate truncated\n singular value decomposition using randomization to speed up the\n computations. It is particularly fast on large matrices on which\n you wish to extract only a small number of components. In order to\n obtain further speed up, `n_iter` can be set <=2 (at the cost of\n loss of precision). To increase the precision it is recommended to\n increase `n_oversamples`, up to `2*k-n_components` where k is the\n effective rank. Usually, `n_components` is chosen to be greater than k\n so increasing `n_oversamples` up to `n_components` should be enough.\n\n References\n ----------\n * Finding structure with randomness: Stochastic algorithms for constructing\n approximate matrix decompositions (Algorithm 4.3)\n Halko, et al., 2009 https://arxiv.org/abs/0909.4061\n\n * A randomized algorithm for the decomposition of matrices\n Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert\n\n * An implementation of a randomized algorithm for principal component\n analysis\n A. Szlam et al. 2014\n \"\"\"\n if isinstance(M, (sparse.lil_matrix, sparse.dok_matrix)):\n warnings.warn('Calculating SVD of a {} is expensive. csr_matrix is more efficient.'.format(type(M).__name__), sparse.SparseEfficiencyWarning)\n if random_state == 'warn':\n warnings.warn(\"If 'random_state' is not supplied, the current default is to use 0 as a fixed seed. This will change to None in version 1.2 leading to non-deterministic results that better reflect nature of the randomized_svd solver. If you want to silence this warning, set 'random_state' to an integer seed or to None explicitly depending if you want your code to be deterministic or not.\", FutureWarning)\n random_state = 0\n random_state = check_random_state(random_state)\n n_random = n_components + n_oversamples\n (n_samples, n_features) = M.shape\n if n_iter == 'auto':\n n_iter = 7 if n_components < 0.1 * min(M.shape) else 4\n if transpose == 'auto':\n transpose = n_samples < n_features\n if transpose:\n M = M.T\n Q = randomized_range_finder(M, size=n_random, n_iter=n_iter, power_iteration_normalizer=power_iteration_normalizer, random_state=random_state)\n B = safe_sparse_dot(Q.T, M)\n (Uhat, s, Vt) = linalg.svd(B, full_matrices=False)\n del B\n U = np.dot(Q, Uhat)\n if flip_sign:\n if not transpose:\n (U, Vt) = svd_flip(U, Vt)\n else:\n (U, Vt) = svd_flip(U, Vt, u_based_decision=False)\n if transpose:\n return Vt[:n_components, :].T, s[:n_components], U[:, :n_components].T\n else:\n return U[:, :n_components], s[:n_components], Vt[:n_components, :]" }, { @@ -173504,7 +187260,8 @@ "docstring": { "type": "array-like", "description": "The input array." - } + }, + "refined_type": {} }, { "name": "squared", @@ -173514,13 +187271,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, return squared norms." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Row-wise (squared) Euclidean norm of X.\n\nEquivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse matrices and does not create an X.shape-sized temporary. Performs no input validation.", - "docstring": "Row-wise (squared) Euclidean norm of X.\n\nEquivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse\nmatrices and does not create an X.shape-sized temporary.\n\nPerforms no input validation.\n\nParameters\n----------\nX : array-like\n The input array.\nsquared : bool, default=False\n If True, return squared norms.\n\nReturns\n-------\narray-like\n The row-wise (squared) Euclidean norm of X.", + "description": "Row-wise (squared) Euclidean norm of X.\n\nEquivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse\nmatrices and does not create an X.shape-sized temporary.\n\nPerforms no input validation.", + "docstring": "Row-wise (squared) Euclidean norm of X.\n\n Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse\n matrices and does not create an X.shape-sized temporary.\n\n Performs no input validation.\n\n Parameters\n ----------\n X : array-like\n The input array.\n squared : bool, default=False\n If True, return squared norms.\n\n Returns\n -------\n array-like\n The row-wise (squared) Euclidean norm of X.\n ", "source_code": "\ndef row_norms(X, squared=False):\n \"\"\"Row-wise (squared) Euclidean norm of X.\n\n Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse\n matrices and does not create an X.shape-sized temporary.\n\n Performs no input validation.\n\n Parameters\n ----------\n X : array-like\n The input array.\n squared : bool, default=False\n If True, return squared norms.\n\n Returns\n -------\n array-like\n The row-wise (squared) Euclidean norm of X.\n \"\"\"\n if sparse.issparse(X):\n if not isinstance(X, sparse.csr_matrix):\n X = sparse.csr_matrix(X)\n norms = csr_row_norms(X)\n else:\n norms = np.einsum('ij,ij->i', X, X)\n if not squared:\n np.sqrt(norms, norms)\n return norms" }, { @@ -173538,6 +187296,10 @@ "docstring": { "type": "{ndarray, sparse matrix}", "description": "" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -173548,6 +187310,10 @@ "docstring": { "type": "{ndarray, sparse matrix}", "description": "" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -173558,13 +187324,14 @@ "docstring": { "type": "bool, default=False", "description": "When False, ``a`` and ``b`` both being sparse will yield sparse output.\nWhen True, output will always be a dense array." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Dot product that handle the sparse matrix case correctly.", - "docstring": "Dot product that handle the sparse matrix case correctly.\n\nParameters\n----------\na : {ndarray, sparse matrix}\nb : {ndarray, sparse matrix}\ndense_output : bool, default=False\n When False, ``a`` and ``b`` both being sparse will yield sparse output.\n When True, output will always be a dense array.\n\nReturns\n-------\ndot_product : {ndarray, sparse matrix}\n Sparse if ``a`` and ``b`` are sparse and ``dense_output=False``.", + "docstring": "Dot product that handle the sparse matrix case correctly.\n\n Parameters\n ----------\n a : {ndarray, sparse matrix}\n b : {ndarray, sparse matrix}\n dense_output : bool, default=False\n When False, ``a`` and ``b`` both being sparse will yield sparse output.\n When True, output will always be a dense array.\n\n Returns\n -------\n dot_product : {ndarray, sparse matrix}\n Sparse if ``a`` and ``b`` are sparse and ``dense_output=False``.\n ", "source_code": "\ndef safe_sparse_dot(a, b, *, dense_output=False):\n \"\"\"Dot product that handle the sparse matrix case correctly.\n\n Parameters\n ----------\n a : {ndarray, sparse matrix}\n b : {ndarray, sparse matrix}\n dense_output : bool, default=False\n When False, ``a`` and ``b`` both being sparse will yield sparse output.\n When True, output will always be a dense array.\n\n Returns\n -------\n dot_product : {ndarray, sparse matrix}\n Sparse if ``a`` and ``b`` are sparse and ``dense_output=False``.\n \"\"\"\n if a.ndim > 2 or b.ndim > 2:\n if sparse.issparse(a):\n b_ = np.rollaxis(b, -2)\n b_2d = b_.reshape((b.shape[-2], -1))\n ret = a @ b_2d\n ret = ret.reshape(a.shape[0], *b_.shape[1:])\n elif sparse.issparse(b):\n a_2d = a.reshape(-1, a.shape[-1])\n ret = a_2d @ b\n ret = ret.reshape(*a.shape[:-1], b.shape[1])\n else:\n ret = np.dot(a, b)\n else:\n ret = a @ b\n if sparse.issparse(a) and sparse.issparse(b) and dense_output and hasattr(ret, 'toarray'):\n return ret.toarray()\n return ret" }, { @@ -173582,7 +187349,8 @@ "docstring": { "type": "array-like of float of shape (M, N)", "description": "Argument to the logistic function." - } + }, + "refined_type": {} }, { "name": "copy", @@ -173592,13 +187360,14 @@ "docstring": { "type": "bool, default=True", "description": "Copy X or not." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Calculate the softmax function.\n\nThe softmax function is calculated by np.exp(X) / np.sum(np.exp(X), axis=1) This will cause overflow when large values are exponentiated. Hence the largest value in each row is subtracted from each data point to prevent this.", - "docstring": "Calculate the softmax function.\n\nThe softmax function is calculated by\nnp.exp(X) / np.sum(np.exp(X), axis=1)\n\nThis will cause overflow when large values are exponentiated.\nHence the largest value in each row is subtracted from each data\npoint to prevent this.\n\nParameters\n----------\nX : array-like of float of shape (M, N)\n Argument to the logistic function.\n\ncopy : bool, default=True\n Copy X or not.\n\nReturns\n-------\nout : ndarray of shape (M, N)\n Softmax function evaluated at every point in x.", + "description": "Calculate the softmax function.\n\nThe softmax function is calculated by\nnp.exp(X) / np.sum(np.exp(X), axis=1)\n\nThis will cause overflow when large values are exponentiated.\nHence the largest value in each row is subtracted from each data\npoint to prevent this.", + "docstring": "\n Calculate the softmax function.\n\n The softmax function is calculated by\n np.exp(X) / np.sum(np.exp(X), axis=1)\n\n This will cause overflow when large values are exponentiated.\n Hence the largest value in each row is subtracted from each data\n point to prevent this.\n\n Parameters\n ----------\n X : array-like of float of shape (M, N)\n Argument to the logistic function.\n\n copy : bool, default=True\n Copy X or not.\n\n Returns\n -------\n out : ndarray of shape (M, N)\n Softmax function evaluated at every point in x.\n ", "source_code": "\ndef softmax(X, copy=True):\n \"\"\"\n Calculate the softmax function.\n\n The softmax function is calculated by\n np.exp(X) / np.sum(np.exp(X), axis=1)\n\n This will cause overflow when large values are exponentiated.\n Hence the largest value in each row is subtracted from each data\n point to prevent this.\n\n Parameters\n ----------\n X : array-like of float of shape (M, N)\n Argument to the logistic function.\n\n copy : bool, default=True\n Copy X or not.\n\n Returns\n -------\n out : ndarray of shape (M, N)\n Softmax function evaluated at every point in x.\n \"\"\"\n if copy:\n X = np.copy(X)\n max_prob = np.max(X, axis=1).reshape((-1, 1))\n X -= max_prob\n np.exp(X, X)\n sum_prob = np.sum(X, axis=1).reshape((-1, 1))\n X /= sum_prob\n return X" }, { @@ -173616,13 +187385,14 @@ "docstring": { "type": "array-like", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Squared Euclidean or Frobenius norm of x.\n\nFaster than norm(x) ** 2.", - "docstring": "Squared Euclidean or Frobenius norm of x.\n\nFaster than norm(x) ** 2.\n\nParameters\n----------\nx : array-like\n\nReturns\n-------\nfloat\n The Euclidean norm when x is a vector, the Frobenius norm when x\n is a matrix (2-d array).", + "docstring": "Squared Euclidean or Frobenius norm of x.\n\n Faster than norm(x) ** 2.\n\n Parameters\n ----------\n x : array-like\n\n Returns\n -------\n float\n The Euclidean norm when x is a vector, the Frobenius norm when x\n is a matrix (2-d array).\n ", "source_code": "\ndef squared_norm(x):\n \"\"\"Squared Euclidean or Frobenius norm of x.\n\n Faster than norm(x) ** 2.\n\n Parameters\n ----------\n x : array-like\n\n Returns\n -------\n float\n The Euclidean norm when x is a vector, the Frobenius norm when x\n is a matrix (2-d array).\n \"\"\"\n x = np.ravel(x, order='K')\n if np.issubdtype(x.dtype, np.integer):\n warnings.warn('Array type is integer, np.dot may overflow. Data should be float type to avoid this issue', UserWarning)\n return np.dot(x, x)" }, { @@ -173640,7 +187410,8 @@ "docstring": { "type": "array-like", "description": "To be cumulatively summed as flat." - } + }, + "refined_type": {} }, { "name": "axis", @@ -173650,7 +187421,8 @@ "docstring": { "type": "int, default=None", "description": "Axis along which the cumulative sum is computed.\nThe default (None) is to compute the cumsum over the flattened array." - } + }, + "refined_type": {} }, { "name": "rtol", @@ -173660,7 +187432,8 @@ "docstring": { "type": "float, default=1e-05", "description": "Relative tolerance, see ``np.allclose``." - } + }, + "refined_type": {} }, { "name": "atol", @@ -173670,13 +187443,14 @@ "docstring": { "type": "float, default=1e-08", "description": "Absolute tolerance, see ``np.allclose``." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Use high precision for cumsum and check that final value matches sum.", - "docstring": "Use high precision for cumsum and check that final value matches sum.\n\nParameters\n----------\narr : array-like\n To be cumulatively summed as flat.\naxis : int, default=None\n Axis along which the cumulative sum is computed.\n The default (None) is to compute the cumsum over the flattened array.\nrtol : float, default=1e-05\n Relative tolerance, see ``np.allclose``.\natol : float, default=1e-08\n Absolute tolerance, see ``np.allclose``.", + "docstring": "Use high precision for cumsum and check that final value matches sum.\n\n Parameters\n ----------\n arr : array-like\n To be cumulatively summed as flat.\n axis : int, default=None\n Axis along which the cumulative sum is computed.\n The default (None) is to compute the cumsum over the flattened array.\n rtol : float, default=1e-05\n Relative tolerance, see ``np.allclose``.\n atol : float, default=1e-08\n Absolute tolerance, see ``np.allclose``.\n ", "source_code": "\ndef stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):\n \"\"\"Use high precision for cumsum and check that final value matches sum.\n\n Parameters\n ----------\n arr : array-like\n To be cumulatively summed as flat.\n axis : int, default=None\n Axis along which the cumulative sum is computed.\n The default (None) is to compute the cumsum over the flattened array.\n rtol : float, default=1e-05\n Relative tolerance, see ``np.allclose``.\n atol : float, default=1e-08\n Absolute tolerance, see ``np.allclose``.\n \"\"\"\n out = np.cumsum(arr, axis=axis, dtype=np.float64)\n expected = np.sum(arr, axis=axis, dtype=np.float64)\n if not np.all(np.isclose(out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True)):\n warnings.warn('cumsum was found to be unstable: its last element does not correspond to sum', RuntimeWarning)\n return out" }, { @@ -173694,7 +187468,8 @@ "docstring": { "type": "ndarray", "description": "u and v are the output of `linalg.svd` or\n:func:`~sklearn.utils.extmath.randomized_svd`, with matching inner\ndimensions so one can compute `np.dot(u * s, v)`." - } + }, + "refined_type": {} }, { "name": "v", @@ -173704,7 +187479,8 @@ "docstring": { "type": "ndarray", "description": "u and v are the output of `linalg.svd` or\n:func:`~sklearn.utils.extmath.randomized_svd`, with matching inner\ndimensions so one can compute `np.dot(u * s, v)`.\nThe input v should really be called vt to be consistent with scipy's\noutput." - } + }, + "refined_type": {} }, { "name": "u_based_decision", @@ -173714,13 +187490,14 @@ "docstring": { "type": "bool, default=True", "description": "If True, use the columns of u as the basis for sign flipping.\nOtherwise, use the rows of v. The choice of which variable to base the\ndecision on is generally algorithm dependent." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Sign correction to ensure deterministic output from SVD.\n\nAdjusts the columns of u and the rows of v such that the loadings in the columns in u that are largest in absolute value are always positive.", - "docstring": "Sign correction to ensure deterministic output from SVD.\n\nAdjusts the columns of u and the rows of v such that the loadings in the\ncolumns in u that are largest in absolute value are always positive.\n\nParameters\n----------\nu : ndarray\n u and v are the output of `linalg.svd` or\n :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner\n dimensions so one can compute `np.dot(u * s, v)`.\n\nv : ndarray\n u and v are the output of `linalg.svd` or\n :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner\n dimensions so one can compute `np.dot(u * s, v)`.\n The input v should really be called vt to be consistent with scipy's\n output.\n\nu_based_decision : bool, default=True\n If True, use the columns of u as the basis for sign flipping.\n Otherwise, use the rows of v. The choice of which variable to base the\n decision on is generally algorithm dependent.\n\n\nReturns\n-------\nu_adjusted, v_adjusted : arrays with the same dimensions as the input.", + "description": "Sign correction to ensure deterministic output from SVD.\n\nAdjusts the columns of u and the rows of v such that the loadings in the\ncolumns in u that are largest in absolute value are always positive.", + "docstring": "Sign correction to ensure deterministic output from SVD.\n\n Adjusts the columns of u and the rows of v such that the loadings in the\n columns in u that are largest in absolute value are always positive.\n\n Parameters\n ----------\n u : ndarray\n u and v are the output of `linalg.svd` or\n :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner\n dimensions so one can compute `np.dot(u * s, v)`.\n\n v : ndarray\n u and v are the output of `linalg.svd` or\n :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner\n dimensions so one can compute `np.dot(u * s, v)`.\n The input v should really be called vt to be consistent with scipy's\n output.\n\n u_based_decision : bool, default=True\n If True, use the columns of u as the basis for sign flipping.\n Otherwise, use the rows of v. The choice of which variable to base the\n decision on is generally algorithm dependent.\n\n\n Returns\n -------\n u_adjusted, v_adjusted : arrays with the same dimensions as the input.\n\n ", "source_code": "\ndef svd_flip(u, v, u_based_decision=True):\n \"\"\"Sign correction to ensure deterministic output from SVD.\n\n Adjusts the columns of u and the rows of v such that the loadings in the\n columns in u that are largest in absolute value are always positive.\n\n Parameters\n ----------\n u : ndarray\n u and v are the output of `linalg.svd` or\n :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner\n dimensions so one can compute `np.dot(u * s, v)`.\n\n v : ndarray\n u and v are the output of `linalg.svd` or\n :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner\n dimensions so one can compute `np.dot(u * s, v)`.\n The input v should really be called vt to be consistent with scipy's\n output.\n\n u_based_decision : bool, default=True\n If True, use the columns of u as the basis for sign flipping.\n Otherwise, use the rows of v. The choice of which variable to base the\n decision on is generally algorithm dependent.\n\n\n Returns\n -------\n u_adjusted, v_adjusted : arrays with the same dimensions as the input.\n\n \"\"\"\n if u_based_decision:\n max_abs_cols = np.argmax(np.abs(u), axis=0)\n signs = np.sign(u[max_abs_cols, range(u.shape[1])])\n u *= signs\n v *= signs[:, np.newaxis]\n else:\n max_abs_rows = np.argmax(np.abs(v), axis=1)\n signs = np.sign(v[range(v.shape[0]), max_abs_rows])\n u *= signs\n v *= signs[:, np.newaxis]\n return u, v" }, { @@ -173738,7 +187515,8 @@ "docstring": { "type": "array-like", "description": "n-dimensional array of which to find mode(s)." - } + }, + "refined_type": {} }, { "name": "w", @@ -173748,7 +187526,8 @@ "docstring": { "type": "array-like", "description": "n-dimensional array of weights for each value." - } + }, + "refined_type": {} }, { "name": "axis", @@ -173758,13 +187537,14 @@ "docstring": { "type": "int, default=0", "description": "Axis along which to operate. Default is 0, i.e. the first axis." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Returns an array of the weighted modal (most common) value in a.\n\nIf there is more than one such value, only the first is returned. The bin-count for the modal bins is also returned. This is an extension of the algorithm in scipy.stats.mode.", - "docstring": "Returns an array of the weighted modal (most common) value in a.\n\nIf there is more than one such value, only the first is returned.\nThe bin-count for the modal bins is also returned.\n\nThis is an extension of the algorithm in scipy.stats.mode.\n\nParameters\n----------\na : array-like\n n-dimensional array of which to find mode(s).\nw : array-like\n n-dimensional array of weights for each value.\naxis : int, default=0\n Axis along which to operate. Default is 0, i.e. the first axis.\n\nReturns\n-------\nvals : ndarray\n Array of modal values.\nscore : ndarray\n Array of weighted counts for each mode.\n\nExamples\n--------\n>>> from sklearn.utils.extmath import weighted_mode\n>>> x = [4, 1, 4, 2, 4, 2]\n>>> weights = [1, 1, 1, 1, 1, 1]\n>>> weighted_mode(x, weights)\n(array([4.]), array([3.]))\n\nThe value 4 appears three times: with uniform weights, the result is\nsimply the mode of the distribution.\n\n>>> weights = [1, 3, 0.5, 1.5, 1, 2] # deweight the 4's\n>>> weighted_mode(x, weights)\n(array([2.]), array([3.5]))\n\nThe value 2 has the highest score: it appears twice with weights of\n1.5 and 2: the sum of these is 3.5.\n\nSee Also\n--------\nscipy.stats.mode", + "description": "Returns an array of the weighted modal (most common) value in a.\n\nIf there is more than one such value, only the first is returned.\nThe bin-count for the modal bins is also returned.\n\nThis is an extension of the algorithm in scipy.stats.mode.", + "docstring": "Returns an array of the weighted modal (most common) value in a.\n\n If there is more than one such value, only the first is returned.\n The bin-count for the modal bins is also returned.\n\n This is an extension of the algorithm in scipy.stats.mode.\n\n Parameters\n ----------\n a : array-like\n n-dimensional array of which to find mode(s).\n w : array-like\n n-dimensional array of weights for each value.\n axis : int, default=0\n Axis along which to operate. Default is 0, i.e. the first axis.\n\n Returns\n -------\n vals : ndarray\n Array of modal values.\n score : ndarray\n Array of weighted counts for each mode.\n\n Examples\n --------\n >>> from sklearn.utils.extmath import weighted_mode\n >>> x = [4, 1, 4, 2, 4, 2]\n >>> weights = [1, 1, 1, 1, 1, 1]\n >>> weighted_mode(x, weights)\n (array([4.]), array([3.]))\n\n The value 4 appears three times: with uniform weights, the result is\n simply the mode of the distribution.\n\n >>> weights = [1, 3, 0.5, 1.5, 1, 2] # deweight the 4's\n >>> weighted_mode(x, weights)\n (array([2.]), array([3.5]))\n\n The value 2 has the highest score: it appears twice with weights of\n 1.5 and 2: the sum of these is 3.5.\n\n See Also\n --------\n scipy.stats.mode\n ", "source_code": "\ndef weighted_mode(a, w, *, axis=0):\n \"\"\"Returns an array of the weighted modal (most common) value in a.\n\n If there is more than one such value, only the first is returned.\n The bin-count for the modal bins is also returned.\n\n This is an extension of the algorithm in scipy.stats.mode.\n\n Parameters\n ----------\n a : array-like\n n-dimensional array of which to find mode(s).\n w : array-like\n n-dimensional array of weights for each value.\n axis : int, default=0\n Axis along which to operate. Default is 0, i.e. the first axis.\n\n Returns\n -------\n vals : ndarray\n Array of modal values.\n score : ndarray\n Array of weighted counts for each mode.\n\n Examples\n --------\n >>> from sklearn.utils.extmath import weighted_mode\n >>> x = [4, 1, 4, 2, 4, 2]\n >>> weights = [1, 1, 1, 1, 1, 1]\n >>> weighted_mode(x, weights)\n (array([4.]), array([3.]))\n\n The value 4 appears three times: with uniform weights, the result is\n simply the mode of the distribution.\n\n >>> weights = [1, 3, 0.5, 1.5, 1, 2] # deweight the 4's\n >>> weighted_mode(x, weights)\n (array([2.]), array([3.5]))\n\n The value 2 has the highest score: it appears twice with weights of\n 1.5 and 2: the sum of these is 3.5.\n\n See Also\n --------\n scipy.stats.mode\n \"\"\"\n if axis is None:\n a = np.ravel(a)\n w = np.ravel(w)\n axis = 0\n else:\n a = np.asarray(a)\n w = np.asarray(w)\n if a.shape != w.shape:\n w = np.full(a.shape, w, dtype=w.dtype)\n scores = np.unique(np.ravel(a))\n testshape = list(a.shape)\n testshape[axis] = 1\n oldmostfreq = np.zeros(testshape)\n oldcounts = np.zeros(testshape)\n for score in scores:\n template = np.zeros(a.shape)\n ind = a == score\n template[ind] = w[ind]\n counts = np.expand_dims(np.sum(template, axis), axis)\n mostfrequent = np.where(counts > oldcounts, score, oldmostfreq)\n oldcounts = np.maximum(counts, oldcounts)\n oldmostfreq = mostfrequent\n return mostfrequent, oldcounts" }, { @@ -173782,13 +187562,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __call__(self, *args, **kwargs):\n with config_context(**self.config):\n return self.function(*args, **kwargs)" }, { @@ -173806,7 +187587,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "function", @@ -173816,13 +187598,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, function):\n self.function = function\n self.config = get_config()\n update_wrapper(self, self.function)" }, { @@ -173840,13 +187623,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Returns the copy=False parameter for {ndarray, csr_matrix, csc_matrix}.astype when possible, otherwise don't specify", - "docstring": "Returns the copy=False parameter for\n{ndarray, csr_matrix, csc_matrix}.astype when possible,\notherwise don't specify", + "description": "Returns the copy=False parameter for\n{ndarray, csr_matrix, csc_matrix}.astype when possible,\notherwise don't specify", + "docstring": "Returns the copy=False parameter for\n {ndarray, csr_matrix, csc_matrix}.astype when possible,\n otherwise don't specify\n ", "source_code": "\ndef _astype_copy_false(X):\n \"\"\"Returns the copy=False parameter for\n {ndarray, csr_matrix, csc_matrix}.astype when possible,\n otherwise don't specify\n \"\"\"\n if sp_version >= parse_version('1.1') or not sp.issparse(X):\n return {'copy': False}\n else:\n return {}" }, { @@ -173859,7 +187643,7 @@ "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_threadpool_controller():\n if not hasattr(threadpoolctl, 'ThreadpoolController'):\n return None\n if not hasattr(sklearn, '_sklearn_threadpool_controller'):\n sklearn._sklearn_threadpool_controller = threadpoolctl.ThreadpoolController()\n return sklearn._sklearn_threadpool_controller" }, { @@ -173871,8 +187655,8 @@ "parameters": [], "results": [], "is_public": false, - "description": "Set joblib.Parallel arguments in a compatible way for 0.11 and 0.12+\n\nFor joblib 0.11 this maps both ``prefer`` and ``require`` parameters to a specific ``backend``.", - "docstring": "Set joblib.Parallel arguments in a compatible way for 0.11 and 0.12+\n\nFor joblib 0.11 this maps both ``prefer`` and ``require`` parameters to\na specific ``backend``.\n\nParameters\n----------\n\nprefer : str in {'processes', 'threads'} or None\n Soft hint to choose the default backend if no specific backend\n was selected with the parallel_backend context manager.\n\nrequire : 'sharedmem' or None\n Hard condstraint to select the backend. If set to 'sharedmem',\n the selected backend will be single-host and thread-based even\n if the user asked for a non-thread based backend with\n parallel_backend.\n\nSee joblib.Parallel documentation for more details", + "description": "Set joblib.Parallel arguments in a compatible way for 0.11 and 0.12+\n\nFor joblib 0.11 this maps both ``prefer`` and ``require`` parameters to\na specific ``backend``.", + "docstring": "Set joblib.Parallel arguments in a compatible way for 0.11 and 0.12+\n\n For joblib 0.11 this maps both ``prefer`` and ``require`` parameters to\n a specific ``backend``.\n\n Parameters\n ----------\n\n prefer : str in {'processes', 'threads'} or None\n Soft hint to choose the default backend if no specific backend\n was selected with the parallel_backend context manager.\n\n require : 'sharedmem' or None\n Hard condstraint to select the backend. If set to 'sharedmem',\n the selected backend will be single-host and thread-based even\n if the user asked for a non-thread based backend with\n parallel_backend.\n\n See joblib.Parallel documentation for more details\n ", "source_code": "\ndef _joblib_parallel_args(**kwargs):\n \"\"\"Set joblib.Parallel arguments in a compatible way for 0.11 and 0.12+\n\n For joblib 0.11 this maps both ``prefer`` and ``require`` parameters to\n a specific ``backend``.\n\n Parameters\n ----------\n\n prefer : str in {'processes', 'threads'} or None\n Soft hint to choose the default backend if no specific backend\n was selected with the parallel_backend context manager.\n\n require : 'sharedmem' or None\n Hard condstraint to select the backend. If set to 'sharedmem',\n the selected backend will be single-host and thread-based even\n if the user asked for a non-thread based backend with\n parallel_backend.\n\n See joblib.Parallel documentation for more details\n \"\"\"\n import joblib\n if parse_version(joblib.__version__) >= parse_version('0.12'):\n return kwargs\n extra_args = set(kwargs.keys()).difference({'prefer', 'require'})\n if extra_args:\n raise NotImplementedError('unhandled arguments %s with joblib %s' % (list(extra_args), joblib.__version__))\n args = {}\n if 'prefer' in kwargs:\n prefer = kwargs['prefer']\n if prefer not in ['threads', 'processes', None]:\n raise ValueError('prefer=%s is not supported' % prefer)\n args['backend'] = {'threads': 'threading', 'processes': 'multiprocessing', None: None}[prefer]\n if 'require' in kwargs:\n require = kwargs['require']\n if require not in [None, 'sharedmem']:\n raise ValueError('require=%s is not supported' % require)\n if require == 'sharedmem':\n args['backend'] = 'threading'\n return args" }, { @@ -173890,15 +187674,63 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _object_dtype_isnan(X):\n return X != X" }, + { + "name": "_percentile", + "unique_name": "_percentile", + "qname": "sklearn.utils.fixes._percentile", + "unique_qname": "sklearn.utils.fixes._percentile", + "decorators": [], + "parameters": [ + { + "name": "a", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + }, + { + "name": "q", + "default_value": null, + "is_public": false, + "assigned_by": "POSITION_OR_NAME", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + }, + { + "name": "method", + "default_value": "'linear'", + "is_public": false, + "assigned_by": "NAME_ONLY", + "docstring": { + "type": "", + "description": "" + }, + "refined_type": {} + } + ], + "results": [], + "is_public": false, + "description": "", + "docstring": null, + "source_code": "\ndef _percentile(a, q, *, method='linear', **kwargs):\n return np.percentile(a, q, interpolation=method, **kwargs)" + }, { "name": "_take_along_axis", "unique_name": "_take_along_axis", @@ -173914,7 +187746,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "indices", @@ -173924,7 +187757,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "axis", @@ -173934,13 +187768,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Implements a simplified version of np.take_along_axis if numpy version < 1.15", - "docstring": "Implements a simplified version of np.take_along_axis if numpy\nversion < 1.15", + "description": "Implements a simplified version of np.take_along_axis if numpy\nversion < 1.15", + "docstring": "Implements a simplified version of np.take_along_axis if numpy\n version < 1.15", "source_code": "\ndef _take_along_axis(arr, indices, axis):\n \"\"\"Implements a simplified version of np.take_along_axis if numpy\n version < 1.15\"\"\"\n if np_version >= parse_version('1.15'):\n return np.take_along_axis(arr=arr, indices=indices, axis=axis)\n else:\n if axis is None:\n arr = arr.flatten()\n if not np.issubdtype(indices.dtype, np.intp):\n raise IndexError('`indices` must be an integer array')\n if arr.ndim != indices.ndim:\n raise ValueError('`indices` and `arr` must have the same number of dimensions')\n shape_ones = (1, ) * indices.ndim\n dest_dims = list(range(axis)) + [None] + list(range(axis + 1, indices.ndim))\n fancy_index = []\n for (dim, n) in zip(dest_dims, arr.shape):\n if dim is None:\n fancy_index.append(indices)\n else:\n ind_shape = shape_ones[:dim] + (-1, ) + shape_ones[dim + 1:]\n fancy_index.append(np.arange(n).reshape(ind_shape))\n fancy_index = tuple(fancy_index)\n return arr[fancy_index]" }, { @@ -173958,7 +187793,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -173982,7 +187818,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "stop", @@ -173992,7 +187829,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "num", @@ -174002,7 +187840,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "endpoint", @@ -174012,7 +187851,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "retstep", @@ -174022,7 +187862,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -174032,7 +187873,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "axis", @@ -174042,13 +187884,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Implements a simplified linspace function as of numpy version >= 1.16.\n\nAs of numpy 1.16, the arguments start and stop can be array-like and there is an optional argument `axis`. For simplicity, we only allow 1d array-like to be passed to start and stop. See: https://github.com/numpy/numpy/pull/12388 and numpy 1.16 release notes about start and stop arrays for linspace logspace and geomspace.", - "docstring": "Implements a simplified linspace function as of numpy version >= 1.16.\n\nAs of numpy 1.16, the arguments start and stop can be array-like and\nthere is an optional argument `axis`.\nFor simplicity, we only allow 1d array-like to be passed to start and stop.\nSee: https://github.com/numpy/numpy/pull/12388 and numpy 1.16 release\nnotes about start and stop arrays for linspace logspace and geomspace.\n\nReturns\n-------\nout : ndarray of shape (num, n_start) or (num,)\n The output array with `n_start=start.shape[0]` columns.", + "description": "Implements a simplified linspace function as of numpy version >= 1.16.\n\nAs of numpy 1.16, the arguments start and stop can be array-like and\nthere is an optional argument `axis`.\nFor simplicity, we only allow 1d array-like to be passed to start and stop.\nSee: https://github.com/numpy/numpy/pull/12388 and numpy 1.16 release\nnotes about start and stop arrays for linspace logspace and geomspace.", + "docstring": "Implements a simplified linspace function as of numpy version >= 1.16.\n\n As of numpy 1.16, the arguments start and stop can be array-like and\n there is an optional argument `axis`.\n For simplicity, we only allow 1d array-like to be passed to start and stop.\n See: https://github.com/numpy/numpy/pull/12388 and numpy 1.16 release\n notes about start and stop arrays for linspace logspace and geomspace.\n\n Returns\n -------\n out : ndarray of shape (num, n_start) or (num,)\n The output array with `n_start=start.shape[0]` columns.\n ", "source_code": "\ndef linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0):\n \"\"\"Implements a simplified linspace function as of numpy version >= 1.16.\n\n As of numpy 1.16, the arguments start and stop can be array-like and\n there is an optional argument `axis`.\n For simplicity, we only allow 1d array-like to be passed to start and stop.\n See: https://github.com/numpy/numpy/pull/12388 and numpy 1.16 release\n notes about start and stop arrays for linspace logspace and geomspace.\n\n Returns\n -------\n out : ndarray of shape (num, n_start) or (num,)\n The output array with `n_start=start.shape[0]` columns.\n \"\"\"\n if np_version < parse_version('1.16'):\n start = np.asanyarray(start) * 1.0\n stop = np.asanyarray(stop) * 1.0\n dt = np.result_type(start, stop, float(num))\n if dtype is None:\n dtype = dt\n if start.ndim == 0 == stop.ndim:\n return np.linspace(start=start, stop=stop, num=num, endpoint=endpoint, retstep=retstep, dtype=dtype)\n if start.ndim != 1 or stop.ndim != 1 or start.shape != stop.shape:\n raise ValueError('start and stop must be 1d array-like of same shape.')\n n_start = start.shape[0]\n out = np.empty((num, n_start), dtype=dtype)\n step = np.empty(n_start, dtype=np.float)\n for i in range(n_start):\n (out[:, i], step[i]) = np.linspace(start=start[i], stop=stop[i], num=num, endpoint=endpoint, retstep=True, dtype=dtype)\n if axis != 0:\n out = np.moveaxis(out, 0, axis)\n if retstep:\n return out, step\n else:\n return out\n else:\n return np.linspace(start=start, stop=stop, num=num, endpoint=endpoint, retstep=retstep, dtype=dtype, axis=axis)" }, { @@ -174061,7 +187904,7 @@ "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef threadpool_info():\n controller = _get_threadpool_controller()\n if controller is not None:\n return controller.info()\n else:\n return threadpoolctl.threadpool_info()" }, { @@ -174079,7 +187922,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "user_api", @@ -174089,13 +187933,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef threadpool_limits(limits=None, user_api=None):\n controller = _get_threadpool_controller()\n if controller is not None:\n return controller.limit(limits=limits, user_api=user_api)\n else:\n return threadpoolctl.threadpool_limits(limits=limits, user_api=user_api)" }, { @@ -174113,7 +187958,8 @@ "docstring": { "type": "int", "description": "" - } + }, + "refined_type": {} }, { "name": "batch_size", @@ -174123,7 +187969,8 @@ "docstring": { "type": "int", "description": "Number of element in each batch." - } + }, + "refined_type": {} }, { "name": "min_batch_size", @@ -174133,13 +187980,14 @@ "docstring": { "type": "int, default=0", "description": "Minimum batch size to produce." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Generator to create slices containing batch_size elements, from 0 to n.\n\nThe last slice may contain less than batch_size elements, when batch_size does not divide n.", - "docstring": "Generator to create slices containing batch_size elements, from 0 to n.\n\nThe last slice may contain less than batch_size elements, when batch_size\ndoes not divide n.\n\nParameters\n----------\nn : int\nbatch_size : int\n Number of element in each batch.\nmin_batch_size : int, default=0\n Minimum batch size to produce.\n\nYields\n------\nslice of batch_size elements\n\nSee Also\n--------\ngen_even_slices: Generator to create n_packs slices going up to n.\n\nExamples\n--------\n>>> from sklearn.utils import gen_batches\n>>> list(gen_batches(7, 3))\n[slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]\n>>> list(gen_batches(6, 3))\n[slice(0, 3, None), slice(3, 6, None)]\n>>> list(gen_batches(2, 3))\n[slice(0, 2, None)]\n>>> list(gen_batches(7, 3, min_batch_size=0))\n[slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]\n>>> list(gen_batches(7, 3, min_batch_size=2))\n[slice(0, 3, None), slice(3, 7, None)]", + "description": "Generator to create slices containing batch_size elements, from 0 to n.\n\nThe last slice may contain less than batch_size elements, when batch_size\ndoes not divide n.", + "docstring": "Generator to create slices containing batch_size elements, from 0 to n.\n\n The last slice may contain less than batch_size elements, when batch_size\n does not divide n.\n\n Parameters\n ----------\n n : int\n batch_size : int\n Number of element in each batch.\n min_batch_size : int, default=0\n Minimum batch size to produce.\n\n Yields\n ------\n slice of batch_size elements\n\n See Also\n --------\n gen_even_slices: Generator to create n_packs slices going up to n.\n\n Examples\n --------\n >>> from sklearn.utils import gen_batches\n >>> list(gen_batches(7, 3))\n [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]\n >>> list(gen_batches(6, 3))\n [slice(0, 3, None), slice(3, 6, None)]\n >>> list(gen_batches(2, 3))\n [slice(0, 2, None)]\n >>> list(gen_batches(7, 3, min_batch_size=0))\n [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]\n >>> list(gen_batches(7, 3, min_batch_size=2))\n [slice(0, 3, None), slice(3, 7, None)]\n ", "source_code": "\ndef gen_batches(n, batch_size, *, min_batch_size=0):\n \"\"\"Generator to create slices containing batch_size elements, from 0 to n.\n\n The last slice may contain less than batch_size elements, when batch_size\n does not divide n.\n\n Parameters\n ----------\n n : int\n batch_size : int\n Number of element in each batch.\n min_batch_size : int, default=0\n Minimum batch size to produce.\n\n Yields\n ------\n slice of batch_size elements\n\n See Also\n --------\n gen_even_slices: Generator to create n_packs slices going up to n.\n\n Examples\n --------\n >>> from sklearn.utils import gen_batches\n >>> list(gen_batches(7, 3))\n [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]\n >>> list(gen_batches(6, 3))\n [slice(0, 3, None), slice(3, 6, None)]\n >>> list(gen_batches(2, 3))\n [slice(0, 2, None)]\n >>> list(gen_batches(7, 3, min_batch_size=0))\n [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]\n >>> list(gen_batches(7, 3, min_batch_size=2))\n [slice(0, 3, None), slice(3, 7, None)]\n \"\"\"\n if not isinstance(batch_size, numbers.Integral):\n raise TypeError('gen_batches got batch_size=%s, must be an integer' % batch_size)\n if batch_size <= 0:\n raise ValueError('gen_batches got batch_size=%s, must be positive' % batch_size)\n start = 0\n for _ in range(int(n // batch_size)):\n end = start + batch_size\n if end + min_batch_size > n:\n continue\n yield slice(start, end)\n start = end\n if start < n:\n yield slice(start, n)" }, { @@ -174157,7 +188005,8 @@ "docstring": { "type": "int", "description": "" - } + }, + "refined_type": {} }, { "name": "n_packs", @@ -174167,7 +188016,8 @@ "docstring": { "type": "int", "description": "Number of slices to generate." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -174177,13 +188027,14 @@ "docstring": { "type": "int, default=None", "description": "Number of samples. Pass n_samples when the slices are to be used for\nsparse matrix indexing; slicing off-the-end raises an exception, while\nit works for NumPy arrays." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Generator to create n_packs slices going up to n.", - "docstring": "Generator to create n_packs slices going up to n.\n\nParameters\n----------\nn : int\nn_packs : int\n Number of slices to generate.\nn_samples : int, default=None\n Number of samples. Pass n_samples when the slices are to be used for\n sparse matrix indexing; slicing off-the-end raises an exception, while\n it works for NumPy arrays.\n\nYields\n------\nslice\n\nSee Also\n--------\ngen_batches: Generator to create slices containing batch_size elements\n from 0 to n.\n\nExamples\n--------\n>>> from sklearn.utils import gen_even_slices\n>>> list(gen_even_slices(10, 1))\n[slice(0, 10, None)]\n>>> list(gen_even_slices(10, 10))\n[slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]\n>>> list(gen_even_slices(10, 5))\n[slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]\n>>> list(gen_even_slices(10, 3))\n[slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]", + "docstring": "Generator to create n_packs slices going up to n.\n\n Parameters\n ----------\n n : int\n n_packs : int\n Number of slices to generate.\n n_samples : int, default=None\n Number of samples. Pass n_samples when the slices are to be used for\n sparse matrix indexing; slicing off-the-end raises an exception, while\n it works for NumPy arrays.\n\n Yields\n ------\n slice\n\n See Also\n --------\n gen_batches: Generator to create slices containing batch_size elements\n from 0 to n.\n\n Examples\n --------\n >>> from sklearn.utils import gen_even_slices\n >>> list(gen_even_slices(10, 1))\n [slice(0, 10, None)]\n >>> list(gen_even_slices(10, 10))\n [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]\n >>> list(gen_even_slices(10, 5))\n [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]\n >>> list(gen_even_slices(10, 3))\n [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]\n ", "source_code": "\ndef gen_even_slices(n, n_packs, *, n_samples=None):\n \"\"\"Generator to create n_packs slices going up to n.\n\n Parameters\n ----------\n n : int\n n_packs : int\n Number of slices to generate.\n n_samples : int, default=None\n Number of samples. Pass n_samples when the slices are to be used for\n sparse matrix indexing; slicing off-the-end raises an exception, while\n it works for NumPy arrays.\n\n Yields\n ------\n slice\n\n See Also\n --------\n gen_batches: Generator to create slices containing batch_size elements\n from 0 to n.\n\n Examples\n --------\n >>> from sklearn.utils import gen_even_slices\n >>> list(gen_even_slices(10, 1))\n [slice(0, 10, None)]\n >>> list(gen_even_slices(10, 10))\n [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]\n >>> list(gen_even_slices(10, 5))\n [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]\n >>> list(gen_even_slices(10, 3))\n [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]\n \"\"\"\n start = 0\n if n_packs < 1:\n raise ValueError('gen_even_slices got n_packs=%s, must be >=1' % n_packs)\n for pack_num in range(n_packs):\n this_n = n // n_packs\n if pack_num < n % n_packs:\n this_n += 1\n if this_n > 0:\n end = start + this_n\n if n_samples is not None:\n end = min(n_samples, end)\n yield slice(start, end, None)\n start = end" }, { @@ -174201,7 +188052,8 @@ "docstring": { "type": "int", "description": "The expected number of bytes of memory that will be consumed\nduring the processing of each row." - } + }, + "refined_type": {} }, { "name": "max_n_rows", @@ -174211,7 +188063,8 @@ "docstring": { "type": "int, default=None", "description": "The maximum return value." - } + }, + "refined_type": {} }, { "name": "working_memory", @@ -174221,13 +188074,14 @@ "docstring": { "type": "int or float, default=None", "description": "The number of rows to fit inside this number of MiB will be returned.\nWhen None (default), the value of\n``sklearn.get_config()['working_memory']`` is used." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Calculates how many rows can be processed within working_memory.", - "docstring": "Calculates how many rows can be processed within working_memory.\n\nParameters\n----------\nrow_bytes : int\n The expected number of bytes of memory that will be consumed\n during the processing of each row.\nmax_n_rows : int, default=None\n The maximum return value.\nworking_memory : int or float, default=None\n The number of rows to fit inside this number of MiB will be returned.\n When None (default), the value of\n ``sklearn.get_config()['working_memory']`` is used.\n\nReturns\n-------\nint or the value of n_samples\n\nWarns\n-----\nIssues a UserWarning if ``row_bytes`` exceeds ``working_memory`` MiB.", + "docstring": "Calculates how many rows can be processed within working_memory.\n\n Parameters\n ----------\n row_bytes : int\n The expected number of bytes of memory that will be consumed\n during the processing of each row.\n max_n_rows : int, default=None\n The maximum return value.\n working_memory : int or float, default=None\n The number of rows to fit inside this number of MiB will be returned.\n When None (default), the value of\n ``sklearn.get_config()['working_memory']`` is used.\n\n Returns\n -------\n int or the value of n_samples\n\n Warns\n -----\n Issues a UserWarning if ``row_bytes`` exceeds ``working_memory`` MiB.\n ", "source_code": "\ndef get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):\n \"\"\"Calculates how many rows can be processed within working_memory.\n\n Parameters\n ----------\n row_bytes : int\n The expected number of bytes of memory that will be consumed\n during the processing of each row.\n max_n_rows : int, default=None\n The maximum return value.\n working_memory : int or float, default=None\n The number of rows to fit inside this number of MiB will be returned.\n When None (default), the value of\n ``sklearn.get_config()['working_memory']`` is used.\n\n Returns\n -------\n int or the value of n_samples\n\n Warns\n -----\n Issues a UserWarning if ``row_bytes`` exceeds ``working_memory`` MiB.\n \"\"\"\n if working_memory is None:\n working_memory = get_config()['working_memory']\n chunk_n_rows = int(working_memory * 2**20 // row_bytes)\n if max_n_rows is not None:\n chunk_n_rows = min(chunk_n_rows, max_n_rows)\n if chunk_n_rows < 1:\n warnings.warn('Could not adhere to working_memory config. Currently %.0fMiB, %.0fMiB required.' % (working_memory, np.ceil(row_bytes * 2**(-20))))\n chunk_n_rows = 1\n return chunk_n_rows" }, { @@ -174245,7 +188099,8 @@ "docstring": { "type": "array of shape (n_samples, n_features) or (n_samples, n_samples)", "description": "Features to compute the pairwise distances. If `metric =\n\"precomputed\"`, X is the matrix of pairwise distances." - } + }, + "refined_type": {} }, { "name": "graph", @@ -174255,7 +188110,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_samples)", "description": "Graph of connection between samples." - } + }, + "refined_type": {} }, { "name": "n_connected_components", @@ -174265,7 +188121,8 @@ "docstring": { "type": "int", "description": "Number of connected components, as computed by\n`scipy.sparse.csgraph.connected_components`." - } + }, + "refined_type": {} }, { "name": "component_labels", @@ -174275,7 +188132,8 @@ "docstring": { "type": "array of shape (n_samples)", "description": "Labels of connected components, as computed by\n`scipy.sparse.csgraph.connected_components`." - } + }, + "refined_type": {} }, { "name": "mode", @@ -174285,6 +188143,10 @@ "docstring": { "type": "{'connectivity', 'distance'}, default='distance'", "description": "Type of graph matrix: 'connectivity' corresponds to the connectivity\nmatrix with ones and zeros, and 'distance' corresponds to the distances\nbetween neighbors according to the given metric." + }, + "refined_type": { + "kind": "EnumType", + "values": ["distance", "connectivity"] } }, { @@ -174295,14 +188157,15 @@ "docstring": { "type": "str", "description": "Metric used in `sklearn.metrics.pairwise.pairwise_distances`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Add connections to sparse graph to connect unconnected components.\n\nFor each pair of unconnected components, compute all pairwise distances from one component to the other, and add a connection on the closest pair of samples. This is a hacky way to get a graph with a single connected component, which is necessary for example to compute a shortest path between all pairs of samples in the graph.", - "docstring": "Add connections to sparse graph to connect unconnected components.\n\nFor each pair of unconnected components, compute all pairwise distances\nfrom one component to the other, and add a connection on the closest pair\nof samples. This is a hacky way to get a graph with a single connected\ncomponent, which is necessary for example to compute a shortest path\nbetween all pairs of samples in the graph.\n\nParameters\n----------\nX : array of shape (n_samples, n_features) or (n_samples, n_samples)\n Features to compute the pairwise distances. If `metric =\n \"precomputed\"`, X is the matrix of pairwise distances.\n\ngraph : sparse matrix of shape (n_samples, n_samples)\n Graph of connection between samples.\n\nn_connected_components : int\n Number of connected components, as computed by\n `scipy.sparse.csgraph.connected_components`.\n\ncomponent_labels : array of shape (n_samples)\n Labels of connected components, as computed by\n `scipy.sparse.csgraph.connected_components`.\n\nmode : {'connectivity', 'distance'}, default='distance'\n Type of graph matrix: 'connectivity' corresponds to the connectivity\n matrix with ones and zeros, and 'distance' corresponds to the distances\n between neighbors according to the given metric.\n\nmetric : str\n Metric used in `sklearn.metrics.pairwise.pairwise_distances`.\n\nkwargs : kwargs\n Keyword arguments passed to\n `sklearn.metrics.pairwise.pairwise_distances`.\n\nReturns\n-------\ngraph : sparse matrix of shape (n_samples, n_samples)\n Graph of connection between samples, with a single connected component.", - "source_code": "\ndef _fix_connected_components(X, graph, n_connected_components, component_labels, mode='distance', metric='euclidean', **kwargs):\n \"\"\"Add connections to sparse graph to connect unconnected components.\n\n For each pair of unconnected components, compute all pairwise distances\n from one component to the other, and add a connection on the closest pair\n of samples. This is a hacky way to get a graph with a single connected\n component, which is necessary for example to compute a shortest path\n between all pairs of samples in the graph.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_features) or (n_samples, n_samples)\n Features to compute the pairwise distances. If `metric =\n \"precomputed\"`, X is the matrix of pairwise distances.\n\n graph : sparse matrix of shape (n_samples, n_samples)\n Graph of connection between samples.\n\n n_connected_components : int\n Number of connected components, as computed by\n `scipy.sparse.csgraph.connected_components`.\n\n component_labels : array of shape (n_samples)\n Labels of connected components, as computed by\n `scipy.sparse.csgraph.connected_components`.\n\n mode : {'connectivity', 'distance'}, default='distance'\n Type of graph matrix: 'connectivity' corresponds to the connectivity\n matrix with ones and zeros, and 'distance' corresponds to the distances\n between neighbors according to the given metric.\n\n metric : str\n Metric used in `sklearn.metrics.pairwise.pairwise_distances`.\n\n kwargs : kwargs\n Keyword arguments passed to\n `sklearn.metrics.pairwise.pairwise_distances`.\n\n Returns\n -------\n graph : sparse matrix of shape (n_samples, n_samples)\n Graph of connection between samples, with a single connected component.\n \"\"\"\n for i in range(n_connected_components):\n idx_i = np.flatnonzero(component_labels == i)\n Xi = X[idx_i]\n for j in range(i):\n idx_j = np.flatnonzero(component_labels == j)\n Xj = X[idx_j]\n if metric == 'precomputed':\n D = X[np.ix_(idx_i, idx_j)]\n else:\n D = pairwise_distances(Xi, Xj, metric=metric, **kwargs)\n (ii, jj) = np.unravel_index(D.argmin(axis=None), D.shape)\n if mode == 'connectivity':\n graph[idx_i[ii], idx_j[jj]] = 1\n graph[idx_j[jj], idx_i[ii]] = 1\n elif mode == 'distance':\n graph[idx_i[ii], idx_j[jj]] = D[ii, jj]\n graph[idx_j[jj], idx_i[ii]] = D[ii, jj]\n else:\n raise ValueError(\"Unknown mode=%r, should be one of ['connectivity', 'distance'].\" % mode)\n return graph" + "description": "Add connections to sparse graph to connect unconnected components.\n\nFor each pair of unconnected components, compute all pairwise distances\nfrom one component to the other, and add a connection on the closest pair\nof samples. This is a hacky way to get a graph with a single connected\ncomponent, which is necessary for example to compute a shortest path\nbetween all pairs of samples in the graph.", + "docstring": "Add connections to sparse graph to connect unconnected components.\n\n For each pair of unconnected components, compute all pairwise distances\n from one component to the other, and add a connection on the closest pair\n of samples. This is a hacky way to get a graph with a single connected\n component, which is necessary for example to compute a shortest path\n between all pairs of samples in the graph.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_features) or (n_samples, n_samples)\n Features to compute the pairwise distances. If `metric =\n \"precomputed\"`, X is the matrix of pairwise distances.\n\n graph : sparse matrix of shape (n_samples, n_samples)\n Graph of connection between samples.\n\n n_connected_components : int\n Number of connected components, as computed by\n `scipy.sparse.csgraph.connected_components`.\n\n component_labels : array of shape (n_samples)\n Labels of connected components, as computed by\n `scipy.sparse.csgraph.connected_components`.\n\n mode : {'connectivity', 'distance'}, default='distance'\n Type of graph matrix: 'connectivity' corresponds to the connectivity\n matrix with ones and zeros, and 'distance' corresponds to the distances\n between neighbors according to the given metric.\n\n metric : str\n Metric used in `sklearn.metrics.pairwise.pairwise_distances`.\n\n kwargs : kwargs\n Keyword arguments passed to\n `sklearn.metrics.pairwise.pairwise_distances`.\n\n Returns\n -------\n graph : sparse matrix of shape (n_samples, n_samples)\n Graph of connection between samples, with a single connected component.\n ", + "source_code": "\ndef _fix_connected_components(X, graph, n_connected_components, component_labels, mode='distance', metric='euclidean', **kwargs):\n \"\"\"Add connections to sparse graph to connect unconnected components.\n\n For each pair of unconnected components, compute all pairwise distances\n from one component to the other, and add a connection on the closest pair\n of samples. This is a hacky way to get a graph with a single connected\n component, which is necessary for example to compute a shortest path\n between all pairs of samples in the graph.\n\n Parameters\n ----------\n X : array of shape (n_samples, n_features) or (n_samples, n_samples)\n Features to compute the pairwise distances. If `metric =\n \"precomputed\"`, X is the matrix of pairwise distances.\n\n graph : sparse matrix of shape (n_samples, n_samples)\n Graph of connection between samples.\n\n n_connected_components : int\n Number of connected components, as computed by\n `scipy.sparse.csgraph.connected_components`.\n\n component_labels : array of shape (n_samples)\n Labels of connected components, as computed by\n `scipy.sparse.csgraph.connected_components`.\n\n mode : {'connectivity', 'distance'}, default='distance'\n Type of graph matrix: 'connectivity' corresponds to the connectivity\n matrix with ones and zeros, and 'distance' corresponds to the distances\n between neighbors according to the given metric.\n\n metric : str\n Metric used in `sklearn.metrics.pairwise.pairwise_distances`.\n\n kwargs : kwargs\n Keyword arguments passed to\n `sklearn.metrics.pairwise.pairwise_distances`.\n\n Returns\n -------\n graph : sparse matrix of shape (n_samples, n_samples)\n Graph of connection between samples, with a single connected component.\n \"\"\"\n if metric == 'precomputed' and sparse.issparse(X):\n raise RuntimeError(\"_fix_connected_components with metric='precomputed' requires the full distance matrix in X, and does not work with a sparse neighbors graph.\")\n for i in range(n_connected_components):\n idx_i = np.flatnonzero(component_labels == i)\n Xi = X[idx_i]\n for j in range(i):\n idx_j = np.flatnonzero(component_labels == j)\n Xj = X[idx_j]\n if metric == 'precomputed':\n D = X[np.ix_(idx_i, idx_j)]\n else:\n D = pairwise_distances(Xi, Xj, metric=metric, **kwargs)\n (ii, jj) = np.unravel_index(D.argmin(axis=None), D.shape)\n if mode == 'connectivity':\n graph[idx_i[ii], idx_j[jj]] = 1\n graph[idx_j[jj], idx_i[ii]] = 1\n elif mode == 'distance':\n graph[idx_i[ii], idx_j[jj]] = D[ii, jj]\n graph[idx_j[jj], idx_i[ii]] = D[ii, jj]\n else:\n raise ValueError(\"Unknown mode=%r, should be one of ['connectivity', 'distance'].\" % mode)\n return graph" }, { "name": "graph_shortest_path", @@ -174321,7 +188184,8 @@ "docstring": { "type": "arraylike or sparse matrix, shape = (N,N)", "description": "Array of positive distances.\nIf vertex i is connected to vertex j, then dist_matrix[i,j] gives\nthe distance between the vertices.\nIf vertex i is not connected to vertex j, then dist_matrix[i,j] = 0" - } + }, + "refined_type": {} }, { "name": "directed", @@ -174331,7 +188195,8 @@ "docstring": { "type": "boolean", "description": "if True, then find the shortest path on a directed graph: only\nprogress from a point to its neighbors, not the other way around.\nif False, then find the shortest path on an undirected graph: the\nalgorithm can progress from a point to its neighbors and vice versa." - } + }, + "refined_type": {} }, { "name": "method", @@ -174341,13 +188206,17 @@ "docstring": { "type": "{'auto', 'FW', 'D'}, default='auto'", "description": "method to use. Options are\n'auto' : attempt to choose the best method for the current problem\n'FW' : Floyd-Warshall algorithm. O[N^3]\n'D' : Dijkstra's algorithm with Fibonacci stacks. O[(k+log(N))N^2]" + }, + "refined_type": { + "kind": "EnumType", + "values": ["auto", "FW", "D"] } } ], "results": [], "is_public": true, "description": "Shortest-path graph search on a positive directed or undirected graph.", - "docstring": "Shortest-path graph search on a positive directed or undirected graph.\n\nParameters\n----------\ndist_matrix : arraylike or sparse matrix, shape = (N,N)\n Array of positive distances.\n If vertex i is connected to vertex j, then dist_matrix[i,j] gives\n the distance between the vertices.\n If vertex i is not connected to vertex j, then dist_matrix[i,j] = 0\n\ndirected : boolean\n if True, then find the shortest path on a directed graph: only\n progress from a point to its neighbors, not the other way around.\n if False, then find the shortest path on an undirected graph: the\n algorithm can progress from a point to its neighbors and vice versa.\n\nmethod : {'auto', 'FW', 'D'}, default='auto'\n method to use. Options are\n 'auto' : attempt to choose the best method for the current problem\n 'FW' : Floyd-Warshall algorithm. O[N^3]\n 'D' : Dijkstra's algorithm with Fibonacci stacks. O[(k+log(N))N^2]\n\nReturns\n-------\nG : np.ndarray, float, shape = [N,N]\n G[i,j] gives the shortest distance from point i to point j\n along the graph.\n\nNotes\n-----\nAs currently implemented, Dijkstra's algorithm does not work for\ngraphs with direction-dependent distances when directed == False.\ni.e., if dist_matrix[i,j] and dist_matrix[j,i] are not equal and\nboth are nonzero, method='D' will not necessarily yield the correct\nresult.\nAlso, these routines have not been tested for graphs with negative\ndistances. Negative distances can lead to infinite cycles that must\nbe handled by specialized algorithms.", + "docstring": "Shortest-path graph search on a positive directed or undirected graph.\n\n Parameters\n ----------\n dist_matrix : arraylike or sparse matrix, shape = (N,N)\n Array of positive distances.\n If vertex i is connected to vertex j, then dist_matrix[i,j] gives\n the distance between the vertices.\n If vertex i is not connected to vertex j, then dist_matrix[i,j] = 0\n\n directed : boolean\n if True, then find the shortest path on a directed graph: only\n progress from a point to its neighbors, not the other way around.\n if False, then find the shortest path on an undirected graph: the\n algorithm can progress from a point to its neighbors and vice versa.\n\n method : {'auto', 'FW', 'D'}, default='auto'\n method to use. Options are\n 'auto' : attempt to choose the best method for the current problem\n 'FW' : Floyd-Warshall algorithm. O[N^3]\n 'D' : Dijkstra's algorithm with Fibonacci stacks. O[(k+log(N))N^2]\n\n Returns\n -------\n G : np.ndarray, float, shape = [N,N]\n G[i,j] gives the shortest distance from point i to point j\n along the graph.\n\n Notes\n -----\n As currently implemented, Dijkstra's algorithm does not work for\n graphs with direction-dependent distances when directed == False.\n i.e., if dist_matrix[i,j] and dist_matrix[j,i] are not equal and\n both are nonzero, method='D' will not necessarily yield the correct\n result.\n Also, these routines have not been tested for graphs with negative\n distances. Negative distances can lead to infinite cycles that must\n be handled by specialized algorithms.\n ", "source_code": "\n@deprecated('`graph_shortest_path` is deprecated in 1.0 (renaming of 0.25) and will be removed in 1.2. Use `scipy.sparse.csgraph.shortest_path` instead.')\ndef graph_shortest_path(dist_matrix, directed=True, method='auto'):\n \"\"\"Shortest-path graph search on a positive directed or undirected graph.\n\n Parameters\n ----------\n dist_matrix : arraylike or sparse matrix, shape = (N,N)\n Array of positive distances.\n If vertex i is connected to vertex j, then dist_matrix[i,j] gives\n the distance between the vertices.\n If vertex i is not connected to vertex j, then dist_matrix[i,j] = 0\n\n directed : boolean\n if True, then find the shortest path on a directed graph: only\n progress from a point to its neighbors, not the other way around.\n if False, then find the shortest path on an undirected graph: the\n algorithm can progress from a point to its neighbors and vice versa.\n\n method : {'auto', 'FW', 'D'}, default='auto'\n method to use. Options are\n 'auto' : attempt to choose the best method for the current problem\n 'FW' : Floyd-Warshall algorithm. O[N^3]\n 'D' : Dijkstra's algorithm with Fibonacci stacks. O[(k+log(N))N^2]\n\n Returns\n -------\n G : np.ndarray, float, shape = [N,N]\n G[i,j] gives the shortest distance from point i to point j\n along the graph.\n\n Notes\n -----\n As currently implemented, Dijkstra's algorithm does not work for\n graphs with direction-dependent distances when directed == False.\n i.e., if dist_matrix[i,j] and dist_matrix[j,i] are not equal and\n both are nonzero, method='D' will not necessarily yield the correct\n result.\n Also, these routines have not been tested for graphs with negative\n distances. Negative distances can lead to infinite cycles that must\n be handled by specialized algorithms.\n \"\"\"\n return sparse.csgraph.shortest_path(dist_matrix, method=method, directed=directed)" }, { @@ -174365,6 +188234,10 @@ "docstring": { "type": "{sparse matrix, ndarray} of shape (n, n)", "description": "Adjacency matrix of the graph. Sparse matrix of format LIL is\npreferred." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -174375,7 +188248,8 @@ "docstring": { "type": "int", "description": "Starting node for path." - } + }, + "refined_type": {} }, { "name": "cutoff", @@ -174385,13 +188259,14 @@ "docstring": { "type": "int, default=None", "description": "Depth to stop the search - only paths of length <= cutoff are returned." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return the shortest path length from source to all reachable nodes.\n\nReturns a dictionary of shortest path lengths keyed by target.", - "docstring": "Return the shortest path length from source to all reachable nodes.\n\nReturns a dictionary of shortest path lengths keyed by target.\n\nParameters\n----------\ngraph : {sparse matrix, ndarray} of shape (n, n)\n Adjacency matrix of the graph. Sparse matrix of format LIL is\n preferred.\n\nsource : int\n Starting node for path.\n\ncutoff : int, default=None\n Depth to stop the search - only paths of length <= cutoff are returned.\n\nExamples\n--------\n>>> from sklearn.utils.graph import single_source_shortest_path_length\n>>> import numpy as np\n>>> graph = np.array([[ 0, 1, 0, 0],\n... [ 1, 0, 1, 0],\n... [ 0, 1, 0, 1],\n... [ 0, 0, 1, 0]])\n>>> list(sorted(single_source_shortest_path_length(graph, 0).items()))\n[(0, 0), (1, 1), (2, 2), (3, 3)]\n>>> graph = np.ones((6, 6))\n>>> list(sorted(single_source_shortest_path_length(graph, 2).items()))\n[(0, 1), (1, 1), (2, 0), (3, 1), (4, 1), (5, 1)]", + "docstring": "Return the shortest path length from source to all reachable nodes.\n\n Returns a dictionary of shortest path lengths keyed by target.\n\n Parameters\n ----------\n graph : {sparse matrix, ndarray} of shape (n, n)\n Adjacency matrix of the graph. Sparse matrix of format LIL is\n preferred.\n\n source : int\n Starting node for path.\n\n cutoff : int, default=None\n Depth to stop the search - only paths of length <= cutoff are returned.\n\n Examples\n --------\n >>> from sklearn.utils.graph import single_source_shortest_path_length\n >>> import numpy as np\n >>> graph = np.array([[ 0, 1, 0, 0],\n ... [ 1, 0, 1, 0],\n ... [ 0, 1, 0, 1],\n ... [ 0, 0, 1, 0]])\n >>> list(sorted(single_source_shortest_path_length(graph, 0).items()))\n [(0, 0), (1, 1), (2, 2), (3, 3)]\n >>> graph = np.ones((6, 6))\n >>> list(sorted(single_source_shortest_path_length(graph, 2).items()))\n [(0, 1), (1, 1), (2, 0), (3, 1), (4, 1), (5, 1)]\n ", "source_code": "\ndef single_source_shortest_path_length(graph, source, *, cutoff=None):\n \"\"\"Return the shortest path length from source to all reachable nodes.\n\n Returns a dictionary of shortest path lengths keyed by target.\n\n Parameters\n ----------\n graph : {sparse matrix, ndarray} of shape (n, n)\n Adjacency matrix of the graph. Sparse matrix of format LIL is\n preferred.\n\n source : int\n Starting node for path.\n\n cutoff : int, default=None\n Depth to stop the search - only paths of length <= cutoff are returned.\n\n Examples\n --------\n >>> from sklearn.utils.graph import single_source_shortest_path_length\n >>> import numpy as np\n >>> graph = np.array([[ 0, 1, 0, 0],\n ... [ 1, 0, 1, 0],\n ... [ 0, 1, 0, 1],\n ... [ 0, 0, 1, 0]])\n >>> list(sorted(single_source_shortest_path_length(graph, 0).items()))\n [(0, 0), (1, 1), (2, 2), (3, 3)]\n >>> graph = np.ones((6, 6))\n >>> list(sorted(single_source_shortest_path_length(graph, 2).items()))\n [(0, 1), (1, 1), (2, 0), (3, 1), (4, 1), (5, 1)]\n \"\"\"\n if sparse.isspmatrix(graph):\n graph = graph.tolil()\n else:\n graph = sparse.lil_matrix(graph)\n seen = {}\n level = 0\n next_level = [source]\n while next_level:\n this_level = next_level\n next_level = set()\n for v in this_level:\n if v not in seen:\n seen[v] = level\n next_level.update(graph.rows[v])\n if cutoff is not None and cutoff <= level:\n break\n level += 1\n return seen" }, { @@ -174409,7 +188284,8 @@ "docstring": { "type": "list-like", "description": "List of integers treated as indices." - } + }, + "refined_type": {} }, { "name": "mask_length", @@ -174419,13 +188295,14 @@ "docstring": { "type": "int", "description": "Length of boolean mask to be generated.\nThis parameter must be greater than max(indices)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Convert list of indices to boolean mask.", - "docstring": "Convert list of indices to boolean mask.\n\nParameters\n----------\nindices : list-like\n List of integers treated as indices.\nmask_length : int\n Length of boolean mask to be generated.\n This parameter must be greater than max(indices).\n\nReturns\n-------\nmask : 1d boolean nd-array\n Boolean array that is True where indices are present, else False.\n\nExamples\n--------\n>>> from sklearn.utils import indices_to_mask\n>>> indices = [1, 2 , 3, 4]\n>>> indices_to_mask(indices, 5)\narray([False, True, True, True, True])", + "docstring": "Convert list of indices to boolean mask.\n\n Parameters\n ----------\n indices : list-like\n List of integers treated as indices.\n mask_length : int\n Length of boolean mask to be generated.\n This parameter must be greater than max(indices).\n\n Returns\n -------\n mask : 1d boolean nd-array\n Boolean array that is True where indices are present, else False.\n\n Examples\n --------\n >>> from sklearn.utils import indices_to_mask\n >>> indices = [1, 2 , 3, 4]\n >>> indices_to_mask(indices, 5)\n array([False, True, True, True, True])\n ", "source_code": "\ndef indices_to_mask(indices, mask_length):\n \"\"\"Convert list of indices to boolean mask.\n\n Parameters\n ----------\n indices : list-like\n List of integers treated as indices.\n mask_length : int\n Length of boolean mask to be generated.\n This parameter must be greater than max(indices).\n\n Returns\n -------\n mask : 1d boolean nd-array\n Boolean array that is True where indices are present, else False.\n\n Examples\n --------\n >>> from sklearn.utils import indices_to_mask\n >>> indices = [1, 2 , 3, 4]\n >>> indices_to_mask(indices, 5)\n array([False, True, True, True, True])\n \"\"\"\n if mask_length <= np.max(indices):\n raise ValueError('mask_length must be greater than max(indices)')\n mask = np.zeros(mask_length, dtype=bool)\n mask[indices] = True\n return mask" }, { @@ -174443,13 +188320,14 @@ "docstring": { "type": "any type", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Tests if x is NaN.\n\nThis function is meant to overcome the issue that np.isnan does not allow non-numerical types as input, and that np.nan is not float('nan').", - "docstring": "Tests if x is NaN.\n\nThis function is meant to overcome the issue that np.isnan does not allow\nnon-numerical types as input, and that np.nan is not float('nan').\n\nParameters\n----------\nx : any type\n\nReturns\n-------\nboolean\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.utils import is_scalar_nan\n>>> is_scalar_nan(np.nan)\nTrue\n>>> is_scalar_nan(float(\"nan\"))\nTrue\n>>> is_scalar_nan(None)\nFalse\n>>> is_scalar_nan(\"\")\nFalse\n>>> is_scalar_nan([np.nan])\nFalse", + "description": "Tests if x is NaN.\n\nThis function is meant to overcome the issue that np.isnan does not allow\nnon-numerical types as input, and that np.nan is not float('nan').", + "docstring": "Tests if x is NaN.\n\n This function is meant to overcome the issue that np.isnan does not allow\n non-numerical types as input, and that np.nan is not float('nan').\n\n Parameters\n ----------\n x : any type\n\n Returns\n -------\n boolean\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.utils import is_scalar_nan\n >>> is_scalar_nan(np.nan)\n True\n >>> is_scalar_nan(float(\"nan\"))\n True\n >>> is_scalar_nan(None)\n False\n >>> is_scalar_nan(\"\")\n False\n >>> is_scalar_nan([np.nan])\n False\n ", "source_code": "\ndef is_scalar_nan(x):\n \"\"\"Tests if x is NaN.\n\n This function is meant to overcome the issue that np.isnan does not allow\n non-numerical types as input, and that np.nan is not float('nan').\n\n Parameters\n ----------\n x : any type\n\n Returns\n -------\n boolean\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.utils import is_scalar_nan\n >>> is_scalar_nan(np.nan)\n True\n >>> is_scalar_nan(float(\"nan\"))\n True\n >>> is_scalar_nan(None)\n False\n >>> is_scalar_nan(\"\")\n False\n >>> is_scalar_nan([np.nan])\n False\n \"\"\"\n return isinstance(x, numbers.Real) and math.isnan(x)" }, { @@ -174467,7 +188345,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "obj", @@ -174477,7 +188356,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "owner", @@ -174487,13 +188367,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __get__(self, obj, owner=None):\n attr_err = AttributeError(f'This {repr(owner.__name__)} has no attribute {repr(self.attribute_name)}')\n if obj is not None:\n if not self.check(obj):\n raise attr_err\n out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)\n else:\n \n def fn(*args, **kwargs):\n if not self.check(args[0]):\n raise attr_err\n return self.fn(*args, **kwargs)\n out = lambda *args, **kwargs: fn(*args, **kwargs)\n update_wrapper(out, self.fn)\n return out" }, { @@ -174511,7 +188392,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fn", @@ -174521,7 +188403,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "check", @@ -174531,7 +188414,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "attribute_name", @@ -174541,13 +188425,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, fn, check, attribute_name):\n self.fn = fn\n self.check = check\n self.attribute_name = attribute_name\n update_wrapper(self, fn)" }, { @@ -174565,13 +188450,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\n@abstractmethod\ndef __init__(self):\n pass" }, { @@ -174589,7 +188475,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "attr", @@ -174599,7 +188486,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "deep", @@ -174609,13 +188497,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _get_params(self, attr, deep=True):\n out = super().get_params(deep=deep)\n if not deep:\n return out\n estimators = getattr(self, attr)\n out.update(estimators)\n for (name, estimator) in estimators:\n if hasattr(estimator, 'get_params'):\n for (key, value) in estimator.get_params(deep=True).items():\n out['%s__%s' % (name, key)] = value\n return out" }, { @@ -174633,7 +188522,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "attr", @@ -174643,7 +188533,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "name", @@ -174653,7 +188544,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "new_val", @@ -174663,13 +188555,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _replace_estimator(self, attr, name, new_val):\n new_estimators = list(getattr(self, attr))\n for (i, (estimator_name, _)) in enumerate(new_estimators):\n if estimator_name == name:\n new_estimators[i] = (name, new_val)\n break\n setattr(self, attr, new_estimators)" }, { @@ -174687,7 +188580,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "attr", @@ -174697,13 +188591,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _set_params(self, attr, **params):\n if attr in params:\n setattr(self, attr, params.pop(attr))\n items = getattr(self, attr)\n names = []\n if items:\n (names, _) = zip(*items)\n for name in list(params.keys()):\n if '__' not in name and name in names:\n self._replace_estimator(attr, name, params.pop(name))\n super().set_params(**params)\n return self" }, { @@ -174721,7 +188616,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "names", @@ -174731,13 +188627,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _validate_names(self, names):\n if len(set(names)) != len(names):\n raise ValueError('Names provided are not unique: {0!r}'.format(list(names)))\n invalid_names = set(names).intersection(self.get_params(deep=False))\n if invalid_names:\n raise ValueError('Estimator names conflict with constructor arguments: {0!r}'.format(sorted(invalid_names)))\n invalid_names = [name for name in names if '__' in name]\n if invalid_names:\n raise ValueError('Estimator names must not contain __: got {0!r}'.format(invalid_names))" }, { @@ -174755,7 +188652,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fn", @@ -174765,7 +188663,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "delegate_names", @@ -174775,7 +188674,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "attribute_name", @@ -174785,13 +188685,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef __init__(self, fn, delegate_names, attribute_name):\n super().__init__(fn, self._check, attribute_name)\n self.delegate_names = delegate_names" }, { @@ -174809,7 +188710,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "obj", @@ -174819,13 +188721,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _check(self, obj):\n delegate = None\n for delegate_name in self.delegate_names:\n try:\n delegate = attrgetter(delegate_name)(obj)\n break\n except AttributeError:\n continue\n if delegate is None:\n return False\n getattr(delegate, self.attribute_name)\n return True" }, { @@ -174843,7 +188746,8 @@ "docstring": { "type": "object", "description": "Estimator to determine whether we should slice only rows or rows and\ncolumns." - } + }, + "refined_type": {} }, { "name": "X", @@ -174853,7 +188757,8 @@ "docstring": { "type": "array-like, sparse matrix or iterable", "description": "Data to be indexed. If ``estimator._pairwise is True``,\nthis needs to be a square array-like or sparse matrix." - } + }, + "refined_type": {} }, { "name": "y", @@ -174863,7 +188768,8 @@ "docstring": { "type": "array-like, sparse matrix or iterable", "description": "Targets to be indexed." - } + }, + "refined_type": {} }, { "name": "indices", @@ -174873,7 +188779,8 @@ "docstring": { "type": "array of int", "description": "Rows to select from X and y.\nIf ``estimator._pairwise is True`` and ``train_indices is None``\nthen ``indices`` will also be used to slice columns." - } + }, + "refined_type": {} }, { "name": "train_indices", @@ -174883,13 +188790,14 @@ "docstring": { "type": "array of int or None, default=None", "description": "If ``estimator._pairwise is True`` and ``train_indices is not None``,\nthen ``train_indices`` will be use to slice the columns of X." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Create subset of dataset and properly handle kernels.\n\nSlice X, y according to indices for cross-validation, but take care of precomputed kernel-matrices or pairwise affinities / distances. If ``estimator._pairwise is True``, X needs to be square and we slice rows and columns. If ``train_indices`` is not None, we slice rows using ``indices`` (assumed the test set) and columns using ``train_indices``, indicating the training set. .. deprecated:: 0.24 The _pairwise attribute is deprecated in 0.24. From 1.1 (renaming of 0.26) and onward, this function will check for the pairwise estimator tag. Labels y will always be indexed only along the first axis.", - "docstring": "Create subset of dataset and properly handle kernels.\n\nSlice X, y according to indices for cross-validation, but take care of\nprecomputed kernel-matrices or pairwise affinities / distances.\n\nIf ``estimator._pairwise is True``, X needs to be square and\nwe slice rows and columns. If ``train_indices`` is not None,\nwe slice rows using ``indices`` (assumed the test set) and columns\nusing ``train_indices``, indicating the training set.\n\n.. deprecated:: 0.24\n\n The _pairwise attribute is deprecated in 0.24. From 1.1\n (renaming of 0.26) and onward, this function will check for the\n pairwise estimator tag.\n\nLabels y will always be indexed only along the first axis.\n\nParameters\n----------\nestimator : object\n Estimator to determine whether we should slice only rows or rows and\n columns.\n\nX : array-like, sparse matrix or iterable\n Data to be indexed. If ``estimator._pairwise is True``,\n this needs to be a square array-like or sparse matrix.\n\ny : array-like, sparse matrix or iterable\n Targets to be indexed.\n\nindices : array of int\n Rows to select from X and y.\n If ``estimator._pairwise is True`` and ``train_indices is None``\n then ``indices`` will also be used to slice columns.\n\ntrain_indices : array of int or None, default=None\n If ``estimator._pairwise is True`` and ``train_indices is not None``,\n then ``train_indices`` will be use to slice the columns of X.\n\nReturns\n-------\nX_subset : array-like, sparse matrix or list\n Indexed data.\n\ny_subset : array-like, sparse matrix or list\n Indexed targets.", + "description": "Create subset of dataset and properly handle kernels.\n\nSlice X, y according to indices for cross-validation, but take care of\nprecomputed kernel-matrices or pairwise affinities / distances.\n\nIf ``estimator._pairwise is True``, X needs to be square and\nwe slice rows and columns. If ``train_indices`` is not None,\nwe slice rows using ``indices`` (assumed the test set) and columns\nusing ``train_indices``, indicating the training set.\n\n.. deprecated:: 0.24\n\n The _pairwise attribute is deprecated in 0.24. From 1.1\n (renaming of 0.26) and onward, this function will check for the\n pairwise estimator tag.\n\nLabels y will always be indexed only along the first axis.", + "docstring": "Create subset of dataset and properly handle kernels.\n\n Slice X, y according to indices for cross-validation, but take care of\n precomputed kernel-matrices or pairwise affinities / distances.\n\n If ``estimator._pairwise is True``, X needs to be square and\n we slice rows and columns. If ``train_indices`` is not None,\n we slice rows using ``indices`` (assumed the test set) and columns\n using ``train_indices``, indicating the training set.\n\n .. deprecated:: 0.24\n\n The _pairwise attribute is deprecated in 0.24. From 1.1\n (renaming of 0.26) and onward, this function will check for the\n pairwise estimator tag.\n\n Labels y will always be indexed only along the first axis.\n\n Parameters\n ----------\n estimator : object\n Estimator to determine whether we should slice only rows or rows and\n columns.\n\n X : array-like, sparse matrix or iterable\n Data to be indexed. If ``estimator._pairwise is True``,\n this needs to be a square array-like or sparse matrix.\n\n y : array-like, sparse matrix or iterable\n Targets to be indexed.\n\n indices : array of int\n Rows to select from X and y.\n If ``estimator._pairwise is True`` and ``train_indices is None``\n then ``indices`` will also be used to slice columns.\n\n train_indices : array of int or None, default=None\n If ``estimator._pairwise is True`` and ``train_indices is not None``,\n then ``train_indices`` will be use to slice the columns of X.\n\n Returns\n -------\n X_subset : array-like, sparse matrix or list\n Indexed data.\n\n y_subset : array-like, sparse matrix or list\n Indexed targets.\n\n ", "source_code": "\ndef _safe_split(estimator, X, y, indices, train_indices=None):\n \"\"\"Create subset of dataset and properly handle kernels.\n\n Slice X, y according to indices for cross-validation, but take care of\n precomputed kernel-matrices or pairwise affinities / distances.\n\n If ``estimator._pairwise is True``, X needs to be square and\n we slice rows and columns. If ``train_indices`` is not None,\n we slice rows using ``indices`` (assumed the test set) and columns\n using ``train_indices``, indicating the training set.\n\n .. deprecated:: 0.24\n\n The _pairwise attribute is deprecated in 0.24. From 1.1\n (renaming of 0.26) and onward, this function will check for the\n pairwise estimator tag.\n\n Labels y will always be indexed only along the first axis.\n\n Parameters\n ----------\n estimator : object\n Estimator to determine whether we should slice only rows or rows and\n columns.\n\n X : array-like, sparse matrix or iterable\n Data to be indexed. If ``estimator._pairwise is True``,\n this needs to be a square array-like or sparse matrix.\n\n y : array-like, sparse matrix or iterable\n Targets to be indexed.\n\n indices : array of int\n Rows to select from X and y.\n If ``estimator._pairwise is True`` and ``train_indices is None``\n then ``indices`` will also be used to slice columns.\n\n train_indices : array of int or None, default=None\n If ``estimator._pairwise is True`` and ``train_indices is not None``,\n then ``train_indices`` will be use to slice the columns of X.\n\n Returns\n -------\n X_subset : array-like, sparse matrix or list\n Indexed data.\n\n y_subset : array-like, sparse matrix or list\n Indexed targets.\n\n \"\"\"\n if _is_pairwise(estimator):\n if not hasattr(X, 'shape'):\n raise ValueError('Precomputed kernels or affinity matrices have to be passed as arrays or sparse matrices.')\n if X.shape[0] != X.shape[1]:\n raise ValueError('X should be a square kernel matrix')\n if train_indices is None:\n X_subset = X[np.ix_(indices, indices)]\n else:\n X_subset = X[np.ix_(indices, train_indices)]\n else:\n X_subset = _safe_indexing(X, indices)\n if y is not None:\n y_subset = _safe_indexing(y, indices)\n else:\n y_subset = None\n return X_subset, y_subset" }, { @@ -174907,13 +188815,14 @@ "docstring": { "type": "callable", "description": "When passed the object with the decorated method, this should return\na truthy value if the attribute is available, and either return False\nor raise an AttributeError if not available." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "An attribute that is available only if check returns a truthy value", - "docstring": "An attribute that is available only if check returns a truthy value\n\nParameters\n----------\ncheck : callable\n When passed the object with the decorated method, this should return\n a truthy value if the attribute is available, and either return False\n or raise an AttributeError if not available.\n\nExamples\n--------\n>>> from sklearn.utils.metaestimators import available_if\n>>> class HelloIfEven:\n... def __init__(self, x):\n... self.x = x\n...\n... def _x_is_even(self):\n... return self.x % 2 == 0\n...\n... @available_if(_x_is_even)\n... def say_hello(self):\n... print(\"Hello\")\n...\n>>> obj = HelloIfEven(1)\n>>> hasattr(obj, \"say_hello\")\nFalse\n>>> obj.x = 2\n>>> hasattr(obj, \"say_hello\")\nTrue\n>>> obj.say_hello()\nHello", + "docstring": "An attribute that is available only if check returns a truthy value\n\n Parameters\n ----------\n check : callable\n When passed the object with the decorated method, this should return\n a truthy value if the attribute is available, and either return False\n or raise an AttributeError if not available.\n\n Examples\n --------\n >>> from sklearn.utils.metaestimators import available_if\n >>> class HelloIfEven:\n ... def __init__(self, x):\n ... self.x = x\n ...\n ... def _x_is_even(self):\n ... return self.x % 2 == 0\n ...\n ... @available_if(_x_is_even)\n ... def say_hello(self):\n ... print(\"Hello\")\n ...\n >>> obj = HelloIfEven(1)\n >>> hasattr(obj, \"say_hello\")\n False\n >>> obj.x = 2\n >>> hasattr(obj, \"say_hello\")\n True\n >>> obj.say_hello()\n Hello\n ", "source_code": "\ndef available_if(check):\n \"\"\"An attribute that is available only if check returns a truthy value\n\n Parameters\n ----------\n check : callable\n When passed the object with the decorated method, this should return\n a truthy value if the attribute is available, and either return False\n or raise an AttributeError if not available.\n\n Examples\n --------\n >>> from sklearn.utils.metaestimators import available_if\n >>> class HelloIfEven:\n ... def __init__(self, x):\n ... self.x = x\n ...\n ... def _x_is_even(self):\n ... return self.x % 2 == 0\n ...\n ... @available_if(_x_is_even)\n ... def say_hello(self):\n ... print(\"Hello\")\n ...\n >>> obj = HelloIfEven(1)\n >>> hasattr(obj, \"say_hello\")\n False\n >>> obj.x = 2\n >>> hasattr(obj, \"say_hello\")\n True\n >>> obj.say_hello()\n Hello\n \"\"\"\n return lambda fn: _AvailableIfDescriptor(fn, check, attribute_name=fn.__name__)" }, { @@ -174931,13 +188840,14 @@ "docstring": { "type": "str, list of str or tuple of str", "description": "Name of the sub-estimator that can be accessed as an attribute of the\nbase object. If a list or a tuple of names are provided, the first\nsub-estimator that is an attribute of the base object will be used." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Create a decorator for methods that are delegated to a sub-estimator\n\nThis enables ducktyping by hasattr returning True according to the sub-estimator.", - "docstring": "Create a decorator for methods that are delegated to a sub-estimator\n\nThis enables ducktyping by hasattr returning True according to the\nsub-estimator.\n\nParameters\n----------\ndelegate : str, list of str or tuple of str\n Name of the sub-estimator that can be accessed as an attribute of the\n base object. If a list or a tuple of names are provided, the first\n sub-estimator that is an attribute of the base object will be used.", + "description": "Create a decorator for methods that are delegated to a sub-estimator\n\nThis enables ducktyping by hasattr returning True according to the\nsub-estimator.", + "docstring": "Create a decorator for methods that are delegated to a sub-estimator\n\n This enables ducktyping by hasattr returning True according to the\n sub-estimator.\n\n Parameters\n ----------\n delegate : str, list of str or tuple of str\n Name of the sub-estimator that can be accessed as an attribute of the\n base object. If a list or a tuple of names are provided, the first\n sub-estimator that is an attribute of the base object will be used.\n\n ", "source_code": "\ndef if_delegate_has_method(delegate):\n \"\"\"Create a decorator for methods that are delegated to a sub-estimator\n\n This enables ducktyping by hasattr returning True according to the\n sub-estimator.\n\n Parameters\n ----------\n delegate : str, list of str or tuple of str\n Name of the sub-estimator that can be accessed as an attribute of the\n base object. If a list or a tuple of names are provided, the first\n sub-estimator that is an attribute of the base object will be used.\n\n \"\"\"\n if isinstance(delegate, list):\n delegate = tuple(delegate)\n if not isinstance(delegate, tuple):\n delegate = (delegate, )\n return lambda fn: _IffHasAttrDescriptor(fn, delegate, attribute_name=fn.__name__)" }, { @@ -174955,7 +188865,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "classes", @@ -174965,13 +188876,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Private helper function for factorizing common classes param logic.\n\nEstimators that implement the ``partial_fit`` API need to be provided with the list of possible classes at the first call to partial_fit. Subsequent calls to partial_fit should check that ``classes`` is still consistent with a previous value of ``clf.classes_`` when provided. This function returns True if it detects that this was the first call to ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also set on ``clf``.", - "docstring": "Private helper function for factorizing common classes param logic.\n\nEstimators that implement the ``partial_fit`` API need to be provided with\nthe list of possible classes at the first call to partial_fit.\n\nSubsequent calls to partial_fit should check that ``classes`` is still\nconsistent with a previous value of ``clf.classes_`` when provided.\n\nThis function returns True if it detects that this was the first call to\n``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also\nset on ``clf``.", + "description": "Private helper function for factorizing common classes param logic.\n\nEstimators that implement the ``partial_fit`` API need to be provided with\nthe list of possible classes at the first call to partial_fit.\n\nSubsequent calls to partial_fit should check that ``classes`` is still\nconsistent with a previous value of ``clf.classes_`` when provided.\n\nThis function returns True if it detects that this was the first call to\n``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also\nset on ``clf``.", + "docstring": "Private helper function for factorizing common classes param logic.\n\n Estimators that implement the ``partial_fit`` API need to be provided with\n the list of possible classes at the first call to partial_fit.\n\n Subsequent calls to partial_fit should check that ``classes`` is still\n consistent with a previous value of ``clf.classes_`` when provided.\n\n This function returns True if it detects that this was the first call to\n ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also\n set on ``clf``.\n\n ", "source_code": "\ndef _check_partial_fit_first_call(clf, classes=None):\n \"\"\"Private helper function for factorizing common classes param logic.\n\n Estimators that implement the ``partial_fit`` API need to be provided with\n the list of possible classes at the first call to partial_fit.\n\n Subsequent calls to partial_fit should check that ``classes`` is still\n consistent with a previous value of ``clf.classes_`` when provided.\n\n This function returns True if it detects that this was the first call to\n ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also\n set on ``clf``.\n\n \"\"\"\n if getattr(clf, 'classes_', None) is None and classes is None:\n raise ValueError('classes must be passed on the first call to partial_fit.')\n elif classes is not None:\n if getattr(clf, 'classes_', None) is not None:\n if not np.array_equal(clf.classes_, unique_labels(classes)):\n raise ValueError('`classes=%r` is not the same as on last call to partial_fit, was: %r' % (classes, clf.classes_))\n else:\n clf.classes_ = unique_labels(classes)\n return True\n return False" }, { @@ -174989,13 +188901,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _is_integral_float(y):\n return y.dtype.kind == 'f' and np.all(y.astype(int) == y)" }, { @@ -175013,7 +188926,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_classifiers)", "description": "Predicted classes for each binary classifier." - } + }, + "refined_type": {} }, { "name": "confidences", @@ -175023,7 +188937,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_classifiers)", "description": "Decision functions or predicted probabilities for positive class\nfor each binary classifier." - } + }, + "refined_type": {} }, { "name": "n_classes", @@ -175033,13 +188948,14 @@ "docstring": { "type": "int", "description": "Number of classes. n_classifiers must be\n``n_classes * (n_classes - 1 ) / 2``." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute a continuous, tie-breaking OvR decision function from OvO.\n\nIt is important to include a continuous value, not only votes, to make computing AUC or calibration meaningful.", - "docstring": "Compute a continuous, tie-breaking OvR decision function from OvO.\n\nIt is important to include a continuous value, not only votes,\nto make computing AUC or calibration meaningful.\n\nParameters\n----------\npredictions : array-like of shape (n_samples, n_classifiers)\n Predicted classes for each binary classifier.\n\nconfidences : array-like of shape (n_samples, n_classifiers)\n Decision functions or predicted probabilities for positive class\n for each binary classifier.\n\nn_classes : int\n Number of classes. n_classifiers must be\n ``n_classes * (n_classes - 1 ) / 2``.", + "description": "Compute a continuous, tie-breaking OvR decision function from OvO.\n\nIt is important to include a continuous value, not only votes,\nto make computing AUC or calibration meaningful.", + "docstring": "Compute a continuous, tie-breaking OvR decision function from OvO.\n\n It is important to include a continuous value, not only votes,\n to make computing AUC or calibration meaningful.\n\n Parameters\n ----------\n predictions : array-like of shape (n_samples, n_classifiers)\n Predicted classes for each binary classifier.\n\n confidences : array-like of shape (n_samples, n_classifiers)\n Decision functions or predicted probabilities for positive class\n for each binary classifier.\n\n n_classes : int\n Number of classes. n_classifiers must be\n ``n_classes * (n_classes - 1 ) / 2``.\n ", "source_code": "\ndef _ovr_decision_function(predictions, confidences, n_classes):\n \"\"\"Compute a continuous, tie-breaking OvR decision function from OvO.\n\n It is important to include a continuous value, not only votes,\n to make computing AUC or calibration meaningful.\n\n Parameters\n ----------\n predictions : array-like of shape (n_samples, n_classifiers)\n Predicted classes for each binary classifier.\n\n confidences : array-like of shape (n_samples, n_classifiers)\n Decision functions or predicted probabilities for positive class\n for each binary classifier.\n\n n_classes : int\n Number of classes. n_classifiers must be\n ``n_classes * (n_classes - 1 ) / 2``.\n \"\"\"\n n_samples = predictions.shape[0]\n votes = np.zeros((n_samples, n_classes))\n sum_of_confidences = np.zeros((n_samples, n_classes))\n k = 0\n for i in range(n_classes):\n for j in range(i + 1, n_classes):\n sum_of_confidences[:, i] -= confidences[:, k]\n sum_of_confidences[:, j] += confidences[:, k]\n votes[predictions[:, k] == 0, i] += 1\n votes[predictions[:, k] == 1, j] += 1\n k += 1\n transformed_confidences = sum_of_confidences / (3 * (np.abs(sum_of_confidences) + 1))\n return votes + transformed_confidences" }, { @@ -175057,13 +188973,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _unique_indicator(y):\n return np.arange(check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1])" }, { @@ -175081,13 +188998,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _unique_multiclass(y):\n if hasattr(y, '__array__'):\n return np.unique(np.asarray(y))\n else:\n return set(y)" }, { @@ -175105,13 +189023,14 @@ "docstring": { "type": "array-like", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Ensure that target y is of a non-regression type.\n\nOnly the following target types (as defined in type_of_target) are allowed: 'binary', 'multiclass', 'multiclass-multioutput', 'multilabel-indicator', 'multilabel-sequences'", - "docstring": "Ensure that target y is of a non-regression type.\n\nOnly the following target types (as defined in type_of_target) are allowed:\n 'binary', 'multiclass', 'multiclass-multioutput',\n 'multilabel-indicator', 'multilabel-sequences'\n\nParameters\n----------\ny : array-like", + "description": "Ensure that target y is of a non-regression type.\n\nOnly the following target types (as defined in type_of_target) are allowed:\n 'binary', 'multiclass', 'multiclass-multioutput',\n 'multilabel-indicator', 'multilabel-sequences'", + "docstring": "Ensure that target y is of a non-regression type.\n\n Only the following target types (as defined in type_of_target) are allowed:\n 'binary', 'multiclass', 'multiclass-multioutput',\n 'multilabel-indicator', 'multilabel-sequences'\n\n Parameters\n ----------\n y : array-like\n ", "source_code": "\ndef check_classification_targets(y):\n \"\"\"Ensure that target y is of a non-regression type.\n\n Only the following target types (as defined in type_of_target) are allowed:\n 'binary', 'multiclass', 'multiclass-multioutput',\n 'multilabel-indicator', 'multilabel-sequences'\n\n Parameters\n ----------\n y : array-like\n \"\"\"\n y_type = type_of_target(y)\n if y_type not in ['binary', 'multiclass', 'multiclass-multioutput', 'multilabel-indicator', 'multilabel-sequences']:\n raise ValueError('Unknown label type: %r' % y_type)" }, { @@ -175129,6 +189048,10 @@ "docstring": { "type": "{array-like, sparse matrix} of size (n_samples, n_outputs)", "description": "The labels for each example." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -175139,13 +189062,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Sample weights." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute class priors from multioutput-multiclass target data.", - "docstring": "Compute class priors from multioutput-multiclass target data.\n\nParameters\n----------\ny : {array-like, sparse matrix} of size (n_samples, n_outputs)\n The labels for each example.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\nReturns\n-------\nclasses : list of size n_outputs of ndarray of size (n_classes,)\n List of classes for each column.\n\nn_classes : list of int of size n_outputs\n Number of classes in each column.\n\nclass_prior : list of size n_outputs of ndarray of size (n_classes,)\n Class distribution of each column.", + "docstring": "Compute class priors from multioutput-multiclass target data.\n\n Parameters\n ----------\n y : {array-like, sparse matrix} of size (n_samples, n_outputs)\n The labels for each example.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n classes : list of size n_outputs of ndarray of size (n_classes,)\n List of classes for each column.\n\n n_classes : list of int of size n_outputs\n Number of classes in each column.\n\n class_prior : list of size n_outputs of ndarray of size (n_classes,)\n Class distribution of each column.\n\n ", "source_code": "\ndef class_distribution(y, sample_weight=None):\n \"\"\"Compute class priors from multioutput-multiclass target data.\n\n Parameters\n ----------\n y : {array-like, sparse matrix} of size (n_samples, n_outputs)\n The labels for each example.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Sample weights.\n\n Returns\n -------\n classes : list of size n_outputs of ndarray of size (n_classes,)\n List of classes for each column.\n\n n_classes : list of int of size n_outputs\n Number of classes in each column.\n\n class_prior : list of size n_outputs of ndarray of size (n_classes,)\n Class distribution of each column.\n\n \"\"\"\n classes = []\n n_classes = []\n class_prior = []\n (n_samples, n_outputs) = y.shape\n if sample_weight is not None:\n sample_weight = np.asarray(sample_weight)\n if issparse(y):\n y = y.tocsc()\n y_nnz = np.diff(y.indptr)\n for k in range(n_outputs):\n col_nonzero = y.indices[y.indptr[k]:y.indptr[k + 1]]\n if sample_weight is not None:\n nz_samp_weight = sample_weight[col_nonzero]\n zeros_samp_weight_sum = np.sum(sample_weight) - np.sum(nz_samp_weight)\n else:\n nz_samp_weight = None\n zeros_samp_weight_sum = y.shape[0] - y_nnz[k]\n (classes_k, y_k) = np.unique(y.data[y.indptr[k]:y.indptr[k + 1]], return_inverse=True)\n class_prior_k = np.bincount(y_k, weights=nz_samp_weight)\n if 0 in classes_k:\n class_prior_k[classes_k == 0] += zeros_samp_weight_sum\n if 0 not in classes_k and y_nnz[k] < y.shape[0]:\n classes_k = np.insert(classes_k, 0, 0)\n class_prior_k = np.insert(class_prior_k, 0, zeros_samp_weight_sum)\n classes.append(classes_k)\n n_classes.append(classes_k.shape[0])\n class_prior.append(class_prior_k / class_prior_k.sum())\n else:\n for k in range(n_outputs):\n (classes_k, y_k) = np.unique(y[:, k], return_inverse=True)\n classes.append(classes_k)\n n_classes.append(classes_k.shape[0])\n class_prior_k = np.bincount(y_k, weights=sample_weight)\n class_prior.append(class_prior_k / class_prior_k.sum())\n return classes, n_classes, class_prior" }, { @@ -175163,13 +189087,14 @@ "docstring": { "type": "ndarray of shape (n_samples,)", "description": "Target values." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Check if ``y`` is in a multilabel format.", - "docstring": "Check if ``y`` is in a multilabel format.\n\nParameters\n----------\ny : ndarray of shape (n_samples,)\n Target values.\n\nReturns\n-------\nout : bool\n Return ``True``, if ``y`` is in a multilabel format, else ```False``.\n\nExamples\n--------\n>>> import numpy as np\n>>> from sklearn.utils.multiclass import is_multilabel\n>>> is_multilabel([0, 1, 0, 1])\nFalse\n>>> is_multilabel([[1], [0, 2], []])\nFalse\n>>> is_multilabel(np.array([[1, 0], [0, 0]]))\nTrue\n>>> is_multilabel(np.array([[1], [0], [0]]))\nFalse\n>>> is_multilabel(np.array([[1, 0, 0]]))\nTrue", + "docstring": "Check if ``y`` is in a multilabel format.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n Target values.\n\n Returns\n -------\n out : bool\n Return ``True``, if ``y`` is in a multilabel format, else ```False``.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.utils.multiclass import is_multilabel\n >>> is_multilabel([0, 1, 0, 1])\n False\n >>> is_multilabel([[1], [0, 2], []])\n False\n >>> is_multilabel(np.array([[1, 0], [0, 0]]))\n True\n >>> is_multilabel(np.array([[1], [0], [0]]))\n False\n >>> is_multilabel(np.array([[1, 0, 0]]))\n True\n ", "source_code": "\ndef is_multilabel(y):\n \"\"\"Check if ``y`` is in a multilabel format.\n\n Parameters\n ----------\n y : ndarray of shape (n_samples,)\n Target values.\n\n Returns\n -------\n out : bool\n Return ``True``, if ``y`` is in a multilabel format, else ```False``.\n\n Examples\n --------\n >>> import numpy as np\n >>> from sklearn.utils.multiclass import is_multilabel\n >>> is_multilabel([0, 1, 0, 1])\n False\n >>> is_multilabel([[1], [0, 2], []])\n False\n >>> is_multilabel(np.array([[1, 0], [0, 0]]))\n True\n >>> is_multilabel(np.array([[1], [0], [0]]))\n False\n >>> is_multilabel(np.array([[1, 0, 0]]))\n True\n \"\"\"\n if hasattr(y, '__array__') or isinstance(y, Sequence):\n with warnings.catch_warnings():\n warnings.simplefilter('error', np.VisibleDeprecationWarning)\n try:\n y = np.asarray(y)\n except np.VisibleDeprecationWarning:\n y = np.array(y, dtype=object)\n if not (hasattr(y, 'shape') and y.ndim == 2 and y.shape[1] > 1):\n return False\n if issparse(y):\n if isinstance(y, (dok_matrix, lil_matrix)):\n y = y.tocsr()\n return len(y.data) == 0 or np.unique(y.data).size == 1 and (y.dtype.kind in 'biu' or _is_integral_float(np.unique(y.data)))\n else:\n labels = np.unique(y)\n return len(labels) < 3 and (y.dtype.kind in 'biu' or _is_integral_float(labels))" }, { @@ -175187,14 +189112,15 @@ "docstring": { "type": "array-like", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Determine the type of data indicated by the target.\n\nNote that this type is the most specific type that can be inferred. For example: * ``binary`` is more specific but compatible with ``multiclass``. * ``multiclass`` of integers is more specific but compatible with ``continuous``. * ``multilabel-indicator`` is more specific but compatible with ``multiclass-multioutput``.", - "docstring": "Determine the type of data indicated by the target.\n\nNote that this type is the most specific type that can be inferred.\nFor example:\n\n * ``binary`` is more specific but compatible with ``multiclass``.\n * ``multiclass`` of integers is more specific but compatible with\n ``continuous``.\n * ``multilabel-indicator`` is more specific but compatible with\n ``multiclass-multioutput``.\n\nParameters\n----------\ny : array-like\n\nReturns\n-------\ntarget_type : str\n One of:\n\n * 'continuous': `y` is an array-like of floats that are not all\n integers, and is 1d or a column vector.\n * 'continuous-multioutput': `y` is a 2d array of floats that are\n not all integers, and both dimensions are of size > 1.\n * 'binary': `y` contains <= 2 discrete values and is 1d or a column\n vector.\n * 'multiclass': `y` contains more than two discrete values, is not a\n sequence of sequences, and is 1d or a column vector.\n * 'multiclass-multioutput': `y` is a 2d array that contains more\n than two discrete values, is not a sequence of sequences, and both\n dimensions are of size > 1.\n * 'multilabel-indicator': `y` is a label indicator matrix, an array\n of two dimensions with at least two columns, and at most 2 unique\n values.\n * 'unknown': `y` is array-like but none of the above, such as a 3d\n array, sequence of sequences, or an array of non-sequence objects.\n\nExamples\n--------\n>>> from sklearn.utils.multiclass import type_of_target\n>>> import numpy as np\n>>> type_of_target([0.1, 0.6])\n'continuous'\n>>> type_of_target([1, -1, -1, 1])\n'binary'\n>>> type_of_target(['a', 'b', 'a'])\n'binary'\n>>> type_of_target([1.0, 2.0])\n'binary'\n>>> type_of_target([1, 0, 2])\n'multiclass'\n>>> type_of_target([1.0, 0.0, 3.0])\n'multiclass'\n>>> type_of_target(['a', 'b', 'c'])\n'multiclass'\n>>> type_of_target(np.array([[1, 2], [3, 1]]))\n'multiclass-multioutput'\n>>> type_of_target([[1, 2]])\n'multilabel-indicator'\n>>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))\n'continuous-multioutput'\n>>> type_of_target(np.array([[0, 1], [1, 1]]))\n'multilabel-indicator'", - "source_code": "\ndef type_of_target(y):\n \"\"\"Determine the type of data indicated by the target.\n\n Note that this type is the most specific type that can be inferred.\n For example:\n\n * ``binary`` is more specific but compatible with ``multiclass``.\n * ``multiclass`` of integers is more specific but compatible with\n ``continuous``.\n * ``multilabel-indicator`` is more specific but compatible with\n ``multiclass-multioutput``.\n\n Parameters\n ----------\n y : array-like\n\n Returns\n -------\n target_type : str\n One of:\n\n * 'continuous': `y` is an array-like of floats that are not all\n integers, and is 1d or a column vector.\n * 'continuous-multioutput': `y` is a 2d array of floats that are\n not all integers, and both dimensions are of size > 1.\n * 'binary': `y` contains <= 2 discrete values and is 1d or a column\n vector.\n * 'multiclass': `y` contains more than two discrete values, is not a\n sequence of sequences, and is 1d or a column vector.\n * 'multiclass-multioutput': `y` is a 2d array that contains more\n than two discrete values, is not a sequence of sequences, and both\n dimensions are of size > 1.\n * 'multilabel-indicator': `y` is a label indicator matrix, an array\n of two dimensions with at least two columns, and at most 2 unique\n values.\n * 'unknown': `y` is array-like but none of the above, such as a 3d\n array, sequence of sequences, or an array of non-sequence objects.\n\n Examples\n --------\n >>> from sklearn.utils.multiclass import type_of_target\n >>> import numpy as np\n >>> type_of_target([0.1, 0.6])\n 'continuous'\n >>> type_of_target([1, -1, -1, 1])\n 'binary'\n >>> type_of_target(['a', 'b', 'a'])\n 'binary'\n >>> type_of_target([1.0, 2.0])\n 'binary'\n >>> type_of_target([1, 0, 2])\n 'multiclass'\n >>> type_of_target([1.0, 0.0, 3.0])\n 'multiclass'\n >>> type_of_target(['a', 'b', 'c'])\n 'multiclass'\n >>> type_of_target(np.array([[1, 2], [3, 1]]))\n 'multiclass-multioutput'\n >>> type_of_target([[1, 2]])\n 'multilabel-indicator'\n >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))\n 'continuous-multioutput'\n >>> type_of_target(np.array([[0, 1], [1, 1]]))\n 'multilabel-indicator'\n \"\"\"\n valid = (isinstance(y, (Sequence, spmatrix)) or hasattr(y, '__array__')) and not isinstance(y, str)\n if not valid:\n raise ValueError('Expected array-like (array or non-string sequence), got %r' % y)\n sparse_pandas = y.__class__.__name__ in ['SparseSeries', 'SparseArray']\n if sparse_pandas:\n raise ValueError(\"y cannot be class 'SparseSeries' or 'SparseArray'\")\n if is_multilabel(y):\n return 'multilabel-indicator'\n with warnings.catch_warnings():\n warnings.simplefilter('error', np.VisibleDeprecationWarning)\n try:\n y = np.asarray(y)\n except np.VisibleDeprecationWarning:\n y = np.asarray(y, dtype=object)\n try:\n if not hasattr(y[0], '__array__') and isinstance(y[0], Sequence) and not isinstance(y[0], str):\n raise ValueError('You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.')\n except IndexError:\n pass\n if y.ndim > 2 or y.dtype == object and len(y) and not isinstance(y.flat[0], str):\n return 'unknown'\n if y.ndim == 2 and y.shape[1] == 0:\n return 'unknown'\n if y.ndim == 2 and y.shape[1] > 1:\n suffix = '-multioutput'\n else:\n suffix = ''\n if y.dtype.kind == 'f' and np.any(y != y.astype(int)):\n _assert_all_finite(y)\n return 'continuous' + suffix\n if len(np.unique(y)) > 2 or y.ndim >= 2 and len(y[0]) > 1:\n return 'multiclass' + suffix\n else:\n return 'binary'" + "description": "Determine the type of data indicated by the target.\n\nNote that this type is the most specific type that can be inferred.\nFor example:\n\n * ``binary`` is more specific but compatible with ``multiclass``.\n * ``multiclass`` of integers is more specific but compatible with\n ``continuous``.\n * ``multilabel-indicator`` is more specific but compatible with\n ``multiclass-multioutput``.", + "docstring": "Determine the type of data indicated by the target.\n\n Note that this type is the most specific type that can be inferred.\n For example:\n\n * ``binary`` is more specific but compatible with ``multiclass``.\n * ``multiclass`` of integers is more specific but compatible with\n ``continuous``.\n * ``multilabel-indicator`` is more specific but compatible with\n ``multiclass-multioutput``.\n\n Parameters\n ----------\n y : array-like\n\n Returns\n -------\n target_type : str\n One of:\n\n * 'continuous': `y` is an array-like of floats that are not all\n integers, and is 1d or a column vector.\n * 'continuous-multioutput': `y` is a 2d array of floats that are\n not all integers, and both dimensions are of size > 1.\n * 'binary': `y` contains <= 2 discrete values and is 1d or a column\n vector.\n * 'multiclass': `y` contains more than two discrete values, is not a\n sequence of sequences, and is 1d or a column vector.\n * 'multiclass-multioutput': `y` is a 2d array that contains more\n than two discrete values, is not a sequence of sequences, and both\n dimensions are of size > 1.\n * 'multilabel-indicator': `y` is a label indicator matrix, an array\n of two dimensions with at least two columns, and at most 2 unique\n values.\n * 'unknown': `y` is array-like but none of the above, such as a 3d\n array, sequence of sequences, or an array of non-sequence objects.\n\n Examples\n --------\n >>> from sklearn.utils.multiclass import type_of_target\n >>> import numpy as np\n >>> type_of_target([0.1, 0.6])\n 'continuous'\n >>> type_of_target([1, -1, -1, 1])\n 'binary'\n >>> type_of_target(['a', 'b', 'a'])\n 'binary'\n >>> type_of_target([1.0, 2.0])\n 'binary'\n >>> type_of_target([1, 0, 2])\n 'multiclass'\n >>> type_of_target([1.0, 0.0, 3.0])\n 'multiclass'\n >>> type_of_target(['a', 'b', 'c'])\n 'multiclass'\n >>> type_of_target(np.array([[1, 2], [3, 1]]))\n 'multiclass-multioutput'\n >>> type_of_target([[1, 2]])\n 'multilabel-indicator'\n >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))\n 'continuous-multioutput'\n >>> type_of_target(np.array([[0, 1], [1, 1]]))\n 'multilabel-indicator'\n ", + "source_code": "\ndef type_of_target(y):\n \"\"\"Determine the type of data indicated by the target.\n\n Note that this type is the most specific type that can be inferred.\n For example:\n\n * ``binary`` is more specific but compatible with ``multiclass``.\n * ``multiclass`` of integers is more specific but compatible with\n ``continuous``.\n * ``multilabel-indicator`` is more specific but compatible with\n ``multiclass-multioutput``.\n\n Parameters\n ----------\n y : array-like\n\n Returns\n -------\n target_type : str\n One of:\n\n * 'continuous': `y` is an array-like of floats that are not all\n integers, and is 1d or a column vector.\n * 'continuous-multioutput': `y` is a 2d array of floats that are\n not all integers, and both dimensions are of size > 1.\n * 'binary': `y` contains <= 2 discrete values and is 1d or a column\n vector.\n * 'multiclass': `y` contains more than two discrete values, is not a\n sequence of sequences, and is 1d or a column vector.\n * 'multiclass-multioutput': `y` is a 2d array that contains more\n than two discrete values, is not a sequence of sequences, and both\n dimensions are of size > 1.\n * 'multilabel-indicator': `y` is a label indicator matrix, an array\n of two dimensions with at least two columns, and at most 2 unique\n values.\n * 'unknown': `y` is array-like but none of the above, such as a 3d\n array, sequence of sequences, or an array of non-sequence objects.\n\n Examples\n --------\n >>> from sklearn.utils.multiclass import type_of_target\n >>> import numpy as np\n >>> type_of_target([0.1, 0.6])\n 'continuous'\n >>> type_of_target([1, -1, -1, 1])\n 'binary'\n >>> type_of_target(['a', 'b', 'a'])\n 'binary'\n >>> type_of_target([1.0, 2.0])\n 'binary'\n >>> type_of_target([1, 0, 2])\n 'multiclass'\n >>> type_of_target([1.0, 0.0, 3.0])\n 'multiclass'\n >>> type_of_target(['a', 'b', 'c'])\n 'multiclass'\n >>> type_of_target(np.array([[1, 2], [3, 1]]))\n 'multiclass-multioutput'\n >>> type_of_target([[1, 2]])\n 'multilabel-indicator'\n >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))\n 'continuous-multioutput'\n >>> type_of_target(np.array([[0, 1], [1, 1]]))\n 'multilabel-indicator'\n \"\"\"\n valid = (isinstance(y, Sequence) or issparse(y) or hasattr(y, '__array__')) and not isinstance(y, str)\n if not valid:\n raise ValueError('Expected array-like (array or non-string sequence), got %r' % y)\n sparse_pandas = y.__class__.__name__ in ['SparseSeries', 'SparseArray']\n if sparse_pandas:\n raise ValueError(\"y cannot be class 'SparseSeries' or 'SparseArray'\")\n if is_multilabel(y):\n return 'multilabel-indicator'\n with warnings.catch_warnings():\n warnings.simplefilter('error', np.VisibleDeprecationWarning)\n try:\n y = np.asarray(y)\n except np.VisibleDeprecationWarning:\n y = np.asarray(y, dtype=object)\n try:\n if not hasattr(y[0], '__array__') and isinstance(y[0], Sequence) and not isinstance(y[0], str):\n raise ValueError('You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.')\n except IndexError:\n pass\n if y.ndim > 2 or y.dtype == object and len(y) and not isinstance(y.flat[0], str):\n return 'unknown'\n if y.ndim == 2 and y.shape[1] == 0:\n return 'unknown'\n if y.ndim == 2 and y.shape[1] > 1:\n suffix = '-multioutput'\n else:\n suffix = ''\n if y.dtype.kind == 'f' and np.any(y != y.astype(int)):\n _assert_all_finite(y)\n return 'continuous' + suffix\n if len(np.unique(y)) > 2 or y.ndim >= 2 and len(y[0]) > 1:\n return 'multiclass' + suffix\n else:\n return 'binary'" }, { "name": "unique_labels", @@ -175205,8 +189131,8 @@ "parameters": [], "results": [], "is_public": true, - "description": "Extract an ordered array of unique labels.\n\nWe don't allow: - mix of multilabel and multiclass (single label) targets - mix of label indicator matrix and anything else, because there are no explicit labels) - mix of label indicator matrices of different sizes - mix of string and integer labels At the moment, we also don't allow \"multiclass-multioutput\" input type.", - "docstring": "Extract an ordered array of unique labels.\n\nWe don't allow:\n - mix of multilabel and multiclass (single label) targets\n - mix of label indicator matrix and anything else,\n because there are no explicit labels)\n - mix of label indicator matrices of different sizes\n - mix of string and integer labels\n\nAt the moment, we also don't allow \"multiclass-multioutput\" input type.\n\nParameters\n----------\n*ys : array-likes\n\nReturns\n-------\nout : ndarray of shape (n_unique_labels,)\n An ordered array of unique labels.\n\nExamples\n--------\n>>> from sklearn.utils.multiclass import unique_labels\n>>> unique_labels([3, 5, 5, 5, 7, 7])\narray([3, 5, 7])\n>>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])\narray([1, 2, 3, 4])\n>>> unique_labels([1, 2, 10], [5, 11])\narray([ 1, 2, 5, 10, 11])", + "description": "Extract an ordered array of unique labels.\n\nWe don't allow:\n - mix of multilabel and multiclass (single label) targets\n - mix of label indicator matrix and anything else,\n because there are no explicit labels)\n - mix of label indicator matrices of different sizes\n - mix of string and integer labels\n\nAt the moment, we also don't allow \"multiclass-multioutput\" input type.", + "docstring": "Extract an ordered array of unique labels.\n\n We don't allow:\n - mix of multilabel and multiclass (single label) targets\n - mix of label indicator matrix and anything else,\n because there are no explicit labels)\n - mix of label indicator matrices of different sizes\n - mix of string and integer labels\n\n At the moment, we also don't allow \"multiclass-multioutput\" input type.\n\n Parameters\n ----------\n *ys : array-likes\n\n Returns\n -------\n out : ndarray of shape (n_unique_labels,)\n An ordered array of unique labels.\n\n Examples\n --------\n >>> from sklearn.utils.multiclass import unique_labels\n >>> unique_labels([3, 5, 5, 5, 7, 7])\n array([3, 5, 7])\n >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])\n array([1, 2, 3, 4])\n >>> unique_labels([1, 2, 10], [5, 11])\n array([ 1, 2, 5, 10, 11])\n ", "source_code": "\ndef unique_labels(*ys):\n \"\"\"Extract an ordered array of unique labels.\n\n We don't allow:\n - mix of multilabel and multiclass (single label) targets\n - mix of label indicator matrix and anything else,\n because there are no explicit labels)\n - mix of label indicator matrices of different sizes\n - mix of string and integer labels\n\n At the moment, we also don't allow \"multiclass-multioutput\" input type.\n\n Parameters\n ----------\n *ys : array-likes\n\n Returns\n -------\n out : ndarray of shape (n_unique_labels,)\n An ordered array of unique labels.\n\n Examples\n --------\n >>> from sklearn.utils.multiclass import unique_labels\n >>> unique_labels([3, 5, 5, 5, 7, 7])\n array([3, 5, 7])\n >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])\n array([1, 2, 3, 4])\n >>> unique_labels([1, 2, 10], [5, 11])\n array([ 1, 2, 5, 10, 11])\n \"\"\"\n if not ys:\n raise ValueError('No argument has been passed.')\n ys_types = set((type_of_target(x) for x in ys))\n if ys_types == {'binary', 'multiclass'}:\n ys_types = {'multiclass'}\n if len(ys_types) > 1:\n raise ValueError('Mix type of y not allowed, got types %s' % ys_types)\n label_type = ys_types.pop()\n if label_type == 'multilabel-indicator' and len(set((check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1] for y in ys))) > 1:\n raise ValueError('Multi-label binary indicator input with different numbers of labels')\n _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)\n if not _unique_labels:\n raise ValueError('Unknown label type: %s' % repr(ys))\n ys_labels = set(chain.from_iterable((_unique_labels(y) for y in ys)))\n if len(set((isinstance(label, str) for label in ys_labels))) > 1:\n raise ValueError('Mix of label input types (string and number)')\n return np.array(sorted(ys_labels))" }, { @@ -175224,7 +189150,8 @@ "docstring": { "type": "callable", "description": "Function that takes the gradient as a parameter and returns the\nmatrix product of the Hessian and gradient." - } + }, + "refined_type": {} }, { "name": "fgrad", @@ -175234,7 +189161,8 @@ "docstring": { "type": "ndarray of shape (n_features,) or (n_features + 1,)", "description": "Gradient vector." - } + }, + "refined_type": {} }, { "name": "maxiter", @@ -175244,7 +189172,8 @@ "docstring": { "type": "int", "description": "Number of CG iterations." - } + }, + "refined_type": {} }, { "name": "tol", @@ -175254,13 +189183,14 @@ "docstring": { "type": "float", "description": "Stopping criterion." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Solve iteratively the linear system 'fhess_p . xsupi = fgrad' with a conjugate gradient descent.", - "docstring": "Solve iteratively the linear system 'fhess_p . xsupi = fgrad'\nwith a conjugate gradient descent.\n\nParameters\n----------\nfhess_p : callable\n Function that takes the gradient as a parameter and returns the\n matrix product of the Hessian and gradient.\n\nfgrad : ndarray of shape (n_features,) or (n_features + 1,)\n Gradient vector.\n\nmaxiter : int\n Number of CG iterations.\n\ntol : float\n Stopping criterion.\n\nReturns\n-------\nxsupi : ndarray of shape (n_features,) or (n_features + 1,)\n Estimated solution.", + "description": "Solve iteratively the linear system 'fhess_p . xsupi = fgrad'\nwith a conjugate gradient descent.", + "docstring": "\n Solve iteratively the linear system 'fhess_p . xsupi = fgrad'\n with a conjugate gradient descent.\n\n Parameters\n ----------\n fhess_p : callable\n Function that takes the gradient as a parameter and returns the\n matrix product of the Hessian and gradient.\n\n fgrad : ndarray of shape (n_features,) or (n_features + 1,)\n Gradient vector.\n\n maxiter : int\n Number of CG iterations.\n\n tol : float\n Stopping criterion.\n\n Returns\n -------\n xsupi : ndarray of shape (n_features,) or (n_features + 1,)\n Estimated solution.\n ", "source_code": "\ndef _cg(fhess_p, fgrad, maxiter, tol):\n \"\"\"\n Solve iteratively the linear system 'fhess_p . xsupi = fgrad'\n with a conjugate gradient descent.\n\n Parameters\n ----------\n fhess_p : callable\n Function that takes the gradient as a parameter and returns the\n matrix product of the Hessian and gradient.\n\n fgrad : ndarray of shape (n_features,) or (n_features + 1,)\n Gradient vector.\n\n maxiter : int\n Number of CG iterations.\n\n tol : float\n Stopping criterion.\n\n Returns\n -------\n xsupi : ndarray of shape (n_features,) or (n_features + 1,)\n Estimated solution.\n \"\"\"\n xsupi = np.zeros(len(fgrad), dtype=fgrad.dtype)\n ri = fgrad\n psupi = -ri\n i = 0\n dri0 = np.dot(ri, ri)\n while i <= maxiter:\n if np.sum(np.abs(ri)) <= tol:\n break\n Ap = fhess_p(psupi)\n curv = np.dot(psupi, Ap)\n if 0 <= curv <= 3 * np.finfo(np.float64).eps:\n break\n elif curv < 0:\n if i > 0:\n break\n else:\n xsupi += dri0 / curv * psupi\n break\n alphai = dri0 / curv\n xsupi += alphai * psupi\n ri = ri + alphai * Ap\n dri1 = np.dot(ri, ri)\n betai = dri1 / dri0\n psupi = -ri + betai * psupi\n i = i + 1\n dri0 = dri1\n return xsupi" }, { @@ -175278,7 +189208,8 @@ "docstring": { "type": "str", "description": "Solver name. Currently only `lbfgs` is supported." - } + }, + "refined_type": {} }, { "name": "result", @@ -175288,7 +189219,8 @@ "docstring": { "type": "OptimizeResult", "description": "Result of the scipy.optimize.minimize function." - } + }, + "refined_type": {} }, { "name": "max_iter", @@ -175298,7 +189230,8 @@ "docstring": { "type": "int, default=None", "description": "Expected maximum number of iterations." - } + }, + "refined_type": {} }, { "name": "extra_warning_msg", @@ -175308,13 +189241,14 @@ "docstring": { "type": "str, default=None", "description": "Extra warning message." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check the OptimizeResult for successful convergence", - "docstring": "Check the OptimizeResult for successful convergence\n\nParameters\n----------\nsolver : str\n Solver name. Currently only `lbfgs` is supported.\n\nresult : OptimizeResult\n Result of the scipy.optimize.minimize function.\n\nmax_iter : int, default=None\n Expected maximum number of iterations.\n\nextra_warning_msg : str, default=None\n Extra warning message.\n\nReturns\n-------\nn_iter : int\n Number of iterations.", + "docstring": "Check the OptimizeResult for successful convergence\n\n Parameters\n ----------\n solver : str\n Solver name. Currently only `lbfgs` is supported.\n\n result : OptimizeResult\n Result of the scipy.optimize.minimize function.\n\n max_iter : int, default=None\n Expected maximum number of iterations.\n\n extra_warning_msg : str, default=None\n Extra warning message.\n\n Returns\n -------\n n_iter : int\n Number of iterations.\n ", "source_code": "\ndef _check_optimize_result(solver, result, max_iter=None, extra_warning_msg=None):\n \"\"\"Check the OptimizeResult for successful convergence\n\n Parameters\n ----------\n solver : str\n Solver name. Currently only `lbfgs` is supported.\n\n result : OptimizeResult\n Result of the scipy.optimize.minimize function.\n\n max_iter : int, default=None\n Expected maximum number of iterations.\n\n extra_warning_msg : str, default=None\n Extra warning message.\n\n Returns\n -------\n n_iter : int\n Number of iterations.\n \"\"\"\n if solver == 'lbfgs':\n if result.status != 0:\n try:\n result_message = result.message.decode('latin1')\n except AttributeError:\n result_message = result.message\n warning_msg = '{} failed to converge (status={}):\\n{}.\\n\\nIncrease the number of iterations (max_iter) or scale the data as shown in:\\n https://scikit-learn.org/stable/modules/preprocessing.html'.format(solver, result.status, result_message)\n if extra_warning_msg is not None:\n warning_msg += '\\n' + extra_warning_msg\n warnings.warn(warning_msg, ConvergenceWarning, stacklevel=2)\n if max_iter is not None:\n n_iter_i = min(result.nit, max_iter)\n else:\n n_iter_i = result.nit\n else:\n raise NotImplementedError\n return n_iter_i" }, { @@ -175332,7 +189266,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "fprime", @@ -175342,7 +189277,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "xk", @@ -175352,7 +189288,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "pk", @@ -175362,7 +189299,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "gfk", @@ -175372,7 +189310,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "old_fval", @@ -175382,7 +189321,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "old_old_fval", @@ -175392,13 +189332,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Same as line_search_wolfe1, but fall back to line_search_wolfe2 if suitable step length is not found, and raise an exception if a suitable step length is not found.", - "docstring": "Same as line_search_wolfe1, but fall back to line_search_wolfe2 if\nsuitable step length is not found, and raise an exception if a\nsuitable step length is not found.\n\nRaises\n------\n_LineSearchError\n If no suitable step size is found.", + "description": "Same as line_search_wolfe1, but fall back to line_search_wolfe2 if\nsuitable step length is not found, and raise an exception if a\nsuitable step length is not found.", + "docstring": "\n Same as line_search_wolfe1, but fall back to line_search_wolfe2 if\n suitable step length is not found, and raise an exception if a\n suitable step length is not found.\n\n Raises\n ------\n _LineSearchError\n If no suitable step size is found.\n\n ", "source_code": "\ndef _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs):\n \"\"\"\n Same as line_search_wolfe1, but fall back to line_search_wolfe2 if\n suitable step length is not found, and raise an exception if a\n suitable step length is not found.\n\n Raises\n ------\n _LineSearchError\n If no suitable step size is found.\n\n \"\"\"\n ret = line_search_wolfe1(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs)\n if ret[0] is None:\n ret = line_search_wolfe2(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs)\n if ret[0] is None:\n raise _LineSearchError()\n return ret" }, { @@ -175416,7 +189357,8 @@ "docstring": { "type": "callable", "description": "Should return the gradient and a callable returning the matvec product\nof the Hessian." - } + }, + "refined_type": {} }, { "name": "func", @@ -175426,7 +189368,8 @@ "docstring": { "type": "callable", "description": "Should return the value of the function." - } + }, + "refined_type": {} }, { "name": "grad", @@ -175436,7 +189379,8 @@ "docstring": { "type": "callable", "description": "Should return the function value and the gradient. This is used\nby the linesearch functions." - } + }, + "refined_type": {} }, { "name": "x0", @@ -175446,7 +189390,8 @@ "docstring": { "type": "array of float", "description": "Initial guess." - } + }, + "refined_type": {} }, { "name": "args", @@ -175456,7 +189401,8 @@ "docstring": { "type": "tuple, default=()", "description": "Arguments passed to func_grad_hess, func and grad." - } + }, + "refined_type": {} }, { "name": "tol", @@ -175466,6 +189412,10 @@ "docstring": { "type": "float, default=1e-4", "description": "Stopping criterion. The iteration will stop when\n``max{|g_i | i = 1, ..., n} <= tol``\nwhere ``g_i`` is the i-th component of the gradient." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -175476,7 +189426,8 @@ "docstring": { "type": "int, default=100", "description": "Number of Newton iterations." - } + }, + "refined_type": {} }, { "name": "maxinner", @@ -175486,7 +189437,8 @@ "docstring": { "type": "int, default=200", "description": "Number of CG iterations." - } + }, + "refined_type": {} }, { "name": "line_search", @@ -175496,7 +189448,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to use a line search or not." - } + }, + "refined_type": {} }, { "name": "warn", @@ -175506,13 +189459,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to warn when didn't converge." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Minimization of scalar function of one or more variables using the Newton-CG algorithm.", - "docstring": "Minimization of scalar function of one or more variables using the\nNewton-CG algorithm.\n\nParameters\n----------\ngrad_hess : callable\n Should return the gradient and a callable returning the matvec product\n of the Hessian.\n\nfunc : callable\n Should return the value of the function.\n\ngrad : callable\n Should return the function value and the gradient. This is used\n by the linesearch functions.\n\nx0 : array of float\n Initial guess.\n\nargs : tuple, default=()\n Arguments passed to func_grad_hess, func and grad.\n\ntol : float, default=1e-4\n Stopping criterion. The iteration will stop when\n ``max{|g_i | i = 1, ..., n} <= tol``\n where ``g_i`` is the i-th component of the gradient.\n\nmaxiter : int, default=100\n Number of Newton iterations.\n\nmaxinner : int, default=200\n Number of CG iterations.\n\nline_search : bool, default=True\n Whether to use a line search or not.\n\nwarn : bool, default=True\n Whether to warn when didn't converge.\n\nReturns\n-------\nxk : ndarray of float\n Estimated minimum.", + "description": "Minimization of scalar function of one or more variables using the\nNewton-CG algorithm.", + "docstring": "\n Minimization of scalar function of one or more variables using the\n Newton-CG algorithm.\n\n Parameters\n ----------\n grad_hess : callable\n Should return the gradient and a callable returning the matvec product\n of the Hessian.\n\n func : callable\n Should return the value of the function.\n\n grad : callable\n Should return the function value and the gradient. This is used\n by the linesearch functions.\n\n x0 : array of float\n Initial guess.\n\n args : tuple, default=()\n Arguments passed to func_grad_hess, func and grad.\n\n tol : float, default=1e-4\n Stopping criterion. The iteration will stop when\n ``max{|g_i | i = 1, ..., n} <= tol``\n where ``g_i`` is the i-th component of the gradient.\n\n maxiter : int, default=100\n Number of Newton iterations.\n\n maxinner : int, default=200\n Number of CG iterations.\n\n line_search : bool, default=True\n Whether to use a line search or not.\n\n warn : bool, default=True\n Whether to warn when didn't converge.\n\n Returns\n -------\n xk : ndarray of float\n Estimated minimum.\n ", "source_code": "\ndef _newton_cg(grad_hess, func, grad, x0, args=(), tol=0.0001, maxiter=100, maxinner=200, line_search=True, warn=True):\n \"\"\"\n Minimization of scalar function of one or more variables using the\n Newton-CG algorithm.\n\n Parameters\n ----------\n grad_hess : callable\n Should return the gradient and a callable returning the matvec product\n of the Hessian.\n\n func : callable\n Should return the value of the function.\n\n grad : callable\n Should return the function value and the gradient. This is used\n by the linesearch functions.\n\n x0 : array of float\n Initial guess.\n\n args : tuple, default=()\n Arguments passed to func_grad_hess, func and grad.\n\n tol : float, default=1e-4\n Stopping criterion. The iteration will stop when\n ``max{|g_i | i = 1, ..., n} <= tol``\n where ``g_i`` is the i-th component of the gradient.\n\n maxiter : int, default=100\n Number of Newton iterations.\n\n maxinner : int, default=200\n Number of CG iterations.\n\n line_search : bool, default=True\n Whether to use a line search or not.\n\n warn : bool, default=True\n Whether to warn when didn't converge.\n\n Returns\n -------\n xk : ndarray of float\n Estimated minimum.\n \"\"\"\n x0 = np.asarray(x0).flatten()\n xk = x0\n k = 0\n if line_search:\n old_fval = func(x0, *args)\n old_old_fval = None\n while k < maxiter:\n (fgrad, fhess_p) = grad_hess(xk, *args)\n absgrad = np.abs(fgrad)\n if np.max(absgrad) <= tol:\n break\n maggrad = np.sum(absgrad)\n eta = min([0.5, np.sqrt(maggrad)])\n termcond = eta * maggrad\n xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond)\n alphak = 1.0\n if line_search:\n try:\n (alphak, fc, gc, old_fval, old_old_fval, gfkp1) = _line_search_wolfe12(func, grad, xk, xsupi, fgrad, old_fval, old_old_fval, args=args)\n except _LineSearchError:\n warnings.warn('Line Search failed')\n break\n xk = xk + alphak * xsupi\n k += 1\n if warn and k >= maxiter:\n warnings.warn('newton-cg failed to converge. Increase the number of iterations.', ConvergenceWarning)\n return xk, k" }, { @@ -175530,7 +189484,8 @@ "docstring": { "type": "int,", "description": "Number of samples to draw in each column." - } + }, + "refined_type": {} }, { "name": "classes", @@ -175540,7 +189495,8 @@ "docstring": { "type": "list of size n_outputs of arrays of size (n_classes,)", "description": "List of classes for each column." - } + }, + "refined_type": {} }, { "name": "class_probability", @@ -175550,7 +189506,8 @@ "docstring": { "type": "list of size n_outputs of arrays of shape (n_classes,), default=None", "description": "Class distribution of each column. If None, uniform distribution is\nassumed." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -175560,13 +189517,14 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Controls the randomness of the sampled classes.\nSee :term:`Glossary `." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Generate a sparse random matrix given column class distributions", - "docstring": "Generate a sparse random matrix given column class distributions\n\nParameters\n----------\nn_samples : int,\n Number of samples to draw in each column.\n\nclasses : list of size n_outputs of arrays of size (n_classes,)\n List of classes for each column.\n\nclass_probability : list of size n_outputs of arrays of shape (n_classes,), default=None\n Class distribution of each column. If None, uniform distribution is\n assumed.\n\nrandom_state : int, RandomState instance or None, default=None\n Controls the randomness of the sampled classes.\n See :term:`Glossary `.\n\nReturns\n-------\nrandom_matrix : sparse csc matrix of size (n_samples, n_outputs)", + "docstring": "Generate a sparse random matrix given column class distributions\n\n Parameters\n ----------\n n_samples : int,\n Number of samples to draw in each column.\n\n classes : list of size n_outputs of arrays of size (n_classes,)\n List of classes for each column.\n\n class_probability : list of size n_outputs of arrays of shape (n_classes,), default=None\n Class distribution of each column. If None, uniform distribution is\n assumed.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the sampled classes.\n See :term:`Glossary `.\n\n Returns\n -------\n random_matrix : sparse csc matrix of size (n_samples, n_outputs)\n\n ", "source_code": "\ndef _random_choice_csc(n_samples, classes, class_probability=None, random_state=None):\n \"\"\"Generate a sparse random matrix given column class distributions\n\n Parameters\n ----------\n n_samples : int,\n Number of samples to draw in each column.\n\n classes : list of size n_outputs of arrays of size (n_classes,)\n List of classes for each column.\n\n class_probability : list of size n_outputs of arrays of shape (n_classes,), default=None\n Class distribution of each column. If None, uniform distribution is\n assumed.\n\n random_state : int, RandomState instance or None, default=None\n Controls the randomness of the sampled classes.\n See :term:`Glossary `.\n\n Returns\n -------\n random_matrix : sparse csc matrix of size (n_samples, n_outputs)\n\n \"\"\"\n data = array.array('i')\n indices = array.array('i')\n indptr = array.array('i', [0])\n for j in range(len(classes)):\n classes[j] = np.asarray(classes[j])\n if classes[j].dtype.kind != 'i':\n raise ValueError('class dtype %s is not supported' % classes[j].dtype)\n classes[j] = classes[j].astype(np.int64, copy=False)\n if class_probability is None:\n class_prob_j = np.empty(shape=classes[j].shape[0])\n class_prob_j.fill(1 / classes[j].shape[0])\n else:\n class_prob_j = np.asarray(class_probability[j])\n if not np.isclose(np.sum(class_prob_j), 1.0):\n raise ValueError('Probability array at index {0} does not sum to one'.format(j))\n if class_prob_j.shape[0] != classes[j].shape[0]:\n raise ValueError('classes[{0}] (length {1}) and class_probability[{0}] (length {2}) have different length.'.format(j, classes[j].shape[0], class_prob_j.shape[0]))\n if 0 not in classes[j]:\n classes[j] = np.insert(classes[j], 0, 0)\n class_prob_j = np.insert(class_prob_j, 0, 0.0)\n rng = check_random_state(random_state)\n if classes[j].shape[0] > 1:\n p_nonzero = 1 - class_prob_j[classes[j] == 0]\n nnz = int(n_samples * p_nonzero)\n ind_sample = sample_without_replacement(n_population=n_samples, n_samples=nnz, random_state=random_state)\n indices.extend(ind_sample)\n classes_j_nonzero = classes[j] != 0\n class_probability_nz = class_prob_j[classes_j_nonzero]\n class_probability_nz_norm = class_probability_nz / np.sum(class_probability_nz)\n classes_ind = np.searchsorted(class_probability_nz_norm.cumsum(), rng.rand(nnz))\n data.extend(classes[j][classes_j_nonzero][classes_ind])\n indptr.append(len(indices))\n return sp.csc_matrix((data, indices, indptr), (n_samples, len(classes)), dtype=int)" }, { @@ -175584,7 +189542,8 @@ "docstring": { "type": "bool, default=True", "description": "Implements resampling with replacement. If False, this will implement\n(sliced) random permutations." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -175594,7 +189553,8 @@ "docstring": { "type": "int, default=None", "description": "Number of samples to generate. If left to None this is\nautomatically set to the first dimension of the arrays.\nIf replace is False it should not be larger than the length of\narrays." - } + }, + "refined_type": {} }, { "name": "random_state", @@ -175604,7 +189564,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for shuffling\nthe data.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "stratify", @@ -175614,13 +189575,14 @@ "docstring": { "type": "array-like of shape (n_samples,) or (n_samples, n_outputs), default=None", "description": "If not None, data is split in a stratified fashion, using this as\nthe class labels." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Resample arrays or sparse matrices in a consistent way.\n\nThe default strategy implements one step of the bootstrapping procedure.", - "docstring": "Resample arrays or sparse matrices in a consistent way.\n\nThe default strategy implements one step of the bootstrapping\nprocedure.\n\nParameters\n----------\n*arrays : sequence of array-like of shape (n_samples,) or (n_samples, n_outputs)\n Indexable data-structures can be arrays, lists, dataframes or scipy\n sparse matrices with consistent first dimension.\n\nreplace : bool, default=True\n Implements resampling with replacement. If False, this will implement\n (sliced) random permutations.\n\nn_samples : int, default=None\n Number of samples to generate. If left to None this is\n automatically set to the first dimension of the arrays.\n If replace is False it should not be larger than the length of\n arrays.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for shuffling\n the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nstratify : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n If not None, data is split in a stratified fashion, using this as\n the class labels.\n\nReturns\n-------\nresampled_arrays : sequence of array-like of shape (n_samples,) or (n_samples, n_outputs)\n Sequence of resampled copies of the collections. The original arrays\n are not impacted.\n\nExamples\n--------\nIt is possible to mix sparse and dense arrays in the same run::\n\n >>> import numpy as np\n >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])\n >>> y = np.array([0, 1, 2])\n\n >>> from scipy.sparse import coo_matrix\n >>> X_sparse = coo_matrix(X)\n\n >>> from sklearn.utils import resample\n >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)\n >>> X\n array([[1., 0.],\n [2., 1.],\n [1., 0.]])\n\n >>> X_sparse\n <3x2 sparse matrix of type '<... 'numpy.float64'>'\n with 4 stored elements in Compressed Sparse Row format>\n\n >>> X_sparse.toarray()\n array([[1., 0.],\n [2., 1.],\n [1., 0.]])\n\n >>> y\n array([0, 1, 0])\n\n >>> resample(y, n_samples=2, random_state=0)\n array([0, 1])\n\nExample using stratification::\n\n >>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1]\n >>> resample(y, n_samples=5, replace=False, stratify=y,\n ... random_state=0)\n [1, 1, 1, 0, 1]\n\nSee Also\n--------\nshuffle", + "description": "Resample arrays or sparse matrices in a consistent way.\n\nThe default strategy implements one step of the bootstrapping\nprocedure.", + "docstring": "Resample arrays or sparse matrices in a consistent way.\n\n The default strategy implements one step of the bootstrapping\n procedure.\n\n Parameters\n ----------\n *arrays : sequence of array-like of shape (n_samples,) or (n_samples, n_outputs)\n Indexable data-structures can be arrays, lists, dataframes or scipy\n sparse matrices with consistent first dimension.\n\n replace : bool, default=True\n Implements resampling with replacement. If False, this will implement\n (sliced) random permutations.\n\n n_samples : int, default=None\n Number of samples to generate. If left to None this is\n automatically set to the first dimension of the arrays.\n If replace is False it should not be larger than the length of\n arrays.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for shuffling\n the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n stratify : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n If not None, data is split in a stratified fashion, using this as\n the class labels.\n\n Returns\n -------\n resampled_arrays : sequence of array-like of shape (n_samples,) or (n_samples, n_outputs)\n Sequence of resampled copies of the collections. The original arrays\n are not impacted.\n\n Examples\n --------\n It is possible to mix sparse and dense arrays in the same run::\n\n >>> import numpy as np\n >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])\n >>> y = np.array([0, 1, 2])\n\n >>> from scipy.sparse import coo_matrix\n >>> X_sparse = coo_matrix(X)\n\n >>> from sklearn.utils import resample\n >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)\n >>> X\n array([[1., 0.],\n [2., 1.],\n [1., 0.]])\n\n >>> X_sparse\n <3x2 sparse matrix of type '<... 'numpy.float64'>'\n with 4 stored elements in Compressed Sparse Row format>\n\n >>> X_sparse.toarray()\n array([[1., 0.],\n [2., 1.],\n [1., 0.]])\n\n >>> y\n array([0, 1, 0])\n\n >>> resample(y, n_samples=2, random_state=0)\n array([0, 1])\n\n Example using stratification::\n\n >>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1]\n >>> resample(y, n_samples=5, replace=False, stratify=y,\n ... random_state=0)\n [1, 1, 1, 0, 1]\n\n See Also\n --------\n shuffle\n ", "source_code": "\ndef resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None):\n \"\"\"Resample arrays or sparse matrices in a consistent way.\n\n The default strategy implements one step of the bootstrapping\n procedure.\n\n Parameters\n ----------\n *arrays : sequence of array-like of shape (n_samples,) or (n_samples, n_outputs)\n Indexable data-structures can be arrays, lists, dataframes or scipy\n sparse matrices with consistent first dimension.\n\n replace : bool, default=True\n Implements resampling with replacement. If False, this will implement\n (sliced) random permutations.\n\n n_samples : int, default=None\n Number of samples to generate. If left to None this is\n automatically set to the first dimension of the arrays.\n If replace is False it should not be larger than the length of\n arrays.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for shuffling\n the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n stratify : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None\n If not None, data is split in a stratified fashion, using this as\n the class labels.\n\n Returns\n -------\n resampled_arrays : sequence of array-like of shape (n_samples,) or (n_samples, n_outputs)\n Sequence of resampled copies of the collections. The original arrays\n are not impacted.\n\n Examples\n --------\n It is possible to mix sparse and dense arrays in the same run::\n\n >>> import numpy as np\n >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])\n >>> y = np.array([0, 1, 2])\n\n >>> from scipy.sparse import coo_matrix\n >>> X_sparse = coo_matrix(X)\n\n >>> from sklearn.utils import resample\n >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)\n >>> X\n array([[1., 0.],\n [2., 1.],\n [1., 0.]])\n\n >>> X_sparse\n <3x2 sparse matrix of type '<... 'numpy.float64'>'\n with 4 stored elements in Compressed Sparse Row format>\n\n >>> X_sparse.toarray()\n array([[1., 0.],\n [2., 1.],\n [1., 0.]])\n\n >>> y\n array([0, 1, 0])\n\n >>> resample(y, n_samples=2, random_state=0)\n array([0, 1])\n\n Example using stratification::\n\n >>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1]\n >>> resample(y, n_samples=5, replace=False, stratify=y,\n ... random_state=0)\n [1, 1, 1, 0, 1]\n\n See Also\n --------\n shuffle\n \"\"\"\n max_n_samples = n_samples\n random_state = check_random_state(random_state)\n if len(arrays) == 0:\n return None\n first = arrays[0]\n n_samples = first.shape[0] if hasattr(first, 'shape') else len(first)\n if max_n_samples is None:\n max_n_samples = n_samples\n elif max_n_samples > n_samples and not replace:\n raise ValueError('Cannot sample %d out of arrays with dim %d when replace is False' % (max_n_samples, n_samples))\n check_consistent_length(*arrays)\n if stratify is None:\n if replace:\n indices = random_state.randint(0, n_samples, size=(max_n_samples, ))\n else:\n indices = np.arange(n_samples)\n random_state.shuffle(indices)\n indices = indices[:max_n_samples]\n else:\n y = check_array(stratify, ensure_2d=False, dtype=None)\n if y.ndim == 2:\n y = np.array([' '.join(row.astype('str')) for row in y])\n (classes, y_indices) = np.unique(y, return_inverse=True)\n n_classes = classes.shape[0]\n class_counts = np.bincount(y_indices)\n class_indices = np.split(np.argsort(y_indices, kind='mergesort'), np.cumsum(class_counts)[:-1])\n n_i = _approximate_mode(class_counts, max_n_samples, random_state)\n indices = []\n for i in range(n_classes):\n indices_i = random_state.choice(class_indices[i], n_i[i], replace=replace)\n indices.extend(indices_i)\n indices = random_state.permutation(indices)\n arrays = [a.tocsr() if issparse(a) else a for a in arrays]\n resampled_arrays = [_safe_indexing(a, indices) for a in arrays]\n if len(resampled_arrays) == 1:\n return resampled_arrays[0]\n else:\n return resampled_arrays" }, { @@ -175638,6 +189600,10 @@ "docstring": { "type": "{array-like, sparse matrix}", "description": "Data on which to apply mask." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -175648,13 +189614,14 @@ "docstring": { "type": "ndarray", "description": "Mask to be used on X." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Return a mask which is safe to use on X.", - "docstring": "Return a mask which is safe to use on X.\n\nParameters\n----------\nX : {array-like, sparse matrix}\n Data on which to apply mask.\n\nmask : ndarray\n Mask to be used on X.\n\nReturns\n-------\n mask", + "docstring": "Return a mask which is safe to use on X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}\n Data on which to apply mask.\n\n mask : ndarray\n Mask to be used on X.\n\n Returns\n -------\n mask\n ", "source_code": "\ndef safe_mask(X, mask):\n \"\"\"Return a mask which is safe to use on X.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}\n Data on which to apply mask.\n\n mask : ndarray\n Mask to be used on X.\n\n Returns\n -------\n mask\n \"\"\"\n mask = np.asarray(mask)\n if np.issubdtype(mask.dtype, np.signedinteger):\n return mask\n if hasattr(X, 'toarray'):\n ind = np.arange(mask.shape[0])\n mask = ind[mask]\n return mask" }, { @@ -175672,6 +189639,10 @@ "docstring": { "type": "{array-like, ndarray, sparse matrix}", "description": "" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -175682,13 +189653,14 @@ "docstring": { "type": "bool, default=True", "description": "Whether to create a copy of X and operate on it or to perform\ninplace computation (default behaviour)." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Element wise squaring of array-likes and sparse matrices.", - "docstring": "Element wise squaring of array-likes and sparse matrices.\n\nParameters\n----------\nX : {array-like, ndarray, sparse matrix}\n\ncopy : bool, default=True\n Whether to create a copy of X and operate on it or to perform\n inplace computation (default behaviour).\n\nReturns\n-------\nX ** 2 : element wise square", + "docstring": "Element wise squaring of array-likes and sparse matrices.\n\n Parameters\n ----------\n X : {array-like, ndarray, sparse matrix}\n\n copy : bool, default=True\n Whether to create a copy of X and operate on it or to perform\n inplace computation (default behaviour).\n\n Returns\n -------\n X ** 2 : element wise square\n ", "source_code": "\ndef safe_sqr(X, *, copy=True):\n \"\"\"Element wise squaring of array-likes and sparse matrices.\n\n Parameters\n ----------\n X : {array-like, ndarray, sparse matrix}\n\n copy : bool, default=True\n Whether to create a copy of X and operate on it or to perform\n inplace computation (default behaviour).\n\n Returns\n -------\n X ** 2 : element wise square\n \"\"\"\n X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], ensure_2d=False)\n if issparse(X):\n if copy:\n X = X.copy()\n X.data **= 2\n elif copy:\n X = X**2\n else:\n X **= 2\n return X" }, { @@ -175706,7 +189678,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "top_path", @@ -175716,14 +189689,15 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "", - "docstring": "", - "source_code": "\ndef configuration(parent_package='', top_path=None):\n import numpy\n from numpy.distutils.misc_util import Configuration\n config = Configuration('utils', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_extension('sparsefuncs_fast', sources=['sparsefuncs_fast.pyx'], libraries=libraries)\n config.add_extension('_cython_blas', sources=['_cython_blas.pyx'], libraries=libraries)\n config.add_extension('arrayfuncs', sources=['arrayfuncs.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('murmurhash', sources=['murmurhash.pyx', join('src', 'MurmurHash3.cpp')], include_dirs=['src'])\n config.add_extension('_fast_dict', sources=['_fast_dict.pyx'], language='c++', include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_openmp_helpers', sources=['_openmp_helpers.pyx'], libraries=libraries)\n templates = ['sklearn/utils/_seq_dataset.pyx.tp', 'sklearn/utils/_seq_dataset.pxd.tp', 'sklearn/utils/_weight_vector.pyx.tp', 'sklearn/utils/_weight_vector.pxd.tp']\n gen_from_templates(templates)\n config.add_extension('_seq_dataset', sources=['_seq_dataset.pyx'], include_dirs=[numpy.get_include()])\n config.add_extension('_weight_vector', sources=['_weight_vector.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_random', sources=['_random.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_logistic_sigmoid', sources=['_logistic_sigmoid.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_readonly_array_wrapper', sources=['_readonly_array_wrapper.pyx'], libraries=libraries)\n config.add_subpackage('tests')\n return config" + "docstring": null, + "source_code": "\ndef configuration(parent_package='', top_path=None):\n import numpy\n from numpy.distutils.misc_util import Configuration\n config = Configuration('utils', parent_package, top_path)\n libraries = []\n if os.name == 'posix':\n libraries.append('m')\n config.add_extension('sparsefuncs_fast', sources=['sparsefuncs_fast.pyx'], libraries=libraries)\n config.add_extension('_cython_blas', sources=['_cython_blas.pyx'], libraries=libraries)\n config.add_extension('arrayfuncs', sources=['arrayfuncs.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('murmurhash', sources=['murmurhash.pyx', join('src', 'MurmurHash3.cpp')], include_dirs=['src'])\n config.add_extension('_fast_dict', sources=['_fast_dict.pyx'], language='c++', include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_openmp_helpers', sources=['_openmp_helpers.pyx'], libraries=libraries)\n templates = ['sklearn/utils/_seq_dataset.pyx.tp', 'sklearn/utils/_seq_dataset.pxd.tp', 'sklearn/utils/_weight_vector.pyx.tp', 'sklearn/utils/_weight_vector.pxd.tp']\n gen_from_templates(templates)\n config.add_extension('_seq_dataset', sources=['_seq_dataset.pyx'], include_dirs=[numpy.get_include()])\n config.add_extension('_weight_vector', sources=['_weight_vector.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_random', sources=['_random.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_logistic_sigmoid', sources=['_logistic_sigmoid.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_extension('_readonly_array_wrapper', sources=['_readonly_array_wrapper.pyx'], libraries=libraries)\n config.add_extension('_typedefs', sources=['_typedefs.pyx'], include_dirs=[numpy.get_include()], libraries=libraries)\n config.add_subpackage('tests')\n return config" }, { "name": "shuffle", @@ -175740,7 +189714,8 @@ "docstring": { "type": "int, RandomState instance or None, default=None", "description": "Determines random number generation for shuffling\nthe data.\nPass an int for reproducible results across multiple function calls.\nSee :term:`Glossary `." - } + }, + "refined_type": {} }, { "name": "n_samples", @@ -175750,13 +189725,14 @@ "docstring": { "type": "int, default=None", "description": "Number of samples to generate. If left to None this is\nautomatically set to the first dimension of the arrays. It should\nnot be larger than the length of arrays." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Shuffle arrays or sparse matrices in a consistent way.\n\nThis is a convenience alias to ``resample(*arrays, replace=False)`` to do random permutations of the collections.", - "docstring": "Shuffle arrays or sparse matrices in a consistent way.\n\nThis is a convenience alias to ``resample(*arrays, replace=False)`` to do\nrandom permutations of the collections.\n\nParameters\n----------\n*arrays : sequence of indexable data-structures\n Indexable data-structures can be arrays, lists, dataframes or scipy\n sparse matrices with consistent first dimension.\n\nrandom_state : int, RandomState instance or None, default=None\n Determines random number generation for shuffling\n the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\nn_samples : int, default=None\n Number of samples to generate. If left to None this is\n automatically set to the first dimension of the arrays. It should\n not be larger than the length of arrays.\n\nReturns\n-------\nshuffled_arrays : sequence of indexable data-structures\n Sequence of shuffled copies of the collections. The original arrays\n are not impacted.\n\nExamples\n--------\nIt is possible to mix sparse and dense arrays in the same run::\n\n >>> import numpy as np\n >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])\n >>> y = np.array([0, 1, 2])\n\n >>> from scipy.sparse import coo_matrix\n >>> X_sparse = coo_matrix(X)\n\n >>> from sklearn.utils import shuffle\n >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)\n >>> X\n array([[0., 0.],\n [2., 1.],\n [1., 0.]])\n\n >>> X_sparse\n <3x2 sparse matrix of type '<... 'numpy.float64'>'\n with 3 stored elements in Compressed Sparse Row format>\n\n >>> X_sparse.toarray()\n array([[0., 0.],\n [2., 1.],\n [1., 0.]])\n\n >>> y\n array([2, 1, 0])\n\n >>> shuffle(y, n_samples=2, random_state=0)\n array([0, 1])\n\nSee Also\n--------\nresample", + "description": "Shuffle arrays or sparse matrices in a consistent way.\n\nThis is a convenience alias to ``resample(*arrays, replace=False)`` to do\nrandom permutations of the collections.", + "docstring": "Shuffle arrays or sparse matrices in a consistent way.\n\n This is a convenience alias to ``resample(*arrays, replace=False)`` to do\n random permutations of the collections.\n\n Parameters\n ----------\n *arrays : sequence of indexable data-structures\n Indexable data-structures can be arrays, lists, dataframes or scipy\n sparse matrices with consistent first dimension.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for shuffling\n the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n n_samples : int, default=None\n Number of samples to generate. If left to None this is\n automatically set to the first dimension of the arrays. It should\n not be larger than the length of arrays.\n\n Returns\n -------\n shuffled_arrays : sequence of indexable data-structures\n Sequence of shuffled copies of the collections. The original arrays\n are not impacted.\n\n Examples\n --------\n It is possible to mix sparse and dense arrays in the same run::\n\n >>> import numpy as np\n >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])\n >>> y = np.array([0, 1, 2])\n\n >>> from scipy.sparse import coo_matrix\n >>> X_sparse = coo_matrix(X)\n\n >>> from sklearn.utils import shuffle\n >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)\n >>> X\n array([[0., 0.],\n [2., 1.],\n [1., 0.]])\n\n >>> X_sparse\n <3x2 sparse matrix of type '<... 'numpy.float64'>'\n with 3 stored elements in Compressed Sparse Row format>\n\n >>> X_sparse.toarray()\n array([[0., 0.],\n [2., 1.],\n [1., 0.]])\n\n >>> y\n array([2, 1, 0])\n\n >>> shuffle(y, n_samples=2, random_state=0)\n array([0, 1])\n\n See Also\n --------\n resample\n ", "source_code": "\ndef shuffle(*arrays, random_state=None, n_samples=None):\n \"\"\"Shuffle arrays or sparse matrices in a consistent way.\n\n This is a convenience alias to ``resample(*arrays, replace=False)`` to do\n random permutations of the collections.\n\n Parameters\n ----------\n *arrays : sequence of indexable data-structures\n Indexable data-structures can be arrays, lists, dataframes or scipy\n sparse matrices with consistent first dimension.\n\n random_state : int, RandomState instance or None, default=None\n Determines random number generation for shuffling\n the data.\n Pass an int for reproducible results across multiple function calls.\n See :term:`Glossary `.\n\n n_samples : int, default=None\n Number of samples to generate. If left to None this is\n automatically set to the first dimension of the arrays. It should\n not be larger than the length of arrays.\n\n Returns\n -------\n shuffled_arrays : sequence of indexable data-structures\n Sequence of shuffled copies of the collections. The original arrays\n are not impacted.\n\n Examples\n --------\n It is possible to mix sparse and dense arrays in the same run::\n\n >>> import numpy as np\n >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])\n >>> y = np.array([0, 1, 2])\n\n >>> from scipy.sparse import coo_matrix\n >>> X_sparse = coo_matrix(X)\n\n >>> from sklearn.utils import shuffle\n >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)\n >>> X\n array([[0., 0.],\n [2., 1.],\n [1., 0.]])\n\n >>> X_sparse\n <3x2 sparse matrix of type '<... 'numpy.float64'>'\n with 3 stored elements in Compressed Sparse Row format>\n\n >>> X_sparse.toarray()\n array([[0., 0.],\n [2., 1.],\n [1., 0.]])\n\n >>> y\n array([2, 1, 0])\n\n >>> shuffle(y, n_samples=2, random_state=0)\n array([0, 1])\n\n See Also\n --------\n resample\n \"\"\"\n return resample(*arrays, replace=False, n_samples=n_samples, random_state=random_state)" }, { @@ -175774,7 +189750,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "data", @@ -175784,7 +189761,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_negative", @@ -175794,7 +189772,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_zeros", @@ -175804,7 +189783,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -175828,7 +189808,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "n_zeros", @@ -175838,13 +189819,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute the median of data with n_zeros additional zeros.\n\nThis function is used to support sparse matrices; it modifies data in-place.", - "docstring": "Compute the median of data with n_zeros additional zeros.\n\nThis function is used to support sparse matrices; it modifies data\nin-place.", + "description": "Compute the median of data with n_zeros additional zeros.\n\nThis function is used to support sparse matrices; it modifies data\nin-place.", + "docstring": "Compute the median of data with n_zeros additional zeros.\n\n This function is used to support sparse matrices; it modifies data\n in-place.\n ", "source_code": "\ndef _get_median(data, n_zeros):\n \"\"\"Compute the median of data with n_zeros additional zeros.\n\n This function is used to support sparse matrices; it modifies data\n in-place.\n \"\"\"\n n_elems = len(data) + n_zeros\n if not n_elems:\n return np.nan\n n_negative = np.count_nonzero(data < 0)\n (middle, is_odd) = divmod(n_elems, 2)\n data.sort()\n if is_odd:\n return _get_elem_at_rank(middle, data, n_negative, n_zeros)\n return (_get_elem_at_rank(middle - 1, data, n_negative, n_zeros) + _get_elem_at_rank(middle, data, n_negative, n_zeros)) / 2.0" }, { @@ -175862,7 +189844,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "axis", @@ -175872,7 +189855,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_or_max", @@ -175882,13 +189866,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _min_or_max_axis(X, axis, min_or_max):\n N = X.shape[axis]\n if N == 0:\n raise ValueError('zero-size array to reduction operation')\n M = X.shape[1 - axis]\n mat = X.tocsc() if axis == 0 else X.tocsr()\n mat.sum_duplicates()\n (major_index, value) = _minor_reduce(mat, min_or_max)\n not_full = np.diff(mat.indptr)[major_index] < N\n value[not_full] = min_or_max(value[not_full], 0)\n mask = value != 0\n major_index = np.compress(mask, major_index)\n value = np.compress(mask, value)\n if axis == 0:\n res = sp.coo_matrix((value, (np.zeros(len(value)), major_index)), dtype=X.dtype, shape=(1, M))\n else:\n res = sp.coo_matrix((value, (major_index, np.zeros(len(value)))), dtype=X.dtype, shape=(M, 1))\n return res.A.ravel()" }, { @@ -175906,7 +189891,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "ufunc", @@ -175916,13 +189902,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _minor_reduce(X, ufunc):\n major_index = np.flatnonzero(np.diff(X.indptr))\n X = type(X)((X.data, X.indices, X.indptr), shape=X.shape)\n value = ufunc.reduceat(X.data, X.indptr[major_index])\n return major_index, value" }, { @@ -175940,13 +189927,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _raise_error_wrong_axis(axis):\n if axis not in (0, 1):\n raise ValueError('Unknown axis value: %d. Use 0 for rows, or 1 for columns' % axis)" }, { @@ -175964,7 +189952,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -175988,7 +189977,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "axis", @@ -175998,13 +189988,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sparse_min_max(X, axis):\n return _sparse_min_or_max(X, axis, np.minimum), _sparse_min_or_max(X, axis, np.maximum)" }, { @@ -176022,7 +190013,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "axis", @@ -176032,7 +190024,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "min_or_max", @@ -176042,13 +190035,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sparse_min_or_max(X, axis, min_or_max):\n if axis is None:\n if 0 in X.shape:\n raise ValueError('zero-size array to reduction operation')\n zero = X.dtype.type(0)\n if X.nnz == 0:\n return zero\n m = min_or_max.reduce(X.data.ravel())\n if X.nnz != np.product(X.shape):\n m = min_or_max(zero, m)\n return m\n if axis < 0:\n axis += 2\n if axis == 0 or axis == 1:\n return _min_or_max_axis(X, axis, min_or_max)\n else:\n raise ValueError('invalid axis, use 0 for rows, or 1 for columns')" }, { @@ -176066,7 +190060,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "axis", @@ -176076,13 +190071,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _sparse_nan_min_max(X, axis):\n return _sparse_min_or_max(X, axis, np.fmin), _sparse_min_or_max(X, axis, np.fmax)" }, { @@ -176100,7 +190096,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_labels)", "description": "Input data. It should be of CSR format." - } + }, + "refined_type": {} }, { "name": "axis", @@ -176110,6 +190107,10 @@ "docstring": { "type": "{0, 1}, default=None", "description": "The axis on which the data is aggregated." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -176120,13 +190121,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Weight for each row of X." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "A variant of X.getnnz() with extension to weighting on axis 0\n\nUseful in efficiently calculating multilabel metrics.", - "docstring": "A variant of X.getnnz() with extension to weighting on axis 0\n\nUseful in efficiently calculating multilabel metrics.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_labels)\n Input data. It should be of CSR format.\n\naxis : {0, 1}, default=None\n The axis on which the data is aggregated.\n\nsample_weight : array-like of shape (n_samples,), default=None\n Weight for each row of X.", + "docstring": "A variant of X.getnnz() with extension to weighting on axis 0\n\n Useful in efficiently calculating multilabel metrics.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_labels)\n Input data. It should be of CSR format.\n\n axis : {0, 1}, default=None\n The axis on which the data is aggregated.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weight for each row of X.\n ", "source_code": "\ndef count_nonzero(X, axis=None, sample_weight=None):\n \"\"\"A variant of X.getnnz() with extension to weighting on axis 0\n\n Useful in efficiently calculating multilabel metrics.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_labels)\n Input data. It should be of CSR format.\n\n axis : {0, 1}, default=None\n The axis on which the data is aggregated.\n\n sample_weight : array-like of shape (n_samples,), default=None\n Weight for each row of X.\n \"\"\"\n if axis == -1:\n axis = 1\n elif axis == -2:\n axis = 0\n elif X.format != 'csr':\n raise TypeError('Expected CSR sparse format, got {0}'.format(X.format))\n if axis is None:\n if sample_weight is None:\n return X.nnz\n else:\n return np.dot(np.diff(X.indptr), sample_weight)\n elif axis == 1:\n out = np.diff(X.indptr)\n if sample_weight is None:\n return out.astype('intp')\n return out * sample_weight\n elif axis == 0:\n if sample_weight is None:\n return np.bincount(X.indices, minlength=X.shape[1])\n else:\n weights = np.repeat(sample_weight, np.diff(X.indptr))\n return np.bincount(X.indices, minlength=X.shape[1], weights=weights)\n else:\n raise ValueError('Unsupported axis: {0}'.format(axis))" }, { @@ -176144,13 +190146,14 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "Input data. It should be of CSC format." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Find the median across axis 0 of a CSC matrix. It is equivalent to doing np.median(X, axis=0).", - "docstring": "Find the median across axis 0 of a CSC matrix.\nIt is equivalent to doing np.median(X, axis=0).\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n Input data. It should be of CSC format.\n\nReturns\n-------\nmedian : ndarray of shape (n_features,)\n Median.", + "description": "Find the median across axis 0 of a CSC matrix.\nIt is equivalent to doing np.median(X, axis=0).", + "docstring": "Find the median across axis 0 of a CSC matrix.\n It is equivalent to doing np.median(X, axis=0).\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Input data. It should be of CSC format.\n\n Returns\n -------\n median : ndarray of shape (n_features,)\n Median.\n\n ", "source_code": "\ndef csc_median_axis_0(X):\n \"\"\"Find the median across axis 0 of a CSC matrix.\n It is equivalent to doing np.median(X, axis=0).\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Input data. It should be of CSC format.\n\n Returns\n -------\n median : ndarray of shape (n_features,)\n Median.\n\n \"\"\"\n if not isinstance(X, sp.csc_matrix):\n raise TypeError('Expected matrix of CSC format, got %s' % X.format)\n indptr = X.indptr\n (n_samples, n_features) = X.shape\n median = np.zeros(n_features)\n for (f_ind, (start, end)) in enumerate(zip(indptr[:-1], indptr[1:])):\n data = np.copy(X.data[start:end])\n nz = n_samples - data.size\n median[f_ind] = _get_median(data, nz)\n return median" }, { @@ -176168,7 +190171,8 @@ "docstring": { "type": "CSR or CSC sparse matrix of shape (n_samples, n_features)", "description": "Input data." - } + }, + "refined_type": {} }, { "name": "axis", @@ -176178,6 +190182,10 @@ "docstring": { "type": "{0, 1}", "description": "Axis along which the axis should be computed." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -176188,7 +190196,8 @@ "docstring": { "type": "ndarray of shape (n_features,) or (n_samples,), dtype=floating", "description": "Array of means to update with the new data X.\nShould be of shape (n_features,) if axis=0 or (n_samples,) if axis=1." - } + }, + "refined_type": {} }, { "name": "last_var", @@ -176198,7 +190207,8 @@ "docstring": { "type": "ndarray of shape (n_features,) or (n_samples,), dtype=floating", "description": "Array of variances to update with the new data X.\nShould be of shape (n_features,) if axis=0 or (n_samples,) if axis=1." - } + }, + "refined_type": {} }, { "name": "last_n", @@ -176208,7 +190218,8 @@ "docstring": { "type": "float or ndarray of shape (n_features,) or (n_samples,), dtype=floating", "description": "Sum of the weights seen so far, excluding the current weights\nIf not float, it should be of shape (n_samples,) if\naxis=0 or (n_features,) if axis=1. If float it corresponds to\nhaving same weights for all samples (or features)." - } + }, + "refined_type": {} }, { "name": "weights", @@ -176218,13 +190229,14 @@ "docstring": { "type": "ndarray of shape (n_samples,) or (n_features,), default=None", "description": "If axis is set to 0 shape is (n_samples,) or\nif axis is set to 1 shape is (n_features,).\nIf it is set to None, then samples are equally weighted.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute incremental mean and variance along an axis on a CSR or CSC matrix.\n\nlast_mean, last_var are the statistics computed at the last step by this function. Both must be initialized to 0-arrays of the proper size, i.e. the number of features in X. last_n is the number of samples encountered until now.", - "docstring": "Compute incremental mean and variance along an axis on a CSR or\nCSC matrix.\n\nlast_mean, last_var are the statistics computed at the last step by this\nfunction. Both must be initialized to 0-arrays of the proper size, i.e.\nthe number of features in X. last_n is the number of samples encountered\nuntil now.\n\nParameters\n----------\nX : CSR or CSC sparse matrix of shape (n_samples, n_features)\n Input data.\n\naxis : {0, 1}\n Axis along which the axis should be computed.\n\nlast_mean : ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Array of means to update with the new data X.\n Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.\n\nlast_var : ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Array of variances to update with the new data X.\n Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.\n\nlast_n : float or ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Sum of the weights seen so far, excluding the current weights\n If not float, it should be of shape (n_samples,) if\n axis=0 or (n_features,) if axis=1. If float it corresponds to\n having same weights for all samples (or features).\n\nweights : ndarray of shape (n_samples,) or (n_features,), default=None\n If axis is set to 0 shape is (n_samples,) or\n if axis is set to 1 shape is (n_features,).\n If it is set to None, then samples are equally weighted.\n\n .. versionadded:: 0.24\n\nReturns\n-------\nmeans : ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Updated feature-wise means if axis = 0 or\n sample-wise means if axis = 1.\n\nvariances : ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Updated feature-wise variances if axis = 0 or\n sample-wise variances if axis = 1.\n\nn : ndarray of shape (n_features,) or (n_samples,), dtype=integral\n Updated number of seen samples per feature if axis=0\n or number of seen features per sample if axis=1.\n\n If weights is not None, n is a sum of the weights of the seen\n samples or features instead of the actual number of seen\n samples or features.\n\nNotes\n-----\nNaNs are ignored in the algorithm.", + "description": "Compute incremental mean and variance along an axis on a CSR or\nCSC matrix.\n\nlast_mean, last_var are the statistics computed at the last step by this\nfunction. Both must be initialized to 0-arrays of the proper size, i.e.\nthe number of features in X. last_n is the number of samples encountered\nuntil now.", + "docstring": "Compute incremental mean and variance along an axis on a CSR or\n CSC matrix.\n\n last_mean, last_var are the statistics computed at the last step by this\n function. Both must be initialized to 0-arrays of the proper size, i.e.\n the number of features in X. last_n is the number of samples encountered\n until now.\n\n Parameters\n ----------\n X : CSR or CSC sparse matrix of shape (n_samples, n_features)\n Input data.\n\n axis : {0, 1}\n Axis along which the axis should be computed.\n\n last_mean : ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Array of means to update with the new data X.\n Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.\n\n last_var : ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Array of variances to update with the new data X.\n Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.\n\n last_n : float or ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Sum of the weights seen so far, excluding the current weights\n If not float, it should be of shape (n_samples,) if\n axis=0 or (n_features,) if axis=1. If float it corresponds to\n having same weights for all samples (or features).\n\n weights : ndarray of shape (n_samples,) or (n_features,), default=None\n If axis is set to 0 shape is (n_samples,) or\n if axis is set to 1 shape is (n_features,).\n If it is set to None, then samples are equally weighted.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n means : ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Updated feature-wise means if axis = 0 or\n sample-wise means if axis = 1.\n\n variances : ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Updated feature-wise variances if axis = 0 or\n sample-wise variances if axis = 1.\n\n n : ndarray of shape (n_features,) or (n_samples,), dtype=integral\n Updated number of seen samples per feature if axis=0\n or number of seen features per sample if axis=1.\n\n If weights is not None, n is a sum of the weights of the seen\n samples or features instead of the actual number of seen\n samples or features.\n\n Notes\n -----\n NaNs are ignored in the algorithm.\n ", "source_code": "\ndef incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n, weights=None):\n \"\"\"Compute incremental mean and variance along an axis on a CSR or\n CSC matrix.\n\n last_mean, last_var are the statistics computed at the last step by this\n function. Both must be initialized to 0-arrays of the proper size, i.e.\n the number of features in X. last_n is the number of samples encountered\n until now.\n\n Parameters\n ----------\n X : CSR or CSC sparse matrix of shape (n_samples, n_features)\n Input data.\n\n axis : {0, 1}\n Axis along which the axis should be computed.\n\n last_mean : ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Array of means to update with the new data X.\n Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.\n\n last_var : ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Array of variances to update with the new data X.\n Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.\n\n last_n : float or ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Sum of the weights seen so far, excluding the current weights\n If not float, it should be of shape (n_samples,) if\n axis=0 or (n_features,) if axis=1. If float it corresponds to\n having same weights for all samples (or features).\n\n weights : ndarray of shape (n_samples,) or (n_features,), default=None\n If axis is set to 0 shape is (n_samples,) or\n if axis is set to 1 shape is (n_features,).\n If it is set to None, then samples are equally weighted.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n means : ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Updated feature-wise means if axis = 0 or\n sample-wise means if axis = 1.\n\n variances : ndarray of shape (n_features,) or (n_samples,), dtype=floating\n Updated feature-wise variances if axis = 0 or\n sample-wise variances if axis = 1.\n\n n : ndarray of shape (n_features,) or (n_samples,), dtype=integral\n Updated number of seen samples per feature if axis=0\n or number of seen features per sample if axis=1.\n\n If weights is not None, n is a sum of the weights of the seen\n samples or features instead of the actual number of seen\n samples or features.\n\n Notes\n -----\n NaNs are ignored in the algorithm.\n \"\"\"\n _raise_error_wrong_axis(axis)\n if not isinstance(X, (sp.csr_matrix, sp.csc_matrix)):\n _raise_typeerror(X)\n if np.size(last_n) == 1:\n last_n = np.full(last_mean.shape, last_n, dtype=last_mean.dtype)\n if not np.size(last_mean) == np.size(last_var) == np.size(last_n):\n raise ValueError('last_mean, last_var, last_n do not have the same shapes.')\n if axis == 1:\n if np.size(last_mean) != X.shape[0]:\n raise ValueError(f'If axis=1, then last_mean, last_n, last_var should be of size n_samples {X.shape[0]} (Got {np.size(last_mean)}).')\n elif np.size(last_mean) != X.shape[1]:\n raise ValueError(f'If axis=0, then last_mean, last_n, last_var should be of size n_features {X.shape[1]} (Got {np.size(last_mean)}).')\n X = X.T if axis == 1 else X\n if weights is not None:\n weights = _check_sample_weight(weights, X, dtype=X.dtype)\n return _incr_mean_var_axis0(X, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=weights)" }, { @@ -176242,7 +190254,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "Matrix to normalize using the variance of the features. It should be\nof CSC or CSR format." - } + }, + "refined_type": {} }, { "name": "scale", @@ -176252,13 +190265,17 @@ "docstring": { "type": "ndarray of shape (n_features,), dtype={np.float32, np.float64}", "description": "Array of precomputed feature-wise values to use for scaling." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Inplace column scaling of a CSC/CSR matrix.\n\nScale each feature of the data matrix by multiplying with specific scale provided by the caller assuming a (n_samples, n_features) shape.", - "docstring": "Inplace column scaling of a CSC/CSR matrix.\n\nScale each feature of the data matrix by multiplying with specific scale\nprovided by the caller assuming a (n_samples, n_features) shape.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n Matrix to normalize using the variance of the features. It should be\n of CSC or CSR format.\n\nscale : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Array of precomputed feature-wise values to use for scaling.", + "description": "Inplace column scaling of a CSC/CSR matrix.\n\nScale each feature of the data matrix by multiplying with specific scale\nprovided by the caller assuming a (n_samples, n_features) shape.", + "docstring": "Inplace column scaling of a CSC/CSR matrix.\n\n Scale each feature of the data matrix by multiplying with specific scale\n provided by the caller assuming a (n_samples, n_features) shape.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix to normalize using the variance of the features. It should be\n of CSC or CSR format.\n\n scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Array of precomputed feature-wise values to use for scaling.\n ", "source_code": "\ndef inplace_column_scale(X, scale):\n \"\"\"Inplace column scaling of a CSC/CSR matrix.\n\n Scale each feature of the data matrix by multiplying with specific scale\n provided by the caller assuming a (n_samples, n_features) shape.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix to normalize using the variance of the features. It should be\n of CSC or CSR format.\n\n scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Array of precomputed feature-wise values to use for scaling.\n \"\"\"\n if isinstance(X, sp.csc_matrix):\n inplace_csr_row_scale(X.T, scale)\n elif isinstance(X, sp.csr_matrix):\n inplace_csr_column_scale(X, scale)\n else:\n _raise_typeerror(X)" }, { @@ -176276,7 +190293,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "Matrix to normalize using the variance of the features.\nIt should be of CSR format." - } + }, + "refined_type": {} }, { "name": "scale", @@ -176286,13 +190304,17 @@ "docstring": { "type": "ndarray of shape (n_features,), dtype={np.float32, np.float64}", "description": "Array of precomputed feature-wise values to use for scaling." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Inplace column scaling of a CSR matrix.\n\nScale each feature of the data matrix by multiplying with specific scale provided by the caller assuming a (n_samples, n_features) shape.", - "docstring": "Inplace column scaling of a CSR matrix.\n\nScale each feature of the data matrix by multiplying with specific scale\nprovided by the caller assuming a (n_samples, n_features) shape.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n Matrix to normalize using the variance of the features.\n It should be of CSR format.\n\nscale : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Array of precomputed feature-wise values to use for scaling.", + "description": "Inplace column scaling of a CSR matrix.\n\nScale each feature of the data matrix by multiplying with specific scale\nprovided by the caller assuming a (n_samples, n_features) shape.", + "docstring": "Inplace column scaling of a CSR matrix.\n\n Scale each feature of the data matrix by multiplying with specific scale\n provided by the caller assuming a (n_samples, n_features) shape.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix to normalize using the variance of the features.\n It should be of CSR format.\n\n scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Array of precomputed feature-wise values to use for scaling.\n ", "source_code": "\ndef inplace_csr_column_scale(X, scale):\n \"\"\"Inplace column scaling of a CSR matrix.\n\n Scale each feature of the data matrix by multiplying with specific scale\n provided by the caller assuming a (n_samples, n_features) shape.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix to normalize using the variance of the features.\n It should be of CSR format.\n\n scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Array of precomputed feature-wise values to use for scaling.\n \"\"\"\n assert scale.shape[0] == X.shape[1]\n X.data *= scale.take(X.indices, mode='clip')" }, { @@ -176310,7 +190332,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "Matrix to be scaled. It should be of CSR format." - } + }, + "refined_type": {} }, { "name": "scale", @@ -176320,13 +190343,14 @@ "docstring": { "type": "ndarray of float of shape (n_samples,)", "description": "Array of precomputed sample-wise values to use for scaling." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Inplace row scaling of a CSR matrix.\n\nScale each sample of the data matrix by multiplying with specific scale provided by the caller assuming a (n_samples, n_features) shape.", - "docstring": "Inplace row scaling of a CSR matrix.\n\nScale each sample of the data matrix by multiplying with specific scale\nprovided by the caller assuming a (n_samples, n_features) shape.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n Matrix to be scaled. It should be of CSR format.\n\nscale : ndarray of float of shape (n_samples,)\n Array of precomputed sample-wise values to use for scaling.", + "description": "Inplace row scaling of a CSR matrix.\n\nScale each sample of the data matrix by multiplying with specific scale\nprovided by the caller assuming a (n_samples, n_features) shape.", + "docstring": "Inplace row scaling of a CSR matrix.\n\n Scale each sample of the data matrix by multiplying with specific scale\n provided by the caller assuming a (n_samples, n_features) shape.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix to be scaled. It should be of CSR format.\n\n scale : ndarray of float of shape (n_samples,)\n Array of precomputed sample-wise values to use for scaling.\n ", "source_code": "\ndef inplace_csr_row_scale(X, scale):\n \"\"\"Inplace row scaling of a CSR matrix.\n\n Scale each sample of the data matrix by multiplying with specific scale\n provided by the caller assuming a (n_samples, n_features) shape.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix to be scaled. It should be of CSR format.\n\n scale : ndarray of float of shape (n_samples,)\n Array of precomputed sample-wise values to use for scaling.\n \"\"\"\n assert scale.shape[0] == X.shape[0]\n X.data *= np.repeat(scale, np.diff(X.indptr))" }, { @@ -176344,7 +190368,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "Matrix to be scaled. It should be of CSR or CSC format." - } + }, + "refined_type": {} }, { "name": "scale", @@ -176354,13 +190379,17 @@ "docstring": { "type": "ndarray of shape (n_features,), dtype={np.float32, np.float64}", "description": "Array of precomputed sample-wise values to use for scaling." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Inplace row scaling of a CSR or CSC matrix.\n\nScale each row of the data matrix by multiplying with specific scale provided by the caller assuming a (n_samples, n_features) shape.", - "docstring": "Inplace row scaling of a CSR or CSC matrix.\n\nScale each row of the data matrix by multiplying with specific scale\nprovided by the caller assuming a (n_samples, n_features) shape.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n Matrix to be scaled. It should be of CSR or CSC format.\n\nscale : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Array of precomputed sample-wise values to use for scaling.", + "description": "Inplace row scaling of a CSR or CSC matrix.\n\nScale each row of the data matrix by multiplying with specific scale\nprovided by the caller assuming a (n_samples, n_features) shape.", + "docstring": "Inplace row scaling of a CSR or CSC matrix.\n\n Scale each row of the data matrix by multiplying with specific scale\n provided by the caller assuming a (n_samples, n_features) shape.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix to be scaled. It should be of CSR or CSC format.\n\n scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Array of precomputed sample-wise values to use for scaling.\n ", "source_code": "\ndef inplace_row_scale(X, scale):\n \"\"\"Inplace row scaling of a CSR or CSC matrix.\n\n Scale each row of the data matrix by multiplying with specific scale\n provided by the caller assuming a (n_samples, n_features) shape.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix to be scaled. It should be of CSR or CSC format.\n\n scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Array of precomputed sample-wise values to use for scaling.\n \"\"\"\n if isinstance(X, sp.csc_matrix):\n inplace_csr_column_scale(X.T, scale)\n elif isinstance(X, sp.csr_matrix):\n inplace_csr_row_scale(X, scale)\n else:\n _raise_typeerror(X)" }, { @@ -176378,7 +190407,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "Matrix whose two columns are to be swapped. It should be of\nCSR or CSC format." - } + }, + "refined_type": {} }, { "name": "m", @@ -176388,7 +190418,8 @@ "docstring": { "type": "int", "description": "Index of the column of X to be swapped." - } + }, + "refined_type": {} }, { "name": "n", @@ -176398,13 +190429,14 @@ "docstring": { "type": "int", "description": "Index of the column of X to be swapped." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Swaps two columns of a CSC/CSR matrix in-place.", - "docstring": "Swaps two columns of a CSC/CSR matrix in-place.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n Matrix whose two columns are to be swapped. It should be of\n CSR or CSC format.\n\nm : int\n Index of the column of X to be swapped.\n\nn : int\n Index of the column of X to be swapped.", + "docstring": "\n Swaps two columns of a CSC/CSR matrix in-place.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix whose two columns are to be swapped. It should be of\n CSR or CSC format.\n\n m : int\n Index of the column of X to be swapped.\n\n n : int\n Index of the column of X to be swapped.\n ", "source_code": "\ndef inplace_swap_column(X, m, n):\n \"\"\"\n Swaps two columns of a CSC/CSR matrix in-place.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix whose two columns are to be swapped. It should be of\n CSR or CSC format.\n\n m : int\n Index of the column of X to be swapped.\n\n n : int\n Index of the column of X to be swapped.\n \"\"\"\n if m < 0:\n m += X.shape[1]\n if n < 0:\n n += X.shape[1]\n if isinstance(X, sp.csc_matrix):\n inplace_swap_row_csr(X, m, n)\n elif isinstance(X, sp.csr_matrix):\n inplace_swap_row_csc(X, m, n)\n else:\n _raise_typeerror(X)" }, { @@ -176422,7 +190454,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "Matrix whose two rows are to be swapped. It should be of CSR or\nCSC format." - } + }, + "refined_type": {} }, { "name": "m", @@ -176432,7 +190465,8 @@ "docstring": { "type": "int", "description": "Index of the row of X to be swapped." - } + }, + "refined_type": {} }, { "name": "n", @@ -176442,13 +190476,14 @@ "docstring": { "type": "int", "description": "Index of the row of X to be swapped." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Swaps two rows of a CSC/CSR matrix in-place.", - "docstring": "Swaps two rows of a CSC/CSR matrix in-place.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n Matrix whose two rows are to be swapped. It should be of CSR or\n CSC format.\n\nm : int\n Index of the row of X to be swapped.\n\nn : int\n Index of the row of X to be swapped.", + "docstring": "\n Swaps two rows of a CSC/CSR matrix in-place.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix whose two rows are to be swapped. It should be of CSR or\n CSC format.\n\n m : int\n Index of the row of X to be swapped.\n\n n : int\n Index of the row of X to be swapped.\n ", "source_code": "\ndef inplace_swap_row(X, m, n):\n \"\"\"\n Swaps two rows of a CSC/CSR matrix in-place.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix whose two rows are to be swapped. It should be of CSR or\n CSC format.\n\n m : int\n Index of the row of X to be swapped.\n\n n : int\n Index of the row of X to be swapped.\n \"\"\"\n if isinstance(X, sp.csc_matrix):\n inplace_swap_row_csc(X, m, n)\n elif isinstance(X, sp.csr_matrix):\n inplace_swap_row_csr(X, m, n)\n else:\n _raise_typeerror(X)" }, { @@ -176466,7 +190501,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "Matrix whose two rows are to be swapped. It should be of\nCSC format." - } + }, + "refined_type": {} }, { "name": "m", @@ -176476,7 +190512,8 @@ "docstring": { "type": "int", "description": "Index of the row of X to be swapped." - } + }, + "refined_type": {} }, { "name": "n", @@ -176486,13 +190523,14 @@ "docstring": { "type": "int", "description": "Index of the row of X to be swapped." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Swaps two rows of a CSC matrix in-place.", - "docstring": "Swaps two rows of a CSC matrix in-place.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n Matrix whose two rows are to be swapped. It should be of\n CSC format.\n\nm : int\n Index of the row of X to be swapped.\n\nn : int\n Index of the row of X to be swapped.", + "docstring": "\n Swaps two rows of a CSC matrix in-place.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix whose two rows are to be swapped. It should be of\n CSC format.\n\n m : int\n Index of the row of X to be swapped.\n\n n : int\n Index of the row of X to be swapped.\n ", "source_code": "\ndef inplace_swap_row_csc(X, m, n):\n \"\"\"\n Swaps two rows of a CSC matrix in-place.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix whose two rows are to be swapped. It should be of\n CSC format.\n\n m : int\n Index of the row of X to be swapped.\n\n n : int\n Index of the row of X to be swapped.\n \"\"\"\n for t in [m, n]:\n if isinstance(t, np.ndarray):\n raise TypeError('m and n should be valid integers')\n if m < 0:\n m += X.shape[0]\n if n < 0:\n n += X.shape[0]\n m_mask = X.indices == m\n X.indices[X.indices == n] = m\n X.indices[m_mask] = n" }, { @@ -176510,7 +190548,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "Matrix whose two rows are to be swapped. It should be of\nCSR format." - } + }, + "refined_type": {} }, { "name": "m", @@ -176520,7 +190559,8 @@ "docstring": { "type": "int", "description": "Index of the row of X to be swapped." - } + }, + "refined_type": {} }, { "name": "n", @@ -176530,13 +190570,14 @@ "docstring": { "type": "int", "description": "Index of the row of X to be swapped." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Swaps two rows of a CSR matrix in-place.", - "docstring": "Swaps two rows of a CSR matrix in-place.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n Matrix whose two rows are to be swapped. It should be of\n CSR format.\n\nm : int\n Index of the row of X to be swapped.\n\nn : int\n Index of the row of X to be swapped.", + "docstring": "\n Swaps two rows of a CSR matrix in-place.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix whose two rows are to be swapped. It should be of\n CSR format.\n\n m : int\n Index of the row of X to be swapped.\n\n n : int\n Index of the row of X to be swapped.\n ", "source_code": "\ndef inplace_swap_row_csr(X, m, n):\n \"\"\"\n Swaps two rows of a CSR matrix in-place.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Matrix whose two rows are to be swapped. It should be of\n CSR format.\n\n m : int\n Index of the row of X to be swapped.\n\n n : int\n Index of the row of X to be swapped.\n \"\"\"\n for t in [m, n]:\n if isinstance(t, np.ndarray):\n raise TypeError('m and n should be valid integers')\n if m < 0:\n m += X.shape[0]\n if n < 0:\n n += X.shape[0]\n if m > n:\n (m, n) = (n, m)\n indptr = X.indptr\n m_start = indptr[m]\n m_stop = indptr[m + 1]\n n_start = indptr[n]\n n_stop = indptr[n + 1]\n nz_m = m_stop - m_start\n nz_n = n_stop - n_start\n if nz_m != nz_n:\n X.indptr[m + 2:n] += nz_n - nz_m\n X.indptr[m + 1] = m_start + nz_n\n X.indptr[n] = n_stop - nz_m\n X.indices = np.concatenate([X.indices[:m_start], X.indices[n_start:n_stop], X.indices[m_stop:n_start], X.indices[m_start:m_stop], X.indices[n_stop:]])\n X.data = np.concatenate([X.data[:m_start], X.data[n_start:n_stop], X.data[m_stop:n_start], X.data[m_start:m_stop], X.data[n_stop:]])" }, { @@ -176554,7 +190595,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "Input data. It can be of CSR or CSC format." - } + }, + "refined_type": {} }, { "name": "axis", @@ -176564,6 +190606,10 @@ "docstring": { "type": "{0, 1}", "description": "Axis along which the axis should be computed." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -176574,7 +190620,8 @@ "docstring": { "type": "ndarray of shape (n_samples,) or (n_features,), default=None", "description": "if axis is set to 0 shape is (n_samples,) or\nif axis is set to 1 shape is (n_features,).\nIf it is set to None, then samples are equally weighted.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} }, { "name": "return_sum_weights", @@ -176584,13 +190631,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, returns the sum of weights seen for each feature\nif `axis=0` or each sample if `axis=1`.\n\n.. versionadded:: 0.24" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Compute mean and variance along an axis on a CSR or CSC matrix.", - "docstring": "Compute mean and variance along an axis on a CSR or CSC matrix.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n Input data. It can be of CSR or CSC format.\n\naxis : {0, 1}\n Axis along which the axis should be computed.\n\nweights : ndarray of shape (n_samples,) or (n_features,), default=None\n if axis is set to 0 shape is (n_samples,) or\n if axis is set to 1 shape is (n_features,).\n If it is set to None, then samples are equally weighted.\n\n .. versionadded:: 0.24\n\nreturn_sum_weights : bool, default=False\n If True, returns the sum of weights seen for each feature\n if `axis=0` or each sample if `axis=1`.\n\n .. versionadded:: 0.24\n\nReturns\n-------\n\nmeans : ndarray of shape (n_features,), dtype=floating\n Feature-wise means.\n\nvariances : ndarray of shape (n_features,), dtype=floating\n Feature-wise variances.\n\nsum_weights : ndarray of shape (n_features,), dtype=floating\n Returned if `return_sum_weights` is `True`.", + "docstring": "Compute mean and variance along an axis on a CSR or CSC matrix.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Input data. It can be of CSR or CSC format.\n\n axis : {0, 1}\n Axis along which the axis should be computed.\n\n weights : ndarray of shape (n_samples,) or (n_features,), default=None\n if axis is set to 0 shape is (n_samples,) or\n if axis is set to 1 shape is (n_features,).\n If it is set to None, then samples are equally weighted.\n\n .. versionadded:: 0.24\n\n return_sum_weights : bool, default=False\n If True, returns the sum of weights seen for each feature\n if `axis=0` or each sample if `axis=1`.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n\n means : ndarray of shape (n_features,), dtype=floating\n Feature-wise means.\n\n variances : ndarray of shape (n_features,), dtype=floating\n Feature-wise variances.\n\n sum_weights : ndarray of shape (n_features,), dtype=floating\n Returned if `return_sum_weights` is `True`.\n ", "source_code": "\ndef mean_variance_axis(X, axis, weights=None, return_sum_weights=False):\n \"\"\"Compute mean and variance along an axis on a CSR or CSC matrix.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Input data. It can be of CSR or CSC format.\n\n axis : {0, 1}\n Axis along which the axis should be computed.\n\n weights : ndarray of shape (n_samples,) or (n_features,), default=None\n if axis is set to 0 shape is (n_samples,) or\n if axis is set to 1 shape is (n_features,).\n If it is set to None, then samples are equally weighted.\n\n .. versionadded:: 0.24\n\n return_sum_weights : bool, default=False\n If True, returns the sum of weights seen for each feature\n if `axis=0` or each sample if `axis=1`.\n\n .. versionadded:: 0.24\n\n Returns\n -------\n\n means : ndarray of shape (n_features,), dtype=floating\n Feature-wise means.\n\n variances : ndarray of shape (n_features,), dtype=floating\n Feature-wise variances.\n\n sum_weights : ndarray of shape (n_features,), dtype=floating\n Returned if `return_sum_weights` is `True`.\n \"\"\"\n _raise_error_wrong_axis(axis)\n if isinstance(X, sp.csr_matrix):\n if axis == 0:\n return _csr_mean_var_axis0(X, weights=weights, return_sum_weights=return_sum_weights)\n else:\n return _csc_mean_var_axis0(X.T, weights=weights, return_sum_weights=return_sum_weights)\n elif isinstance(X, sp.csc_matrix):\n if axis == 0:\n return _csc_mean_var_axis0(X, weights=weights, return_sum_weights=return_sum_weights)\n else:\n return _csr_mean_var_axis0(X.T, weights=weights, return_sum_weights=return_sum_weights)\n else:\n _raise_typeerror(X)" }, { @@ -176608,7 +190656,8 @@ "docstring": { "type": "sparse matrix of shape (n_samples, n_features)", "description": "Input data. It should be of CSR or CSC format." - } + }, + "refined_type": {} }, { "name": "axis", @@ -176618,6 +190667,10 @@ "docstring": { "type": "{0, 1}", "description": "Axis along which the axis should be computed." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -176628,13 +190681,14 @@ "docstring": { "type": "bool, default=False", "description": "Ignore or passing through NaN values.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Compute minimum and maximum along an axis on a CSR or CSC matrix and optionally ignore NaN values.", - "docstring": "Compute minimum and maximum along an axis on a CSR or CSC matrix and\noptionally ignore NaN values.\n\nParameters\n----------\nX : sparse matrix of shape (n_samples, n_features)\n Input data. It should be of CSR or CSC format.\n\naxis : {0, 1}\n Axis along which the axis should be computed.\n\nignore_nan : bool, default=False\n Ignore or passing through NaN values.\n\n .. versionadded:: 0.20\n\nReturns\n-------\n\nmins : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Feature-wise minima.\n\nmaxs : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Feature-wise maxima.", + "description": "Compute minimum and maximum along an axis on a CSR or CSC matrix and\noptionally ignore NaN values.", + "docstring": "Compute minimum and maximum along an axis on a CSR or CSC matrix and\n optionally ignore NaN values.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Input data. It should be of CSR or CSC format.\n\n axis : {0, 1}\n Axis along which the axis should be computed.\n\n ignore_nan : bool, default=False\n Ignore or passing through NaN values.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n\n mins : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Feature-wise minima.\n\n maxs : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Feature-wise maxima.\n ", "source_code": "\ndef min_max_axis(X, axis, ignore_nan=False):\n \"\"\"Compute minimum and maximum along an axis on a CSR or CSC matrix and\n optionally ignore NaN values.\n\n Parameters\n ----------\n X : sparse matrix of shape (n_samples, n_features)\n Input data. It should be of CSR or CSC format.\n\n axis : {0, 1}\n Axis along which the axis should be computed.\n\n ignore_nan : bool, default=False\n Ignore or passing through NaN values.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n\n mins : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Feature-wise minima.\n\n maxs : ndarray of shape (n_features,), dtype={np.float32, np.float64}\n Feature-wise maxima.\n \"\"\"\n if isinstance(X, sp.csr_matrix) or isinstance(X, sp.csc_matrix):\n if ignore_nan:\n return _sparse_nan_min_max(X, axis=axis)\n else:\n return _sparse_min_max(X, axis=axis)\n else:\n _raise_typeerror(X)" }, { @@ -176652,7 +190706,8 @@ "docstring": { "type": "1D or 2D array", "description": "Values to take the weighted percentile of." - } + }, + "refined_type": {} }, { "name": "sample_weight", @@ -176662,7 +190717,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "percentile", @@ -176672,13 +190728,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Compute weighted percentile\n\nComputes lower weighted percentile. If `array` is a 2D array, the `percentile` is computed along the axis 0. .. versionchanged:: 0.24 Accepts 2D `array`.", - "docstring": "Compute weighted percentile\n\nComputes lower weighted percentile. If `array` is a 2D array, the\n`percentile` is computed along the axis 0.\n\n .. versionchanged:: 0.24\n Accepts 2D `array`.\n\nParameters\n----------\narray : 1D or 2D array\n Values to take the weighted percentile of.\n\nsample_weight: 1D or 2D array\n Weights for each value in `array`. Must be same shape as `array` or\n of shape `(array.shape[0],)`.\n\npercentile: int or float, default=50\n Percentile to compute. Must be value between 0 and 100.\n\nReturns\n-------\npercentile : int if `array` 1D, ndarray if `array` 2D\n Weighted percentile.", + "description": "Compute weighted percentile\n\nComputes lower weighted percentile. If `array` is a 2D array, the\n`percentile` is computed along the axis 0.\n\n .. versionchanged:: 0.24\n Accepts 2D `array`.", + "docstring": "Compute weighted percentile\n\n Computes lower weighted percentile. If `array` is a 2D array, the\n `percentile` is computed along the axis 0.\n\n .. versionchanged:: 0.24\n Accepts 2D `array`.\n\n Parameters\n ----------\n array : 1D or 2D array\n Values to take the weighted percentile of.\n\n sample_weight: 1D or 2D array\n Weights for each value in `array`. Must be same shape as `array` or\n of shape `(array.shape[0],)`.\n\n percentile: int or float, default=50\n Percentile to compute. Must be value between 0 and 100.\n\n Returns\n -------\n percentile : int if `array` 1D, ndarray if `array` 2D\n Weighted percentile.\n ", "source_code": "\ndef _weighted_percentile(array, sample_weight, percentile=50):\n \"\"\"Compute weighted percentile\n\n Computes lower weighted percentile. If `array` is a 2D array, the\n `percentile` is computed along the axis 0.\n\n .. versionchanged:: 0.24\n Accepts 2D `array`.\n\n Parameters\n ----------\n array : 1D or 2D array\n Values to take the weighted percentile of.\n\n sample_weight: 1D or 2D array\n Weights for each value in `array`. Must be same shape as `array` or\n of shape `(array.shape[0],)`.\n\n percentile: int or float, default=50\n Percentile to compute. Must be value between 0 and 100.\n\n Returns\n -------\n percentile : int if `array` 1D, ndarray if `array` 2D\n Weighted percentile.\n \"\"\"\n n_dim = array.ndim\n if n_dim == 0:\n return array[()]\n if array.ndim == 1:\n array = array.reshape((-1, 1))\n if array.shape != sample_weight.shape and array.shape[0] == sample_weight.shape[0]:\n sample_weight = np.tile(sample_weight, (array.shape[1], 1)).T\n sorted_idx = np.argsort(array, axis=0)\n sorted_weights = _take_along_axis(sample_weight, sorted_idx, axis=0)\n weight_cdf = stable_cumsum(sorted_weights, axis=0)\n adjusted_percentile = percentile / 100 * weight_cdf[-1]\n mask = adjusted_percentile == 0\n adjusted_percentile[mask] = np.nextafter(adjusted_percentile[mask], adjusted_percentile[mask] + 1)\n percentile_idx = np.array([np.searchsorted(weight_cdf[:, i], adjusted_percentile[i]) for i in range(weight_cdf.shape[1])])\n percentile_idx = np.array(percentile_idx)\n max_idx = sorted_idx.shape[0] - 1\n percentile_idx = np.apply_along_axis(lambda x: np.clip(x, 0, max_idx), axis=0, arr=percentile_idx)\n col_index = np.arange(array.shape[1])\n percentile_in_sorted = sorted_idx[percentile_idx, col_index]\n percentile = array[percentile_in_sorted, col_index]\n return percentile[0] if n_dim == 1 else percentile" }, { @@ -176696,13 +190753,14 @@ "docstring": { "type": "iterable", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Cast iterable x to a Sequence, avoiding a copy if possible.", - "docstring": "Cast iterable x to a Sequence, avoiding a copy if possible.\n\nParameters\n----------\nx : iterable", + "docstring": "Cast iterable x to a Sequence, avoiding a copy if possible.\n\n Parameters\n ----------\n x : iterable\n ", "source_code": "\ndef tosequence(x):\n \"\"\"Cast iterable x to a Sequence, avoiding a copy if possible.\n\n Parameters\n ----------\n x : iterable\n \"\"\"\n if isinstance(x, np.ndarray):\n return np.asarray(x)\n elif isinstance(x, Sequence):\n return x\n else:\n return list(x)" }, { @@ -176720,6 +190778,10 @@ "docstring": { "type": "{array-like, sparse matrix}", "description": "First array to compare." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -176730,6 +190792,10 @@ "docstring": { "type": "{array-like, sparse matrix}", "description": "Second array to compare." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -176740,7 +190806,8 @@ "docstring": { "type": "float, default=1e-7", "description": "Relative tolerance; see numpy.allclose." - } + }, + "refined_type": {} }, { "name": "atol", @@ -176750,13 +190817,14 @@ "docstring": { "type": "float, default=1e-9", "description": "absolute tolerance; see numpy.allclose. Note that the default here is\nmore tolerant than the default for numpy.testing.assert_allclose, where\natol=0." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Check allclose for sparse and dense data.\n\nBoth x and y need to be either sparse or dense, they can't be mixed.", - "docstring": "Check allclose for sparse and dense data.\n\nBoth x and y need to be either sparse or dense, they\ncan't be mixed.\n\nParameters\n----------\nx : {array-like, sparse matrix}\n First array to compare.\n\ny : {array-like, sparse matrix}\n Second array to compare.\n\nrtol : float, default=1e-7\n Relative tolerance; see numpy.allclose.\n\natol : float, default=1e-9\n absolute tolerance; see numpy.allclose. Note that the default here is\n more tolerant than the default for numpy.testing.assert_allclose, where\n atol=0.", + "description": "Check allclose for sparse and dense data.\n\nBoth x and y need to be either sparse or dense, they\ncan't be mixed.", + "docstring": "Check allclose for sparse and dense data.\n\n Both x and y need to be either sparse or dense, they\n can't be mixed.\n\n Parameters\n ----------\n x : {array-like, sparse matrix}\n First array to compare.\n\n y : {array-like, sparse matrix}\n Second array to compare.\n\n rtol : float, default=1e-7\n Relative tolerance; see numpy.allclose.\n\n atol : float, default=1e-9\n absolute tolerance; see numpy.allclose. Note that the default here is\n more tolerant than the default for numpy.testing.assert_allclose, where\n atol=0.\n ", "source_code": "\ndef _allclose_dense_sparse(x, y, rtol=1e-07, atol=1e-09):\n \"\"\"Check allclose for sparse and dense data.\n\n Both x and y need to be either sparse or dense, they\n can't be mixed.\n\n Parameters\n ----------\n x : {array-like, sparse matrix}\n First array to compare.\n\n y : {array-like, sparse matrix}\n Second array to compare.\n\n rtol : float, default=1e-7\n Relative tolerance; see numpy.allclose.\n\n atol : float, default=1e-9\n absolute tolerance; see numpy.allclose. Note that the default here is\n more tolerant than the default for numpy.testing.assert_allclose, where\n atol=0.\n \"\"\"\n if sp.issparse(x) and sp.issparse(y):\n x = x.tocsr()\n y = y.tocsr()\n x.sum_duplicates()\n y.sum_duplicates()\n return np.array_equal(x.indices, y.indices) and np.array_equal(x.indptr, y.indptr) and np.allclose(x.data, y.data, rtol=rtol, atol=atol)\n elif not sp.issparse(x) and not sp.issparse(y):\n return np.allclose(x, y, rtol=rtol, atol=atol)\n raise ValueError('Can only compare two sparse matrices, not a sparse matrix and an array')" }, { @@ -176774,7 +190842,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "allow_nan", @@ -176784,7 +190853,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "msg_dtype", @@ -176794,7 +190864,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -176818,7 +190889,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "input_features", @@ -176828,13 +190900,14 @@ "docstring": { "type": "array-like of str or None, default=None", "description": "Input features.\n\n- If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n- If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Get output feature names for transformation.", - "docstring": "Get output feature names for transformation.\n\nParameters\n----------\ninput_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\nReturns\n-------\nfeature_names_in : ndarray of str\n Feature names in.", + "docstring": "Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_in : ndarray of str\n Feature names in.\n ", "source_code": "\ndef _check_feature_names_in(estimator, input_features=None):\n \"\"\"Get output feature names for transformation.\n\n Parameters\n ----------\n input_features : array-like of str or None, default=None\n Input features.\n\n - If `input_features` is `None`, then `feature_names_in_` is\n used as feature names in. If `feature_names_in_` is not defined,\n then names are generated: `[x0, x1, ..., x(n_features_in_)]`.\n - If `input_features` is an array-like, then `input_features` must\n match `feature_names_in_` if `feature_names_in_` is defined.\n\n Returns\n -------\n feature_names_in : ndarray of str\n Feature names in.\n \"\"\"\n feature_names_in_ = getattr(estimator, 'feature_names_in_', None)\n n_features_in_ = getattr(estimator, 'n_features_in_', None)\n if input_features is not None:\n input_features = np.asarray(input_features, dtype=object)\n if feature_names_in_ is not None and not np.array_equal(feature_names_in_, input_features):\n raise ValueError('input_features is not equal to feature_names_in_')\n if n_features_in_ is not None and len(input_features) != n_features_in_:\n raise ValueError(f'input_features should have length equal to number of features ({n_features_in_}), got {len(input_features)}')\n return input_features\n if feature_names_in_ is not None:\n return feature_names_in_\n if n_features_in_ is None:\n raise ValueError('Unable to generate feature names without n_features_in_')\n return np.asarray([f'x{i}' for i in range(n_features_in_)], dtype=object)" }, { @@ -176852,7 +190925,8 @@ "docstring": { "type": "array-like of shape (n_samples, n_features)", "description": "Data array." - } + }, + "refined_type": {} }, { "name": "fit_params", @@ -176862,7 +190936,8 @@ "docstring": { "type": "dict", "description": "Dictionary containing the parameters passed at fit." - } + }, + "refined_type": {} }, { "name": "indices", @@ -176872,13 +190947,14 @@ "docstring": { "type": "array-like of shape (n_samples,), default=None", "description": "Indices to be selected if the parameter has the same size as `X`." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Check and validate the parameters passed during `fit`.", - "docstring": "Check and validate the parameters passed during `fit`.\n\nParameters\n----------\nX : array-like of shape (n_samples, n_features)\n Data array.\n\nfit_params : dict\n Dictionary containing the parameters passed at fit.\n\nindices : array-like of shape (n_samples,), default=None\n Indices to be selected if the parameter has the same size as `X`.\n\nReturns\n-------\nfit_params_validated : dict\n Validated parameters. We ensure that the values support indexing.", + "docstring": "Check and validate the parameters passed during `fit`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data array.\n\n fit_params : dict\n Dictionary containing the parameters passed at fit.\n\n indices : array-like of shape (n_samples,), default=None\n Indices to be selected if the parameter has the same size as `X`.\n\n Returns\n -------\n fit_params_validated : dict\n Validated parameters. We ensure that the values support indexing.\n ", "source_code": "\ndef _check_fit_params(X, fit_params, indices=None):\n \"\"\"Check and validate the parameters passed during `fit`.\n\n Parameters\n ----------\n X : array-like of shape (n_samples, n_features)\n Data array.\n\n fit_params : dict\n Dictionary containing the parameters passed at fit.\n\n indices : array-like of shape (n_samples,), default=None\n Indices to be selected if the parameter has the same size as `X`.\n\n Returns\n -------\n fit_params_validated : dict\n Validated parameters. We ensure that the values support indexing.\n \"\"\"\n from . import _safe_indexing\n fit_params_validated = {}\n for (param_key, param_value) in fit_params.items():\n if not _is_arraylike(param_value) or _num_samples(param_value) != _num_samples(X):\n fit_params_validated[param_key] = param_value\n else:\n fit_params_validated[param_key] = _make_indexable(param_value)\n fit_params_validated[param_key] = _safe_indexing(fit_params_validated[param_key], indices)\n return fit_params_validated" }, { @@ -176896,7 +190972,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "accept_large_sparse", @@ -176906,7 +190983,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -176930,7 +191008,8 @@ "docstring": { "type": "array-like of shape (n_eigenvalues,)", "description": "Array of eigenvalues to check / fix." - } + }, + "refined_type": {} }, { "name": "enable_warnings", @@ -176940,13 +191019,14 @@ "docstring": { "type": "bool, default=False", "description": "When this is set to ``True``, a ``PositiveSpectrumWarning`` will be\nraised when there are imaginary parts, negative eigenvalues, or\nextremely small non-zero eigenvalues. Otherwise no warning will be\nraised. In both cases, imaginary parts, negative eigenvalues, and\nextremely small non-zero eigenvalues will be set to zero." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Check the eigenvalues of a positive semidefinite (PSD) matrix.\n\nChecks the provided array of PSD matrix eigenvalues for numerical or conditioning issues and returns a fixed validated version. This method should typically be used if the PSD matrix is user-provided (e.g. a Gram matrix) or computed using a user-provided dissimilarity metric (e.g. kernel function), or if the decomposition process uses approximation methods (randomized SVD, etc.). It checks for three things: - that there are no significant imaginary parts in eigenvalues (more than 1e-5 times the maximum real part). If this check fails, it raises a ``ValueError``. Otherwise all non-significant imaginary parts that may remain are set to zero. This operation is traced with a ``PositiveSpectrumWarning`` when ``enable_warnings=True``. - that eigenvalues are not all negative. If this check fails, it raises a ``ValueError`` - that there are no significant negative eigenvalues with absolute value more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest positive eigenvalue in double (simple) precision. If this check fails, it raises a ``ValueError``. Otherwise all negative eigenvalues that may remain are set to zero. This operation is traced with a ``PositiveSpectrumWarning`` when ``enable_warnings=True``. Finally, all the positive eigenvalues that are too small (with a value smaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to zero. This operation is traced with a ``PositiveSpectrumWarning`` when ``enable_warnings=True``.", - "docstring": "Check the eigenvalues of a positive semidefinite (PSD) matrix.\n\nChecks the provided array of PSD matrix eigenvalues for numerical or\nconditioning issues and returns a fixed validated version. This method\nshould typically be used if the PSD matrix is user-provided (e.g. a\nGram matrix) or computed using a user-provided dissimilarity metric\n(e.g. kernel function), or if the decomposition process uses approximation\nmethods (randomized SVD, etc.).\n\nIt checks for three things:\n\n- that there are no significant imaginary parts in eigenvalues (more than\n 1e-5 times the maximum real part). If this check fails, it raises a\n ``ValueError``. Otherwise all non-significant imaginary parts that may\n remain are set to zero. This operation is traced with a\n ``PositiveSpectrumWarning`` when ``enable_warnings=True``.\n\n- that eigenvalues are not all negative. If this check fails, it raises a\n ``ValueError``\n\n- that there are no significant negative eigenvalues with absolute value\n more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest\n positive eigenvalue in double (simple) precision. If this check fails,\n it raises a ``ValueError``. Otherwise all negative eigenvalues that may\n remain are set to zero. This operation is traced with a\n ``PositiveSpectrumWarning`` when ``enable_warnings=True``.\n\nFinally, all the positive eigenvalues that are too small (with a value\nsmaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to\nzero. This operation is traced with a ``PositiveSpectrumWarning`` when\n``enable_warnings=True``.\n\nParameters\n----------\nlambdas : array-like of shape (n_eigenvalues,)\n Array of eigenvalues to check / fix.\n\nenable_warnings : bool, default=False\n When this is set to ``True``, a ``PositiveSpectrumWarning`` will be\n raised when there are imaginary parts, negative eigenvalues, or\n extremely small non-zero eigenvalues. Otherwise no warning will be\n raised. In both cases, imaginary parts, negative eigenvalues, and\n extremely small non-zero eigenvalues will be set to zero.\n\nReturns\n-------\nlambdas_fixed : ndarray of shape (n_eigenvalues,)\n A fixed validated copy of the array of eigenvalues.\n\nExamples\n--------\n>>> from sklearn.utils.validation import _check_psd_eigenvalues\n>>> _check_psd_eigenvalues([1, 2]) # nominal case\narray([1, 2])\n>>> _check_psd_eigenvalues([5, 5j]) # significant imag part\nTraceback (most recent call last):\n ...\nValueError: There are significant imaginary parts in eigenvalues (1\n of the maximum real part). Either the matrix is not PSD, or there was\n an issue while computing the eigendecomposition of the matrix.\n>>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part\narray([5., 0.])\n>>> _check_psd_eigenvalues([-5, -1]) # all negative\nTraceback (most recent call last):\n ...\nValueError: All eigenvalues are negative (maximum is -1). Either the\n matrix is not PSD, or there was an issue while computing the\n eigendecomposition of the matrix.\n>>> _check_psd_eigenvalues([5, -1]) # significant negative\nTraceback (most recent call last):\n ...\nValueError: There are significant negative eigenvalues (0.2 of the\n maximum positive). Either the matrix is not PSD, or there was an issue\n while computing the eigendecomposition of the matrix.\n>>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative\narray([5., 0.])\n>>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small)\narray([5., 0.])", + "description": "Check the eigenvalues of a positive semidefinite (PSD) matrix.\n\nChecks the provided array of PSD matrix eigenvalues for numerical or\nconditioning issues and returns a fixed validated version. This method\nshould typically be used if the PSD matrix is user-provided (e.g. a\nGram matrix) or computed using a user-provided dissimilarity metric\n(e.g. kernel function), or if the decomposition process uses approximation\nmethods (randomized SVD, etc.).\n\nIt checks for three things:\n\n- that there are no significant imaginary parts in eigenvalues (more than\n 1e-5 times the maximum real part). If this check fails, it raises a\n ``ValueError``. Otherwise all non-significant imaginary parts that may\n remain are set to zero. This operation is traced with a\n ``PositiveSpectrumWarning`` when ``enable_warnings=True``.\n\n- that eigenvalues are not all negative. If this check fails, it raises a\n ``ValueError``\n\n- that there are no significant negative eigenvalues with absolute value\n more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest\n positive eigenvalue in double (simple) precision. If this check fails,\n it raises a ``ValueError``. Otherwise all negative eigenvalues that may\n remain are set to zero. This operation is traced with a\n ``PositiveSpectrumWarning`` when ``enable_warnings=True``.\n\nFinally, all the positive eigenvalues that are too small (with a value\nsmaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to\nzero. This operation is traced with a ``PositiveSpectrumWarning`` when\n``enable_warnings=True``.", + "docstring": "Check the eigenvalues of a positive semidefinite (PSD) matrix.\n\n Checks the provided array of PSD matrix eigenvalues for numerical or\n conditioning issues and returns a fixed validated version. This method\n should typically be used if the PSD matrix is user-provided (e.g. a\n Gram matrix) or computed using a user-provided dissimilarity metric\n (e.g. kernel function), or if the decomposition process uses approximation\n methods (randomized SVD, etc.).\n\n It checks for three things:\n\n - that there are no significant imaginary parts in eigenvalues (more than\n 1e-5 times the maximum real part). If this check fails, it raises a\n ``ValueError``. Otherwise all non-significant imaginary parts that may\n remain are set to zero. This operation is traced with a\n ``PositiveSpectrumWarning`` when ``enable_warnings=True``.\n\n - that eigenvalues are not all negative. If this check fails, it raises a\n ``ValueError``\n\n - that there are no significant negative eigenvalues with absolute value\n more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest\n positive eigenvalue in double (simple) precision. If this check fails,\n it raises a ``ValueError``. Otherwise all negative eigenvalues that may\n remain are set to zero. This operation is traced with a\n ``PositiveSpectrumWarning`` when ``enable_warnings=True``.\n\n Finally, all the positive eigenvalues that are too small (with a value\n smaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to\n zero. This operation is traced with a ``PositiveSpectrumWarning`` when\n ``enable_warnings=True``.\n\n Parameters\n ----------\n lambdas : array-like of shape (n_eigenvalues,)\n Array of eigenvalues to check / fix.\n\n enable_warnings : bool, default=False\n When this is set to ``True``, a ``PositiveSpectrumWarning`` will be\n raised when there are imaginary parts, negative eigenvalues, or\n extremely small non-zero eigenvalues. Otherwise no warning will be\n raised. In both cases, imaginary parts, negative eigenvalues, and\n extremely small non-zero eigenvalues will be set to zero.\n\n Returns\n -------\n lambdas_fixed : ndarray of shape (n_eigenvalues,)\n A fixed validated copy of the array of eigenvalues.\n\n Examples\n --------\n >>> from sklearn.utils.validation import _check_psd_eigenvalues\n >>> _check_psd_eigenvalues([1, 2]) # nominal case\n array([1, 2])\n >>> _check_psd_eigenvalues([5, 5j]) # significant imag part\n Traceback (most recent call last):\n ...\n ValueError: There are significant imaginary parts in eigenvalues (1\n of the maximum real part). Either the matrix is not PSD, or there was\n an issue while computing the eigendecomposition of the matrix.\n >>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part\n array([5., 0.])\n >>> _check_psd_eigenvalues([-5, -1]) # all negative\n Traceback (most recent call last):\n ...\n ValueError: All eigenvalues are negative (maximum is -1). Either the\n matrix is not PSD, or there was an issue while computing the\n eigendecomposition of the matrix.\n >>> _check_psd_eigenvalues([5, -1]) # significant negative\n Traceback (most recent call last):\n ...\n ValueError: There are significant negative eigenvalues (0.2 of the\n maximum positive). Either the matrix is not PSD, or there was an issue\n while computing the eigendecomposition of the matrix.\n >>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative\n array([5., 0.])\n >>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small)\n array([5., 0.])\n\n ", "source_code": "\ndef _check_psd_eigenvalues(lambdas, enable_warnings=False):\n \"\"\"Check the eigenvalues of a positive semidefinite (PSD) matrix.\n\n Checks the provided array of PSD matrix eigenvalues for numerical or\n conditioning issues and returns a fixed validated version. This method\n should typically be used if the PSD matrix is user-provided (e.g. a\n Gram matrix) or computed using a user-provided dissimilarity metric\n (e.g. kernel function), or if the decomposition process uses approximation\n methods (randomized SVD, etc.).\n\n It checks for three things:\n\n - that there are no significant imaginary parts in eigenvalues (more than\n 1e-5 times the maximum real part). If this check fails, it raises a\n ``ValueError``. Otherwise all non-significant imaginary parts that may\n remain are set to zero. This operation is traced with a\n ``PositiveSpectrumWarning`` when ``enable_warnings=True``.\n\n - that eigenvalues are not all negative. If this check fails, it raises a\n ``ValueError``\n\n - that there are no significant negative eigenvalues with absolute value\n more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest\n positive eigenvalue in double (simple) precision. If this check fails,\n it raises a ``ValueError``. Otherwise all negative eigenvalues that may\n remain are set to zero. This operation is traced with a\n ``PositiveSpectrumWarning`` when ``enable_warnings=True``.\n\n Finally, all the positive eigenvalues that are too small (with a value\n smaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to\n zero. This operation is traced with a ``PositiveSpectrumWarning`` when\n ``enable_warnings=True``.\n\n Parameters\n ----------\n lambdas : array-like of shape (n_eigenvalues,)\n Array of eigenvalues to check / fix.\n\n enable_warnings : bool, default=False\n When this is set to ``True``, a ``PositiveSpectrumWarning`` will be\n raised when there are imaginary parts, negative eigenvalues, or\n extremely small non-zero eigenvalues. Otherwise no warning will be\n raised. In both cases, imaginary parts, negative eigenvalues, and\n extremely small non-zero eigenvalues will be set to zero.\n\n Returns\n -------\n lambdas_fixed : ndarray of shape (n_eigenvalues,)\n A fixed validated copy of the array of eigenvalues.\n\n Examples\n --------\n >>> from sklearn.utils.validation import _check_psd_eigenvalues\n >>> _check_psd_eigenvalues([1, 2]) # nominal case\n array([1, 2])\n >>> _check_psd_eigenvalues([5, 5j]) # significant imag part\n Traceback (most recent call last):\n ...\n ValueError: There are significant imaginary parts in eigenvalues (1\n of the maximum real part). Either the matrix is not PSD, or there was\n an issue while computing the eigendecomposition of the matrix.\n >>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part\n array([5., 0.])\n >>> _check_psd_eigenvalues([-5, -1]) # all negative\n Traceback (most recent call last):\n ...\n ValueError: All eigenvalues are negative (maximum is -1). Either the\n matrix is not PSD, or there was an issue while computing the\n eigendecomposition of the matrix.\n >>> _check_psd_eigenvalues([5, -1]) # significant negative\n Traceback (most recent call last):\n ...\n ValueError: There are significant negative eigenvalues (0.2 of the\n maximum positive). Either the matrix is not PSD, or there was an issue\n while computing the eigendecomposition of the matrix.\n >>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative\n array([5., 0.])\n >>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small)\n array([5., 0.])\n\n \"\"\"\n lambdas = np.array(lambdas)\n is_double_precision = lambdas.dtype == np.float64\n significant_imag_ratio = 1e-05\n significant_neg_ratio = 1e-05 if is_double_precision else 0.005\n significant_neg_value = 1e-10 if is_double_precision else 1e-06\n small_pos_ratio = 1e-12 if is_double_precision else 2e-07\n if not np.isreal(lambdas).all():\n max_imag_abs = np.abs(np.imag(lambdas)).max()\n max_real_abs = np.abs(np.real(lambdas)).max()\n if max_imag_abs > significant_imag_ratio * max_real_abs:\n raise ValueError('There are significant imaginary parts in eigenvalues (%g of the maximum real part). Either the matrix is not PSD, or there was an issue while computing the eigendecomposition of the matrix.' % (max_imag_abs / max_real_abs))\n if enable_warnings:\n warnings.warn('There are imaginary parts in eigenvalues (%g of the maximum real part). Either the matrix is not PSD, or there was an issue while computing the eigendecomposition of the matrix. Only the real parts will be kept.' % (max_imag_abs / max_real_abs), PositiveSpectrumWarning)\n lambdas = np.real(lambdas)\n max_eig = lambdas.max()\n if max_eig < 0:\n raise ValueError('All eigenvalues are negative (maximum is %g). Either the matrix is not PSD, or there was an issue while computing the eigendecomposition of the matrix.' % max_eig)\n else:\n min_eig = lambdas.min()\n if min_eig < -significant_neg_ratio * max_eig and min_eig < -significant_neg_value:\n raise ValueError('There are significant negative eigenvalues (%g of the maximum positive). Either the matrix is not PSD, or there was an issue while computing the eigendecomposition of the matrix.' % (-min_eig / max_eig))\n elif min_eig < 0:\n if enable_warnings:\n warnings.warn('There are negative eigenvalues (%g of the maximum positive). Either the matrix is not PSD, or there was an issue while computing the eigendecomposition of the matrix. Negative eigenvalues will be replaced with 0.' % (-min_eig / max_eig), PositiveSpectrumWarning)\n lambdas[lambdas < 0] = 0\n too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig)\n if too_small_lambdas.any():\n if enable_warnings:\n warnings.warn('Badly conditioned PSD matrix spectrum: the largest eigenvalue is more than %g times the smallest. Small eigenvalues will be replaced with 0.' % (1 / small_pos_ratio), PositiveSpectrumWarning)\n lambdas[too_small_lambdas] = 0\n return lambdas" }, { @@ -176964,6 +191044,10 @@ "docstring": { "type": "{ndarray, Number or None}, shape (n_samples,)", "description": "Input sample weights." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -176974,6 +191058,10 @@ "docstring": { "type": "{ndarray, list, sparse matrix}", "description": "Input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -176984,7 +191072,8 @@ "docstring": { "type": "dtype, default=None", "description": "dtype of the validated `sample_weight`.\nIf None, and the input `sample_weight` is an array, the dtype of the\ninput is preserved; otherwise an array with the default numpy dtype\nis be allocated. If `dtype` is not one of `float32`, `float64`,\n`None`, the output will be of dtype `float64`." - } + }, + "refined_type": {} }, { "name": "copy", @@ -176994,13 +191083,14 @@ "docstring": { "type": "bool, default=False", "description": "If True, a copy of sample_weight will be created." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Validate sample weights.\n\nNote that passing sample_weight=None will output an array of ones. Therefore, in some cases, you may want to protect the call with: if sample_weight is not None: sample_weight = _check_sample_weight(...)", - "docstring": "Validate sample weights.\n\nNote that passing sample_weight=None will output an array of ones.\nTherefore, in some cases, you may want to protect the call with:\nif sample_weight is not None:\n sample_weight = _check_sample_weight(...)\n\nParameters\n----------\nsample_weight : {ndarray, Number or None}, shape (n_samples,)\n Input sample weights.\n\nX : {ndarray, list, sparse matrix}\n Input data.\n\ndtype : dtype, default=None\n dtype of the validated `sample_weight`.\n If None, and the input `sample_weight` is an array, the dtype of the\n input is preserved; otherwise an array with the default numpy dtype\n is be allocated. If `dtype` is not one of `float32`, `float64`,\n `None`, the output will be of dtype `float64`.\n\ncopy : bool, default=False\n If True, a copy of sample_weight will be created.\n\nReturns\n-------\nsample_weight : ndarray of shape (n_samples,)\n Validated sample weight. It is guaranteed to be \"C\" contiguous.", + "description": "Validate sample weights.\n\nNote that passing sample_weight=None will output an array of ones.\nTherefore, in some cases, you may want to protect the call with:\nif sample_weight is not None:\n sample_weight = _check_sample_weight(...)", + "docstring": "Validate sample weights.\n\n Note that passing sample_weight=None will output an array of ones.\n Therefore, in some cases, you may want to protect the call with:\n if sample_weight is not None:\n sample_weight = _check_sample_weight(...)\n\n Parameters\n ----------\n sample_weight : {ndarray, Number or None}, shape (n_samples,)\n Input sample weights.\n\n X : {ndarray, list, sparse matrix}\n Input data.\n\n dtype : dtype, default=None\n dtype of the validated `sample_weight`.\n If None, and the input `sample_weight` is an array, the dtype of the\n input is preserved; otherwise an array with the default numpy dtype\n is be allocated. If `dtype` is not one of `float32`, `float64`,\n `None`, the output will be of dtype `float64`.\n\n copy : bool, default=False\n If True, a copy of sample_weight will be created.\n\n Returns\n -------\n sample_weight : ndarray of shape (n_samples,)\n Validated sample weight. It is guaranteed to be \"C\" contiguous.\n ", "source_code": "\ndef _check_sample_weight(sample_weight, X, dtype=None, copy=False):\n \"\"\"Validate sample weights.\n\n Note that passing sample_weight=None will output an array of ones.\n Therefore, in some cases, you may want to protect the call with:\n if sample_weight is not None:\n sample_weight = _check_sample_weight(...)\n\n Parameters\n ----------\n sample_weight : {ndarray, Number or None}, shape (n_samples,)\n Input sample weights.\n\n X : {ndarray, list, sparse matrix}\n Input data.\n\n dtype : dtype, default=None\n dtype of the validated `sample_weight`.\n If None, and the input `sample_weight` is an array, the dtype of the\n input is preserved; otherwise an array with the default numpy dtype\n is be allocated. If `dtype` is not one of `float32`, `float64`,\n `None`, the output will be of dtype `float64`.\n\n copy : bool, default=False\n If True, a copy of sample_weight will be created.\n\n Returns\n -------\n sample_weight : ndarray of shape (n_samples,)\n Validated sample weight. It is guaranteed to be \"C\" contiguous.\n \"\"\"\n n_samples = _num_samples(X)\n if dtype is not None and dtype not in [np.float32, np.float64]:\n dtype = np.float64\n if sample_weight is None:\n sample_weight = np.ones(n_samples, dtype=dtype)\n elif isinstance(sample_weight, numbers.Number):\n sample_weight = np.full(n_samples, sample_weight, dtype=dtype)\n else:\n if dtype is None:\n dtype = [np.float64, np.float32]\n sample_weight = check_array(sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype, order='C', copy=copy)\n if sample_weight.ndim != 1:\n raise ValueError('Sample weights must be 1D array or scalar')\n if sample_weight.shape != (n_samples, ):\n raise ValueError('sample_weight.shape == {}, expected {}!'.format(sample_weight.shape, (n_samples, )))\n return sample_weight" }, { @@ -177018,7 +191108,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "multi_output", @@ -177028,7 +191119,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} }, { "name": "y_numeric", @@ -177038,7 +191130,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -177062,7 +191155,8 @@ "docstring": { "type": "callable, default=None", "description": "Function to check arguments on." - } + }, + "refined_type": {} }, { "name": "version", @@ -177072,13 +191166,14 @@ "docstring": { "type": "callable, default=\"1.1 (renaming of 0.26)\"", "description": "The version when positional arguments will result in error." - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Decorator for methods that issues warnings for positional arguments.\n\nUsing the keyword-only argument syntax in pep 3102, arguments after the * will issue a warning when passed as a positional argument.", - "docstring": "Decorator for methods that issues warnings for positional arguments.\n\nUsing the keyword-only argument syntax in pep 3102, arguments after the\n* will issue a warning when passed as a positional argument.\n\nParameters\n----------\nfunc : callable, default=None\n Function to check arguments on.\nversion : callable, default=\"1.1 (renaming of 0.26)\"\n The version when positional arguments will result in error.", + "description": "Decorator for methods that issues warnings for positional arguments.\n\nUsing the keyword-only argument syntax in pep 3102, arguments after the\n* will issue a warning when passed as a positional argument.", + "docstring": "Decorator for methods that issues warnings for positional arguments.\n\n Using the keyword-only argument syntax in pep 3102, arguments after the\n * will issue a warning when passed as a positional argument.\n\n Parameters\n ----------\n func : callable, default=None\n Function to check arguments on.\n version : callable, default=\"1.1 (renaming of 0.26)\"\n The version when positional arguments will result in error.\n ", "source_code": "\ndef _deprecate_positional_args(func=None, *, version='1.1 (renaming of 0.26)'):\n \"\"\"Decorator for methods that issues warnings for positional arguments.\n\n Using the keyword-only argument syntax in pep 3102, arguments after the\n * will issue a warning when passed as a positional argument.\n\n Parameters\n ----------\n func : callable, default=None\n Function to check arguments on.\n version : callable, default=\"1.1 (renaming of 0.26)\"\n The version when positional arguments will result in error.\n \"\"\"\n \n def _inner_deprecate_positional_args(f):\n sig = signature(f)\n kwonly_args = []\n all_args = []\n for (name, param) in sig.parameters.items():\n if param.kind == Parameter.POSITIONAL_OR_KEYWORD:\n all_args.append(name)\n elif param.kind == Parameter.KEYWORD_ONLY:\n kwonly_args.append(name)\n \n @wraps(f)\n def inner_f(*args, **kwargs):\n extra_args = len(args) - len(all_args)\n if extra_args <= 0:\n return f(*args, **kwargs)\n args_msg = ['{}={}'.format(name, arg) for (name, arg) in zip(kwonly_args[:extra_args], args[-extra_args:])]\n args_msg = ', '.join(args_msg)\n warnings.warn(f'Pass {args_msg} as keyword args. From version {version} passing these as positional arguments will result in an error', FutureWarning)\n kwargs.update(zip(sig.parameters, args))\n return f(**kwargs)\n return inner_f\n if func is not None:\n return _inner_deprecate_positional_args(func)\n return _inner_deprecate_positional_args" }, { @@ -177096,13 +191191,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "", - "docstring": "", + "docstring": null, "source_code": "\ndef _ensure_no_complex_data(array):\n if hasattr(array, 'dtype') and array.dtype is not None and hasattr(array.dtype, 'kind') and array.dtype.kind == 'c':\n raise ValueError('Complex data not supported\\n{}\\n'.format(array))" }, { @@ -177120,7 +191216,8 @@ "docstring": { "type": "sparse matrix", "description": "Input to validate and convert." - } + }, + "refined_type": {} }, { "name": "accept_sparse", @@ -177130,7 +191227,8 @@ "docstring": { "type": "str, bool or list/tuple of str", "description": "String[s] representing allowed sparse matrix formats ('csc',\n'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but\nnot in the allowed format, it will be converted to the first listed\nformat. True allows the input to be any format. False means\nthat a sparse matrix input will raise an error." - } + }, + "refined_type": {} }, { "name": "dtype", @@ -177140,7 +191238,8 @@ "docstring": { "type": "str, type or None", "description": "Data type of result. If None, the dtype of the input is preserved." - } + }, + "refined_type": {} }, { "name": "copy", @@ -177150,7 +191249,8 @@ "docstring": { "type": "bool", "description": "Whether a forced copy will be triggered. If copy=False, a copy might\nbe triggered by a conversion." - } + }, + "refined_type": {} }, { "name": "force_all_finite", @@ -177160,7 +191260,8 @@ "docstring": { "type": "bool or 'allow-nan'", "description": "Whether to raise an error on np.inf, np.nan, pd.NA in X. The\npossibilities are:\n\n- True: Force all values of X to be finite.\n- False: accepts np.inf, np.nan, pd.NA in X.\n- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot\n be infinite.\n\n.. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n.. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`" - } + }, + "refined_type": {} }, { "name": "accept_large_sparse", @@ -177170,13 +191271,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, "description": "Convert a sparse matrix to a given format.\n\nChecks the sparse format of spmatrix and converts if necessary.", - "docstring": "Convert a sparse matrix to a given format.\n\nChecks the sparse format of spmatrix and converts if necessary.\n\nParameters\n----------\nspmatrix : sparse matrix\n Input to validate and convert.\n\naccept_sparse : str, bool or list/tuple of str\n String[s] representing allowed sparse matrix formats ('csc',\n 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but\n not in the allowed format, it will be converted to the first listed\n format. True allows the input to be any format. False means\n that a sparse matrix input will raise an error.\n\ndtype : str, type or None\n Data type of result. If None, the dtype of the input is preserved.\n\ncopy : bool\n Whether a forced copy will be triggered. If copy=False, a copy might\n be triggered by a conversion.\n\nforce_all_finite : bool or 'allow-nan'\n Whether to raise an error on np.inf, np.nan, pd.NA in X. The\n possibilities are:\n\n - True: Force all values of X to be finite.\n - False: accepts np.inf, np.nan, pd.NA in X.\n - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot\n be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\nReturns\n-------\nspmatrix_converted : sparse matrix.\n Matrix that is ensured to have an allowed type.", + "docstring": "Convert a sparse matrix to a given format.\n\n Checks the sparse format of spmatrix and converts if necessary.\n\n Parameters\n ----------\n spmatrix : sparse matrix\n Input to validate and convert.\n\n accept_sparse : str, bool or list/tuple of str\n String[s] representing allowed sparse matrix formats ('csc',\n 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but\n not in the allowed format, it will be converted to the first listed\n format. True allows the input to be any format. False means\n that a sparse matrix input will raise an error.\n\n dtype : str, type or None\n Data type of result. If None, the dtype of the input is preserved.\n\n copy : bool\n Whether a forced copy will be triggered. If copy=False, a copy might\n be triggered by a conversion.\n\n force_all_finite : bool or 'allow-nan'\n Whether to raise an error on np.inf, np.nan, pd.NA in X. The\n possibilities are:\n\n - True: Force all values of X to be finite.\n - False: accepts np.inf, np.nan, pd.NA in X.\n - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot\n be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\n Returns\n -------\n spmatrix_converted : sparse matrix.\n Matrix that is ensured to have an allowed type.\n ", "source_code": "\ndef _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, force_all_finite, accept_large_sparse):\n \"\"\"Convert a sparse matrix to a given format.\n\n Checks the sparse format of spmatrix and converts if necessary.\n\n Parameters\n ----------\n spmatrix : sparse matrix\n Input to validate and convert.\n\n accept_sparse : str, bool or list/tuple of str\n String[s] representing allowed sparse matrix formats ('csc',\n 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but\n not in the allowed format, it will be converted to the first listed\n format. True allows the input to be any format. False means\n that a sparse matrix input will raise an error.\n\n dtype : str, type or None\n Data type of result. If None, the dtype of the input is preserved.\n\n copy : bool\n Whether a forced copy will be triggered. If copy=False, a copy might\n be triggered by a conversion.\n\n force_all_finite : bool or 'allow-nan'\n Whether to raise an error on np.inf, np.nan, pd.NA in X. The\n possibilities are:\n\n - True: Force all values of X to be finite.\n - False: accepts np.inf, np.nan, pd.NA in X.\n - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot\n be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\n Returns\n -------\n spmatrix_converted : sparse matrix.\n Matrix that is ensured to have an allowed type.\n \"\"\"\n if dtype is None:\n dtype = spmatrix.dtype\n changed_format = False\n if isinstance(accept_sparse, str):\n accept_sparse = [accept_sparse]\n _check_large_sparse(spmatrix, accept_large_sparse)\n if accept_sparse is False:\n raise TypeError('A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.')\n elif isinstance(accept_sparse, (list, tuple)):\n if len(accept_sparse) == 0:\n raise ValueError(\"When providing 'accept_sparse' as a tuple or list, it must contain at least one string value.\")\n if spmatrix.format not in accept_sparse:\n spmatrix = spmatrix.asformat(accept_sparse[0])\n changed_format = True\n elif accept_sparse is not True:\n raise ValueError(\"Parameter 'accept_sparse' should be a string, boolean or list of strings. You provided 'accept_sparse={}'.\".format(accept_sparse))\n if dtype != spmatrix.dtype:\n spmatrix = spmatrix.astype(dtype)\n elif copy and not changed_format:\n spmatrix = spmatrix.copy()\n if force_all_finite:\n if not hasattr(spmatrix, 'data'):\n warnings.warn(\"Can't check %s sparse matrix for nan or inf.\" % spmatrix.format, stacklevel=2)\n else:\n _assert_all_finite(spmatrix.data, allow_nan=force_all_finite == 'allow-nan')\n return spmatrix" }, { @@ -177194,13 +191296,17 @@ "docstring": { "type": "{ndarray, dataframe} of shape (n_samples, n_features)", "description": "Array container to extract feature names.\n\n- pandas dataframe : The columns will be considered to be feature\n names. If the dataframe contains non-string feature names, `None` is\n returned.\n- All other array containers will return `None`." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, "description": "Get feature names from X.\n\nSupport for other array containers should place its implementation here.", - "docstring": "Get feature names from X.\n\nSupport for other array containers should place its implementation here.\n\nParameters\n----------\nX : {ndarray, dataframe} of shape (n_samples, n_features)\n Array container to extract feature names.\n\n - pandas dataframe : The columns will be considered to be feature\n names. If the dataframe contains non-string feature names, `None` is\n returned.\n - All other array containers will return `None`.\n\nReturns\n-------\nnames: ndarray or None\n Feature names of `X`. Unrecognized array containers will return `None`.", + "docstring": "Get feature names from X.\n\n Support for other array containers should place its implementation here.\n\n Parameters\n ----------\n X : {ndarray, dataframe} of shape (n_samples, n_features)\n Array container to extract feature names.\n\n - pandas dataframe : The columns will be considered to be feature\n names. If the dataframe contains non-string feature names, `None` is\n returned.\n - All other array containers will return `None`.\n\n Returns\n -------\n names: ndarray or None\n Feature names of `X`. Unrecognized array containers will return `None`.\n ", "source_code": "\ndef _get_feature_names(X):\n \"\"\"Get feature names from X.\n\n Support for other array containers should place its implementation here.\n\n Parameters\n ----------\n X : {ndarray, dataframe} of shape (n_samples, n_features)\n Array container to extract feature names.\n\n - pandas dataframe : The columns will be considered to be feature\n names. If the dataframe contains non-string feature names, `None` is\n returned.\n - All other array containers will return `None`.\n\n Returns\n -------\n names: ndarray or None\n Feature names of `X`. Unrecognized array containers will return `None`.\n \"\"\"\n feature_names = None\n if hasattr(X, 'columns'):\n feature_names = np.asarray(X.columns, dtype=object)\n if feature_names is None or len(feature_names) == 0:\n return\n types = sorted((t.__qualname__ for t in set((type(v) for v in feature_names))))\n if len(types) > 1 or not (types[0].startswith('int') or types[0] == 'str'):\n warnings.warn(f'Feature names only support names that are all strings. Got feature names with dtypes: {types}. An error will be raised in 1.2.', FutureWarning)\n return\n if types[0] == 'str':\n return feature_names" }, { @@ -177218,7 +191324,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -177242,13 +191349,17 @@ "docstring": { "type": "{list, dataframe, ndarray, sparse matrix} or None", "description": "Object to be converted to an indexable iterable." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": false, - "description": "Ensure iterable supports indexing or convert to an indexable variant.\n\nConvert sparse matrices to csr and other non-indexable iterable to arrays. Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.", - "docstring": "Ensure iterable supports indexing or convert to an indexable variant.\n\nConvert sparse matrices to csr and other non-indexable iterable to arrays.\nLet `None` and indexable objects (e.g. pandas dataframes) pass unchanged.\n\nParameters\n----------\niterable : {list, dataframe, ndarray, sparse matrix} or None\n Object to be converted to an indexable iterable.", + "description": "Ensure iterable supports indexing or convert to an indexable variant.\n\nConvert sparse matrices to csr and other non-indexable iterable to arrays.\nLet `None` and indexable objects (e.g. pandas dataframes) pass unchanged.", + "docstring": "Ensure iterable supports indexing or convert to an indexable variant.\n\n Convert sparse matrices to csr and other non-indexable iterable to arrays.\n Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.\n\n Parameters\n ----------\n iterable : {list, dataframe, ndarray, sparse matrix} or None\n Object to be converted to an indexable iterable.\n ", "source_code": "\ndef _make_indexable(iterable):\n \"\"\"Ensure iterable supports indexing or convert to an indexable variant.\n\n Convert sparse matrices to csr and other non-indexable iterable to arrays.\n Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.\n\n Parameters\n ----------\n iterable : {list, dataframe, ndarray, sparse matrix} or None\n Object to be converted to an indexable iterable.\n \"\"\"\n if sp.issparse(iterable):\n return iterable.tocsr()\n elif hasattr(iterable, '__getitem__') or hasattr(iterable, 'iloc'):\n return iterable\n elif iterable is None:\n return iterable\n return np.array(iterable)" }, { @@ -177266,13 +191377,14 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": false, - "description": "Return the number of features in an array-like X.\n\nThis helper function tries hard to avoid to materialize an array version of X unless necessary. For instance, if X is a list of lists, this function will return the length of the first element, assuming that subsequent elements are all lists of the same length without checking. Parameters ---------- X : array-like array-like to get the number of features.", - "docstring": "Return the number of features in an array-like X.\n\nThis helper function tries hard to avoid to materialize an array version\nof X unless necessary. For instance, if X is a list of lists,\nthis function will return the length of the first element, assuming\nthat subsequent elements are all lists of the same length without\nchecking.\nParameters\n----------\nX : array-like\n array-like to get the number of features.\n\nReturns\n-------\nfeatures : int\n Number of features", + "description": "Return the number of features in an array-like X.\n\nThis helper function tries hard to avoid to materialize an array version\nof X unless necessary. For instance, if X is a list of lists,\nthis function will return the length of the first element, assuming\nthat subsequent elements are all lists of the same length without\nchecking.\nParameters\n----------\nX : array-like\n array-like to get the number of features.", + "docstring": "Return the number of features in an array-like X.\n\n This helper function tries hard to avoid to materialize an array version\n of X unless necessary. For instance, if X is a list of lists,\n this function will return the length of the first element, assuming\n that subsequent elements are all lists of the same length without\n checking.\n Parameters\n ----------\n X : array-like\n array-like to get the number of features.\n\n Returns\n -------\n features : int\n Number of features\n ", "source_code": "\ndef _num_features(X):\n \"\"\"Return the number of features in an array-like X.\n\n This helper function tries hard to avoid to materialize an array version\n of X unless necessary. For instance, if X is a list of lists,\n this function will return the length of the first element, assuming\n that subsequent elements are all lists of the same length without\n checking.\n Parameters\n ----------\n X : array-like\n array-like to get the number of features.\n\n Returns\n -------\n features : int\n Number of features\n \"\"\"\n type_ = type(X)\n if type_.__module__ == 'builtins':\n type_name = type_.__qualname__\n else:\n type_name = f'{type_.__module__}.{type_.__qualname__}'\n message = f'Unable to find the number of features from X of type {type_name}'\n if not hasattr(X, '__len__') and not hasattr(X, 'shape'):\n if not hasattr(X, '__array__'):\n raise TypeError(message)\n X = np.asarray(X)\n if hasattr(X, 'shape'):\n if not hasattr(X.shape, '__len__') or len(X.shape) <= 1:\n message += f' with shape {X.shape}'\n raise TypeError(message)\n return X.shape[1]\n first_sample = X[0]\n if isinstance(first_sample, (str, bytes, dict)):\n message += f' where the samples are of type {type(first_sample).__qualname__}'\n raise TypeError(message)\n try:\n return len(first_sample)\n except Exception as err:\n raise TypeError(message) from err" }, { @@ -177290,7 +191402,8 @@ "docstring": { "type": "", "description": "" - } + }, + "refined_type": {} } ], "results": [], @@ -177313,7 +191426,11 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "{array-like, sparse matrix}", - "description": "" + "description": "The input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -177324,7 +191441,8 @@ "docstring": { "type": "bool, default=True", "description": "If True, a copy of X will be created. If False, a copy may still be\nreturned if X's dtype is not a floating point type." - } + }, + "refined_type": {} }, { "name": "force_all_finite", @@ -177334,14 +191452,15 @@ "docstring": { "type": "bool or 'allow-nan', default=True", "description": "Whether to raise an error on np.inf, np.nan, pd.NA in X. The\npossibilities are:\n\n- True: Force all values of X to be finite.\n- False: accepts np.inf, np.nan, pd.NA in X.\n- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot\n be infinite.\n\n.. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n.. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Converts an array-like to an array of floats.\n\nThe new dtype will be np.float32 or np.float64, depending on the original type. The function can create a copy or modify the argument depending on the argument copy.", - "docstring": "Converts an array-like to an array of floats.\n\nThe new dtype will be np.float32 or np.float64, depending on the original\ntype. The function can create a copy or modify the argument depending\non the argument copy.\n\nParameters\n----------\nX : {array-like, sparse matrix}\n\ncopy : bool, default=True\n If True, a copy of X will be created. If False, a copy may still be\n returned if X's dtype is not a floating point type.\n\nforce_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in X. The\n possibilities are:\n\n - True: Force all values of X to be finite.\n - False: accepts np.inf, np.nan, pd.NA in X.\n - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot\n be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\nReturns\n-------\nXT : {ndarray, sparse matrix}\n An array of type float.", - "source_code": "\ndef as_float_array(X, *, copy=True, force_all_finite=True):\n \"\"\"Converts an array-like to an array of floats.\n\n The new dtype will be np.float32 or np.float64, depending on the original\n type. The function can create a copy or modify the argument depending\n on the argument copy.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}\n\n copy : bool, default=True\n If True, a copy of X will be created. If False, a copy may still be\n returned if X's dtype is not a floating point type.\n\n force_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in X. The\n possibilities are:\n\n - True: Force all values of X to be finite.\n - False: accepts np.inf, np.nan, pd.NA in X.\n - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot\n be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\n Returns\n -------\n XT : {ndarray, sparse matrix}\n An array of type float.\n \"\"\"\n if isinstance(X, np.matrix) or not isinstance(X, np.ndarray) and not sp.issparse(X):\n return check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64, copy=copy, force_all_finite=force_all_finite, ensure_2d=False)\n elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:\n return X.copy() if copy else X\n elif X.dtype in [np.float32, np.float64]:\n return X.copy('F' if X.flags['F_CONTIGUOUS'] else 'C') if copy else X\n else:\n if X.dtype.kind in 'uib' and X.dtype.itemsize <= 4:\n return_dtype = np.float32\n else:\n return_dtype = np.float64\n return X.astype(return_dtype)" + "description": "Convert an array-like to an array of floats.\n\nThe new dtype will be np.float32 or np.float64, depending on the original\ntype. The function can create a copy or modify the argument depending\non the argument copy.", + "docstring": "Convert an array-like to an array of floats.\n\n The new dtype will be np.float32 or np.float64, depending on the original\n type. The function can create a copy or modify the argument depending\n on the argument copy.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}\n The input data.\n\n copy : bool, default=True\n If True, a copy of X will be created. If False, a copy may still be\n returned if X's dtype is not a floating point type.\n\n force_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in X. The\n possibilities are:\n\n - True: Force all values of X to be finite.\n - False: accepts np.inf, np.nan, pd.NA in X.\n - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot\n be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\n Returns\n -------\n XT : {ndarray, sparse matrix}\n An array of type float.\n ", + "source_code": "\ndef as_float_array(X, *, copy=True, force_all_finite=True):\n \"\"\"Convert an array-like to an array of floats.\n\n The new dtype will be np.float32 or np.float64, depending on the original\n type. The function can create a copy or modify the argument depending\n on the argument copy.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}\n The input data.\n\n copy : bool, default=True\n If True, a copy of X will be created. If False, a copy may still be\n returned if X's dtype is not a floating point type.\n\n force_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in X. The\n possibilities are:\n\n - True: Force all values of X to be finite.\n - False: accepts np.inf, np.nan, pd.NA in X.\n - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot\n be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\n Returns\n -------\n XT : {ndarray, sparse matrix}\n An array of type float.\n \"\"\"\n if isinstance(X, np.matrix) or not isinstance(X, np.ndarray) and not sp.issparse(X):\n return check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64, copy=copy, force_all_finite=force_all_finite, ensure_2d=False)\n elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:\n return X.copy() if copy else X\n elif X.dtype in [np.float32, np.float64]:\n return X.copy('F' if X.flags['F_CONTIGUOUS'] else 'C') if copy else X\n else:\n if X.dtype.kind in 'uib' and X.dtype.itemsize <= 4:\n return_dtype = np.float32\n else:\n return_dtype = np.float64\n return X.astype(return_dtype)" }, { "name": "assert_all_finite", @@ -177358,6 +191477,10 @@ "docstring": { "type": "{ndarray, sparse matrix}", "description": "" + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -177368,13 +191491,14 @@ "docstring": { "type": "bool, default=False", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Throw a ValueError if X contains NaN or infinity.", - "docstring": "Throw a ValueError if X contains NaN or infinity.\n\nParameters\n----------\nX : {ndarray, sparse matrix}\n\nallow_nan : bool, default=False", + "docstring": "Throw a ValueError if X contains NaN or infinity.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix}\n\n allow_nan : bool, default=False\n ", "source_code": "\ndef assert_all_finite(X, *, allow_nan=False):\n \"\"\"Throw a ValueError if X contains NaN or infinity.\n\n Parameters\n ----------\n X : {ndarray, sparse matrix}\n\n allow_nan : bool, default=False\n \"\"\"\n _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)" }, { @@ -177392,6 +191516,10 @@ "docstring": { "type": "{ndarray, list, sparse matrix}", "description": "Input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -177402,6 +191530,10 @@ "docstring": { "type": "{ndarray, list, sparse matrix}", "description": "Labels." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -177412,7 +191544,8 @@ "docstring": { "type": "str, bool or list of str, default=False", "description": "String[s] representing allowed sparse matrix formats, such as 'csc',\n'csr', etc. If the input is sparse but not in the allowed format,\nit will be converted to the first listed format. True allows the input\nto be any format. False means that a sparse matrix input will\nraise an error." - } + }, + "refined_type": {} }, { "name": "accept_large_sparse", @@ -177422,7 +191555,8 @@ "docstring": { "type": "bool, default=True", "description": "If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by\naccept_sparse, accept_large_sparse will cause it to be accepted only\nif its indices are stored with a 32-bit dtype.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -177432,7 +191566,8 @@ "docstring": { "type": "'numeric', type, list of type or None, default='numeric'", "description": "Data type of result. If None, the dtype of the input is preserved.\nIf \"numeric\", dtype is preserved unless array.dtype is object.\nIf dtype is a list of types, conversion on the first type is only\nperformed if the dtype of the input is not in the list." - } + }, + "refined_type": {} }, { "name": "order", @@ -177442,6 +191577,10 @@ "docstring": { "type": "{'F', 'C'}, default=None", "description": "Whether an array will be forced to be fortran or c-style." + }, + "refined_type": { + "kind": "EnumType", + "values": ["F", "C"] } }, { @@ -177452,7 +191591,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether a forced copy will be triggered. If copy=False, a copy might\nbe triggered by a conversion." - } + }, + "refined_type": {} }, { "name": "force_all_finite", @@ -177462,7 +191602,8 @@ "docstring": { "type": "bool or 'allow-nan', default=True", "description": "Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter\ndoes not influence whether y can have np.inf, np.nan, pd.NA values.\nThe possibilities are:\n\n- True: Force all values of X to be finite.\n- False: accepts np.inf, np.nan, pd.NA in X.\n- 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot\n be infinite.\n\n.. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n.. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`" - } + }, + "refined_type": {} }, { "name": "ensure_2d", @@ -177472,7 +191613,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to raise a value error if X is not 2D." - } + }, + "refined_type": {} }, { "name": "allow_nd", @@ -177482,7 +191624,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to allow X.ndim > 2." - } + }, + "refined_type": {} }, { "name": "multi_output", @@ -177492,7 +191635,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to allow 2D y (array or sparse matrix). If false, y will be\nvalidated as a vector. y cannot have np.nan or np.inf values if\nmulti_output=True." - } + }, + "refined_type": {} }, { "name": "ensure_min_samples", @@ -177502,7 +191646,8 @@ "docstring": { "type": "int, default=1", "description": "Make sure that X has a minimum number of samples in its first\naxis (rows for a 2D array)." - } + }, + "refined_type": {} }, { "name": "ensure_min_features", @@ -177512,7 +191657,8 @@ "docstring": { "type": "int, default=1", "description": "Make sure that the 2D array has some minimum number of features\n(columns). The default value of 1 rejects empty datasets.\nThis check is only enforced when X has effectively 2 dimensions or\nis originally 1D and ``ensure_2d`` is True. Setting to 0 disables\nthis check." - } + }, + "refined_type": {} }, { "name": "y_numeric", @@ -177522,7 +191668,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to ensure that y has a numeric type. If dtype of y is object,\nit is converted to float64. Should only be used for regression\nalgorithms." - } + }, + "refined_type": {} }, { "name": "estimator", @@ -177532,13 +191679,14 @@ "docstring": { "type": "str or estimator instance, default=None", "description": "If passed, include the name of the estimator in warning messages." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Input validation for standard estimators.\n\nChecks X and y for consistent length, enforces X to be 2D and y 1D. By default, X is checked to be non-empty and containing only finite values. Standard input checks are also applied to y, such as checking that y does not have np.nan or np.inf targets. For multi-label y, set multi_output=True to allow 2D and sparse y. If the dtype of X is object, attempt converting to float, raising on failure.", - "docstring": "Input validation for standard estimators.\n\nChecks X and y for consistent length, enforces X to be 2D and y 1D. By\ndefault, X is checked to be non-empty and containing only finite values.\nStandard input checks are also applied to y, such as checking that y\ndoes not have np.nan or np.inf targets. For multi-label y, set\nmulti_output=True to allow 2D and sparse y. If the dtype of X is\nobject, attempt converting to float, raising on failure.\n\nParameters\n----------\nX : {ndarray, list, sparse matrix}\n Input data.\n\ny : {ndarray, list, sparse matrix}\n Labels.\n\naccept_sparse : str, bool or list of str, default=False\n String[s] representing allowed sparse matrix formats, such as 'csc',\n 'csr', etc. If the input is sparse but not in the allowed format,\n it will be converted to the first listed format. True allows the input\n to be any format. False means that a sparse matrix input will\n raise an error.\n\naccept_large_sparse : bool, default=True\n If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by\n accept_sparse, accept_large_sparse will cause it to be accepted only\n if its indices are stored with a 32-bit dtype.\n\n .. versionadded:: 0.20\n\ndtype : 'numeric', type, list of type or None, default='numeric'\n Data type of result. If None, the dtype of the input is preserved.\n If \"numeric\", dtype is preserved unless array.dtype is object.\n If dtype is a list of types, conversion on the first type is only\n performed if the dtype of the input is not in the list.\n\norder : {'F', 'C'}, default=None\n Whether an array will be forced to be fortran or c-style.\n\ncopy : bool, default=False\n Whether a forced copy will be triggered. If copy=False, a copy might\n be triggered by a conversion.\n\nforce_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter\n does not influence whether y can have np.inf, np.nan, pd.NA values.\n The possibilities are:\n\n - True: Force all values of X to be finite.\n - False: accepts np.inf, np.nan, pd.NA in X.\n - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot\n be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\nensure_2d : bool, default=True\n Whether to raise a value error if X is not 2D.\n\nallow_nd : bool, default=False\n Whether to allow X.ndim > 2.\n\nmulti_output : bool, default=False\n Whether to allow 2D y (array or sparse matrix). If false, y will be\n validated as a vector. y cannot have np.nan or np.inf values if\n multi_output=True.\n\nensure_min_samples : int, default=1\n Make sure that X has a minimum number of samples in its first\n axis (rows for a 2D array).\n\nensure_min_features : int, default=1\n Make sure that the 2D array has some minimum number of features\n (columns). The default value of 1 rejects empty datasets.\n This check is only enforced when X has effectively 2 dimensions or\n is originally 1D and ``ensure_2d`` is True. Setting to 0 disables\n this check.\n\ny_numeric : bool, default=False\n Whether to ensure that y has a numeric type. If dtype of y is object,\n it is converted to float64. Should only be used for regression\n algorithms.\n\nestimator : str or estimator instance, default=None\n If passed, include the name of the estimator in warning messages.\n\nReturns\n-------\nX_converted : object\n The converted and validated X.\n\ny_converted : object\n The converted and validated y.", + "description": "Input validation for standard estimators.\n\nChecks X and y for consistent length, enforces X to be 2D and y 1D. By\ndefault, X is checked to be non-empty and containing only finite values.\nStandard input checks are also applied to y, such as checking that y\ndoes not have np.nan or np.inf targets. For multi-label y, set\nmulti_output=True to allow 2D and sparse y. If the dtype of X is\nobject, attempt converting to float, raising on failure.", + "docstring": "Input validation for standard estimators.\n\n Checks X and y for consistent length, enforces X to be 2D and y 1D. By\n default, X is checked to be non-empty and containing only finite values.\n Standard input checks are also applied to y, such as checking that y\n does not have np.nan or np.inf targets. For multi-label y, set\n multi_output=True to allow 2D and sparse y. If the dtype of X is\n object, attempt converting to float, raising on failure.\n\n Parameters\n ----------\n X : {ndarray, list, sparse matrix}\n Input data.\n\n y : {ndarray, list, sparse matrix}\n Labels.\n\n accept_sparse : str, bool or list of str, default=False\n String[s] representing allowed sparse matrix formats, such as 'csc',\n 'csr', etc. If the input is sparse but not in the allowed format,\n it will be converted to the first listed format. True allows the input\n to be any format. False means that a sparse matrix input will\n raise an error.\n\n accept_large_sparse : bool, default=True\n If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by\n accept_sparse, accept_large_sparse will cause it to be accepted only\n if its indices are stored with a 32-bit dtype.\n\n .. versionadded:: 0.20\n\n dtype : 'numeric', type, list of type or None, default='numeric'\n Data type of result. If None, the dtype of the input is preserved.\n If \"numeric\", dtype is preserved unless array.dtype is object.\n If dtype is a list of types, conversion on the first type is only\n performed if the dtype of the input is not in the list.\n\n order : {'F', 'C'}, default=None\n Whether an array will be forced to be fortran or c-style.\n\n copy : bool, default=False\n Whether a forced copy will be triggered. If copy=False, a copy might\n be triggered by a conversion.\n\n force_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter\n does not influence whether y can have np.inf, np.nan, pd.NA values.\n The possibilities are:\n\n - True: Force all values of X to be finite.\n - False: accepts np.inf, np.nan, pd.NA in X.\n - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot\n be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\n ensure_2d : bool, default=True\n Whether to raise a value error if X is not 2D.\n\n allow_nd : bool, default=False\n Whether to allow X.ndim > 2.\n\n multi_output : bool, default=False\n Whether to allow 2D y (array or sparse matrix). If false, y will be\n validated as a vector. y cannot have np.nan or np.inf values if\n multi_output=True.\n\n ensure_min_samples : int, default=1\n Make sure that X has a minimum number of samples in its first\n axis (rows for a 2D array).\n\n ensure_min_features : int, default=1\n Make sure that the 2D array has some minimum number of features\n (columns). The default value of 1 rejects empty datasets.\n This check is only enforced when X has effectively 2 dimensions or\n is originally 1D and ``ensure_2d`` is True. Setting to 0 disables\n this check.\n\n y_numeric : bool, default=False\n Whether to ensure that y has a numeric type. If dtype of y is object,\n it is converted to float64. Should only be used for regression\n algorithms.\n\n estimator : str or estimator instance, default=None\n If passed, include the name of the estimator in warning messages.\n\n Returns\n -------\n X_converted : object\n The converted and validated X.\n\n y_converted : object\n The converted and validated y.\n ", "source_code": "\ndef check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True, dtype='numeric', order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, estimator=None):\n \"\"\"Input validation for standard estimators.\n\n Checks X and y for consistent length, enforces X to be 2D and y 1D. By\n default, X is checked to be non-empty and containing only finite values.\n Standard input checks are also applied to y, such as checking that y\n does not have np.nan or np.inf targets. For multi-label y, set\n multi_output=True to allow 2D and sparse y. If the dtype of X is\n object, attempt converting to float, raising on failure.\n\n Parameters\n ----------\n X : {ndarray, list, sparse matrix}\n Input data.\n\n y : {ndarray, list, sparse matrix}\n Labels.\n\n accept_sparse : str, bool or list of str, default=False\n String[s] representing allowed sparse matrix formats, such as 'csc',\n 'csr', etc. If the input is sparse but not in the allowed format,\n it will be converted to the first listed format. True allows the input\n to be any format. False means that a sparse matrix input will\n raise an error.\n\n accept_large_sparse : bool, default=True\n If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by\n accept_sparse, accept_large_sparse will cause it to be accepted only\n if its indices are stored with a 32-bit dtype.\n\n .. versionadded:: 0.20\n\n dtype : 'numeric', type, list of type or None, default='numeric'\n Data type of result. If None, the dtype of the input is preserved.\n If \"numeric\", dtype is preserved unless array.dtype is object.\n If dtype is a list of types, conversion on the first type is only\n performed if the dtype of the input is not in the list.\n\n order : {'F', 'C'}, default=None\n Whether an array will be forced to be fortran or c-style.\n\n copy : bool, default=False\n Whether a forced copy will be triggered. If copy=False, a copy might\n be triggered by a conversion.\n\n force_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter\n does not influence whether y can have np.inf, np.nan, pd.NA values.\n The possibilities are:\n\n - True: Force all values of X to be finite.\n - False: accepts np.inf, np.nan, pd.NA in X.\n - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot\n be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\n ensure_2d : bool, default=True\n Whether to raise a value error if X is not 2D.\n\n allow_nd : bool, default=False\n Whether to allow X.ndim > 2.\n\n multi_output : bool, default=False\n Whether to allow 2D y (array or sparse matrix). If false, y will be\n validated as a vector. y cannot have np.nan or np.inf values if\n multi_output=True.\n\n ensure_min_samples : int, default=1\n Make sure that X has a minimum number of samples in its first\n axis (rows for a 2D array).\n\n ensure_min_features : int, default=1\n Make sure that the 2D array has some minimum number of features\n (columns). The default value of 1 rejects empty datasets.\n This check is only enforced when X has effectively 2 dimensions or\n is originally 1D and ``ensure_2d`` is True. Setting to 0 disables\n this check.\n\n y_numeric : bool, default=False\n Whether to ensure that y has a numeric type. If dtype of y is object,\n it is converted to float64. Should only be used for regression\n algorithms.\n\n estimator : str or estimator instance, default=None\n If passed, include the name of the estimator in warning messages.\n\n Returns\n -------\n X_converted : object\n The converted and validated X.\n\n y_converted : object\n The converted and validated y.\n \"\"\"\n if y is None:\n raise ValueError('y cannot be None')\n X = check_array(X, accept_sparse=accept_sparse, accept_large_sparse=accept_large_sparse, dtype=dtype, order=order, copy=copy, force_all_finite=force_all_finite, ensure_2d=ensure_2d, allow_nd=allow_nd, ensure_min_samples=ensure_min_samples, ensure_min_features=ensure_min_features, estimator=estimator)\n y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)\n check_consistent_length(X, y)\n return X, y" }, { @@ -177556,7 +191704,8 @@ "docstring": { "type": "object", "description": "Input object to check / convert." - } + }, + "refined_type": {} }, { "name": "accept_sparse", @@ -177566,7 +191715,8 @@ "docstring": { "type": "str, bool or list/tuple of str, default=False", "description": "String[s] representing allowed sparse matrix formats, such as 'csc',\n'csr', etc. If the input is sparse but not in the allowed format,\nit will be converted to the first listed format. True allows the input\nto be any format. False means that a sparse matrix input will\nraise an error." - } + }, + "refined_type": {} }, { "name": "accept_large_sparse", @@ -177576,7 +191726,8 @@ "docstring": { "type": "bool, default=True", "description": "If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by\naccept_sparse, accept_large_sparse=False will cause it to be accepted\nonly if its indices are stored with a 32-bit dtype.\n\n.. versionadded:: 0.20" - } + }, + "refined_type": {} }, { "name": "dtype", @@ -177586,7 +191737,8 @@ "docstring": { "type": "'numeric', type, list of type or None, default='numeric'", "description": "Data type of result. If None, the dtype of the input is preserved.\nIf \"numeric\", dtype is preserved unless array.dtype is object.\nIf dtype is a list of types, conversion on the first type is only\nperformed if the dtype of the input is not in the list." - } + }, + "refined_type": {} }, { "name": "order", @@ -177596,6 +191748,10 @@ "docstring": { "type": "{'F', 'C'} or None, default=None", "description": "Whether an array will be forced to be fortran or c-style.\nWhen order is None (default), then if copy=False, nothing is ensured\nabout the memory layout of the output array; otherwise (copy=True)\nthe memory layout of the returned array is kept as close as possible\nto the original array." + }, + "refined_type": { + "kind": "EnumType", + "values": ["F", "C"] } }, { @@ -177606,7 +191762,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether a forced copy will be triggered. If copy=False, a copy might\nbe triggered by a conversion." - } + }, + "refined_type": {} }, { "name": "force_all_finite", @@ -177616,7 +191773,8 @@ "docstring": { "type": "bool or 'allow-nan', default=True", "description": "Whether to raise an error on np.inf, np.nan, pd.NA in array. The\npossibilities are:\n\n- True: Force all values of array to be finite.\n- False: accepts np.inf, np.nan, pd.NA in array.\n- 'allow-nan': accepts only np.nan and pd.NA values in array. Values\n cannot be infinite.\n\n.. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n.. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`" - } + }, + "refined_type": {} }, { "name": "ensure_2d", @@ -177626,7 +191784,8 @@ "docstring": { "type": "bool, default=True", "description": "Whether to raise a value error if array is not 2D." - } + }, + "refined_type": {} }, { "name": "allow_nd", @@ -177636,7 +191795,8 @@ "docstring": { "type": "bool, default=False", "description": "Whether to allow array.ndim > 2." - } + }, + "refined_type": {} }, { "name": "ensure_min_samples", @@ -177646,7 +191806,8 @@ "docstring": { "type": "int, default=1", "description": "Make sure that the array has a minimum number of samples in its first\naxis (rows for a 2D array). Setting to 0 disables this check." - } + }, + "refined_type": {} }, { "name": "ensure_min_features", @@ -177656,7 +191817,8 @@ "docstring": { "type": "int, default=1", "description": "Make sure that the 2D array has some minimum number of features\n(columns). The default value of 1 rejects empty datasets.\nThis check is only enforced when the input data has effectively 2\ndimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0\ndisables this check." - } + }, + "refined_type": {} }, { "name": "estimator", @@ -177666,13 +191828,14 @@ "docstring": { "type": "str or estimator instance, default=None", "description": "If passed, include the name of the estimator in warning messages." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Input validation on an array, list, sparse matrix or similar.\n\nBy default, the input is checked to be a non-empty 2D array containing only finite values. If the dtype of the array is object, attempt converting to float, raising on failure.", - "docstring": "Input validation on an array, list, sparse matrix or similar.\n\nBy default, the input is checked to be a non-empty 2D array containing\nonly finite values. If the dtype of the array is object, attempt\nconverting to float, raising on failure.\n\nParameters\n----------\narray : object\n Input object to check / convert.\n\naccept_sparse : str, bool or list/tuple of str, default=False\n String[s] representing allowed sparse matrix formats, such as 'csc',\n 'csr', etc. If the input is sparse but not in the allowed format,\n it will be converted to the first listed format. True allows the input\n to be any format. False means that a sparse matrix input will\n raise an error.\n\naccept_large_sparse : bool, default=True\n If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by\n accept_sparse, accept_large_sparse=False will cause it to be accepted\n only if its indices are stored with a 32-bit dtype.\n\n .. versionadded:: 0.20\n\ndtype : 'numeric', type, list of type or None, default='numeric'\n Data type of result. If None, the dtype of the input is preserved.\n If \"numeric\", dtype is preserved unless array.dtype is object.\n If dtype is a list of types, conversion on the first type is only\n performed if the dtype of the input is not in the list.\n\norder : {'F', 'C'} or None, default=None\n Whether an array will be forced to be fortran or c-style.\n When order is None (default), then if copy=False, nothing is ensured\n about the memory layout of the output array; otherwise (copy=True)\n the memory layout of the returned array is kept as close as possible\n to the original array.\n\ncopy : bool, default=False\n Whether a forced copy will be triggered. If copy=False, a copy might\n be triggered by a conversion.\n\nforce_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in array. The\n possibilities are:\n\n - True: Force all values of array to be finite.\n - False: accepts np.inf, np.nan, pd.NA in array.\n - 'allow-nan': accepts only np.nan and pd.NA values in array. Values\n cannot be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\nensure_2d : bool, default=True\n Whether to raise a value error if array is not 2D.\n\nallow_nd : bool, default=False\n Whether to allow array.ndim > 2.\n\nensure_min_samples : int, default=1\n Make sure that the array has a minimum number of samples in its first\n axis (rows for a 2D array). Setting to 0 disables this check.\n\nensure_min_features : int, default=1\n Make sure that the 2D array has some minimum number of features\n (columns). The default value of 1 rejects empty datasets.\n This check is only enforced when the input data has effectively 2\n dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0\n disables this check.\n\nestimator : str or estimator instance, default=None\n If passed, include the name of the estimator in warning messages.\n\nReturns\n-------\narray_converted : object\n The converted and validated array.", + "description": "Input validation on an array, list, sparse matrix or similar.\n\nBy default, the input is checked to be a non-empty 2D array containing\nonly finite values. If the dtype of the array is object, attempt\nconverting to float, raising on failure.", + "docstring": "Input validation on an array, list, sparse matrix or similar.\n\n By default, the input is checked to be a non-empty 2D array containing\n only finite values. If the dtype of the array is object, attempt\n converting to float, raising on failure.\n\n Parameters\n ----------\n array : object\n Input object to check / convert.\n\n accept_sparse : str, bool or list/tuple of str, default=False\n String[s] representing allowed sparse matrix formats, such as 'csc',\n 'csr', etc. If the input is sparse but not in the allowed format,\n it will be converted to the first listed format. True allows the input\n to be any format. False means that a sparse matrix input will\n raise an error.\n\n accept_large_sparse : bool, default=True\n If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by\n accept_sparse, accept_large_sparse=False will cause it to be accepted\n only if its indices are stored with a 32-bit dtype.\n\n .. versionadded:: 0.20\n\n dtype : 'numeric', type, list of type or None, default='numeric'\n Data type of result. If None, the dtype of the input is preserved.\n If \"numeric\", dtype is preserved unless array.dtype is object.\n If dtype is a list of types, conversion on the first type is only\n performed if the dtype of the input is not in the list.\n\n order : {'F', 'C'} or None, default=None\n Whether an array will be forced to be fortran or c-style.\n When order is None (default), then if copy=False, nothing is ensured\n about the memory layout of the output array; otherwise (copy=True)\n the memory layout of the returned array is kept as close as possible\n to the original array.\n\n copy : bool, default=False\n Whether a forced copy will be triggered. If copy=False, a copy might\n be triggered by a conversion.\n\n force_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in array. The\n possibilities are:\n\n - True: Force all values of array to be finite.\n - False: accepts np.inf, np.nan, pd.NA in array.\n - 'allow-nan': accepts only np.nan and pd.NA values in array. Values\n cannot be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\n ensure_2d : bool, default=True\n Whether to raise a value error if array is not 2D.\n\n allow_nd : bool, default=False\n Whether to allow array.ndim > 2.\n\n ensure_min_samples : int, default=1\n Make sure that the array has a minimum number of samples in its first\n axis (rows for a 2D array). Setting to 0 disables this check.\n\n ensure_min_features : int, default=1\n Make sure that the 2D array has some minimum number of features\n (columns). The default value of 1 rejects empty datasets.\n This check is only enforced when the input data has effectively 2\n dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0\n disables this check.\n\n estimator : str or estimator instance, default=None\n If passed, include the name of the estimator in warning messages.\n\n Returns\n -------\n array_converted : object\n The converted and validated array.\n ", "source_code": "\ndef check_array(array, accept_sparse=False, *, accept_large_sparse=True, dtype='numeric', order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, estimator=None):\n \"\"\"Input validation on an array, list, sparse matrix or similar.\n\n By default, the input is checked to be a non-empty 2D array containing\n only finite values. If the dtype of the array is object, attempt\n converting to float, raising on failure.\n\n Parameters\n ----------\n array : object\n Input object to check / convert.\n\n accept_sparse : str, bool or list/tuple of str, default=False\n String[s] representing allowed sparse matrix formats, such as 'csc',\n 'csr', etc. If the input is sparse but not in the allowed format,\n it will be converted to the first listed format. True allows the input\n to be any format. False means that a sparse matrix input will\n raise an error.\n\n accept_large_sparse : bool, default=True\n If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by\n accept_sparse, accept_large_sparse=False will cause it to be accepted\n only if its indices are stored with a 32-bit dtype.\n\n .. versionadded:: 0.20\n\n dtype : 'numeric', type, list of type or None, default='numeric'\n Data type of result. If None, the dtype of the input is preserved.\n If \"numeric\", dtype is preserved unless array.dtype is object.\n If dtype is a list of types, conversion on the first type is only\n performed if the dtype of the input is not in the list.\n\n order : {'F', 'C'} or None, default=None\n Whether an array will be forced to be fortran or c-style.\n When order is None (default), then if copy=False, nothing is ensured\n about the memory layout of the output array; otherwise (copy=True)\n the memory layout of the returned array is kept as close as possible\n to the original array.\n\n copy : bool, default=False\n Whether a forced copy will be triggered. If copy=False, a copy might\n be triggered by a conversion.\n\n force_all_finite : bool or 'allow-nan', default=True\n Whether to raise an error on np.inf, np.nan, pd.NA in array. The\n possibilities are:\n\n - True: Force all values of array to be finite.\n - False: accepts np.inf, np.nan, pd.NA in array.\n - 'allow-nan': accepts only np.nan and pd.NA values in array. Values\n cannot be infinite.\n\n .. versionadded:: 0.20\n ``force_all_finite`` accepts the string ``'allow-nan'``.\n\n .. versionchanged:: 0.23\n Accepts `pd.NA` and converts it into `np.nan`\n\n ensure_2d : bool, default=True\n Whether to raise a value error if array is not 2D.\n\n allow_nd : bool, default=False\n Whether to allow array.ndim > 2.\n\n ensure_min_samples : int, default=1\n Make sure that the array has a minimum number of samples in its first\n axis (rows for a 2D array). Setting to 0 disables this check.\n\n ensure_min_features : int, default=1\n Make sure that the 2D array has some minimum number of features\n (columns). The default value of 1 rejects empty datasets.\n This check is only enforced when the input data has effectively 2\n dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0\n disables this check.\n\n estimator : str or estimator instance, default=None\n If passed, include the name of the estimator in warning messages.\n\n Returns\n -------\n array_converted : object\n The converted and validated array.\n \"\"\"\n if isinstance(array, np.matrix):\n warnings.warn('np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html', FutureWarning)\n array_orig = array\n dtype_numeric = isinstance(dtype, str) and dtype == 'numeric'\n dtype_orig = getattr(array, 'dtype', None)\n if not hasattr(dtype_orig, 'kind'):\n dtype_orig = None\n dtypes_orig = None\n has_pd_integer_array = False\n if hasattr(array, 'dtypes') and hasattr(array.dtypes, '__array__'):\n with suppress(ImportError):\n from pandas.api.types import is_sparse\n if not hasattr(array, 'sparse') and array.dtypes.apply(is_sparse).any():\n warnings.warn('pandas.DataFrame with sparse columns found.It will be converted to a dense numpy array.')\n dtypes_orig = list(array.dtypes)\n for (i, dtype_iter) in enumerate(dtypes_orig):\n if dtype_iter.kind == 'b':\n dtypes_orig[i] = np.dtype(object)\n elif dtype_iter.name.startswith(('Int', 'UInt')):\n with suppress(ImportError):\n from pandas import Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype\n if isinstance(dtype_iter, (Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype)):\n has_pd_integer_array = True\n if all((isinstance(dtype, np.dtype) for dtype in dtypes_orig)):\n dtype_orig = np.result_type(*dtypes_orig)\n if dtype_numeric:\n if dtype_orig is not None and dtype_orig.kind == 'O':\n dtype = np.float64\n else:\n dtype = None\n if isinstance(dtype, (list, tuple)):\n if dtype_orig is not None and dtype_orig in dtype:\n dtype = None\n else:\n dtype = dtype[0]\n if has_pd_integer_array:\n array = array.astype(dtype)\n if force_all_finite not in (True, False, 'allow-nan'):\n raise ValueError('force_all_finite should be a bool or \"allow-nan\". Got {!r} instead'.format(force_all_finite))\n if estimator is not None:\n if isinstance(estimator, str):\n estimator_name = estimator\n else:\n estimator_name = estimator.__class__.__name__\n else:\n estimator_name = 'Estimator'\n context = ' by %s' % estimator_name if estimator is not None else ''\n if hasattr(array, 'sparse') and array.ndim > 1:\n array = array.sparse.to_coo()\n if array.dtype == np.dtype('object'):\n unique_dtypes = set([dt.subtype.name for dt in array_orig.dtypes])\n if len(unique_dtypes) > 1:\n raise ValueError('Pandas DataFrame with mixed sparse extension arrays generated a sparse matrix with object dtype which can not be converted to a scipy sparse matrix.Sparse extension arrays should all have the same numeric type.')\n if sp.issparse(array):\n _ensure_no_complex_data(array)\n array = _ensure_sparse_format(array, accept_sparse=accept_sparse, dtype=dtype, copy=copy, force_all_finite=force_all_finite, accept_large_sparse=accept_large_sparse)\n else:\n with warnings.catch_warnings():\n try:\n warnings.simplefilter('error', ComplexWarning)\n if dtype is not None and np.dtype(dtype).kind in 'iu':\n array = np.asarray(array, order=order)\n if array.dtype.kind == 'f':\n _assert_all_finite(array, allow_nan=False, msg_dtype=dtype)\n array = array.astype(dtype, casting='unsafe', copy=False)\n else:\n array = np.asarray(array, order=order, dtype=dtype)\n except ComplexWarning as complex_warning:\n raise ValueError('Complex data not supported\\n{}\\n'.format(array)) from complex_warning\n _ensure_no_complex_data(array)\n if ensure_2d:\n if array.ndim == 0:\n raise ValueError('Expected 2D array, got scalar array instead:\\narray={}.\\nReshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.'.format(array))\n if array.ndim == 1:\n raise ValueError('Expected 2D array, got 1D array instead:\\narray={}.\\nReshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.'.format(array))\n if dtype_numeric and array.dtype.kind in 'OUSV':\n warnings.warn(\"Arrays of bytes/strings is being converted to decimal numbers if dtype='numeric'. This behavior is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Please convert your data to numeric values explicitly instead.\", FutureWarning, stacklevel=2)\n try:\n array = array.astype(np.float64)\n except ValueError as e:\n raise ValueError(\"Unable to convert array of bytes/strings into decimal numbers with dtype='numeric'\") from e\n if not allow_nd and array.ndim >= 3:\n raise ValueError('Found array with dim %d. %s expected <= 2.' % (array.ndim, estimator_name))\n if force_all_finite:\n _assert_all_finite(array, allow_nan=force_all_finite == 'allow-nan')\n if ensure_min_samples > 0:\n n_samples = _num_samples(array)\n if n_samples < ensure_min_samples:\n raise ValueError('Found array with %d sample(s) (shape=%s) while a minimum of %d is required%s.' % (n_samples, array.shape, ensure_min_samples, context))\n if ensure_min_features > 0 and array.ndim == 2:\n n_features = array.shape[1]\n if n_features < ensure_min_features:\n raise ValueError('Found array with %d feature(s) (shape=%s) while a minimum of %d is required%s.' % (n_features, array.shape, ensure_min_features, context))\n if copy and np.may_share_memory(array, array_orig):\n array = np.array(array, dtype=dtype, order=order)\n return array" }, { @@ -177685,7 +191848,7 @@ "results": [], "is_public": true, "description": "Check that all arrays have consistent first dimensions.\n\nChecks whether all objects in arrays have the same shape or length.", - "docstring": "Check that all arrays have consistent first dimensions.\n\nChecks whether all objects in arrays have the same shape or length.\n\nParameters\n----------\n*arrays : list or tuple of input objects.\n Objects that will be checked for consistent length.", + "docstring": "Check that all arrays have consistent first dimensions.\n\n Checks whether all objects in arrays have the same shape or length.\n\n Parameters\n ----------\n *arrays : list or tuple of input objects.\n Objects that will be checked for consistent length.\n ", "source_code": "\ndef check_consistent_length(*arrays):\n \"\"\"Check that all arrays have consistent first dimensions.\n\n Checks whether all objects in arrays have the same shape or length.\n\n Parameters\n ----------\n *arrays : list or tuple of input objects.\n Objects that will be checked for consistent length.\n \"\"\"\n lengths = [_num_samples(X) for X in arrays if X is not None]\n uniques = np.unique(lengths)\n if len(uniques) > 1:\n raise ValueError('Found input variables with inconsistent numbers of samples: %r' % [int(l) for l in lengths])" }, { @@ -177703,7 +191866,8 @@ "docstring": { "type": "estimator instance", "description": "estimator instance for which the check is performed." - } + }, + "refined_type": {} }, { "name": "attributes", @@ -177713,7 +191877,8 @@ "docstring": { "type": "str, list or tuple of str, default=None", "description": "Attribute name(s) given as string or a list/tuple of strings\nEg.: ``[\"coef_\", \"estimator_\", ...], \"coef_\"``\n\nIf `None`, `estimator` is considered fitted if there exist an\nattribute that ends with a underscore and does not start with double\nunderscore." - } + }, + "refined_type": {} }, { "name": "msg", @@ -177723,7 +191888,8 @@ "docstring": { "type": "str, default=None", "description": "The default error message is, \"This %(name)s instance is not fitted\nyet. Call 'fit' with appropriate arguments before using this\nestimator.\"\n\nFor custom messages if \"%(name)s\" is present in the message string,\nit is substituted for the estimator name.\n\nEg. : \"Estimator, %(name)s, must be fitted before sparsifying\"." - } + }, + "refined_type": {} }, { "name": "all_or_any", @@ -177733,13 +191899,17 @@ "docstring": { "type": "callable, {all, any}, default=all", "description": "Specify whether all or any of the given attributes must exist." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } } ], "results": [], "is_public": true, - "description": "Perform is_fitted validation for estimator.\n\nChecks if the estimator is fitted by verifying the presence of fitted attributes (ending with a trailing underscore) and otherwise raises a NotFittedError with the given message. If an estimator does not set any attributes with a trailing underscore, it can define a ``__sklearn_is_fitted__`` method returning a boolean to specify if the estimator is fitted or not.", - "docstring": "Perform is_fitted validation for estimator.\n\nChecks if the estimator is fitted by verifying the presence of\nfitted attributes (ending with a trailing underscore) and otherwise\nraises a NotFittedError with the given message.\n\nIf an estimator does not set any attributes with a trailing underscore, it\ncan define a ``__sklearn_is_fitted__`` method returning a boolean to specify if the\nestimator is fitted or not.\n\nParameters\n----------\nestimator : estimator instance\n estimator instance for which the check is performed.\n\nattributes : str, list or tuple of str, default=None\n Attribute name(s) given as string or a list/tuple of strings\n Eg.: ``[\"coef_\", \"estimator_\", ...], \"coef_\"``\n\n If `None`, `estimator` is considered fitted if there exist an\n attribute that ends with a underscore and does not start with double\n underscore.\n\nmsg : str, default=None\n The default error message is, \"This %(name)s instance is not fitted\n yet. Call 'fit' with appropriate arguments before using this\n estimator.\"\n\n For custom messages if \"%(name)s\" is present in the message string,\n it is substituted for the estimator name.\n\n Eg. : \"Estimator, %(name)s, must be fitted before sparsifying\".\n\nall_or_any : callable, {all, any}, default=all\n Specify whether all or any of the given attributes must exist.\n\nReturns\n-------\nNone\n\nRaises\n------\nNotFittedError\n If the attributes are not found.", + "description": "Perform is_fitted validation for estimator.\n\nChecks if the estimator is fitted by verifying the presence of\nfitted attributes (ending with a trailing underscore) and otherwise\nraises a NotFittedError with the given message.\n\nIf an estimator does not set any attributes with a trailing underscore, it\ncan define a ``__sklearn_is_fitted__`` method returning a boolean to specify if the\nestimator is fitted or not.", + "docstring": "Perform is_fitted validation for estimator.\n\n Checks if the estimator is fitted by verifying the presence of\n fitted attributes (ending with a trailing underscore) and otherwise\n raises a NotFittedError with the given message.\n\n If an estimator does not set any attributes with a trailing underscore, it\n can define a ``__sklearn_is_fitted__`` method returning a boolean to specify if the\n estimator is fitted or not.\n\n Parameters\n ----------\n estimator : estimator instance\n estimator instance for which the check is performed.\n\n attributes : str, list or tuple of str, default=None\n Attribute name(s) given as string or a list/tuple of strings\n Eg.: ``[\"coef_\", \"estimator_\", ...], \"coef_\"``\n\n If `None`, `estimator` is considered fitted if there exist an\n attribute that ends with a underscore and does not start with double\n underscore.\n\n msg : str, default=None\n The default error message is, \"This %(name)s instance is not fitted\n yet. Call 'fit' with appropriate arguments before using this\n estimator.\"\n\n For custom messages if \"%(name)s\" is present in the message string,\n it is substituted for the estimator name.\n\n Eg. : \"Estimator, %(name)s, must be fitted before sparsifying\".\n\n all_or_any : callable, {all, any}, default=all\n Specify whether all or any of the given attributes must exist.\n\n Returns\n -------\n None\n\n Raises\n ------\n NotFittedError\n If the attributes are not found.\n ", "source_code": "\ndef check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):\n \"\"\"Perform is_fitted validation for estimator.\n\n Checks if the estimator is fitted by verifying the presence of\n fitted attributes (ending with a trailing underscore) and otherwise\n raises a NotFittedError with the given message.\n\n If an estimator does not set any attributes with a trailing underscore, it\n can define a ``__sklearn_is_fitted__`` method returning a boolean to specify if the\n estimator is fitted or not.\n\n Parameters\n ----------\n estimator : estimator instance\n estimator instance for which the check is performed.\n\n attributes : str, list or tuple of str, default=None\n Attribute name(s) given as string or a list/tuple of strings\n Eg.: ``[\"coef_\", \"estimator_\", ...], \"coef_\"``\n\n If `None`, `estimator` is considered fitted if there exist an\n attribute that ends with a underscore and does not start with double\n underscore.\n\n msg : str, default=None\n The default error message is, \"This %(name)s instance is not fitted\n yet. Call 'fit' with appropriate arguments before using this\n estimator.\"\n\n For custom messages if \"%(name)s\" is present in the message string,\n it is substituted for the estimator name.\n\n Eg. : \"Estimator, %(name)s, must be fitted before sparsifying\".\n\n all_or_any : callable, {all, any}, default=all\n Specify whether all or any of the given attributes must exist.\n\n Returns\n -------\n None\n\n Raises\n ------\n NotFittedError\n If the attributes are not found.\n \"\"\"\n if isclass(estimator):\n raise TypeError('{} is a class, not an instance.'.format(estimator))\n if msg is None:\n msg = \"This %(name)s instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.\"\n if not hasattr(estimator, 'fit'):\n raise TypeError('%s is not an estimator instance.' % estimator)\n if attributes is not None:\n if not isinstance(attributes, (list, tuple)):\n attributes = [attributes]\n fitted = all_or_any([hasattr(estimator, attr) for attr in attributes])\n elif hasattr(estimator, '__sklearn_is_fitted__'):\n fitted = estimator.__sklearn_is_fitted__()\n else:\n fitted = [v for v in vars(estimator) if v.endswith('_') and not v.startswith('__')]\n if not fitted:\n raise NotFittedError(msg % {'name': type(estimator).__name__})" }, { @@ -177757,13 +191927,14 @@ "docstring": { "type": "None, str or object with the joblib.Memory interface", "description": "" - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Check that ``memory`` is joblib.Memory-like.\n\njoblib.Memory-like means that ``memory`` can be converted into a joblib.Memory instance (typically a str denoting the ``location``) or has the same interface (has a ``cache`` method).", - "docstring": "Check that ``memory`` is joblib.Memory-like.\n\njoblib.Memory-like means that ``memory`` can be converted into a\njoblib.Memory instance (typically a str denoting the ``location``)\nor has the same interface (has a ``cache`` method).\n\nParameters\n----------\nmemory : None, str or object with the joblib.Memory interface\n\nReturns\n-------\nmemory : object with the joblib.Memory interface\n\nRaises\n------\nValueError\n If ``memory`` is not joblib.Memory-like.", + "description": "Check that ``memory`` is joblib.Memory-like.\n\njoblib.Memory-like means that ``memory`` can be converted into a\njoblib.Memory instance (typically a str denoting the ``location``)\nor has the same interface (has a ``cache`` method).", + "docstring": "Check that ``memory`` is joblib.Memory-like.\n\n joblib.Memory-like means that ``memory`` can be converted into a\n joblib.Memory instance (typically a str denoting the ``location``)\n or has the same interface (has a ``cache`` method).\n\n Parameters\n ----------\n memory : None, str or object with the joblib.Memory interface\n\n Returns\n -------\n memory : object with the joblib.Memory interface\n\n Raises\n ------\n ValueError\n If ``memory`` is not joblib.Memory-like.\n ", "source_code": "\ndef check_memory(memory):\n \"\"\"Check that ``memory`` is joblib.Memory-like.\n\n joblib.Memory-like means that ``memory`` can be converted into a\n joblib.Memory instance (typically a str denoting the ``location``)\n or has the same interface (has a ``cache`` method).\n\n Parameters\n ----------\n memory : None, str or object with the joblib.Memory interface\n\n Returns\n -------\n memory : object with the joblib.Memory interface\n\n Raises\n ------\n ValueError\n If ``memory`` is not joblib.Memory-like.\n \"\"\"\n if memory is None or isinstance(memory, str):\n if parse_version(joblib.__version__) < parse_version('0.12'):\n memory = joblib.Memory(cachedir=memory, verbose=0)\n else:\n memory = joblib.Memory(location=memory, verbose=0)\n elif not hasattr(memory, 'cache'):\n raise ValueError(\"'memory' should be None, a string or have the same interface as joblib.Memory. Got memory='{}' instead.\".format(memory))\n return memory" }, { @@ -177781,6 +191952,10 @@ "docstring": { "type": "{array-like, sparse matrix}", "description": "Input data." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -177791,13 +191966,14 @@ "docstring": { "type": "str", "description": "Who passed X to this function." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Check if there is any negative value in an array.", - "docstring": "Check if there is any negative value in an array.\n\nParameters\n----------\nX : {array-like, sparse matrix}\n Input data.\n\nwhom : str\n Who passed X to this function.", + "docstring": "\n Check if there is any negative value in an array.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}\n Input data.\n\n whom : str\n Who passed X to this function.\n ", "source_code": "\ndef check_non_negative(X, whom):\n \"\"\"\n Check if there is any negative value in an array.\n\n Parameters\n ----------\n X : {array-like, sparse matrix}\n Input data.\n\n whom : str\n Who passed X to this function.\n \"\"\"\n if sp.issparse(X):\n if X.format in ['lil', 'dok']:\n X = X.tocsr()\n if X.data.size == 0:\n X_min = 0\n else:\n X_min = X.data.min()\n else:\n X_min = X.min()\n if X_min < 0:\n raise ValueError('Negative values in data passed to %s' % whom)" }, { @@ -177815,13 +191991,14 @@ "docstring": { "type": "None, int or instance of RandomState", "description": "If seed is None, return the RandomState singleton used by np.random.\nIf seed is an int, return a new RandomState instance seeded with seed.\nIf seed is already a RandomState instance, return it.\nOtherwise raise ValueError." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Turn seed into a np.random.RandomState instance", - "docstring": "Turn seed into a np.random.RandomState instance\n\nParameters\n----------\nseed : None, int or instance of RandomState\n If seed is None, return the RandomState singleton used by np.random.\n If seed is an int, return a new RandomState instance seeded with seed.\n If seed is already a RandomState instance, return it.\n Otherwise raise ValueError.", + "docstring": "Turn seed into a np.random.RandomState instance\n\n Parameters\n ----------\n seed : None, int or instance of RandomState\n If seed is None, return the RandomState singleton used by np.random.\n If seed is an int, return a new RandomState instance seeded with seed.\n If seed is already a RandomState instance, return it.\n Otherwise raise ValueError.\n ", "source_code": "\ndef check_random_state(seed):\n \"\"\"Turn seed into a np.random.RandomState instance\n\n Parameters\n ----------\n seed : None, int or instance of RandomState\n If seed is None, return the RandomState singleton used by np.random.\n If seed is an int, return a new RandomState instance seeded with seed.\n If seed is already a RandomState instance, return it.\n Otherwise raise ValueError.\n \"\"\"\n if seed is None or seed is np.random:\n return np.random.mtrand._rand\n if isinstance(seed, numbers.Integral):\n return np.random.RandomState(seed)\n if isinstance(seed, np.random.RandomState):\n return seed\n raise ValueError('%r cannot be used to seed a numpy.random.RandomState instance' % seed)" }, { @@ -177839,7 +192016,8 @@ "docstring": { "type": "object", "description": "The scalar parameter to validate." - } + }, + "refined_type": {} }, { "name": "name", @@ -177849,7 +192027,8 @@ "docstring": { "type": "str", "description": "The name of the parameter to be printed in error messages." - } + }, + "refined_type": {} }, { "name": "target_type", @@ -177859,7 +192038,8 @@ "docstring": { "type": "type or tuple", "description": "Acceptable data types for the parameter." - } + }, + "refined_type": {} }, { "name": "min_val", @@ -177869,7 +192049,8 @@ "docstring": { "type": "float or int, default=None", "description": "The minimum valid value the parameter can take. If None (default) it\nis implied that the parameter does not have a lower bound." - } + }, + "refined_type": {} }, { "name": "max_val", @@ -177877,9 +192058,10 @@ "is_public": true, "assigned_by": "NAME_ONLY", "docstring": { - "type": "float or int, default=False", + "type": "float or int, default=None", "description": "The maximum valid value the parameter can take. If None (default) it\nis implied that the parameter does not have an upper bound." - } + }, + "refined_type": {} }, { "name": "include_boundaries", @@ -177889,14 +192071,18 @@ "docstring": { "type": "{\"left\", \"right\", \"both\", \"neither\"}, default=\"both\"", "description": "Whether the interval defined by `min_val` and `max_val` should include\nthe boundaries. Possible choices are:\n\n- `\"left\"`: only `min_val` is included in the valid interval;\n- `\"right\"`: only `max_val` is included in the valid interval;\n- `\"both\"`: `min_val` and `max_val` are included in the valid interval;\n- `\"neither\"`: neither `min_val` nor `max_val` are included in the\n valid interval." + }, + "refined_type": { + "kind": "EnumType", + "values": ["right", "left", "neither", "both"] } } ], "results": [], "is_public": true, "description": "Validate scalar parameters type and value.", - "docstring": "Validate scalar parameters type and value.\n\nParameters\n----------\nx : object\n The scalar parameter to validate.\n\nname : str\n The name of the parameter to be printed in error messages.\n\ntarget_type : type or tuple\n Acceptable data types for the parameter.\n\nmin_val : float or int, default=None\n The minimum valid value the parameter can take. If None (default) it\n is implied that the parameter does not have a lower bound.\n\nmax_val : float or int, default=False\n The maximum valid value the parameter can take. If None (default) it\n is implied that the parameter does not have an upper bound.\n\ninclude_boundaries : {\"left\", \"right\", \"both\", \"neither\"}, default=\"both\"\n Whether the interval defined by `min_val` and `max_val` should include\n the boundaries. Possible choices are:\n\n - `\"left\"`: only `min_val` is included in the valid interval;\n - `\"right\"`: only `max_val` is included in the valid interval;\n - `\"both\"`: `min_val` and `max_val` are included in the valid interval;\n - `\"neither\"`: neither `min_val` nor `max_val` are included in the\n valid interval.\n\nReturns\n-------\nx : numbers.Number\n The validated number.\n\nRaises\n------\nTypeError\n If the parameter's type does not match the desired type.\n\nValueError\n If the parameter's value violates the given bounds.", - "source_code": "\ndef check_scalar(x, name, target_type, *, min_val=None, max_val=None, include_boundaries='both'):\n \"\"\"Validate scalar parameters type and value.\n\n Parameters\n ----------\n x : object\n The scalar parameter to validate.\n\n name : str\n The name of the parameter to be printed in error messages.\n\n target_type : type or tuple\n Acceptable data types for the parameter.\n\n min_val : float or int, default=None\n The minimum valid value the parameter can take. If None (default) it\n is implied that the parameter does not have a lower bound.\n\n max_val : float or int, default=False\n The maximum valid value the parameter can take. If None (default) it\n is implied that the parameter does not have an upper bound.\n\n include_boundaries : {\"left\", \"right\", \"both\", \"neither\"}, default=\"both\"\n Whether the interval defined by `min_val` and `max_val` should include\n the boundaries. Possible choices are:\n\n - `\"left\"`: only `min_val` is included in the valid interval;\n - `\"right\"`: only `max_val` is included in the valid interval;\n - `\"both\"`: `min_val` and `max_val` are included in the valid interval;\n - `\"neither\"`: neither `min_val` nor `max_val` are included in the\n valid interval.\n\n Returns\n -------\n x : numbers.Number\n The validated number.\n\n Raises\n ------\n TypeError\n If the parameter's type does not match the desired type.\n\n ValueError\n If the parameter's value violates the given bounds.\n \"\"\"\n if not isinstance(x, target_type):\n raise TypeError(f'{name} must be an instance of {target_type}, not {type(x)}.')\n expected_include_boundaries = ('left', 'right', 'both', 'neither')\n if include_boundaries not in expected_include_boundaries:\n raise ValueError(f'Unknown value for `include_boundaries`: {repr(include_boundaries)}. Possible values are: {expected_include_boundaries}.')\n comparison_operator = operator.lt if include_boundaries in ('left', 'both') else operator.le\n if min_val is not None and comparison_operator(x, min_val):\n raise ValueError(f\"{name} == {x}, must be {'>=' if include_boundaries in ('left', 'both') else '>'} {min_val}.\")\n comparison_operator = operator.gt if include_boundaries in ('right', 'both') else operator.ge\n if max_val is not None and comparison_operator(x, max_val):\n raise ValueError(f\"{name} == {x}, must be {'<=' if include_boundaries in ('right', 'both') else '<'} {max_val}.\")\n return x" + "docstring": "Validate scalar parameters type and value.\n\n Parameters\n ----------\n x : object\n The scalar parameter to validate.\n\n name : str\n The name of the parameter to be printed in error messages.\n\n target_type : type or tuple\n Acceptable data types for the parameter.\n\n min_val : float or int, default=None\n The minimum valid value the parameter can take. If None (default) it\n is implied that the parameter does not have a lower bound.\n\n max_val : float or int, default=None\n The maximum valid value the parameter can take. If None (default) it\n is implied that the parameter does not have an upper bound.\n\n include_boundaries : {\"left\", \"right\", \"both\", \"neither\"}, default=\"both\"\n Whether the interval defined by `min_val` and `max_val` should include\n the boundaries. Possible choices are:\n\n - `\"left\"`: only `min_val` is included in the valid interval;\n - `\"right\"`: only `max_val` is included in the valid interval;\n - `\"both\"`: `min_val` and `max_val` are included in the valid interval;\n - `\"neither\"`: neither `min_val` nor `max_val` are included in the\n valid interval.\n\n Returns\n -------\n x : numbers.Number\n The validated number.\n\n Raises\n ------\n TypeError\n If the parameter's type does not match the desired type.\n\n ValueError\n If the parameter's value violates the given bounds.\n ", + "source_code": "\ndef check_scalar(x, name, target_type, *, min_val=None, max_val=None, include_boundaries='both'):\n \"\"\"Validate scalar parameters type and value.\n\n Parameters\n ----------\n x : object\n The scalar parameter to validate.\n\n name : str\n The name of the parameter to be printed in error messages.\n\n target_type : type or tuple\n Acceptable data types for the parameter.\n\n min_val : float or int, default=None\n The minimum valid value the parameter can take. If None (default) it\n is implied that the parameter does not have a lower bound.\n\n max_val : float or int, default=None\n The maximum valid value the parameter can take. If None (default) it\n is implied that the parameter does not have an upper bound.\n\n include_boundaries : {\"left\", \"right\", \"both\", \"neither\"}, default=\"both\"\n Whether the interval defined by `min_val` and `max_val` should include\n the boundaries. Possible choices are:\n\n - `\"left\"`: only `min_val` is included in the valid interval;\n - `\"right\"`: only `max_val` is included in the valid interval;\n - `\"both\"`: `min_val` and `max_val` are included in the valid interval;\n - `\"neither\"`: neither `min_val` nor `max_val` are included in the\n valid interval.\n\n Returns\n -------\n x : numbers.Number\n The validated number.\n\n Raises\n ------\n TypeError\n If the parameter's type does not match the desired type.\n\n ValueError\n If the parameter's value violates the given bounds.\n \"\"\"\n if not isinstance(x, target_type):\n raise TypeError(f'{name} must be an instance of {target_type}, not {type(x)}.')\n expected_include_boundaries = ('left', 'right', 'both', 'neither')\n if include_boundaries not in expected_include_boundaries:\n raise ValueError(f'Unknown value for `include_boundaries`: {repr(include_boundaries)}. Possible values are: {expected_include_boundaries}.')\n comparison_operator = operator.lt if include_boundaries in ('left', 'both') else operator.le\n if min_val is not None and comparison_operator(x, min_val):\n raise ValueError(f\"{name} == {x}, must be {'>=' if include_boundaries in ('left', 'both') else '>'} {min_val}.\")\n comparison_operator = operator.gt if include_boundaries in ('right', 'both') else operator.ge\n if max_val is not None and comparison_operator(x, max_val):\n raise ValueError(f\"{name} == {x}, must be {'<=' if include_boundaries in ('right', 'both') else '<'} {max_val}.\")\n return x" }, { "name": "check_symmetric", @@ -177913,6 +192099,10 @@ "docstring": { "type": "{ndarray, sparse matrix}", "description": "Input object to check / convert. Must be two-dimensional and square,\notherwise a ValueError will be raised." + }, + "refined_type": { + "kind": "EnumType", + "values": [] } }, { @@ -177923,7 +192113,8 @@ "docstring": { "type": "float, default=1e-10", "description": "Absolute tolerance for equivalence of arrays. Default = 1E-10." - } + }, + "refined_type": {} }, { "name": "raise_warning", @@ -177933,7 +192124,8 @@ "docstring": { "type": "bool, default=True", "description": "If True then raise a warning if conversion is required." - } + }, + "refined_type": {} }, { "name": "raise_exception", @@ -177943,13 +192135,14 @@ "docstring": { "type": "bool, default=False", "description": "If True then raise an exception if array is not symmetric." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Make sure that array is 2D, square and symmetric.\n\nIf the array is not symmetric, then a symmetrized version is returned. Optionally, a warning or exception is raised if the matrix is not symmetric.", - "docstring": "Make sure that array is 2D, square and symmetric.\n\nIf the array is not symmetric, then a symmetrized version is returned.\nOptionally, a warning or exception is raised if the matrix is not\nsymmetric.\n\nParameters\n----------\narray : {ndarray, sparse matrix}\n Input object to check / convert. Must be two-dimensional and square,\n otherwise a ValueError will be raised.\n\ntol : float, default=1e-10\n Absolute tolerance for equivalence of arrays. Default = 1E-10.\n\nraise_warning : bool, default=True\n If True then raise a warning if conversion is required.\n\nraise_exception : bool, default=False\n If True then raise an exception if array is not symmetric.\n\nReturns\n-------\narray_sym : {ndarray, sparse matrix}\n Symmetrized version of the input array, i.e. the average of array\n and array.transpose(). If sparse, then duplicate entries are first\n summed and zeros are eliminated.", + "description": "Make sure that array is 2D, square and symmetric.\n\nIf the array is not symmetric, then a symmetrized version is returned.\nOptionally, a warning or exception is raised if the matrix is not\nsymmetric.", + "docstring": "Make sure that array is 2D, square and symmetric.\n\n If the array is not symmetric, then a symmetrized version is returned.\n Optionally, a warning or exception is raised if the matrix is not\n symmetric.\n\n Parameters\n ----------\n array : {ndarray, sparse matrix}\n Input object to check / convert. Must be two-dimensional and square,\n otherwise a ValueError will be raised.\n\n tol : float, default=1e-10\n Absolute tolerance for equivalence of arrays. Default = 1E-10.\n\n raise_warning : bool, default=True\n If True then raise a warning if conversion is required.\n\n raise_exception : bool, default=False\n If True then raise an exception if array is not symmetric.\n\n Returns\n -------\n array_sym : {ndarray, sparse matrix}\n Symmetrized version of the input array, i.e. the average of array\n and array.transpose(). If sparse, then duplicate entries are first\n summed and zeros are eliminated.\n ", "source_code": "\ndef check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=False):\n \"\"\"Make sure that array is 2D, square and symmetric.\n\n If the array is not symmetric, then a symmetrized version is returned.\n Optionally, a warning or exception is raised if the matrix is not\n symmetric.\n\n Parameters\n ----------\n array : {ndarray, sparse matrix}\n Input object to check / convert. Must be two-dimensional and square,\n otherwise a ValueError will be raised.\n\n tol : float, default=1e-10\n Absolute tolerance for equivalence of arrays. Default = 1E-10.\n\n raise_warning : bool, default=True\n If True then raise a warning if conversion is required.\n\n raise_exception : bool, default=False\n If True then raise an exception if array is not symmetric.\n\n Returns\n -------\n array_sym : {ndarray, sparse matrix}\n Symmetrized version of the input array, i.e. the average of array\n and array.transpose(). If sparse, then duplicate entries are first\n summed and zeros are eliminated.\n \"\"\"\n if array.ndim != 2 or array.shape[0] != array.shape[1]:\n raise ValueError('array must be 2-dimensional and square. shape = {0}'.format(array.shape))\n if sp.issparse(array):\n diff = array - array.T\n if diff.format not in ['csr', 'csc', 'coo']:\n diff = diff.tocsr()\n symmetric = np.all(abs(diff.data) < tol)\n else:\n symmetric = np.allclose(array, array.T, atol=tol)\n if not symmetric:\n if raise_exception:\n raise ValueError('Array must be symmetric')\n if raise_warning:\n warnings.warn('Array is not symmetric, and will be converted to symmetric by average with its transpose.', stacklevel=2)\n if sp.issparse(array):\n conversion = 'to' + array.format\n array = getattr(0.5 * (array + array.T), conversion)()\n else:\n array = 0.5 * (array + array.T)\n return array" }, { @@ -177966,8 +192159,9 @@ "assigned_by": "POSITION_OR_NAME", "docstring": { "type": "array-like", - "description": "" - } + "description": "Input data." + }, + "refined_type": {} }, { "name": "warn", @@ -177977,14 +192171,15 @@ "docstring": { "type": "bool, default=False", "description": "To control display of warnings." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, "description": "Ravel column or 1d numpy array, else raises an error.", - "docstring": "Ravel column or 1d numpy array, else raises an error.\n\nParameters\n----------\ny : array-like\n\nwarn : bool, default=False\n To control display of warnings.\n\nReturns\n-------\ny : ndarray", - "source_code": "\ndef column_or_1d(y, *, warn=False):\n \"\"\"Ravel column or 1d numpy array, else raises an error.\n\n Parameters\n ----------\n y : array-like\n\n warn : bool, default=False\n To control display of warnings.\n\n Returns\n -------\n y : ndarray\n\n \"\"\"\n y = np.asarray(y)\n shape = np.shape(y)\n if len(shape) == 1:\n return np.ravel(y)\n if len(shape) == 2 and shape[1] == 1:\n if warn:\n warnings.warn('A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().', DataConversionWarning, stacklevel=2)\n return np.ravel(y)\n raise ValueError('y should be a 1d array, got an array of shape {} instead.'.format(shape))" + "docstring": "Ravel column or 1d numpy array, else raises an error.\n\n Parameters\n ----------\n y : array-like\n Input data.\n\n warn : bool, default=False\n To control display of warnings.\n\n Returns\n -------\n y : ndarray\n Output data.\n\n Raises\n -------\n ValueError\n If `y` is not a 1D array or a 2D array with a single row or column.\n ", + "source_code": "\ndef column_or_1d(y, *, warn=False):\n \"\"\"Ravel column or 1d numpy array, else raises an error.\n\n Parameters\n ----------\n y : array-like\n Input data.\n\n warn : bool, default=False\n To control display of warnings.\n\n Returns\n -------\n y : ndarray\n Output data.\n\n Raises\n -------\n ValueError\n If `y` is not a 1D array or a 2D array with a single row or column.\n \"\"\"\n y = np.asarray(y)\n shape = np.shape(y)\n if len(shape) == 1:\n return np.ravel(y)\n if len(shape) == 2 and shape[1] == 1:\n if warn:\n warnings.warn('A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().', DataConversionWarning, stacklevel=2)\n return np.ravel(y)\n raise ValueError('y should be a 1d array, got an array of shape {} instead.'.format(shape))" }, { "name": "has_fit_parameter", @@ -178001,7 +192196,8 @@ "docstring": { "type": "object", "description": "An estimator to inspect." - } + }, + "refined_type": {} }, { "name": "parameter", @@ -178011,14 +192207,15 @@ "docstring": { "type": "str", "description": "The searched parameter." - } + }, + "refined_type": {} } ], "results": [], "is_public": true, - "description": "Checks whether the estimator's fit method supports the given parameter.", - "docstring": "Checks whether the estimator's fit method supports the given parameter.\n\nParameters\n----------\nestimator : object\n An estimator to inspect.\n\nparameter : str\n The searched parameter.\n\nReturns\n-------\nis_parameter: bool\n Whether the parameter was found to be a named parameter of the\n estimator's fit method.\n\nExamples\n--------\n>>> from sklearn.svm import SVC\n>>> from sklearn.utils.validation import has_fit_parameter\n>>> has_fit_parameter(SVC(), \"sample_weight\")\nTrue", - "source_code": "\ndef has_fit_parameter(estimator, parameter):\n \"\"\"Checks whether the estimator's fit method supports the given parameter.\n\n Parameters\n ----------\n estimator : object\n An estimator to inspect.\n\n parameter : str\n The searched parameter.\n\n Returns\n -------\n is_parameter: bool\n Whether the parameter was found to be a named parameter of the\n estimator's fit method.\n\n Examples\n --------\n >>> from sklearn.svm import SVC\n >>> from sklearn.utils.validation import has_fit_parameter\n >>> has_fit_parameter(SVC(), \"sample_weight\")\n True\n\n \"\"\"\n return parameter in signature(estimator.fit).parameters" + "description": "Check whether the estimator's fit method supports the given parameter.", + "docstring": "Check whether the estimator's fit method supports the given parameter.\n\n Parameters\n ----------\n estimator : object\n An estimator to inspect.\n\n parameter : str\n The searched parameter.\n\n Returns\n -------\n is_parameter : bool\n Whether the parameter was found to be a named parameter of the\n estimator's fit method.\n\n Examples\n --------\n >>> from sklearn.svm import SVC\n >>> from sklearn.utils.validation import has_fit_parameter\n >>> has_fit_parameter(SVC(), \"sample_weight\")\n True\n ", + "source_code": "\ndef has_fit_parameter(estimator, parameter):\n \"\"\"Check whether the estimator's fit method supports the given parameter.\n\n Parameters\n ----------\n estimator : object\n An estimator to inspect.\n\n parameter : str\n The searched parameter.\n\n Returns\n -------\n is_parameter : bool\n Whether the parameter was found to be a named parameter of the\n estimator's fit method.\n\n Examples\n --------\n >>> from sklearn.svm import SVC\n >>> from sklearn.utils.validation import has_fit_parameter\n >>> has_fit_parameter(SVC(), \"sample_weight\")\n True\n \"\"\"\n return parameter in signature(estimator.fit).parameters" }, { "name": "indexable", @@ -178029,9 +192226,9 @@ "parameters": [], "results": [], "is_public": true, - "description": "Make arrays indexable for cross-validation.\n\nChecks consistent length, passes through None, and ensures that everything can be indexed by converting sparse matrices to csr and converting non-interable objects to arrays.", - "docstring": "Make arrays indexable for cross-validation.\n\nChecks consistent length, passes through None, and ensures that everything\ncan be indexed by converting sparse matrices to csr and converting\nnon-interable objects to arrays.\n\nParameters\n----------\n*iterables : {lists, dataframes, ndarrays, sparse matrices}\n List of objects to ensure sliceability.", - "source_code": "\ndef indexable(*iterables):\n \"\"\"Make arrays indexable for cross-validation.\n\n Checks consistent length, passes through None, and ensures that everything\n can be indexed by converting sparse matrices to csr and converting\n non-interable objects to arrays.\n\n Parameters\n ----------\n *iterables : {lists, dataframes, ndarrays, sparse matrices}\n List of objects to ensure sliceability.\n \"\"\"\n result = [_make_indexable(X) for X in iterables]\n check_consistent_length(*result)\n return result" + "description": "Make arrays indexable for cross-validation.\n\nChecks consistent length, passes through None, and ensures that everything\ncan be indexed by converting sparse matrices to csr and converting\nnon-interable objects to arrays.", + "docstring": "Make arrays indexable for cross-validation.\n\n Checks consistent length, passes through None, and ensures that everything\n can be indexed by converting sparse matrices to csr and converting\n non-interable objects to arrays.\n\n Parameters\n ----------\n *iterables : {lists, dataframes, ndarrays, sparse matrices}\n List of objects to ensure sliceability.\n\n Returns\n -------\n result : list of {ndarray, sparse matrix, dataframe} or None\n Returns a list containing indexable arrays (i.e. NumPy array,\n sparse matrix, or dataframe) or `None`.\n ", + "source_code": "\ndef indexable(*iterables):\n \"\"\"Make arrays indexable for cross-validation.\n\n Checks consistent length, passes through None, and ensures that everything\n can be indexed by converting sparse matrices to csr and converting\n non-interable objects to arrays.\n\n Parameters\n ----------\n *iterables : {lists, dataframes, ndarrays, sparse matrices}\n List of objects to ensure sliceability.\n\n Returns\n -------\n result : list of {ndarray, sparse matrix, dataframe} or None\n Returns a list containing indexable arrays (i.e. NumPy array,\n sparse matrix, or dataframe) or `None`.\n \"\"\"\n result = [_make_indexable(X) for X in iterables]\n check_consistent_length(*result)\n return result" } ] } diff --git a/package-parser/README.md b/package-parser/README.md index 7275151ff..7c6223adb 100644 --- a/package-parser/README.md +++ b/package-parser/README.md @@ -1,6 +1,6 @@ -# library-analyzer +# package-parser -A tool to analyzer client and API code written in Python. +A tool to analyze client and API code written in Python. ## Usage diff --git a/package-parser/package_parser/commands/get_api/__init__.py b/package-parser/package_parser/commands/get_api/__init__.py index 17d7b8ccd..5784fde26 100644 --- a/package-parser/package_parser/commands/get_api/__init__.py +++ b/package-parser/package_parser/commands/get_api/__init__.py @@ -1,7 +1,11 @@ from ._get_api import get_api from ._model import ( API, + Action, + APIDependencies, Class, + Condition, + Dependency, FromImport, Function, Import, @@ -9,6 +13,10 @@ Parameter, ParameterAndResultDocstring, ParameterAssignment, + ParameterHasValue, + ParameterIsIgnored, + ParameterIsIllegal, + ParameterIsNone, Result, ) from ._package_metadata import ( diff --git a/package-parser/package_parser/commands/get_api/_ast_visitor.py b/package-parser/package_parser/commands/get_api/_ast_visitor.py index 460c8e90e..de6d069e2 100644 --- a/package-parser/package_parser/commands/get_api/_ast_visitor.py +++ b/package-parser/package_parser/commands/get_api/_ast_visitor.py @@ -160,11 +160,11 @@ def __description(numpydoc: NumpyDocString) -> str: result = "" if has_summary: - result += " ".join(numpydoc["Summary"]) + result += "\n".join(numpydoc["Summary"]) if has_summary and has_extended_summary: result += "\n\n" if has_extended_summary: - result += " ".join(numpydoc["Extended Summary"]) + result += "\n".join(numpydoc["Extended Summary"]) return result @staticmethod diff --git a/package-parser/package_parser/commands/get_api/_model.py b/package-parser/package_parser/commands/get_api/_model.py index d23aaa617..4c36c3408 100644 --- a/package-parser/package_parser/commands/get_api/_model.py +++ b/package-parser/package_parser/commands/get_api/_model.py @@ -167,29 +167,29 @@ def to_json(self) -> Any: } +@dataclass class Import: + module_name: str + alias: Optional[str] + @staticmethod def from_json(json: Any) -> Import: return Import(json["module"], json["alias"]) - def __init__(self, module_name: str, alias: Optional[str]): - self.module: str = module_name - self.alias: Optional[str] = alias - def to_json(self) -> Any: - return {"module": self.module, "alias": self.alias} + return {"module": self.module_name, "alias": self.alias} +@dataclass class FromImport: + module_name: str + declaration_name: str + alias: Optional[str] + @staticmethod def from_json(json: Any) -> FromImport: return FromImport(json["module"], json["declaration"], json["alias"]) - def __init__(self, module_name: str, declaration_name: str, alias: Optional[str]): - self.module_name: str = module_name - self.declaration_name: str = declaration_name - self.alias: Optional[str] = alias - def to_json(self) -> Any: return { "module": self.module_name, @@ -256,7 +256,17 @@ def to_json(self) -> Any: } +@dataclass class Function: + qname: str + decorators: list[str] + parameters: list[Parameter] + results: list[Result] + is_public: bool + description: str + docstring: str + source_code: str + @staticmethod def from_json(json: Any) -> Function: return Function( @@ -273,26 +283,6 @@ def from_json(json: Any) -> Function: json["source_code"], ) - def __init__( - self, - qname: str, - decorators: list[str], - parameters: list[Parameter], - results: list[Result], - is_public: bool, - description: str, - docstring: str, - source_code: str, - ) -> None: - self.qname: str = qname - self.decorators: list[str] = decorators - self.parameters: list[Parameter] = parameters - self.results: list[Result] = results - self.is_public: bool = is_public - self.description: str = description - self.docstring: str = inspect.cleandoc(docstring or "") - self.source_code: str = source_code - @property def name(self) -> str: return self.qname.split(".")[-1] @@ -370,7 +360,7 @@ def from_docstring(cls, docstring: ParameterAndResultDocstring) -> RefinedType: def __init__( self, - ref_type: Optional[Union[UnionType, BoundaryType, EnumType, NamedType]] = None, + ref_type: Union[UnionType, BoundaryType, EnumType, NamedType, None] = None, ) -> None: self.ref_type = ref_type @@ -423,34 +413,30 @@ class ParameterAssignment(Enum): NAME_ONLY = (auto(),) +@dataclass class Result: + name: str + docstring: ParameterAndResultDocstring + @staticmethod def from_json(json: Any) -> Result: return Result( json["name"], ParameterAndResultDocstring.from_json(json["docstring"]) ) - def __init__(self, name: str, docstring: ParameterAndResultDocstring) -> None: - self.name: str = name - self.docstring = docstring - def to_json(self) -> Any: return {"name": self.name, "docstring": self.docstring.to_json()} +@dataclass class ParameterAndResultDocstring: + type: str + description: str + @classmethod def from_json(cls, json: Any): return cls(json["type"], json["description"]) - def __init__( - self, - type_: str, - description: str, - ) -> None: - self.type: str = type_ - self.description: str = description - def to_json(self) -> Any: return {"type": self.type, "description": self.description} @@ -529,10 +515,10 @@ class Dependency: @classmethod def from_json(cls, json: Any): return cls( - Parameter.from_json(["hasDependentParameter"]), - Parameter.from_json(["isDependingOn"]), - Condition.from_json(["hasCondition"]), - Action.from_json(["hasAction"]), + Parameter.from_json(json["hasDependentParameter"]), + Parameter.from_json(json["isDependingOn"]), + Condition.from_json(json["hasCondition"]), + Action.from_json(json["hasAction"]), ) def to_json(self) -> Dict: diff --git a/package-parser/package_parser/commands/get_dependencies/README.md b/package-parser/package_parser/commands/get_dependencies/README.md index da1235502..90abeeddb 100644 --- a/package-parser/package_parser/commands/get_dependencies/README.md +++ b/package-parser/package_parser/commands/get_dependencies/README.md @@ -1,4 +1,4 @@ -# Dependency Extaction +# Dependency Extraction ## How do we imagine a Dependency @@ -16,13 +16,13 @@ Parsing a dependency subtree in an InOrder traversal, we can rebuild a sentence ### Dependency Tree Example -![Alt text](../../../package_parser/commands/get_dependencies/dependency_tree_example.png "Dependency Tree Example") +![Alt text](dependency_tree_example.png "Dependency Tree Example") ## Where to continue work? * Development of more patterns in _dependency_patterns.py, see [Dependency Parser](https://spacy.io/usage/linguistic-features#dependency-parse), [Dependency Matcher](https://spacy.io/usage/rule-based-matching#dependencymatcher) * Look into pattern matching over multiple sentences. Have a more theoretical understanding of the meaning of a dependency in the english language. -* Better classification of the subclasses of Actions, Condtions. These classes are found in __../get_api/_model.py__. +* Better classification of the subclasses of Actions, Conditions. These classes are found in __../get_api/_model.py__. ### How to continue work? It is currently constructed such that for each pattern, there exists a corresponding function __extract_(pattern_name)__ within the class DependencyExtractor. This is due to the assumption that different types of dependency patterns will have different structured sentences, and thus require slightly different combinations of methods to extract the wanted information. diff --git a/package-parser/package_parser/commands/get_dependencies/_get_dependency.py b/package-parser/package_parser/commands/get_dependencies/_get_dependency.py index 86e76f8c9..032babb2f 100644 --- a/package-parser/package_parser/commands/get_dependencies/_get_dependency.py +++ b/package-parser/package_parser/commands/get_dependencies/_get_dependency.py @@ -6,7 +6,7 @@ from spacy.tokens.doc import Doc from spacy.tokens.span import Span -from ..get_api._model import ( +from ..get_api import ( API, Action, APIDependencies, @@ -61,7 +61,7 @@ def extract_action(action_token: Token, condition_token: Token) -> Action: if token != condition_token: action_tokens.extend(extract_lefts_and_rights(token)) - # Remove trailing punctiation + # Remove trailing punctuation if any(p == action_tokens[-1] for p in [",", "."]): del action_tokens[-1] action_text = " ".join(action_tokens) diff --git a/package-parser/tests/commands/get_dependencies/test_get_dependency.py b/package-parser/tests/commands/get_dependencies/test_get_dependency.py index 0d6db7c34..634e6e4e1 100644 --- a/package-parser/tests/commands/get_dependencies/test_get_dependency.py +++ b/package-parser/tests/commands/get_dependencies/test_get_dependency.py @@ -5,6 +5,7 @@ Dependency, Parameter, ParameterAndResultDocstring, + ParameterAssignment, ParameterHasValue, ParameterIsIgnored, ParameterIsIllegal, @@ -99,18 +100,18 @@ def test_extract_dependencies_from_docstring_pattern_adverbial_clause(): name="random_state", default_value=None, is_public=True, - assigned_by="NAME_ONLY", + assigned_by=ParameterAssignment.NAME_ONLY, docstring=ParameterAndResultDocstring( - type_="param possible types", description=param_docstring_nlp.text + type="param possible types", description=param_docstring_nlp.text ), ) dependee_param = Parameter( name="probability", default_value=None, is_public=True, - assigned_by="NAME_ONLY", + assigned_by=ParameterAssignment.NAME_ONLY, docstring=ParameterAndResultDocstring( - type_="param possible types", description="param probability docstring" + type="param possible types", description="param probability docstring" ), ) pattern_parameter_subordinating_conjunction = nlp(